diff --git a/.cloudbees/aws-nuke/bp-tf-ci-nuke.yaml b/.cloudbees/aws-nuke/bp-tf-ci-nuke.yaml deleted file mode 100644 index 3198a3c6..00000000 --- a/.cloudbees/aws-nuke/bp-tf-ci-nuke.yaml +++ /dev/null @@ -1,141 +0,0 @@ -regions: -- us-east-1 -- us-west-2 -- global - -account-blocklist: -- "999999999999" # production - -# aws-nuke resource-types ==> to list supported resource types -resource-types: - targets: - - ACMCertificate - - CloudWatchLogsLogGroup - - DynamoDBTable - - EC2Instance - - EC2InternetGateway - - EC2LaunchTemplate - - EC2NATGateway - - EC2NetworkACL - - EC2RouteTable - - EC2SecurityGroup - - EC2Snapshot - - EC2Subnet - - EC2Volume - - ELBv2 - - ELBv2TargetGroup - - KMSKey - - KMSAlias - - RDSSnapshot - - S3Bucket - - SecretsManagerSecret - # - IAMPolicy - # - IAMInstanceProfile - -accounts: - 324005994172: - filters: - ACMCertificate: - - property: 'tag:cb-user' - type: exact - value: "cb-platform" - invert: true - CloudWatchLogsLogGroup: - - property: 'tag:cb-user' - type: exact - value: "cb-platform" - invert: true - DynamoDBTable: - - property: 'tag:cb-user' - type: exact - value: "cb-platform" - invert: true - EC2Instance: - - property: 'tag:cb-user' - type: exact - value: "cb-platform" - invert: true - EC2InternetGateway: - - property: 'tag:cb-user' - type: exact - value: "cb-platform" - invert: true - EC2LaunchTemplate: - - property: 'tag:cb-user' - type: exact - value: "cb-platform" - invert: true - EC2NATGateway: - - property: 'tag:cb-user' - type: exact - value: "cb-platform" - invert: true - EC2NetworkACL: - - property: 'tag:cb-user' - type: exact - value: "cb-platform" - invert: true - EC2RouteTable: - - property: 'tag:cb-user' - type: exact - value: "cb-platform" - invert: true - EC2SecurityGroup: - - property: 'tag:cb-user' - type: exact - value: "cb-platform" - invert: true - EC2Snapshot: - - property: 'tag:cb-user' - type: exact - value: "cb-platform" - invert: true - EC2Subnet: - - property: 'tag:cb-user' - type: exact - value: "cb-platform" - invert: true - EC2Volume: - - property: 'tag:cb-user' - type: exact - value: "cb-platform" - invert: true - ELBv2: - - property: 'tag:cb-user' - type: exact - value: "cb-platform" - invert: true - ELBv2TargetGroup: - - property: 'tag:cb-user' - type: exact - value: "cb-platform" - invert: true - KMSKey: - - property: 'tag:cb-user' - type: exact - value: "cb-platform" - invert: true - KMSAlias: - # - property: 'tag:cb-user' - # type: exact - # value: "cb-platform" - # invert: true - - property: 'Name' - type: contains - value: "-ci-" - invert: true - RDSSnapshot: - - property: 'tag:cb-user' - type: exact - value: "cb-platform" - invert: true - S3Bucket: - - property: 'tag:cb-user' - type: exact - value: "cb-platform" - invert: true - SecretsManagerSecret: - - property: 'tag:cb-user' - type: exact - value: "cb-platform" - invert: true diff --git a/.cloudbees/workflows/bp-agent-ecr.yaml b/.cloudbees/workflows/bp-agent-ecr.yaml index 1ddc8ec9..6105f1b2 100644 --- a/.cloudbees/workflows/bp-agent-ecr.yaml +++ b/.cloudbees/workflows/bp-agent-ecr.yaml @@ -16,10 +16,10 @@ env: RESPOSITORY: cloudbees-labs/tf-aws-cb-ci-eks-addon-agent VERSION: latest AWS_REGION: us-west-2 + #TODO: Replace by rootless image when it is supported DOCKERFILE: .docker/agent/agent.root.Dockerfile jobs: - #TODO: Verify if the repository is created before running this job. If not, create it (including tags). build_and_push_images: steps: - name: Checkout code @@ -36,7 +36,6 @@ jobs: id: login-ecr uses: cloudbees-io/configure-ecr-credentials@v1 - #TODO: Replace by rootless image when it is supported - name: Build, tag, and push docker image to Amazon ECR uses: cloudbees-io/kaniko@v1 with: diff --git a/.cloudbees/workflows/bp-tf-cd.yaml b/.cloudbees/workflows/bp-tf-cd.yaml deleted file mode 100644 index a2153a70..00000000 --- a/.cloudbees/workflows/bp-tf-cd.yaml +++ /dev/null @@ -1,257 +0,0 @@ -# Copyright (c) CloudBees, Inc. - -# Stages -# CD: deploy,validate,onboarding -# Nuke: wipeout - -apiVersion: automation.cloudbees.io/v1alpha1 -kind: workflow -name: ci - -on: - workflow_dispatch: - -env: - AWS_REGION_TF_BUCKET: "us-east-1" - BUCKET_NAME_TF_STATE: "cbci-eks-addon-tf-state-cd" - AWS_ROLE_TO_ASSUME: "infra-admin-ci" - TF_VAR_suffix: "ci-v11" - TF_VAR_aws_region: "us-west-2" - TF_AUTO_VARS_FILE: | - tags = { - "cb-owner" : "professional-services" - "cb-user" : "cb-platform" - "cb-purpose" : "cd" - } - trial_license = { - first_name = "CloudBees.io" - last_name = "Platform" - email = "ci.user@cloudbees.io" - company = "CloudBees Inc." - } - ci = true - -jobs: - init: - steps: - - - name: Configure AWS Credentials - uses: cloudbees-io/configure-aws-credentials@v1 - with: - aws-region: ${{ env.AWS_REGION_TF_BUCKET }} - aws-access-key-id: ${{ secrets.AWS_TF_CBCI_EKS_AccessKeyID }} - aws-secret-access-key: ${{ secrets.AWS_TF_CBCI_EKS_SecretAccessKey }} - role-to-assume: ${{ env.AWS_ROLE_TO_ASSUME }} - role-external-id: cloudbees - role-duration-seconds: "3600" - - #TODO: Add tags for the bucket - - name: Create Terraform Backend Bucket if not exists - uses: docker://public.ecr.aws/r1n1q0e5/cloudbees-labs/tf-aws-cb-ci-eks-addon-agent:latest - shell: bash - run: | - set -x - aws s3api create-bucket \ - --bucket ${{ env.BUCKET_NAME_TF_STATE }} \ - --region ${{ env.AWS_REGION_TF_BUCKET }} || echo "Bucket ${{ env.BUCKET_NAME_TF_STATE }} already exists" - - bp01: - env: - ROOT: 01-getting-started - TF_VAR_hosted_zone: bp01-cd.aws.ps.beescloud.com - STAGES: "wipeout" - needs: - - init - steps: - - - name: Configure AWS Credentials - uses: cloudbees-io/configure-aws-credentials@v1 - with: - aws-region: ${{ env.TF_VAR_aws_region }} - aws-access-key-id: ${{ secrets.AWS_TF_CBCI_EKS_AccessKeyID }} - aws-secret-access-key: ${{ secrets.AWS_TF_CBCI_EKS_SecretAccessKey }} - role-to-assume: ${{ env.AWS_ROLE_TO_ASSUME }} - role-external-id: cloudbees - role-duration-seconds: "3600" - - - name: Checkout code - uses: cloudbees-io/checkout@v1 - - - name: 01-getting-started - Set - uses: docker://public.ecr.aws/r1n1q0e5/cloudbees-labs/tf-aws-cb-ci-eks-addon-agent:latest - shell: bash - run : | - cat <> blueprints/${{ env.ROOT }}/.auto.tfvars - ${{ env.TF_AUTO_VARS_FILE }} - EOT - cat blueprints/${{ env.ROOT }}/.auto.tfvars - cat <> blueprints/${{ env.ROOT }}/backend.tf - terraform { - backend "s3" { - bucket = "${{ env.BUCKET_NAME_TF_STATE }}" - key = "${{ env.ROOT }}/ci.terraform.tfstate" - region = "${{ env.AWS_REGION_TF_BUCKET }}" - } - } - EOT - - - name: 01-getting-started - Deploy - uses: docker://public.ecr.aws/r1n1q0e5/cloudbees-labs/tf-aws-cb-ci-eks-addon-agent:latest - if: contains(env.STAGES, 'deploy') - shell: bash - run : | - set -x - aws kms delete-alias --alias-name alias/eks/cbci-bp01-${{ env.TF_VAR_suffix }}-eks --region ${{ env.TF_VAR_aws_region }} || echo "Alias alias/eks/cbci-bp01-${{ env.TF_VAR_suffix }}-eks does not exist" - aws kms delete-alias --alias-name alias/eks/cbci-bp01-${{ env.TF_VAR_suffix }} --region ${{ env.TF_VAR_aws_region }} || echo "Alias alias/eks/cbci-bp01-${{ env.TF_VAR_suffix }} does not exist" - CI=true make deploy - aws s3api put-object \ - --bucket ${{ env.BUCKET_NAME_TF_STATE }} \ - --region ${{ env.AWS_REGION_TF_BUCKET }} \ - --key ${{ env.ROOT }}/${{ env.ROOT }}.terraform.output \ - --body blueprints/${{ env.ROOT }}/terraform.output - - - name: 01-getting-started - Validate - uses: docker://public.ecr.aws/r1n1q0e5/cloudbees-labs/tf-aws-cb-ci-eks-addon-agent:latest - if: contains(env.STAGES, 'validate') - shell: bash - run : | - CI=true make validate - - - name: 01-getting-started - Destroy - uses: docker://public.ecr.aws/r1n1q0e5/cloudbees-labs/tf-aws-cb-ci-eks-addon-agent:latest - if: contains(env.STAGES, 'destroy') - shell: bash - run : | - CI=true make destroy - - - name: 01-getting-started - Wipeout - uses: docker://public.ecr.aws/r1n1q0e5/cloudbees-labs/tf-aws-cb-ci-eks-addon-agent:latest - if: contains(env.STAGES, 'wipeout') - shell: bash - run : | - terraform -chdir=blueprints/${{ env.ROOT }} init -reconfigure && CI=true make destroy - - - name: 01-getting-started - Role Onboarding - uses: docker://public.ecr.aws/r1n1q0e5/cloudbees-labs/tf-aws-cb-ci-eks-addon-agent:latest - if: contains(env.STAGES, 'onboarding') - env: - TARGET_ROLE: arn:aws:iam::324005994172:role/AWSReservedSSO_infra-admin_256addbf79cfacd1 - shell: bash - run : | - set -x - cd blueprints/${{ env.ROOT }} && eval $(terraform output --raw kubeconfig_export) - kubectl describe configmap aws-auth -n kube-system - eksctl create iamidentitymapping \ - --cluster $(terraform output --raw eks_cluster_name) \ - --region ${{ env.TF_VAR_aws_region }} \ - --arn ${{ env.TARGET_ROLE }} \ - --username k8s-admin-rol \ - --group system:masters - kubectl describe configmap aws-auth -n kube-system - - bp02: - env: - ROOT: 02-at-scale - TF_VAR_hosted_zone: bp02-cd.aws.ps.beescloud.com - STAGES: "wipeout" - needs: - - init - steps: - - - name: Configure AWS Credentials - uses: cloudbees-io/configure-aws-credentials@v1 - with: - aws-region: ${{ env.TF_VAR_aws_region }} - aws-access-key-id: ${{ secrets.AWS_TF_CBCI_EKS_AccessKeyID }} - aws-secret-access-key: ${{ secrets.AWS_TF_CBCI_EKS_SecretAccessKey }} - role-to-assume: ${{ env.AWS_ROLE_TO_ASSUME }} - role-external-id: cloudbees - role-duration-seconds: "3600" - - - name: Checkout code - uses: cloudbees-io/checkout@v1 - - - name: 02-at-scale - Set - uses: docker://public.ecr.aws/r1n1q0e5/cloudbees-labs/tf-aws-cb-ci-eks-addon-agent:latest - shell: bash - run : | - cat <> blueprints/${{ env.ROOT }}/.auto.tfvars - ${{ env.TF_AUTO_VARS_FILE }} - dh_reg_secret_auth = { - username = "${{ secrets.AWS_TF_CBCI_EKS_DHUser }}" - password = "${{ secrets.AWS_TF_CBCI_EKS_DHPass }}" - email = "${{ secrets.AWS_TF_CBCI_EKS_DHMail }}" - } - EOT - cat blueprints/${{ env.ROOT }}/.auto.tfvars - cat <> blueprints/${{ env.ROOT }}/backend.tf - terraform { - backend "s3" { - bucket = "${{ env.BUCKET_NAME_TF_STATE }}" - key = "${{ env.ROOT }}/ci.terraform.tfstate" - region = "${{ env.AWS_REGION_TF_BUCKET }}" - } - } - EOT - - - name: 02-at-scale - Deploy - uses: docker://public.ecr.aws/r1n1q0e5/cloudbees-labs/tf-aws-cb-ci-eks-addon-agent:latest - if: contains(env.STAGES, 'deploy') - shell: bash - run : | - set -x - aws kms delete-alias --alias-name alias/eks/cbci-bp02-${{ env.TF_VAR_suffix }}-eks --region ${{ env.TF_VAR_aws_region }} || echo "Alias alias/eks/cbci-bp02-${{ env.TF_VAR_suffix }}-eks does not exist" - aws kms delete-alias --alias-name alias/eks/cbci-bp02-${{ env.TF_VAR_suffix }} --region ${{ env.TF_VAR_aws_region }} || echo "Alias alias/eks/cbci-bp02-${{ env.TF_VAR_suffix }} does not exist" - CI=true make deploy - aws s3api put-object \ - --bucket ${{ env.BUCKET_NAME_TF_STATE }} \ - --region ${{ env.AWS_REGION_TF_BUCKET }} \ - --key ${{ env.ROOT }}/${{ env.ROOT }}.terraform.output \ - --body blueprints/${{ env.ROOT }}/terraform.output - # TODO: Add vault init log to s3 - # cd blueprints/${{ env.ROOT }} && eval $(terraform output --raw kubeconfig_export) - # cd blueprints/${{ env.ROOT }} && eval $(terraform output --raw vault_init) - # aws s3api put-object \ - # --bucket ${{ env.BUCKET_NAME_TF_STATE }} \ - # --region ${{ env.AWS_REGION_TF_BUCKET }} \ - # --key ${{ env.ROOT }}/${{ env.ROOT }}.vault.init.log \ - # --body $(cd blueprints/${{ env.ROOT }} && terraform output --raw vault_init_log_file) || echo "No vault-init.log found" - - - name: 02-at-scale - Validate - uses: docker://public.ecr.aws/r1n1q0e5/cloudbees-labs/tf-aws-cb-ci-eks-addon-agent:latest - if: contains(env.STAGES, 'validate') - shell: bash - run : | - CI=true make validate - - - name: 02-at-scale - Destroy - uses: docker://public.ecr.aws/r1n1q0e5/cloudbees-labs/tf-aws-cb-ci-eks-addon-agent:latest - if: contains(env.STAGES, 'destroy') - shell: bash - run : | - CI=true make destroy - - - name: 02-at-scale - Wipeout - uses: docker://public.ecr.aws/r1n1q0e5/cloudbees-labs/tf-aws-cb-ci-eks-addon-agent:latest - if: contains(env.STAGES, 'wipeout') - shell: bash - run : | - terraform -chdir=blueprints/${{ env.ROOT }} init -reconfigure && CI=true make destroy - - - name: 02-at-scale - Role Onboarding - uses: docker://public.ecr.aws/r1n1q0e5/cloudbees-labs/tf-aws-cb-ci-eks-addon-agent:latest - if: contains(env.STAGES, 'onboarding') - env: - TARGET_ROLE: arn:aws:iam::324005994172:role/AWSReservedSSO_infra-admin_256addbf79cfacd1 - shell: bash - run : | - set -x - cd blueprints/${{ env.ROOT }} && eval $(terraform output --raw kubeconfig_export) - kubectl describe configmap aws-auth -n kube-system - eksctl create iamidentitymapping \ - --cluster $(terraform output --raw eks_cluster_name) \ - --region ${{ env.TF_VAR_aws_region }} \ - --arn ${{ env.TARGET_ROLE }} \ - --username k8s-admin-rol \ - --group system:masters - kubectl describe configmap aws-auth -n kube-system diff --git a/.cloudbees/workflows/bp-tf-ci.yaml b/.cloudbees/workflows/bp-tf-ci.yaml index f00812e1..292c0cfe 100644 --- a/.cloudbees/workflows/bp-tf-ci.yaml +++ b/.cloudbees/workflows/bp-tf-ci.yaml @@ -1,10 +1,5 @@ # Copyright (c) CloudBees, Inc. -# Stages -# CI: deploy,validate,destroy -# Troubleshooting: deploy,validate,onboarding -# Nuke (Delete Dangling resource): wipeout - apiVersion: automation.cloudbees.io/v1alpha1 kind: workflow name: ci @@ -19,10 +14,11 @@ on: workflow_dispatch: env: - AWS_REGION_TF_BUCKET: "us-east-1" - BUCKET_NAME_TF_STATE: "cbci-eks-addon-tf-state-ci" + BUCKET_NAME_TF_STATE: cbci-eks-addon-bp + AWS_REGION_TF_BUCKET: us-east-1 + TAGS_TF_BUCKET: '[{Key=cb-owner,Value=professional-services},{Key=cb-user,Value=cb-platform},{Key=cb-purpose,Value=production shared cluster}]' AWS_ROLE_TO_ASSUME: "infra-admin-ci" - TF_VAR_suffix: "ci-v11" + TF_VAR_suffix: "ci" TF_VAR_aws_region: "us-west-2" TF_AUTO_VARS_FILE: | tags = { @@ -33,7 +29,7 @@ env: trial_license = { first_name = "CloudBees.io" last_name = "Platform" - email = "ci.user@cloudbees.io" + email = "ci.cbci.eks.bp@cloudbees.io" company = "CloudBees Inc." } ci = true @@ -58,15 +54,25 @@ jobs: shell: bash run: | set -x - aws s3api create-bucket \ - --bucket ${{ env.BUCKET_NAME_TF_STATE }} \ - --region ${{ env.AWS_REGION_TF_BUCKET }} || echo "Bucket ${{ env.BUCKET_NAME_TF_STATE }} already exists" + if aws s3api head-bucket --bucket ${{ env.BUCKET_NAME_TF_STATE }} 2>/dev/null; then + echo "Bucket ${{ env.BUCKET_NAME_TF_STATE }} already exists." + else + echo "Bucket ${{ env.BUCKET_NAME_TF_STATE }} does not exist. Creating now..." + aws s3api create-bucket --bucket ${{ env.BUCKET_NAME_TF_STATE }} \ + --region ${{ env.AWS_REGION_TF_BUCKET }} + fi + aws s3api put-bucket-tagging --bucket ${{ env.BUCKET_NAME_TF_STATE }} \ + --tagging 'TagSet=${{ env.TAGS_TF_BUCKET }}' bp01: env: + # Stages + # CI: deploy,validate,destroy + # Troubleshooting: deploy,validate,onboarding + # Nuke (Delete Dangling resource): wipeout + STAGES: "deploy,validate,destroy" ROOT: 01-getting-started TF_VAR_hosted_zone: bp01-ci.aws.ps.beescloud.com - STAGES: "deploy,validate,destroy" needs: - init steps: @@ -96,30 +102,39 @@ jobs: terraform { backend "s3" { bucket = "${{ env.BUCKET_NAME_TF_STATE }}" - key = "${{ env.ROOT }}/ci.terraform.tfstate" + key = "${{ env.ROOT }}/ci/terraform.tfstate" region = "${{ env.AWS_REGION_TF_BUCKET }}" } } EOT + echo "Environment Variables:" + printenv - name: 01-getting-started - Deploy uses: docker://public.ecr.aws/r1n1q0e5/cloudbees-labs/tf-aws-cb-ci-eks-addon-agent:latest if: contains(env.STAGES, 'deploy') + kind: build shell: bash run : | set -x - aws kms delete-alias --alias-name alias/eks/cbci-bp01-${{ env.TF_VAR_suffix }}-eks --region ${{ env.TF_VAR_aws_region }} || echo "Alias alias/eks/cbci-bp01-${{ env.TF_VAR_suffix }}-eks does not exist" - aws kms delete-alias --alias-name alias/eks/cbci-bp01-${{ env.TF_VAR_suffix }} --region ${{ env.TF_VAR_aws_region }} || echo "Alias alias/eks/cbci-bp01-${{ env.TF_VAR_suffix }} does not exist" + # It Resolves Issue #66 + aliases=("alias/eks/cbci-bp02-${{ env.TF_VAR_suffix }}-eks" "alias/eks/cbci-bp02-${{ env.TF_VAR_suffix }}") + for alias in "${aliases[@]}"; do + aws kms delete-alias \ + --alias-name $alias \ + --region ${{ env.TF_VAR_aws_region }} || echo "$alias does not exist" + done CI=true make deploy aws s3api put-object \ - --bucket ${{ env.BUCKET_NAME_TF_STATE }} \ - --region ${{ env.AWS_REGION_TF_BUCKET }} \ - --key ${{ env.ROOT }}/${{ env.ROOT }}.terraform.output \ - --body blueprints/${{ env.ROOT }}/terraform.output + --bucket "${{ env.BUCKET_NAME_TF_STATE }}" \ + --region "${{ env.AWS_REGION_TF_BUCKET }}" \ + --body blueprints/${{ env.ROOT }}/terraform.output \ + --key ${{ env.ROOT }}/ci/${{ env.ROOT }}.terraform.output || echo "Failed to put $body object in ${{ env.BUCKET_NAME_TF_STATE }}" - name: 01-getting-started - Validate uses: docker://public.ecr.aws/r1n1q0e5/cloudbees-labs/tf-aws-cb-ci-eks-addon-agent:latest if: contains(env.STAGES, 'validate') + kind: test shell: bash run : | CI=true make validate @@ -158,9 +173,13 @@ jobs: bp02: env: + # Stages + # CI: deploy,validate,destroy + # Troubleshooting: deploy,validate,onboarding + # Nuke (Delete Dangling resource): wipeout + STAGES: "deploy,validate,destroy" ROOT: 02-at-scale TF_VAR_hosted_zone: bp02-ci.aws.ps.beescloud.com - STAGES: "deploy,validate,destroy" needs: - init steps: @@ -195,30 +214,39 @@ jobs: terraform { backend "s3" { bucket = "${{ env.BUCKET_NAME_TF_STATE }}" - key = "${{ env.ROOT }}/ci.terraform.tfstate" + key = "${{ env.ROOT }}/ci/terraform.tfstate" region = "${{ env.AWS_REGION_TF_BUCKET }}" } } EOT + echo "Environment Variables:" + printenv - name: 02-at-scale - Deploy uses: docker://public.ecr.aws/r1n1q0e5/cloudbees-labs/tf-aws-cb-ci-eks-addon-agent:latest if: contains(env.STAGES, 'deploy') + kind: build shell: bash run : | set -x - aws kms delete-alias --alias-name alias/eks/cbci-bp02-${{ env.TF_VAR_suffix }}-eks --region ${{ env.TF_VAR_aws_region }} || echo "Alias alias/eks/cbci-bp02-${{ env.TF_VAR_suffix }}-eks does not exist" - aws kms delete-alias --alias-name alias/eks/cbci-bp02-${{ env.TF_VAR_suffix }} --region ${{ env.TF_VAR_aws_region }} || echo "Alias alias/eks/cbci-bp02-${{ env.TF_VAR_suffix }} does not exist" + # It Resolves Issue #66 + aliases=("alias/eks/cbci-bp02-${{ env.TF_VAR_suffix }}-eks" "alias/eks/cbci-bp02-${{ env.TF_VAR_suffix }}") + for alias in "${aliases[@]}"; do + aws kms delete-alias \ + --alias-name $alias \ + --region ${{ env.TF_VAR_aws_region }} || echo "$alias does not exist" + done CI=true make deploy aws s3api put-object \ - --bucket ${{ env.BUCKET_NAME_TF_STATE }} \ - --region ${{ env.AWS_REGION_TF_BUCKET }} \ - --key ${{ env.ROOT }}/${{ env.ROOT }}.terraform.output \ - --body blueprints/${{ env.ROOT }}/terraform.output + --bucket "${{ env.BUCKET_NAME_TF_STATE }}" \ + --region "${{ env.AWS_REGION_TF_BUCKET }}" \ + --body blueprints/${{ env.ROOT }}/terraform.output \ + --key ${{ env.ROOT }}/ci/${{ env.ROOT }}.terraform.output || echo "Failed to put $body object in ${{ env.BUCKET_NAME_TF_STATE }}" - name: 02-at-scale - Validate uses: docker://public.ecr.aws/r1n1q0e5/cloudbees-labs/tf-aws-cb-ci-eks-addon-agent:latest if: contains(env.STAGES, 'validate') + kind: test shell: bash run : | CI=true make validate @@ -245,10 +273,10 @@ jobs: shell: bash run : | set -x - cd blueprints/${{ env.ROOT }} && eval $(terraform output --raw kubeconfig_export) + eval $(terraform -chdir="blueprints/${{ env.ROOT }}" output --raw --raw kubeconfig_export) kubectl describe configmap aws-auth -n kube-system eksctl create iamidentitymapping \ - --cluster $(terraform output --raw eks_cluster_name) \ + --cluster $(terraform -chdir="blueprints/${{ env.ROOT }}" output --raw eks_cluster_name) \ --region ${{ env.TF_VAR_aws_region }} \ --arn ${{ env.TARGET_ROLE }} \ --username k8s-admin-rol \ diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 8c49e7ad..9918f5eb 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -14,10 +14,10 @@ This document provides guidelines for contributing to the CloudBees CI add-on fo - The `source` field in the `eks_blueprints_addon_cbci` at blueprints must point to the remote [terraform registry version](https://registry.terraform.io/modules/cloudbees/cloudbees-ci-eks-addon/aws/latest) and `version >= "x.x.x"`. It is important for the telemetry in https://registry.terraform.io/modules/cloudbees/cloudbees-ci-eks-addon/aws/latest. - The CasC bundles SCM configuration must point to the https://github.com/cloudbees/terraform-aws-cloudbees-ci-eks-addon repository and its `main` branch. - `develop` branch: - - It is the integration branch and it is used for testing new features and updates before merging them into the `main` branch. + - It is the integration branch, and is used for testing new features and updates before merging them into the `main` branch. - Requirements: - - The `source` field in the `eks_blueprints_addon_cbci` in the blueprints folder must point to the local root of the https://github.com/cloudbees/terraform-aws-cloudbees-ci-eks-addon repository (for example, `source = "../../"`). - - The CasC bundles SCM configuration must point to the https://github.com/cloudbees/terraform-aws-cloudbees-ci-eks-addon repository and its `develop` branch. + - The `source` field in the `eks_blueprints_addon_cbci` in the blueprints folder must point to the local root of the [terraform-aws-cloudbees-ci-eks-addon](https://github.com/cloudbees/terraform-aws-cloudbees-ci-eks-addon) repository (for example, `source = "../../"`). + - The CasC bundles SCM configuration must point to the `develop` branch in the [terraform-aws-cloudbees-ci-eks-addon](https://github.com/cloudbees/terraform-aws-cloudbees-ci-eks-addon) repository. ## Report bugs and feature requests @@ -54,7 +54,7 @@ To submit a pull request: 7. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. > [!IMPORTANT] -> If you make updates to embeded repository (e.g. CasC bundles), you must push the changes to the public upstream (repository/branch) before running `terraform apply` locally. The endpoint and/or branch can be updated via `set-casc-location` from the companion [Makefile](Makefile). +> If you make updates to embedded repository (for example, CasC bundles), you must push the changes to the public upstream (repository/branch) before running `terraform apply` locally. The endpoint and/or branch can be updated via `set-casc-location` from the companion [Makefile](Makefile). ### Pre-commits: Linting, formatting and secrets scanning @@ -73,7 +73,7 @@ Validate your pull request changes inside the blueprint agent described in the [ The [bp-tf-ci.yaml](.cloudbees/workflows/bp-tf-ci.yaml) blueprints are orchestrated into the [CloudBees platform](https://www.cloudbees.com/products/saas-platform) inside the [CloudBees Professional Services (PS) sub-organization](https://cloudbees.io/orgs/cloudbees~professional-services/components/94c50dcf-125e-4767-b9c5-58d6d669a1f6/runs). > [!NOTE] -> At the time of writing, the pipeline triggers on `push` events only and not for `pull_requests`. Although pull request event is supported, it is requires filters for file patters ()`*.tf`). +> The pipeline triggers on `push` events only, and does not trigger for `pull_requests`. Although the `pull_requests` event is supported, it requires filters for file patters (for example, `*.tf`). #### Prerequisites @@ -82,7 +82,7 @@ The [bp-tf-ci.yaml](.cloudbees/workflows/bp-tf-ci.yaml) blueprints are orchestra - AWS Route 53 zone name, to create DNS records. > [!IMPORTANT] -> CloudBees Platform currently only supports push events. Therefore, pull requests are sent to the `develop` branch for integration. +> CloudBees platform currently only supports push events. Therefore, pull requests are sent to the `develop` branch for integration. ## Release @@ -90,7 +90,7 @@ CloudBees CI Terraform EKS Addon versions try to be in sync with the [CloudBees 1. Ensure that `develop` branch follows its requisites from the [Design principles](#design-principles) section. 2. Test locally the (`develop`) for all the blueprints. Use the `test-all` target in the companion [Makefile](Makefile). -3. Once all local tests passed successfully, create a PR against the `main` branch. **It requires to pass the COE Team validation**. +3. Once all local tests passed successfully, create a PR against the `main` branch. It **must pass** the Center of Excellence (CoE) team validation. 4. Once the pull request is merged, update the `main` branch following its requisites from the [Design principles](#design-principles) section. The [Blueprint Terraform CI pipeline](#blueprint-terraform-ci-pipeline) must validate the changes. 5. Create a [new release](https://github.com/cloudbees/terraform-aws-cloudbees-ci-eks-addon/releases). The release version semantics follow the Helm chart convention. diff --git a/Makefile b/Makefile index 5af64978..c81d8d00 100644 --- a/Makefile +++ b/Makefile @@ -6,7 +6,7 @@ BP_AGENT_USER := bp-agent MKFILEDIR := $(abspath $(dir $(lastword $(MAKEFILE_LIST)))) CBCI_REPO ?= https://github.com/cloudbees/terraform-aws-cloudbees-ci-eks-addon.git CBCI_BRANCH ?= main -NUKE_DRY_RUN ?= true +DESTROY_WL_ONLY ?= false define helpers source blueprints/helpers.sh && $(1) @@ -57,13 +57,17 @@ endif @$(call helpers,INFO "CloudBees CI Blueprint $(ROOT) Validation target finished succesfully.") .PHONY: destroy -destroy: ## Destroy Terraform Blueprint passed as parameter. Example: ROOT=02-at-scale make destroy -destroy: tfChecks agentCheck +destroy: ## Destroy Terraform Blueprint passed as parameter. Example: DESTROY_WL_ONLY=false ROOT=02-at-scale make destroy +destroy: tfChecks agentCheck guard-DESTROY_WL_ONLY ifeq ($(CI),false) - @$(call helpers,ask-confirmation "Destroy $(ROOT)") + @$(call helpers,ask-confirmation "Destroy $(ROOT) with Destroy Workloads Only=$(DESTROY_WL_ONLY)") endif - @$(call helpers,tf-destroy $(ROOT) $(CBCI_ONLY)) - @$(call helpers,INFO "CloudBees CI Blueprint $(ROOT) Destroy target finished succesfully.") +ifeq ($(DESTROY_WL_ONLY),false) + @$(call helpers,tf-destroy $(ROOT)) +else + @$(call helpers,tf-destroy-wl $(ROOT)) +endif + @$(call helpers,INFO "CloudBees CI Blueprint $(ROOT) Destroy target finished succesfully. Destroy Workloads Only=$(DESTROY_WL_ONLY)") .PHONY: clean clean: ## Clean Blueprint passed as parameter. Example: ROOT=02-at-scale make clean @@ -77,7 +81,7 @@ clean: guard-ROOT agentCheck .PHONY: test test: ## Runs a test for blueprint passed as parameters throughout their Terraform Lifecycle. Example: ROOT=02-at-scale make test -test: deploy validate destroy clean +test: clean deploy validate destroy @$(call helpers,INFO "Test target for $(ROOT) passed succesfully.") .PHONY: test-all @@ -102,15 +106,6 @@ set-cbci-location: agentCheck guard-CBCI_REPO guard-CBCI_BRANCH @$(call helpers,set-cbci-location $(CBCI_REPO) $(CBCI_BRANCH)) @$(call helpers,INFO "Setting new Casc location to $(CBCI_REPO) $(CBCI_BRANCH) finished succesfully.") -.PHONY: run-aws-nuke -run-aws-nuke: ## Run aws nuke by https://github.com/rebuy-de/aws-nuke. Example: NUKE_DRY_RUN=true make run-aws-nuke -run-aws-nuke: guard-NUKE_DRY_RUN -ifeq ($(NUKE_DRY_RUN),false) - @$(call helpers,ask-confirmation "Running AWS Nuke to destroy selected resources.") -endif - @$(call helpers,run-aws-nuke $(NUKE_DRY_RUN)) - @$(call helpers,INFO "AWS nuke finished successfully with DRY_RUN=$(NUKE_DRY_RUN).") - ########################## # Global ########################## diff --git a/README.md b/README.md index 1359fb05..a58f3aa8 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,14 @@ # CloudBees CI add-on for Amazon EKS blueprints

- CloudBees CI add-on for Amazon EKS blueprints -

Deploy CloudBees CI to Amazon Web Services (AWS) Elastic Kubernetes Service (EKS) clusters

+ + + + + CloudBees CI add-on for Amazon EKS blueprints +

+ +

Deploy CloudBees CI to Amazon Web Services (AWS) Elastic Kubernetes Service (EKS) clusters

--- @@ -24,7 +30,7 @@ The CloudBees CI [AWS partner add-on](https://aws-ia.github.io/terraform-aws-eks ## Usage -Implementation examples are included in the [blueprints](https://github.com/cloudbees/terraform-aws-cloudbees-ci-eks-addon/tree/main/blueprints) folder, however this is the simplest example of usage: +Implementation examples are included in the [blueprints](https://github.com/cloudbees/terraform-aws-cloudbees-ci-eks-addon/tree/main/blueprints) folder, however, this is the simplest example of usage: ```terraform module "eks_blueprints_addon_cbci" { @@ -81,6 +87,9 @@ The two main components of CloudBees CI - the operations center and managed cont This module runs with a [trial license for CloudBees CI](https://docs.cloudbees.com/docs/cloudbees-ci-migration/latest/trial-guide/). Once the trial has expired, refer to [CloudBees CI license expiration FAQ](https://docs.cloudbees.com/docs/general-kb/latest/faqs/jenkins-enterprise-license-expiration-faq) to determine your next steps. +> [!NOTE] +> This addon appends the string `[EKS_TF_ADDON]` to the Trial License last name for telemetry purposes. + ## Compatibility The CloudBees CI add-on uses `helms release` for its resources definition, making it compatible with [AWS EKS Blueprint v4](https://github.com/aws-ia/terraform-aws-eks-blueprints/tree/v4.32.1) and [AWS EKS Blueprint v5](https://github.com/aws-ia/terraform-aws-eks-blueprints/tree/v5.0.0). For more information, refer to [Amazon EKS Blueprints for Terraform: v4 to v5 migration](https://aws-ia.github.io/terraform-aws-eks-blueprints/v4-to-v5/motivation/). @@ -100,8 +109,9 @@ The CloudBees CI add-on uses `helms release` for its resources definition, makin | create_reg_secret | Create a Kubernetes dockerconfigjson secret for container registry authentication (cbci-sec-reg) for CI builds agents. | `bool` | `false` | no | | helm_config | CloudBees CI Helm chart configuration. | `any` |
{
"values": [
""
]
}
| no | | prometheus_target | Creates a service monitor to discover the CloudBees CI Prometheus target dynamically. It is designed to be enabled with the AWS EKS Terraform Addon Kube Prometheus Stack. | `bool` | `false` | no | +| prometheus_target_ns | Prometheus target namespace, designed to be enabled with the AWS EKS Terraform Addon Kube Prometheus Stack. It is required when prometheus_target is enabled. | `string` | `"observability"` | no | | reg_secret_auth | Registry server authentication details for cbci-sec-reg secret. It is required when create_reg_secret is enabled. | `map(string)` |
{
"email": "foo.bar@acme.com",
"password": "changeme1234",
"server": "my-registry.acme:5000",
"username": "foo"
}
| no | -| reg_secret_ns | Agent namespace to allocate cbci-sec-reg secret. It is required when create_reg_secret is enabled. | `string` | `"cbci"` | no | +| reg_secret_ns | Agent namespace to allocate the cbci-sec-reg secret. It is required when create_reg_secret is enabled. | `string` | `"cbci"` | no | ### Outputs @@ -115,7 +125,7 @@ The CloudBees CI add-on uses `helms release` for its resources definition, makin | cbci_oc_pod | Operations center pod for the CloudBees CI add-on. | | cbci_oc_url | Operations center URL for the CloudBees CI add-on using a subdomain and certificates. | | cbci_sec_casc | Optional. Kubernetes secrets name for CloudBees CI Casc. | -| cbci_sec_registry | Optional. Kubernetes secrets name for CloudBees CI agents to autheticate to registry. | +| cbci_sec_registry | Optional. Kubernetes secrets name for CloudBees CI agents to authenticate the registry. | | merged_helm_config | (merged) Helm configuration for CloudBees CI. | diff --git a/blueprints/.k8s.env b/blueprints/.k8s.env index f3bab735..90468088 100644 --- a/blueprints/.k8s.env +++ b/blueprints/.k8s.env @@ -1,8 +1,9 @@ # Kubernetes # # K8s version: https://kubernetes.io/releases/ # K8s support: https://docs.cloudbees.com/docs/cloudbees-common/latest/supported-platforms/cloudbees-ci-cloud#_kubernetes -vK8=1.28 +vK8=1.29 # CloudBees CI Chart versions: https://artifacthub.io/packages/helm/cloudbees/cloudbees-core/ -vCBCI_Helm=3.18306.0+b5ad27c80a6b +vCBCI_Helm=3.19313.0+1afe0458111d # AWS Terraform EKS Blueprint Addons Module https://github.com/aws-ia/terraform-aws-eks-blueprints-addons/releases -vEKSBpAddonsTFMod=1.15.1 +# Note: Validate cluster_autoscaler_image_tag contains the version of vK8 +vEKSBpAddonsTFMod=1.17.0 diff --git a/blueprints/01-getting-started/README.md b/blueprints/01-getting-started/README.md index 0c11a238..dbee24e3 100644 --- a/blueprints/01-getting-started/README.md +++ b/blueprints/01-getting-started/README.md @@ -52,8 +52,8 @@ This blueprint presents the minimum setup to run CloudBees CI on Amazon EKS; one | cbci_oc_url | URL of the CloudBees CI operations center for the CloudBees CI add-on. | | eks_cluster_arn | Amazon EKS cluster ARN. | | eks_cluster_name | Amazon EKS cluster Name. | -| kubeconfig_add | Add kubeconfig to your local configuration to access the Kubernetes API. | -| kubeconfig_export | Export the KUBECONFIG environment variable to access the Kubernetes API. | +| kubeconfig_add | Adds kubeconfig to your local configuration to access the Kubernetes API. | +| kubeconfig_export | Exports the KUBECONFIG environment variable to access the Kubernetes API. | | vpc_arn | VPC ID. | @@ -62,8 +62,8 @@ This blueprint presents the minimum setup to run CloudBees CI on Amazon EKS; one When preparing to deploy, you must complete the following steps: 1. Customize your Terraform values by copying `.auto.tfvars.example` to `.auto.tfvars`. -2. Initialize the root module and any associated configuration for providers. -3. Create the resources and deploy CloudBees CI to an EKS cluster. Refer to [Amazon EKS Blueprints for Terraform - Deploy](https://aws-ia.github.io/terraform-aws-eks-blueprints/getting-started/#deploy). +1. Initialize the root module and any associated configuration for providers. +1. Create the resources and deploy CloudBees CI to an EKS cluster. Refer to [Amazon EKS Blueprints for Terraform - Deploy](https://aws-ia.github.io/terraform-aws-eks-blueprints/getting-started/#deploy). For more information, refer to [The Core Terraform Workflow](https://www.terraform.io/intro/core-workflow) documentation. @@ -94,13 +94,13 @@ Once you can access the Kubernetes API from your terminal, complete the followin eval $(terraform output --raw cbci_oc_pod) ``` -2. Issue the following command to verify that the Ingress is ready and has assigned a valid `ADDRESS`: +1. Issue the following command to verify that the Ingress is ready and has assigned a valid `ADDRESS`: ```sh eval $(terraform output --raw cbci_oc_ing) ``` -3. Issue the following command to verify that the operations center service is running from inside the Kubernetes cluster: +1. Issue the following command to verify that the operations center service is running from inside the Kubernetes cluster: ```sh eval $(terraform output --raw cbci_liveness_probe_int) @@ -108,7 +108,7 @@ Once you can access the Kubernetes API from your terminal, complete the followin If the command is successful, no output is returned. -4. Issue the following command to verify that the operations center service is running from outside the Kubernetes cluster: +1. Issue the following command to verify that the operations center service is running from outside the Kubernetes cluster: ```sh eval $(terraform output --raw cbci_liveness_probe_ext) @@ -116,15 +116,15 @@ Once you can access the Kubernetes API from your terminal, complete the followin If the command is successful, no output is returned. -5. DNS propagation may take several minutes. Once propagation is complete, issue the following command, copy the output, and then paste it into a web browser. +1. DNS propagation may take several minutes. Once propagation is complete, issue the following command, copy the output, and then paste it into a web browser. ```sh terraform output cbci_oc_url ``` -6. Paste the output of the previous command into your browser to access the CloudBees CI setup wizard to complete the CloudBees CI operations center installation. +1. Paste the output of the previous command into your browser to access the CloudBees CI setup wizard to complete the CloudBees CI operations center installation. -7. Issue the following command to retrieve the first administrative user password (required): +1. Issue the following command to retrieve the first administrative user password (required): ```sh eval $(terraform output --raw cbci_initial_admin_password) diff --git a/blueprints/01-getting-started/img/getting-started.k8s.drawio.svg b/blueprints/01-getting-started/img/getting-started.k8s.drawio.svg index 2f732413..a9f00966 100644 --- a/blueprints/01-getting-started/img/getting-started.k8s.drawio.svg +++ b/blueprints/01-getting-started/img/getting-started.k8s.drawio.svg @@ -1,4 +1,4 @@ - + @@ -10,12 +10,12 @@ - - + + -
+
@@ -25,41 +25,42 @@
- + AWS Cloud - - - - - - - + + + + + -
+
- EKS + Kubernetes cluster +
+ on Amazon EKS +
- - EKS + + Kubernetes... - - - + + + -
+
@@ -71,25 +72,25 @@
- + Amazon EBS... - - - + + + -
+
AWS
- Load Balancer + load balancer

@@ -97,17 +98,17 @@
- + AWS... - - + + -
+
Amazon Route 53 @@ -117,20 +118,20 @@
- + Amazon Rout... - - - - - + + + + + -
+
Application load @@ -140,17 +141,17 @@
- + Applicatio... - - + + -
+
@@ -161,90 +162,87 @@
- + Amazon E... - + -
+
- - kube-system - + kube-system
- + kube-system - - - - - - - - + + + + + + + + -
+
- - external-dns - + external-dns
- + external-dns - - + + -
+
- - - cbci - + + + CloudBees CI +
- - cbci + + CloudBees CI - - - + + + - - cjoc + + Operations center + diff --git a/blueprints/01-getting-started/k8s/extdns-values.yml b/blueprints/01-getting-started/k8s/extdns-values.yml index 27f3dabd..9724a319 100644 --- a/blueprints/01-getting-started/k8s/extdns-values.yml +++ b/blueprints/01-getting-started/k8s/extdns-values.yml @@ -1,5 +1,5 @@ -#https://artifacthub.io/packages/helm/external-dns/external-dns -#https://github.com/kubernetes-sigs/external-dns/tree/master/charts/external-dns/Chart.yaml +# https://artifacthub.io/packages/helm/external-dns/external-dns +# https://github.com/kubernetes-sigs/external-dns/tree/master/charts/external-dns/Chart.yaml provider: "aws" domainFilters: [ "${zoneDNS}" ] policy: "sync" diff --git a/blueprints/01-getting-started/main.tf b/blueprints/01-getting-started/main.tf index a5589ef9..5749a035 100644 --- a/blueprints/01-getting-started/main.tf +++ b/blueprints/01-getting-started/main.tf @@ -31,8 +31,9 @@ locals { # CloudBees CI Add-on module "eks_blueprints_addon_cbci" { - source = "cloudbees/cloudbees-ci-eks-addon/aws" - version = ">= 3.18306.0" + #source = "cloudbees/cloudbees-ci-eks-addon/aws" + #version = ">= 3.18306.0" + source = "../../" depends_on = [module.eks_blueprints_addons] @@ -65,7 +66,7 @@ module "ebs_csi_driver_irsa" { module "eks_blueprints_addons" { source = "aws-ia/eks-blueprints-addons/aws" #vEKSBpAddonsTFMod# - version = "1.15.1" + version = "1.17.0" cluster_name = module.eks.cluster_name cluster_endpoint = module.eks.cluster_endpoint @@ -112,7 +113,7 @@ module "eks" { cluster_name = local.cluster_name cluster_endpoint_public_access = true #vK8# - cluster_version = "1.28" + cluster_version = "1.29" vpc_id = module.vpc.vpc_id subnet_ids = module.vpc.private_subnets @@ -163,8 +164,8 @@ module "eks" { } } - #https://docs.aws.amazon.com/eks/latest/userguide/choosing-instance-type.html - #https://docs.aws.amazon.com/eks/latest/APIReference/API_Nodegroup.html + # https://docs.aws.amazon.com/eks/latest/userguide/choosing-instance-type.html + # https://docs.aws.amazon.com/eks/latest/APIReference/API_Nodegroup.html eks_managed_node_groups = { mg_start = { node_group_name = "managed-start" @@ -242,13 +243,13 @@ module "acm" { source = "terraform-aws-modules/acm/aws" version = "5.0.0" - #Important: Application Services Hostname must be the same as the domain name or subject_alternative_names + # Important: Application Services Hostname must be the same as the domain name or subject_alternative_names domain_name = var.hosted_zone subject_alternative_names = [ "*.${var.hosted_zone}" # For subdomains example.${var.domain_name} ] - #https://docs.aws.amazon.com/acm/latest/userguide/dns-validation.html + # https://docs.aws.amazon.com/acm/latest/userguide/dns-validation.html zone_id = local.route53_zone_id validation_method = "DNS" @@ -269,8 +270,8 @@ module "vpc" { enable_nat_gateway = true single_nat_gateway = true - #https://docs.aws.amazon.com/eks/latest/userguide/network_reqs.html - #https://docs.aws.amazon.com/eks/latest/userguide/network-load-balancing.html + # https://docs.aws.amazon.com/eks/latest/userguide/network_reqs.html + # https://docs.aws.amazon.com/eks/latest/userguide/network-load-balancing.html public_subnet_tags = { "kubernetes.io/role/elb" = 1 } diff --git a/blueprints/01-getting-started/outputs.tf b/blueprints/01-getting-started/outputs.tf index 971a3474..50325dbe 100644 --- a/blueprints/01-getting-started/outputs.tf +++ b/blueprints/01-getting-started/outputs.tf @@ -1,10 +1,10 @@ output "kubeconfig_export" { - description = "Export the KUBECONFIG environment variable to access the Kubernetes API." + description = "Exports the KUBECONFIG environment variable to access the Kubernetes API." value = "export KUBECONFIG=${local.kubeconfig_file_path}" } output "kubeconfig_add" { - description = "Add kubeconfig to your local configuration to access the Kubernetes API." + description = "Adds kubeconfig to your local configuration to access the Kubernetes API." value = "aws eks update-kubeconfig --region ${var.aws_region} --name ${local.cluster_name}" } diff --git a/blueprints/02-at-scale/README.md b/blueprints/02-at-scale/README.md index 286f4ce8..d89b4728 100644 --- a/blueprints/02-at-scale/README.md +++ b/blueprints/02-at-scale/README.md @@ -4,7 +4,7 @@ Once you have familiarized yourself with [CloudBees CI blueprint add-on: Get sta - An [Amazon Elastic File System (Amazon EFS) drive](https://aws.amazon.com/efs/) that is required by CloudBees CI High Availability/Horizontal Scalability (HA/HS) controllers and is optional for non-HA/HS controllers. - An [Amazon Simple Storage Service (Amazon S3) bucket](https://aws.amazon.com/s3/) to store assets from applications like CloudBees CI, Velero, and Fluent Bit. -- [Amazon Elastic Kubernetes Service (Amazon EKS) managed node groups](https://docs.aws.amazon.com/eks/latest/userguide/managed-node-groups.html) for different workloads: shared services, CI applications, CI Linux on-demand agents, CI Linux spot agents, and CI Microsoft Windows on-demand agents. +- [Amazon Elastic Kubernetes Service (Amazon EKS) managed node groups](https://docs.aws.amazon.com/eks/latest/userguide/managed-node-groups.html) for different workloads: shared services, CI applications, CI Linux on-demand agents, CI Linux Spot agents, and CI Microsoft Windows on-demand agents. - [Amazon CloudWatch Logs](https://docs.aws.amazon.com/AmazonCloudWatch/latest/logs/WhatIsCloudWatchLogs.html) to explode control plane logs and Fluent Bit logs. - The following [Amazon EKS blueprints add-ons](https://aws-ia.github.io/terraform-aws-eks-blueprints-addons/main/): @@ -13,22 +13,24 @@ Once you have familiarized yourself with [CloudBees CI blueprint add-on: Get sta | [AWS EFS CSI Driver](https://aws-ia.github.io/terraform-aws-eks-blueprints-addons/main/addons/aws-efs-csi-driver/)| Connects the Amazon Elastic File System (Amazon EFS) drive to the Amazon EKS cluster. | | [AWS for Fluent Bit](https://aws-ia.github.io/terraform-aws-eks-blueprints-addons/main/addons/aws-for-fluentbit/)| Acts as an applications log router for log observability in CloudWatch. | | [Cluster Autoscaler](https://aws-ia.github.io/terraform-aws-eks-blueprints-addons/main/addons/cluster-autoscaler/) | Watches Amazon EKS managed node groups to accomplish [CloudBees CI auto-scaling nodes on EKS](https://docs.cloudbees.com/docs/cloudbees-ci/latest/cloud-admin-guide/eks-auto-scaling-nodes). | - | [Kube Prometheus Stack](https://aws-ia.github.io/terraform-aws-eks-blueprints-addons/main/addons/kube-prometheus-stack/) | Used for metrics observability.| + | [Kube Prometheus Stack](https://aws-ia.github.io/terraform-aws-eks-blueprints-addons/main/addons/kube-prometheus-stack/) | Observability backbone.| | [Metrics Server](https://aws-ia.github.io/terraform-aws-eks-blueprints-addons/main/addons/metrics-server/) | This is a requirement for CloudBees CI HA/HS controllers for horizontal pod autoscaling.| | [Velero](https://aws-ia.github.io/terraform-aws-eks-blueprints-addons/main/addons/velero/)| Backs up and restores Kubernetes resources and volume snapshots. It is only compatible with Amazon Elastic Block Store (Amazon EBS).| - | [Bottlerocket Update Operator](https://aws-ia.github.io/terraform-aws-eks-blueprints-addons/main/addons/bottlerocket/) | Coordinates Bottlerocket updates on hosts in a cluster. It is configured for CloudBees CI Applications and Agents Node Groups at a specific time according to `scheduler_cron_expression`, when the build workload is minimal (weekend). In a case where the CI service cannot be interrupted at any time by the Update Operator, it could be excluded from planned updates by removing the [bottlerocket.aws/updater-interface-version=2.0.0](https://github.com/bottlerocket-os/bottlerocket-update-operator#label-nodes) label.| + | [Bottlerocket Update Operator](https://aws-ia.github.io/terraform-aws-eks-blueprints-addons/main/addons/bottlerocket/) | Coordinates Bottlerocket updates on hosts in a cluster. It is configured for CloudBees CI Applications and Agents Node Groups at a specific time according to `scheduler_cron_expression`, when the build workload is minimal (for example, on the weekend). In a case where the CI service cannot be interrupted at any time by the Update Operator, it could be excluded from planned updates by removing the [bottlerocket.aws/updater-interface-version=2.0.0](https://github.com/bottlerocket-os/bottlerocket-update-operator#label-nodes) label. [Cert-manager](https://aws-ia.github.io/terraform-aws-eks-blueprints-addons/main/addons/cert-manager/) is required for the API server to use a CA certificate when communicating over SSL with the agents. | -- [Amazon EKS blueprints Helm Release Add-on](https://aws-ia.github.io/terraform-aws-eks-blueprints-addons/main/helm-release/) is used to install the following applications: +- [Amazon EKS blueprints Helm Release add-on](https://aws-ia.github.io/terraform-aws-eks-blueprints-addons/main/helm-release/) is used to install the following applications: | Helm Chart | Description | |-------------------------------|-------------| | [Helm Openldap](https://github.com/jp-gouin/helm-openldap/tree/master) | LDAP server for Kubernetes. | | [AWS Node Termination Handler](https://github.com/aws/aws-node-termination-handler) | Gracefully handles EC2 instance shutdown within Kubernetes. Note that this add-on is not compatible with managed instance groups. For more information, refer to [issue #23](https://github.com/cloudbees/terraform-aws-cloudbees-ci-eks-addon/issues/23). | - | [Grafana Tempo](https://grafana.com/oss/tempo/) | Provides backend tracing for [Jenkins OpenTelemetry](https://plugins.jenkins.io/opentelemetry/). | | [Hashicorp Vault](https://github.com/hashicorp/vault-helm) | Secrets management system that is integrated via [CloudBees HashiCorp Vault Plugin](https://docs.cloudbees.com/docs/cloudbees-ci/latest/cloud-secure-guide/hashicorp-vault-plugin). | + | [OTEL collector](https://grafana.com/oss/tempo/) | The collector for [Jenkins OpenTelemetry](https://plugins.jenkins.io/opentelemetry/) observability data. | + | [Jagger](https://www.jaegertracing.io/) | Provides tracing backend for [Jenkins OpenTelemetry](https://plugins.jenkins.io/opentelemetry/). | + | [Grafana Loki](https://grafana.com/oss/loki/) | Provides logs backend for [Jenkins OpenTelemetry](https://plugins.jenkins.io/opentelemetry/). | - Cloudbees CI uses [Configuration as Code (CasC)](https://docs.cloudbees.com/docs/cloudbees-ci/latest/casc-oc/casc-intro) (refer to the [casc](cbci/casc) folder) to enable [exciting new features for streamlined DevOps](https://www.cloudbees.com/blog/cloudbees-ci-exciting-new-features-for-streamlined-devops) and other enterprise features, such as [CloudBees CI hibernation](https://docs.cloudbees.com/docs/cloudbees-ci/latest/cloud-admin-guide/managing-controllers#hibernation-managed-controllers). - - The operations center is using the [CasC Bundle Retriever](https://docs.cloudbees.com/docs/cloudbees-ci/latest/casc-oc/bundle-retrieval-scm). + - The CloudBees operations center is using the [CasC Bundle Retriever](https://docs.cloudbees.com/docs/cloudbees-ci/latest/casc-oc/bundle-retrieval-scm). - Managed controller configurations are managed from the operations center using [source control management (SCM)](https://docs.cloudbees.com/docs/cloudbees-ci/latest/casc-controller/add-bundle#_adding_casc_bundles_from_an_scm_tool). - The managed controllers are using [CasC bundle inheritance](https://docs.cloudbees.com/docs/cloudbees-ci/latest/casc-controller/advanced#_configuring_bundle_inheritance_with_casc) (refer to the [parent](cbci/casc/mc/parent) folder). This "parent" bundle is inherited by two types of "child" controller bundles: `ha` and `none-ha`, to accommodate [considerations about HA controllers](https://docs.cloudbees.com/docs/cloudbees-ci/latest/ha/ha-considerations). @@ -43,13 +45,14 @@ This blueprint divides scalable node groups for different types of workloads: - CloudBees CI node groups: - CI services (role: `cb-apps`): - Services instance type: [AWS Graviton Processor](https://aws.amazon.com/ec2/graviton/) and [Bottlerocket OS](https://aws.amazon.com/bottlerocket/) AMI type. - - It uses an [instance profile](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles_use_switch-role-ec2_instance-profiles.html) for operating with AWS services permissions (for example, S3 buckets). However, the recommended options are explained in [Issue 56](https://github.com/cloudbees/terraform-aws-cloudbees-ci-eks-addon/issues/56). + - Regarding storage classes, no HA/HS controllers use `gp3-aza` (an Amazon EBS type which is tightened to Availability Zone A to avoid issue [#195](https://github.com/cloudbees/terraform-aws-cloudbees-ci-eks-addon/issues/195)) or HA/HS controller `efs`. - CI agents (ephemeral): - Linux: [AWS Graviton Processor](https://aws.amazon.com/ec2/graviton/) and [Bottlerocket OS](https://aws.amazon.com/bottlerocket/) AMI type and includes on-demand (role: `build-linux`) and Spot (role: `build-linux-spot`) capacity types. The Spot agent node groups follow the principles described in [Building for Cost Optimization and Resilience for EKS with Spot Instances](https://aws.amazon.com/blogs/compute/cost-optimization-and-resilience-eks-with-spot-instances/). + - Amazon Elastic Container Registry (Amazon ECR) authentication is done via [instance profile](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles_use_switch-role-ec2_instance-profiles.html) connected to `build-linux-spot` node pools. - Windows (role: `build-windows`): Windows 2019 AMI type. > [!IMPORTANT] -> The launch time for Linux containers is faster than Windows containers. This can be improved by using a cache container image strategy. Refer to [Speeding up Windows container launch times with EC2 Image builder and image cache strategy](https://aws.amazon.com/blogs/containers/speeding-up-windows-container-launch-times-with-ec2-image-builder-and-image-cache-strategy/) and more about [Windows Container Best Practices](https://aws.github.io/aws-eks-best-practices/windows/docs/ami/)). Another potential alternative is to use Windows VMs with a [shared agent](https://docs.cloudbees.com/docs/cloudbees-ci/latest/cloud-admin-guide/shared-agents). +> The launch time for Linux containers is faster than Windows containers. This can be improved by using a cache container image strategy. Refer to [Speeding up Windows container launch times with EC2 Image builder and image cache strategy](https://aws.amazon.com/blogs/containers/speeding-up-windows-container-launch-times-with-ec2-image-builder-and-image-cache-strategy/) and more about [Windows Container Best Practices](https://aws.github.io/aws-eks-best-practices/windows/docs/ami/). Another potential alternative is to use Windows VMs with a [shared agent](https://docs.cloudbees.com/docs/cloudbees-ci/latest/cloud-admin-guide/shared-agents). ![Architecture](img/at-scale.architect.drawio.svg) @@ -57,7 +60,12 @@ This blueprint divides scalable node groups for different types of workloads: ![K8sApps](img/at-scale.k8s.drawio.svg) -## Terraform Docs +CloudBees CI Services uses [Pod identity](https://aws.amazon.com/blogs/aws/amazon-eks-pod-identity-simplifies-iam-permissions-for-applications-on-amazon-eks-clusters/) adquire permissions to operate with an AWS s3 services for backup, restore and cache operations. + +> [!IMPORTANT] +> Known issues: Operation Center pod requires to be recreated to get injected AWS credentials. + +## Terraform documentation ### Inputs @@ -66,9 +74,9 @@ This blueprint divides scalable node groups for different types of workloads: |------|-------------|------|---------|:--------:| | hosted_zone | Amazon Route 53 hosted zone. CloudBees CI applications are configured to use subdomains in this hosted zone. | `string` | n/a | yes | | trial_license | CloudBees CI trial license details for evaluation. | `map(string)` | n/a | yes | -| aws_region | AWS region to deploy resources to. It requires at minimun 3 AZs. | `string` | `"us-west-2"` | no | +| aws_region | AWS region to deploy resources to. It requires a minimum of three availability zones. | `string` | `"us-west-2"` | no | | ci | Running in a CI service versus running locally. False when running locally, true when running in a CI service. | `bool` | `false` | no | -| dh_reg_secret_auth | Docker Hub Registry server authentication details for cbci-sec-reg secret. | `map(string)` |
{
"email": "foo.bar@acme.com",
"password": "changeme1234",
"username": "foo"
}
| no | +| dh_reg_secret_auth | Docker Hub registry server authentication details for cbci-sec-reg secret. | `map(string)` |
{
"email": "foo.bar@acme.com",
"password": "changeme1234",
"username": "foo"
}
| no | | suffix | Unique suffix to assign to all resources. When adding the suffix, changes are required in CloudBees CI for the validation phase. | `string` | `""` | no | | tags | Tags to apply to resources. | `map(string)` | `{}` | no | @@ -79,6 +87,7 @@ This blueprint divides scalable node groups for different types of workloads: | acm_certificate_arn | AWS Certificate Manager (ACM) certificate for Amazon Resource Names (ARN). | | aws_backup_efs_protected_resource | AWS description for the Amazon EFS drive that is used to back up protected resources. | | aws_logstreams_fluentbit | AWS CloudWatch log streams from Fluent Bit. | +| aws_region | AWS region. | | cbci_agent_linuxtempl_events | Retrieves a list of events related to Linux template agents. | | cbci_agent_sec_reg | Retrieves the container registry secret deployed in the agents namespace. | | cbci_agent_windowstempl_events | Retrieves a list of events related to Windows template agents. | @@ -100,20 +109,22 @@ This blueprint divides scalable node groups for different types of workloads: | efs_access_points | Amazon EFS access points. | | efs_arn | Amazon EFS ARN. | | eks_cluster_arn | Amazon EKS cluster ARN. | -| eks_cluster_name | Amazon EKS cluster Name. | +| eks_cluster_name | Amazon EKS cluster name. | | global_password | Random string that is used as the global password. | -| grafana_dashboard | Provides access to Grafana dashboards. | -| kubeconfig_add | Add kubeconfig to the local configuration to access the Kubernetes API. | -| kubeconfig_export | Export the KUBECONFIG environment variable to access the Kubernetes API. | -| prometheus_active_targets | Checks active Prometheus targets from the operations center. | +| grafana_url | Grafana URL. | +| kubeconfig_add | Adds kubeconfig to the local configuration to access the Kubernetes API. | +| kubeconfig_export | Exports the KUBECONFIG environment variable to access the Kubernetes API. | +| loki_labels | Lists all labels ingested in Loki. | +| prometheus_active_targets | Checks active Prometheus targets from the CloudBees operations center. | | prometheus_dashboard | Provides access to Prometheus dashboards. | | s3_cbci_arn | CloudBees CI Amazon S3 bucket ARN. | | s3_cbci_name | CloudBees CI Amazon S3 bucket name. It is required by CloudBees CI for workspace caching and artifact management. | | s3_list_objects | Recursively lists all objects stored in the Amazon S3 bucket. | -| vault_configure | Configure Vault with initial secrets and creates approle for integration with CloudBees CI (role-id and secret-id). It requires unseal keys and the root token from the vault_init output. | +| tempo_tags | Lists all tags ingested in Tempo. | +| vault_configure | Configures the vault with initial secrets and creates the application role for integration with CloudBees CI (role-id and secret-id). It requires unseal keys and the root token from the vault_init output. | | vault_dashboard | Provides access to Hashicorp Vault dashboard. It requires the root token from the vault_init output. | -| vault_init | Inicialization of Vault Service. | -| vault_init_log_file | Vault Inicialization log file. | +| vault_init | Initialization of the vault service. | +| vault_init_log_file | Vault initialization log file. | | velero_backup_on_demand | Takes an on-demand Velero backup from the schedule for the selected controller that is using block storage. | | velero_backup_schedule | Creates a Velero backup schedule for the selected controller that is using block storage, and then deletes the existing schedule, if it exists. | | velero_restore | Restores the selected controller that is using block storage from a backup. | @@ -132,8 +143,8 @@ This blueprint uses [DockerHub](https://hub.docker.com/) as a container registry When preparing to deploy, you must complete the following steps: 1. Customize your Terraform values by copying `.auto.tfvars.example` to `.auto.tfvars`. -2. Initialize the root module and any associated configuration for providers. -3. Create the resources and deploy CloudBees CI to an EKS cluster. Refer to [Amazon EKS Blueprints for Terraform - Deploy](https://aws-ia.github.io/terraform-aws-eks-blueprints/getting-started/#deploy). +1. Initialize the root module and any associated configuration for providers. +1. Create the resources and deploy CloudBees CI to an EKS cluster. Refer to [Amazon EKS Blueprints for Terraform - Deploy](https://aws-ia.github.io/terraform-aws-eks-blueprints/getting-started/#deploy). For more information, refer to [The Core Terraform Workflow](https://www.terraform.io/intro/core-workflow) documentation. @@ -152,13 +163,12 @@ Once the resources have been created, a `kubeconfig` file is created in the [/k8 eval $(terraform output --raw kubeconfig_export) ``` - If the command is successful, no output is returned. +If the command is successful, no output is returned. ### CloudBees CI 1. Complete the steps to [validate CloudBees CI](../01-getting-started/README.md#cloudbees-ci), if you have not done so already. - -2. Authentication in this blueprint is based on LDAP using the `cn` user (available in [k8s/openldap-stack-values.yml](./k8s/openldap-stack-values.yml)) and the global password. The authorization level defines a set of permissions configured using [RBAC](https://docs.cloudbees.com/docs/cloudbees-ci/latest/cloud-secure-guide/rbac). Additionally, the operations center and controller use [single sign-on (SS0)](https://docs.cloudbees.com/docs/cloudbees-ci/latest/cloud-secure-guide/using-sso), including a [fallback mechanism](https://docs.cloudbees.com/docs/cloudbees-ci-kb/latest/operations-center/how-ldap-plugin-works-on-cjoc-sso-context) that is enabled by default. Issue the following command to retrieve the global password (valid for all users): +1. Authentication in this blueprint is based on LDAP using the `cn` user (available in [k8s/openldap-stack-values.yml](./k8s/openldap-stack-values.yml)) and the global password. The authorization level defines a set of permissions configured using [RBAC](https://docs.cloudbees.com/docs/cloudbees-ci/latest/cloud-secure-guide/rbac). Additionally, the operations center and controller use [single sign-on (SS0)](https://docs.cloudbees.com/docs/cloudbees-ci/latest/cloud-secure-guide/using-sso), including a [fallback mechanism](https://docs.cloudbees.com/docs/cloudbees-ci-kb/latest/operations-center/how-ldap-plugin-works-on-cjoc-sso-context) that is enabled by default. Issue the following command to retrieve the global password (valid for all users): ```sh eval $(terraform output --raw global_password) @@ -166,7 +176,7 @@ Once the resources have been created, a `kubeconfig` file is created in the [/k8 There are differences in CloudBees CI permissions and folder restrictions when signed in as a user of the Admin group versus the Development group. For example, only Admin users have access to the agent validation jobs. -3. CasC is enabled for the [operations center](https://docs.cloudbees.com/docs/cloudbees-ci/latest/casc-oc/) (`cjoc`) and [controllers](https://docs.cloudbees.com/docs/cloudbees-ci/latest/casc-controller/) (`team-b` and `team-c-ha`). `team-a` is not using CasC, to illustrate the difference between the two approaches. Issue the following command to verify that all controllers are running: +1. CasC is enabled for the [operations center](https://docs.cloudbees.com/docs/cloudbees-ci/latest/casc-oc/) (`cjoc`) and [controllers](https://docs.cloudbees.com/docs/cloudbees-ci/latest/casc-controller/) (`team-b` and `team-c-ha`). `team-a` is not using CasC, to illustrate the difference between the two approaches. Issue the following command to verify that all controllers are running: ```sh eval $(terraform output --raw cbci_controllers_pods) @@ -174,7 +184,7 @@ Once the resources have been created, a `kubeconfig` file is created in the [/k8 If successful, it should indicate that 2 replicas are running for `team-c-ha` since [CloudBees CI HA/HS](https://docs.cloudbees.com/docs/cloudbees-ci/latest/ha-install-guide/) is enabled on this controller. -4. Issue the following command to verify that horizontal pod autoscaling is enabled for `team-c-ha`: +1. Issue the following command to verify that horizontal pod autoscaling is enabled for `team-c-ha`: ```sh eval $(terraform output --raw cbci_controller_c_hpa) @@ -184,11 +194,11 @@ Once the resources have been created, a `kubeconfig` file is created in the [/k8 ##### Kubernetes secret -This blueprint Kubernetes secrets for different purposes. +This blueprint uses Kubernetes secrets for different purposes. > [!NOTE] -> - Beyond the CloudBees CI add-on (used for demo purposes), Kubernetes secrets can be managed via [External Secret Operators](https://aws-ia.github.io/terraform-aws-eks-blueprints-addons/main/addons/external-secrets/). -> - Kubernetes secrets could be also be retrived as Jenkins Credentials via using the [Kubernetes Credentials Provider plugin](https://jenkinsci.github.io/kubernetes-credentials-provider-plugin/). +> - Beyond the CloudBees CI add-on that is used for demo purposes, Kubernetes secrets can be managed via [External Secret Operators](https://aws-ia.github.io/terraform-aws-eks-blueprints-addons/main/addons/external-secrets/). +> - Kubernetes secrets can be also be retrieved as Jenkins credentials using the [Kubernetes Credentials Provider plugin](https://jenkinsci.github.io/kubernetes-credentials-provider-plugin/). ###### CasC secrets @@ -203,7 +213,7 @@ DockerHub authentication is stored as Kubernetes secrets (`cbci-agent-sec-reg`) ``` > [!NOTE] -> ECR authentication is done via instance profile connected to `build-linux-spot` Node pools. +> Amazon Elastic Container Registry (Amazon ECR) authentication is done via an instance profile connected to `build-linux-spot` node pools. ##### HashiCorp Vault @@ -215,13 +225,13 @@ HashiCorp Vault is used as a credential provider for CloudBees CI Pipelines in t eval $(terraform output --raw vault_init) ``` -2. Run the configure Hashicorp Vault script. It configures Vault with initial secrets and creates `approle` for integration with CloudBees CI (role-id and secret-id) +1. Run the configure Hashicorp Vault script. It configures Vault with initial secrets and creates `approle` for integration with CloudBees CI (role-id and secret-id) ```sh eval $(terraform output --raw vault_configure) ``` -3. Access the HashiCorp Vault UI by issuing the following command. Enter the root token to log in from the _step 1_. +1. Access the HashiCorp Vault UI by issuing the following command. Enter the root token to log in from the _step 1_. ```sh eval $(terraform output --raw vault_dashboard) @@ -229,13 +239,13 @@ HashiCorp Vault is used as a credential provider for CloudBees CI Pipelines in t If successful, the Vault web service should be available at `http://localhost:50003` and you can view the secrets that were created in _step 2_. -4. Sign in to the CloudBees CI operations center as a user with the admin role. +1. Sign in to the CloudBees CI operations center as a user with the admin role. -5. Navigate to **Manage Jenkins > Credentials Providers > HashiCorp Vault Credentials Provider** and complete the configuration for the CloudBees CI Vault Plugin by entering the role ID and secret ID for the `cbci-oc` application role from _step 1_. +1. Navigate to **Manage Jenkins > Credentials Providers > HashiCorp Vault Credentials Provider** and complete the configuration for the CloudBees CI Vault Plugin by entering the role ID and secret ID for the `cbci-oc` application role from _step 1_. -6. Select **Test Connection** to verify the inputs are correct. +1. Select **Test Connection** to verify the inputs are correct. -7. Move to `team-b` or `team-c-ha` to run the Pipeline (**admin > validations > vault-credentials**) and validate that credentials are fetched correctly from the Hashicorp Vault. +1. Move to `team-b` or `team-c-ha` to run the Pipeline (**admin > validations > vault-credentials**) and validate that credentials are fetched correctly from the Hashicorp Vault. > [!NOTE] > Hashicorp Vault can be also be configured to be used for [Configuration as Code - Handling Secrets - Vault](https://github.com/jenkinsci/configuration-as-code-plugin/blob/master/docs/features/secrets.adoc#hashicorp-vault-secret-source). @@ -258,7 +268,7 @@ HashiCorp Vault is used as a credential provider for CloudBees CI Pipelines in t eval $(terraform output --raw cbci_liveness_probe_ext) ``` -2. Once you have retrieved the API token, issue the following commands to trigger builds using the [POST queue for hibernation API endpoint](https://docs.cloudbees.com/docs/cloudbees-ci/latest/cloud-admin-guide/managing-controllers#_post_queue_for_hibernation). If successful, an `HTTP/2 201` response is returned, indicating the REST API call has been correctly received by the CloudBees CI controller. +1. Once you have retrieved the API token, issue the following commands to trigger builds using the [POST queue for hibernation API endpoint](https://docs.cloudbees.com/docs/cloudbees-ci/latest/cloud-admin-guide/managing-controllers#_post_queue_for_hibernation). If successful, an `HTTP/2 201` response is returned, indicating the REST API call has been correctly received by the CloudBees CI controller. - For Linux node pools use: @@ -282,17 +292,17 @@ HashiCorp Vault is used as a credential provider for CloudBees CI Pipelines in t Note that the first build for a new Windows image container can take up to 10 minutes to run; subsequent builds should take seconds to run. This behavior can be improved, as explained in the section [Architecture](#architecture). -3. Right after triggering the builds, issue the following to validate pod agent provisioning to build the Pipeline code: +1. Right after triggering the builds, issue the following to validate pod agent provisioning to build the Pipeline code: ```sh eval $(terraform output --raw cbci_agents_pods) ``` -4. Check build logs by signing in to the `team-b` and `team-c-ha` controllers, respectively. Navigate to the Pipeline jobs and select the first build, indicated by the `#1` build number. [CloudBees Pipeline Explorer](https://docs.cloudbees.com/docs/cloudbees-ci/latest/pipelines/cloudbees-pipeline-explorer-plugin) is enabled by default. +1. Check build logs by signing in to the `team-b` and `team-c-ha` controllers, respectively. Navigate to the Pipeline jobs and select the first build, indicated by the `#1` build number. [CloudBees Pipeline Explorer](https://docs.cloudbees.com/docs/cloudbees-ci/latest/pipelines/cloudbees-pipeline-explorer-plugin) is enabled by default. ##### Container Registry -This blueprints use a couple of container registries for different purposes. +This blueprint use a couple of container registries for different purposes: - The public registry uses DockerHub. - The private registry uses AWS ECR. @@ -300,10 +310,12 @@ This blueprints use a couple of container registries for different purposes. > [!NOTE] > Other Container Registry services can be used for the same purposes. -Sign in to the CloudBees CI to `team-b` or `team-c-ha` controllers with admin access. Run the **admin > validations > kaniko** Pipeline and enter (using parameters) an existing DockerHub organization and an existing AWS ECR Repository to test that building and pushing to all repositories works as expected. +1. In the CloudBees CI UI, sign in to the `team-b` or `team-c-ha` controllers with admin access. +1. Navigate to the **admin > validations > kaniko** Pipeline. +1. Using parameters, enter an existing DockerHub organization and an existing Amazon ECR repository to test that building and pushing to all repositories works as expected. > [!NOTE] -> Besides Kaniko, there are [other alternitives tools](https://docs.cloudbees.com/docs/cloudbees-ci/latest/cloud-admin-guide/using-kaniko#_alternatives). +> Besides Kaniko, there are [other alternative tools](https://docs.cloudbees.com/docs/cloudbees-ci/latest/cloud-admin-guide/using-kaniko#_alternatives) for building images in K8s. #### Back up and restore @@ -312,8 +324,8 @@ For backup and restore operations, you can use the [preconfigured CloudBees CI C [Velero](#create-a-velero-backup-schedule) is an alternative for services only for controllers using Amazon EBS. Velero commands and configuration in this blueprint follow [Using Velero back up and restore Kubernetes cluster resources](https://docs.cloudbees.com/docs/cloudbees-ci/latest/backup-restore/velero-dr). > [!NOTE] -> - An installation that has been completely converted to CasC may not need traditional backups; a restore operation could consist simply of running a CasC bootstrap script. This is only an option for a customer who has translated every significant system setting and job configuration to CasC. Even then it may be desirable to perform a filesystem-level restore from backup in order to preserve transient data such as build history. -> - There is no alternative for services using Amazon EFS storage. Although [AWS Backup](https://aws.amazon.com/backup/) includes this Amazon EFS drive as a protected resource, there is not currently a best practice to dynamically restore Amazon EFS PVCs. For more information, refer to [Issue 39](https://github.com/cloudbees/terraform-aws-cloudbees-ci-eks-addon/issues/39). +> - An installation that has been completely converted to CasC may not need traditional backups; a restore operation could consist simply of running a CasC bootstrap script. This is only an option if you have translated every significant system setting and job configuration to CasC. Even then, it may be desirable to perform a filesystem-level restore from backup to preserve transient data, such as build history. +> - There is no alternative for services using Amazon EFS storage. Although [AWS Backup](https://aws.amazon.com/backup/) includes the Amazon EFS drive as a protected resource, there is not currently a best practice to dynamically restore Amazon EFS PVCs. For more information, refer to [Issue 39](https://github.com/cloudbees/terraform-aws-cloudbees-ci-eks-addon/issues/39). ##### Create daily backups using a CloudBees CI Cluster Operations job @@ -322,8 +334,8 @@ The [CloudBees Backup plugin](https://docs.cloudbees.com/docs/cloudbees-ci/lates To view the **backup-all-controllers** job: 1. Sign in to the CloudBees CI operations center UI as a user with **Administer** privileges. Note that access to back up jobs is restricted to admin users via RBAC. -2. From the operations center dashboard, select **All** to view all folders on the operations center. -3. Navigate to the **admin** folder, and then select the **backup-all-controllers** Cluster Operations job. +1. From the operations center dashboard, select **All** to view all folders on the operations center. +1. Navigate to the **admin** folder, and then select the **backup-all-controllers** Cluster Operations job. Restore operations can be done on-demand at the controller level from the preconfigured restore job. @@ -356,12 +368,14 @@ Issue the following command to restore the controller from the last backup: ### Observability -#### Metrics and Tracing +> [!IMPORTANT] +> Regarding the observability stack described in the following sections, note that the CloudBees Prometheus plugin is a CloudBees Tier 1 plugin, while the Jenkins OpenTelemetry plugin is a Tier 3 plugin. For more information, refer to the [CloudBees plugin support policies](https://docs.cloudbees.com/docs/cloudbees-common/latest/plugin-support-policies). + +#### Metrics -Grafana is used to visualize and query: +Prometheus is used to store metrics that are retrieved from the [Jenkins Metrics plugin](https://plugins.jenkins.io/metrics/) and the [Jenkins OpenTelemetry plugin](https://github.com/jenkinsci/opentelemetry-plugin/blob/main/docs/monitoring-metrics.md). -- [Jenkins Metrics](https://plugins.jenkins.io/metrics/) that are stored in Prometheus. -- [Jenkins Tracing via OpenTelemetry](https://plugins.jenkins.io/opentelemetry/) that stored into Grafana Tempo. +Grafana imports Prometheus as a datasource and provides metrics dashboards for CloudBees CI. 1. Issue the following command to verify that the CloudBees CI targets are connected to Prometheus: @@ -369,55 +383,71 @@ Grafana is used to visualize and query: eval $(terraform output --raw prometheus_active_targets) | jq '.data.activeTargets[] | select(.labels.container=="jenkins") | {job: .labels.job, instance: .labels.instance, status: .health}' ``` -2. Issue the following command to access Kube Prometheus Stack dashboards from your web browser and verify that [Jenkins metrics](https://plugins.jenkins.io/metrics/) are available. +1. Issue the following command to access Kube Prometheus Stack dashboards from your web browser and verify that targets are correctly collecting metrics. ```sh eval $(terraform output --raw prometheus_dashboard) ``` - If successful, the Prometheus web service is available at `http://localhost:50001` and you can view the configured alerts for CloudBees CI. Additionally, check _Status_ > _Targets_ shows targets in `UP` status. + If successful, the Prometheus web service is available at `http://localhost:50001` and you can view the configured alerts for CloudBees CI. Additionally, you can select **Status > Targets** to show targets with an `UP` status. -3. Issue the following command to access Grafana dashboards at `localhost:50002`. For the username, use `admin` and set the password using the `global_password` terraform variable: +1. Issue the following command to access the Grafana URL. For the username, use `admin` and set the password using the `global_password` terraform variable: ```sh - eval $(terraform output --raw grafana_dashboard) + eval $(terraform output --raw grafana_url) ``` - If successful, the Grafana web service is available `http://localhost:50002`. +1. To explore Metrics dashboards, navigate to **Home > Dashboards > CloudBees CI**, and then select the controller pod to view the metrics. The following image shows metrics for `team-b`: - - For Jenkins Metrics Dashboards navigate to **Home > Dashboards > CloudBees CI**. Then, select the controller pod to view the metrics. The following image shows metrics for team-b. + ![CloudBees CI Metrics Dashboard](img/observability/cbci-metrics-dashboard.png) - ![CloudBees CI Dashboard](img/observability/cbci-dashboard.png) +##### Tracing - - For Tracing Data, navigate to **Home > Explore > Select Tempo > Select `Query Type: Search`**. Then, select the `service name: jenkins` and the desired `Span Name` to `Run Query`. The following image shows an example of the ws-cache pipeline build. +Tempo is used as the Tracing/APM backend for Jenkins tracing data via the Jenkins OpenTelemetry plugin: [HTTP](https://github.com/jenkinsci/opentelemetry-plugin/blob/main/docs/http-requests-traces.md) and [Jobs](https://github.com/jenkinsci/opentelemetry-plugin/blob/main/docs/job-traces.md). - ![CloudBees CI Tracing Example](img/observability/cbci-tracing-example.png) +Grafana imports Tempo as a datasource and provides tracing dashboards per a CI/CD pipeline Trace ID. -> [!NOTE] -> Grafana Ingress can be enabled as explained in Issue [#165](https://github.com/cloudbees/terraform-aws-cloudbees-ci-eks-addon/issues/165), but currently is incompatible with `terrafrom destroy`. +In CloudBees CI, the Jenkins OpenTelemetry plugin is configured to use Grafana as a visualization backend. Then, it offers a **View pipeline with Grafana** link for every pipeline run, which redirects to Grafana Explorer using Tempo as a datasource and passing a Trace ID. Other system traces can be visualized in Grafana Explorer as well. + +![CloudBees CI Tracing Tempo](img/observability/cbci-tracing-tempo.png) -#### Logs +##### Logs -For application logs, Fluent Bit acts as a router. +###### Build Logs -- Short-term application logs live in the [Amazon CloudWatch Logs](https://docs.aws.amazon.com/AmazonCloudWatch/latest/logs/WhatIsCloudWatchLogs.html) group, under `/aws/eks//aws-fluentbit-logs` and contains log streams for all the Kubernetes services running in the cluster, including CloudBees CI applications and agents. The following image shows an example of team b controller logs. +The recommended approach for build logs is using [CloudBees Pipeline Explorer](https://docs.cloudbees.com/docs/cloudbees-ci/latest/pipelines/cloudbees-pipeline-explorer-plugin). + +> [!IMPORTANT] +> Although [pipeline build logs can be sent to external storage via the Jenkins OpenTelemetry plugin](https://github.com/jenkinsci/opentelemetry-plugin/blob/main/docs/build-logs.md), it is not compatible with CloudBees Pipeline Explorer. + +###### Container logs + +Fluent Bit acts as a router for container logs. + +- Short-term logs and log aggregation systems: + + - [Amazon CloudWatch Logs](https://docs.aws.amazon.com/AmazonCloudWatch/latest/logs/WhatIsCloudWatchLogs.html) group: Stores log streams for all the Kubernetes services running in the cluster, including CloudBees CI applications and agents in `/aws/eks//aws-fluentbit-logs`. ```sh eval $(terraform output --raw aws_logstreams_fluentbit) | jq '.[] ' ``` - ![CloudBees CI Logs Example](img/observability/cbci-fluenbit-example.png) + The following image shows an example of `team b` controller logs: + + ![CloudBees CI logs from CloudWatch](img/observability/cbci-logs-cloudwatch.png) + + - CloudWatch log group: Stores control plane logs in `/aws/eks/CLUSTER_NAME>/cluster`. -- Long-term application logs live in an Amazon S3 bucket. + - [Loki](https://grafana.com/oss/loki/): In Grafana, navigate to the **Explore** section, select **Loki** as the datasource, filter by `com_cloudbees_cje_tenants`, and then select a CloudBees CI application log. -For CloudBees CI build logs: + ![CloudBees CI logs from Loki](img/observability/cbci-logs-loki.png) -- Short-term build logs live in the CloudBees CI controller and are managed using the [Build Discarder](https://plugins.jenkins.io/build-discarder/) Jenkins plugin, which is installed and configured using CasC. -- Long-term logs can be handled (like any other artifact that is sent to an Amazon S3 bucket) using the [Artifact Manager on Amazon S3](https://plugins.jenkins.io/artifact-manager-s3/) Jenkins plugin, which is installed and configured by CasC. +- Long-term logs are stored in an Amazon S3 bucket under the `fluentbit` path. ## Destroy To tear down and remove the resources created in the blueprint, refer to [Amazon EKS Blueprints for Terraform - Destroy](https://aws-ia.github.io/terraform-aws-eks-blueprints/getting-started/#destroy). > [!TIP] -> The `destroy` phase can be orchestrated via the companion [Makefile](../../Makefile). +> - To avoid [#165](https://github.com/cloudbees/terraform-aws-cloudbees-ci-eks-addon/issues/165), run `kube-prometheus-destroy.sh` after destroying the EKS cluster. +> - The `destroy` phase can be orchestrated via the companion [Makefile](../../Makefile). diff --git a/blueprints/02-at-scale/cbci/casc/mc/ha/bundle.yaml b/blueprints/02-at-scale/cbci/casc/mc/ha/bundle.yaml index dbb47c13..59927d65 100644 --- a/blueprints/02-at-scale/cbci/casc/mc/ha/bundle.yaml +++ b/blueprints/02-at-scale/cbci/casc/mc/ha/bundle.yaml @@ -7,3 +7,5 @@ allowCapExceptions: true jcascMergeStrategy: "errorOnConflict" jcasc: - jcasc +variables: + - variables diff --git a/blueprints/02-at-scale/cbci/casc/mc/ha/jcasc/main.yaml b/blueprints/02-at-scale/cbci/casc/mc/ha/jcasc/main.yaml index b5dbc48f..5e0ae815 100644 --- a/blueprints/02-at-scale/cbci/casc/mc/ha/jcasc/main.yaml +++ b/blueprints/02-at-scale/cbci/casc/mc/ha/jcasc/main.yaml @@ -1,3 +1,20 @@ unclassified: globalDefaultFlowDurabilityLevel: durabilityHint: MAX_SURVIVABILITY + globalLibraries: + libraries: + - defaultVersion: ${sharedLibBranch} + name: "common" + retriever: + modernSCM: + clone: true + libraryPath: ${sharedLibPath} + scm: + git: + remote: ${sharedLibRepo} + traits: + - cloneOption: + extension: + depth: 1 + noTags: false + shallow: true diff --git a/blueprints/02-at-scale/cbci/casc/mc/ha/variables/variables.yaml b/blueprints/02-at-scale/cbci/casc/mc/ha/variables/variables.yaml new file mode 100644 index 00000000..4930abbf --- /dev/null +++ b/blueprints/02-at-scale/cbci/casc/mc/ha/variables/variables.yaml @@ -0,0 +1,4 @@ +variables: + - sharedLibRepo: "https://github.com/cloudbees/terraform-aws-cloudbees-ci-eks-addon.git" + - sharedLibBranch: develop + - sharedLibPath: "blueprints/02-at-scale/cbci/shared-lib" diff --git a/blueprints/02-at-scale/cbci/casc/mc/none-ha/bundle.yaml b/blueprints/02-at-scale/cbci/casc/mc/none-ha/bundle.yaml index 750e7631..69ededd1 100644 --- a/blueprints/02-at-scale/cbci/casc/mc/none-ha/bundle.yaml +++ b/blueprints/02-at-scale/cbci/casc/mc/none-ha/bundle.yaml @@ -7,3 +7,5 @@ allowCapExceptions: true jcascMergeStrategy: "errorOnConflict" jcasc: - jcasc +variables: + - variables diff --git a/blueprints/02-at-scale/cbci/casc/mc/none-ha/jcasc/main.yaml b/blueprints/02-at-scale/cbci/casc/mc/none-ha/jcasc/main.yaml index 265cf97b..b413baa0 100644 --- a/blueprints/02-at-scale/cbci/casc/mc/none-ha/jcasc/main.yaml +++ b/blueprints/02-at-scale/cbci/casc/mc/none-ha/jcasc/main.yaml @@ -1,3 +1,15 @@ unclassified: globalDefaultFlowDurabilityLevel: durabilityHint: PERFORMANCE_OPTIMIZED + globalLibraries: + libraries: + - defaultVersion: ${sharedLibBranch} + name: "common" + retriever: + modernSCM: + libraryPath: ${sharedLibPath} + scm: + git: + remote: ${sharedLibRepo} + cachingConfiguration: + refreshTimeMinutes: 0 diff --git a/blueprints/02-at-scale/cbci/casc/mc/none-ha/variables/variables.yaml b/blueprints/02-at-scale/cbci/casc/mc/none-ha/variables/variables.yaml new file mode 100644 index 00000000..4930abbf --- /dev/null +++ b/blueprints/02-at-scale/cbci/casc/mc/none-ha/variables/variables.yaml @@ -0,0 +1,4 @@ +variables: + - sharedLibRepo: "https://github.com/cloudbees/terraform-aws-cloudbees-ci-eks-addon.git" + - sharedLibBranch: develop + - sharedLibPath: "blueprints/02-at-scale/cbci/shared-lib" diff --git a/blueprints/02-at-scale/cbci/casc/mc/parent/items/admin-folder.yaml b/blueprints/02-at-scale/cbci/casc/mc/parent/items/admin-folder.yaml index a407cf25..d0ef8f69 100644 --- a/blueprints/02-at-scale/cbci/casc/mc/parent/items/admin-folder.yaml +++ b/blueprints/02-at-scale/cbci/casc/mc/parent/items/admin-folder.yaml @@ -216,9 +216,13 @@ items: sandbox: true script: | pipeline { + options { + retry(3) + timeout(time: 1, unit: 'HOURS') + } agent any environment { - U1=credentials('cbci-oc-secret-a') + U1=credentials('cbci-oc_secret-a') S1=credentials('cbci-oc_secret-b') } stages { @@ -244,6 +248,10 @@ items: @Library('common') _ pipeline { + options { + retry(3) + timeout(time: 1, unit: 'HOURS') + } agent { label 'linux-mavenAndKaniko-XL' } diff --git a/blueprints/02-at-scale/cbci/casc/mc/parent/jcasc/k8s-agents.yaml b/blueprints/02-at-scale/cbci/casc/mc/parent/jcasc/k8s-agents.yaml index 882410ca..5752c18f 100644 --- a/blueprints/02-at-scale/cbci/casc/mc/parent/jcasc/k8s-agents.yaml +++ b/blueprints/02-at-scale/cbci/casc/mc/parent/jcasc/k8s-agents.yaml @@ -48,12 +48,13 @@ kube: path: config.json nodeSelector: kubernetes.io/os: linux - role: "build-linux" + role: "build-linux-l" + size: "2x" tolerations: - effect: "NoSchedule" key: "dedicated" operator: "Equal" - value: "build-linux" + value: "build-linux-l" - name: "linux-mavenAndKaniko-XL" label: "linux-mavenAndKaniko-XL" nodeUsageMode: "NORMAL" @@ -114,12 +115,13 @@ kube: path: config.json nodeSelector: kubernetes.io/os: linux - role: "build-linux-spot" + role: "build-linux-xl" + size: "4x" tolerations: - effect: "NoSchedule" key: "dedicated" operator: "Equal" - value: "build-linux-spot" + value: "build-linux-xl" - name: "windows-powershell" label: "windows-powershell" yaml: |- diff --git a/blueprints/02-at-scale/cbci/casc/mc/parent/jcasc/main.yaml b/blueprints/02-at-scale/cbci/casc/mc/parent/jcasc/main.yaml index 301da1c9..d672234c 100644 --- a/blueprints/02-at-scale/cbci/casc/mc/parent/jcasc/main.yaml +++ b/blueprints/02-at-scale/cbci/casc/mc/parent/jcasc/main.yaml @@ -36,18 +36,14 @@ unclassified: gracePeriod: 3600 openTelemetry: endpoint: ${ot_endpoint} + observabilityBackends: + - grafana: + grafanaBaseUrl: ${sec_grafana_url} + tempoDataSourceIdentifier: ${tempoDatasource} + #Note: Not compatible with CPE + #grafanaLogsBackend: "grafanaLogsBackendBackendWithLogMirroringInJenkins" cascItemsConfiguration: variableInterpolationEnabledForAdmin: true - globalLibraries: - libraries: - - defaultVersion: ${sharedLibBranch} - name: "common" - retriever: - modernSCM: - libraryPath: ${sharedLibPath} - scm: - git: - remote: ${sharedLibRepo} aws: awsCredentials: region: "${sec_awsRegion}" diff --git a/blueprints/02-at-scale/cbci/casc/mc/parent/variables/variables.yaml b/blueprints/02-at-scale/cbci/casc/mc/parent/variables/variables.yaml index 37fe9a7f..80977cdd 100644 --- a/blueprints/02-at-scale/cbci/casc/mc/parent/variables/variables.yaml +++ b/blueprints/02-at-scale/cbci/casc/mc/parent/variables/variables.yaml @@ -1,7 +1,5 @@ variables: - - ot_endpoint: "http://tempo.kube-prometheus-stack.svc.cluster.local:4317" + - ot_endpoint: "http://otel-collector-opentelemetry-collector.observability.svc.cluster.local:4317" - s3bucketPreffix: "cbci" - regSecretsName: "cbci-sec-reg" - - sharedLibRepo: "https://github.com/cloudbees/terraform-aws-cloudbees-ci-eks-addon.git" - - sharedLibBranch: main - - sharedLibPath: "blueprints/02-at-scale/cbci/shared-lib" + - tempoDatasource: "tempoDatasource" diff --git a/blueprints/02-at-scale/cbci/casc/oc/items/admin-folder.yaml b/blueprints/02-at-scale/cbci/casc/oc/items/admin-folder.yaml index 00735694..475cc726 100644 --- a/blueprints/02-at-scale/cbci/casc/oc/items/admin-folder.yaml +++ b/blueprints/02-at-scale/cbci/casc/oc/items/admin-folder.yaml @@ -72,12 +72,9 @@ items: - jobConfigurationSubject: { } - systemConfigurationSubject: - omitMasterKey: false + omitMasterKey: true format: - zipFormat: { - } - retentionPolicy: - noRetentionPolicy: { + tarGzFormat: { } safeDelaySeconds: 0 store: @@ -85,9 +82,40 @@ items: bucketName: "${sec_s3bucketName}" bucketFolder: "${s3bucketPreffix}/backup" region: "${sec_awsRegion}" + retentionPolicy: + upToNRetentionPolicy: + n: 5 itemSource: jenkinsRootItemSource: { } filters: - isMasterOnlineFilter: { } + - kind: backupAndRestore + name: backup-cjoc + triggers: + - cron: + spec: '@daily' + buildersList: + - backupBuilder: + subjects: + - buildRecordSubject: { + } + - jobConfigurationSubject: { + } + - systemConfigurationSubject: + omitMasterKey: true + format: + tarGzFormat: { + } + exclusive: false + store: + s3Store: + bucketName: "${sec_s3bucketName}" + bucketFolder: "${s3bucketPreffix}/backup" + region: "${sec_awsRegion}" + retentionPolicy: + upToNRetentionPolicy: + n: 5 + safeDelaySeconds: 0 + concurrentBuild: false diff --git a/blueprints/02-at-scale/cbci/casc/oc/items/root.yaml b/blueprints/02-at-scale/cbci/casc/oc/items/root.yaml index 9a0be7ca..9f2f4646 100644 --- a/blueprints/02-at-scale/cbci/casc/oc/items/root.yaml +++ b/blueprints/02-at-scale/cbci/casc/oc/items/root.yaml @@ -9,7 +9,7 @@ items: kubernetes: memory: 2048 cpus: 1.0 - disk: 20 + disk: 10 storageClassName: "gp3" # Casc, Non-HA - kind: managedController @@ -53,7 +53,7 @@ items: "cloudbees.prometheus": "true" properties: - configurationAsCode: - bundle: "main/none-ha" + bundle: "develop/none-ha" # Casc, HA - kind: managedController name: team-c-ha @@ -62,6 +62,7 @@ items: memory: 6144 cpus: 2.0 disk: 5 + storageClassName: "efs" replication: config: maxReplicas: 4 @@ -100,4 +101,4 @@ items: "cloudbees.prometheus": "true" properties: - configurationAsCode: - bundle: "main/ha" + bundle: "develop/ha" diff --git a/blueprints/02-at-scale/cbci/casc/oc/jcasc/main.yaml b/blueprints/02-at-scale/cbci/casc/oc/jcasc/main.yaml index 36a52eb3..a6f3c506 100644 --- a/blueprints/02-at-scale/cbci/casc/oc/jcasc/main.yaml +++ b/blueprints/02-at-scale/cbci/casc/oc/jcasc/main.yaml @@ -1,6 +1,7 @@ jenkins: systemMessage: "${message}" - numExecutors: 0 + # Setting 1 executor for Backup OC + numExecutors: 1 unclassified: cascItemsConfiguration: variableInterpolationEnabledForAdmin: true @@ -32,7 +33,7 @@ unclassified: logRotator: numToKeepStr: "3" cloudBeesCasCServer: - defaultBundle: "main/none-ha" + defaultBundle: ${cascBranch}/none-ha visibility: true beekeeper: enabled: true @@ -55,9 +56,9 @@ masterprovisioning: kind: "StatefulSet" spec: template: - spec: - tolerations: - - key: "dedicated" - operator: "Equal" - value: "cb-apps" - effect: "NoSchedule" + spec: + tolerations: + - key: "dedicated" + operator: "Equal" + value: "cb-apps" + effect: "NoSchedule" diff --git a/blueprints/02-at-scale/cbci/casc/oc/variables/variables.yaml b/blueprints/02-at-scale/cbci/casc/oc/variables/variables.yaml index 5d887643..82f69637 100644 --- a/blueprints/02-at-scale/cbci/casc/oc/variables/variables.yaml +++ b/blueprints/02-at-scale/cbci/casc/oc/variables/variables.yaml @@ -1,7 +1,7 @@ variables: - message: "Welcome to the CloudBees CI blueprint add-on: At scale!" - cascRepo: "https://github.com/cloudbees/terraform-aws-cloudbees-ci-eks-addon.git" - - cascBranch: main + - cascBranch: develop - cascPathController: "/blueprints/02-at-scale/cbci/casc/mc/" - ldapManagerDN: "cn=admin,dc=acme,dc=org" - ldapRootDN: "dc=acme,dc=org" diff --git a/blueprints/02-at-scale/img/at-scale.architect.drawio.svg b/blueprints/02-at-scale/img/at-scale.architect.drawio.svg index 1561c961..0a76581c 100644 --- a/blueprints/02-at-scale/img/at-scale.architect.drawio.svg +++ b/blueprints/02-at-scale/img/at-scale.architect.drawio.svg @@ -1,4 +1,4 @@ - + @@ -22,70 +22,124 @@ - - + + -
-
-
- Region +
+
+
+ + AWS Cloud +
- - Region + + AWS Cloud - - + + -
-
-
- - Custom VPC - +
+
+
+ Availability zone a
- - Custom VPC + + Availability zone a - - + + -
+
+
+
+ Availability zone c +
+
+
+ + + Availability zone c + + + + + + + + +
+
+
+ Availability zone b +
+
+
+
+ + Availability zone b + +
+
+ + + + + +
+
+
+ Region +
+
+
+
+ + Region + +
+
+ + + + + +
- - AWS Cloud + + Custom VPC
- - AWS Cloud + + Custom VPC
- - + + -
+
Amazon Route 53 @@ -95,17 +149,17 @@
- + Amazon Rou... - - + + -
+
K8s Cluster @@ -115,17 +169,17 @@
- + K8s Cluste... - - + + -
+
Amazon EFS @@ -133,17 +187,17 @@
- + Amazon EFS - - + + -
+
Amazon S3 bucket @@ -151,17 +205,17 @@
- + Amazon S3... - - + + -
+
CloudWatch @@ -169,17 +223,17 @@
- + CloudWatch - - + + -
+
Application load @@ -189,16 +243,16 @@
- + Application... - + -
+
Text @@ -206,17 +260,17 @@
- + Text - - + + -
+
AWS certificate @@ -226,18 +280,18 @@
- + AWS certif... - - - + + + -
+
@@ -247,40 +301,40 @@
- + Internet... - - + + -
+
Amazon EBS
- gp3 + gp3-aza
- + Amazon EB... - - + + -
+
Public subnet @@ -288,17 +342,17 @@
- + Public subnet - - + + -
+
NAT @@ -308,17 +362,17 @@
- + NAT... - - + + -
+
Private Subnet @@ -326,17 +380,17 @@
- + Private Subnet - - + + -
+
shared @@ -344,35 +398,17 @@
- + shared - - + + -
-
-
- build-linux -
-
-
-
- - build-lin... - -
-
- - - - - -
+
cb-apps @@ -380,98 +416,40 @@
- + cb-apps - - - - - -
-
-
- Availability zone 1 -
-
-
-
- - Availability zone 1 - -
-
- - - + + -
+
- - build-windows - -
-
-
- - - build-win... - - - - - - - -
-
-
- build-linux-spot -
-
-
-
- - build-lin... - -
-
- - - - - - - -
-
-
Amazon EBS
- gp3 + gp3-azb
- + Amazon EB...
- - + + -
+
Public subnet @@ -479,17 +457,17 @@
- + Public subnet - - + + -
+
NAT @@ -499,17 +477,17 @@
- + NAT... - - + + -
+
Private Subnet @@ -517,17 +495,17 @@
- + Private Subnet - - + + -
+
shared @@ -535,35 +513,17 @@
- + shared - - + + -
-
-
- build-linux -
-
-
-
- - build-lin... - -
-
- - - - - -
+
cb-apps @@ -571,55 +531,16 @@
- + cb-apps - - - - - -
-
-
- Availability zone 2 -
-
-
-
- - Availability zone 2 - -
-
- - - - - - -
-
-
- - build-windows - -
-
-
-
- - build-win... - -
-
- + -
+
build-linux-spot @@ -627,42 +548,42 @@
- + build-lin... - - - - + + + + -
+
Amazon EBS
- gp3 + gp3-azc
- + Amazon EB... - - + + -
+
Public subnet @@ -670,17 +591,17 @@
- + Public subnet - - + + -
+
NAT @@ -690,17 +611,17 @@
- + NAT... - - + + -
+
Private Subnet @@ -708,17 +629,17 @@
- + Private Subnet - - + + -
+
shared @@ -726,35 +647,17 @@
- + shared - - - - - -
-
-
- build-linux -
-
-
-
- - build-lin... - -
-
- - + + -
+
cb-apps @@ -762,75 +665,76 @@
- + cb-apps - - + -
-
-
- Availability zone 3 +
+
+
+ build-linux-spot
- - Availability zone 3 + + build-lin... - - - + + + + -
-
-
- - build-windows - +
+
+
+ Autoscaling +
+ group
- - build-win... + + Autoscal... - + + + -
+
- build-linux-spot + Autoscaling +
+ group
- - build-lin... + + Autoscal... - - - - + -
+
Autoscaling @@ -840,149 +744,146 @@
- + Autoscal... - - - - + -
+
- s3 + Autoscaling
- Profile + group
- - s3... + + Autoscal... - - + + -
+
- Autoscaling + Elastic Container
- group + Registry
- - Autoscal... + + Elastic Co... - + -
+
- Autoscaling + ECR
- group + Profile
- - Autoscal... + + ECR... - + -
+
- Autoscaling -
- group + build-linux-spot
- - Autoscal... + + build-lin... - + + + -
+
- Autoscaling -
- group + build windons spot
- - Autoscal... + + build win... - - + + + -
+
- Elastic Container -
- Registry + build windons spot
- - Elastic Co... + + build win... - + + + -
+
- ECR -
- Profile + build windons spot
- - ECR... + + build win... + + + diff --git a/blueprints/02-at-scale/img/at-scale.k8s.drawio.svg b/blueprints/02-at-scale/img/at-scale.k8s.drawio.svg index 57d4079d..a84bd081 100644 --- a/blueprints/02-at-scale/img/at-scale.k8s.drawio.svg +++ b/blueprints/02-at-scale/img/at-scale.k8s.drawio.svg @@ -1,4 +1,4 @@ - + @@ -18,8 +18,7 @@ - - + @@ -39,35 +38,20 @@ - - - - - + + + + + + + + + + -
-
-
- - /velero - -
-
-
-
- - /velero - -
-
- - - - - -
+
Amazon S3 bucket @@ -75,18 +59,18 @@
- + Amazon S3 b... - - - + + + -
+
@@ -97,18 +81,18 @@
- + Metric serv... - - - + + + -
+
@@ -120,7 +104,7 @@
- + Amazon EFS... @@ -148,13 +132,13 @@ - - - + + + -
+
@@ -165,18 +149,18 @@
- + Autoscaler - - - + + + -
+
@@ -189,20 +173,20 @@
- + AWS Load... - - - - - + + + + + -
+
Autoscaling @@ -212,17 +196,17 @@
- + Autoscal... - - + + -
+
Amazon EFS @@ -230,7 +214,7 @@
- + Amazon EFS @@ -256,11 +240,11 @@ - + -
+
@@ -272,7 +256,7 @@
- + kube-system @@ -280,22 +264,22 @@ - - - - - - - - - - - - + + + + + + + + + + + + -
+
@@ -305,17 +289,17 @@
- + /cbci - - + + -
+
CloudWatch @@ -323,20 +307,19 @@
- + CloudWatch - - - - - + + + + -
+
EKS @@ -344,17 +327,17 @@
- + EKS - - + + -
+
ALB @@ -362,7 +345,7 @@
- + ALB @@ -370,7 +353,7 @@ -
+
@@ -382,17 +365,17 @@
- + external-dns - - + + -
+
@@ -404,18 +387,17 @@
- + Backup/Restore - - - + + -
+
@@ -427,40 +409,19 @@
- + Long-term logs - - + + + -
-
-
- - - Fluent Bit - - -
-
-
-
- - Fluent Bit - -
-
- - - - - -
-
+
+
Amazon Route 53 @@ -473,15 +434,16 @@
- + Amazon Route 53... + -
+
@@ -491,41 +453,38 @@
- + /... - - -
+
- kube-prometheus-stack + observabilty
- - kube-prometheus-stack + + observabilty - - - - + + + -
+
@@ -538,7 +497,7 @@
- + Node Termin... @@ -546,7 +505,7 @@ -
+
@@ -558,18 +517,18 @@
- + bottle-rocket-oper... - - - + + + -
+
@@ -581,7 +540,7 @@
- + Short-term logs @@ -589,7 +548,7 @@ -
+
@@ -603,7 +562,7 @@
- + velero @@ -611,31 +570,7 @@ -
-
-
- - - - cert-manager - - - -
-
-
-
- - cert-manager - -
-
- - - - - -
+
@@ -650,7 +585,7 @@
- + cbci-agents @@ -679,36 +614,35 @@ - - - - - + + + + -
+
- cbci-sec-reg +
- - cbci-sec-reg - + - - - + + + + + -
+
@@ -718,36 +652,37 @@
- + - - - - - + + + + + + -
+
-
+ cbci-sec-reg
- + + cbci-sec-reg + - - -
+
@@ -761,64 +696,17 @@
- + auth - - - - - - - - Tempo - - - - - - Prometheus - - - - - - Grafana - - - - - - - - - -
-
-
- - - - vault - - - -
-
-
-
- - vault - -
-
- + + -
+
@@ -833,16 +721,16 @@
- + cbci - + -
+
@@ -852,18 +740,18 @@
- + Workspace cache - - - + + + -
+
@@ -875,16 +763,16 @@
- + Backup/Restore - + -
+
@@ -896,20 +784,19 @@
- + Artifacts - - - - - + + + + -
+
@@ -919,38 +806,38 @@
- - cbci-sec-casc + + cbci-sec-ca... - + - + cjoc - + - + team-b - + - + team-a - - - - - + + + + + -
+
@@ -967,29 +854,29 @@
- + Hibernati... - - - - + + + + - + team-c-ha - - - - - + + + + + -
+
@@ -1006,7 +893,7 @@
- + DockerHub... @@ -1016,7 +903,7 @@ -
+
@@ -1030,7 +917,7 @@
- + Elastic Container... @@ -1044,7 +931,7 @@
- Private Images + Private Registry
@@ -1052,35 +939,299 @@
- Private Images + Private Regist... - + -
+
- Public Images + Public Registry
- - Public Images + + Public Registry - + - - + + + + + + +
+
+
+ + + Bottle Rocket +
+ Worker Nodes +
+
+
+
+
+
+
+ + Bottle Rocket... + +
+
+ + + + + +
+
+
+ + /velero + +
+
+
+
+ + /vele... + +
+
+ + + + OTEL + + + Collector + + + + + + Prometheus + + + (Metrics) + + + + + + Grafana + + + + + + + + Loki + + + (Logs) + + + + + + + + + + + + Tempo + + + (APM) + + + + + + +
+
+
+ + + Fluent Bit +
+
+
+
+
+
+
+ + Fluent Bi... + +
+
+ + + +
+
+
+ + + Short-term logs + + +
+
+
+
+ + Short-term logs + +
+
+ + + +
+
+
+ + + metrics + + +
+
+
+
+ + metrics + +
+
+ + + +
+
+
+ + + tracing + + +
+
+
+
+ + tracing + +
+
+ + + +
+
+
+ + + build logs + + +
+
+
+
+ + build lo... + +
+
+ + + + + + +
+
+
+ + + + cert-manager + + + +
+
+
+
+ + cert-manager + +
+
+ + + + + +
+
+
+ + + + vault + + + +
+
+
+
+ + vault + +
+
+ + + + +
+
+
+ s3 +
+ Profile +
+
+
+
+ + s3... + +
+
diff --git a/blueprints/02-at-scale/img/observability/cbci-fluenbit-example.png b/blueprints/02-at-scale/img/observability/cbci-logs-cloudwatch.png similarity index 100% rename from blueprints/02-at-scale/img/observability/cbci-fluenbit-example.png rename to blueprints/02-at-scale/img/observability/cbci-logs-cloudwatch.png diff --git a/blueprints/02-at-scale/img/observability/cbci-logs-loki.png b/blueprints/02-at-scale/img/observability/cbci-logs-loki.png new file mode 100644 index 00000000..b486889a Binary files /dev/null and b/blueprints/02-at-scale/img/observability/cbci-logs-loki.png differ diff --git a/blueprints/02-at-scale/img/observability/cbci-dashboard.png b/blueprints/02-at-scale/img/observability/cbci-metrics-dashboard.png similarity index 100% rename from blueprints/02-at-scale/img/observability/cbci-dashboard.png rename to blueprints/02-at-scale/img/observability/cbci-metrics-dashboard.png diff --git a/blueprints/02-at-scale/img/observability/cbci-tracing-example.png b/blueprints/02-at-scale/img/observability/cbci-tracing-tempo.png similarity index 100% rename from blueprints/02-at-scale/img/observability/cbci-tracing-example.png rename to blueprints/02-at-scale/img/observability/cbci-tracing-tempo.png diff --git a/blueprints/02-at-scale/k8s/aws-alb-controller-values.yml b/blueprints/02-at-scale/k8s/aws-alb-controller-values.yml index 4696cec1..3962fbf2 100644 --- a/blueprints/02-at-scale/k8s/aws-alb-controller-values.yml +++ b/blueprints/02-at-scale/k8s/aws-alb-controller-values.yml @@ -1,6 +1,7 @@ # Copyright (c) CloudBees, Inc. -#https://github.com/aws/eks-charts/blob/master/stable/aws-load-balancer-controller/values.yaml +# https://artifacthub.io/packages/helm/aws/aws-load-balancer-controller +# https://github.com/aws/eks-charts/blob/master/stable/aws-load-balancer-controller/values.yaml nodeSelector: kubernetes.io/os: linux diff --git a/blueprints/02-at-scale/k8s/aws-for-fluent-bit-values.yml b/blueprints/02-at-scale/k8s/aws-for-fluent-bit-values.yml index e370fcd9..d0670f2b 100644 --- a/blueprints/02-at-scale/k8s/aws-for-fluent-bit-values.yml +++ b/blueprints/02-at-scale/k8s/aws-for-fluent-bit-values.yml @@ -1,10 +1,26 @@ # Copyright (c) CloudBees, Inc. -#https://artifacthub.io/packages/helm/aws/aws-for-fluent-bit -#https://github.com/aws/eks-charts/blob/master/stable/aws-for-fluent-bit/Chart.yaml +# https://artifacthub.io/packages/helm/aws/aws-for-fluent-bit +# https://github.com/aws/eks-charts/blob/master/stable/aws-for-fluent-bit/values.yaml -nodeSelector: - kubernetes.io/os: linux +tolerations: +- effect: "NoSchedule" + key: "${cbciAppsTolerationKey}" + operator: "Equal" + value: "${cbciAppsTolerationValue}" +- effect: "NoSchedule" + key: "dedicated" + operator: "Equal" + value: "build-linux-l" +- effect: "NoSchedule" + key: "dedicated" + operator: "Equal" + value: "build-linux-xl" +# Note: It is not compatible with Windows nodes by default. +# - effect: "NoSchedule" +# key: "dedicated" +# operator: "Equal" +# value: "build-windows" cloudWatchLogs: enabled: true @@ -20,3 +36,12 @@ s3: region: "${region}" bucket: "${bucketName}" s3KeyFormat: /fluentbit/$TAG/%Y-%m-%d/%H-%M-%S + +additionalOutputs: | + [OUTPUT] + name loki + host loki.observability.svc.cluster.local + port 3100 + match * + labels job=fluentbit + auto_kubernetes_labels on diff --git a/blueprints/02-at-scale/k8s/aws-node-term-handler-values.yml b/blueprints/02-at-scale/k8s/aws-node-term-handler-values.yml index 47ed69a6..557f308b 100644 --- a/blueprints/02-at-scale/k8s/aws-node-term-handler-values.yml +++ b/blueprints/02-at-scale/k8s/aws-node-term-handler-values.yml @@ -1,4 +1,7 @@ -#https://github.com/aws/aws-node-termination-handler/tree/main/config/helm/aws-node-termination-handler +# Copyright (c) CloudBees, Inc. + +# https://artifacthub.io/packages/helm/aws/aws-node-termination-handler +# https://github.com/aws/aws-node-termination-handler/tree/main/config/helm/aws-node-termination-handler nodeSelector: role: build-linux-spot diff --git a/blueprints/02-at-scale/k8s/br-update-operator-values.yml b/blueprints/02-at-scale/k8s/br-update-operator-values.yml index eb3bf3d9..e33c8d4a 100644 --- a/blueprints/02-at-scale/k8s/br-update-operator-values.yml +++ b/blueprints/02-at-scale/k8s/br-update-operator-values.yml @@ -1,11 +1,11 @@ # Copyright (c) CloudBees, Inc. -#https://github.com/bottlerocket-os/bottlerocket-update-operator/blob/develop/README.md#configuration +# https://github.com/bottlerocket-os/bottlerocket-update-operator/blob/develop/README.md#configuration -#Planned Update: checks every Saturday at 23H / 11PM +# Planned Update: checks every Saturday at 23H / 11PM scheduler_cron_expression: "0 0 23 * * Sat *" -#TODO: Add toleration for bottle rockets nodes +# TODO: Add toleration for bottle rockets nodes placement: agent: # The agent is a daemonset, so the only controls that apply to it are tolerations. @@ -22,7 +22,7 @@ placement: operator: Equal value: cb-apps effect: NoSchedule - #TODO: Add selector for bottle rockets nodes + # TODO: Add selector for bottle rockets nodes controller: nodeSelector: diff --git a/blueprints/02-at-scale/k8s/cbci-values.yml b/blueprints/02-at-scale/k8s/cbci-values.yml index ada333af..bb00c96e 100644 --- a/blueprints/02-at-scale/k8s/cbci-values.yml +++ b/blueprints/02-at-scale/k8s/cbci-values.yml @@ -1,7 +1,7 @@ # Copyright (c) CloudBees, Inc. -#https://artifacthub.io/packages/helm/cloudbees/cloudbees-core/ -#https://docs.cloudbees.com/docs/cloudbees-ci/latest/eks-install-guide/installing-eks-using-helm +# https://artifacthub.io/packages/helm/cloudbees/cloudbees-core/ +# https://docs.cloudbees.com/docs/cloudbees-ci/latest/eks-install-guide/installing-eks-using-helm OperationsCenter: NodeSelector: @@ -16,7 +16,7 @@ OperationsCenter: Retriever: Enabled: true scmRepo: "https://github.com/cloudbees/terraform-aws-cloudbees-ci-eks-addon.git" - scmBranch: main + scmBranch: develop scmBundlePath: blueprints/02-at-scale/cbci/casc/oc scmPollingInterval: PT20M Persistence: diff --git a/blueprints/02-at-scale/k8s/cert-manager-values.yml b/blueprints/02-at-scale/k8s/cert-manager-values.yml index fae36a89..642807ef 100644 --- a/blueprints/02-at-scale/k8s/cert-manager-values.yml +++ b/blueprints/02-at-scale/k8s/cert-manager-values.yml @@ -1,5 +1,6 @@ # Copyright (c) CloudBees, Inc. +# https://artifacthub.io/packages/helm/cert-manager/cert-manager # https://github.com/cert-manager/cert-manager/blob/master/deploy/charts/cert-manager/Chart.template.yaml nodeSelector: diff --git a/blueprints/02-at-scale/k8s/cluster-autoscaler-values.yml b/blueprints/02-at-scale/k8s/cluster-autoscaler-values.yml index a02b8788..b1084c36 100644 --- a/blueprints/02-at-scale/k8s/cluster-autoscaler-values.yml +++ b/blueprints/02-at-scale/k8s/cluster-autoscaler-values.yml @@ -1,5 +1,6 @@ # Copyright (c) CloudBees, Inc. +# https://artifacthub.io/packages/helm/cluster-autoscaler/cluster-autoscaler # https://github.com/kubernetes/autoscaler/blob/master/charts/cluster-autoscaler/Chart.yaml nodeSelector: diff --git a/blueprints/02-at-scale/k8s/extdns-values.yml b/blueprints/02-at-scale/k8s/extdns-values.yml index 070e0254..cb6ea450 100644 --- a/blueprints/02-at-scale/k8s/extdns-values.yml +++ b/blueprints/02-at-scale/k8s/extdns-values.yml @@ -1,7 +1,7 @@ # Copyright (c) CloudBees, Inc. -#https://artifacthub.io/packages/helm/external-dns/external-dns -#https://github.com/kubernetes-sigs/external-dns/tree/master/charts/external-dns/Chart.yaml +# https://artifacthub.io/packages/helm/external-dns/external-dns +# https://github.com/kubernetes-sigs/external-dns/tree/master/charts/external-dns/Chart.yaml provider: "aws" domainFilters: [ "${zoneDNS}" ] diff --git a/blueprints/02-at-scale/k8s/grafana-loki-values.yml b/blueprints/02-at-scale/k8s/grafana-loki-values.yml new file mode 100644 index 00000000..8b5c6983 --- /dev/null +++ b/blueprints/02-at-scale/k8s/grafana-loki-values.yml @@ -0,0 +1,33 @@ +# Copyright (c) CloudBees, Inc. + +# https://artifacthub.io/packages/helm/grafana/loki +# https://github.com/grafana/helm-charts/blob/main/charts/loki/README.md +# https://github.com/grafana/loki/blob/main/production/helm/loki/values.yaml +# https://grafana.com/docs/loki/next/setup/install/helm/ + +deploymentMode: SingleBinary +loki: + commonConfig: + replication_factor: 1 + storage: + type: 'filesystem' + schemaConfig: + configs: + - from: "2024-01-01" + store: tsdb + index: + prefix: loki_index_ + period: 24h + object_store: filesystem # we're storing on filesystem so there's no real persistence here. + schema: v13 + auth_enabled: false +singleBinary: + replicas: 1 + nodeSelector: + kubernetes.io/os: linux +read: + replicas: 0 +backend: + replicas: 0 +write: + replicas: 0 diff --git a/blueprints/02-at-scale/k8s/grafana-tempo-values.yml b/blueprints/02-at-scale/k8s/grafana-tempo-values.yml new file mode 100644 index 00000000..aef0edcc --- /dev/null +++ b/blueprints/02-at-scale/k8s/grafana-tempo-values.yml @@ -0,0 +1,10 @@ +# Copyright (c) CloudBees, Inc. + +# https://github.com/grafana/helm-charts/blob/main/charts/tempo/values.yaml +# https://artifacthub.io/packages/helm/grafana/tempo + +nodeSelector: + kubernetes.io/os: linux + +tempoQuery: + enabled: true diff --git a/blueprints/02-at-scale/k8s/grafana-tempo.yml b/blueprints/02-at-scale/k8s/grafana-tempo.yml deleted file mode 100644 index 8a61fb2d..00000000 --- a/blueprints/02-at-scale/k8s/grafana-tempo.yml +++ /dev/null @@ -1,7 +0,0 @@ -#https://github.com/grafana/helm-charts/blob/main/charts/tempo/values.yaml - -tempoQuery: - enabled: true - -nodeSelector: - kubernetes.io/os: linux diff --git a/blueprints/02-at-scale/k8s/kube-prom-stack-values.yml b/blueprints/02-at-scale/k8s/kube-prom-stack-values.yml index 41839d10..c6f22753 100644 --- a/blueprints/02-at-scale/k8s/kube-prom-stack-values.yml +++ b/blueprints/02-at-scale/k8s/kube-prom-stack-values.yml @@ -1,7 +1,10 @@ # Copyright (c) CloudBees, Inc. -#https://artifacthub.io/packages/helm/prometheus-community/kube-prometheus-stack -#https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-prometheus-stack/Chart.yaml +# https://artifacthub.io/packages/helm/prometheus-community/kube-prometheus-stack +# https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-prometheus-stack/values.yaml +# https://artifacthub.io/packages/helm/prometheus-community/prometheus +# https://artifacthub.io/packages/helm/grafana/grafana +# https://artifacthub.io/packages/helm/prometheus-community/alertmanager alertmanager: alertmanagerSpec: @@ -9,12 +12,13 @@ alertmanager: kubernetes.io/os: linux prometheus: prometheusSpec: - # Not Used since OC uses ServiceMonitor - # additionalScrapeConfigs: - # - job_name: "cjoc" - # metrics_path: "/prometheus/" - # static_configs: - # - targets: ["cjoc.cbci.svc.cluster.local:80"] + additionalScrapeConfigs: + - job_name: "otel-collector" + static_configs: + #OTEL collector own metrics https://opentelemetry.io/docs/collector/internal-telemetry/ + - targets: ["otel-collector-opentelemetry-collector:8888"] + #OTEL collector exporter prometheus metrics + - targets: ["otel-collector-opentelemetry-collector:8889"] serviceMonitorSelector: # Note: For all Service Monitors, use a common label matchLabels: @@ -118,16 +122,45 @@ kube-state-metrics: grafana: nodeSelector: kubernetes.io/os: linux + ingress: + enabled: true + annotations: + kubernetes.io/ingress.class: alb + alb.ingress.kubernetes.io/scheme: internet-facing + alb.ingress.kubernetes.io/target-type: ip + alb.ingress.kubernetes.io/listen-ports: '[{"HTTP": 80}, {"HTTPS":443}]' + alb.ingress.kubernetes.io/certificate-arn: ${cert_arn} + alb.ingress.kubernetes.io/actions.ssl-redirect: '{"Type": "redirect", "RedirectConfig": { "Protocol": "HTTPS", "Port": "443", "StatusCode": "HTTP_301"}}' + external-dns.alpha.kubernetes.io/hostname: ${grafana_hostname} + hosts: + - ${grafana_hostname} adminPassword: ${grafana_password} + plugins: + - grafana-lokiexplore-app datasources: datasources.yaml: apiVersion: 1 datasources: + - name: Loki + type: loki + access: proxy + url: http://loki-gateway + uid: lokiDatasource + derivedFields: + - datasourceUid: tempoDatasource + matcherRegex: trace_id=(\w+) + name: TraceID + url: '$${__value.raw}' - name: Tempo type: tempo - url: http://tempo.kube-prometheus-stack.svc.cluster.local:3100 + url: http://tempo:3100 access: proxy - isDefault: false + uid: tempoDatasource + jsonData: + tracesToLogsV2: + datasourceUid: lokiDatasource + tracesToMetrics: + datasourceUid: prometheus dashboardProviders: dashboardproviders.yaml: apiVersion: 1 @@ -150,19 +183,22 @@ grafana: path: /var/lib/grafana/dashboards/grafana-dashboards-cloudbees dashboards: grafana-dashboards-kubernetes: - k8s-views-global: - url: https://raw.githubusercontent.com/dotdc/grafana-dashboards-kubernetes/master/dashboards/k8s-views-global.json - token: "" - k8s-views-namespaces: - url: https://raw.githubusercontent.com/dotdc/grafana-dashboards-kubernetes/master/dashboards/k8s-views-namespaces.json - token: "" - k8s-views-nodes: - url: https://raw.githubusercontent.com/dotdc/grafana-dashboards-kubernetes/master/dashboards/k8s-views-nodes.json - token: "" - k8s-views-pods: - url: https://raw.githubusercontent.com/dotdc/grafana-dashboards-kubernetes/master/dashboards/k8s-views-pods.json - token: "" + k8s-views-global: + url: https://raw.githubusercontent.com/dotdc/grafana-dashboards-kubernetes/master/dashboards/k8s-views-global.json + token: "" + k8s-views-namespaces: + url: https://raw.githubusercontent.com/dotdc/grafana-dashboards-kubernetes/master/dashboards/k8s-views-namespaces.json + token: "" + k8s-views-nodes: + url: https://raw.githubusercontent.com/dotdc/grafana-dashboards-kubernetes/master/dashboards/k8s-views-nodes.json + token: "" + k8s-views-pods: + url: https://raw.githubusercontent.com/dotdc/grafana-dashboards-kubernetes/master/dashboards/k8s-views-pods.json + token: "" grafana-dashboards-cloudbees: - cb-controllers: - url: https://raw.githubusercontent.com/cloudbees/terraform-aws-cloudbees-ci-eks-addon/main/blueprints/02-at-scale/k8s/kube-prom-stack-grafana-db.json + prometheus-plugin: + url: https://raw.githubusercontent.com/cloudbees/terraform-aws-cloudbees-ci-eks-addon/develop/blueprints/02-at-scale/k8s/prometheus-plugin-db.json + token: "" + otel-plugin: + url: https://raw.githubusercontent.com/cloudbees/terraform-aws-cloudbees-ci-eks-addon/develop/blueprints/02-at-scale/k8s/opentelemetry-plugin-db.json token: "" diff --git a/blueprints/02-at-scale/k8s/metrics-server-values.yml b/blueprints/02-at-scale/k8s/metrics-server-values.yml index bc5cbe55..2c727561 100644 --- a/blueprints/02-at-scale/k8s/metrics-server-values.yml +++ b/blueprints/02-at-scale/k8s/metrics-server-values.yml @@ -1,5 +1,6 @@ # Copyright (c) CloudBees, Inc. +# https://artifacthub.io/packages/helm/metrics-server/metrics-server # https://github.com/kubernetes-sigs/metrics-server/blob/master/charts/metrics-server/Chart.yaml nodeSelector: diff --git a/blueprints/02-at-scale/k8s/openldap-stack-values.yml b/blueprints/02-at-scale/k8s/openldap-stack-values.yml index 2c7eeae2..b3fb08eb 100644 --- a/blueprints/02-at-scale/k8s/openldap-stack-values.yml +++ b/blueprints/02-at-scale/k8s/openldap-stack-values.yml @@ -1,6 +1,6 @@ # Copyright (c) CloudBees, Inc. -#https://artifacthub.io/packages/helm/helm-openldap/openldap-stack-ha +# https://artifacthub.io/packages/helm/helm-openldap/openldap-stack-ha global: ldapDomain: "acme.org" diff --git a/blueprints/02-at-scale/k8s/opentelemetry-plugin-db.json b/blueprints/02-at-scale/k8s/opentelemetry-plugin-db.json new file mode 100644 index 00000000..deb15c78 --- /dev/null +++ b/blueprints/02-at-scale/k8s/opentelemetry-plugin-db.json @@ -0,0 +1,582 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 35, + "links": [], + "liveNow": false, + "panels": [ + { + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 9, + "title": "JVM", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 0, + "y": 1 + }, + "id": 10, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "avg(process_runtime_jvm_cpu_utilization_ratio{k8s_pod_name=\"$pod\"})", + "instant": false, + "range": true, + "refId": "A" + } + ], + "title": "JVM CPU", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 4, + "y": 1 + }, + "id": 11, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "avg(process_runtime_jvm_gc_duration_seconds_count{k8s_pod_name=\"$pod\"})", + "instant": false, + "range": true, + "refId": "A" + } + ], + "title": "JVM GC Duration", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 6 + }, + "id": 3, + "panels": [], + "title": "Agents", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 0, + "y": 7 + }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "avg(jenkins_cloud_agents_completed_total{k8s_pod_name=\"$pod\"})", + "instant": false, + "range": true, + "refId": "A" + } + ], + "title": "Cloud Agents Completed Total", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 4, + "y": 7 + }, + "id": 7, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "avg(jenkins_agents_online_ratio{k8s_pod_name=\"$pod\"})", + "instant": false, + "range": true, + "refId": "A" + } + ], + "title": "Agents Online Ratio", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 8, + "y": 7 + }, + "id": 8, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "avg(jenkins_agents_offline_ratio{k8s_pod_name=\"$pod\"})", + "instant": false, + "range": true, + "refId": "A" + } + ], + "title": "Agents Offline Ratio", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 12 + }, + "id": 2, + "panels": [], + "title": "SCM", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 4, + "x": 0, + "y": 13 + }, + "id": 1, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "avg(jenkins_scm_event_active_threads{k8s_pod_name=\"$pod\"})", + "instant": false, + "range": true, + "refId": "A" + } + ], + "title": "SCM Active Threads", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 4, + "x": 4, + "y": 13 + }, + "id": 5, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "avg(jenkins_scm_event_completed_tasks_total{k8s_pod_name=\"$pod\"})", + "instant": false, + "range": true, + "refId": "A" + } + ], + "title": "SCM Completed Task", + "type": "stat" + } + ], + "refresh": "", + "schemaVersion": 38, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": true, + "text": "team-b-0", + "value": "team-b-0" + }, + "definition": "query_result(up{container=\"jenkins\"})", + "hide": 0, + "includeAll": false, + "label": "pod", + "multi": false, + "name": "pod", + "options": [], + "query": { + "query": "query_result(up{container=\"jenkins\"})", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": ".*pod=\"(.*?)\".*", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": { + "selected": true, + "text": "team-c-ha", + "value": "team-c-ha" + }, + "definition": "query_result(up{container=\"jenkins\"})", + "hide": 0, + "includeAll": false, + "label": "service", + "multi": false, + "name": "service", + "options": [], + "query": { + "query": "query_result(up{container=\"jenkins\"})", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": ".*service=\"(.*?)\".*", + "skipUrlSync": false, + "sort": 0, + "type": "query" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Jenkins - OpenTelemetry", + "uid": "be8efb28-3273-4659-a7fe-1baf4ff6880c", + "version": 4, + "weekStart": "" +} diff --git a/blueprints/02-at-scale/k8s/otel-collector-values.yml b/blueprints/02-at-scale/k8s/otel-collector-values.yml new file mode 100644 index 00000000..44bef63e --- /dev/null +++ b/blueprints/02-at-scale/k8s/otel-collector-values.yml @@ -0,0 +1,84 @@ +# Copyright (c) CloudBees, Inc. + +# https://artifacthub.io/packages/helm/opentelemetry-helm/opentelemetry-collector +# https://github.com/open-telemetry/opentelemetry-helm-charts/blob/main/charts/opentelemetry-collector/values.yaml + +nodeSelector: + kubernetes.io/os: linux + +image: + repository: "otel/opentelemetry-collector-contrib" + +mode: "deployment" +presets: + kubernetesAttributes: + enabled: true + extractAllPodLabels: true + extractAllPodAnnotations: true +service: + type: ClusterIP +ports: + metrics: + enabled: true + prometheus: + enabled: true + containerPort: 8889 + servicePort: 8889 + protocol: TCP +config: + receivers: + otlp: + protocols: + grpc: + endpoint: '${env:MY_POD_IP}:4317' + http: + endpoint: '${env:MY_POD_IP}:4318' + exporters: + # Note: Not compatible with CPE + # otlphttp/loki: + # endpoint: 'http://loki.observability.svc.cluster.local:3100/otlp' + otlphttp/tempo: + endpoint: http://tempo.observability.svc.cluster.local:4318 + prometheus: + endpoint: '${env:MY_POD_IP}:8889' + resource_to_telemetry_conversion: + enabled: true + debug: + verbosity: detailed + sampling_initial: 5 + sampling_thereafter: 200 + extensions: + health_check: {} + pprof: + endpoint: :1888 + zpages: + endpoint: :55679 + + service: + extensions: [pprof, zpages, health_check] + pipelines: + # Note: Not compatible with CPE + # logs: + # receivers: + # - otlp/jenkins + # processors: + # - batch + # exporters: + # - otlphttp/loki + # - debug + traces: + receivers: + - otlp + processors: + - batch + exporters: + - otlphttp/tempo + - debug + metrics: + receivers: + - otlp + processors: + - batch + exporters: + - prometheus + - debug diff --git a/blueprints/02-at-scale/k8s/kube-prom-stack-grafana-db.json b/blueprints/02-at-scale/k8s/prometheus-plugin-db.json similarity index 75% rename from blueprints/02-at-scale/k8s/kube-prom-stack-grafana-db.json rename to blueprints/02-at-scale/k8s/prometheus-plugin-db.json index f579e52e..da6306d8 100644 --- a/blueprints/02-at-scale/k8s/kube-prom-stack-grafana-db.json +++ b/blueprints/02-at-scale/k8s/prometheus-plugin-db.json @@ -20,7 +20,6 @@ "fiscalYearStartMonth": 0, "gnetId": 9964, "graphTooltip": 0, - "id": 30, "links": [ { "asDropdown": false, @@ -45,9 +44,9 @@ "x": 0, "y": 0 }, - "id": 32, + "id": 42, "panels": [], - "title": "Performance", + "title": "CBCI Pod - Readiness", "type": "row" }, { @@ -59,15 +58,19 @@ "defaults": { "mappings": [], "thresholds": { - "mode": "absolute", + "mode": "percentage", "steps": [ { - "color": "green", + "color": "red", "value": null }, { - "color": "red", - "value": 80 + "color": "orange", + "value": 70 + }, + { + "color": "green", + "value": 85 } ] }, @@ -85,18 +88,16 @@ "links": [], "maxDataPoints": 100, "options": { - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": [ - "lastNotNull" + "mean" ], "fields": "", "values": false }, - "textMode": "auto" + "showThresholdLabels": false, + "showThresholdMarkers": true }, "pluginVersion": "10.0.2", "targets": [ @@ -106,7 +107,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(jenkins_health_check_score{pod=\"$pod\"}) by (pod)", + "expr": "avg(jenkins_health_check_score{pod=~\"$pod\"})", "format": "time_series", "intervalFactor": 1, "range": true, @@ -114,8 +115,8 @@ "textEditor": false } ], - "title": "Jenkins health", - "type": "stat" + "title": "Jenkins Health Score", + "type": "gauge" }, { "datasource": { @@ -129,18 +130,23 @@ }, "mappings": [], "thresholds": { - "mode": "absolute", + "mode": "percentage", "steps": [ { - "color": "green", + "color": "red", "value": null }, { - "color": "red", - "value": 80 + "color": "orange", + "value": 97 + }, + { + "color": "green", + "value": 99 } ] - } + }, + "unit": "percentunit" }, "overrides": [] }, @@ -155,7 +161,7 @@ "orientation": "auto", "reduceOptions": { "calcs": [ - "lastNotNull" + "mean" ], "fields": "", "values": false @@ -171,7 +177,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "vm_uptime_milliseconds{pod=~\"$pod\"} / 3600000", + "expr": "avg(up{pod=~\"$pod\"})", "instant": false, "range": true, "refId": "A" @@ -187,9 +193,6 @@ }, "fieldConfig": { "defaults": { - "color": { - "mode": "thresholds" - }, "mappings": [], "thresholds": { "mode": "absolute", @@ -200,23 +203,24 @@ }, { "color": "red", - "value": 1 + "value": 80 } ] - } + }, + "unit": "short" }, "overrides": [] }, "gridPos": { "h": 4, - "w": 4, + "w": 3, "x": 8, "y": 1 }, - "id": 39, + "id": 33, "options": { - "colorMode": "value", - "graphMode": "area", + "colorMode": "background", + "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { @@ -235,14 +239,14 @@ "type": "prometheus", "uid": "prometheus" }, - "editorMode": "builder", - "expr": "vm_deadlock_count{pod=\"$pod\"}", + "editorMode": "code", + "expr": "jenkins_plugins_failed{pod=~\"$pod\"}", "instant": false, "range": true, "refId": "A" } ], - "title": "Number of Deadlocks", + "title": "Plugins Failed", "type": "stat" }, { @@ -252,9 +256,6 @@ }, "fieldConfig": { "defaults": { - "color": { - "mode": "thresholds" - }, "mappings": [], "thresholds": { "mode": "absolute", @@ -265,24 +266,24 @@ }, { "color": "red", - "value": 0.7997 + "value": 80 } ] }, - "unit": "percentunit" + "unit": "short" }, "overrides": [] }, "gridPos": { "h": 4, - "w": 4, - "x": 12, + "w": 3, + "x": 11, "y": 1 }, - "id": 40, + "id": 39, "options": { - "colorMode": "value", - "graphMode": "area", + "colorMode": "background", + "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { @@ -302,13 +303,13 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "vm_file_descriptor_ratio{pod=\"$pod\"}", + "expr": "avg(vm_deadlock_count{pod=~\"$pod\"})", "instant": false, "range": true, "refId": "A" } ], - "title": "File Descriptor Ratio", + "title": "Number of Deadlocks", "type": "stat" }, { @@ -316,10 +317,162 @@ "type": "prometheus", "uid": "prometheus" }, + "description": "", "fieldConfig": { "defaults": { "color": { - "mode": "thresholds" + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 14, + "x": 0, + "y": 5 + }, + "id": 43, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "10.0.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "avg(process_max_fds{pod=~\"$pod\"}) ", + "instant": false, + "legendFormat": "Max Open Files", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "avg(process_open_fds{pod=~\"$pod\"}) ", + "hide": false, + "instant": false, + "legendFormat": "Open files", + "range": true, + "refId": "B" + } + ], + "title": "Process Open Files", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 14 + }, + "id": 32, + "panels": [], + "title": "CBCI Pod - JVM status", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } }, "mappings": [], "thresholds": { @@ -344,27 +497,28 @@ "overrides": [] }, "gridPos": { - "h": 4, - "w": 4, - "x": 16, - "y": 1 + "h": 7, + "w": 24, + "x": 0, + "y": 15 }, "id": 5, "links": [], "maxDataPoints": 100, "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true }, - "textMode": "auto" + "timezone": [ + "utc" + ], + "tooltip": { + "mode": "single", + "sort": "none" + } }, "pluginVersion": "10.0.2", "targets": [ @@ -373,16 +527,17 @@ "type": "prometheus", "uid": "prometheus" }, - "editorMode": "builder", - "expr": "vm_cpu_load{pod=\"$pod\"}", + "editorMode": "code", + "expr": "avg(vm_cpu_load{pod=~\"$pod\"})", "format": "time_series", "intervalFactor": 1, + "legendFormat": "{{pod}} JVM CPU Load", "range": true, "refId": "A" } ], - "title": "CPU Core Usage", - "type": "stat" + "title": "JVM CPU Load", + "type": "timeseries" }, { "datasource": { @@ -409,15 +564,15 @@ } ] }, - "unit": "percentunit" + "unit": "percent" }, "overrides": [] }, "gridPos": { - "h": 4, - "w": 4, - "x": 20, - "y": 1 + "h": 6, + "w": 5, + "x": 0, + "y": 22 }, "id": 15, "links": [], @@ -442,7 +597,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "vm_memory_heap_usage{pod=\"$pod\"}", + "expr": "(avg(vm_memory_total_committed {pod=\"$pod\"}) - avg(vm_memory_total_used {pod=\"$pod\"})) * 100 / avg(vm_memory_total_committed {pod=\"$pod\"}) ", "format": "time_series", "intervalFactor": 1, "range": true, @@ -457,7 +612,6 @@ "type": "prometheus", "uid": "prometheus" }, - "description": "The 99% percentile of HTTP Requests handled by Jenkins masters.", "fieldConfig": { "defaults": { "color": { @@ -470,7 +624,7 @@ "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", - "fillOpacity": 8, + "fillOpacity": 0, "gradientMode": "none", "hideFrom": { "legend": false, @@ -478,15 +632,12 @@ "viz": false }, "lineInterpolation": "linear", - "lineStyle": { - "fill": "solid" - }, "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, - "showPoints": "never", + "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", @@ -498,40 +649,51 @@ }, "mappings": [], "thresholds": { - "mode": "absolute", + "mode": "percentage", "steps": [ { "color": "green", "value": null }, + { + "color": "orange", + "value": 70 + }, { "color": "red", - "value": 80 + "value": 85 } ] - } + }, + "unit": "short" }, "overrides": [] }, "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 5 + "h": 6, + "w": 19, + "x": 5, + "y": 22 }, - "id": 36, + "id": 45, + "links": [], + "maxDataPoints": 100, "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", - "showLegend": false + "showLegend": true }, + "timezone": [ + "utc" + ], "tooltip": { "mode": "single", "sort": "none" } }, + "pluginVersion": "10.0.2", "targets": [ { "datasource": { @@ -539,56 +701,64 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(http_requests{pod=~\"$pod\",quantile=\"0.99\"} ) by (pod)", - "instant": false, + "expr": "avg(vm_memory_total_used{pod=\"$pod\"})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "JVM Total Memory", "range": true, "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "avg(vm_memory_total_committed{pod=\"$pod\"})", + "hide": false, + "instant": false, + "legendFormat": "JVM Comitted Memory", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "avg(vm_memory_heap_max{pod=\"$pod\"})", + "hide": false, + "instant": false, + "legendFormat": "JVM Heap Memory Limit", + "range": true, + "refId": "C" } ], - "title": "HTTP Request Duration (99%)", + "title": "JVM Memory Usage", "type": "timeseries" }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 28 + }, + "id": 24, + "panels": [], + "title": "CBCI Pod - Plugins Status", + "type": "row" + }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, - "description": "The ratio of ok (200) request out of all requests.", + "description": "", "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 13, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, "mappings": [], "thresholds": { "mode": "absolute", @@ -597,34 +767,46 @@ "color": "green", "value": null }, + { + "color": "orange", + "value": 200 + }, + { + "color": "#EAB839", + "value": 300 + }, { "color": "red", - "value": 80 + "value": 500 } ] - } + }, + "unit": "short" }, "overrides": [] }, "gridPos": { - "h": 8, - "w": 8, - "x": 8, - "y": 5 + "h": 4, + "w": 3, + "x": 0, + "y": 29 }, - "id": 41, + "id": 28, "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": false + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false }, - "tooltip": { - "mode": "single", - "sort": "none" - } + "textMode": "auto" }, + "pluginVersion": "10.0.2", "targets": [ { "datasource": { @@ -632,56 +814,22 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(http_responseCodes_ok_total{pod=~\"$pod\"}) \nby (pod) / \nsum(http_requests_count{pod=~\"$pod\"}) \nby (pod)", + "expr": "sum(jenkins_plugins_active{pod=\"$pod\"})", "instant": false, "range": true, "refId": "A" } ], - "title": "Good HTTP Request Ratio", - "type": "timeseries" + "title": "Plugins Active", + "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, - "description": "Http Server Errors (500)", "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, "mappings": [], "thresholds": { "mode": "absolute", @@ -692,31 +840,34 @@ }, { "color": "red", - "value": 1 + "value": 80 } ] - } + }, + "unit": "short" }, "overrides": [] }, "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 5 + "h": 4, + "w": 3, + "x": 3, + "y": 29 }, - "id": 37, + "id": 29, "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": false + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false }, - "tooltip": { - "mode": "single", - "sort": "none" - } + "textMode": "auto" }, "pluginVersion": "10.0.2", "targets": [ @@ -726,34 +877,20 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "rate(http_responseCodes_serverError_total{pod=~\"$pod\"}[1m])", + "expr": "sum(jenkins_plugins_withUpdate{pod=\"$pod\"})", "instant": false, "range": true, "refId": "A" } - ], - "title": "HTTP Server Errors", - "type": "timeseries" - }, - { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 13 - }, - "id": 24, - "panels": [], - "title": "Plugins", - "type": "row" + ], + "title": "Plugins Update Available", + "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, - "description": "", "fieldConfig": { "defaults": { "mappings": [], @@ -766,15 +903,11 @@ }, { "color": "orange", - "value": 200 - }, - { - "color": "#EAB839", - "value": 300 + "value": 20 }, { "color": "red", - "value": 500 + "value": 30 } ] }, @@ -785,10 +918,10 @@ "gridPos": { "h": 4, "w": 3, - "x": 0, - "y": 14 + "x": 6, + "y": 29 }, - "id": 28, + "id": 34, "options": { "colorMode": "background", "graphMode": "none", @@ -810,23 +943,70 @@ "type": "prometheus", "uid": "prometheus" }, - "editorMode": "builder", - "expr": "jenkins_plugins_active{pod=\"$pod\"}", + "editorMode": "code", + "expr": "sum(jenkins_plugins_inactive{pod=\"$pod\"})", "instant": false, "range": true, "refId": "A" } ], - "title": "Plugins Active", + "title": "Plugins inactive", "type": "stat" }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 33 + }, + "id": 44, + "panels": [], + "title": "CBCI Pod - HTTP Request Status", + "type": "row" + }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, + "description": "The 99% percentile of HTTP Requests handled by Jenkins masters.", "fieldConfig": { "defaults": { + "color": { + "mode": "continuous-GrYlRd" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 3, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, "mappings": [], "thresholds": { "mode": "absolute", @@ -840,56 +1020,86 @@ "value": 80 } ] - }, - "unit": "short" + } }, "overrides": [] }, "gridPos": { - "h": 4, - "w": 3, - "x": 3, - "y": 14 + "h": 8, + "w": 8, + "x": 0, + "y": 34 }, - "id": 29, + "id": 36, "options": { - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true }, - "textMode": "auto" + "tooltip": { + "mode": "single", + "sort": "none" + } }, - "pluginVersion": "10.0.2", "targets": [ { "datasource": { "type": "prometheus", "uid": "prometheus" }, - "editorMode": "builder", - "expr": "jenkins_plugins_withUpdate{pod=\"$pod\"}", + "editorMode": "code", + "expr": "avg(http_requests{pod=~\"$pod\",quantile=\"0.99\"})", "instant": false, "range": true, "refId": "A" } ], - "title": "Plugins Update Available", - "type": "stat" + "title": "HTTP Request Duration (99%)", + "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, + "description": "The ratio of ok (200) request out of all requests.", "fieldConfig": { "defaults": { + "color": { + "mode": "continuous-RdYlGr" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 3, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, "mappings": [], "thresholds": { "mode": "absolute", @@ -903,33 +1113,29 @@ "value": 80 } ] - }, - "unit": "short" + } }, "overrides": [] }, "gridPos": { - "h": 4, - "w": 3, - "x": 6, - "y": 14 + "h": 8, + "w": 8, + "x": 8, + "y": 34 }, - "id": 33, + "id": 41, "options": { - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true }, - "textMode": "auto" + "tooltip": { + "mode": "single", + "sort": "none" + } }, - "pluginVersion": "10.0.2", "targets": [ { "datasource": { @@ -937,22 +1143,56 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "jenkins_plugins_failed{pod=\"$pod\"}", + "expr": "avg(http_responseCodes_ok_total{pod=~\"$pod\"})/ \navg(http_requests_count{pod=~\"$pod\"}) ", "instant": false, "range": true, "refId": "A" } ], - "title": "Plugins Failed", - "type": "stat" + "title": "Good HTTP Request Ratio", + "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, + "description": "Http Server Errors (500)", "fieldConfig": { "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, "mappings": [], "thresholds": { "mode": "absolute", @@ -961,40 +1201,33 @@ "color": "green", "value": null }, - { - "color": "orange", - "value": 20 - }, { "color": "red", - "value": 30 + "value": 1 } ] - }, - "unit": "short" + } }, "overrides": [] }, "gridPos": { - "h": 4, - "w": 3, - "x": 9, - "y": 14 + "h": 8, + "w": 8, + "x": 16, + "y": 34 }, - "id": 34, + "id": 37, "options": { - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false }, - "textMode": "auto" + "tooltip": { + "mode": "single", + "sort": "none" + } }, "pluginVersion": "10.0.2", "targets": [ @@ -1004,14 +1237,14 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "jenkins_plugins_inactive{pod=\"$pod\"}", + "expr": "avg(rate(http_responseCodes_serverError_total{pod=~\"$pod\"}[1m]))", "instant": false, "range": true, "refId": "A" } ], - "title": "Plugins inactive", - "type": "stat" + "title": "HTTP Server Errors", + "type": "timeseries" }, { "collapsed": false, @@ -1019,11 +1252,11 @@ "h": 1, "w": 24, "x": 0, - "y": 18 + "y": 42 }, "id": 23, "panels": [], - "title": "Builds", + "title": "CBCI Service - Builds", "type": "row" }, { @@ -1057,7 +1290,7 @@ "h": 4, "w": 4, "x": 0, - "y": 19 + "y": 43 }, "id": 25, "options": { @@ -1082,7 +1315,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "jenkins_job_count_value{pod=\"$pod\"}", + "expr": "avg(jenkins_job_count_value{service=~\"$service\"})", "instant": false, "range": true, "refId": "A" @@ -1124,7 +1357,7 @@ "h": 4, "w": 4, "x": 4, - "y": 19 + "y": 43 }, "id": 9, "links": [], @@ -1149,7 +1382,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "(sum(jenkins_runs_success_total{pod=\"$pod\"})\nby (pod) * 100) /\nsum(jenkins_runs_total_total{pod=\"$pod\"})\nby (pod)", + "expr": "(avg(jenkins_runs_success_total{service=~\"$service\"}) * 100) /\navg(jenkins_runs_total_total{service=~\"$service\"})", "format": "time_series", "intervalFactor": 1, "range": true, @@ -1193,7 +1426,7 @@ "h": 4, "w": 12, "x": 8, - "y": 19 + "y": 43 }, "id": 31, "options": { @@ -1218,7 +1451,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "(sum(jenkins_runs_failure_total{pod=~\"$pod\"})\nby (pod) * 100) /\nsum(jenkins_runs_total_total{pod=~\"$pod\"})\nby (pod)", + "expr": "(avg(jenkins_runs_failure_total{service=~\"$service\"}) * 100) /\navg(jenkins_runs_total_total{service=~\"$service\"})", "instant": false, "interval": "", "legendFormat": "Failed", @@ -1231,7 +1464,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "(sum(jenkins_runs_aborted_total{pod=~\"$pod\"})\nby (pod) * 100) /\nsum(jenkins_runs_total_total{pod=~\"$pod\"})\nby (pod)", + "expr": "(avg(jenkins_runs_aborted_total{service=~\"$service\"}) * 100) /\navg(jenkins_runs_total_total{service=~\"$service\"})", "hide": false, "instant": false, "legendFormat": "Aborted", @@ -1244,7 +1477,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "(sum(jenkins_runs_unstable_total{pod=~\"$pod\"})\nby (pod) * 100) /\nsum(jenkins_runs_total_total{pod=~\"$pod\"})\nby (pod)", + "expr": "(avg(jenkins_runs_unstable_total{pod=~\"$pod\"}) * 100) /\navg(jenkins_runs_total_total{pod=~\"$pod\"})", "hide": false, "instant": false, "legendFormat": "Unstable", @@ -1301,7 +1534,7 @@ "h": 4, "w": 4, "x": 20, - "y": 19 + "y": 43 }, "id": 11, "links": [], @@ -1327,8 +1560,8 @@ "type": "prometheus", "uid": "prometheus" }, - "editorMode": "builder", - "expr": "jenkins_queue_size_value{pod=\"$pod\"}", + "editorMode": "code", + "expr": "avg(jenkins_queue_size_value{service=~\"$service\"})", "format": "time_series", "intervalFactor": 1, "range": true, @@ -1386,7 +1619,7 @@ "h": 7, "w": 4, "x": 0, - "y": 23 + "y": 47 }, "id": 12, "links": [], @@ -1412,8 +1645,8 @@ "type": "prometheus", "uid": "prometheus" }, - "editorMode": "builder", - "expr": "rate(jenkins_job_building_duration_count{pod=\"$pod\"}[1m])", + "editorMode": "code", + "expr": "avg(rate(jenkins_job_building_duration_count{service=~\"$service\"}[1m]))", "range": true, "refId": "A" } @@ -1483,7 +1716,7 @@ "h": 7, "w": 16, "x": 4, - "y": 23 + "y": 47 }, "id": 4, "links": [], @@ -1507,7 +1740,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "jenkins_job_queuing_duration{pod=\"$pod\"}", + "expr": "avg(jenkins_job_queuing_duration{service=~\"$service\"})", "format": "time_series", "intervalFactor": 1, "range": true, @@ -1560,7 +1793,7 @@ "h": 7, "w": 4, "x": 20, - "y": 23 + "y": 47 }, "id": 13, "links": [], @@ -1587,7 +1820,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "rate(jenkins_job_queuing_duration_count{pod=\"$pod\"}[1m])", + "expr": "avg(rate(jenkins_job_queuing_duration_count{service=~\"$service\"}[1m]))", "range": true, "refId": "A" } @@ -1596,7 +1829,7 @@ "type": "stat" } ], - "refresh": "", + "refresh": "10s", "schemaVersion": 38, "style": "dark", "tags": [ @@ -1619,7 +1852,7 @@ "definition": "query_result(up{container=\"jenkins\"})", "hide": 0, "includeAll": false, - "label": "Select Jenkins Pod", + "label": "Pod", "multi": false, "name": "pod", "options": [], @@ -1632,11 +1865,33 @@ "skipUrlSync": false, "sort": 0, "type": "query" + }, + { + "current": { + "selected": true, + "text": "team-b", + "value": "team-b" + }, + "definition": "query_result(up{container=\"jenkins\"})", + "hide": 0, + "includeAll": false, + "multi": false, + "name": "service", + "options": [], + "query": { + "query": "query_result(up{container=\"jenkins\"})", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": ".*service=\"(.*?)\".*", + "skipUrlSync": false, + "sort": 0, + "type": "query" } ] }, "time": { - "from": "now-30m", + "from": "now-6h", "to": "now" }, "timepicker": { @@ -1665,8 +1920,8 @@ ] }, "timezone": "browser", - "title": "CloudBees CI v2", + "title": "CloudBees - Prometheus Plugin", "uid": "f22bb554-2024-45e2-9795-af2ca85aea82", - "version": 3, + "version": 1, "weekStart": "" } diff --git a/blueprints/02-at-scale/k8s/secrets-values.yml b/blueprints/02-at-scale/k8s/secrets-values.yml index 71ebebd4..335e26c6 100644 --- a/blueprints/02-at-scale/k8s/secrets-values.yml +++ b/blueprints/02-at-scale/k8s/secrets-values.yml @@ -6,3 +6,4 @@ sec_globalPassword: ${global_password} sec_adminMail : ${adminMail} sec_s3bucketName: ${s3bucketName} sec_awsRegion: ${awsRegion} +sec_grafana_url: ${grafana_url} diff --git a/blueprints/02-at-scale/k8s/velero-values.yml b/blueprints/02-at-scale/k8s/velero-values.yml index 43cfac79..e1bd8c9d 100644 --- a/blueprints/02-at-scale/k8s/velero-values.yml +++ b/blueprints/02-at-scale/k8s/velero-values.yml @@ -1,7 +1,7 @@ # Copyright (c) CloudBees, Inc. -#https://artifacthub.io/packages/helm/vmware-tanzu/velero -#https://github.com/vmware-tanzu/helm-charts/blob/main/charts/velero/values.yaml +# https://artifacthub.io/packages/helm/vmware-tanzu/velero +# https://github.com/vmware-tanzu/helm-charts/blob/main/charts/velero/values.yaml nodeSelector: kubernetes.io/os: linux diff --git a/blueprints/02-at-scale/main.k8s.tf b/blueprints/02-at-scale/main.k8s.tf new file mode 100644 index 00000000..142564f7 --- /dev/null +++ b/blueprints/02-at-scale/main.k8s.tf @@ -0,0 +1,506 @@ + +locals { + + kubeconfig_file = "kubeconfig_${local.name}.yaml" + kubeconfig_file_path = abspath("k8s/${local.kubeconfig_file}") + + global_password = random_string.global_pass_string.result + global_pass_jsonpath = "'{.data.sec_globalPassword}'" + + bottlerocket_bootstrap_extra_args = <<-EOT + [settings.host-containers.admin] + enabled = false + [settings.host-containers.control] + enabled = true + [settings.kernel] + lockdown = "integrity" + [settings.kubernetes.node-labels] + "bottlerocket.aws/updater-interface-version" = "2.0.0" + EOT + + # Velero Backups: Only for controllers using block storage (for example, Amazon EBS volumes in AWS) + velero_controller_backup = "team-b" + velero_controller_backup_selector = "tenant=${local.velero_controller_backup}" + velero_schedule_name = "schedule-${local.velero_controller_backup}" + + hibernation_monitor_url = "https://hibernation-${module.eks_blueprints_addon_cbci.cbci_namespace}.${module.eks_blueprints_addon_cbci.cbci_domain_name}" + cbci_admin_user = "admin_cbci_a" + cbci_agents_ns = "cbci-agents" + # K8S agent template name from the CasC bundle + cbci_agent_linuxtempl = "linux-mavenAndKaniko-" + cbci_agent_windowstempl = "windows-powershell" + + vault_ns = "vault" + vault_config_file_path = abspath("k8s/vault-config.sh") + vault_init_file_path = abspath("k8s/vault-init.log") + + observability_ns = "observability" + grafana_hostname = "grafana.${var.hosted_zone}" + grafana_url = "https://${local.grafana_hostname}" + +} + +resource "random_string" "global_pass_string" { + length = 16 + special = false + upper = true + lower = true +} + +resource "time_static" "epoch" { + depends_on = [module.eks_blueprints_addons] +} + +################################################################################ +# Workloads +################################################################################ + +# CloudBees CI Add-on + +module "eks_blueprints_addon_cbci" { + # source = "cloudbees/cloudbees-ci-eks-addon/aws" + # version = ">= 3.18072.0" + source = "../../" + depends_on = [module.eks_blueprints_addons] + + hosted_zone = var.hosted_zone + cert_arn = module.acm.acm_certificate_arn + trial_license = var.trial_license + + helm_config = { + values = [templatefile("k8s/cbci-values.yml", { + cbciAppsNodeRole = local.mng["cbci_apps"]["labels"].role + cbciAppsTolerationKey = local.mng["cbci_apps"]["taints"].key + cbciAppsTolerationValue = local.mng["cbci_apps"]["taints"].value + cbciAgentsNamespace = local.cbci_agents_ns + })] + } + + create_casc_secrets = true + casc_secrets_file = templatefile("k8s/secrets-values.yml", { + global_password = local.global_password + s3bucketName = local.bucket_name + awsRegion = var.aws_region + adminMail = var.trial_license["email"] + grafana_url = local.grafana_url + }) + + create_reg_secret = true + reg_secret_ns = local.cbci_agents_ns + # Note: This blueprint tests DockerHub as container registry but different registries can be used. + reg_secret_auth = { + server = "https://index.docker.io/v1/" + username = var.dh_reg_secret_auth["username"] + password = var.dh_reg_secret_auth["password"] + email = var.dh_reg_secret_auth["email"] + } + + prometheus_target = true + prometheus_target_ns = local.observability_ns + +} + +# EKS Blueprints Add-ons + +module "ebs_csi_driver_irsa" { + source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks" + version = "5.29.0" + + role_name_prefix = "${module.eks.cluster_name}-ebs-csi-driv" + + attach_ebs_csi_policy = true + + oidc_providers = { + main = { + provider_arn = module.eks.oidc_provider_arn + namespace_service_accounts = ["kube-system:ebs-csi-controller-sa"] + } + } + + tags = var.tags +} + +module "eks_blueprints_addons" { + source = "aws-ia/eks-blueprints-addons/aws" + #vEKSBpAddonsTFMod# + version = "1.17.0" + depends_on = [module.eks] + + cluster_name = module.eks.cluster_name + cluster_endpoint = module.eks.cluster_endpoint + oidc_provider_arn = module.eks.oidc_provider_arn + cluster_version = module.eks.cluster_version + + eks_addons = { + aws-ebs-csi-driver = { + service_account_role_arn = module.ebs_csi_driver_irsa.iam_role_arn + configuration_values = jsonencode( + { + # ensure any PVC created also includes the custom tags + controller = { + extraVolumeTags = local.tags + } + # Deploy on the nodes that need Amazon EBS storage + node = { + nodeSelector = { + storage = "enabled" + } + } + } + ) + } + coredns = { + timeouts = { + create = "25m" + delete = "10m" + } + } + vpc-cni = { + configuration_values = jsonencode( + { + enableWindowsIpam = "true" + } + ) + } + kube-proxy = {} + eks-pod-identity-agent = {} + } + ##################### + #01-getting-started + ##################### + enable_external_dns = true + external_dns = { + values = [templatefile("k8s/extdns-values.yml", { + zoneDNS = var.hosted_zone + })] + } + external_dns_route53_zone_arns = [local.route53_zone_arn] + enable_aws_load_balancer_controller = true + aws_load_balancer_controller = { + values = [file("k8s/aws-alb-controller-values.yml")] + } + ##################### + #02-at-scale + ##################### + enable_aws_efs_csi_driver = true + aws_efs_csi_driver = { + values = [file("k8s/aws-efs-csi-driver-values.yml")] + } + enable_metrics_server = true + metrics_server = { + values = [file("k8s/metrics-server-values.yml")] + } + enable_cluster_autoscaler = true + cluster_autoscaler = { + values = [file("k8s/cluster-autoscaler-values.yml")] + } + enable_velero = true + velero = { + values = [file("k8s/velero-values.yml")] + s3_backup_location = local.velero_s3_location + set = [{ + name = "initContainers" + value = <<-EOT + - name: velero-plugin-for-aws + image: velero/velero-plugin-for-aws:v1.7.1 + imagePullPolicy: IfNotPresent + volumeMounts: + - mountPath: /target + name: plugins + #https://docs.cloudbees.com/docs/cloudbees-ci/latest/pipelines/restart-aborted-builds#_restarting_builds_after_a_restore + - name: inject-metadata-velero-plugin + image: ghcr.io/cloudbees-oss/inject-metadata-velero-plugin:main + imagePullPolicy: Always + volumeMounts: + - mountPath: /target + name: plugins + EOT + }] + } + # Cert Manager - Requirement for Bottlerocket Update Operator + enable_cert_manager = true + cert_manager = { + wait = true + } + # Important: Update timing can be customized + # Bottlerocket Update Operator + enable_bottlerocket_update_operator = true + bottlerocket_update_operator = { + values = [file("k8s/br-update-operator-values.yml")] + } + enable_kube_prometheus_stack = true + kube_prometheus_stack = { + namespace = local.observability_ns + chart_version = "62.3.0" + create_namespace = true + values = [templatefile("k8s/kube-prom-stack-values.yml", { + grafana_password = local.global_password + grafana_hostname = local.grafana_hostname + cert_arn = module.acm.acm_certificate_arn + })] + } + enable_aws_for_fluentbit = true + aws_for_fluentbit_cw_log_group = { + create = true + use_name_prefix = true # Set this to true to enable name prefix + name_prefix = "eks-cluster-logs-" + retention = local.cloudwatch_logs_expiration_days + } + aws_for_fluentbit = { + # Enable Container Insights just for troubleshooting + # https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/ContainerInsights.html + enable_containerinsights = false + namespace = local.observability_ns + create_namespace = true + values = [templatefile("k8s/aws-for-fluent-bit-values.yml", { + region = var.aws_region + bucketName = module.cbci_s3_bucket.s3_bucket_id + log_retention_days = local.cloudwatch_logs_expiration_days + cbciAppsTolerationKey = local.mng["cbci_apps"]["taints"].key + cbciAppsTolerationValue = local.mng["cbci_apps"]["taints"].value + })] + kubelet_monitoring = true + chart_version = "0.1.28" + s3_bucket_arns = [ + module.cbci_s3_bucket.s3_bucket_arn, + "${local.fluentbit_s3_location}/*" + ] + # Note: This values are duplicated in k8s/aws-for-fluent-bit-values.yml but they are required here to not be overwrite by default values. + set = [{ + name = "cloudWatchLogs.autoCreateGroup" + value = true + }, + { + name = "hostNetwork" + value = true + }, + { + name = "dnsPolicy" + value = "ClusterFirstWithHostNet" + } + ] + } + helm_releases = { + openldap-stack = { + chart = "openldap-stack-ha" + chart_version = "4.2.2" + namespace = "auth" + create_namespace = true + repository = "https://jp-gouin.github.io/helm-openldap/" + values = [templatefile("k8s/openldap-stack-values.yml", { + password = local.global_password + admin_user_outputs = local.cbci_admin_user + })] + } + aws-node-termination-handler = { + name = "aws-node-termination-handler" + namespace = "kube-system" + create_namespace = false + chart = "aws-node-termination-handler" + chart_version = "0.21.0" + repository = "https://aws.github.io/eks-charts" + values = [file("k8s/aws-node-term-handler-values.yml")] + } + # Based on hashicorp/hashicorp-vault-eks-addon/aws + vault = { + name = "vault" + namespace = local.vault_ns + create_namespace = true + chart = "vault" + chart_version = "0.28.0" + repository = "https://helm.releases.hashicorp.com" + values = [file("k8s/vault-values.yml")] + } + otel-collector = { + name = "otel-collector" + namespace = local.observability_ns + create_namespace = true + chart = "opentelemetry-collector" + chart_version = "0.105.1" + repository = "https://open-telemetry.github.io/opentelemetry-helm-charts" + values = [file("k8s/otel-collector-values.yml")] + } + tempo = { + name = "tempo" + namespace = local.observability_ns + create_namespace = true + chart = "tempo" + chart_version = "1.7.2" + repository = "https://grafana.github.io/helm-charts" + values = [file("k8s/grafana-tempo-values.yml")] + } + loki = { + name = "loki" + namespace = local.observability_ns + create_namespace = true + chart = "loki" + chart_version = "6.12.0" + repository = "https://grafana.github.io/helm-charts" + values = [file("k8s/grafana-loki-values.yml")] + } + } + tags = local.tags +} + +################################################################################ +# Storage Classes +################################################################################ + +resource "kubernetes_annotations" "gp2" { + api_version = "storage.k8s.io/v1" + kind = "StorageClass" + # This is true because the resources was already created by the ebs-csi-driver addon + force = "true" + depends_on = [module.eks] + + metadata { + name = "gp2" + } + + annotations = { + "storageclass.kubernetes.io/is-default-class" = "false" + } +} + +resource "kubernetes_storage_class_v1" "gp3_aza" { + metadata { + name = "gp3" + + annotations = { + "storageclass.kubernetes.io/is-default-class" = "true" + } + } + depends_on = [module.eks] + + storage_provisioner = "ebs.csi.aws.com" + allow_volume_expansion = true + reclaim_policy = "Delete" + volume_binding_mode = "WaitForFirstConsumer" + # # Issue #195 + # allowed_topologies { + # match_label_expressions { + # key = "topology.ebs.csi.aws.com/zone" + # values = ["${var.aws_region}a"] + # } + # } + + parameters = { + encrypted = "true" + fsType = "ext4" + type = "gp3" + } + +} + +resource "kubernetes_storage_class_v1" "efs" { + + metadata { + name = "efs" + } + depends_on = [module.eks] + + storage_provisioner = "efs.csi.aws.com" + reclaim_policy = "Delete" + parameters = { + # Dynamic provisioning + provisioningMode = "efs-ap" + fileSystemId = module.efs.id + directoryPerms = "700" + # Issue #190 + uid = "1000" + gid = "1000" + } + + mount_options = [ + "iam" + ] +} + +################################################################################ +# Pod Identity +################################################################################ + +data "aws_iam_policy_document" "assume_role" { + statement { + effect = "Allow" + + principals { + type = "Service" + identifiers = ["pods.eks.amazonaws.com"] + } + + actions = [ + "sts:AssumeRole", + "sts:TagSession" + ] + } +} + +resource "aws_iam_role" "s3" { + name = local.cbci_iam_role_s3 + assume_role_policy = data.aws_iam_policy_document.assume_role.json +} + +resource "aws_iam_role_policy" "s3_policy" { + name = "${local.name}-iam_inline_policy" + role = aws_iam_role.s3.id + policy = jsonencode( + { + "Version" : "2012-10-17", + #https://docs.cloudbees.com/docs/cloudbees-ci/latest/pipelines/cloudbees-cache-step#_s3_configuration + "Statement" : [ + { + "Sid" : "cbciS3BucketputGetDelete", + "Effect" : "Allow", + "Action" : [ + "s3:PutObject", + "s3:GetObject", + "s3:DeleteObject" + ], + "Resource" : "${local.cbci_s3_location}/*" + }, + { + "Sid" : "cbciS3BucketList", + "Effect" : "Allow", + "Action" : "s3:ListBucket", + "Resource" : module.cbci_s3_bucket.s3_bucket_arn, + "Condition" : { + "StringLike" : { + "s3:prefix" : "${local.cbci_s3_prefix}/*" + } + } + } + ] + } + ) +} + +resource "aws_eks_pod_identity_association" "oc_s3" { + depends_on = [module.eks_blueprints_addon_cbci] + cluster_name = module.eks.cluster_name + namespace = module.eks_blueprints_addon_cbci.cbci_namespace + service_account = "cjoc" + role_arn = aws_iam_role.s3.arn +} + +resource "aws_eks_pod_identity_association" "controllers_s3" { + depends_on = [module.eks_blueprints_addon_cbci] + cluster_name = module.eks.cluster_name + namespace = module.eks_blueprints_addon_cbci.cbci_namespace + service_account = "jenkins" + role_arn = aws_iam_role.s3.arn +} + +################################################################################ +# Kubeconfig +################################################################################ + +resource "terraform_data" "create_kubeconfig" { + depends_on = [module.eks] + + triggers_replace = var.ci ? [timestamp()] : [] + + provisioner "local-exec" { + command = "aws eks update-kubeconfig --name ${module.eks.cluster_name} --region ${var.aws_region} --kubeconfig ${local.kubeconfig_file_path}" + } +} diff --git a/blueprints/02-at-scale/main.tf b/blueprints/02-at-scale/main.tf index 2d478e2c..963c9eb0 100644 --- a/blueprints/02-at-scale/main.tf +++ b/blueprints/02-at-scale/main.tf @@ -6,24 +6,16 @@ data "aws_availability_zones" "available" {} locals { - ############ - # Infra - ############ - name = var.suffix == "" ? "cbci-bp02" : "cbci-bp02-${var.suffix}" vpc_name = "${local.name}-vpc" cluster_name = "${local.name}-eks" efs_name = "${local.name}-efs" resource_group_name = "${local.name}-rg" bucket_name = "${local.name}-s3" - cbci_instance_profile_s3 = "${local.name}-instance_profile_s3" - cbci_iam_role_s3 = "${local.name}-iam_role_s3" - cbci_inline_policy_s3 = "${local.name}-iam_inline_policy_s3" cbci_instance_profile_ecr = "${local.name}-instance_profile_ecr" cbci_iam_role_ecr = "${local.name}-iam_role_ecr" cbci_inline_policy_ecr = "${local.name}-iam_inline_policy_ecr" - kubeconfig_file = "kubeconfig_${local.name}.yaml" - kubeconfig_file_path = abspath("k8s/${local.kubeconfig_file}") + cbci_iam_role_s3 = "${local.name}-iam_role_s3" vpc_cidr = "10.0.0.0/16" azs = slice(data.aws_availability_zones.available.names, 0, 3) @@ -57,320 +49,12 @@ locals { "tf-repository" = "github.com/cloudbees/terraform-aws-cloudbees-ci-eks-addon" }) - ############ - # K8s Apps - ############ - - global_password = random_string.global_pass_string.result - global_pass_jsonpath = "'{.data.sec_globalPassword}'" - - bottlerocket_bootstrap_extra_args = <<-EOT - [settings.host-containers.admin] - enabled = false - [settings.host-containers.control] - enabled = true - [settings.kernel] - lockdown = "integrity" - [settings.kubernetes.node-labels] - "bottlerocket.aws/updater-interface-version" = "2.0.0" - EOT - - #Velero Backups: Only for controllers using block storage (for example, Amazon EBS volumes in AWS) - velero_controller_backup = "team-b" - velero_controller_backup_selector = "tenant=${local.velero_controller_backup}" - velero_schedule_name = "schedule-${local.velero_controller_backup}" - - hibernation_monitor_url = "https://hibernation-${module.eks_blueprints_addon_cbci.cbci_namespace}.${module.eks_blueprints_addon_cbci.cbci_domain_name}" - cbci_admin_user = "admin_cbci_a" - cbci_agents_ns = "cbci-agents" - #K8S agent template name from the CasC bundle - cbci_agent_linuxtempl = "linux-mavenAndKaniko-" - cbci_agent_windowstempl = "windows-powershell" - - vault_ns = "vault" - vault_config_file_path = abspath("k8s/vault-config.sh") - vault_init_file_path = abspath("k8s/vault-init.log") -} - -resource "random_string" "global_pass_string" { - length = 16 - special = false - upper = true - lower = true -} - -resource "time_static" "epoch" { - depends_on = [module.eks_blueprints_addons] } ################################################################################ -# EKS: Add-ons +# EKS Cluster ################################################################################ -# CloudBees CI Add-on - -module "eks_blueprints_addon_cbci" { - source = "cloudbees/cloudbees-ci-eks-addon/aws" - version = ">= 3.18306.0" - - depends_on = [module.eks_blueprints_addons] - - hosted_zone = var.hosted_zone - cert_arn = module.acm.acm_certificate_arn - trial_license = var.trial_license - - helm_config = { - values = [templatefile("k8s/cbci-values.yml", { - cbciAppsNodeRole = local.mng["cbci_apps"]["labels"].role - cbciAppsTolerationKey = local.mng["cbci_apps"]["taints"].key - cbciAppsTolerationValue = local.mng["cbci_apps"]["taints"].value - cbciAgentsNamespace = local.cbci_agents_ns - })] - } - - create_casc_secrets = true - casc_secrets_file = templatefile("k8s/secrets-values.yml", { - global_password = local.global_password - s3bucketName = local.bucket_name - awsRegion = var.aws_region - adminMail = var.trial_license["email"] - }) - - create_reg_secret = true - reg_secret_ns = local.cbci_agents_ns - #Note: This blueprint tests DockerHub as container registry but different registries can be used. - reg_secret_auth = { - server = "https://index.docker.io/v1/" - username = var.dh_reg_secret_auth["username"] - password = var.dh_reg_secret_auth["password"] - email = var.dh_reg_secret_auth["email"] - } - - prometheus_target = true - -} - -# EKS Blueprints Add-ons - -module "ebs_csi_driver_irsa" { - source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks" - version = "5.29.0" - - role_name_prefix = "${module.eks.cluster_name}-ebs-csi-driv" - - attach_ebs_csi_policy = true - - oidc_providers = { - main = { - provider_arn = module.eks.oidc_provider_arn - namespace_service_accounts = ["kube-system:ebs-csi-controller-sa"] - } - } - - tags = var.tags -} - -# It must be separate to correctly purge the kube_prometheus_stack -resource "kubernetes_namespace" "kube_prometheus_stack" { - - depends_on = [module.eks] - metadata { - name = "kube-prometheus-stack" - } - -} - -module "eks_blueprints_addons" { - source = "aws-ia/eks-blueprints-addons/aws" - #vEKSBpAddonsTFMod# - version = "1.15.1" - - cluster_name = module.eks.cluster_name - cluster_endpoint = module.eks.cluster_endpoint - oidc_provider_arn = module.eks.oidc_provider_arn - cluster_version = module.eks.cluster_version - - eks_addons = { - aws-ebs-csi-driver = { - service_account_role_arn = module.ebs_csi_driver_irsa.iam_role_arn - configuration_values = jsonencode( - { - # ensure any PVC created also includes the custom tags - controller = { - extraVolumeTags = local.tags - } - # Deploy on the nodes that need Amazon EBS storage - node = { - nodeSelector = { - storage = "enabled" - } - } - } - ) - } - coredns = { - timeouts = { - create = "25m" - delete = "10m" - } - } - vpc-cni = { - configuration_values = jsonencode( - { - enableWindowsIpam = "true" - } - ) - } - kube-proxy = {} - } - ##################### - #01-getting-started - ##################### - enable_external_dns = true - external_dns = { - values = [templatefile("k8s/extdns-values.yml", { - zoneDNS = var.hosted_zone - })] - } - external_dns_route53_zone_arns = [local.route53_zone_arn] - enable_aws_load_balancer_controller = true - aws_load_balancer_controller = { - values = [file("k8s/aws-alb-controller-values.yml")] - } - ##################### - #02-at-scale - ##################### - enable_aws_efs_csi_driver = true - aws_efs_csi_driver = { - values = [file("k8s/aws-efs-csi-driver-values.yml")] - } - enable_metrics_server = true - metrics_server = { - values = [file("k8s/metrics-server-values.yml")] - } - enable_cluster_autoscaler = true - cluster_autoscaler = { - values = [file("k8s/cluster-autoscaler-values.yml")] - } - enable_velero = true - velero = { - values = [file("k8s/velero-values.yml")] - s3_backup_location = local.velero_s3_location - set = [{ - name = "initContainers" - value = <<-EOT - - name: velero-plugin-for-aws - image: velero/velero-plugin-for-aws:v1.7.1 - imagePullPolicy: IfNotPresent - volumeMounts: - - mountPath: /target - name: plugins - #https://docs.cloudbees.com/docs/cloudbees-ci/latest/pipelines/restart-aborted-builds#_restarting_builds_after_a_restore - - name: inject-metadata-velero-plugin - image: ghcr.io/cloudbees-oss/inject-metadata-velero-plugin:main - imagePullPolicy: Always - volumeMounts: - - mountPath: /target - name: plugins - EOT - }] - } - enable_kube_prometheus_stack = true - kube_prometheus_stack = { - namespace = kubernetes_namespace.kube_prometheus_stack.metadata[0].name - create_namespace = false - values = [templatefile("k8s/kube-prom-stack-values.yml", { - grafana_password = local.global_password - })] - } - enable_aws_for_fluentbit = true - aws_for_fluentbit_cw_log_group = { - create = true - use_name_prefix = true # Set this to true to enable name prefix - name_prefix = "eks-cluster-logs-" - retention = local.cloudwatch_logs_expiration_days - } - aws_for_fluentbit = { - #Enable Container Insights just for troubleshooting - #https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/ContainerInsights.html - enable_containerinsights = false - values = [templatefile("k8s/aws-for-fluent-bit-values.yml", { - region = var.aws_region - bucketName = module.cbci_s3_bucket.s3_bucket_id - log_retention_days = local.cloudwatch_logs_expiration_days - })] - kubelet_monitoring = true - chart_version = "0.1.28" - s3_bucket_arns = [ - module.cbci_s3_bucket.s3_bucket_arn, - "${local.fluentbit_s3_location}/*" - ] - } - #Cert Manager - Requirement for Bottlerocket Update Operator - enable_cert_manager = true - cert_manager = { - wait = true - } - #Important: Update timing can be customized - #Bottlerocket Update Operator - enable_bottlerocket_update_operator = true - bottlerocket_update_operator = { - values = [file("k8s/br-update-operator-values.yml")] - } - ##################### - #Additional Helm Releases - ##################### - helm_releases = { - openldap-stack = { - chart = "openldap-stack-ha" - chart_version = "4.2.2" - namespace = "auth" - create_namespace = true - repository = "https://jp-gouin.github.io/helm-openldap/" - values = [templatefile("k8s/openldap-stack-values.yml", { - password = local.global_password - admin_user_outputs = local.cbci_admin_user - })] - } - aws-node-termination-handler = { - name = "aws-node-termination-handler" - namespace = "kube-system" - create_namespace = false - chart = "aws-node-termination-handler" - chart_version = "0.21.0" - repository = "https://aws.github.io/eks-charts" - values = [file("k8s/aws-node-term-handler-values.yml")] - } - grafana-tempo = { - name = "tempo" - namespace = kubernetes_namespace.kube_prometheus_stack.metadata[0].name - create_namespace = false - chart = "tempo" - chart_version = "1.7.2" - repository = "https://grafana.github.io/helm-charts" - values = [file("k8s/grafana-tempo.yml")] - } - #Based on hashicorp/hashicorp-vault-eks-addon/aws - vault = { - name = "vault" - namespace = local.vault_ns - create_namespace = true - chart = "vault" - chart_version = "0.28.0" - repository = "https://helm.releases.hashicorp.com" - values = [file("k8s/vault-values.yml")] - } - } - - tags = local.tags -} - -################################################################################ -# EKS: Infra -################################################################################ - -# EKS Cluster - module "eks" { source = "terraform-aws-modules/eks/aws" version = "19.17.1" @@ -378,7 +62,7 @@ module "eks" { cluster_name = local.cluster_name cluster_endpoint_public_access = true #vK8# - cluster_version = "1.28" + cluster_version = "1.29" vpc_id = module.vpc.vpc_id subnet_ids = module.vpc.private_subnets @@ -429,54 +113,58 @@ module "eks" { } } - #https://docs.aws.amazon.com/eks/latest/userguide/choosing-instance-type.html - #https://docs.aws.amazon.com/eks/latest/APIReference/API_Nodegroup.html + # https://docs.aws.amazon.com/eks/latest/userguide/choosing-instance-type.html + # https://docs.aws.amazon.com/eks/latest/APIReference/API_Nodegroup.html eks_managed_node_group_defaults = { capacity_type = "ON_DEMAND" disk_size = 50 } eks_managed_node_groups = { - #Note: Openldap is not compatible with Bottlerocket or Graviton. + # Note: Openldap is not compatible with Bottlerocket or Graviton. shared_apps = { - node_group_name = "mg-shared" + node_group_name = "shared" instance_types = ["m5d.xlarge"] ami_type = "AL2023_x86_64_STANDARD" platform = "linux" min_size = 1 max_size = 3 - desired_size = 1 + desired_size = 2 labels = { role = "shared" storage = "enabled" } } cb_apps = { - node_group_name = "mg-cb-apps" + node_group_name = "cb-apps" instance_types = ["m7g.2xlarge"] #Graviton min_size = 1 max_size = 6 - desired_size = 1 + desired_size = 2 taints = [local.mng["cbci_apps"]["taints"]] labels = { role = local.mng["cbci_apps"]["labels"].role storage = "enabled" } - create_iam_role = false - iam_role_arn = aws_iam_role.managed_ng_s3.arn ami_type = "BOTTLEROCKET_ARM_64" platform = "bottlerocket" enable_bootstrap_user_data = true bootstrap_extra_args = local.bottlerocket_bootstrap_extra_args + disk_size = 100 } - cb_agents_2x = { - node_group_name = "mg-agent-2x" - instance_types = ["m7g.large"] #Graviton - min_size = 1 - max_size = 3 - desired_size = 1 - taints = [{ key = "dedicated", value = "build-linux", effect = "NO_SCHEDULE" }] + # https://aws.amazon.com/blogs/compute/cost-optimization-and-resilience-eks-with-spot-instances/ + # https://www.eksworkshop.com/docs/fundamentals/managed-node-groups/spot/instance-diversification + cb_agents_lin_2x = { + node_group_name = "agent-lin-2x" + # ec2-instance-selector --vcpus 2 --memory 8 --region us-east-1 --deny-list 't.*' --current-generation -a arm64 --gpus 0 --usage-class spot + instance_types = ["im4gn.large", "m6g.large", "m6gd.large", "m7g.large", "m7gd.large"] #Graviton + capacity_type = "SPOT" + min_size = 1 + max_size = 3 + desired_size = 1 + taints = [{ key = "dedicated", value = "build-linux-l", effect = "NO_SCHEDULE" }] labels = { - role = "build-linux" + role = "build-linux-l" + size = "2x" } create_iam_role = false iam_role_arn = aws_iam_role.managed_ng_ecr.arn @@ -485,19 +173,18 @@ module "eks" { enable_bootstrap_user_data = true bootstrap_extra_args = local.bottlerocket_bootstrap_extra_args } - #https://aws.amazon.com/blogs/compute/cost-optimization-and-resilience-eks-with-spot-instances/ - #https://www.eksworkshop.com/docs/fundamentals/managed-node-groups/spot/instance-diversification - cb_agents_spot_4x = { - node_group_name = "mng-agent-spot-4x" - #ec2-instance-selector --vcpus 4 --memory 16 --region us-east-1 --deny-list 't.*' --current-generation -a arm64 --gpus 0 --usage-class spot + cb_agents_lin_4x = { + node_group_name = "agent-lin-4x" + # ec2-instance-selector --vcpus 4 --memory 16 --region us-east-1 --deny-list 't.*' --current-generation -a arm64 --gpus 0 --usage-class spot instance_types = ["im4gn.xlarge", "m6g.xlarge", "m6gd.xlarge", "m7g.xlarge", "m7gd.xlarge"] #Graviton capacity_type = "SPOT" min_size = 0 max_size = 3 desired_size = 0 - taints = [{ key = "dedicated", value = "build-linux-spot", effect = "NO_SCHEDULE" }] + taints = [{ key = "dedicated", value = "build-linux-xl", effect = "NO_SCHEDULE" }] labels = { - role = "build-linux-spot" + role = "build-linux-xl" + size = "4x" } create_iam_role = false iam_role_arn = aws_iam_role.managed_ng_ecr.arn @@ -506,17 +193,18 @@ module "eks" { enable_bootstrap_user_data = true bootstrap_extra_args = local.bottlerocket_bootstrap_extra_args } - cb_agents_spot_8x = { - node_group_name = "mng-agent-spot-8x" - #ec2-instance-selector --vcpus 8 --memory 32 --region us-east-1 --deny-list 't.*' --current-generation -a arm64 --gpus 0 --usage-class spot + cb_agents_lin_8x = { + node_group_name = "agent-lin-8x" + # ec2-instance-selector --vcpus 8 --memory 32 --region us-east-1 --deny-list 't.*' --current-generation -a arm64 --gpus 0 --usage-class spot instance_types = ["im4gn.2xlarge", "m6g.2xlarge", "m6gd.2xlarge", "m7g.2xlarge", "m7gd.2xlarge"] #Graviton capacity_type = "SPOT" min_size = 0 max_size = 3 desired_size = 0 - taints = [{ key = "dedicated", value = "build-linux-spot", effect = "NO_SCHEDULE" }] + taints = [{ key = "dedicated", value = "build-linux-xl", effect = "NO_SCHEDULE" }] labels = { - role = "build-linux-spot" + role = "build-linux-xl" + size = "8x" } create_iam_role = false iam_role_arn = aws_iam_role.managed_ng_ecr.arn @@ -525,23 +213,26 @@ module "eks" { enable_bootstrap_user_data = true bootstrap_extra_args = local.bottlerocket_bootstrap_extra_args } - mg_windows = { + cb_agents_win = { + node_group_name = "agent-win-4x" min_size = 1 max_size = 3 desired_size = 1 platform = "windows" ami_type = "WINDOWS_CORE_2019_x86_64" use_name_prefix = true - instance_types = ["m5d.xlarge", "m5ad.xlarge"] - taints = [{ key = "dedicated", value = "build-windows", effect = "NO_SCHEDULE" }] + # ec2-instance-selector --vcpus 4 --memory 16 --region us-east-1 --deny-list 't.*' --current-generation -a amd64 --gpus 0 --usage-class spot + instance_types = ["m5.xlarge", "m5a.xlarge", "m5d.xlarge", "m5dn.xlarge", "m5n.xlarge", "m5zn.xlarge", "m6a.xlarge", "m6i.xlarge", "m6id.xlarge", "m6idn.xlarge", "m6in.xlarge", "m7a.xlarge", "m7i.xlarge"] + capacity_type = "SPOT" + taints = [{ key = "dedicated", value = "build-windows", effect = "NO_SCHEDULE" }] labels = { role = "build-windows" } } } - #https://docs.aws.amazon.com/eks/latest/userguide/control-plane-logs.html - #https://aws.amazon.com/blogs/containers/understanding-and-cost-optimizing-amazon-eks-control-plane-logs/ + # https://docs.aws.amazon.com/eks/latest/userguide/control-plane-logs.html + # https://aws.amazon.com/blogs/containers/understanding-and-cost-optimizing-amazon-eks-control-plane-logs/ create_cloudwatch_log_group = true cluster_enabled_log_types = ["audit", "api", "authenticator", "controllerManager", "scheduler"] cloudwatch_log_group_retention_in_days = local.cloudwatch_logs_expiration_days @@ -550,6 +241,7 @@ module "eks" { } # AWS Instance Permissions + data "aws_iam_policy_document" "managed_ng_assume_role_policy" { statement { sid = "EKSWorkerAssumeRole" @@ -564,67 +256,6 @@ data "aws_iam_policy_document" "managed_ng_assume_role_policy" { } } -resource "aws_iam_role" "managed_ng_s3" { - name = local.cbci_iam_role_s3 - description = "EKS Managed Node group IAM Role s3" - assume_role_policy = data.aws_iam_policy_document.managed_ng_assume_role_policy.json - path = "/" - force_detach_policies = true - # Mandatory for EKS Managed Node Group - managed_policy_arns = [ - "arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy", - "arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy", - "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly", - "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" - ] - # Additional Permissions for for EKS Managed Node Group per https://docs.aws.amazon.com/eks/latest/userguide/create-node-role.html - inline_policy { - name = local.cbci_inline_policy_s3 - policy = jsonencode( - { - "Version" : "2012-10-17", - #https://docs.cloudbees.com/docs/cloudbees-ci/latest/pipelines/cloudbees-cache-step#_s3_configuration - "Statement" : [ - { - "Sid" : "cbciS3BucketputGetDelete", - "Effect" : "Allow", - "Action" : [ - "s3:PutObject", - "s3:GetObject", - "s3:DeleteObject" - ], - "Resource" : "${local.cbci_s3_location}/*" - }, - { - "Sid" : "cbciS3BucketList", - "Effect" : "Allow", - "Action" : "s3:ListBucket", - "Resource" : module.cbci_s3_bucket.s3_bucket_arn - "Condition" : { - "StringLike" : { - "s3:prefix" : "${local.cbci_s3_prefix}/*" - } - } - } - ] - } - ) - } - tags = var.tags -} - -resource "aws_iam_instance_profile" "managed_ng_s3" { - name = local.cbci_instance_profile_s3 - role = aws_iam_role.managed_ng_s3.name - path = "/" - - lifecycle { - create_before_destroy = true - } - - tags = var.tags -} - resource "aws_iam_role" "managed_ng_ecr" { name = local.cbci_iam_role_ecr description = "EKS Managed Node group IAM Role ECR" @@ -678,75 +309,6 @@ resource "aws_iam_instance_profile" "managed_ng_ecr" { tags = var.tags } -# Storage Classes - -resource "kubernetes_annotations" "gp2" { - api_version = "storage.k8s.io/v1" - kind = "StorageClass" - # This is true because the resources was already created by the ebs-csi-driver addon - force = "true" - - metadata { - name = "gp2" - } - - annotations = { - "storageclass.kubernetes.io/is-default-class" = "false" - } -} - -resource "kubernetes_storage_class_v1" "gp3" { - metadata { - name = "gp3" - - annotations = { - "storageclass.kubernetes.io/is-default-class" = "true" - } - } - - storage_provisioner = "ebs.csi.aws.com" - allow_volume_expansion = true - reclaim_policy = "Delete" - volume_binding_mode = "WaitForFirstConsumer" - - parameters = { - encrypted = "true" - fsType = "ext4" - type = "gp3" - } - -} - -resource "kubernetes_storage_class_v1" "efs" { - - metadata { - name = "efs" - } - - storage_provisioner = "efs.csi.aws.com" - reclaim_policy = "Delete" - parameters = { - provisioningMode = "efs-ap" # Dynamic provisioning - fileSystemId = module.efs.id - directoryPerms = "700" - } - - mount_options = [ - "iam" - ] -} - -# Kubeconfig -resource "terraform_data" "create_kubeconfig" { - depends_on = [module.eks] - - triggers_replace = var.ci ? [timestamp()] : [] - - provisioner "local-exec" { - command = "aws eks update-kubeconfig --name ${module.eks.cluster_name} --region ${var.aws_region} --kubeconfig ${local.kubeconfig_file_path}" - } -} - ################################################################################ # Supported Resources ################################################################################ diff --git a/blueprints/02-at-scale/outputs.tf b/blueprints/02-at-scale/outputs.tf index d57b8239..a5ef5581 100644 --- a/blueprints/02-at-scale/outputs.tf +++ b/blueprints/02-at-scale/outputs.tf @@ -1,10 +1,10 @@ output "kubeconfig_export" { - description = "Export the KUBECONFIG environment variable to access the Kubernetes API." + description = "Exports the KUBECONFIG environment variable to access the Kubernetes API." value = "export KUBECONFIG=${local.kubeconfig_file_path}" } output "kubeconfig_add" { - description = "Add kubeconfig to the local configuration to access the Kubernetes API." + description = "Adds kubeconfig to the local configuration to access the Kubernetes API." value = "aws eks update-kubeconfig --region ${var.aws_region} --name ${local.cluster_name}" } @@ -99,6 +99,11 @@ output "cbci_agent_sec_reg" { value = "kubectl get secret ${module.eks_blueprints_addon_cbci.cbci_sec_registry} -n ${local.cbci_agents_ns} -o jsonpath='{.data.*}' | base64 -d" } +output "aws_region" { + description = "AWS region." + value = var.aws_region +} + output "acm_certificate_arn" { description = "AWS Certificate Manager (ACM) certificate for Amazon Resource Names (ARN)." value = module.acm.acm_certificate_arn @@ -114,9 +119,11 @@ output "eks_cluster_arn" { value = module.eks.cluster_arn } +#Issue #165 +#not using module.eks.cluster_name because we need to get this value after the cluster is destroyed output "eks_cluster_name" { - description = "Amazon EKS cluster Name." - value = module.eks.cluster_name + description = "Amazon EKS cluster name." + value = local.cluster_name } output "s3_cbci_arn" { @@ -169,19 +176,21 @@ output "velero_restore" { value = "kubectl delete all,pvc -n ${module.eks_blueprints_addon_cbci.cbci_namespace} -l ${local.velero_controller_backup_selector}; velero restore create --from-schedule ${local.velero_schedule_name} --restore-volumes=true" } + output "prometheus_dashboard" { description = "Provides access to Prometheus dashboards." - value = "kubectl port-forward svc/kube-prometheus-stack-prometheus 50001:9090 -n kube-prometheus-stack" + value = "kubectl port-forward svc/kube-prometheus-stack-prometheus 50001:9090 -n ${local.observability_ns}" } +# https://prometheus.io/docs/prometheus/latest/querying/api/ output "prometheus_active_targets" { - description = "Checks active Prometheus targets from the operations center." - value = "kubectl exec -n cbci -ti cjoc-0 --container jenkins -- curl -sSf kube-prometheus-stack-prometheus.kube-prometheus-stack.svc.cluster.local:9090/api/v1/targets" + description = "Checks active Prometheus targets from the CloudBees operations center." + value = "kubectl exec -n cbci -ti cjoc-0 --container jenkins -- curl -sSf kube-prometheus-stack-prometheus.${local.observability_ns}.svc.cluster.local:9090/api/v1/targets" } -output "grafana_dashboard" { - description = "Provides access to Grafana dashboards." - value = "kubectl port-forward svc/kube-prometheus-stack-grafana 50002:80 -n kube-prometheus-stack" +output "grafana_url" { + description = "Grafana URL." + value = local.grafana_url } output "global_password" { @@ -190,17 +199,17 @@ output "global_password" { } output "vault_init" { - description = "Inicialization of Vault Service." + description = "Initialization of the vault service." value = "kubectl exec -it vault-0 -n ${local.vault_ns} -- vault operator init | tee ${local.vault_init_file_path} || echo \"Vault initialization failed.\"" } output "vault_init_log_file" { - description = "Vault Inicialization log file." + description = "Vault initialization log file." value = local.vault_init_file_path } output "vault_configure" { - description = "Configure Vault with initial secrets and creates approle for integration with CloudBees CI (role-id and secret-id). It requires unseal keys and the root token from the vault_init output." + description = "Configures the vault with initial secrets and creates the application role for integration with CloudBees CI (role-id and secret-id). It requires unseal keys and the root token from the vault_init output." value = "bash ${local.vault_config_file_path} ${local.vault_ns}" } @@ -208,3 +217,15 @@ output "vault_dashboard" { description = "Provides access to Hashicorp Vault dashboard. It requires the root token from the vault_init output." value = "kubectl port-forward svc/vault 50003:8200 -n ${local.vault_ns}" } + +# https://grafana.com/docs/tempo/latest/api_docs/ +output "tempo_tags" { + description = "Lists all tags ingested in Tempo." + value = "kubectl exec -n cbci -ti cjoc-0 --container jenkins -- curl -sG tempo.${local.observability_ns}.svc.cluster.local:3100/api/search/tags" +} + +# https://grafana.com/docs/loki/latest/reference/loki-http-api/ +output "loki_labels" { + description = "Lists all labels ingested in Loki." + value = "kubectl exec -n cbci -ti cjoc-0 --container jenkins -- curl -sG loki.${local.observability_ns}.svc.cluster.local:3100/loki/api/v1/labels" +} diff --git a/blueprints/02-at-scale/providers.tf b/blueprints/02-at-scale/providers.tf index 28c2b9df..b722a90e 100644 --- a/blueprints/02-at-scale/providers.tf +++ b/blueprints/02-at-scale/providers.tf @@ -28,6 +28,7 @@ terraform { source = "hashicorp/random" version = ">= 3.6.1" } + } } diff --git a/blueprints/02-at-scale/scripts/kube-prometheus-destroy.sh b/blueprints/02-at-scale/scripts/kube-prometheus-destroy.sh new file mode 100644 index 00000000..2fba66d2 --- /dev/null +++ b/blueprints/02-at-scale/scripts/kube-prometheus-destroy.sh @@ -0,0 +1,77 @@ +#!/bin/bash + +set -o pipefail +set -x + +# Constants +TAG_KEY1="ingress.k8s.aws/stack" +TAG_VALUE1="kube-prometheus-stack-grafana" +TAG_KEY2="elbv2.k8s.aws/cluster" +OBSERVABABILITY_NS="observability" +#Paranmeters cluster name +EKS_CLUSTER_NAME="$1" +REGION="$2" + +retry () { + local retries="$1" + local command="$2" + local options="$-" + local wait=150 + + if [[ $options == *e* ]]; then + set +e + fi + + $command + local exit_code=$? + + if [[ $options == *e* ]]; then + set -e + fi + + if [[ $exit_code -ne 0 && $retries -gt 0 ]]; then + echo "$command failed. Retrying in $wait seconds..." + sleep $wait + retry $((retries - 1)) "$command" + else + return $exit_code + fi +} + + +#https://github.com/cloudbees/terraform-aws-cloudbees-ci-eks-addon/issues/165 + +# List all ALBs +load_balancers=$(aws elbv2 describe-load-balancers --query 'LoadBalancers[*].LoadBalancerArn' --output text --region "$REGION") + +# Loop through all load balancers to find the one with the desired tag +for lb_arn in $load_balancers; do + # Describe tags for the current load balancer + tags=$(aws elbv2 describe-tags --resource-arns "$lb_arn" --region "$REGION") + if echo "$tags" | jq -e --arg key1 "$TAG_KEY1" --arg value1 "$OBSERVABABILITY_NS/$TAG_VALUE1" --arg key2 "$TAG_KEY2" --arg value2 "$EKS_CLUSTER_NAME" ' + .TagDescriptions[].Tags | + any(.[]; .Key == $key1 and .Value == $value1) and + any(.[]; .Key == $key2 and .Value == $value2) + ' > /dev/null; then + + # Describe the load balancer to get its security groups + lb_desc=$(aws elbv2 describe-load-balancers --load-balancer-arns "$lb_arn" --region "$REGION") + security_groups=$(echo "$lb_desc" | jq -r '.LoadBalancers[].SecurityGroups[]') + + # Delete the load balancer + aws elbv2 delete-load-balancer --load-balancer-arn "$lb_arn" --region "$REGION" + echo "Load Balancer with ARN: $lb_arn deleted" + + fi +done + +# Delete the security groups + +if [ -n "$security_groups" ]; then + for sg in $security_groups; do + retry 5 "aws ec2 delete-security-group --group-id $sg --region $REGION" + echo "Security Group: $sg deleted" + done +else + echo "No security groups found for Load Balancer with ARN: $lb_arn" +fi diff --git a/blueprints/02-at-scale/variables.tf b/blueprints/02-at-scale/variables.tf index a29be22a..a7becea5 100644 --- a/blueprints/02-at-scale/variables.tf +++ b/blueprints/02-at-scale/variables.tf @@ -13,7 +13,7 @@ variable "trial_license" { } variable "dh_reg_secret_auth" { - description = "Docker Hub Registry server authentication details for cbci-sec-reg secret." + description = "Docker Hub registry server authentication details for cbci-sec-reg secret." type = map(string) default = { username = "foo" @@ -38,7 +38,7 @@ variable "suffix" { #Check number of AZ: aws ec2 describe-availability-zones --region var.aws_region variable "aws_region" { - description = "AWS region to deploy resources to. It requires at minimun 3 AZs." + description = "AWS region to deploy resources to. It requires a minimum of three availability zones." type = string default = "us-west-2" } diff --git a/blueprints/helpers.sh b/blueprints/helpers.sh index 4be30e9c..552850e3 100755 --- a/blueprints/helpers.sh +++ b/blueprints/helpers.sh @@ -81,23 +81,46 @@ tf-output () { tf-apply () { local root="$1" export TF_LOG_PATH="$SCRIPTDIR/$root/terraform.log" + rm TF_LOG_PATH || INFO "No previous log found." retry 3 "terraform -chdir=$SCRIPTDIR/$root apply -target=module.vpc -auto-approve" + INFO "Apply target module.vpc completed." retry 3 "terraform -chdir=$SCRIPTDIR/$root apply -target=module.eks -auto-approve" + INFO "Apply target module.eks completed." retry 3 "terraform -chdir=$SCRIPTDIR/$root apply -auto-approve" + INFO "Apply the rest completed." terraform -chdir="$SCRIPTDIR/$root" output > "$SCRIPTDIR/$root/terraform.output" + INFO "Outputs saved corretely." } #https://aws-ia.github.io/terraform-aws-eks-blueprints/getting-started/#destroy tf-destroy () { local root="$1" export TF_LOG_PATH="$SCRIPTDIR/$root/terraform.log" - retry 3 "terraform -chdir=$SCRIPTDIR/$root destroy -target=module.eks_blueprints_addon_cbci -auto-approve" - retry 3 "terraform -chdir=$SCRIPTDIR/$root destroy -target=module.eks_blueprints_addons -auto-approve" + rm "$TF_LOG_PATH" || INFO "No previous log found." + tf-destroy-wl "$root" retry 3 "terraform -chdir=$SCRIPTDIR/$root destroy -target=module.eks -auto-approve" + INFO "Destroy target module.eks completed." + #Prevent Issue #165 + if [ "$root" == "${BLUEPRINTS[1]}" ]; then + eks_cluster_name=$(tf-output "$root" eks_cluster_name) + aws_region=$(tf-output "$root" aws_region) + bash "$SCRIPTDIR/$root/scripts/kube-prometheus-destroy.sh" "$eks_cluster_name" "$aws_region" + INFO "kube-prometheus-destroy.sh completed." + fi retry 3 "terraform -chdir=$SCRIPTDIR/$root destroy -auto-approve" + INFO "Destroy the rest completed." rm -f "$SCRIPTDIR/$root/terraform.output" } +tf-destroy-wl () { + local root="$1" + export TF_LOG_PATH="$SCRIPTDIR/$root/terraform.log" + retry 3 "terraform -chdir=$SCRIPTDIR/$root destroy -target=module.eks_blueprints_addon_cbci -auto-approve" + INFO "Destroy target module.eks_blueprints_addon_cbci completed." + retry 3 "terraform -chdir=$SCRIPTDIR/$root destroy -target=module.eks_blueprints_addons -auto-approve" + INFO "Destroy target module.eks_blueprints_addons completed." +} + probes () { local root="$1" local wait=5 @@ -111,13 +134,11 @@ probes () { OC_URL=$(tf-output "$root" cbci_oc_url) until eval "$(tf-output "$root" cbci_liveness_probe_ext)"; do sleep $wait && echo "Waiting for Operation Center Service to pass Health Check from outside the clustery..."; done ;\ INFO "Operation Center Service passed Health Check outside the cluster. It is available at $OC_URL." - if [ "$root" == "01-getting-started" ]; then + if [ "$root" == "${BLUEPRINTS[0]}" ]; then INITIAL_PASS=$(eval "$(tf-output "$root" cbci_initial_admin_password)"); \ INFO "Initial Admin Password: $INITIAL_PASS." fi - if [ "$root" == "02-at-scale" ]; then - until [ "$(eval "$(tf-output "$root" cbci_controllers_pods)" | awk '{ print $3 }' | grep -v STATUS | grep -v -c Running)" == 0 ]; do sleep $wait && echo "Waiting for Controllers Pod to get into Ready State..."; done ;\ - eval "$(tf-output "$root" cbci_controllers_pods)" && INFO "All Controllers Pods are Ready." + if [ "$root" == "${BLUEPRINTS[1]}" ]; then GLOBAL_PASS=$(eval "$(tf-output "$root" global_password)") && \ if [ -n "$GLOBAL_PASS" ]; then INFO "Password for admin_cbci_a: $GLOBAL_PASS." @@ -139,6 +160,8 @@ probes () { fi until eval "$(tf-output "$root" cbci_controller_c_hpa)"; do sleep $wait && echo "Waiting for Team C HPA to get Ready..."; done ;\ INFO "Team C HPA is Ready." + until [ "$(eval "$(tf-output "$root" cbci_controllers_pods)" | awk '{ print $3 }' | grep -v STATUS | grep -v -c Running)" == 0 ]; do sleep $wait && echo "Waiting for Controllers Pod to get into Ready State..."; done ;\ + eval "$(tf-output "$root" cbci_controllers_pods)" && INFO "All Controllers Pods are Ready." until [ "$(eval "$(tf-output "$root" cbci_agent_windowstempl_events)" | grep -c 'Allocated Resource vpc.amazonaws.com')" -ge 1 ]; do sleep $wait && echo "Waiting for Windows Template Pod to allocate resource vpc.amazonaws.com"; done ;\ eval "$(tf-output "$root" cbci_agent_windowstempl_events)" && INFO "Windows Template Example is OK." until [ "$(eval "$(tf-output "$root" cbci_agent_linuxtempl_events)" | grep -c 'Created container maven')" -ge 2 ]; do sleep $wait && echo "Waiting for both Linux Template Pods (On demand and Spot) to create maven container"; done ;\ @@ -153,9 +176,12 @@ probes () { fi until eval "$(tf-output "$root" prometheus_active_targets)" | jq '.data.activeTargets[] | select(.labels.container=="jenkins") | {job: .labels.job, instance: .labels.instance, status: .health}'; do sleep $wait && echo "Waiting for CloudBees CI Prometheus Targets..."; done ;\ INFO "CloudBees CI Targets are loaded in Prometheus." - # Note: Sometimes the log streams are not created in CloudWatch for CI builds - # until eval "$(tf-output "$root" aws_logstreams_fluentbit)" | jq '.[] '; do sleep $wait && echo "Waiting for CloudBees CI Log streams in CloudWatch..."; done ;\ - # INFO "CloudBees CI Log Streams are already in Cloud Watch." + until eval "$(tf-output "$root" aws_logstreams_fluentbit)" | jq '.[] '; do sleep $wait && echo "Waiting for CloudBees CI Log streams in CloudWatch..."; done ;\ + INFO "CloudBees CI Log Streams are already in Cloud Watch." + until [ "$(eval "$(tf-output "$root" tempo_tags)" | grep -c 'jenkins.pipeline')" -ge 1 ]; do sleep $wait && echo "Waiting for Tempo to inject jenkins.pipeline* tags from Open Telemetry plugin"; done ;\ + eval "$(tf-output "$root" tempo_tags)" | jq .tagNames && INFO "Tempo has injested tags from Open Telemetry plugin." + until [ "$(eval "$(tf-output "$root" loki_labels)" | grep -c 'com_cloudbees')" -ge 1 ]; do sleep $wait && echo "Waiting for Loki to inject com_cloudbees* labels from FluentBit"; done ;\ + eval "$(tf-output "$root" loki_label)" && INFO "Loki has injested labels from FluentBit." fi } @@ -182,8 +208,9 @@ set-kube-env () { for bp in "${BLUEPRINTS[@]}" do # shellcheck disable=SC2154 - sed -i -e "/#vK8#/{n;s/\".*\"/\"$vK8\"/;}" \ - -e "/#vEKSBpAddonsTFMod#/{n;s/\".*\"/\"$vEKSBpAddonsTFMod\"/;}" "$SCRIPTDIR/$bp/main.tf" + find "$SCRIPTDIR/$bp" -type f -name "*.tf" -print0 \ + | xargs -0 sed -i -e "/#vK8#/{n;s/\".*\"/\"$vK8\"/;}" \ + -e "/#vEKSBpAddonsTFMod#/{n;s/\".*\"/\"$vEKSBpAddonsTFMod\"/;}" done } @@ -193,26 +220,15 @@ set-cbci-location () { #Repo sed -i "s|scmRepo: .*|scmRepo: \"$repo\"|g" "$SCRIPTDIR/02-at-scale/k8s/cbci-values.yml" sed -i "s|scmCascMmStore: .*|scmCascMmStore: \"$repo\"|g" "$SCRIPTDIR/02-at-scale/cbci/casc/oc/variables/variables.yaml" + sed -i "s|sharedLibRepo: .*|sharedLibRepo: \"$repo\"|g" "$SCRIPTDIR/02-at-scale/cbci/casc/mc/ha/variables/variables.yaml" + sed -i "s|sharedLibRepo: .*|sharedLibRepo: \"$repo\"|g" "$SCRIPTDIR/02-at-scale/cbci/casc/mc/none-ha/variables/variables.yaml" #Branch sed -i "s|scmBranch: .*|scmBranch: $branch|g" "$SCRIPTDIR/02-at-scale/k8s/cbci-values.yml" sed -i "s|cascBranch: .*|cascBranch: $branch|g" "$SCRIPTDIR/02-at-scale/cbci/casc/oc/variables/variables.yaml" - sed -i "s|sharedLibBranch: .*|sharedLibBranch: $branch|g" "$SCRIPTDIR/02-at-scale/cbci/casc/mc/parent/variables/variables.yaml" + sed -i "s|sharedLibBranch: .*|sharedLibBranch: $branch|g" "$SCRIPTDIR/02-at-scale/cbci/casc/mc/ha/variables/variables.yaml" + sed -i "s|sharedLibBranch: .*|sharedLibBranch: $branch|g" "$SCRIPTDIR/02-at-scale/cbci/casc/mc/none-ha/variables/variables.yaml" sed -i "s|bundle: \".*/none-ha\"|bundle: \"$branch/none-ha\"|g" "$SCRIPTDIR/02-at-scale/cbci/casc/oc/items/root.yaml" sed -i "s|bundle: \".*/ha\"|bundle: \"$branch/ha\"|g" "$SCRIPTDIR/02-at-scale/cbci/casc/oc/items/root.yaml" -} - -run-aws-nuke () { - local dry_run="$1" - local aws_nuke_file="$SCRIPTDIR/../.cloudbees/aws-nuke/bp-tf-ci-nuke.yaml" - local aws_nuke_file_log="$SCRIPTDIR/../.cloudbees/aws-nuke/aws-nuke.log" - if [ "$dry_run" == "true" ]; then - INFO "Running AWS Nuke in Dry Run Mode..." - rm "$aws_nuke_file_log" || INFO "No log file to remove." - aws-nuke -c "$aws_nuke_file" | tee "$aws_nuke_file_log" - INFO "Listing candidated resources to be deleted by using $aws_nuke_file" - grep "remove" "$aws_nuke_file_log" || INFO "No candidates to delete." - else - WARN "Running AWS Nuke in Not Dry Run Mode..." - aws-nuke -c "$aws_nuke_file" --no-dry-run - fi + sed -i "s|https://raw.githubusercontent.com/cloudbees/terraform-aws-cloudbees-ci-eks-addon/.*/blueprints/02-at-scale/k8s/prometheus-plugin-db.json|https://raw.githubusercontent.com/cloudbees/terraform-aws-cloudbees-ci-eks-addon/$branch/blueprints/02-at-scale/k8s/prometheus-plugin-db.json|g" "$SCRIPTDIR/02-at-scale/k8s/kube-prom-stack-values.yml" + sed -i "s|https://raw.githubusercontent.com/cloudbees/terraform-aws-cloudbees-ci-eks-addon/.*/blueprints/02-at-scale/k8s/opentelemetry-plugin-db.json|https://raw.githubusercontent.com/cloudbees/terraform-aws-cloudbees-ci-eks-addon/$branch/blueprints/02-at-scale/k8s/opentelemetry-plugin-db.json|g" "$SCRIPTDIR/02-at-scale/k8s/kube-prom-stack-values.yml" } diff --git a/main.tf b/main.tf index 0690cfd7..112545b6 100644 --- a/main.tf +++ b/main.tf @@ -27,7 +27,7 @@ locals { hosted_zone = var.hosted_zone cert_arn = var.cert_arn LicFirstName = var.trial_license["first_name"] - LicLastName = var.trial_license["last_name"] + LicLastName = "${var.trial_license["last_name"]} [EKS_TF_ADDON]" LicEmail = var.trial_license["email"] LicCompany = var.trial_license["company"] } @@ -49,6 +49,12 @@ resource "kubernetes_namespace" "cbci" { } +resource "time_sleep" "wait_30_seconds" { + depends_on = [kubernetes_namespace.cbci] + + destroy_duration = "30s" +} + # Kubernetes Secrets to be passed to Casc # https://github.com/jenkinsci/configuration-as-code-plugin/blob/master/docs/features/secrets.adoc#kubernetes-secrets resource "kubernetes_secret" "cbci_sec_casc" { @@ -100,7 +106,7 @@ apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: name: servicemonitor-cbci - namespace: kube-prometheus-stack + namespace: ${var.prometheus_target_ns} labels: release: kube-prometheus-stack app.kubernetes.io/part-of: kube-prometheus-stack @@ -141,7 +147,7 @@ resource "helm_release" "cloudbees_ci" { description = try(var.helm_config.description, null) chart = "cloudbees-core" #vCBCI_Helm# - version = try(var.helm_config.version, "3.18306.0+b5ad27c80a6b") + version = try(var.helm_config.version, "3.19313.0+1afe0458111d") repository = try(var.helm_config.repository, "https://public-charts.artifacts.cloudbees.com/repository/public/") values = local.create_secret_casc ? concat(var.helm_config.values, local.oc_secrets_mount, [templatefile("${path.module}/values.yml", local.cbci_template_values)]) : concat(var.helm_config.values, [templatefile("${path.module}/values.yml", local.cbci_template_values)]) timeout = try(var.helm_config.timeout, 1200) @@ -201,16 +207,3 @@ resource "helm_release" "cloudbees_ci" { depends_on = [time_sleep.wait_30_seconds] } - -# Need to wait a few seconds when removing the cbci resource to give helm -# time to finish cleaning up. -# -# Otherwise, after `terraform destroy`: -# │ Error: uninstallation completed with 1 error(s): uninstall: Failed to purge -# the release: release: not found - -resource "time_sleep" "wait_30_seconds" { - depends_on = [kubernetes_namespace.cbci] - - destroy_duration = "30s" -} diff --git a/outputs.tf b/outputs.tf index 2f349e3e..be23699b 100644 --- a/outputs.tf +++ b/outputs.tf @@ -46,6 +46,6 @@ output "cbci_sec_casc" { } output "cbci_sec_registry" { - description = "Optional. Kubernetes secrets name for CloudBees CI agents to autheticate to registry." + description = "Optional. Kubernetes secrets name for CloudBees CI agents to authenticate the registry." value = local.create_secret_reg ? kubernetes_secret.cbci_sec_reg[0].metadata[0].name : "No secrets created" } diff --git a/variables.tf b/variables.tf index ec2fa0fa..d3e2c8b4 100644 --- a/variables.tf +++ b/variables.tf @@ -47,7 +47,7 @@ variable "trial_license" { } validation { condition = length(var.trial_license) == 4 - error_message = "The map must contain 4 keys." + error_message = "The map must contain four keys." } } @@ -74,7 +74,7 @@ variable "create_reg_secret" { } variable "reg_secret_ns" { - description = "Agent namespace to allocate cbci-sec-reg secret. It is required when create_reg_secret is enabled." + description = "Agent namespace to allocate the cbci-sec-reg secret. It is required when create_reg_secret is enabled." default = "cbci" type = string validation { @@ -98,7 +98,7 @@ variable "reg_secret_auth" { } validation { condition = length(var.reg_secret_auth) == 4 - error_message = "The reg_secret_auth must contain 4 keys." + error_message = "The reg_secret_auth must contain four keys." } } @@ -107,3 +107,13 @@ variable "prometheus_target" { default = false type = bool } + +variable "prometheus_target_ns" { + description = "Prometheus target namespace, designed to be enabled with the AWS EKS Terraform Addon Kube Prometheus Stack. It is required when prometheus_target is enabled." + default = "observability" + type = string + validation { + condition = length(trimspace(var.prometheus_target_ns)) > 0 + error_message = "Prometheus target namespace must not be an empty string." + } +}