From f0a74370b8f16316551c282b271c9392b59f707a Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 6 Nov 2024 19:44:56 +0000 Subject: [PATCH] deploy: d9e1292dcb3be35a219ba92546357ad1e8f27196 --- 404.html | 4 ++-- assets/js/51875793.4dcb24bc.js | 1 - assets/js/51875793.6ec38f2f.js | 1 + ...me~main.6a86179b.js => runtime~main.2deea5ff.js} | 2 +- blog.html | 4 ++-- blog/archive.html | 4 ++-- blog/argo-cd-application-scalability.html | 4 ++-- blog/argo-workflow-scalability.html | 4 ++-- blog/optimizing-data-quality-in-dev-portals.html | 4 ++-- blog/tags.html | 4 ++-- blog/tags/argo.html | 4 ++-- blog/tags/argocd.html | 4 ++-- blog/tags/backstage.html | 4 ++-- blog/tags/benchmarking.html | 4 ++-- blog/tags/cnoe.html | 4 ++-- blog/tags/data-ingestion.html | 4 ++-- blog/tags/dev-portal.html | 4 ++-- blog/tags/hello.html | 4 ++-- blog/tags/launch.html | 4 ++-- blog/tags/scalability.html | 4 ++-- blog/tags/workflows.html | 4 ++-- blog/welcome.html | 4 ++-- docs/category/configurations.html | 4 ++-- docs/category/deploy-a-platform.html | 4 ++-- docs/category/generate-templates.html | 4 ++-- docs/category/getting-started.html | 4 ++-- docs/category/plugins.html | 4 ++-- docs/category/technology-capabilities.html | 4 ++-- docs/intro.html | 4 ++-- docs/intro/approach.html | 4 ++-- docs/intro/capabilities/artifact-registry.html | 4 ++-- docs/intro/capabilities/code-repository.html | 4 ++-- docs/intro/capabilities/compute-platform.html | 4 ++-- docs/intro/capabilities/config-repository.html | 4 ++-- docs/intro/capabilities/continuous-delivery.html | 4 ++-- docs/intro/capabilities/deployment-targets.html | 4 ++-- docs/intro/capabilities/developer-portal.html | 4 ++-- docs/intro/capabilities/identity-and-access.html | 4 ++-- docs/intro/capabilities/infra-as-code.html | 4 ++-- docs/intro/capabilities/observability.html | 4 ++-- .../capabilities/packaging-and-templating.html | 4 ++-- docs/intro/capabilities/secret-management.html | 4 ++-- docs/intro/capabilities/secret-repository.html | 4 ++-- docs/intro/capabilities/service-discovery.html | 4 ++-- docs/intro/capabilities/signing.html | 4 ++-- docs/intro/capabilities/validation.html | 4 ++-- docs/intro/capabilities/workflow-orchestration.html | 4 ++-- docs/intro/personas.html | 4 ++-- .../configs/access-management.html | 4 ++-- .../configs/control-plane.html | 4 ++-- docs/reference-implementation/configs/secrets.html | 4 ++-- .../installations/app-idp.html | 4 ++-- .../installations/idpbuilder.html | 4 ++-- .../installations/idpbuilder/how-it-works.html | 13 +++++++++---- .../idpbuilder/local-oci-registry.html | 4 ++-- .../installations/idpbuilder/override.html | 4 ++-- .../installations/idpbuilder/quick-start.html | 4 ++-- .../installations/idpbuilder/troubleshooting.html | 4 ++-- .../installations/idpbuilder/usage.html | 4 ++-- .../integrations/generated.html | 4 ++-- .../integrations/generated/crd-templating.html | 4 ++-- .../integrations/generated/tf-templating.html | 4 ++-- .../integrations/localstack.html | 4 ++-- .../integrations/reference-impl.html | 4 ++-- .../integrations/terraform.html | 4 ++-- .../integrations/terraform/s3-bucket.html | 4 ++-- .../terraform/serverless-microservice-pattern.html | 4 ++-- .../integrations/verification.html | 4 ++-- .../plugins/argo-workflows.html | 4 ++-- .../plugins/scaffolder-backend.html | 4 ++-- .../plugins/scaffolder-frontend.html | 4 ++-- .../plugins/spark-plugin.html | 4 ++-- .../plugins/terraform-plugin.html | 4 ++-- docs/reference-implementation/technology.html | 4 ++-- index.html | 4 ++-- markdown-page.html | 4 ++-- radars.html | 4 ++-- radars/radar.html | 4 ++-- 78 files changed, 159 insertions(+), 154 deletions(-) delete mode 100644 assets/js/51875793.4dcb24bc.js create mode 100644 assets/js/51875793.6ec38f2f.js rename assets/js/{runtime~main.6a86179b.js => runtime~main.2deea5ff.js} (99%) diff --git a/404.html b/404.html index 51b1e57e..eb990ea2 100644 --- a/404.html +++ b/404.html @@ -10,13 +10,13 @@ - +
Skip to main content

Page Not Found

We could not find what you were looking for.

Please contact the owner of the site that linked you to the original URL and let them know their link is broken.

- + \ No newline at end of file diff --git a/assets/js/51875793.4dcb24bc.js b/assets/js/51875793.4dcb24bc.js deleted file mode 100644 index 2ee4d599..00000000 --- a/assets/js/51875793.4dcb24bc.js +++ /dev/null @@ -1 +0,0 @@ -"use strict";(self.webpackChunkcnoe=self.webpackChunkcnoe||[]).push([[6262],{3905:(e,t,n)=>{n.d(t,{Zo:()=>p,kt:()=>g});var i=n(7294);function r(e,t,n){return t in e?Object.defineProperty(e,t,{value:n,enumerable:!0,configurable:!0,writable:!0}):e[t]=n,e}function a(e,t){var n=Object.keys(e);if(Object.getOwnPropertySymbols){var i=Object.getOwnPropertySymbols(e);t&&(i=i.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),n.push.apply(n,i)}return n}function o(e){for(var t=1;t=0||(r[n]=e[n]);return r}(e,t);if(Object.getOwnPropertySymbols){var a=Object.getOwnPropertySymbols(e);for(i=0;i=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(r[n]=e[n])}return r}var l=i.createContext({}),c=function(e){var t=i.useContext(l),n=t;return e&&(n="function"==typeof e?e(t):o(o({},t),e)),n},p=function(e){var t=c(e.components);return i.createElement(l.Provider,{value:t},e.children)},d="mdxType",u={inlineCode:"code",wrapper:function(e){var t=e.children;return i.createElement(i.Fragment,{},t)}},m=i.forwardRef((function(e,t){var n=e.components,r=e.mdxType,a=e.originalType,l=e.parentName,p=s(e,["components","mdxType","originalType","parentName"]),d=c(n),m=r,g=d["".concat(l,".").concat(m)]||d[m]||u[m]||a;return n?i.createElement(g,o(o({ref:t},p),{},{components:n})):i.createElement(g,o({ref:t},p))}));function g(e,t){var n=arguments,r=t&&t.mdxType;if("string"==typeof e||r){var a=n.length,o=new Array(a);o[0]=m;var s={};for(var l in t)hasOwnProperty.call(t,l)&&(s[l]=t[l]);s.originalType=e,s[d]="string"==typeof e?e:r,o[1]=s;for(var c=2;c{n.r(t),n.d(t,{assets:()=>l,contentTitle:()=>o,default:()=>d,frontMatter:()=>a,metadata:()=>s,toc:()=>c});var i=n(7462),r=(n(7294),n(3905));const a={sidebar_position:2,description:"How it works",title:"How it works",index:3},o=void 0,s={unversionedId:"reference-implementation/installations/idpbuilder/how-it-works",id:"reference-implementation/installations/idpbuilder/how-it-works",title:"How it works",description:"How it works",source:"@site/docs/reference-implementation/installations/idpbuilder/how-it-works.md",sourceDirName:"reference-implementation/installations/idpbuilder",slug:"/reference-implementation/installations/idpbuilder/how-it-works",permalink:"/docs/reference-implementation/installations/idpbuilder/how-it-works",draft:!1,editUrl:"https://github.com/cnoe-io/website/tree/main/docs/reference-implementation/installations/idpbuilder/how-it-works.md",tags:[],version:"current",sidebarPosition:2,frontMatter:{sidebar_position:2,description:"How it works",title:"How it works",index:3},sidebar:"tutorialSidebar",previous:{title:"Using the idpBuilder",permalink:"/docs/reference-implementation/installations/idpbuilder/usage"},next:{title:"Override built-in services",permalink:"/docs/reference-implementation/installations/idpbuilder/override"}},l={},c=[{value:"Bootstrapping",id:"bootstrapping",level:2},{value:"Self Signed Certificate",id:"self-signed-certificate",level:2},{value:"Networking",id:"networking",level:2},{value:"Overview",id:"overview",level:3},{value:"DNS",id:"dns",level:3},{value:"In-cluster DNS Configuration",id:"in-cluster-dns-configuration",level:3},{value:"Core Packages",id:"core-packages",level:2},{value:"Getting Relevant Secrets",id:"getting-relevant-secrets",level:2}],p={toc:c};function d(e){let{components:t,...a}=e;return(0,r.kt)("wrapper",(0,i.Z)({},p,a,{components:t,mdxType:"MDXLayout"}),(0,r.kt)("h2",{id:"bootstrapping"},"Bootstrapping"),(0,r.kt)("p",null,"When idpbuilder creates an environment for you, it performs the following tasks."),(0,r.kt)("ol",null,(0,r.kt)("li",{parentName:"ol"},"Create a local cluster if one does not exist yet."),(0,r.kt)("li",{parentName:"ol"},"Create a self-signed certificate, then set it as the default TLS certificate for ingress-nginx."),(0,r.kt)("li",{parentName:"ol"},"Configure CoreDNS to ensure names are resolved correctly."),(0,r.kt)("li",{parentName:"ol"},"Install Core Packages, then hands control over to ArgoCD.")),(0,r.kt)("p",null,(0,r.kt)("img",{alt:"img.png",src:n(824).Z,width:"649",height:"352"})),(0,r.kt)("h2",{id:"self-signed-certificate"},"Self Signed Certificate"),(0,r.kt)("p",null,"To ensure applications inside the cluster can talk to other services, idpbuilder creates a self-signed TLS certificate. The certificate is a wild card certificate\nfor the domain name and any subdomains given by the ",(0,r.kt)("inlineCode",{parentName:"p"},"--host")," flag.\nFor example, if you use the default domain name ",(0,r.kt)("inlineCode",{parentName:"p"},"cnoe.localtest.me")," the certificate is issued for ",(0,r.kt)("inlineCode",{parentName:"p"},"cnoe.localtest.me")," and ",(0,r.kt)("inlineCode",{parentName:"p"},"*.cnoe.localtest.me")),(0,r.kt)("p",null,"This certificate is then used by ingress-nginx as ","[the default TLS certificate]","(",(0,r.kt)("a",{parentName:"p",href:"https://kubernetes.github.io/ingress-nginx/user-guide/tls/#default-ssl-certificate"},"https://kubernetes.github.io/ingress-nginx/user-guide/tls/#default-ssl-certificate"),"_. This means you can override TLS certificate used at ingress level if desired."),(0,r.kt)("p",null,"The certificate is also ",(0,r.kt)("a",{parentName:"p",href:"https://argo-cd.readthedocs.io/en/stable/operator-manual/declarative-setup/#repositories-using-self-signed-tls-certificates-or-are-signed-by-custom-ca"},"imported to ArgoCD")," as one of trusted CAs. This is necessary to make sure ArgoCD can talk to Gitea services without disabling TLS."),(0,r.kt)("p",null,"Finally, the certificate is exposed as a secret named ",(0,r.kt)("inlineCode",{parentName:"p"},"idpbuilder-cert")," in the default namespace. To retrieve it, run the following command:"),(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre",className:"language-bash"},"kubectl get secret -n default idpbuilder-cert\n")),(0,r.kt)("h2",{id:"networking"},"Networking"),(0,r.kt)("h3",{id:"overview"},"Overview"),(0,r.kt)("p",null,"With the default configuration on Docker on Linux, kind cluster is set up as follows:"),(0,r.kt)("ol",null,(0,r.kt)("li",{parentName:"ol"},"A Docker container runs as the Kubernetes node and the container port 443 is exposed on host port 8443. You can confirm this by running ",(0,r.kt)("inlineCode",{parentName:"li"},"docker container ls")),(0,r.kt)("li",{parentName:"ol"},"Ingress-nginx service is configured as ",(0,r.kt)("inlineCode",{parentName:"li"},"NodePort")," and listens on port 443. You can confirm with ",(0,r.kt)("inlineCode",{parentName:"li"},"kubectl get service -n ingress-nginx ingress-nginx-controller"),".")),(0,r.kt)("p",null,"With this setup, HTTP traffic for ",(0,r.kt)("inlineCode",{parentName:"p"},"https://gitea.cnoe.localtest.me:8443")," roughly looks like this."),(0,r.kt)("ol",null,(0,r.kt)("li",{parentName:"ol"},"Domain name resolves to the local loopback address."),(0,r.kt)("li",{parentName:"ol"},"A request is made to ",(0,r.kt)("inlineCode",{parentName:"li"},"127.0.0.1:8443")," with host set as ",(0,r.kt)("inlineCode",{parentName:"li"},"gitea.cnoe.localtest.me:8443"),"."),(0,r.kt)("li",{parentName:"ol"},"The request is sent to the container port 443."),(0,r.kt)("li",{parentName:"ol"},"Ingress-nginx looks at SNI and the host header, then routes the traffic to the Gitea service."),(0,r.kt)("li",{parentName:"ol"},"Gitea receives the request then sends back a response.")),(0,r.kt)("h3",{id:"dns"},"DNS"),(0,r.kt)("p",null,"By default, idpbuilder uses the domain name ",(0,r.kt)("inlineCode",{parentName:"p"},"cnoe.localtest.me")," as the base domain name to expose services such as ArgoCD and Gitea.\nMost subdomains under ",(0,r.kt)("inlineCode",{parentName:"p"},"localtest.me")," resolves to the ",(0,r.kt)("a",{parentName:"p",href:"https://en.wikipedia.org/wiki/Localhost"},"local loopback address"),".\nThis allows us to have a consistent name that resolves to a known IP address without using the ",(0,r.kt)("inlineCode",{parentName:"p"},"localhost")," name.\nSee ",(0,r.kt)("a",{parentName:"p",href:"https://readme.localtest.me/"},"the localtest.me documentation site")," for more information."),(0,r.kt)("p",null,(0,r.kt)("img",{alt:"img.png",src:n(1210).Z,width:"907",height:"407"})),(0,r.kt)("h3",{id:"in-cluster-dns-configuration"},"In-cluster DNS Configuration"),(0,r.kt)("p",null,"idpbuilder configures in-cluster DNS service (CoreDNS) to ensure domain names are resolved correctly.\nThe name given by the ",(0,r.kt)("inlineCode",{parentName:"p"},"--host")," flag resolves to the ingress-nginx controller service address.\nThis allows us to resolve the domain name inside and outside cluster to the same endpoint. "),(0,r.kt)("p",null,"As described above, the default domain name, ",(0,r.kt)("inlineCode",{parentName:"p"},"cnoe.localtest.me"),", resolves to a local loopback address such as ",(0,r.kt)("inlineCode",{parentName:"p"},"127.0.0.1"),".\nThis works for accessing the ingress-nginx service from outside the cluster because the service port is exposed as NodePort on the local machine. "),(0,r.kt)("p",null,"This approach does not work for in-cluster traffic because the address resolves to local loopback interface.\nFor example, if ArgoCD pod wants to access Gitea at ",(0,r.kt)("inlineCode",{parentName:"p"},"gitea.cnoe.localtest.me"),", the address resolves to ",(0,r.kt)("inlineCode",{parentName:"p"},"127.0.0.1")," which is the local loopback address within the node.\nTo ensure ArgoCD can talk to Gitea services, in-cluster DNS must be configured like so:"),(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre"},"rewrite name gitea.cnoe.localtest.me ingress-nginx-controller.ingress-nginx.svc.cluster.local\n")),(0,r.kt)("p",null,"This CoreDNS rewrite rule instructs CoreDNS to resolve requests made for ",(0,r.kt)("inlineCode",{parentName:"p"},"gitea.cnoe.localtest.me")," using the address given by ",(0,r.kt)("inlineCode",{parentName:"p"},"ingress-nginx-controller.ingress-nginx.svc.cluster.local")),(0,r.kt)("h2",{id:"core-packages"},"Core Packages"),(0,r.kt)("p",null,"idpbuilder installs the following packages to the cluster."),(0,r.kt)("ul",null,(0,r.kt)("li",{parentName:"ul"},(0,r.kt)("strong",{parentName:"li"},"ArgoCD")," is the GitOps solution to deploy manifests to Kubernetes clusters. In this project, a package is an ArgoCD application."),(0,r.kt)("li",{parentName:"ul"},(0,r.kt)("strong",{parentName:"li"},"Gitea")," server is the in-cluster Git server that ArgoCD can be configured to sync resources from. You can sync from local file systems to this."),(0,r.kt)("li",{parentName:"ul"},(0,r.kt)("strong",{parentName:"li"},"Ingress-nginx")," is used as a method to access in-cluster resources such as ArgoCD UI and Gitea UI.")),(0,r.kt)("p",null,"Once installed, idpbuilder passes control over these packages to ArgoCD by storing manifests in Gitea repositories then creating ArgoCD applications. From here on, ArgoCD manages them based on manifests checked into Git repositories."),(0,r.kt)("h2",{id:"getting-relevant-secrets"},"Getting Relevant Secrets"),(0,r.kt)("p",null,"The ",(0,r.kt)("inlineCode",{parentName:"p"},"idpbuilder get secrets")," command retrieves the following:"),(0,r.kt)("ul",null,(0,r.kt)("li",{parentName:"ul"},"ArgoCD initial admin password."),(0,r.kt)("li",{parentName:"ul"},"Gitea admin user credentials."),(0,r.kt)("li",{parentName:"ul"},"Any secrets labeled with ",(0,r.kt)("inlineCode",{parentName:"li"},"cnoe.io/cli-secret=true"),".")),(0,r.kt)("p",null,"You can think of the command as executing the following kubectl commands:"),(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre",className:"language-bash"},"kubectl -n argocd get secret argocd-initial-admin-secret\nkubectl get secrets -n gitea gitea-admin-secret\nkubectl get secrets -A -l cnoe.io/cli-secret=true\n")),(0,r.kt)("p",null,"If you want to retrieve secrets for a package, you can use the ",(0,r.kt)("inlineCode",{parentName:"p"},"-p")," flag. To get secrets for a package named ",(0,r.kt)("inlineCode",{parentName:"p"},"gitea"),": "),(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre",className:"language-bash"},"idpbuilder get secrets -p gitea\n")),(0,r.kt)("p",null,"For the ",(0,r.kt)("inlineCode",{parentName:"p"},"-p")," flag to work, you must label the secret with ",(0,r.kt)("inlineCode",{parentName:"p"},"cnoe.io/package-name"),".\nFor example, to make secret values available in a secret named ",(0,r.kt)("inlineCode",{parentName:"p"},"my-secret")," for a package named ",(0,r.kt)("inlineCode",{parentName:"p"},"foo"),":"),(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre",className:"language-bash"},'kubectl label secret my-secret "cnoe.io/package-name=foo" "cnoe.io/cli-secret=true"\n')),(0,r.kt)("p",null,"The secret will then be listed when issuing the ",(0,r.kt)("inlineCode",{parentName:"p"},"idpbuilder get secrets")," command.\nAlternatively, you can use the following command to retrieve the individual secret:"),(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre"},"idpbuilder get secrets -p foo\n")))}d.isMDXComponent=!0},824:(e,t,n)=>{n.d(t,{Z:()=>i});const i=n.p+"assets/images/idpbuilder-basic-16bfe6b7baa57701b2124233fff1e919.png"},1210:(e,t,n)=>{n.d(t,{Z:()=>i});const i=n.p+"assets/images/idpbuilder-dns-9462be1aa0af12554e03e841f3b72bfb.png"}}]); \ No newline at end of file diff --git a/assets/js/51875793.6ec38f2f.js b/assets/js/51875793.6ec38f2f.js new file mode 100644 index 00000000..82f0b76e --- /dev/null +++ b/assets/js/51875793.6ec38f2f.js @@ -0,0 +1 @@ +"use strict";(self.webpackChunkcnoe=self.webpackChunkcnoe||[]).push([[6262],{3905:(e,t,n)=>{n.d(t,{Zo:()=>d,kt:()=>g});var i=n(7294);function a(e,t,n){return t in e?Object.defineProperty(e,t,{value:n,enumerable:!0,configurable:!0,writable:!0}):e[t]=n,e}function r(e,t){var n=Object.keys(e);if(Object.getOwnPropertySymbols){var i=Object.getOwnPropertySymbols(e);t&&(i=i.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),n.push.apply(n,i)}return n}function o(e){for(var t=1;t=0||(a[n]=e[n]);return a}(e,t);if(Object.getOwnPropertySymbols){var r=Object.getOwnPropertySymbols(e);for(i=0;i=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(a[n]=e[n])}return a}var l=i.createContext({}),c=function(e){var t=i.useContext(l),n=t;return e&&(n="function"==typeof e?e(t):o(o({},t),e)),n},d=function(e){var t=c(e.components);return i.createElement(l.Provider,{value:t},e.children)},p="mdxType",u={inlineCode:"code",wrapper:function(e){var t=e.children;return i.createElement(i.Fragment,{},t)}},m=i.forwardRef((function(e,t){var n=e.components,a=e.mdxType,r=e.originalType,l=e.parentName,d=s(e,["components","mdxType","originalType","parentName"]),p=c(n),m=a,g=p["".concat(l,".").concat(m)]||p[m]||u[m]||r;return n?i.createElement(g,o(o({ref:t},d),{},{components:n})):i.createElement(g,o({ref:t},d))}));function g(e,t){var n=arguments,a=t&&t.mdxType;if("string"==typeof e||a){var r=n.length,o=new Array(r);o[0]=m;var s={};for(var l in t)hasOwnProperty.call(t,l)&&(s[l]=t[l]);s.originalType=e,s[p]="string"==typeof e?e:a,o[1]=s;for(var c=2;c{n.r(t),n.d(t,{assets:()=>l,contentTitle:()=>o,default:()=>p,frontMatter:()=>r,metadata:()=>s,toc:()=>c});var i=n(7462),a=(n(7294),n(3905));const r={sidebar_position:2,description:"How it works",title:"How it works",index:3},o=void 0,s={unversionedId:"reference-implementation/installations/idpbuilder/how-it-works",id:"reference-implementation/installations/idpbuilder/how-it-works",title:"How it works",description:"How it works",source:"@site/docs/reference-implementation/installations/idpbuilder/how-it-works.md",sourceDirName:"reference-implementation/installations/idpbuilder",slug:"/reference-implementation/installations/idpbuilder/how-it-works",permalink:"/docs/reference-implementation/installations/idpbuilder/how-it-works",draft:!1,editUrl:"https://github.com/cnoe-io/website/tree/main/docs/reference-implementation/installations/idpbuilder/how-it-works.md",tags:[],version:"current",sidebarPosition:2,frontMatter:{sidebar_position:2,description:"How it works",title:"How it works",index:3},sidebar:"tutorialSidebar",previous:{title:"Using the idpBuilder",permalink:"/docs/reference-implementation/installations/idpbuilder/usage"},next:{title:"Override built-in services",permalink:"/docs/reference-implementation/installations/idpbuilder/override"}},l={},c=[{value:"Bootstrapping",id:"bootstrapping",level:2},{value:"Self Signed Certificate",id:"self-signed-certificate",level:2},{value:"Networking",id:"networking",level:2},{value:"Overview",id:"overview",level:3},{value:"DNS",id:"dns",level:3},{value:"In-cluster DNS Configuration",id:"in-cluster-dns-configuration",level:3},{value:"Domain-based and Path-based routing",id:"domain-based-and-path-based-routing",level:3},{value:"Domain-based routing",id:"domain-based-routing",level:4},{value:"Path-based routing",id:"path-based-routing",level:4},{value:"Core Packages",id:"core-packages",level:2},{value:"Getting Relevant Secrets",id:"getting-relevant-secrets",level:2}],d={toc:c};function p(e){let{components:t,...r}=e;return(0,a.kt)("wrapper",(0,i.Z)({},d,r,{components:t,mdxType:"MDXLayout"}),(0,a.kt)("h2",{id:"bootstrapping"},"Bootstrapping"),(0,a.kt)("p",null,"When idpbuilder creates an environment for you, it performs the following tasks."),(0,a.kt)("ol",null,(0,a.kt)("li",{parentName:"ol"},"Create a local cluster if one does not exist yet."),(0,a.kt)("li",{parentName:"ol"},"Create a self-signed certificate, then set it as the default TLS certificate for ingress-nginx."),(0,a.kt)("li",{parentName:"ol"},"Configure CoreDNS to ensure names are resolved correctly."),(0,a.kt)("li",{parentName:"ol"},"Install Core Packages, then hands control over to ArgoCD.")),(0,a.kt)("p",null,(0,a.kt)("img",{alt:"img.png",src:n(824).Z,width:"649",height:"352"})),(0,a.kt)("h2",{id:"self-signed-certificate"},"Self Signed Certificate"),(0,a.kt)("p",null,"To ensure applications inside the cluster can talk to other services, idpbuilder creates a self-signed TLS certificate. The certificate is a wild card certificate\nfor the domain name and any subdomains given by the ",(0,a.kt)("inlineCode",{parentName:"p"},"--host")," flag.\nFor example, if you use the default domain name ",(0,a.kt)("inlineCode",{parentName:"p"},"cnoe.localtest.me")," the certificate is issued for ",(0,a.kt)("inlineCode",{parentName:"p"},"cnoe.localtest.me")," and ",(0,a.kt)("inlineCode",{parentName:"p"},"*.cnoe.localtest.me")),(0,a.kt)("p",null,"This certificate is then used by ingress-nginx as ","[the default TLS certificate]","(",(0,a.kt)("a",{parentName:"p",href:"https://kubernetes.github.io/ingress-nginx/user-guide/tls/#default-ssl-certificate"},"https://kubernetes.github.io/ingress-nginx/user-guide/tls/#default-ssl-certificate"),"_. This means you can override TLS certificate used at ingress level if desired."),(0,a.kt)("p",null,"The certificate is also ",(0,a.kt)("a",{parentName:"p",href:"https://argo-cd.readthedocs.io/en/stable/operator-manual/declarative-setup/#repositories-using-self-signed-tls-certificates-or-are-signed-by-custom-ca"},"imported to ArgoCD")," as one of trusted CAs. This is necessary to make sure ArgoCD can talk to Gitea services without disabling TLS."),(0,a.kt)("p",null,"Finally, the certificate is exposed as a secret named ",(0,a.kt)("inlineCode",{parentName:"p"},"idpbuilder-cert")," in the default namespace. To retrieve it, run the following command:"),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre",className:"language-bash"},"kubectl get secret -n default idpbuilder-cert\n")),(0,a.kt)("h2",{id:"networking"},"Networking"),(0,a.kt)("h3",{id:"overview"},"Overview"),(0,a.kt)("p",null,"With the default configuration on Docker on Linux, kind cluster is set up as follows:"),(0,a.kt)("ol",null,(0,a.kt)("li",{parentName:"ol"},"A Docker container runs as the Kubernetes node and the container port 443 is exposed on host port 8443. You can confirm this by running ",(0,a.kt)("inlineCode",{parentName:"li"},"docker container ls")),(0,a.kt)("li",{parentName:"ol"},"Ingress-nginx service is configured as ",(0,a.kt)("inlineCode",{parentName:"li"},"NodePort")," and listens on port 443. You can confirm with ",(0,a.kt)("inlineCode",{parentName:"li"},"kubectl get service -n ingress-nginx ingress-nginx-controller"),".")),(0,a.kt)("p",null,"With this setup, HTTP traffic for ",(0,a.kt)("inlineCode",{parentName:"p"},"https://gitea.cnoe.localtest.me:8443")," roughly looks like this."),(0,a.kt)("ol",null,(0,a.kt)("li",{parentName:"ol"},"Domain name resolves to the local loopback address."),(0,a.kt)("li",{parentName:"ol"},"A request is made to ",(0,a.kt)("inlineCode",{parentName:"li"},"127.0.0.1:8443")," with host set as ",(0,a.kt)("inlineCode",{parentName:"li"},"gitea.cnoe.localtest.me:8443"),"."),(0,a.kt)("li",{parentName:"ol"},"The request is sent to the container port 443."),(0,a.kt)("li",{parentName:"ol"},"Ingress-nginx looks at SNI and the host header, then routes the traffic to the Gitea service."),(0,a.kt)("li",{parentName:"ol"},"Gitea receives the request then sends back a response.")),(0,a.kt)("h3",{id:"dns"},"DNS"),(0,a.kt)("p",null,"By default, idpbuilder uses the domain name ",(0,a.kt)("inlineCode",{parentName:"p"},"cnoe.localtest.me")," as the base domain name to expose services such as ArgoCD and Gitea.\nMost subdomains under ",(0,a.kt)("inlineCode",{parentName:"p"},"localtest.me")," resolves to the ",(0,a.kt)("a",{parentName:"p",href:"https://en.wikipedia.org/wiki/Localhost"},"local loopback address"),".\nThis allows us to have a consistent name that resolves to a known IP address without using the ",(0,a.kt)("inlineCode",{parentName:"p"},"localhost")," name.\nSee ",(0,a.kt)("a",{parentName:"p",href:"https://readme.localtest.me/"},"the localtest.me documentation site")," for more information."),(0,a.kt)("p",null,(0,a.kt)("img",{alt:"img.png",src:n(1210).Z,width:"907",height:"407"})),(0,a.kt)("h3",{id:"in-cluster-dns-configuration"},"In-cluster DNS Configuration"),(0,a.kt)("p",null,"idpbuilder configures in-cluster DNS service (CoreDNS) to ensure domain names are resolved correctly.\nThe name given by the ",(0,a.kt)("inlineCode",{parentName:"p"},"--host")," flag resolves to the ingress-nginx controller service address.\nThis allows us to resolve the domain name inside and outside cluster to the same endpoint. "),(0,a.kt)("p",null,"As described above, the default domain name, ",(0,a.kt)("inlineCode",{parentName:"p"},"cnoe.localtest.me"),", resolves to a local loopback address such as ",(0,a.kt)("inlineCode",{parentName:"p"},"127.0.0.1"),".\nThis works for accessing the ingress-nginx service from outside the cluster because the service port is exposed as NodePort on the local machine. "),(0,a.kt)("p",null,"This approach does not work for in-cluster traffic because the address resolves to local loopback interface.\nFor example, if ArgoCD pod wants to access Gitea at ",(0,a.kt)("inlineCode",{parentName:"p"},"gitea.cnoe.localtest.me"),", the address resolves to ",(0,a.kt)("inlineCode",{parentName:"p"},"127.0.0.1")," which is the local loopback address within the node.\nTo ensure ArgoCD can talk to Gitea services, in-cluster DNS must be configured like so:"),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre"},"rewrite name gitea.cnoe.localtest.me ingress-nginx-controller.ingress-nginx.svc.cluster.local\n")),(0,a.kt)("p",null,"This CoreDNS rewrite rule instructs CoreDNS to resolve requests made for ",(0,a.kt)("inlineCode",{parentName:"p"},"gitea.cnoe.localtest.me")," using the address given by ",(0,a.kt)("inlineCode",{parentName:"p"},"ingress-nginx-controller.ingress-nginx.svc.cluster.local")),(0,a.kt)("h3",{id:"domain-based-and-path-based-routing"},"Domain-based and Path-based routing"),(0,a.kt)("p",null,"idpbuilder supports two modes of routing requests to in-cluster resources: domain-based and path-based.\nThe behavior is configured with the ",(0,a.kt)("inlineCode",{parentName:"p"},"--use-path-routing")," flag, which defaults to ",(0,a.kt)("inlineCode",{parentName:"p"},"false"),"."),(0,a.kt)("h4",{id:"domain-based-routing"},"Domain-based routing"),(0,a.kt)("p",null,"This is the default behavior of idpbuilder. In this mode, services are exposed under their own domain names.\nFor example:"),(0,a.kt)("ul",null,(0,a.kt)("li",{parentName:"ul"},"ArgoCD UI is accessed via ",(0,a.kt)("inlineCode",{parentName:"li"},"https://argocd.cnoe.localtest.me")),(0,a.kt)("li",{parentName:"ul"},"Gitea UI is accessed via ",(0,a.kt)("inlineCode",{parentName:"li"},"https://gitea.cnoe.localtest.me"))),(0,a.kt)("p",null,"This approach is generally cleaner and offers more flexible routing options because it requires less complex ingress configurations."),(0,a.kt)("h4",{id:"path-based-routing"},"Path-based routing"),(0,a.kt)("p",null,"When you use the ",(0,a.kt)("inlineCode",{parentName:"p"},"--use-path-routing")," flag, idpbuilder configures all services under a single domain name, with routing based on path parameters.\nFor example:"),(0,a.kt)("ul",null,(0,a.kt)("li",{parentName:"ul"},"ArgoCD UI is accessed via ",(0,a.kt)("inlineCode",{parentName:"li"},"https://cnoe.localtest.me/argocd")),(0,a.kt)("li",{parentName:"ul"},"Gitea UI is accessed via ",(0,a.kt)("inlineCode",{parentName:"li"},"https://cnoe.localtest.me/gitea"))),(0,a.kt)("p",null,"This is useful when you are constrained to using a single domain name and cannot use subdomains.\nA good example is when using GitHub Codespaces. When ",(0,a.kt)("a",{parentName:"p",href:"https://docs.github.com/en/codespaces/developing-in-a-codespace/forwarding-ports-in-your-codespace"},"forwarding ports")," in Codespaces, you are given a single domain name (like ",(0,a.kt)("inlineCode",{parentName:"p"},"wild-broomstick-abc.github.dev"),") to reach all services running in your codespace.\nIn such situations, you cannot use subdomains (e.g., ",(0,a.kt)("inlineCode",{parentName:"p"},"argocd.wild-broomstick-abc.github.dev")," would not work), making path-based routing the appropriate choice."),(0,a.kt)("h2",{id:"core-packages"},"Core Packages"),(0,a.kt)("p",null,"idpbuilder installs the following packages to the cluster."),(0,a.kt)("ul",null,(0,a.kt)("li",{parentName:"ul"},(0,a.kt)("strong",{parentName:"li"},"ArgoCD")," is the GitOps solution to deploy manifests to Kubernetes clusters. In this project, a package is an ArgoCD application."),(0,a.kt)("li",{parentName:"ul"},(0,a.kt)("strong",{parentName:"li"},"Gitea")," server is the in-cluster Git server that ArgoCD can be configured to sync resources from. You can sync from local file systems to this."),(0,a.kt)("li",{parentName:"ul"},(0,a.kt)("strong",{parentName:"li"},"Ingress-nginx")," is used as a method to access in-cluster resources such as ArgoCD UI and Gitea UI.")),(0,a.kt)("p",null,"Once installed, idpbuilder passes control over these packages to ArgoCD by storing manifests in Gitea repositories then creating ArgoCD applications. From here on, ArgoCD manages them based on manifests checked into Git repositories."),(0,a.kt)("h2",{id:"getting-relevant-secrets"},"Getting Relevant Secrets"),(0,a.kt)("p",null,"The ",(0,a.kt)("inlineCode",{parentName:"p"},"idpbuilder get secrets")," command retrieves the following:"),(0,a.kt)("ul",null,(0,a.kt)("li",{parentName:"ul"},"ArgoCD initial admin password."),(0,a.kt)("li",{parentName:"ul"},"Gitea admin user credentials."),(0,a.kt)("li",{parentName:"ul"},"Any secrets labeled with ",(0,a.kt)("inlineCode",{parentName:"li"},"cnoe.io/cli-secret=true"),".")),(0,a.kt)("p",null,"You can think of the command as executing the following kubectl commands:"),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre",className:"language-bash"},"kubectl -n argocd get secret argocd-initial-admin-secret\nkubectl get secrets -n gitea gitea-admin-secret\nkubectl get secrets -A -l cnoe.io/cli-secret=true\n")),(0,a.kt)("p",null,"If you want to retrieve secrets for a package, you can use the ",(0,a.kt)("inlineCode",{parentName:"p"},"-p")," flag. To get secrets for a package named ",(0,a.kt)("inlineCode",{parentName:"p"},"gitea"),": "),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre",className:"language-bash"},"idpbuilder get secrets -p gitea\n")),(0,a.kt)("p",null,"For the ",(0,a.kt)("inlineCode",{parentName:"p"},"-p")," flag to work, you must label the secret with ",(0,a.kt)("inlineCode",{parentName:"p"},"cnoe.io/package-name"),".\nFor example, to make secret values available in a secret named ",(0,a.kt)("inlineCode",{parentName:"p"},"my-secret")," for a package named ",(0,a.kt)("inlineCode",{parentName:"p"},"foo"),":"),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre",className:"language-bash"},'kubectl label secret my-secret "cnoe.io/package-name=foo" "cnoe.io/cli-secret=true"\n')),(0,a.kt)("p",null,"The secret will then be listed when issuing the ",(0,a.kt)("inlineCode",{parentName:"p"},"idpbuilder get secrets")," command.\nAlternatively, you can use the following command to retrieve the individual secret:"),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre"},"idpbuilder get secrets -p foo\n")))}p.isMDXComponent=!0},824:(e,t,n)=>{n.d(t,{Z:()=>i});const i=n.p+"assets/images/idpbuilder-basic-16bfe6b7baa57701b2124233fff1e919.png"},1210:(e,t,n)=>{n.d(t,{Z:()=>i});const i=n.p+"assets/images/idpbuilder-dns-9462be1aa0af12554e03e841f3b72bfb.png"}}]); \ No newline at end of file diff --git a/assets/js/runtime~main.6a86179b.js b/assets/js/runtime~main.2deea5ff.js similarity index 99% rename from assets/js/runtime~main.6a86179b.js rename to assets/js/runtime~main.2deea5ff.js index 48923cf6..216f24a7 100644 --- a/assets/js/runtime~main.6a86179b.js +++ b/assets/js/runtime~main.2deea5ff.js @@ -1 +1 @@ -(()=>{"use strict";var e,f,c,d,a,b={},t={};function r(e){var f=t[e];if(void 0!==f)return f.exports;var c=t[e]={id:e,loaded:!1,exports:{}};return b[e].call(c.exports,c,c.exports,r),c.loaded=!0,c.exports}r.m=b,r.c=t,e=[],r.O=(f,c,d,a)=>{if(!c){var b=1/0;for(i=0;i=a)&&Object.keys(r.O).every((e=>r.O[e](c[o])))?c.splice(o--,1):(t=!1,a0&&e[i-1][2]>a;i--)e[i]=e[i-1];e[i]=[c,d,a]},r.n=e=>{var f=e&&e.__esModule?()=>e.default:()=>e;return r.d(f,{a:f}),f},c=Object.getPrototypeOf?e=>Object.getPrototypeOf(e):e=>e.__proto__,r.t=function(e,d){if(1&d&&(e=this(e)),8&d)return e;if("object"==typeof e&&e){if(4&d&&e.__esModule)return e;if(16&d&&"function"==typeof e.then)return e}var a=Object.create(null);r.r(a);var b={};f=f||[null,c({}),c([]),c(c)];for(var t=2&d&&e;"object"==typeof t&&!~f.indexOf(t);t=c(t))Object.getOwnPropertyNames(t).forEach((f=>b[f]=()=>e[f]));return b.default=()=>e,r.d(a,b),a},r.d=(e,f)=>{for(var c in f)r.o(f,c)&&!r.o(e,c)&&Object.defineProperty(e,c,{enumerable:!0,get:f[c]})},r.f={},r.e=e=>Promise.all(Object.keys(r.f).reduce(((f,c)=>(r.f[c](e,f),f)),[])),r.u=e=>"assets/js/"+({53:"935f2afb",110:"66406991",225:"3152febb",277:"c4348237",290:"fbfe943f",453:"30a24c52",533:"b2b675dd",562:"10e68140",608:"fd406b69",754:"0a9abf7c",798:"02035f19",938:"022d1785",942:"dfd2e4b2",1047:"f5cbd827",1122:"525ff081",1251:"753efc25",1425:"4d5d8cd4",1466:"7d744e69",1477:"b2f554cd",1505:"7c2ff145",1713:"a7023ddc",1740:"f47f6e9b",1802:"b3b21529",1848:"6dad2acb",1968:"64b46e12",2291:"22198832",2448:"6f67b907",2503:"9c73b929",2535:"814f3328",2546:"3594c826",2553:"e8b58003",2803:"9e45a4e1",2843:"d18447ac",3085:"1f391b9e",3089:"a6aa9e1f",3203:"1b3d8d97",3285:"d17b656b",3293:"199bd6e0",3581:"65e462e7",3608:"9e4087bc",3835:"e49b4efd",3862:"8659458c",4013:"01a85c17",4159:"14e5f2d2",4195:"c4f5d8e4",4370:"24d5500f",4498:"fb4f9e51",4658:"cc6fec3d",4724:"89fa0985",4914:"f54a84d7",5062:"69369ae2",5253:"d9c58f53",5297:"b50cb68d",5374:"22549844",5616:"c200e719",5698:"ba37b902",5706:"559056c9",5719:"7e0db0be",6102:"919e122e",6103:"ccc49370",6133:"ae04ae61",6262:"51875793",6709:"f6d4f892",6715:"d373a014",6769:"9933fcf2",6981:"b7b5aa46",7089:"85c6e295",7091:"dff68946",7152:"9582ef58",7185:"f635a236",7219:"c54b9243",7233:"e032c842",7238:"7872e6e5",7414:"393be207",7445:"136d60bc",7619:"4f159f0a",7664:"af379018",7918:"17896441",8003:"b565c4cc",8080:"568e75da",8323:"bb5fcc00",8334:"07fe7216",8411:"da4ec9ab",8528:"8ce76c70",8610:"6875c492",8673:"d7ec2c57",8699:"15c9e1ef",8716:"f3a8eeeb",8833:"1a08dbdb",8902:"cad7ebc1",9134:"341f1b08",9305:"006469b5",9369:"d6e77a74",9385:"5079f34d",9399:"9a2ba1e6",9492:"dcf946d0",9514:"1be78505",9629:"29cbf954",9648:"97c96805",9694:"c0893904",9817:"14eb3368",9823:"0d2b97dd",9856:"bace8f9a"}[e]||e)+"."+{53:"d4616a51",110:"7aa2b758",210:"94ce215e",225:"9f4b1c1c",277:"f35f8477",290:"494572c6",453:"e7569983",533:"46441b04",562:"ad923060",608:"62c5546f",754:"baa3018d",798:"06bed6de",938:"6a2297d2",942:"99e91afe",1047:"d49a7301",1122:"eedfade4",1251:"9385716f",1425:"84d66faa",1466:"0e689c5f",1477:"42d23da9",1505:"ae33fcda",1713:"59f4e0ae",1740:"a08486e5",1802:"4102bca5",1848:"7fe48976",1968:"05fd79e3",2291:"a25e25dc",2315:"127a9c58",2448:"4694634f",2503:"3a70b3f1",2529:"dafbadc3",2535:"8f53d97e",2546:"8866eda9",2553:"832c66e3",2803:"62b2cb3e",2843:"58f2342b",3085:"b66f7e16",3089:"256103a6",3203:"7f8c3f31",3285:"8762ccd0",3293:"2d283142",3581:"98813a7b",3608:"284b8028",3835:"2ac6630f",3862:"ab22b105",4013:"188c0436",4159:"3092d9d5",4195:"067d91d3",4370:"b0d0f3b9",4498:"da947439",4658:"3eea9396",4724:"ece9b0da",4914:"aeaeda9a",4972:"07e59103",5062:"3b98306a",5253:"a9938c6b",5297:"7cba9d8e",5374:"d4100a21",5616:"48f492f2",5698:"6c7ba523",5706:"da560dbe",5719:"64def548",6102:"9d511cdd",6103:"80125080",6133:"5371986e",6262:"4dcb24bc",6451:"6e3eebe1",6709:"f02ca350",6715:"c7176dcc",6769:"79defe67",6981:"d7d97938",7089:"cea130f9",7091:"3cd8c818",7152:"3bd4b2db",7185:"756991de",7219:"1cdfc167",7233:"70727e9a",7238:"2d0e8944",7414:"1e5c1058",7445:"6ed420e8",7619:"9360a1ec",7664:"5a4d042b",7918:"448699f0",8003:"048eb8c3",8080:"851c86dd",8263:"17187b0a",8323:"af52cc6c",8334:"a54d9747",8411:"4ce3b202",8528:"fb9e52bb",8610:"7f2e1211",8673:"056219f4",8699:"c684dde9",8716:"3b4c218a",8833:"bba4fc80",8866:"c5d8e240",8902:"8f8e7e37",9134:"048568bb",9305:"07e6f9da",9369:"306978bf",9385:"d1191d9c",9399:"1ccd2b14",9492:"8f3fd2e6",9514:"53ea86c9",9629:"25f26dc0",9648:"49f0c7dd",9694:"dc1bea6a",9817:"b35cd586",9823:"4eb02dc0",9856:"ad6f3266"}[e]+".js",r.miniCssF=e=>{},r.g=function(){if("object"==typeof globalThis)return globalThis;try{return this||new Function("return this")()}catch(e){if("object"==typeof window)return window}}(),r.o=(e,f)=>Object.prototype.hasOwnProperty.call(e,f),d={},a="cnoe:",r.l=(e,f,c,b)=>{if(d[e])d[e].push(f);else{var t,o;if(void 0!==c)for(var n=document.getElementsByTagName("script"),i=0;i{t.onerror=t.onload=null,clearTimeout(s);var a=d[e];if(delete d[e],t.parentNode&&t.parentNode.removeChild(t),a&&a.forEach((e=>e(c))),f)return f(c)},s=setTimeout(l.bind(null,void 0,{type:"timeout",target:t}),12e4);t.onerror=l.bind(null,t.onerror),t.onload=l.bind(null,t.onload),o&&document.head.appendChild(t)}},r.r=e=>{"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},r.p="/",r.gca=function(e){return e={17896441:"7918",22198832:"2291",22549844:"5374",51875793:"6262",66406991:"110","935f2afb":"53","3152febb":"225",c4348237:"277",fbfe943f:"290","30a24c52":"453",b2b675dd:"533","10e68140":"562",fd406b69:"608","0a9abf7c":"754","02035f19":"798","022d1785":"938",dfd2e4b2:"942",f5cbd827:"1047","525ff081":"1122","753efc25":"1251","4d5d8cd4":"1425","7d744e69":"1466",b2f554cd:"1477","7c2ff145":"1505",a7023ddc:"1713",f47f6e9b:"1740",b3b21529:"1802","6dad2acb":"1848","64b46e12":"1968","6f67b907":"2448","9c73b929":"2503","814f3328":"2535","3594c826":"2546",e8b58003:"2553","9e45a4e1":"2803",d18447ac:"2843","1f391b9e":"3085",a6aa9e1f:"3089","1b3d8d97":"3203",d17b656b:"3285","199bd6e0":"3293","65e462e7":"3581","9e4087bc":"3608",e49b4efd:"3835","8659458c":"3862","01a85c17":"4013","14e5f2d2":"4159",c4f5d8e4:"4195","24d5500f":"4370",fb4f9e51:"4498",cc6fec3d:"4658","89fa0985":"4724",f54a84d7:"4914","69369ae2":"5062",d9c58f53:"5253",b50cb68d:"5297",c200e719:"5616",ba37b902:"5698","559056c9":"5706","7e0db0be":"5719","919e122e":"6102",ccc49370:"6103",ae04ae61:"6133",f6d4f892:"6709",d373a014:"6715","9933fcf2":"6769",b7b5aa46:"6981","85c6e295":"7089",dff68946:"7091","9582ef58":"7152",f635a236:"7185",c54b9243:"7219",e032c842:"7233","7872e6e5":"7238","393be207":"7414","136d60bc":"7445","4f159f0a":"7619",af379018:"7664",b565c4cc:"8003","568e75da":"8080",bb5fcc00:"8323","07fe7216":"8334",da4ec9ab:"8411","8ce76c70":"8528","6875c492":"8610",d7ec2c57:"8673","15c9e1ef":"8699",f3a8eeeb:"8716","1a08dbdb":"8833",cad7ebc1:"8902","341f1b08":"9134","006469b5":"9305",d6e77a74:"9369","5079f34d":"9385","9a2ba1e6":"9399",dcf946d0:"9492","1be78505":"9514","29cbf954":"9629","97c96805":"9648",c0893904:"9694","14eb3368":"9817","0d2b97dd":"9823",bace8f9a:"9856"}[e]||e,r.p+r.u(e)},(()=>{var e={1303:0,532:0};r.f.j=(f,c)=>{var d=r.o(e,f)?e[f]:void 0;if(0!==d)if(d)c.push(d[2]);else if(/^(1303|532)$/.test(f))e[f]=0;else{var a=new Promise(((c,a)=>d=e[f]=[c,a]));c.push(d[2]=a);var b=r.p+r.u(f),t=new Error;r.l(b,(c=>{if(r.o(e,f)&&(0!==(d=e[f])&&(e[f]=void 0),d)){var a=c&&("load"===c.type?"missing":c.type),b=c&&c.target&&c.target.src;t.message="Loading chunk "+f+" failed.\n("+a+": "+b+")",t.name="ChunkLoadError",t.type=a,t.request=b,d[1](t)}}),"chunk-"+f,f)}},r.O.j=f=>0===e[f];var f=(f,c)=>{var d,a,b=c[0],t=c[1],o=c[2],n=0;if(b.some((f=>0!==e[f]))){for(d in t)r.o(t,d)&&(r.m[d]=t[d]);if(o)var i=o(r)}for(f&&f(c);n{"use strict";var e,f,c,d,a,b={},t={};function r(e){var f=t[e];if(void 0!==f)return f.exports;var c=t[e]={id:e,loaded:!1,exports:{}};return b[e].call(c.exports,c,c.exports,r),c.loaded=!0,c.exports}r.m=b,r.c=t,e=[],r.O=(f,c,d,a)=>{if(!c){var b=1/0;for(i=0;i=a)&&Object.keys(r.O).every((e=>r.O[e](c[o])))?c.splice(o--,1):(t=!1,a0&&e[i-1][2]>a;i--)e[i]=e[i-1];e[i]=[c,d,a]},r.n=e=>{var f=e&&e.__esModule?()=>e.default:()=>e;return r.d(f,{a:f}),f},c=Object.getPrototypeOf?e=>Object.getPrototypeOf(e):e=>e.__proto__,r.t=function(e,d){if(1&d&&(e=this(e)),8&d)return e;if("object"==typeof e&&e){if(4&d&&e.__esModule)return e;if(16&d&&"function"==typeof e.then)return e}var a=Object.create(null);r.r(a);var b={};f=f||[null,c({}),c([]),c(c)];for(var t=2&d&&e;"object"==typeof t&&!~f.indexOf(t);t=c(t))Object.getOwnPropertyNames(t).forEach((f=>b[f]=()=>e[f]));return b.default=()=>e,r.d(a,b),a},r.d=(e,f)=>{for(var c in f)r.o(f,c)&&!r.o(e,c)&&Object.defineProperty(e,c,{enumerable:!0,get:f[c]})},r.f={},r.e=e=>Promise.all(Object.keys(r.f).reduce(((f,c)=>(r.f[c](e,f),f)),[])),r.u=e=>"assets/js/"+({53:"935f2afb",110:"66406991",225:"3152febb",277:"c4348237",290:"fbfe943f",453:"30a24c52",533:"b2b675dd",562:"10e68140",608:"fd406b69",754:"0a9abf7c",798:"02035f19",938:"022d1785",942:"dfd2e4b2",1047:"f5cbd827",1122:"525ff081",1251:"753efc25",1425:"4d5d8cd4",1466:"7d744e69",1477:"b2f554cd",1505:"7c2ff145",1713:"a7023ddc",1740:"f47f6e9b",1802:"b3b21529",1848:"6dad2acb",1968:"64b46e12",2291:"22198832",2448:"6f67b907",2503:"9c73b929",2535:"814f3328",2546:"3594c826",2553:"e8b58003",2803:"9e45a4e1",2843:"d18447ac",3085:"1f391b9e",3089:"a6aa9e1f",3203:"1b3d8d97",3285:"d17b656b",3293:"199bd6e0",3581:"65e462e7",3608:"9e4087bc",3835:"e49b4efd",3862:"8659458c",4013:"01a85c17",4159:"14e5f2d2",4195:"c4f5d8e4",4370:"24d5500f",4498:"fb4f9e51",4658:"cc6fec3d",4724:"89fa0985",4914:"f54a84d7",5062:"69369ae2",5253:"d9c58f53",5297:"b50cb68d",5374:"22549844",5616:"c200e719",5698:"ba37b902",5706:"559056c9",5719:"7e0db0be",6102:"919e122e",6103:"ccc49370",6133:"ae04ae61",6262:"51875793",6709:"f6d4f892",6715:"d373a014",6769:"9933fcf2",6981:"b7b5aa46",7089:"85c6e295",7091:"dff68946",7152:"9582ef58",7185:"f635a236",7219:"c54b9243",7233:"e032c842",7238:"7872e6e5",7414:"393be207",7445:"136d60bc",7619:"4f159f0a",7664:"af379018",7918:"17896441",8003:"b565c4cc",8080:"568e75da",8323:"bb5fcc00",8334:"07fe7216",8411:"da4ec9ab",8528:"8ce76c70",8610:"6875c492",8673:"d7ec2c57",8699:"15c9e1ef",8716:"f3a8eeeb",8833:"1a08dbdb",8902:"cad7ebc1",9134:"341f1b08",9305:"006469b5",9369:"d6e77a74",9385:"5079f34d",9399:"9a2ba1e6",9492:"dcf946d0",9514:"1be78505",9629:"29cbf954",9648:"97c96805",9694:"c0893904",9817:"14eb3368",9823:"0d2b97dd",9856:"bace8f9a"}[e]||e)+"."+{53:"d4616a51",110:"7aa2b758",210:"94ce215e",225:"9f4b1c1c",277:"f35f8477",290:"494572c6",453:"e7569983",533:"46441b04",562:"ad923060",608:"62c5546f",754:"baa3018d",798:"06bed6de",938:"6a2297d2",942:"99e91afe",1047:"d49a7301",1122:"eedfade4",1251:"9385716f",1425:"84d66faa",1466:"0e689c5f",1477:"42d23da9",1505:"ae33fcda",1713:"59f4e0ae",1740:"a08486e5",1802:"4102bca5",1848:"7fe48976",1968:"05fd79e3",2291:"a25e25dc",2315:"127a9c58",2448:"4694634f",2503:"3a70b3f1",2529:"dafbadc3",2535:"8f53d97e",2546:"8866eda9",2553:"832c66e3",2803:"62b2cb3e",2843:"58f2342b",3085:"b66f7e16",3089:"256103a6",3203:"7f8c3f31",3285:"8762ccd0",3293:"2d283142",3581:"98813a7b",3608:"284b8028",3835:"2ac6630f",3862:"ab22b105",4013:"188c0436",4159:"3092d9d5",4195:"067d91d3",4370:"b0d0f3b9",4498:"da947439",4658:"3eea9396",4724:"ece9b0da",4914:"aeaeda9a",4972:"07e59103",5062:"3b98306a",5253:"a9938c6b",5297:"7cba9d8e",5374:"d4100a21",5616:"48f492f2",5698:"6c7ba523",5706:"da560dbe",5719:"64def548",6102:"9d511cdd",6103:"80125080",6133:"5371986e",6262:"6ec38f2f",6451:"6e3eebe1",6709:"f02ca350",6715:"c7176dcc",6769:"79defe67",6981:"d7d97938",7089:"cea130f9",7091:"3cd8c818",7152:"3bd4b2db",7185:"756991de",7219:"1cdfc167",7233:"70727e9a",7238:"2d0e8944",7414:"1e5c1058",7445:"6ed420e8",7619:"9360a1ec",7664:"5a4d042b",7918:"448699f0",8003:"048eb8c3",8080:"851c86dd",8263:"17187b0a",8323:"af52cc6c",8334:"a54d9747",8411:"4ce3b202",8528:"fb9e52bb",8610:"7f2e1211",8673:"056219f4",8699:"c684dde9",8716:"3b4c218a",8833:"bba4fc80",8866:"c5d8e240",8902:"8f8e7e37",9134:"048568bb",9305:"07e6f9da",9369:"306978bf",9385:"d1191d9c",9399:"1ccd2b14",9492:"8f3fd2e6",9514:"53ea86c9",9629:"25f26dc0",9648:"49f0c7dd",9694:"dc1bea6a",9817:"b35cd586",9823:"4eb02dc0",9856:"ad6f3266"}[e]+".js",r.miniCssF=e=>{},r.g=function(){if("object"==typeof globalThis)return globalThis;try{return this||new Function("return this")()}catch(e){if("object"==typeof window)return window}}(),r.o=(e,f)=>Object.prototype.hasOwnProperty.call(e,f),d={},a="cnoe:",r.l=(e,f,c,b)=>{if(d[e])d[e].push(f);else{var t,o;if(void 0!==c)for(var n=document.getElementsByTagName("script"),i=0;i{t.onerror=t.onload=null,clearTimeout(s);var a=d[e];if(delete d[e],t.parentNode&&t.parentNode.removeChild(t),a&&a.forEach((e=>e(c))),f)return f(c)},s=setTimeout(l.bind(null,void 0,{type:"timeout",target:t}),12e4);t.onerror=l.bind(null,t.onerror),t.onload=l.bind(null,t.onload),o&&document.head.appendChild(t)}},r.r=e=>{"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},r.p="/",r.gca=function(e){return e={17896441:"7918",22198832:"2291",22549844:"5374",51875793:"6262",66406991:"110","935f2afb":"53","3152febb":"225",c4348237:"277",fbfe943f:"290","30a24c52":"453",b2b675dd:"533","10e68140":"562",fd406b69:"608","0a9abf7c":"754","02035f19":"798","022d1785":"938",dfd2e4b2:"942",f5cbd827:"1047","525ff081":"1122","753efc25":"1251","4d5d8cd4":"1425","7d744e69":"1466",b2f554cd:"1477","7c2ff145":"1505",a7023ddc:"1713",f47f6e9b:"1740",b3b21529:"1802","6dad2acb":"1848","64b46e12":"1968","6f67b907":"2448","9c73b929":"2503","814f3328":"2535","3594c826":"2546",e8b58003:"2553","9e45a4e1":"2803",d18447ac:"2843","1f391b9e":"3085",a6aa9e1f:"3089","1b3d8d97":"3203",d17b656b:"3285","199bd6e0":"3293","65e462e7":"3581","9e4087bc":"3608",e49b4efd:"3835","8659458c":"3862","01a85c17":"4013","14e5f2d2":"4159",c4f5d8e4:"4195","24d5500f":"4370",fb4f9e51:"4498",cc6fec3d:"4658","89fa0985":"4724",f54a84d7:"4914","69369ae2":"5062",d9c58f53:"5253",b50cb68d:"5297",c200e719:"5616",ba37b902:"5698","559056c9":"5706","7e0db0be":"5719","919e122e":"6102",ccc49370:"6103",ae04ae61:"6133",f6d4f892:"6709",d373a014:"6715","9933fcf2":"6769",b7b5aa46:"6981","85c6e295":"7089",dff68946:"7091","9582ef58":"7152",f635a236:"7185",c54b9243:"7219",e032c842:"7233","7872e6e5":"7238","393be207":"7414","136d60bc":"7445","4f159f0a":"7619",af379018:"7664",b565c4cc:"8003","568e75da":"8080",bb5fcc00:"8323","07fe7216":"8334",da4ec9ab:"8411","8ce76c70":"8528","6875c492":"8610",d7ec2c57:"8673","15c9e1ef":"8699",f3a8eeeb:"8716","1a08dbdb":"8833",cad7ebc1:"8902","341f1b08":"9134","006469b5":"9305",d6e77a74:"9369","5079f34d":"9385","9a2ba1e6":"9399",dcf946d0:"9492","1be78505":"9514","29cbf954":"9629","97c96805":"9648",c0893904:"9694","14eb3368":"9817","0d2b97dd":"9823",bace8f9a:"9856"}[e]||e,r.p+r.u(e)},(()=>{var e={1303:0,532:0};r.f.j=(f,c)=>{var d=r.o(e,f)?e[f]:void 0;if(0!==d)if(d)c.push(d[2]);else if(/^(1303|532)$/.test(f))e[f]=0;else{var a=new Promise(((c,a)=>d=e[f]=[c,a]));c.push(d[2]=a);var b=r.p+r.u(f),t=new Error;r.l(b,(c=>{if(r.o(e,f)&&(0!==(d=e[f])&&(e[f]=void 0),d)){var a=c&&("load"===c.type?"missing":c.type),b=c&&c.target&&c.target.src;t.message="Loading chunk "+f+" failed.\n("+a+": "+b+")",t.name="ChunkLoadError",t.type=a,t.request=b,d[1](t)}}),"chunk-"+f,f)}},r.O.j=f=>0===e[f];var f=(f,c)=>{var d,a,b=c[0],t=c[1],o=c[2],n=0;if(b.some((f=>0!==e[f]))){for(d in t)r.o(t,d)&&(r.m[d]=t[d]);if(o)var i=o(r)}for(f&&f(c);n - + @@ -24,7 +24,7 @@ expands entities in the original git repository with intended application specifications.

On the positive side:

  • This is a relatively simple approach and works for smaller teams with smaller number of applications or systems
  • Having a second git repository to capture the end state of an entity stays closer to the core GitOps practices
  • Does not require significant modification to the developer portal

On the negative side:

  • There is inherent duplications that are happening
  • Adding custom metadata by application teams is not as trivial as it requires making changes to the integration workflow, thus bringing load and demand to the DevOps teams
  • Less abstraction in place as end application users are directly exposed to the yaml specification of the entities
  • Does not scale well as the number of systems and entities grow

ci-as-source-of-truth

Use a central control plane as the source of truth

The hub and spoke model is the most advocated for model when applying GitOps practices. Your control plane cluster runs and manages your platform tools, your CI, your CD, developer portal, infrastructure as code tooling, etc.

On the positive side:

  • There really is a single place to inspect the status of entities. E.g., Argo applications can tell you the status of deployed applications. You can also inspect the status of workflows, infrastructure resources, and any other entity that the control plane cluster manages.
  • You can use the Backstage Kubernetes plugin seamlessly and maybe with some little tweaks. Alternatively this can be achieved by introducing fairly light-weight Backstage custom entity providers which pull and show the status of entities in the Backstage portal.
  • In an organization with a diverse set of distributed systems, the control plane cluster can be used as the integration layer by wrapping legacy APIs and or implementing native controllers.

On the negative side:

  • Most organizations do not have a central control plane and adopting one as the source of truth is often a significant change, especially if an organization is early in their GitOps transition.
  • For organizations deep into a federated model of operation with different teams running and managing their platforms separately and rather independently, it could be challenging to offer a single control plane that aggregates data across all teams.
  • Management of change could become cumbersome. Existence of a single control plane could create bottlenecks where changes occur to a set of entities or practices. Changes in organizations or systems may result in changes to various entities managed across several teams. Bringing GitOps practices to the mix, this requires chains of approvals to happen across multiple entities and across several repositories for deployments to start flowing. Depending on the size of the organization, this could lead to organizational nightmares.
  • You may need to jump through a few hoops before getting from the representation of the application, to the actual deployment of it, e.g., going from git to your continuous delivery and from there to your target cluster.

controlplane-as-source-of-truth

Use Backstage as the source of truth

Where control planes and compute workloads are scattered, the unifying layer lies in the developer portal, i.e. Backstage. Hence, it is reasonable to construct an entity by collecting and aggregating data from various data sources, each providing partial data on the entity, making Backstage be the source of truth. This generally starts with Backstage querying git for the entities that exist. Then using the identifiers for the entities to collect metadata on how the entity contributes to a system. This could involve querying the control plane clusters and the workload clusters via some custom entity provider that looks for certain information and putting collected pieces together to come close to the core promise of a developer portal, providing reliable information on the entities.

On the positive side:

  • This model copes better with legacy systems
  • Users are not exposed to and often times not even aware of the underlying platforms, hence underlying platform and tooling is more rigorously abstracted away
  • Changes to the system are only isolated to the entities of the particular system as managed by the underlying resources and platform. This causes less chaos when definitions, metadata, or properties of entities need to change.

On the negative side:

  • The git service may not be able to scale, technically or financially. This is particularly because Backstage may hit the git service endpoints too frequently and exceed the API limits. This could cause delays in displaying data for end users or display wrong information if partially available data is mishandled. This can be mitigated via approaches like using an eventing mechanism to notify Backstage of changes, or alternatively to store entity definitions in an alternative storage space (e.g. Amazon S3). There are challenges to such approaches too, for example when using Amazon S3, change history will be lost. Also, using an eventing mechanism could introduce security challenges that we discuss next.
  • Securing Backstage could be a challenge. For Backstage to proactively receive updates on entity changes, it would work best to configure event hooks to provide callbacks to Backstage when changes occur. Backstage, being the entry point for user workflows, sits on the critical path of platform operations. As such, platform engineers need to solve for a chicken and egg problem by deciding how to expose Backstage endpoints to receive events and yet to limit access for security reasons. The authentication methods that GitHub supports may not satisfy the security standards that an organization requires.
  • Changes to entities may not be as trivial. DevOps engineers need to manage entities that they may not control. For example, if a new mandatory field is introduced to a catalog file, DevOps engineers may need to talk to the respective repository owners, create a PR, then get approval for all affected repositories.

backstage-as-source-of-truth

Conclusion

We discussed multiple approaches to creating reliable representation of system entities in the developer portals. We do not necessarily recommend one approach over another, but it is important to find the right approach given the patterns and practices in your organization. It is also worth noting that you can choose to combine multiple approaches depending on the requirements of your teams. For example, while continuous integration can still be used to construct the actual state of the world by collecting status data and other related information, Backstage extensions can be introduced to expand on entity relations, providing better representation of a system. Stating the obvious here, but your proper selection of patterns that work for you will go a long way in increasing your overall team velocity down the road.

Reach out on #cnoe-interest CNCF slack channel to share thoughts and get involved in developing CNOE.

· 5 min read
Nima Kaviani

Adobe, Amazon Web Services, Autodesk, Salesforce, and Twilio have come together to launch an open source initiative for building internal developer platforms (IDPs). Cloud Native Operational Excellence (aka, CNOE, pronounced Kuh.no) is a joint effort to share developer tooling, thoughts, and patterns to help organizations make informed technology choices and resolve common pain points. CNOE will enable organizations to navigate tooling sprawl and technology churn by coordinating contributions, offering tools, and providing neutral and unbiased guidance on technology choices to deliver internal developer platforms.

Developer productivity is increasingly important for organizations to compete in today’s fast-paced marketplace. To increase productivity, many organizations are taking a platform engineering approach to build internal developer platforms that abstract away complexity and enable faster, more secure software delivery. These internal developer platforms are long-term strategic investments, and the choice of open source technologies and architectures used to build these platforms can greatly impact their long-term success and viability.

CNOE is a community for organizations passionate about evolving experiences in developer productivity and efficiency. Contributors to this community are sharing their open source developer platform tooling choices to bring awareness to the best practices that have helped their respective teams. With such awareness comes alignment and the ability to de-risk their technology choices over the long term.

The CNOE community will navigate their operational technology decisions together, coordinate contributions, and offer guidance on which Cloud Native Computing Foundation (CNCF) technologies to use to achieve cloud efficiencies. CNOE will aim to:

Create an open source first strategy for internal developer platform capabilities, prioritizing CNCF technologies.

Build community alignment on technology choices and best practices.

Elevate tools and practices that can benefit a wide range of organizations building their own internal developer platforms.

Build for the infrastructure and customize to developer needs, making the solutions and patterns flexible for adoption.

Provide artifacts about tools, patterns, and practices to be easily consumable by the community.  

“The work of building secure, reliable, compliant, and regionalized software is becoming more and more complicated. Development teams need the right separation of concerns to build efficiently and move fast. Internal developer platforms enable just that. They abstract away complexity so a team can focus fully on their key goals. I’m excited to see the CNOE community share experiences, expand ideas beyond a single company’s viewpoint, and de-risk our technology strategies to build better together.” - Ben Cochran, VP Developer Enablement at Autodesk

"As a technology company, CNOE is an extension of our DNA, and open source is key to our platform. CNOE fosters collaboration within the industry, minimizes duplicated work, and emphasizes unique products. I'm eager to see our contributions to CNOE and others benefiting from it." - Chris Lyon, VP of Engineering Segment at Twilio.

"Open source software is a core component that many organizations leverage to power their internal developer platforms. Organizations often anchor on specific capabilities to power their developer platforms like Continuous Integration/Continuous Delivery, Infrastructure as Code, Service Mesh, Policy controls, Artifact management, and developer portals. As a result, they have been seeking a forum to share best practices and to share their findings on the tooling choices they have been using. I’m incredibly excited to see AWS contribute to CNOE and CNOE be the vehicle that creates industry alignment based on the intrinsic gravity of the tooling choices being made at scale.” - said Paul Roberts, Sr. Principal Solutions Architect at AWS.

“Adobe believes in the transformative power of open source software. We are excited to be a founding member of CNOE and to partner with other industry thought leaders to define and share our vision of a cloud native stack for rapidly building Internal Developer Platforms.” - Dave Weinstein, VP of Engineering at Adobe.

“Salesforce is deeply engaged in the Open Source community, which was integral in building Hyperforce, a reimagination of our trusted platform architecture for the public cloud. Salesforce is honored to serve as a launch partner for CNOE, further advancing the adoption of open source technologies and assuring companies of sound technology decisions and sustained support for years to come.” - Josh Meier, Hyperforce Lead Architect

With the launch of CNOE, members will contribute tooling, plugins, and reference implementations that facilitate building internal developer platforms. Members are also releasing a capability map that captures key open technologies and their relevance in building internal developer platforms across these organizations.

As we move forward, each member organization will continue to share their approach on adopting and composing the tooling and technologies recommended by the CNOE working group to deliver on their IDPs.

CNOE invites more companies to join us. To learn more about CNOE, visit https://cnoe.io, where we share extended details about patterns and practices we are developing. Explore options to get involved and contact us via the CNCF slack channel #cnoe-public.

Special thanks to the many people who helped with the launch, Andrew Lee, Omar Kahil, Ben Fields, Bryan Landes, Vikram Venkataraman, Rick Sostheim, Manabu McCloskey, Praseeda Sathaye, and Vara Bonthu from AWS, Rob Hilton (formerly AWS, now Google), Jesse Sanford, Greg Haynes, Mani Kandadai Venkatesh, Sara Mesing, and Brandon Leach from Autodesk, Jesse Adametz and Wes Medford from Twilio, Rohan Kapoor and Vikram Sethi from Adobe.

Member Announcements

- + \ No newline at end of file diff --git a/blog/archive.html b/blog/archive.html index 67b94baf..86496e86 100644 --- a/blog/archive.html +++ b/blog/archive.html @@ -10,13 +10,13 @@ - + - + \ No newline at end of file diff --git a/blog/argo-cd-application-scalability.html b/blog/argo-cd-application-scalability.html index 1a8efcea..0493a2e5 100644 --- a/blog/argo-cd-application-scalability.html +++ b/blog/argo-cd-application-scalability.html @@ -10,14 +10,14 @@ - +

Argo CD Benchmarking - Pushing the Limits and Sharding Deep Dive

· 21 min read
Andrew Lee
Michael Crenshaw
Gaurav Dhamija

Introduction

In Part 1 of our Argo CD benchmarking blog post, we analyzed the impacts of various Argo CD configuration parameters on the performance of Argo CD. In particular we measured the impact of status and operation processes, client QPS, burst QPS, and sharding algorithms on the overall synchronization and reconciliation behavior in Argo CD. We showed that using the right configuration and sharding strategy, particularly by properly setting client and burst QPS, as well as by splitting the workload across multiple workload clusters using Argo CD sharding, overall sync time can be improved by a factor of 4.

Here, and in Part 2 of our scalability work, we push our scalability experiments for Argo CD further. In particular, among other tests, we run our scalability metrics against a maximum of 500 workload clusters, deploying 50,000 Argo applications. This, to the best of our knowledge, sets the largest scalability testing ever done for Argo CD. We also report on a much deeper set of sharding experiments, utilizing different sharding algorithms for distribution of load across 100 workload clusters. While we report on running our experiments against a legacy sharding algorithm and a round robin algorithm that already exist in Argo CD 2.8, we also discuss results of workload distribution using 3 new sharding algorithms we developed in collaboration with RedHat, namely: a greedy minimum algorithm, a weighted ring hash algorithm, and a consistent hash with bounded loads algorithm. We show that, depending on the optimization goals one has in mind, choosing from the new sharding algorithms can improve CPU utilization by a factor of 3 and reduce application-to-shard rebalancing by a factor of 5, significantly improving the performance of a highly distributed and massively scaled Argo CD deployment.

Experiment 1: How Client QPS/Burst QPS affects the Kubernetes API Server

Objective:

The objective of the first experiment is to understand the impact of QPS & Burst Rate parameters on 1/Kubernetes control plane for both the Argo CD cluster and the remote application clusters, and 2/ overall sync duration for Argo CD applications. To understand the impact on Kubernetes API server, we observed following control plane metrics:

  • Latency (apiserver_request_duration_seconds_bucket)
  • Throughput (apiserver_request_total)
  • Error Rate (apiserver_request_total{code=~"[45].."}) for any request returning an error code 4xx or 5xx.

To analyze impact on application synchronization, we observed Sync Duration and No. of Goroutines Argo CD server metrics.

Test Infrastructure:

In terms of test infrastructure and workload configuration, we had one central Amazon EKS cluster with Argo CD Server running on it. This central cluster connected with three remote Amazon EKS clusters with each one of them hosting 5000 Argo CD applications. Each application is a Configmap (2KB) provisioned in a dedicated namespace. All of the four clusters, one central and three remote, had a dedicated monitoring stack composed of Prometheus and Grafana installed on them.

Observations:

Observation 1 - Impact on Argo CD application synchronization

The table and graphs below highlight the impact of QPS & Burst Rate on “Sync Duration” as well as the average and maximum no. of goroutines active during the test run.

QPSBurst RateSync DurationNo. of GoRoutines (Avg)No. of GoRoutines (Max)
5010061.5 mins17601810
10020029.5 mins21202310
15030019.0 mins25202760
20040018.0 mins26202780
25050017.5 mins25902760
30060018.0 mins25402760

alt_text

To summarize, during the test, we immediately observed ~52% reduction (from 61.5 mins to 29.5 mins) as we increased QPS & Burst Rate from default values to 100 & 200 respectively. This also correlated with corresponding increase in no. of Goroutines processing application synchronization requests. The benefit from increasing values of these parameters started providing diminishing returns with subsequent runs. Beyond QPS & Burst rate of 150 & 300 respectively, there wasn’t measurable improvement observed. This again correlated with number of Goroutines actively processing sync requests.

Observation 2 - Impact on central Amazon EKS cluster control plane hosting Argo CD Server

The table and graphs below highlights the impact of QPS & Burst Rate on throughput and latency from Amazon EKS control plane hosting Argo CD Server. We can observe an increase in request rate per second to the Kubernetes control plane which is in line with previous observations related to increase in no. of goroutines processing the sync requests. The increased activity related to sync operations translates into increased requests to Amazon EKS control plane tapering off at QPS of 150 and Burst Rate of 300. Additional increase in QPS and Burst Rate parameters doesn’t noticeably impact request rate per second.

QPSBurst RateRequest Rate (Max)Latency p50 (Max)Latency p90 (Max)
5010027.2 rps13.0 ms22.6 ms
10020031.9 rps13.3 ms23.1 ms
15030039.8 rps14.3 ms24.0 ms
20040041.4 rps14.9 ms24.4 ms
25050039.0 rps15.1 ms24.4 ms
30060040.7 rps16.4 ms34.5 ms

From a latency perspective, overall during the course of testing, average (p50) duration remained within range of 13 to 16.5 ms and p90 latency within 22 ms to 34 ms. The error rate remained consistently around ~0.22% with a brief spike to ~0.25% (increase of ~0.03%).

The relatively low latency numbers and low error rate (<0.25%) indicates that Amazon EKS control plane was able to handle the load comfortably. Increasing QPS and Burst rate only would stretch the control plane to a limited extent indicating it still has resources to process additional requests as long as Argo CD server can generate request traffic.

alt_text

Observation 3 - Impact on remote Amazon EKS cluster control plane hosting applications

We had similar observations regarding latency, throughput and error rate for Amazon EKS control plane of remote application clusters. These are the clusters hosting ~5000 Argo CD applications each and connected to Argo CD Server on the central Amazon EKS cluster. The throughput peaked at ~35 requests per second with QPS and burst rate of 150 & 300 respectively. From an average latency perspective, it remained consistently within single digit millisecond hovering around ~5ms.

alt_text

Experiment 2: Revisiting Status/Operation Processors

Objective:

The objective of the second experiment is to explore why status/operation processors did not have an effect on sync times of our previous experiments. It is possible that the simple nature of ConfigMap applications which takes <1s to deploy is causing this behavior. Most real world applications would consist of tens to hundreds of resources taking longer to be deployed. During this experiment, we will simulate a more complex application which takes longer to deploy than the original ConfigMap application.

Test Infrastructure:

Central Argo CD cluster running on a single m5.2xlarge managing 100 application clusters. In order to simulate larger applications, each application will execute a PreSync job which waits 10 seconds before deploying the original ConfigMap application.

Example of the PreSync Job:

apiVersion: batch/v1
kind: Job
metadata:
name: before
annotations:
argocd.argoproj.io/hook: PreSync
argocd.argoproj.io/hook-delete-policy: HookSucceeded
spec:
template:
spec:
containers:
- name: sleep
image: alpine:latest
command: ["sleep", "10"]
restartPolicy: Never
backoffLimit: 0

Observations:

Observation 1 - Syncing never finishes and require a restart of the application controller to continue syncing

The screenshot below shows that from the start of the sync test at 17:02 till around 17:41, the sync process was deadlocked. We observed no changes to synced apps and the app_operation_processing_queue was pinned at 10k operations.

alt_text

Looking at the Argo CD console for a single application we see that the PreSync job finished 17 mins ago, but the application stayed in the Syncing phase.

alt_text

Observation 2: There is a link between client QPS/burst QPS and operation/status processor settings

In order to fix the sync freezing issue, we increased the client QPS/burst QPS from the default 50/100 to 100/200. After the change we were able to collect data on operation/status processor settings.

operation/status processors: 25/50
Sync time: 45 mins
operation/status processors: 50/100
Sync time: 30 mins
alt_textalt_text

We can see that there is a link between status/operation processors and client QPS/burst QPS settings. Changing one or the other could be required to improve sync times and Argo CD performance depending on your environment. Our recommendation is to first change the status/operation processor settings. If you run into Argo CD locking up or the performance not increasing further, and you have sufficient resources, you can try increasing the client QPS/burst QPS. But as mentioned in the first experiment, ensure you are monitoring the k8s api-server.

Experiment 3: Cluster Scaling

Objective:

The following experiment is designed to test the compute demands of the Argo CD app controller managing clusters with more than 100 applications.

Test Infrastructure:

Central Argo CD cluster with 10 app controller shards running on a single m5.2xlarge node managing 100/250/500 application clusters and 10k 2KB ConfigMap applications.

Observations:

From earlier experiences, we can see that when managing 100 clusters, we are close to the limit of a single m5.2xlarge node. As we push further and to 250/500 clusters, we have two observations. The first observation is that the graph data is less smooth than the sync test of 100 clusters. This can indicate that Prometheus is running out of compute as Argo CD is consuming most of it. Please note that we are not using any resource limits/requests in our experiments. If proper resource limits/requests are set, most likely we would only see performance issues with Argo CD and not Prometheus, when operating at the limit of your compute resources. The second observation is that on both the 250/500 cluster tests, there are some drop off in metric data. For the 250 cluster test, there is a blip at the 16:16 mark for Memory Usage. For the 500 cluster test there are blips in data at the 21.05 mark on the Workqueue depth, CPU usage, and Memory usage. In spite of these observations, the sync process completes in a reasonable time.

Clusters: 100
Sync time: 9 mins
Clusters: 250
Sync time: 9 mins
Clusters: 500
Sync time: 11 mins
alt_textalt_textalt_text
From this experiment, you can see that as you approach the limit of your compute resources, Argo CD and other applications running in your k8s environment could experience issues. It is recommended that you set proper resource limits/requests for your monitoring stack to ensure you have insights into what could be causing your performance issues.

Experiment 4: Application Scaling

Objective:

This experiment is meant to push the Argo CD app controller beyond 10k applications. As the previous rounds of experiments were performed with 10k apps, the intention of these experiments is to scale the Argo CD app controller up to 50k apps.

Test Infrastructure:

We will be performing this experiment on a Central Argo CD cluster with 10 app controller shards and 500 downstream application clusters. As we scale up the applications up to 10k,15k,20k,25k,30k,50k 2KB ConfigMap applications, we will add additional m5.2xlarge node(s) to the Argo CD cluster.

Observations:

Sync test at 15k applications with a single m5.2xlarge. You can see blips in data indicating unhealthy behavior on the cluster.CPU and Memory Usage is near 100% utilization of 8 vCPUs and 30 GB of memory.After adding another node for a total of two m5.2xlarge, we were able to perform a sync in 9 mins.
alt_textalt_textalt_text

After adding another node, we were able to continue our application scaling tests. You can see in the graphs below that syncing 20k and 25k apps was not a problem. The sync test of 30k apps shown on the third graph shows some blips in data, indicating that we are at the limits of two nodes.

Apps: 20000
Sync time: 12 mins
Apps: 25000
Sync time: 11 mins
Apps: 30000
Sync time: 19 mins
alt_textalt_textalt_text

For the final test in this experiment, we pushed the cluster to sync 50k apps.

While the cluster was able to manage reconciliation for the 50k apps as shown by a stable Sync Status graph from 8:40, when we start the sync at the 9:02 mark, you can see unhealthy behavior in the graph data.From examining the CPU/Memory Usage, you can see we have 100% CPU utilization across the cluster.After scaling the cluster to three m5.2xlarge nodes, we were able to perform a sync in 22 mins.
alt_textalt_textalt_text

From the scaling tests, we can see that the Argo CD app controller scales effectively by adding compute resources as we increase the number of applications to sync.

Experiment 5: How Many Shards?

Objective:

In previous experiments, we utilized ten app controller shards running across multiple nodes. In this experiment, we will explore how the number of app controller shards affect performance.

Test Infrastructure:

Central Argo CD cluster with 3, 6, 9 app controller shards running on 3 m5.2xlarge node(s) managing 500 application clusters and 50k 2KB ConfigMap applications.

Observations:

For the baseline of three shards it took 75 mins to perform a sync. Adding additional shards saw further improvements with a sync time of 37 mins for six shards and a sync time of 21 mins for nine shards. Further increasing shards beyond nine did not yield any improvements.

Shards: 3
Sync time: 75 mins
Shards: 6
Sync time: 37 mins
Shards: 9
Sync time: 21 mins
alt_textalt_textalt_text

Looking at the CPU and Memory utilization, you can see that adding shards can improve performance only if there are free resources to consume. With the baseline of three shards, CPU utilization of the nodes are well below eight vCPU that each node is allocated. As we add more shards, we can see CPU utilization increasing until we are close to 100% CPU Utilization with nine shards. Adding any more shards would not yield any performance benefits unless we add more nodes.

Shards: 3Shards: 6Shards: 9
alt_textalt_textalt_text

From the experiments, the Argo CD app controller sharding mechanism is able to scale as you add more compute resources. Sharding allows both horizontal and vertical scaling. As you add more shards, you can horizontally scale by adding more nodes or vertically scale by utilizing a larger node with more compute resources.

Experiment 6: Sharding Deep Dive

Objective:

With the release of Argo CD 2.8, a new sharding algorithm: round-robin was released. The existing legacy sharding algorithm performed a modulo of the number of replicas and the hash sum of the cluster id to determine the shard that should manage the cluster. This led to an imbalance in the number of clusters being managed by each shard. The new round-robin sharding algorithm is supposed to ensure an equal distribution of clusters being managed by each shard. We will also introduce 3 new algorithms: greedy minimum, weighted ring hash, and consistent hash with bounded loads. This experiment will evaluate all the algorithms on shard balance, application distribution and rebalancing on changes to the environment.

Test Infrastructure:

Central Argo CD cluster with 10 app controller shards running on 1 m5.2xlarge node managing 100 application clusters and 10k 2KB ConfigMap applications.

Observations:

Note: For all the observations, we start monitoring-period when we see items in the operations queue. We end the monitoring-period when all the applications are synced. We then look at the avg metric of CPU/Memory usage during the monitoring-period.

Legacy

The graph below shows the CPU Usage/Memory Usage of the 10 different Argo CD App Controller shards. Looking at the avg, you can see a large variation to how much each shard is utilizing its resources. To make an accurate comparison between the different sharding methods, we calculate the variability by determining the range of the data for both avg CPU usage and Memory usage. The CPU usage variability is calculated by taking the shard with the highest CPU usage and subtracting it from the shard with the least CPU usage: 0.55 - 0.23 = 0.32. The Memory usage variability is 452 MiB - 225 MiB = 227 MiB.

Variability:

CPU:0.32
Memory:227 MiB

alt_text

Round-Robin

With the newly introduced Round-Robin algorithm, you can see improved balance across the shards.

Variability:

CPU:0.02
Memory:110 MiB

alt_text

Better but not perfect

The new round-robin algorithm does a better job of keeping the number of clusters balanced across the shards. But in a real world environment, you would not have an equal number of applications running on each cluster and the work done by each shard is determined not by the number of clusters, but the number of applications. A new experiment was run which deploys a random number of applications to each cluster with the results below. Even with the round-robin algorithm, you can see some high variability in CPU/Memory usage.

Variability:

CPU:0.27
Memory:136 MiB

alt_text

Greedy Minimum Algorithm, sharding by the Number of Apps

A new algorithm is introduced in order to shard by the number of applications that are running on each cluster. It utilizes a greedy minimum algorithm to always choose the shard with the least number of apps when assigning shards. A description of the algorithm is shown below:

Iterate through the cluster list:

1. Determine the number of applications per cluster.
2. Find the shard with the least number of applications.
3. Add the number of applications to the assigned shard.

The same experiment with a random number of applications running on each cluster is run again with the results shown below. With the new algorithm, there is better balance across the shards.

Variability:

CPU:0.06
Memory:109 MiB

alt_text

While there is better balance when utilizing the greedy minimum algorithm, there is an issue when changing any aspect of the Argo CD sharding parameters. If you are adding shards, removing shards, adding clusters and/or removing clusters, the algorithm can trigger large scale changes in the shard assignments. Changes to the shard assignments cause shards to waste resources when switching to manage new clusters. This is especially true when utilizing ephemeral clusters in AI/ML training and big data operations where clusters come and go. Starting from the previous experiment from before, we changed the number of shards from 10 to 9 and observed over 75 cluster to shard assignment changes out of 100 clusters excluding the changes associated with the removed shard.

Weighted Ring Hash

In order to decrease the number of shard assignment changes, a well known method called consistent hashing is explored for our use case (Reference). Consistent hashing algorithms utilize a ring hash to determine distribution decisions. This method is already widely utilized by network load balancing applications to evenly distribute traffic in a distributed manner independent of the number of servers/nodes. By utilizing a ring hash algorithm to determine shard assignments, we were able to decrease the number of shard assignment changes when we changed the number of shards from 10 to 9. We observed 48 cluster to shard assignment changes, excluding the changes associated with the removed shard.

alt_text

To ensure balance, weighting is applied at each shard assignment to ensure the shard with the least number of apps is given the highest weight when choosing shards for assignment. The balancing is not perfect as you can see that CPU variability has increased from the greedy minimum algorithm of 0.06 to 0.12.

Variability:

CPU:0.12
Memory:163 MiB

Consistent Hash with Bounded Loads

The ring hash algorithm was never designed to allow dynamically updating the weights based on load. While we were able to utilize it for this purpose, we looked at another algorithm called Consistent Hashing with Bounded Loads (Reference) which looks to solve the problem of consistent hashing and load uniformity. By utilizing this new algorithm, we were able to significantly decrease the redistribution of cluster to shard assignments. When we change the number of shards from 10 to 9, we only observed 15 cluster to shard assignment changes excluding the changes associated with the removed shard.

alt_text

The trade off is slightly worse cluster/app balancing than the weighted ring hash which increased CPU variability from 0.12 to 0.17.

Variability:

CPU:0.17
Memory:131 MiB

There are no direct recommendations about which algorithm you should utilize, as each of them have their pros and cons. You should evaluate each for your environment whether you are looking for strict balancing of clusters/apps across the shards or whether you want to minimize the impact of making frequent changes to your Argo CD environment.

Conclusion

In this blog post, we continued our scalability tests of the Argo CD app controller by answering some questions we had from our first scalability tests about the common scalability parameters. We showed how QPS/Burst QPS affects the k8s api server, determined why status/operation processors did not affect our previous scalability tests, and how those parameters are linked together. We then continued our scalability tests by pushing the Argo CD app controller to 500 clusters and 50,000 apps. We ended our tests by showing that a key component of scaling the Argo CD app controller is how it performs sharding. By doing a deep dive into how the app controller performs sharding we also determined some ways to improve sharding by adding in and evaluating new sharding algorithms. We are currently evaluating how to contribute these changes back to Argo CD. Stay tuned for those contributions and reach out on the CNCF #argo-sig-scalability or the #cnoe-interest Slack channel to get help optimizing for your use-cases and scenarios.

- + \ No newline at end of file diff --git a/blog/argo-workflow-scalability.html b/blog/argo-workflow-scalability.html index d67914b3..2b04dce6 100644 --- a/blog/argo-workflow-scalability.html +++ b/blog/argo-workflow-scalability.html @@ -10,13 +10,13 @@ - +

Argo Workflows Controller Scalability Testing on Amazon EKS

· 18 min read
Andrew Lee
Vikram Sethi

Introduction

In our earlier blog posts, we have discussed scalability tests for Argo CD, where in two consecutive experiments, we pushed the limits of Argo CD to deploy 10,000 applications on ~100 clusters and then 50,000 applications on 500 clusters along with configuration and fine-tuning required to make Argo CD scale effectively. Argo CD deployments, however, do not happen in isolation, and similar to a CNOE stack, Argo CD is often deployed on a cluster along with other tooling which collectively contribute to the performance and scalability bottlenecks we see users run into.

Argo Workflows is one common tool we often see users deploy alongside Argo CD to enable workflow executions (e.g. building images, running tests, cutting releases, etc). Our early experiments with Argo Workflows revealed that, if not tuned properly, it can negatively impact the scalability of a given Kubernetes cluster, particularly if the Kubernetes cluster happens to be the control cluster managing developer workflows across a large group of users. A real world example of some of the scaling challenges you can encounter with Argo Workflows is explored in our recent ArgoCon talk: Key Takeaways from Scaling Adobe's CI/CD Solution to Support 50K Argo CD Apps.

For us to better understand the limitations and tuning requirements for Argo Workflows, in this blog post we publish details on the scalability experiments we ran for Argo Workflows executing Workflows in two different load patterns: increasing rate up to 2100 workflows/min and queued reconciliation of 5000 workflows on an Amazon EKS cluster with 50x m5.large nodes. We show the correlation between the various Argo Workflow's knobs and controls and the processing time as well as performance improvements you can get by determining how you supply the workflows to the control plane.

Test Parameters

Test Workflow

The test workflow is based on the lightweight whalesay container from docker which prints out some text and ASCII art to the terminal. The reason we chose a lightweight container is that we wanted to stress the Argo Workflows controller in managing the Workflow lifecycle (pod creation, scheduling, and cleanup) and minimize the extra overhead on the Kubernetes control plane in dealing with the data plane workloads. An example of the Workflow is below:

var helloWorldWorkflow = wfv1.Workflow{
ObjectMeta: metav1.ObjectMeta{
GenerateName: "hello-world-",
},
Spec: wfv1.WorkflowSpec{
Entrypoint: "whalesay",
ServiceAccountName: "argo",
Templates: []wfv1.Template{
{
Name: "whalesay",
Container: &corev1.Container{
Image: "docker/whalesay:latest",
Command: []string{"cowsay", "hello world"},
},
},
},
PodGC: &wfv1.PodGC{
Strategy: "OnPodSuccess",
},
},
}

Argo Workflows Settings

We will be detailing how each of these settings affect Argo Workflow in various experiments later in this blog post.

  • Controller workers: Argo Workflows controller utilizes different workers for various operations in a Workflow lifecycle. We will be looking at t types of workers for our scalability testing.

    • workflow-workers (default: 32): These workers are threads in a single Argo Workflows controller that reconcile Argo Workflow Custom Resources (CRs). When a Workflow is created, a workflow-worker will handle the end-to-end operations of the Workflow from ensuring the pod is scheduled to ensuring the pod has finished. The number of workers can be specified by passing the --workflow-workers flag to the controller.

    • pod-cleanup-workers (default: 4): These workers clean up finished Workflows. When a Workflow has finished executing, depending on your clean-up settings, a pod-cleanup-worker will handle cleaning up the pod from the Workflow. The number of workers can be specified by passing the --pod-cleanup-workers flag to the controller.

  • Client queries per second (QPS)/Burst QPS settings (default: 20/30): These settings control when the Argo Workflows controller’s Kubernetes (K8s) client starts to throttle requests to the K8S API server. The client QPS setting is for limiting sustained QPS for the k8s client while burst QPS is for allowing a burst request rate in excess of the client QPS for a short period of time. The client QPS/burst QPS can be set by passing the --qps and --burst flag to the controller.

  • Sharding: Sharding with multiple Argo Workflows controllers is possible by running each controller in its own namespace. The controller would only reconcile Workflows submitted in that particular namespace. The namespace of each controller can be specified with the --namespaced flag.

Key Metrics

We chose a set of key metrics for the scalability testing because we wanted to measure how many workflows the Argo Workflows controller can reconcile and process. We will also be looking into K8s control plane metrics which might indicate your control plane cannot keep up with the Argo Workflows workload. 

  • Workqueue depth: The workqueue depth shows workflows which have not been reconciled. If the depth starts to increase, it indicates that the Argo Workflows controller is unable to handle the submission rate of Workflows.

  • Workqueue latency: The workqueue latency is the average time workflows spent waiting in the workqueue. A lower value indicates that the Argo Workflows controller is processing workflows faster so that they are not waiting in the workqueue.

  • K8S api server requests per second: The read and write requests per second being made to the K8S api server.

We didn’t include CPU/Memory as a key metric because during our testing we did not see any significant impacts to both. Most likely because of our simplistic workflows utilized for this benchmark.

Environment

We ran the experiments in an AWS environment utilizing a single Amazon EKS cluster. The Kubernetes version is 1.27 and Argo Workflows version is 3.5.4. No resource quotas were utilized on the Argo Workflows controller. For the cluster, we will start by provisioning 1x m5.8xlarge Amazon Elastic Compute Cloud (Amazon EC2) instances which will run the Argo Workflows controller and 50x m5.large instances for executing workflows. The number of execution instances is sufficient to run all 5000 workflows in parallel to ensure that pods are not waiting on resources to execute. Monitoring and metrics for Argo Workflows were provided by Prometheus/Grafana. 

Methodology

There will be two types of load patterns evaluated:

Increasing Rate Test: Workflows will be submitted at an increasing rate (workflows/min) until the Argo Workflows controller cannot keep up. The state at which the controller cannot keep up is when there are >0 workflows in the workflow queue or there is increasing queue latency. That rate of Workflow submissions will be noted as the maximum rate at which the Argo Workflows can be processed with the current settings.

Queued Reconciliation Test: 5000 workflows are submitted in less than minute. Metrics will be monitored from when the Argo Workflows controller starts processing workflows to when it has reconciled all 5000 workflows. The number of nodes is sufficient for running all the workflows simultaneously.

Experiments

Experiment 1: Baseline

In our baseline experiment, we are running in a single Argo Workflows shard (namespace) with default settings.

Increasing Rate Test:

As you can see below, the Argo Workflows controller can process up to 270 workflows/min. The average workqueue latency and workqueue depth are nearly zero. At 300 workflows/min, workqueue latency and workqueue depth starts to increase.

Enter image alt description

Queued Reconciliation Test:

It takes around 17 mins to reconcile 5000 workflows and peak avg workqueue latency was 5.38 minutes.

Enter image alt description

Experiment 2: Workflow Workers

For this experiment, we increase the number of workflow workers from the default of 32 to 128 where the workers use the maximum QPS and burst settings available to them. We also had to increase the number of pod-cleanup-workers to 32 as the Argo Workflows controller was experiencing some instability, where the controller pod was consistently crashing with the default value of 4.

Increasing Rate Test:

For the increasing workflow rate test, we can see exactly when the number of workflow workers is not sufficient to process the load. Both workqueue latency and depth start to increase indicating that workflows are waiting to be reconciled. When we increase the number of workers, the controller is able to reconcile the current load until an additional load is placed on it. For 32 workers, that limit is 300 workflows/min. When we increase the number of workers to 64, it is able to process that load until load is increased to 330 workflows/min. Then we increase the number of workers to 96 and it can process the additional load again. When we increase to 360 workflows/min, we need to bump the number of workers to 128.

WorkersMax workflows/minute
32270
64300
96330
128360

Enter image alt description

For the K8S api server, we see sustained 180 writes/sec and 70 reads/sec during the increasing rate tests.

Enter image alt description

Queued Reconciliation Test:

For the queued reconciliation test, the time it took to reconcile all the workflows did not change significantly. With 32 workers it took 17 mins to reconcile while with 96 workers it took 16 mins. The peak workqueue latency did decrease from 5.38 mins with 32 workers to 3.19 mins with 96 workers. With 128 workers, the Argo Workflows controller kept crashing.

WorkersPeak avg latency (mins)Reconcile time (mins)
325.3817
645.0618
963.1916
128N/AN/A

Enter image alt description

For the K8S api server, we see peaks of up to 260 writes/sec and 90 reads/sec during the queued reconciliation tests. You notice for the last test that there is no K8S api server activity as the Argo Workflows controller was misbehaving due to client-side throttling.

Enter image alt description

Observations from Experiment 2:

Workers play a big part in how fast the Argo Workflows controller is able to reconcile the rate of workflows being submitted. If you are observing workflow latency and backing up the workqueue depth, changing the number of workers is a potential way to improve performance. There are a few observations that we want to call out. One is that if we compare the two different patterns, one where we submit workflows at a constant rate and one in which we load up the workqueue all at once, we can see variations in calculated throughput. We can actually calculate the time it takes to reconcile 5000 apps utilizing the increasing rate test results and compare them to the queued reconciliation test.

WorkersIncreasing rate test time to reconciling 5000 workflows (mins)Reconcile time of 5000 workflows queued all at once (mins)
3218.517
6416.618
9615.116
12813.8N/A

We do get some conflicting results when we make this comparison. With 32 and 64 workers, the increasing rate test is actually slower than the queued reconciliation test. But if we increase to 96 workers, we can see that the increasing rate test results are faster. We were unable to compare with 128 workers as the Argo Workflows controller crashed when trying to run the queued reconciliation test. When investigating the cause of the crash, the logs have several messages like the following:

Waited for 6.185558715s due to client-side throttling, not priority and fairness, request: DELETE:https://10.100.0.1:443/api/v1/namespaces/argoworkflows1/pods/hello-world-57cfda8a-dc8b-4854-83a0-05785fb25e4b-3gwthk

These messages indicate that we should increase the Client QPS settings which we will evaluate in the next experiment.

Experiment 3: Client QPS Settings

For this experiment, we set the number of workflow workers back to the default of 32. We will then increase the QPS/Burst by increments of 10/10, from 20/30 to 50/60. We chose to only increase by 10/10 because any large increase past 50/60 did not yield any performance improvements. We believe that this is partly because we kept the workers at 32.

Initial Testing

Increasing Rate Test:

The QPS/Burst settings had a significant impact on the increasing rate test. By increasing the QPS/Burst from 20/30 to 30/40, we see ~50% improvement in max workflows/min from 270 to 420. When we increase the QPS/Burst from 30/40 to 40/50, we see another 28% improvement in max workflows/min from 420 to 540. When increasing from 40/50 to 50/60 there was only an additional 5% improvement. For 32 workers, increasing past 50/60 did not yield any significant improvements to the max workflows/min.

QPS/BurstMax workflows/minute
20/30270
30/40420
40/50540
50/60570

Enter image alt description

When changing QPS/Burst, we need to also monitor the K8S API server. Looking at the K8S API server req/s, we see sustained 390 writes/sec and 85 read/sec.

Enter image alt description

Queued Reconciliation Test:

Again, the QPS/Burst settings make a big difference in the queued reconciliation test when compared to just changing the workflow workers. Starting from the default settings of 20/30, we see decreasing reconcile times from 19 mins to 12 mins to 8 mins and finally to 6 mins when setting the QPS/Burst to 50/60. The peak average latency also decreased from 4.79 mins to 1.94 mins. We did note that there was a higher peak avg latency with 30/40 vs 20/30 but if you examine the graph you can see a steeper drop in latency accounting for the shorter reconcile time. Similar to the increasing rate test, increasing the QPS/Burst further did not yield any improvements.

QPS/BurstPeak avg latency (mins)Reconcile time (mins)
20/304.7919
30/405.6612
40/502.988
50/601.946

Enter image alt description

When looking at the K8S API server, we see peaks of up to 700 writes/sec and 200 reads/sec during the tests.

Enter image alt description

When compared to the workflow workers testing, you can see increasing the QPS/Burst is able to push the K8S API server and improve Argo Workflows overall performance. We do see some diminishing returns when increasing QPS/Burst past 50/60 even though it appears that the K8S API server has plenty of capacity for additional load. For the next test, we will increase both the workflow workers with the QPS/burst to see how far we can push Argo Workflows and the K8s API server.

Max Load Test

Increasing Rate Test:

We increased the number of workers to 128 and QPS/burst to 60/70 and observed peak average latency of 54 secs and a reconciliation time of 5 mins. Increasing either the workers or QPS/Burst did not improve these numbers.

Enter image alt description

Looking at the K8s API server, we saw peaks of 800 writes/sec and 190 reads/sec.

Enter image alt description

Queued Reconciliation Test:

Starting with 128 workers and QPS/Burst of 60/70, we were able to push Argo Workflows to 810 workflows/min. But past that point, there were no improvements with more workers or increased QPS/Burst limits.

Enter image alt description

We can see increased K8s API server activity with sustained 700 writes/sec and 160 reads/sec.

Enter image alt description

Observations from Experiment 3

One observation we made in the previous experiment with workflow workers is that the two different patterns of submitting workflows can be compared. We made that comparison again with the QPS/Burst tests and saw the following results:

QPS/BurstWorkersIncreasing rate test time to reconcile 5000 workflows (mins)Reconcile time of 5000 workflows queued all at once (mins)
20/303218.519
30/403211.912
50/60329.28
60/70328.76
70/801286.15

When we take the data about the comparison in experiment 1 with the data above, we can see a slight improvement in submitting all workflows together vs staggering them. We are not sure why this is the case and more experiments are required to understand this behavior.

It seems that we have hit a wall with 128 workers and a QPS/burst of 60/70 for a single Argo Workflows Controller. We will now evaluate Sharding and see if we can improve our performance from this point.

Experiment 4: Sharding

For this experiment, we will evaluate 1 shard, 2 shards, and 5 shards of the Argo Workflows controller with the default settings. We will then try for a maximum load test utilizing workflow workers, QPS/burst, and sharding to see the maximum performance on our current infrastructure.

Initial Testing

Increasing Rate Test:

Sharding the Argo Workflows controller has a linear impact on performance with the increasing rate test. By increasing the number of shards from 1 to 2, we see a 100% improvement in max workflows/min from 270 to 540. When we increase the shards from 2 to 5, we see an additional 150% improvement in max workflows/min from 540 to 1350.

ShardsMax workflows/min
1270
2540
51350

One thing to note is that each shard is increased by 30 workflows/min when increasing the rate. This means that the difference between two rates with 2 shards 30 = 60 workflows/min and the difference between two rates with 5 shards 30 = 150 workflows/min. That is why for 2 shards when the max load was determined at 600 workflows/min, we go down 1 rate which is 600 - 60 = 540 workflows/min.

Enter image alt description

You can see a significant impact on the K8s API server with sustained 1400 writes/sec and 300 reads/sec.

Enter image alt description

Queued Reconciliation Test:

As shown in the Increasing Rate Test, sharding has a huge impact on performance for the queued reconciliation test. With 1 shard it takes 18 mins to reconcile 5000 workflows, while with 2 shards it takes 9 mins. With 5 shards the reconcile time is further reduced to 4 mins.

ShardsPeak avg latency (mins)Reconcile time (mins)
15.4318
23.819
51.424

Enter image alt description

The impact on the K8s API server was not as significant when compared to previous experiments.

Max Load Test

Increasing Rate Test:

When increasing the workflow workers to 128, QPS/burst to 60/70 and shards to 5, the Argo Workflows controller is able to process up to 2100 workflows/min. Any higher than this seems to run into K8s API Priority and Fairness (APF) limits.

Enter image alt description

When looking at the K8s API server, we are seeing significant impact with peaks of 1500 writes/sec and 350 reads/sec.

Enter image alt description

When investigating why we are unable to push higher on the K8s API server, we see that APF limits are coming into effect by looking at the apiserver_flowcontrol_current_inqueue_requests. This metric shows the number of requests waiting in the APF flowcontrol queue.

Enter image alt description

Queued Reconciliation Test:

With the max load settings, we observed that the peak workqueue latency is only 20 seconds and the reconcile time is 2 minutes.

Enter image alt description

The impact on K8s API server is actually less than the previous max load queued reconciliation tests.

Enter image alt description

Observations from Experiment 4

As we did in previous experiments, we again make the comparison between the two different load patterns:

ShardsIncreasing rate test time to reconcile 5000 workflows (mins)Reconcile time of 5000 workflows queued all at once (mins)
118.518
29.29
53.74
Max load (5 shards)2.32

In general, it appears that submitting all workflows at once performs slightly better than submitting workflows at a steady rate. More experiments will need to be done to further investigate this behavior.

Conclusion

In this blog post we discussed our initial efforts in documenting and understanding the scaling characteristics of the Argo Workflows controller. Our findings show that the existing mechanisms for increasing workflow workers, increasing client and burst QPS settings and sharding the controller can help Argo Workflows scale better. Another interesting observation is that we saw differences in performance with how you submit your workflows. For the next set of experiments, we plan to evaluate more environmental variables and different types of workflows: multi-step and/or long running. Stay tuned for the report on our next round of experiments and reach out on the CNCF #argo-sig-scalability Slack channel to get help optimizing for your use-cases and scenarios.

- + \ No newline at end of file diff --git a/blog/optimizing-data-quality-in-dev-portals.html b/blog/optimizing-data-quality-in-dev-portals.html index 610cdb89..0cb6bf2a 100644 --- a/blog/optimizing-data-quality-in-dev-portals.html +++ b/blog/optimizing-data-quality-in-dev-portals.html @@ -10,7 +10,7 @@ - + @@ -22,7 +22,7 @@ hydrated entities are often kept in a separate git repository that mirrors and expands entities in the original git repository with intended application specifications.

On the positive side:

  • This is a relatively simple approach and works for smaller teams with smaller number of applications or systems
  • Having a second git repository to capture the end state of an entity stays closer to the core GitOps practices
  • Does not require significant modification to the developer portal

On the negative side:

  • There is inherent duplications that are happening
  • Adding custom metadata by application teams is not as trivial as it requires making changes to the integration workflow, thus bringing load and demand to the DevOps teams
  • Less abstraction in place as end application users are directly exposed to the yaml specification of the entities
  • Does not scale well as the number of systems and entities grow

ci-as-source-of-truth

Use a central control plane as the source of truth

The hub and spoke model is the most advocated for model when applying GitOps practices. Your control plane cluster runs and manages your platform tools, your CI, your CD, developer portal, infrastructure as code tooling, etc.

On the positive side:

  • There really is a single place to inspect the status of entities. E.g., Argo applications can tell you the status of deployed applications. You can also inspect the status of workflows, infrastructure resources, and any other entity that the control plane cluster manages.
  • You can use the Backstage Kubernetes plugin seamlessly and maybe with some little tweaks. Alternatively this can be achieved by introducing fairly light-weight Backstage custom entity providers which pull and show the status of entities in the Backstage portal.
  • In an organization with a diverse set of distributed systems, the control plane cluster can be used as the integration layer by wrapping legacy APIs and or implementing native controllers.

On the negative side:

  • Most organizations do not have a central control plane and adopting one as the source of truth is often a significant change, especially if an organization is early in their GitOps transition.
  • For organizations deep into a federated model of operation with different teams running and managing their platforms separately and rather independently, it could be challenging to offer a single control plane that aggregates data across all teams.
  • Management of change could become cumbersome. Existence of a single control plane could create bottlenecks where changes occur to a set of entities or practices. Changes in organizations or systems may result in changes to various entities managed across several teams. Bringing GitOps practices to the mix, this requires chains of approvals to happen across multiple entities and across several repositories for deployments to start flowing. Depending on the size of the organization, this could lead to organizational nightmares.
  • You may need to jump through a few hoops before getting from the representation of the application, to the actual deployment of it, e.g., going from git to your continuous delivery and from there to your target cluster.

controlplane-as-source-of-truth

Use Backstage as the source of truth

Where control planes and compute workloads are scattered, the unifying layer lies in the developer portal, i.e. Backstage. Hence, it is reasonable to construct an entity by collecting and aggregating data from various data sources, each providing partial data on the entity, making Backstage be the source of truth. This generally starts with Backstage querying git for the entities that exist. Then using the identifiers for the entities to collect metadata on how the entity contributes to a system. This could involve querying the control plane clusters and the workload clusters via some custom entity provider that looks for certain information and putting collected pieces together to come close to the core promise of a developer portal, providing reliable information on the entities.

On the positive side:

  • This model copes better with legacy systems
  • Users are not exposed to and often times not even aware of the underlying platforms, hence underlying platform and tooling is more rigorously abstracted away
  • Changes to the system are only isolated to the entities of the particular system as managed by the underlying resources and platform. This causes less chaos when definitions, metadata, or properties of entities need to change.

On the negative side:

  • The git service may not be able to scale, technically or financially. This is particularly because Backstage may hit the git service endpoints too frequently and exceed the API limits. This could cause delays in displaying data for end users or display wrong information if partially available data is mishandled. This can be mitigated via approaches like using an eventing mechanism to notify Backstage of changes, or alternatively to store entity definitions in an alternative storage space (e.g. Amazon S3). There are challenges to such approaches too, for example when using Amazon S3, change history will be lost. Also, using an eventing mechanism could introduce security challenges that we discuss next.
  • Securing Backstage could be a challenge. For Backstage to proactively receive updates on entity changes, it would work best to configure event hooks to provide callbacks to Backstage when changes occur. Backstage, being the entry point for user workflows, sits on the critical path of platform operations. As such, platform engineers need to solve for a chicken and egg problem by deciding how to expose Backstage endpoints to receive events and yet to limit access for security reasons. The authentication methods that GitHub supports may not satisfy the security standards that an organization requires.
  • Changes to entities may not be as trivial. DevOps engineers need to manage entities that they may not control. For example, if a new mandatory field is introduced to a catalog file, DevOps engineers may need to talk to the respective repository owners, create a PR, then get approval for all affected repositories.

backstage-as-source-of-truth

Conclusion

We discussed multiple approaches to creating reliable representation of system entities in the developer portals. We do not necessarily recommend one approach over another, but it is important to find the right approach given the patterns and practices in your organization. It is also worth noting that you can choose to combine multiple approaches depending on the requirements of your teams. For example, while continuous integration can still be used to construct the actual state of the world by collecting status data and other related information, Backstage extensions can be introduced to expand on entity relations, providing better representation of a system. Stating the obvious here, but your proper selection of patterns that work for you will go a long way in increasing your overall team velocity down the road.

Reach out on #cnoe-interest CNCF slack channel to share thoughts and get involved in developing CNOE.

- + \ No newline at end of file diff --git a/blog/tags.html b/blog/tags.html index e1a034f6..330cc5fa 100644 --- a/blog/tags.html +++ b/blog/tags.html @@ -10,13 +10,13 @@ - + - + \ No newline at end of file diff --git a/blog/tags/argo.html b/blog/tags/argo.html index f42f8b4e..7cd91aa4 100644 --- a/blog/tags/argo.html +++ b/blog/tags/argo.html @@ -10,13 +10,13 @@ - +

One post tagged with "argo"

View All Tags

· 18 min read
Andrew Lee
Vikram Sethi

Introduction

In our earlier blog posts, we have discussed scalability tests for Argo CD, where in two consecutive experiments, we pushed the limits of Argo CD to deploy 10,000 applications on ~100 clusters and then 50,000 applications on 500 clusters along with configuration and fine-tuning required to make Argo CD scale effectively. Argo CD deployments, however, do not happen in isolation, and similar to a CNOE stack, Argo CD is often deployed on a cluster along with other tooling which collectively contribute to the performance and scalability bottlenecks we see users run into.

Argo Workflows is one common tool we often see users deploy alongside Argo CD to enable workflow executions (e.g. building images, running tests, cutting releases, etc). Our early experiments with Argo Workflows revealed that, if not tuned properly, it can negatively impact the scalability of a given Kubernetes cluster, particularly if the Kubernetes cluster happens to be the control cluster managing developer workflows across a large group of users. A real world example of some of the scaling challenges you can encounter with Argo Workflows is explored in our recent ArgoCon talk: Key Takeaways from Scaling Adobe's CI/CD Solution to Support 50K Argo CD Apps.

For us to better understand the limitations and tuning requirements for Argo Workflows, in this blog post we publish details on the scalability experiments we ran for Argo Workflows executing Workflows in two different load patterns: increasing rate up to 2100 workflows/min and queued reconciliation of 5000 workflows on an Amazon EKS cluster with 50x m5.large nodes. We show the correlation between the various Argo Workflow's knobs and controls and the processing time as well as performance improvements you can get by determining how you supply the workflows to the control plane.

Test Parameters

Test Workflow

The test workflow is based on the lightweight whalesay container from docker which prints out some text and ASCII art to the terminal. The reason we chose a lightweight container is that we wanted to stress the Argo Workflows controller in managing the Workflow lifecycle (pod creation, scheduling, and cleanup) and minimize the extra overhead on the Kubernetes control plane in dealing with the data plane workloads. An example of the Workflow is below:

var helloWorldWorkflow = wfv1.Workflow{
ObjectMeta: metav1.ObjectMeta{
GenerateName: "hello-world-",
},
Spec: wfv1.WorkflowSpec{
Entrypoint: "whalesay",
ServiceAccountName: "argo",
Templates: []wfv1.Template{
{
Name: "whalesay",
Container: &corev1.Container{
Image: "docker/whalesay:latest",
Command: []string{"cowsay", "hello world"},
},
},
},
PodGC: &wfv1.PodGC{
Strategy: "OnPodSuccess",
},
},
}

Argo Workflows Settings

We will be detailing how each of these settings affect Argo Workflow in various experiments later in this blog post.

  • Controller workers: Argo Workflows controller utilizes different workers for various operations in a Workflow lifecycle. We will be looking at t types of workers for our scalability testing.

    • workflow-workers (default: 32): These workers are threads in a single Argo Workflows controller that reconcile Argo Workflow Custom Resources (CRs). When a Workflow is created, a workflow-worker will handle the end-to-end operations of the Workflow from ensuring the pod is scheduled to ensuring the pod has finished. The number of workers can be specified by passing the --workflow-workers flag to the controller.

    • pod-cleanup-workers (default: 4): These workers clean up finished Workflows. When a Workflow has finished executing, depending on your clean-up settings, a pod-cleanup-worker will handle cleaning up the pod from the Workflow. The number of workers can be specified by passing the --pod-cleanup-workers flag to the controller.

  • Client queries per second (QPS)/Burst QPS settings (default: 20/30): These settings control when the Argo Workflows controller’s Kubernetes (K8s) client starts to throttle requests to the K8S API server. The client QPS setting is for limiting sustained QPS for the k8s client while burst QPS is for allowing a burst request rate in excess of the client QPS for a short period of time. The client QPS/burst QPS can be set by passing the --qps and --burst flag to the controller.

  • Sharding: Sharding with multiple Argo Workflows controllers is possible by running each controller in its own namespace. The controller would only reconcile Workflows submitted in that particular namespace. The namespace of each controller can be specified with the --namespaced flag.

Key Metrics

We chose a set of key metrics for the scalability testing because we wanted to measure how many workflows the Argo Workflows controller can reconcile and process. We will also be looking into K8s control plane metrics which might indicate your control plane cannot keep up with the Argo Workflows workload. 

  • Workqueue depth: The workqueue depth shows workflows which have not been reconciled. If the depth starts to increase, it indicates that the Argo Workflows controller is unable to handle the submission rate of Workflows.

  • Workqueue latency: The workqueue latency is the average time workflows spent waiting in the workqueue. A lower value indicates that the Argo Workflows controller is processing workflows faster so that they are not waiting in the workqueue.

  • K8S api server requests per second: The read and write requests per second being made to the K8S api server.

We didn’t include CPU/Memory as a key metric because during our testing we did not see any significant impacts to both. Most likely because of our simplistic workflows utilized for this benchmark.

Environment

We ran the experiments in an AWS environment utilizing a single Amazon EKS cluster. The Kubernetes version is 1.27 and Argo Workflows version is 3.5.4. No resource quotas were utilized on the Argo Workflows controller. For the cluster, we will start by provisioning 1x m5.8xlarge Amazon Elastic Compute Cloud (Amazon EC2) instances which will run the Argo Workflows controller and 50x m5.large instances for executing workflows. The number of execution instances is sufficient to run all 5000 workflows in parallel to ensure that pods are not waiting on resources to execute. Monitoring and metrics for Argo Workflows were provided by Prometheus/Grafana. 

Methodology

There will be two types of load patterns evaluated:

Increasing Rate Test: Workflows will be submitted at an increasing rate (workflows/min) until the Argo Workflows controller cannot keep up. The state at which the controller cannot keep up is when there are >0 workflows in the workflow queue or there is increasing queue latency. That rate of Workflow submissions will be noted as the maximum rate at which the Argo Workflows can be processed with the current settings.

Queued Reconciliation Test: 5000 workflows are submitted in less than minute. Metrics will be monitored from when the Argo Workflows controller starts processing workflows to when it has reconciled all 5000 workflows. The number of nodes is sufficient for running all the workflows simultaneously.

Experiments

Experiment 1: Baseline

In our baseline experiment, we are running in a single Argo Workflows shard (namespace) with default settings.

Increasing Rate Test:

As you can see below, the Argo Workflows controller can process up to 270 workflows/min. The average workqueue latency and workqueue depth are nearly zero. At 300 workflows/min, workqueue latency and workqueue depth starts to increase.

Enter image alt description

Queued Reconciliation Test:

It takes around 17 mins to reconcile 5000 workflows and peak avg workqueue latency was 5.38 minutes.

Enter image alt description

Experiment 2: Workflow Workers

For this experiment, we increase the number of workflow workers from the default of 32 to 128 where the workers use the maximum QPS and burst settings available to them. We also had to increase the number of pod-cleanup-workers to 32 as the Argo Workflows controller was experiencing some instability, where the controller pod was consistently crashing with the default value of 4.

Increasing Rate Test:

For the increasing workflow rate test, we can see exactly when the number of workflow workers is not sufficient to process the load. Both workqueue latency and depth start to increase indicating that workflows are waiting to be reconciled. When we increase the number of workers, the controller is able to reconcile the current load until an additional load is placed on it. For 32 workers, that limit is 300 workflows/min. When we increase the number of workers to 64, it is able to process that load until load is increased to 330 workflows/min. Then we increase the number of workers to 96 and it can process the additional load again. When we increase to 360 workflows/min, we need to bump the number of workers to 128.

WorkersMax workflows/minute
32270
64300
96330
128360

Enter image alt description

For the K8S api server, we see sustained 180 writes/sec and 70 reads/sec during the increasing rate tests.

Enter image alt description

Queued Reconciliation Test:

For the queued reconciliation test, the time it took to reconcile all the workflows did not change significantly. With 32 workers it took 17 mins to reconcile while with 96 workers it took 16 mins. The peak workqueue latency did decrease from 5.38 mins with 32 workers to 3.19 mins with 96 workers. With 128 workers, the Argo Workflows controller kept crashing.

WorkersPeak avg latency (mins)Reconcile time (mins)
325.3817
645.0618
963.1916
128N/AN/A

Enter image alt description

For the K8S api server, we see peaks of up to 260 writes/sec and 90 reads/sec during the queued reconciliation tests. You notice for the last test that there is no K8S api server activity as the Argo Workflows controller was misbehaving due to client-side throttling.

Enter image alt description

Observations from Experiment 2:

Workers play a big part in how fast the Argo Workflows controller is able to reconcile the rate of workflows being submitted. If you are observing workflow latency and backing up the workqueue depth, changing the number of workers is a potential way to improve performance. There are a few observations that we want to call out. One is that if we compare the two different patterns, one where we submit workflows at a constant rate and one in which we load up the workqueue all at once, we can see variations in calculated throughput. We can actually calculate the time it takes to reconcile 5000 apps utilizing the increasing rate test results and compare them to the queued reconciliation test.

WorkersIncreasing rate test time to reconciling 5000 workflows (mins)Reconcile time of 5000 workflows queued all at once (mins)
3218.517
6416.618
9615.116
12813.8N/A

We do get some conflicting results when we make this comparison. With 32 and 64 workers, the increasing rate test is actually slower than the queued reconciliation test. But if we increase to 96 workers, we can see that the increasing rate test results are faster. We were unable to compare with 128 workers as the Argo Workflows controller crashed when trying to run the queued reconciliation test. When investigating the cause of the crash, the logs have several messages like the following:

Waited for 6.185558715s due to client-side throttling, not priority and fairness, request: DELETE:https://10.100.0.1:443/api/v1/namespaces/argoworkflows1/pods/hello-world-57cfda8a-dc8b-4854-83a0-05785fb25e4b-3gwthk

These messages indicate that we should increase the Client QPS settings which we will evaluate in the next experiment.

Experiment 3: Client QPS Settings

For this experiment, we set the number of workflow workers back to the default of 32. We will then increase the QPS/Burst by increments of 10/10, from 20/30 to 50/60. We chose to only increase by 10/10 because any large increase past 50/60 did not yield any performance improvements. We believe that this is partly because we kept the workers at 32.

Initial Testing

Increasing Rate Test:

The QPS/Burst settings had a significant impact on the increasing rate test. By increasing the QPS/Burst from 20/30 to 30/40, we see ~50% improvement in max workflows/min from 270 to 420. When we increase the QPS/Burst from 30/40 to 40/50, we see another 28% improvement in max workflows/min from 420 to 540. When increasing from 40/50 to 50/60 there was only an additional 5% improvement. For 32 workers, increasing past 50/60 did not yield any significant improvements to the max workflows/min.

QPS/BurstMax workflows/minute
20/30270
30/40420
40/50540
50/60570

Enter image alt description

When changing QPS/Burst, we need to also monitor the K8S API server. Looking at the K8S API server req/s, we see sustained 390 writes/sec and 85 read/sec.

Enter image alt description

Queued Reconciliation Test:

Again, the QPS/Burst settings make a big difference in the queued reconciliation test when compared to just changing the workflow workers. Starting from the default settings of 20/30, we see decreasing reconcile times from 19 mins to 12 mins to 8 mins and finally to 6 mins when setting the QPS/Burst to 50/60. The peak average latency also decreased from 4.79 mins to 1.94 mins. We did note that there was a higher peak avg latency with 30/40 vs 20/30 but if you examine the graph you can see a steeper drop in latency accounting for the shorter reconcile time. Similar to the increasing rate test, increasing the QPS/Burst further did not yield any improvements.

QPS/BurstPeak avg latency (mins)Reconcile time (mins)
20/304.7919
30/405.6612
40/502.988
50/601.946

Enter image alt description

When looking at the K8S API server, we see peaks of up to 700 writes/sec and 200 reads/sec during the tests.

Enter image alt description

When compared to the workflow workers testing, you can see increasing the QPS/Burst is able to push the K8S API server and improve Argo Workflows overall performance. We do see some diminishing returns when increasing QPS/Burst past 50/60 even though it appears that the K8S API server has plenty of capacity for additional load. For the next test, we will increase both the workflow workers with the QPS/burst to see how far we can push Argo Workflows and the K8s API server.

Max Load Test

Increasing Rate Test:

We increased the number of workers to 128 and QPS/burst to 60/70 and observed peak average latency of 54 secs and a reconciliation time of 5 mins. Increasing either the workers or QPS/Burst did not improve these numbers.

Enter image alt description

Looking at the K8s API server, we saw peaks of 800 writes/sec and 190 reads/sec.

Enter image alt description

Queued Reconciliation Test:

Starting with 128 workers and QPS/Burst of 60/70, we were able to push Argo Workflows to 810 workflows/min. But past that point, there were no improvements with more workers or increased QPS/Burst limits.

Enter image alt description

We can see increased K8s API server activity with sustained 700 writes/sec and 160 reads/sec.

Enter image alt description

Observations from Experiment 3

One observation we made in the previous experiment with workflow workers is that the two different patterns of submitting workflows can be compared. We made that comparison again with the QPS/Burst tests and saw the following results:

QPS/BurstWorkersIncreasing rate test time to reconcile 5000 workflows (mins)Reconcile time of 5000 workflows queued all at once (mins)
20/303218.519
30/403211.912
50/60329.28
60/70328.76
70/801286.15

When we take the data about the comparison in experiment 1 with the data above, we can see a slight improvement in submitting all workflows together vs staggering them. We are not sure why this is the case and more experiments are required to understand this behavior.

It seems that we have hit a wall with 128 workers and a QPS/burst of 60/70 for a single Argo Workflows Controller. We will now evaluate Sharding and see if we can improve our performance from this point.

Experiment 4: Sharding

For this experiment, we will evaluate 1 shard, 2 shards, and 5 shards of the Argo Workflows controller with the default settings. We will then try for a maximum load test utilizing workflow workers, QPS/burst, and sharding to see the maximum performance on our current infrastructure.

Initial Testing

Increasing Rate Test:

Sharding the Argo Workflows controller has a linear impact on performance with the increasing rate test. By increasing the number of shards from 1 to 2, we see a 100% improvement in max workflows/min from 270 to 540. When we increase the shards from 2 to 5, we see an additional 150% improvement in max workflows/min from 540 to 1350.

ShardsMax workflows/min
1270
2540
51350

One thing to note is that each shard is increased by 30 workflows/min when increasing the rate. This means that the difference between two rates with 2 shards 30 = 60 workflows/min and the difference between two rates with 5 shards 30 = 150 workflows/min. That is why for 2 shards when the max load was determined at 600 workflows/min, we go down 1 rate which is 600 - 60 = 540 workflows/min.

Enter image alt description

You can see a significant impact on the K8s API server with sustained 1400 writes/sec and 300 reads/sec.

Enter image alt description

Queued Reconciliation Test:

As shown in the Increasing Rate Test, sharding has a huge impact on performance for the queued reconciliation test. With 1 shard it takes 18 mins to reconcile 5000 workflows, while with 2 shards it takes 9 mins. With 5 shards the reconcile time is further reduced to 4 mins.

ShardsPeak avg latency (mins)Reconcile time (mins)
15.4318
23.819
51.424

Enter image alt description

The impact on the K8s API server was not as significant when compared to previous experiments.

Max Load Test

Increasing Rate Test:

When increasing the workflow workers to 128, QPS/burst to 60/70 and shards to 5, the Argo Workflows controller is able to process up to 2100 workflows/min. Any higher than this seems to run into K8s API Priority and Fairness (APF) limits.

Enter image alt description

When looking at the K8s API server, we are seeing significant impact with peaks of 1500 writes/sec and 350 reads/sec.

Enter image alt description

When investigating why we are unable to push higher on the K8s API server, we see that APF limits are coming into effect by looking at the apiserver_flowcontrol_current_inqueue_requests. This metric shows the number of requests waiting in the APF flowcontrol queue.

Enter image alt description

Queued Reconciliation Test:

With the max load settings, we observed that the peak workqueue latency is only 20 seconds and the reconcile time is 2 minutes.

Enter image alt description

The impact on K8s API server is actually less than the previous max load queued reconciliation tests.

Enter image alt description

Observations from Experiment 4

As we did in previous experiments, we again make the comparison between the two different load patterns:

ShardsIncreasing rate test time to reconcile 5000 workflows (mins)Reconcile time of 5000 workflows queued all at once (mins)
118.518
29.29
53.74
Max load (5 shards)2.32

In general, it appears that submitting all workflows at once performs slightly better than submitting workflows at a steady rate. More experiments will need to be done to further investigate this behavior.

Conclusion

In this blog post we discussed our initial efforts in documenting and understanding the scaling characteristics of the Argo Workflows controller. Our findings show that the existing mechanisms for increasing workflow workers, increasing client and burst QPS settings and sharding the controller can help Argo Workflows scale better. Another interesting observation is that we saw differences in performance with how you submit your workflows. For the next set of experiments, we plan to evaluate more environmental variables and different types of workflows: multi-step and/or long running. Stay tuned for the report on our next round of experiments and reach out on the CNCF #argo-sig-scalability Slack channel to get help optimizing for your use-cases and scenarios.

- + \ No newline at end of file diff --git a/blog/tags/argocd.html b/blog/tags/argocd.html index b828a0d9..8a6e8f39 100644 --- a/blog/tags/argocd.html +++ b/blog/tags/argocd.html @@ -10,14 +10,14 @@ - +

One post tagged with "argocd"

View All Tags

· 21 min read
Andrew Lee
Michael Crenshaw
Gaurav Dhamija

Introduction

In Part 1 of our Argo CD benchmarking blog post, we analyzed the impacts of various Argo CD configuration parameters on the performance of Argo CD. In particular we measured the impact of status and operation processes, client QPS, burst QPS, and sharding algorithms on the overall synchronization and reconciliation behavior in Argo CD. We showed that using the right configuration and sharding strategy, particularly by properly setting client and burst QPS, as well as by splitting the workload across multiple workload clusters using Argo CD sharding, overall sync time can be improved by a factor of 4.

Here, and in Part 2 of our scalability work, we push our scalability experiments for Argo CD further. In particular, among other tests, we run our scalability metrics against a maximum of 500 workload clusters, deploying 50,000 Argo applications. This, to the best of our knowledge, sets the largest scalability testing ever done for Argo CD. We also report on a much deeper set of sharding experiments, utilizing different sharding algorithms for distribution of load across 100 workload clusters. While we report on running our experiments against a legacy sharding algorithm and a round robin algorithm that already exist in Argo CD 2.8, we also discuss results of workload distribution using 3 new sharding algorithms we developed in collaboration with RedHat, namely: a greedy minimum algorithm, a weighted ring hash algorithm, and a consistent hash with bounded loads algorithm. We show that, depending on the optimization goals one has in mind, choosing from the new sharding algorithms can improve CPU utilization by a factor of 3 and reduce application-to-shard rebalancing by a factor of 5, significantly improving the performance of a highly distributed and massively scaled Argo CD deployment.

Experiment 1: How Client QPS/Burst QPS affects the Kubernetes API Server

Objective:

The objective of the first experiment is to understand the impact of QPS & Burst Rate parameters on 1/Kubernetes control plane for both the Argo CD cluster and the remote application clusters, and 2/ overall sync duration for Argo CD applications. To understand the impact on Kubernetes API server, we observed following control plane metrics:

  • Latency (apiserver_request_duration_seconds_bucket)
  • Throughput (apiserver_request_total)
  • Error Rate (apiserver_request_total{code=~"[45].."}) for any request returning an error code 4xx or 5xx.

To analyze impact on application synchronization, we observed Sync Duration and No. of Goroutines Argo CD server metrics.

Test Infrastructure:

In terms of test infrastructure and workload configuration, we had one central Amazon EKS cluster with Argo CD Server running on it. This central cluster connected with three remote Amazon EKS clusters with each one of them hosting 5000 Argo CD applications. Each application is a Configmap (2KB) provisioned in a dedicated namespace. All of the four clusters, one central and three remote, had a dedicated monitoring stack composed of Prometheus and Grafana installed on them.

Observations:

Observation 1 - Impact on Argo CD application synchronization

The table and graphs below highlight the impact of QPS & Burst Rate on “Sync Duration” as well as the average and maximum no. of goroutines active during the test run.

QPSBurst RateSync DurationNo. of GoRoutines (Avg)No. of GoRoutines (Max)
5010061.5 mins17601810
10020029.5 mins21202310
15030019.0 mins25202760
20040018.0 mins26202780
25050017.5 mins25902760
30060018.0 mins25402760

alt_text

To summarize, during the test, we immediately observed ~52% reduction (from 61.5 mins to 29.5 mins) as we increased QPS & Burst Rate from default values to 100 & 200 respectively. This also correlated with corresponding increase in no. of Goroutines processing application synchronization requests. The benefit from increasing values of these parameters started providing diminishing returns with subsequent runs. Beyond QPS & Burst rate of 150 & 300 respectively, there wasn’t measurable improvement observed. This again correlated with number of Goroutines actively processing sync requests.

Observation 2 - Impact on central Amazon EKS cluster control plane hosting Argo CD Server

The table and graphs below highlights the impact of QPS & Burst Rate on throughput and latency from Amazon EKS control plane hosting Argo CD Server. We can observe an increase in request rate per second to the Kubernetes control plane which is in line with previous observations related to increase in no. of goroutines processing the sync requests. The increased activity related to sync operations translates into increased requests to Amazon EKS control plane tapering off at QPS of 150 and Burst Rate of 300. Additional increase in QPS and Burst Rate parameters doesn’t noticeably impact request rate per second.

QPSBurst RateRequest Rate (Max)Latency p50 (Max)Latency p90 (Max)
5010027.2 rps13.0 ms22.6 ms
10020031.9 rps13.3 ms23.1 ms
15030039.8 rps14.3 ms24.0 ms
20040041.4 rps14.9 ms24.4 ms
25050039.0 rps15.1 ms24.4 ms
30060040.7 rps16.4 ms34.5 ms

From a latency perspective, overall during the course of testing, average (p50) duration remained within range of 13 to 16.5 ms and p90 latency within 22 ms to 34 ms. The error rate remained consistently around ~0.22% with a brief spike to ~0.25% (increase of ~0.03%).

The relatively low latency numbers and low error rate (<0.25%) indicates that Amazon EKS control plane was able to handle the load comfortably. Increasing QPS and Burst rate only would stretch the control plane to a limited extent indicating it still has resources to process additional requests as long as Argo CD server can generate request traffic.

alt_text

Observation 3 - Impact on remote Amazon EKS cluster control plane hosting applications

We had similar observations regarding latency, throughput and error rate for Amazon EKS control plane of remote application clusters. These are the clusters hosting ~5000 Argo CD applications each and connected to Argo CD Server on the central Amazon EKS cluster. The throughput peaked at ~35 requests per second with QPS and burst rate of 150 & 300 respectively. From an average latency perspective, it remained consistently within single digit millisecond hovering around ~5ms.

alt_text

Experiment 2: Revisiting Status/Operation Processors

Objective:

The objective of the second experiment is to explore why status/operation processors did not have an effect on sync times of our previous experiments. It is possible that the simple nature of ConfigMap applications which takes <1s to deploy is causing this behavior. Most real world applications would consist of tens to hundreds of resources taking longer to be deployed. During this experiment, we will simulate a more complex application which takes longer to deploy than the original ConfigMap application.

Test Infrastructure:

Central Argo CD cluster running on a single m5.2xlarge managing 100 application clusters. In order to simulate larger applications, each application will execute a PreSync job which waits 10 seconds before deploying the original ConfigMap application.

Example of the PreSync Job:

apiVersion: batch/v1
kind: Job
metadata:
name: before
annotations:
argocd.argoproj.io/hook: PreSync
argocd.argoproj.io/hook-delete-policy: HookSucceeded
spec:
template:
spec:
containers:
- name: sleep
image: alpine:latest
command: ["sleep", "10"]
restartPolicy: Never
backoffLimit: 0

Observations:

Observation 1 - Syncing never finishes and require a restart of the application controller to continue syncing

The screenshot below shows that from the start of the sync test at 17:02 till around 17:41, the sync process was deadlocked. We observed no changes to synced apps and the app_operation_processing_queue was pinned at 10k operations.

alt_text

Looking at the Argo CD console for a single application we see that the PreSync job finished 17 mins ago, but the application stayed in the Syncing phase.

alt_text

Observation 2: There is a link between client QPS/burst QPS and operation/status processor settings

In order to fix the sync freezing issue, we increased the client QPS/burst QPS from the default 50/100 to 100/200. After the change we were able to collect data on operation/status processor settings.

operation/status processors: 25/50
Sync time: 45 mins
operation/status processors: 50/100
Sync time: 30 mins
alt_textalt_text

We can see that there is a link between status/operation processors and client QPS/burst QPS settings. Changing one or the other could be required to improve sync times and Argo CD performance depending on your environment. Our recommendation is to first change the status/operation processor settings. If you run into Argo CD locking up or the performance not increasing further, and you have sufficient resources, you can try increasing the client QPS/burst QPS. But as mentioned in the first experiment, ensure you are monitoring the k8s api-server.

Experiment 3: Cluster Scaling

Objective:

The following experiment is designed to test the compute demands of the Argo CD app controller managing clusters with more than 100 applications.

Test Infrastructure:

Central Argo CD cluster with 10 app controller shards running on a single m5.2xlarge node managing 100/250/500 application clusters and 10k 2KB ConfigMap applications.

Observations:

From earlier experiences, we can see that when managing 100 clusters, we are close to the limit of a single m5.2xlarge node. As we push further and to 250/500 clusters, we have two observations. The first observation is that the graph data is less smooth than the sync test of 100 clusters. This can indicate that Prometheus is running out of compute as Argo CD is consuming most of it. Please note that we are not using any resource limits/requests in our experiments. If proper resource limits/requests are set, most likely we would only see performance issues with Argo CD and not Prometheus, when operating at the limit of your compute resources. The second observation is that on both the 250/500 cluster tests, there are some drop off in metric data. For the 250 cluster test, there is a blip at the 16:16 mark for Memory Usage. For the 500 cluster test there are blips in data at the 21.05 mark on the Workqueue depth, CPU usage, and Memory usage. In spite of these observations, the sync process completes in a reasonable time.

Clusters: 100
Sync time: 9 mins
Clusters: 250
Sync time: 9 mins
Clusters: 500
Sync time: 11 mins
alt_textalt_textalt_text
From this experiment, you can see that as you approach the limit of your compute resources, Argo CD and other applications running in your k8s environment could experience issues. It is recommended that you set proper resource limits/requests for your monitoring stack to ensure you have insights into what could be causing your performance issues.

Experiment 4: Application Scaling

Objective:

This experiment is meant to push the Argo CD app controller beyond 10k applications. As the previous rounds of experiments were performed with 10k apps, the intention of these experiments is to scale the Argo CD app controller up to 50k apps.

Test Infrastructure:

We will be performing this experiment on a Central Argo CD cluster with 10 app controller shards and 500 downstream application clusters. As we scale up the applications up to 10k,15k,20k,25k,30k,50k 2KB ConfigMap applications, we will add additional m5.2xlarge node(s) to the Argo CD cluster.

Observations:

Sync test at 15k applications with a single m5.2xlarge. You can see blips in data indicating unhealthy behavior on the cluster.CPU and Memory Usage is near 100% utilization of 8 vCPUs and 30 GB of memory.After adding another node for a total of two m5.2xlarge, we were able to perform a sync in 9 mins.
alt_textalt_textalt_text

After adding another node, we were able to continue our application scaling tests. You can see in the graphs below that syncing 20k and 25k apps was not a problem. The sync test of 30k apps shown on the third graph shows some blips in data, indicating that we are at the limits of two nodes.

Apps: 20000
Sync time: 12 mins
Apps: 25000
Sync time: 11 mins
Apps: 30000
Sync time: 19 mins
alt_textalt_textalt_text

For the final test in this experiment, we pushed the cluster to sync 50k apps.

While the cluster was able to manage reconciliation for the 50k apps as shown by a stable Sync Status graph from 8:40, when we start the sync at the 9:02 mark, you can see unhealthy behavior in the graph data.From examining the CPU/Memory Usage, you can see we have 100% CPU utilization across the cluster.After scaling the cluster to three m5.2xlarge nodes, we were able to perform a sync in 22 mins.
alt_textalt_textalt_text

From the scaling tests, we can see that the Argo CD app controller scales effectively by adding compute resources as we increase the number of applications to sync.

Experiment 5: How Many Shards?

Objective:

In previous experiments, we utilized ten app controller shards running across multiple nodes. In this experiment, we will explore how the number of app controller shards affect performance.

Test Infrastructure:

Central Argo CD cluster with 3, 6, 9 app controller shards running on 3 m5.2xlarge node(s) managing 500 application clusters and 50k 2KB ConfigMap applications.

Observations:

For the baseline of three shards it took 75 mins to perform a sync. Adding additional shards saw further improvements with a sync time of 37 mins for six shards and a sync time of 21 mins for nine shards. Further increasing shards beyond nine did not yield any improvements.

Shards: 3
Sync time: 75 mins
Shards: 6
Sync time: 37 mins
Shards: 9
Sync time: 21 mins
alt_textalt_textalt_text

Looking at the CPU and Memory utilization, you can see that adding shards can improve performance only if there are free resources to consume. With the baseline of three shards, CPU utilization of the nodes are well below eight vCPU that each node is allocated. As we add more shards, we can see CPU utilization increasing until we are close to 100% CPU Utilization with nine shards. Adding any more shards would not yield any performance benefits unless we add more nodes.

Shards: 3Shards: 6Shards: 9
alt_textalt_textalt_text

From the experiments, the Argo CD app controller sharding mechanism is able to scale as you add more compute resources. Sharding allows both horizontal and vertical scaling. As you add more shards, you can horizontally scale by adding more nodes or vertically scale by utilizing a larger node with more compute resources.

Experiment 6: Sharding Deep Dive

Objective:

With the release of Argo CD 2.8, a new sharding algorithm: round-robin was released. The existing legacy sharding algorithm performed a modulo of the number of replicas and the hash sum of the cluster id to determine the shard that should manage the cluster. This led to an imbalance in the number of clusters being managed by each shard. The new round-robin sharding algorithm is supposed to ensure an equal distribution of clusters being managed by each shard. We will also introduce 3 new algorithms: greedy minimum, weighted ring hash, and consistent hash with bounded loads. This experiment will evaluate all the algorithms on shard balance, application distribution and rebalancing on changes to the environment.

Test Infrastructure:

Central Argo CD cluster with 10 app controller shards running on 1 m5.2xlarge node managing 100 application clusters and 10k 2KB ConfigMap applications.

Observations:

Note: For all the observations, we start monitoring-period when we see items in the operations queue. We end the monitoring-period when all the applications are synced. We then look at the avg metric of CPU/Memory usage during the monitoring-period.

Legacy

The graph below shows the CPU Usage/Memory Usage of the 10 different Argo CD App Controller shards. Looking at the avg, you can see a large variation to how much each shard is utilizing its resources. To make an accurate comparison between the different sharding methods, we calculate the variability by determining the range of the data for both avg CPU usage and Memory usage. The CPU usage variability is calculated by taking the shard with the highest CPU usage and subtracting it from the shard with the least CPU usage: 0.55 - 0.23 = 0.32. The Memory usage variability is 452 MiB - 225 MiB = 227 MiB.

Variability:

CPU:0.32
Memory:227 MiB

alt_text

Round-Robin

With the newly introduced Round-Robin algorithm, you can see improved balance across the shards.

Variability:

CPU:0.02
Memory:110 MiB

alt_text

Better but not perfect

The new round-robin algorithm does a better job of keeping the number of clusters balanced across the shards. But in a real world environment, you would not have an equal number of applications running on each cluster and the work done by each shard is determined not by the number of clusters, but the number of applications. A new experiment was run which deploys a random number of applications to each cluster with the results below. Even with the round-robin algorithm, you can see some high variability in CPU/Memory usage.

Variability:

CPU:0.27
Memory:136 MiB

alt_text

Greedy Minimum Algorithm, sharding by the Number of Apps

A new algorithm is introduced in order to shard by the number of applications that are running on each cluster. It utilizes a greedy minimum algorithm to always choose the shard with the least number of apps when assigning shards. A description of the algorithm is shown below:

Iterate through the cluster list:

1. Determine the number of applications per cluster.
2. Find the shard with the least number of applications.
3. Add the number of applications to the assigned shard.

The same experiment with a random number of applications running on each cluster is run again with the results shown below. With the new algorithm, there is better balance across the shards.

Variability:

CPU:0.06
Memory:109 MiB

alt_text

While there is better balance when utilizing the greedy minimum algorithm, there is an issue when changing any aspect of the Argo CD sharding parameters. If you are adding shards, removing shards, adding clusters and/or removing clusters, the algorithm can trigger large scale changes in the shard assignments. Changes to the shard assignments cause shards to waste resources when switching to manage new clusters. This is especially true when utilizing ephemeral clusters in AI/ML training and big data operations where clusters come and go. Starting from the previous experiment from before, we changed the number of shards from 10 to 9 and observed over 75 cluster to shard assignment changes out of 100 clusters excluding the changes associated with the removed shard.

Weighted Ring Hash

In order to decrease the number of shard assignment changes, a well known method called consistent hashing is explored for our use case (Reference). Consistent hashing algorithms utilize a ring hash to determine distribution decisions. This method is already widely utilized by network load balancing applications to evenly distribute traffic in a distributed manner independent of the number of servers/nodes. By utilizing a ring hash algorithm to determine shard assignments, we were able to decrease the number of shard assignment changes when we changed the number of shards from 10 to 9. We observed 48 cluster to shard assignment changes, excluding the changes associated with the removed shard.

alt_text

To ensure balance, weighting is applied at each shard assignment to ensure the shard with the least number of apps is given the highest weight when choosing shards for assignment. The balancing is not perfect as you can see that CPU variability has increased from the greedy minimum algorithm of 0.06 to 0.12.

Variability:

CPU:0.12
Memory:163 MiB

Consistent Hash with Bounded Loads

The ring hash algorithm was never designed to allow dynamically updating the weights based on load. While we were able to utilize it for this purpose, we looked at another algorithm called Consistent Hashing with Bounded Loads (Reference) which looks to solve the problem of consistent hashing and load uniformity. By utilizing this new algorithm, we were able to significantly decrease the redistribution of cluster to shard assignments. When we change the number of shards from 10 to 9, we only observed 15 cluster to shard assignment changes excluding the changes associated with the removed shard.

alt_text

The trade off is slightly worse cluster/app balancing than the weighted ring hash which increased CPU variability from 0.12 to 0.17.

Variability:

CPU:0.17
Memory:131 MiB

There are no direct recommendations about which algorithm you should utilize, as each of them have their pros and cons. You should evaluate each for your environment whether you are looking for strict balancing of clusters/apps across the shards or whether you want to minimize the impact of making frequent changes to your Argo CD environment.

Conclusion

In this blog post, we continued our scalability tests of the Argo CD app controller by answering some questions we had from our first scalability tests about the common scalability parameters. We showed how QPS/Burst QPS affects the k8s api server, determined why status/operation processors did not affect our previous scalability tests, and how those parameters are linked together. We then continued our scalability tests by pushing the Argo CD app controller to 500 clusters and 50,000 apps. We ended our tests by showing that a key component of scaling the Argo CD app controller is how it performs sharding. By doing a deep dive into how the app controller performs sharding we also determined some ways to improve sharding by adding in and evaluating new sharding algorithms. We are currently evaluating how to contribute these changes back to Argo CD. Stay tuned for those contributions and reach out on the CNCF #argo-sig-scalability or the #cnoe-interest Slack channel to get help optimizing for your use-cases and scenarios.

- + \ No newline at end of file diff --git a/blog/tags/backstage.html b/blog/tags/backstage.html index 7428cc08..a7bc8323 100644 --- a/blog/tags/backstage.html +++ b/blog/tags/backstage.html @@ -10,7 +10,7 @@ - + @@ -22,7 +22,7 @@ hydrated entities are often kept in a separate git repository that mirrors and expands entities in the original git repository with intended application specifications.

On the positive side:

  • This is a relatively simple approach and works for smaller teams with smaller number of applications or systems
  • Having a second git repository to capture the end state of an entity stays closer to the core GitOps practices
  • Does not require significant modification to the developer portal

On the negative side:

  • There is inherent duplications that are happening
  • Adding custom metadata by application teams is not as trivial as it requires making changes to the integration workflow, thus bringing load and demand to the DevOps teams
  • Less abstraction in place as end application users are directly exposed to the yaml specification of the entities
  • Does not scale well as the number of systems and entities grow

ci-as-source-of-truth

Use a central control plane as the source of truth

The hub and spoke model is the most advocated for model when applying GitOps practices. Your control plane cluster runs and manages your platform tools, your CI, your CD, developer portal, infrastructure as code tooling, etc.

On the positive side:

  • There really is a single place to inspect the status of entities. E.g., Argo applications can tell you the status of deployed applications. You can also inspect the status of workflows, infrastructure resources, and any other entity that the control plane cluster manages.
  • You can use the Backstage Kubernetes plugin seamlessly and maybe with some little tweaks. Alternatively this can be achieved by introducing fairly light-weight Backstage custom entity providers which pull and show the status of entities in the Backstage portal.
  • In an organization with a diverse set of distributed systems, the control plane cluster can be used as the integration layer by wrapping legacy APIs and or implementing native controllers.

On the negative side:

  • Most organizations do not have a central control plane and adopting one as the source of truth is often a significant change, especially if an organization is early in their GitOps transition.
  • For organizations deep into a federated model of operation with different teams running and managing their platforms separately and rather independently, it could be challenging to offer a single control plane that aggregates data across all teams.
  • Management of change could become cumbersome. Existence of a single control plane could create bottlenecks where changes occur to a set of entities or practices. Changes in organizations or systems may result in changes to various entities managed across several teams. Bringing GitOps practices to the mix, this requires chains of approvals to happen across multiple entities and across several repositories for deployments to start flowing. Depending on the size of the organization, this could lead to organizational nightmares.
  • You may need to jump through a few hoops before getting from the representation of the application, to the actual deployment of it, e.g., going from git to your continuous delivery and from there to your target cluster.

controlplane-as-source-of-truth

Use Backstage as the source of truth

Where control planes and compute workloads are scattered, the unifying layer lies in the developer portal, i.e. Backstage. Hence, it is reasonable to construct an entity by collecting and aggregating data from various data sources, each providing partial data on the entity, making Backstage be the source of truth. This generally starts with Backstage querying git for the entities that exist. Then using the identifiers for the entities to collect metadata on how the entity contributes to a system. This could involve querying the control plane clusters and the workload clusters via some custom entity provider that looks for certain information and putting collected pieces together to come close to the core promise of a developer portal, providing reliable information on the entities.

On the positive side:

  • This model copes better with legacy systems
  • Users are not exposed to and often times not even aware of the underlying platforms, hence underlying platform and tooling is more rigorously abstracted away
  • Changes to the system are only isolated to the entities of the particular system as managed by the underlying resources and platform. This causes less chaos when definitions, metadata, or properties of entities need to change.

On the negative side:

  • The git service may not be able to scale, technically or financially. This is particularly because Backstage may hit the git service endpoints too frequently and exceed the API limits. This could cause delays in displaying data for end users or display wrong information if partially available data is mishandled. This can be mitigated via approaches like using an eventing mechanism to notify Backstage of changes, or alternatively to store entity definitions in an alternative storage space (e.g. Amazon S3). There are challenges to such approaches too, for example when using Amazon S3, change history will be lost. Also, using an eventing mechanism could introduce security challenges that we discuss next.
  • Securing Backstage could be a challenge. For Backstage to proactively receive updates on entity changes, it would work best to configure event hooks to provide callbacks to Backstage when changes occur. Backstage, being the entry point for user workflows, sits on the critical path of platform operations. As such, platform engineers need to solve for a chicken and egg problem by deciding how to expose Backstage endpoints to receive events and yet to limit access for security reasons. The authentication methods that GitHub supports may not satisfy the security standards that an organization requires.
  • Changes to entities may not be as trivial. DevOps engineers need to manage entities that they may not control. For example, if a new mandatory field is introduced to a catalog file, DevOps engineers may need to talk to the respective repository owners, create a PR, then get approval for all affected repositories.

backstage-as-source-of-truth

Conclusion

We discussed multiple approaches to creating reliable representation of system entities in the developer portals. We do not necessarily recommend one approach over another, but it is important to find the right approach given the patterns and practices in your organization. It is also worth noting that you can choose to combine multiple approaches depending on the requirements of your teams. For example, while continuous integration can still be used to construct the actual state of the world by collecting status data and other related information, Backstage extensions can be introduced to expand on entity relations, providing better representation of a system. Stating the obvious here, but your proper selection of patterns that work for you will go a long way in increasing your overall team velocity down the road.

Reach out on #cnoe-interest CNCF slack channel to share thoughts and get involved in developing CNOE.

- + \ No newline at end of file diff --git a/blog/tags/benchmarking.html b/blog/tags/benchmarking.html index ad68cc23..2b36cabe 100644 --- a/blog/tags/benchmarking.html +++ b/blog/tags/benchmarking.html @@ -10,14 +10,14 @@ - +

2 posts tagged with "benchmarking"

View All Tags

· 18 min read
Andrew Lee
Vikram Sethi

Introduction

In our earlier blog posts, we have discussed scalability tests for Argo CD, where in two consecutive experiments, we pushed the limits of Argo CD to deploy 10,000 applications on ~100 clusters and then 50,000 applications on 500 clusters along with configuration and fine-tuning required to make Argo CD scale effectively. Argo CD deployments, however, do not happen in isolation, and similar to a CNOE stack, Argo CD is often deployed on a cluster along with other tooling which collectively contribute to the performance and scalability bottlenecks we see users run into.

Argo Workflows is one common tool we often see users deploy alongside Argo CD to enable workflow executions (e.g. building images, running tests, cutting releases, etc). Our early experiments with Argo Workflows revealed that, if not tuned properly, it can negatively impact the scalability of a given Kubernetes cluster, particularly if the Kubernetes cluster happens to be the control cluster managing developer workflows across a large group of users. A real world example of some of the scaling challenges you can encounter with Argo Workflows is explored in our recent ArgoCon talk: Key Takeaways from Scaling Adobe's CI/CD Solution to Support 50K Argo CD Apps.

For us to better understand the limitations and tuning requirements for Argo Workflows, in this blog post we publish details on the scalability experiments we ran for Argo Workflows executing Workflows in two different load patterns: increasing rate up to 2100 workflows/min and queued reconciliation of 5000 workflows on an Amazon EKS cluster with 50x m5.large nodes. We show the correlation between the various Argo Workflow's knobs and controls and the processing time as well as performance improvements you can get by determining how you supply the workflows to the control plane.

Test Parameters

Test Workflow

The test workflow is based on the lightweight whalesay container from docker which prints out some text and ASCII art to the terminal. The reason we chose a lightweight container is that we wanted to stress the Argo Workflows controller in managing the Workflow lifecycle (pod creation, scheduling, and cleanup) and minimize the extra overhead on the Kubernetes control plane in dealing with the data plane workloads. An example of the Workflow is below:

var helloWorldWorkflow = wfv1.Workflow{
ObjectMeta: metav1.ObjectMeta{
GenerateName: "hello-world-",
},
Spec: wfv1.WorkflowSpec{
Entrypoint: "whalesay",
ServiceAccountName: "argo",
Templates: []wfv1.Template{
{
Name: "whalesay",
Container: &corev1.Container{
Image: "docker/whalesay:latest",
Command: []string{"cowsay", "hello world"},
},
},
},
PodGC: &wfv1.PodGC{
Strategy: "OnPodSuccess",
},
},
}

Argo Workflows Settings

We will be detailing how each of these settings affect Argo Workflow in various experiments later in this blog post.

  • Controller workers: Argo Workflows controller utilizes different workers for various operations in a Workflow lifecycle. We will be looking at t types of workers for our scalability testing.

    • workflow-workers (default: 32): These workers are threads in a single Argo Workflows controller that reconcile Argo Workflow Custom Resources (CRs). When a Workflow is created, a workflow-worker will handle the end-to-end operations of the Workflow from ensuring the pod is scheduled to ensuring the pod has finished. The number of workers can be specified by passing the --workflow-workers flag to the controller.

    • pod-cleanup-workers (default: 4): These workers clean up finished Workflows. When a Workflow has finished executing, depending on your clean-up settings, a pod-cleanup-worker will handle cleaning up the pod from the Workflow. The number of workers can be specified by passing the --pod-cleanup-workers flag to the controller.

  • Client queries per second (QPS)/Burst QPS settings (default: 20/30): These settings control when the Argo Workflows controller’s Kubernetes (K8s) client starts to throttle requests to the K8S API server. The client QPS setting is for limiting sustained QPS for the k8s client while burst QPS is for allowing a burst request rate in excess of the client QPS for a short period of time. The client QPS/burst QPS can be set by passing the --qps and --burst flag to the controller.

  • Sharding: Sharding with multiple Argo Workflows controllers is possible by running each controller in its own namespace. The controller would only reconcile Workflows submitted in that particular namespace. The namespace of each controller can be specified with the --namespaced flag.

Key Metrics

We chose a set of key metrics for the scalability testing because we wanted to measure how many workflows the Argo Workflows controller can reconcile and process. We will also be looking into K8s control plane metrics which might indicate your control plane cannot keep up with the Argo Workflows workload. 

  • Workqueue depth: The workqueue depth shows workflows which have not been reconciled. If the depth starts to increase, it indicates that the Argo Workflows controller is unable to handle the submission rate of Workflows.

  • Workqueue latency: The workqueue latency is the average time workflows spent waiting in the workqueue. A lower value indicates that the Argo Workflows controller is processing workflows faster so that they are not waiting in the workqueue.

  • K8S api server requests per second: The read and write requests per second being made to the K8S api server.

We didn’t include CPU/Memory as a key metric because during our testing we did not see any significant impacts to both. Most likely because of our simplistic workflows utilized for this benchmark.

Environment

We ran the experiments in an AWS environment utilizing a single Amazon EKS cluster. The Kubernetes version is 1.27 and Argo Workflows version is 3.5.4. No resource quotas were utilized on the Argo Workflows controller. For the cluster, we will start by provisioning 1x m5.8xlarge Amazon Elastic Compute Cloud (Amazon EC2) instances which will run the Argo Workflows controller and 50x m5.large instances for executing workflows. The number of execution instances is sufficient to run all 5000 workflows in parallel to ensure that pods are not waiting on resources to execute. Monitoring and metrics for Argo Workflows were provided by Prometheus/Grafana. 

Methodology

There will be two types of load patterns evaluated:

Increasing Rate Test: Workflows will be submitted at an increasing rate (workflows/min) until the Argo Workflows controller cannot keep up. The state at which the controller cannot keep up is when there are >0 workflows in the workflow queue or there is increasing queue latency. That rate of Workflow submissions will be noted as the maximum rate at which the Argo Workflows can be processed with the current settings.

Queued Reconciliation Test: 5000 workflows are submitted in less than minute. Metrics will be monitored from when the Argo Workflows controller starts processing workflows to when it has reconciled all 5000 workflows. The number of nodes is sufficient for running all the workflows simultaneously.

Experiments

Experiment 1: Baseline

In our baseline experiment, we are running in a single Argo Workflows shard (namespace) with default settings.

Increasing Rate Test:

As you can see below, the Argo Workflows controller can process up to 270 workflows/min. The average workqueue latency and workqueue depth are nearly zero. At 300 workflows/min, workqueue latency and workqueue depth starts to increase.

Enter image alt description

Queued Reconciliation Test:

It takes around 17 mins to reconcile 5000 workflows and peak avg workqueue latency was 5.38 minutes.

Enter image alt description

Experiment 2: Workflow Workers

For this experiment, we increase the number of workflow workers from the default of 32 to 128 where the workers use the maximum QPS and burst settings available to them. We also had to increase the number of pod-cleanup-workers to 32 as the Argo Workflows controller was experiencing some instability, where the controller pod was consistently crashing with the default value of 4.

Increasing Rate Test:

For the increasing workflow rate test, we can see exactly when the number of workflow workers is not sufficient to process the load. Both workqueue latency and depth start to increase indicating that workflows are waiting to be reconciled. When we increase the number of workers, the controller is able to reconcile the current load until an additional load is placed on it. For 32 workers, that limit is 300 workflows/min. When we increase the number of workers to 64, it is able to process that load until load is increased to 330 workflows/min. Then we increase the number of workers to 96 and it can process the additional load again. When we increase to 360 workflows/min, we need to bump the number of workers to 128.

WorkersMax workflows/minute
32270
64300
96330
128360

Enter image alt description

For the K8S api server, we see sustained 180 writes/sec and 70 reads/sec during the increasing rate tests.

Enter image alt description

Queued Reconciliation Test:

For the queued reconciliation test, the time it took to reconcile all the workflows did not change significantly. With 32 workers it took 17 mins to reconcile while with 96 workers it took 16 mins. The peak workqueue latency did decrease from 5.38 mins with 32 workers to 3.19 mins with 96 workers. With 128 workers, the Argo Workflows controller kept crashing.

WorkersPeak avg latency (mins)Reconcile time (mins)
325.3817
645.0618
963.1916
128N/AN/A

Enter image alt description

For the K8S api server, we see peaks of up to 260 writes/sec and 90 reads/sec during the queued reconciliation tests. You notice for the last test that there is no K8S api server activity as the Argo Workflows controller was misbehaving due to client-side throttling.

Enter image alt description

Observations from Experiment 2:

Workers play a big part in how fast the Argo Workflows controller is able to reconcile the rate of workflows being submitted. If you are observing workflow latency and backing up the workqueue depth, changing the number of workers is a potential way to improve performance. There are a few observations that we want to call out. One is that if we compare the two different patterns, one where we submit workflows at a constant rate and one in which we load up the workqueue all at once, we can see variations in calculated throughput. We can actually calculate the time it takes to reconcile 5000 apps utilizing the increasing rate test results and compare them to the queued reconciliation test.

WorkersIncreasing rate test time to reconciling 5000 workflows (mins)Reconcile time of 5000 workflows queued all at once (mins)
3218.517
6416.618
9615.116
12813.8N/A

We do get some conflicting results when we make this comparison. With 32 and 64 workers, the increasing rate test is actually slower than the queued reconciliation test. But if we increase to 96 workers, we can see that the increasing rate test results are faster. We were unable to compare with 128 workers as the Argo Workflows controller crashed when trying to run the queued reconciliation test. When investigating the cause of the crash, the logs have several messages like the following:

Waited for 6.185558715s due to client-side throttling, not priority and fairness, request: DELETE:https://10.100.0.1:443/api/v1/namespaces/argoworkflows1/pods/hello-world-57cfda8a-dc8b-4854-83a0-05785fb25e4b-3gwthk

These messages indicate that we should increase the Client QPS settings which we will evaluate in the next experiment.

Experiment 3: Client QPS Settings

For this experiment, we set the number of workflow workers back to the default of 32. We will then increase the QPS/Burst by increments of 10/10, from 20/30 to 50/60. We chose to only increase by 10/10 because any large increase past 50/60 did not yield any performance improvements. We believe that this is partly because we kept the workers at 32.

Initial Testing

Increasing Rate Test:

The QPS/Burst settings had a significant impact on the increasing rate test. By increasing the QPS/Burst from 20/30 to 30/40, we see ~50% improvement in max workflows/min from 270 to 420. When we increase the QPS/Burst from 30/40 to 40/50, we see another 28% improvement in max workflows/min from 420 to 540. When increasing from 40/50 to 50/60 there was only an additional 5% improvement. For 32 workers, increasing past 50/60 did not yield any significant improvements to the max workflows/min.

QPS/BurstMax workflows/minute
20/30270
30/40420
40/50540
50/60570

Enter image alt description

When changing QPS/Burst, we need to also monitor the K8S API server. Looking at the K8S API server req/s, we see sustained 390 writes/sec and 85 read/sec.

Enter image alt description

Queued Reconciliation Test:

Again, the QPS/Burst settings make a big difference in the queued reconciliation test when compared to just changing the workflow workers. Starting from the default settings of 20/30, we see decreasing reconcile times from 19 mins to 12 mins to 8 mins and finally to 6 mins when setting the QPS/Burst to 50/60. The peak average latency also decreased from 4.79 mins to 1.94 mins. We did note that there was a higher peak avg latency with 30/40 vs 20/30 but if you examine the graph you can see a steeper drop in latency accounting for the shorter reconcile time. Similar to the increasing rate test, increasing the QPS/Burst further did not yield any improvements.

QPS/BurstPeak avg latency (mins)Reconcile time (mins)
20/304.7919
30/405.6612
40/502.988
50/601.946

Enter image alt description

When looking at the K8S API server, we see peaks of up to 700 writes/sec and 200 reads/sec during the tests.

Enter image alt description

When compared to the workflow workers testing, you can see increasing the QPS/Burst is able to push the K8S API server and improve Argo Workflows overall performance. We do see some diminishing returns when increasing QPS/Burst past 50/60 even though it appears that the K8S API server has plenty of capacity for additional load. For the next test, we will increase both the workflow workers with the QPS/burst to see how far we can push Argo Workflows and the K8s API server.

Max Load Test

Increasing Rate Test:

We increased the number of workers to 128 and QPS/burst to 60/70 and observed peak average latency of 54 secs and a reconciliation time of 5 mins. Increasing either the workers or QPS/Burst did not improve these numbers.

Enter image alt description

Looking at the K8s API server, we saw peaks of 800 writes/sec and 190 reads/sec.

Enter image alt description

Queued Reconciliation Test:

Starting with 128 workers and QPS/Burst of 60/70, we were able to push Argo Workflows to 810 workflows/min. But past that point, there were no improvements with more workers or increased QPS/Burst limits.

Enter image alt description

We can see increased K8s API server activity with sustained 700 writes/sec and 160 reads/sec.

Enter image alt description

Observations from Experiment 3

One observation we made in the previous experiment with workflow workers is that the two different patterns of submitting workflows can be compared. We made that comparison again with the QPS/Burst tests and saw the following results:

QPS/BurstWorkersIncreasing rate test time to reconcile 5000 workflows (mins)Reconcile time of 5000 workflows queued all at once (mins)
20/303218.519
30/403211.912
50/60329.28
60/70328.76
70/801286.15

When we take the data about the comparison in experiment 1 with the data above, we can see a slight improvement in submitting all workflows together vs staggering them. We are not sure why this is the case and more experiments are required to understand this behavior.

It seems that we have hit a wall with 128 workers and a QPS/burst of 60/70 for a single Argo Workflows Controller. We will now evaluate Sharding and see if we can improve our performance from this point.

Experiment 4: Sharding

For this experiment, we will evaluate 1 shard, 2 shards, and 5 shards of the Argo Workflows controller with the default settings. We will then try for a maximum load test utilizing workflow workers, QPS/burst, and sharding to see the maximum performance on our current infrastructure.

Initial Testing

Increasing Rate Test:

Sharding the Argo Workflows controller has a linear impact on performance with the increasing rate test. By increasing the number of shards from 1 to 2, we see a 100% improvement in max workflows/min from 270 to 540. When we increase the shards from 2 to 5, we see an additional 150% improvement in max workflows/min from 540 to 1350.

ShardsMax workflows/min
1270
2540
51350

One thing to note is that each shard is increased by 30 workflows/min when increasing the rate. This means that the difference between two rates with 2 shards 30 = 60 workflows/min and the difference between two rates with 5 shards 30 = 150 workflows/min. That is why for 2 shards when the max load was determined at 600 workflows/min, we go down 1 rate which is 600 - 60 = 540 workflows/min.

Enter image alt description

You can see a significant impact on the K8s API server with sustained 1400 writes/sec and 300 reads/sec.

Enter image alt description

Queued Reconciliation Test:

As shown in the Increasing Rate Test, sharding has a huge impact on performance for the queued reconciliation test. With 1 shard it takes 18 mins to reconcile 5000 workflows, while with 2 shards it takes 9 mins. With 5 shards the reconcile time is further reduced to 4 mins.

ShardsPeak avg latency (mins)Reconcile time (mins)
15.4318
23.819
51.424

Enter image alt description

The impact on the K8s API server was not as significant when compared to previous experiments.

Max Load Test

Increasing Rate Test:

When increasing the workflow workers to 128, QPS/burst to 60/70 and shards to 5, the Argo Workflows controller is able to process up to 2100 workflows/min. Any higher than this seems to run into K8s API Priority and Fairness (APF) limits.

Enter image alt description

When looking at the K8s API server, we are seeing significant impact with peaks of 1500 writes/sec and 350 reads/sec.

Enter image alt description

When investigating why we are unable to push higher on the K8s API server, we see that APF limits are coming into effect by looking at the apiserver_flowcontrol_current_inqueue_requests. This metric shows the number of requests waiting in the APF flowcontrol queue.

Enter image alt description

Queued Reconciliation Test:

With the max load settings, we observed that the peak workqueue latency is only 20 seconds and the reconcile time is 2 minutes.

Enter image alt description

The impact on K8s API server is actually less than the previous max load queued reconciliation tests.

Enter image alt description

Observations from Experiment 4

As we did in previous experiments, we again make the comparison between the two different load patterns:

ShardsIncreasing rate test time to reconcile 5000 workflows (mins)Reconcile time of 5000 workflows queued all at once (mins)
118.518
29.29
53.74
Max load (5 shards)2.32

In general, it appears that submitting all workflows at once performs slightly better than submitting workflows at a steady rate. More experiments will need to be done to further investigate this behavior.

Conclusion

In this blog post we discussed our initial efforts in documenting and understanding the scaling characteristics of the Argo Workflows controller. Our findings show that the existing mechanisms for increasing workflow workers, increasing client and burst QPS settings and sharding the controller can help Argo Workflows scale better. Another interesting observation is that we saw differences in performance with how you submit your workflows. For the next set of experiments, we plan to evaluate more environmental variables and different types of workflows: multi-step and/or long running. Stay tuned for the report on our next round of experiments and reach out on the CNCF #argo-sig-scalability Slack channel to get help optimizing for your use-cases and scenarios.

· 21 min read
Andrew Lee
Michael Crenshaw
Gaurav Dhamija

Introduction

In Part 1 of our Argo CD benchmarking blog post, we analyzed the impacts of various Argo CD configuration parameters on the performance of Argo CD. In particular we measured the impact of status and operation processes, client QPS, burst QPS, and sharding algorithms on the overall synchronization and reconciliation behavior in Argo CD. We showed that using the right configuration and sharding strategy, particularly by properly setting client and burst QPS, as well as by splitting the workload across multiple workload clusters using Argo CD sharding, overall sync time can be improved by a factor of 4.

Here, and in Part 2 of our scalability work, we push our scalability experiments for Argo CD further. In particular, among other tests, we run our scalability metrics against a maximum of 500 workload clusters, deploying 50,000 Argo applications. This, to the best of our knowledge, sets the largest scalability testing ever done for Argo CD. We also report on a much deeper set of sharding experiments, utilizing different sharding algorithms for distribution of load across 100 workload clusters. While we report on running our experiments against a legacy sharding algorithm and a round robin algorithm that already exist in Argo CD 2.8, we also discuss results of workload distribution using 3 new sharding algorithms we developed in collaboration with RedHat, namely: a greedy minimum algorithm, a weighted ring hash algorithm, and a consistent hash with bounded loads algorithm. We show that, depending on the optimization goals one has in mind, choosing from the new sharding algorithms can improve CPU utilization by a factor of 3 and reduce application-to-shard rebalancing by a factor of 5, significantly improving the performance of a highly distributed and massively scaled Argo CD deployment.

Experiment 1: How Client QPS/Burst QPS affects the Kubernetes API Server

Objective:

The objective of the first experiment is to understand the impact of QPS & Burst Rate parameters on 1/Kubernetes control plane for both the Argo CD cluster and the remote application clusters, and 2/ overall sync duration for Argo CD applications. To understand the impact on Kubernetes API server, we observed following control plane metrics:

  • Latency (apiserver_request_duration_seconds_bucket)
  • Throughput (apiserver_request_total)
  • Error Rate (apiserver_request_total{code=~"[45].."}) for any request returning an error code 4xx or 5xx.

To analyze impact on application synchronization, we observed Sync Duration and No. of Goroutines Argo CD server metrics.

Test Infrastructure:

In terms of test infrastructure and workload configuration, we had one central Amazon EKS cluster with Argo CD Server running on it. This central cluster connected with three remote Amazon EKS clusters with each one of them hosting 5000 Argo CD applications. Each application is a Configmap (2KB) provisioned in a dedicated namespace. All of the four clusters, one central and three remote, had a dedicated monitoring stack composed of Prometheus and Grafana installed on them.

Observations:

Observation 1 - Impact on Argo CD application synchronization

The table and graphs below highlight the impact of QPS & Burst Rate on “Sync Duration” as well as the average and maximum no. of goroutines active during the test run.

QPSBurst RateSync DurationNo. of GoRoutines (Avg)No. of GoRoutines (Max)
5010061.5 mins17601810
10020029.5 mins21202310
15030019.0 mins25202760
20040018.0 mins26202780
25050017.5 mins25902760
30060018.0 mins25402760

alt_text

To summarize, during the test, we immediately observed ~52% reduction (from 61.5 mins to 29.5 mins) as we increased QPS & Burst Rate from default values to 100 & 200 respectively. This also correlated with corresponding increase in no. of Goroutines processing application synchronization requests. The benefit from increasing values of these parameters started providing diminishing returns with subsequent runs. Beyond QPS & Burst rate of 150 & 300 respectively, there wasn’t measurable improvement observed. This again correlated with number of Goroutines actively processing sync requests.

Observation 2 - Impact on central Amazon EKS cluster control plane hosting Argo CD Server

The table and graphs below highlights the impact of QPS & Burst Rate on throughput and latency from Amazon EKS control plane hosting Argo CD Server. We can observe an increase in request rate per second to the Kubernetes control plane which is in line with previous observations related to increase in no. of goroutines processing the sync requests. The increased activity related to sync operations translates into increased requests to Amazon EKS control plane tapering off at QPS of 150 and Burst Rate of 300. Additional increase in QPS and Burst Rate parameters doesn’t noticeably impact request rate per second.

QPSBurst RateRequest Rate (Max)Latency p50 (Max)Latency p90 (Max)
5010027.2 rps13.0 ms22.6 ms
10020031.9 rps13.3 ms23.1 ms
15030039.8 rps14.3 ms24.0 ms
20040041.4 rps14.9 ms24.4 ms
25050039.0 rps15.1 ms24.4 ms
30060040.7 rps16.4 ms34.5 ms

From a latency perspective, overall during the course of testing, average (p50) duration remained within range of 13 to 16.5 ms and p90 latency within 22 ms to 34 ms. The error rate remained consistently around ~0.22% with a brief spike to ~0.25% (increase of ~0.03%).

The relatively low latency numbers and low error rate (<0.25%) indicates that Amazon EKS control plane was able to handle the load comfortably. Increasing QPS and Burst rate only would stretch the control plane to a limited extent indicating it still has resources to process additional requests as long as Argo CD server can generate request traffic.

alt_text

Observation 3 - Impact on remote Amazon EKS cluster control plane hosting applications

We had similar observations regarding latency, throughput and error rate for Amazon EKS control plane of remote application clusters. These are the clusters hosting ~5000 Argo CD applications each and connected to Argo CD Server on the central Amazon EKS cluster. The throughput peaked at ~35 requests per second with QPS and burst rate of 150 & 300 respectively. From an average latency perspective, it remained consistently within single digit millisecond hovering around ~5ms.

alt_text

Experiment 2: Revisiting Status/Operation Processors

Objective:

The objective of the second experiment is to explore why status/operation processors did not have an effect on sync times of our previous experiments. It is possible that the simple nature of ConfigMap applications which takes <1s to deploy is causing this behavior. Most real world applications would consist of tens to hundreds of resources taking longer to be deployed. During this experiment, we will simulate a more complex application which takes longer to deploy than the original ConfigMap application.

Test Infrastructure:

Central Argo CD cluster running on a single m5.2xlarge managing 100 application clusters. In order to simulate larger applications, each application will execute a PreSync job which waits 10 seconds before deploying the original ConfigMap application.

Example of the PreSync Job:

apiVersion: batch/v1
kind: Job
metadata:
name: before
annotations:
argocd.argoproj.io/hook: PreSync
argocd.argoproj.io/hook-delete-policy: HookSucceeded
spec:
template:
spec:
containers:
- name: sleep
image: alpine:latest
command: ["sleep", "10"]
restartPolicy: Never
backoffLimit: 0

Observations:

Observation 1 - Syncing never finishes and require a restart of the application controller to continue syncing

The screenshot below shows that from the start of the sync test at 17:02 till around 17:41, the sync process was deadlocked. We observed no changes to synced apps and the app_operation_processing_queue was pinned at 10k operations.

alt_text

Looking at the Argo CD console for a single application we see that the PreSync job finished 17 mins ago, but the application stayed in the Syncing phase.

alt_text

Observation 2: There is a link between client QPS/burst QPS and operation/status processor settings

In order to fix the sync freezing issue, we increased the client QPS/burst QPS from the default 50/100 to 100/200. After the change we were able to collect data on operation/status processor settings.

operation/status processors: 25/50
Sync time: 45 mins
operation/status processors: 50/100
Sync time: 30 mins
alt_textalt_text

We can see that there is a link between status/operation processors and client QPS/burst QPS settings. Changing one or the other could be required to improve sync times and Argo CD performance depending on your environment. Our recommendation is to first change the status/operation processor settings. If you run into Argo CD locking up or the performance not increasing further, and you have sufficient resources, you can try increasing the client QPS/burst QPS. But as mentioned in the first experiment, ensure you are monitoring the k8s api-server.

Experiment 3: Cluster Scaling

Objective:

The following experiment is designed to test the compute demands of the Argo CD app controller managing clusters with more than 100 applications.

Test Infrastructure:

Central Argo CD cluster with 10 app controller shards running on a single m5.2xlarge node managing 100/250/500 application clusters and 10k 2KB ConfigMap applications.

Observations:

From earlier experiences, we can see that when managing 100 clusters, we are close to the limit of a single m5.2xlarge node. As we push further and to 250/500 clusters, we have two observations. The first observation is that the graph data is less smooth than the sync test of 100 clusters. This can indicate that Prometheus is running out of compute as Argo CD is consuming most of it. Please note that we are not using any resource limits/requests in our experiments. If proper resource limits/requests are set, most likely we would only see performance issues with Argo CD and not Prometheus, when operating at the limit of your compute resources. The second observation is that on both the 250/500 cluster tests, there are some drop off in metric data. For the 250 cluster test, there is a blip at the 16:16 mark for Memory Usage. For the 500 cluster test there are blips in data at the 21.05 mark on the Workqueue depth, CPU usage, and Memory usage. In spite of these observations, the sync process completes in a reasonable time.

Clusters: 100
Sync time: 9 mins
Clusters: 250
Sync time: 9 mins
Clusters: 500
Sync time: 11 mins
alt_textalt_textalt_text
From this experiment, you can see that as you approach the limit of your compute resources, Argo CD and other applications running in your k8s environment could experience issues. It is recommended that you set proper resource limits/requests for your monitoring stack to ensure you have insights into what could be causing your performance issues.

Experiment 4: Application Scaling

Objective:

This experiment is meant to push the Argo CD app controller beyond 10k applications. As the previous rounds of experiments were performed with 10k apps, the intention of these experiments is to scale the Argo CD app controller up to 50k apps.

Test Infrastructure:

We will be performing this experiment on a Central Argo CD cluster with 10 app controller shards and 500 downstream application clusters. As we scale up the applications up to 10k,15k,20k,25k,30k,50k 2KB ConfigMap applications, we will add additional m5.2xlarge node(s) to the Argo CD cluster.

Observations:

Sync test at 15k applications with a single m5.2xlarge. You can see blips in data indicating unhealthy behavior on the cluster.CPU and Memory Usage is near 100% utilization of 8 vCPUs and 30 GB of memory.After adding another node for a total of two m5.2xlarge, we were able to perform a sync in 9 mins.
alt_textalt_textalt_text

After adding another node, we were able to continue our application scaling tests. You can see in the graphs below that syncing 20k and 25k apps was not a problem. The sync test of 30k apps shown on the third graph shows some blips in data, indicating that we are at the limits of two nodes.

Apps: 20000
Sync time: 12 mins
Apps: 25000
Sync time: 11 mins
Apps: 30000
Sync time: 19 mins
alt_textalt_textalt_text

For the final test in this experiment, we pushed the cluster to sync 50k apps.

While the cluster was able to manage reconciliation for the 50k apps as shown by a stable Sync Status graph from 8:40, when we start the sync at the 9:02 mark, you can see unhealthy behavior in the graph data.From examining the CPU/Memory Usage, you can see we have 100% CPU utilization across the cluster.After scaling the cluster to three m5.2xlarge nodes, we were able to perform a sync in 22 mins.
alt_textalt_textalt_text

From the scaling tests, we can see that the Argo CD app controller scales effectively by adding compute resources as we increase the number of applications to sync.

Experiment 5: How Many Shards?

Objective:

In previous experiments, we utilized ten app controller shards running across multiple nodes. In this experiment, we will explore how the number of app controller shards affect performance.

Test Infrastructure:

Central Argo CD cluster with 3, 6, 9 app controller shards running on 3 m5.2xlarge node(s) managing 500 application clusters and 50k 2KB ConfigMap applications.

Observations:

For the baseline of three shards it took 75 mins to perform a sync. Adding additional shards saw further improvements with a sync time of 37 mins for six shards and a sync time of 21 mins for nine shards. Further increasing shards beyond nine did not yield any improvements.

Shards: 3
Sync time: 75 mins
Shards: 6
Sync time: 37 mins
Shards: 9
Sync time: 21 mins
alt_textalt_textalt_text

Looking at the CPU and Memory utilization, you can see that adding shards can improve performance only if there are free resources to consume. With the baseline of three shards, CPU utilization of the nodes are well below eight vCPU that each node is allocated. As we add more shards, we can see CPU utilization increasing until we are close to 100% CPU Utilization with nine shards. Adding any more shards would not yield any performance benefits unless we add more nodes.

Shards: 3Shards: 6Shards: 9
alt_textalt_textalt_text

From the experiments, the Argo CD app controller sharding mechanism is able to scale as you add more compute resources. Sharding allows both horizontal and vertical scaling. As you add more shards, you can horizontally scale by adding more nodes or vertically scale by utilizing a larger node with more compute resources.

Experiment 6: Sharding Deep Dive

Objective:

With the release of Argo CD 2.8, a new sharding algorithm: round-robin was released. The existing legacy sharding algorithm performed a modulo of the number of replicas and the hash sum of the cluster id to determine the shard that should manage the cluster. This led to an imbalance in the number of clusters being managed by each shard. The new round-robin sharding algorithm is supposed to ensure an equal distribution of clusters being managed by each shard. We will also introduce 3 new algorithms: greedy minimum, weighted ring hash, and consistent hash with bounded loads. This experiment will evaluate all the algorithms on shard balance, application distribution and rebalancing on changes to the environment.

Test Infrastructure:

Central Argo CD cluster with 10 app controller shards running on 1 m5.2xlarge node managing 100 application clusters and 10k 2KB ConfigMap applications.

Observations:

Note: For all the observations, we start monitoring-period when we see items in the operations queue. We end the monitoring-period when all the applications are synced. We then look at the avg metric of CPU/Memory usage during the monitoring-period.

Legacy

The graph below shows the CPU Usage/Memory Usage of the 10 different Argo CD App Controller shards. Looking at the avg, you can see a large variation to how much each shard is utilizing its resources. To make an accurate comparison between the different sharding methods, we calculate the variability by determining the range of the data for both avg CPU usage and Memory usage. The CPU usage variability is calculated by taking the shard with the highest CPU usage and subtracting it from the shard with the least CPU usage: 0.55 - 0.23 = 0.32. The Memory usage variability is 452 MiB - 225 MiB = 227 MiB.

Variability:

CPU:0.32
Memory:227 MiB

alt_text

Round-Robin

With the newly introduced Round-Robin algorithm, you can see improved balance across the shards.

Variability:

CPU:0.02
Memory:110 MiB

alt_text

Better but not perfect

The new round-robin algorithm does a better job of keeping the number of clusters balanced across the shards. But in a real world environment, you would not have an equal number of applications running on each cluster and the work done by each shard is determined not by the number of clusters, but the number of applications. A new experiment was run which deploys a random number of applications to each cluster with the results below. Even with the round-robin algorithm, you can see some high variability in CPU/Memory usage.

Variability:

CPU:0.27
Memory:136 MiB

alt_text

Greedy Minimum Algorithm, sharding by the Number of Apps

A new algorithm is introduced in order to shard by the number of applications that are running on each cluster. It utilizes a greedy minimum algorithm to always choose the shard with the least number of apps when assigning shards. A description of the algorithm is shown below:

Iterate through the cluster list:

1. Determine the number of applications per cluster.
2. Find the shard with the least number of applications.
3. Add the number of applications to the assigned shard.

The same experiment with a random number of applications running on each cluster is run again with the results shown below. With the new algorithm, there is better balance across the shards.

Variability:

CPU:0.06
Memory:109 MiB

alt_text

While there is better balance when utilizing the greedy minimum algorithm, there is an issue when changing any aspect of the Argo CD sharding parameters. If you are adding shards, removing shards, adding clusters and/or removing clusters, the algorithm can trigger large scale changes in the shard assignments. Changes to the shard assignments cause shards to waste resources when switching to manage new clusters. This is especially true when utilizing ephemeral clusters in AI/ML training and big data operations where clusters come and go. Starting from the previous experiment from before, we changed the number of shards from 10 to 9 and observed over 75 cluster to shard assignment changes out of 100 clusters excluding the changes associated with the removed shard.

Weighted Ring Hash

In order to decrease the number of shard assignment changes, a well known method called consistent hashing is explored for our use case (Reference). Consistent hashing algorithms utilize a ring hash to determine distribution decisions. This method is already widely utilized by network load balancing applications to evenly distribute traffic in a distributed manner independent of the number of servers/nodes. By utilizing a ring hash algorithm to determine shard assignments, we were able to decrease the number of shard assignment changes when we changed the number of shards from 10 to 9. We observed 48 cluster to shard assignment changes, excluding the changes associated with the removed shard.

alt_text

To ensure balance, weighting is applied at each shard assignment to ensure the shard with the least number of apps is given the highest weight when choosing shards for assignment. The balancing is not perfect as you can see that CPU variability has increased from the greedy minimum algorithm of 0.06 to 0.12.

Variability:

CPU:0.12
Memory:163 MiB

Consistent Hash with Bounded Loads

The ring hash algorithm was never designed to allow dynamically updating the weights based on load. While we were able to utilize it for this purpose, we looked at another algorithm called Consistent Hashing with Bounded Loads (Reference) which looks to solve the problem of consistent hashing and load uniformity. By utilizing this new algorithm, we were able to significantly decrease the redistribution of cluster to shard assignments. When we change the number of shards from 10 to 9, we only observed 15 cluster to shard assignment changes excluding the changes associated with the removed shard.

alt_text

The trade off is slightly worse cluster/app balancing than the weighted ring hash which increased CPU variability from 0.12 to 0.17.

Variability:

CPU:0.17
Memory:131 MiB

There are no direct recommendations about which algorithm you should utilize, as each of them have their pros and cons. You should evaluate each for your environment whether you are looking for strict balancing of clusters/apps across the shards or whether you want to minimize the impact of making frequent changes to your Argo CD environment.

Conclusion

In this blog post, we continued our scalability tests of the Argo CD app controller by answering some questions we had from our first scalability tests about the common scalability parameters. We showed how QPS/Burst QPS affects the k8s api server, determined why status/operation processors did not affect our previous scalability tests, and how those parameters are linked together. We then continued our scalability tests by pushing the Argo CD app controller to 500 clusters and 50,000 apps. We ended our tests by showing that a key component of scaling the Argo CD app controller is how it performs sharding. By doing a deep dive into how the app controller performs sharding we also determined some ways to improve sharding by adding in and evaluating new sharding algorithms. We are currently evaluating how to contribute these changes back to Argo CD. Stay tuned for those contributions and reach out on the CNCF #argo-sig-scalability or the #cnoe-interest Slack channel to get help optimizing for your use-cases and scenarios.

- + \ No newline at end of file diff --git a/blog/tags/cnoe.html b/blog/tags/cnoe.html index 39a4af67..20923612 100644 --- a/blog/tags/cnoe.html +++ b/blog/tags/cnoe.html @@ -10,14 +10,14 @@ - +

One post tagged with "cnoe"

View All Tags

· 5 min read
Nima Kaviani

Adobe, Amazon Web Services, Autodesk, Salesforce, and Twilio have come together to launch an open source initiative for building internal developer platforms (IDPs). Cloud Native Operational Excellence (aka, CNOE, pronounced Kuh.no) is a joint effort to share developer tooling, thoughts, and patterns to help organizations make informed technology choices and resolve common pain points. CNOE will enable organizations to navigate tooling sprawl and technology churn by coordinating contributions, offering tools, and providing neutral and unbiased guidance on technology choices to deliver internal developer platforms.

Developer productivity is increasingly important for organizations to compete in today’s fast-paced marketplace. To increase productivity, many organizations are taking a platform engineering approach to build internal developer platforms that abstract away complexity and enable faster, more secure software delivery. These internal developer platforms are long-term strategic investments, and the choice of open source technologies and architectures used to build these platforms can greatly impact their long-term success and viability.

CNOE is a community for organizations passionate about evolving experiences in developer productivity and efficiency. Contributors to this community are sharing their open source developer platform tooling choices to bring awareness to the best practices that have helped their respective teams. With such awareness comes alignment and the ability to de-risk their technology choices over the long term.

The CNOE community will navigate their operational technology decisions together, coordinate contributions, and offer guidance on which Cloud Native Computing Foundation (CNCF) technologies to use to achieve cloud efficiencies. CNOE will aim to:

Create an open source first strategy for internal developer platform capabilities, prioritizing CNCF technologies.

Build community alignment on technology choices and best practices.

Elevate tools and practices that can benefit a wide range of organizations building their own internal developer platforms.

Build for the infrastructure and customize to developer needs, making the solutions and patterns flexible for adoption.

Provide artifacts about tools, patterns, and practices to be easily consumable by the community.  

“The work of building secure, reliable, compliant, and regionalized software is becoming more and more complicated. Development teams need the right separation of concerns to build efficiently and move fast. Internal developer platforms enable just that. They abstract away complexity so a team can focus fully on their key goals. I’m excited to see the CNOE community share experiences, expand ideas beyond a single company’s viewpoint, and de-risk our technology strategies to build better together.” - Ben Cochran, VP Developer Enablement at Autodesk

"As a technology company, CNOE is an extension of our DNA, and open source is key to our platform. CNOE fosters collaboration within the industry, minimizes duplicated work, and emphasizes unique products. I'm eager to see our contributions to CNOE and others benefiting from it." - Chris Lyon, VP of Engineering Segment at Twilio.

"Open source software is a core component that many organizations leverage to power their internal developer platforms. Organizations often anchor on specific capabilities to power their developer platforms like Continuous Integration/Continuous Delivery, Infrastructure as Code, Service Mesh, Policy controls, Artifact management, and developer portals. As a result, they have been seeking a forum to share best practices and to share their findings on the tooling choices they have been using. I’m incredibly excited to see AWS contribute to CNOE and CNOE be the vehicle that creates industry alignment based on the intrinsic gravity of the tooling choices being made at scale.” - said Paul Roberts, Sr. Principal Solutions Architect at AWS.

“Adobe believes in the transformative power of open source software. We are excited to be a founding member of CNOE and to partner with other industry thought leaders to define and share our vision of a cloud native stack for rapidly building Internal Developer Platforms.” - Dave Weinstein, VP of Engineering at Adobe.

“Salesforce is deeply engaged in the Open Source community, which was integral in building Hyperforce, a reimagination of our trusted platform architecture for the public cloud. Salesforce is honored to serve as a launch partner for CNOE, further advancing the adoption of open source technologies and assuring companies of sound technology decisions and sustained support for years to come.” - Josh Meier, Hyperforce Lead Architect

With the launch of CNOE, members will contribute tooling, plugins, and reference implementations that facilitate building internal developer platforms. Members are also releasing a capability map that captures key open technologies and their relevance in building internal developer platforms across these organizations.

As we move forward, each member organization will continue to share their approach on adopting and composing the tooling and technologies recommended by the CNOE working group to deliver on their IDPs.

CNOE invites more companies to join us. To learn more about CNOE, visit https://cnoe.io, where we share extended details about patterns and practices we are developing. Explore options to get involved and contact us via the CNCF slack channel #cnoe-public.

Special thanks to the many people who helped with the launch, Andrew Lee, Omar Kahil, Ben Fields, Bryan Landes, Vikram Venkataraman, Rick Sostheim, Manabu McCloskey, Praseeda Sathaye, and Vara Bonthu from AWS, Rob Hilton (formerly AWS, now Google), Jesse Sanford, Greg Haynes, Mani Kandadai Venkatesh, Sara Mesing, and Brandon Leach from Autodesk, Jesse Adametz and Wes Medford from Twilio, Rohan Kapoor and Vikram Sethi from Adobe.

Member Announcements

- + \ No newline at end of file diff --git a/blog/tags/data-ingestion.html b/blog/tags/data-ingestion.html index 07be0606..09674202 100644 --- a/blog/tags/data-ingestion.html +++ b/blog/tags/data-ingestion.html @@ -10,7 +10,7 @@ - + @@ -22,7 +22,7 @@ hydrated entities are often kept in a separate git repository that mirrors and expands entities in the original git repository with intended application specifications.

On the positive side:

  • This is a relatively simple approach and works for smaller teams with smaller number of applications or systems
  • Having a second git repository to capture the end state of an entity stays closer to the core GitOps practices
  • Does not require significant modification to the developer portal

On the negative side:

  • There is inherent duplications that are happening
  • Adding custom metadata by application teams is not as trivial as it requires making changes to the integration workflow, thus bringing load and demand to the DevOps teams
  • Less abstraction in place as end application users are directly exposed to the yaml specification of the entities
  • Does not scale well as the number of systems and entities grow

ci-as-source-of-truth

Use a central control plane as the source of truth

The hub and spoke model is the most advocated for model when applying GitOps practices. Your control plane cluster runs and manages your platform tools, your CI, your CD, developer portal, infrastructure as code tooling, etc.

On the positive side:

  • There really is a single place to inspect the status of entities. E.g., Argo applications can tell you the status of deployed applications. You can also inspect the status of workflows, infrastructure resources, and any other entity that the control plane cluster manages.
  • You can use the Backstage Kubernetes plugin seamlessly and maybe with some little tweaks. Alternatively this can be achieved by introducing fairly light-weight Backstage custom entity providers which pull and show the status of entities in the Backstage portal.
  • In an organization with a diverse set of distributed systems, the control plane cluster can be used as the integration layer by wrapping legacy APIs and or implementing native controllers.

On the negative side:

  • Most organizations do not have a central control plane and adopting one as the source of truth is often a significant change, especially if an organization is early in their GitOps transition.
  • For organizations deep into a federated model of operation with different teams running and managing their platforms separately and rather independently, it could be challenging to offer a single control plane that aggregates data across all teams.
  • Management of change could become cumbersome. Existence of a single control plane could create bottlenecks where changes occur to a set of entities or practices. Changes in organizations or systems may result in changes to various entities managed across several teams. Bringing GitOps practices to the mix, this requires chains of approvals to happen across multiple entities and across several repositories for deployments to start flowing. Depending on the size of the organization, this could lead to organizational nightmares.
  • You may need to jump through a few hoops before getting from the representation of the application, to the actual deployment of it, e.g., going from git to your continuous delivery and from there to your target cluster.

controlplane-as-source-of-truth

Use Backstage as the source of truth

Where control planes and compute workloads are scattered, the unifying layer lies in the developer portal, i.e. Backstage. Hence, it is reasonable to construct an entity by collecting and aggregating data from various data sources, each providing partial data on the entity, making Backstage be the source of truth. This generally starts with Backstage querying git for the entities that exist. Then using the identifiers for the entities to collect metadata on how the entity contributes to a system. This could involve querying the control plane clusters and the workload clusters via some custom entity provider that looks for certain information and putting collected pieces together to come close to the core promise of a developer portal, providing reliable information on the entities.

On the positive side:

  • This model copes better with legacy systems
  • Users are not exposed to and often times not even aware of the underlying platforms, hence underlying platform and tooling is more rigorously abstracted away
  • Changes to the system are only isolated to the entities of the particular system as managed by the underlying resources and platform. This causes less chaos when definitions, metadata, or properties of entities need to change.

On the negative side:

  • The git service may not be able to scale, technically or financially. This is particularly because Backstage may hit the git service endpoints too frequently and exceed the API limits. This could cause delays in displaying data for end users or display wrong information if partially available data is mishandled. This can be mitigated via approaches like using an eventing mechanism to notify Backstage of changes, or alternatively to store entity definitions in an alternative storage space (e.g. Amazon S3). There are challenges to such approaches too, for example when using Amazon S3, change history will be lost. Also, using an eventing mechanism could introduce security challenges that we discuss next.
  • Securing Backstage could be a challenge. For Backstage to proactively receive updates on entity changes, it would work best to configure event hooks to provide callbacks to Backstage when changes occur. Backstage, being the entry point for user workflows, sits on the critical path of platform operations. As such, platform engineers need to solve for a chicken and egg problem by deciding how to expose Backstage endpoints to receive events and yet to limit access for security reasons. The authentication methods that GitHub supports may not satisfy the security standards that an organization requires.
  • Changes to entities may not be as trivial. DevOps engineers need to manage entities that they may not control. For example, if a new mandatory field is introduced to a catalog file, DevOps engineers may need to talk to the respective repository owners, create a PR, then get approval for all affected repositories.

backstage-as-source-of-truth

Conclusion

We discussed multiple approaches to creating reliable representation of system entities in the developer portals. We do not necessarily recommend one approach over another, but it is important to find the right approach given the patterns and practices in your organization. It is also worth noting that you can choose to combine multiple approaches depending on the requirements of your teams. For example, while continuous integration can still be used to construct the actual state of the world by collecting status data and other related information, Backstage extensions can be introduced to expand on entity relations, providing better representation of a system. Stating the obvious here, but your proper selection of patterns that work for you will go a long way in increasing your overall team velocity down the road.

Reach out on #cnoe-interest CNCF slack channel to share thoughts and get involved in developing CNOE.

- + \ No newline at end of file diff --git a/blog/tags/dev-portal.html b/blog/tags/dev-portal.html index dbbc6050..9bef2663 100644 --- a/blog/tags/dev-portal.html +++ b/blog/tags/dev-portal.html @@ -10,7 +10,7 @@ - + @@ -22,7 +22,7 @@ hydrated entities are often kept in a separate git repository that mirrors and expands entities in the original git repository with intended application specifications.

On the positive side:

  • This is a relatively simple approach and works for smaller teams with smaller number of applications or systems
  • Having a second git repository to capture the end state of an entity stays closer to the core GitOps practices
  • Does not require significant modification to the developer portal

On the negative side:

  • There is inherent duplications that are happening
  • Adding custom metadata by application teams is not as trivial as it requires making changes to the integration workflow, thus bringing load and demand to the DevOps teams
  • Less abstraction in place as end application users are directly exposed to the yaml specification of the entities
  • Does not scale well as the number of systems and entities grow

ci-as-source-of-truth

Use a central control plane as the source of truth

The hub and spoke model is the most advocated for model when applying GitOps practices. Your control plane cluster runs and manages your platform tools, your CI, your CD, developer portal, infrastructure as code tooling, etc.

On the positive side:

  • There really is a single place to inspect the status of entities. E.g., Argo applications can tell you the status of deployed applications. You can also inspect the status of workflows, infrastructure resources, and any other entity that the control plane cluster manages.
  • You can use the Backstage Kubernetes plugin seamlessly and maybe with some little tweaks. Alternatively this can be achieved by introducing fairly light-weight Backstage custom entity providers which pull and show the status of entities in the Backstage portal.
  • In an organization with a diverse set of distributed systems, the control plane cluster can be used as the integration layer by wrapping legacy APIs and or implementing native controllers.

On the negative side:

  • Most organizations do not have a central control plane and adopting one as the source of truth is often a significant change, especially if an organization is early in their GitOps transition.
  • For organizations deep into a federated model of operation with different teams running and managing their platforms separately and rather independently, it could be challenging to offer a single control plane that aggregates data across all teams.
  • Management of change could become cumbersome. Existence of a single control plane could create bottlenecks where changes occur to a set of entities or practices. Changes in organizations or systems may result in changes to various entities managed across several teams. Bringing GitOps practices to the mix, this requires chains of approvals to happen across multiple entities and across several repositories for deployments to start flowing. Depending on the size of the organization, this could lead to organizational nightmares.
  • You may need to jump through a few hoops before getting from the representation of the application, to the actual deployment of it, e.g., going from git to your continuous delivery and from there to your target cluster.

controlplane-as-source-of-truth

Use Backstage as the source of truth

Where control planes and compute workloads are scattered, the unifying layer lies in the developer portal, i.e. Backstage. Hence, it is reasonable to construct an entity by collecting and aggregating data from various data sources, each providing partial data on the entity, making Backstage be the source of truth. This generally starts with Backstage querying git for the entities that exist. Then using the identifiers for the entities to collect metadata on how the entity contributes to a system. This could involve querying the control plane clusters and the workload clusters via some custom entity provider that looks for certain information and putting collected pieces together to come close to the core promise of a developer portal, providing reliable information on the entities.

On the positive side:

  • This model copes better with legacy systems
  • Users are not exposed to and often times not even aware of the underlying platforms, hence underlying platform and tooling is more rigorously abstracted away
  • Changes to the system are only isolated to the entities of the particular system as managed by the underlying resources and platform. This causes less chaos when definitions, metadata, or properties of entities need to change.

On the negative side:

  • The git service may not be able to scale, technically or financially. This is particularly because Backstage may hit the git service endpoints too frequently and exceed the API limits. This could cause delays in displaying data for end users or display wrong information if partially available data is mishandled. This can be mitigated via approaches like using an eventing mechanism to notify Backstage of changes, or alternatively to store entity definitions in an alternative storage space (e.g. Amazon S3). There are challenges to such approaches too, for example when using Amazon S3, change history will be lost. Also, using an eventing mechanism could introduce security challenges that we discuss next.
  • Securing Backstage could be a challenge. For Backstage to proactively receive updates on entity changes, it would work best to configure event hooks to provide callbacks to Backstage when changes occur. Backstage, being the entry point for user workflows, sits on the critical path of platform operations. As such, platform engineers need to solve for a chicken and egg problem by deciding how to expose Backstage endpoints to receive events and yet to limit access for security reasons. The authentication methods that GitHub supports may not satisfy the security standards that an organization requires.
  • Changes to entities may not be as trivial. DevOps engineers need to manage entities that they may not control. For example, if a new mandatory field is introduced to a catalog file, DevOps engineers may need to talk to the respective repository owners, create a PR, then get approval for all affected repositories.

backstage-as-source-of-truth

Conclusion

We discussed multiple approaches to creating reliable representation of system entities in the developer portals. We do not necessarily recommend one approach over another, but it is important to find the right approach given the patterns and practices in your organization. It is also worth noting that you can choose to combine multiple approaches depending on the requirements of your teams. For example, while continuous integration can still be used to construct the actual state of the world by collecting status data and other related information, Backstage extensions can be introduced to expand on entity relations, providing better representation of a system. Stating the obvious here, but your proper selection of patterns that work for you will go a long way in increasing your overall team velocity down the road.

Reach out on #cnoe-interest CNCF slack channel to share thoughts and get involved in developing CNOE.

- + \ No newline at end of file diff --git a/blog/tags/hello.html b/blog/tags/hello.html index 460e5128..5050fd2b 100644 --- a/blog/tags/hello.html +++ b/blog/tags/hello.html @@ -10,14 +10,14 @@ - +

One post tagged with "hello"

View All Tags

· 5 min read
Nima Kaviani

Adobe, Amazon Web Services, Autodesk, Salesforce, and Twilio have come together to launch an open source initiative for building internal developer platforms (IDPs). Cloud Native Operational Excellence (aka, CNOE, pronounced Kuh.no) is a joint effort to share developer tooling, thoughts, and patterns to help organizations make informed technology choices and resolve common pain points. CNOE will enable organizations to navigate tooling sprawl and technology churn by coordinating contributions, offering tools, and providing neutral and unbiased guidance on technology choices to deliver internal developer platforms.

Developer productivity is increasingly important for organizations to compete in today’s fast-paced marketplace. To increase productivity, many organizations are taking a platform engineering approach to build internal developer platforms that abstract away complexity and enable faster, more secure software delivery. These internal developer platforms are long-term strategic investments, and the choice of open source technologies and architectures used to build these platforms can greatly impact their long-term success and viability.

CNOE is a community for organizations passionate about evolving experiences in developer productivity and efficiency. Contributors to this community are sharing their open source developer platform tooling choices to bring awareness to the best practices that have helped their respective teams. With such awareness comes alignment and the ability to de-risk their technology choices over the long term.

The CNOE community will navigate their operational technology decisions together, coordinate contributions, and offer guidance on which Cloud Native Computing Foundation (CNCF) technologies to use to achieve cloud efficiencies. CNOE will aim to:

Create an open source first strategy for internal developer platform capabilities, prioritizing CNCF technologies.

Build community alignment on technology choices and best practices.

Elevate tools and practices that can benefit a wide range of organizations building their own internal developer platforms.

Build for the infrastructure and customize to developer needs, making the solutions and patterns flexible for adoption.

Provide artifacts about tools, patterns, and practices to be easily consumable by the community.  

“The work of building secure, reliable, compliant, and regionalized software is becoming more and more complicated. Development teams need the right separation of concerns to build efficiently and move fast. Internal developer platforms enable just that. They abstract away complexity so a team can focus fully on their key goals. I’m excited to see the CNOE community share experiences, expand ideas beyond a single company’s viewpoint, and de-risk our technology strategies to build better together.” - Ben Cochran, VP Developer Enablement at Autodesk

"As a technology company, CNOE is an extension of our DNA, and open source is key to our platform. CNOE fosters collaboration within the industry, minimizes duplicated work, and emphasizes unique products. I'm eager to see our contributions to CNOE and others benefiting from it." - Chris Lyon, VP of Engineering Segment at Twilio.

"Open source software is a core component that many organizations leverage to power their internal developer platforms. Organizations often anchor on specific capabilities to power their developer platforms like Continuous Integration/Continuous Delivery, Infrastructure as Code, Service Mesh, Policy controls, Artifact management, and developer portals. As a result, they have been seeking a forum to share best practices and to share their findings on the tooling choices they have been using. I’m incredibly excited to see AWS contribute to CNOE and CNOE be the vehicle that creates industry alignment based on the intrinsic gravity of the tooling choices being made at scale.” - said Paul Roberts, Sr. Principal Solutions Architect at AWS.

“Adobe believes in the transformative power of open source software. We are excited to be a founding member of CNOE and to partner with other industry thought leaders to define and share our vision of a cloud native stack for rapidly building Internal Developer Platforms.” - Dave Weinstein, VP of Engineering at Adobe.

“Salesforce is deeply engaged in the Open Source community, which was integral in building Hyperforce, a reimagination of our trusted platform architecture for the public cloud. Salesforce is honored to serve as a launch partner for CNOE, further advancing the adoption of open source technologies and assuring companies of sound technology decisions and sustained support for years to come.” - Josh Meier, Hyperforce Lead Architect

With the launch of CNOE, members will contribute tooling, plugins, and reference implementations that facilitate building internal developer platforms. Members are also releasing a capability map that captures key open technologies and their relevance in building internal developer platforms across these organizations.

As we move forward, each member organization will continue to share their approach on adopting and composing the tooling and technologies recommended by the CNOE working group to deliver on their IDPs.

CNOE invites more companies to join us. To learn more about CNOE, visit https://cnoe.io, where we share extended details about patterns and practices we are developing. Explore options to get involved and contact us via the CNCF slack channel #cnoe-public.

Special thanks to the many people who helped with the launch, Andrew Lee, Omar Kahil, Ben Fields, Bryan Landes, Vikram Venkataraman, Rick Sostheim, Manabu McCloskey, Praseeda Sathaye, and Vara Bonthu from AWS, Rob Hilton (formerly AWS, now Google), Jesse Sanford, Greg Haynes, Mani Kandadai Venkatesh, Sara Mesing, and Brandon Leach from Autodesk, Jesse Adametz and Wes Medford from Twilio, Rohan Kapoor and Vikram Sethi from Adobe.

Member Announcements

- + \ No newline at end of file diff --git a/blog/tags/launch.html b/blog/tags/launch.html index 3df649bf..431839ea 100644 --- a/blog/tags/launch.html +++ b/blog/tags/launch.html @@ -10,14 +10,14 @@ - +

One post tagged with "launch"

View All Tags

· 5 min read
Nima Kaviani

Adobe, Amazon Web Services, Autodesk, Salesforce, and Twilio have come together to launch an open source initiative for building internal developer platforms (IDPs). Cloud Native Operational Excellence (aka, CNOE, pronounced Kuh.no) is a joint effort to share developer tooling, thoughts, and patterns to help organizations make informed technology choices and resolve common pain points. CNOE will enable organizations to navigate tooling sprawl and technology churn by coordinating contributions, offering tools, and providing neutral and unbiased guidance on technology choices to deliver internal developer platforms.

Developer productivity is increasingly important for organizations to compete in today’s fast-paced marketplace. To increase productivity, many organizations are taking a platform engineering approach to build internal developer platforms that abstract away complexity and enable faster, more secure software delivery. These internal developer platforms are long-term strategic investments, and the choice of open source technologies and architectures used to build these platforms can greatly impact their long-term success and viability.

CNOE is a community for organizations passionate about evolving experiences in developer productivity and efficiency. Contributors to this community are sharing their open source developer platform tooling choices to bring awareness to the best practices that have helped their respective teams. With such awareness comes alignment and the ability to de-risk their technology choices over the long term.

The CNOE community will navigate their operational technology decisions together, coordinate contributions, and offer guidance on which Cloud Native Computing Foundation (CNCF) technologies to use to achieve cloud efficiencies. CNOE will aim to:

Create an open source first strategy for internal developer platform capabilities, prioritizing CNCF technologies.

Build community alignment on technology choices and best practices.

Elevate tools and practices that can benefit a wide range of organizations building their own internal developer platforms.

Build for the infrastructure and customize to developer needs, making the solutions and patterns flexible for adoption.

Provide artifacts about tools, patterns, and practices to be easily consumable by the community.  

“The work of building secure, reliable, compliant, and regionalized software is becoming more and more complicated. Development teams need the right separation of concerns to build efficiently and move fast. Internal developer platforms enable just that. They abstract away complexity so a team can focus fully on their key goals. I’m excited to see the CNOE community share experiences, expand ideas beyond a single company’s viewpoint, and de-risk our technology strategies to build better together.” - Ben Cochran, VP Developer Enablement at Autodesk

"As a technology company, CNOE is an extension of our DNA, and open source is key to our platform. CNOE fosters collaboration within the industry, minimizes duplicated work, and emphasizes unique products. I'm eager to see our contributions to CNOE and others benefiting from it." - Chris Lyon, VP of Engineering Segment at Twilio.

"Open source software is a core component that many organizations leverage to power their internal developer platforms. Organizations often anchor on specific capabilities to power their developer platforms like Continuous Integration/Continuous Delivery, Infrastructure as Code, Service Mesh, Policy controls, Artifact management, and developer portals. As a result, they have been seeking a forum to share best practices and to share their findings on the tooling choices they have been using. I’m incredibly excited to see AWS contribute to CNOE and CNOE be the vehicle that creates industry alignment based on the intrinsic gravity of the tooling choices being made at scale.” - said Paul Roberts, Sr. Principal Solutions Architect at AWS.

“Adobe believes in the transformative power of open source software. We are excited to be a founding member of CNOE and to partner with other industry thought leaders to define and share our vision of a cloud native stack for rapidly building Internal Developer Platforms.” - Dave Weinstein, VP of Engineering at Adobe.

“Salesforce is deeply engaged in the Open Source community, which was integral in building Hyperforce, a reimagination of our trusted platform architecture for the public cloud. Salesforce is honored to serve as a launch partner for CNOE, further advancing the adoption of open source technologies and assuring companies of sound technology decisions and sustained support for years to come.” - Josh Meier, Hyperforce Lead Architect

With the launch of CNOE, members will contribute tooling, plugins, and reference implementations that facilitate building internal developer platforms. Members are also releasing a capability map that captures key open technologies and their relevance in building internal developer platforms across these organizations.

As we move forward, each member organization will continue to share their approach on adopting and composing the tooling and technologies recommended by the CNOE working group to deliver on their IDPs.

CNOE invites more companies to join us. To learn more about CNOE, visit https://cnoe.io, where we share extended details about patterns and practices we are developing. Explore options to get involved and contact us via the CNCF slack channel #cnoe-public.

Special thanks to the many people who helped with the launch, Andrew Lee, Omar Kahil, Ben Fields, Bryan Landes, Vikram Venkataraman, Rick Sostheim, Manabu McCloskey, Praseeda Sathaye, and Vara Bonthu from AWS, Rob Hilton (formerly AWS, now Google), Jesse Sanford, Greg Haynes, Mani Kandadai Venkatesh, Sara Mesing, and Brandon Leach from Autodesk, Jesse Adametz and Wes Medford from Twilio, Rohan Kapoor and Vikram Sethi from Adobe.

Member Announcements

- + \ No newline at end of file diff --git a/blog/tags/scalability.html b/blog/tags/scalability.html index c8e547cb..fbaa55f2 100644 --- a/blog/tags/scalability.html +++ b/blog/tags/scalability.html @@ -10,14 +10,14 @@ - +

2 posts tagged with "scalability"

View All Tags

· 18 min read
Andrew Lee
Vikram Sethi

Introduction

In our earlier blog posts, we have discussed scalability tests for Argo CD, where in two consecutive experiments, we pushed the limits of Argo CD to deploy 10,000 applications on ~100 clusters and then 50,000 applications on 500 clusters along with configuration and fine-tuning required to make Argo CD scale effectively. Argo CD deployments, however, do not happen in isolation, and similar to a CNOE stack, Argo CD is often deployed on a cluster along with other tooling which collectively contribute to the performance and scalability bottlenecks we see users run into.

Argo Workflows is one common tool we often see users deploy alongside Argo CD to enable workflow executions (e.g. building images, running tests, cutting releases, etc). Our early experiments with Argo Workflows revealed that, if not tuned properly, it can negatively impact the scalability of a given Kubernetes cluster, particularly if the Kubernetes cluster happens to be the control cluster managing developer workflows across a large group of users. A real world example of some of the scaling challenges you can encounter with Argo Workflows is explored in our recent ArgoCon talk: Key Takeaways from Scaling Adobe's CI/CD Solution to Support 50K Argo CD Apps.

For us to better understand the limitations and tuning requirements for Argo Workflows, in this blog post we publish details on the scalability experiments we ran for Argo Workflows executing Workflows in two different load patterns: increasing rate up to 2100 workflows/min and queued reconciliation of 5000 workflows on an Amazon EKS cluster with 50x m5.large nodes. We show the correlation between the various Argo Workflow's knobs and controls and the processing time as well as performance improvements you can get by determining how you supply the workflows to the control plane.

Test Parameters

Test Workflow

The test workflow is based on the lightweight whalesay container from docker which prints out some text and ASCII art to the terminal. The reason we chose a lightweight container is that we wanted to stress the Argo Workflows controller in managing the Workflow lifecycle (pod creation, scheduling, and cleanup) and minimize the extra overhead on the Kubernetes control plane in dealing with the data plane workloads. An example of the Workflow is below:

var helloWorldWorkflow = wfv1.Workflow{
ObjectMeta: metav1.ObjectMeta{
GenerateName: "hello-world-",
},
Spec: wfv1.WorkflowSpec{
Entrypoint: "whalesay",
ServiceAccountName: "argo",
Templates: []wfv1.Template{
{
Name: "whalesay",
Container: &corev1.Container{
Image: "docker/whalesay:latest",
Command: []string{"cowsay", "hello world"},
},
},
},
PodGC: &wfv1.PodGC{
Strategy: "OnPodSuccess",
},
},
}

Argo Workflows Settings

We will be detailing how each of these settings affect Argo Workflow in various experiments later in this blog post.

  • Controller workers: Argo Workflows controller utilizes different workers for various operations in a Workflow lifecycle. We will be looking at t types of workers for our scalability testing.

    • workflow-workers (default: 32): These workers are threads in a single Argo Workflows controller that reconcile Argo Workflow Custom Resources (CRs). When a Workflow is created, a workflow-worker will handle the end-to-end operations of the Workflow from ensuring the pod is scheduled to ensuring the pod has finished. The number of workers can be specified by passing the --workflow-workers flag to the controller.

    • pod-cleanup-workers (default: 4): These workers clean up finished Workflows. When a Workflow has finished executing, depending on your clean-up settings, a pod-cleanup-worker will handle cleaning up the pod from the Workflow. The number of workers can be specified by passing the --pod-cleanup-workers flag to the controller.

  • Client queries per second (QPS)/Burst QPS settings (default: 20/30): These settings control when the Argo Workflows controller’s Kubernetes (K8s) client starts to throttle requests to the K8S API server. The client QPS setting is for limiting sustained QPS for the k8s client while burst QPS is for allowing a burst request rate in excess of the client QPS for a short period of time. The client QPS/burst QPS can be set by passing the --qps and --burst flag to the controller.

  • Sharding: Sharding with multiple Argo Workflows controllers is possible by running each controller in its own namespace. The controller would only reconcile Workflows submitted in that particular namespace. The namespace of each controller can be specified with the --namespaced flag.

Key Metrics

We chose a set of key metrics for the scalability testing because we wanted to measure how many workflows the Argo Workflows controller can reconcile and process. We will also be looking into K8s control plane metrics which might indicate your control plane cannot keep up with the Argo Workflows workload. 

  • Workqueue depth: The workqueue depth shows workflows which have not been reconciled. If the depth starts to increase, it indicates that the Argo Workflows controller is unable to handle the submission rate of Workflows.

  • Workqueue latency: The workqueue latency is the average time workflows spent waiting in the workqueue. A lower value indicates that the Argo Workflows controller is processing workflows faster so that they are not waiting in the workqueue.

  • K8S api server requests per second: The read and write requests per second being made to the K8S api server.

We didn’t include CPU/Memory as a key metric because during our testing we did not see any significant impacts to both. Most likely because of our simplistic workflows utilized for this benchmark.

Environment

We ran the experiments in an AWS environment utilizing a single Amazon EKS cluster. The Kubernetes version is 1.27 and Argo Workflows version is 3.5.4. No resource quotas were utilized on the Argo Workflows controller. For the cluster, we will start by provisioning 1x m5.8xlarge Amazon Elastic Compute Cloud (Amazon EC2) instances which will run the Argo Workflows controller and 50x m5.large instances for executing workflows. The number of execution instances is sufficient to run all 5000 workflows in parallel to ensure that pods are not waiting on resources to execute. Monitoring and metrics for Argo Workflows were provided by Prometheus/Grafana. 

Methodology

There will be two types of load patterns evaluated:

Increasing Rate Test: Workflows will be submitted at an increasing rate (workflows/min) until the Argo Workflows controller cannot keep up. The state at which the controller cannot keep up is when there are >0 workflows in the workflow queue or there is increasing queue latency. That rate of Workflow submissions will be noted as the maximum rate at which the Argo Workflows can be processed with the current settings.

Queued Reconciliation Test: 5000 workflows are submitted in less than minute. Metrics will be monitored from when the Argo Workflows controller starts processing workflows to when it has reconciled all 5000 workflows. The number of nodes is sufficient for running all the workflows simultaneously.

Experiments

Experiment 1: Baseline

In our baseline experiment, we are running in a single Argo Workflows shard (namespace) with default settings.

Increasing Rate Test:

As you can see below, the Argo Workflows controller can process up to 270 workflows/min. The average workqueue latency and workqueue depth are nearly zero. At 300 workflows/min, workqueue latency and workqueue depth starts to increase.

Enter image alt description

Queued Reconciliation Test:

It takes around 17 mins to reconcile 5000 workflows and peak avg workqueue latency was 5.38 minutes.

Enter image alt description

Experiment 2: Workflow Workers

For this experiment, we increase the number of workflow workers from the default of 32 to 128 where the workers use the maximum QPS and burst settings available to them. We also had to increase the number of pod-cleanup-workers to 32 as the Argo Workflows controller was experiencing some instability, where the controller pod was consistently crashing with the default value of 4.

Increasing Rate Test:

For the increasing workflow rate test, we can see exactly when the number of workflow workers is not sufficient to process the load. Both workqueue latency and depth start to increase indicating that workflows are waiting to be reconciled. When we increase the number of workers, the controller is able to reconcile the current load until an additional load is placed on it. For 32 workers, that limit is 300 workflows/min. When we increase the number of workers to 64, it is able to process that load until load is increased to 330 workflows/min. Then we increase the number of workers to 96 and it can process the additional load again. When we increase to 360 workflows/min, we need to bump the number of workers to 128.

WorkersMax workflows/minute
32270
64300
96330
128360

Enter image alt description

For the K8S api server, we see sustained 180 writes/sec and 70 reads/sec during the increasing rate tests.

Enter image alt description

Queued Reconciliation Test:

For the queued reconciliation test, the time it took to reconcile all the workflows did not change significantly. With 32 workers it took 17 mins to reconcile while with 96 workers it took 16 mins. The peak workqueue latency did decrease from 5.38 mins with 32 workers to 3.19 mins with 96 workers. With 128 workers, the Argo Workflows controller kept crashing.

WorkersPeak avg latency (mins)Reconcile time (mins)
325.3817
645.0618
963.1916
128N/AN/A

Enter image alt description

For the K8S api server, we see peaks of up to 260 writes/sec and 90 reads/sec during the queued reconciliation tests. You notice for the last test that there is no K8S api server activity as the Argo Workflows controller was misbehaving due to client-side throttling.

Enter image alt description

Observations from Experiment 2:

Workers play a big part in how fast the Argo Workflows controller is able to reconcile the rate of workflows being submitted. If you are observing workflow latency and backing up the workqueue depth, changing the number of workers is a potential way to improve performance. There are a few observations that we want to call out. One is that if we compare the two different patterns, one where we submit workflows at a constant rate and one in which we load up the workqueue all at once, we can see variations in calculated throughput. We can actually calculate the time it takes to reconcile 5000 apps utilizing the increasing rate test results and compare them to the queued reconciliation test.

WorkersIncreasing rate test time to reconciling 5000 workflows (mins)Reconcile time of 5000 workflows queued all at once (mins)
3218.517
6416.618
9615.116
12813.8N/A

We do get some conflicting results when we make this comparison. With 32 and 64 workers, the increasing rate test is actually slower than the queued reconciliation test. But if we increase to 96 workers, we can see that the increasing rate test results are faster. We were unable to compare with 128 workers as the Argo Workflows controller crashed when trying to run the queued reconciliation test. When investigating the cause of the crash, the logs have several messages like the following:

Waited for 6.185558715s due to client-side throttling, not priority and fairness, request: DELETE:https://10.100.0.1:443/api/v1/namespaces/argoworkflows1/pods/hello-world-57cfda8a-dc8b-4854-83a0-05785fb25e4b-3gwthk

These messages indicate that we should increase the Client QPS settings which we will evaluate in the next experiment.

Experiment 3: Client QPS Settings

For this experiment, we set the number of workflow workers back to the default of 32. We will then increase the QPS/Burst by increments of 10/10, from 20/30 to 50/60. We chose to only increase by 10/10 because any large increase past 50/60 did not yield any performance improvements. We believe that this is partly because we kept the workers at 32.

Initial Testing

Increasing Rate Test:

The QPS/Burst settings had a significant impact on the increasing rate test. By increasing the QPS/Burst from 20/30 to 30/40, we see ~50% improvement in max workflows/min from 270 to 420. When we increase the QPS/Burst from 30/40 to 40/50, we see another 28% improvement in max workflows/min from 420 to 540. When increasing from 40/50 to 50/60 there was only an additional 5% improvement. For 32 workers, increasing past 50/60 did not yield any significant improvements to the max workflows/min.

QPS/BurstMax workflows/minute
20/30270
30/40420
40/50540
50/60570

Enter image alt description

When changing QPS/Burst, we need to also monitor the K8S API server. Looking at the K8S API server req/s, we see sustained 390 writes/sec and 85 read/sec.

Enter image alt description

Queued Reconciliation Test:

Again, the QPS/Burst settings make a big difference in the queued reconciliation test when compared to just changing the workflow workers. Starting from the default settings of 20/30, we see decreasing reconcile times from 19 mins to 12 mins to 8 mins and finally to 6 mins when setting the QPS/Burst to 50/60. The peak average latency also decreased from 4.79 mins to 1.94 mins. We did note that there was a higher peak avg latency with 30/40 vs 20/30 but if you examine the graph you can see a steeper drop in latency accounting for the shorter reconcile time. Similar to the increasing rate test, increasing the QPS/Burst further did not yield any improvements.

QPS/BurstPeak avg latency (mins)Reconcile time (mins)
20/304.7919
30/405.6612
40/502.988
50/601.946

Enter image alt description

When looking at the K8S API server, we see peaks of up to 700 writes/sec and 200 reads/sec during the tests.

Enter image alt description

When compared to the workflow workers testing, you can see increasing the QPS/Burst is able to push the K8S API server and improve Argo Workflows overall performance. We do see some diminishing returns when increasing QPS/Burst past 50/60 even though it appears that the K8S API server has plenty of capacity for additional load. For the next test, we will increase both the workflow workers with the QPS/burst to see how far we can push Argo Workflows and the K8s API server.

Max Load Test

Increasing Rate Test:

We increased the number of workers to 128 and QPS/burst to 60/70 and observed peak average latency of 54 secs and a reconciliation time of 5 mins. Increasing either the workers or QPS/Burst did not improve these numbers.

Enter image alt description

Looking at the K8s API server, we saw peaks of 800 writes/sec and 190 reads/sec.

Enter image alt description

Queued Reconciliation Test:

Starting with 128 workers and QPS/Burst of 60/70, we were able to push Argo Workflows to 810 workflows/min. But past that point, there were no improvements with more workers or increased QPS/Burst limits.

Enter image alt description

We can see increased K8s API server activity with sustained 700 writes/sec and 160 reads/sec.

Enter image alt description

Observations from Experiment 3

One observation we made in the previous experiment with workflow workers is that the two different patterns of submitting workflows can be compared. We made that comparison again with the QPS/Burst tests and saw the following results:

QPS/BurstWorkersIncreasing rate test time to reconcile 5000 workflows (mins)Reconcile time of 5000 workflows queued all at once (mins)
20/303218.519
30/403211.912
50/60329.28
60/70328.76
70/801286.15

When we take the data about the comparison in experiment 1 with the data above, we can see a slight improvement in submitting all workflows together vs staggering them. We are not sure why this is the case and more experiments are required to understand this behavior.

It seems that we have hit a wall with 128 workers and a QPS/burst of 60/70 for a single Argo Workflows Controller. We will now evaluate Sharding and see if we can improve our performance from this point.

Experiment 4: Sharding

For this experiment, we will evaluate 1 shard, 2 shards, and 5 shards of the Argo Workflows controller with the default settings. We will then try for a maximum load test utilizing workflow workers, QPS/burst, and sharding to see the maximum performance on our current infrastructure.

Initial Testing

Increasing Rate Test:

Sharding the Argo Workflows controller has a linear impact on performance with the increasing rate test. By increasing the number of shards from 1 to 2, we see a 100% improvement in max workflows/min from 270 to 540. When we increase the shards from 2 to 5, we see an additional 150% improvement in max workflows/min from 540 to 1350.

ShardsMax workflows/min
1270
2540
51350

One thing to note is that each shard is increased by 30 workflows/min when increasing the rate. This means that the difference between two rates with 2 shards 30 = 60 workflows/min and the difference between two rates with 5 shards 30 = 150 workflows/min. That is why for 2 shards when the max load was determined at 600 workflows/min, we go down 1 rate which is 600 - 60 = 540 workflows/min.

Enter image alt description

You can see a significant impact on the K8s API server with sustained 1400 writes/sec and 300 reads/sec.

Enter image alt description

Queued Reconciliation Test:

As shown in the Increasing Rate Test, sharding has a huge impact on performance for the queued reconciliation test. With 1 shard it takes 18 mins to reconcile 5000 workflows, while with 2 shards it takes 9 mins. With 5 shards the reconcile time is further reduced to 4 mins.

ShardsPeak avg latency (mins)Reconcile time (mins)
15.4318
23.819
51.424

Enter image alt description

The impact on the K8s API server was not as significant when compared to previous experiments.

Max Load Test

Increasing Rate Test:

When increasing the workflow workers to 128, QPS/burst to 60/70 and shards to 5, the Argo Workflows controller is able to process up to 2100 workflows/min. Any higher than this seems to run into K8s API Priority and Fairness (APF) limits.

Enter image alt description

When looking at the K8s API server, we are seeing significant impact with peaks of 1500 writes/sec and 350 reads/sec.

Enter image alt description

When investigating why we are unable to push higher on the K8s API server, we see that APF limits are coming into effect by looking at the apiserver_flowcontrol_current_inqueue_requests. This metric shows the number of requests waiting in the APF flowcontrol queue.

Enter image alt description

Queued Reconciliation Test:

With the max load settings, we observed that the peak workqueue latency is only 20 seconds and the reconcile time is 2 minutes.

Enter image alt description

The impact on K8s API server is actually less than the previous max load queued reconciliation tests.

Enter image alt description

Observations from Experiment 4

As we did in previous experiments, we again make the comparison between the two different load patterns:

ShardsIncreasing rate test time to reconcile 5000 workflows (mins)Reconcile time of 5000 workflows queued all at once (mins)
118.518
29.29
53.74
Max load (5 shards)2.32

In general, it appears that submitting all workflows at once performs slightly better than submitting workflows at a steady rate. More experiments will need to be done to further investigate this behavior.

Conclusion

In this blog post we discussed our initial efforts in documenting and understanding the scaling characteristics of the Argo Workflows controller. Our findings show that the existing mechanisms for increasing workflow workers, increasing client and burst QPS settings and sharding the controller can help Argo Workflows scale better. Another interesting observation is that we saw differences in performance with how you submit your workflows. For the next set of experiments, we plan to evaluate more environmental variables and different types of workflows: multi-step and/or long running. Stay tuned for the report on our next round of experiments and reach out on the CNCF #argo-sig-scalability Slack channel to get help optimizing for your use-cases and scenarios.

· 21 min read
Andrew Lee
Michael Crenshaw
Gaurav Dhamija

Introduction

In Part 1 of our Argo CD benchmarking blog post, we analyzed the impacts of various Argo CD configuration parameters on the performance of Argo CD. In particular we measured the impact of status and operation processes, client QPS, burst QPS, and sharding algorithms on the overall synchronization and reconciliation behavior in Argo CD. We showed that using the right configuration and sharding strategy, particularly by properly setting client and burst QPS, as well as by splitting the workload across multiple workload clusters using Argo CD sharding, overall sync time can be improved by a factor of 4.

Here, and in Part 2 of our scalability work, we push our scalability experiments for Argo CD further. In particular, among other tests, we run our scalability metrics against a maximum of 500 workload clusters, deploying 50,000 Argo applications. This, to the best of our knowledge, sets the largest scalability testing ever done for Argo CD. We also report on a much deeper set of sharding experiments, utilizing different sharding algorithms for distribution of load across 100 workload clusters. While we report on running our experiments against a legacy sharding algorithm and a round robin algorithm that already exist in Argo CD 2.8, we also discuss results of workload distribution using 3 new sharding algorithms we developed in collaboration with RedHat, namely: a greedy minimum algorithm, a weighted ring hash algorithm, and a consistent hash with bounded loads algorithm. We show that, depending on the optimization goals one has in mind, choosing from the new sharding algorithms can improve CPU utilization by a factor of 3 and reduce application-to-shard rebalancing by a factor of 5, significantly improving the performance of a highly distributed and massively scaled Argo CD deployment.

Experiment 1: How Client QPS/Burst QPS affects the Kubernetes API Server

Objective:

The objective of the first experiment is to understand the impact of QPS & Burst Rate parameters on 1/Kubernetes control plane for both the Argo CD cluster and the remote application clusters, and 2/ overall sync duration for Argo CD applications. To understand the impact on Kubernetes API server, we observed following control plane metrics:

  • Latency (apiserver_request_duration_seconds_bucket)
  • Throughput (apiserver_request_total)
  • Error Rate (apiserver_request_total{code=~"[45].."}) for any request returning an error code 4xx or 5xx.

To analyze impact on application synchronization, we observed Sync Duration and No. of Goroutines Argo CD server metrics.

Test Infrastructure:

In terms of test infrastructure and workload configuration, we had one central Amazon EKS cluster with Argo CD Server running on it. This central cluster connected with three remote Amazon EKS clusters with each one of them hosting 5000 Argo CD applications. Each application is a Configmap (2KB) provisioned in a dedicated namespace. All of the four clusters, one central and three remote, had a dedicated monitoring stack composed of Prometheus and Grafana installed on them.

Observations:

Observation 1 - Impact on Argo CD application synchronization

The table and graphs below highlight the impact of QPS & Burst Rate on “Sync Duration” as well as the average and maximum no. of goroutines active during the test run.

QPSBurst RateSync DurationNo. of GoRoutines (Avg)No. of GoRoutines (Max)
5010061.5 mins17601810
10020029.5 mins21202310
15030019.0 mins25202760
20040018.0 mins26202780
25050017.5 mins25902760
30060018.0 mins25402760

alt_text

To summarize, during the test, we immediately observed ~52% reduction (from 61.5 mins to 29.5 mins) as we increased QPS & Burst Rate from default values to 100 & 200 respectively. This also correlated with corresponding increase in no. of Goroutines processing application synchronization requests. The benefit from increasing values of these parameters started providing diminishing returns with subsequent runs. Beyond QPS & Burst rate of 150 & 300 respectively, there wasn’t measurable improvement observed. This again correlated with number of Goroutines actively processing sync requests.

Observation 2 - Impact on central Amazon EKS cluster control plane hosting Argo CD Server

The table and graphs below highlights the impact of QPS & Burst Rate on throughput and latency from Amazon EKS control plane hosting Argo CD Server. We can observe an increase in request rate per second to the Kubernetes control plane which is in line with previous observations related to increase in no. of goroutines processing the sync requests. The increased activity related to sync operations translates into increased requests to Amazon EKS control plane tapering off at QPS of 150 and Burst Rate of 300. Additional increase in QPS and Burst Rate parameters doesn’t noticeably impact request rate per second.

QPSBurst RateRequest Rate (Max)Latency p50 (Max)Latency p90 (Max)
5010027.2 rps13.0 ms22.6 ms
10020031.9 rps13.3 ms23.1 ms
15030039.8 rps14.3 ms24.0 ms
20040041.4 rps14.9 ms24.4 ms
25050039.0 rps15.1 ms24.4 ms
30060040.7 rps16.4 ms34.5 ms

From a latency perspective, overall during the course of testing, average (p50) duration remained within range of 13 to 16.5 ms and p90 latency within 22 ms to 34 ms. The error rate remained consistently around ~0.22% with a brief spike to ~0.25% (increase of ~0.03%).

The relatively low latency numbers and low error rate (<0.25%) indicates that Amazon EKS control plane was able to handle the load comfortably. Increasing QPS and Burst rate only would stretch the control plane to a limited extent indicating it still has resources to process additional requests as long as Argo CD server can generate request traffic.

alt_text

Observation 3 - Impact on remote Amazon EKS cluster control plane hosting applications

We had similar observations regarding latency, throughput and error rate for Amazon EKS control plane of remote application clusters. These are the clusters hosting ~5000 Argo CD applications each and connected to Argo CD Server on the central Amazon EKS cluster. The throughput peaked at ~35 requests per second with QPS and burst rate of 150 & 300 respectively. From an average latency perspective, it remained consistently within single digit millisecond hovering around ~5ms.

alt_text

Experiment 2: Revisiting Status/Operation Processors

Objective:

The objective of the second experiment is to explore why status/operation processors did not have an effect on sync times of our previous experiments. It is possible that the simple nature of ConfigMap applications which takes <1s to deploy is causing this behavior. Most real world applications would consist of tens to hundreds of resources taking longer to be deployed. During this experiment, we will simulate a more complex application which takes longer to deploy than the original ConfigMap application.

Test Infrastructure:

Central Argo CD cluster running on a single m5.2xlarge managing 100 application clusters. In order to simulate larger applications, each application will execute a PreSync job which waits 10 seconds before deploying the original ConfigMap application.

Example of the PreSync Job:

apiVersion: batch/v1
kind: Job
metadata:
name: before
annotations:
argocd.argoproj.io/hook: PreSync
argocd.argoproj.io/hook-delete-policy: HookSucceeded
spec:
template:
spec:
containers:
- name: sleep
image: alpine:latest
command: ["sleep", "10"]
restartPolicy: Never
backoffLimit: 0

Observations:

Observation 1 - Syncing never finishes and require a restart of the application controller to continue syncing

The screenshot below shows that from the start of the sync test at 17:02 till around 17:41, the sync process was deadlocked. We observed no changes to synced apps and the app_operation_processing_queue was pinned at 10k operations.

alt_text

Looking at the Argo CD console for a single application we see that the PreSync job finished 17 mins ago, but the application stayed in the Syncing phase.

alt_text

Observation 2: There is a link between client QPS/burst QPS and operation/status processor settings

In order to fix the sync freezing issue, we increased the client QPS/burst QPS from the default 50/100 to 100/200. After the change we were able to collect data on operation/status processor settings.

operation/status processors: 25/50
Sync time: 45 mins
operation/status processors: 50/100
Sync time: 30 mins
alt_textalt_text

We can see that there is a link between status/operation processors and client QPS/burst QPS settings. Changing one or the other could be required to improve sync times and Argo CD performance depending on your environment. Our recommendation is to first change the status/operation processor settings. If you run into Argo CD locking up or the performance not increasing further, and you have sufficient resources, you can try increasing the client QPS/burst QPS. But as mentioned in the first experiment, ensure you are monitoring the k8s api-server.

Experiment 3: Cluster Scaling

Objective:

The following experiment is designed to test the compute demands of the Argo CD app controller managing clusters with more than 100 applications.

Test Infrastructure:

Central Argo CD cluster with 10 app controller shards running on a single m5.2xlarge node managing 100/250/500 application clusters and 10k 2KB ConfigMap applications.

Observations:

From earlier experiences, we can see that when managing 100 clusters, we are close to the limit of a single m5.2xlarge node. As we push further and to 250/500 clusters, we have two observations. The first observation is that the graph data is less smooth than the sync test of 100 clusters. This can indicate that Prometheus is running out of compute as Argo CD is consuming most of it. Please note that we are not using any resource limits/requests in our experiments. If proper resource limits/requests are set, most likely we would only see performance issues with Argo CD and not Prometheus, when operating at the limit of your compute resources. The second observation is that on both the 250/500 cluster tests, there are some drop off in metric data. For the 250 cluster test, there is a blip at the 16:16 mark for Memory Usage. For the 500 cluster test there are blips in data at the 21.05 mark on the Workqueue depth, CPU usage, and Memory usage. In spite of these observations, the sync process completes in a reasonable time.

Clusters: 100
Sync time: 9 mins
Clusters: 250
Sync time: 9 mins
Clusters: 500
Sync time: 11 mins
alt_textalt_textalt_text
From this experiment, you can see that as you approach the limit of your compute resources, Argo CD and other applications running in your k8s environment could experience issues. It is recommended that you set proper resource limits/requests for your monitoring stack to ensure you have insights into what could be causing your performance issues.

Experiment 4: Application Scaling

Objective:

This experiment is meant to push the Argo CD app controller beyond 10k applications. As the previous rounds of experiments were performed with 10k apps, the intention of these experiments is to scale the Argo CD app controller up to 50k apps.

Test Infrastructure:

We will be performing this experiment on a Central Argo CD cluster with 10 app controller shards and 500 downstream application clusters. As we scale up the applications up to 10k,15k,20k,25k,30k,50k 2KB ConfigMap applications, we will add additional m5.2xlarge node(s) to the Argo CD cluster.

Observations:

Sync test at 15k applications with a single m5.2xlarge. You can see blips in data indicating unhealthy behavior on the cluster.CPU and Memory Usage is near 100% utilization of 8 vCPUs and 30 GB of memory.After adding another node for a total of two m5.2xlarge, we were able to perform a sync in 9 mins.
alt_textalt_textalt_text

After adding another node, we were able to continue our application scaling tests. You can see in the graphs below that syncing 20k and 25k apps was not a problem. The sync test of 30k apps shown on the third graph shows some blips in data, indicating that we are at the limits of two nodes.

Apps: 20000
Sync time: 12 mins
Apps: 25000
Sync time: 11 mins
Apps: 30000
Sync time: 19 mins
alt_textalt_textalt_text

For the final test in this experiment, we pushed the cluster to sync 50k apps.

While the cluster was able to manage reconciliation for the 50k apps as shown by a stable Sync Status graph from 8:40, when we start the sync at the 9:02 mark, you can see unhealthy behavior in the graph data.From examining the CPU/Memory Usage, you can see we have 100% CPU utilization across the cluster.After scaling the cluster to three m5.2xlarge nodes, we were able to perform a sync in 22 mins.
alt_textalt_textalt_text

From the scaling tests, we can see that the Argo CD app controller scales effectively by adding compute resources as we increase the number of applications to sync.

Experiment 5: How Many Shards?

Objective:

In previous experiments, we utilized ten app controller shards running across multiple nodes. In this experiment, we will explore how the number of app controller shards affect performance.

Test Infrastructure:

Central Argo CD cluster with 3, 6, 9 app controller shards running on 3 m5.2xlarge node(s) managing 500 application clusters and 50k 2KB ConfigMap applications.

Observations:

For the baseline of three shards it took 75 mins to perform a sync. Adding additional shards saw further improvements with a sync time of 37 mins for six shards and a sync time of 21 mins for nine shards. Further increasing shards beyond nine did not yield any improvements.

Shards: 3
Sync time: 75 mins
Shards: 6
Sync time: 37 mins
Shards: 9
Sync time: 21 mins
alt_textalt_textalt_text

Looking at the CPU and Memory utilization, you can see that adding shards can improve performance only if there are free resources to consume. With the baseline of three shards, CPU utilization of the nodes are well below eight vCPU that each node is allocated. As we add more shards, we can see CPU utilization increasing until we are close to 100% CPU Utilization with nine shards. Adding any more shards would not yield any performance benefits unless we add more nodes.

Shards: 3Shards: 6Shards: 9
alt_textalt_textalt_text

From the experiments, the Argo CD app controller sharding mechanism is able to scale as you add more compute resources. Sharding allows both horizontal and vertical scaling. As you add more shards, you can horizontally scale by adding more nodes or vertically scale by utilizing a larger node with more compute resources.

Experiment 6: Sharding Deep Dive

Objective:

With the release of Argo CD 2.8, a new sharding algorithm: round-robin was released. The existing legacy sharding algorithm performed a modulo of the number of replicas and the hash sum of the cluster id to determine the shard that should manage the cluster. This led to an imbalance in the number of clusters being managed by each shard. The new round-robin sharding algorithm is supposed to ensure an equal distribution of clusters being managed by each shard. We will also introduce 3 new algorithms: greedy minimum, weighted ring hash, and consistent hash with bounded loads. This experiment will evaluate all the algorithms on shard balance, application distribution and rebalancing on changes to the environment.

Test Infrastructure:

Central Argo CD cluster with 10 app controller shards running on 1 m5.2xlarge node managing 100 application clusters and 10k 2KB ConfigMap applications.

Observations:

Note: For all the observations, we start monitoring-period when we see items in the operations queue. We end the monitoring-period when all the applications are synced. We then look at the avg metric of CPU/Memory usage during the monitoring-period.

Legacy

The graph below shows the CPU Usage/Memory Usage of the 10 different Argo CD App Controller shards. Looking at the avg, you can see a large variation to how much each shard is utilizing its resources. To make an accurate comparison between the different sharding methods, we calculate the variability by determining the range of the data for both avg CPU usage and Memory usage. The CPU usage variability is calculated by taking the shard with the highest CPU usage and subtracting it from the shard with the least CPU usage: 0.55 - 0.23 = 0.32. The Memory usage variability is 452 MiB - 225 MiB = 227 MiB.

Variability:

CPU:0.32
Memory:227 MiB

alt_text

Round-Robin

With the newly introduced Round-Robin algorithm, you can see improved balance across the shards.

Variability:

CPU:0.02
Memory:110 MiB

alt_text

Better but not perfect

The new round-robin algorithm does a better job of keeping the number of clusters balanced across the shards. But in a real world environment, you would not have an equal number of applications running on each cluster and the work done by each shard is determined not by the number of clusters, but the number of applications. A new experiment was run which deploys a random number of applications to each cluster with the results below. Even with the round-robin algorithm, you can see some high variability in CPU/Memory usage.

Variability:

CPU:0.27
Memory:136 MiB

alt_text

Greedy Minimum Algorithm, sharding by the Number of Apps

A new algorithm is introduced in order to shard by the number of applications that are running on each cluster. It utilizes a greedy minimum algorithm to always choose the shard with the least number of apps when assigning shards. A description of the algorithm is shown below:

Iterate through the cluster list:

1. Determine the number of applications per cluster.
2. Find the shard with the least number of applications.
3. Add the number of applications to the assigned shard.

The same experiment with a random number of applications running on each cluster is run again with the results shown below. With the new algorithm, there is better balance across the shards.

Variability:

CPU:0.06
Memory:109 MiB

alt_text

While there is better balance when utilizing the greedy minimum algorithm, there is an issue when changing any aspect of the Argo CD sharding parameters. If you are adding shards, removing shards, adding clusters and/or removing clusters, the algorithm can trigger large scale changes in the shard assignments. Changes to the shard assignments cause shards to waste resources when switching to manage new clusters. This is especially true when utilizing ephemeral clusters in AI/ML training and big data operations where clusters come and go. Starting from the previous experiment from before, we changed the number of shards from 10 to 9 and observed over 75 cluster to shard assignment changes out of 100 clusters excluding the changes associated with the removed shard.

Weighted Ring Hash

In order to decrease the number of shard assignment changes, a well known method called consistent hashing is explored for our use case (Reference). Consistent hashing algorithms utilize a ring hash to determine distribution decisions. This method is already widely utilized by network load balancing applications to evenly distribute traffic in a distributed manner independent of the number of servers/nodes. By utilizing a ring hash algorithm to determine shard assignments, we were able to decrease the number of shard assignment changes when we changed the number of shards from 10 to 9. We observed 48 cluster to shard assignment changes, excluding the changes associated with the removed shard.

alt_text

To ensure balance, weighting is applied at each shard assignment to ensure the shard with the least number of apps is given the highest weight when choosing shards for assignment. The balancing is not perfect as you can see that CPU variability has increased from the greedy minimum algorithm of 0.06 to 0.12.

Variability:

CPU:0.12
Memory:163 MiB

Consistent Hash with Bounded Loads

The ring hash algorithm was never designed to allow dynamically updating the weights based on load. While we were able to utilize it for this purpose, we looked at another algorithm called Consistent Hashing with Bounded Loads (Reference) which looks to solve the problem of consistent hashing and load uniformity. By utilizing this new algorithm, we were able to significantly decrease the redistribution of cluster to shard assignments. When we change the number of shards from 10 to 9, we only observed 15 cluster to shard assignment changes excluding the changes associated with the removed shard.

alt_text

The trade off is slightly worse cluster/app balancing than the weighted ring hash which increased CPU variability from 0.12 to 0.17.

Variability:

CPU:0.17
Memory:131 MiB

There are no direct recommendations about which algorithm you should utilize, as each of them have their pros and cons. You should evaluate each for your environment whether you are looking for strict balancing of clusters/apps across the shards or whether you want to minimize the impact of making frequent changes to your Argo CD environment.

Conclusion

In this blog post, we continued our scalability tests of the Argo CD app controller by answering some questions we had from our first scalability tests about the common scalability parameters. We showed how QPS/Burst QPS affects the k8s api server, determined why status/operation processors did not affect our previous scalability tests, and how those parameters are linked together. We then continued our scalability tests by pushing the Argo CD app controller to 500 clusters and 50,000 apps. We ended our tests by showing that a key component of scaling the Argo CD app controller is how it performs sharding. By doing a deep dive into how the app controller performs sharding we also determined some ways to improve sharding by adding in and evaluating new sharding algorithms. We are currently evaluating how to contribute these changes back to Argo CD. Stay tuned for those contributions and reach out on the CNCF #argo-sig-scalability or the #cnoe-interest Slack channel to get help optimizing for your use-cases and scenarios.

- + \ No newline at end of file diff --git a/blog/tags/workflows.html b/blog/tags/workflows.html index 7e83f5b0..44a3e854 100644 --- a/blog/tags/workflows.html +++ b/blog/tags/workflows.html @@ -10,13 +10,13 @@ - +

One post tagged with "workflows"

View All Tags

· 18 min read
Andrew Lee
Vikram Sethi

Introduction

In our earlier blog posts, we have discussed scalability tests for Argo CD, where in two consecutive experiments, we pushed the limits of Argo CD to deploy 10,000 applications on ~100 clusters and then 50,000 applications on 500 clusters along with configuration and fine-tuning required to make Argo CD scale effectively. Argo CD deployments, however, do not happen in isolation, and similar to a CNOE stack, Argo CD is often deployed on a cluster along with other tooling which collectively contribute to the performance and scalability bottlenecks we see users run into.

Argo Workflows is one common tool we often see users deploy alongside Argo CD to enable workflow executions (e.g. building images, running tests, cutting releases, etc). Our early experiments with Argo Workflows revealed that, if not tuned properly, it can negatively impact the scalability of a given Kubernetes cluster, particularly if the Kubernetes cluster happens to be the control cluster managing developer workflows across a large group of users. A real world example of some of the scaling challenges you can encounter with Argo Workflows is explored in our recent ArgoCon talk: Key Takeaways from Scaling Adobe's CI/CD Solution to Support 50K Argo CD Apps.

For us to better understand the limitations and tuning requirements for Argo Workflows, in this blog post we publish details on the scalability experiments we ran for Argo Workflows executing Workflows in two different load patterns: increasing rate up to 2100 workflows/min and queued reconciliation of 5000 workflows on an Amazon EKS cluster with 50x m5.large nodes. We show the correlation between the various Argo Workflow's knobs and controls and the processing time as well as performance improvements you can get by determining how you supply the workflows to the control plane.

Test Parameters

Test Workflow

The test workflow is based on the lightweight whalesay container from docker which prints out some text and ASCII art to the terminal. The reason we chose a lightweight container is that we wanted to stress the Argo Workflows controller in managing the Workflow lifecycle (pod creation, scheduling, and cleanup) and minimize the extra overhead on the Kubernetes control plane in dealing with the data plane workloads. An example of the Workflow is below:

var helloWorldWorkflow = wfv1.Workflow{
ObjectMeta: metav1.ObjectMeta{
GenerateName: "hello-world-",
},
Spec: wfv1.WorkflowSpec{
Entrypoint: "whalesay",
ServiceAccountName: "argo",
Templates: []wfv1.Template{
{
Name: "whalesay",
Container: &corev1.Container{
Image: "docker/whalesay:latest",
Command: []string{"cowsay", "hello world"},
},
},
},
PodGC: &wfv1.PodGC{
Strategy: "OnPodSuccess",
},
},
}

Argo Workflows Settings

We will be detailing how each of these settings affect Argo Workflow in various experiments later in this blog post.

  • Controller workers: Argo Workflows controller utilizes different workers for various operations in a Workflow lifecycle. We will be looking at t types of workers for our scalability testing.

    • workflow-workers (default: 32): These workers are threads in a single Argo Workflows controller that reconcile Argo Workflow Custom Resources (CRs). When a Workflow is created, a workflow-worker will handle the end-to-end operations of the Workflow from ensuring the pod is scheduled to ensuring the pod has finished. The number of workers can be specified by passing the --workflow-workers flag to the controller.

    • pod-cleanup-workers (default: 4): These workers clean up finished Workflows. When a Workflow has finished executing, depending on your clean-up settings, a pod-cleanup-worker will handle cleaning up the pod from the Workflow. The number of workers can be specified by passing the --pod-cleanup-workers flag to the controller.

  • Client queries per second (QPS)/Burst QPS settings (default: 20/30): These settings control when the Argo Workflows controller’s Kubernetes (K8s) client starts to throttle requests to the K8S API server. The client QPS setting is for limiting sustained QPS for the k8s client while burst QPS is for allowing a burst request rate in excess of the client QPS for a short period of time. The client QPS/burst QPS can be set by passing the --qps and --burst flag to the controller.

  • Sharding: Sharding with multiple Argo Workflows controllers is possible by running each controller in its own namespace. The controller would only reconcile Workflows submitted in that particular namespace. The namespace of each controller can be specified with the --namespaced flag.

Key Metrics

We chose a set of key metrics for the scalability testing because we wanted to measure how many workflows the Argo Workflows controller can reconcile and process. We will also be looking into K8s control plane metrics which might indicate your control plane cannot keep up with the Argo Workflows workload. 

  • Workqueue depth: The workqueue depth shows workflows which have not been reconciled. If the depth starts to increase, it indicates that the Argo Workflows controller is unable to handle the submission rate of Workflows.

  • Workqueue latency: The workqueue latency is the average time workflows spent waiting in the workqueue. A lower value indicates that the Argo Workflows controller is processing workflows faster so that they are not waiting in the workqueue.

  • K8S api server requests per second: The read and write requests per second being made to the K8S api server.

We didn’t include CPU/Memory as a key metric because during our testing we did not see any significant impacts to both. Most likely because of our simplistic workflows utilized for this benchmark.

Environment

We ran the experiments in an AWS environment utilizing a single Amazon EKS cluster. The Kubernetes version is 1.27 and Argo Workflows version is 3.5.4. No resource quotas were utilized on the Argo Workflows controller. For the cluster, we will start by provisioning 1x m5.8xlarge Amazon Elastic Compute Cloud (Amazon EC2) instances which will run the Argo Workflows controller and 50x m5.large instances for executing workflows. The number of execution instances is sufficient to run all 5000 workflows in parallel to ensure that pods are not waiting on resources to execute. Monitoring and metrics for Argo Workflows were provided by Prometheus/Grafana. 

Methodology

There will be two types of load patterns evaluated:

Increasing Rate Test: Workflows will be submitted at an increasing rate (workflows/min) until the Argo Workflows controller cannot keep up. The state at which the controller cannot keep up is when there are >0 workflows in the workflow queue or there is increasing queue latency. That rate of Workflow submissions will be noted as the maximum rate at which the Argo Workflows can be processed with the current settings.

Queued Reconciliation Test: 5000 workflows are submitted in less than minute. Metrics will be monitored from when the Argo Workflows controller starts processing workflows to when it has reconciled all 5000 workflows. The number of nodes is sufficient for running all the workflows simultaneously.

Experiments

Experiment 1: Baseline

In our baseline experiment, we are running in a single Argo Workflows shard (namespace) with default settings.

Increasing Rate Test:

As you can see below, the Argo Workflows controller can process up to 270 workflows/min. The average workqueue latency and workqueue depth are nearly zero. At 300 workflows/min, workqueue latency and workqueue depth starts to increase.

Enter image alt description

Queued Reconciliation Test:

It takes around 17 mins to reconcile 5000 workflows and peak avg workqueue latency was 5.38 minutes.

Enter image alt description

Experiment 2: Workflow Workers

For this experiment, we increase the number of workflow workers from the default of 32 to 128 where the workers use the maximum QPS and burst settings available to them. We also had to increase the number of pod-cleanup-workers to 32 as the Argo Workflows controller was experiencing some instability, where the controller pod was consistently crashing with the default value of 4.

Increasing Rate Test:

For the increasing workflow rate test, we can see exactly when the number of workflow workers is not sufficient to process the load. Both workqueue latency and depth start to increase indicating that workflows are waiting to be reconciled. When we increase the number of workers, the controller is able to reconcile the current load until an additional load is placed on it. For 32 workers, that limit is 300 workflows/min. When we increase the number of workers to 64, it is able to process that load until load is increased to 330 workflows/min. Then we increase the number of workers to 96 and it can process the additional load again. When we increase to 360 workflows/min, we need to bump the number of workers to 128.

WorkersMax workflows/minute
32270
64300
96330
128360

Enter image alt description

For the K8S api server, we see sustained 180 writes/sec and 70 reads/sec during the increasing rate tests.

Enter image alt description

Queued Reconciliation Test:

For the queued reconciliation test, the time it took to reconcile all the workflows did not change significantly. With 32 workers it took 17 mins to reconcile while with 96 workers it took 16 mins. The peak workqueue latency did decrease from 5.38 mins with 32 workers to 3.19 mins with 96 workers. With 128 workers, the Argo Workflows controller kept crashing.

WorkersPeak avg latency (mins)Reconcile time (mins)
325.3817
645.0618
963.1916
128N/AN/A

Enter image alt description

For the K8S api server, we see peaks of up to 260 writes/sec and 90 reads/sec during the queued reconciliation tests. You notice for the last test that there is no K8S api server activity as the Argo Workflows controller was misbehaving due to client-side throttling.

Enter image alt description

Observations from Experiment 2:

Workers play a big part in how fast the Argo Workflows controller is able to reconcile the rate of workflows being submitted. If you are observing workflow latency and backing up the workqueue depth, changing the number of workers is a potential way to improve performance. There are a few observations that we want to call out. One is that if we compare the two different patterns, one where we submit workflows at a constant rate and one in which we load up the workqueue all at once, we can see variations in calculated throughput. We can actually calculate the time it takes to reconcile 5000 apps utilizing the increasing rate test results and compare them to the queued reconciliation test.

WorkersIncreasing rate test time to reconciling 5000 workflows (mins)Reconcile time of 5000 workflows queued all at once (mins)
3218.517
6416.618
9615.116
12813.8N/A

We do get some conflicting results when we make this comparison. With 32 and 64 workers, the increasing rate test is actually slower than the queued reconciliation test. But if we increase to 96 workers, we can see that the increasing rate test results are faster. We were unable to compare with 128 workers as the Argo Workflows controller crashed when trying to run the queued reconciliation test. When investigating the cause of the crash, the logs have several messages like the following:

Waited for 6.185558715s due to client-side throttling, not priority and fairness, request: DELETE:https://10.100.0.1:443/api/v1/namespaces/argoworkflows1/pods/hello-world-57cfda8a-dc8b-4854-83a0-05785fb25e4b-3gwthk

These messages indicate that we should increase the Client QPS settings which we will evaluate in the next experiment.

Experiment 3: Client QPS Settings

For this experiment, we set the number of workflow workers back to the default of 32. We will then increase the QPS/Burst by increments of 10/10, from 20/30 to 50/60. We chose to only increase by 10/10 because any large increase past 50/60 did not yield any performance improvements. We believe that this is partly because we kept the workers at 32.

Initial Testing

Increasing Rate Test:

The QPS/Burst settings had a significant impact on the increasing rate test. By increasing the QPS/Burst from 20/30 to 30/40, we see ~50% improvement in max workflows/min from 270 to 420. When we increase the QPS/Burst from 30/40 to 40/50, we see another 28% improvement in max workflows/min from 420 to 540. When increasing from 40/50 to 50/60 there was only an additional 5% improvement. For 32 workers, increasing past 50/60 did not yield any significant improvements to the max workflows/min.

QPS/BurstMax workflows/minute
20/30270
30/40420
40/50540
50/60570

Enter image alt description

When changing QPS/Burst, we need to also monitor the K8S API server. Looking at the K8S API server req/s, we see sustained 390 writes/sec and 85 read/sec.

Enter image alt description

Queued Reconciliation Test:

Again, the QPS/Burst settings make a big difference in the queued reconciliation test when compared to just changing the workflow workers. Starting from the default settings of 20/30, we see decreasing reconcile times from 19 mins to 12 mins to 8 mins and finally to 6 mins when setting the QPS/Burst to 50/60. The peak average latency also decreased from 4.79 mins to 1.94 mins. We did note that there was a higher peak avg latency with 30/40 vs 20/30 but if you examine the graph you can see a steeper drop in latency accounting for the shorter reconcile time. Similar to the increasing rate test, increasing the QPS/Burst further did not yield any improvements.

QPS/BurstPeak avg latency (mins)Reconcile time (mins)
20/304.7919
30/405.6612
40/502.988
50/601.946

Enter image alt description

When looking at the K8S API server, we see peaks of up to 700 writes/sec and 200 reads/sec during the tests.

Enter image alt description

When compared to the workflow workers testing, you can see increasing the QPS/Burst is able to push the K8S API server and improve Argo Workflows overall performance. We do see some diminishing returns when increasing QPS/Burst past 50/60 even though it appears that the K8S API server has plenty of capacity for additional load. For the next test, we will increase both the workflow workers with the QPS/burst to see how far we can push Argo Workflows and the K8s API server.

Max Load Test

Increasing Rate Test:

We increased the number of workers to 128 and QPS/burst to 60/70 and observed peak average latency of 54 secs and a reconciliation time of 5 mins. Increasing either the workers or QPS/Burst did not improve these numbers.

Enter image alt description

Looking at the K8s API server, we saw peaks of 800 writes/sec and 190 reads/sec.

Enter image alt description

Queued Reconciliation Test:

Starting with 128 workers and QPS/Burst of 60/70, we were able to push Argo Workflows to 810 workflows/min. But past that point, there were no improvements with more workers or increased QPS/Burst limits.

Enter image alt description

We can see increased K8s API server activity with sustained 700 writes/sec and 160 reads/sec.

Enter image alt description

Observations from Experiment 3

One observation we made in the previous experiment with workflow workers is that the two different patterns of submitting workflows can be compared. We made that comparison again with the QPS/Burst tests and saw the following results:

QPS/BurstWorkersIncreasing rate test time to reconcile 5000 workflows (mins)Reconcile time of 5000 workflows queued all at once (mins)
20/303218.519
30/403211.912
50/60329.28
60/70328.76
70/801286.15

When we take the data about the comparison in experiment 1 with the data above, we can see a slight improvement in submitting all workflows together vs staggering them. We are not sure why this is the case and more experiments are required to understand this behavior.

It seems that we have hit a wall with 128 workers and a QPS/burst of 60/70 for a single Argo Workflows Controller. We will now evaluate Sharding and see if we can improve our performance from this point.

Experiment 4: Sharding

For this experiment, we will evaluate 1 shard, 2 shards, and 5 shards of the Argo Workflows controller with the default settings. We will then try for a maximum load test utilizing workflow workers, QPS/burst, and sharding to see the maximum performance on our current infrastructure.

Initial Testing

Increasing Rate Test:

Sharding the Argo Workflows controller has a linear impact on performance with the increasing rate test. By increasing the number of shards from 1 to 2, we see a 100% improvement in max workflows/min from 270 to 540. When we increase the shards from 2 to 5, we see an additional 150% improvement in max workflows/min from 540 to 1350.

ShardsMax workflows/min
1270
2540
51350

One thing to note is that each shard is increased by 30 workflows/min when increasing the rate. This means that the difference between two rates with 2 shards 30 = 60 workflows/min and the difference between two rates with 5 shards 30 = 150 workflows/min. That is why for 2 shards when the max load was determined at 600 workflows/min, we go down 1 rate which is 600 - 60 = 540 workflows/min.

Enter image alt description

You can see a significant impact on the K8s API server with sustained 1400 writes/sec and 300 reads/sec.

Enter image alt description

Queued Reconciliation Test:

As shown in the Increasing Rate Test, sharding has a huge impact on performance for the queued reconciliation test. With 1 shard it takes 18 mins to reconcile 5000 workflows, while with 2 shards it takes 9 mins. With 5 shards the reconcile time is further reduced to 4 mins.

ShardsPeak avg latency (mins)Reconcile time (mins)
15.4318
23.819
51.424

Enter image alt description

The impact on the K8s API server was not as significant when compared to previous experiments.

Max Load Test

Increasing Rate Test:

When increasing the workflow workers to 128, QPS/burst to 60/70 and shards to 5, the Argo Workflows controller is able to process up to 2100 workflows/min. Any higher than this seems to run into K8s API Priority and Fairness (APF) limits.

Enter image alt description

When looking at the K8s API server, we are seeing significant impact with peaks of 1500 writes/sec and 350 reads/sec.

Enter image alt description

When investigating why we are unable to push higher on the K8s API server, we see that APF limits are coming into effect by looking at the apiserver_flowcontrol_current_inqueue_requests. This metric shows the number of requests waiting in the APF flowcontrol queue.

Enter image alt description

Queued Reconciliation Test:

With the max load settings, we observed that the peak workqueue latency is only 20 seconds and the reconcile time is 2 minutes.

Enter image alt description

The impact on K8s API server is actually less than the previous max load queued reconciliation tests.

Enter image alt description

Observations from Experiment 4

As we did in previous experiments, we again make the comparison between the two different load patterns:

ShardsIncreasing rate test time to reconcile 5000 workflows (mins)Reconcile time of 5000 workflows queued all at once (mins)
118.518
29.29
53.74
Max load (5 shards)2.32

In general, it appears that submitting all workflows at once performs slightly better than submitting workflows at a steady rate. More experiments will need to be done to further investigate this behavior.

Conclusion

In this blog post we discussed our initial efforts in documenting and understanding the scaling characteristics of the Argo Workflows controller. Our findings show that the existing mechanisms for increasing workflow workers, increasing client and burst QPS settings and sharding the controller can help Argo Workflows scale better. Another interesting observation is that we saw differences in performance with how you submit your workflows. For the next set of experiments, we plan to evaluate more environmental variables and different types of workflows: multi-step and/or long running. Stay tuned for the report on our next round of experiments and reach out on the CNCF #argo-sig-scalability Slack channel to get help optimizing for your use-cases and scenarios.

- + \ No newline at end of file diff --git a/blog/welcome.html b/blog/welcome.html index 42c3971a..fe33fca7 100644 --- a/blog/welcome.html +++ b/blog/welcome.html @@ -10,14 +10,14 @@ - +

CNOE - A Joint Effort to Share Internal Developer Platform Tools and Best Practices.

· 5 min read
Nima Kaviani

Adobe, Amazon Web Services, Autodesk, Salesforce, and Twilio have come together to launch an open source initiative for building internal developer platforms (IDPs). Cloud Native Operational Excellence (aka, CNOE, pronounced Kuh.no) is a joint effort to share developer tooling, thoughts, and patterns to help organizations make informed technology choices and resolve common pain points. CNOE will enable organizations to navigate tooling sprawl and technology churn by coordinating contributions, offering tools, and providing neutral and unbiased guidance on technology choices to deliver internal developer platforms.

Developer productivity is increasingly important for organizations to compete in today’s fast-paced marketplace. To increase productivity, many organizations are taking a platform engineering approach to build internal developer platforms that abstract away complexity and enable faster, more secure software delivery. These internal developer platforms are long-term strategic investments, and the choice of open source technologies and architectures used to build these platforms can greatly impact their long-term success and viability.

CNOE is a community for organizations passionate about evolving experiences in developer productivity and efficiency. Contributors to this community are sharing their open source developer platform tooling choices to bring awareness to the best practices that have helped their respective teams. With such awareness comes alignment and the ability to de-risk their technology choices over the long term.

The CNOE community will navigate their operational technology decisions together, coordinate contributions, and offer guidance on which Cloud Native Computing Foundation (CNCF) technologies to use to achieve cloud efficiencies. CNOE will aim to:

Create an open source first strategy for internal developer platform capabilities, prioritizing CNCF technologies.

Build community alignment on technology choices and best practices.

Elevate tools and practices that can benefit a wide range of organizations building their own internal developer platforms.

Build for the infrastructure and customize to developer needs, making the solutions and patterns flexible for adoption.

Provide artifacts about tools, patterns, and practices to be easily consumable by the community.  

“The work of building secure, reliable, compliant, and regionalized software is becoming more and more complicated. Development teams need the right separation of concerns to build efficiently and move fast. Internal developer platforms enable just that. They abstract away complexity so a team can focus fully on their key goals. I’m excited to see the CNOE community share experiences, expand ideas beyond a single company’s viewpoint, and de-risk our technology strategies to build better together.” - Ben Cochran, VP Developer Enablement at Autodesk

"As a technology company, CNOE is an extension of our DNA, and open source is key to our platform. CNOE fosters collaboration within the industry, minimizes duplicated work, and emphasizes unique products. I'm eager to see our contributions to CNOE and others benefiting from it." - Chris Lyon, VP of Engineering Segment at Twilio.

"Open source software is a core component that many organizations leverage to power their internal developer platforms. Organizations often anchor on specific capabilities to power their developer platforms like Continuous Integration/Continuous Delivery, Infrastructure as Code, Service Mesh, Policy controls, Artifact management, and developer portals. As a result, they have been seeking a forum to share best practices and to share their findings on the tooling choices they have been using. I’m incredibly excited to see AWS contribute to CNOE and CNOE be the vehicle that creates industry alignment based on the intrinsic gravity of the tooling choices being made at scale.” - said Paul Roberts, Sr. Principal Solutions Architect at AWS.

“Adobe believes in the transformative power of open source software. We are excited to be a founding member of CNOE and to partner with other industry thought leaders to define and share our vision of a cloud native stack for rapidly building Internal Developer Platforms.” - Dave Weinstein, VP of Engineering at Adobe.

“Salesforce is deeply engaged in the Open Source community, which was integral in building Hyperforce, a reimagination of our trusted platform architecture for the public cloud. Salesforce is honored to serve as a launch partner for CNOE, further advancing the adoption of open source technologies and assuring companies of sound technology decisions and sustained support for years to come.” - Josh Meier, Hyperforce Lead Architect

With the launch of CNOE, members will contribute tooling, plugins, and reference implementations that facilitate building internal developer platforms. Members are also releasing a capability map that captures key open technologies and their relevance in building internal developer platforms across these organizations.

As we move forward, each member organization will continue to share their approach on adopting and composing the tooling and technologies recommended by the CNOE working group to deliver on their IDPs.

CNOE invites more companies to join us. To learn more about CNOE, visit https://cnoe.io, where we share extended details about patterns and practices we are developing. Explore options to get involved and contact us via the CNCF slack channel #cnoe-public.

Special thanks to the many people who helped with the launch, Andrew Lee, Omar Kahil, Ben Fields, Bryan Landes, Vikram Venkataraman, Rick Sostheim, Manabu McCloskey, Praseeda Sathaye, and Vara Bonthu from AWS, Rob Hilton (formerly AWS, now Google), Jesse Sanford, Greg Haynes, Mani Kandadai Venkatesh, Sara Mesing, and Brandon Leach from Autodesk, Jesse Adametz and Wes Medford from Twilio, Rohan Kapoor and Vikram Sethi from Adobe.

Member Announcements

- + \ No newline at end of file diff --git a/docs/category/configurations.html b/docs/category/configurations.html index fb30e911..b6015b69 100644 --- a/docs/category/configurations.html +++ b/docs/category/configurations.html @@ -10,13 +10,13 @@ - + - + \ No newline at end of file diff --git a/docs/category/deploy-a-platform.html b/docs/category/deploy-a-platform.html index 085a0321..5520507f 100644 --- a/docs/category/deploy-a-platform.html +++ b/docs/category/deploy-a-platform.html @@ -10,7 +10,7 @@ - + @@ -18,7 +18,7 @@ - + \ No newline at end of file diff --git a/docs/category/generate-templates.html b/docs/category/generate-templates.html index b9780d0b..632cdc60 100644 --- a/docs/category/generate-templates.html +++ b/docs/category/generate-templates.html @@ -10,13 +10,13 @@ - + - + \ No newline at end of file diff --git a/docs/category/getting-started.html b/docs/category/getting-started.html index 630b5b38..94a584f4 100644 --- a/docs/category/getting-started.html +++ b/docs/category/getting-started.html @@ -10,7 +10,7 @@ - + @@ -18,7 +18,7 @@ - + \ No newline at end of file diff --git a/docs/category/plugins.html b/docs/category/plugins.html index fe7dd358..95ac3b98 100644 --- a/docs/category/plugins.html +++ b/docs/category/plugins.html @@ -10,13 +10,13 @@ - + - + \ No newline at end of file diff --git a/docs/category/technology-capabilities.html b/docs/category/technology-capabilities.html index 697876d1..69b95b59 100644 --- a/docs/category/technology-capabilities.html +++ b/docs/category/technology-capabilities.html @@ -10,13 +10,13 @@ - +

Technology Capabilities

- + \ No newline at end of file diff --git a/docs/intro.html b/docs/intro.html index 8e26ed17..860473c2 100644 --- a/docs/intro.html +++ b/docs/intro.html @@ -10,13 +10,13 @@ - +

Cloud Native Operational Excellence (CNOE)

( pronounced Kuh.noo )

What is CNOE?

Enterprises that adopt OSS as the foundation of their cloud platforms face the challenge of choosing technologies that will support their business outcomes for 3-5 years. The cost of retooling and re-platforming for large organizations is high, which makes bets on specific technologies fundamental to their technology strategies. In order to de-risk these bets, enterprises take into consideration the investments of their peer organizations. The goal for the CNOE framework is to bring together a cohort of enterprises operating at the same scale so that they can navigate their operational technology decisions together, de-risk their tooling bets, coordinate contribution, and offer guidance to large enterprises on which CNCF technologies to use together to achieve the best cloud efficiencies.

Problem Statement

As software development processes have evolved, infrastructure and related tooling have become more and more fragmented and complex. Standalone tools (i.e., Spinnaker, Jenkins, CloudFormation) for operating in cloud-native environments are no longer effective for most customers on their own. While some of this is the nature of technology evolution and growth, a number of root causes are contributing to customers augmenting or replacing their existing tooling to address their larger-scale challenges.

  1. The current tooling “standard” is a moving target. The size and scope of the CNCF landscape alongside the ever-increasing breadth of tools creates a paralyzation of choice. Customers are forced to adopt a wide-ranging array of tools with minimal direction and implement them in environment-specific ways, leading to a lack of consensus between customers. This also contributes to significant supportability and maintainability problems within the communities that govern those tools.

  2. The definitions of traditional continuous integration and continuous delivery/deployment (CI/CD) have become blurry. Legacy systems have grown to be bloated and contributions have fallen behind in lieu of lighter more modern tools (i.e., ArgoCD, Flux, Tekton) that focus on newer paradigms like GitOps.

  3. The advent and growth of declarative, centralized control planes for infrastructure (e.g. Kubernetes) creates an ecosystem that is fundamentally different and arguably inoperable with previous generation tooling. However, many application environments still expect (and will continue to expect) to interface with “traditional” virtual machine or bare metal based infrastructure points.

  4. Developers’ core workflows have remained more-or-less the same over the past years, focused more on understanding the intricacies of a language or framework and implementing them using appropriate versioning and collaborative tooling. Many of the abstractions in *-Ops today were designed to accommodate differences in infrastructure, leading to a discontinuity between developers and delivery.

  5. Fragmentation between software delivery and deployment methods have led to a multitude of languages, infrastructure as code platforms, templating engines, specs, and packaging systems. This creates an endless combination of non-portable software components that are difficult to reconcile into a singular application.

Tenets

The CNOE working group will operate based on the following tenets.

  1. Open source first: use of open source technology is prioritized over proprietary technology for each of the technology verticals discussed later in the doc. This helps ensure alignment across all the participating members by allowing them to coordinate on collaborations while having the freedom to update and modify a given technology to their needs

  2. Community driven: Decisions on the direction of the working group is driven by the community and its governing body. This involves the selection of technologies, level of commitment, and level of contribution.

  3. Tools and not Practices: CNOE offers suggestions on which tools to use and with what configurations. What practices a given company builds around and above those tools is out of scope for CNOE.

  4. Powered by Kubernetes, but not limited to orchestrate to Kubernetes: The CNOE working group relies heavily on the success of the CNCF community to choose technologies that are deemed useful to the type of operations required by the community. As such, Kubernetes is considered the de-facto environment to operate CNOE tooling. However, choosing of Kubernetes as the operating environment, does not require for it to be the environment to orchestrate against. Using the right infrastructure as code tooling, the CNOE community can choose to orchestrate against any compute platform of their choice.

  5. Standardized to the infrastructure, customizable by the developers: CNOE aims at addressing the usability requirements of its stakeholders. While the requirements of the platform could be enforced by the security engineers and infrastructure operators, the usability of it needs to be guaranteed by platform operators and application developers.

  6. Built to be shared: All CNOE deliverables including the reference architecture and the deployment packages will be developed out in the open and by collaboration of all its participating members and with the goal of making it sharable and usable by the larger open source community of interest.

What CNOE is not

  1. not only a unified control plane but building blocks for them to expand and extend the unified control plane

  2. not only a CI/CD tool but other components and capabilities that extend and enhance the integration and delivery of applications

  3. not new technologies or set of managed services, but a way to interact and integrate. There is still an expectation that companies will need to fund and operate the various open source tools used within the IDP

  4. not installers or proprietary packaging mechanisms. it will be fully open source and customizable and available to use by any one

  5. not responsible for operationalizing of the toolchain. There is still an expectation that companies will need to fund and operate the various open source tools used within the IDP

- + \ No newline at end of file diff --git a/docs/intro/approach.html b/docs/intro/approach.html index d8502c9d..6757a978 100644 --- a/docs/intro/approach.html +++ b/docs/intro/approach.html @@ -10,13 +10,13 @@ - +

Approach

CNOE takes a multitudinal and communal approach toward solving problems faced by DevOps teams. In order to address selection challenges within the fragmented and complex ecosystem of CNCF DevOps tooling, CNOE seeks community consensus on the categorical subdivision of delivery needs based on the size and scale of its users. This involves defining categories of tools deemed necessary for a successful DevOps strategy, as seen by the cohorts of users and based on the size of the company, the nature of the operation, and the type of workload. CNOE then endorses a set of tools in each category that when configured together, can deliver the top-of-the-line DevOps experience.

Pluggability and Extensibility

Splitting a DevOps delivery strategy into subcategories with logical boundaries requires for CNOE to allow pluggability and extensibility for tools within each category. This means that CNOE needs to ensure and facilitate integration of tools from one category with tools from another category as part of its delivery pipeline. As a concrete example, assuming that users will have the option to choose between Tekton or Argo Workflows for their CI and Weaveworks Flux or Argo CD for their CD, any combination of tools from the two categories should effectively work within the context of CNOE. This helps reduce fragmentation while providing options for adoption. On the other hand, a list of CNOE-endorsed tools that fit the defined logical DevOps boundaries is aimed at better right-tooling of the delivery pipelines. This in turn reduces the complexity in selecting the right tools for the job and enabling CNOE users to get a compliant delivery pipeline up and running as quickly as possible.

Powered by but not limited to Kubernetes

As discussed earlier, CNOE aims at simplifying selection, integration, and operation of DevOps tools available within the CNCF ecosystem. A question that may arise is whether CNOE assumes strong dependency to Kubernetes. Our take is that, while modern CNCF tools require Kubernetes to run on, they do not have to orchestrate resources and deployments against Kubernetes. This means that while users of CNOE assume dependency to Kubernetes for the operation of CNOE tool set, their workload does not need to be tied to Kubernetes. Within this context, using CNOE to deploy to discrete cloud platforms such as AWS Elastic Containers Service (ECS) or GCP Cloud Functions is a totally fair game.

Building Patterns and Tooling

For seamless transition into a CNOE-compliant delivery pipeline, CNOE will aim at delivering "packaging specifications", "templating mechanisms", as well as "deployer technologies", an example of which is enabled via the idpBuilder tool we have released. The combination of templates, specifications, and deployers allow for bundling and then unpacking of CNOE recommended tools into a user's DevOps environment. This enables teams to share and deliver components that are deemed to be the best tools for the job.

Modernizing a delivery pipeline according to CNOE guidelines then becomes the practice of devising a migration plan from the old set of tools used by an organization into the new set of tools endorsed by CNOE. This is another area where a community approach to endorsing, adhering to, and executing on CNOE-compliant delivery pipelines will be critical. For it to succeed, CNOE relies on commitments and contributions from its community members to develop and contribute migration plans and tools that empower transitioning from the legacy environments to the new environments.

- + \ No newline at end of file diff --git a/docs/intro/capabilities/artifact-registry.html b/docs/intro/capabilities/artifact-registry.html index 17501df4..84242b73 100644 --- a/docs/intro/capabilities/artifact-registry.html +++ b/docs/intro/capabilities/artifact-registry.html @@ -10,7 +10,7 @@ - + @@ -18,7 +18,7 @@

Artifact Registries

The artifact registry allows for the packaged components endorsed by the CNOE community to be signed, accessible, and traceable for its users. By storing the list of components in an OCI registry or Git repository, the CNOE packaging framework will be able to deal with versioned and compatible artifacts that have already been tested and verified in working together. This also allows the combination of the registry and the packaging mechanism to undergo secure software supply chain (SSSC) best practices to further increase the level of confidence in leveraging these tools by the CNOE users.

  • Canonical location for durable long term artifact storage.
  • Catalog + metadata about artifacts. Used for discovery of artifacts.
  • Can be used in conjunction with Role Based Access Control (RBAC) to limit access to artifacts.
  • Should be versioned and is often immutable
  • Often used with static analysis tools to verify artifacts are free from known vulnerabilities.
- + \ No newline at end of file diff --git a/docs/intro/capabilities/code-repository.html b/docs/intro/capabilities/code-repository.html index c42a1ed7..a08d2508 100644 --- a/docs/intro/capabilities/code-repository.html +++ b/docs/intro/capabilities/code-repository.html @@ -10,7 +10,7 @@ - + @@ -19,7 +19,7 @@ They allow for developers to work collaboratively on common codebases and often asynchronously. While git and other source control tooling can allow for decentralized collaboration, usually we choose to centralize the common workflows associated with code review and automation driven by git aka “git-ops”. When appropriately hardened and durable, pull requests (and associated merges to protected branches) can be used as a “system of record” for change control and approval in regulatory environments

  • Allows for developers to collaborate on code asynchronously
    • This includes peer reviews and change request approvals
  • Usually centralized even when using decentralized tooling like git for the purposes of building workflows
  • They can also be the mechanism used for peer reviews and change request approvals
- + \ No newline at end of file diff --git a/docs/intro/capabilities/compute-platform.html b/docs/intro/capabilities/compute-platform.html index 38969a67..8bb5e37d 100644 --- a/docs/intro/capabilities/compute-platform.html +++ b/docs/intro/capabilities/compute-platform.html @@ -10,13 +10,13 @@ - +

Compute Platform

This is the platform runtime. It can also be thought of as a deployment target for the applications that make up the platform. It offers some formalized patterns for interoperability between platform capabilities.

Can offer similar discoverability and uniformity as that of service oriented architectures (SOA) Can also offer a common medium for data exchange between services like an enterprise service bus or (ESB)

Frequently Kubernetes is the compute and also the substrate for the foundation of platform capabilities.

- + \ No newline at end of file diff --git a/docs/intro/capabilities/config-repository.html b/docs/intro/capabilities/config-repository.html index 330e0e39..aef0b45c 100644 --- a/docs/intro/capabilities/config-repository.html +++ b/docs/intro/capabilities/config-repository.html @@ -10,7 +10,7 @@ - + @@ -22,7 +22,7 @@ Can be centralized or distributed, but should only have one source of truth for a fully qualified key-value pair. The values should not contain embedded secrets but often contains references to secrets that can be found within secret repositories. The data should be versioned and immutable to allow for point in time snapshots for things like rollbacks.

  • Usually key/value or other serialized structured data format
  • Often appended or tagged with meta-data about env specifics
  • Keys can be structured in hierarchical or graph format
  • Values should not contain secrets but can contain references
  • Values should be versioned and immutable
  • Keys should only have one current source of truth for their values
- + \ No newline at end of file diff --git a/docs/intro/capabilities/continuous-delivery.html b/docs/intro/capabilities/continuous-delivery.html index 632af812..58223ba2 100644 --- a/docs/intro/capabilities/continuous-delivery.html +++ b/docs/intro/capabilities/continuous-delivery.html @@ -10,13 +10,13 @@ - +

Continuous Delivery (CD)

Continuous delivery’s ultimate goal is to get infrastructure and application resources into a state, ready for receiving production workload.

GitOps is a new trend in continuous delivery where automation is put in place to ensure the desired state of the world matches the perceived state of the world. This is achieved by connecting the source of truth (usually a git repository holding definition of application and infrastructure resources) to a reconciling controller that ensures consistency of the spun up resource to what is stored in Git. ArgoCD and FluxCD are two prominent implementations of these CD practices. While very similar in nature, ArgoCD and FluxCD and the likes could work in tandem and are not mutually exclusive.

Under the CD category, the CNOE community can help users evaluate which personas (e.g operator, developer) would be the most likely beneficiaries from each category of tooling. It is also worth noting that while GitOps is the dominant CD strategy in the CNCF space, it does not need to be the one or the only practice adopted by the CNOE users. Pluggability aspects of CNOE should ensure that customers have enough freedom in choosing their alternative.

  • Automation to build, test and release software upon every successful merge to a mainline branch
  • Allows for fully automated deployments when “Continuous Deployment” is enabled
  • Facilitates testing that goes beyond simple unit or integration tests. Frequently used in conjunction with end to end (E2E) or functional tests.
  • Can be used in conjunction with safe production deployment methods like Blue/Green or Canary deployments
  • Can also make use of feature flags to allow for “soft” or “dark launches” of features and functionality not yet ready for broad consumption
  • Generally gets code in the hands of consumers faster, surfacing bugs quicker and shortening product feedback loops.
- + \ No newline at end of file diff --git a/docs/intro/capabilities/deployment-targets.html b/docs/intro/capabilities/deployment-targets.html index de60e3f4..4a376d6b 100644 --- a/docs/intro/capabilities/deployment-targets.html +++ b/docs/intro/capabilities/deployment-targets.html @@ -10,13 +10,13 @@ - +

Deployment Targets

These are the runtime environments that product apps and services run on. This includes static content or data published for distribution.

  • Often abstractions hiding the details of underlying environments from the product developers.
    • Regions or localities are a good example of what one might want to mask.
  • Common deployment targets include:
    • Kubernetes
    • Lambda
    • Virtual Machines
    • Elastic Container Service
    • Static Content
- + \ No newline at end of file diff --git a/docs/intro/capabilities/developer-portal.html b/docs/intro/capabilities/developer-portal.html index 03b543de..01b640a4 100644 --- a/docs/intro/capabilities/developer-portal.html +++ b/docs/intro/capabilities/developer-portal.html @@ -10,13 +10,13 @@ - +

Developer Portal

The CNOE cohort will work towards striking a balance on expectations across all its stake holders. This basically means that the set of tooling put together under CNOE will have to be as useful to the application developers as it is to other stakeholders. In order for this to be achieved, big emphasis is put on offering the right developer productivity tool that would serve as an overarching umbrella for including and presenting the underlying tooling in a user-friendly manner. Backstage is a popular open source tool supporting configurability and pluggability that can be utilized to achieve such level of developer productivity.

  • Software catalog of all components, systems and domains.
  • One-stop location to find all about the software we build (docs, source repository, dashboards, support location, owners, etc.)
  • API Documentation
  • Dependencies on other software
  • Documentation system using the docs-as-code approach. Docs are typically in Markdown, and stored in code repositories.
  • Software templates for creating new projects.
  • Onboarding automation for security and trust.
- + \ No newline at end of file diff --git a/docs/intro/capabilities/identity-and-access.html b/docs/intro/capabilities/identity-and-access.html index b71bc830..c1a59696 100644 --- a/docs/intro/capabilities/identity-and-access.html +++ b/docs/intro/capabilities/identity-and-access.html @@ -10,13 +10,13 @@ - +

Identity and Access

In the context of a platform, identity and access is most frequently a service that can be used to wire up Authentication and Authorization in a common well understood manner. By offering Identity and Access management as a capability of the platform, we can avoid product applications from having to reinvent the wheel for such critical functionality.

This capability can differ greatly depending on the needs of applications and services that consume it, but generally it will allow for an application to delegate the login, or challenge for proof of identity to the platform. Then the application can utilize the results of that challenge process to use credentials presented to the user by the identity access process to access sensitive information or processes.

The technical aspects of how the Identity and Access service can be consumed by client apps should use rigourously tested standards. Often the Identity and Access service will allow for client apps to bring their own sources of identity through a process of federation. This allows for client apps to root their identity in their existing systems but still make use of the common Auth service offered by the platform.

Machine identity and in particular the SPIFFE Protocol is a relatively new method to make use of trust built into workloads running in known good environments as an authentication mechanism. This is considered more secure than the use of long lived pre-shared secrets like those used by services users or API tokens.

  • Must provide authentication
  • May provide primitives or framework for authorization
  • Must be well understood and easy to reason about
  • Reduces duplication of effort through delegation
  • Can be tested independently and in conjunction with consumer applications
  • Identity can be federated
  • Machine Identity can use modern protocols like SPIFFE
  • Examples of Standard Protocols:
    • OAuth and OpenID Connect
    • SAML
    • Mutual TLS and pre-shared certificates
    • API tokens or Bearer Authentication
- + \ No newline at end of file diff --git a/docs/intro/capabilities/infra-as-code.html b/docs/intro/capabilities/infra-as-code.html index caf80675..83fa074c 100644 --- a/docs/intro/capabilities/infra-as-code.html +++ b/docs/intro/capabilities/infra-as-code.html @@ -10,13 +10,13 @@ - +

Infrastructure as Code (IaC)

Infrastructure as Code or IaC, builds upon the Infrastructure as a Service (IaaS) offerings from cloud providers and modern datacenter automation. It is the APIs and programmatic libraries utilized within software frameworks built specifically for managing the life cycles of cloud infrastructure. It frequently encapsulates the tooling and automation used to spin up infrastructure resources for a given application.

By masking away the inconsistency of underlying cloud provider APIs, IaC offers the ability to build common patterns across a mix of heterogeneous resources. It also allows platform teams the ability to build higher order resources that meet specific business needs (beyond the low level APIs of the cloud providers). Furthermore, sane defaults and security and compliance concerns can be injected in a uniformly and made compulsory

We are seeing two categories of OSS tools in use at large: That which is occasionally reconciled like Terraform, Pulumi, CDK and continuously reconciled solutions like Crossplane or Amazon Controllers for Kubernetes (ACK).

The CNOE cohort will have to decide on the ideal IaC tool that works in tandem with the rest of delivery components, gains the overall community approval, and becomes the defacto service in use by the CNOE cohort.

  • Cohesive libraries, APIs and patterns for reconciling IaaS provider resources
  • Allows for higher order abstractions to be built
  • Can inject sane defaults and enforce security best practices
  • Can be continuously reconciled when used in conjunction with Kubernetes
  • Common implementations are
    • Terraform
    • CloudFormation
    • Pulumi
    • Crossplane
- + \ No newline at end of file diff --git a/docs/intro/capabilities/observability.html b/docs/intro/capabilities/observability.html index 11c49fc3..041194dc 100644 --- a/docs/intro/capabilities/observability.html +++ b/docs/intro/capabilities/observability.html @@ -10,13 +10,13 @@ - +

Observability

The overall well-being of the system is tracked via proper integration with state of the art observability tooling.

Building on the existing set of technologies available in the CNCF ecosystem, CNOE needs to work in tandem with open telemetry data collectors and allow its users to view and analyze collected data using technologies such as Prometheus and Grafana.

- + \ No newline at end of file diff --git a/docs/intro/capabilities/packaging-and-templating.html b/docs/intro/capabilities/packaging-and-templating.html index 7322df2b..879105f1 100644 --- a/docs/intro/capabilities/packaging-and-templating.html +++ b/docs/intro/capabilities/packaging-and-templating.html @@ -10,7 +10,7 @@ - + @@ -18,7 +18,7 @@

Packaging and Templating

Packaging and templating languages and frameworks are required to ensure the delivery of complete and functional sets of tools to target specific capabilities endorsed and usable by the CNOE community. While opinionated, extensibility and configuration must meet the needs of users, but guide towards best practices when combining the tooling in the package. Some candidates for templating and packaging languages include the open component model, the Kubernetes packaging tool (KPT), and the OCI distribution specifications.

  • Packaged sets of tools and configuration endorsed and usable by the CNOE community
  • Opinionated and oriented toward producing best practices for the majority of use cases
  • Open and extensible, allowing for configurability of the targeted capabilities.
- + \ No newline at end of file diff --git a/docs/intro/capabilities/secret-management.html b/docs/intro/capabilities/secret-management.html index 4d7bfb70..d0ffdf36 100644 --- a/docs/intro/capabilities/secret-management.html +++ b/docs/intro/capabilities/secret-management.html @@ -10,13 +10,13 @@ - +

Secret Management

The life cycle and distribution of secrets must be managed safely and securely. Secrets Management aims to shift this important responsibility to the platform where it can be implemented and audited in one place rather than many.

Secrets Management works in conjunction with secrets repositories to securely source and deliver secrets on demand and just-in-time to applications and services.

  • Can be built with workflow-orchestration but must be treated with great care
  • Should have additional security scrutiny applied beyond other non-secret artifact delivery tooling
  • Often provides the ability to promote secrets between environments, distribute, roll and revoke secrets
- + \ No newline at end of file diff --git a/docs/intro/capabilities/secret-repository.html b/docs/intro/capabilities/secret-repository.html index 54137315..79612858 100644 --- a/docs/intro/capabilities/secret-repository.html +++ b/docs/intro/capabilities/secret-repository.html @@ -10,7 +10,7 @@ - + @@ -20,7 +20,7 @@ The storage of secrets should be encrypted. They may be encrypted with Hardware Security Modules or HSMs. They may be used in conjunction with other encryption and cryptographic solutions like Public Key Infrastructure or PKI. Secrets repositories may also offer the ability to generate, lease, rotate and revoke certain types of secrets like certificates.

  • Secure and durable
  • Usually key value pairs or similar structured data
  • Values must be encrypted
  • Keys and meta-data may not be encrypted
  • Must have canonical source of truth for a fully qualified key
  • May offer ability to generate/lease/rotate/revoke secret values such as certificates
  • Common tooling includes:
    • Hashicorp Vault
    • Cyberark Conjur
    • AWS Secrets Manager
    • Azure Key Vault
    • Google Secret Manager
- + \ No newline at end of file diff --git a/docs/intro/capabilities/service-discovery.html b/docs/intro/capabilities/service-discovery.html index d67edd8d..39e4b9cf 100644 --- a/docs/intro/capabilities/service-discovery.html +++ b/docs/intro/capabilities/service-discovery.html @@ -10,13 +10,13 @@ - +

Service Discovery

Service discovery is a capability that allows for the dynamic lookup or querying of a producer’s interface/API details by consumers of that service. Frequently this is based on some sort of centralized key/value or database called a Service Registry, but it can be distributed like in the case of the Domain Name System (DNS). When distributed, care must be taken to handle inconsistency in the results of queries to the Service Registry.

Service discovery can be used in conjunction with configuration repositories, and secret repositories to allow for consumers to bootstrap themselves at startup or accept dynamic runtime changes to configuration.

  • Allows for dynamic lookup or querying of service information
  • Usually based on a database or registry
  • Can be decentralized but care must be taken to handle inconsistency
  • Often used in conjunction with config and secret repositories for app bootstrapping
  • Common Tooling:
    • DNS
    • Consul
    • ZooKeeper
    • etcd
- + \ No newline at end of file diff --git a/docs/intro/capabilities/signing.html b/docs/intro/capabilities/signing.html index 07b6cd68..d4750ed2 100644 --- a/docs/intro/capabilities/signing.html +++ b/docs/intro/capabilities/signing.html @@ -10,7 +10,7 @@ - + @@ -20,7 +20,7 @@ Cryptographic signatures can also provide attribution and provenance data (lineage and chain of custody). When combined with attestations (meta-data with a specific predicate in regards to the material being signed) they can be used to build up “trust telemetry” or verifiable signals about the material and how it was processed. These are foundational elements of a Secure Software Supply Chain.

  • Allows for verifying Consistency and Integrity of contents
  • Can also provide provenance and attribution
  • Can be combined with attestations to create "Trust Telemetry"
  • Foundational to Secure Software Supply Chain practices
  • Common tools to perform signing include:
    • Sigstore
    • PGP
    • PKCS #11
- + \ No newline at end of file diff --git a/docs/intro/capabilities/validation.html b/docs/intro/capabilities/validation.html index e4cf1f07..cd54beea 100644 --- a/docs/intro/capabilities/validation.html +++ b/docs/intro/capabilities/validation.html @@ -10,13 +10,13 @@ - +

Validation

Platforms can make use of API specifications and code generation to create validators for client interactions and data exchange. Kubernetes does this with its type system and Open API Spec V3 (at the time of this writing). Proper validation ensures that clients of the platform fail quickly and loudly if their requests are malformed or inconsistent with the platform’s API schemas.

Kubernetes also offers “admission control” as a lifecycle hook on client requests in addition to validation against type schemas. However this type of ad-hoc validation can be implemented within many phases or locations with platform tooling. Admission control can also be a common substrate for injecting policy controls or building guardrails within the platform to meet security or regulatory requirements.

When paired with Cryptographic signing, verification of the signatures on configurations and artifacts (like container images) can be done with admission control. This allows for the enforcement of policy only allowing verifiably good materials into an environment.

  • Ensures API specifications are abided
  • Can leverage code generation with proper tooling
  • Kubernetes Admission Control can enable a common policy plane
  • Crypographic signing can be used to enforce validation for things like binary authorization
- + \ No newline at end of file diff --git a/docs/intro/capabilities/workflow-orchestration.html b/docs/intro/capabilities/workflow-orchestration.html index 71cf70ec..e398bb99 100644 --- a/docs/intro/capabilities/workflow-orchestration.html +++ b/docs/intro/capabilities/workflow-orchestration.html @@ -10,13 +10,13 @@ - +

Workflow Orchestration

This is the tooling that allows for explicit orchestration of tasks. Usually this involves the process to get applications ready for delivery.

This can be things like defining the set of activities deemed necessary as part of CI, including but not limited to running tests (unit tests, smoke tests, integration tests, acceptance tests, etc), validations, verifications, and configuration changes to the deployment environments.

  • Frequently imperative definitions of steps to be completed
  • Can use DSL for describing state machines or graphs
  • Can enable side effects like notifications or manual interventions
- + \ No newline at end of file diff --git a/docs/intro/personas.html b/docs/intro/personas.html index ccfa3929..cac0c1ec 100644 --- a/docs/intro/personas.html +++ b/docs/intro/personas.html @@ -10,13 +10,13 @@ - +

Personas

Application Developers

Experts in writing business-logic-specific code to be consumed by “customers”. Familiar with traditional programming languages and frameworks with minimal interest in infrastructure components outside of the ones in-use for their applications.

Package Builders

Experts in stitching together multiple components into reusable blueprints and delivering those as a service. Package builders are likely to “wrap” multiple infrastructure and application parts into singular deployables that can be leveraged by application developers.

Infrastructure Operators

Experts in deployment and management of the infrastructure and platform components that provide the foundation to business applications. Familiar with infra-as-code primitives and automation/scripting languages, as well the fundamental characteristics of network, storage, and database. Also likely to have experience with application orchestration platforms and their underlying functionality.

Information Security Engineers (ISE)

Experts in applying and enforcing security and compliance best practices. ISE’s partner with package builders to approve production ready packages to be used across the organization.

- + \ No newline at end of file diff --git a/docs/reference-implementation/configs/access-management.html b/docs/reference-implementation/configs/access-management.html index 339811d7..f9b6453b 100644 --- a/docs/reference-implementation/configs/access-management.html +++ b/docs/reference-implementation/configs/access-management.html @@ -10,7 +10,7 @@ - + @@ -18,7 +18,7 @@

Access Management

Keycloak

In the implementation, Keycloak is used as the identity provider. This instance is used to login into UIs such as Backstage and Argo.

Although it is not configured to be a identity broker or a user federation provider, you can configured it to be one. For example, you can configure it to federate users from Active Directory. Keycloak supports a large number of identity providers to integrate with. Please refer to the documentation for more information.

Backstage and Kubernetes Authentication

In the reference implementation, it uses the server side authentication pattern. Server side authentication is the pattern that all users on Backstage share the same credential and access level when accessing resources in the cluster. For example, for accessing secret resources, the same service account token is used for a configured Kubernetes cluster regardless of the user requesting resources. This is not ideal for use cases where a Backstage instance is shared by multiple teams. For example, when tying infrastructure and application provisioning to Backstage, it is important to ensure only authorized persons can access certain actions. For example, only admins should be able to delete a Kubernetes cluster in AWS.

Backstage has the ability to enforce policies through the Permission Framework about who can invoke what API actions. Although it is not enabled for the implementation currently, we would like to enable this in the future. Expanding on Backstage's permissions framework, examples provided in the documentation requires writing policies in TypeScript, and they need to be pulled into the Backstage application code. From the Kubernetes centric platform perspective, it makes a lof of sense to leverage policy engines like Kyverno or OPA Gatekeeper if possible.

Client side authentication can be more fine tuned. Client side authentication means actions are performed using the user's credentials. This means even if a cluster is listed and configured for use in Backstage, as long as the logged in user does not have permissions for the cluster, performing actions on the cluster is denied. Currently this is not natively supported by Backstage for EKS clusters. This requires more complex configuration and support from Backstage frontend plugin to properly pass user credentials to the cluster through the Kubernetes proxy in Backstage backend.

- + \ No newline at end of file diff --git a/docs/reference-implementation/configs/control-plane.html b/docs/reference-implementation/configs/control-plane.html index 24197c73..f13f3c9d 100644 --- a/docs/reference-implementation/configs/control-plane.html +++ b/docs/reference-implementation/configs/control-plane.html @@ -10,7 +10,7 @@ - + @@ -19,7 +19,7 @@ Once a bucket is created through Crossplane, it continuously enforces these configuration options. Even if an end user updates configuration manually through API calls or the console, the changes are reverted back automatically because Kubernetes operators continuously work to sync external resources with the specifications stored in the cluster.

Kubernetes CD solutions like ArgoCD is a great companion for this. It allows you to automatically sync resources from a Git repository with many flexible options. With ArgoCD and Crossplane, many application infrastructure needs can be condensed to a YAML file that developers can self-service through a developer portal.

This pattern is not limited to Crossplane. There are other Kubernetes Controllers for Infrastructure as code toolings. Take Terraform controller for example. This controller allows you to run Terraform modules in Kubernetes and exposes commonly used Terraform features as fields in the CRD. In the terraform module you can enforce your organizational requirements.

Whichever tool you choose to use, use of a policy engine such as Kyverno and OPA Gatekeeper is essential in securing your cluster and external resources. We will include more example of this pattern in the future.

- + \ No newline at end of file diff --git a/docs/reference-implementation/configs/secrets.html b/docs/reference-implementation/configs/secrets.html index 4b627282..f42155a9 100644 --- a/docs/reference-implementation/configs/secrets.html +++ b/docs/reference-implementation/configs/secrets.html @@ -10,13 +10,13 @@ - +

Secret Management

External Secrets Operator

If your organization requires sensitive data to be stored in a secret store such as Vault and Secrets Manager, you may need a way to retrieve secrets from your secret store into your cluster. External Secrets Operator is a Kubernetes Operator that fetches secrets from external APIs and creates Kubernetes secrets.

The reference implementation uses this operator to sync secrets between the cluster and AWS Secrets Manager. Information such as generated user password, Keycloak admin password, and database password are stored as an entity in AWS secrets manager.

TLS Certificates

If you opted to use cert manager to manage certificates for you endpoints, certificates and their private keys are stored as Kubernetes secrets. If this does not meet your security standard, you can store it in a secret store of your choice, then use External Secrets Operator to sync it. An example manifest for Secrets Manager would look something like this.

apiVersion: external-secrets.io/v1beta1
kind: ExternalSecret
metadata:
name: backstage-prod-tls
namespace: backstage
spec:
refreshInterval: 12h
secretStoreRef:
name: keycloak
kind: SecretStore
target:
name: backstage-prod-tls
template:
type: kubernetes.io/tls
data:
tls.crt: "{{ .public }}"
tls.key: "{{ .private }}"
data:
- secretKey: private
remoteRef:
key: cnoe/tls/dev # path to the tls cert in Secrets Manager
property: PRIVATE_KEY
- secretKey: public
remoteRef:
key: cnoe/tls/dev
property: CERT

When removing the reference implementation installation from your cluster, the uninstall script will back up the secrets to a local directory. This is to avoid re-issuing Let's Encrypt certificate for the same host because Let's Encrypt has a limit on how many times you can request certificates in a given time.

- + \ No newline at end of file diff --git a/docs/reference-implementation/installations/app-idp.html b/docs/reference-implementation/installations/app-idp.html index 260f858f..ae66cc13 100644 --- a/docs/reference-implementation/installations/app-idp.html +++ b/docs/reference-implementation/installations/app-idp.html @@ -12,7 +12,7 @@ - + @@ -24,7 +24,7 @@ You can get eksctl from this link.
  • If you don't have a public registered Route53 zone, register a Route53 domain (be sure to use Route53 as the DNS service for the domain). We strongly encourage creating a dedicated sub domain for this. If you'd rather manage DNS yourself, you can set enable_dns_management in the config file.
  • Get the host zone id and put it in the config file.
    aws route53 list-hosted-zones-by-name --dns-name <YOUR_DOMAIN_NAME> --query 'HostedZones[0].Id' --output text | cut -d'/' -f3
    # in the setups/config file, update the zone id.
    HOSTEDZONE_ID=ZO020111111
  • Update the setups/config file with your own values.
  • Run setups/install.sh and follow the prompts. See the section below about monitoring installation progress.
  • Once installation completes, navigate to backstage.<DOMAIN_NAME> and log in as user1. Password is available as a secret. You may need to wait for DNS propagation to complete to be able to login. May take ~10 minutes.
    kubectl get secrets -n keycloak keycloak-user-config -o go-template='{{range $k,$v := .data}}{{printf "%s: " $k}}{{if not $v}}{{$v}}{{else}}{{$v | base64decode}}{{end}}{{"\n"}}{{end}}'
  • Monitoring installation progress

    Components are installed as ArgoCD Applications. You can monitor installation progress by going to ArgoCD UI.

    # Get the admin password 
    kubectl -n argocd get secret argocd-initial-admin-secret -o jsonpath="{.data.password}" | base64 -d

    kubectl port-forward svc/argocd-server -n argocd 8081:80

    Go to http://localhost:8081 and login with the username admin and password obtained above. In the UI you can look at resources created, their logs, and events.

    If you installed it without automatic DNS configuration.

    If you set MANAGED_DNS=false, you are responsible for updating DNS records, thus external-dns is not installed. You have to set the following DNS records:

    • backstage.<DOMAIN_NAME>
    • keycloak.<DOMAIN_NAME>
    • argo.<DOMAIN_NAME>
    • argocd.<DOMAIN_NAME>

    Point these records to the value returned by the following command.

    k get svc -n ingress-nginx ingress-nginx-controller -o jsonpath='{.status.loadBalancer.ingress[0].hostname}'

    If you installed it without Cert Manager.

    If you set MANAGED_CERT=false, you are responsible for managing TLS certs, thus cert-manager is not installed. You must create TLS secrets accordingly.

    Run the following command to find where to create secrets.

    output=$(kubectl get ingress --all-namespaces -o json | jq -r '.items[] | "\(.metadata.namespace) \(.spec.rules[].host) \(.spec.tls[].secretName)"')
    echo -e "Namespace \t Hostname \t TLS Secret Name"
    echo -e "$output"

    Secret format should be something like:

    apiVersion: v1
    kind: Secret
    metadata:
    name: backstage.<DOMAIN>
    namespace: backstage
    data:
    tls.crt: <base64 encoded cert>
    tls.key: <base64 encoded key>
    type: kubernetes.io/tls

    What was created?

    The following components are installed if you chose the full installation option.

    NameVersion
    argo-workflowsv3.4.8
    argocdv2.7.6
    aws-load-balancer-controllerv2.5.3
    backstagev1.16.0
    cert-managerv1.12.2
    crossplanev1.12.2
    external-dnsv0.13.5
    ingress-nginxv1.8.0
    keycloakv22.0.0
    external-secretsv0.9.2

    Things created outside of the cluster

    If full installation is done, you should have these DNS entries available. They all point to the Network Load Balancer.

    • backstage.<DOMAIN_NAME>
    • argo.<DOMAIN_NAME>
    • keycloak.<DOMAIN_NAME>

    You can confirm these by querying at a register.

    dig A `backstage.<DOMAIN_NAME>` @1.1.1.1

    kubectl get svc -n ingress-nginx

    A Network Load Balancer is also created. This is managed by the AWS Load Balancer Controller and points to ingress-nginx pod. This pod is responsible for routing requests to correct places. As a result, HTTPS endpoints are created with valid certificates.

    openssl s_client -showcerts -servername id.<DOMAIN_NAME> -connect id.<DOMAIN_NAME>:443 <<< "Q"
    curl https://backstage.<DOMAIN_NAME>

    How to access the Backstage instance?

    When you open a browser window and go to https://backstage.<DOMAIN_NAME>, you should be prompted to login. Two users are created during the installation process: user1 and user2. Their passwords are available in the keycloak namespace.

    k get secrets -n keycloak keycloak-user-config -o go-template='{{range $k,$v := .data}}{{printf "%s: " $k}}{{if not $v}}{{$v}}{{else}}{{$v | base64decode}}{{end}}{{"\n"}}{{end}}'

    Uninstall

    1. Run setups/uninstall.sh and follow the prompts.
    2. Remove GitHub app from your Organization by following these steps.
    3. Remove token from your GitHub Organization by following these steps.
    4. Remove the created GitHub Organization.
    Uninstall details

    Resources deleted

    Currently resources created by applications are not deleted. For example, if you have Spark Jobs running, they are not deleted and may block deletion of the spark-operator app.

    What can you do in Backstage?

    See this doc for demos!

    Possible issues

    Cert-manager

    • by default it uses http-01 challenge. If you'd prefer using dns-01, you can update the ingress files.
    • You may get events like Get "http://<DOMAIN>/.well-known/acme-challenge/09yldI6tVRvtWVPyMfwCwsYdOCEGGVWhmb1PWzXwhXI": dial tcp: lookup <DOMAIN> on 10.100.0.10:53: no such host. This is due to DNS propagation delay. It may take ~10 minutes.

    Troubleshooting

    See the troubleshooting doc for more information.

    Creation Order notes

    Click to expand

    Things created outside of the cluster with Keycloak SSO enabled.

    • Route53 records. Route53 hosted zones are not created. You must also register it if you want to be able to access through public DNS. These are managed by the external DNS controller.

    • AWS Network Load Balancer. This is just the entrance to the Kubernetes cluster. This points to the default installation of Ingress Nginx and is managed by AWS Load Balancer Controller.

    • TLS Certificates issued by Let's Encrypt. These are managed by cert-manager based on values in Ingress. They use the production issuer which means we must be very careful with how many and often we request certificates from them. The uninstall scripts backup certificates to the private directory to avoid re-issuing certificates.

    These resources are controlled by Kubernetes controllers and thus should be deleted using controllers.

    Keycloak SSO with DNS and TLS certificates

    If using keycloak SSO with fully automated DNS and certificate management, it must be:

    1. aws-load-balancer-controller
    2. ingress-nginx
    3. cert-manager
    4. external-dns
    5. keycloak
    6. The rest of stuff

    Keycloak SSO with manual DNS and TLS Certificates

    If using keycloak SSO but manage DNS records and certificates manually.

    1. aws-load-balancer-controller
    2. ingress-nginx
    3. The rest of stuff minus cert-manager and external-dns

    In this case, you can issue your own certs and provide them as TLS secrets as specified in the spec.tls[0].secretName field of Ingress objects. You can also let NLB or ALB terminate TLS instead using the LB controller. This is not covered currently, but possible.

    No SSO

    If no SSO, no particular installation order. Eventual consistency works.

    - + \ No newline at end of file diff --git a/docs/reference-implementation/installations/idpbuilder.html b/docs/reference-implementation/installations/idpbuilder.html index 4d69469d..be6b0aa0 100644 --- a/docs/reference-implementation/installations/idpbuilder.html +++ b/docs/reference-implementation/installations/idpbuilder.html @@ -10,13 +10,13 @@ - +

    idpBuilder on Local Machine

    About

    Spin up a complete internal developer platform using industry standard technologies like Kubernetes, ArgoCD, and Backstage with only Docker required as a dependency.

    This can be useful in several ways:

    • Create a single binary which can demonstrate an IDP reference implementation.
    • Use within CI to perform integration testing.
    • Use as a local development environment for IDP engineers.
    - + \ No newline at end of file diff --git a/docs/reference-implementation/installations/idpbuilder/how-it-works.html b/docs/reference-implementation/installations/idpbuilder/how-it-works.html index 91a41c21..6ae771b7 100644 --- a/docs/reference-implementation/installations/idpbuilder/how-it-works.html +++ b/docs/reference-implementation/installations/idpbuilder/how-it-works.html @@ -10,7 +10,7 @@ - + @@ -25,10 +25,15 @@ This allows us to resolve the domain name inside and outside cluster to the same endpoint.

    As described above, the default domain name, cnoe.localtest.me, resolves to a local loopback address such as 127.0.0.1. This works for accessing the ingress-nginx service from outside the cluster because the service port is exposed as NodePort on the local machine.

    This approach does not work for in-cluster traffic because the address resolves to local loopback interface. For example, if ArgoCD pod wants to access Gitea at gitea.cnoe.localtest.me, the address resolves to 127.0.0.1 which is the local loopback address within the node. -To ensure ArgoCD can talk to Gitea services, in-cluster DNS must be configured like so:

    rewrite name gitea.cnoe.localtest.me ingress-nginx-controller.ingress-nginx.svc.cluster.local

    This CoreDNS rewrite rule instructs CoreDNS to resolve requests made for gitea.cnoe.localtest.me using the address given by ingress-nginx-controller.ingress-nginx.svc.cluster.local

    Core Packages

    idpbuilder installs the following packages to the cluster.

    • ArgoCD is the GitOps solution to deploy manifests to Kubernetes clusters. In this project, a package is an ArgoCD application.
    • Gitea server is the in-cluster Git server that ArgoCD can be configured to sync resources from. You can sync from local file systems to this.
    • Ingress-nginx is used as a method to access in-cluster resources such as ArgoCD UI and Gitea UI.

    Once installed, idpbuilder passes control over these packages to ArgoCD by storing manifests in Gitea repositories then creating ArgoCD applications. From here on, ArgoCD manages them based on manifests checked into Git repositories.

    Getting Relevant Secrets

    The idpbuilder get secrets command retrieves the following:

    • ArgoCD initial admin password.
    • Gitea admin user credentials.
    • Any secrets labeled with cnoe.io/cli-secret=true.

    You can think of the command as executing the following kubectl commands:

    kubectl -n argocd get secret argocd-initial-admin-secret
    kubectl get secrets -n gitea gitea-admin-secret
    kubectl get secrets -A -l cnoe.io/cli-secret=true

    If you want to retrieve secrets for a package, you can use the -p flag. To get secrets for a package named gitea:

    idpbuilder get secrets -p gitea

    For the -p flag to work, you must label the secret with cnoe.io/package-name. +To ensure ArgoCD can talk to Gitea services, in-cluster DNS must be configured like so:

    rewrite name gitea.cnoe.localtest.me ingress-nginx-controller.ingress-nginx.svc.cluster.local

    This CoreDNS rewrite rule instructs CoreDNS to resolve requests made for gitea.cnoe.localtest.me using the address given by ingress-nginx-controller.ingress-nginx.svc.cluster.local

    Domain-based and Path-based routing

    idpbuilder supports two modes of routing requests to in-cluster resources: domain-based and path-based. +The behavior is configured with the --use-path-routing flag, which defaults to false.

    Domain-based routing

    This is the default behavior of idpbuilder. In this mode, services are exposed under their own domain names. +For example:

    • ArgoCD UI is accessed via https://argocd.cnoe.localtest.me
    • Gitea UI is accessed via https://gitea.cnoe.localtest.me

    This approach is generally cleaner and offers more flexible routing options because it requires less complex ingress configurations.

    Path-based routing

    When you use the --use-path-routing flag, idpbuilder configures all services under a single domain name, with routing based on path parameters. +For example:

    • ArgoCD UI is accessed via https://cnoe.localtest.me/argocd
    • Gitea UI is accessed via https://cnoe.localtest.me/gitea

    This is useful when you are constrained to using a single domain name and cannot use subdomains. +A good example is when using GitHub Codespaces. When forwarding ports in Codespaces, you are given a single domain name (like wild-broomstick-abc.github.dev) to reach all services running in your codespace. +In such situations, you cannot use subdomains (e.g., argocd.wild-broomstick-abc.github.dev would not work), making path-based routing the appropriate choice.

    Core Packages

    idpbuilder installs the following packages to the cluster.

    • ArgoCD is the GitOps solution to deploy manifests to Kubernetes clusters. In this project, a package is an ArgoCD application.
    • Gitea server is the in-cluster Git server that ArgoCD can be configured to sync resources from. You can sync from local file systems to this.
    • Ingress-nginx is used as a method to access in-cluster resources such as ArgoCD UI and Gitea UI.

    Once installed, idpbuilder passes control over these packages to ArgoCD by storing manifests in Gitea repositories then creating ArgoCD applications. From here on, ArgoCD manages them based on manifests checked into Git repositories.

    Getting Relevant Secrets

    The idpbuilder get secrets command retrieves the following:

    • ArgoCD initial admin password.
    • Gitea admin user credentials.
    • Any secrets labeled with cnoe.io/cli-secret=true.

    You can think of the command as executing the following kubectl commands:

    kubectl -n argocd get secret argocd-initial-admin-secret
    kubectl get secrets -n gitea gitea-admin-secret
    kubectl get secrets -A -l cnoe.io/cli-secret=true

    If you want to retrieve secrets for a package, you can use the -p flag. To get secrets for a package named gitea:

    idpbuilder get secrets -p gitea

    For the -p flag to work, you must label the secret with cnoe.io/package-name. For example, to make secret values available in a secret named my-secret for a package named foo:

    kubectl label secret my-secret "cnoe.io/package-name=foo" "cnoe.io/cli-secret=true"

    The secret will then be listed when issuing the idpbuilder get secrets command. -Alternatively, you can use the following command to retrieve the individual secret:

    idpbuilder get secrets -p foo
    - +Alternatively, you can use the following command to retrieve the individual secret:

    idpbuilder get secrets -p foo
    + \ No newline at end of file diff --git a/docs/reference-implementation/installations/idpbuilder/local-oci-registry.html b/docs/reference-implementation/installations/idpbuilder/local-oci-registry.html index b921ad28..0eead932 100644 --- a/docs/reference-implementation/installations/idpbuilder/local-oci-registry.html +++ b/docs/reference-implementation/installations/idpbuilder/local-oci-registry.html @@ -10,7 +10,7 @@ - + @@ -21,7 +21,7 @@ https://specs.opencontainers.org/distribution-spec/?v=v1.0.0#checking-if-content-exists-in-the-registry

    Pulling Images From Inside Idpbuilder K8s Cluster:

    Because we are using an NGINX Ingress and pushing our image from off cluster, Gitea and its OCI registry think all images pushed to it are prefixed with gitea.cnoe.localtest.me:8443.

    This is correct by the OCI spec standards. However, when you are on the cluster, that ingress is not available to you. You can use the service name of gitea, but gitea will not know what images are being asked for at the svc domain name. To work around this issue, we use containerd to rewrite those image names so that they can be referenced at the external url:

    See the Kind config for how this is done.

    - + \ No newline at end of file diff --git a/docs/reference-implementation/installations/idpbuilder/override.html b/docs/reference-implementation/installations/idpbuilder/override.html index 5b495efb..5167107d 100644 --- a/docs/reference-implementation/installations/idpbuilder/override.html +++ b/docs/reference-implementation/installations/idpbuilder/override.html @@ -10,7 +10,7 @@ - + @@ -25,7 +25,7 @@ The specified file can contain multiple yaml documents. The format of this flag is <PACKAGE_NAME>:<PATH>. Where <PACKAGE_NAME> is one of argocd, nginx, and gitea. You can find the built-in manifests in your local Gitea repositories or in our source files:

    For example, if you'd like to override the ArgoCD ConfigMap, you can run idpbuilder like this:

    idpbuilder create -c argocd:/tmp/override.yaml

    The contents of /tmp/override.yaml is:

    apiVersion: v1
    data:
    application.resourceTrackingMethod: annotation
    resource.exclusions: |
    - kinds:
    - ProviderConfigUsage
    apiGroups:
    - "*"
    kind: ConfigMap
    metadata:
    labels:
    app.kubernetes.io/name: argocd-cm
    app.kubernetes.io/part-of: argocd
    name: argocd-cm

    The corresponding built-in manifest can be found here.

    This instructs idpbuilder to use the provided manifest for argocd-cm only. The built-in Kubernetes manifests are used for everything but argocd-cm.

    - + \ No newline at end of file diff --git a/docs/reference-implementation/installations/idpbuilder/quick-start.html b/docs/reference-implementation/installations/idpbuilder/quick-start.html index 2cc5c7f1..e37cfa30 100644 --- a/docs/reference-implementation/installations/idpbuilder/quick-start.html +++ b/docs/reference-implementation/installations/idpbuilder/quick-start.html @@ -10,14 +10,14 @@ - +

    Quick Start

    Prerequisites

    A container engine is needed locally, such as:

    NameSupportedRemark
    Docker desktopYes
    Podman desktopNoidpbuilder can create a cluster using podman rootful
    FinchNo

    Note: Set the DOCKER_HOST env var property using podman to let idpbuilder to talk with the engine (e.g export DOCKER_HOST="unix:///var/run/docker.sock")

    Quick Start

    You can execute the following bash script to get started with a running version of the idpBuilder (inspect the script first if you have concerns):

    danger
    curl -fsSL https://raw.githubusercontent.com/cnoe-io/idpbuilder/main/hack/install.sh | bash

    verify a successful installation by running the following command and inspecting the output for the right version:

    idpbuilder version

    Alternatively, you can run the following commands for a manual installation:

    version=$(curl -Ls -o /dev/null -w %{url_effective} https://github.com/cnoe-io/idpbuilder/releases/latest)
    version=${version##*/}
    curl -L -o ./idpbuilder.tar.gz "https://github.com/cnoe-io/idpbuilder/releases/download/${version}/idpbuilder-$(uname | awk '{print tolower($0)}')-$(uname -m | sed 's/x86_64/amd64/').tar.gz"
    tar xzf idpbuilder.tar.gz

    ./idpbuilder version
    # example output
    # idpbuilder 0.4.1 go1.21.5 linux/amd64

    Or, you can download the latest binary from the release page.

    Running in Codespaces

    You can run idpbuilder in Codespaces.

    Create a Codespaces instance. img

    1. Wait for it to be ready. It may take several minutes.

    2. Get the latest release of idpbuilder:

       version=$(curl -Ls -o /dev/null -w %{url_effective} https://github.com/cnoe-io/idpbuilder/releases/latest)
      version=${version##*/}
      curl -L -o ./idpbuilder.tar.gz "https://github.com/cnoe-io/idpbuilder/releases/download/${version}/idpbuilder-$(uname | awk '{print tolower($0)}')-$(uname -m | sed 's/x86_64/amd64/').tar.gz"
      tar xzf idpbuilder.tar.gz
    3. Run idpbuilder:

       idpbuilder create --protocol http  \
      --host ${CODESPACE_NAME}-8080.${GITHUB_CODESPACES_PORT_FORWARDING_DOMAIN} \
      --port 8080 --use-path-routing
    4. Because Codespaces gives a single externally routable host name for an instance, idpbuilder must deploy with path based routing. This means ArgoCD and Gitea UIs are given with the following commands.

      • ArgoCD: echo https://${CODESPACE_NAME}-8080.${GITHUB_CODESPACES_PORT_FORWARDING_DOMAIN}/argocd
      • Gitea: echo https://${CODESPACE_NAME}-8080.${GITHUB_CODESPACES_PORT_FORWARDING_DOMAIN}/gitea

      Note that not all examples work with path based routing.

    - + \ No newline at end of file diff --git a/docs/reference-implementation/installations/idpbuilder/troubleshooting.html b/docs/reference-implementation/installations/idpbuilder/troubleshooting.html index 6f24b2e9..c0b7e53f 100644 --- a/docs/reference-implementation/installations/idpbuilder/troubleshooting.html +++ b/docs/reference-implementation/installations/idpbuilder/troubleshooting.html @@ -10,7 +10,7 @@ - + @@ -18,7 +18,7 @@

    Troubleshooting

    Podman support

    idpBuilder comes with experimental support for Podman. Although idpbuilder seems to work well with podman, we currently do not have end-to-end tests or a guarantee.

    To get started with Podman on MacOS, run the following:

    # create a local Linux VM
    podman machine init
    podman machine start

    # KIND_EXPERIMENTAL_PROVIDER instructs Kind to use podman as its provider
    KIND_EXPERIMENTAL_PROVIDER=podman idpbuilder create

    Podman rootless

    As of podman 5.0.0, it defaults to rootless mode. idpbuilder's core packages do not require root privileges, however you may need to run it in rootful mode depending on your use cases. If you need rootful behaviours, run the following command:

    # verify if you are running rootful or rootless
    podman machine inspect | jq '.[0].Rootful'
    # https://docs.podman.io/en/stable/markdown/podman-machine-set.1.html
    podman machine set --rootful

    Missing Kernel modules

    In some environment such as RHEL 9, idpbuilder may fail with the following error message:

    Failed to create pod sandbox: rpc error: code = Unknown desc = failed to setup network for sandbox "": plugin type="portmap" failed (add): unable to create chain CNI-HOSTPORT-SETMARK: failed to list iptables chains: running [/usr/sbin/iptables -t nat -S --wait]: exit status 3: modprobe: ERROR: could not insert 'ip_tables': Operation not permitted
    iptables v1.8.9 (legacy): can't initialize iptables table `nat': Table does not exist (do you need to insmod?)

    You may need to enable the ip_table module.

    # check if ip_table is enabled
    lsmod | grep ip_table
    # if not, enable it.
    sudo modprobe ip_tables
    echo 'ip_tables' | sudo tee -a /etc/modules-load.d/ip_tables.conf
    # verify it's activated
    lsmod | grep ip_table

    Gitea OCI registry

    When using the Gitea OCI registry, you may run into issues where your client cannot pull or push images to the registry.

    podman pull gitea.cnoe.localtest.me:8443/giteaadmin/ubuntu:24.08
    Trying to pull gitea.cnoe.localtest.me:8443/giteaadmin/ubuntu:24.08...
    Error: initializing source docker://gitea.cnoe.localtest.me:8443/giteaadmin/ubuntu:24.08: reading manifest 24.08 in gitea.cnoe.localtest.me:8443/giteaadmin/ubuntu: manifest unknown

    You may need to tell your client to not verify TLS, because the TLS certificate is self-signed and generated every time a new cluster is created.

    # use the --tls-verify=0 flag if you are using podman
    podman pull gitea.cnoe.localtest.me:8443/giteaadmin/ubuntu:24.08 --tls-verify=0
    - + \ No newline at end of file diff --git a/docs/reference-implementation/installations/idpbuilder/usage.html b/docs/reference-implementation/installations/idpbuilder/usage.html index 0fabed1e..fb966d31 100644 --- a/docs/reference-implementation/installations/idpbuilder/usage.html +++ b/docs/reference-implementation/installations/idpbuilder/usage.html @@ -10,7 +10,7 @@ - + @@ -26,7 +26,7 @@ By default, idpbuilder exposes the ingress-nginx service on host port 8443 and Kubernetes Ingress objects are created for core packages. For example, an ingress object for Gitea looks something like this:

    apiVersion: networking.k8s.io/v1
    kind: Ingress
    spec:
    ingressClassName: nginx
    rules:
    - host: gitea.cnoe.localtest.me
    http:
    paths:
    - path: /
    backend:
    service:
    name: my-gitea-http

    With this configuration, nginx routes traffic to Gitea service when http requests are made for gitea.cnoe.localtest.me.

    Similarly, you can expose your own service by defining an ingress object. For example, to expose a service named my-service at my-service.cnoe.localtest.me, the ingress object may look something like this.

    apiVersion: networking.k8s.io/v1
    kind: Ingress
    metadata:
    name: my-service
    spec:
    ingressClassName: nginx
    rules:
    - host: my-service.cnoe.localtest.me
    http:
    paths:
    - backend:
    service:
    name: my-service
    port:
    number: 80
    path: /
    pathType: Prefix
    - + \ No newline at end of file diff --git a/docs/reference-implementation/integrations/generated.html b/docs/reference-implementation/integrations/generated.html index b9590c06..d0f0ad31 100644 --- a/docs/reference-implementation/integrations/generated.html +++ b/docs/reference-implementation/integrations/generated.html @@ -10,7 +10,7 @@ - + @@ -23,7 +23,7 @@ streamlines some common developer workflows. This involves tooling that helps with migrating developer workflows to your developer portal. This also involves adding verifications and extensions to developer workflows.

    - + \ No newline at end of file diff --git a/docs/reference-implementation/integrations/generated/crd-templating.html b/docs/reference-implementation/integrations/generated/crd-templating.html index 4dc22bb2..cd47c45a 100644 --- a/docs/reference-implementation/integrations/generated/crd-templating.html +++ b/docs/reference-implementation/integrations/generated/crd-templating.html @@ -10,7 +10,7 @@ - + @@ -47,7 +47,7 @@ Kubernetes:

    create

    Where deploying the resource will result in running the Backstage scaffolder and getting the resource deployed to a target cluster as configured in your template:

    deploy

    - + \ No newline at end of file diff --git a/docs/reference-implementation/integrations/generated/tf-templating.html b/docs/reference-implementation/integrations/generated/tf-templating.html index 1f7267de..5c43d7a6 100644 --- a/docs/reference-implementation/integrations/generated/tf-templating.html +++ b/docs/reference-implementation/integrations/generated/tf-templating.html @@ -10,13 +10,13 @@ - +

    Templating of Terraform Modules

    The CNOE CLI supports integration of Terraform modules into the developer portal.

    Template Generation

    To generate Backstage template input fields from Terraform modules, you can use the tf subcommand. Usage is shown below.

    Generate backstage templates by walking the given input directory, find TF modules,then create output file per module.
    If the templatePath and insertionPoint flags are set, generated objects are merged into the given template at given insertion point.
    Otherwise a yaml file with two keys are generated. The properties key contains the generated form input. The required key contains the TF variable names that do not have defaults.

    Usage:
    cnoe template tf [flags]

    Flags:
    -h, --help help for tf

    Global Flags:
    -c, --colllapse if set to true, items are rendered and collapsed as drop down items in a single specified template
    --depth uint32 depth from given directory to search for TF modules or CRDs (default 2)
    -i, --inputDir string input directory for CRDs and XRDs to be templatized
    -p, --insertAt string jq path within the template to insert backstage info (default ".spec.parameters[0]")
    -o, --outputDir string output directory for backstage templates to be stored in
    --raww templatePath prints the raw open API output without putting it into a template (ignoring templatePath and `insertAt`)
    -t, --templatePath string path to the template to be augmented with backstage info

    Example

    We can run the command against one of modules within the Data on EKS repository.

    git clone https://github.com/awslabs/data-on-eks.git /tmp/data-on-eks

    git clone https://github.com/cnoe-io/reference-implementation-aws.git /tmp/ref-impl

    cnoe template tf \
    -i /tmp/data-on-eks/analytics/terraform/spark-k8s-operator \
    -t /tmp/ref-impl/examples/template-generation/data-on-eks.yaml \
    -p '.spec.parameters[0].properties.tfVars' \
    -o .

    The -i flag specifies input Terraform module directory. In this example, the content looks like this:

    ls /tmp/data-on-eks/analytics/terraform/spark-k8s-operator
    README.md data.tf karpenter-provisioners spark-team.tf
    addons.tf examples main.tf variables.tf
    amp.tf helm-values outputs.tf versions.tf
    cleanup.sh install.sh providers.tf vpc.tf

    The -t flag specifies the location of the partially configured template file. It may look something like this:

    apiVersion: scaffolder.backstage.io/v1beta3
    kind: Template
    spec:
    parameters:
    - title: Terraform config options
    properties:
    tfVars: # this field is to be generated.
    title: Terraform variables
    type: object
    - title: Configuration Options
    properties:
    name:
    title: name of this entry
    type: string
    namespace:
    title: namespace within the kubernetes cluster to deploy this
    type: string
    default: data-on-eks
    adminRoleName:
    title: Admin Role Name
    description: Name of the role to give the administrative rights on the EKS cluster.
    default: Admin
    type: string
    clusterName:
    title: Cluster to run
    description: The cluster to run this workflow in.
    type: string
    ui:field: KubernetesClusterPicker
    repoUrl: # need a place to store this entity information.
    title: Repository Location
    type: string
    ui:field: RepoUrlPicker
    ui:options:
    allowedHosts:
    - github.com
    ...

    This template contains input fields (.spec.parameters[1]) that are common to all Data on EKS blueprints. For example, the name of the admin IAM role that will have Cluster Admin access is common to all EKS clusters. The only difference between templates are the terraform configuration options field. We will populate this field with variables from a terraform module.

    The -p flag specifies where you want to insert input field within the given template. In this case, we want to insert it at .spec.parameters[0].properties.tfVars.

    The -o flag specifies the output directory. In this case, we want it to output it to the current directory.

    Once the fields are generated and inserted, the template is ready to use. When rendered in Backstage, it should look something like this.

    The diff between the original template and generated template should look something like this:

    spec.parameters
    - one list entry removed:
    - title: "Terraform config options"
    │ properties:
    │ │ tfVars:
    │ │ │ type: object
    │ │ │ title: "Terraform variables"

    + one list entry added:
    - properties:
    │ │ tfVars:
    │ │ │ type: object
    │ │ │ title: "Terraform variables"
    │ │ │ properties:
    │ │ │ │ name:
    │ │ │ │ │ type: string
    │ │ │ │ │ default: spark-operator-doeks
    │ │ │ │ │ description: "Name of the VPC and EKS Cluster"
    │ │ │ │ eks_cluster_version:
    │ │ │ │ │ type: string
    │ │ │ │ │ default: 1.26
    │ │ │ │ │ description: "EKS Cluster version"
    │ │ │ │ enable_amazon_prometheus:
    │ │ │ │ │ type: boolean
    │ │ │ │ │ default: true
    │ │ │ │ │ description: "Enable AWS Managed Prometheus service"
    │ │ │ │ enable_vpc_endpoints:
    │ │ │ │ │ type: boolean
    │ │ │ │ │ default: false
    │ │ │ │ │ description: "Enable VPC Endpoints"
    │ │ │ │ enable_yunikorn:
    │ │ │ │ │ type: boolean
    │ │ │ │ │ default: true
    │ │ │ │ │ description: "Enable Apache YuniKorn Scheduler"
    │ │ │ │ region:
    │ │ │ │ │ type: string
    │ │ │ │ │ default: us-west-2
    │ │ │ │ │ description: Region
    │ │ │ │ vpc_cidr:
    │ │ │ │ │ type: string
    │ │ │ │ │ default: 10.1.0.0/16
    │ │ │ │ │ description: "VPC CIDR. This should be a valid private (RFC 1918) CIDR range"
    │ │ │ │ eks_data_plane_subnet_secondary_cidr:
    │ │ │ │ │ type: array
    │ │ │ │ │ description: "Secondary CIDR blocks. 32766 IPs per Subnet per Subnet/AZ for EKS Node and Pods"
    │ │ │ │ │ default:
    │ │ │ │ │ - 100.64.0.0/17
    │ │ │ │ │ - 100.64.128.0/17
    │ │ │ │ │ items:
    │ │ │ │ │ │ type: string
    │ │ │ │ private_subnets:
    │ │ │ │ │ type: array
    │ │ │ │ │ description: "Private Subnets CIDRs. 254 IPs per Subnet/AZ for Private NAT + NLB + Airflow + EC2 Jumphost etc."
    │ │ │ │ │ default:
    │ │ │ │ │ - 10.1.1.0/24
    │ │ │ │ │ - 10.1.2.0/24
    │ │ │ │ │ items:
    │ │ │ │ │ │ type: string
    │ │ │ │ public_subnets:
    │ │ │ │ │ type: array
    │ │ │ │ │ description: "Public Subnets CIDRs. 62 IPs per Subnet/AZ"
    │ │ │ │ │ default:
    │ │ │ │ │ - 10.1.0.0/26
    │ │ │ │ │ - 10.1.0.64/26
    │ │ │ │ │ items:
    │ │ │ │ │ │ type: string
    │ │ │ │ secondary_cidr_blocks:
    │ │ │ │ │ type: array
    │ │ │ │ │ description: "Secondary CIDR blocks to be attached to VPC"
    │ │ │ │ │ default:
    │ │ │ │ │ - 100.64.0.0/16
    │ │ │ │ │ items:
    │ │ │ │ │ │ type: string
    │ title: "Terraform config options"
    - + \ No newline at end of file diff --git a/docs/reference-implementation/integrations/localstack.html b/docs/reference-implementation/integrations/localstack.html index d4dba5b1..7a1b3376 100644 --- a/docs/reference-implementation/integrations/localstack.html +++ b/docs/reference-implementation/integrations/localstack.html @@ -10,13 +10,13 @@ - +

    Local Stack

    Enable the Integration

    Please use the below command to deploy an IDP reference implementation with an Argo application that adds Localstack, as well as integrating with Crossplane.

    idpbuilder create \
    --use-path-routing \
    --package https://github.com/cnoe-io/stacks//ref-implementation \
    --package https://github.com/cnoe-io/stacks//localstack-integration

    As you see above, this add-on to idpbuilder has a dependency on the reference implementation. This command primarily does the following:

    1. Installs localstack helmchart as an argo application.
    2. Adds localstack crossplane ProviderConfig, targeting localstack

    Once the custom package is installed, localstack can be used from the backstage template app-with-aws-resources, by changing the providerConfigName during the bucket configuration page from default to localstack.

    - + \ No newline at end of file diff --git a/docs/reference-implementation/integrations/reference-impl.html b/docs/reference-implementation/integrations/reference-impl.html index b4d5c350..c1d78079 100644 --- a/docs/reference-implementation/integrations/reference-impl.html +++ b/docs/reference-implementation/integrations/reference-impl.html @@ -10,7 +10,7 @@ - + @@ -35,7 +35,7 @@ This is done for convenience and demonstration purposes only. There are alternative actions that you can use. For example, you can create a PR to an existing repository, create a repository but not deploy them yet, etc.

  • If Backstage's pipelining and templating mechanisms is too simple, you can use more advanced workflow engines like Tekton or Argo Workflows. You can invoke them in Backstage templates, then track progress similar to how it was described above.

  • - + \ No newline at end of file diff --git a/docs/reference-implementation/integrations/terraform.html b/docs/reference-implementation/integrations/terraform.html index ad5e0f08..e5677390 100644 --- a/docs/reference-implementation/integrations/terraform.html +++ b/docs/reference-implementation/integrations/terraform.html @@ -10,13 +10,13 @@ - +

    Terraform Modules

    Enable the Integration

    Use the below command to deploy idpbuilder to make sure backstage terraform integration Argo application is deployed as part of your setup.

    idpbuilder create \
    --use-path-routing \
    --package https://github.com/cnoe-io/stacks//ref-implementation \
    --package https://github.com/cnoe-io/stacks//terraform-integrations

    As you see above, this add-on to idpbuilder has a dependency on the reference implementation. This command primarily does the following:

    1. Installs the Source Controller from Flux CD to clone the terraform modules to Install
    2. Installs the Tofu Controller to run pull terraform values files and run terraform apply on the pulled modules

    Setup

    Follow the instruction on the Backstage Terraform Integrations repo to enable the modules on the Backstage environment.

    - + \ No newline at end of file diff --git a/docs/reference-implementation/integrations/terraform/s3-bucket.html b/docs/reference-implementation/integrations/terraform/s3-bucket.html index 7ab75157..516dbff7 100644 --- a/docs/reference-implementation/integrations/terraform/s3-bucket.html +++ b/docs/reference-implementation/integrations/terraform/s3-bucket.html @@ -10,13 +10,13 @@ - +

    Amazon S3 Bucket

    This pattern demonstrates the creation of an Amazon S3 bucket in an AWS region. You can download the respective Backstage templates from the s3 bucket folder under cnoe-io/backstage-terraform-integrations.

    Prerequisite

    You need to add AWS credentials before deployed this pattern.

    Deployment

    Once you are done with setting up backstage-terraform-integrations, navigate to Backstage and click on create in the left pane to view the list of available platform templates and click Choose on the Creates an Amazon S3 Bucket pattern as shown below:

    Backstage Template Console

    Next, populate the terraform variables for the pattern deployment as shown below and click Review:

    Backstage NVDIA Console

    Next, validate the entered variables in the below confirmation screen and click Create :

    Backstage NVDIA Terraform Vars

    Next, check on the steps of backstage template run as show below and click Open In Catalog:

    Backstage Run

    Next, check on the below screen showing the created Backstage component and click View Source to navigate to the Gitea repository:

    Backstage Component

    Next, check on the Gitea repo of the created component as shown below:

    Gitea Console

    Next, Navigate to ArgoCD console and navigate to Argo App by name backstage-terraform-s3-intgto view the below screen:

    ArgoCD Console

    Validation

    Next, lets validate the execution of the pattern by tofu controller. Run the below command on your terminal to check on terraforms.infra.contrib.fluxcd.io resource:

    > kubectl get terraforms.infra.contrib.fluxcd.io -A

    NAMESPACE NAME READY STATUS AGE
    flux-system aws-s3-bucket-backstage-terraform-s3-intg Unknown Reconciliation in progress 4m17s

    Next, lets check on the Kubernetes pod in the flux-system namespace which executes the terraform code :

    > kubectl get pods -n flux-system

    NAME READY STATUS RESTARTS AGE
    aws-s3-bucket-backstage-terraform-s3-intg-tf-runner 1/1 Running 0 3m22s
    notification-controller-5487f8c847-7w9dp 1/1 Running 0 17h
    source-controller-69bcb7cd85-92nhv 1/1 Running 0 17h
    tf-controller-7f8c8bbdfc-8rmvq 1/1 Running 0 17h

    Lets wait for 5 mins for the terraform apply to be completed fully by the tofu controller and lets navigate to Amazon S3 console to view the created S3 bucket:

    AWS Console

    Delete Workflow

    Please follow the following steps if you are looking to delete s3-bucket-backstage-terraform-s3-intg component created using the backstage terraform integrations. The Terraform resources in this repo are configured to clean up the corresponding cloud resources. When the Argo CD application is deleted, the deletion hook for cloud resources kicks in (takes a little bit of time though).

    1. In your argocd console, naviagate to backstage-terraform-s3-intg application created for your component and delete it manually.
    2. In your gitea console, navigate to the backstage-terraform-s3-intg repository for your component and delete it manually under settings.
    3. Finally in your backstage console, navigate to s3-bucket-backstage-terraform-s3-intg component and click on unregister component to remove the deleted c
    - + \ No newline at end of file diff --git a/docs/reference-implementation/integrations/terraform/serverless-microservice-pattern.html b/docs/reference-implementation/integrations/terraform/serverless-microservice-pattern.html index 75640b81..a7420c0e 100644 --- a/docs/reference-implementation/integrations/terraform/serverless-microservice-pattern.html +++ b/docs/reference-implementation/integrations/terraform/serverless-microservice-pattern.html @@ -10,13 +10,13 @@ - +

    Serverless Microservice

    This pattern demonstrates a Serverless Microservice built using Amazon API Gateway, AWS Lambda, and Amazon DynamoDB. You can download the respective Backstage templates from the serverless microservice folder under cnoe-io/backstage-terraform-integrations.

    Prerequisite

    You need to add AWS credentials before deployed this pattern.

    Deployment

    Navigate to Backstage, click on Create in the left pane to view the list of available platform templates, and click Choose on the Serverless Microservice pattern.

    Next, populate the Terraform variables for the pattern deployment as shown below and click on Review.

    Backstage

    Next, validate the entered variables in the below confirmation screen and click Create :

    Backstage

    Next, check on the steps of backstage template run as show below and click Open In Catalog:

    Backstage

    Next, check on the below screen showing the created Backstage component and click View Source to navigate to the Gitea repository:

    Backstage

    Next, check on the Gitea repo of the created component as shown below:

    Backstage

    Next, Navigate to ArgoCD console and navigate to Argo App by name todo view the below screen:

    Backstage

    Validation

    Next, lets validate the execution of the pattern by tofu controller. Run the below command on your terminal to check on terraforms.infra.contrib.fluxcd.io resource:

    > kubectl get terraforms.infra.contrib.fluxcd.io -A

    NAMESPACE NAME READY STATUS AGE
    flux-system serverless-microservice-todo True No drift: main@sha1:549d0d82efea3b6a46807578cf0a8583f35a799c 156m

    Next, lets check on the Kubernetes pod in the flux-system namespace which executes the terraform controller:

    > kubectl get pods -n flux-system

    NAME READY STATUS RESTARTS AGE
    notification-controller-5487f8c847-95p4m 1/1 Running 0 168m
    source-controller-69bcb7cd85-st7ph 1/1 Running 0 168m
    tf-controller-7f8c8bbdfc-c5xw7 1/1 Running 0 167m

    Next, lets check on the logs of this tf-controller-7f8c8bbdfc-c5xw7as shown below:

    > kubectl logs -n flux-system tf-controller-7f8c8bbdfc-c5xw7 |grep serverless-microservice-

    {"level":"info","ts":"2024-08-06T21:22:22.008Z","msg":">> Started Generation: 1","controller":"terraform","controllerGroup":"infra.contrib.fluxcd.io","controllerKind":"Terraform","Terraform":{"name":"serverless-microservice-todo","namespace":"flux-system"},"namespace":"flux-system","name":"serverless-microservice-todo","reconcileID":"d440d7f5-9f77-475b-8f84-c509e318bd07","reconciliation-loop-id":"39c5d3d1-0551-468b-ad5b-4cc20fec86ca","start-time":"2024-08-06T21:22:22.007Z"}
    {"level":"info","ts":"2024-08-06T21:22:22.050Z","msg":"getting source","controller":"terraform","controllerGroup":"infra.contrib.fluxcd.io","controllerKind":"Terraform","Terraform":{"name":"serverless-microservice-todo","namespace":"flux-system"},"namespace":"flux-system","name":"serverless-microservice-todo","reconcileID":"d440d7f5-9f77-475b-8f84-c509e318bd07","reconciliation-loop-id":"39c5d3d1-0551-468b-ad5b-4cc20fec86ca","start-time":"2024-08-06T21:22:22.007Z"}
    {"level":"info","ts":"2024-08-06T21:22:22.050Z","msg":"before lookup runner: checking ready condition","controller":"terraform","controllerGroup":"infra.contrib.fluxcd.io","controllerKind":"Terraform","Terraform":{"name":"serverless-microservice-todo","namespace":"flux-system"},"namespace":"flux-system","name":"serverless-microservice-todo","reconcileID":"d440d7f5-9f77-475b-8f84-c509e318bd07","reconciliation-loop-id":"39c5d3d1-0551-468b-ad5b-4cc20fec86ca","start-time":"2024-08-06T21:22:22.007Z","ready":"nil"}
    {"level":"info","ts":"2024-08-06T21:22:22.051Z","msg":"before lookup runner: updating status","controller":"terraform","controllerGroup":"infra.contrib.fluxcd.io","controllerKind":"Terraform","Terraform":{"name":"serverless-microservice-todo","namespace":"flux-system"},"namespace":"flux-system","name":"serverless-microservice-todo","reconcileID":"d440d7f5-9f77-475b-8f84-c509e318bd07","reconciliation-loop-id":"39c5d3d1-0551-468b-ad5b-4cc20fec86ca","start-time":"2024-08-06T21:22:22.007Z","ready":"nil"}
    {"level":"info","ts":"2024-08-06T21:22:22.067Z","msg":"before lookup runner: updated status","controller":"terraform","controllerGroup":"infra.contrib.fluxcd.io","controllerKind":"Terraform","Terraform":{"name":"serverless-microservice-todo","namespace":"flux-system"},"namespace":"flux-system","name":"serverless-microservice-todo","reconcileID":"d440d7f5-9f77-475b-8f84-c509e318bd07","reconciliation-loop-id":"39c5d3d1-0551-468b-ad5b-4cc20fec86ca","start-time":"2024-08-06T21:22:22.007Z","ready":"nil"}
    {"level":"info","ts":"2024-08-06T21:22:22.068Z","msg":"trigger namespace tls secret generation","controller":"terraform","controllerGroup":"infra.contrib.fluxcd.io","controllerKind":"Terraform","Terraform":{"name":"serverless-microservice-todo","namespace":"flux-system"},"namespace":"flux-system","name":"serverless-microservice-todo","reconcileID":"d440d7f5-9f77-475b-8f84-c509e318bd07","reconciliation-loop-id":"39c5d3d1-0551-468b-ad5b-4cc20fec86ca","start-time":"2024-08-06T21:22:22.007Z"}
    {"level":"info","ts":"2024-08-06T21:22:22.810Z","msg":"show runner pod state: ","controller":"terraform","controllerGroup":"infra.contrib.fluxcd.io","controllerKind":"Terraform","Terraform":{"name":"serverless-microservice-todo","namespace":"flux-system"},"namespace":"flux-system","name":"serverless-microservice-todo","reconcileID":"d440d7f5-9f77-475b-8f84-c509e318bd07","reconciliation-loop-id":"39c5d3d1-0551-468b-ad5b-4cc20fec86ca","start-time":"2024-08-06T21:22:22.007Z","name":"serverless-microservice-todo","state":"not-found"}
    {"level":"info","ts":"2024-08-06T21:22:37.895Z","msg":"runner is running","controller":"terraform","controllerGroup":"infra.contrib.fluxcd.io","controllerKind":"Terraform","Terraform":{"name":"serverless-microservice-todo","namespace":"flux-system"},"namespace":"flux-system","name":"serverless-microservice-todo","reconcileID":"d440d7f5-9f77-475b-8f84-c509e318bd07","reconciliation-loop-id":"39c5d3d1-0551-468b-ad5b-4cc20fec86ca","start-time":"2024-08-06T21:22:22.007Z"}
    {"level":"info","ts":"2024-08-06T21:22:37.895Z","msg":"setting up terraform","controller":"terraform","controllerGroup":"infra.contrib.fluxcd.io","controllerKind":"Terraform","Terraform":{"name":"serverless-microservice-todo","namespace":"flux-system"},"namespace":"flux-system","name":"serverless-microservice-todo","reconcileID":"d440d7f5-9f77-475b-8f84-c509e318bd07","reconciliation-loop-id":"39c5d3d1-0551-468b-ad5b-4cc20fec86ca","start-time":"2024-08-06T21:22:22.007Z"}
    {"level":"info","ts":"2024-08-06T21:22:37.949Z","msg":"write backend config: ok","controller":"terraform","controllerGroup":"infra.contrib.fluxcd.io","controllerKind":"Terraform","Terraform":{"name":"serverless-microservice-todo","namespace":"flux-system"},"namespace":"flux-system","name":"serverless-microservice-todo","reconcileID":"d440d7f5-9f77-475b-8f84-c509e318bd07","reconciliation-loop-id":"39c5d3d1-0551-468b-ad5b-4cc20fec86ca","start-time":"2024-08-06T21:22:22.007Z"}
    {"level":"info","ts":"2024-08-06T21:22:37.951Z","msg":"new terraform","controller":"terraform","controllerGroup":"infra.contrib.fluxcd.io","controllerKind":"Terraform","Terraform":{"name":"serverless-microservice-todo","namespace":"flux-system"},"namespace":"flux-system","name":"serverless-microservice-todo","reconcileID":"d440d7f5-9f77-475b-8f84-c509e318bd07","reconciliation-loop-id":"39c5d3d1-0551-468b-ad5b-4cc20fec86ca","start-time":"2024-08-06T21:22:22.007Z","workingDir":"/tmp/flux-system-serverless-microservice-todo/serverless-microservice"}
    {"level":"info","ts":"2024-08-06T21:22:37.961Z","msg":"generate vars from tf: ok","controller":"terraform","controllerGroup":"infra.contrib.fluxcd.io","controllerKind":"Terraform","Terraform":{"name":"serverless-microservice-todo","namespace":"flux-system"},"namespace":"flux-system","name":"serverless-microservice-todo","reconcileID":"d440d7f5-9f77-475b-8f84-c509e318bd07","reconciliation-loop-id":"39c5d3d1-0551-468b-ad5b-4cc20fec86ca","start-time":"2024-08-06T21:22:22.007Z"}
    {"level":"info","ts":"2024-08-06T21:22:37.961Z","msg":"generated var files from spec","controller":"terraform","controllerGroup":"infra.contrib.fluxcd.io","controllerKind":"Terraform","Terraform":{"name":"serverless-microservice-todo","namespace":"flux-system"},"namespace":"flux-system","name":"serverless-microservice-todo","reconcileID":"d440d7f5-9f77-475b-8f84-c509e318bd07","reconciliation-loop-id":"39c5d3d1-0551-468b-ad5b-4cc20fec86ca","start-time":"2024-08-06T21:22:22.007Z"}
    {"level":"info","ts":"2024-08-06T21:22:37.961Z","msg":"generate template: ok","controller":"terraform","controllerGroup":"infra.contrib.fluxcd.io","controllerKind":"Terraform","Terraform":{"name":"serverless-microservice-todo","namespace":"flux-system"},"namespace":"flux-system","name":"serverless-microservice-todo","reconcileID":"d440d7f5-9f77-475b-8f84-c509e318bd07","reconciliation-loop-id":"39c5d3d1-0551-468b-ad5b-4cc20fec86ca","start-time":"2024-08-06T21:22:22.007Z"}
    {"level":"info","ts":"2024-08-06T21:22:37.961Z","msg":"generated template","controller":"terraform","controllerGroup":"infra.contrib.fluxcd.io","controllerKind":"Terraform","Terraform":{"name":"serverless-microservice-todo","namespace":"flux-system"},"namespace":"flux-system","name":"serverless-microservice-todo","reconcileID":"d440d7f5-9f77-475b-8f84-c509e318bd07","reconciliation-loop-id":"39c5d3d1-0551-468b-ad5b-4cc20fec86ca","start-time":"2024-08-06T21:22:22.007Z"}
    {"level":"info","ts":"2024-08-06T21:22:44.907Z","msg":"init reply: ok","controller":"terraform","controllerGroup":"infra.contrib.fluxcd.io","controllerKind":"Terraform","Terraform":{"name":"serverless-microservice-todo","namespace":"flux-system"},"namespace":"flux-system","name":"serverless-microservice-todo","reconcileID":"d440d7f5-9f77-475b-8f84-c509e318bd07","reconciliation-loop-id":"39c5d3d1-0551-468b-ad5b-4cc20fec86ca","start-time":"2024-08-06T21:22:22.007Z"}
    {"level":"info","ts":"2024-08-06T21:22:44.907Z","msg":"tfexec initialized terraform","controller":"terraform","controllerGroup":"infra.contrib.fluxcd.io","controllerKind":"Terraform","Terraform":{"name":"serverless-microservice-todo","namespace":"flux-system"},"namespace":"flux-system","name":"serverless-microservice-todo","reconcileID":"d440d7f5-9f77-475b-8f84-c509e318bd07","reconciliation-loop-id":"39c5d3d1-0551-468b-ad5b-4cc20fec86ca","start-time":"2024-08-06T21:22:22.007Z"}
    {"level":"info","ts":"2024-08-06T21:22:44.911Z","msg":"workspace select reply: ok","controller":"terraform","controllerGroup":"infra.contrib.fluxcd.io","controllerKind":"Terraform","Terraform":{"name":"serverless-microservice-todo","namespace":"flux-system"},"namespace":"flux-system","name":"serverless-microservice-todo","reconcileID":"d440d7f5-9f77-475b-8f84-c509e318bd07","reconciliation-loop-id":"39c5d3d1-0551-468b-ad5b-4cc20fec86ca","start-time":"2024-08-06T21:22:22.007Z"}
    {"level":"info","ts":"2024-08-06T21:22:44.911Z","msg":"calling plan ...","controller":"terraform","controllerGroup":"infra.contrib.fluxcd.io","controllerKind":"Terraform","Terraform":{"name":"serverless-microservice-todo","namespace":"flux-system"},"namespace":"flux-system","name":"serverless-microservice-todo","reconcileID":"d440d7f5-9f77-475b-8f84-c509e318bd07","reconciliation-loop-id":"39c5d3d1-0551-468b-ad5b-4cc20fec86ca","start-time":"2024-08-06T21:22:22.007Z"}
    {"level":"info","ts":"2024-08-06T21:22:51.706Z","msg":"plan: ok, found drift: true","controller":"terraform","controllerGroup":"infra.contrib.fluxcd.io","controllerKind":"Terraform","Terraform":{"name":"serverless-microservice-todo","namespace":"flux-system"},"namespace":"flux-system","name":"serverless-microservice-todo","reconcileID":"d440d7f5-9f77-475b-8f84-c509e318bd07","reconciliation-loop-id":"39c5d3d1-0551-468b-ad5b-4cc20fec86ca","start-time":"2024-08-06T21:22:22.007Z"}
    {"level":"info","ts":"2024-08-06T21:22:51.725Z","msg":"save tfplan: ok","controller":"terraform","controllerGroup":"infra.contrib.fluxcd.io","controllerKind":"Terraform","Terraform":{"name":"serverless-microservice-todo","namespace":"flux-system"},"namespace":"flux-system","name":"serverless-microservice-todo","reconcileID":"d440d7f5-9f77-475b-8f84-c509e318

    Let's wait for 3 minutes for tofu controller to complete the terraform apply and navigate to API Gateway console and search for the API:

    AWS Console

    Then, copy the microservice endpoint from the Invoke URL property for testing:

    AWS Console

    Testing

    Please refer to the example requests to test the microservice.

    Clean up

    To clean up all the resources created please follow these steps:

    1. In your Argo CD console, navigate to the application created for your component and click on delete.
    2. In your Gitea console, navigate to the repository for your component and delete it manually under settings.
    3. Finally, in your Backstage console, navigate to component created and click on unregister component.
    - + \ No newline at end of file diff --git a/docs/reference-implementation/integrations/verification.html b/docs/reference-implementation/integrations/verification.html index 053021ac..2b3141ed 100644 --- a/docs/reference-implementation/integrations/verification.html +++ b/docs/reference-implementation/integrations/verification.html @@ -10,7 +10,7 @@ - + @@ -34,7 +34,7 @@ shows the exact error reporting that the CNOE CLI reports upon execution from within the terminal (see below):

    verify

    In case of a successful verification, the verify step would finish execution with an exit code 0 and allow for the rest of the steps to proceed:

    deploy

    - + \ No newline at end of file diff --git a/docs/reference-implementation/plugins/argo-workflows.html b/docs/reference-implementation/plugins/argo-workflows.html index c3182d7f..beb7ad9c 100644 --- a/docs/reference-implementation/plugins/argo-workflows.html +++ b/docs/reference-implementation/plugins/argo-workflows.html @@ -10,7 +10,7 @@ - + @@ -23,7 +23,7 @@ selector to choose workflows that carry the label.
  • argo-workflows/label-selector: Same as the above, except internal to Argo machinery. This value takes precedent over the one above if both are defined.
  • argo-workflows/cluster-name: Optional. Specifies the name of the Kubernetes cluster to retrieve information from. If missing chooses the Kubernetes context available to backstage at runtime.
  • Authentication

    This plugin supports two methods of authentication.

    Through Argo API with Service Account Token

    This method uses a service account token to retrieve information from Argo API through the configured proxy endpoint.

    1. Create a service account and associated permissions. For this plugin to work, you need list, get, and watch verbs. for example create a file called sa.yaml with the following contents:
      apiVersion: v1
      kind: ServiceAccount
      metadata:
      name: backstage-argo-workflows-plugin
      namespace: argo
      ---
      # This is a long-lived token intended to be used by the backstage proxy.
      apiVersion: v1
      kind: Secret
      metadata:
      name: backstage-argo-workflows-plugin-token
      annotations:
      kubernetes.io/service-account.name: backstage-argo-workflows-plugin
      namespace: argo
      type: kubernetes.io/service-account-token
      ---
      apiVersion: rbac.authorization.k8s.io/v1
      kind: ClusterRole
      metadata:
      name: backstage-argo-workflows-plugin
      rules:
      - apiGroups: ["argoproj.io"]
      resources: ["workflows"]
      verbs: [ "get", "watch", "list"]
      ---
      apiVersion: rbac.authorization.k8s.io/v1
      kind: ClusterRoleBinding
      metadata:
      name: backstage-argo-workflows-plugin
      roleRef:
      apiGroup: rbac.authorization.k8s.io
      kind: ClusterRole
      name: backstage-argo-workflows-plugin
      subjects:
      - kind: ServiceAccount
      name: backstage-argo-workflows-plugin
      namespace: argo
      ```
    2. Apply them to your cluster
      kubectl apply -f sa.yaml
    3. Configure Backstage Proxy. In this example we are instructing Backstage to get the token value from environment variable called ARGO_WORKFLOWS_AUTH_TOKEN
      proxy:
      "/argo-workflows/api":
      target: https://argo.a1.mccloman.people.aws.dev
      changeOrigin: true
      secure: true
      headers:
      Authorization:
      $env: ARGO_WORKFLOWS_AUTH_TOKEN
    4. Grab the token value and make it available as an environment variable for your backstage backend.
      export ARGO_WORKFLOWS_AUTH_TOKEN="Bearer $(kubectl get secret -n argo backstage-argo-workflows-plugin-token -o=jsonpath='{.data.token}' | base64 --decode)"
      If this is running in Kubernetes see this documentation.

    See this documentation for more information on getting your token.

    Using configured Kubernetes API

    The plugin can use configured Kubernetes clusters to fetch resources instead of going through the Argo Workflows API The entity must be annotated correctly for it to work.

    For example, for a Kubernetes cluster given in your app-config.yaml

    kubernetes:
    serviceLocatorMethod:
    type: "multiTenant"
    clusterLocatorMethods:
    - type: "config"
    clusters:
    - url: https://abcd.gr7.us-west-2.eks.amazonaws.com:443
    name: my-cluster-1
    authProvider: "serviceAccount"
    serviceAccountToken: eyJh
    caData: LS0t

    For this configuration, the argo-workflows/cluster-name annotaton value must be my-cluster-1

    apiVersion: backstage.io/v1alpha1
    kind: Component
    metadata:
    name: backstage
    annotations:
    backstage.io/kubernetes-namespace: default
    backstage.io/kubernetes-label-selector: env=dev,my=label
    argo-workflows/cluster-name: my-cluster-1
    spec:
    type: service
    lifecycle: experimental
    owner: user1
    system: system1
    - + \ No newline at end of file diff --git a/docs/reference-implementation/plugins/scaffolder-backend.html b/docs/reference-implementation/plugins/scaffolder-backend.html index 9e06e006..6c360d5f 100644 --- a/docs/reference-implementation/plugins/scaffolder-backend.html +++ b/docs/reference-implementation/plugins/scaffolder-backend.html @@ -10,14 +10,14 @@ - +

    CNOE Scaffolder Backend Plugin

    Getting Started

    Add to your Backstage app.

    # From your Backstage root directory
    yarn add --cwd packages/backend @cnoe-io/scaffolder-actions-plugin
    # To be able to keep using the built-in actions.
    yarn add --cwd packages/backend @backstage/integration

    Append it to your existing actions in packages/backend/src/plugins/scaffolder.ts

    import { CatalogClient } from '@backstage/catalog-client';
    import { createRouter, createBuiltinActions } from '@backstage/plugin-scaffolder-backend';
    import { ScmIntegrations } from '@backstage/integration';
    import { Router } from 'express';
    import type { PluginEnvironment } from '../types';
    import {
    createSanitizeResource,
    createVerifyDependency,
    createKubernetesApply,
    } from "@cnoe-io/scaffolder-actions";

    export default async function createPlugin(
    env: PluginEnvironment,
    ): Promise<Router> {
    const catalogClient = new CatalogClient({ discoveryApi: env.discovery });
    const integrations = ScmIntegrations.fromConfig(env.config);

    const builtInActions = createBuiltinActions({
    integrations,
    catalogClient,
    config: env.config,
    reader: env.reader,
    });

    const cnoeActions = [
    createSanitizeResource(),
    createVerifyDependency(),
    createKubernetesApply(env.config),
    ]

    const actions = [
    ...builtInActions,
    ...cnoeActions,
    ]

    return await createRouter({
    actions,
    catalogClient,
    logger: env.logger,
    config: env.config,
    database: env.database,
    reader: env.reader,
    identity: env.identity,
    });
    }

    Done! You can now use any of the action in your software templates.

    apiVersion: scaffolder.backstage.io/v1beta3
    kind: Template
    metadata:
    name: hello-world-on-kubernetes
    title: Hello World on Kubernetes
    spec:
    steps:
    - id: sanitize-resource
    name: Sanitize Resource
    action: cnoe:utils:sanitize
    input:
    resource: ${{ serialize.output }}

    List of Actions

    Here is a list of running actions.

    ActionidDescription
    createKubernetesApplycnoe:kubernetes:applyApply Kubernetes manifest to a template
    createVerifyDependencycnoe:verify:dependencyVerify resource dependencies for CNOE
    createSanitizeResourcecnoe:utils:sanitizeSanitize resources (remove empty fields) before further processing

    For more detailed information about these actions, go to /create/actions endpoint of your Backstage instance after installing these actions. If you are running this locally, the endpoint should be http://localhost:3000/create/actions

    - + \ No newline at end of file diff --git a/docs/reference-implementation/plugins/scaffolder-frontend.html b/docs/reference-implementation/plugins/scaffolder-frontend.html index d9505fa1..3f4cd502 100644 --- a/docs/reference-implementation/plugins/scaffolder-frontend.html +++ b/docs/reference-implementation/plugins/scaffolder-frontend.html @@ -10,7 +10,7 @@ - + @@ -18,7 +18,7 @@

    CNOE Scaffolder Frontend Plugin

    Kubernetes Cluster Picker

    Allows you to display and select Kubernetes clusters configured in your Backstage configuration.

    Optionally, you can extract user token to use against the selected cluster. Note that the target cluster and Kubernetes configuration in Backstage must support this. This typically means you have to configure your cluster to accept a OIDC token and client side authentication must be configured in Backstage.

    Configuration

    // in packages/app/src/App.tsx

    const routes = (
    ...
    <Route path="/create" element={<ScaffolderPage />}>
    <ScaffolderFieldExtensions>
    <KubernetesClusterPickerExtension />
    </ScaffolderFieldExtensions>
    </Route>
    ...
    )

    Usage

    The plugin adds KubernetesClusterPicker as an available UI field option.

    • requestUserCredentials Optional. Requests the user's token for use for the target cluster.
    • allowedClusters Optional. Specifies which clusters the user use with this template.

    Example usage:

    # In your scaffolder template
    apiVersion: scaffolder.backstage.io/v1beta3
    kind: Template
    spec:
    parameters:
    - title: Enter some details
    properties:
    clusterName:
    title: Name of the cluster to deploy manifest into.
    type: string
    ui:field: KubernetesClusterPicker
    ui:options:
    allowedClusters:
    - cluster-1
    - cluster-2
    requestUserCredentials:
    secretKey: MY_TOKEN
    - id: fetch-base
    name: Fetch Base
    action: fetch:template
    input:
    url: ./templates
    values:
    token: ${{ secrets.MY_TOKEN }}

    In the above example, the users will be presented with a choice of two clusters, cluster-1 and cluster-2.

    It also specified requestUserCredentials. This means the plugin will attempt to retrieve user token for the cluster, then store it in the template secret field called MY_TOKEN.

    This token is then used in the next step by referencing the token value with ${{ secrets.MY_TOKEN }}

    - + \ No newline at end of file diff --git a/docs/reference-implementation/plugins/spark-plugin.html b/docs/reference-implementation/plugins/spark-plugin.html index 012192dc..670306d9 100644 --- a/docs/reference-implementation/plugins/spark-plugin.html +++ b/docs/reference-implementation/plugins/spark-plugin.html @@ -10,7 +10,7 @@ - + @@ -23,7 +23,7 @@ generic label selector used more widely by the Kubernetes plugin which could pull other less relevant data pulled into your backstage deployment as well. We recommend using apache-spark.cnoe.io/label-selector when using this plugin.

    Authentication

    This plugin uses the Kubernetes plugin for authentication.

    Using configured Kubernetes API

    The plugin uses configured Kubernetes clusters to fetch resources.

    For example, for a Kubernetes cluster given in your app-config.yaml

    kubernetes:
    serviceLocatorMethod:
    type: "multiTenant"
    clusterLocatorMethods:
    - type: "config"
    clusters:
    - url: https://abcd.gr7.us-west-2.eks.amazonaws.com:443
    name: my-cluster-1
    authProvider: "serviceAccount"
    serviceAccountToken: eyJh
    caData: LS0t

    For this configuration, the apache-spark.cnoe.io/cluster-name annotation value must be my-cluster-1. If this is not specified, the first cluster in the list is selected.

    apiVersion: backstage.io/v1alpha1
    kind: Component
    metadata:
    name: backstage
    annotations:
    backstage.io/kubernetes-namespace: default
    apache-spark.cnoe-io/label-selector: env=dev,my=label
    apache-spark.cnoe.io/cluster-name: my-cluster-1
    spec:
    type: service
    lifecycle: experimental
    owner: user1
    system: system1
    - + \ No newline at end of file diff --git a/docs/reference-implementation/plugins/terraform-plugin.html b/docs/reference-implementation/plugins/terraform-plugin.html index 6942c6d5..da39e1cc 100644 --- a/docs/reference-implementation/plugins/terraform-plugin.html +++ b/docs/reference-implementation/plugins/terraform-plugin.html @@ -10,7 +10,7 @@ - + @@ -21,7 +21,7 @@ shown below:

    apiVersion: backstage.io/v1alpha1
    kind: Component
    metadata:
    name: backstage
    annotations:
    terraform/s3-bucket: backstage-terraform-plugin
    terraform/s3-prefix: tfstates/
    terraform/local-filepath: /var/lib/tfstatefiles
    spec:
    type: service
    lifecycle: experimental
    owner: user1
    system: system1

    Update your Entity page. For example:

    // in packages/app/src/components/catalog/EntityPage.tsx
    import { TerraformPluginPage } from '@cnoe-io/plugin-terraform';
    ...
    const terraFormContent = (
    <TerraformPluginPage />
    );
    ...
    const websiteEntityPage = (
    <EntityLayout>
    ...
    <EntityLayout.Route path="/terraform" title="Terraform">
    {terraFormContent}
    </EntityLayout.Route>
    </EntityLayout>
    ...
    );

    Annotations

    As shown in the example above, the following annotations could go under annotations in the backstage Component and will be recognized by this plugin.

    • One of the two annotations below are required:
    • terraform/s3-bucket: Required. The S3 bucket where tfstate files would be stored.
    • terraform/local-filepath: Required. The local file system path of where tfstate files would be stored.
    • If storing tfstate files in S3, you can optionally define a prefix:
    • terraform/s3-prefix: Optional. This is a S3 prefix of where tfstate files would be stored in the S3 bucket.

    Note: The plugin only supports using one storage location at a time. If you define an S3 storage location and a local file system, the plugin will only use the S3 storage location.

    Configuration - Backend

    Create a new file at packages/backend/src/plugins/terraform.ts with the following contents.

    import { Router } from 'express';
    import { PluginEnvironment } from '../types';
    import { createRouter } from '@cnoe-io/plugin-terraform-backend';

    export default async function createPlugin(
    env: PluginEnvironment,
    ): Promise<Router> {
    return await createRouter({
    logger: env.logger,
    config: env.config,
    });
    }

    In packages/backend/src/index.ts, import the function created above and create an endpoint for the backend.

    import ...
    import terraform from './plugins/terraform';

    ...
    const appEnv = useHotMemoize(module, () => createEnv('app'));
    const terraformEnv = useHotMemoize(module, () => createEnv('terraform'));
    ...
    apiRouter.use('/search', await search(searchEnv));
    apiRouter.use('/terraform', await terraform(terraformEnv));
    ...

    Authentication

    AWS Credentials

    By default, the Terraform backend plugin relies on the default behavior of the AWS SDK for Javascript to determine the AWS credentials that it uses to authenticate an identity to use with AWS APIs.

    The Terraform backend plugin that runs in your Backstage app searches for credentials in the following order:

    1. Environment variables (AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
    2. SSO credentials from the token cache
    3. Web identity token credentials (including running in an Amazon EKS cluster using IAM roles for service accounts)
    4. Shared credentials and config ini files (~/.aws/credentials, ~/.aws/config)
    5. Amazon Elastic Container Service (Amazon ECS) task metadata service
    6. Amazon Elastic Compute Cloud (Amazon EC2) instance metadata service

    We recommend that you don't hard-code long lived AWS credentials in your production Backstage application configuration. Hard-coding credentials is risky and might expose your access key ID and secret access key.

    Instead, we recommend that you use short lived AWS credentials for your production Backstage application by deploying it to Amazon ECS, Amazon Elastic Kubernetes Service (Amazon EKS), or Amazon EC2. For more information about deploying Backstage to Amazon EKS using a Helm chart or to Amazon ECS on AWS Fargate using the AWS Cloud Development Kit (CDK), see Deploying Backstage in the Backstage documentation.

    To use multiple AWS accounts with your Backstage app or to explicitly configure credentials for an AWS account, you can configure AWS accounts in your Backstage app's configuration. For example, to configure an AWS account to use with the Terraform backend plugin which requires using an IAM role to retrieve credentials, add the following to your Backstage app-config.yaml file.

    aws:
    accounts:
    - accountId: '111111111111'
    roleName: 'my-iam-role-name'

    For more account configuration examples, see the Backstage integration-aws-node package documentation.

    IAM permissions

    The Terraform backend plugin requires the AWS identity that it uses to have the following IAM permissions for getting tfstate files from S3:

    • s3:GetObject
    • s3:ListObjectsV2
    - + \ No newline at end of file diff --git a/docs/reference-implementation/technology.html b/docs/reference-implementation/technology.html index ed287bf2..96bf4a5d 100644 --- a/docs/reference-implementation/technology.html +++ b/docs/reference-implementation/technology.html @@ -12,7 +12,7 @@ - + @@ -30,7 +30,7 @@ community driven patterns and best practices based on what is commonly deployed in production.

    overview

    For CNOE reference implementations we will provide configurations, patterns, and practices with the following (growing) list of technologies.

    CapabilityTechnologies
    Code RepositoryGit
    Config RepositoryGit
    Artifact RegistryContainer Registries
    Secret RepositoryExternal Secrets (with Vault and KMS)
    ValidationsCNOE Validators
    Secret ManagementExternal Secrets
    Infra as CodeTerraform, Crossplane
    Continuous DeliveryArgo CD, Flux
    Continuous IntegrationsArgo Workflows, Tekton
    Identity & AccessKeyCloak
    Developer PortalsBackstage
    - + \ No newline at end of file diff --git a/index.html b/index.html index 85ade77c..a1186152 100644 --- a/index.html +++ b/index.html @@ -10,13 +10,13 @@ - +
    CHOOSE STACK

    CHOOSE STACK

    BUILD STACK

    BUILD STACK

    TEST STACK

    TEST STACK

    ADOPT STACK

    ADOPT STACK

    SCALE STACK

    SCALE STACK

    Platform Architecture

    Deployment Targets
    (Cloud / On Prem / Edge)

    Application

    Packaging/Templating
    Code Repository
    Config Repository
    Artifact Registries
    Secret Repository
    Signing

    Operation

    Developer Portal
    Identity and Access
    Infra as
    Code
    Continuous
    Delivery
    Workflow
    Orchestration
    Service
    Discovery
    Secret Management
    Validation
    Compute Platform
    Observability

    Value Proposition

    CLOUD NATIVE

    CNOE is developed around open source cloud native projects that are deemed to be useful in helping companies build their internal developer tooling.

    COMMUNITY BEST PRACTICES

    CNOE relies on community consensus on selecting and configuring open source cloud native projects as part of the internal developer platform recommendations.

    MODULAR

    CNOE aims to allow its users to pick and choose what core technologies they want to choose for their internal developer platform.

    MISSION

    CNOE aims at helping platform engineers build their IDPs faster and in a more secure way with best practices built in

    VISION

    CNOE strives to be the goto framework for leading software companies to build their cloud-native internal developer platform

    Members

    In the News

    ► Add your talk!
    - + \ No newline at end of file diff --git a/markdown-page.html b/markdown-page.html index 10c36470..84618b2b 100644 --- a/markdown-page.html +++ b/markdown-page.html @@ -10,13 +10,13 @@ - +

    Markdown page example

    You don't need React to write simple standalone pages.

    - + \ No newline at end of file diff --git a/radars.html b/radars.html index 49ea260a..f13547ad 100644 --- a/radars.html +++ b/radars.html @@ -10,13 +10,13 @@ - + - + \ No newline at end of file diff --git a/radars/radar.html b/radars/radar.html index 5f969f97..d25a3777 100644 --- a/radars/radar.html +++ b/radars/radar.html @@ -10,13 +10,13 @@ - +
    - + \ No newline at end of file