argoproj-labs · jannfis · Jan 14, 2025 · Jan 15, 2025 · Jan 15, 2025 · Jan 15, 2025
@@ -46,7 +46,7 @@ jobs:
       - name: Download all Go modules
         run: |
           go mod download
-      - name: Check for tidyness of go.mod and go.sum
+      - name: Check for tidiness of go.mod and go.sum
         run: |
           go mod tidy
           git diff --exit-code -- .
@@ -114,6 +114,16 @@ jobs:
       - name: Compile all packages
         run: make build
 
+  build-docs:
+    name: Build the documentation
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+      - name: Build the documentation
+        run: |
+          make build-docs
+
   test:
     name: Run unit tests
     if: ${{ needs.changes.outputs.code == 'true' }}

@@ -7,6 +7,10 @@ IMAGE_NAME_PRINCIPAL=argocd-agent-principal
 IMAGE_PLATFORMS?=linux/amd64
 IMAGE_TAG?=latest
 
+# mkdocs related configuration
+MKDOCS_DOCKER_IMAGE?=squidfunk/mkdocs-material:9
+MKDOCS_RUN_ARGS?=
+
 # Binary names
 BIN_NAME_AGENT=argocd-agent-agent
 BIN_NAME_PRINCIPAL=argocd-agent-principal
@@ -139,10 +143,20 @@ image-agent:
 image-principal:
 	$(DOCKER_BIN) build -f Dockerfile.principal --platform $(IMAGE_PLATFORMS) -t $(IMAGE_REPOSITORY)/$(IMAGE_NAME_PRINCIPAL):$(IMAGE_TAG) .
 
+.PHONY: push-images
 push-images:
 	$(DOCKER_BIN) push $(IMAGE_REPOSITORY)/$(IMAGE_NAME_AGENT):$(IMAGE_TAG)
 	$(DOCKER_BIN) push $(IMAGE_REPOSITORY)/$(IMAGE_NAME_PRINCIPAL):$(IMAGE_TAG)
 
+.PHONY: serve-docs
+serve-docs:
+	${DOCKER_BIN} run ${MKDOCS_RUN_ARGS} --rm -it -p 8000:8000 -v ${current_dir}:/docs ${MKDOCS_DOCKER_IMAGE} serve -a 0.0.0.0:8000
+
+.PHONY: build-docs
+build-docs:
+	${DOCKER_BIN} run ${MKDOCS_RUN_ARGS} --rm -v ${current_dir}:/docs ${MKDOCS_DOCKER_IMAGE} build
+
+
 .PHONY: help
 help:
 	@echo "Not yet, sorry."
@@ -0,0 +1,25 @@
+.codehilite {
+    background-color: hsla(0,0%,92.5%,.5);
+    overflow: auto;
+    -webkit-overflow-scrolling: touch;
+}
+
+.codehilite pre {
+    background-color: transparent;
+    padding: .525rem .6rem;
+}
+
+@media only screen and (min-width: 76.25em) {
+    .md-main__inner {
+      max-width: none;
+    }
+    .md-sidebar--primary {
+      left: 0;
+    }
+    .md-sidebar--secondary {
+      right: 0;
+      margin-left: 0;
+      -webkit-transform: none;
+      transform: none;   
+    }
+  }
@@ -0,0 +1,13 @@
+# Autonomous mode
+
+## Overview of autonomous mode
+
+In *autonomous mode*, the workload cluster is wholly responsible for maintaining its own configuration. As opposed to the [managed mode](./managed.md), all configuration is first created on the workload cluster. The agent on this workload cluster will observe creation, modification and deletion of configuration and transmit them to the principal on the control plane cluster.
+
+The principal will then create, update or delete this configuration on the control plane cluster. Users can use the Argo CD UI, CLI or API to inspect the status of the configuration, but they are not able to perform changes to the configuration or delete it.
+
+## Architectural considerations
+
+## Why chose this mode
+
+## Why not chose this mode
@@ -0,0 +1,7 @@
+# Agent modes
+
+The main purpose of the [agent](../components-terminology.md#agent) and the [principal](../components-terminology.md#principal) components is to keep configuration in sync between the [workload clusters](../components-terminology.md#workload-cluster) and the [control plane cluster](../components-terminology.md#control-plane-cluster).
+
+Each agent can operate in one of two distinct configuration modes: *managed* or *autonomous*. These modes define the general sync direction: From the workload cluster to the control plane cluster (*autonomous*), or from the control plane cluster to the workload cluster (*managed*).
+
+Please refer to the sub-chapters [Managed mode](./managed.md) and [Autonomous mode](./autonomous.md) for detailed information, architectural considerations and constraints to chose the mode most appropriate for your agents.
@@ -0,0 +1,34 @@
+# Managed mode
+
+## Overview
+
+In *managed mode*, the control plane cluster is responsible for maintaining the configuration of an agent. The agent receives all of its configuration (e.g. `Applications` and `AppProjects`) from the [principal](../components-terminology.md#principal). 
+
+For example, to create a new Argo CD `Application` on the workload cluster, it must be created on the control plane cluster. The principal will observe the creation of a new `Application` and emit a creation event that the agent on the workload cluster will pick up. The agent then will create the `Application` in its local cluster. From there, it will be picked up by the Argo CD *application controller* for reconciliation. The agent will observe any changes to the `Application`'s status field and transmits them to the principal, which merges them into the leading copy of the `Application`.
+
+Likewise, if an `Application` is to be deleted, it must be deleted on the control plane cluster. Once the principal observes the deletion event, it will emit a deletion event that the agent on the workload cluster will pick up. The agent then deletes the `Application` from its local cluster, and transmits the result back to the principal.
+
+Similar procedures apply to modifications of an `Application` in this mode.
+
+Changes to `Application` resources on the workload cluster that are not originating from the principal will be reverted.
+
+## Architectural considerations
+
+* The minimum requirement on any workload cluster in *managed* mode is to have an agent and the Argo CD *application-controller* installed
+* The *application-controller* can be configured to use the *repository-server* and *redis-server* on either the control plane cluster, or on the local workload cluster.
+* The Argo CD *applicationset-controller* must be running on the control plane cluster, if you intend to use `ApplicationSets`.
+* If the Argo CD *application-controller* is configured to use the *redis-server* or the *repository-server* on the control plane cluster, the control plane cluster becomes a single point of failure (SPoF) for the workload cluster.
+
+## Why chose this mode
+
+* Provides the classical Argo CD experience
+* Create and manage applications from the Argo CD UI, CLI or API
+* It has the lowest footprint on the workload cluster
+* Allows use of ApplicationSet generators that span over multiple clusters, such as cluster or cluster-decision generators
+
+## Why not chose this mode
+
+* Very limited support for the app-of-apps pattern
+* In the case the control plane cluster is compromised, it will affect workload clusters in managed mode, too
+* As noted [previously](#architectural-considerations), the control plane cluster might become a SPoF
+
@@ -0,0 +1,57 @@
+# Architectural overview
+
+This section of the documentation gives a broad overview about *argocd-agent*'s architecture, the terminology used and how things will fit together. In order to get started, and to understand the functionality and limitations of *argocd-agent*, it is important to get familiar with the architecture and the components that make up the project.
+
+## Problem statement
+
+In the classical Argo CD multi-cluster architecture, scaling out can become a tedious and difficult task. Usually, scaling out comes with a couple of generic challenges, as well as environment specific ones.
+
+**TODO**: Describe the problem in more detail here.
+
+## Architectural diagram
+
+The following diagram shows a very simplified overview of *argocd-agent*'s architecture. In this particular example, there are three [workload clusters](./components-terminology.md#workload-cluster) connected to a single [control plane cluster](./components-terminology.md#control-plane-cluster). The light blue boxes are existing Argo CD components and assets, while the light green boxes are components added by *argocd-agent*.
+
+![Architectural overview](../assets/01-architecture.png)
+
+In the context of the diagram, the term "Argo CD components" means one or more Argo CD workloads (such as, application controller, applicationset controller, repository server etc) depending on the concrete setup, and the term "configuration" means the configuration required to reconcile resources with Argo CD, e.g. `Applications`, `AppProjects` etc. What this exactly means in which scenario is described in more detail [here TODO](TODO)
+
+As can be seen in the diagram, there is no connection between the central control plane cluster and the workload clusters except for the components of *argocd-agent*. Or in particular, a connection from the workload cluster's *agent* to the control plane's *principal* component. The reconciliation will happen local to the [workload clusters](./components-terminology.md#workload-cluster), as (at least) an Argo CD *application controller* will be running on each of them.
+
+## Scope and function
+
+
+The agent-based architecture is a major shift from that classical architecture. It outsources some of the compute requirements to the workload clusters and synchronizes configuration between the control plane cluster and the workload clusters using a component on the control plane cluster, the [principal](./components-terminology.md#principal), and an [agent](./components-terminology.md#agent) on the workload cluster.
+
+The connection between an [agent](./components-terminology.md#agent) and the [principal](./components-terminology.md#principal) is much cheaper to establish and maintain than the connection from an *application controller* to the Kubernetes API endpoint of a remote cluster, as it does not need to watch or cache every resource on the cluster. Instead, it focuses on Argo CD configuration such as `Applications`, `AppProjects`, repository configuration and the likes. It is also much more resilient to bad/slow networks, connection drops, or high latency transmissions. And last but not least, the [control plane cluster](./components-terminology.md#control-plane-cluster) does not need to maintain credentials to any of the workload clusters anymore. Instead, the workload clusters will authenticate to the central control plane.
+
+*argocd-agent* is not designed to nor does it intend to replace any existing functionality in Argo CD. Its scope is to change the way how Applications are being deployed in a multi-cluster scenario, especially when there are more than a couple of clusters involved. And the project intends to require as minimal changes to Argo CD as possible, using any out-of-the-box Argo CD installation as the ultimate target.
+
+Under the hood, *argocd-agent* uses a message based protocol to establish a *bi-directional* exchange of messages. Bi-directional in this context means that both, the [principal](./components-terminology.md#principal) and the [agents](./components-terminology.md#agent) can send and receive messages simultaneously using the same connection, which is established exclusively by the agents. As of today, the underlying transport is gRPC based, but there are [plans](https://github.com/argoproj-labs/argocd-agent/issues/260) to make this extensible. The vision for this is that one could use a message bus implementation such as Kafka for the transport of messages if they wanted to.
+
+## Design principles
+
+The following describes the guiding design principles upon which *argocd-agent* is built. All enhancements and contributions should follow those principles.
+
+**A permanent network connection is neither expected nor required**
+
+It is understood that workload clusters can be everywhere: In your dark-fibre connected data centres, across different cloud providers, regions, and availability zones, in moving things such as cars, ships, or trains or wherever they are needed. Not all these locations will have a permanent, reliable and low-latency network connection.
+
+Thus, *argocd-agent* is designed around the assumption that the connection between [workload clusters](./components-terminology.md#workload-cluster) and the [control plane cluster](./components-terminology.md#control-plane-cluster) is not always available and that it might not be possible to keep up a stable, good performing network connection between the components. The system will benefit from a stable network connection with low latency, however it will not require it to function properly.
+
+**Workload clusters are and will stay autonomous**
+
+When the [agent](./components-terminology.md#agent) cannot communicate with the [principal](./components-terminology.md#principal) for whatever reason, the [workload cluster](./components-terminology.md#workload-cluster) will still be able to perform its operations (i.e. reconciliation) in an autonomous way, if setup correctly. Also, depending on the agent's mode of operation, cluster admins may still be able to perform configuration tasks (i.e. create, update and delete applications) but those changes will only take effect once the agent is connected again.
+
+There are architectural variants in which a workload cluster will be dependent upon the availability of the control plane, for example when the workload cluster uses a repository server or Redis cache on the control plane. However, there will always be a variant where fully autonomous workload clusters are supported.
+
+**The initiating component is always the agent, not the control plane**
+
+Connections are established in one direction only: from the agent to the control plane. Neither the control plane nor the agents need to know exact details about the topology of the system, as long as the agents know which control plane to connect to. In some parts of this documentation, we mention something called a _bi-directional stream_. This refers to a gRPC mechanisms where both parties may randomly transmit and receive data from its peer, all while the connection is established only in one direction.
+
+**Be lightweight by default but keep extensibility in mind**
+
+*argocd-agent* should not impose any mandatory, heavy runtime dependencies or operational patterns. The hurdle of getting started should be as low as possible. The project should stay unencumbered of requirements such as persistent storage or relational databases by default. 
+
+We are aware that at some point in time we may hit a scaling limit, especially when it comes to etcd and the Kubernetes API. Thus, major parts such as Application backends on the principal are designed to be pluggable, so users can contribute and use different kinds of backends according to their scalability requirements and operational preferences.
+
@@ -0,0 +1,54 @@
+# Integration with Argo CD
+
+## Overview
+
+*argocd-agent* does not replace Argo CD, but instead integrates with it. There are several ways of integrating Argo CD and *argocd-agent*, each with their own pros and cons. This chapter gives an overview over the integration pattern that most people will likely want to use.
+
+In the below diagrams,
+
+* light green boxes are parts of *argocd-agent*
+* light blue boxes are parts of Argo CD and
+* light red boxes are external systems and components
+
+Components drawn with a dotted outline indicate their location depend on the [operational mode](./agent-modes/index.md) of the agent.
+
+!!! warning "Choosing which integration pattern to use"
+    While it is possible to run agents with different operational modes connecting to the same control plane cluster, it is not (yet?) possible to have your workload clusters using different integration modes. Choosing which integration pattern to use is an architectural decision affecting the whole environment, spanning from the control plane cluster to each and every workload cluster. It will not be possible to switch between the two integration patterns without service interruption.
+
+## Integration patterns
+
+### Pattern 1: Lowest footprint workload clusters
+
+This integration pattern requires some of the core components of Argo CD, specifically the *repository-server* and the *redis-server* to be shared on the control plane, while the only component on the workload clusters will be the *application-controller*.
+
+![Integration pattern 1: Low footprint spokes](../assets/02-integration-shared.png)
+
+As can be seen, the only component installed and running on the *workload cluster* is the *application-controller* (and the *applicationset-controller*, in case the agent runs in autonomous mode), and the *application-controller* is talking to a *repository-server* and a *redis-server* on the control plane cluster.
+
+**Advantages of this pattern**
+
+* Less compute requirements on the workload clusters, as some of the heavy lifting is done on the control plane
+* Since the *repository-server* runs on the control plane cluster, the workload clusters don't need access to Git. They will only need to talk to the control plane cluster.
+* Since more of the important state (such as, rendered manifests) is stored on the control plane cluster's *redis-server*, it is cheaper for the Argo CD API on the control plane cluster to actually access the state. However, it should be noted that most of the traffic to *redis-server* stems from the *application-controller* as well as the *repository-server*.
+
+**Disadvantages of this pattern**
+
+* The control plane cluster and its components become a single point of failure (SPoF) for the whole setup. If the workload cluster cannot reach the control plane cluster, or the components become unavailable, the *application-controller* on the workload clusters cannot render manifests anymore, or store important information in the cache. Reconciliation will stop working on the workload clusters.
+* The network traffic flowing between the workload cluster and the control plane cluster increases, potentially significantly. This might become a bottleneck, or result in higher bills depending on how your traffic is charged.
+* You will have to take steps for scaling of the *repository-server* and the *redis-server* workloads on the central control plane, depending on how many clusters you have, how many applications are deployed to them and how often they reconcile.
+* You will have to manage additional ingress points on the central control plane, along with credentials for each 
+
+### Pattern 2: Fully autonomous workload clusters
+
+This integration pattern also outsources the Argo CD *repository-server* and *redis-server* components in addition to the *application-controller* to the workload clusters. This pattern makes each workload cluster effectively an autonomous Argo CD installation, minus the configuration and observability aspects - which are provided on the central control plane.
+
+![Integration pattern 2: Autonomous spokes](../assets/02-integration-autonomous.png)
+
+**Advantages of this pattern**
+
+* Workload clusters become truly autonomous in their operations, while only configuration and observability will be affected when the control plane cluster becomes unavailable or has problems. With agents also operating in [autonomous mode](./agent-modes/autonomous.md), only observability will be affected by an outage of the control plane cluster.
+* (Much) less traffic has to flow between
+* Scaling of all Argo CD workloads per-cluster becomes possible
+* Single point of ingress in the control plane cluster (the principal)
+
+**Disadvantages of this pattern**