diff --git a/.github/workflows/kind.yml b/.github/workflows/kind.yml index aeff9f347bf..a517699525a 100644 --- a/.github/workflows/kind.yml +++ b/.github/workflows/kind.yml @@ -277,6 +277,69 @@ jobs: path: log.tar.gz retention-days: 30 + test-e2e-ipam-feature-enabled: + name: E2e tests on a Kind cluster on Linux with FlexibleIPAM feature enabled + needs: [build-antrea-coverage-image] + runs-on: [ubuntu-latest-4-cores] + steps: + - uses: actions/checkout@v4 + with: + show-progress: false + - uses: actions/setup-go@v5 + with: + go-version-file: 'go.mod' + - name: Download Antrea image from previous job + uses: actions/download-artifact@v4 + with: + name: antrea-ubuntu-cov + - name: Load Antrea image + run: | + docker load -i antrea-ubuntu.tar + - name: Install Kind + run: | + KIND_VERSION=$(head -n1 ./ci/kind/version) + curl -Lo ./kind https://github.com/kubernetes-sigs/kind/releases/download/${KIND_VERSION}/kind-$(uname)-amd64 + chmod +x ./kind + sudo mv kind /usr/local/bin + - name: Run ipam e2e tests + # We enable multicast as some FlexibleIPAM e2e tests require it + run: | + mkdir log + mkdir test-ipam-e2e-coverage + ANTREA_LOG_DIR=$PWD/log ANTREA_COV_DIR=$PWD/test-ipam-e2e-coverage ./ci/kind/test-e2e-kind.sh \ + --encap-mode noEncap \ + --coverage \ + --multicast \ + --flexible-ipam \ + - name: Tar coverage files + run: tar -czf test-ipam-e2e-coverage.tar.gz test-ipam-e2e-coverage + - name: Upload coverage for test-ipam-e2e-coverage + uses: actions/upload-artifact@v4 + with: + name: test-ipam-e2e-coverage + path: test-ipam-e2e-coverage.tar.gz + retention-days: 30 + - name: Codecov + uses: codecov/codecov-action@v5 + with: + token: ${{ secrets.CODECOV_TOKEN }} + files: '*.cov.out*' + disable_search: true + flags: kind-e2e-tests + name: test-ipam-e2e-coverage + directory: test-ipam-e2e-coverage + fail_ci_if_error: ${{ github.event_name == 'push' }} + - name: Tar log files + if: ${{ failure() }} + run: tar -czf log.tar.gz log + - name: Upload test log + uses: actions/upload-artifact@v4 + if: ${{ failure() }} + with: + name: e2e-kind-ipam-features-enabled.tar.gz + path: log.tar.gz + retention-days: 30 + test-e2e-noencap: name: E2e tests on a Kind cluster on Linux (noEncap) needs: [build-antrea-coverage-image] diff --git a/ci/kind/kind-setup.sh b/ci/kind/kind-setup.sh index e57e79616e4..9cba5886756 100755 --- a/ci/kind/kind-setup.sh +++ b/ci/kind/kind-setup.sh @@ -289,6 +289,22 @@ function configure_vlan_subnets { docker_run_with_host_net iptables -t filter -A FORWARD -i $bridge_interface -o $vlan_interface -j ACCEPT docker_run_with_host_net iptables -t filter -A FORWARD -o $bridge_interface -i $vlan_interface -j ACCEPT done + + if [[ $FLEXIBLE_IPAM == true ]]; then + docker_run_with_host_net ipset create excluded_subnets hash:net + docker_run_with_host_net ipset add excluded_subnets 192.168.241.0/24 + docker_run_with_host_net ipset add excluded_subnets 192.168.242.0/24 + docker_run_with_host_net ipset add excluded_subnets 192.168.240.0/24 + docker_run_with_host_net ipset list excluded_subnets + + # Bypass default Docker SNAT rule for FlexibleIPAM traffic from the untagged subnet (192.168.240.0/24, which is the subnet for the Docker bridge network) + # and destined to the VLAN subnets (192.168.241.0/24, 192.168.242.0/24). + docker_run_with_host_net iptables -t nat -I POSTROUTING 1 ! -o $bridge_interface -s 192.168.240.0/24 -m set --match-set excluded_subnets dst -j RETURN + + # With FlexibleIPAM, Antrea SNAT is disabled (noSNAT: true) so Pods don't have access to the external network by default (including regular / NodeIPAM Pods). + # Our e2e tests require external network access for regular Pods, so we need to add a custom SNAT rule. + docker_run_with_host_net iptables -t nat -A POSTROUTING ! -o $bridge_interface -s 10.244.0.0/16 -m set ! --match-set excluded_subnets dst -j MASQUERADE + fi } function delete_vlan_subnets { @@ -307,17 +323,29 @@ function delete_vlan_subnets { docker_run_with_host_net ip link del $interface_name fi done + + if [[ $FLEXIBLE_IPAM == true ]]; then + docker_run_with_host_net iptables -t nat -D POSTROUTING ! -o $bridge_interface -s 192.168.240.0/24 -m set --match-set excluded_subnets dst -j RETURN || true + docker_run_with_host_net iptables -t nat -D POSTROUTING ! -o $bridge_interface -s 10.244.0.0/16 -m set ! --match-set excluded_subnets dst -j MASQUERADE || true + docker_run_with_host_net ipset destroy excluded_subnets || true + fi } -function delete_networks { - networks=$(docker network ls -f name=antrea --format '{{.Name}}') - networks="$(echo $networks)" - if [[ ! -z $networks ]]; then +function delete_network_by_filter { + local networks=$(docker network ls -f name="$1" --format '{{.Name}}') + if [[ -n $networks ]]; then docker network rm $networks > /dev/null 2>&1 - echo "deleted networks $networks" + echo "Deleted networks: $networks" fi } +function delete_networks { + if [[ $FLEXIBLE_IPAM == true ]]; then + delete_network_by_filter "kind" + fi + delete_network_by_filter "antrea" +} + function load_images { echo "load images" set +e @@ -700,7 +728,6 @@ if [[ $ACTION == "destroy" ]]; then exit fi - kind_version=$(kind version | awk '{print $2}') kind_version=${kind_version:1} # strip leading 'v' function version_lt() { test "$(printf '%s\n' "$@" | sort -rV | head -n 1)" != "$1"; } @@ -717,5 +744,10 @@ if [[ $ACTION == "create" ]]; then echoerr "Only one of '--subnets' and '--extra-networks' can be specified" exit 1 fi + + # Reserve IPs after 192.168.240.63 for e2e tests. + if [[ $FLEXIBLE_IPAM == true ]]; then + docker network create -d bridge --subnet 192.168.240.0/24 --gateway 192.168.240.1 --ip-range 192.168.240.0/26 kind + fi create fi diff --git a/ci/kind/test-e2e-kind.sh b/ci/kind/test-e2e-kind.sh index 1b9030c76df..264dbc97e03 100755 --- a/ci/kind/test-e2e-kind.sh +++ b/ci/kind/test-e2e-kind.sh @@ -89,6 +89,7 @@ setup_only=false cleanup_only=false test_only=false run="" +flexible_ipam=false antrea_controller_image="antrea/antrea-controller-ubuntu" antrea_agent_image="antrea/antrea-agent-ubuntu" use_non_default_images=false @@ -110,6 +111,10 @@ case $key in proxy_all=true shift ;; + --flexible-ipam) + flexible_ipam=true + shift + ;; --no-kube-proxy) no_kube_proxy=true shift @@ -248,6 +253,9 @@ fi if $flow_visibility; then manifest_args="$manifest_args --feature-gates FlowExporter=true,L7FlowExporter=true --extra-helm-values-file $FLOW_VISIBILITY_HELM_VALUES" fi +if $flexible_ipam; then + manifest_args="$manifest_args --flexible-ipam" +fi COMMON_IMAGES_LIST=("registry.k8s.io/e2e-test-images/agnhost:2.40" \ "antrea/nginx:1.21.6-alpine" \ @@ -302,6 +310,10 @@ if $extra_vlan; then fi fi +if $flexible_ipam; then + vlan_args="$vlan_args --vlan-subnets 11=192.168.241.1/24 --vlan-subnets 12=192.168.242.1/24" +fi + function setup_cluster { args=$1 @@ -328,7 +340,9 @@ function setup_cluster { if $bgp_policy; then args="$args --deploy-external-frr" fi - + if $flexible_ipam; then + args="$args --flexible-ipam" + fi echo "creating test bed with args $args" eval "timeout 600 $TESTBED_CMD create kind $args" } @@ -401,6 +415,11 @@ function run_test { EXTRA_ARGS="$EXTRA_ARGS --external-frr-cid $external_frr_cid --external-frr-ips $external_frr_ips" fi + if $flexible_ipam; then + EXTRA_ARGS="$EXTRA_ARGS --antrea-ipam" + timeout="100m" + fi + go test -v -timeout=$timeout $RUN_OPT antrea.io/antrea/test/e2e $flow_visibility_args -provider=kind --logs-export-dir=$ANTREA_LOG_DIR $np_evaluation_flag --skip-cases=$skiplist $coverage_args $EXTRA_ARGS if $coverage; then diff --git a/test/e2e/antreapolicy_test.go b/test/e2e/antreapolicy_test.go index 2f40cb43652..d34aec82745 100644 --- a/test/e2e/antreapolicy_test.go +++ b/test/e2e/antreapolicy_test.go @@ -5288,3 +5288,284 @@ func testAntreaClusterNetworkPolicyStats(t *testing.T, data *TestData) { } k8sUtils.Cleanup(namespaces) } + +// TestFQDNCacheMinTTL ensures stable FQDN access for applications that cache DNS resolutions, +// even when FQDN-to-IP mappings change frequently, and FQDN-based NetworkPolicies are in use. +// It validates the functionality of the new minTTL configuration, which is used for scenarios +// where applications may cache DNS responses beyond the TTL defined in original DNS response. +// The minTTL value enforces that resolved IPs remain in datapath rules for as long as +// applications might cache them, thereby preventing intermittent network connectivity issues +// to the FQDN concerned. Actual test logic runs in testWithFQDNCacheMinTTL, which gets called +// by TestFQDNCacheMinTTL with 2 fqdnCacheMinTTL values where `0` represents a default value +// when fqdnCacheMinTTL is unset . +func TestFQDNCacheMinTTL(t *testing.T) { + skipIfAntreaPolicyDisabled(t) + skipIfHasWindowsNodes(t) + skipIfNotIPv4Cluster(t) + skipIfIPv6Cluster(t) + skipIfNotRequired(t, "mode-irrelevant") + + t.Run("minTTLUnset", func(t *testing.T) { testWithFQDNCacheMinTTL(t, 0) }) + t.Run("minTTL20s", func(t *testing.T) { testWithFQDNCacheMinTTL(t, 20) }) +} + +func testWithFQDNCacheMinTTL(t *testing.T, fqdnCacheMinTTL int) { + const ( + testFQDN = "fqdn-test-pod.lfx.test" + dnsPort = 53 + dnsTTL = 5 + ) + + data, err := setupTest(t) + if err != nil { + t.Fatalf("Error when setting up test: %v", err) + } + defer teardownTest(t, data) + + // create two agnhost Pods and get their IPv4 addresses. The IP of these Pods will be mapped against the FQDN. + podCount := 2 + agnhostPodIPs := make([]*PodIPs, podCount) + for i := 0; i < podCount; i++ { + agnhostPodIPs[i] = createHttpAgnhostPod(t, data) + } + + // get IPv4 addresses of the agnhost Pods created. + agnhostPodOneIP, _ := agnhostPodIPs[0].AsStrings() + agnhostPodTwoIP, _ := agnhostPodIPs[1].AsStrings() + + // create customDNS Service and get its ClusterIP. + customDNSService, err := data.CreateServiceWithAnnotations("custom-dns-service", data.testNamespace, dnsPort, + dnsPort, v1.ProtocolUDP, map[string]string{"app": "custom-dns"}, false, + false, v1.ServiceTypeClusterIP, ptr.To[v1.IPFamily](v1.IPv4Protocol), map[string]string{}) + require.NoError(t, err, "Error creating custom DNS Service") + dnsServiceIP := customDNSService.Spec.ClusterIP + + // create a ConfigMap for the custom DNS server, mapping IP of agnhost Pod 1 to the FQDN. + configMap := &v1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: "custom-dns-config", + Namespace: data.testNamespace, + }, + Data: createDNSConfig(t, map[string]string{agnhostPodOneIP: testFQDN}, dnsTTL), + } + customDNSConfigMap, err := data.CreateConfigMap(configMap) + require.NoError(t, err, "failed to create custom DNS ConfigMap") + + createCustomDNSPod(t, data, configMap.Name) + + // set the custom DNS server IP address in Antrea ConfigMap. + configureFQDNPolicyEnforcement(t, data, dnsServiceIP, fqdnCacheMinTTL) + defer configureFQDNPolicyEnforcement(t, data, "", 0) //reset after the test. + + createPolicyForFQDNCacheMinTTL(t, data, testFQDN, "test-anp-fqdn", "custom-dns", "fqdn-cache-test") + require.NoError(t, NewPodBuilder(toolboxPodName, data.testNamespace, ToolboxImage). + WithLabels(map[string]string{"app": "fqdn-cache-test"}). + WithContainerName(toolboxContainerName). + WithCustomDNSConfig(&v1.PodDNSConfig{Nameservers: []string{dnsServiceIP}}). + Create(data)) + require.NoError(t, data.podWaitForRunning(defaultTimeout, toolboxPodName, data.testNamespace)) + + // get timestamp before the Pod resolves the FQDN for the first time + startCacheTime := time.Now() + + curlFQDN := func(target string) (string, error) { + cmd := []string{"curl", target} + stdout, stderr, err := data.RunCommandFromPod(data.testNamespace, toolboxPodName, toolboxContainerName, cmd) + if err != nil { + return "", fmt.Errorf("error when running command '%s' on Pod '%s': %v, stdout: <%v>, stderr: <%v>", + strings.Join(cmd, " "), toolboxPodName, err, stdout, stderr) + } + return stdout, nil + } + + assert.EventuallyWithT(t, func(t *assert.CollectT) { + _, err := curlFQDN(testFQDN) + assert.NoError(t, err) + }, 2*time.Second, 100*time.Millisecond, "failed to curl test FQDN: ", testFQDN) + + // confirm that the FQDN resolves to the expected IP address and store it to simulate caching of this IP by the client Pod. + t.Logf("Resolving FQDN to simulate caching the current IP inside toolbox Pod") + resolvedIP, err := data.runDNSQuery(toolboxPodName, toolboxContainerName, data.testNamespace, testFQDN, false, dnsServiceIP) + fqdnIP := resolvedIP.String() + require.NoError(t, err, "failed to resolve FQDN to an IP from toolbox Pod") + require.Equalf(t, agnhostPodOneIP, fqdnIP, "Resolved IP does not match expected value") + t.Logf("Successfully received the expected IP %s against the test FQDN", fqdnIP) + + // update the IP address mapped to the FQDN in the custom DNS ConfigMap. + t.Logf("Updating host mapping in DNS server config to use new IP: %s", agnhostPodTwoIP) + customDNSConfigMap.Data = createDNSConfig(t, map[string]string{agnhostPodTwoIP: testFQDN}, dnsTTL) + require.NoError(t, data.UpdateConfigMap(customDNSConfigMap), "failed to update configmap with new IP") + t.Logf("Successfully updated DNS ConfigMap with new IP: %s", agnhostPodTwoIP) + + // try to trigger an immediate refresh of the configmap by setting annotations in custom DNS server Pod, this way + // we try to bypass the kubelet sync period which may be as long as (1 minute by default) + TTL of ConfigMaps. + // Ref: https://kubernetes.io/docs/tasks/configure-pod-container/configure-pod-configmap/#mounted-configmaps-are-updated-automatically + require.NoError(t, data.setPodAnnotation(data.testNamespace, "custom-dns-server", "test.antrea.io/random-value", + randSeq(8)), "failed to update custom DNS Pod annotation.") + + // finally verify that Curling the previously cached IP does not fail after DNS update, as long as fqdnCacheMinTTL is set. + // The wait time here should be slightly longer than the reload value specified in the custom DNS configuration. + t.Logf("Trying to curl the existing cached IP of the domain: %s", fqdnIP) + + if fqdnCacheMinTTL == 0 { + // fqdnCacheMinTTL is unset , hence we expect an error in connection . + assert.EventuallyWithT(t, func(t *assert.CollectT) { + _, err := curlFQDN(fqdnIP) + assert.Error(t, err) + }, 10*time.Second, 1*time.Second) + } else { + // Calculate `waitFor` to determine the duration to wait for the 'Never' assertion. + // This accounts for the elapsed time since the initial DNS request was made from the Pod + // and the start of the FQDN cache's minimum TTL (fqdnCacheMinTTL). The duration is reduced + // by 1 second as a buffer acting as a safety margin. + safetyMargin := 1 * time.Second + waitFor := (time.Duration(fqdnCacheMinTTL)*time.Second - time.Since(startCacheTime)) - safetyMargin + require.GreaterOrEqual(t, waitFor, 5*time.Second) + + // fqdnCacheMinTTL is set hence we expect no error at least until fqdnCacheMinTTL expires. + assert.Never(t, func() bool { + _, err := curlFQDN(fqdnIP) + return err != nil + }, waitFor, 1*time.Second) + } +} + +// configureFQDNPolicyEnforcement sets or resets the custom DNS server IP address and FQDNCacheMinTTL in Antrea ConfigMap. +func configureFQDNPolicyEnforcement(t *testing.T, data *TestData, dnsServiceIP string, fqdnCacheMinTTL int) { + agentChanges := func(config *agentconfig.AgentConfig) { + config.DNSServerOverride = dnsServiceIP + config.FQDNCacheMinTTL = fqdnCacheMinTTL + } + err := data.mutateAntreaConfigMap(nil, agentChanges, false, true) + require.NoError(t, err, "Error when setting up custom DNS server IP and FQDNCacheMinTTL in Antrea configmap") + t.Logf("DNSServerOverride set to %q and FQDNCacheMinTTL set to %d in Antrea Agent config", dnsServiceIP, fqdnCacheMinTTL) +} + +// createPolicyForFQDNCacheMinTTL creates a FQDN policy in the specified Namespace. +func createPolicyForFQDNCacheMinTTL(t *testing.T, data *TestData, testFQDN string, fqdnPolicyName, customDNSLabelValue, fqdnPodSelectorLabelValue string) { + podSelectorLabel := map[string]string{ + "app": fqdnPodSelectorLabelValue, + } + builder := &AntreaNetworkPolicySpecBuilder{} + builder = builder.SetName(data.testNamespace, fqdnPolicyName). + SetTier(defaultTierName). + SetPriority(1.0). + SetAppliedToGroup([]ANNPAppliedToSpec{{PodSelector: podSelectorLabel}}) + builder.AddFQDNRule(testFQDN, ProtocolTCP, ptr.To[int32](80), nil, nil, "AllowForFQDN", nil, + crdv1beta1.RuleActionAllow) + builder.AddEgress(ProtocolUDP, ptr.To[int32](53), nil, nil, nil, nil, + nil, nil, nil, nil, map[string]string{"app": customDNSLabelValue}, + nil, nil, nil, nil, + nil, nil, crdv1beta1.RuleActionAllow, "", "AllowDnsQueries") + builder.AddEgress(ProtocolTCP, nil, nil, nil, nil, nil, + nil, nil, nil, nil, nil, + nil, nil, nil, nil, + nil, nil, crdv1beta1.RuleActionReject, "", "DropAllRemainingTraffic") + + annp, err := data.CreateOrUpdateANNP(builder.Get()) + require.NoError(t, err, "error while deploying Antrea policy") + require.NoError(t, data.waitForANNPRealized(t, annp.Namespace, annp.Name, 30*time.Second)) +} + +// createHttpAgnhostPod creates an agnhost Pod that serves HTTP requests and returns the IP of Pod created. +func createHttpAgnhostPod(t *testing.T, data *TestData) *PodIPs { + const ( + agnhostPort = 80 + agnhostPodNamePreFix = "agnhost-" + ) + podName := randName(agnhostPodNamePreFix) + args := []string{"netexec", "--http-port=" + strconv.Itoa(agnhostPort)} + ports := []v1.ContainerPort{ + { + Name: "http", + ContainerPort: agnhostPort, + Protocol: v1.ProtocolTCP, + }, + } + + require.NoError(t, NewPodBuilder(podName, data.testNamespace, agnhostImage). + WithArgs(args). + WithPorts(ports). + WithLabels(map[string]string{"app": "agnhost"}). + Create(data)) + podIPs, err := data.podWaitForIPs(defaultTimeout, podName, data.testNamespace) + require.NoError(t, err) + return podIPs +} + +// createDNSPod creates the CoreDNS Pod configured to use the custom DNS ConfigMap. +func createCustomDNSPod(t *testing.T, data *TestData, configName string) { + volume := []v1.Volume{ + { + Name: "config-volume", + VolumeSource: v1.VolumeSource{ + ConfigMap: &v1.ConfigMapVolumeSource{ + LocalObjectReference: v1.LocalObjectReference{ + Name: configName, + }, + Items: []v1.KeyToPath{ + { + Key: "Corefile", + Path: "Corefile", + }, + }, + }, + }, + }, + } + + volumeMount := []v1.VolumeMount{ + { + Name: "config-volume", + MountPath: "/etc/coredns", + }, + } + + require.NoError(t, NewPodBuilder("custom-dns-server", data.testNamespace, "coredns/coredns:1.11.3"). + WithLabels(map[string]string{"app": "custom-dns"}). + WithContainerName("coredns"). + WithArgs([]string{"-conf", "/etc/coredns/Corefile"}). + AddVolume(volume).AddVolumeMount(volumeMount). + Create(data)) + require.NoError(t, data.podWaitForRunning(defaultTimeout, "custom-dns-server", data.testNamespace)) +} + +// createDNSConfig generates a DNS configuration for the specified IP address and domain name. +func createDNSConfig(t *testing.T, hosts map[string]string, ttl int) map[string]string { + const coreFileTemplate = `lfx.test:53 { + errors + log + health + hosts { + {{ range $IP, $FQDN := .Hosts }}{{ $IP }} {{ $FQDN }}{{ end }} + no_reverse + pods verified + ttl {{ .TTL }} + } + loop + reload 2s + }` + + data := struct { + Hosts map[string]string + TTL int + }{ + Hosts: hosts, + TTL: ttl, + } + + // Parse the template and generate the config data + tmpl, err := template.New("configMapData").Parse(coreFileTemplate) + require.NoError(t, err, "error parsing config template") + + var output bytes.Buffer + err = tmpl.Execute(&output, data) + require.NoError(t, err, "error executing config template") + + configMapData := strings.TrimSpace(output.String()) + configData := map[string]string{ + "Corefile": configMapData, + } + + return configData +}