diff --git a/README.md b/README.md index a6f2342c216..2511b456d0b 100644 --- a/README.md +++ b/README.md @@ -591,6 +591,17 @@ configured to operate in IPv6 mode. Prefix delegation is only supported on nitro --- +#### `ENABLE_NFTABLES` (v1.13.0+) + +Type: Boolean as a String + +Default: `false` + +VPC CNI uses `iptables-legacy` by default. Setting `ENABLE_NFTABLES` to `true` will update VPC CNI to use `iptables-nft`. + +**Note:** VPC CNI image contains `iptables-legacy` and `iptables-nft`. Switching between them is done via `update-alternatives`. It is *strongly* recommended that the iptables mode matches that which is used by the base OS and `kube-proxy`. +Switching modes while pods are running or rules are installed will not trigger reconciliation. It is recommended that rules are manually updated or nodes are drained and cordoned before updating. If reloading node, ensure that previous rules are not set to be persisted. + ### VPC CNI Feature Matrix IP Mode | Secondary IP Mode | Prefix Delegation | Security Groups Per Pod | WARM & MIN IP/Prefix Targets | External SNAT diff --git a/cmd/aws-vpc-cni-init/main.go b/cmd/aws-vpc-cni-init/main.go index e9d46814f12..bb5a7730924 100644 --- a/cmd/aws-vpc-cni-init/main.go +++ b/cmd/aws-vpc-cni-init/main.go @@ -17,9 +17,9 @@ package main import ( "os" + "github.com/aws/amazon-vpc-cni-k8s/pkg/procsyswrapper" "github.com/aws/amazon-vpc-cni-k8s/utils/cp" "github.com/aws/amazon-vpc-cni-k8s/utils/imds" - "github.com/aws/amazon-vpc-cni-k8s/utils/sysctl" "github.com/pkg/errors" log "github.com/sirupsen/logrus" "github.com/vishvananda/netlink" @@ -68,68 +68,68 @@ func getNodePrimaryIF() (string, error) { return primaryIF, nil } -func configureSystemParams(sysctlUtil sysctl.Interface, primaryIF string) error { +func configureSystemParams(procSys procsyswrapper.ProcSys, primaryIF string) error { var err error // Configure rp_filter in loose mode entry := "net/ipv4/conf/" + primaryIF + "/rp_filter" - err = sysctlUtil.Set(entry, 2) + err = procSys.Set(entry, "2") if err != nil { return errors.Wrapf(err, "Failed to set rp_filter for %s", primaryIF) } - val, _ := sysctlUtil.Get(entry) - log.Infof("Updated %s to %d", entry, val) + val, _ := procSys.Get(entry) + log.Infof("Updated %s to %s", entry, val) // Enable or disable TCP early demux based on environment variable // Note that older kernels may not support tcp_early_demux, so we must first check that it exists. entry = "net/ipv4/tcp_early_demux" - if _, err := sysctlUtil.Get(entry); err == nil { + if _, err := procSys.Get(entry); err == nil { disableIPv4EarlyDemux := getEnv(envDisableIPv4TcpEarlyDemux, "false") if disableIPv4EarlyDemux == "true" { - err = sysctlUtil.Set(entry, 0) + err = procSys.Set(entry, "0") if err != nil { return errors.Wrap(err, "Failed to disable tcp_early_demux") } } else { - err = sysctlUtil.Set(entry, 1) + err = procSys.Set(entry, "1") if err != nil { return errors.Wrap(err, "Failed to enable tcp_early_demux") } } - val, _ = sysctlUtil.Get(entry) - log.Infof("Updated %s to %d", entry, val) + val, _ = procSys.Get(entry) + log.Infof("Updated %s to %s", entry, val) } return nil } -func configureIPv6Settings(sysctlUtil sysctl.Interface, primaryIF string) error { +func configureIPv6Settings(procSys procsyswrapper.ProcSys, primaryIF string) error { var err error // Enable IPv6 when environment variable is set // Note that IPv6 is not disabled when environment variable is unset. This is omitted to preserve default host semantics. enableIPv6 := getEnv(envEnableIPv6, "false") if enableIPv6 == "true" { entry := "net/ipv6/conf/all/disable_ipv6" - err = sysctlUtil.Set(entry, 0) + err = procSys.Set(entry, "0") if err != nil { return errors.Wrap(err, "Failed to set disable_ipv6 to 0") } - val, _ := sysctlUtil.Get(entry) - log.Infof("Updated %s to %d", entry, val) + val, _ := procSys.Get(entry) + log.Infof("Updated %s to %s", entry, val) entry = "net/ipv6/conf/all/forwarding" - err = sysctlUtil.Set(entry, 1) + err = procSys.Set(entry, "1") if err != nil { return errors.Wrap(err, "Failed to enable ipv6 forwarding") } - val, _ = sysctlUtil.Get(entry) - log.Infof("Updated %s to %d", entry, val) + val, _ = procSys.Get(entry) + log.Infof("Updated %s to %s", entry, val) entry = "net/ipv6/conf/" + primaryIF + "/accept_ra" - err = sysctlUtil.Set(entry, 2) + err = procSys.Set(entry, "2") if err != nil { return errors.Wrap(err, "Failed to enable ipv6 accept_ra") } - val, _ = sysctlUtil.Get(entry) - log.Infof("Updated %s to %d", entry, val) + val, _ = procSys.Get(entry) + log.Infof("Updated %s to %s", entry, val) } return nil } @@ -166,14 +166,14 @@ func _main() int { } log.Infof("Found primaryIF %s", primaryIF) - sysctlUtil := sysctl.New() - err = configureSystemParams(sysctlUtil, primaryIF) + procSys := procsyswrapper.NewProcSys() + err = configureSystemParams(procSys, primaryIF) if err != nil { log.WithError(err).Errorf("Failed to configure system parameters") return 1 } - err = configureIPv6Settings(sysctlUtil, primaryIF) + err = configureIPv6Settings(procSys, primaryIF) if err != nil { log.WithError(err).Errorf("Failed to configure IPv6 settings") return 1 diff --git a/cmd/aws-vpc-cni/main.go b/cmd/aws-vpc-cni/main.go index 3c6777b4b2f..b5a16f12a9c 100644 --- a/cmd/aws-vpc-cni/main.go +++ b/cmd/aws-vpc-cni/main.go @@ -68,6 +68,7 @@ const ( defaultPluginLogLevel = "Debug" defaultEnableIPv6 = "false" defaultRandomizeSNAT = "prng" + defaultEnableNftables = "false" awsConflistFile = "/10-aws.conflist" vpcCniInitDonePath = "/vpc-cni-init/done" @@ -88,6 +89,7 @@ const ( envEnBandwidthPlugin = "ENABLE_BANDWIDTH_PLUGIN" envEnIPv6 = "ENABLE_IPv6" envRandomizeSNAT = "AWS_VPC_K8S_CNI_RANDOMIZESNAT" + envEnableNftables = "ENABLE_NFTABLES" ) func getEnv(env, defaultVal string) string { @@ -209,8 +211,6 @@ func getNodePrimaryV4Address() (string, error) { if hostIP != "" { return hostIP, nil } - - time.Sleep(1 * time.Second) } } @@ -324,6 +324,26 @@ func validateEnvVars() bool { return true } +func configureNftablesIfEnabled() error { + // By default, VPC CNI container uses iptables-legacy. Update to iptables-nft when env var is set + nftables := getEnv(envEnableNftables, defaultEnableNftables) + if nftables == "true" { + log.Infof("Updating iptables mode to nft") + var cmd *exec.Cmd + // Command output is not suppressed so that log shows iptables mode being set + cmd = exec.Command("update-alternatives", "--set", "iptables", "/usr/sbin/iptables-nft") + if err := cmd.Run(); err != nil { + return errors.Wrap(err, "Failed to use iptables-nft") + } + cmd = exec.Command("update-alternatives", "--set", "ip6tables", "/usr/sbin/ip6tables-nft") + if err := cmd.Run(); err != nil { + log.WithError(err).Errorf("Failed to use ip6tables-nft") + return errors.Wrap(err, "Failed to use iptables6-nft") + } + } + return nil +} + func main() { os.Exit(_main()) } @@ -334,11 +354,15 @@ func _main() int { return 1 } + if err := configureNftablesIfEnabled(); err != nil { + log.WithError(err).Error("Failed to enable nftables") + } + pluginBins := []string{"aws-cni", "egress-v4-cni"} hostCNIBinPath := getEnv(envHostCniBinPath, defaultHostCNIBinPath) err := cp.InstallBinaries(pluginBins, hostCNIBinPath) if err != nil { - log.WithError(err).Errorf("Failed to install CNI binaries") + log.WithError(err).Error("Failed to install CNI binaries") return 1 } diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index 03b36545c5b..b1bdc898d46 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -224,9 +224,10 @@ kubectl apply -f https://raw.githubusercontent.com/aws/amazon-vpc-cni-k8s/releas The [CNI image](../scripts/dockerfiles/Dockerfile.release) built for the `aws-node` manifest uses Amazon Linux 2 as the base image. Support for other Linux distributions (custom AMIs) is best-effort. Known issues with other Linux distributions are captured here: -- **iptables** - iptables is installed by default in `aws-node` container images. Newer distributions of RHEL (RHEL 8.x+), Ubuntu (Ubuntu 20.x+), etc. have moved to using `nftables`. This leads to issues such as [this](https://github.com/aws/amazon-vpc-cni-k8s/issues/1847) when running IPAMD. +- **iptables** + Prior to v1.13.0, the VPC CNI image only contained `iptables-legacy`. Newer distributions of RHEL (RHEL 8.x+), Ubuntu (Ubuntu 21.x+), etc. have moved to using `nftables`. This leads to issues such as [this](https://github.com/aws/amazon-vpc-cni-k8s/issues/1847) when running IPAMD. - To resolve this issue on distributions that use `nftables`, there are currently two options: + To resolve this issue in versions before v1.13.0, there are currently two options: 1. Uninstall `nftables` and install `iptables-legacy` in base distribution 2. Build a custom CNI image based on `nftables`, such as: ``` @@ -235,6 +236,8 @@ The [CNI image](../scripts/dockerfiles/Dockerfile.release) built for the `aws-no run cd /usr/sbin && rm iptables && ln -s xtables-nft-multi iptables ``` + In v1.13.0+, `iptables-legacy` and `iptables-nft` are present in the VPC CNI container image. Setting `ENABLE_NFTABLES` environment variable to `true` instructs VPC CNI to use `iptables-nft`. By default, `iptables-legacy` is used. + ## cni-metrics-helper See the [cni-metrics-helper README](../cmd/cni-metrics-helper/README.md). diff --git a/utils/sysctl/sysctl.go b/utils/sysctl/sysctl.go deleted file mode 100644 index 234a611fd54..00000000000 --- a/utils/sysctl/sysctl.go +++ /dev/null @@ -1,49 +0,0 @@ -// Ref: https://github.com/kubernetes/kubernetes/blob/cb2ea4bf7c029e595f44ee62013c982626fb5bd4/staging/src/k8s.io/component-helpers/node/utils/sysctl/sysctl.go - -package sysctl - -import ( - "io/ioutil" - "path" - "strconv" - "strings" -) - -const ( - sysctlBase = "/proc/sys" -) - -// Interface is an injectable interface for running sysctl commands. -type Interface interface { - // Get returns the value for the specified sysctl setting - Get(sysctl string) (int, error) - // Set modifies the specified sysctl flag to the new value - Set(sysctl string, newVal int) error -} - -// New returns a new Interface for accessing sysctl -func New() Interface { - return &procSysctl{} -} - -// procSysctl implements Interface by reading and writing files under /proc/sys -type procSysctl struct { -} - -// Get returns the value for the specified sysctl setting -func (*procSysctl) Get(sysctl string) (int, error) { - data, err := ioutil.ReadFile(path.Join(sysctlBase, sysctl)) - if err != nil { - return -1, err - } - val, err := strconv.Atoi(strings.Trim(string(data), " \n")) - if err != nil { - return -1, err - } - return val, nil -} - -// Set modifies the specified sysctl flag to the new value -func (*procSysctl) Set(sysctl string, newVal int) error { - return ioutil.WriteFile(path.Join(sysctlBase, sysctl), []byte(strconv.Itoa(newVal)), 0640) -}