Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Syncing latest changes from devel for ceph-csi #208

Merged
merged 13 commits into from
Nov 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 15 additions & 1 deletion .github/pull_request_template.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,20 @@ Fixes: #issue_number
List items that are not part of the PR and do not impact it's
functionality, but are work items that can be taken up subsequently.

**Checklist:**

* [ ] **Commit Message Formatting**: Commit titles and messages follow
guidelines in the [developer
guide](https://github.com/ceph/ceph-csi/blob/devel/docs/development-guide.md#commit-messages).
* [ ] Reviewed the developer guide on [Submitting a Pull
Request](https://github.com/ceph/ceph-csi/blob/devel/docs/development-guide.md#development-workflow)
* [ ] [Pending release
notes](https://github.com/ceph/ceph-csi/blob/devel/PendingReleaseNotes.md)
updated with breaking and/or notable changes for the next major release.
* [ ] Documentation has been updated, if necessary.
* [ ] Unit tests have been added, if necessary.
* [ ] Integration tests have been added, if necessary.

---

<details>
Expand All @@ -42,7 +56,7 @@ functionality, but are work items that can be taken up subsequently.
These commands are normally not required, but in case of issues, leave any of
the following bot commands in an otherwise empty comment in this PR:

- `/retest ci/centos/<job-name>`: retest the `<job-name>` after unrelated
* `/retest ci/centos/<job-name>`: retest the `<job-name>` after unrelated
failure (please report the failure too!)

</details>
4 changes: 1 addition & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,7 @@ endif
GO_PROJECT=github.com/ceph/ceph-csi

CEPH_VERSION ?= $(shell . $(CURDIR)/build.env ; echo $${CEPH_VERSION})
# TODO: ceph_preview tag may be removed with go-ceph 0.17.0
# TODO: ceph_ci_untested is added for subvolume metadata (go-ceph#691) and snapshot metadata management (go-ceph#698)
GO_TAGS_LIST ?= $(CEPH_VERSION) ceph_preview ceph_ci_untested ceph_pre_quincy
GO_TAGS_LIST ?= $(CEPH_VERSION)

# go build flags
LDFLAGS ?=
Expand Down
5 changes: 5 additions & 0 deletions PendingReleaseNotes.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# v3.10 Pending Release Notes

## Breaking Changes

## Features
14 changes: 3 additions & 11 deletions cmd/cephcsi.go
Original file line number Diff line number Diff line change
Expand Up @@ -106,8 +106,8 @@ func init() {
"",
"Comma separated string of mount options accepted by ceph-fuse mounter")

// liveness/grpc metrics related flags
flag.IntVar(&conf.MetricsPort, "metricsport", 8080, "TCP port for liveness/grpc metrics requests")
// liveness/profile metrics related flags
flag.IntVar(&conf.MetricsPort, "metricsport", 8080, "TCP port for liveness/profile metrics requests")
flag.StringVar(
&conf.MetricsPath,
"metricspath",
Expand All @@ -116,14 +116,6 @@ func init() {
flag.DurationVar(&conf.PollTime, "polltime", time.Second*pollTime, "time interval in seconds between each poll")
flag.DurationVar(&conf.PoolTimeout, "timeout", time.Second*probeTimeout, "probe timeout in seconds")

flag.BoolVar(&conf.EnableGRPCMetrics, "enablegrpcmetrics", false, "[DEPRECATED] enable grpc metrics")
flag.StringVar(
&conf.HistogramOption,
"histogramoption",
"0.5,2,6",
"[DEPRECATED] Histogram option for grpc metrics, should be comma separated value, "+
"ex:= 0.5,2,6 where start=0.5 factor=2, count=6")

flag.UintVar(
&conf.RbdHardMaxCloneDepth,
"rbdhardmaxclonedepth",
Expand Down Expand Up @@ -210,7 +202,7 @@ func main() {

setPIDLimit(&conf)

if conf.EnableGRPCMetrics || conf.Vtype == livenessType {
if conf.EnableProfiling || conf.Vtype == livenessType {
// validate metrics endpoint
conf.MetricsIP = os.Getenv("POD_IP")

Expand Down
10 changes: 5 additions & 5 deletions deploy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,10 @@ GITHUB_EMAIL=${GITHUB_EMAIL:-"[email protected]"}

# Build and push images. Steps as below:
# 1. get base image from ./build.env (BASE_IMAGE=ceph/ceph:v14.2)
# 2. parse manifest to get image digest per arch (sha256:XXX, sha256:YYY)
# 3. patch Dockerfile with amd64 base image (FROM ceph/ceph:v14.2@sha256:XXX)
# 2. parse manifest to get image digest per arch (sha256:XYZ, sha256:ZYX)
# 3. patch Dockerfile with amd64 base image (FROM ceph/ceph:v14.2@sha256:XYZ)
# 4. build and push amd64 image
# 5. patch Dockerfile with arm64 base image (FROM ceph/ceph:v14.2@sha256:YYY)
# 5. patch Dockerfile with arm64 base image (FROM ceph/ceph:v14.2@sha256:ZYX)
# 6. build and push arm64 image
build_push_images() {
# "docker manifest" requires experimental feature enabled
Expand All @@ -29,11 +29,11 @@ build_push_images() {
# get image digest per architecture
# {
# "arch": "amd64",
# "digest": "sha256:XXX"
# "digest": "sha256:XYZ"
# }
# {
# "arch": "arm64",
# "digest": "sha256:YYY"
# "digest": "sha256:ZYX"
# }
manifests=$(docker manifest inspect "${baseimg}" | jq '.manifests[] | {arch: .platform.architecture, digest: .digest}')
# qemu-user-static is to enable an execution of different multi-architecture containers by QEMU
Expand Down
2 changes: 0 additions & 2 deletions docs/deploy-cephfs.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,9 @@ make image-cephcsi
| `--pidlimit` | _0_ | Configure the PID limit in cgroups. The container runtime can restrict the number of processes/tasks which can cause problems while provisioning (or deleting) a large number of volumes. A value of `-1` configures the limit to the maximum, `0` does not configure limits at all. |
| `--metricsport` | `8080` | TCP port for liveness metrics requests |
| `--metricspath` | `/metrics` | Path of prometheus endpoint where metrics will be available |
| `--enablegrpcmetrics` | `false` | [Deprecated] Enable grpc metrics collection and start prometheus server |
| `--polltime` | `60s` | Time interval in between each poll |
| `--timeout` | `3s` | Probe timeout in seconds |
| `--clustername` | _empty_ | Cluster name to set on subvolume |
| `--histogramoption` | `0.5,2,6` | [Deprecated] Histogram option for grpc metrics, should be comma separated value (ex:= "0.5,2,6" where start=0.5 factor=2, count=6) |
| `--forcecephkernelclient` | `false` | Force enabling Ceph Kernel clients for mounting on kernels < 4.17 |
| `--kernelmountoptions` | _empty_ | Comma separated string of mount options accepted by cephfs kernel mounter |
| `--fusemountoptions` | _empty_ | Comma separated string of mount options accepted by ceph-fuse mounter |
Expand Down
2 changes: 0 additions & 2 deletions docs/deploy-rbd.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,9 @@ make image-cephcsi
| `--pidlimit` | _0_ | Configure the PID limit in cgroups. The container runtime can restrict the number of processes/tasks which can cause problems while provisioning (or deleting) a large number of volumes. A value of `-1` configures the limit to the maximum, `0` does not configure limits at all. |
| `--metricsport` | `8080` | TCP port for liveness metrics requests |
| `--metricspath` | `"/metrics"` | Path of prometheus endpoint where metrics will be available |
| `--enablegrpcmetrics` | `false` | [Deprecated] Enable grpc metrics collection and start prometheus server |
| `--polltime` | `"60s"` | Time interval in between each poll |
| `--timeout` | `"3s"` | Probe timeout in seconds |
| `--clustername` | _empty_ | Cluster name to set on RBD image |
| `--histogramoption` | `0.5,2,6` | [Deprecated] Histogram option for grpc metrics, should be comma separated value (ex:= "0.5,2,6" where start=0.5 factor=2, count=6) |
| `--domainlabels` | _empty_ | Kubernetes node labels to use as CSI domain labels for topology aware provisioning, should be a comma separated value (ex:= "failure-domain/region,failure-domain/zone") |
| `--rbdhardmaxclonedepth` | `8` | Hard limit for maximum number of nested volume clones that are taken before a flatten occurs |
| `--rbdsoftmaxclonedepth` | `4` | Soft limit for maximum number of nested volume clones that are taken before a flatten occurs |
Expand Down
25 changes: 25 additions & 0 deletions docs/design/proposals/volume-condition.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Support for CSI `VolumeCondition` aka Volume Health Checker

## health-checker API

Under `internal/health-checker` the Manager for health-checking is
implemented. The Manager can start a checking process for a given path, return
the (un)healthy state and stop the checking process when the volume is not
needed anymore.

The Manager is responsible for creating a suitable checker for the requested
path. If the path is a block-device, the BlockChecker should be created. For a
filesystem path (directory), the FileChecker is appropriate.

## CephFS

The health-checker writes to the file `csi-volume-condition.ts` in the root of
the volume. This file contains a JSON formatted timestamp.

A new `data` directory is introduced for newly created volumes. During the
`NodeStageVolume` call the root of the volume is mounted, and the `data`
directory is bind-mounted inside the container when `NodePublishVolume` is
called.

The `data` directory makes it possible to place Ceph-CSI internal files in the
root of the volume, without that the user/application has access to it.
17 changes: 13 additions & 4 deletions e2e/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,29 +26,38 @@ import (
"k8s.io/client-go/kubernetes"
"k8s.io/kubernetes/test/e2e/framework"
e2enode "k8s.io/kubernetes/test/e2e/framework/node"
testutils "k8s.io/kubernetes/test/utils"
)

func createNodeLabel(f *framework.Framework, labelKey, labelValue string) error {
func addLabelsToNodes(f *framework.Framework, labels map[string]string) error {
// NOTE: This makes all nodes (in a multi-node setup) in the test take
// the same label values, which is fine for the test
nodes, err := f.ClientSet.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{})
if err != nil {
return fmt.Errorf("failed to list node: %w", err)
}
for i := range nodes.Items {
e2enode.AddOrUpdateLabelOnNode(f.ClientSet, nodes.Items[i].Name, labelKey, labelValue)
if err := testutils.AddLabelsToNode(f.ClientSet, nodes.Items[i].Name, labels); err != nil {
return fmt.Errorf("failed to add labels to node: %w", err)
}
}

return nil
}

func deleteNodeLabel(c kubernetes.Interface, labelKey string) error {
func deleteNodeLabels(c kubernetes.Interface, labelKeys []string) error {
nodes, err := c.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{})
if err != nil {
return fmt.Errorf("failed to list node: %w", err)
}
for i := range nodes.Items {
e2enode.RemoveLabelOffNode(c, nodes.Items[i].Name, labelKey)
if err := testutils.RemoveLabelOffNode(c, nodes.Items[i].Name, labelKeys); err != nil {
return fmt.Errorf("failed to remove label off node: %w", err)
}

if err := testutils.VerifyLabelsRemoved(c, nodes.Items[i].Name, labelKeys); err != nil {
return fmt.Errorf("failed to verify label removed from node: %w", err)
}
}

return nil
Expand Down
54 changes: 16 additions & 38 deletions e2e/rbd.go
Original file line number Diff line number Diff line change
Expand Up @@ -275,21 +275,14 @@ var _ = Describe("RBD", func() {
}
c = f.ClientSet
if deployRBD {
err := createNodeLabel(f, nodeRegionLabel, regionValue)
if err != nil {
framework.Failf("failed to create node label: %v", err)
}
err = createNodeLabel(f, nodeZoneLabel, zoneValue)
if err != nil {
framework.Failf("failed to create node label: %v", err)
}
err = createNodeLabel(f, crushLocationRegionLabel, crushLocationRegionValue)
if err != nil {
framework.Failf("failed to create node label: %v", err)
}
err = createNodeLabel(f, crushLocationZoneLabel, crushLocationZoneValue)
err := addLabelsToNodes(f, map[string]string{
nodeRegionLabel: regionValue,
nodeZoneLabel: zoneValue,
crushLocationRegionLabel: crushLocationRegionValue,
crushLocationZoneLabel: crushLocationZoneValue,
})
if err != nil {
framework.Failf("failed to create node label: %v", err)
framework.Failf("failed to add node labels: %v", err)
}
if cephCSINamespace != defaultNs {
err = createNamespace(c, cephCSINamespace)
Expand Down Expand Up @@ -408,31 +401,16 @@ var _ = Describe("RBD", func() {
}
}
}
err = deleteNodeLabel(c, nodeRegionLabel)
if err != nil {
framework.Failf("failed to delete node label: %v", err)
}
err = deleteNodeLabel(c, nodeZoneLabel)
if err != nil {
framework.Failf("failed to delete node label: %v", err)
}
// Remove the CSI labels that get added
err = deleteNodeLabel(c, nodeCSIRegionLabel)
if err != nil {
framework.Failf("failed to delete node label: %v", err)
}
err = deleteNodeLabel(c, nodeCSIZoneLabel)
if err != nil {
framework.Failf("failed to delete node label: %v", err)
}
// Remove the CRUSH Location labels
err = deleteNodeLabel(c, crushLocationRegionLabel)
if err != nil {
framework.Failf("failed to delete node label: %v", err)
}
err = deleteNodeLabel(c, crushLocationZoneLabel)
err = deleteNodeLabels(c, []string{
nodeRegionLabel,
nodeZoneLabel,
nodeCSIRegionLabel,
nodeCSIZoneLabel,
crushLocationRegionLabel,
crushLocationZoneLabel,
})
if err != nil {
framework.Failf("failed to delete node label: %v", err)
framework.Failf("failed to delete node labels: %v", err)
}
})

Expand Down
23 changes: 10 additions & 13 deletions e2e/upgrade-rbd.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,14 +107,12 @@ var _ = Describe("RBD Upgrade Testing", func() {
if err != nil {
framework.Failf("failed to create snapshotclass: %v", err)
}

err = createNodeLabel(f, nodeRegionLabel, regionValue)
if err != nil {
framework.Failf("failed to create node label: %v", err)
}
err = createNodeLabel(f, nodeZoneLabel, zoneValue)
err = addLabelsToNodes(f, map[string]string{
nodeRegionLabel: regionValue,
nodeZoneLabel: zoneValue,
})
if err != nil {
framework.Failf("failed to create node label: %v", err)
framework.Failf("failed to add node labels: %v", err)
}
})
AfterEach(func() {
Expand Down Expand Up @@ -167,13 +165,12 @@ var _ = Describe("RBD Upgrade Testing", func() {
}
}
}
err = deleteNodeLabel(c, nodeRegionLabel)
if err != nil {
framework.Failf("failed to delete node label: %v", err)
}
err = deleteNodeLabel(c, nodeZoneLabel)
err = deleteNodeLabels(c, []string{
nodeRegionLabel,
nodeZoneLabel,
})
if err != nil {
framework.Failf("failed to delete node label: %v", err)
framework.Failf("failed to delete node labels: %v", err)
}
})

Expand Down
1 change: 0 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ require (
github.com/aws/aws-sdk-go v1.46.4
github.com/aws/aws-sdk-go-v2/service/sts v1.23.2
github.com/ceph/ceph-csi/api v0.0.0-00010101000000-000000000000
// TODO: API for managing subvolume metadata and snapshot metadata requires `ceph_ci_untested` build-tag
github.com/ceph/go-ceph v0.24.0
github.com/container-storage-interface/spec v1.9.0
github.com/csi-addons/replication-lib-utils v0.2.0
Expand Down
13 changes: 5 additions & 8 deletions internal/cephfs/driver.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (
casceph "github.com/ceph/ceph-csi/internal/csi-addons/cephfs"
csiaddons "github.com/ceph/ceph-csi/internal/csi-addons/server"
csicommon "github.com/ceph/ceph-csi/internal/csi-common"
hc "github.com/ceph/ceph-csi/internal/health-checker"
"github.com/ceph/ceph-csi/internal/journal"
"github.com/ceph/ceph-csi/internal/util"
"github.com/ceph/ceph-csi/internal/util/log"
Expand Down Expand Up @@ -82,6 +83,7 @@ func NewNodeServer(
VolumeLocks: util.NewVolumeLocks(),
kernelMountOptions: kernelMountOptions,
fuseMountOptions: fuseMountOptions,
healthChecker: hc.NewHealthCheckManager(),
}
}

Expand Down Expand Up @@ -167,15 +169,10 @@ func (fs *Driver) Run(conf *util.Config) {
// passing nil for replication server as cephFS does not support mirroring.
RS: nil,
}
server.Start(conf.Endpoint, conf.HistogramOption, srv, conf.EnableGRPCMetrics)
if conf.EnableGRPCMetrics {
log.WarningLogMsg("EnableGRPCMetrics is deprecated")
go util.StartMetricsServer(conf)
}
server.Start(conf.Endpoint, srv)

if conf.EnableProfiling {
if !conf.EnableGRPCMetrics {
go util.StartMetricsServer(conf)
}
go util.StartMetricsServer(conf)
log.DebugLogMsg("Registering profiling handler")
go util.EnableProfiling()
}
Expand Down
Loading