diff --git a/.github/workflows/q2q-candidate-upgrade.yml b/.github/workflows/q2q-candidate-upgrade.yml index 9963d7ec..fbf00651 100644 --- a/.github/workflows/q2q-candidate-upgrade.yml +++ b/.github/workflows/q2q-candidate-upgrade.yml @@ -58,5 +58,8 @@ jobs: - name: Wait until 3 OSDs are up run: ~/actionutils.sh headexec wait_for_osds 3 + - name: Verify config + run: ~/actionutils.sh test_ceph_conf + - name: Exercise RGW again run: ~/actionutils.sh headexec testrgw diff --git a/.github/workflows/q2r-candidate-upgrade.yaml b/.github/workflows/q2r-candidate-upgrade.yaml index c06b67da..d2a08c57 100644 --- a/.github/workflows/q2r-candidate-upgrade.yaml +++ b/.github/workflows/q2r-candidate-upgrade.yaml @@ -58,5 +58,8 @@ jobs: - name: Wait until 3 OSDs are up run: ~/actionutils.sh headexec wait_for_osds 3 + - name: Verify config + run: ~/actionutils.sh test_ceph_conf + - name: Exercise RGW again run: ~/actionutils.sh headexec testrgw diff --git a/.github/workflows/r2r-candidate-upgrade.yaml b/.github/workflows/r2r-candidate-upgrade.yaml index f00f9c3e..c1f94b1c 100644 --- a/.github/workflows/r2r-candidate-upgrade.yaml +++ b/.github/workflows/r2r-candidate-upgrade.yaml @@ -58,5 +58,8 @@ jobs: - name: Wait until 3 OSDs are up run: ~/actionutils.sh headexec wait_for_osds 3 + - name: Verify config + run: ~/actionutils.sh test_ceph_conf + - name: Exercise RGW again run: ~/actionutils.sh headexec testrgw diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 6cbbdd32..dd0205d2 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -255,6 +255,9 @@ jobs: - name: Setup cluster run: ~/actionutils.sh cluster_nodes custom + - name: Verify config + run: ~/actionutils.sh test_ceph_conf + - name: Add 2 OSDs run: | for c in node-wrk1 node-wrk2 ; do diff --git a/microceph/ceph/config.go b/microceph/ceph/config.go index 82859024..bbda5ef2 100644 --- a/microceph/ceph/config.go +++ b/microceph/ceph/config.go @@ -6,6 +6,7 @@ import ( "encoding/json" "fmt" "github.com/canonical/microceph/microceph/interfaces" + "net" "os" "path/filepath" "strings" @@ -162,30 +163,68 @@ func ListConfigs() (types.Configs, error) { return configs, nil } -// updates the ceph config file. -func UpdateConfig(s interfaces.StateInterface) error { - confPath := filepath.Join(os.Getenv("SNAP_DATA"), "conf") - runPath := filepath.Join(os.Getenv("SNAP_DATA"), "run") - - // Get the configuration and servers. - var err error - var configItems []database.ConfigItem +// ensureBackwardCompat ensures that all config items are set in the database. +// this is a backward-compat shim to accomodate older versions of microceph +// which will ensure that the public_network and mon.host. are set +// in the database +func ensureBackwardCompat(s interfaces.StateInterface) error { + config, err := getConfigDb(s) + if err != nil { + return fmt.Errorf("failed to get config from db: %w", err) + } - err = s.ClusterState().Database.Transaction(s.ClusterState().Context, func(ctx context.Context, tx *sql.Tx) error { - configItems, err = database.GetConfigItems(ctx, tx) + // do we have a public_network configured? + pubNet := config["public_network"] + _, _, err = net.ParseCIDR(pubNet) + if err != nil { + // get public network from default address + pubNet, err = common.Network.FindNetworkAddress(s.ClusterState().Address().Hostname()) if err != nil { - return err + return fmt.Errorf("failed to locate public network: %w", err) } + // update the database + err = s.ClusterState().Database.Transaction(s.ClusterState().Context, func(ctx context.Context, tx *sql.Tx) error { + _, err = database.CreateConfigItem(ctx, tx, database.ConfigItem{Key: "public_network", Value: pubNet}) + if err != nil { + return fmt.Errorf("failed to record public_network: %w", err) + } + return nil + }) + } - return nil - }) + // do we have a mon host recorded? + k := fmt.Sprintf("mon.host.%s", s.ClusterState().Name()) + _, ok := config[k] + if !ok { + // key not found, update the database + err = s.ClusterState().Database.Transaction(s.ClusterState().Context, func(ctx context.Context, tx *sql.Tx) error { + _, err = database.CreateConfigItem( + ctx, + tx, + database.ConfigItem{Key: k, Value: s.ClusterState().Address().Hostname()}, + ) + if err != nil { + return fmt.Errorf("failed to record mon host: %w", err) + } + return nil + }) + } + return nil +} + +// UpdateConfig updates the ceph.conf file with the current configuration. +func UpdateConfig(s interfaces.StateInterface) error { + confPath := filepath.Join(os.Getenv("SNAP_DATA"), "conf") + runPath := filepath.Join(os.Getenv("SNAP_DATA"), "run") + + err := ensureBackwardCompat(s) if err != nil { - return err + return fmt.Errorf("failed to ensure backward compat: %w", err) } - config := map[string]string{} - for _, item := range configItems { - config[item.Key] = item.Value + config, err := getConfigDb(s) + if err != nil { + return fmt.Errorf("failed to get config db: %w", err) } // REF: https://docs.ceph.com/en/quincy/rados/configuration/network-config-ref/#ceph-daemons @@ -199,6 +238,7 @@ func UpdateConfig(s interfaces.StateInterface) error { if err != nil { return fmt.Errorf("failed to locate IP on public network %s: %w", config["public_network"], err) } + clientConfig, err := GetClientConfigForHost(s, s.ClusterState().Name()) if err != nil { logger.Errorf("Failed to pull Client Configurations: %v", err) @@ -225,6 +265,7 @@ func UpdateConfig(s interfaces.StateInterface) error { if err != nil { return fmt.Errorf("couldn't render ceph.conf: %w", err) } + logger.Debugf("updated ceph.conf: %v", conf.GetPath()) // Generate ceph.client.admin.keyring keyring := newCephKeyring(confPath, "ceph.keyring") @@ -242,6 +283,30 @@ func UpdateConfig(s interfaces.StateInterface) error { return nil } +// getConfigDb retrieves the configuration from the database. +func getConfigDb(s interfaces.StateInterface) (map[string]string, error) { + var err error + var configItems []database.ConfigItem + + err = s.ClusterState().Database.Transaction(s.ClusterState().Context, func(ctx context.Context, tx *sql.Tx) error { + configItems, err = database.GetConfigItems(ctx, tx) + if err != nil { + return err + } + + return nil + }) + if err != nil { + return nil, err + } + + config := map[string]string{} + for _, item := range configItems { + config[item.Key] = item.Value + } + return config, nil +} + // getMonitorAddresses scans a provided config key/value map and returns a list of mon hosts found. func getMonitorAddresses(configs map[string]string) []string { monHosts := []string{} diff --git a/microceph/ceph/start.go b/microceph/ceph/start.go index aac8ec6b..6af259c6 100644 --- a/microceph/ceph/start.go +++ b/microceph/ceph/start.go @@ -3,6 +3,7 @@ package ceph import ( "context" "database/sql" + "github.com/canonical/lxd/shared/logger" "github.com/canonical/microceph/microceph/interfaces" "reflect" "time" @@ -19,6 +20,7 @@ func Start(s interfaces.StateInterface) error { for { // Check that the database is ready. if !s.ClusterState().Database.IsOpen() { + logger.Debug("start: database not ready, waiting...") time.Sleep(10 * time.Second) continue } @@ -39,26 +41,28 @@ func Start(s interfaces.StateInterface) error { return nil }) if err != nil { + logger.Warnf("start: failed to fetch monitors, retrying: %v", err) time.Sleep(10 * time.Second) continue } // Compare to the previous list. if reflect.DeepEqual(oldMonitors, monitors) { + logger.Debugf("start: monitors unchanged, sleeping: %v", monitors) time.Sleep(time.Minute) continue } err = UpdateConfig(s) if err != nil { + logger.Errorf("start: failed to update config, retrying: %v", err) time.Sleep(10 * time.Second) continue } - + logger.Debug("start: updated config, sleeping") oldMonitors = monitors time.Sleep(time.Minute) } - }() return nil diff --git a/tests/scripts/actionutils.sh b/tests/scripts/actionutils.sh index 78302f88..b08c8bb9 100755 --- a/tests/scripts/actionutils.sh +++ b/tests/scripts/actionutils.sh @@ -388,6 +388,28 @@ function test_migration() { return -1 } +function test_ceph_conf() { + for n in $( lxc ls -c n --format csv ); do + + # Test: configured rundir must be current + current=$( realpath /var/snap/microceph/current ) + rundir=$( cat /var/snap/microceph/current/conf/ceph.conf | awk '/run dir/{ print $4 }' ) + p=$( dirname $rundir ) + if [ $p != $current ]; then + echo "Error: snap data dir $current, configured run dir: $rundir" + cat /var/snap/microceph/current/conf/ceph.conf + exit -1 + fi + + # Test: must contain public_network + if ! grep -q public_net /var/snap/microceph/current/conf/ceph.conf ; then + echo "Error: didn't find public_net in ceph.conf" + cat /var/snap/microceph/current/conf/ceph.conf + exit -1 + fi + done +} + function headexec() { local run="${1?missing}" shift