Skip to content

Commit

Permalink
Fix: ensure backward compat config
Browse files Browse the repository at this point in the history
Fixes: canonical#318

Ensure we populate the config key value db with required values if not
present. Newer microceph version set these values on bootstrap, older
releases might be missing them

Also add some testing for ceph.conf

Signed-off-by: Peter Sabaini <[email protected]>
  • Loading branch information
sabaini committed Feb 28, 2024
1 parent b959197 commit 32af31e
Show file tree
Hide file tree
Showing 7 changed files with 122 additions and 19 deletions.
3 changes: 3 additions & 0 deletions .github/workflows/q2q-candidate-upgrade.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,5 +58,8 @@ jobs:
- name: Wait until 3 OSDs are up
run: ~/actionutils.sh headexec wait_for_osds 3

- name: Verify config
run: ~/actionutils.sh test_ceph_conf

- name: Exercise RGW again
run: ~/actionutils.sh headexec testrgw
3 changes: 3 additions & 0 deletions .github/workflows/q2r-candidate-upgrade.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -58,5 +58,8 @@ jobs:
- name: Wait until 3 OSDs are up
run: ~/actionutils.sh headexec wait_for_osds 3

- name: Verify config
run: ~/actionutils.sh test_ceph_conf

- name: Exercise RGW again
run: ~/actionutils.sh headexec testrgw
3 changes: 3 additions & 0 deletions .github/workflows/r2r-candidate-upgrade.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -58,5 +58,8 @@ jobs:
- name: Wait until 3 OSDs are up
run: ~/actionutils.sh headexec wait_for_osds 3

- name: Verify config
run: ~/actionutils.sh test_ceph_conf

- name: Exercise RGW again
run: ~/actionutils.sh headexec testrgw
3 changes: 3 additions & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,9 @@ jobs:
- name: Setup cluster
run: ~/actionutils.sh cluster_nodes custom

- name: Verify config
run: ~/actionutils.sh test_ceph_conf

- name: Add 2 OSDs
run: |
for c in node-wrk1 node-wrk2 ; do
Expand Down
99 changes: 82 additions & 17 deletions microceph/ceph/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"encoding/json"
"fmt"
"github.com/canonical/microceph/microceph/interfaces"
"net"
"os"
"path/filepath"
"strings"
Expand Down Expand Up @@ -162,30 +163,68 @@ func ListConfigs() (types.Configs, error) {
return configs, nil
}

// updates the ceph config file.
func UpdateConfig(s interfaces.StateInterface) error {
confPath := filepath.Join(os.Getenv("SNAP_DATA"), "conf")
runPath := filepath.Join(os.Getenv("SNAP_DATA"), "run")

// Get the configuration and servers.
var err error
var configItems []database.ConfigItem
// ensureBackwardCompat ensures that all config items are set in the database.
// this is a backward-compat shim to accomodate older versions of microceph
// which will ensure that the public_network and mon.host.<cluster_name> are set
// in the database
func ensureBackwardCompat(s interfaces.StateInterface) error {
config, err := getConfigDb(s)
if err != nil {
return fmt.Errorf("failed to get config from db: %w", err)
}

err = s.ClusterState().Database.Transaction(s.ClusterState().Context, func(ctx context.Context, tx *sql.Tx) error {
configItems, err = database.GetConfigItems(ctx, tx)
// do we have a public_network configured?
pubNet := config["public_network"]
_, _, err = net.ParseCIDR(pubNet)
if err != nil {
// get public network from default address
pubNet, err = common.Network.FindNetworkAddress(s.ClusterState().Address().Hostname())
if err != nil {
return err
return fmt.Errorf("failed to locate public network: %w", err)
}
// update the database
err = s.ClusterState().Database.Transaction(s.ClusterState().Context, func(ctx context.Context, tx *sql.Tx) error {
_, err = database.CreateConfigItem(ctx, tx, database.ConfigItem{Key: "public_network", Value: pubNet})
if err != nil {
return fmt.Errorf("failed to record public_network: %w", err)
}
return nil
})
}

return nil
})
// do we have a mon host recorded?
k := fmt.Sprintf("mon.host.%s", s.ClusterState().Name())
_, ok := config[k]
if !ok {
// key not found, update the database
err = s.ClusterState().Database.Transaction(s.ClusterState().Context, func(ctx context.Context, tx *sql.Tx) error {
_, err = database.CreateConfigItem(
ctx,
tx,
database.ConfigItem{Key: k, Value: s.ClusterState().Address().Hostname()},
)
if err != nil {
return fmt.Errorf("failed to record mon host: %w", err)
}
return nil
})
}
return nil
}

// UpdateConfig updates the ceph.conf file with the current configuration.
func UpdateConfig(s interfaces.StateInterface) error {
confPath := filepath.Join(os.Getenv("SNAP_DATA"), "conf")
runPath := filepath.Join(os.Getenv("SNAP_DATA"), "run")

err := ensureBackwardCompat(s)
if err != nil {
return err
return fmt.Errorf("failed to ensure backward compat: %w", err)
}

config := map[string]string{}
for _, item := range configItems {
config[item.Key] = item.Value
config, err := getConfigDb(s)
if err != nil {
return fmt.Errorf("failed to get config db: %w", err)
}

// REF: https://docs.ceph.com/en/quincy/rados/configuration/network-config-ref/#ceph-daemons
Expand All @@ -199,6 +238,7 @@ func UpdateConfig(s interfaces.StateInterface) error {
if err != nil {
return fmt.Errorf("failed to locate IP on public network %s: %w", config["public_network"], err)
}

clientConfig, err := GetClientConfigForHost(s, s.ClusterState().Name())
if err != nil {
logger.Errorf("Failed to pull Client Configurations: %v", err)
Expand All @@ -225,6 +265,7 @@ func UpdateConfig(s interfaces.StateInterface) error {
if err != nil {
return fmt.Errorf("couldn't render ceph.conf: %w", err)
}
logger.Debugf("updated ceph.conf: %v", conf.GetPath())

// Generate ceph.client.admin.keyring
keyring := newCephKeyring(confPath, "ceph.keyring")
Expand All @@ -242,6 +283,30 @@ func UpdateConfig(s interfaces.StateInterface) error {
return nil
}

// getConfigDb retrieves the configuration from the database.
func getConfigDb(s interfaces.StateInterface) (map[string]string, error) {
var err error
var configItems []database.ConfigItem

err = s.ClusterState().Database.Transaction(s.ClusterState().Context, func(ctx context.Context, tx *sql.Tx) error {
configItems, err = database.GetConfigItems(ctx, tx)
if err != nil {
return err
}

return nil
})
if err != nil {
return nil, err
}

config := map[string]string{}
for _, item := range configItems {
config[item.Key] = item.Value
}
return config, nil
}

// getMonitorAddresses scans a provided config key/value map and returns a list of mon hosts found.
func getMonitorAddresses(configs map[string]string) []string {
monHosts := []string{}
Expand Down
8 changes: 6 additions & 2 deletions microceph/ceph/start.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package ceph
import (
"context"
"database/sql"
"github.com/canonical/lxd/shared/logger"
"github.com/canonical/microceph/microceph/interfaces"
"reflect"
"time"
Expand All @@ -19,6 +20,7 @@ func Start(s interfaces.StateInterface) error {
for {
// Check that the database is ready.
if !s.ClusterState().Database.IsOpen() {
logger.Debug("start: database not ready, waiting...")
time.Sleep(10 * time.Second)
continue
}
Expand All @@ -39,26 +41,28 @@ func Start(s interfaces.StateInterface) error {
return nil
})
if err != nil {
logger.Warnf("start: failed to fetch monitors, retrying: %v", err)
time.Sleep(10 * time.Second)
continue
}

// Compare to the previous list.
if reflect.DeepEqual(oldMonitors, monitors) {
logger.Debugf("start: monitors unchanged, sleeping: %v", monitors)
time.Sleep(time.Minute)
continue
}

err = UpdateConfig(s)
if err != nil {
logger.Errorf("start: failed to update config, retrying: %v", err)
time.Sleep(10 * time.Second)
continue
}

logger.Debug("start: updated config, sleeping")
oldMonitors = monitors
time.Sleep(time.Minute)
}

}()

return nil
Expand Down
22 changes: 22 additions & 0 deletions tests/scripts/actionutils.sh
Original file line number Diff line number Diff line change
Expand Up @@ -388,6 +388,28 @@ function test_migration() {
return -1
}

function test_ceph_conf() {
for n in $( lxc ls -c n --format csv ); do

# Test: configured rundir must be current
current=$( realpath /var/snap/microceph/current )
rundir=$( cat /var/snap/microceph/current/conf/ceph.conf | awk '/run dir/{ print $4 }' )
p=$( dirname $rundir )
if [ $p != $current ]; then
echo "Error: snap data dir $current, configured run dir: $rundir"
cat /var/snap/microceph/current/conf/ceph.conf
exit -1
fi

# Test: must contain public_network
if ! grep -q public_net /var/snap/microceph/current/conf/ceph.conf ; then
echo "Error: didn't find public_net in ceph.conf"
cat /var/snap/microceph/current/conf/ceph.conf
exit -1
fi
done
}

function headexec() {
local run="${1?missing}"
shift
Expand Down

0 comments on commit 32af31e

Please sign in to comment.