Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add core watcher module #397

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions sunbeam-python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -98,3 +98,7 @@ min-file-size = 1

[tool.ruff.lint.mccabe]
max-complexity = 15

[[tool.mypy.overrides]]
module = ["watcherclient.*", "timeout_decorator"]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

timeout_decorator is not need here

follow_untyped_imports = true
9 changes: 5 additions & 4 deletions sunbeam-python/sunbeam/core/openstack_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,16 +43,17 @@ def get_admin_connection(jhelper: JujuHelper) -> openstack.connection.Connection


def guests_on_hypervisor(
hypervisor_name: str, jhelper: JujuHelper
hypervisor_name: str, conn: openstack.connection.Connection
jneo8 marked this conversation as resolved.
Show resolved Hide resolved
) -> list[openstack.compute.v2.server.Server]:
"""Return a list of guests that run on the given hypervisor.

:param hypervisor_name: Name of hypervisor
:param jhelper: Juju helpers for retrieving admin credentials
:param conn: Admin connection
:raises: openstack.exceptions.SDKException
"""
conn = get_admin_connection(jhelper)
return list(conn.compute.servers(all_projects=True, host=hypervisor_name))
return list(
conn.compute.servers(all_projects=True, hypervisor_hostname=hypervisor_name)
)


def remove_compute_service(
Expand Down
213 changes: 213 additions & 0 deletions sunbeam-python/sunbeam/core/watcher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
# Copyright (c) 2024 Canonical Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
from typing import Any

import tenacity
from watcherclient import v1 as watcher
from watcherclient.common.apiclient.exceptions import NotFound
from watcherclient.v1 import client as watcher_client

from sunbeam.core.common import SunbeamException, read_config
from sunbeam.core.deployment import Deployment
from sunbeam.core.juju import JujuHelper
from sunbeam.core.openstack_api import get_admin_connection
from sunbeam.steps.openstack import REGION_CONFIG_KEY

LOG = logging.getLogger(__name__)

# Timeout of seconds while waiting for the watcher resource to reach the target state.
WAIT_TIMEOUT = 60 * 3
# Sleep interval (in seconds) between querying watcher resources.
WAIT_SLEEP_INTERVAL = 5
ENABLE_MAINTENANCE_AUDIT_TEMPLATE_NAME = "Sunbeam Cluster Maintaining Template"
ENABLE_MAINTENANCE_STRATEGY_NAME = "host_maintenance"
ENABLE_MAINTENANCE_GOAL_NAME = "cluster_maintaining"

WORKLOAD_BALANCING_GOAL_NAME = "workload_balancing"
WORKLOAD_BALANCING_STRATEGY_NAME = "workload_stabilization"
WORKLOAD_BALANCING_AUDIT_TEMPLATE_NAME = "Sunbeam Cluster Workload Balancing Template"


def get_watcher_client(deployment: Deployment) -> watcher_client.Client:
region = read_config(deployment.get_client(), REGION_CONFIG_KEY)["region"]
conn = get_admin_connection(
jhelper=JujuHelper(deployment.get_connected_controller())
)

watcher_endpoint = conn.session.get_endpoint(
service_type="infra-optim",
region_name=region,
)
return watcher_client.Client(session=conn.session, endpoint=watcher_endpoint)


def _create_host_maintenance_audit_template(
client: watcher_client.Client,
) -> watcher.AuditTemplate:
template = client.audit_template.create(
name=ENABLE_MAINTENANCE_AUDIT_TEMPLATE_NAME,
description="Audit template for cluster maintaining",
goal=ENABLE_MAINTENANCE_GOAL_NAME,
strategy=ENABLE_MAINTENANCE_STRATEGY_NAME,
)
return template


def _create_workload_balancing_audit_template(
client: watcher_client.Client,
) -> watcher.AuditTemplate:
template = client.audit_template.create(
name=WORKLOAD_BALANCING_AUDIT_TEMPLATE_NAME,
description="Audit template for workload balancing",
goal=WORKLOAD_BALANCING_GOAL_NAME,
strategy=WORKLOAD_BALANCING_STRATEGY_NAME,
)
return template


def get_enable_maintenance_audit_template(
client: watcher_client.Client,
) -> watcher.AuditTemplate:
try:
template = client.audit_template.get(ENABLE_MAINTENANCE_AUDIT_TEMPLATE_NAME)
except NotFound:
template = _create_host_maintenance_audit_template(client=client)
return template


def get_workload_balancing_audit_template(
client: watcher_client.Client,
) -> watcher.AuditTemplate:
try:
template = client.audit_template.get(WORKLOAD_BALANCING_AUDIT_TEMPLATE_NAME)
except NotFound:
template = _create_workload_balancing_audit_template(client=client)
return template


@tenacity.retry(
reraise=True,
stop=tenacity.stop_after_delay(WAIT_TIMEOUT),
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As discussed, the WAIT_TIMEOUT should adjust based on number of actions it has to perform. Otherwise user might end up to run sunbeam maintenance command multiple times if there are more instances to migrate.

wait=tenacity.wait_fixed(WAIT_SLEEP_INTERVAL),
)
def _wait_resource_in_target_state(
client: watcher_client.Client,
resource_name: str,
resource_uuid: str,
states: list[str] = ["SUCCEEDED", "FAILED"],
) -> watcher.Audit:
src = getattr(client, resource_name).get(resource_uuid)
if src.state not in states:
raise SunbeamException(f"{resource_name} {resource_uuid} not in target state")
return src


def create_audit(
client: watcher_client.Client,
template: watcher.AuditTemplate,
audit_type: str = "ONESHOT",
parameters: dict[str, Any] = {},
) -> watcher.Audit:
audit = client.audit.create(
audit_template_uuid=template.uuid,
audit_type=audit_type,
parameters=parameters,
)
audit_details = _wait_resource_in_target_state(
client=client,
resource_name="audit",
resource_uuid=audit.uuid,
)

if audit_details.state == "SUCCEEDED":
LOG.debug(f"Create Watcher audit {audit.uuid} successfully")
else:
LOG.debug(f"Create Watcher audit {audit.uuid} failed")
raise SunbeamException(
f"Create watcher audit failed, template: {template.name}"
)

_check_audit_plans_recommended(client=client, audit=audit)
return audit


def _check_audit_plans_recommended(client: watcher_client.Client, audit: watcher.Audit):
action_plans = client.action_plan.list(audit=audit.uuid)
# Verify all the action_plan's state is RECOMMENDED.
# In case there is not action been generated, the action plan state
# will be SUCCEEDED at the beginning.
if not all(plan.state in ["RECOMMENDED", "SUCCEEDED"] for plan in action_plans):
raise SunbeamException(
f"Not all action plan for audit({audit.uuid}) is RECOMMENDED"
)


def get_actions(
client: watcher_client.Client, audit: watcher.Audit
) -> list[watcher.Action]:
"""Get list of actions by audit."""
return client.action.list(audit=audit.uuid, detail=True)
Copy link
Collaborator

@hemanthnakkina hemanthnakkina Jan 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.



def exec_audit(client: watcher_client.Client, audit: watcher.Audit):
"""Run audit's action plans."""
action_plans = client.action_plan.list(audit=audit.uuid)
for action_plan in action_plans:
_exec_plan(client=client, action_plan=action_plan)
LOG.info(f"All Action plan for Audit {audit.uuid} execution successfully")


def _exec_plan(client: watcher_client.Client, action_plan: watcher.ActionPlan):
"""Run action plan."""
if action_plan.state == "SUCCEEDED":
LOG.debug(f"action plan {action_plan.uuid} state is SUCCEEDED, skip execution")
return
client.action_plan.start(action_plan_id=action_plan.uuid)

action_plan_details = _wait_resource_in_target_state(
client=client,
resource_name="action_plan",
resource_uuid=action_plan.uuid,
)

if action_plan_details.state == "SUCCEEDED":
LOG.debug(f"Action plan {action_plan.uuid} execution successfully")
else:
LOG.debug(f"Action plan {action_plan.uuid} execution failed")

# Even if an action fails, the action plan can still be in the SUCCEEDED state.
# To handle this, we check if there are any failed actions at this point.
_raise_on_failed_action(client=client, action_plan=action_plan)


def _raise_on_failed_action(
client: watcher_client.Client, action_plan: watcher.ActionPlan
):
"""Raise exception on failed action."""
actions = client.action.list(action_plan=action_plan.uuid, detail=True)
info = {}
for action in actions:
if not action.state == "FAILED":
continue
info[action.uuid] = {
"action": action.action_type,
"updated-at": action.updated_at,
"description": action.description,
"input_parameters": action.input_parameters,
}
if len(info) > 0:
raise SunbeamException(f"Actions in FAILED state. {info}")
1 change: 1 addition & 0 deletions sunbeam-python/test-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,4 @@ croniter

ruff
mypy
python-watcherclient
8 changes: 5 additions & 3 deletions sunbeam-python/tests/unit/sunbeam/core/test_openstack_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,12 +75,14 @@ def test_get_admin_connection(self, retrieve_admin_credentials, os_connect):
project_domain_name=FAKE_CREDS.get("OS_PROJECT_DOMAIN_NAME"),
)

def test_guests_on_hypervisor(self, get_admin_connection):
def test_guests_on_hypervisor(self):
conn = Mock()
get_admin_connection.return_value = conn
conn.compute.servers.return_value = [1]
assert sunbeam.core.openstack_api.guests_on_hypervisor("hyper1", None) == [1]
conn.compute.servers.assert_called_once_with(all_projects=True, host="hyper1")
assert sunbeam.core.openstack_api.guests_on_hypervisor("hyper1", conn) == [1]
conn.compute.servers.assert_called_once_with(
all_projects=True, hypervisor_hostname="hyper1"
)

def test_remove_compute_service(self):
service1 = Mock(binary="nova-compute", host="hyper1")
Expand Down
Loading
Loading