From ed68222e2e68540aedce553b0532690c4dcbd6d3 Mon Sep 17 00:00:00 2001 From: Jacob Callahan Date: Tue, 19 Nov 2024 16:21:37 -0500 Subject: [PATCH] Add initial attempts to automatically clean up dangling hosts Now, when workflows fail, Broker will attempt to find a handgling host and check it in if found. --- broker/providers/ansible_tower.py | 32 +++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/broker/providers/ansible_tower.py b/broker/providers/ansible_tower.py index 6684a2e..056c0ee 100644 --- a/broker/providers/ansible_tower.py +++ b/broker/providers/ansible_tower.py @@ -168,7 +168,7 @@ def __init__(self, **kwargs): # Init the class itself config = kwargs.get("config") root = kwargs.get("root") - self._v2, self.username = get_awxkit_and_uname( + self._v2, self.uname = get_awxkit_and_uname( config=config, root=root, url=self.url, @@ -374,6 +374,33 @@ def _get_failure_messages(self, workflow): else: return failure_messages + def _try_get_dangling_hosts(self, failed_workflow): + """Get one or more hosts that may have been left behind by a failed workflow.""" + hosts = [] + for node in failed_workflow.get_related("workflow_nodes").results: + if not (job_fields := node.summary_fields.get("job", {})) or job_fields.get( + "failed" + ): # skip jobs with no summary fields and failed jobs + continue + if jobs := self._v2.jobs.get(id=job_fields["id"]).results: + if vm_name := jobs[0].artifacts.get("vm_name"): + hosts.append(vm_name) + return list(set(hosts)) + + def _try_checkin_dangling_host(self, job): + """Attempt to check in dangling hosts associated with the given job.""" + dangling_hosts = self._try_get_dangling_hosts(job) + if not dangling_hosts: + logger.debug("No dangling hosts found for the failed job.") + return + for dangling_host in dangling_hosts: + logger.info(f"Found dangling host: {dangling_host}. Attempting to check in.") + try: + self.release(dangling_host) + logger.debug(f"Successfully checked in dangling host: {dangling_host}") + except exceptions.BrokerError: + logger.warning(f"Failed to check in dangling host: {dangling_host}") + def _compile_host_info(self, host): try: host_facts = host.related.ansible_facts.get() @@ -607,6 +634,7 @@ def execute(self, **kwargs): # noqa: PLR0912,PLR0915 - Possible TODO refactor "URL": job_ui_url, } helpers.emit(message_data) + self._try_checkin_dangling_host(job) raise JobExecutionError(message_data=message_data["Reason(s)"]) if strategy := kwargs.pop("artifacts", None): return self._merge_artifacts(job, strategy=strategy) @@ -614,7 +642,7 @@ def execute(self, **kwargs): # noqa: PLR0912,PLR0915 - Possible TODO refactor def get_inventory(self, user=None): """Compile a list of hosts based on any inventory a user's name is mentioned.""" - user = user or self.username + user = user or self.uname invs = [ inv for inv in self._v2.inventory.get(page_size=200).results