Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add option --ignore-pools to exclude vms in pools from backup c… #60

Merged
merged 1 commit into from
Jun 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -102,9 +102,9 @@ The ``icinga2`` folder contains the command definition and service examples for

```
usage: check_pve.py [-h] -e API_ENDPOINT [--api-port API_PORT] -u API_USER (-p API_PASSWORD | -t API_TOKEN) [-k] -m
{cluster,version,cpu,memory,swap,storage,io_wait,io-wait,updates,services,subscription,vm,vm_status,vm-status,replication,disk-health,ceph-health,zfs-health,zfs-fragmentation,backup} [-n NODE] [--name NAME]
[--vmid VMID] [--expected-vm-status {running,stopped,paused}] [--ignore-vm-status] [--ignore-service NAME] [--ignore-disk NAME] [-w THRESHOLD_WARNING] [-c THRESHOLD_CRITICAL] [-M] [-V MIN_VERSION]
[--unit {GB,MB,KB,GiB,MiB,KiB,B}]
{cluster,version,cpu,memory,swap,storage,io_wait,io-wait,updates,services,subscription,vm,vm_status,vm-status,replication,disk-health,ceph-health,zfs-health,zfs-fragmentation,backup}
[-n NODE] [--name NAME] [--vmid VMID] [--expected-vm-status {running,stopped,paused}] [--ignore-vm-status] [--ignore-service NAME] [--ignore-disk NAME] [--ignore-pools NAME]
[-w THRESHOLD_WARNING] [-c THRESHOLD_CRITICAL] [-M] [-V MIN_VERSION] [--unit {GB,MB,KB,GiB,MiB,KiB,B}]

Check command for PVE hosts via API

Expand Down Expand Up @@ -135,6 +135,7 @@ Check Options:
--ignore-service NAME
Ignore service NAME in checks
--ignore-disk NAME Ignore disk NAME in health check
--ignore-pools NAME Ignore vms and containers in pool(s) NAME in checks
-w THRESHOLD_WARNING, --warning THRESHOLD_WARNING
Warning threshold for check value. Mutiple thresholds with name:value,name:value
-c THRESHOLD_CRITICAL, --critical THRESHOLD_CRITICAL
Expand Down Expand Up @@ -258,7 +259,7 @@ WARNING - Ceph Cluster is in warning state

**Check ZFS pool health**
```
./check_pve.py -u <API_USER> -p <API_PASSWORD> -e <API_ENDPOINT> -m zfs-health -n pve
./check_pve.py -u <API_USER> -p <API_PASSWORD> -e <API_ENDPOINT> -m zfs-health -n pve
OK - All ZFS pools are healthy
```

Expand Down
68 changes: 61 additions & 7 deletions check_pve.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@

import re
import sys
from typing import Callable, Dict, Optional, Union
from typing import Callable, Dict, Optional, Union, List

try:
import argparse
Expand Down Expand Up @@ -126,6 +126,16 @@ def threshold_type(arg: str) -> Dict[str, "CheckThreshold"]:
return thresholds


class RequestError(Exception):
"""Exception for request related errors."""

def __init__(self, message: str, rc: int) -> None:
self.message = message
self.rc = rc

super().__init__(self.message)


class CheckPVE:
"""Check command for Proxmox VE."""

Expand Down Expand Up @@ -209,6 +219,9 @@ def request(self, url: str, method: str = "get", **kwargs: Dict) -> Union[Dict,
else:
message += f"HTTP error code was {response.status_code}"

if kwargs.get("raise_error", False):
raise RequestError(message, response.status_code)

self.output(CheckState.UNKNOWN, message)

def get_ticket(self) -> str:
Expand Down Expand Up @@ -664,6 +677,26 @@ def check_version(self) -> None:
f"Your PVE instance version '{data['version']}' ({data['repoid']}) is up to date"
)

def _get_pool_members(self, pool: str) -> List[int]:
"""Get a list of vmids, which are members of a given resource pool.

NOTE: The request needs the Pool.Audit permission!
"""
members = []

try:
url = self.get_url(f"pools/{pool}")
pools = self.request(url, raise_error=True)
for pool in pools.get("members", []):
members.append(pool["vmid"])
except RequestError:
print(
f"Unable to fetch members of pool '{pool}'. "
"Check if the name is correct and the role has the 'Pool.Audit' permission"
)

return members

def check_vzdump_backup(self, name: Optional[str] = None) -> None:
"""Check for failed vzdump backup jobs."""
tasks_url = self.get_url("cluster/tasks")
Expand Down Expand Up @@ -696,13 +729,25 @@ def check_vzdump_backup(self, name: Optional[str] = None) -> None:

nbu_url = self.get_url("cluster/backup-info/not-backed-up")
not_backed_up = self.request(nbu_url)

if len(not_backed_up) > 0:
guest_ids = " ".join([str(guest["vmid"]) for guest in not_backed_up])
if self.check_result not in [CheckState.CRITICAL, CheckState.UNKNOWN]:
self.check_result = CheckState.WARNING
self.check_message += (
f"\nThere are guests not covered by any backup schedule: {guest_ids}"
)
guest_ids = []

for guest in not_backed_up:
guest_ids.append(str(guest["vmid"]))

ignored_vmids = []
for pool in self.options.ignore_pools:
ignored_vmids += map(str, self._get_pool_members(pool))

remaining_not_backed_up = sorted(list(set(guest_ids) - set(ignored_vmids)))
if len(remaining_not_backed_up) > 0:
if self.check_result not in [CheckState.CRITICAL, CheckState.UNKNOWN]:
self.check_result = CheckState.WARNING
self.check_message += (
"\nThere are unignored guests not covered by any backup schedule: "
+ ", ".join(remaining_not_backed_up)
)

def check_memory(self) -> None:
"""Check memory usage of Proxmox VE node."""
Expand Down Expand Up @@ -999,6 +1044,15 @@ def parse_args(self) -> None:
default=[],
)

check_opts.add_argument(
"--ignore-pools",
dest="ignore_pools",
action="append",
metavar="NAME",
help="Ignore vms and containers in pool(s) NAME in checks",
default=[],
)

check_opts.add_argument(
"-w",
"--warning",
Expand Down