From f8013f066531e4f5d59f7dfbc9b64976a6bc128f Mon Sep 17 00:00:00 2001 From: jamesbeedy Date: Thu, 2 Mar 2023 01:54:06 +0000 Subject: [PATCH 1/3] add ubuntu installer --- config.yaml | 2 +- src/charm.py | 14 +++- src/nvidia_ops_manager.py | 158 ++++++++++++++++++++++++++++++++------ 3 files changed, 147 insertions(+), 27 deletions(-) diff --git a/config.yaml b/config.yaml index a6d22cb..cd71b55 100644 --- a/config.yaml +++ b/config.yaml @@ -3,4 +3,4 @@ options: type: string default: "nvidia-driver-latest-dkms" description: | - Driver package to be installed. \ No newline at end of file + Driver package to be installed (centos only). diff --git a/src/charm.py b/src/charm.py index 266a9a9..bd0f364 100755 --- a/src/charm.py +++ b/src/charm.py @@ -2,7 +2,12 @@ """Nvidia Operator Charm.""" import logging -from nvidia_ops_manager import NvidiaDriverOpsError, NvidiaOpsManager +from nvidia_ops_manager import ( + NvidiaDriverOpsError, + NvidiaOpsManagerCentos, + NvidiaOpsManagerUbuntu, + os_release, +) from ops.charm import CharmBase from ops.main import main from ops.model import ActiveStatus, BlockedStatus, WaitingStatus @@ -17,9 +22,10 @@ def __init__(self, *args): """Initialize the charm.""" super().__init__(*args) - driver_package = self.config.get("driver-package") - - self._nvidia_ops_manager = NvidiaOpsManager(driver_package) + if os_release()["ID"] == "ubuntu": + self._nvidia_ops_manager = NvidiaOpsManagerUbuntu() + else: + self._nvidia_ops_manager = NvidiaOpsManagerCentos(self.config.get("driver-package")) event_handler_bindings = { self.on.install: self._on_install, diff --git a/src/nvidia_ops_manager.py b/src/nvidia_ops_manager.py index 01d73e9..850e70f 100644 --- a/src/nvidia_ops_manager.py +++ b/src/nvidia_ops_manager.py @@ -1,11 +1,19 @@ #!/usr/bin/env python3 """Nvidia driver install, remove and return version.""" -import requests - +import tempfile from pathlib import Path from subprocess import CalledProcessError, check_output, run from typing import List +import requests + + +def os_release(): + """Return /etc/os-release as a dict.""" + os_release_data = Path("/etc/os-release").read_text() + os_release_list = [item.split("=") for item in os_release_data.strip().split("\n")] + return {k: v for k, v in os_release_list} + class NvidiaDriverOpsError(Exception): """Error raised for nvidia driver installation errors.""" @@ -15,8 +23,132 @@ def __init__(self, message: str): super().__init__(self.message) -class NvidiaOpsManager: - """NvidiaOpsManager.""" +class NvidiaOpsManagerBase: + """NvidiaOpsManagerBase.""" + + def __init__(self): + pass + + @property + def _arch(self) -> str: + """Return the system architecture.""" + try: + arch = check_output(["/bin/arch"]) + except CalledProcessError: + raise NvidiaDriverOpsError("Error detecting system architecture.") + return arch.decode().strip() + + @property + def _uname_r(self) -> str: + """Return the kernel version.""" + try: + kernel_version = check_output(["/usr/bin/uname", "-r"]) + except CalledProcessError: + raise NvidiaDriverOpsError("Error detecting kernel version.") + return kernel_version.decode().strip() + + def install(self) -> None: + """Install nvidia-drivers here.""" + raise Exception("Inheriting object needs to define this method.") + + def remove(self) -> None: + """Remove nvidia-drivers here.""" + raise Exception("Inheriting object needs to define this method.") + + def version(self) -> None: + """Return the cuda-drivers version.""" + raise Exception("Inheriting object needs to define this method.") + + +class NvidiaOpsManagerUbuntu(NvidiaOpsManagerBase): + """NvidiaOpsManager for Ubuntu.""" + + OS_RELEASE = os_release() + + def __init__(self): + self._id = self.OS_RELEASE["ID"] + self._os = self.OS_RELEASE["VERSION_ID"].strip(".") + self._distribution = f"{self._id}{self._os}" + self._cuda_keyring_url = ( + "https://developer.download.nvidia.com/compute/cuda/" + f"repos/{self._distribuition}/{self._arch}/cuda-keyring_1.0-1_all.deb" + ) + self._cuda_sources_list = Path( + f"/etc/apt/sources.list.d/cuda-{self._distribution}-{self._arch}.list" + ) + + def _install_kernel_headers(self) -> None: + """Install the kernel headers.""" + try: + run(["apt-get", "install", "-y", f"linux-headers-{self._uname_r}"]) + except CalledProcessError: + raise NvidiaDriverOpsError("Error installing kernel headers.") + + def _install_cuda_keyring(self) -> None: + """Install the cuda keyring .deb.""" + try: + r = requests.get(self._cuda_keyring_url) + except requests.exceptions.HTTPError: + raise NvidiaDriverOpsError( + f"Error downloading cuda keyring from {self._cuda_keyring_url}" + ) + + with tempfile.TemporaryDirectory() as tmpdir: + cuda_keyring_deb = f"{tmpdir}/cuda_keyring.deb" + Path(cuda_keyring_deb).write_bytes(r.content) + try: + run(["dpkg", "-i", cuda_keyring_deb]) + except CalledProcessError: + raise NvidiaDriverOpsError("Error installing cuda keyring .deb.") + try: + run(["apt-get", "update"]) + except CalledProcessError: + raise NvidiaDriverOpsError("Error running `apt-get update`.") + + def _install_cuda_drivers(self) -> None: + """Install the cuda drivers.""" + try: + run(["apt-get", "install", "-y", "cuda-drivers"]) + except CalledProcessError: + raise NvidiaDriverOpsError("Error installing cuda drivers.") + + def install(self) -> None: + """Install Nvidia drivers on Ubuntu.""" + self._install_kernel_headers() + self._install_cuda_keyring() + self._install_cuda_drivers() + + def remove(self) -> None: + """Remove cuda drivers from the os.""" + try: + run(["apt-get", "-y", "remove", "--purge", "cuda-drivers"]) + except CalledProcessError: + raise NvidiaDriverOpsError("Error removing cuda-drivers.") + + self._cuda_sources_list.unlink() + + try: + run(["apt-get", "update"]) + except CalledProcessError: + raise NvidiaDriverOpsError("Error running `apt-get update`.") + + def version(self) -> str: + """Return the cuda-drivers package version.""" + try: + p = check_output(["apt-cache", "policy", "cuda-drivers"]) + except CalledProcessError: + raise NvidiaDriverOpsError("Error running `apt-cache policy cuda-drivers.") + + for line in p.decode().strip().split("\n"): + if "Installed" in line: + version = line.split("Installed: ")[1] + else: + raise NvidiaDriverOpsError("Error locating cuda-drivers package version.") + return version + + +class NvidiaOpsManagerCentos(NvidiaOpsManagerBase): + """NvidiaOpsManager for Centos7.""" def __init__(self, driver_package): """Initialize class level variables.""" @@ -42,24 +174,6 @@ def __init__(self, driver_package): self.NVIDIA_DRIVER_PACKAGE = driver_package self.NVIDIA_DRIVER_REPO_FILEPATH = Path("/etc/yum.repos.d/cuda-rhel7.repo") - @property - def _arch(self) -> str: - """Return the system architecture.""" - try: - arch = check_output(["/bin/arch"]) - except CalledProcessError: - raise NvidiaDriverOpsError("Error detecting system architecture.") - return arch.decode().strip() - - @property - def _uname_r(self) -> str: - """Return the kernel version.""" - try: - kernel_version = check_output(["/usr/bin/uname", "-r"]) - except CalledProcessError: - raise NvidiaDriverOpsError("Error detecting kernel version.") - return kernel_version.decode().strip() - @property def _nvidia_developer_repo(self) -> str: """Generate and return the Nvidia developer repo url.""" From 5b1343d5389737d9ca371d55385c8fe95d1b1105 Mon Sep 17 00:00:00 2001 From: jamesbeedy Date: Thu, 2 Mar 2023 02:13:14 +0000 Subject: [PATCH 2/3] fix typo --- charmcraft.yaml | 6 ++++++ metadata.yaml | 2 ++ src/nvidia_ops_manager.py | 2 +- 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/charmcraft.yaml b/charmcraft.yaml index 55815ff..ca20899 100644 --- a/charmcraft.yaml +++ b/charmcraft.yaml @@ -4,6 +4,12 @@ bases: - name: ubuntu channel: "22.04" run-on: + - name: ubuntu + channel: "20.04" + architectures: [amd64] + - name: ubuntu + channel: "22.04" + architectures: [amd64] - name: centos channel: "7" architectures: [amd64] diff --git a/metadata.yaml b/metadata.yaml index 99024c0..2d63edd 100644 --- a/metadata.yaml +++ b/metadata.yaml @@ -20,6 +20,8 @@ tags: - nvidia series: + - jammy + - focal - centos7 requires: diff --git a/src/nvidia_ops_manager.py b/src/nvidia_ops_manager.py index 850e70f..9603a06 100644 --- a/src/nvidia_ops_manager.py +++ b/src/nvidia_ops_manager.py @@ -71,7 +71,7 @@ def __init__(self): self._distribution = f"{self._id}{self._os}" self._cuda_keyring_url = ( "https://developer.download.nvidia.com/compute/cuda/" - f"repos/{self._distribuition}/{self._arch}/cuda-keyring_1.0-1_all.deb" + f"repos/{self._distribution}/{self._arch}/cuda-keyring_1.0-1_all.deb" ) self._cuda_sources_list = Path( f"/etc/apt/sources.list.d/cuda-{self._distribution}-{self._arch}.list" From e7c846e9f620c338f10a847f36e3fb948076b940 Mon Sep 17 00:00:00 2001 From: jamesbeedy Date: Thu, 2 Mar 2023 03:09:13 +0000 Subject: [PATCH 3/3] fix typos and add github publish to charmhub Add .gihub/ with issue and PR templates and publish to charmhub ci workflows. --- .github/ISSUE_TEMPLATE/bug_report.yaml | 57 +++++++++++++++++++ .../ISSUE_TEMPLATE/enhancement_proposal.yaml | 17 ++++++ .github/PULL_REQUEST_TEMPLATE.md | 23 ++++++++ .github/workflows/ci.yaml | 41 +++++++++++++ .github/workflows/release.yaml | 44 ++++++++++++++ src/nvidia_ops_manager.py | 21 +++---- 6 files changed, 193 insertions(+), 10 deletions(-) create mode 100644 .github/ISSUE_TEMPLATE/bug_report.yaml create mode 100644 .github/ISSUE_TEMPLATE/enhancement_proposal.yaml create mode 100644 .github/PULL_REQUEST_TEMPLATE.md create mode 100644 .github/workflows/ci.yaml create mode 100644 .github/workflows/release.yaml diff --git a/.github/ISSUE_TEMPLATE/bug_report.yaml b/.github/ISSUE_TEMPLATE/bug_report.yaml new file mode 100644 index 0000000..be847d5 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.yaml @@ -0,0 +1,57 @@ +name: Bug Report +description: File a bug report +labels: ["Type: Bug", "Status: Triage"] +body: + - type: markdown + attributes: + value: > + Thanks for taking the time to fill out this bug report! Before submitting your issue, please make + sure you are using the latest version of the charm. If not, please switch to the latest version of this charm + before posting your report to make sure it's not already solved. + - type: textarea + id: bug-description + attributes: + label: Bug Description + description: > + Provide a description of the issue you are facing. If applicable, add screenshots to help explain the problem. + validations: + required: true + - type: textarea + id: reproduction + attributes: + label: To Reproduce + description: > + Please provide a step-by-step instruction of how to reproduce the behavior. + placeholder: | + 1. `juju deploy ...` + 2. `juju relate ...` + 3. `juju status --relations` + validations: + required: true + - type: textarea + id: environment + attributes: + label: Environment + description: > + We need to know a bit more about the context in which you run the charm. + - Are you running Juju locally, on lxd, in multipass or on some other platform? + - What track and channel you deployed the charm from (ie. `latest/edge` or similar). + - Version of any applicable components, like the juju snap, the model controller, lxd, microk8s, and/or multipass. + validations: + required: true + - type: textarea + id: logs + attributes: + label: Relevant log output + description: > + Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks. + Fetch the logs using `juju debug-log --replay`. Additional details on how to retrieve logs are available in the juju + documentation at https://juju.is/docs/olm/juju-logs. + render: shell + validations: + required: true + - type: textarea + id: additional-context + attributes: + label: Additional context + diff --git a/.github/ISSUE_TEMPLATE/enhancement_proposal.yaml b/.github/ISSUE_TEMPLATE/enhancement_proposal.yaml new file mode 100644 index 0000000..9587b38 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/enhancement_proposal.yaml @@ -0,0 +1,17 @@ +name: Enhancement Proposal +description: File an enhancement proposal +labels: ["Type: Enhancement", "Status: Triage"] +body: + - type: markdown + attributes: + value: > + Thank you for taking the time to fill out this enhancement proposal! Before submitting your proposal, please + make sure there isn't a pre-existing similar proposal. If there is, please join that discussion instead. + - type: textarea + id: enhancement-proposal + attributes: + label: Enhancement Proposal + description: > + Describe the enhancement you would like to see in as much detail as needed. + validations: + required: true diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 0000000..fbc730e --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,23 @@ +## Description + +> Provide a description of the purpose of this pull request, as well as its +motivation and context. Is it a new feature? A bug fix? Does it address an existing issue? + +## How was the code tested? + +> Describe the conditions under which the code has been tested. +> * Did you run the defined integration and units under `tests/`? +> * Did you write new tests? Where are they located in the repository? +> * Which undercloud did you use to perform the tests? LXD, vSphere, AWS, etc. +> * What operating system did you test the charms on? Ubuntu 22.04, Ubuntu 20.04, CentOS 7, etc. + +## Related issues and/or tasks + +> Link any related issues or project board tasks to this pull request. + +## Checklist + +- [ ] I am the author of these changes, or I have the rights to submit them. +- [ ] I have added the relevant changes to the README and/or documentation. +- [ ] I have self reviewed my own code. +- [ ] All requested changes and/or review comments have been resolved. diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml new file mode 100644 index 0000000..9823807 --- /dev/null +++ b/.github/workflows/ci.yaml @@ -0,0 +1,41 @@ +# Copyright 2023 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: nvidia charm tests +on: + workflow_call: + pull_request: + +jobs: + inclusive-naming-check: + name: Inclusive naming check + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v3 + - name: Run tests + uses: get-woke/woke-action@v0 + with: + fail-on-error: true + + lint: + name: Lint + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v3 + - name: Install dependencies + run: python3 -m pip install tox + - name: Run linters + run: tox -e lint diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml new file mode 100644 index 0000000..828124a --- /dev/null +++ b/.github/workflows/release.yaml @@ -0,0 +1,44 @@ +# Copyright 2023 Canonical Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Release to latest/edge + +on: + push: + branches: + - main + +jobs: + ci-tests: + uses: ./.github/workflows/ci.yaml + + release-to-charmhub: + name: Release to CharmHub + needs: + - ci-tests + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v3 + - name: Create version file + run: git describe --tags --always --dirty > version + - name: Select charmhub channel + uses: canonical/charming-actions/channel@2.2.0 + id: channel + - name: Upload charm to charmhub + uses: canonical/charming-actions/upload-charm@2.2.0 + with: + credentials: "${{ secrets.CHARMCRAFT_AUTH }}" + github-token: "${{ secrets.GITHUB_TOKEN }}" + channel: "${{ steps.channel.outputs.name }}" diff --git a/src/nvidia_ops_manager.py b/src/nvidia_ops_manager.py index 9603a06..4290af8 100644 --- a/src/nvidia_ops_manager.py +++ b/src/nvidia_ops_manager.py @@ -12,7 +12,7 @@ def os_release(): """Return /etc/os-release as a dict.""" os_release_data = Path("/etc/os-release").read_text() os_release_list = [item.split("=") for item in os_release_data.strip().split("\n")] - return {k: v for k, v in os_release_list} + return {k: v.strip('"') for k, v in os_release_list} class NvidiaDriverOpsError(Exception): @@ -67,8 +67,8 @@ class NvidiaOpsManagerUbuntu(NvidiaOpsManagerBase): def __init__(self): self._id = self.OS_RELEASE["ID"] - self._os = self.OS_RELEASE["VERSION_ID"].strip(".") - self._distribution = f"{self._id}{self._os}" + self._version_id = self.OS_RELEASE["VERSION_ID"].replace(".", "") + self._distribution = f"{self._id}{self._version_id}" self._cuda_keyring_url = ( "https://developer.download.nvidia.com/compute/cuda/" f"repos/{self._distribution}/{self._arch}/cuda-keyring_1.0-1_all.deb" @@ -94,16 +94,16 @@ def _install_cuda_keyring(self) -> None: ) with tempfile.TemporaryDirectory() as tmpdir: - cuda_keyring_deb = f"{tmpdir}/cuda_keyring.deb" + cuda_keyring_deb = f"{tmpdir}/cuda-keyring.deb" Path(cuda_keyring_deb).write_bytes(r.content) try: run(["dpkg", "-i", cuda_keyring_deb]) except CalledProcessError: raise NvidiaDriverOpsError("Error installing cuda keyring .deb.") - try: - run(["apt-get", "update"]) - except CalledProcessError: - raise NvidiaDriverOpsError("Error running `apt-get update`.") + try: + run(["apt-get", "update"]) + except CalledProcessError: + raise NvidiaDriverOpsError("Error running `apt-get update`.") def _install_cuda_drivers(self) -> None: """Install the cuda drivers.""" @@ -139,11 +139,12 @@ def version(self) -> str: except CalledProcessError: raise NvidiaDriverOpsError("Error running `apt-cache policy cuda-drivers.") + version = "" for line in p.decode().strip().split("\n"): if "Installed" in line: version = line.split("Installed: ")[1] - else: - raise NvidiaDriverOpsError("Error locating cuda-drivers package version.") + if not version: + raise NvidiaDriverOpsError("Error locating cuda-drivers package version.") return version