From fd7458f88bdca051135872068f84e531537a3ad8 Mon Sep 17 00:00:00 2001 From: Arne Welzel Date: Thu, 28 Sep 2023 14:22:23 +0200 Subject: [PATCH] Add and populate machines table. This should help to track changes to the EC2 instance. This also moves over to use SQLAlchemy in storage.py rather than plain sqlite3. --- ...928_1204-311be9937d3e_add_machine_table.py | 42 ++++++++++ ci-based/docker-compose.yml | 5 +- ci-based/tests/test_app.py | 64 +++++++++------ ci-based/tests/test_storage.py | 68 ++++++++++++++++ ci-based/zeek_benchmarker/app.py | 7 ++ ci-based/zeek_benchmarker/machine.py | 79 +++++++++++++++++++ ci-based/zeek_benchmarker/models.py | 51 ++++++++++++ ci-based/zeek_benchmarker/storage.py | 75 ++++++++++++++++-- ci-based/zeek_benchmarker/testing.py | 3 + 9 files changed, 362 insertions(+), 32 deletions(-) create mode 100644 ci-based/alembic/versions/20230928_1204-311be9937d3e_add_machine_table.py create mode 100644 ci-based/zeek_benchmarker/machine.py create mode 100644 ci-based/zeek_benchmarker/models.py diff --git a/ci-based/alembic/versions/20230928_1204-311be9937d3e_add_machine_table.py b/ci-based/alembic/versions/20230928_1204-311be9937d3e_add_machine_table.py new file mode 100644 index 0000000..62813ad --- /dev/null +++ b/ci-based/alembic/versions/20230928_1204-311be9937d3e_add_machine_table.py @@ -0,0 +1,42 @@ +"""add machine table + +Revision ID: 311be9937d3e +Revises: 1c1d18482b62 +Create Date: 2023-09-28 12:04:49.205816 + +""" +from collections.abc import Sequence + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision: str = "311be9937d3e" +down_revision: str | None = "1c1d18482b62" +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None + + +def upgrade() -> None: + op.create_table( + "machines", + sa.Column("id", sa.Integer, primary_key=True), + sa.Column( + "created_at", sa.DateTime, server_default=sa.text("(STRFTIME('%s'))") + ), + sa.Column("dmi_sys_vendor", sa.Text), + sa.Column("dmi_product_uuid", sa.Text), + sa.Column("dmi_product_serial", sa.Text), + sa.Column("dmi_board_asset_tag", sa.Text), + sa.Column("os", sa.Text), + sa.Column("architecture", sa.Text), + sa.Column("cpu_model", sa.Text), + sa.Column("mem_total_bytes", sa.Integer), + ) + + op.add_column("jobs", sa.Column("machine_id", sa.Integer)) + + +def downgrade() -> None: + op.drop_column("jobs", "machine_id") + op.drop_table("machines") diff --git a/ci-based/docker-compose.yml b/ci-based/docker-compose.yml index 9a8a4ad..371ddd4 100644 --- a/ci-based/docker-compose.yml +++ b/ci-based/docker-compose.yml @@ -34,8 +34,9 @@ services: # also come from the environment. - ./config.yml:/app/config.yml - # Let the API record information about - # jobs in the database. + # Allow the API access to the host machine's machine-id. + - /etc/machine-id:/etc/machine-id + - ./persistent:/app/persistent working_dir: /app entrypoint: diff --git a/ci-based/tests/test_app.py b/ci-based/tests/test_app.py index cf9a841..cc3e32f 100644 --- a/ci-based/tests/test_app.py +++ b/ci-based/tests/test_app.py @@ -1,14 +1,19 @@ import datetime import hmac -import sqlite3 import time import unittest from unittest import mock from zeek_benchmarker.app import create_app, is_valid_branch_name +from zeek_benchmarker.models import Job, Machine from zeek_benchmarker.testing import TestWithDatabase +def test_machine(): + return Machine(dmi_product_uuid="ec2abcdef-1234", os="Linux") + + +@mock.patch("zeek_benchmarker.machine.get_machine", new_callable=lambda: test_machine) @mock.patch("zeek_benchmarker.app.enqueue_job") class TestApi(TestWithDatabase): def setUp(self): @@ -45,7 +50,7 @@ def hmac_digest(self, path, timestamp, build_hash): hmac_msg = f"{path:s}-{timestamp:d}-{build_hash:s}\n".encode() return hmac.new(self._test_hmac_key, hmac_msg, "sha256").hexdigest() - def test_zeek_good(self, enqueue_job_mock): + def test_zeek_good(self, enqueue_job_mock, get_machine_mock): enqueue_job_mock.return_value = self.enqueue_job_result_mock r = self._test_client.post( @@ -67,7 +72,17 @@ def test_zeek_good(self, enqueue_job_mock): self._test_build_hash, enqueue_job_mock.call_args[0][1]["build_hash"] ) - def test_zeek_good__more(self, enqueue_job_mock): + # Query the databse for the stored Job and Machine and + # ensure they are connected. + with self.storage.Session() as session: + jobs = session.query(Job).all() + machine = session.query(Machine).first() + + self.assertEqual(1, len(jobs)) + self.assertEqual("test-job-id", jobs[0].id) + self.assertEqual(machine.id, jobs[0].machine_id) + + def test_zeek_good__more(self, enqueue_job_mock, get_machine_mock): enqueue_job_mock.return_value = self.enqueue_job_result_mock r = self._test_client.post( @@ -98,25 +113,26 @@ def test_zeek_good__more(self, enqueue_job_mock): self._test_build_hash, enqueue_job_mock.call_args[0][1]["build_hash"] ) - with sqlite3.connect(self.database_file.name) as conn: - conn.row_factory = sqlite3.Row - rows = list(conn.execute("select * from jobs")) - self.assertEqual(1, len(rows)) - row = rows[0] - self.assertEqual("zeek", row["kind"]) - self.assertEqual(self._test_build_hash, row["build_hash"]) - self.assertEqual("f572d396fae9206628714fb2ce00f72e94f2258f", row["sha"]) - self.assertEqual("test-branch", row["branch"]) - self.assertEqual("test-owner", row["cirrus_repo_owner"]) - self.assertEqual("test-name", row["cirrus_repo_name"]) - self.assertEqual(123, row["cirrus_task_id"]) - self.assertEqual("test-task-name", row["cirrus_task_name"]) - self.assertEqual(9, row["cirrus_build_id"]) - self.assertEqual(456, row["cirrus_pr"]) - self.assertEqual(789, row["github_check_suite_id"]) - self.assertEqual("6.1.0-dev.123", row["repo_version"]) - - def test_zeek_bad_build_url(self, enqueue_job_mock): + # Query the databse for the stored Job and Machine and + # ensure they are connected. + with self.storage.Session() as session: + jobs = session.query(Job).all() + self.assertEqual(1, len(jobs)) + job = jobs[0] + self.assertEqual("zeek", job.kind) + self.assertEqual(self._test_build_hash, job.build_hash) + self.assertEqual("f572d396fae9206628714fb2ce00f72e94f2258f", job.sha) + self.assertEqual("test-branch", job.branch) + self.assertEqual("test-owner", job.cirrus_repo_owner) + self.assertEqual("test-name", job.cirrus_repo_name) + self.assertEqual(123, job.cirrus_task_id) + self.assertEqual("test-task-name", job.cirrus_task_name) + self.assertEqual(9, job.cirrus_build_id) + self.assertEqual(456, job.cirrus_pr) + self.assertEqual(789, job.github_check_suite_id) + self.assertEqual("6.1.0-dev.123", job.repo_version) + + def test_zeek_bad_build_url(self, enqueue_job_mock, get_machine_mock): enqueue_job_mock.return_value = self.enqueue_job_result_mock r = self._test_client.post( @@ -136,7 +152,7 @@ def test_zeek_bad_build_url(self, enqueue_job_mock): enqueue_job_mock.assert_not_called() - def test_zeek_bad_hmac_digest(self, enqueue_job_mock): + def test_zeek_bad_hmac_digest(self, enqueue_job_mock, get_machine_mock): enqueue_job_mock.return_value = self.enqueue_job_result_mock r = self._test_client.post( @@ -156,7 +172,7 @@ def test_zeek_bad_hmac_digest(self, enqueue_job_mock): enqueue_job_mock.assert_not_called() - def test_zeek_bad_branch(self, enqueue_job_mock): + def test_zeek_bad_branch(self, enqueue_job_mock, get_machine_mock): enqueue_job_mock.return_value = self.enqueue_job_result_mock r = self._test_client.post( diff --git a/ci-based/tests/test_storage.py b/ci-based/tests/test_storage.py index e7b6f78..1f69e68 100644 --- a/ci-based/tests/test_storage.py +++ b/ci-based/tests/test_storage.py @@ -4,6 +4,7 @@ import sqlite3 from zeek_benchmarker import storage, testing +from zeek_benchmarker.models import Machine from zeek_benchmarker.tasks import ZeekJob, ZeekTest, ZeekTestResult @@ -23,6 +24,58 @@ def setUp(self): self.store = storage.Storage(self.database_file.name) + def make_test_machine(self, **kwargs): + m = Machine( + dmi_sys_vendor="LENOVO", + dmi_product_uuid="test-product-uuid", + dmi_product_serial="test-serial", + dmi_board_asset_tag="test-asset-tag", + os="Linux", + architecture="x86_64", + cpu_model="test-cpu-model", + mem_total_bytes=16458768384, + ) + for k, v in kwargs.items(): + setattr(m, k, v) + return m + + def test_store_job(self): + """ + Store a job in the database. + """ + self.store.store_job( + job_id="test-job-id", + kind="zeek", + machine_id=421234, + req_vals={ + "build_url": "test-build-url", + "build_hash": "test-build-hash", + "sha": "test-sha", + "commit": "test-sha", + "branch": "test-branch", + "original_branch": "test-original-branch", + "cirrus_repo_owner": "test-cirrus-repo-owner", + "cirrus_repo_name": "test-cirrus-repo-name", + "cirrus_task_id": "test-cirrus-task-id", + "cirrus_task_name": "test-cirrus-task-name", + "cirrus_build_id": "test-cirrus-build-id", + "cirrus_pr": "1111", + "github_check_suite_id": "22334455", + "repo_version": "test-repo-version", + }, + ) + + with sqlite3.connect(self.database_file.name) as conn: + conn.row_factory = sqlite3.Row + rows = list(conn.execute("select * from jobs")) + + # Just some smoke-checking + self.assertEqual(1, len(rows)) + self.assertEqual(rows[0]["machine_id"], 421234) + self.assertEqual(rows[0]["id"], "test-job-id") + self.assertEqual(rows[0]["cirrus_pr"], 1111) + self.assertEqual(rows[0]["github_check_suite_id"], 22334455) + def test_store_zeek_result(self): """ Store a result in the database. @@ -66,3 +119,18 @@ def test_store_zeek_error(self): self.assertEqual(rows[0]["test_run"], 3) self.assertFalse(rows[0]["success"]) self.assertEqual(rows[0]["error"], "Something broke") + + def test_get_or_create_machine(self): + m1 = self.store.get_or_create_machine(self.make_test_machine()) + m2 = self.store.get_or_create_machine(self.make_test_machine()) + self.assertEqual(m1.id, m2.id) + + m3 = self.store.get_or_create_machine( + self.make_test_machine(mem_total_bytes=12345678) + ) + m4 = self.store.get_or_create_machine( + self.make_test_machine(mem_total_bytes=12345678) + ) + self.assertEqual(m3.id, m4.id) + + self.assertNotEqual(m1.id, m3.id) diff --git a/ci-based/zeek_benchmarker/app.py b/ci-based/zeek_benchmarker/app.py index 6eeedd3..adf1aeb 100644 --- a/ci-based/zeek_benchmarker/app.py +++ b/ci-based/zeek_benchmarker/app.py @@ -5,6 +5,7 @@ import redis import rq +import zeek_benchmarker.machine import zeek_benchmarker.tasks from flask import Flask, current_app, jsonify, request from werkzeug.exceptions import BadRequest, Forbidden @@ -187,9 +188,15 @@ def zeek(): # Store information about this job, too. store = storage.Storage(app.config["DATABASE_FILE"]) + + # Store the machine information with the job. The assumption + # here is that the system serving the API is also executing + # the job. Otherwise this would need to move into tasks.py. + machine = store.get_or_create_machine(zeek_benchmarker.machine.get_machine()) store.store_job( job_id=job.id, kind="zeek", + machine_id=machine.id, req_vals=req_vals, ) diff --git a/ci-based/zeek_benchmarker/machine.py b/ci-based/zeek_benchmarker/machine.py new file mode 100644 index 0000000..37e55f9 --- /dev/null +++ b/ci-based/zeek_benchmarker/machine.py @@ -0,0 +1,79 @@ +""" +Collect information about the running machine. + +Requires access to /sys for some of the DMI information +""" +import logging +import platform +from pathlib import Path + +from . import models + +logger = logging.getLogger(__name__) + + +def read_sys_dmi_id_file( + name: str, base_path: Path = Path("/sys/devices/virtual/dmi/id") +) -> str: + """ + Read /sys/devices/virtual/dmi/id/{name} and return the content, stripped. + """ + try: + return (base_path / name).read_text().strip() + except FileNotFoundError as e: + logger.warning("Could not open %s: %s", base_path / name, e) + return "" + + +def get_cpu_model(path: Path = Path("/proc/cpuinfo")): + """ + Seems platform.processor() isn't working well. + + Parse the first model name line out of /proc/cpuinfo + + model : 142 + model name : Intel(R) Core(TM) i7-8565U CPU @ 1.80GHz + stepping : 12 + """ + with path.open() as f: + for line in f: + if line.startswith("model name"): + return line.split(":", 1)[1].strip() + + return "" + + +def get_mem_total_bytes(path: Path = Path("/proc/meminfo")): + """ + Parse /proc/meminfo for the MemTotal: line. + + MemTotal: 16073016 kB + MemFree: 4273580 kB + + """ + with path.open() as f: + for line in f: + if line.startswith("MemTotal:"): + value = line.split(":", 1)[1] + kb, suffix = value.strip().split(" ", 1) + if suffix != "kB" or not kb.isnumeric(): + raise Exception(f"Unexpected value ${kb} / {suffix}") + return int(kb) * 1024 + + return 0 + + +def get_machine() -> models.Machine: + """ + Collect information for this system/machine. + """ + kwargs = {} + for k in ["sys_vendor", "product_uuid", "product_serial", "board_asset_tag"]: + kwargs[f"dmi_{k}"] = read_sys_dmi_id_file(k) + + kwargs["os"] = platform.system() + kwargs["architecture"] = platform.machine() + kwargs["cpu_model"] = get_cpu_model() + kwargs["mem_total_bytes"] = get_mem_total_bytes() + + return models.Machine(**kwargs) diff --git a/ci-based/zeek_benchmarker/models.py b/ci-based/zeek_benchmarker/models.py new file mode 100644 index 0000000..796ebfd --- /dev/null +++ b/ci-based/zeek_benchmarker/models.py @@ -0,0 +1,51 @@ +from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column + + +class Base(DeclarativeBase): + def __repr__(self): + """ + Show column values when debugging. + """ + fields_str = ", ".join( + [f"{c.name}={getattr(self, c.name)!r}" for c in self.__mapper__.columns] + ) + + return f"{self.__class__.__name__}({fields_str})" + + +class Machine(Base): + __tablename__ = "machines" + + id: Mapped[int] = mapped_column(primary_key=True) + dmi_sys_vendor: Mapped[str] + dmi_product_uuid: Mapped[str] + dmi_product_serial: Mapped[str] + dmi_board_asset_tag: Mapped[str] + os: Mapped[str] + architecture: Mapped[str] + cpu_model: Mapped[str] + mem_total_bytes: Mapped[int] + + +class Job(Base): + """ + The Jobs table - currently in use by tests. + """ + + __tablename__ = "jobs" + id: Mapped[str] = mapped_column(primary_key=True) + kind: Mapped[str] + build_url: Mapped[str] + build_hash: Mapped[str] + sha: Mapped[str] + branch: Mapped[str] + original_branch: Mapped[str] + cirrus_repo_owner: Mapped[str] + cirrus_repo_name: Mapped[str] + cirrus_task_id: Mapped[str] + cirrus_task_name: Mapped[str] + cirrus_build_id: Mapped[str] + cirrus_pr: Mapped[str] + github_check_suite_id: Mapped[str] + repo_version: Mapped[str] + machine_id: Mapped[int] diff --git a/ci-based/zeek_benchmarker/storage.py b/ci-based/zeek_benchmarker/storage.py index 5b90a7c..9e7ed79 100644 --- a/ci-based/zeek_benchmarker/storage.py +++ b/ci-based/zeek_benchmarker/storage.py @@ -5,14 +5,33 @@ """ import sqlite3 -from . import config +import sqlalchemy as sa + +from . import config, models + + +def get_engine(url: str) -> sa.Engine: + """ + Get an SQLAlchemy engine: + """ + if "://" not in url: + url = f"sqlite:///{url}" + return sa.create_engine(url) class Storage: def __init__(self, filename): self._filename = filename + self.Session = sa.orm.sessionmaker(get_engine(self._filename)) - def store_job(self, *, job_id: str, kind: str, req_vals: dict[any, any]): + def store_job( + self, + *, + job_id: str, + kind: str, + machine_id: int, + req_vals: dict[str, any], + ): with sqlite3.connect(self._filename) as conn: c = conn.cursor() sql = """INSERT INTO jobs ( @@ -30,7 +49,8 @@ def store_job(self, *, job_id: str, kind: str, req_vals: dict[any, any]): cirrus_build_id, cirrus_pr, github_check_suite_id, - repo_version + repo_version, + machine_id ) VALUES ( :id, :kind, @@ -46,12 +66,14 @@ def store_job(self, *, job_id: str, kind: str, req_vals: dict[any, any]): :cirrus_build_id, :cirrus_pr, :github_check_suite_id, - :repo_version + :repo_version, + :machine_id )""" data = req_vals.copy() data["id"] = job_id data["sha"] = req_vals["commit"] data["kind"] = kind + data["machine_id"] = machine_id c.execute(sql, data) def store_zeek_result( @@ -139,9 +161,50 @@ def store_zeek_error( } c.execute(sql, data) + def get_or_create_machine(self, m: models.Machine): + """ + Find an entry in table machine with all the same attributes and + return it, or insert the new information and return that entry. + + There might be a better way, but we do want to check that all + the columns are the same. The following is a bit more generic + in case we need that: https://stackoverflow.com/a/6078058 + """ + with self.Session() as session: + # Allow access to the returned objects after returning + # from this function. + session.expire_on_commit = False + + query = session.query(models.Machine).where( + models.Machine.dmi_sys_vendor == m.dmi_sys_vendor, + models.Machine.dmi_product_uuid == m.dmi_product_uuid, + models.Machine.dmi_product_serial == m.dmi_product_serial, + models.Machine.dmi_board_asset_tag == m.dmi_board_asset_tag, + models.Machine.os == m.os, + models.Machine.architecture == m.architecture, + models.Machine.cpu_model == m.cpu_model, + models.Machine.mem_total_bytes == m.mem_total_bytes, + ) + + r = query.scalar() + if r: + return r + + session.add(m) + session.commit() + session.expunge(m) + return m + + +_storage = None + def get() -> Storage: """ - Get a Storage handle. + Get a singleton Storage handle. """ - return Storage(config.get()["DATABASE_FILE"]) + global _storage + if _storage is None: + _storage = Storage(config.get()["DATABASE_FILE"]) + + return _storage diff --git a/ci-based/zeek_benchmarker/testing.py b/ci-based/zeek_benchmarker/testing.py index c936e23..a00cb8b 100644 --- a/ci-based/zeek_benchmarker/testing.py +++ b/ci-based/zeek_benchmarker/testing.py @@ -5,6 +5,7 @@ import alembic.command import alembic.config +from zeek_benchmarker import storage class TestWithDatabase(unittest.TestCase): @@ -22,5 +23,7 @@ def setUp(self): ) alembic.command.upgrade(alembic_config, "head") + self.storage = storage.Storage(self.database_file.name) + def tearDown(self): os.unlink(self.database_file.name)