From fd7458f88bdca051135872068f84e531537a3ad8 Mon Sep 17 00:00:00 2001
From: Arne Welzel <arne.welzel@corelight.com>
Date: Thu, 28 Sep 2023 14:22:23 +0200
Subject: [PATCH] Add and populate machines table.

This should help to track changes to the EC2 instance.

This also moves over to use SQLAlchemy in storage.py rather than
plain sqlite3.
---
 ...928_1204-311be9937d3e_add_machine_table.py | 42 ++++++++++
 ci-based/docker-compose.yml                   |  5 +-
 ci-based/tests/test_app.py                    | 64 +++++++++------
 ci-based/tests/test_storage.py                | 68 ++++++++++++++++
 ci-based/zeek_benchmarker/app.py              |  7 ++
 ci-based/zeek_benchmarker/machine.py          | 79 +++++++++++++++++++
 ci-based/zeek_benchmarker/models.py           | 51 ++++++++++++
 ci-based/zeek_benchmarker/storage.py          | 75 ++++++++++++++++--
 ci-based/zeek_benchmarker/testing.py          |  3 +
 9 files changed, 362 insertions(+), 32 deletions(-)
 create mode 100644 ci-based/alembic/versions/20230928_1204-311be9937d3e_add_machine_table.py
 create mode 100644 ci-based/zeek_benchmarker/machine.py
 create mode 100644 ci-based/zeek_benchmarker/models.py

diff --git a/ci-based/alembic/versions/20230928_1204-311be9937d3e_add_machine_table.py b/ci-based/alembic/versions/20230928_1204-311be9937d3e_add_machine_table.py
new file mode 100644
index 0000000..62813ad
--- /dev/null
+++ b/ci-based/alembic/versions/20230928_1204-311be9937d3e_add_machine_table.py
@@ -0,0 +1,42 @@
+"""add machine table
+
+Revision ID: 311be9937d3e
+Revises: 1c1d18482b62
+Create Date: 2023-09-28 12:04:49.205816
+
+"""
+from collections.abc import Sequence
+
+import sqlalchemy as sa
+from alembic import op
+
+# revision identifiers, used by Alembic.
+revision: str = "311be9937d3e"
+down_revision: str | None = "1c1d18482b62"
+branch_labels: str | Sequence[str] | None = None
+depends_on: str | Sequence[str] | None = None
+
+
+def upgrade() -> None:
+    op.create_table(
+        "machines",
+        sa.Column("id", sa.Integer, primary_key=True),
+        sa.Column(
+            "created_at", sa.DateTime, server_default=sa.text("(STRFTIME('%s'))")
+        ),
+        sa.Column("dmi_sys_vendor", sa.Text),
+        sa.Column("dmi_product_uuid", sa.Text),
+        sa.Column("dmi_product_serial", sa.Text),
+        sa.Column("dmi_board_asset_tag", sa.Text),
+        sa.Column("os", sa.Text),
+        sa.Column("architecture", sa.Text),
+        sa.Column("cpu_model", sa.Text),
+        sa.Column("mem_total_bytes", sa.Integer),
+    )
+
+    op.add_column("jobs", sa.Column("machine_id", sa.Integer))
+
+
+def downgrade() -> None:
+    op.drop_column("jobs", "machine_id")
+    op.drop_table("machines")
diff --git a/ci-based/docker-compose.yml b/ci-based/docker-compose.yml
index 9a8a4ad..371ddd4 100644
--- a/ci-based/docker-compose.yml
+++ b/ci-based/docker-compose.yml
@@ -34,8 +34,9 @@ services:
       # also come from the environment.
       - ./config.yml:/app/config.yml
 
-      # Let the API record information about
-      # jobs in the database.
+      # Allow the API access to the host machine's machine-id.
+      - /etc/machine-id:/etc/machine-id
+
       - ./persistent:/app/persistent
     working_dir: /app
     entrypoint:
diff --git a/ci-based/tests/test_app.py b/ci-based/tests/test_app.py
index cf9a841..cc3e32f 100644
--- a/ci-based/tests/test_app.py
+++ b/ci-based/tests/test_app.py
@@ -1,14 +1,19 @@
 import datetime
 import hmac
-import sqlite3
 import time
 import unittest
 from unittest import mock
 
 from zeek_benchmarker.app import create_app, is_valid_branch_name
+from zeek_benchmarker.models import Job, Machine
 from zeek_benchmarker.testing import TestWithDatabase
 
 
+def test_machine():
+    return Machine(dmi_product_uuid="ec2abcdef-1234", os="Linux")
+
+
+@mock.patch("zeek_benchmarker.machine.get_machine", new_callable=lambda: test_machine)
 @mock.patch("zeek_benchmarker.app.enqueue_job")
 class TestApi(TestWithDatabase):
     def setUp(self):
@@ -45,7 +50,7 @@ def hmac_digest(self, path, timestamp, build_hash):
         hmac_msg = f"{path:s}-{timestamp:d}-{build_hash:s}\n".encode()
         return hmac.new(self._test_hmac_key, hmac_msg, "sha256").hexdigest()
 
-    def test_zeek_good(self, enqueue_job_mock):
+    def test_zeek_good(self, enqueue_job_mock, get_machine_mock):
         enqueue_job_mock.return_value = self.enqueue_job_result_mock
 
         r = self._test_client.post(
@@ -67,7 +72,17 @@ def test_zeek_good(self, enqueue_job_mock):
             self._test_build_hash, enqueue_job_mock.call_args[0][1]["build_hash"]
         )
 
-    def test_zeek_good__more(self, enqueue_job_mock):
+        # Query the databse for the stored Job and Machine and
+        # ensure they are connected.
+        with self.storage.Session() as session:
+            jobs = session.query(Job).all()
+            machine = session.query(Machine).first()
+
+        self.assertEqual(1, len(jobs))
+        self.assertEqual("test-job-id", jobs[0].id)
+        self.assertEqual(machine.id, jobs[0].machine_id)
+
+    def test_zeek_good__more(self, enqueue_job_mock, get_machine_mock):
         enqueue_job_mock.return_value = self.enqueue_job_result_mock
 
         r = self._test_client.post(
@@ -98,25 +113,26 @@ def test_zeek_good__more(self, enqueue_job_mock):
             self._test_build_hash, enqueue_job_mock.call_args[0][1]["build_hash"]
         )
 
-        with sqlite3.connect(self.database_file.name) as conn:
-            conn.row_factory = sqlite3.Row
-            rows = list(conn.execute("select * from jobs"))
-            self.assertEqual(1, len(rows))
-            row = rows[0]
-            self.assertEqual("zeek", row["kind"])
-            self.assertEqual(self._test_build_hash, row["build_hash"])
-            self.assertEqual("f572d396fae9206628714fb2ce00f72e94f2258f", row["sha"])
-            self.assertEqual("test-branch", row["branch"])
-            self.assertEqual("test-owner", row["cirrus_repo_owner"])
-            self.assertEqual("test-name", row["cirrus_repo_name"])
-            self.assertEqual(123, row["cirrus_task_id"])
-            self.assertEqual("test-task-name", row["cirrus_task_name"])
-            self.assertEqual(9, row["cirrus_build_id"])
-            self.assertEqual(456, row["cirrus_pr"])
-            self.assertEqual(789, row["github_check_suite_id"])
-            self.assertEqual("6.1.0-dev.123", row["repo_version"])
-
-    def test_zeek_bad_build_url(self, enqueue_job_mock):
+        # Query the databse for the stored Job and Machine and
+        # ensure they are connected.
+        with self.storage.Session() as session:
+            jobs = session.query(Job).all()
+            self.assertEqual(1, len(jobs))
+            job = jobs[0]
+            self.assertEqual("zeek", job.kind)
+            self.assertEqual(self._test_build_hash, job.build_hash)
+            self.assertEqual("f572d396fae9206628714fb2ce00f72e94f2258f", job.sha)
+            self.assertEqual("test-branch", job.branch)
+            self.assertEqual("test-owner", job.cirrus_repo_owner)
+            self.assertEqual("test-name", job.cirrus_repo_name)
+            self.assertEqual(123, job.cirrus_task_id)
+            self.assertEqual("test-task-name", job.cirrus_task_name)
+            self.assertEqual(9, job.cirrus_build_id)
+            self.assertEqual(456, job.cirrus_pr)
+            self.assertEqual(789, job.github_check_suite_id)
+            self.assertEqual("6.1.0-dev.123", job.repo_version)
+
+    def test_zeek_bad_build_url(self, enqueue_job_mock, get_machine_mock):
         enqueue_job_mock.return_value = self.enqueue_job_result_mock
 
         r = self._test_client.post(
@@ -136,7 +152,7 @@ def test_zeek_bad_build_url(self, enqueue_job_mock):
 
         enqueue_job_mock.assert_not_called()
 
-    def test_zeek_bad_hmac_digest(self, enqueue_job_mock):
+    def test_zeek_bad_hmac_digest(self, enqueue_job_mock, get_machine_mock):
         enqueue_job_mock.return_value = self.enqueue_job_result_mock
 
         r = self._test_client.post(
@@ -156,7 +172,7 @@ def test_zeek_bad_hmac_digest(self, enqueue_job_mock):
 
         enqueue_job_mock.assert_not_called()
 
-    def test_zeek_bad_branch(self, enqueue_job_mock):
+    def test_zeek_bad_branch(self, enqueue_job_mock, get_machine_mock):
         enqueue_job_mock.return_value = self.enqueue_job_result_mock
 
         r = self._test_client.post(
diff --git a/ci-based/tests/test_storage.py b/ci-based/tests/test_storage.py
index e7b6f78..1f69e68 100644
--- a/ci-based/tests/test_storage.py
+++ b/ci-based/tests/test_storage.py
@@ -4,6 +4,7 @@
 import sqlite3
 
 from zeek_benchmarker import storage, testing
+from zeek_benchmarker.models import Machine
 from zeek_benchmarker.tasks import ZeekJob, ZeekTest, ZeekTestResult
 
 
@@ -23,6 +24,58 @@ def setUp(self):
 
         self.store = storage.Storage(self.database_file.name)
 
+    def make_test_machine(self, **kwargs):
+        m = Machine(
+            dmi_sys_vendor="LENOVO",
+            dmi_product_uuid="test-product-uuid",
+            dmi_product_serial="test-serial",
+            dmi_board_asset_tag="test-asset-tag",
+            os="Linux",
+            architecture="x86_64",
+            cpu_model="test-cpu-model",
+            mem_total_bytes=16458768384,
+        )
+        for k, v in kwargs.items():
+            setattr(m, k, v)
+        return m
+
+    def test_store_job(self):
+        """
+        Store a job in the database.
+        """
+        self.store.store_job(
+            job_id="test-job-id",
+            kind="zeek",
+            machine_id=421234,
+            req_vals={
+                "build_url": "test-build-url",
+                "build_hash": "test-build-hash",
+                "sha": "test-sha",
+                "commit": "test-sha",
+                "branch": "test-branch",
+                "original_branch": "test-original-branch",
+                "cirrus_repo_owner": "test-cirrus-repo-owner",
+                "cirrus_repo_name": "test-cirrus-repo-name",
+                "cirrus_task_id": "test-cirrus-task-id",
+                "cirrus_task_name": "test-cirrus-task-name",
+                "cirrus_build_id": "test-cirrus-build-id",
+                "cirrus_pr": "1111",
+                "github_check_suite_id": "22334455",
+                "repo_version": "test-repo-version",
+            },
+        )
+
+        with sqlite3.connect(self.database_file.name) as conn:
+            conn.row_factory = sqlite3.Row
+            rows = list(conn.execute("select * from jobs"))
+
+        # Just some smoke-checking
+        self.assertEqual(1, len(rows))
+        self.assertEqual(rows[0]["machine_id"], 421234)
+        self.assertEqual(rows[0]["id"], "test-job-id")
+        self.assertEqual(rows[0]["cirrus_pr"], 1111)
+        self.assertEqual(rows[0]["github_check_suite_id"], 22334455)
+
     def test_store_zeek_result(self):
         """
         Store a result in the database.
@@ -66,3 +119,18 @@ def test_store_zeek_error(self):
             self.assertEqual(rows[0]["test_run"], 3)
             self.assertFalse(rows[0]["success"])
             self.assertEqual(rows[0]["error"], "Something broke")
+
+    def test_get_or_create_machine(self):
+        m1 = self.store.get_or_create_machine(self.make_test_machine())
+        m2 = self.store.get_or_create_machine(self.make_test_machine())
+        self.assertEqual(m1.id, m2.id)
+
+        m3 = self.store.get_or_create_machine(
+            self.make_test_machine(mem_total_bytes=12345678)
+        )
+        m4 = self.store.get_or_create_machine(
+            self.make_test_machine(mem_total_bytes=12345678)
+        )
+        self.assertEqual(m3.id, m4.id)
+
+        self.assertNotEqual(m1.id, m3.id)
diff --git a/ci-based/zeek_benchmarker/app.py b/ci-based/zeek_benchmarker/app.py
index 6eeedd3..adf1aeb 100644
--- a/ci-based/zeek_benchmarker/app.py
+++ b/ci-based/zeek_benchmarker/app.py
@@ -5,6 +5,7 @@
 
 import redis
 import rq
+import zeek_benchmarker.machine
 import zeek_benchmarker.tasks
 from flask import Flask, current_app, jsonify, request
 from werkzeug.exceptions import BadRequest, Forbidden
@@ -187,9 +188,15 @@ def zeek():
 
         # Store information about this job, too.
         store = storage.Storage(app.config["DATABASE_FILE"])
+
+        # Store the machine information with the job. The assumption
+        # here is that the system serving the API is also executing
+        # the job. Otherwise this would need to move into tasks.py.
+        machine = store.get_or_create_machine(zeek_benchmarker.machine.get_machine())
         store.store_job(
             job_id=job.id,
             kind="zeek",
+            machine_id=machine.id,
             req_vals=req_vals,
         )
 
diff --git a/ci-based/zeek_benchmarker/machine.py b/ci-based/zeek_benchmarker/machine.py
new file mode 100644
index 0000000..37e55f9
--- /dev/null
+++ b/ci-based/zeek_benchmarker/machine.py
@@ -0,0 +1,79 @@
+"""
+Collect information about the running machine.
+
+Requires access to /sys for some of the DMI information
+"""
+import logging
+import platform
+from pathlib import Path
+
+from . import models
+
+logger = logging.getLogger(__name__)
+
+
+def read_sys_dmi_id_file(
+    name: str, base_path: Path = Path("/sys/devices/virtual/dmi/id")
+) -> str:
+    """
+    Read /sys/devices/virtual/dmi/id/{name} and return the content, stripped.
+    """
+    try:
+        return (base_path / name).read_text().strip()
+    except FileNotFoundError as e:
+        logger.warning("Could not open %s: %s", base_path / name, e)
+        return ""
+
+
+def get_cpu_model(path: Path = Path("/proc/cpuinfo")):
+    """
+    Seems platform.processor() isn't working well.
+
+    Parse the first model name line out of /proc/cpuinfo
+
+        model           : 142
+        model name      : Intel(R) Core(TM) i7-8565U CPU @ 1.80GHz
+        stepping        : 12
+    """
+    with path.open() as f:
+        for line in f:
+            if line.startswith("model name"):
+                return line.split(":", 1)[1].strip()
+
+    return ""
+
+
+def get_mem_total_bytes(path: Path = Path("/proc/meminfo")):
+    """
+    Parse /proc/meminfo for the MemTotal: line.
+
+        MemTotal:       16073016 kB
+        MemFree:         4273580 kB
+
+    """
+    with path.open() as f:
+        for line in f:
+            if line.startswith("MemTotal:"):
+                value = line.split(":", 1)[1]
+                kb, suffix = value.strip().split(" ", 1)
+                if suffix != "kB" or not kb.isnumeric():
+                    raise Exception(f"Unexpected value ${kb} / {suffix}")
+                return int(kb) * 1024
+
+    return 0
+
+
+def get_machine() -> models.Machine:
+    """
+    Collect information for this system/machine.
+    """
+    kwargs = {}
+    for k in ["sys_vendor", "product_uuid", "product_serial", "board_asset_tag"]:
+        kwargs[f"dmi_{k}"] = read_sys_dmi_id_file(k)
+
+    kwargs["os"] = platform.system()
+    kwargs["architecture"] = platform.machine()
+    kwargs["cpu_model"] = get_cpu_model()
+    kwargs["mem_total_bytes"] = get_mem_total_bytes()
+
+    return models.Machine(**kwargs)
diff --git a/ci-based/zeek_benchmarker/models.py b/ci-based/zeek_benchmarker/models.py
new file mode 100644
index 0000000..796ebfd
--- /dev/null
+++ b/ci-based/zeek_benchmarker/models.py
@@ -0,0 +1,51 @@
+from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column
+
+
+class Base(DeclarativeBase):
+    def __repr__(self):
+        """
+        Show column values when debugging.
+        """
+        fields_str = ", ".join(
+            [f"{c.name}={getattr(self, c.name)!r}" for c in self.__mapper__.columns]
+        )
+
+        return f"{self.__class__.__name__}({fields_str})"
+
+
+class Machine(Base):
+    __tablename__ = "machines"
+
+    id: Mapped[int] = mapped_column(primary_key=True)
+    dmi_sys_vendor: Mapped[str]
+    dmi_product_uuid: Mapped[str]
+    dmi_product_serial: Mapped[str]
+    dmi_board_asset_tag: Mapped[str]
+    os: Mapped[str]
+    architecture: Mapped[str]
+    cpu_model: Mapped[str]
+    mem_total_bytes: Mapped[int]
+
+
+class Job(Base):
+    """
+    The Jobs table - currently in use by tests.
+    """
+
+    __tablename__ = "jobs"
+    id: Mapped[str] = mapped_column(primary_key=True)
+    kind: Mapped[str]
+    build_url: Mapped[str]
+    build_hash: Mapped[str]
+    sha: Mapped[str]
+    branch: Mapped[str]
+    original_branch: Mapped[str]
+    cirrus_repo_owner: Mapped[str]
+    cirrus_repo_name: Mapped[str]
+    cirrus_task_id: Mapped[str]
+    cirrus_task_name: Mapped[str]
+    cirrus_build_id: Mapped[str]
+    cirrus_pr: Mapped[str]
+    github_check_suite_id: Mapped[str]
+    repo_version: Mapped[str]
+    machine_id: Mapped[int]
diff --git a/ci-based/zeek_benchmarker/storage.py b/ci-based/zeek_benchmarker/storage.py
index 5b90a7c..9e7ed79 100644
--- a/ci-based/zeek_benchmarker/storage.py
+++ b/ci-based/zeek_benchmarker/storage.py
@@ -5,14 +5,33 @@
 """
 import sqlite3
 
-from . import config
+import sqlalchemy as sa
+
+from . import config, models
+
+
+def get_engine(url: str) -> sa.Engine:
+    """
+    Get an SQLAlchemy engine:
+    """
+    if "://" not in url:
+        url = f"sqlite:///{url}"
+    return sa.create_engine(url)
 
 
 class Storage:
     def __init__(self, filename):
         self._filename = filename
+        self.Session = sa.orm.sessionmaker(get_engine(self._filename))
 
-    def store_job(self, *, job_id: str, kind: str, req_vals: dict[any, any]):
+    def store_job(
+        self,
+        *,
+        job_id: str,
+        kind: str,
+        machine_id: int,
+        req_vals: dict[str, any],
+    ):
         with sqlite3.connect(self._filename) as conn:
             c = conn.cursor()
             sql = """INSERT INTO jobs (
@@ -30,7 +49,8 @@ def store_job(self, *, job_id: str, kind: str, req_vals: dict[any, any]):
                          cirrus_build_id,
                          cirrus_pr,
                          github_check_suite_id,
-                         repo_version
+                         repo_version,
+                         machine_id
                     ) VALUES (
                         :id,
                         :kind,
@@ -46,12 +66,14 @@ def store_job(self, *, job_id: str, kind: str, req_vals: dict[any, any]):
                         :cirrus_build_id,
                         :cirrus_pr,
                         :github_check_suite_id,
-                        :repo_version
+                        :repo_version,
+                        :machine_id
                     )"""
             data = req_vals.copy()
             data["id"] = job_id
             data["sha"] = req_vals["commit"]
             data["kind"] = kind
+            data["machine_id"] = machine_id
             c.execute(sql, data)
 
     def store_zeek_result(
@@ -139,9 +161,50 @@ def store_zeek_error(
             }
             c.execute(sql, data)
 
+    def get_or_create_machine(self, m: models.Machine):
+        """
+        Find an entry in table machine with all the same attributes and
+        return it, or insert the new information and return that entry.
+
+        There might be a better way, but we do want to check that all
+        the columns are the same. The following is a bit more generic
+        in case we need that: https://stackoverflow.com/a/6078058
+        """
+        with self.Session() as session:
+            # Allow access to the returned objects after returning
+            # from this function.
+            session.expire_on_commit = False
+
+            query = session.query(models.Machine).where(
+                models.Machine.dmi_sys_vendor == m.dmi_sys_vendor,
+                models.Machine.dmi_product_uuid == m.dmi_product_uuid,
+                models.Machine.dmi_product_serial == m.dmi_product_serial,
+                models.Machine.dmi_board_asset_tag == m.dmi_board_asset_tag,
+                models.Machine.os == m.os,
+                models.Machine.architecture == m.architecture,
+                models.Machine.cpu_model == m.cpu_model,
+                models.Machine.mem_total_bytes == m.mem_total_bytes,
+            )
+
+            r = query.scalar()
+            if r:
+                return r
+
+            session.add(m)
+            session.commit()
+            session.expunge(m)
+            return m
+
+
+_storage = None
+
 
 def get() -> Storage:
     """
-    Get a Storage handle.
+    Get a singleton Storage handle.
     """
-    return Storage(config.get()["DATABASE_FILE"])
+    global _storage
+    if _storage is None:
+        _storage = Storage(config.get()["DATABASE_FILE"])
+
+    return _storage
diff --git a/ci-based/zeek_benchmarker/testing.py b/ci-based/zeek_benchmarker/testing.py
index c936e23..a00cb8b 100644
--- a/ci-based/zeek_benchmarker/testing.py
+++ b/ci-based/zeek_benchmarker/testing.py
@@ -5,6 +5,7 @@
 
 import alembic.command
 import alembic.config
+from zeek_benchmarker import storage
 
 
 class TestWithDatabase(unittest.TestCase):
@@ -22,5 +23,7 @@ def setUp(self):
         )
         alembic.command.upgrade(alembic_config, "head")
 
+        self.storage = storage.Storage(self.database_file.name)
+
     def tearDown(self):
         os.unlink(self.database_file.name)