From 32a104bddfca9b9921ffe2c92673df1e7768c39f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ale=C5=A1=20Mat=C4=9Bj?= Date: Tue, 6 Aug 2024 13:36:17 +0200 Subject: [PATCH 1/3] Fix up some comments in addCountmeFlag() The buckets aren't really an array that's indexed in the code, they're just sequential numbers for the URL flag. Also clarify why we're using "this window" instead of "the current position of the sliding window" in the comments. Backport from dnf4: https://github.com/rpm-software-management/libdnf/commit/cc95edd15b8a4fc4c381c85735e2f14a1dc0852e --- libdnf5/repo/repo_downloader.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/libdnf5/repo/repo_downloader.cpp b/libdnf5/repo/repo_downloader.cpp index 7d84fad5b..a04d2ff1b 100644 --- a/libdnf5/repo/repo_downloader.cpp +++ b/libdnf5/repo/repo_downloader.cpp @@ -682,6 +682,9 @@ const std::array COUNTME_BUCKETS = {{2, 5, 25}}; /// This is to align the time window with an absolute point in time rather /// than the last counting event (which could facilitate tracking across /// multiple such events). +/// +/// In the below comments, the window's current position will be referred to +/// as "this window" for brevity. void RepoDownloader::add_countme_flag(LibrepoHandle & handle) { auto & logger = *base->get_logger(); @@ -753,7 +756,7 @@ void RepoDownloader::add_countme_flag(LibrepoHandle & handle) { for (i = 0; i < COUNTME_BUCKETS.size(); ++i) if (step < COUNTME_BUCKETS[i]) break; - uint32_t bucket = i + 1; // Buckets are indexed from 1 + uint32_t bucket = i + 1; // Buckets are numbered from 1 // Set the flag std::string flag = "countme=" + std::to_string(bucket); From fa99d56edc1421ce9caaeafe735bc22934c585a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ale=C5=A1=20Mat=C4=9Bj?= Date: Mon, 5 Aug 2024 14:15:37 +0200 Subject: [PATCH 2/3] Fix countme bucket calculation Actually use the system's installation time (if known) as the reference point, instead of the first-ever countme event recorded for the given repo. This is what the dnf5.conf(5) man page always said about the countme option, the code just never lived up to that. This makes bucket calculation more accurate: 1. System upgrades will no longer reset the bucket to 1 (this used to be the case due to a new persistdir being created whenever $releasever changed). 2. Systems that only reach out to the repos after an initial time period after being installed will no longer appear younger than they really are. 3. Prebuilt OS images that happen to include countme cookies created at build time will no longer cause all the instances spawned from those images (physical machines, VMs or containers) to appear older than they really are. Use the machine-id(5) file's mtime to infer the installation time. This file is semantically tied to the system's lifetime since it's typically populated at installation time or during the first boot by an installer tool or init system, respectively, and remains unchanged. The fact that it's a well-defined file with clear semantics ensures that OS images won't accidentally include a prepopulated version of this file with a timestamp corresponding to the image build, unlike our own cookie files (see point 3 above). In some cases, such as in OCI containers without an init system running, the machine-id file may be missing or empty, even though the system is still used long-term. To cover those, keep the original, relative epoch as a fallback method. System upgrades aren't really a thing for such systems so the above point 1 doesn't apply here. Some containers, such as those created by toolbox(1), may also choose to bind-mount the host's machine-id file, thus falling into the same bucket as their host. Conveniently, that's what we want, since the purpose of such containers is to blend with the host as much as possible. Backport from dnf4: https://github.com/rpm-software-management/libdnf/commit/cc95edd15b8a4fc4c381c85735e2f14a1dc0852e --- libdnf5/repo/repo_downloader.cpp | 35 +++++++++++++++++++++++++++++++- libdnf5/repo/repo_downloader.hpp | 1 + 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/libdnf5/repo/repo_downloader.cpp b/libdnf5/repo/repo_downloader.cpp index a04d2ff1b..b142aabbe 100644 --- a/libdnf5/repo/repo_downloader.cpp +++ b/libdnf5/repo/repo_downloader.cpp @@ -31,6 +31,7 @@ along with libdnf. If not, see . #include #include #include +#include #include #include @@ -714,7 +715,7 @@ void RepoDownloader::add_countme_flag(LibrepoHandle & handle) { file_path /= COUNTME_COOKIE; int ver = COUNTME_VERSION; // file format version (for future use) - time_t epoch = 0; // position of first-ever counted window + time_t epoch = 0; // position of first observed window time_t win = COUNTME_OFFSET; // position of last counted window int budget = -1; // budget for this window (-1 = generate) // TODO(lukash) ideally replace with utils::fs::File (via adding scanf() support?), @@ -746,8 +747,15 @@ void RepoDownloader::add_countme_flag(LibrepoHandle & handle) { // Compute the position of this window win = now - (delta % COUNTME_WINDOW); + + // Compute the epoch from this system's epoch or, if unknown, declare + // this window as the epoch (unless stored in the cookie previously). + time_t sysepoch = get_system_epoch(); + if (sysepoch) + epoch = sysepoch - ((sysepoch - COUNTME_OFFSET) % COUNTME_WINDOW); if (!epoch) epoch = win; + // Window step (0 at epoch) int64_t step = (win - epoch) / COUNTME_WINDOW; @@ -784,6 +792,31 @@ std::set RepoDownloader::get_optional_metadata() const { } +/* Returns this system's installation time ("epoch") as a UNIX timestamp. + * + * Uses the machine-id(5) file's mtime as a good-enough source of truth. This + * file is typically tied to the system's installation or first boot where it's + * populated by an installer tool or init system, respectively, and is never + * changed afterwards. + * + * Some systems, such as containers that don't run an init system, may have the + * file missing, empty or uninitialized, in which case this function returns 0. + */ +time_t RepoDownloader::get_system_epoch() const { + std::string filename = "/etc/machine-id"; + std::string id; + struct stat st; + + if (stat(filename.c_str(), &st) != 0 || !st.st_size) + return 0; + std::ifstream(filename) >> id; + if (id == "uninitialized") + return 0; + + return st.st_mtime; +} + + //void Downloader::download_url(ConfigMain * cfg, const char * url, int fd) { // std::unique_ptr lr_handle(new_remote_handle(*cfg)); // GError * err_p{nullptr}; diff --git a/libdnf5/repo/repo_downloader.hpp b/libdnf5/repo/repo_downloader.hpp index f6adcf7ef..1c14b272b 100644 --- a/libdnf5/repo/repo_downloader.hpp +++ b/libdnf5/repo/repo_downloader.hpp @@ -96,6 +96,7 @@ class RepoDownloader { std::string get_persistdir() const; void add_countme_flag(LibrepoHandle & handle); + time_t get_system_epoch() const; std::set get_optional_metadata() const; From 39c193281219666f03f3f315c6f88e75a13dcef0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ale=C5=A1=20Mat=C4=9Bj?= Date: Tue, 6 Aug 2024 14:00:06 +0200 Subject: [PATCH 3/3] Update the man page entry for the countme option Make it a bit more explanatory, format the age buckets as a table and reflect the changes from the last update. Backport from dnf4: https://github.com/rpm-software-management/dnf/commit/a8c77bb0d8fe58e48b3b22e21e2cd495d1d6ec15 --- doc/dnf5.conf.5.rst | 61 ++++++++++++++++++++++++++++++++------------- 1 file changed, 44 insertions(+), 17 deletions(-) diff --git a/doc/dnf5.conf.5.rst b/doc/dnf5.conf.5.rst index 2a72e4cad..347b3cd10 100644 --- a/doc/dnf5.conf.5.rst +++ b/doc/dnf5.conf.5.rst @@ -636,23 +636,50 @@ configuration. ``countme`` :ref:`boolean ` - Determines whether a special flag should be added to a single, randomly - chosen metalink/mirrorlist query each week. - This allows the repository owner to estimate the number of systems - consuming it, by counting such queries over a week's time, which is much - more accurate than just counting unique IP addresses (which is subject to - both overcounting and undercounting due to short DHCP leases and NAT, - respectively). - - The flag is a simple "countme=N" parameter appended to the metalink and - mirrorlist URL, where N is an integer representing the "longevity" bucket - this system belongs to. - The following 4 buckets are defined, based on how many full weeks have - passed since the beginning of the week when this system was installed: 1 = - first week, 2 = first month (2-4 weeks), 3 = six months (5-24 weeks) and 4 - = more than six months (> 24 weeks). - This information is meant to help distinguish short-lived installs from - long-term ones, and to gather other statistics about system lifecycle. + When enabled, one (and only one) HTTP GET request for the metalink file + will be selected at random every week to carry a special URL flag. + + This flag allows the repository provider to estimate the number of systems + consuming the repository, by counting such requests over a week's time. + This method is more accurate than just counting unique IP addresses (which + is subject to both overcounting and undercounting due to short DHCP leases + and NAT, respectively). + + This is *not* an out-of-band HTTP request made for this purpose alone. + Only requests initiated by DNF during normal operation, such as to check + for metadata updates, can get this flag. + + The flag is a simple "countme=N" parameter appended to the metalink URL + where N is an integer representing the age "bucket" this system belongs to. + Four buckets are defined, based on how many full weeks have passed since + the installation of a system: + + ====== =============================== + bucket system age + ====== =============================== + 1 first week + 2 first month (2 - 4 weeks) + 3 first 6 months (5 - 24 weeks) + 4 more than 6 months (> 24 weeks) + ====== =============================== + + This number is meant to help distinguish short-lived (throwaway) machines + from long-term installs and get a better picture of how systems are used + over time. + + To determine a system's installation time ("epoch"), the ``machine-id(5)`` + file's modification time is used as the single source of truth. This file + is semantically tied to the system's lifetime as it's typically populated + at installation time or during the first boot by an installer tool or init + system (such as ``systemd(1)``), respectively, and remains unchanged. + + If the file is empty or missing (such as in containers), the time of the + very first request made using the expanded metalink URL (i.e. with any + repository variables such as ``$releasever`` substituted) that carried the + flag is declared as the epoch. + + If no metalink URL is defined for this repository but a mirrorlist URL is, + the latter is used for this purpose instead. Default: ``False``.