From 585a332528ec3face77cccbd74dbb1f33c10331e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Bompard?= Date: Tue, 14 Jan 2025 12:19:51 +0100 Subject: [PATCH 1/3] Add unit tests for the crawler connectors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Aurélien Bompard --- tests/test_crawler.py | 76 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/tests/test_crawler.py b/tests/test_crawler.py index 745baa5e0..df5a41359 100644 --- a/tests/test_crawler.py +++ b/tests/test_crawler.py @@ -16,12 +16,46 @@ """ import os +from unittest.mock import Mock +import pytest + +from mirrormanager2 import default_config +from mirrormanager2.crawler.connection_pool import ConnectionPool +from mirrormanager2.lib import model from mirrormanager2.lib.sync import run_rsync FOLDER = os.path.dirname(os.path.abspath(__file__)) +@pytest.fixture() +def config(): + config = dict() + for key in dir(default_config): + if key.isupper(): + config[key] = getattr(default_config, key) + return config + + +@pytest.fixture() +def dir_obj(db): + """Test scanning empty directories.""" + directory = model.Directory( + name="pub/fedora/linux/releases/20", + readable=True, + ) + db.add(directory) + db.commit() + return directory + + +@pytest.fixture() +def dir_obj_with_files(db, dir_obj): + dir_obj.files = {"does-not-exist": {"size": 1, "stat": 1}} + db.commit() + return dir_obj + + def test_run_rsync(): """Test the run_rsync function""" @@ -82,3 +116,45 @@ def test_run_rsync(): # Check that non-excluded files are still included assert "fedora/linux/development/22/" in output + + +def test_scan_rsync(db, dir_obj_with_files, config): + """Test scanning directories with missing files.""" + connection_pool = ConnectionPool(config) + connector = connection_pool.get(f"rsync://{FOLDER}/../testdata/") + dir_url = f"rsync:///{FOLDER}/../testdata/pub/fedora/linux" + scan_result = { + f"{dir_url}/{filename}": fileinfo for filename, fileinfo in dir_obj_with_files.files.items() + } + for fileinfo in scan_result.values(): + fileinfo["mode"] = "f" + connector._scan_result = scan_result + result = connector.check_dir(dir_url, dir_obj_with_files) + assert result is True + + +def test_scan_http(db, dir_obj_with_files): + """Test scanning directories with http""" + connection_pool = ConnectionPool({}) + connector = connection_pool.get("http://localhost/testdata/") + mocked_connection = object() + connector.get_connection = Mock(return_value=mocked_connection) + connector._check_file = Mock(return_value=True) + dir_url = "http://localhost/testdata/pub/fedora/linux" + result = connector.check_dir(dir_url, dir_obj_with_files) + assert result is True + connector.get_connection.assert_called_once() + connector._check_file.assert_called_once_with( + mocked_connection, f"{dir_url}/does-not-exist", {"size": 1, "stat": 1}, True + ) + + +def test_scan_ftp(db, dir_obj_with_files): + """Test scanning directories with ftp""" + connection_pool = ConnectionPool({}) + connector = connection_pool.get("ftp://localhost/testdata/") + connector.get_ftp_dir = Mock(return_value=dir_obj_with_files.files) + dir_url = "http://localhost/testdata/pub/fedora/linux" + result = connector.check_dir(dir_url, dir_obj_with_files) + assert result is True + connector.get_ftp_dir.assert_called_once_with(dir_url, True) From 962a94b4358154879b6e0d2bcf2e8349fadc897a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Bompard?= Date: Tue, 14 Jan 2025 12:21:16 +0100 Subject: [PATCH 2/3] Crawler connectors: add unit tests for missing files MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Aurélien Bompard --- mirrormanager2/crawler/ftp_connector.py | 2 ++ tests/test_crawler.py | 37 +++++++++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/mirrormanager2/crawler/ftp_connector.py b/mirrormanager2/crawler/ftp_connector.py index 5cf85638f..0bb8f772e 100755 --- a/mirrormanager2/crawler/ftp_connector.py +++ b/mirrormanager2/crawler/ftp_connector.py @@ -115,6 +115,8 @@ def _check_dir(self, url, directory): with mmlib.instance_attribute(directory, "files") as files: # Getting Directory.files is a bit expensive, involves json decoding for filename in files: + if filename not in results: + return False # Missing file, we don't need to go over other files status = self._check_file(results[filename], files[filename]) if not status: # Shortcut: we don't need to go over other files diff --git a/tests/test_crawler.py b/tests/test_crawler.py index df5a41359..5e6540b96 100644 --- a/tests/test_crawler.py +++ b/tests/test_crawler.py @@ -133,6 +133,16 @@ def test_scan_rsync(db, dir_obj_with_files, config): assert result is True +def test_scan_missing_files_rsync(db, dir_obj_with_files, config): + """Test scanning directories with missing files.""" + connection_pool = ConnectionPool(config) + connector = connection_pool.get(f"rsync://{FOLDER}/../testdata/") + dir_url = f"rsync:///{FOLDER}/../testdata/pub/fedora/linux" + connector._scan_result = connector._run(dir_url) + result = connector.check_dir(dir_url, dir_obj_with_files) + assert result is False + + def test_scan_http(db, dir_obj_with_files): """Test scanning directories with http""" connection_pool = ConnectionPool({}) @@ -149,6 +159,22 @@ def test_scan_http(db, dir_obj_with_files): ) +def test_scan_missing_files_http(db, dir_obj_with_files): + """Test scanning empty directories with http""" + connection_pool = ConnectionPool({}) + connector = connection_pool.get("http://localhost/testdata/") + mocked_connection = object() + connector.get_connection = Mock(return_value=mocked_connection) + connector._check_file = Mock(return_value=False) + dir_url = "http://localhost/testdata/pub/fedora/linux" + result = connector.check_dir(dir_url, dir_obj_with_files) + assert result is False + connector.get_connection.assert_called_once() + connector._check_file.assert_called_once_with( + mocked_connection, f"{dir_url}/does-not-exist", {"size": 1, "stat": 1}, True + ) + + def test_scan_ftp(db, dir_obj_with_files): """Test scanning directories with ftp""" connection_pool = ConnectionPool({}) @@ -158,3 +184,14 @@ def test_scan_ftp(db, dir_obj_with_files): result = connector.check_dir(dir_url, dir_obj_with_files) assert result is True connector.get_ftp_dir.assert_called_once_with(dir_url, True) + + +def test_scan_missing_files_ftp(db, dir_obj_with_files): + """Test scanning empty directories with ftp""" + connection_pool = ConnectionPool({}) + connector = connection_pool.get("ftp://localhost/testdata/") + connector.get_ftp_dir = Mock(return_value={}) + dir_url = "http://localhost/testdata/pub/fedora/linux" + result = connector.check_dir(dir_url, dir_obj_with_files) + assert result is False + connector.get_ftp_dir.assert_called_once_with(dir_url, True) From 759c0976f9865c2d7fbd1d34be0a0a1f7ddda1f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Bompard?= Date: Tue, 14 Jan 2025 12:21:28 +0100 Subject: [PATCH 3/3] Crawler connectors: consider empty directories up-to-date MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes: #437 Signed-off-by: Aurélien Bompard --- mirrormanager2/crawler/ftp_connector.py | 2 ++ mirrormanager2/crawler/http_connector.py | 2 ++ mirrormanager2/crawler/rsync_connector.py | 2 ++ mirrormanager2/lib/__init__.py | 1 - tests/test_crawler.py | 33 +++++++++++++++++++++++ 5 files changed, 39 insertions(+), 1 deletion(-) diff --git a/mirrormanager2/crawler/ftp_connector.py b/mirrormanager2/crawler/ftp_connector.py index 0bb8f772e..bb6775a00 100755 --- a/mirrormanager2/crawler/ftp_connector.py +++ b/mirrormanager2/crawler/ftp_connector.py @@ -114,6 +114,8 @@ def _check_dir(self, url, directory): with mmlib.instance_attribute(directory, "files") as files: # Getting Directory.files is a bit expensive, involves json decoding + # files can be None in case of empty directories + files = files or [] for filename in files: if filename not in results: return False # Missing file, we don't need to go over other files diff --git a/mirrormanager2/crawler/http_connector.py b/mirrormanager2/crawler/http_connector.py index eb7af71bd..b6d276942 100755 --- a/mirrormanager2/crawler/http_connector.py +++ b/mirrormanager2/crawler/http_connector.py @@ -89,6 +89,8 @@ def _check_dir(self, url, directory): return None with mmlib.instance_attribute(directory, "files") as files: # Getting Directory.files is a bit expensive, involves json decoding + # files can be None in case of empty directories + files = files or [] for filename in files: file_url = f"{url}/{filename}" exists = self._check_file(conn, file_url, files[filename], directory.readable) diff --git a/mirrormanager2/crawler/rsync_connector.py b/mirrormanager2/crawler/rsync_connector.py index d5edaac19..ab087d7c2 100755 --- a/mirrormanager2/crawler/rsync_connector.py +++ b/mirrormanager2/crawler/rsync_connector.py @@ -78,6 +78,8 @@ def _check_file(self, current_file_info, db_file_info): def _check_dir(self, dirname, directory): with mmlib.instance_attribute(directory, "files") as files: # Getting Directory.files is a bit expensive, involves json decoding + # files can be None in case of empty directories + files = files or [] for filename in sorted(files): if len(dirname) == 0: key = filename diff --git a/mirrormanager2/lib/__init__.py b/mirrormanager2/lib/__init__.py index da8b82432..a3038fb6d 100644 --- a/mirrormanager2/lib/__init__.py +++ b/mirrormanager2/lib/__init__.py @@ -772,7 +772,6 @@ def _get_directories_by_category_query(category, only_repodata=False): .where( model.Category.id == category.id, model.Directory.readable.is_(True), - model.Directory.files.is_not(None), ) ) if only_repodata: diff --git a/tests/test_crawler.py b/tests/test_crawler.py index 5e6540b96..258f5e34b 100644 --- a/tests/test_crawler.py +++ b/tests/test_crawler.py @@ -143,6 +143,15 @@ def test_scan_missing_files_rsync(db, dir_obj_with_files, config): assert result is False +def test_scan_empty_directory_rsync(db, dir_obj): + """Test scanning empty directories with rsync""" + connection_pool = ConnectionPool({}) + connector = connection_pool.get(f"rsync://{FOLDER}/../testdata/") + dir_url = f"rsync://{FOLDER}/../testdata/pub/fedora/linux" + result = connector.check_dir(dir_url, dir_obj) + assert result is True + + def test_scan_http(db, dir_obj_with_files): """Test scanning directories with http""" connection_pool = ConnectionPool({}) @@ -175,6 +184,19 @@ def test_scan_missing_files_http(db, dir_obj_with_files): ) +def test_scan_empty_directory_http(db, dir_obj): + """Test scanning empty directories with http""" + connection_pool = ConnectionPool({}) + connector = connection_pool.get("http://localhost/testdata/") + connector.get_connection = Mock() + connector._check_file = Mock(return_value=True) + dir_url = "http://localhost/testdata/pub/fedora/linux" + result = connector.check_dir(dir_url, dir_obj) + assert result is True + connector.get_connection.assert_called_once() + connector._check_file.assert_not_called() + + def test_scan_ftp(db, dir_obj_with_files): """Test scanning directories with ftp""" connection_pool = ConnectionPool({}) @@ -195,3 +217,14 @@ def test_scan_missing_files_ftp(db, dir_obj_with_files): result = connector.check_dir(dir_url, dir_obj_with_files) assert result is False connector.get_ftp_dir.assert_called_once_with(dir_url, True) + + +def test_scan_empty_directory_ftp(db, dir_obj): + """Test scanning empty directories with ftp""" + connection_pool = ConnectionPool({}) + connector = connection_pool.get("ftp://localhost/testdata/") + connector.get_ftp_dir = Mock(return_value={}) + dir_url = "ftp://localhost/testdata/pub/fedora/linux" + result = connector.check_dir(dir_url, dir_obj) + assert result is True + connector.get_ftp_dir.assert_called_once_with(dir_url, True)