Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Crawler: consider empty directories as up-to-date #438

Merged
merged 3 commits into from
Jan 14, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions mirrormanager2/crawler/ftp_connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,11 @@ def _check_dir(self, url, directory):

with mmlib.instance_attribute(directory, "files") as files:
# Getting Directory.files is a bit expensive, involves json decoding
# files can be None in case of empty directories
files = files or []
for filename in files:
if filename not in results:
return False # Missing file, we don't need to go over other files
status = self._check_file(results[filename], files[filename])
if not status:
# Shortcut: we don't need to go over other files
Expand Down
2 changes: 2 additions & 0 deletions mirrormanager2/crawler/http_connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,8 @@ def _check_dir(self, url, directory):
return None
with mmlib.instance_attribute(directory, "files") as files:
# Getting Directory.files is a bit expensive, involves json decoding
# files can be None in case of empty directories
files = files or []
for filename in files:
file_url = f"{url}/{filename}"
exists = self._check_file(conn, file_url, files[filename], directory.readable)
Expand Down
2 changes: 2 additions & 0 deletions mirrormanager2/crawler/rsync_connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@ def _check_file(self, current_file_info, db_file_info):
def _check_dir(self, dirname, directory):
with mmlib.instance_attribute(directory, "files") as files:
# Getting Directory.files is a bit expensive, involves json decoding
# files can be None in case of empty directories
files = files or []
for filename in sorted(files):
if len(dirname) == 0:
key = filename
Expand Down
1 change: 0 additions & 1 deletion mirrormanager2/lib/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -772,7 +772,6 @@ def _get_directories_by_category_query(category, only_repodata=False):
.where(
model.Category.id == category.id,
model.Directory.readable.is_(True),
model.Directory.files.is_not(None),
)
)
if only_repodata:
Expand Down
146 changes: 146 additions & 0 deletions tests/test_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,46 @@
"""

import os
from unittest.mock import Mock

import pytest

from mirrormanager2 import default_config
from mirrormanager2.crawler.connection_pool import ConnectionPool
from mirrormanager2.lib import model
from mirrormanager2.lib.sync import run_rsync

FOLDER = os.path.dirname(os.path.abspath(__file__))


@pytest.fixture()
def config():
config = dict()
for key in dir(default_config):
if key.isupper():
config[key] = getattr(default_config, key)
return config


@pytest.fixture()
def dir_obj(db):
"""Test scanning empty directories."""
directory = model.Directory(
name="pub/fedora/linux/releases/20",
readable=True,
)
db.add(directory)
db.commit()
return directory


@pytest.fixture()
def dir_obj_with_files(db, dir_obj):
dir_obj.files = {"does-not-exist": {"size": 1, "stat": 1}}
db.commit()
return dir_obj


def test_run_rsync():
"""Test the run_rsync function"""

Expand Down Expand Up @@ -82,3 +116,115 @@ def test_run_rsync():

# Check that non-excluded files are still included
assert "fedora/linux/development/22/" in output


def test_scan_rsync(db, dir_obj_with_files, config):
"""Test scanning directories with missing files."""
connection_pool = ConnectionPool(config)
connector = connection_pool.get(f"rsync://{FOLDER}/../testdata/")
dir_url = f"rsync:///{FOLDER}/../testdata/pub/fedora/linux"
scan_result = {
f"{dir_url}/{filename}": fileinfo for filename, fileinfo in dir_obj_with_files.files.items()
}
for fileinfo in scan_result.values():
fileinfo["mode"] = "f"
connector._scan_result = scan_result
result = connector.check_dir(dir_url, dir_obj_with_files)
assert result is True


def test_scan_missing_files_rsync(db, dir_obj_with_files, config):
"""Test scanning directories with missing files."""
connection_pool = ConnectionPool(config)
connector = connection_pool.get(f"rsync://{FOLDER}/../testdata/")
dir_url = f"rsync:///{FOLDER}/../testdata/pub/fedora/linux"
connector._scan_result = connector._run(dir_url)
result = connector.check_dir(dir_url, dir_obj_with_files)
assert result is False


def test_scan_empty_directory_rsync(db, dir_obj):
"""Test scanning empty directories with rsync"""
connection_pool = ConnectionPool({})
connector = connection_pool.get(f"rsync://{FOLDER}/../testdata/")
dir_url = f"rsync://{FOLDER}/../testdata/pub/fedora/linux"
result = connector.check_dir(dir_url, dir_obj)
assert result is True


def test_scan_http(db, dir_obj_with_files):
"""Test scanning directories with http"""
connection_pool = ConnectionPool({})
connector = connection_pool.get("http://localhost/testdata/")
mocked_connection = object()
connector.get_connection = Mock(return_value=mocked_connection)
connector._check_file = Mock(return_value=True)
dir_url = "http://localhost/testdata/pub/fedora/linux"
result = connector.check_dir(dir_url, dir_obj_with_files)
assert result is True
connector.get_connection.assert_called_once()
connector._check_file.assert_called_once_with(
mocked_connection, f"{dir_url}/does-not-exist", {"size": 1, "stat": 1}, True
)


def test_scan_missing_files_http(db, dir_obj_with_files):
"""Test scanning empty directories with http"""
connection_pool = ConnectionPool({})
connector = connection_pool.get("http://localhost/testdata/")
mocked_connection = object()
connector.get_connection = Mock(return_value=mocked_connection)
connector._check_file = Mock(return_value=False)
dir_url = "http://localhost/testdata/pub/fedora/linux"
result = connector.check_dir(dir_url, dir_obj_with_files)
assert result is False
connector.get_connection.assert_called_once()
connector._check_file.assert_called_once_with(
mocked_connection, f"{dir_url}/does-not-exist", {"size": 1, "stat": 1}, True
)


def test_scan_empty_directory_http(db, dir_obj):
"""Test scanning empty directories with http"""
connection_pool = ConnectionPool({})
connector = connection_pool.get("http://localhost/testdata/")
connector.get_connection = Mock()
connector._check_file = Mock(return_value=True)
dir_url = "http://localhost/testdata/pub/fedora/linux"
result = connector.check_dir(dir_url, dir_obj)
assert result is True
connector.get_connection.assert_called_once()
connector._check_file.assert_not_called()


def test_scan_ftp(db, dir_obj_with_files):
"""Test scanning directories with ftp"""
connection_pool = ConnectionPool({})
connector = connection_pool.get("ftp://localhost/testdata/")
connector.get_ftp_dir = Mock(return_value=dir_obj_with_files.files)
dir_url = "http://localhost/testdata/pub/fedora/linux"
result = connector.check_dir(dir_url, dir_obj_with_files)
assert result is True
connector.get_ftp_dir.assert_called_once_with(dir_url, True)


def test_scan_missing_files_ftp(db, dir_obj_with_files):
"""Test scanning empty directories with ftp"""
connection_pool = ConnectionPool({})
connector = connection_pool.get("ftp://localhost/testdata/")
connector.get_ftp_dir = Mock(return_value={})
dir_url = "http://localhost/testdata/pub/fedora/linux"
result = connector.check_dir(dir_url, dir_obj_with_files)
assert result is False
connector.get_ftp_dir.assert_called_once_with(dir_url, True)


def test_scan_empty_directory_ftp(db, dir_obj):
"""Test scanning empty directories with ftp"""
connection_pool = ConnectionPool({})
connector = connection_pool.get("ftp://localhost/testdata/")
connector.get_ftp_dir = Mock(return_value={})
dir_url = "ftp://localhost/testdata/pub/fedora/linux"
result = connector.check_dir(dir_url, dir_obj)
assert result is True
connector.get_ftp_dir.assert_called_once_with(dir_url, True)
Loading