Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

improve portability of reproducible tarballs by replacing external tar command with tarfile module #4660

Open
wants to merge 17 commits into
base: 5.0.x
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
ec1ede9
use more portable --date argument for touch command used in reproduci…
lexming Sep 27, 2024
e7f3bbd
stop reproducible tarball generation command on any failure in the pipe
lexming Sep 27, 2024
ca09f4e
move command to make reproducible archives to its own generator metho…
lexming Sep 27, 2024
d0a55ba
replace harcoded pattern for reproducible archives command for call t…
lexming Sep 27, 2024
d7195c7
use tarfile module instead of executing external shell commands to cr…
lexming Oct 7, 2024
87b733a
add required flag to filetools.find_extension() method
lexming Oct 8, 2024
dd28095
add support for extended_dry_run mode to filetools.make_archive()
lexming Oct 8, 2024
a37af5a
add unit test for filetools.make_archive()
lexming Oct 8, 2024
f2296de
make test for github_get_source_tarball_from_git compatible with make…
lexming Oct 8, 2024
980f618
improve reliability of bit-wise operations setting file mode in repro…
lexming Oct 8, 2024
ddb9cae
Merge branch '5.0.x' into reprod-tarballs-mac
lexming Nov 5, 2024
a26c71e
set reproducible flag of make_archives from a specific variable
lexming Nov 5, 2024
3936a6e
simplify logic in EasyBlock.get_checksum_for and improve its logging
lexming Nov 5, 2024
6188cc7
ignore checksums of sources from git repos prior to Python 3.9
lexming Nov 5, 2024
ee772f7
only run checksum assertions in test_make_archive on Python 3.9+
lexming Nov 5, 2024
27ea1d0
add test_fetch_sources_git to easyblock suite
lexming Nov 5, 2024
ad47cac
push deprecation of cheksum check of git repo with Python < 3.9 to Ea…
lexming Nov 10, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 34 additions & 18 deletions easybuild/framework/easyblock.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
import os
import re
import stat
import sys
import tempfile
import time
import traceback
Expand Down Expand Up @@ -358,34 +359,49 @@ def get_checksum_for(self, checksums, filename=None, index=None):
:param filename: name of the file to obtain checksum for
:param index: index of file in list
"""
checksum = None

# sometimes, filename are specified as a dict
chksum_input = filename
chksum_input_git = None
# if filename is provided as dict, take 'filename' key
if isinstance(filename, dict):
filename = filename['filename']
chksum_input = filename['filename']
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@lexming Should we be more careful here, and also use filename.get('filename', None) to avoid crashing if the filename key is not set?

chksum_input_git = filename.get('git_config', None)
# early return if no filename given
if chksum_input is None:
self.log.debug("Cannot get checksum without a file name")
return None

if sys.version_info[0] >= 3 and sys.version_info[1] < 9:
# ignore any checksum for given filename due to changes in python/cpython#90021
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
# ignore any checksum for given filename due to changes in python/cpython#90021
# ignore any checksum for given filename due to changes in https://github.com/python/cpython/issues/90021

# checksums of tarballs made by EB of git repos cannot be reliably checked prior to Python 3.9
if chksum_input_git is not None:
self.log.deprecated(
"Reproducible tarballs of git repos are only supported in Python 3.9+. "
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
"Reproducible tarballs of git repos are only supported in Python 3.9+. "
"Reproducible tarballs of Git repos are only possible when using Python 3.9+ to run EasyBuild. "

f"Checksum of {chksum_input} cannot be verified.",
'6.0'
)
return None

checksum = None
# if checksums are provided as a dict, lookup by source filename as key
if isinstance(checksums, dict):
if filename is not None and filename in checksums:
checksum = checksums[filename]
else:
checksum = None
elif isinstance(checksums, (list, tuple)):
if index is not None and index < len(checksums) and (index >= 0 or abs(index) <= len(checksums)):
try:
checksum = checksums[chksum_input]
except KeyError:
self.log.debug("Checksum not found for file: %s", chksum_input)
elif isinstance(checksums, (list, tuple)) and index is not None:
try:
checksum = checksums[index]
else:
checksum = None
elif checksums is None:
checksum = None
else:
except IndexError:
self.log.debug("Checksum not found for index list: %s", index)
elif checksums is not None:
raise EasyBuildError("Invalid type for checksums (%s), should be dict, list, tuple or None.",
type(checksums))

if checksum is None or build_option("checksum_priority") == CHECKSUM_PRIORITY_JSON:
json_checksums = self.get_checksums_from_json()
return json_checksums.get(filename, None)
else:
return checksum
return json_checksums.get(chksum_input, None)

return checksum

def get_checksums_from_json(self, always_read=False):
"""
Expand Down
113 changes: 82 additions & 31 deletions easybuild/tools/filetools.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,13 +48,15 @@
import inspect
import itertools
import os
import pathlib
import platform
import re
import shutil
import signal
import stat
import ssl
import sys
import tarfile
import tempfile
import time
import zlib
Expand Down Expand Up @@ -1402,18 +1404,20 @@ def get_local_dirs_purged():
return new_dir


def find_extension(filename):
def find_extension(filename, required=True):
"""Find best match for filename extension."""
# sort by length, so longest file extensions get preference
suffixes = sorted(EXTRACT_CMDS.keys(), key=len, reverse=True)
pat = r'(?P<ext>%s)$' % '|'.join([s.replace('.', '\\.') for s in suffixes])
res = re.search(pat, filename, flags=re.IGNORECASE)

if res:
ext = res.group('ext')
else:
return res.group('ext')

if required:
raise EasyBuildError("%s has unknown file extension", filename)

return ext
return None


def extract_cmd(filepath, overwrite=False):
Expand Down Expand Up @@ -2644,7 +2648,7 @@ def get_source_tarball_from_git(filename, target_dir, git_config):
"""
Downloads a git repository, at a specific tag or commit, recursively or not, and make an archive with it

:param filename: name of the archive to save the code to (must be .tar.gz)
:param filename: name of the archive to save the code to (must be extensionless)
:param target_dir: target directory where to save the archive to
:param git_config: dictionary containing url, repo_name, recursive, and one of tag or commit
"""
Expand Down Expand Up @@ -2680,8 +2684,10 @@ def get_source_tarball_from_git(filename, target_dir, git_config):
if not url:
raise EasyBuildError("url not specified in git_config parameter")

if not filename.endswith('.tar.gz'):
raise EasyBuildError("git_config currently only supports filename ending in .tar.gz")
file_ext = find_extension(filename, required=False)
if file_ext:
print_warning(f"Ignoring extension of filename '{filename}' set in git_config parameter")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't this going to require cleaning up a whole bunch of easyconfigs?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rather than printing a warning, we should enforce the use of .tar.xz as extension here (to "ease" the transition, in some way)

filename = filename[:-len(file_ext)]

# prepare target directory and clone repository
mkdir(target_dir, parents=True)
Expand Down Expand Up @@ -2768,37 +2774,82 @@ def get_source_tarball_from_git(filename, target_dir, git_config):
run_shell_cmd(cmd, work_dir=work_dir, hidden=True, verbose_dry_run=True)

# Create archive
archive_path = os.path.join(target_dir, filename)

if keep_git_dir:
# create archive of git repo including .git directory
tar_cmd = ['tar', 'cfvz', archive_path, repo_name]
else:
# create reproducible archive
# see https://reproducible-builds.org/docs/archives/
tar_cmd = [
# print names of all files and folders excluding .git directory
'find', repo_name, '-name ".git"', '-prune', '-o', '-print0',
# reset access and modification timestamps to epoch 0 (equivalent to --mtime in GNU tar)
'-exec', 'touch', '--date=@0', '{}', r'\;',
# reset file permissions of cloned repo (equivalent to --mode in GNU tar)
'-exec', 'chmod', '"go+u,go-w"', '{}', r'\;', '|',
# sort file list (equivalent to --sort in GNU tar)
'LC_ALL=C', 'sort', '--zero-terminated', '|',
# create tarball in GNU format with ownership and permissions reset
'tar', '--create', '--no-recursion', '--owner=0', '--group=0', '--numeric-owner',
'--format=gnu', '--null', '--files-from', '-', '|',
# compress tarball with gzip without original file name and timestamp
'gzip', '--no-name', '>', archive_path
]
run_shell_cmd(' '.join(tar_cmd), work_dir=tmpdir, hidden=True, verbose_dry_run=True)
repo_path = os.path.join(tmpdir, repo_name)
reproducible = not keep_git_dir # presence of .git directory renders repo unreproducible
archive_path = make_archive(repo_path, archive_name=filename, archive_dir=target_dir, reproducible=reproducible)

# cleanup (repo_name dir does not exist in dry run mode)
remove(tmpdir)

return archive_path


def make_archive(dir_path, archive_name=None, archive_dir=None, reproducible=False):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would rename this to make_tar_xz, since we're creating a very specific archive (a tarball), compressed with a hardcoded compression algorithms (XZ).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Or maybe even make_reproducible_tar_xz

"""
Create a compressed tar archive in XZ format.

:dir_path: string with path to directory to be archived
:archive_name: string with extensionless filename of archive
:archive_dir: string with path to directory to place the archive
:reproducuble: make a tarball that is reproducible accross systems
see https://reproducible-builds.org/docs/archives/

Archive is compressed with LZMA into a .xz because that is compatible with
a reproducible archive. Other formats like .gz are not reproducible due to
arbitrary strings and timestamps getting added into their metadata.
Comment on lines +2798 to +2799
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, this is new to me, on what source is this based, do we have a reference we can point to?

"""
def reproducible_filter(tarinfo):
"Filter out system-dependent data from tarball"
# contents of '.git' subdir are inherently system dependent
if "/.git/" in tarinfo.name or tarinfo.name.endswith("/.git"):
return None
# set timestamp to epoch 0
tarinfo.mtime = 0
# reset file permissions by applying go+u,go-w
user_mode = tarinfo.mode & stat.S_IRWXU
group_mode = (user_mode >> 3) & ~stat.S_IWGRP # user mode without write
other_mode = group_mode >> 3 # user mode without write
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
other_mode = group_mode >> 3 # user mode without write
other_mode = group_mode >> 3 # same as group mode

tarinfo.mode = (tarinfo.mode & ~0o77) | group_mode | other_mode
# reset ownership numeric UID/GID 0
tarinfo.uid = tarinfo.gid = 0
tarinfo.uname = tarinfo.gname = ""
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, why does empty user/group name make sense, what does this imply?

return tarinfo

if archive_name is None:
archive_name = os.path.basename(dir_path)

archive_ext = ".tar.xz"
archive_filename = archive_name + archive_ext
archive_path = archive_filename if archive_dir is None else os.path.join(archive_dir, archive_filename)

if build_option('extended_dry_run'):
# early return in dry run mode
dry_run_msg("Archiving '%s' into '%s'...", dir_path, archive_path)
return archive_path

# TODO: replace with TarFile.add(recursive=True) when support for Python 3.6 drops
# since Python v3.7 tarfile automatically orders the list of files added to the archive
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we have a reference for this?

dir_files = [dir_path]
# pathlib's glob includes hidden files
dir_files.extend([str(filepath) for filepath in pathlib.Path(dir_path).glob("**/*")])
dir_files.sort() # independent of locale

dir_path_prefix = os.path.dirname(dir_path)
archive_filter = reproducible_filter if reproducible else None

_log.info("Archiving '%s' into '%s'...", dir_path, archive_path)
with tarfile.open(archive_path, "w:xz", format=tarfile.GNU_FORMAT, encoding="utf-8", preset=6) as archive:
for filepath in dir_files:
# archive with target directory in its top level, remove any prefix in path
file_name = os.path.relpath(filepath, start=dir_path_prefix)
archive.add(filepath, arcname=file_name, recursive=False, filter=archive_filter)
_log.debug("File/folder added to archive '%s': %s", archive_filename, filepath)

_log.info("Archive '%s' created successfully", archive_filename)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
_log.info("Archive '%s' created successfully", archive_filename)
_log.info(f"Archive '{archive_path}' created successfully")


return archive_path


def move_file(path, target_path, force_in_dry_run=False):
"""
Move a file from path to target_path
Expand Down
35 changes: 35 additions & 0 deletions test/framework/easyblock.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
import sys
import tempfile
from inspect import cleandoc
from test.framework.github import requires_github_access
from test.framework.utilities import EnhancedTestCase, TestLoaderFiltered, init_config
from unittest import TextTestRunner

Expand Down Expand Up @@ -1618,6 +1619,40 @@ def test_fetch_sources(self):
error_pattern = "Found one or more unexpected keys in 'sources' specification: {'nosuchkey': 'foobar'}"
self.assertErrorRegex(EasyBuildError, error_pattern, eb.fetch_sources, sources, checksums=[])

@requires_github_access()
def test_fetch_sources_git(self):
"""Test fetch_sources method from git repo."""
testdir = os.path.abspath(os.path.dirname(__file__))
ec = process_easyconfig(os.path.join(testdir, 'easyconfigs', 'test_ecs', 't', 'toy', 'toy-0.0.eb'))[0]
eb = get_easyblock_instance(ec)
eb.src = []
sources = [
{
'filename': 'testrepository.tar.xz',
'git_config': {
'repo_name': 'testrepository',
'url': 'https://github.com/easybuilders',
'tag': 'branch_tag_for_test',
}
}
]
checksums = ["00000000"]
with self.mocked_stdout_stderr():
eb.fetch_sources(sources, checksums=checksums)

self.assertEqual(len(eb.src), 1)
self.assertEqual(eb.src[0]['name'], "testrepository.tar.xz")
self.assertExists(eb.src[0]['path'])
self.assertEqual(eb.src[0]['cmd'], None)

reference_checksum = "00000000"
if sys.version_info[0] >= 3 and sys.version_info[1] < 9:
# checksums of tarballs made by EB cannot be reliably checked prior to Python 3.9
# due to changes introduced in python/cpython#90021
reference_checksum = None

self.assertEqual(eb.src[0]['checksum'], reference_checksum)

def test_download_instructions(self):
"""Test use of download_instructions easyconfig parameter."""

Expand Down
Loading
Loading