-
Notifications
You must be signed in to change notification settings - Fork 203
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
improve portability of reproducible tarballs by replacing external tar
command with tarfile
module
#4660
base: 5.0.x
Are you sure you want to change the base?
improve portability of reproducible tarballs by replacing external tar
command with tarfile
module
#4660
Changes from all commits
ec1ede9
e7f3bbd
ca09f4e
d0a55ba
d7195c7
87b733a
dd28095
a37af5a
f2296de
980f618
ddb9cae
a26c71e
3936a6e
6188cc7
ee772f7
27ea1d0
ad47cac
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -49,6 +49,7 @@ | |||||
import os | ||||||
import re | ||||||
import stat | ||||||
import sys | ||||||
import tempfile | ||||||
import time | ||||||
import traceback | ||||||
|
@@ -358,34 +359,49 @@ def get_checksum_for(self, checksums, filename=None, index=None): | |||||
:param filename: name of the file to obtain checksum for | ||||||
:param index: index of file in list | ||||||
""" | ||||||
checksum = None | ||||||
|
||||||
# sometimes, filename are specified as a dict | ||||||
chksum_input = filename | ||||||
chksum_input_git = None | ||||||
# if filename is provided as dict, take 'filename' key | ||||||
if isinstance(filename, dict): | ||||||
filename = filename['filename'] | ||||||
chksum_input = filename['filename'] | ||||||
chksum_input_git = filename.get('git_config', None) | ||||||
# early return if no filename given | ||||||
if chksum_input is None: | ||||||
self.log.debug("Cannot get checksum without a file name") | ||||||
return None | ||||||
|
||||||
if sys.version_info[0] >= 3 and sys.version_info[1] < 9: | ||||||
# ignore any checksum for given filename due to changes in python/cpython#90021 | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
# checksums of tarballs made by EB of git repos cannot be reliably checked prior to Python 3.9 | ||||||
if chksum_input_git is not None: | ||||||
self.log.deprecated( | ||||||
"Reproducible tarballs of git repos are only supported in Python 3.9+. " | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
f"Checksum of {chksum_input} cannot be verified.", | ||||||
'6.0' | ||||||
) | ||||||
return None | ||||||
|
||||||
checksum = None | ||||||
# if checksums are provided as a dict, lookup by source filename as key | ||||||
if isinstance(checksums, dict): | ||||||
if filename is not None and filename in checksums: | ||||||
checksum = checksums[filename] | ||||||
else: | ||||||
checksum = None | ||||||
elif isinstance(checksums, (list, tuple)): | ||||||
if index is not None and index < len(checksums) and (index >= 0 or abs(index) <= len(checksums)): | ||||||
try: | ||||||
checksum = checksums[chksum_input] | ||||||
except KeyError: | ||||||
self.log.debug("Checksum not found for file: %s", chksum_input) | ||||||
elif isinstance(checksums, (list, tuple)) and index is not None: | ||||||
try: | ||||||
checksum = checksums[index] | ||||||
else: | ||||||
checksum = None | ||||||
elif checksums is None: | ||||||
checksum = None | ||||||
else: | ||||||
except IndexError: | ||||||
self.log.debug("Checksum not found for index list: %s", index) | ||||||
elif checksums is not None: | ||||||
raise EasyBuildError("Invalid type for checksums (%s), should be dict, list, tuple or None.", | ||||||
type(checksums)) | ||||||
|
||||||
if checksum is None or build_option("checksum_priority") == CHECKSUM_PRIORITY_JSON: | ||||||
json_checksums = self.get_checksums_from_json() | ||||||
return json_checksums.get(filename, None) | ||||||
else: | ||||||
return checksum | ||||||
return json_checksums.get(chksum_input, None) | ||||||
|
||||||
return checksum | ||||||
|
||||||
def get_checksums_from_json(self, always_read=False): | ||||||
""" | ||||||
|
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -48,13 +48,15 @@ | |||||
import inspect | ||||||
import itertools | ||||||
import os | ||||||
import pathlib | ||||||
import platform | ||||||
import re | ||||||
import shutil | ||||||
import signal | ||||||
import stat | ||||||
import ssl | ||||||
import sys | ||||||
import tarfile | ||||||
import tempfile | ||||||
import time | ||||||
import zlib | ||||||
|
@@ -1402,18 +1404,20 @@ def get_local_dirs_purged(): | |||||
return new_dir | ||||||
|
||||||
|
||||||
def find_extension(filename): | ||||||
def find_extension(filename, required=True): | ||||||
"""Find best match for filename extension.""" | ||||||
# sort by length, so longest file extensions get preference | ||||||
suffixes = sorted(EXTRACT_CMDS.keys(), key=len, reverse=True) | ||||||
pat = r'(?P<ext>%s)$' % '|'.join([s.replace('.', '\\.') for s in suffixes]) | ||||||
res = re.search(pat, filename, flags=re.IGNORECASE) | ||||||
|
||||||
if res: | ||||||
ext = res.group('ext') | ||||||
else: | ||||||
return res.group('ext') | ||||||
|
||||||
if required: | ||||||
raise EasyBuildError("%s has unknown file extension", filename) | ||||||
|
||||||
return ext | ||||||
return None | ||||||
|
||||||
|
||||||
def extract_cmd(filepath, overwrite=False): | ||||||
|
@@ -2644,7 +2648,7 @@ def get_source_tarball_from_git(filename, target_dir, git_config): | |||||
""" | ||||||
Downloads a git repository, at a specific tag or commit, recursively or not, and make an archive with it | ||||||
|
||||||
:param filename: name of the archive to save the code to (must be .tar.gz) | ||||||
:param filename: name of the archive to save the code to (must be extensionless) | ||||||
:param target_dir: target directory where to save the archive to | ||||||
:param git_config: dictionary containing url, repo_name, recursive, and one of tag or commit | ||||||
""" | ||||||
|
@@ -2680,8 +2684,10 @@ def get_source_tarball_from_git(filename, target_dir, git_config): | |||||
if not url: | ||||||
raise EasyBuildError("url not specified in git_config parameter") | ||||||
|
||||||
if not filename.endswith('.tar.gz'): | ||||||
raise EasyBuildError("git_config currently only supports filename ending in .tar.gz") | ||||||
file_ext = find_extension(filename, required=False) | ||||||
if file_ext: | ||||||
print_warning(f"Ignoring extension of filename '{filename}' set in git_config parameter") | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Isn't this going to require cleaning up a whole bunch of easyconfigs? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Rather than printing a warning, we should enforce the use of |
||||||
filename = filename[:-len(file_ext)] | ||||||
|
||||||
# prepare target directory and clone repository | ||||||
mkdir(target_dir, parents=True) | ||||||
|
@@ -2768,37 +2774,82 @@ def get_source_tarball_from_git(filename, target_dir, git_config): | |||||
run_shell_cmd(cmd, work_dir=work_dir, hidden=True, verbose_dry_run=True) | ||||||
|
||||||
# Create archive | ||||||
archive_path = os.path.join(target_dir, filename) | ||||||
|
||||||
if keep_git_dir: | ||||||
# create archive of git repo including .git directory | ||||||
tar_cmd = ['tar', 'cfvz', archive_path, repo_name] | ||||||
else: | ||||||
# create reproducible archive | ||||||
# see https://reproducible-builds.org/docs/archives/ | ||||||
tar_cmd = [ | ||||||
# print names of all files and folders excluding .git directory | ||||||
'find', repo_name, '-name ".git"', '-prune', '-o', '-print0', | ||||||
# reset access and modification timestamps to epoch 0 (equivalent to --mtime in GNU tar) | ||||||
'-exec', 'touch', '--date=@0', '{}', r'\;', | ||||||
# reset file permissions of cloned repo (equivalent to --mode in GNU tar) | ||||||
'-exec', 'chmod', '"go+u,go-w"', '{}', r'\;', '|', | ||||||
# sort file list (equivalent to --sort in GNU tar) | ||||||
'LC_ALL=C', 'sort', '--zero-terminated', '|', | ||||||
# create tarball in GNU format with ownership and permissions reset | ||||||
'tar', '--create', '--no-recursion', '--owner=0', '--group=0', '--numeric-owner', | ||||||
'--format=gnu', '--null', '--files-from', '-', '|', | ||||||
# compress tarball with gzip without original file name and timestamp | ||||||
'gzip', '--no-name', '>', archive_path | ||||||
] | ||||||
run_shell_cmd(' '.join(tar_cmd), work_dir=tmpdir, hidden=True, verbose_dry_run=True) | ||||||
repo_path = os.path.join(tmpdir, repo_name) | ||||||
reproducible = not keep_git_dir # presence of .git directory renders repo unreproducible | ||||||
archive_path = make_archive(repo_path, archive_name=filename, archive_dir=target_dir, reproducible=reproducible) | ||||||
|
||||||
# cleanup (repo_name dir does not exist in dry run mode) | ||||||
remove(tmpdir) | ||||||
|
||||||
return archive_path | ||||||
|
||||||
|
||||||
def make_archive(dir_path, archive_name=None, archive_dir=None, reproducible=False): | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would rename this to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Or maybe even |
||||||
""" | ||||||
Create a compressed tar archive in XZ format. | ||||||
|
||||||
:dir_path: string with path to directory to be archived | ||||||
:archive_name: string with extensionless filename of archive | ||||||
:archive_dir: string with path to directory to place the archive | ||||||
:reproducuble: make a tarball that is reproducible accross systems | ||||||
see https://reproducible-builds.org/docs/archives/ | ||||||
|
||||||
Archive is compressed with LZMA into a .xz because that is compatible with | ||||||
a reproducible archive. Other formats like .gz are not reproducible due to | ||||||
arbitrary strings and timestamps getting added into their metadata. | ||||||
Comment on lines
+2798
to
+2799
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm, this is new to me, on what source is this based, do we have a reference we can point to? |
||||||
""" | ||||||
def reproducible_filter(tarinfo): | ||||||
"Filter out system-dependent data from tarball" | ||||||
# contents of '.git' subdir are inherently system dependent | ||||||
if "/.git/" in tarinfo.name or tarinfo.name.endswith("/.git"): | ||||||
return None | ||||||
# set timestamp to epoch 0 | ||||||
tarinfo.mtime = 0 | ||||||
# reset file permissions by applying go+u,go-w | ||||||
user_mode = tarinfo.mode & stat.S_IRWXU | ||||||
group_mode = (user_mode >> 3) & ~stat.S_IWGRP # user mode without write | ||||||
other_mode = group_mode >> 3 # user mode without write | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
tarinfo.mode = (tarinfo.mode & ~0o77) | group_mode | other_mode | ||||||
# reset ownership numeric UID/GID 0 | ||||||
tarinfo.uid = tarinfo.gid = 0 | ||||||
tarinfo.uname = tarinfo.gname = "" | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm, why does empty user/group name make sense, what does this imply? |
||||||
return tarinfo | ||||||
|
||||||
if archive_name is None: | ||||||
archive_name = os.path.basename(dir_path) | ||||||
|
||||||
archive_ext = ".tar.xz" | ||||||
archive_filename = archive_name + archive_ext | ||||||
archive_path = archive_filename if archive_dir is None else os.path.join(archive_dir, archive_filename) | ||||||
|
||||||
if build_option('extended_dry_run'): | ||||||
# early return in dry run mode | ||||||
dry_run_msg("Archiving '%s' into '%s'...", dir_path, archive_path) | ||||||
return archive_path | ||||||
|
||||||
# TODO: replace with TarFile.add(recursive=True) when support for Python 3.6 drops | ||||||
# since Python v3.7 tarfile automatically orders the list of files added to the archive | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we have a reference for this? |
||||||
dir_files = [dir_path] | ||||||
# pathlib's glob includes hidden files | ||||||
dir_files.extend([str(filepath) for filepath in pathlib.Path(dir_path).glob("**/*")]) | ||||||
dir_files.sort() # independent of locale | ||||||
|
||||||
dir_path_prefix = os.path.dirname(dir_path) | ||||||
archive_filter = reproducible_filter if reproducible else None | ||||||
|
||||||
_log.info("Archiving '%s' into '%s'...", dir_path, archive_path) | ||||||
with tarfile.open(archive_path, "w:xz", format=tarfile.GNU_FORMAT, encoding="utf-8", preset=6) as archive: | ||||||
for filepath in dir_files: | ||||||
# archive with target directory in its top level, remove any prefix in path | ||||||
file_name = os.path.relpath(filepath, start=dir_path_prefix) | ||||||
archive.add(filepath, arcname=file_name, recursive=False, filter=archive_filter) | ||||||
_log.debug("File/folder added to archive '%s': %s", archive_filename, filepath) | ||||||
|
||||||
_log.info("Archive '%s' created successfully", archive_filename) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
|
||||||
return archive_path | ||||||
|
||||||
|
||||||
def move_file(path, target_path, force_in_dry_run=False): | ||||||
""" | ||||||
Move a file from path to target_path | ||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@lexming Should we be more careful here, and also use
filename.get('filename', None)
to avoid crashing if thefilename
key is not set?