From 6bd93b5d8b9ad63fad6c139b412edf9f62c7629c Mon Sep 17 00:00:00 2001 From: Alexander Grund Date: Mon, 14 Oct 2024 12:42:59 +0200 Subject: [PATCH 1/2] avoid name clash in cargo crates downloaded from git repositories Crates referenced by their git revision have a version but might have different revisions. The source filename ignored the revision such that sources from different revisions were considered the same file which either fails the checksum verification or the build. Append the revision if specified to disambiguate them. --- easybuild/easyblocks/generic/cargo.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/easybuild/easyblocks/generic/cargo.py b/easybuild/easyblocks/generic/cargo.py index d03407862d..422d74f7a7 100644 --- a/easybuild/easyblocks/generic/cargo.py +++ b/easybuild/easyblocks/generic/cargo.py @@ -81,12 +81,15 @@ def extra_options(extra_vars=None): return extra_vars @staticmethod - def crate_src_filename(pkg_name, pkg_version, *args): - """Crate tarball filename based on package name and version""" - return "{0}-{1}.tar.gz".format(pkg_name, pkg_version) + def crate_src_filename(pkg_name, pkg_version, _=None, rev=None): + """Crate tarball filename based on package name, version and optionally git revision""" + parts = [pkg_name, pkg_version] + if rev is not None: + parts.append(rev) + return '-'.join(parts) + ".tar.gz" @staticmethod - def crate_download_filename(pkg_name, pkg_version, *args): + def crate_download_filename(pkg_name, pkg_version): """Crate download filename based on package name and version""" return "{0}/{1}/download".format(pkg_name, pkg_version) @@ -148,7 +151,7 @@ def __init__(self, *args, **kwargs): repo_name = repo_name[:-4] sources.append({ 'git_config': {'url': url, 'repo_name': repo_name, 'commit': rev}, - 'filename': self.crate_src_filename(crate, version), + 'filename': self.crate_src_filename(crate, version, rev=rev), }) # copy EasyConfig instance before we make changes to it From b88ab292466122dd5835c49a9bf8eecd1594a0f9 Mon Sep 17 00:00:00 2001 From: Alexander Grund Date: Mon, 14 Oct 2024 13:06:23 +0200 Subject: [PATCH 2/2] Handle cargo workspaces in git repositories Crates downloaded from crates.io can just be put into the vendored-sources directory. However a referenced git repository might contain multiple crates in a so-called "workspace". If we put such a workspace into the vendor folder, cargo fails with > found a virtual manifest at [...]Cargo.toml instead of a package manifest This solution uses a separate folder for sources downloaded from git repositories and checks them for workspaces. If they are "regular", i.e. don't have a workspace, they are put to the vendored sources from crates.io. Otherwise they are left in the git-vendor directory. Additionally refine the config.toml entry for sources vendored from git. This allows to override a specific git URL and revision with the path to the extracted repository. Otherwise we would need entries for each crate in workspaces but then cargo tries to verify the revision and fails with > Unable to update https://github.com/[...]?rev=[...] > Caused by: can't checkout from 'https://github.com/[...]': you are in the offline mode (--offline) --- easybuild/easyblocks/generic/cargo.py | 204 ++++++++++++++++++++------ 1 file changed, 158 insertions(+), 46 deletions(-) diff --git a/easybuild/easyblocks/generic/cargo.py b/easybuild/easyblocks/generic/cargo.py index 422d74f7a7..b4e8316932 100644 --- a/easybuild/easyblocks/generic/cargo.py +++ b/easybuild/easyblocks/generic/cargo.py @@ -27,6 +27,7 @@ @author: Mikael Oehman (Chalmers University of Technology) @author: Alex Domingo (Vrije Universiteit Brussel) +@author: Alexander Grund (TU Dresden) """ import os @@ -37,10 +38,10 @@ from easybuild.tools.build_log import EasyBuildError, print_warning from easybuild.framework.easyconfig import CUSTOM from easybuild.framework.extensioneasyblock import ExtensionEasyBlock -from easybuild.tools.filetools import extract_file, change_dir +from easybuild.tools.filetools import extract_file from easybuild.tools.run import run_cmd from easybuild.tools.config import build_option -from easybuild.tools.filetools import compute_checksum, mkdir, write_file +from easybuild.tools.filetools import compute_checksum, mkdir, move_file, read_file, write_file, CHECKSUM_TYPE_SHA256 from easybuild.tools.toolchain.compiler import OPTARCH_GENERIC CRATESIO_SOURCE = "https://crates.io/api/v1/crates" @@ -54,14 +55,74 @@ """ -CONFIG_TOML_PATCH_GIT = """ -[patch."{repo}"] -{crates} +CONFIG_TOML_SOURCE_GIT = """ +[source."{url}?rev={rev}"] +git = "{url}" +rev = "{rev}" +replace-with = "vendored-sources" + """ -CONFIG_TOML_PATCH_GIT_CRATES = """{0} = {{ path = "{1}" }} + +CONFIG_TOML_SOURCE_GIT_WORKSPACE = """ +[source."real-{url}?rev={rev}"] +directory = "{workspace_dir}" + +[source."{url}?rev={rev}"] +git = "{url}" +rev = "{rev}" +replace-with = "real-{url}?rev={rev}" + """ -CARGO_CHECKSUM_JSON = '{{"files": {{}}, "package": "{chksum}"}}' +CARGO_CHECKSUM_JSON = '{{"files": {{}}, "package": "{checksum}"}}' + + +def get_workspace_members(crate_dir): + """Find all members of a cargo workspace in crate_dir. + + (Minimally) parse the Cargo.toml file. + If it is a workspace return all members (subfolder names). + Otherwise return None. + """ + cargo_toml = os.path.join(crate_dir, 'Cargo.toml') + + # We are looking for this: + # [workspace] + # members = [ + # "reqwest-middleware", + # "reqwest-tracing", + # "reqwest-retry", + # ] + + lines = [line.strip() for line in read_file(cargo_toml).splitlines()] + try: + start_idx = lines.index('[workspace]') + except ValueError: + return None + # Find "members = [" and concatenate the value, stop at end of section or file + member_str = None + for line in lines[start_idx + 1:]: + if line.startswith('#'): + continue # Skip comments + if re.match(r'\[\w+\]', line): + break + if member_str is None: + m = re.match(r'members\s+=\s+\[', line) + if m: + member_str = line[m.end():] + elif line.endswith(']'): + member_str += line[:-1].strip() + break + else: + member_str += line + # Split at commas after removing possibly trailing ones and remove the quotes + members = [member.strip().strip('"') for member in member_str.rstrip(',').split(',')] + # Sanity check that we didn't pick up anything unexpected + invalid_members = [member for member in members if not re.match(r'(\w|-)+', member)] + if invalid_members: + raise EasyBuildError('Failed to parse %s: Found seemingly invalid members: %s', + cargo_toml, ', '.join(invalid_members)) + return [os.path.join(crate_dir, m) for m in members] class Cargo(ExtensionEasyBlock): @@ -125,7 +186,6 @@ def __init__(self, *args, **kwargs): """Constructor for Cargo easyblock.""" super(Cargo, self).__init__(*args, **kwargs) self.cargo_home = os.path.join(self.builddir, '.cargo') - self.vendor_dir = os.path.join(self.builddir, 'easybuild_vendor') env.setvar('CARGO_HOME', self.cargo_home) env.setvar('RUSTC', 'rustc') env.setvar('RUSTDOC', 'rustdoc') @@ -168,67 +228,119 @@ def extract_step(self): """ Unpack the source files and populate them with required .cargo-checksum.json if offline """ - mkdir(self.vendor_dir) + vendor_dir = os.path.join(self.builddir, 'easybuild_vendor') + mkdir(vendor_dir) + # Sources from git repositories might contain multiple crates/folders in a so-called "workspace". + # If we put such a workspace into the vendor folder, cargo fails with + # "found a virtual manifest at [...]Cargo.toml instead of a package manifest". + # Hence we put those in a separate folder and only move "regular" crates into the vendor folder. + git_vendor_dir = os.path.join(self.builddir, 'easybuild_vendor_git') + mkdir(git_vendor_dir) vendor_crates = {self.crate_src_filename(*crate): crate for crate in self.crates} - git_sources = {crate[2]: [] for crate in self.crates if len(crate) == 4} + # Track git sources for building the cargo config and avoiding duplicated folders + git_sources = {} for src in self.src: - extraction_dir = self.builddir + # Check for git crates, `git_key` will be set to a true-ish value for those + try: + crate_name, _, git_repo, rev = vendor_crates[src['name']] + except (ValueError, KeyError): + git_key = None + else: + git_key = (git_repo, rev) + self.log.debug("Sources of %s(%s) belong to git repo: %s rev %s", + crate_name, src['name'], git_repo, rev) + # Do a sanity check that sources for the same repo and revision are the same + try: + previous_source = git_sources[git_key] + except KeyError: + git_sources[git_key] = src + else: + previous_checksum = previous_source['checksum'] + current_checksum = src['checksum'] + if previous_checksum and current_checksum and previous_checksum != current_checksum: + raise EasyBuildError("Sources for the same git repository need to be identical." + "Mismatch found for %s rev %s in %s vs %s", + git_repo, rev, previous_source['name'], src['name']) + self.log.info("Source %s already extracted to %s by %s. Skipping extraction.", + src['name'], previous_source['finalpath'], previous_source['name']) + src['finalpath'] = previous_source['finalpath'] + continue + + is_vendor_crate = src['name'] in vendor_crates # Extract dependency crates into vendor subdirectory, separate from sources of main package - if src['name'] in vendor_crates: - extraction_dir = self.vendor_dir + if is_vendor_crate: + extraction_dir = git_vendor_dir if git_key else vendor_dir + else: + extraction_dir = self.builddir self.log.info("Unpacking source of %s", src['name']) existing_dirs = set(os.listdir(extraction_dir)) - crate_dir = None src_dir = extract_file(src['path'], extraction_dir, cmd=src['cmd'], extra_options=self.cfg['unpack_options'], change_into_dir=False) new_extracted_dirs = set(os.listdir(extraction_dir)) - existing_dirs - if len(new_extracted_dirs) == 1: - # Expected crate tarball with 1 folder - crate_dir = new_extracted_dirs.pop() - src_dir = os.path.join(extraction_dir, crate_dir) - self.log.debug("Unpacked sources of %s into: %s", src['name'], src_dir) - elif len(new_extracted_dirs) == 0: + if len(new_extracted_dirs) == 0: # Extraction went wrong raise EasyBuildError("Unpacking sources of '%s' failed", src['name']) + # Expected crate tarball with 1 folder # TODO: properly handle case with multiple extracted folders # this is currently in a grey area, might still be used by cargo + if len(new_extracted_dirs) == 1: + src_dir = os.path.join(extraction_dir, new_extracted_dirs.pop()) + self.log.debug("Unpacked sources of %s into: %s", src['name'], src_dir) - change_dir(src_dir) - self.src[self.src.index(src)]['finalpath'] = src_dir - - if self.cfg['offline'] and crate_dir: - # Create checksum file for extracted sources required by vendored crates.io sources - self.log.info('creating .cargo-checksums.json file for : %s', crate_dir) - chksum = compute_checksum(src['path'], checksum_type='sha256') - chkfile = os.path.join(extraction_dir, crate_dir, '.cargo-checksum.json') - write_file(chkfile, CARGO_CHECKSUM_JSON.format(chksum=chksum)) - # Add path to extracted sources for any crate from a git repo - try: - crate_name, _, crate_repo, _ = vendor_crates[src['name']] - except (ValueError, KeyError): - pass - else: - self.log.debug("Sources of %s belong to git repo: %s", src['name'], crate_repo) - git_src_dir = (crate_name, src_dir) - git_sources[crate_repo].append(git_src_dir) + if is_vendor_crate and self.cfg['offline']: + # Create checksum file for extracted sources required by vendored crates + + # By default there is only a single crate + crate_dirs = [src_dir] + # For git sources determine the folders that contain crates by taking workspaces into account + if git_key: + member_dirs = get_workspace_members(src_dir) + if member_dirs: + crate_dirs = member_dirs + + try: + checksum = src[CHECKSUM_TYPE_SHA256] + except KeyError: + checksum = compute_checksum(src['path'], checksum_type=CHECKSUM_TYPE_SHA256) + for crate_dir in crate_dirs: + self.log.info('creating .cargo-checksums.json file for %s', os.path.basename(crate_dir)) + chkfile = os.path.join(src_dir, crate_dir, '.cargo-checksum.json') + write_file(chkfile, CARGO_CHECKSUM_JSON.format(checksum=checksum)) + # Move non-workspace git crates to the vendor folder + if git_key and member_dirs is None: + src_dir = os.path.join(vendor_dir, os.path.basename(crate_dirs[0])) + move_file(crate_dirs[0], src_dir) + + src['finalpath'] = src_dir if self.cfg['offline']: self.log.info("Setting vendored crates dir for offline operation") config_toml = os.path.join(self.cargo_home, 'config.toml') # Replace crates-io with vendored sources using build dir wide toml file in CARGO_HOME - # because the rust source subdirectories might differ with python packages self.log.debug("Writting config.toml entry for vendored crates from crate.io") - write_file(config_toml, CONFIG_TOML_SOURCE_VENDOR.format(vendor_dir=self.vendor_dir), append=True) - - # also vendor sources from other git sources (could be many crates for one git source) - for git_repo, repo_crates in git_sources.items(): - self.log.debug("Writting config.toml entry for git repo: %s", git_repo) - config_crates = ''.join([CONFIG_TOML_PATCH_GIT_CRATES.format(*crate) for crate in repo_crates]) - write_file(config_toml, CONFIG_TOML_PATCH_GIT.format(repo=git_repo, crates=config_crates), append=True) + write_file(config_toml, CONFIG_TOML_SOURCE_VENDOR.format(vendor_dir=vendor_dir), append=True) + + # Tell cargo about the vendored git sources to avoid it failing with: + # Unable to update https://github.com/[...] + # can't checkout from 'https://github.com/[...]]': you are in the offline mode (--offline) + for (git_repo, rev), src in git_sources.items(): + self.log.debug("Writting config.toml entry for git repo: %s rev %s", git_repo, rev) + src_dir = src['finalpath'] + if os.path.dirname(src_dir) == vendor_dir: + # Non-workspace sources are in vendor_dir + write_file(config_toml, + CONFIG_TOML_SOURCE_GIT.format(url=git_repo, rev=rev), + append=True) + else: + # Workspace sources stay in their own separate folder. + # We cannot have a `directory = ""` entry where a folder containing a workspace is inside + write_file(config_toml, + CONFIG_TOML_SOURCE_GIT_WORKSPACE.format(url=git_repo, rev=rev, workspace_dir=src_dir), + append=True) # Use environment variable since it would also be passed along to builds triggered via python packages env.setvar('CARGO_NET_OFFLINE', 'true')