diff --git a/README.md b/README.md index 0623b5f8..dbb7f172 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ Tooling for connecting GitLab, pipelines, and SODAR at CUBI. Prerequisites when using conda: ```bash -$ conda create -n cubi-tk python=3.7 +$ conda create -n cubi-tk python=3.10 $ conda activate cubi-tk ``` diff --git a/cubi_tk/irods/check.py b/cubi_tk/irods/check.py index be1552f7..05305367 100644 --- a/cubi_tk/irods/check.py +++ b/cubi_tk/irods/check.py @@ -9,7 +9,10 @@ import typing from irods.collection import iRODSCollection +from irods.column import Like from irods.data_object import iRODSDataObject +from irods.models import Collection as CollectionModel +from irods.models import DataObject as DataObjectModel from irods.session import iRODSSession from logzero import logger import tqdm @@ -103,16 +106,32 @@ def get_irods_error(cls, e: Exception): es = str(e) return es if es != "None" else e.__class__.__name__ - def get_data_objs(self, root_coll: iRODSCollection): + def get_data_objs( + self, root_coll: iRODSCollection + ) -> typing.Dict[ + str, typing.Union[typing.Dict[str, iRODSDataObject], typing.List[iRODSDataObject]] + ]: """Get data objects recursively under the given iRODS path.""" data_objs = dict(files=[], checksums={}) ignore_schemes = [k.lower() for k in HASH_SCHEMES if k != self.args.hash_scheme.upper()] - for res in root_coll.walk(): - for obj in res[2]: - if obj.path.endswith("." + self.args.hash_scheme.lower()): - data_objs["checksums"][obj.path] = obj - elif obj.path.split(".")[-1] not in ignore_schemes: - data_objs["files"].append(obj) + irods_sess = root_coll.manager.sess + + query = irods_sess.query(DataObjectModel, CollectionModel).filter( + Like(CollectionModel.name, f"{root_coll.path}%") + ) + + for res in query: + # If the 'res' dict is not split into Colllection&Object the resulting iRODSDataObject is not fully functional, likely because a name/path/... attribute is overwritten somewhere + coll_res = {k: v for k, v in res.items() if k.icat_id >= 500} + obj_res = {k: v for k, v in res.items() if k.icat_id < 500} + coll = iRODSCollection(root_coll.manager, coll_res) + obj = iRODSDataObject(irods_sess.data_objects, parent=coll, results=[obj_res]) + + if obj.path.endswith("." + self.args.hash_scheme.lower()): + data_objs["checksums"][obj.path] = obj + elif obj.path.split(".")[-1] not in ignore_schemes: + data_objs["files"].append(obj) + return data_objs def check_args(self, _args): diff --git a/cubi_tk/snappy/check_remote.py b/cubi_tk/snappy/check_remote.py index e610be8f..35dc6c67 100644 --- a/cubi_tk/snappy/check_remote.py +++ b/cubi_tk/snappy/check_remote.py @@ -258,7 +258,7 @@ def coordinate_run(self, check_name): # Restrict dictionary to directories associated with step subset_remote_files_dict = {} for key in self.remote_files_dict: - if all((check_name in dat.irods_path for dat in self.remote_files_dict[key])): + if all((check_name in dat.path for dat in self.remote_files_dict[key])): subset_remote_files_dict[key] = self.remote_files_dict.get(key) # Parse local files - remove library reference @@ -333,7 +333,7 @@ def compare_md5_files(remote_dict, in_both_set): # Compare to remote MD5 for irods_dat in remote_dict.get(original_file_name): if local_md5 != irods_dat.FILE_MD5SUM: - different_md5_list.append((md5_file.replace(".md5", ""), irods_dat.irods_path)) + different_md5_list.append((md5_file.replace(".md5", ""), irods_dat.path)) else: same_md5_list.append(md5_file.replace(".md5", "")) # BONUS - check remote replicas @@ -345,7 +345,7 @@ def compare_md5_files(remote_dict, in_both_set): ): logger.error( f"iRODS metadata checksum not consistent with checksum file...\n" - f"File: {irods_dat.irods_path}\n" + f"File: {irods_dat.path}\n" f"File checksum: {irods_dat.FILE_MD5SUM}\n" f"Metadata checksum: {', '.join(irods_dat.REPLICAS_MD5SUM)}\n" ) @@ -393,7 +393,7 @@ def compare_local_and_remote_files(local_dict, remote_dict): # Only remote for file_ in all_remote_files_set - all_local_files_set: for irods_dat in remote_dict[file_]: - only_remote_set.add(irods_dat.irods_path) + only_remote_set.add(irods_dat.path) # Only local for file_ in all_local_files_set - all_remote_files_set: @@ -419,7 +419,7 @@ def report_multiple_file_versions_in_sodar(remote_dict): inner_dict = {} for file_, irods_list in remote_dict.items(): if len(irods_list) > 1: - inner_dict[file_] = [dat.irods_path for dat in irods_list] + inner_dict[file_] = [dat.path for dat in irods_list] # Format and display information - if any if len(inner_dict) > 0: pairs_str = "" diff --git a/cubi_tk/snappy/common.py b/cubi_tk/snappy/common.py index 245ad594..eed2380d 100644 --- a/cubi_tk/snappy/common.py +++ b/cubi_tk/snappy/common.py @@ -1,5 +1,4 @@ """Common functionality for SNAPPY.""" - import pathlib import typing @@ -8,26 +7,6 @@ from logzero import logger import yaml -#: Dependencies between the SNAPPY steps. -DEPENDENCIES: typing.Dict[str, typing.Tuple[str, ...]] = { - "ngs_mapping": (), - "roh_calling": ("variant_calling",), - "variant_calling": ("ngs_mapping",), - "variant_export": ("variant_calling",), - "variant_export_external": (), - "targeted_seq_cnv_calling": ("ngs_mapping",), - "targeted_seq_cnv_annotation": ("targeted_seq_cnv_calling",), - "targeted_seq_cnv_export": ("targeted_seq_cnv_annotation",), - "wgs_sv_calling": ("ngs_mapping",), - "wgs_sv_annotation": ("wgs_sv_calling",), - "wgs_sv_export": ("wgs_sv_annotation",), - "wgs_sv_export_external": (), - "wgs_cnv_calling": ("ngs_mapping",), - "wgs_cnv_annotation": ("wgs_cnv_calling",), - "wgs_cnv_export": ("wgs_cnv_annotation",), - "wgs_cnv_export_external": (), -} - class CouldNotFindPipelineRoot(Exception): """Raised when ``.snappy_pipeline`` could not be found.""" @@ -37,6 +16,22 @@ class CouldNotFindBioMedSheet(Exception): """Raised when BioMedSheet could not be found in configuration file.""" +def load_sheet_tsv(path_tsv, tsv_shortcut="germline"): + """Load sample sheet. + + :param path_tsv: Path to sample sheet TSV file. + :type path_tsv: pathlib.Path + + :param tsv_shortcut: Sample sheet type. Default: 'germline'. + :type tsv_shortcut: str + + :return: Returns Sheet model. + """ + load_tsv = getattr(io_tsv, "read_%s_tsv_sheet" % tsv_shortcut) + with open(path_tsv, "rt") as f: + return load_tsv(f, naming_scheme=NAMING_ONLY_SECONDARY_ID) + + def find_snappy_root_dir( start_path: typing.Union[str, pathlib.Path], more_markers: typing.Iterable[str] = () ): @@ -63,22 +58,6 @@ def find_snappy_root_dir( raise CouldNotFindPipelineRoot() -def load_sheet_tsv(path_tsv, tsv_shortcut="germline"): - """Load sample sheet. - - :param path_tsv: Path to sample sheet TSV file. - :type path_tsv: pathlib.Path - - :param tsv_shortcut: Sample sheet type. Default: 'germline'. - :type tsv_shortcut: str - - :return: Returns Sheet model. - """ - load_tsv = getattr(io_tsv, "read_%s_tsv_sheet" % tsv_shortcut) - with open(path_tsv, "rt") as f: - return load_tsv(f, naming_scheme=NAMING_ONLY_SECONDARY_ID) - - def get_biomedsheet_path(start_path, uuid): """Get biomedsheet path, i.e., sample sheet. diff --git a/cubi_tk/snappy/kickoff.py b/cubi_tk/snappy/kickoff.py index 505c65fd..045dd1ca 100644 --- a/cubi_tk/snappy/kickoff.py +++ b/cubi_tk/snappy/kickoff.py @@ -12,6 +12,26 @@ from cubi_tk.exceptions import ParseOutputException from . import common +from .snappy_workflows import SnappyWorkflowManager + + +class SnappyMissingPackageException(Exception): + def __str__(self): + return "snappy-pipeline is not installed. This function will not work." + + +class SnappyMissingDependencyException(Exception): + """Raised if dependencies of steps do not exist in the current workflow.""" + + def __init__( + self, step_name: str, step_dependencies: typing.List[str], existing_steps: typing.List[str] + ): + self.step_name = step_name + self.step_dependencies = step_dependencies + self.existing_steps = existing_steps + + def __str__(self): + return f"{self.step_name} requires {self.step_dependencies}, but only {self.existing_steps} exist in workflow directory." def run( @@ -19,39 +39,56 @@ def run( ) -> typing.Optional[int]: logger.info("Try to find SNAPPY pipeline directory...") try: - path = common.find_snappy_root_dir(args.path or os.getcwd(), common.DEPENDENCIES.keys()) + path = common.find_snappy_root_dir(args.path or os.getcwd()) except common.CouldNotFindPipelineRoot: return 1 - # TODO: this assumes standard naming which is a limitation... - logger.info("Looking for pipeline directories (assuming standard naming)...") + logger.info("Looking for pipeline directories (needs to contain snappy config.yaml)...") logger.debug("Looking in %s", path) - step_set = {name for name in common.DEPENDENCIES if (path / name).exists()} + + manager = SnappyWorkflowManager.from_snappy() + + if manager is None: + raise SnappyMissingPackageException + + step_dependencies = {} + folder_steps = manager.get_snappy_step_directories(path) + for step_name, step_path in folder_steps.items(): + dependencies = manager.get_workflow_step_dependencies(step_path) + if not all(dep in folder_steps for dep in dependencies): + raise SnappyMissingDependencyException( + step_name, dependencies, list(folder_steps.keys()) + ) + + step_dependencies[step_name] = dependencies + steps: typing.List[str] = [] - for names in toposort({k: set(v) for k, v in common.DEPENDENCIES.items()}): - steps += [name for name in names if name in step_set] + for names in toposort({k: set(v) for k, v in step_dependencies.items()}): + steps += names logger.info("Will run the steps: %s", ", ".join(steps)) logger.info("Submitting with sbatch...") jids: typing.Dict[str, str] = {} + for step in steps: - path_cache = path / step / ".snappy_path_cache" + step_path = folder_steps[step] + path_cache = step_path / ".snappy_path_cache" if step == "ngs_mapping" and path_cache.exists(): age_cache = time.time() - path_cache.stat().st_mtime max_age = 24 * 60 * 60 # 1d if age_cache > max_age: logger.info("Cache older than %d - purging", max_age) path_cache.unlink() - dep_jids = [jids[dep] for dep in common.DEPENDENCIES[step] if dep in jids] + dep_jids = [jids[dep] for dep in step_dependencies[step] if dep in jids] cmd = ["sbatch"] if dep_jids: cmd += ["--dependency", "afterok:%s" % ":".join(map(str, dep_jids))] cmd += ["pipeline_job.sh"] - logger.info("Submitting step %s: %s", step, " ".join(cmd)) + logger.info("Submitting step %s (./%s): %s", step, step_path.name, " ".join(cmd)) if args.dry_run: jid = "<%s>" % step else: - stdout_raw = subprocess.check_output(cmd, cwd=str(path / step), timeout=args.timeout) + stdout_raw = subprocess.check_output(cmd, cwd=str(step_path), timeout=args.timeout) stdout = stdout_raw.decode("utf-8") if not stdout.startswith("Submitted batch job "): raise ParseOutputException("Did not understand sbatch output: %s" % stdout) @@ -75,7 +112,10 @@ def setup_argparse(parser: argparse.ArgumentParser) -> None: ) parser.add_argument( - "--timeout", default=10, type=int, help="Number of seconds to wait for commands." + "--timeout", + default=10, + type=int, + help="Number of seconds to wait for commands.", ) parser.add_argument( diff --git a/cubi_tk/snappy/pull_data_common.py b/cubi_tk/snappy/pull_data_common.py index 872bbc6c..049141bb 100644 --- a/cubi_tk/snappy/pull_data_common.py +++ b/cubi_tk/snappy/pull_data_common.py @@ -1,4 +1,5 @@ from datetime import datetime +import os from pathlib import Path from types import SimpleNamespace @@ -30,7 +31,7 @@ def filter_irods_collection(self, identifiers, remote_files_dict, file_type): :type identifiers: list :param remote_files_dict: Dictionary with iRODS collection information. Key: file name as string (e.g., - 'P001-N1-DNA1-WES1.vcf.gz'); Value: iRODS data (``IrodsDataObject``). + 'P001-N1-DNA1-WES1.vcf.gz'); Value: iRODS data (``iRODSDataObject``). :type remote_files_dict: dict :param file_type: File type, example: 'bam' or 'vcf'. @@ -52,7 +53,7 @@ def filter_irods_collection(self, identifiers, remote_files_dict, file_type): # Note: if a file with the same name is present in both assay and in a common file, it will be ignored. in_common_links = False for irods_obj in value: - in_common_links = self._irods_path_in_common_links(irods_obj.irods_path) + in_common_links = self._irods_path_in_common_links(irods_obj.path) if in_common_links: break @@ -93,7 +94,9 @@ def get_assay_uuid(self, sodar_url, sodar_api_token, project_uuid): :return: Returns assay UUID. """ investigation = api.samplesheet.retrieve( - sodar_url=sodar_url, sodar_api_token=sodar_api_token, project_uuid=project_uuid + sodar_url=sodar_url, + sodar_api_token=sodar_api_token, + project_uuid=project_uuid, ) for study in investigation.studies.values(): for _assay_uuid in study.assays: @@ -123,11 +126,14 @@ def get_irods_files(self, irods_local_path_pairs, force_overwrite=False): file_name = pair[0].split("/")[-1] irods_path = pair[0] local_out_path = pair[1] - logger.info(f"Retrieving '{file_name}' from: {irods_path}") # Create output directory if necessary Path(local_out_path).parent.mkdir(parents=True, exist_ok=True) # Get file - irods_sessions[0].data_objects.get(irods_path, local_out_path, **kw_options) + if os.path.exists(local_out_path) and not force_overwrite: + logger.info(f"{file_name} already exists. Force_overwrite to re-download.") + else: + logger.info(f"Retrieving '{file_name}' from: {irods_path}") + irods_sessions[0].data_objects.get(irods_path, local_out_path, **kw_options) except OVERWRITE_WITHOUT_FORCE_FLAG: logger.error( @@ -169,7 +175,7 @@ def sort_irods_object_by_date_in_path(self, irods_obj_list): /sodarZone/projects/..//.../assay_//...//... :param irods_obj_list: List of iRODS objects derived from collection in SODAR. - :type irods_obj_list: List[IrodsDataObject] + :type irods_obj_list: List[iRODSDataObject] :return: Returns inputted list sorted from latest to earliest iRODS object. """ @@ -178,7 +184,7 @@ def sort_irods_object_by_date_in_path(self, irods_obj_list): return irods_obj_list return sorted( irods_obj_list, - key=lambda irods_obj: self._find_date_in_path(irods_obj.irods_path), + key=lambda irods_obj: self._find_date_in_path(irods_obj.path), reverse=True, ) diff --git a/cubi_tk/snappy/pull_processed_data.py b/cubi_tk/snappy/pull_processed_data.py index 54d08078..a2620486 100644 --- a/cubi_tk/snappy/pull_processed_data.py +++ b/cubi_tk/snappy/pull_processed_data.py @@ -298,7 +298,7 @@ def pair_ipath_with_outdir(self, remote_files_dict, output_dir, assay_uuid, retr """Pair iRODS path with local output directory :param remote_files_dict: Dictionary with iRODS collection information. Key: file name as string (e.g., - 'P001-N1-DNA1-WES1'); Value: iRODS data (``IrodsDataObject``). + 'P001-N1-DNA1-WES1'); Value: iRODS data (``iRODSDataObject``). :type remote_files_dict: dict :param output_dir: Output directory path. @@ -328,19 +328,18 @@ def pair_ipath_with_outdir(self, remote_files_dict, output_dir, assay_uuid, retr # /sodarZone/projects/..//sample_data/study_/assay_/ try: irods_dir_structure = os.path.dirname( - str(irods_obj.irods_path).split(f"assay_{assay_uuid}/")[1] + str(irods_obj.path).split(f"assay_{assay_uuid}/")[1] ) - _out_path = os.path.join(output_dir, irods_dir_structure, irods_obj.file_name) + _out_path = os.path.join(output_dir, irods_dir_structure, irods_obj.name) except IndexError: logger.warning( f"Provided Assay UUID '{assay_uuid}' is not present in SODAR path, " f"hence directory structure won't be preserved.\n" f"All files will be stored in root of output directory: {output_list}" ) - _out_path = os.path.join(output_dir, irods_obj.file_name) + _out_path = os.path.join(output_dir, irods_obj.name) # Update output - output_list.append((irods_obj.irods_path, _out_path)) - output_list.append((irods_obj.irods_path + ".md5", _out_path + ".md5")) + output_list.append((irods_obj.path, _out_path)) return output_list diff --git a/cubi_tk/snappy/pull_raw_data.py b/cubi_tk/snappy/pull_raw_data.py index 9a9602af..a70e52a7 100644 --- a/cubi_tk/snappy/pull_raw_data.py +++ b/cubi_tk/snappy/pull_raw_data.py @@ -255,7 +255,7 @@ def filter_irods_collection_by_library_name_in_path( :type identifiers: list :param remote_files_dict: Dictionary with iRODS collection information. Key: file name as string (e.g., - 'P001-N1-DNA1-WES1.vcf.gz'); Value: iRODS data (``IrodsDataObject``). + 'P001-N1-DNA1-WES1.vcf.gz'); Value: iRODS data (``iRODSDataObject``). :type remote_files_dict: dict :param file_type: File type, example: 'fastq'. @@ -280,9 +280,9 @@ def filter_irods_collection_by_library_name_in_path( in_common_links = False for irods_obj in value: # Piggyback loop for dir check - _irods_path_list.append(irods_obj.irods_path) + _irods_path_list.append(irods_obj.path) # Actual check - if self._irods_path_in_common_links(irods_obj.irods_path): + if self._irods_path_in_common_links(irods_obj.path): in_common_links = True break @@ -302,7 +302,7 @@ def pair_ipath_with_outdir(library_to_irods_dict, identifiers_tuples, assay_uuid """Pair iRODS path with local output directory :param library_to_irods_dict: Dictionary with iRODS collection information by sample. Key: sample name as - string (e.g., 'P001'); Value: iRODS data (``IrodsDataObject``). + string (e.g., 'P001'); Value: iRODS data (``iRODSDataObject``). :type library_to_irods_dict: dict :param identifiers_tuples: List of tuples (sample name, folder name) as defined in the sample sheet. @@ -332,10 +332,10 @@ def pair_ipath_with_outdir(library_to_irods_dict, identifiers_tuples, assay_uuid # /sodarZone/projects/..//sample_data/study_/assay_/ try: irods_dir_structure = os.path.dirname( - str(irods_obj.irods_path).split(f"assay_{assay_uuid}/")[1] + str(irods_obj.path).split(f"assay_{assay_uuid}/")[1] ) _out_path = os.path.join( - output_dir, folder_name, irods_dir_structure, irods_obj.file_name + output_dir, folder_name, irods_dir_structure, irods_obj.name ) except IndexError: logger.warning( @@ -343,10 +343,9 @@ def pair_ipath_with_outdir(library_to_irods_dict, identifiers_tuples, assay_uuid f"hence directory structure won't be preserved.\n" f"All files will be stored in root of output directory: {output_list}" ) - _out_path = os.path.join(output_dir, folder_name, irods_obj.file_name) + _out_path = os.path.join(output_dir, folder_name, irods_obj.name) # Update output - output_list.append((irods_obj.irods_path, _out_path)) - output_list.append((irods_obj.irods_path + ".md5", _out_path + ".md5")) + output_list.append((irods_obj.path, _out_path)) return output_list @@ -358,7 +357,7 @@ def get_library_to_irods_dict(identifiers, remote_files_dict): :type identifiers: list :param remote_files_dict: Dictionary with iRODS collection information. Key: file name as string (e.g., - 'P001_R1_001.fastq.gz'); Value: iRODS data (``IrodsDataObject``). + 'P001_R1_001.fastq.gz'); Value: iRODS data (``iRODSDataObject``). :type remote_files_dict: dict :return: Returns dictionary: Key: identifier (sample name [str]); Value: list of iRODS objects. diff --git a/cubi_tk/snappy/pull_sheets.py b/cubi_tk/snappy/pull_sheets.py index 16b610b1..2e5d546a 100644 --- a/cubi_tk/snappy/pull_sheets.py +++ b/cubi_tk/snappy/pull_sheets.py @@ -205,6 +205,8 @@ def __init__(self): self.sources = {} #: Sample by sample name. self.samples = {} + #: The previous process. + self.prev_process = None def on_visit_material(self, material, node_path, study=None, assay=None): super().on_visit_material(material, node_path, study, assay) @@ -230,7 +232,10 @@ def on_visit_material(self, material, node_path, study=None, assay=None): affected=affected.value[0] if affected else None, sample_name=sample.name, ) - elif material.type == "Library Name": + elif material.type == "Library Name" or ( + material.type == "Extract Name" + and self.prev_process.protocol_ref.startswith("Library construction") + ): library = material sample = material_path[0] if library.name.split("-")[-1].startswith("WGS"): @@ -260,6 +265,7 @@ def on_visit_material(self, material, node_path, study=None, assay=None): def on_visit_process(self, process, node_path, study=None, assay=None): super().on_visit_node(process, study, assay) + self.prev_process = process material_path = [x for x in node_path if hasattr(x, "type")] sample = material_path[0] if process.protocol_ref.startswith("Nucleic acid sequencing"): diff --git a/cubi_tk/snappy/retrieve_irods_collection.py b/cubi_tk/snappy/retrieve_irods_collection.py index 98f80a95..57afed86 100644 --- a/cubi_tk/snappy/retrieve_irods_collection.py +++ b/cubi_tk/snappy/retrieve_irods_collection.py @@ -1,28 +1,18 @@ """Contains classes and methods used to retrieve iRODS collections from SODAR. """ from collections import defaultdict -import re +import typing -import attr +from irods.data_object import iRODSDataObject from logzero import logger from sodar_cli import api -from ..irods.check import HASH_SCHEMES, IrodsCheckCommand +from ..irods.check import IrodsCheckCommand #: Default hash scheme. Although iRODS provides alternatives, the whole of `snappy` pipeline uses MD5. DEFAULT_HASH_SCHEME = "MD5" -@attr.s(frozen=True, auto_attribs=True) -class IrodsDataObject: - """iRODS data object - simplified version of data provided in iRODS Collections.""" - - file_name: str - irods_path: str - file_md5sum: str - replicas_md5sum: list - - class RetrieveIrodsCollection(IrodsCheckCommand): """Class retrieves iRODS Collection associated with Assay""" @@ -47,19 +37,17 @@ def __init__(self, args, sodar_url, sodar_api_token, assay_uuid, project_uuid): self.assay_uuid = assay_uuid self.project_uuid = project_uuid - def perform(self): - """Perform class routines. - - :return: Returns iRODS collection represented as dictionary: key: file name as string (e.g., - 'P001-N1-DNA1-WES1'); value: iRODS data (``IrodsDataObject``). - """ + def perform(self) -> typing.Dict[str, typing.List[iRODSDataObject]]: + """Perform class routines.""" logger.info("Starting remote files search ...") # Get assay iRODS path assay_path = self.get_assay_irods_path(assay_uuid=self.assay_uuid) # Get iRODS collection - irods_collection_dict = self.retrieve_irods_data_objects(irods_path=assay_path) + irods_collection_dict = {} + if assay_path: + irods_collection_dict = self.retrieve_irods_data_objects(irods_path=assay_path) logger.info("... done with remote files search.") return irods_collection_dict @@ -110,60 +98,50 @@ def multi_assay_warning(assays): f"All available UUIDs:\n{multi_assay_str}" ) - def retrieve_irods_data_objects(self, irods_path): + def retrieve_irods_data_objects( + self, irods_path: str + ) -> typing.Dict[str, typing.List[iRODSDataObject]]: """Retrieve data objects from iRODS. :param irods_path: iRODS path. - :type irods_path: str :return: Returns dictionary representation of iRODS collection information. Key: File name in iRODS (str); - Value: list of IrodsDataObject (attributes: 'file_name', 'irods_path', 'file_md5sum', 'replicas_md5sum'). + Value: list of iRODSDataObject (native python-irodsclient object). """ + # Connect to iRODS - with self._get_irods_sessions() as irods_sessions: + with self._get_irods_sessions(1) as irods_sessions: try: root_coll = irods_sessions[0].collections.get(irods_path) - s_char = "s" if len(irods_sessions) != 1 else "" - logger.info(f"{len(irods_sessions)} iRODS connection{s_char} initialized") + + # Get files and run checks + logger.info("Querying for data objects") + + if root_coll is not None: + irods_data_objs = self.get_data_objs(root_coll) + irods_obj_dict = self.parse_irods_collection(irods_data_objs) + return irods_obj_dict + except Exception as e: logger.error("Failed to retrieve iRODS path: %s", self.get_irods_error(e)) raise - # Get files and run checks - logger.info("Querying for data objects") - irods_collection = self.get_data_objs(root_coll) - return self.parse_irods_collection(irods_collection=irods_collection) + return {} @staticmethod - def parse_irods_collection(irods_collection): + def parse_irods_collection(irods_data_objs) -> typing.Dict[str, typing.List[iRODSDataObject]]: """Parse iRODS collection :param irods_collection: iRODS collection. :type irods_collection: dict :return: Returns dictionary representation of iRODS collection information. Key: File name in iRODS (str); - Value: list of IrodsDataObject (attributes: 'file_name', 'irods_path', 'file_md5sum', 'replicas_md5sum'). + Value: list of iRODSDataObject (native python-irodsclient object). """ # Initialise variables output_dict = defaultdict(list) - checksums = irods_collection["checksums"] - - # Extract relevant info from iRODS collection: file and replicates MD5SUM - for data_obj in irods_collection["files"]: - chk_obj = checksums.get(data_obj.path + "." + DEFAULT_HASH_SCHEME.lower()) - if not chk_obj: - logger.error(f"No checksum file for: {data_obj.path}") - continue - with chk_obj.open("r") as f: - file_sum = re.search( - HASH_SCHEMES[DEFAULT_HASH_SCHEME]["regex"], f.read().decode("utf-8") - ).group(0) - output_dict[data_obj.name].append( - IrodsDataObject( - file_name=data_obj.name, - irods_path=data_obj.path, - file_md5sum=file_sum, - replicas_md5sum=[replica.checksum for replica in data_obj.replicas], - ) - ) + + for obj in irods_data_objs["files"]: + output_dict[obj.name].append(obj) + return output_dict diff --git a/cubi_tk/snappy/snappy_workflows.py b/cubi_tk/snappy/snappy_workflows.py new file mode 100644 index 00000000..8b71c276 --- /dev/null +++ b/cubi_tk/snappy/snappy_workflows.py @@ -0,0 +1,156 @@ +from collections import defaultdict +import importlib +import pathlib +import re +import typing + +from attrs import define +from logzero import logger +import yaml + + +def get_workflow_snakefile_object_name( + snakefile_path: typing.Union[str, pathlib.Path] +) -> typing.Optional[str]: + """Find the Workflow implementation object name. + + :param snakefile_path: Path to snakefile for workflow. + + :return: str Name of the implementation class or None if nothing as been found. + """ + + with open(str(snakefile_path)) as f: + if m := re.search(r"wf\s*=\s*(\w+)\(", f.read()): + module_name = m.group(1) + return module_name + return None + + +class DummyWorkflow: + """Dummy workflow that does nothing.""" + + def __init__(self): + self.globals = defaultdict(str) + + def __getattr__(self, _): + return self._catchall + + def _catchall(self, *_, **__): + pass + + +@define +class SnappyWorkflowManager: + _expand_ref: typing.Callable + _snakefile_path: typing.Callable + _step_to_module: typing.Dict[str, typing.Any] + + @classmethod + def from_snappy(cls) -> typing.Optional["SnappyWorkflowManager"]: + try: + from snappy_pipeline import expand_ref + from snappy_pipeline.apps.snappy_snake import STEP_TO_MODULE + from snappy_pipeline.base import snakefile_path + except ImportError: + logger.warn( + "snappy_pipeline is not available. snappy pipeline related functions will not work properly." + ) + return None + + return cls( + expand_ref=expand_ref, step_to_module=STEP_TO_MODULE, snakefile_path=snakefile_path + ) + + def _load_workflow_step_configuration( + self, workflow_step_path: typing.Union[str, pathlib.Path] + ) -> tuple: + """Load snappy configuration and resolve any refs. + + :param workflow_step_path: Path to snappy config yaml. + :type workflow_step_path: str, pathlib.Path + + :return: Tuple of config, lookup paths and configuration paths. If no config is found, a tuple of None is returned. + """ + + config_path = pathlib.Path(workflow_step_path) / "config.yaml" + + if not config_path.exists(): + return (None, None, None) + + with open(str(config_path)) as stream: + config_data = yaml.safe_load(stream) + + config, lookup_paths, config_paths = self._expand_ref( + str(config_path), + config_data, + [ + str(workflow_step_path), + str(pathlib.Path(workflow_step_path).parent / ".snappy_pipeline"), + ], + ) + return config, lookup_paths, config_paths + + def _get_workflow_name( + self, workflow_path: typing.Union[str, pathlib.Path] + ) -> typing.Optional[str]: + """Get the name of the workflow in the directory. This will parse the contained config.yaml.""" + + config, _, _ = self._load_workflow_step_configuration(workflow_path) + if config is not None and "pipeline_step" in config: + return config["pipeline_step"].get("name", None) + + def get_snappy_step_directories( + self, snappy_root_dir: typing.Union[str, pathlib.Path] + ) -> typing.Dict[str, pathlib.Path]: + """Get a dictionary of snappy workflow step names and their associated path. + + :param snappy_root_dir: Path to the snappy root directory, also containing .snappy_pipeline + + :return: Dict of workflow step name to workflow step path. + """ + folder_steps = { + name: path + for name, path in [ + (self._get_workflow_name(p), p) for p in pathlib.Path(snappy_root_dir).iterdir() + ] + if name in self._step_to_module + } + + return folder_steps + + def get_workflow_step_dependencies( + self, workflow_step_path: typing.Union[str, pathlib.Path] + ) -> typing.List[str]: + """Find workflow dependencies for the given workflow step. + :param workflow_step_path: Path to the workflow step. + + :return: List of dependencies for the given workflow step. + """ + workflow_step_path = pathlib.Path(workflow_step_path) + + config, lookup_paths, config_paths = self._load_workflow_step_configuration( + workflow_step_path + ) + if config is None: + raise RuntimeError( + f"Could not load workflow step confiuration for {workflow_step_path}" + ) + + step_name = config["pipeline_step"]["name"] + + module_snakefile = self._snakefile_path(step_name) + + # get the name of the workflow step class name + obj_name = get_workflow_snakefile_object_name(module_snakefile) + if obj_name is None: + raise RuntimeError(f"Could not find workflow object for {step_name}") + + workflow_module = importlib.import_module(f".{step_name}", "snappy_pipeline.workflows") + workflow_class = getattr(workflow_module, obj_name) + assert workflow_class.name == step_name + + workflow_object = workflow_class( + DummyWorkflow(), config, lookup_paths, config_paths, str(workflow_step_path) + ) + dependencies = workflow_object.sub_workflows.keys() + return list(dependencies) diff --git a/cubi_tk/snappy/varfish_upload.py b/cubi_tk/snappy/varfish_upload.py index 4fecd889..d1b9d7ce 100644 --- a/cubi_tk/snappy/varfish_upload.py +++ b/cubi_tk/snappy/varfish_upload.py @@ -38,6 +38,7 @@ "bwa.gcnv", "bwa.popdel", "bwa.xhmm", + "bwa_mem2.mehari_", "bwa_mem2.varfish_", "minimap2.sniffles2", "write_pedigree", diff --git a/cubi_tk/sodar/check_remote.py b/cubi_tk/sodar/check_remote.py index 1410154c..e926ffeb 100644 --- a/cubi_tk/sodar/check_remote.py +++ b/cubi_tk/sodar/check_remote.py @@ -116,7 +116,7 @@ def __init__( :param local_files_dict: Dictionary with local directories as keys and list of FileDataObject as values. :type local_files_dict: dict - :param remote_files_dict: Dictionary with remote filenames as keys and list of IrodsDataObject as values. + :param remote_files_dict: Dictionary with remote filenames as keys and list of iRODSDataObject as values. :type remote_files_dict: dict :param filenames_only: Flag to indicate if md5 sums should not be used for comparison @@ -165,7 +165,7 @@ def compare_local_and_remote_files( :param local_dict: Dictionary with local directories as keys and list of FileDataObject as values. :type local_dict: dict - :param remote_dict: Dictionary with remote filenames as keys and list of IrodsDataObject as values. + :param remote_dict: Dictionary with remote filenames as keys and list of iRODSDataObject as values. :type remote_dict: dict :param filenames_only: Flag to indicate if md5 sums should not be used for comparison @@ -179,10 +179,10 @@ def compare_local_and_remote_files( """ def filedata_from_irodsdata(obj): - # Helper Function to convert IrodsDataObject (non-hashable) to FileDataObject, also making path relative - p = Path(obj.irods_path).parent + # Helper Function to convert iRodsDataObject (non-hashable) to FileDataObject, also making path relative + p = Path(obj.path).parent p = p.relative_to(irods_basepath) if irods_basepath else p - return FileDataObject(obj.file_name, str(p), obj.file_md5sum) + return FileDataObject(obj.name, str(p), obj.checksum) # The dictionaries will contain double information on the file path (both as keys & in the objects) # For collecting info in itself sets would be easier, however grouping by folder makes it easier to @@ -223,7 +223,7 @@ def filedata_from_irodsdata(obj): # From the file with matching names subselect those with same md5 md5_matches = { - filedata_from_irodsdata(f) for f in remote_files if f.file_md5sum == md5 + filedata_from_irodsdata(f) for f in remote_files if f.checksum == md5 } if md5_matches: remote_unmatched -= md5_matches diff --git a/docs_manual/installation.rst b/docs_manual/installation.rst index 6e296e24..1fbc3a61 100644 --- a/docs_manual/installation.rst +++ b/docs_manual/installation.rst @@ -8,7 +8,7 @@ Prerequisites when using conda: .. code-block:: bash - $ conda create -n cubi-tk python=3.7 + $ conda create -n cubi-tk python=3.10 $ conda activate cubi-tk Clone CUBI-TK and install: diff --git a/requirements/test.txt b/requirements/test.txt index 691fbdaa..01000c51 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -39,3 +39,6 @@ bandit pyflakes setuptools ==65.6.3 + +# needed for testing snappy workflow methods +snappy-pipeline @ git+https://github.com/bihealth/snappy-pipeline diff --git a/tests/helpers.py b/tests/helpers.py new file mode 100644 index 00000000..08d3e68a --- /dev/null +++ b/tests/helpers.py @@ -0,0 +1,54 @@ +import pathlib +from typing import List + +from irods.collection import iRODSCollection +from irods.data_object import iRODSDataObject +from irods.models import Collection, DataObject + + +class iRODSDataObjectEq(iRODSDataObject): + name: str + path: str + checksum: str + + def __eq__(self, other): + return ( + self.name == other.name and self.path == other.path and self.checksum == other.checksum + ) + + +def createIrodsDataObject( + file_name: str, irods_path: str, file_md5sum: str, replicas_md5sum: List[str] +): + """Create iRODSDataObject from parameters.""" + + parent = pathlib.Path(irods_path).parent + collection_data = { + Collection.id: 0, + Collection.name: str(parent), + Collection.create_time: None, + Collection.modify_time: None, + Collection.inheritance: None, + Collection.owner_name: None, + Collection.owner_zone: None, + } + collection = iRODSCollection(None, result=collection_data) + + data_object_datas = [] + for i, rep_md5sum in enumerate(replicas_md5sum): + data_object_datas.append( + { + DataObject.id: 0, + DataObject.name: file_name, + DataObject.replica_number: i, + DataObject.replica_status: None, + DataObject.resource_name: None, + DataObject.path: irods_path, + DataObject.resc_hier: None, + DataObject.checksum: rep_md5sum, + DataObject.size: 0, + DataObject.comments: "", + } + ) + obj = iRODSDataObjectEq(None, parent=collection, results=data_object_datas) + return obj diff --git a/tests/hide_modules.py b/tests/hide_modules.py new file mode 100644 index 00000000..ad04a415 --- /dev/null +++ b/tests/hide_modules.py @@ -0,0 +1,110 @@ +"""Hide modules from import + + ModuleHider - import finder hook and context manager + hide_modules - decorator using ModuleHider +""" + +# Copyright (c) 2019 Rory Yorke +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +try: + import importlib.abc + + # py>=3.3 has MetaPathFinder + _ModuleHiderBase = getattr(importlib.abc, "MetaPathFinder", importlib.abc.Finder) +except ImportError: + # py2 + _ModuleHiderBase = object + + +class ModuleHider(_ModuleHiderBase): + """Import finder hook to hide specified modules + ModuleHider(hidden_modules) -> instance + hidden_modules is a list of strings naming modules to hide. + """ + + def __init__(self, hidden): + self.hidden = hidden + self.hidden_modules = {} + + # python <=3.3 + def find_module(self, fullname, path=None): + return self.find_spec(fullname, path) + + # python >=3.4 + def find_spec(self, fullname, path, target=None): + if fullname in self.hidden: + raise ImportError("No module named {}".format(fullname)) + + def hide(self): + "Starting hiding modules" + import sys + + if self in sys.meta_path: + raise RuntimeError("Already hiding modules") + # must be first to override standard finders + sys.meta_path.insert(0, self) + # remove hidden modules to force reload + for m in self.hidden: + if m in sys.modules: + self.hidden_modules[m] = sys.modules[m] + del sys.modules[m] + + def unhide(self): + "Unhide modules" + import sys + + sys.meta_path.remove(self) + sys.modules.update(self.hidden_modules) + self.hidden_modules.clear() + + def __enter__(self): + self.hide() + + def __exit__(self, exc_type, exc_val, exc_tb): + self.unhide() + + # there's much point in __del__: sys.meta_path will keep a + # reference to an object on which .unhide() is not called, so + # refcount will only go to 0 if the object is removed from + # sys.meta_path somehow (in which case deletion doesn't + # matter), or when Python exits (ditto) + + +def hide_modules(hidden): + """hide_modules(hidden_modules) -> decorator + + When decorated function is called, the specified list of modules + will be hidden; once the function exits, the modules will be + unhidden. + """ + + def applydec(f): + def decf(*args, **kwargs): + with ModuleHider(hidden): + f(*args, **kwargs) + + # carry across name so that nose still finds the test + decf.__name__ = f.__name__ + # and carry across doc for test descriptions (etc.) + decf.__doc__ = f.__doc__ + return decf + + return applydec diff --git a/tests/test_snappy_check_remote.py b/tests/test_snappy_check_remote.py index 6dea3c09..4ff1355d 100644 --- a/tests/test_snappy_check_remote.py +++ b/tests/test_snappy_check_remote.py @@ -4,7 +4,11 @@ import pytest from cubi_tk.snappy.check_remote import Checker, FindLocalFiles, FindLocalRawdataFiles -from cubi_tk.snappy.retrieve_irods_collection import IrodsDataObject + +from .helpers import createIrodsDataObject + +# from cubi_tk.snappy.retrieve_irods_collection import IrodsDataObject + # Tests FindLocalFiles ================================================================================================= @@ -87,7 +91,7 @@ def test_compare_local_and_remote_files(): replicas_md5sum = [file_md5sum] * 3 in_remote_dict = { "bwa.P001-N1-DNA1-WES1.bam": [ - IrodsDataObject( + createIrodsDataObject( file_name="bwa.P001-N1-DNA1-WES1.bam", irods_path="/sodar_path/bwa.P001-N1-DNA1-WES1.bam", file_md5sum=file_md5sum, @@ -95,7 +99,7 @@ def test_compare_local_and_remote_files(): ) ], "bwa.P001-N1-DNA1-WES1.bam.bai": [ - IrodsDataObject( + createIrodsDataObject( file_name="bwa.P001-N1-DNA1-WES1.bam.bai", irods_path="/sodar_path/bwa.P001-N1-DNA1-WES1.bam.bai", file_md5sum=file_md5sum, @@ -103,7 +107,7 @@ def test_compare_local_and_remote_files(): ) ], "bwa.P002-N1-DNA1-WES1.bam": [ - IrodsDataObject( + createIrodsDataObject( file_name="bwa.P002-N1-DNA1-WES1.bam", irods_path="/sodar_path/bwa.P002-N1-DNA1-WES1.bam", file_md5sum=file_md5sum, @@ -111,7 +115,7 @@ def test_compare_local_and_remote_files(): ) ], "bwa.P002-N1-DNA1-WES1.bam.bai": [ - IrodsDataObject( + createIrodsDataObject( file_name="bwa.P002-N1-DNA1-WES1.bam.bai", irods_path="/sodar_path/bwa.P002-N1-DNA1-WES1.bam.bai", file_md5sum=file_md5sum, @@ -165,7 +169,7 @@ def test_compare_local_and_remote_files(): # Update input and expected results extra_remote_files_dict = { "bwa.P001-N1-DNA1-WES1.conda_info.txt": [ - IrodsDataObject( + createIrodsDataObject( file_name="bwa.P001-N1-DNA1-WES1.conda_info.txt", irods_path="/sodar_path/bwa.P001-N1-DNA1-WES1.conda_info.txt", file_md5sum=file_md5sum, diff --git a/tests/test_snappy_pull_data_common.py b/tests/test_snappy_pull_data_common.py index 25a224b0..7e005c48 100644 --- a/tests/test_snappy_pull_data_common.py +++ b/tests/test_snappy_pull_data_common.py @@ -4,7 +4,8 @@ import pytest from cubi_tk.snappy.pull_data_common import PullDataCommon -from cubi_tk.snappy.retrieve_irods_collection import IrodsDataObject + +from .helpers import createIrodsDataObject as IrodsDataObject # Empty file MD5 checksum FILE_MD5SUM = "d41d8cd98f00b204e9800998ecf8427e" @@ -99,7 +100,7 @@ def test_pull_data_common_sort_irods_object_by_date_in_path(irods_objects_list): expected = ("2038-01-19", "2000-01-01", "1999-09-09") actual = raw_data_class.sort_irods_object_by_date_in_path(irods_obj_list=irods_objects_list) for count, irods_obj in enumerate(actual): - assert expected[count] in irods_obj.irods_path + assert expected[count] in irods_obj.path def test_pull_data_common_sort_irods_object_by_date_in_path_mixed(irods_objects_list_format_mixed): @@ -110,7 +111,7 @@ def test_pull_data_common_sort_irods_object_by_date_in_path_mixed(irods_objects_ irods_obj_list=irods_objects_list_format_mixed ) for count, irods_obj in enumerate(actual): - assert expected[count] in irods_obj.irods_path + assert expected[count] in irods_obj.path def test_pull_data_common_sort_irods_object_by_date_in_path_missing_date( diff --git a/tests/test_snappy_pull_processed_data.py b/tests/test_snappy_pull_processed_data.py index 5c419d56..de4045ef 100644 --- a/tests/test_snappy_pull_processed_data.py +++ b/tests/test_snappy_pull_processed_data.py @@ -5,7 +5,8 @@ from cubi_tk.__main__ import setup_argparse from cubi_tk.snappy.pull_processed_data import PullProcessedDataCommand -from cubi_tk.snappy.retrieve_irods_collection import IrodsDataObject + +from .helpers import createIrodsDataObject as IrodsDataObject # Empty file MD5 checksum FILE_MD5SUM = "d41d8cd98f00b204e9800998ecf8427e" @@ -480,19 +481,15 @@ def test_pull_processed_data_pair_ipath_with_outdir_bam(pull_processed_data, rem "out_dir/P00{i}-N1-DNA1-WES1/1999-09-09/ngs_mapping/bwa.P00{i}-N1-DNA1-WES1.{ext}" ) irods_files_list = [ - irods_path.format(i=i, ext=ext) - for i in (1, 2) - for ext in ("bam", "bam.bai", "bam.md5", "bam.bai.md5") + irods_path.format(i=i, ext=ext) for i in (1, 2) for ext in ("bam", "bam.bai") ] correct_uuid_output_dir_list = [ - full_out_dir.format(i=i, ext=ext) - for i in (1, 2) - for ext in ("bam", "bam.bai", "bam.md5", "bam.bai.md5") + full_out_dir.format(i=i, ext=ext) for i in (1, 2) for ext in ("bam", "bam.bai") ] wrong_uuid_output_dir_list = [ "out_dir/bwa.P00{i}-N1-DNA1-WES1.{ext}".format(i=i, ext=ext) for i in (1, 2) - for ext in ("bam", "bam.bai", "bam.md5", "bam.bai.md5") + for ext in ("bam", "bam.bai") ] correct_uuid_expected = [] for _irods_path, _out_path in zip(irods_files_list, correct_uuid_output_dir_list): @@ -532,13 +529,13 @@ def test_pull_processed_data_pair_ipath_with_outdir_bam_retrieve_all( irods_path.format(i=i, date=date, ext=ext) for i in (1, 2) for date in ("1999-09-09", "1975-01-04") - for ext in ("bam", "bam.bai", "bam.md5", "bam.bai.md5") + for ext in ("bam", "bam.bai") ] correct_uuid_output_dir_list = [ full_out_dir.format(i=i, date=date, ext=ext) for i in (1, 2) for date in ("1999-09-09", "1975-01-04") - for ext in ("bam", "bam.bai", "bam.md5", "bam.bai.md5") + for ext in ("bam", "bam.bai") ] correct_uuid_expected = [] for _irods_path, _out_path in zip(irods_files_list, correct_uuid_output_dir_list): @@ -570,19 +567,15 @@ def test_pull_processed_data_pair_ipath_with_outdir_vcf(pull_processed_data, rem "out_dir/P00{i}-N1-DNA1-WES1/1999-09-09/variant_calling/bwa.P00{i}-N1-DNA1-WES1.{ext}" ) irods_files_list = [ - irods_path.format(i=i, ext=ext) - for i in (1, 2) - for ext in ("vcf.gz", "vcf.gz.tbi", "vcf.gz.md5", "vcf.gz.tbi.md5") + irods_path.format(i=i, ext=ext) for i in (1, 2) for ext in ("vcf.gz", "vcf.gz.tbi") ] correct_uuid_output_dir_list = [ - full_out_dir.format(i=i, ext=ext) - for i in (1, 2) - for ext in ("vcf.gz", "vcf.gz.tbi", "vcf.gz.md5", "vcf.gz.tbi.md5") + full_out_dir.format(i=i, ext=ext) for i in (1, 2) for ext in ("vcf.gz", "vcf.gz.tbi") ] wrong_uuid_output_dir_list = [ "out_dir/bwa.P00{i}-N1-DNA1-WES1.{ext}".format(i=i, ext=ext) for i in (1, 2) - for ext in ("vcf.gz", "vcf.gz.tbi", "vcf.gz.md5", "vcf.gz.tbi.md5") + for ext in ("vcf.gz", "vcf.gz.tbi") ] correct_uuid_expected = [] for _irods_path, _out_path in zip(irods_files_list, correct_uuid_output_dir_list): diff --git a/tests/test_snappy_pull_raw_data.py b/tests/test_snappy_pull_raw_data.py index f3bf54f9..62427350 100644 --- a/tests/test_snappy_pull_raw_data.py +++ b/tests/test_snappy_pull_raw_data.py @@ -5,7 +5,8 @@ from cubi_tk.__main__ import setup_argparse from cubi_tk.snappy.pull_raw_data import Config, PullRawDataCommand -from cubi_tk.snappy.retrieve_irods_collection import IrodsDataObject + +from .helpers import createIrodsDataObject as IrodsDataObject # Empty file MD5 checksum FILE_MD5SUM = "d41d8cd98f00b204e9800998ecf8427e" @@ -259,7 +260,7 @@ def test_pull_raw_data_get_library_to_irods_dict(pull_raw_data, remote_files_fas identifiers=samples_list, remote_files_dict=remote_files_fastq ) for id_ in samples_list: - assert all([str(irods.file_name).startswith(id_) for irods in actual.get(id_)]) + assert all([str(irods.name).startswith(id_) for irods in actual.get(id_)]) def test_pull_raw_data_pair_ipath_with_folder_name(pull_raw_data, sample_to_irods_dict): @@ -280,19 +281,19 @@ def test_pull_raw_data_pair_ipath_with_folder_name(pull_raw_data, sample_to_irod irods_path.format(i=i, r=r, ext=ext) for i in (1, 2) for r in (1, 2) - for ext in ("fastq.gz", "fastq.gz.md5") + for ext in ("fastq.gz",) ] correct_uuid_output_dir_list = [ full_out_dir.format(i=i, r=r, ext=ext) for i in (1, 2) for r in (1, 2) - for ext in ("fastq.gz", "fastq.gz.md5") + for ext in ("fastq.gz",) ] wrong_uuid_output_dir_list = [ "out_dir/P00{i}/P00{i}_R{r}_001.{ext}".format(i=i, r=r, ext=ext) for i in (1, 2) for r in (1, 2) - for ext in ("fastq.gz", "fastq.gz.md5") + for ext in ("fastq.gz",) ] correct_uuid_expected = [] for _irods_path, _out_path in zip(irods_files_list, correct_uuid_output_dir_list): diff --git a/tests/test_snappy_workflows.py b/tests/test_snappy_workflows.py new file mode 100644 index 00000000..e512ae6b --- /dev/null +++ b/tests/test_snappy_workflows.py @@ -0,0 +1,19 @@ +"""Tests for ``cubi_tk.snappy.snappy_workflows``. +""" + +from cubi_tk.snappy.snappy_workflows import SnappyWorkflowManager + +from .hide_modules import hide_modules + + +@hide_modules(["snappy_pipeline"]) +def test_could_not_import_module(): + manager = SnappyWorkflowManager.from_snappy() + assert manager is None + + +def test_could_import_module(): + manager = SnappyWorkflowManager.from_snappy() + assert manager is not None + assert callable(manager._expand_ref) + assert len(manager._step_to_module.keys()) > 0 diff --git a/tests/test_sodar_check_remote.py b/tests/test_sodar_check_remote.py index 57c43aba..7270dd6f 100644 --- a/tests/test_sodar_check_remote.py +++ b/tests/test_sodar_check_remote.py @@ -2,13 +2,14 @@ import pathlib -from cubi_tk.snappy.retrieve_irods_collection import IrodsDataObject from cubi_tk.sodar.check_remote import ( FileComparisonChecker, FileDataObject, FindLocalMD5Files, ) +from .helpers import createIrodsDataObject as IrodsDataObject + def test_findlocalmd5_run(): # Run 3 simple & 1 combined test @@ -73,20 +74,31 @@ def test_filecomparisoncheck_compare_local_and_remote_files(): remote_dict = { "test1.txt": [ IrodsDataObject( - "test1.txt", "test1/test1.txt", "fa029a7f2a3ca5a03fe682d3b77c7f0d", 3 * [None] + "test1.txt", + "/test1/test1.txt", + "fa029a7f2a3ca5a03fe682d3b77c7f0d", + 3 * ["fa029a7f2a3ca5a03fe682d3b77c7f0d"], ) ], "test2.txt": [ IrodsDataObject( - "test2.txt", "test2/test2.txt", "856babf68edfd13e2fd019df330e11c5", 3 * [None] + "test2.txt", + "/test2/test2.txt", + "856babf68edfd13e2fd019df330e11c5", + 3 * ["856babf68edfd13e2fd019df330e11c5"], ) ], "test3.txt": [ IrodsDataObject( - "test3.txt", "test3/test3.txt", "0f034ea35fde3fca41d71cbcb13ee659", 3 * [None] + "test3.txt", + "/test3/test3.txt", + "0f034ea35fde3fca41d71cbcb13ee659", + 3 * ["0f034ea35fde3fca41d71cbcb13ee659"], ) ], - "test5.txt": [IrodsDataObject("test5.txt", "test5/test5.txt", "abcdefgh", 3 * [None])], + "test5.txt": [ + IrodsDataObject("test5.txt", "/test5/test5.txt", "abcdefgh", 3 * ["abcdefgh"]) + ], } # Setup (local) FileDataObjects: test1 = FileDataObject( @@ -109,8 +121,8 @@ def test_filecomparisoncheck_compare_local_and_remote_files(): expected_both = {test_dir_path / "test1": [test1], test_dir_path / "test2": [test2]} expected_local = {test_dir_path / "test3": [test3], test_dir_path / "test4": [test4]} expected_remote = { - "test3": [FileDataObject("test3.txt", "test3", "0f034ea35fde3fca41d71cbcb13ee659")], - "test5": [FileDataObject("test5.txt", "test5", "abcdefgh")], + "/test3": [FileDataObject("test3.txt", "/test3", "0f034ea35fde3fca41d71cbcb13ee659")], + "/test5": [FileDataObject("test5.txt", "/test5", "abcdefgh")], } actual_both, actual_local, actual_remote = FileComparisonChecker.compare_local_and_remote_files( local_dict, remote_dict @@ -121,7 +133,7 @@ def test_filecomparisoncheck_compare_local_and_remote_files(): # Check matching by filename only (test3 moves to expected_both) expected_both.update({test_dir_path / "test3": [test3]}) expected_local.pop(test_dir_path / "test3") - expected_remote.pop("test3") + expected_remote.pop("/test3") actual_both, actual_local, actual_remote = FileComparisonChecker.compare_local_and_remote_files( local_dict, remote_dict, filenames_only=True )