diff --git a/config/organise_config/organise_project.yml b/config/organise_config/organise_project.yml new file mode 100644 index 0000000..2e4a39e --- /dev/null +++ b/config/organise_config/organise_project.yml @@ -0,0 +1,31 @@ +# Analysis results, readme and runfolder/runfolders. +#Input: PROJECTID +variables: + inputkey: "projectid" + rootpath: "/proj/ngi2016001/nobackup/NGI" + runfolderpath: "/proj/ngi2016001/incoming" + analysispath: "{rootpath}/ANALYSIS/{inputkey}" + deliverypath: "{rootpath}/DELIVERY/{inputkey}" +files_to_organise: +- source: "{analysispath}/results" + destination: "{deliverypath}/results" + options: + required: True + link_type: "softlink" +- source: "{rootpath}/DELIVERY/softlinks/DELIVERY.README.RNASEQ.md" + destination: "{deliverypath}/DELIVERY.README.RNASEQ.md" + options: + required: True + link_type: "copy" +- source: "{runfolderpath}/*/Projects/{inputkey}/*" + destination: "{deliverypath}/fastq/{runfoldername}/{samplename}/" + options: + required: True + link_type: "softlink" + filter: "{runfolderpath}/(?P[^/]+)/Projects/{inputkey}/(?P=runfoldername)/(?P[^/]+)/" +- source: "{runfolderpath}/*/Projects/{inputkey}/*" + destination: "{deliverypath}/fastq/{runfoldername}/" + options: + required: True + link_type: "softlink" + filter: "{runfolderpath}/(?P[^/]+)/Projects/{inputkey}/(?P=runfoldername)/(?P=runfoldername)_{inputkey}_multiqc_report[\\w.-]+" diff --git a/config/organise_config/organise_runfolder.yml b/config/organise_config/organise_runfolder.yml index 78c1cb2..306c45a 100644 --- a/config/organise_config/organise_runfolder.yml +++ b/config/organise_config/organise_runfolder.yml @@ -2,29 +2,31 @@ #Input: runfolder_name #Currently named runfolderpath and runfolder to harmonize with rnaseq config by Monika, but for specific runfolder delivery, technically, these could be "merged" to one. -runfolderpath: /proj/ngi2016001/incoming -runfolder: / -organised: /Projects/ +variables: + inputkey: "runfolder_name" + runfolderpath: "/proj/ngi2016001/incoming" + runfolder: "{runfolderpath}/{inputkey}" + organised: "{runfolder}/Projects" #current path is /proj/ngi2016001/incoming//Projects/// #The following is based on that we are to keep the same directory structure as above. -files_to_organize: - - #The fastq files - - source: /Unaligned/* - destination: /(?P=projectid)//Sample_(?P=samplename)/ - options: - required: True - symlink: True - regexp: (?P[\w-]+)/Sample_(?P[\w-]+)/(?P=samplename)_S(?P\d+)_L(?P\d+)_R(?P\d)_001.fastq.gz - - #The MultiQC files - - source: /seqreports/project/* - destination: /(?P=projectid)// - options: - required: True - symlink: True - regexp: (?P[\w-]+)/(?P\w+)_(?P=projectid)_multiqc_report[\w.-]+ +files_to_organise: + +#The fastq files +- source: "{runfolder}/Unaligned/*" + destination: "{organised}/{projectid}/{inputkey}/Sample_{samplename}/" + options: + required: True + link_type: "softlink" + filter: "(?P[\\w-]+)/Sample_(?P[\\w-]+)/(?P=samplename)_S(?P\\d+)_L(?P\\d+)_R(?P\\d)_001.fastq.gz" + +#The MultiQC files +- source: "{runfolder}/seqreports/projects/*" + destination: "{organised}/{projectid}/{inputkey}/" + options: + required: True + link_type: "softlink" + filter: "(?P[\\w-]+)/{inputkey}_(?P=projectid)_multiqc_report[\\w.-]+" diff --git a/delivery/exceptions.py b/delivery/exceptions.py index 3c95e84..0cb6fdc 100644 --- a/delivery/exceptions.py +++ b/delivery/exceptions.py @@ -59,13 +59,6 @@ class ProjectAlreadyDeliveredException(Exception): pass -class ProjectAlreadyOrganisedException(Exception): - """ - Should be raised when a project has already been organised. - """ - pass - - class FileNameParsingException(Exception): pass diff --git a/delivery/handlers/organise_handlers.py b/delivery/handlers/organise_handlers.py index 1e05014..ca3aaf0 100644 --- a/delivery/handlers/organise_handlers.py +++ b/delivery/handlers/organise_handlers.py @@ -3,7 +3,7 @@ from arteria.web.handlers import BaseRestHandler from delivery.exceptions import ProjectsDirNotfoundException, ChecksumFileNotFoundException, FileNameParsingException, \ - SamplesheetNotFoundException, ProjectReportNotFoundException, ProjectAlreadyOrganisedException + SamplesheetNotFoundException, ProjectReportNotFoundException from delivery.handlers import OK, NOT_FOUND, INTERNAL_SERVER_ERROR, FORBIDDEN log = logging.getLogger(__name__) @@ -72,12 +72,14 @@ def post(self, runfolder_id): except (ProjectsDirNotfoundException, ChecksumFileNotFoundException, SamplesheetNotFoundException, - ProjectReportNotFoundException) as e: + ProjectReportNotFoundException, + FileNotFoundError) as e: log.error(str(e), exc_info=e) self.set_status(NOT_FOUND, reason=str(e)) - except ProjectAlreadyOrganisedException as e: + except PermissionError as e: log.error(str(e), exc_info=e) self.set_status(FORBIDDEN, reason=str(e)) - except FileNameParsingException as e: + except (FileNameParsingException, + RuntimeError) as e: log.error(str(e), exc_info=e) self.set_status(INTERNAL_SERVER_ERROR, reason=str(e)) diff --git a/delivery/services/file_system_service.py b/delivery/services/file_system_service.py index 86bb4a6..981cc80 100644 --- a/delivery/services/file_system_service.py +++ b/delivery/services/file_system_service.py @@ -1,6 +1,10 @@ -import os +import glob import logging +import os +import pathlib +import shutil +from contextlib import contextmanager log = logging.getLogger(__name__) @@ -11,6 +15,19 @@ class FileSystemService(object): easily be mocked out in testing. """ + @staticmethod + @contextmanager + def change_directory(new_dir): + """ + A context manager for changing directory and changing back when leaving the context + """ + current_dir = os.getcwd() + try: + os.chdir(new_dir) + yield + finally: + os.chdir(current_dir) + @staticmethod def list_directories(base_path): """ @@ -81,6 +98,14 @@ def abspath(path): """ return os.path.abspath(path) + def create_parent_dirs(self, child_path): + """ + Create the parent directory structure for a child path + :param child_path: path to child + :return: None + """ + self.makedirs(self.dirname(child_path), exist_ok=True) + def symlink(self, source, link_name): """ Shadows os.symlink @@ -88,8 +113,31 @@ def symlink(self, source, link_name): :param link_name: the name of the link to create :return: None """ - self.makedirs(self.dirname(link_name), exist_ok=True) - return os.symlink(source, link_name) + self.create_parent_dirs(link_name) + return pathlib.Path(link_name).symlink_to(source) + + def hardlink(self, source, link_name): + """ + Shadows os.symlink + :param source: of link + :param link_name: the name of the link to create + :return: None + """ + self.create_parent_dirs(link_name) + return pathlib.Path(link_name).hardlink_to(source) + + def copy(self, source, dest): + """ + Shadows shutil.copyfile + :param source: + :param dest: + :return: None + """ + self.create_parent_dirs(dest) + try: + return shutil.copyfile(source, dest) + except IsADirectoryError: + return shutil.copytree(source, dest, symlinks=True) @staticmethod def mkdir(path): @@ -124,3 +172,10 @@ def rename(src, dst): @staticmethod def relpath(path, start): return os.path.relpath(path, start) + + @staticmethod + def glob(pattern, root_dir=None): + if root_dir: + with FileSystemService.change_directory(root_dir): + return FileSystemService.glob(pattern, root_dir=None) + return glob.glob(pattern, recursive=True) diff --git a/delivery/services/organise_service.py b/delivery/services/organise_service.py index 6f874b7..5049165 100644 --- a/delivery/services/organise_service.py +++ b/delivery/services/organise_service.py @@ -1,9 +1,12 @@ import logging import os +import pathlib import time - -from delivery.exceptions import ProjectAlreadyOrganisedException +import sys +import yaml +import json +import re from delivery.models.project import RunfolderProject from delivery.models.runfolder import Runfolder, RunfolderFile @@ -12,16 +15,20 @@ log = logging.getLogger(__name__) - class OrganiseService(object): """ - Starting in this context means organising a runfolder in preparation for a delivery. Each project on the runfolder - will be organised into its own separate directory. Sequence and report files will be symlinked from their original + Starting in this context means organising a runfolder in preparation + for a delivery. Each project on the runfolder + will be organised into its own separate directory. + Sequence and report files will be symlinked from their original location. This service handles that in a synchronous way. """ - - def __init__(self, runfolder_service, file_system_service=FileSystemService()): + def __init__( + self, + runfolder_service, + file_system_service=FileSystemService() + ): """ Instantiate a new OrganiseService :param runfolder_service: an instance of a RunfolderService @@ -32,30 +39,45 @@ def __init__(self, runfolder_service, file_system_service=FileSystemService()): def organise_runfolder(self, runfolder_id, lanes, projects, force): """ - Organise a runfolder in preparation for delivery. This will create separate subdirectories for each of the - projects and symlink all files belonging to the project to be delivered under this directory. + Organise a runfolder in preparation for delivery. + This will create separate subdirectories for each of the + projects and symlink all files belonging to the project + to be delivered under this directory. :param runfolder_id: the name of the runfolder to be organised - :param lanes: if not None, only samples on any of the specified lanes will be organised - :param projects: if not None, only projects in this list will be organised - :param force: if True, a previously organised project will be renamed with a unique suffix - :raises ProjectAlreadyOrganisedException: if a project has already been organised and force is False + :param lanes: if not None, only samples on any of the + specified lanes will be organised + :param projects: if not None, only projects in this + list will be organised + :param force: if True, a previously organised project will be + renamed with a unique suffix + :raises PermissionError: if a project has already been organised + and force is False :return: a Runfolder instance representing the runfolder after organisation """ # retrieve a runfolder object and project objects to be organised runfolder = self.runfolder_service.find_runfolder(runfolder_id) projects_on_runfolder = list( - self.runfolder_service.find_projects_on_runfolder(runfolder, only_these_projects=projects)) + self.runfolder_service.find_projects_on_runfolder( + runfolder, + only_these_projects=projects)) # handle previously organised projects organised_projects_path = os.path.join(runfolder.path, "Projects") for project in projects_on_runfolder: - self.check_previously_organised_project(project, organised_projects_path, force) + self.check_previously_organised_project( + project, + organised_projects_path, + force) # organise the projects and return a new Runfolder instance organised_projects = [] for project in projects_on_runfolder: - organised_projects.append(self.organise_project(runfolder, project, organised_projects_path, lanes)) + organised_projects.append(self.organise_project( + runfolder, + project, + organised_projects_path, + lanes)) return Runfolder( runfolder.name, @@ -63,12 +85,19 @@ def organise_runfolder(self, runfolder_id, lanes, projects, force): projects=organised_projects, checksums=runfolder.checksums) - def check_previously_organised_project(self, project, organised_projects_path, force): - organised_project_path = os.path.join(organised_projects_path, project.name) + def check_previously_organised_project( + self, + project, + organised_projects_path, + force + ): + organised_project_path = os.path.join( + organised_projects_path, + project.name) if self.file_system_service.exists(organised_project_path): msg = "Organised project path '{}' already exists".format(organised_project_path) if not force: - raise ProjectAlreadyOrganisedException(msg) + raise PermissionError(msg) organised_projects_backup_path = "{}.bak".format(organised_projects_path) backup_path = os.path.join( organised_projects_backup_path, @@ -79,22 +108,48 @@ def check_previously_organised_project(self, project, organised_projects_path, f self.file_system_service.mkdir(organised_projects_backup_path) self.file_system_service.rename(organised_project_path, backup_path) - def organise_project(self, runfolder, project, organised_projects_path, lanes): + def organise_project( + self, + runfolder, + project, + organised_projects_path, + lanes + ): """ Organise a project on a runfolder into its own directory and into a standard structure. If the project has - already been organised, a ProjectAlreadyOrganisedException will be raised, unless force is True. If force is + already been organised, a PermissionError will be raised, unless force is True. If force is True, the existing project path will be renamed with a unique suffix. :param runfolder: a Runfolder instance representing the runfolder on which the project belongs :param project: a Project instance representing the project to be organised :param lanes: if not None, only samples on any of the specified lanes will be organised :param force: if True, a previously organised project will be renamed with a unique suffix - :raises ProjectAlreadyOrganisedException: if project has already been organised and force is False + + Organise a project on a runfolder into its own directory and + into a standard structure. If the project has + already been organised, a PermissionError will + be raised, unless force is True. If force is + True, the existing project path will be renamed with a unique suffix. + + :param runfolder: a Runfolder instance representing the runfolder + on which the project belongs + :param project: a Project instance representing the project + to be organised + :param lanes: if not None, only samples on any of the + specified lanes will be organised + :param force: if True, a previously organised project will be + renamed with a unique suffix + :raises PermissionError: if project has already been organised and + force is False :return: a Project instance representing the project after organisation """ # symlink the samples - organised_project_path = os.path.join(organised_projects_path, project.name) - organised_project_runfolder_path = os.path.join(organised_project_path, runfolder.name) + organised_project_path = os.path.join( + organised_projects_path, + project.name) + organised_project_runfolder_path = os.path.join( + organised_project_path, + runfolder.name) organised_samples = [] for sample in project.samples: organised_samples.append( @@ -105,7 +160,8 @@ def organise_project(self, runfolder, project, organised_projects_path, lanes): # symlink the project files organised_project_files = [] if project.project_files: - project_file_base = self.file_system_service.dirname(project.project_files[0].file_path) + project_file_base = self.file_system_service.dirname( + project.project_files[0].file_path) for project_file in project.project_files: organised_project_files.append( self.organise_project_file( @@ -128,12 +184,18 @@ def organise_project(self, runfolder, project, organised_projects_path, lanes): return organised_project - def organise_project_file(self, project_file, organised_project_path, project_file_base=None): + def organise_project_file( + self, + project_file, + organised_project_path, + project_file_base=None): """ Find and symlink the project report to the organised project directory. - :param project: a Project instance representing the project before organisation - :param organised_project: a Project instance representing the project after organisation + :param project: a Project instance representing the + project before organisation + :param organised_project: a Project instance representing the + project after organisation """ project_file_base = project_file_base or self.file_system_service.dirname(project_file.file_path) @@ -150,26 +212,41 @@ def organise_project_file(self, project_file, organised_project_path, project_fi self.file_system_service.symlink(link_path, link_name) return RunfolderFile(link_name, file_checksum=project_file.checksum) - def organise_sample(self, sample, organised_project_path, lanes): + def organise_sample( + self, + sample, + organised_project_path, + lanes): """ - Organise a sample into its own directory under the corresponding project directory. Samples can be excluded - from organisation based on which lane they were run on. The sample directory will be named identically to the - sample id field. This may be different from the sample name field which is used as a prefix in the file name - for the sample files. This is the same behavior as e.g. bcl2fastq uses for sample id and sample name. + Organise a sample into its own directory under the + corresponding project directory. Samples can be excluded + from organisation based on which lane they were run on. + The sample directory will be named identically to the + sample id field. This may be different from the + sample name field which is used as a prefix in the file name + for the sample files. This is the same behavior + as e.g. bcl2fastq uses for sample id and sample name. :param sample: a Sample instance representing the sample to be organised - :param organised_project_path: the path to the organised project directory under which to place the sample - :param lanes: if not None, only samples run on the any of the specified lanes will be organised + :param organised_project_path: the path to the organised + project directory under which to place the sample + :param lanes: if not None, only samples run on the any of the + specified lanes will be organised :return: a new Sample instance representing the sample after organisation """ # symlink each sample in its own directory - organised_sample_path = os.path.join(organised_project_path, sample.sample_id) + organised_sample_path = os.path.join( + organised_project_path, + sample.sample_id) # symlink the sample files using relative paths organised_sample_files = [] for sample_file in sample.sample_files: - organised_sample_files.append(self.organise_sample_file(sample_file, organised_sample_path, lanes)) + organised_sample_files.append(self.organise_sample_file( + sample_file, + organised_sample_path, + lanes)) # clean up the list of sample files by removing None elements organised_sample_files = list(filter(None, organised_sample_files)) @@ -180,24 +257,39 @@ def organise_sample(self, sample, organised_project_path, lanes): sample_id=sample.sample_id, sample_files=organised_sample_files) - def organise_sample_file(self, sample_file, organised_sample_path, lanes): + def organise_sample_file( + self, + sample_file, + organised_sample_path, + lanes): """ - Organise a sample file by creating a relative symlink in the supplied directory, pointing back to the supplied - SampleFile's original file path. The Sample file can be excluded from organisation based on the lane it was - derived from. The supplied directory will be created if it doesn't exist. + Organise a sample file by creating a relative symlink + in the supplied directory, pointing back to the supplied + SampleFile's original file path. + The Sample file can be excluded from organisation based on + the lane it was derived from. + The supplied directory will be created if it doesn't exist. - :param sample_file: a SampleFile instance representing the sample file to be organised - :param organised_sample_path: the path to the organised sample directory under which to place the symlink - :param lanes: if not None, only sample files derived from any of the specified lanes will be organised - :return: a new SampleFile instance representing the sample file after organisation + :param sample_file: a SampleFile instance representing the + sample file to be organised + :param organised_sample_path: the path to the + organised sample directory under which to place the symlink + :param lanes: if not None, only sample files + derived from any of the specified lanes will be organised + :return: a new SampleFile instance representing the + sample file after organisation """ - # skip if the sample file data is derived from a lane that shouldn't be included + # skip if the sample file data is derived from + # a lane that shouldn't be included if lanes and sample_file.lane_no not in lanes: return None - # create the symlink in the supplied directory and relative to the file's original location + # create the symlink in the supplied directory and + # relative to the file's original location link_name = os.path.join(organised_sample_path, sample_file.file_name) - relative_path = self.file_system_service.relpath(sample_file.file_path, organised_sample_path) + relative_path = self.file_system_service.relpath( + sample_file.file_path, + organised_sample_path) self.file_system_service.symlink(relative_path, link_name) return SampleFile( link_name, @@ -207,3 +299,287 @@ def organise_sample_file(self, sample_file, organised_sample_path, lanes): read_no=sample_file.read_no, is_index=sample_file.is_index, checksum=sample_file.checksum) + + def _determine_organise_operation(self, link_type=None): + """ + Determine the organisation operation from the config. If link_type is None, the default + will be to copy. Raises a RuntimeError if link_type is neither None or one of "softlink" + or "copy". + :param link_type: None or one of "softlink" or "copy" + :return: the function reference for the organisation operation to use + :raise: RuntimeError if link_type is not recognized + """ + ops = { + "softlink": self.file_system_service.symlink, + "copy": self.file_system_service.copy + } + try: + return ops[link_type or "copy"] + except KeyError: + raise RuntimeError( + f"{link_type} is not a recognized operation") + + def _configure_organisation_entry(self, entry): + + src_path = pathlib.Path(entry[0]) + dst_path = pathlib.Path(entry[1]) + options = entry[2] + + # check explicitly if source exists since hard linking would throw an exception but + # soft links will not + required = options.get("required", False) + if not src_path.exists(): + if required: + raise FileNotFoundError(f"{src_path} does not exist") + return None + + # ensure that the destination path does not already exist + if dst_path.exists(): + raise PermissionError(f"{dst_path} already exists") + + # determine what operation should be used, i.e. softlink or copy (default) + organise_op = self._determine_organise_operation( + link_type=options.get("link_type")) + + return organise_op, src_path, dst_path + + def organise_with_config(self, config_yaml_file, input_value): + """ + Organise files for delivery according to a supplied config file in YAML format. + + This will parse the config and symlink files accordingly. + + :param config_yaml_file: + :param input_value: + :return: a list of paths to organised files + :raise: FileNotFoundError, PermissionError, RuntimeError + """ + + # use the config parser to resolve into source - destination entries + parsed_config_entries = self.parse_yaml_config(config_yaml_file, input_value) + log.debug(f"parsed yaml config and received {len(parsed_config_entries)} entries") + + # do a first round to check status of source and destination, basically in order to avoid + # creating half-finished organised structures. Since non-existing, non-required files + # return None, filter those out + organised_paths = [] + try: + operations = list( + filter( + lambda op: op is not None, + map( + lambda entry: self._configure_organisation_entry(entry), + parsed_config_entries))) + for operation in operations: + operation[0](operation[1], operation[2]) + organised_paths.append(operation[2]) + except (RuntimeError, + FileNotFoundError, + PermissionError) as ex: + log.debug(str(ex)) + raise + except Exception as ex: + log.debug(ex) + raise RuntimeError(ex) + + return organised_paths + + def get_paths_matching_glob_path(self, glob_path, root_dir=None): + """ + Search the file system using a path with (or without) globs and return a list yielding + the matching paths as strings. If glob_path is relative, it will be evaluated relative to + root_dir (or os.getcwd() if root_dir is None). + + :param glob_path: the glob path to match, can be absolute or relative in combination with + root_dir + :param root_dir: (optional) if the glob_path is relative, it will be evaluated relative to + this root_dir + :return: an iterator yielding the matching paths as strings + """ + return self.file_system_service.glob(glob_path, root_dir=root_dir) + + @staticmethod + def load_yaml_config(config_file_path): + """ + Open and read yaml configuration file for + organising runfolders or projects. + :param config_file_path: /path/to/config.yaml + :return: Config as dict. + """ + with open(config_file_path) as f: + config_as_dict = yaml.load(f, Loader=yaml.SafeLoader) + + return config_as_dict + + def parse_yaml_config(self, config_file_path, input_value): + """ + Read a configuration file for organizing a runfolder + or analyzed project prior to delivery. + This function will do string substitutions based on the input, + keys in the configuration file and results from resolved + regular expressions (based on patterns from the configuration file). + + :param config_file_path: Full path to a + configuration file in yaml-format. + :param input_value: A runfolder name or a + project name (for analysed projects). + :return: A list with one element per link/copy operation to be + performed during organisation. + Each element consist of a tuple with 3 values + ("path/to/source/file", + "path/to/destination/file", + "options as dictionary"). + """ + + config = self.load_yaml_config(config_file_path) + + variables_dict = config["variables"] + variables_dict["inputkey"] = input_value + + config_as_list = [] + + for key in variables_dict.keys(): + # Perform string substitutions on variable + # values based on variable keys. + variables_dict[key] = self.sub_values_with_dict( + variables_dict[key], + variables_dict) + + for file_element in config["files_to_organise"]: + # for each source-destination pair, + # under "files_to_organise", given in the configuration file + config_as_list = config_as_list + self.process_file_to_organise( + file_element, + variables_dict) + + return config_as_list + + def process_file_to_organise(self, file_element, variables_dict): + """ + Function to handle a separate element of "files_to_organise" + from the cofiguration file. + Each element has one source, one destination and options available. + This function will resolve the source and destination into + several source files and destination files if applicable. + Unknowns in the source and destination path will be + resolved by substitutions or by calling resolve_config_regexp(). + :param file_element: A dictionary with keys: source, destination, options + corresponding to one element of the + "files_to_organise" section in the configuration file. + :param variables_dict: A dictionary corresponding to the + section "variables" from the configuration file. + :return: A list with all selected files under source. + Each element is a tuple + (/source/path/, /destination/path/, {"options": "as dict"}) + """ + + # Get the filter pattern given in the configuration file and + # substitute available values. + # If filter isn't defined in the configuration file, use an + # empty string as pattern to return all files under source. + config_options = file_element["options"] + [ + source_config_path, + destination_config_path, + filter_config_pattern + ] = [ + OrganiseService.sub_values_with_dict( + cfgstr, + variables_dict + ) for cfgstr in ( + file_element["source"], + file_element["destination"], + config_options.get("filter", "") + ) + ] + file_to_organise_list = [] + + # Get at list from the filesystem with all files under "source" + source_file_list = self.get_paths_matching_glob_path(source_config_path) + filtered_source_list = OrganiseService.resolve_config_regexp( + filter_config_pattern, + source_file_list) + for source_file_dict in filtered_source_list: + # substitute unknowns in destination path based on + # varibales found in source file + source_file_path = source_file_dict["source_file"] + destination_file = os.path.join( + OrganiseService.sub_values_with_dict( + destination_config_path, + source_file_dict), + os.path.basename( + source_file_path + ) + ) + file_to_organise_list.append( + ( + source_file_path, + destination_file, + config_options + ) + ) + + return file_to_organise_list + + @staticmethod + def sub_values_with_dict(value, sub_dict): + """ + Function to substitute values in a string based on keys from a dictionary. + :param value: A string with values to substitute, + e.g. "/path/to/{sub}/file" + :param sub_dict: A dictionary with one or several keys + that should be substituted, e.g {"sub": "dir1"}. + :return: Input string with one or more parts substituted, + e.g. "/path/to/dir1/file". + If one or several keys can't be found in the dictionary the key + it self will be used as default value. + e.g. "/path/to/{sub1}/{sub2}/file" and {"sub1": "dir1"} as + input will return "/path/to/dir1/{sub2}/file". + """ + + return value.format_map(Default(sub_dict)) + + @staticmethod + def resolve_config_regexp(regexp_pattern, list_of_source_files): + """ + This function will parse through a list of files/paths and + select all files/paths matching + the given regexp pattern. + + :param regexp_pattern: A regular expression as specified in + the configuration option "filter". + :param list_of_source_files: A list of all files under the + source specified in the configuration file, + as listed from the filesystem. + :return: A list where each element is a dictionary containing + keys specified by the input regexp pattern. + The number of elements in the list is decided by the number of + matches the re.search() returns. To each + list element, i.e. to each dictionary, the actual match + (the source file) is also added. If there is no + regexp pattern given as input the returned dictionary will + only have on key and value. + {"source_file": "/path/to/source/file"} + """ + source_file_list = [] + for file in list_of_source_files: + source_file_dict = { + "source_file": file + } + + search_re = re.search(regexp_pattern, file) + if search_re: + # If there is pattern submitted we save resolved + # variables in a dictionary + source_file_dict.update( + search_re.groupdict() + ) + + source_file_list.append(source_file_dict) + + return source_file_list + + +class Default(dict): + def __missing__(self, key): + return f"{{{key}}}" diff --git a/tests/unit_tests/services/test_file_system_service.py b/tests/unit_tests/services/test_file_system_service.py index 766b6be..5281aa9 100644 --- a/tests/unit_tests/services/test_file_system_service.py +++ b/tests/unit_tests/services/test_file_system_service.py @@ -1,4 +1,5 @@ - +import os.path +import pathlib import shutil import tempfile import unittest @@ -8,13 +9,22 @@ class TestFileSystemService(unittest.TestCase): + @staticmethod + def _tempdirs(dir, n): + return [tempfile.mkdtemp(dir=dir) for i in range(n)] + @staticmethod def _tempfiles(dir, n): - return [tempfile.mkstemp(dir=dir)[1] for i in range(n)] + files = [] + for i in range(n): + files.append(tempfile.mkstemp(dir=dir)[1]) + pathlib.Path(files[-1]).write_text("\n".join([str(i), files[-1]])) + return files @staticmethod - def _tempdirs(dir, n): - return [tempfile.mkdtemp(dir=dir) for i in range(n)] + def _content_equal(file_a, file_b): + with open(file_a, "rb") as fa, open(file_b, "rb") as fb: + return fa.read() == fb.read() def setUp(self): self.rootdir = tempfile.mkdtemp() @@ -25,6 +35,7 @@ def setUp(self): self.files.extend(self._tempfiles(self.rootdir, 3)) self.files.extend(self._tempfiles(self.dirs[0], 3)) self.files.extend(self._tempfiles(self.dirs[-1], 3)) + self.service = FileSystemService() def tearDown(self): shutil.rmtree(self.rootdir) @@ -32,5 +43,225 @@ def tearDown(self): def test_list_files_recursively(self): self.assertListEqual( sorted(self.files), - sorted(list(FileSystemService().list_files_recursively(self.rootdir))) + sorted(list(self.service.list_files_recursively(self.rootdir))) + ) + + def test_create_parent_dirs(self): + child_path = pathlib.Path(self.rootdir, "path", "to", "child", "file") + self.service.create_parent_dirs(child_path) + self.assertTrue(child_path.parent.exists()) + + def test_copy(self): + source_lines = ["abc", "def", "ghi", "jkl", "mno", "pqr", "stu", "vwx", "yz!"] + source_file = self.files[-1] + dest_file = self.files[0] + with open(source_file, "w") as fh: + fh.writelines(source_lines) + + self.service.copy(source_file, dest_file) + self.assertTrue(self._content_equal(source_file, dest_file)) + self.assertFalse(os.path.samefile(source_file, dest_file)) + + def test_copy_dir(self): + + def _list_dirs_and_files(rdir): + paths = [] + for root, subdirs, files in os.walk(rdir): + r = os.path.relpath(root, rdir) + paths.extend([os.path.join(r, p) for p in subdirs + files]) + return sorted(paths) + + with tempfile.TemporaryDirectory() as dest_root: + dest = os.path.join(dest_root, os.path.basename(self.rootdir)) + self.service.copy(self.rootdir, dest) + + expected = _list_dirs_and_files(self.rootdir) + observed = _list_dirs_and_files(dest) + + self.assertListEqual(observed, expected) + + # assert that the files are identical + for f in filter( + lambda x: x.is_file(), + map( + lambda p: pathlib.Path(self.rootdir) / p, + expected + ) + ): + self.assertEqual( + (pathlib.Path(dest) / f.relative_to(self.rootdir)).read_text(), + f.read_text() + ) + + def test_change_directory(self): + start_dir = pathlib.Path.cwd() + target_dir = pathlib.Path(self.rootdir) + + os.chdir(start_dir) + # assert that we have moved to the user's home directory + self.assertTrue(start_dir.samefile(os.getcwd())) + # assert that the context manager changes to the target directory + with self.service.change_directory(target_dir): + self.assertTrue(target_dir.samefile(os.getcwd())) + # assert that we have moved back to the user's home directory when leaving the context + self.assertTrue(start_dir.samefile(os.getcwd())) + + def _link_helper(self, fn): + source_file = self.files[-1] + dest_file = self.files[0] + + # assert that an existing destination throws an exception + with self.assertRaises(FileExistsError): + self.service.symlink(source_file, dest_file) + + os.unlink(dest_file) + fn(source_file, dest_file) + + self.assertTrue(os.path.samefile(source_file, dest_file)) + + return source_file, dest_file + + def test_hardlink(self): + src, dst = self._link_helper(self.service.hardlink) + self.assertFalse(pathlib.Path(dst).is_symlink()) + + def test_symlink(self): + src, dst = self._link_helper(self.service.symlink) + self.assertTrue(pathlib.Path(dst).is_symlink()) + + def _create_disk_structure(self): + dirs = [ + "dir_AA", + "dir_AB", + "dir_BB", + ] + subdirs = [ + "subdir_01", + "subdir_11", + "subdir_10", + ] + files = [ + "file_XX.txt", + "file_XY.txt.gz", + "ZZ_file_YY.txt", + ] + paths = [] + + def _create_file_path(p): + self.service.create_parent_dirs(str(p)) + p.touch() + paths.append(p.relative_to(self.rootdir)) + + # create the file structure + for f in files: + r = pathlib.Path(self.rootdir) + _create_file_path(r / f) + for d in dirs: + d = r / d + _create_file_path(d / f) + for s in subdirs: + s = d / s + _create_file_path(s / f) + + return paths + + def _absolute_glob_helper(self, pattern, expected_paths): + self._glob_helper(pattern, expected_paths, root_dir="", relative_dir=self.rootdir) + + def _relative_glob_helper(self, pattern, expected_paths): + self._glob_helper(pattern, expected_paths, root_dir=self.rootdir, relative_dir="") + + def _glob_helper(self, pattern, expected_paths, root_dir, relative_dir): + obs = sorted([ + str(pathlib.Path(p)) + for p in self.service.glob(os.path.join(relative_dir, pattern), root_dir=root_dir) + ]) + exp = sorted([ + str((pathlib.Path(relative_dir) / p)) + for p in expected_paths + ]) + self.assertListEqual(obs, exp) + + def test_rootdir_file_glob(self): + paths = self._create_disk_structure() + pattern = "*.txt" + expected_paths = list( + filter( + lambda p: p.parent.name == "" and p.suffix == ".txt", + paths)) + self._absolute_glob_helper(pattern, expected_paths) + self._relative_glob_helper(pattern, expected_paths) + + def test_subdir_file_glob(self): + paths = self._create_disk_structure() + pattern = "*/*.txt" + expected_paths = list( + filter( + lambda p: p.parent.name.startswith("dir_") and p.suffix == ".txt", + paths)) + self._absolute_glob_helper(pattern, expected_paths) + self._relative_glob_helper(pattern, expected_paths) + + def test_anydir_file_glob(self): + paths = self._create_disk_structure() + pattern = "**/*.txt" + expected_paths = list( + filter( + lambda p: p.suffix == ".txt", + paths)) + self._absolute_glob_helper(pattern, expected_paths) + self._relative_glob_helper(pattern, expected_paths) + + def test_dirsuffix_anydir_file_glob(self): + paths = self._create_disk_structure() + pattern = "*B/**/*.gz" + expected_paths = list( + filter( + lambda p: p.parts[0].endswith("B") and p.suffix == ".gz", + paths)) + self._absolute_glob_helper(pattern, expected_paths) + self._relative_glob_helper(pattern, expected_paths) + + def test_dirsuffix_anyfile_glob(self): + paths = self._create_disk_structure() + pattern = "**/*10/*" + expected_paths = list( + filter( + lambda p: p.parent.name.endswith("10"), + paths)) + self._absolute_glob_helper(pattern, expected_paths) + self._relative_glob_helper(pattern, expected_paths) + + def test_rootsuffix_anyfile_glob(self): + paths = self._create_disk_structure() + pattern = "*10/*" + expected_paths = list( + filter( + lambda p: len(p.parts) > 1 and p.parts[0].endswith("10"), + paths)) + self._absolute_glob_helper(pattern, expected_paths) + self._relative_glob_helper(pattern, expected_paths) + + def test_noglob_file_glob(self): + paths = self._create_disk_structure() + pattern = paths[0].name + expected_paths = [paths[0].name] + self._absolute_glob_helper(pattern, expected_paths) + self._relative_glob_helper(pattern, expected_paths) + + def test_anyfile_glob(self): + paths = self._create_disk_structure() + pattern = "dir_*/**" + expected_paths = list( + filter( + lambda p: len(p.parts) > 1 and p.parts[0].startswith("dir_"), + paths)) + expected_paths.extend( + list( + set( + [d for p in paths for d in p.parents if d.name] + ) + ) ) + self._absolute_glob_helper(pattern, expected_paths) + self._relative_glob_helper(pattern, expected_paths) diff --git a/tests/unit_tests/services/test_organise_service.py b/tests/unit_tests/services/test_organise_service.py index 3be72a0..6900e19 100644 --- a/tests/unit_tests/services/test_organise_service.py +++ b/tests/unit_tests/services/test_organise_service.py @@ -1,10 +1,12 @@ +import pathlib +import tempfile + import mock import os import unittest +import uuid -from delivery.exceptions import ProjectAlreadyOrganisedException from delivery.models.runfolder import RunfolderFile -from delivery.models.sample import Sample from delivery.repositories.project_repository import GeneralProjectRepository from delivery.repositories.sample_repository import RunfolderProjectBasedSampleRepository from delivery.services.file_system_service import FileSystemService @@ -27,7 +29,8 @@ def setUp(self): self.file_system_service = mock.MagicMock(spec=FileSystemService) self.runfolder_service = mock.MagicMock(spec=RunfolderService) self.project_repository = mock.MagicMock(spec=GeneralProjectRepository) - self.sample_repository = mock.MagicMock(spec=RunfolderProjectBasedSampleRepository) + self.sample_repository = mock.MagicMock( + spec=RunfolderProjectBasedSampleRepository) self.organise_service = OrganiseService( self.runfolder_service, file_system_service=self.file_system_service) @@ -36,12 +39,19 @@ def test_organise_runfolder(self): self.runfolder_service.find_runfolder.return_value = self.runfolder self.runfolder_service.find_projects_on_runfolder.side_effect = [[self.project]] self.file_system_service.exists.return_value = False - with mock.patch.object(self.organise_service, "organise_project", autospec=True) as organise_project_mock: + with mock.patch.object( + self.organise_service, + "organise_project", + autospec=True) as organise_project_mock: runfolder_id = self.runfolder.name lanes = [1, 2, 3] projects = ["a", "b", "c"] force = False - self.organise_service.organise_runfolder(runfolder_id, lanes, projects, force) + self.organise_service.organise_runfolder( + runfolder_id, + lanes, + projects, + force) organise_project_mock.assert_called_once_with( self.runfolder, self.project, @@ -64,7 +74,7 @@ def test_check_previously_organised_project(self): # previously organised and not forced self.file_system_service.exists.return_value = True self.assertRaises( - ProjectAlreadyOrganisedException, + PermissionError, self.organise_service.check_previously_organised_project, self.project, organised_projects_path, @@ -79,7 +89,10 @@ def test_check_previously_organised_project(self): def test_organise_runfolder_already_organised(self): self.runfolder_service.find_runfolder.return_value = self.runfolder self.file_system_service.exists.return_value = True - with mock.patch.object(self.organise_service, "organise_project", autospec=True) as organise_project_mock: + with mock.patch.object( + self.organise_service, + "organise_project", + autospec=True) as organise_project_mock: expected_organised_project = "this-is-an-organised-project" organise_project_mock.return_value = expected_organised_project self.runfolder_service.find_projects_on_runfolder.side_effect = [[self.project], [self.project]] @@ -87,7 +100,7 @@ def test_organise_runfolder_already_organised(self): # without force self.assertRaises( - ProjectAlreadyOrganisedException, + PermissionError, self.organise_service.organise_runfolder, runfolder_id, [], [], False) @@ -106,7 +119,11 @@ def test_organise_project(self): self.file_system_service.dirname.side_effect = os.path.dirname lanes = [1, 2, 3] organised_projects_path = os.path.join(self.project.runfolder_path, "Projects") - self.organise_service.organise_project(self.runfolder, self.project, organised_projects_path, lanes) + self.organise_service.organise_project( + self.runfolder, + self.project, + organised_projects_path, + lanes) organise_sample_mock.assert_has_calls([ mock.call( sample, @@ -116,8 +133,12 @@ def test_organise_project(self): organise_project_file_mock.assert_has_calls([ mock.call( project_file, - os.path.join(organised_projects_path, self.project.name, self.project.runfolder_name), - project_file_base=os.path.dirname(self.project.project_files[0].file_path) + os.path.join( + organised_projects_path, + self.project.name, + self.project.runfolder_name), + project_file_base=os.path.dirname( + self.project.project_files[0].file_path) ) for project_file in self.project.project_files]) @@ -126,7 +147,10 @@ def test_organise_sample(self): self.file_system_service.relpath.side_effect = os.path.relpath self.file_system_service.dirname.side_effect = os.path.dirname for sample in self.project.samples: - organised_sample = self.organise_service.organise_sample(sample, self.organised_project_path, []) + organised_sample = self.organise_service.organise_sample( + sample, + self.organised_project_path, + []) sample_file_dir = os.path.relpath( os.path.dirname( sample.sample_files[0].file_path), @@ -134,19 +158,26 @@ def test_organise_sample(self): relative_path = os.path.join("..", "..", "..", "..", sample_file_dir) self.file_system_service.symlink.assert_has_calls([ mock.call( - os.path.join(relative_path, os.path.basename(sample_file.file_path)), + os.path.join(relative_path, os.path.basename( + sample_file.file_path)), sample_file.file_path) for sample_file in organised_sample.sample_files]) def test_organise_sample_exclude_by_lane(self): # all sample lanes are excluded for sample in self.project.samples: - organised_sample = self.organise_service.organise_sample(sample, self.organised_project_path, [0]) + organised_sample = self.organise_service.organise_sample( + sample, + self.organised_project_path, + [0]) self.assertListEqual([], organised_sample.sample_files) # a specific sample lane is excluded for sample in self.project.samples: - organised_sample = self.organise_service.organise_sample(sample, self.organised_project_path, [2, 3]) + organised_sample = self.organise_service.organise_sample( + sample, + self.organised_project_path, + [2, 3]) self.assertListEqual( list(map(lambda x: x.file_name, filter(lambda f: f.lane_no in [2, 3], sample.sample_files))), list(map(lambda x: x.file_name, organised_sample.sample_files))) @@ -184,7 +215,13 @@ def test_organise_sample_file(self): os.path.dirname(sample_file.file_path)), os.path.basename(sample_file.file_path)), expected_link_path) - for attr in ("file_name", "sample_name", "sample_index", "lane_no", "read_no", "is_index", "checksum"): + for attr in ("file_name", + "sample_name", + "sample_index", + "lane_no", + "read_no", + "is_index", + "checksum"): self.assertEqual( getattr(sample_file, attr), getattr(organised_sample_file, attr)) @@ -198,7 +235,9 @@ def test_organise_project_file(self): project_file_base, project_file), file_checksum="checksum-for-{}".format(project_file)) - for project_file in ("a-report-file", os.path.join("report-dir", "another-report-file"))] + for project_file in ("a-report-file", os.path.join( + "report-dir", + "another-report-file"))] self.file_system_service.relpath.side_effect = os.path.relpath self.file_system_service.dirname.side_effect = os.path.dirname for project_file in project_files: @@ -217,3 +256,448 @@ def test_organise_project_file(self): mock.call( os.path.join("..", "..", "..", "foo", "report-dir", "another-report-file"), os.path.join(organised_project_path, "report-dir", "another-report-file"))]) + + def test__determine_organise_operation(self): + ops = ["softlink", "copy"] + self.file_system_service.symlink.return_value = ops[0] + self.file_system_service.copy.return_value = ops[1] + + for op in ops: + fn = self.organise_service._determine_organise_operation(link_type=op) + self.assertEqual(op, fn()) + + def test__determine_organise_operation_default(self): + # assert copy is the default + op = "copy" + self.file_system_service.copy.return_value = op + self.assertEqual( + self.organise_service._determine_organise_operation()(), + op + ) + + def test__determine_organise_operation_illegal(self): + # assert unrecognized operation throws exception + with self.assertRaises(RuntimeError): + self.organise_service._determine_organise_operation( + link_type="not-a-recognized-link-type") + + def test__configure_organisation_entry(self): + fn_name = "softlink" + self.file_system_service.symlink.return_value = fn_name + + with tempfile.TemporaryDirectory() as dir: + entry = ( + pathlib.Path(dir, "source"), + pathlib.Path(dir, "dest"), + { + "required": True, + "link_type": fn_name + } + ) + entry[0].touch() + + fn, src, dst = self.organise_service._configure_organisation_entry(entry) + self.assertEqual(fn(), fn_name) + self.assertEqual(src, entry[0]) + self.assertEqual(dst, entry[1]) + + def test__configure_organisation_entry_existing(self): + # an existing destination file should raise an exception + with tempfile.TemporaryDirectory() as dir: + entry = ( + pathlib.Path(dir, "source"), + pathlib.Path(dir, "dest"), + {} + ) + entry[0].touch() + entry[1].touch() + with self.assertRaises(PermissionError): + self.organise_service._configure_organisation_entry(entry) + + def test__configure_organisation_entry_missing_required(self): + # a missing required file should raise an exception + entry = ( + "non-existing-source-file", + "non-existing-destination-file", + { + "required": True + } + ) + with self.assertRaises(FileNotFoundError): + self.organise_service._configure_organisation_entry(entry) + + def test__configure_organisation_entry_missing_nonrequired(self): + # a missing required file should raise an exception + entry = ( + "non-existing-source-file", + "non-existing-destination-file", + { + "required": False + } + ) + self.assertIsNone( + self.organise_service._configure_organisation_entry(entry)) + + def test_organise_with_config_file_not_found(self): + with mock.patch.object(self.organise_service, "parse_yaml_config") as parser_mock: + cfg = ( + "not-existing-source-file", + "some-destination-file", + { + "required": True + } + ) + parser_mock.return_value = [cfg] + with self.assertRaises(FileNotFoundError): + self.organise_service.organise_with_config( + "this-would-be-a-yaml-file", + "this-is-a-path-to-runfolder-or-project") + + def test_organise_with_config_illegal(self): + with mock.patch.object(self.organise_service, "parse_yaml_config") as parser_mock: + cfg = [("not-a", "valid config")] + parser_mock.return_value = cfg + with self.assertRaises(RuntimeError): + self.organise_service.organise_with_config( + "this-would-be-a-yaml-file", + "this-is-a-path-to-runfolder-or-project") + + def test_organise_with_config_illegal_link(self): + with tempfile.TemporaryDirectory() as tdir, \ + mock.patch.object(self.organise_service, "parse_yaml_config") as parser_mock: + cfg = ( + pathlib.Path(tdir, "source_file"), + pathlib.Path(tdir, "destination_file"), + { + "required": True, + "link_type": "not-a-valid-link-type" + } + ) + parser_mock.return_value = [cfg] + cfg[0].touch() + with self.assertRaises(RuntimeError): + self.organise_service.organise_with_config( + "this-would-be-a-yaml-file", + "this-is-a-path-to-runfolder-or-project") + + def test_organise_with_config_destination_exists(self): + with tempfile.TemporaryDirectory() as tdir, \ + mock.patch.object(self.organise_service, "parse_yaml_config") as parser_mock: + cfg = (tdir, tdir, {}) + parser_mock.return_value = [cfg] + with self.assertRaises(PermissionError): + self.organise_service.organise_with_config( + "this-would-be-a-yaml-file", + "this-is-a-path-to-runfolder-or-project") + + def test_organise_with_config_oserror(self): + with tempfile.TemporaryDirectory() as tdir, \ + mock.patch.object(self.organise_service, "parse_yaml_config") as parser_mock: + cfg = ( + pathlib.Path(tdir, "source_file"), + pathlib.Path(tdir, "destination_file"), + { + "required": True, + "link_type": "copy" + } + ) + parser_mock.return_value = [cfg] + cfg[0].touch() + with self.assertRaises(RuntimeError): + self.organise_service.file_system_service.copy.side_effect = OSError( + "just-a-mocked-exception") + self.organise_service.organise_with_config( + "this-would-be-a-yaml-file", + "this-is-a-path-to-runfolder-or-project") + + def test_parse_yaml_config_project(self): + config_file_path = "config/organise_config/organise_project.yml" + input_value = "ABC-123" + + with mock.patch.object( + self.organise_service, + "load_yaml_config" + ) as cfg_mock, mock.patch.object( + self.organise_service, + "get_paths_matching_glob_path" + ) as glob_mock: + + cfg_mock.return_value = { + "variables": { + "inputkey": "projectid", + "rootpath": "/proj/ngi2016001/nobackup/NGI", + "runfolderpath": "/proj/ngi2016001/incoming", + "analysispath": "{rootpath}/ANALYSIS/{inputkey}", + "deliverypath": "{rootpath}/DELIVERY/{inputkey}" + }, + "files_to_organise": [ + { + "source": "{analysispath}/results", + "destination": "{deliverypath}", + "options": { + "required": True, + "link_type": "softlink" + } + } + ] + } + + def _glob_mirror(src): + return [src] + + glob_mock.side_effect = _glob_mirror + + expected_cfg = [ + ( + f"/proj/ngi2016001/nobackup/NGI/ANALYSIS/{input_value}/results", + f"/proj/ngi2016001/nobackup/NGI/DELIVERY/{input_value}/results", + { + "required": True, + "link_type": "softlink" + } + ) + ] + self.assertEqual( + expected_cfg, + self.organise_service.parse_yaml_config( + config_file_path, + input_value) + ) + + def test_parse_yaml_config_empty_source(self): + ''' + If the input (i.e. the list of files) is empty + the output will be empty in this case since + we have a "filter" option for all files_to_organise in + the given configuration file. + ''' + config_file_path = "config/organise_config/organise_runfolder.yml" + input_value = "200624_A00834_0183_BHMTFYDRXX" + with mock.patch.object(self.organise_service, "get_paths_matching_glob_path") as glob_mock: + glob_mock.return_value = [] + self.assertEqual( + self.organise_service.parse_yaml_config( + config_file_path, + input_value + ), + [] + ) + + def test_parse_yaml_config_runfolder_fastq(self): + config_file_path = "/path/to/config.yaml" + input_value = "200624_A00834_0183_BHMTFYDRXX" + + with mock.patch.object( + self.organise_service, + "load_yaml_config" + ) as cfg_mock, mock.patch.object( + self.organise_service, + "get_paths_matching_glob_path" + ) as glob_mock: + cfg_mock.return_value = { + "variables": { + "inputkey": "runfolder_name", + "runfolderpath": "/proj/ngi2016001/incoming", + "runfolder": "{runfolderpath}/{inputkey}", + "organised": "{runfolder}/Projects" + }, + "files_to_organise": [ + { + "source": "{runfolder}/Unaligned/**", + "destination": "{organised}/{projectid}/{inputkey}/Sample_{samplename}/", + "options": { + "required": True, + "link_type": "softlink", + "filter": "(?P[\\w-]+)/Sample_(?P[\\w-]+)/" + "(?P=samplename)_S(?P\\d+)_L(?P\\d+)_" + "R(?P\\d)_001.fastq.gz" + } + } + ] + } + + glob_mock.return_value = [ + "/proj/ngi2016001/incoming/200624_A00834_0183_BHMTFYDRXX/Unaligned/" + "AB-1234/Sample_AB-1234-14092/AB-1234-14092_S35_L001_R1_001.fastq.gz", + "/proj/ngi2016001/incoming/200624_A00834_0183_BHMTFYDRXX/Unaligned/" + "AB-1234/Sample_AB-1234-14597/AB-1234-14597_S35_L001_R1_001.fastq.gz", + "/proj/ngi2016001/incoming/200624_A00834_0183_BHMTFYDRXX/Unaligned/" + "CD-5678/Sample_CD-5678-1/CD-5678-1_S89_L001_R1_001.fastq.gz" + ] + + expected_output = [ + ( + "/proj/ngi2016001/incoming/200624_A00834_0183_BHMTFYDRXX/Unaligned/" + "AB-1234/Sample_AB-1234-14092/AB-1234-14092_S35_L001_R1_001.fastq.gz", + "/proj/ngi2016001/incoming/200624_A00834_0183_BHMTFYDRXX/Projects/" + "AB-1234/200624_A00834_0183_BHMTFYDRXX/Sample_AB-1234-14092/" + "AB-1234-14092_S35_L001_R1_001.fastq.gz", + { + "required": True, + "link_type": "softlink", + "filter": "(?P[\\w-]+)/Sample_(?P[\\w-]+)/" + "(?P=samplename)_S(?P\\d+)_L(?P\\d+)_" + "R(?P\\d)_001.fastq.gz" + } + ), + ( + "/proj/ngi2016001/incoming/200624_A00834_0183_BHMTFYDRXX/Unaligned/" + "AB-1234/Sample_AB-1234-14597/AB-1234-14597_S35_L001_R1_001.fastq.gz", + "/proj/ngi2016001/incoming/200624_A00834_0183_BHMTFYDRXX/Projects/" + "AB-1234/200624_A00834_0183_BHMTFYDRXX/Sample_AB-1234-14597/" + "AB-1234-14597_S35_L001_R1_001.fastq.gz", + { + "required": True, + "link_type": "softlink", + "filter": "(?P[\\w-]+)/Sample_(?P[\\w-]+)/" + "(?P=samplename)_S(?P\\d+)_L(?P\\d+)_" + "R(?P\\d)_001.fastq.gz" + } + ), + ( + "/proj/ngi2016001/incoming/200624_A00834_0183_BHMTFYDRXX/Unaligned/" + "CD-5678/Sample_CD-5678-1/CD-5678-1_S89_L001_R1_001.fastq.gz", + "/proj/ngi2016001/incoming/200624_A00834_0183_BHMTFYDRXX/Projects/" + "CD-5678/200624_A00834_0183_BHMTFYDRXX/Sample_CD-5678-1/" + "CD-5678-1_S89_L001_R1_001.fastq.gz", + { + "required": True, + "link_type": "softlink", + "filter": "(?P[\\w-]+)/Sample_(?P[\\w-]+)/" + "(?P=samplename)_S(?P\\d+)_L(?P\\d+)_" + "R(?P\\d)_001.fastq.gz" + } + ) + ] + + self.assertEqual( + self.organise_service.parse_yaml_config( + config_file_path, + input_value), + expected_output + ) + + def test_parse_yaml_config_runfolder_reports(self): + config_file_path = "/path/to/config.yaml" + input_value = "200624_A00834_0183_BHMTFYDRXX" + + with mock.patch.object( + self.organise_service, + "load_yaml_config" + ) as cfg_mock, mock.patch.object( + self.organise_service, + "get_paths_matching_glob_path" + ) as glob_mock: + cfg_mock.return_value = { + "variables": { + "inputkey": "runfolder_name", + "runfolderpath": "/proj/ngi2016001/incoming", + "runfolder": "{runfolderpath}/{inputkey}", + "organised": "{runfolder}/Projects" + }, + "files_to_organise": [ + { + "source": "{runfolder}/seqreports/projects/**", + "destination": "{organised}/{projectid}/{inputkey}/", + "options": { + "required": True, + "link_type": "softlink", + "filter": "(?P[\\w-]+)/{inputkey}_" + "(?P=projectid)_multiqc_report[\\w.-]+" + } + } + ] + } + + glob_mock.return_value = [ + "/proj/ngi2016001/incoming/200624_A00834_0183_BHMTFYDRXX/seqreports/projects/" + "AB-1234/200624_A00834_0183_BHMTFYDRXX_AB-1234_multiqc_report.html", + "/proj/ngi2016001/incoming/200624_A00834_0183_BHMTFYDRXX/seqreports/projects/" + "CD-5678/200624_A00834_0183_BHMTFYDRXX_CD-5678_multiqc_report.html" + ] + + expected_output = [ + ( + "/proj/ngi2016001/incoming/200624_A00834_0183_BHMTFYDRXX/seqreports/projects/" + "AB-1234/200624_A00834_0183_BHMTFYDRXX_AB-1234_multiqc_report.html", + "/proj/ngi2016001/incoming/200624_A00834_0183_BHMTFYDRXX/Projects/" + "AB-1234/200624_A00834_0183_BHMTFYDRXX/" + "200624_A00834_0183_BHMTFYDRXX_AB-1234_multiqc_report.html", + { + "required": True, + "link_type": "softlink", + "filter": "(?P[\\w-]+)/{inputkey}_(?P=projectid)_" + "multiqc_report[\\w.-]+" + } + ), + ( + "/proj/ngi2016001/incoming/200624_A00834_0183_BHMTFYDRXX/seqreports/projects/" + "CD-5678/200624_A00834_0183_BHMTFYDRXX_CD-5678_multiqc_report.html", + "/proj/ngi2016001/incoming/200624_A00834_0183_BHMTFYDRXX/Projects/" + "CD-5678/200624_A00834_0183_BHMTFYDRXX/" + "200624_A00834_0183_BHMTFYDRXX_CD-5678_multiqc_report.html", + { + "required": True, + "link_type": "softlink", + "filter": "(?P[\\w-]+)/{inputkey}_(?P=projectid)_" + "multiqc_report[\\w.-]+" + } + ) + ] + + self.assertEqual( + self.organise_service.parse_yaml_config( + config_file_path, + input_value), + expected_output + ) + + def test_parse_yaml_config_no_filter(self): + config_file_path = "/path/to/config.yaml" + input_value = "AB-1234" + + with mock.patch.object( + self.organise_service, + "load_yaml_config" + ) as cfg_mock, mock.patch.object( + self.organise_service, + "get_paths_matching_glob_path" + ) as glob_mock: + cfg_mock.return_value = { + "variables": { + "inputkey": "projectid", + "rootpath": "/proj/ngi2016001/nobackup/NGI", + "runfolderpath": "/proj/ngi2016001/incoming", + "analysispath": "{rootpath}/ANALYSIS/{inputkey}", + "deliverypath": "{rootpath}/DELIVERY/{inputkey}" + }, + "files_to_organise": [ + { + "source": "{analysispath}/results", + "destination": "{deliverypath}", + "options": { + "required": True, + "link_type": "softlink" + } + } + ] + } + glob_mock.return_value = [ + "/proj/ngi2016001/nobackup/NGI/ANALYSIS/AB-1234/results" + ] + + expected_output = [( + "/proj/ngi2016001/nobackup/NGI/ANALYSIS/AB-1234/results", + "/proj/ngi2016001/nobackup/NGI/DELIVERY/AB-1234/results", + { + "required": True, + "link_type": "softlink" + } + )] + + self.assertEqual( + self.organise_service.parse_yaml_config( + config_file_path, + input_value), + expected_output + )