From 78ae06ab753784c62a5c6fc050fb80ef44001fb0 Mon Sep 17 00:00:00 2001
From: Nicolai-vKuegelgen <nicolai.von-kuegelgen@bih-charite.de>
Date: Thu, 9 Nov 2023 17:44:05 +0100
Subject: [PATCH] Changes based on Code-Review & feedback

---
 cubi_tk/snappy/itransfer_common.py |  22 +++--
 cubi_tk/sodar/ingest_fastq.py      | 150 ++++++++++++++++++-----------
 tests/test_sodar_ingest_fastq.py   |  17 ++++
 3 files changed, 127 insertions(+), 62 deletions(-)

diff --git a/cubi_tk/snappy/itransfer_common.py b/cubi_tk/snappy/itransfer_common.py
index bb26df70..81ddd6ae 100644
--- a/cubi_tk/snappy/itransfer_common.py
+++ b/cubi_tk/snappy/itransfer_common.py
@@ -190,15 +190,21 @@ def check_args(self, args):
 
         toml_config = load_toml_config(args)
         if not args.sodar_url:
-            if toml_config:
-                args.sodar_url = toml_config.get("global", {}).get("sodar_server_url")
-            else:
+            if not toml_config:
+                logger.error("SODAR URL not found in config files. Please specify on command line.")
+                res = 1
+            args.sodar_url = toml_config.get("global", {}).get("sodar_server_url")
+            if not args.sodar_url:
                 logger.error("SODAR URL not found in config files. Please specify on command line.")
                 res = 1
         if not args.sodar_api_token:
-            if toml_config:
-                args.sodar_api_token = toml_config.get("global", {}).get("sodar_api_token")
-            else:
+            if not toml_config:
+                logger.error(
+                    "SODAR API token not found in config files. Please specify on command line."
+                )
+                res = 1
+            args.sodar_api_token = toml_config.get("global", {}).get("sodar_api_token")
+            if not args.sodar_api_token:
                 logger.error(
                     "SODAR API token not found in config files. Please specify on command line."
                 )
@@ -454,8 +460,8 @@ def get_landing_zone_by_uuid(self, lz_uuid):
 
     def get_landing_zone_uuid_by_path(self, lz_irods_path, project_uuid, assay_uuid=None):
         """
-        :param lz_path: Landing zone path.
-        :type lz_path: str
+        :param lz_irods_path: Landing zone path.
+        :type lz_irods_path: str
 
         :param project_uuid: Project UUID.
         :type project_uuid: str
diff --git a/cubi_tk/sodar/ingest_fastq.py b/cubi_tk/sodar/ingest_fastq.py
index 34b51498..43efe0ff 100644
--- a/cubi_tk/sodar/ingest_fastq.py
+++ b/cubi_tk/sodar/ingest_fastq.py
@@ -18,7 +18,7 @@
 import tqdm
 
 from ..common import check_irods_icommands, load_toml_config, sizeof_fmt
-from ..exceptions import MissingFileException, ParameterException
+from ..exceptions import MissingFileException, ParameterException, UserCanceledException
 from ..snappy.itransfer_common import (
     SnappyItransferCommandBase,
     TransferJob,
@@ -98,7 +98,7 @@ def setup_argparse(cls, parser: argparse.ArgumentParser) -> None:
             "--remote-dir-pattern",
             default=DEFAULT_DEST_PATTERN,
             help=f"Pattern to use for constructing remote pattern, default: {DEFAULT_DEST_PATTERN}. "
-            "'collection_name' is the target irods collection and will be filled with the (-m regex modified) "
+            "'collection_name' is the target iRODS collection and will be filled with the (-m regex modified) "
             "'sample' unless --match-column is not used to fill it from the assay table. Any capture group of the "
             "src-regex ('sample', 'lane', ...) can be used along with 'date' and 'filename'.",
         )
@@ -107,7 +107,8 @@ def setup_argparse(cls, parser: argparse.ArgumentParser) -> None:
             default=None,
             help="Alternative assay column against which the {sample} from the src-regex should be matched, "
             "in order to determine collections based on the assay table (e.g. last material or collection-column). "
-            "If not set it is assumed that {sample} matches the irods collections directly.",
+            "If not set it is assumed that {sample} matches the iRODS collections directly. If it matches multiple "
+            "columns the last one can be used.",
         )
         parser.add_argument(
             "-m",
@@ -118,7 +119,7 @@ def setup_argparse(cls, parser: argparse.ArgumentParser) -> None:
             default=[],
             type=str,
             help="Substitutions applied to the extracted sample name, "
-            "which is used to determine iRods collections."
+            "which is used to determine iRODS collections."
             "Can be used to change extracted string to correct collections names "
             "or to match the values of '--match-column'."
             "Use pythons regex syntax of 're.sub' package. "
@@ -133,8 +134,9 @@ def setup_argparse(cls, parser: argparse.ArgumentParser) -> None:
         parser.add_argument(
             "--collection-column",
             default=None,
-            help="Assay column from that matchs irods collection names. "
-            "If not set, the last material column will be used.",
+            help="Assay column from that matches iRODS collection names. "
+            "If not set, the last material column will be used. If it matches multiple "
+            "columns the last one can be used.",
         )
         parser.add_argument(
             "--tmp",
@@ -159,15 +161,21 @@ def check_args(self, args):
 
         toml_config = load_toml_config(args)
         if not args.sodar_url:
-            if toml_config:
-                args.sodar_url = toml_config.get("global", {}).get("sodar_server_url")
-            else:
+            if not toml_config:
+                logger.error("SODAR URL not found in config files. Please specify on command line.")
+                res = 1
+            args.sodar_url = toml_config.get("global", {}).get("sodar_server_url")
+            if not args.sodar_url:
                 logger.error("SODAR URL not found in config files. Please specify on command line.")
                 res = 1
         if not args.sodar_api_token:
-            if toml_config:
-                args.sodar_api_token = toml_config.get("global", {}).get("sodar_api_token")
-            else:
+            if not toml_config:
+                logger.error(
+                    "SODAR API token not found in config files. Please specify on command line."
+                )
+                res = 1
+            args.sodar_api_token = toml_config.get("global", {}).get("sodar_api_token")
+            if not args.sodar_api_token:
                 logger.error(
                     "SODAR API token not found in config files. Please specify on command line."
                 )
@@ -180,7 +188,7 @@ def get_project_uuid(self, lz_uuid: str):
         :param lz_uuid: Landing zone UUID.
         :type lz_uuid: str
 
-        :return: Returns Sodar UUID of corresponding project.
+        :return: Returns SODAR UUID of corresponding project.
         """
         from sodar_cli.api import landingzone
 
@@ -202,35 +210,77 @@ def get_match_to_collection_mapping(
         """Return a dict that matches all values from a specific `ìn_column` of the assay table
         to a corresponding `out_column` (default if not defined: last Material column)."""
 
-        # This part is only needed to get `assay.file_name`
-        # -> could be removed if we can get around that
-        investigation = api.samplesheet.retrieve(
-            sodar_url=self.args.sodar_url,
-            sodar_api_token=self.args.sodar_api_token,
-            project_uuid=project_uuid,
-        )
-        assay = None
-        for study in investigation.studies.values():
-            for assay_uuid in study.assays.keys():
-                if (self.args.assay is None) and (assay is None):
-                    assay = study.assays[assay_uuid]
-                if (self.args.assay is not None) and (self.args.assay == assay_uuid):
-                    assay = study.assays[assay_uuid]
-                    logger.info("Using irods path of assay %s: %s", assay_uuid, assay.irods_path)
-                    break
-
         isa_dict = api.samplesheet.export(
             sodar_url=self.args.sodar_url,
             sodar_api_token=self.args.sodar_api_token,
             project_uuid=project_uuid,
         )
+        if len(isa_dict["assays"]) > 1:
+            if not self.args.assay:
+                msg = "Multiple assays found in investigation, please specify which one to use with --assay."
+                logger.error(msg)
+                raise ParameterException(msg)
+
+            investigation = api.samplesheet.retrieve(
+                sodar_url=self.args.sodar_url,
+                sodar_api_token=self.args.sodar_api_token,
+                project_uuid=project_uuid,
+            )
+            for study in investigation.studies.values():
+                for assay_uuid in study.assays.keys():
+                    if self.args.assay == assay_uuid:
+                        assay_file_name = study.assays[assay_uuid].file_name
+                        break
+                # First break can only break out of inner loop
+                else:
+                    continue
+                break
+            else:
+                msg = f"Assay with UUID {self.args.assay} not found in investigation."
+                logger.error(msg)
+                raise ParameterException(msg)
+        else:
+            assay_file_name = list(isa_dict["assays"].keys())[0]
 
-        assay_tsv = isa_dict["assays"][assay.file_name]["tsv"]
+        assay_tsv = isa_dict["assays"][assay_file_name]["tsv"]
         assay_header, *assay_lines = assay_tsv.rstrip("\n").split("\n")
         assay_header = assay_header.split("\t")
         assay_lines = map(lambda x: x.split("\t"), assay_lines)
 
-        # Never match these assay cols
+        def check_col_index(column_index):
+            if not column_index:
+                msg = "Could not identify any column in the assay sheet matching provided data. Please review input: --match-column={0}".format(
+                    in_column
+                )
+                logger.error(msg)
+                raise ParameterException(msg)
+            elif len(column_index) > 1:
+                column_index = max(column_index)
+                if self.args.yes:
+                    logger.info(
+                        "Multiple columns in the assay sheet match the provided column name ({}), using the last one.".format(
+                            assay_header[column_index]
+                        )
+                    )
+                elif (
+                    input(
+                        "Multiple columns in the assay sheet match the provided column name ({}), use the last one? [yN] ".format(
+                            assay_header[column_index]
+                        )
+                    )
+                    .lower()
+                    .startswith("y")
+                ):
+                    pass
+                else:
+                    msg = "Not possible to continue the process without a defined match-column. Breaking..."
+                    logger.info(msg)
+                    raise UserCanceledException(msg)
+            else:
+                column_index = column_index[0]
+            return column_index
+
+        # Never match these (hidden) assay columns
         ignore_cols = (
             "Performer",
             "Date",
@@ -247,35 +297,27 @@ def get_match_to_collection_mapping(
             and in_column.lower()
             in re.sub("(Parameter Value|Comment|Characteristics)", "", head).lower()
         ]
-        if not in_column_index or len(in_column_index) > 1:
-            msg = "Could not identify a valid unique column of the assay sheet matching provided data. Please review input: --match-column={0}".format(
-                in_column
-            )
-            logger.error(msg)
-            raise ParameterException(msg)
-        in_column_index = in_column_index[0]
+        in_column_index = check_col_index(in_column_index)
 
         if out_column is None:
             # Get index of last material column that is not 'Raw Data File'
-            out_column_index = max(
-                [
-                    i
-                    for i, head in enumerate(assay_header)
-                    if head not in ignore_cols
-                    and not re.match("Raw Data File|Parameter Value|Comment|Characteristics", head)
-                ]
+            materials = (
+                "Extract Name",
+                "Labeled Extract Name",
+                "Library Name",
+                "Sample Name",
+                "Source Name",
             )
+            out_column_index = max([i for i, head in enumerate(assay_header) if head in materials])
         else:
             out_column_index = [
-                i for i, head in enumerate(assay_header) if re.match(out_column, head)
+                i
+                for i, head in enumerate(assay_header)
+                if head not in ignore_cols
+                and out_column.lower()
+                in re.sub("(Parameter Value|Comment|Characteristics)", "", head).lower()
             ]
-            if not out_column_index or len(out_column_index) > 1:
-                msg = "Could not identify a valid unique column of the assay sheet matching provided data. Please review input: --collection-column={0}".format(
-                    out_column
-                )
-                logger.error(msg)
-                raise ParameterException(msg)
-            out_column_index = out_column_index[0]
+            out_column_index = check_col_index(out_column_index)
 
         return {line[in_column_index]: line[out_column_index] for line in assay_lines}
 
diff --git a/tests/test_sodar_ingest_fastq.py b/tests/test_sodar_ingest_fastq.py
index cb5dd3e8..6cdbbb6b 100644
--- a/tests/test_sodar_ingest_fastq.py
+++ b/tests/test_sodar_ingest_fastq.py
@@ -5,6 +5,7 @@
 
 import json
 import os
+import re
 from unittest import mock
 
 from pyfakefs import fake_filesystem, fake_pathlib
@@ -40,6 +41,22 @@ def test_run_sodar_ingest_fastq_nothing(capsys):
     assert res.err
 
 
+def test_run_sodar_ingest_fastq_src_regex():
+    from cubi_tk.sodar.ingest_fastq import DEFAULT_SRC_REGEX
+
+    # Collection of example filenames and the expected {sample} value the regex should capture
+    test_filenames = {
+        "Sample1-N1-RNA1-RNA_seq1.fastq.gz": "Sample1-N1-RNA1-RNA_seq1",
+        "P1234_Samplename_S14_L006_R2_001.fastq.gz": "P1234_Samplename",
+        "P1234_Samplename2_R1.fastq.gz": "P1234_Samplename2",
+    }
+
+    for test_filename, expected_sample in test_filenames.items():
+        res = re.match(DEFAULT_SRC_REGEX, test_filename)
+        assert res is not None
+        assert res.groupdict()["sample"] == expected_sample
+
+
 def test_run_sodar_ingest_fastq_smoke_test(mocker, requests_mock):
     # --- setup arguments
     irods_path = "/irods/dest"