Merge branch 'main' into ingest

bihealth · Oct 9, 2023 · 25baf0b · 25baf0b
2 parents 96a93e7 + e9c49b6
commit 25baf0b
Show file tree

Hide file tree

Showing 86 changed files with 644 additions and 2,080 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -66,7 +66,7 @@ jobs:
           lfs: true
 
       - name: Install mamba
-        run: conda install -y mamba==0.27.0
+        run: conda install -y mamba
 
       - name: Prepare environment.yaml file
         run: >

diff --git a/README.md b/README.md
@@ -12,7 +12,7 @@ Tooling for connecting GitLab, pipelines, and SODAR at CUBI.
 Prerequisites when using conda:
 
 ```bash
-$ conda create -n cubi-tk python=3.7
+$ conda create -n cubi-tk python=3.10
 $ conda activate cubi-tk
 ```
 

diff --git a/cubi_tk/archive/readme.py b/cubi_tk/archive/readme.py
@@ -12,11 +12,11 @@
 
 import attr
 from cookiecutter.main import cookiecutter
+from cubi_isa_templates import IsaTabTemplate
 from logzero import logger
 
 from . import common
 from ..common import execute_shell_commands
-from ..isa_tpl import IsaTabTemplate
 
 _TEMPLATE_DIR = os.path.join(os.path.dirname(__file__), "templates")
 

diff --git a/cubi_tk/common.py b/cubi_tk/common.py
@@ -158,9 +158,11 @@ def yield_files_recursively(path, print_=False, file=sys.stderr):
     """Recursively yield below path to ``file`` in sorted order, print optionally"""
     while len(path) > 1 and path[-1] == "/":  # trim trailing slashes
         path = path[:-1]  # pragma: no cover
-    paths = glob.glob(os.path.join(path, "**"))
+    paths = glob.glob(os.path.join(path, "**"), recursive=True)
     for p in sorted(paths):
         p = p[len(path) + 1 :]
+        if not p:
+            continue
         if print_:
             print(p, file=file)  # pragma: no cover
         yield p

diff --git a/cubi_tk/irods/check.py b/cubi_tk/irods/check.py
@@ -9,7 +9,10 @@
 import typing
 
 from irods.collection import iRODSCollection
+from irods.column import Like
 from irods.data_object import iRODSDataObject
+from irods.models import Collection as CollectionModel
+from irods.models import DataObject as DataObjectModel
 from irods.session import iRODSSession
 from logzero import logger
 import tqdm
@@ -103,16 +106,32 @@ def get_irods_error(cls, e: Exception):
         es = str(e)
         return es if es != "None" else e.__class__.__name__
 
-    def get_data_objs(self, root_coll: iRODSCollection):
+    def get_data_objs(
+        self, root_coll: iRODSCollection
+    ) -> typing.Dict[
+        str, typing.Union[typing.Dict[str, iRODSDataObject], typing.List[iRODSDataObject]]
+    ]:
         """Get data objects recursively under the given iRODS path."""
         data_objs = dict(files=[], checksums={})
         ignore_schemes = [k.lower() for k in HASH_SCHEMES if k != self.args.hash_scheme.upper()]
-        for res in root_coll.walk():
-            for obj in res[2]:
-                if obj.path.endswith("." + self.args.hash_scheme.lower()):
-                    data_objs["checksums"][obj.path] = obj
-                elif obj.path.split(".")[-1] not in ignore_schemes:
-                    data_objs["files"].append(obj)
+        irods_sess = root_coll.manager.sess
+
+        query = irods_sess.query(DataObjectModel, CollectionModel).filter(
+            Like(CollectionModel.name, f"{root_coll.path}%")
+        )
+
+        for res in query:
+            # If the 'res' dict is not split into Colllection&Object the resulting iRODSDataObject is not fully functional, likely because a name/path/... attribute is overwritten somewhere
+            coll_res = {k: v for k, v in res.items() if k.icat_id >= 500}
+            obj_res = {k: v for k, v in res.items() if k.icat_id < 500}
+            coll = iRODSCollection(root_coll.manager, coll_res)
+            obj = iRODSDataObject(irods_sess.data_objects, parent=coll, results=[obj_res])
+
+            if obj.path.endswith("." + self.args.hash_scheme.lower()):
+                data_objs["checksums"][obj.path] = obj
+            elif obj.path.split(".")[-1] not in ignore_schemes:
+                data_objs["files"].append(obj)
+
         return data_objs
 
     def check_args(self, _args):

diff --git a/cubi_tk/isa_tpl/__init__.py b/cubi_tk/isa_tpl/__init__.py
@@ -18,152 +18,37 @@
 Available Templates
 -------------------
 
-The `Cookiecutter`_ directories are located in this module's directory.  Currently available templates are:
-
-- ``isatab-generic``
-- ``isatab-germline``
-- ``isatab-microarray``
-- ``isatab-ms_meta_biocrates``
-- ``isatab-single_cell_rnaseq``
-- ``isatab-bulk_rnaseq``
-- ``isatab-tumor_normal_dna``
-- ``isatab-tumor_normal_triplets``
-- ``isatab-stem_cell_core_bulk``
-- ``isatab-stem_cell_core_sc``
+These have been moved to a separate repository: see `cubi-isa-templates`_.
 
 Adding Templates
 ----------------
 
-Adding templates consists of the following steps:
-
-1. Add a new template directory below ``cubi_tk/isa_tpl``.
-2. Register it appending a ``IsaTabTemplate`` object to ``_TEMPLATES`` in ``cubi_tk.isa_tpl``.
-3. Add it to the list above in the docstring.
-
-The easiest way to start out is to copy an existing cookiecutter template and registration.
+See `cubi-isa-templates`_.
 
 More Information
 ----------------
 
 Also see ``cubi-tk isa-tpl`` CLI documentation and ``cubi-tk isa-tab --help`` for more information.
 
 .. _Cookiecutter: https://cookiecutter.readthedocs.io/
+.. _cubi-isa-templates: https://github.com/bihealth/cubi-isa-templates
 """
 
 import argparse
 from functools import partial
-import json
-import os
 from pathlib import Path
-import typing
+import shutil
+import warnings
 
-import attr
+import altamisa
 from cookiecutter.main import cookiecutter
+from cubi_isa_templates import TEMPLATES
 from logzero import logger
 from toolz import curry
 
 from ..common import run_nocmd, yield_files_recursively
 
 
-@attr.s(frozen=True, auto_attribs=True)
-class IsaTabTemplate:
-    """Information regarding an ISA-tab template."""
-
-    #: Name of the ISA-tab template.
-    name: str
-
-    #: Path to template directory.
-    path: str
-
-    #: Configuration loaded from ``cookiecutter.json``.
-    configuration: typing.Dict[str, typing.Any]
-
-    #: Optional description string.
-    description: typing.Optional[str] = None
-
-
-#: Base directory to this file.
-_BASE_DIR = os.path.dirname(__file__)
-
-
-def load_variables(template_name, extra=None):
-    """Load variables given the template name."""
-    extra = extra or {}
-    config_path = os.path.join(_BASE_DIR, template_name, "cookiecutter.json")
-    with open(config_path, "rt") as inputf:
-        result = json.load(inputf)
-    result.update(extra)
-    return result
-
-
-#: Known ISA-tab templates (internal, mapping generated below).
-_TEMPLATES = (
-    IsaTabTemplate(
-        name="single_cell_rnaseq",
-        path=os.path.join(_BASE_DIR, "isatab-single_cell_rnaseq"),
-        description="single cell RNA sequencing ISA-tab template",
-        configuration=load_variables("isatab-single_cell_rnaseq"),
-    ),
-    IsaTabTemplate(
-        name="bulk_rnaseq",
-        path=os.path.join(_BASE_DIR, "isatab-bulk_rnaseq"),
-        description="bulk RNA sequencing ISA-tab template",
-        configuration=load_variables("isatab-generic"),
-    ),
-    IsaTabTemplate(
-        name="tumor_normal_dna",
-        path=os.path.join(_BASE_DIR, "isatab-tumor_normal_dna"),
-        description="Tumor-Normal DNA sequencing ISA-tab template",
-        configuration=load_variables("isatab-tumor_normal_dna", {"is_triplet": False}),
-    ),
-    IsaTabTemplate(
-        name="tumor_normal_triplets",
-        path=os.path.join(_BASE_DIR, "isatab-tumor_normal_triplets"),
-        description="Tumor-Normal DNA+RNA sequencing ISA-tab template",
-        configuration=load_variables("isatab-tumor_normal_triplets", {"is_triplet": True}),
-    ),
-    IsaTabTemplate(
-        name="germline",
-        path=os.path.join(_BASE_DIR, "isatab-germline"),
-        description="germline DNA sequencing ISA-tab template",
-        configuration=load_variables("isatab-germline"),
-    ),
-    IsaTabTemplate(
-        name="generic",
-        path=os.path.join(_BASE_DIR, "isatab-generic"),
-        description="generic RNA sequencing ISA-tab template",
-        configuration=load_variables("isatab-generic"),
-    ),
-    IsaTabTemplate(
-        name="microarray",
-        path=os.path.join(_BASE_DIR, "isatab-microarray"),
-        description="microarray ISA-tab template",
-        configuration=load_variables("isatab-microarray"),
-    ),
-    IsaTabTemplate(
-        name="ms_meta_biocrates",
-        path=os.path.join(_BASE_DIR, "isatab-ms_meta_biocrates"),
-        description="MS Metabolomics Biocrates kit ISA-tab template",
-        configuration=load_variables("isatab-ms_meta_biocrates"),
-    ),
-    IsaTabTemplate(
-        name="stem_cell_core_bulk",
-        path=os.path.join(_BASE_DIR, "isatab-stem_cell_core_bulk"),
-        description="Bulk RNA sequencing ISA-tab template from hiPSC for stem cell core projects",
-        configuration=load_variables("isatab-stem_cell_core_bulk"),
-    ),
-    IsaTabTemplate(
-        name="stem_cell_core_sc",
-        path=os.path.join(_BASE_DIR, "isatab-stem_cell_core_sc"),
-        description="Single cell RNA sequencing ISA-tab template from hiPSC for stem cell core projects",
-        configuration=load_variables("isatab-stem_cell_core_sc"),
-    ),
-)
-
-#: Known ISA-tab templates.
-TEMPLATES = {tpl.name: tpl for tpl in _TEMPLATES}
-
-
 @curry
 def run_cookiecutter(tpl, args, _parser=None, _subparser=None, no_input=False):
     """Run cookiecutter, ``tpl`` will be bound with ``toolz.curry``."""
@@ -172,16 +57,13 @@ def run_cookiecutter(tpl, args, _parser=None, _subparser=None, no_input=False):
         if getattr(args, "var_%s" % name, None) is not None:
             extra_context[name] = getattr(args, "var_%s" % name)
 
-    logger.info(tpl.configuration)
-    logger.info(args)
-
-    output_dir = os.path.realpath(args.output_dir)
-    output_base = os.path.dirname(output_dir)
-    extra_context["__output_dir"] = os.path.basename(output_dir)
+    if args.verbose:
+        logger.info(tpl.configuration)
+        logger.info(args)
 
-    # FIXME: better solution? (added because args.var_is_triplet is None)
-    if "is_triplet" in tpl.configuration:
-        extra_context["is_triplet"] = tpl.configuration["is_triplet"]
+    output_dir = Path(args.output_dir).resolve()
+    output_base = output_dir.parent
+    extra_context["__output_dir"] = Path(output_dir).name
 
     logger.info("Start running cookiecutter")
     logger.info("  template path: %s", tpl.path)
@@ -255,4 +137,27 @@ def run(args, parser, subparser):  # pragma: nocover
     if not args.tpl:  # pragma: nocover
         return run_nocmd(args, parser, subparser)
     else:
-        return args.isa_tpl_cmd(args, parser, subparser)
+        status = args.isa_tpl_cmd(args, parser, subparser)
+
+        # output validation
+        if not status:
+            logger.info("Running AltamISA validator:")
+            i_files = Path(args.output_dir).rglob("i_*")
+            args.show_duplicate_warnings = False
+            warnings.filterwarnings(
+                "error", category=altamisa.exceptions.CriticalIsaValidationWarning
+            )
+            for i in i_files:
+                try:
+                    with i.open() as i_file:
+                        args.input_investigation_file = i_file
+                        altamisa.apps.isatab_validate.run(args)
+                except (
+                    altamisa.exceptions.ParseIsatabException,
+                    altamisa.exceptions.CriticalIsaValidationWarning,
+                ):
+                    shutil.rmtree(args.output_dir)
+                    raise
+            return 0
+        else:
+            return status
diff --git a/cubi_tk/isa_tpl/isatab-bulk_rnaseq/cookiecutter.json b/cubi_tk/isa_tpl/isatab-bulk_rnaseq/cookiecutter.json
diff --git a/...okiecutter.__output_dir}}/a_{{cookiecutter.assay_prefix}}_{{cookiecutter.assay_name}}.txt b/...okiecutter.__output_dir}}/a_{{cookiecutter.assay_prefix}}_{{cookiecutter.assay_name}}.txt