taxprofiler · Midnighter · Sep 10, 2023 · Sep 2, 2023 · Sep 2, 2023 · Sep 2, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Added
+
+-   Added an option `--ignore-errors` to the `taxpasta merge` command. This allows
+    ignoring individual profiles that cause errors, like empty profiles (#136).
+
+### Changed
+
+-   Created a special error message for empty profiles, making the cause of the error
+    much clearer (#136).
+-   Internal restructuring of input validation and transformation services (#136).
+
 ## [0.5.0] - (2023-08-24)
 
 ### Added

diff --git a/src/taxpasta/application/__init__.py b/src/taxpasta/application/__init__.py
@@ -16,5 +16,6 @@
 # limitations under the License.
 
 
-from .sample_merging_application import SampleMergingApplication
+from .add_tax_info_command import AddTaxInfoCommand
+from .sample_handling_application import SampleHandlingApplication
 from .consensus_application import ConsensusApplication
diff --git a/src/taxpasta/application/add_tax_info_command.py b/src/taxpasta/application/add_tax_info_command.py
@@ -0,0 +1,105 @@
+# Copyright (c) 2023 Moritz E. Beber
+# Copyright (c) 2023 Maxime Borry
+# Copyright (c) 2023 James A. Fellows Yates
+# Copyright (c) 2023 Sofia Stamouli.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""Provide a command object for adding taxonomy information."""
+
+
+from dataclasses import dataclass
+from typing import Optional
+
+from taxpasta.domain.model import Sample
+from taxpasta.domain.service import TaxonomyService
+
+
+@dataclass(frozen=True)
+class AddTaxInfoCommand:
+    """Define a command object for adding taxonomy information."""
+
+    taxonomy_service: Optional[TaxonomyService] = None
+    summarise_at: Optional[str] = None
+    add_name: bool = False
+    add_rank: bool = False
+    add_lineage: bool = False
+    add_id_lineage: bool = False
+    add_rank_lineage: bool = False
+
+    def execute(self, sample: Sample) -> Sample:
+        """Execute the command to add taxonomy information."""
+        if self.taxonomy_service is None:
+            return sample
+        # The order of the following conditions is chosen specifically to yield a
+        # pleasant final output format.
+        result = sample
+        if self.add_rank_lineage:
+            result = Sample(
+                name=result.name,
+                profile=self.taxonomy_service.add_rank_lineage(result.profile),
+            )
+        if self.add_id_lineage:
+            result = Sample(
+                name=result.name,
+                profile=self.taxonomy_service.add_identifier_lineage(result.profile),
+            )
+        if self.add_lineage:
+            result = Sample(
+                name=result.name,
+                profile=self.taxonomy_service.add_name_lineage(result.profile),
+            )
+        if self.add_rank:
+            result = Sample(
+                name=result.name, profile=self.taxonomy_service.add_rank(result.profile)
+            )
+        if self.add_name:
+            result = Sample(
+                name=result.name, profile=self.taxonomy_service.add_name(result.profile)
+            )
+        return result
+
+    def __post_init__(self) -> None:
+        """Perform post initialization validation."""
+        no_taxonomy = self.taxonomy_service is None
+        if self.summarise_at is not None and no_taxonomy:
+            raise ValueError(
+                "The summarising feature '--summarise-at' requires a taxonomy. Please "
+                "provide one using the option '--taxonomy'."
+            )
+        if self.add_name and no_taxonomy:
+            raise ValueError(
+                "The '--add-name' option requires a taxonomy. Please "
+                "provide one using the option '--taxonomy'."
+            )
+        if self.add_rank and no_taxonomy:
+            raise ValueError(
+                "The '--add-rank' option requires a taxonomy. Please "
+                "provide one using the option '--taxonomy'."
+            )
+        if self.add_lineage and no_taxonomy:
+            raise ValueError(
+                "The '--add-lineage' option requires a taxonomy. Please "
+                "provide one using the option '--taxonomy'."
+            )
+        if self.add_id_lineage and no_taxonomy:
+            raise ValueError(
+                "The '--add-id-lineage' option requires a taxonomy. Please "
+                "provide one using the option '--taxonomy'."
+            )
+        if self.add_rank_lineage and no_taxonomy:
+            raise ValueError(
+                "The '--add-rank-lineage' option requires a taxonomy. Please "
+                "provide one using the option '--taxonomy'."
+            )
diff --git a/...application/sample_merging_application.py → ...pplication/sample_handling_application.py b/...application/sample_merging_application.py → ...pplication/sample_handling_application.py
@@ -16,23 +16,20 @@
 # limitations under the License.
 
 
-"""Provide a sample merging application."""
+"""Provide a sample handling application."""
 
 
 from __future__ import annotations
 
 import logging
 from pathlib import Path
-from typing import Iterable, List, Optional, Tuple, Type
+from typing import Iterable, Optional, Type
 
 from pandera.errors import SchemaErrors
 from pandera.typing import DataFrame
 
 from taxpasta.application.error import StandardisationError
-from taxpasta.application.service.profile_reader import ProfileReader
-from taxpasta.application.service.profile_standardisation_service import (
-    ProfileStandardisationService,
-)
+from taxpasta.application.service import ProfileReader, ProfileStandardisationService
 from taxpasta.domain.model import (
     Sample,
     StandardProfile,
@@ -45,8 +42,8 @@
 logger = logging.getLogger(__name__)
 
 
-class SampleMergingApplication:
-    """Define a sample merging application."""
+class SampleHandlingApplication:
+    """Define the sample handling application."""
 
     def __init__(
         self,
@@ -57,7 +54,7 @@ def __init__(
         **kwargs: dict,
     ):
         """
-        Initialize the application for a particular taxonomic profiler.
+        Initialize the sample handling application.
 
         Args:
             profile_reader: A profile reader for a specific taxonomic profile format.
@@ -72,37 +69,66 @@ def __init__(
         super().__init__(**kwargs)
         self.reader = profile_reader
         self.standardiser = profile_standardiser
-        self.taxonomy = taxonomy_service
+        self.taxonomy_service = taxonomy_service
+
+    def etl_sample(self, name: str, profile: Path) -> Sample:
+        """
+        Extract, transform, and load a profile into a sample.
+
+        Args:
+            name: A name for the sample.
+            profile: The path to a taxonomic profile.
+
+        Returns:
+            A sample.
+
+        Raises:
+            StandardisationError: If the given profile does not match the validation
+                schema.
 
-    def run(
+        """
+        try:
+            result = self.standardiser.transform(self.reader.read(profile))
+        except SchemaErrors as errors:
+            if errors.data.empty:
+                raise StandardisationError(
+                    sample=name, profile=profile, message="Profile is empty."
+                ) from errors
+            else:
+                raise StandardisationError(
+                    sample=name, profile=profile, message=str(errors.failure_cases)
+                ) from errors
+        except ValueError as error:
+            raise StandardisationError(
+                sample=name, profile=profile, message=str(error)
+            ) from error
+
+        return Sample(name=name, profile=result)
+
+    def summarise_sample(self, sample: Sample, rank: str) -> Sample:
+        """Summarise a sample at a higher taxonomic rank."""
+        assert self.taxonomy_service is not None  # nosec assert_used
+        return Sample(
+            name=sample.name,
+            profile=self.taxonomy_service.summarise_at(sample.profile, rank),
+        )
+
+    def merge_samples(
         self,
-        profiles: Iterable[Tuple[str, Path]],
+        samples: Iterable[Sample],
         wide_format: bool,
-        summarise_at: Optional[str] = None,
-        ignore_error: bool = False,
     ) -> DataFrame[WideObservationTable] | DataFrame[TidyObservationTable]:
         """
-        Extract and transform profiles into samples, then merge them.
+        Merge two or more  samples into a single table.
 
         Args:
-            profiles: Pairs of name and profile path.
+            samples: Two or more samples.
             wide_format: Whether to create wide or (tidy) long format output.
-            summarise_at: The taxonomic rank at which to summarise abundance if any.
-            ignore_error: Whether to ignore profiles that contain errors.
 
         Returns:
             A single table containing all samples in the desired format.
 
-        Raises:
-            StandardisationError: If any of the given profiles does not match the
-                validation schema.  # noqa: DAR402
-
         """
-        samples = self._etl_samples(profiles, ignore_error)
-
-        if summarise_at is not None:
-            samples = self._summarise_samples(samples, summarise_at, ignore_error)
-
         if wide_format:
             wide_table = SampleMergingService.merge_wide(samples)
             # If any profile did not have all the same taxonomy IDs as the combined
@@ -120,56 +146,3 @@ def run(
             return wide_table
         else:
             return SampleMergingService.merge_long(samples)
-
-    def _etl_samples(
-        self, profiles: Iterable[Tuple[str, Path]], ignore_error: bool
-    ) -> List[Sample]:
-        """Extract, transform, and load profiles into samples."""
-        result = []
-        for name, profile in profiles:
-            try:
-                result.append(
-                    Sample(
-                        name=name,
-                        profile=self.standardiser.transform(self.reader.read(profile)),
-                    )
-                )
-            except SchemaErrors as errors:
-                if ignore_error:
-                    logger.error("Sample %s: %s", name, str(errors))
-                    continue
-                else:
-                    raise StandardisationError(
-                        sample=name, profile=profile, message=str(errors.failure_cases)
-                    ) from errors
-            except ValueError as error:
-                if ignore_error:
-                    logger.error("Sample %s: %s", name, str(error))
-                    continue
-                else:
-                    raise StandardisationError(
-                        sample=name, profile=profile, message=str(error)
-                    ) from error
-        return result
-
-    def _summarise_samples(
-        self, samples: List[Sample], rank: str, ignore_error: bool
-    ) -> List[Sample]:
-        """Summarise samples at a given taxonomic rank."""
-        assert self.taxonomy is not None  # nosec assert_used
-        result = []
-        for sample in samples:
-            try:
-                result.append(
-                    Sample(
-                        name=sample.name,
-                        profile=self.taxonomy.summarise_at(sample.profile, rank),
-                    )
-                )
-            except ValueError as error:
-                if ignore_error:
-                    logger.error("Sample %s: %s", sample.name, str(error))
-                    continue
-                else:
-                    raise
-        return result
diff --git a/src/taxpasta/domain/model/tidy_observation_table.py b/src/taxpasta/domain/model/tidy_observation_table.py
@@ -19,6 +19,8 @@
 """Provide a description of a tidy observation table."""
 
 
+from typing import Optional
+
 import numpy as np
 import pandas as pd
 import pandera as pa
@@ -29,6 +31,11 @@ class TidyObservationTable(pa.DataFrameModel):
     """Define the tidy observation table."""
 
     taxonomy_id: Series[pd.CategoricalDtype] = pa.Field()
+    name: Optional[Series[pd.CategoricalDtype]] = pa.Field()
+    rank: Optional[Series[pd.CategoricalDtype]] = pa.Field()
+    lineage: Optional[Series[pd.CategoricalDtype]] = pa.Field()
+    id_lineage: Optional[Series[pd.CategoricalDtype]] = pa.Field()
+    rank_lineage: Optional[Series[pd.CategoricalDtype]] = pa.Field()
     count: Series[np.int64] = pa.Field(ge=0)
     sample: Series[pd.CategoricalDtype] = pa.Field()
 

diff --git a/src/taxpasta/domain/model/wide_observation_table.py b/src/taxpasta/domain/model/wide_observation_table.py
@@ -19,6 +19,8 @@
 """Provide a description of an observation matrix."""
 
 
+from typing import Optional
+
 import numpy as np
 import pandas as pd
 import pandera as pa
@@ -29,9 +31,16 @@ class WideObservationTable(pa.DataFrameModel):
     """Define the observation matrix."""
 
     taxonomy_id: Series[pd.CategoricalDtype] = pa.Field()
-    # This field uses a regex to match all columns that are not `taxonomy_id`.
+    name: Optional[Series[pd.CategoricalDtype]] = pa.Field()
+    rank: Optional[Series[pd.CategoricalDtype]] = pa.Field()
+    lineage: Optional[Series[pd.CategoricalDtype]] = pa.Field()
+    id_lineage: Optional[Series[pd.CategoricalDtype]] = pa.Field()
+    rank_lineage: Optional[Series[pd.CategoricalDtype]] = pa.Field()
+    # This field uses a regex to match all columns that are not one of the above.
     any_samples: Series[np.int64] = pa.Field(
-        ge=0, alias="^(?!taxonomy_id$).*", regex=True
+        ge=0,
+        alias="^(?!(taxonomy_id|name|rank|lineage|id_lineage|rank_lineage)$).*",
+        regex=True,
     )
 
     class Config:

diff --git a/src/taxpasta/infrastructure/application/__init__.py b/src/taxpasta/infrastructure/application/__init__.py
@@ -72,7 +72,6 @@
     MotusProfileStandardisationService,
 )
 
-from .sample_etl_application import SampleETLApplication
 from .standard_profile_file_format import StandardProfileFileFormat
 from .table_reader_file_format import TableReaderFileFormat
 from .tidy_observation_table_file_format import TidyObservationTableFileFormat