Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow ignoring profile errors when merging, message on empty profile #136

Merged
merged 8 commits into from
Sep 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

### Added

- Added an option `--ignore-errors` to the `taxpasta merge` command. This allows
ignoring individual profiles that cause errors, like empty profiles (#136).

### Changed

- Created a special error message for empty profiles, making the cause of the error
much clearer (#136).
- Internal restructuring of input validation and transformation services (#136).

## [0.5.0] - (2023-08-24)

### Added
Expand Down
3 changes: 2 additions & 1 deletion src/taxpasta/application/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,6 @@
# limitations under the License.


from .sample_merging_application import SampleMergingApplication
from .add_tax_info_command import AddTaxInfoCommand
from .sample_handling_application import SampleHandlingApplication
from .consensus_application import ConsensusApplication
105 changes: 105 additions & 0 deletions src/taxpasta/application/add_tax_info_command.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
# Copyright (c) 2023 Moritz E. Beber
# Copyright (c) 2023 Maxime Borry
# Copyright (c) 2023 James A. Fellows Yates
# Copyright (c) 2023 Sofia Stamouli.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


"""Provide a command object for adding taxonomy information."""


from dataclasses import dataclass
from typing import Optional

from taxpasta.domain.model import Sample
from taxpasta.domain.service import TaxonomyService


@dataclass(frozen=True)
class AddTaxInfoCommand:
"""Define a command object for adding taxonomy information."""

taxonomy_service: Optional[TaxonomyService] = None
summarise_at: Optional[str] = None
add_name: bool = False
add_rank: bool = False
add_lineage: bool = False
add_id_lineage: bool = False
add_rank_lineage: bool = False

def execute(self, sample: Sample) -> Sample:
"""Execute the command to add taxonomy information."""
if self.taxonomy_service is None:
return sample
# The order of the following conditions is chosen specifically to yield a
# pleasant final output format.
result = sample
if self.add_rank_lineage:
result = Sample(
name=result.name,
profile=self.taxonomy_service.add_rank_lineage(result.profile),
)
if self.add_id_lineage:
result = Sample(
name=result.name,
profile=self.taxonomy_service.add_identifier_lineage(result.profile),
)
if self.add_lineage:
result = Sample(
name=result.name,
profile=self.taxonomy_service.add_name_lineage(result.profile),
)
if self.add_rank:
result = Sample(
name=result.name, profile=self.taxonomy_service.add_rank(result.profile)
)
if self.add_name:
result = Sample(
name=result.name, profile=self.taxonomy_service.add_name(result.profile)
)
return result

def __post_init__(self) -> None:
"""Perform post initialization validation."""
no_taxonomy = self.taxonomy_service is None
if self.summarise_at is not None and no_taxonomy:
raise ValueError(
"The summarising feature '--summarise-at' requires a taxonomy. Please "
"provide one using the option '--taxonomy'."
)
if self.add_name and no_taxonomy:
raise ValueError(
"The '--add-name' option requires a taxonomy. Please "
"provide one using the option '--taxonomy'."
)
if self.add_rank and no_taxonomy:
raise ValueError(
"The '--add-rank' option requires a taxonomy. Please "
"provide one using the option '--taxonomy'."
)
if self.add_lineage and no_taxonomy:
raise ValueError(
"The '--add-lineage' option requires a taxonomy. Please "
"provide one using the option '--taxonomy'."
)
if self.add_id_lineage and no_taxonomy:
raise ValueError(
"The '--add-id-lineage' option requires a taxonomy. Please "
"provide one using the option '--taxonomy'."
)
if self.add_rank_lineage and no_taxonomy:
raise ValueError(
"The '--add-rank-lineage' option requires a taxonomy. Please "
"provide one using the option '--taxonomy'."
)
Original file line number Diff line number Diff line change
Expand Up @@ -16,23 +16,20 @@
# limitations under the License.


"""Provide a sample merging application."""
"""Provide a sample handling application."""


from __future__ import annotations

import logging
from pathlib import Path
from typing import Iterable, List, Optional, Tuple, Type
from typing import Iterable, Optional, Type

from pandera.errors import SchemaErrors
from pandera.typing import DataFrame

from taxpasta.application.error import StandardisationError
from taxpasta.application.service.profile_reader import ProfileReader
from taxpasta.application.service.profile_standardisation_service import (
ProfileStandardisationService,
)
from taxpasta.application.service import ProfileReader, ProfileStandardisationService
from taxpasta.domain.model import (
Sample,
StandardProfile,
Expand All @@ -45,8 +42,8 @@
logger = logging.getLogger(__name__)


class SampleMergingApplication:
"""Define a sample merging application."""
class SampleHandlingApplication:
"""Define the sample handling application."""

def __init__(
self,
Expand All @@ -57,7 +54,7 @@ def __init__(
**kwargs: dict,
):
"""
Initialize the application for a particular taxonomic profiler.
Initialize the sample handling application.

Args:
profile_reader: A profile reader for a specific taxonomic profile format.
Expand All @@ -72,37 +69,66 @@ def __init__(
super().__init__(**kwargs)
self.reader = profile_reader
self.standardiser = profile_standardiser
self.taxonomy = taxonomy_service
self.taxonomy_service = taxonomy_service

def etl_sample(self, name: str, profile: Path) -> Sample:
"""
Extract, transform, and load a profile into a sample.

Args:
name: A name for the sample.
profile: The path to a taxonomic profile.

Returns:
A sample.

Raises:
StandardisationError: If the given profile does not match the validation
schema.

def run(
"""
try:
result = self.standardiser.transform(self.reader.read(profile))
except SchemaErrors as errors:
if errors.data.empty:
raise StandardisationError(
sample=name, profile=profile, message="Profile is empty."
) from errors
else:
raise StandardisationError(
sample=name, profile=profile, message=str(errors.failure_cases)
) from errors
except ValueError as error:
raise StandardisationError(
sample=name, profile=profile, message=str(error)
) from error

return Sample(name=name, profile=result)

def summarise_sample(self, sample: Sample, rank: str) -> Sample:
"""Summarise a sample at a higher taxonomic rank."""
assert self.taxonomy_service is not None # nosec assert_used
return Sample(
name=sample.name,
profile=self.taxonomy_service.summarise_at(sample.profile, rank),
)

def merge_samples(
self,
profiles: Iterable[Tuple[str, Path]],
samples: Iterable[Sample],
wide_format: bool,
summarise_at: Optional[str] = None,
ignore_error: bool = False,
) -> DataFrame[WideObservationTable] | DataFrame[TidyObservationTable]:
"""
Extract and transform profiles into samples, then merge them.
Merge two or more samples into a single table.

Args:
profiles: Pairs of name and profile path.
samples: Two or more samples.
wide_format: Whether to create wide or (tidy) long format output.
summarise_at: The taxonomic rank at which to summarise abundance if any.
ignore_error: Whether to ignore profiles that contain errors.

Returns:
A single table containing all samples in the desired format.

Raises:
StandardisationError: If any of the given profiles does not match the
validation schema. # noqa: DAR402

"""
samples = self._etl_samples(profiles, ignore_error)

if summarise_at is not None:
samples = self._summarise_samples(samples, summarise_at, ignore_error)

if wide_format:
wide_table = SampleMergingService.merge_wide(samples)
# If any profile did not have all the same taxonomy IDs as the combined
Expand All @@ -120,56 +146,3 @@ def run(
return wide_table
else:
return SampleMergingService.merge_long(samples)

def _etl_samples(
self, profiles: Iterable[Tuple[str, Path]], ignore_error: bool
) -> List[Sample]:
"""Extract, transform, and load profiles into samples."""
result = []
for name, profile in profiles:
try:
result.append(
Sample(
name=name,
profile=self.standardiser.transform(self.reader.read(profile)),
)
)
except SchemaErrors as errors:
if ignore_error:
logger.error("Sample %s: %s", name, str(errors))
continue
else:
raise StandardisationError(
sample=name, profile=profile, message=str(errors.failure_cases)
) from errors
except ValueError as error:
if ignore_error:
logger.error("Sample %s: %s", name, str(error))
continue
else:
raise StandardisationError(
sample=name, profile=profile, message=str(error)
) from error
return result

def _summarise_samples(
self, samples: List[Sample], rank: str, ignore_error: bool
) -> List[Sample]:
"""Summarise samples at a given taxonomic rank."""
assert self.taxonomy is not None # nosec assert_used
result = []
for sample in samples:
try:
result.append(
Sample(
name=sample.name,
profile=self.taxonomy.summarise_at(sample.profile, rank),
)
)
except ValueError as error:
if ignore_error:
logger.error("Sample %s: %s", sample.name, str(error))
continue
else:
raise
return result
7 changes: 7 additions & 0 deletions src/taxpasta/domain/model/tidy_observation_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
"""Provide a description of a tidy observation table."""


from typing import Optional

import numpy as np
import pandas as pd
import pandera as pa
Expand All @@ -29,6 +31,11 @@ class TidyObservationTable(pa.DataFrameModel):
"""Define the tidy observation table."""

taxonomy_id: Series[pd.CategoricalDtype] = pa.Field()
name: Optional[Series[pd.CategoricalDtype]] = pa.Field()
rank: Optional[Series[pd.CategoricalDtype]] = pa.Field()
lineage: Optional[Series[pd.CategoricalDtype]] = pa.Field()
id_lineage: Optional[Series[pd.CategoricalDtype]] = pa.Field()
rank_lineage: Optional[Series[pd.CategoricalDtype]] = pa.Field()
count: Series[np.int64] = pa.Field(ge=0)
sample: Series[pd.CategoricalDtype] = pa.Field()

Expand Down
13 changes: 11 additions & 2 deletions src/taxpasta/domain/model/wide_observation_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
"""Provide a description of an observation matrix."""


from typing import Optional

import numpy as np
import pandas as pd
import pandera as pa
Expand All @@ -29,9 +31,16 @@ class WideObservationTable(pa.DataFrameModel):
"""Define the observation matrix."""

taxonomy_id: Series[pd.CategoricalDtype] = pa.Field()
# This field uses a regex to match all columns that are not `taxonomy_id`.
name: Optional[Series[pd.CategoricalDtype]] = pa.Field()
rank: Optional[Series[pd.CategoricalDtype]] = pa.Field()
lineage: Optional[Series[pd.CategoricalDtype]] = pa.Field()
id_lineage: Optional[Series[pd.CategoricalDtype]] = pa.Field()
rank_lineage: Optional[Series[pd.CategoricalDtype]] = pa.Field()
# This field uses a regex to match all columns that are not one of the above.
any_samples: Series[np.int64] = pa.Field(
ge=0, alias="^(?!taxonomy_id$).*", regex=True
ge=0,
alias="^(?!(taxonomy_id|name|rank|lineage|id_lineage|rank_lineage)$).*",
regex=True,
)

class Config:
Expand Down
1 change: 0 additions & 1 deletion src/taxpasta/infrastructure/application/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,6 @@
MotusProfileStandardisationService,
)

from .sample_etl_application import SampleETLApplication
from .standard_profile_file_format import StandardProfileFileFormat
from .table_reader_file_format import TableReaderFileFormat
from .tidy_observation_table_file_format import TidyObservationTableFileFormat
Expand Down
Loading