From 3d7336acdaae5c6ed8cd2f0b23b4164f270c0116 Mon Sep 17 00:00:00 2001 From: Keith James Date: Thu, 4 Jan 2024 16:01:45 +0000 Subject: [PATCH] Fix Illumina flowcells query to use the entity_type column The query to find Illumina flowcells in the MLWH by component was using tag index to try to identify controls. This is not a reliable method; entity_type should be used instead. The set of valid entity types is defined in the metadata of the MLWH schema. --- src/npg_irods/illumina.py | 67 +++++++++++++++++++++------------------ tests/conftest.py | 61 ++++++++++++++++++++++++++++++----- 2 files changed, 89 insertions(+), 39 deletions(-) diff --git a/src/npg_irods/illumina.py b/src/npg_irods/illumina.py index 401d3cf..2067b9e 100644 --- a/src/npg_irods/illumina.py +++ b/src/npg_irods/illumina.py @@ -26,6 +26,7 @@ from typing import Iterator, Optional, Type from partisan.irods import AVU, Collection, DataObject +from partisan.metadata import AsValueEnum from sqlalchemy import asc from sqlalchemy.orm import Session from structlog import get_logger @@ -57,11 +58,17 @@ class TagIndex(Enum): Rather, it is a bin for reads that cannot be associated with any of the candidate tags in a pool after sequencing.""" - CONTROL_198 = 198 - CONTROL_888 = 888 - """Tag index 888 is conventionally used to indicate a control sample e.g. Phi X - that has been added to a pool.""" +@unique +class EntityType(AsValueEnum): + """The type of sequenced material applied to a flowcell. This related to the + entity_type column in the MLWH. The values are defined in the MLWH schema + metadata.""" + + LIBRARY = "library" + LIBRARY_CONTROL = "library_control" + LIBRARY_INDEXED = "library_indexed" + LIBRARY_INDEXED_SPIKE = "library_indexed_spike" @dataclass(order=True) @@ -141,7 +148,7 @@ def __repr__(self): if self.tag_index is not None: rep[SeqConcept.TAG_INDEX.value] = self.tag_index if self.subset is not None: - rep[SeqConcept.SUBSET.value] = self.subset + rep[SeqConcept.SUBSET.value] = self.subset.value return json.dumps(rep, sort_keys=True, separators=(",", ":")) @@ -202,25 +209,29 @@ def empty_acl(*args): return [] if requires_full_metadata(item): - log.info("Requires full metadata", path=item) + log.debug("Requires full metadata", path=item) sample_fn, study_fn = make_sample_metadata, make_study_metadata else: - log.info("Requires reduced metadata", path=item) + log.debug("Requires reduced metadata", path=item) sample_fn, study_fn = make_reduced_sample_metadata, make_reduced_study_metadata if requires_managed_access(item): - log.info("Requires managed access", path=item) + log.debug("Requires managed access", path=item) acl_fn = make_sample_acl else: - log.info("Does not require managed access", path=item) + log.debug("Does not require managed access", path=item) acl_fn = empty_acl # Each component may be associated with multiple flowcells components = find_associated_components(item) + log.debug("Found associated components", path=item, comp=components) + for c in components: - for fc in find_flowcells_by_component( + flowcells = find_flowcells_by_component( mlwh_session, c, include_controls=include_controls - ): + ) + log.debug("Found associated flowcells", path=item, flowcells=flowcells, comp=c) + for fc in flowcells: secondary_metadata.extend(sample_fn(fc.sample)) secondary_metadata.extend(study_fn(fc.study)) acl.extend(acl_fn(fc.sample, fc.study, zone=zone)) @@ -467,31 +478,25 @@ def find_flowcells_by_component( sess.query(IseqFlowcell) .distinct() .join(IseqFlowcell.iseq_product_metrics) - .filter(IseqProductMetrics.id_run == component.id_run) + .filter( + IseqProductMetrics.id_run == component.id_run, + IseqFlowcell.position == component.position, + ) ) - if component.position is not None: - query = query.filter(IseqProductMetrics.position == component.position) + if not include_controls: + query = query.filter( + IseqFlowcell.entity_type.notin_( + [ + EntityType.LIBRARY_CONTROL.value, + EntityType.LIBRARY_INDEXED_SPIKE.value, + ] + ) + ) match component.tag_index: - case TagIndex.CONTROL_198.value | TagIndex.CONTROL_888.value: - if not include_controls: - query = query.filter( - IseqProductMetrics.tag_index.notin_( - [TagIndex.CONTROL_198.value, TagIndex.CONTROL_888.value] - ) - ) - - query = query.filter(IseqProductMetrics.tag_index == component.tag_index) case TagIndex.BIN.value: - if not include_controls: - query = query.filter( - IseqProductMetrics.tag_index.notin_( - [TagIndex.CONTROL_198.value, TagIndex.CONTROL_888.value] - ) - ) - - query = query.filter(IseqProductMetrics.tag_index.is_not(None)) + query = query.filter(IseqProductMetrics.tag_index.isnot(None)) case int(): query = query.filter(IseqProductMetrics.tag_index == component.tag_index) case None: diff --git a/tests/conftest.py b/tests/conftest.py index 4986616..50b4219 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -57,6 +57,7 @@ Sample, Study, ) +from npg_irods.illumina import EntityType from npg_irods.metadata import illumina, ont from npg_irods.metadata.common import DataFile, SeqConcept from npg_irods.metadata.lims import TrackedSample @@ -413,34 +414,78 @@ def initialize_mlwh_illumina_synthetic(session: Session): sample_info = [ # Not multiplexed - {"study": study_a, "sample": sample1, "position": 1, "tag_index": None}, + { + "study": study_a, + "sample": sample1, + "position": 1, + "tag_index": None, + "entity_type": EntityType.LIBRARY.value, + }, # Multiplexed, samples from the same study - {"study": study_a, "sample": sample1, "position": 1, "tag_index": 1}, - {"study": study_a, "sample": sample2, "position": 1, "tag_index": 2}, - {"study": study_a, "sample": sample1, "position": 2, "tag_index": 1}, - {"study": study_a, "sample": sample2, "position": 2, "tag_index": 2}, + { + "study": study_a, + "sample": sample1, + "position": 1, + "tag_index": 1, + "entity_type": EntityType.LIBRARY_INDEXED.value, + }, + { + "study": study_a, + "sample": sample2, + "position": 1, + "tag_index": 2, + "entity_type": EntityType.LIBRARY_INDEXED.value, + }, + { + "study": study_a, + "sample": sample1, + "position": 2, + "tag_index": 1, + "entity_type": EntityType.LIBRARY_INDEXED.value, + }, + { + "study": study_a, + "sample": sample2, + "position": 2, + "tag_index": 2, + "entity_type": EntityType.LIBRARY_INDEXED.value, + }, # Multiplexed, samples from different studies - {"study": study_a, "sample": sample1, "position": 2, "tag_index": 1}, - {"study": study_b, "sample": sample3, "position": 2, "tag_index": 2}, + { + "study": study_a, + "sample": sample1, + "position": 2, + "tag_index": 1, + "entity_type": EntityType.LIBRARY_INDEXED.value, + }, + { + "study": study_b, + "sample": sample3, + "position": 2, + "tag_index": 2, + "entity_type": EntityType.LIBRARY_INDEXED.value, + }, # Phi X { "study": control_study, "sample": control_sample, "position": 1, "tag_index": 888, + "entity_type": EntityType.LIBRARY_INDEXED_SPIKE.value, }, { "study": control_study, "sample": control_sample, "position": 2, "tag_index": 888, + "entity_type": EntityType.LIBRARY_INDEXED_SPIKE.value, }, ] flowcells = [ IseqFlowcell( entity_id_lims=f"ENTITY_01", - entity_type=f"ENTITY_TYPE_01", + entity_type=info["entity_type"], id_flowcell_lims=f"FLOWCELL{i}", id_lims="LIMS_01", id_pool_lims=f"POOL_01",