From 3d7336acdaae5c6ed8cd2f0b23b4164f270c0116 Mon Sep 17 00:00:00 2001
From: Keith James <kdj@sanger.ac.uk>
Date: Thu, 4 Jan 2024 16:01:45 +0000
Subject: [PATCH] Fix Illumina flowcells query to use the entity_type column

The query to find Illumina flowcells in the MLWH by component was
using tag index to try to identify controls. This is not a reliable
method; entity_type should be used instead. The set of valid entity
types is defined in the metadata of the MLWH schema.
---
 src/npg_irods/illumina.py | 67 +++++++++++++++++++++------------------
 tests/conftest.py         | 61 ++++++++++++++++++++++++++++++-----
 2 files changed, 89 insertions(+), 39 deletions(-)

diff --git a/src/npg_irods/illumina.py b/src/npg_irods/illumina.py
index 401d3cf..2067b9e 100644
--- a/src/npg_irods/illumina.py
+++ b/src/npg_irods/illumina.py
@@ -26,6 +26,7 @@
 from typing import Iterator, Optional, Type
 
 from partisan.irods import AVU, Collection, DataObject
+from partisan.metadata import AsValueEnum
 from sqlalchemy import asc
 from sqlalchemy.orm import Session
 from structlog import get_logger
@@ -57,11 +58,17 @@ class TagIndex(Enum):
     Rather, it is a bin for reads that cannot be associated with any of the candidate
     tags in a pool after sequencing."""
 
-    CONTROL_198 = 198
 
-    CONTROL_888 = 888
-    """Tag index 888 is conventionally used to indicate a control sample e.g. Phi X
-    that has been added to a pool."""
+@unique
+class EntityType(AsValueEnum):
+    """The type of sequenced material applied to a flowcell. This related to the
+    entity_type column in the MLWH. The values are defined in the MLWH schema
+    metadata."""
+
+    LIBRARY = "library"
+    LIBRARY_CONTROL = "library_control"
+    LIBRARY_INDEXED = "library_indexed"
+    LIBRARY_INDEXED_SPIKE = "library_indexed_spike"
 
 
 @dataclass(order=True)
@@ -141,7 +148,7 @@ def __repr__(self):
         if self.tag_index is not None:
             rep[SeqConcept.TAG_INDEX.value] = self.tag_index
         if self.subset is not None:
-            rep[SeqConcept.SUBSET.value] = self.subset
+            rep[SeqConcept.SUBSET.value] = self.subset.value
 
         return json.dumps(rep, sort_keys=True, separators=(",", ":"))
 
@@ -202,25 +209,29 @@ def empty_acl(*args):
         return []
 
     if requires_full_metadata(item):
-        log.info("Requires full metadata", path=item)
+        log.debug("Requires full metadata", path=item)
         sample_fn, study_fn = make_sample_metadata, make_study_metadata
     else:
-        log.info("Requires reduced metadata", path=item)
+        log.debug("Requires reduced metadata", path=item)
         sample_fn, study_fn = make_reduced_sample_metadata, make_reduced_study_metadata
 
     if requires_managed_access(item):
-        log.info("Requires managed access", path=item)
+        log.debug("Requires managed access", path=item)
         acl_fn = make_sample_acl
     else:
-        log.info("Does not require managed access", path=item)
+        log.debug("Does not require managed access", path=item)
         acl_fn = empty_acl
 
     # Each component may be associated with multiple flowcells
     components = find_associated_components(item)
+    log.debug("Found associated components", path=item, comp=components)
+
     for c in components:
-        for fc in find_flowcells_by_component(
+        flowcells = find_flowcells_by_component(
             mlwh_session, c, include_controls=include_controls
-        ):
+        )
+        log.debug("Found associated flowcells", path=item, flowcells=flowcells, comp=c)
+        for fc in flowcells:
             secondary_metadata.extend(sample_fn(fc.sample))
             secondary_metadata.extend(study_fn(fc.study))
             acl.extend(acl_fn(fc.sample, fc.study, zone=zone))
@@ -467,31 +478,25 @@ def find_flowcells_by_component(
         sess.query(IseqFlowcell)
         .distinct()
         .join(IseqFlowcell.iseq_product_metrics)
-        .filter(IseqProductMetrics.id_run == component.id_run)
+        .filter(
+            IseqProductMetrics.id_run == component.id_run,
+            IseqFlowcell.position == component.position,
+        )
     )
 
-    if component.position is not None:
-        query = query.filter(IseqProductMetrics.position == component.position)
+    if not include_controls:
+        query = query.filter(
+            IseqFlowcell.entity_type.notin_(
+                [
+                    EntityType.LIBRARY_CONTROL.value,
+                    EntityType.LIBRARY_INDEXED_SPIKE.value,
+                ]
+            )
+        )
 
     match component.tag_index:
-        case TagIndex.CONTROL_198.value | TagIndex.CONTROL_888.value:
-            if not include_controls:
-                query = query.filter(
-                    IseqProductMetrics.tag_index.notin_(
-                        [TagIndex.CONTROL_198.value, TagIndex.CONTROL_888.value]
-                    )
-                )
-
-            query = query.filter(IseqProductMetrics.tag_index == component.tag_index)
         case TagIndex.BIN.value:
-            if not include_controls:
-                query = query.filter(
-                    IseqProductMetrics.tag_index.notin_(
-                        [TagIndex.CONTROL_198.value, TagIndex.CONTROL_888.value]
-                    )
-                )
-
-            query = query.filter(IseqProductMetrics.tag_index.is_not(None))
+            query = query.filter(IseqProductMetrics.tag_index.isnot(None))
         case int():
             query = query.filter(IseqProductMetrics.tag_index == component.tag_index)
         case None:
diff --git a/tests/conftest.py b/tests/conftest.py
index 4986616..50b4219 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -57,6 +57,7 @@
     Sample,
     Study,
 )
+from npg_irods.illumina import EntityType
 from npg_irods.metadata import illumina, ont
 from npg_irods.metadata.common import DataFile, SeqConcept
 from npg_irods.metadata.lims import TrackedSample
@@ -413,34 +414,78 @@ def initialize_mlwh_illumina_synthetic(session: Session):
 
     sample_info = [
         # Not multiplexed
-        {"study": study_a, "sample": sample1, "position": 1, "tag_index": None},
+        {
+            "study": study_a,
+            "sample": sample1,
+            "position": 1,
+            "tag_index": None,
+            "entity_type": EntityType.LIBRARY.value,
+        },
         # Multiplexed, samples from the same study
-        {"study": study_a, "sample": sample1, "position": 1, "tag_index": 1},
-        {"study": study_a, "sample": sample2, "position": 1, "tag_index": 2},
-        {"study": study_a, "sample": sample1, "position": 2, "tag_index": 1},
-        {"study": study_a, "sample": sample2, "position": 2, "tag_index": 2},
+        {
+            "study": study_a,
+            "sample": sample1,
+            "position": 1,
+            "tag_index": 1,
+            "entity_type": EntityType.LIBRARY_INDEXED.value,
+        },
+        {
+            "study": study_a,
+            "sample": sample2,
+            "position": 1,
+            "tag_index": 2,
+            "entity_type": EntityType.LIBRARY_INDEXED.value,
+        },
+        {
+            "study": study_a,
+            "sample": sample1,
+            "position": 2,
+            "tag_index": 1,
+            "entity_type": EntityType.LIBRARY_INDEXED.value,
+        },
+        {
+            "study": study_a,
+            "sample": sample2,
+            "position": 2,
+            "tag_index": 2,
+            "entity_type": EntityType.LIBRARY_INDEXED.value,
+        },
         # Multiplexed, samples from different studies
-        {"study": study_a, "sample": sample1, "position": 2, "tag_index": 1},
-        {"study": study_b, "sample": sample3, "position": 2, "tag_index": 2},
+        {
+            "study": study_a,
+            "sample": sample1,
+            "position": 2,
+            "tag_index": 1,
+            "entity_type": EntityType.LIBRARY_INDEXED.value,
+        },
+        {
+            "study": study_b,
+            "sample": sample3,
+            "position": 2,
+            "tag_index": 2,
+            "entity_type": EntityType.LIBRARY_INDEXED.value,
+        },
         # Phi X
         {
             "study": control_study,
             "sample": control_sample,
             "position": 1,
             "tag_index": 888,
+            "entity_type": EntityType.LIBRARY_INDEXED_SPIKE.value,
         },
         {
             "study": control_study,
             "sample": control_sample,
             "position": 2,
             "tag_index": 888,
+            "entity_type": EntityType.LIBRARY_INDEXED_SPIKE.value,
         },
     ]
 
     flowcells = [
         IseqFlowcell(
             entity_id_lims=f"ENTITY_01",
-            entity_type=f"ENTITY_TYPE_01",
+            entity_type=info["entity_type"],
             id_flowcell_lims=f"FLOWCELL{i}",
             id_lims="LIMS_01",
             id_pool_lims=f"POOL_01",