diff --git a/src/azul/indexer/aggregate.py b/src/azul/indexer/aggregate.py index 563972092a..27db90676d 100644 --- a/src/azul/indexer/aggregate.py +++ b/src/azul/indexer/aggregate.py @@ -33,6 +33,12 @@ Entities = JSONs +class TooManyEntities(Exception): + + def __init__(self, max_size: int): + super().__init__(max_size) + + class Accumulator(metaclass=ABCMeta): """ Accumulates multiple values into a single value, not necessarily of the same @@ -92,7 +98,12 @@ class SetAccumulator(Accumulator): smallest values, it returns a sorted list of the first N distinct values. """ - def __init__(self, max_size=None, key=None) -> None: + def __init__(self, + max_size=None, + key=None, + *, + raise_on_overflow: bool = False + ) -> None: """ :param max_size: the maximum number of elements to retain @@ -101,15 +112,50 @@ def __init__(self, max_size=None, key=None) -> None: be used. With that default key, if any None values were placed in the accumulator, the first element, and only the first element of the returned list will be None. + + :param raise_on_overflow: If true, raise TooManyEntities if the size of + the accumulated set would exceed max_size. """ super().__init__() self.value = set() self.max_size = max_size self.key = none_safe_key(none_last=True) if key is None else key + self.raise_on_overflow = raise_on_overflow def accumulate(self, value) -> bool: """ :return: True, if the given value was incorporated into the set + + >>> s = SetAccumulator(max_size=3) + >>> s.accumulate(1) + True + + >>> s.accumulate(1) + False + + >>> s.accumulate(2) + True + + >>> s.accumulate([1, 2, 3]) + True + + >>> s.accumulate([2, 3]) + False + + >>> s.accumulate(4) + False + + >>> s.get() + [1, 2, 3] + + >>> s = SetAccumulator(max_size=3, raise_on_overflow=True) + >>> s.accumulate([1, 2, 3]) + True + + >>> s.accumulate(4) + Traceback (most recent call last): + ... + azul.indexer.aggregate.TooManyEntities: 3 """ if self.max_size is None or len(self.value) < self.max_size: before = len(self.value) @@ -126,6 +172,8 @@ def accumulate(self, value) -> bool: return False else: assert False + elif self.raise_on_overflow and value not in self.value: + raise TooManyEntities(self.max_size) else: return False diff --git a/src/azul/plugins/__init__.py b/src/azul/plugins/__init__.py index 2fab31a4ed..91f63b213e 100644 --- a/src/azul/plugins/__init__.py +++ b/src/azul/plugins/__init__.py @@ -415,10 +415,15 @@ def special_fields(self) -> SpecialFields: @abstractmethod def hot_entity_types(self) -> AbstractSet[str]: """ - The types of entities that do not explicitly track their hubs in replica - documents in order to avoid a large list of hub references in the - replica document, and to avoid contention when updating that list during - indexing. + The types of inner entities that do not explicitly track their hubs in + replica documents in order to avoid a large list of hub references in + the replica document, and to avoid contention when updating that list + during indexing. + + There is a brittle coupling between this method and the implementation + of the transformers, where the replicas are emitted. This is because + here we use the transformed types (e.g. "donors") and the latter uses + the untransformed types (e.g. "donor_organism"). """ raise NotImplementedError diff --git a/src/azul/plugins/metadata/anvil/indexer/transform.py b/src/azul/plugins/metadata/anvil/indexer/transform.py index 858899bd59..fce1199bc5 100644 --- a/src/azul/plugins/metadata/anvil/indexer/transform.py +++ b/src/azul/plugins/metadata/anvil/indexer/transform.py @@ -631,10 +631,7 @@ def _transform(self, for linked_entity in linked: yield self._replica( linked_entity, - # Datasets are linked to every file in their snapshot, - # making an explicit list of hub IDs for the dataset both - # redundant and impractically large. Therefore, we leave the - # hub IDs field empty for datasets and rely on the tenet - # that every file is an implicit hub of its parent dataset. + # There is a brittle coupling between here and + # :meth:`MetadataPlugin.hot_entity_types`. file_hub=None if linked_entity.entity_type == 'anvil_dataset' else entity.entity_id, ) diff --git a/src/azul/plugins/metadata/hca/__init__.py b/src/azul/plugins/metadata/hca/__init__.py index 85f4ce1c7b..679cfa058e 100644 --- a/src/azul/plugins/metadata/hca/__init__.py +++ b/src/azul/plugins/metadata/hca/__init__.py @@ -287,7 +287,14 @@ def special_fields(self) -> SpecialFields: @property def hot_entity_types(self) -> AbstractSet[str]: - return {'projects'} + return { + 'projects', + 'donors', + 'analysis_protocols', + 'imaging_protocols', + 'library_preparation_protocols', + 'sequencing_protocols' + } @property def facets(self) -> Sequence[str]: diff --git a/src/azul/plugins/metadata/hca/indexer/aggregate.py b/src/azul/plugins/metadata/hca/indexer/aggregate.py index 06ff91527f..7c0a6434bf 100644 --- a/src/azul/plugins/metadata/hca/indexer/aggregate.py +++ b/src/azul/plugins/metadata/hca/indexer/aggregate.py @@ -162,6 +162,8 @@ def _accumulator(self, field) -> Accumulator | None: none_safe_itemgetter('value', 'unit'))) elif field == 'donor_count': return UniqueValueCountAccumulator() + elif field == 'document_id': + return SetAccumulator(max_size=100, raise_on_overflow=True) else: return super()._accumulator(field) @@ -197,6 +199,8 @@ class ProtocolAggregator(SimpleAggregator): def _accumulator(self, field) -> Accumulator | None: if field == 'assay_type': return FrequencySetAccumulator(max_size=100) + elif field == 'document_id': + return SetAccumulator(max_size=100, raise_on_overflow=True) else: return super()._accumulator(field) diff --git a/src/azul/plugins/metadata/hca/indexer/transform.py b/src/azul/plugins/metadata/hca/indexer/transform.py index 00380d8564..be98427484 100644 --- a/src/azul/plugins/metadata/hca/indexer/transform.py +++ b/src/azul/plugins/metadata/hca/indexer/transform.py @@ -12,6 +12,9 @@ from enum import ( Enum, ) +from itertools import ( + chain, +) import logging import re from typing import ( @@ -1472,15 +1475,25 @@ def _transform(self, file_id = file.ref.entity_id yield self._contribution(contents, file_id) if config.enable_replicas: - yield self._replica(self.api_bundle.ref, file_hub=file_id) - # Projects are linked to every file in their snapshot, - # making an explicit list of hub IDs for the project both - # redundant and impractically large. Therefore, we leave the - # hub IDs field empty for projects and rely on the tenet - # that every file is an implicit hub of its parent project. - yield self._replica(self._api_project.ref, file_hub=None) - for linked_entity in visitor.entities: - yield self._replica(linked_entity, file_hub=file_id) + + def is_hot(entity_type: EntityType) -> bool: + # There is a brittle coupling between this function and + # :meth:`MetadataPlugin.hot_entity_types`. + if entity_type == 'donor_organism': + return True + elif entity_type == 'project': + return True + elif entity_type.endswith('_protocol'): + return True + else: + return False + + for ref in chain( + [self.api_bundle.ref, self._api_project.ref], + visitor.entities + ): + yield self._replica(ref, + file_hub=None if is_hot(ref.entity_type) else file_id) def matrix_stratification_values(self, file: api.File) -> JSON: """ diff --git a/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json b/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json index d7b65274b4..ada1e68d39 100644 --- a/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json +++ b/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json @@ -3748,10 +3748,7 @@ "sex": "female" }, "entity_id": "7b07b9d0-cc0e-4098-9f64-f4a569f7d746", - "hub_ids": [ - "0c5ac7c0-817e-40d4-b1b1-34c3d5cfecdb", - "70d1af4a-82c8-478a-8960-e9028b3616ca" - ], + "hub_ids": [], "replica_type": "donor_organism" }, "_type": "_doc" @@ -3791,10 +3788,7 @@ "strand": "unstranded" }, "entity_id": "9c32cf70-3ed7-4720-badc-5ee71e8a38af", - "hub_ids": [ - "0c5ac7c0-817e-40d4-b1b1-34c3d5cfecdb", - "70d1af4a-82c8-478a-8960-e9028b3616ca" - ], + "hub_ids": [], "replica_type": "library_preparation_protocol" }, "_type": "_doc" @@ -3828,10 +3822,7 @@ } }, "entity_id": "61e629ed-0135-4492-ac8a-5c4ab3ccca8a", - "hub_ids": [ - "0c5ac7c0-817e-40d4-b1b1-34c3d5cfecdb", - "70d1af4a-82c8-478a-8960-e9028b3616ca" - ], + "hub_ids": [], "replica_type": "sequencing_protocol" }, "_type": "_doc"