Skip to content

Commit

Permalink
[r] Fix: No replicas for donors in HCA (#6582)
Browse files Browse the repository at this point in the history
  • Loading branch information
nadove-ucsc committed Oct 2, 2024
1 parent ab515f5 commit 6b9d422
Show file tree
Hide file tree
Showing 11 changed files with 967 additions and 143 deletions.
11 changes: 8 additions & 3 deletions src/azul/indexer/index_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,16 +302,21 @@ def transform(self,
log.info('Transforming %i entities in partition %s of bundle %s, version %s.',
num_entities, partition, bundle.uuid, bundle.version)
contributions = []
replicas = []
replicas_by_coords = {}
for transformer in transformers:
for document in transformer.transform(partition):
if isinstance(document, Contribution):
contributions.append(document)
elif isinstance(document, Replica):
replicas.append(document)
try:
dup = replicas_by_coords[document.coordinates]
except KeyError:
replicas_by_coords[document.coordinates] = document
else:
dup.hub_ids.extend(document.hub_ids)
else:
assert False, document
return contributions, replicas
return contributions, list(replicas_by_coords.values())

def create_indices(self, catalog: CatalogName):
es_client = ESClientFactory.get()
Expand Down
10 changes: 8 additions & 2 deletions src/azul/indexer/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@

import attr

from azul.collections import (
alist,
)
from azul.indexer import (
Bundle,
BundleFQID,
Expand Down Expand Up @@ -129,7 +132,8 @@ def _contribution(self,

def _replica(self,
entity: EntityReference,
hub_ids: list[EntityID]
*,
file_hub: EntityID | None,
) -> Replica:
replica_type, contents = self._replicate(entity)
coordinates = ReplicaCoordinates(content_hash=json_hash(contents).hexdigest(),
Expand All @@ -138,7 +142,9 @@ def _replica(self,
version=None,
replica_type=replica_type,
contents=contents,
hub_ids=hub_ids)
# The other hubs will be added when the indexer
# consolidates duplicate replicas.
hub_ids=alist(file_hub))

@classmethod
@abstractmethod
Expand Down
2 changes: 0 additions & 2 deletions src/azul/plugins/metadata/anvil/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@
BiosampleTransformer,
BundleTransformer,
DatasetTransformer,
DiagnosisTransformer,
DonorTransformer,
FileTransformer,
)
Expand Down Expand Up @@ -96,7 +95,6 @@ def transformer_types(self) -> Iterable[type[BaseTransformer]]:
BiosampleTransformer,
BundleTransformer,
DatasetTransformer,
DiagnosisTransformer,
DonorTransformer,
FileTransformer,
)
Expand Down
94 changes: 33 additions & 61 deletions src/azul/plugins/metadata/anvil/indexer/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,12 @@ class LinkedEntities:
def __getitem__(self, item: EntityType) -> set[EntityReference]:
return self.ancestors[item] | self.descendants[item]

def __iter__(self) -> Iterable[EntityReference]:
for entities in self.ancestors.values():
yield from entities
for entities in self.descendants.values():
yield from entities

@classmethod
def from_links(cls,
origin: EntityReference,
Expand Down Expand Up @@ -449,8 +455,8 @@ def _complete_dataset_keys(cls) -> AbstractSet[str]:

class SingletonTransformer(BaseTransformer, metaclass=ABCMeta):

def _contents(self) -> MutableJSON:
return dict(
def _transform(self, entity: EntityReference) -> Iterable[Contribution]:
contents = dict(
activities=self._entities(self._activity, chain.from_iterable(
self._entities_by_type[activity_type]
for activity_type in self._activity_polymorphic_types
Expand All @@ -461,6 +467,7 @@ def _contents(self) -> MutableJSON:
donors=self._entities(self._donor, self._entities_by_type['donor']),
files=self._entities(self._file, self._entities_by_type['file'])
)
yield self._contribution(contents, entity.entity_id)

@classmethod
def field_types(cls) -> FieldTypes:
Expand All @@ -479,8 +486,11 @@ def _duos_types(cls) -> FieldTypes:
def _duos(self, dataset: EntityReference) -> MutableJSON:
return self._entity(dataset, self._duos_types())

def _is_duos(self, dataset: EntityReference) -> bool:
return 'description' in self.bundle.entities[dataset]

def _dataset(self, dataset: EntityReference) -> MutableJSON:
if 'description' in self.bundle.entities[dataset]:
if self._is_duos(dataset):
return self._duos(dataset)
else:
return super()._dataset(dataset)
Expand All @@ -499,23 +509,17 @@ class ActivityTransformer(BaseTransformer):
def entity_type(cls) -> str:
return 'activities'

def _transform(self,
entity: EntityReference
) -> Iterable[Contribution | Replica]:
def _transform(self, entity: EntityReference) -> Iterable[Contribution]:
linked = self._linked_entities(entity)
files = linked['file']
contents = dict(
activities=[self._activity(entity)],
biosamples=self._entities(self._biosample, linked['biosample']),
datasets=[self._dataset(self._only_dataset())],
diagnoses=self._entities(self._diagnosis, linked['diagnosis']),
donors=self._entities(self._donor, linked['donor']),
files=self._entities(self._file, files),
files=self._entities(self._file, linked['file'])
)
yield self._contribution(contents, entity.entity_id)
if config.enable_replicas:
hub_ids = [f.entity_id for f in files]
yield self._replica(entity, hub_ids)


class BiosampleTransformer(BaseTransformer):
Expand All @@ -524,11 +528,8 @@ class BiosampleTransformer(BaseTransformer):
def entity_type(cls) -> str:
return 'biosamples'

def _transform(self,
entity: EntityReference
) -> Iterable[Contribution | Replica]:
def _transform(self, entity: EntityReference) -> Iterable[Contribution]:
linked = self._linked_entities(entity)
files = linked['file']
contents = dict(
activities=self._entities(self._activity, chain.from_iterable(
linked[activity_type]
Expand All @@ -538,25 +539,9 @@ def _transform(self,
datasets=[self._dataset(self._only_dataset())],
diagnoses=self._entities(self._diagnosis, linked['diagnosis']),
donors=self._entities(self._donor, linked['donor']),
files=self._entities(self._file, files),
files=self._entities(self._file, linked['file']),
)
yield self._contribution(contents, entity.entity_id)
if config.enable_replicas:
hub_ids = [f.entity_id for f in files]
yield self._replica(entity, hub_ids)


class DiagnosisTransformer(BaseTransformer):

def _transform(self, entity: EntityReference) -> Iterable[Replica]:
if config.enable_replicas:
files = self._linked_entities(entity)['file']
hub_ids = [f.entity_id for f in files]
yield self._replica(entity, hub_ids)

@classmethod
def entity_type(cls) -> EntityType:
return 'diagnoses'


class BundleTransformer(SingletonTransformer):
Expand All @@ -569,10 +554,6 @@ def _singleton(self) -> EntityReference:
return EntityReference(entity_type='bundle',
entity_id=self.bundle.uuid)

def _transform(self, entity: EntityReference) -> Iterable[Contribution]:
contents = self._contents()
yield self._contribution(contents, entity.entity_id)


class DatasetTransformer(SingletonTransformer):

Expand All @@ -586,18 +567,9 @@ def _singleton(self) -> EntityReference:
def _transform(self,
entity: EntityReference
) -> Iterable[Contribution | Replica]:
contents = self._contents()
yield self._contribution(contents, entity.entity_id)
if config.enable_replicas:
# Every file in a snapshot is linked to that snapshot's singular
# dataset, making an explicit list of hub IDs for the dataset both
# redundant and impractically large (we observe that for large
# snapshots, trying to track this many files in a single data structure
# causes a prohibitively high rate of conflicts during replica updates).
# Therefore, we leave the hub IDs field empty for datasets and rely on
# the tenet that every file is an implicit hub of its parent dataset.
hub_ids = []
yield self._replica(entity, hub_ids)
yield from super()._transform(entity)
if self._is_duos(entity):
yield self._replica(entity, file_hub=None)


class DonorTransformer(BaseTransformer):
Expand All @@ -606,11 +578,8 @@ class DonorTransformer(BaseTransformer):
def entity_type(cls) -> str:
return 'donors'

def _transform(self,
entity: EntityReference
) -> Iterable[Contribution | Replica]:
def _transform(self, entity: EntityReference) -> Iterable[Contribution]:
linked = self._linked_entities(entity)
files = linked['file']
contents = dict(
activities=self._entities(self._activity, chain.from_iterable(
linked[activity_type]
Expand All @@ -620,12 +589,9 @@ def _transform(self,
datasets=[self._dataset(self._only_dataset())],
diagnoses=self._entities(self._diagnosis, linked['diagnosis']),
donors=[self._donor(entity)],
files=self._entities(self._file, files),
files=self._entities(self._file, linked['file']),
)
yield self._contribution(contents, entity.entity_id)
if config.enable_replicas:
hub_ids = [f.entity_id for f in files]
yield self._replica(entity, hub_ids)


class FileTransformer(BaseTransformer):
Expand All @@ -651,8 +617,14 @@ def _transform(self,
)
yield self._contribution(contents, entity.entity_id)
if config.enable_replicas:
# The result of the link traversal does not include the starting entity,
# so without this step the file itself wouldn't be included in its hubs
files = (entity, *linked['file'])
hub_ids = [f.entity_id for f in files]
yield self._replica(entity, hub_ids)
yield self._replica(entity, file_hub=entity.entity_id)
for linked_entity in linked:
yield self._replica(
linked_entity,
# Datasets are linked to every file in their snapshot,
# making an explicit list of hub IDs for the dataset both
# redundant and impractically large. Therefore, we leave the
# hub IDs field empty for datasets and rely on the tenet
# that every file is an implicit hub of its parent dataset.
file_hub=None if linked_entity.entity_type == 'dataset' else entity.entity_id,
)
Loading

0 comments on commit 6b9d422

Please sign in to comment.