[r] fixup! Fix: No replicas for donors in HCA (#6582)

DataBiosphere · Sep 28, 2024 · e035899 · e035899
1 parent 7d00561
commit e035899
Show file tree

Hide file tree

Showing 3 changed files with 27 additions and 21 deletions.
diff --git a/src/azul/indexer/transform.py b/src/azul/indexer/transform.py
@@ -11,6 +11,9 @@
 
 import attr
 
+from azul.collections import (
+    alist,
+)
 from azul.indexer import (
     Bundle,
     BundleFQID,
@@ -127,7 +130,8 @@ def _contribution(self,
 
     def _replica(self,
                  entity: EntityReference,
-                 hub_ids: list[EntityID]
+                 *,
+                 file_hub: EntityID | None,
                  ) -> Replica:
         replica_type, contents = self._replicate(entity)
         coordinates = ReplicaCoordinates(content_hash=json_hash(contents).hexdigest(),
@@ -136,7 +140,9 @@ def _replica(self,
                        version=None,
                        replica_type=replica_type,
                        contents=contents,
-                       hub_ids=hub_ids)
+                       # The other hubs will be added when the indexer
+                       # consolidates duplicate replicas.
+                       hub_ids=alist(file_hub))
 
     @classmethod
     @abstractmethod

diff --git a/src/azul/plugins/metadata/anvil/indexer/transform.py b/src/azul/plugins/metadata/anvil/indexer/transform.py
@@ -569,7 +569,7 @@ def _transform(self,
                    ) -> Iterable[Contribution | Replica]:
         yield from super()._transform(entity)
         if self._is_duos(entity):
-            yield self._replica(entity, [])
+            yield self._replica(entity, file_hub=None)
 
 
 class DonorTransformer(BaseTransformer):
@@ -617,12 +617,14 @@ def _transform(self,
         )
         yield self._contribution(contents, entity.entity_id)
         if config.enable_replicas:
-            # The other hubs will be added when the indexer consolidates duplicate replicas.
-            yield self._replica(entity, [entity.entity_id])
+            yield self._replica(entity, file_hub=entity.entity_id)
             for linked_entity in linked:
-                # Datasets are linked to every file in their snapshot, making an explicit list
-                # of hub IDs for the dataset both redundant and impractically large. Therefore,
-                # we leave the hub IDs field empty for datasets and rely on the tenet that every
-                # file is an implicit hub of its parent dataset.
-                yield self._replica(linked_entity,
-                                    hub_ids=[] if linked_entity.entity_type == 'dataset' else [entity.entity_id])
+                yield self._replica(
+                    linked_entity,
+                    # Datasets are linked to every file in their snapshot,
+                    # making an explicit list of hub IDs for the dataset both
+                    # redundant and impractically large. Therefore, we leave the
+                    # hub IDs field empty for datasets and rely on the tenet
+                    # that every file is an implicit hub of its parent dataset.
+                    file_hub=None if linked_entity.entity_type == 'dataset' else entity.entity_id,
+                )
diff --git a/src/azul/plugins/metadata/hca/indexer/transform.py b/src/azul/plugins/metadata/hca/indexer/transform.py
@@ -1472,17 +1472,15 @@ def _transform(self,
                 file_id = file.ref.entity_id
                 yield self._contribution(contents, file_id)
                 if config.enable_replicas:
-                    yield self._replica(self.api_bundle.ref,
-                                        # The other hubs will be added when the indexer consolidates duplicate replicas.
-                                        [file_id])
-                    yield self._replica(one(self.api_bundle.projects.values()).ref,
-                                        # Projects are linked to every file in their snapshot, making an explicit list
-                                        # of hub IDs for the project both redundant and impractically large. Therefore,
-                                        # we leave the hub IDs field empty for projects and rely on the tenet that every
-                                        # file is an implicit hub of its parent project.
-                                        [])
+                    yield self._replica(self.api_bundle.ref, file_hub=file_id)
+                    # Projects are linked to every file in their snapshot,
+                    # making an explicit list of hub IDs for the project both
+                    # redundant and impractically large. Therefore, we leave the
+                    # hub IDs field empty for projects and rely on the tenet
+                    # that every file is an implicit hub of its parent project.
+                    yield self._replica(self._api_project.ref, file_hub=None)
                     for linked_entity in visitor.entities:
-                        yield self._replica(linked_entity, [file_id])
+                        yield self._replica(linked_entity, file_hub=file_id)
 
     def matrix_stratification_values(self, file: api.File) -> JSON:
         """