From 4e99857957ef68b461c8abc747209105d8e0c480 Mon Sep 17 00:00:00 2001 From: Noa Aviel Dove Date: Tue, 22 Oct 2024 16:38:11 -0700 Subject: [PATCH 01/32] Clarify and enforce bundle FQID equality semantics (#6671) --- src/azul/indexer/__init__.py | 130 +++++++++++++++--- src/azul/indexer/document.py | 17 --- src/azul/indexer/transform.py | 2 +- .../plugins/repository/tdr_anvil/__init__.py | 2 +- test/indexer/test_indexer.py | 8 +- test/indexer/test_projects.py | 2 +- 6 files changed, 119 insertions(+), 42 deletions(-) diff --git a/src/azul/indexer/__init__.py b/src/azul/indexer/__init__.py index 5d17349c0b..6e673d9ec1 100644 --- a/src/azul/indexer/__init__.py +++ b/src/azul/indexer/__init__.py @@ -2,6 +2,9 @@ ABCMeta, abstractmethod, ) +from functools import ( + total_ordering, +) from itertools import ( product, ) @@ -17,6 +20,7 @@ Self, TypeVar, TypedDict, + final, ) import attrs @@ -45,21 +49,115 @@ BundleVersion = str -@attrs.frozen(kw_only=True, order=True) -class BundleFQID(SupportsLessAndGreaterThan): +# PyCharm can't handle mixing `attrs` with `total_ordering` and falsely claims +# that comparison operators besides `__lt__` are not defined. +# noinspection PyDataclass +@attrs.frozen(kw_only=True, eq=False) +@total_ordering +class BundleFQID: """ - >>> list(sorted([ - ... BundleFQID(uuid='d', version='e'), - ... BundleFQID(uuid='a', version='c'), - ... BundleFQID(uuid='a', version='b'), - ... ])) - ... # doctest: +NORMALIZE_WHITESPACE - [BundleFQID(uuid='a', version='b'), - BundleFQID(uuid='a', version='c'), - BundleFQID(uuid='d', version='e')] + A fully qualified bundle identifier. The attributes defined in this class + must always be sufficient to decide whether two instances of this class or + its subclasses identify the same bundle or not. Subclasses may define + additional attributes to help describe the bundle, but they are forbidden + from using these attributes in the implementations of their `__eq__` or + `__hash__` methods, either explicitly or in code generated by `attrs`. """ - uuid: BundleUUID = attrs.field(order=str.lower) - version: BundleVersion = attrs.field(order=str.lower) + uuid: BundleUUID + version: BundleVersion + + def _nucleus(self) -> tuple[str, str]: + return self.uuid.lower(), self.version.lower() + + # We can't use attrs' generated implementation because it always + # considers operands with different types to be unequal, regardless of + # their inheritance relationships or how their attributes are annotated + # (e.g. specifying `eq=False` has no effect). We want instances of + # all subclasses to compare equal as long as `uuid` and `version` are + # equal. For the same reason, we can't use `typing.Self` in the signature + # because it would constrain the RHS to instances of subclasses of the LHS. + @final + def __eq__(self, other: 'BundleFQID') -> bool: + """ + >>> b1 = BundleFQID(uuid='a', version='b') + >>> b2 = BundleFQID(uuid='a', version='b') + >>> b1 == b2 + True + + >>> s1 = SourceRef(id='x', spec=SimpleSourceSpec.parse('y:/0')) + >>> sb1 = SourcedBundleFQID(uuid='a', version='b', source=s1) + >>> sb2 = SourcedBundleFQID(uuid='a', version='b', source=s1) + >>> sb1 == sb2 + True + + >>> b1 == sb1 + True + + >>> s2 = SourceRef(id='w', spec=SimpleSourceSpec.parse('z:/0')) + >>> sb3 = SourcedBundleFQID(uuid='a', version='b', source=s2) + >>> b1 == sb3 + True + + >>> sb1 == sb3 + ... # doctest: +NORMALIZE_WHITESPACE + Traceback (most recent call last): + ... + AssertionError: (('a', 'b'), + SourceRef(id='x', spec=SimpleSourceSpec(prefix=Prefix(common='', partition=0), name='y')), + SourceRef(id='w', spec=SimpleSourceSpec(prefix=Prefix(common='', partition=0), name='z'))) + """ + same_bundle = self._nucleus() == other._nucleus() + if ( + same_bundle + and isinstance(self, SourcedBundleFQID) + and isinstance(other, SourcedBundleFQID) + ): + assert self.source == other.source, (self._nucleus(), self.source, other.source) + return same_bundle + + @final + def __hash__(self) -> int: + return hash(self._nucleus()) + + def __init_subclass__(cls, **kwargs): + """ + >>> @attrs.frozen(kw_only=True) + ... class FooBundleFQID(SourcedBundleFQID): + ... foo: str + Traceback (most recent call last): + ... + AssertionError: + + >>> @attrs.frozen(kw_only=True, eq=False) + ... class FooBundleFQID(SourcedBundleFQID): + ... foo: str + """ + super().__init_subclass__(**kwargs) + assert cls.__eq__ is BundleFQID.__eq__, cls + assert cls.__hash__ is BundleFQID.__hash__, cls + + # attrs doesn't allow `order=True` when `eq=False` + def __lt__(self, other: 'BundleFQID') -> bool: + """ + >>> aa = BundleFQID(uuid='a', version='a') + >>> ab = BundleFQID(uuid='a', version='b') + >>> ba = BundleFQID(uuid='b', version='a') + >>> aa < ab < ba + True + + >>> ba > ab > aa + True + + >>> aa <= ab <= ba + True + + >>> ba >= ab >= aa + True + + >>> aa != ab != ba + True + """ + return self._nucleus() < other._nucleus() def to_json(self) -> MutableJSON: return attrs.asdict(self, recurse=False) @@ -462,7 +560,7 @@ class SourcedBundleFQIDJSON(BundleFQIDJSON): BUNDLE_FQID = TypeVar('BUNDLE_FQID', bound='SourcedBundleFQID') -@attrs.frozen(kw_only=True, order=True) +@attrs.frozen(kw_only=True, eq=False) class SourcedBundleFQID(BundleFQID, Generic[SOURCE_REF]): """ >>> spec = SimpleSourceSpec(name='', prefix=(Prefix(partition=0))) @@ -493,10 +591,6 @@ def from_json(cls, json: SourcedBundleFQIDJSON) -> 'SourcedBundleFQID': source = cls.source_ref_cls().from_json(json.pop('source')) return cls(source=source, **json) - def upcast(self) -> BundleFQID: - return BundleFQID(uuid=self.uuid, - version=self.version) - def to_json(self) -> SourcedBundleFQIDJSON: return dict(super().to_json(), source=self.source.to_json()) diff --git a/src/azul/indexer/document.py b/src/azul/indexer/document.py index 5e8a0cd64d..f082193e6d 100644 --- a/src/azul/indexer/document.py +++ b/src/azul/indexer/document.py @@ -519,23 +519,6 @@ class ContributionCoordinates(DocumentCoordinates[E], Generic[E]): deleted: bool - def __attrs_post_init__(self): - # If we were to allow instances of subclasses, we'd risk breaking - # equality and hashing semantics. It is impossible to correctly - # implement the transitivity property of equality between instances of - # type and subtype without ignoring the additional attributes added by - # the subtype. Consider types T and S where S is a subtype of T. - # Transitivity requires that s1 == s2 for any two instances s1 and s2 - # of S for which s1 == t and s2 == t, where t is any instance of T. The - # subtype instances s1 and s2 can only be equal to t if they match in - # all attributes that T defines. The additional attributes introduced - # by S must be ignored, even when comparing s1 and s2, otherwise s1 and - # s2 might turn out to be unequal. In this particular case that is not - # desirable: we do want to be able to compare instances of subtypes of - # BundleFQID without ignoring any of their attributes. - concrete_type = type(self.bundle) - assert concrete_type is BundleFQID, concrete_type - @property def document_id(self) -> str: return '_'.join(( diff --git a/src/azul/indexer/transform.py b/src/azul/indexer/transform.py index 4f6efb7f50..1e49c5746e 100644 --- a/src/azul/indexer/transform.py +++ b/src/azul/indexer/transform.py @@ -123,7 +123,7 @@ def _contribution(self, ) -> Contribution: entity = EntityReference(entity_type=self.entity_type(), entity_id=entity_id) coordinates = ContributionCoordinates(entity=entity, - bundle=self.bundle.fqid.upcast(), + bundle=self.bundle.fqid, deleted=self.deleted) return Contribution(coordinates=coordinates, version=None, diff --git a/src/azul/plugins/repository/tdr_anvil/__init__.py b/src/azul/plugins/repository/tdr_anvil/__init__.py index 530850366d..a0162d2c16 100644 --- a/src/azul/plugins/repository/tdr_anvil/__init__.py +++ b/src/azul/plugins/repository/tdr_anvil/__init__.py @@ -119,7 +119,7 @@ class TDRAnvilBundleFQIDJSON(SourcedBundleFQIDJSON): table_name: str -@attrs.frozen(kw_only=True) +@attrs.frozen(kw_only=True, eq=False) class TDRAnvilBundleFQID(TDRBundleFQID): table_name: BundleType = attrs.field(converter=BundleType) diff --git a/test/indexer/test_indexer.py b/test/indexer/test_indexer.py index 2fbf8206ba..3b24b211cf 100644 --- a/test/indexer/test_indexer.py +++ b/test/indexer/test_indexer.py @@ -288,7 +288,7 @@ def test_deletion(self): self.assertNotEqual(doc.contents, {}) elif doc_type is DocumentType.contribution: doc = Contribution.from_index(field_types, hit) - self.assertEqual(bundle_fqid.upcast(), doc.coordinates.bundle) + self.assertEqual(bundle_fqid, doc.coordinates.bundle) self.assertFalse(doc.coordinates.deleted) elif doc_type is DocumentType.replica: pass @@ -315,7 +315,7 @@ def test_deletion(self): if doc_type is DocumentType.contribution: doc = Contribution.from_index(field_types, hit) docs_by_entity[doc.entity].append(doc) - self.assertEqual(bundle_fqid.upcast(), doc.coordinates.bundle) + self.assertEqual(bundle_fqid, doc.coordinates.bundle) else: # Since there is only one bundle and it was deleted, # nothing should be aggregated @@ -388,7 +388,7 @@ def test_duplicate_notification(self): coordinates = [ ContributionCoordinates( entity=entity, - bundle=bundle.fqid.upcast(), + bundle=bundle.fqid, deleted=False ).with_catalog(self.catalog) ] @@ -553,7 +553,7 @@ def test_multi_entity_contributing_bundles(self): entity_id=old_file_uuid, entity_type='files') deletion = ContributionCoordinates(entity=entity, - bundle=bundle_fqid.upcast(), + bundle=bundle_fqid, deleted=True) index_name, document_id = deletion.index_name, deletion.document_id hits = [ diff --git a/test/indexer/test_projects.py b/test/indexer/test_projects.py index c96cb4e490..d76dc52980 100644 --- a/test/indexer/test_projects.py +++ b/test/indexer/test_projects.py @@ -89,7 +89,7 @@ def test_no_duplicate_files_in_specimen(self): coordinates = AggregateCoordinates(entity=entity) else: coordinates = ContributionCoordinates(entity=entity, - bundle=bundle_fqid.upcast(), + bundle=bundle_fqid, deleted=False) result = self.es_client.get(index=coordinates.index_name, id=coordinates.document_id) From 7cb40d2ac822833ff32fbc61081dddfd709fa237 Mon Sep 17 00:00:00 2001 From: Noa Aviel Dove Date: Tue, 22 Oct 2024 16:22:37 -0700 Subject: [PATCH 02/32] Use `SourcedBundleFQID` in `catalog_complete` subtest --- test/integration_test.py | 26 ++++---------------------- 1 file changed, 4 insertions(+), 22 deletions(-) diff --git a/test/integration_test.py b/test/integration_test.py index 3d3b635abd..815787bc28 100644 --- a/test/integration_test.py +++ b/test/integration_test.py @@ -120,7 +120,6 @@ SourceJSON, SourceRef, SourcedBundleFQID, - SourcedBundleFQIDJSON, ) from azul.indexer.document import ( EntityReference, @@ -154,7 +153,6 @@ from azul.plugins.repository.tdr_anvil import ( BundleType, TDRAnvilBundleFQID, - TDRAnvilBundleFQIDJSON, ) from azul.portal_service import ( PortalService, @@ -1272,26 +1270,10 @@ def _get_indexed_bundles(self, source, bundle = one(hit['sources']), one(hit['bundles']) source = SourceJSON(id=source[special_fields.source_id], spec=source[special_fields.source_spec]) - bundle_fqid = SourcedBundleFQIDJSON(uuid=bundle[special_fields.bundle_uuid], - version=bundle[special_fields.bundle_version], - source=source) - if config.is_anvil_enabled(catalog): - # Every primary bundle contains 1 or more biosamples, 1 dataset, - # and 0 or more other entities. Biosamples only occur in primary - # bundles. - if len(hit['biosamples']) > 0: - table_name = BundleType.primary - # Supplementary bundles contain only 1 file and 1 dataset. - elif len(hit['files']) > 0: - table_name = BundleType.supplementary - # DUOS bundles contain only 1 dataset. - elif len(hit['datasets']) > 0: - table_name = BundleType.duos - else: - assert False, hit - bundle_fqid = cast(TDRAnvilBundleFQIDJSON, bundle_fqid) - bundle_fqid['table_name'] = table_name.value - bundle_fqid = self.repository_plugin(catalog).resolve_bundle(bundle_fqid) + source = self.repository_plugin(catalog).source_from_json(source) + bundle_fqid = SourcedBundleFQID(uuid=bundle[special_fields.bundle_uuid], + version=bundle[special_fields.bundle_version], + source=source) indexed_fqids.add(bundle_fqid) return indexed_fqids From cad323c4c348c9558dd298ebd05425d9bdfe9648 Mon Sep 17 00:00:00 2001 From: Noa Aviel Dove Date: Tue, 22 Oct 2024 16:19:08 -0700 Subject: [PATCH 03/32] Rename, relocate, and document method --- scripts/can_bundle.py | 2 +- src/azul/indexer/index_service.py | 2 +- src/azul/plugins/__init__.py | 10 +++++++--- src/azul/plugins/repository/tdr_anvil/__init__.py | 4 ++-- test/indexer/test_indexer_controller.py | 4 ++-- 5 files changed, 13 insertions(+), 9 deletions(-) diff --git a/scripts/can_bundle.py b/scripts/can_bundle.py index 9314e10ff5..e58da02afa 100644 --- a/scripts/can_bundle.py +++ b/scripts/can_bundle.py @@ -93,7 +93,7 @@ def fetch_bundle(source: str, bundle_uuid: str, bundle_version: str) -> Bundle: fqid = SourcedBundleFQIDJSON(source=source_ref.to_json(), uuid=bundle_uuid, version=bundle_version) - fqid = plugin.resolve_bundle(fqid) + fqid = plugin.bundle_fqid_from_json(fqid) bundle = plugin.fetch_bundle(fqid) log.info('Fetched bundle %r version %r from catalog %r.', fqid.uuid, fqid.version, catalog) diff --git a/src/azul/indexer/index_service.py b/src/azul/indexer/index_service.py index fbf0493572..25a3c6d40f 100644 --- a/src/azul/indexer/index_service.py +++ b/src/azul/indexer/index_service.py @@ -197,7 +197,7 @@ def fetch_bundle(self, bundle_fqid: SourcedBundleFQIDJSON ) -> Bundle: plugin = self.repository_plugin(catalog) - bundle_fqid = plugin.resolve_bundle(bundle_fqid) + bundle_fqid = plugin.bundle_fqid_from_json(bundle_fqid) return plugin.fetch_bundle(bundle_fqid) def index(self, catalog: CatalogName, bundle: Bundle) -> None: diff --git a/src/azul/plugins/__init__.py b/src/azul/plugins/__init__.py index e6f9bef7a3..361ed18560 100644 --- a/src/azul/plugins/__init__.py +++ b/src/azul/plugins/__init__.py @@ -583,6 +583,13 @@ def _bundle_fqid_cls(self) -> type[BUNDLE_FQID]: bundle_cls, spec_cls, ref_cls, fqid_cls = self._generic_params return fqid_cls + def bundle_fqid_from_json(self, fqid: SourcedBundleFQIDJSON) -> BUNDLE_FQID: + """ + Instantiate a :class:`SourcedBundleFQID` from its JSON representation. + The expected input matches the output format of `SourcedBundleFQID.to_json`. + """ + return self._bundle_fqid_cls.from_json(fqid) + @property def _bundle_cls(self) -> type[BUNDLE]: bundle_cls, spec_cls, ref_cls, fqid_cls = self._generic_params @@ -607,9 +614,6 @@ def _lookup_source_id(self, spec: SOURCE_SPEC) -> str: """ raise NotImplementedError - def resolve_bundle(self, fqid: SourcedBundleFQIDJSON) -> BUNDLE_FQID: - return self._bundle_fqid_cls.from_json(fqid) - @abstractmethod def _count_subgraphs(self, source: SOURCE_SPEC) -> int: """ diff --git a/src/azul/plugins/repository/tdr_anvil/__init__.py b/src/azul/plugins/repository/tdr_anvil/__init__.py index a0162d2c16..e5de2a9ddc 100644 --- a/src/azul/plugins/repository/tdr_anvil/__init__.py +++ b/src/azul/plugins/repository/tdr_anvil/__init__.py @@ -233,7 +233,7 @@ def _list_bundles(self, )) return bundles - def resolve_bundle(self, fqid: SourcedBundleFQIDJSON) -> TDRAnvilBundleFQID: + def bundle_fqid_from_json(self, fqid: SourcedBundleFQIDJSON) -> TDRAnvilBundleFQID: if 'table_name' not in fqid: # Resolution of bundles without the table name is expensive, so we # only support it during canning. @@ -251,7 +251,7 @@ def resolve_bundle(self, fqid: SourcedBundleFQIDJSON) -> TDRAnvilBundleFQID: for bundle_type in BundleType ))) fqid = {**fqid, **one(rows)} - return super().resolve_bundle(fqid) + return super().bundle_fqid_from_json(fqid) def _emulate_bundle(self, bundle_fqid: TDRAnvilBundleFQID) -> TDRAnvilBundle: if bundle_fqid.table_name is BundleType.primary: diff --git a/test/indexer/test_indexer_controller.py b/test/indexer/test_indexer_controller.py index 0aea488842..7cd3d0aca1 100644 --- a/test/indexer/test_indexer_controller.py +++ b/test/indexer/test_indexer_controller.py @@ -232,7 +232,7 @@ def test_contribute_and_aggregate(self): notified_fqids = list(map(self._fqid_from_notification, notifications)) notified_bundles = [bundles[fqid] for fqid in notified_fqids] mock_plugin.fetch_bundle.side_effect = notified_bundles - mock_plugin.resolve_bundle.side_effect = DSSBundleFQID.from_json + mock_plugin.bundle_fqid_from_json.side_effect = DSSBundleFQID.from_json mock_plugin.sources = [source] with patch.object(IndexService, 'repository_plugin', return_value=mock_plugin): with patch.object(BundlePartition, 'max_partition_size', 4): @@ -241,7 +241,7 @@ def test_contribute_and_aggregate(self): # Assert plugin calls by controller expected_calls = [call(fqid.to_json()) for fqid in notified_fqids] - self.assertEqual(expected_calls, mock_plugin.resolve_bundle.mock_calls) + self.assertEqual(expected_calls, mock_plugin.bundle_fqid_from_json.mock_calls) expected_calls = list(map(call, notified_fqids)) self.assertEqual(expected_calls, mock_plugin.fetch_bundle.mock_calls) From 644377fd0cd4b80f92eb85f4c4e5c74ef2d5f81b Mon Sep 17 00:00:00 2001 From: Noa Aviel Dove Date: Wed, 30 Oct 2024 17:17:08 -0700 Subject: [PATCH 04/32] Fix: Inaccurate module-level docstring in can_bundle script (#6669) --- scripts/can_bundle.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/can_bundle.py b/scripts/can_bundle.py index e58da02afa..fd98579dfb 100644 --- a/scripts/can_bundle.py +++ b/scripts/can_bundle.py @@ -1,6 +1,6 @@ """ -Download manifest and metadata for a given bundle from the given repository -source and store them as separate JSON files in the index test data directory. +Download the contents of a given bundle from the given repository source and +store it as a single JSON file. Note: silently overwrites the destination file. """ From 8da52fbbfa0c8104c4c6c9afedd98350ff689a58 Mon Sep 17 00:00:00 2001 From: Noa Aviel Dove Date: Wed, 30 Oct 2024 17:17:38 -0700 Subject: [PATCH 05/32] Fix: Faulty/brittle default value for source parameter in can_bundle script (#6669) --- scripts/can_bundle.py | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/scripts/can_bundle.py b/scripts/can_bundle.py index fd98579dfb..c6ec386214 100644 --- a/scripts/can_bundle.py +++ b/scripts/can_bundle.py @@ -15,10 +15,6 @@ import sys import uuid -from more_itertools import ( - one, -) - from azul import ( cache, config, @@ -52,15 +48,8 @@ def main(argv): parser = argparse.ArgumentParser(description=__doc__, formatter_class=AzulArgumentHelpFormatter) - default_catalog = config.default_catalog - plugin_cls = RepositoryPlugin.load(default_catalog) - plugin = plugin_cls.create(default_catalog) - if len(plugin.sources) == 1: - source_arg = {'default': str(one(plugin.sources))} - else: - source_arg = {'required': True} parser.add_argument('--source', '-s', - **source_arg, + required=True, help='The repository source containing the bundle') parser.add_argument('--uuid', '-b', required=True, From 2b0d7d8a050c38867cc3c47f2be1570566904ddc Mon Sep 17 00:00:00 2001 From: Noa Aviel Dove Date: Wed, 30 Oct 2024 17:18:11 -0700 Subject: [PATCH 06/32] Fix: Incorrect help message for version parameter in can_bundle script (#6669) --- scripts/can_bundle.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/can_bundle.py b/scripts/can_bundle.py index c6ec386214..8dc3956de8 100644 --- a/scripts/can_bundle.py +++ b/scripts/can_bundle.py @@ -55,7 +55,7 @@ def main(argv): required=True, help='The UUID of the bundle to can.') parser.add_argument('--version', '-v', - help='The version of the bundle to can (default: the latest version).') + help='The version of the bundle to can. Required for HCA, ignored for AnVIL.') parser.add_argument('--output-dir', '-O', default=os.path.join(config.project_root, 'test', 'indexer', 'data'), help='The path to the output directory (default: %(default)s).') From f0be49d99c8bb38d8ce0da15051f5d9d5c8420cb Mon Sep 17 00:00:00 2001 From: Noa Aviel Dove Date: Fri, 18 Oct 2024 17:10:17 -0700 Subject: [PATCH 07/32] can_bundle script requires explicit table name for AnVIL sources (#6669) --- scripts/can_bundle.py | 23 ++++++++++++++----- .../plugins/repository/tdr_anvil/__init__.py | 20 ---------------- test/integration_test.py | 20 ++++++++-------- 3 files changed, 26 insertions(+), 37 deletions(-) diff --git a/scripts/can_bundle.py b/scripts/can_bundle.py index 8dc3956de8..9e508ce207 100644 --- a/scripts/can_bundle.py +++ b/scripts/can_bundle.py @@ -1,6 +1,8 @@ """ Download the contents of a given bundle from the given repository source and -store it as a single JSON file. +store it as a single JSON file. Users are expected to be familiar with the +structure of the bundle FQIDs for the given source and provide the appropriate +attributes. Note: silently overwrites the destination file. """ @@ -41,6 +43,7 @@ from azul.types import ( AnyJSON, AnyMutableJSON, + JSON, ) log = logging.getLogger(__name__) @@ -56,19 +59,29 @@ def main(argv): help='The UUID of the bundle to can.') parser.add_argument('--version', '-v', help='The version of the bundle to can. Required for HCA, ignored for AnVIL.') + parser.add_argument('--table-name', + help='The BigQuery table of the bundle to can. Only applicable for AnVIL.') parser.add_argument('--output-dir', '-O', default=os.path.join(config.project_root, 'test', 'indexer', 'data'), help='The path to the output directory (default: %(default)s).') parser.add_argument('--redaction-key', '-K', help='Provide a key to redact confidential or sensitive information from the output files') args = parser.parse_args(argv) - bundle = fetch_bundle(args.source, args.uuid, args.version) + fqid_fields = parse_fqid_fields(args) + bundle = fetch_bundle(args.source, fqid_fields) if args.redaction_key: redact_bundle(bundle, args.redaction_key.encode()) save_bundle(bundle, args.output_dir) -def fetch_bundle(source: str, bundle_uuid: str, bundle_version: str) -> Bundle: +def parse_fqid_fields(args: argparse.Namespace) -> JSON: + fields = {'uuid': args.uuid, 'version': args.version} + if args.table_name is not None: + fields['table_name'] = args.table_name + return fields + + +def fetch_bundle(source: str, fqid_args: JSON) -> Bundle: for catalog in config.catalogs: plugin = plugin_for(catalog) try: @@ -79,9 +92,7 @@ def fetch_bundle(source: str, bundle_uuid: str, bundle_version: str) -> Bundle: log.debug('Searching for %r in catalog %r', source, catalog) for plugin_source_spec in plugin.sources: if source_ref.spec.eq_ignoring_prefix(plugin_source_spec): - fqid = SourcedBundleFQIDJSON(source=source_ref.to_json(), - uuid=bundle_uuid, - version=bundle_version) + fqid = SourcedBundleFQIDJSON(source=source_ref.to_json(), **fqid_args) fqid = plugin.bundle_fqid_from_json(fqid) bundle = plugin.fetch_bundle(fqid) log.info('Fetched bundle %r version %r from catalog %r.', diff --git a/src/azul/plugins/repository/tdr_anvil/__init__.py b/src/azul/plugins/repository/tdr_anvil/__init__.py index e5de2a9ddc..3d7afc2b9c 100644 --- a/src/azul/plugins/repository/tdr_anvil/__init__.py +++ b/src/azul/plugins/repository/tdr_anvil/__init__.py @@ -233,26 +233,6 @@ def _list_bundles(self, )) return bundles - def bundle_fqid_from_json(self, fqid: SourcedBundleFQIDJSON) -> TDRAnvilBundleFQID: - if 'table_name' not in fqid: - # Resolution of bundles without the table name is expensive, so we - # only support it during canning. - assert not config.is_in_lambda, ('Bundle FQID lacks table name', fqid) - source = self.source_from_json(fqid['source']) - entity_id = uuids.change_version(fqid['uuid'], - self.bundle_uuid_version, - self.datarepo_row_uuid_version) - rows = self._run_sql(' UNION ALL '.join(( - f''' - SELECT {bundle_type.value!r} AS table_name - FROM {backtick(self._full_table_name(source.spec, bundle_type.value))} - WHERE datarepo_row_id = {entity_id!r} - ''' - for bundle_type in BundleType - ))) - fqid = {**fqid, **one(rows)} - return super().bundle_fqid_from_json(fqid) - def _emulate_bundle(self, bundle_fqid: TDRAnvilBundleFQID) -> TDRAnvilBundle: if bundle_fqid.table_name is BundleType.primary: log.info('Bundle %r is a primary bundle', bundle_fqid.uuid) diff --git a/test/integration_test.py b/test/integration_test.py index 815787bc28..773df15328 100644 --- a/test/integration_test.py +++ b/test/integration_test.py @@ -1872,10 +1872,7 @@ def _test_catalog(self, catalog: config.Catalog): fqid = self.bundle_fqid(catalog.name) log.info('Canning bundle %r from catalog %r', fqid, catalog.name) with tempfile.TemporaryDirectory() as d: - self._can_bundle(source=str(fqid.source.spec), - uuid=fqid.uuid, - version=fqid.version, - output_dir=d) + self._can_bundle(fqid, output_dir=d) generated_file = one(os.listdir(d)) with open(os.path.join(d, generated_file)) as f: bundle_json = json.load(f) @@ -1952,16 +1949,17 @@ def bundle_fqid(self, catalog: CatalogName) -> SourcedBundleFQID: return self.random.choice(sorted(bundle_fqids)) def _can_bundle(self, - source: str, - uuid: str, - version: str, + fqid: SourcedBundleFQID, output_dir: str ) -> None: args = [ - '--source', source, - '--uuid', uuid, - '--version', version, - '--output-dir', output_dir + '--uuid', fqid.uuid, + '--version', fqid.version, + '--source', str(fqid.source.spec), + *([ + '--table-name', fqid.table_name.value, + ] if isinstance(fqid, TDRAnvilBundleFQID) else []), + '--output-dir', output_dir, ] return self._can_bundle_main(args) From 85b36eadec5120027b8a8d16ae34cc4a5753ccb2 Mon Sep 17 00:00:00 2001 From: Hannes Schmidt Date: Fri, 1 Nov 2024 23:46:18 -0700 Subject: [PATCH 08/32] Fix formatting --- test/integration_test.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/test/integration_test.py b/test/integration_test.py index 773df15328..67eba232d9 100644 --- a/test/integration_test.py +++ b/test/integration_test.py @@ -1956,9 +1956,11 @@ def _can_bundle(self, '--uuid', fqid.uuid, '--version', fqid.version, '--source', str(fqid.source.spec), - *([ - '--table-name', fqid.table_name.value, - ] if isinstance(fqid, TDRAnvilBundleFQID) else []), + *( + ['--table-name', fqid.table_name.value] + if isinstance(fqid, TDRAnvilBundleFQID) else + [] + ), '--output-dir', output_dir, ] return self._can_bundle_main(args) From d53b1593e110e46f110ac02cd673cc46e0e4634b Mon Sep 17 00:00:00 2001 From: Hannes Schmidt Date: Fri, 1 Nov 2024 23:49:58 -0700 Subject: [PATCH 09/32] Fix missing parameter in inactive code This fix is untested since it touches code that is disabled (conditional upon DSS direct access). --- test/integration_test.py | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/test/integration_test.py b/test/integration_test.py index 67eba232d9..44dd367268 100644 --- a/test/integration_test.py +++ b/test/integration_test.py @@ -119,6 +119,7 @@ from azul.indexer import ( SourceJSON, SourceRef, + SourceSpec, SourcedBundleFQID, ) from azul.indexer.document import ( @@ -699,7 +700,7 @@ def _get_one_outer_file(self, catalog: CatalogName) -> JSON: self.fail('No files found') return one(hits) - def _source_spec(self, catalog: CatalogName, entity: JSON) -> TDRSourceSpec: + def _source_spec(self, catalog: CatalogName, entity: JSON) -> SourceSpec: if config.is_hca_enabled(catalog): field = 'sourceSpec' elif config.is_anvil_enabled(catalog): @@ -753,9 +754,10 @@ def _uuid_column_name(self, catalog: CatalogName) -> str: def _test_dos_and_drs(self, catalog: CatalogName): if config.is_dss_enabled(catalog) and config.dss_direct_access: - _, file = self._get_one_inner_file(catalog) - self._test_dos(catalog, file) - self._test_drs(catalog, file) + outer_file, inner_file = self._get_one_inner_file(catalog) + source = self._source_spec(catalog, outer_file) + self._test_dos(catalog, inner_file) + self._test_drs(catalog, source, inner_file) @property def _service_account_credentials(self) -> ContextManager: @@ -1138,13 +1140,14 @@ def _validate_file_content(self, content: ReadableFileObject, file: FileInnerEnt def _validate_file_response(self, response: urllib3.HTTPResponse, - source: TDRSourceSpec, + source: SourceSpec, file: FileInnerEntity): """ Note: The response object must have been obtained with stream=True """ try: - if source.name == 'ANVIL_1000G_2019_Dev_20230609_ANV5_202306121732': + special = 'ANVIL_1000G_2019_Dev_20230609_ANV5_202306121732' + if isinstance(source, TDRSourceSpec) and source.name == special: # All files in this snapshot were truncated to zero bytes by the # Broad to save costs. The metadata is not a reliable indication # of these files' actual size. @@ -1154,7 +1157,11 @@ def _validate_file_response(self, finally: response.close() - def _test_drs(self, catalog: CatalogName, file: FileInnerEntity): + def _test_drs(self, + catalog: CatalogName, + source: SourceSpec, + file: FileInnerEntity + ) -> None: repository_plugin = self.azul_client.repository_plugin(catalog) drs = repository_plugin.drs_client() for access_method in AccessMethod: @@ -1165,7 +1172,7 @@ def _test_drs(self, catalog: CatalogName, file: FileInnerEntity): self.assertIsNone(access.headers) if access.method is AccessMethod.https: response = self._get_url(GET, furl(access.url), stream=True) - self._validate_file_response(response, file) + self._validate_file_response(response, source, file) elif access.method is AccessMethod.gs: content = self._get_gs_url_content(furl(access.url), size=self.num_fastq_bytes) self._validate_file_content(content, file) From d206d2f63f6be68c4affdfa92098418e32ddce5c Mon Sep 17 00:00:00 2001 From: Noa Aviel Dove Date: Tue, 22 Oct 2024 16:24:36 -0700 Subject: [PATCH 10/32] Eliminate old-style type annotations in integration tests --- test/integration_test.py | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/test/integration_test.py b/test/integration_test.py index 44dd367268..f416724f15 100644 --- a/test/integration_test.py +++ b/test/integration_test.py @@ -6,7 +6,6 @@ Iterator, Mapping, Sequence, - Set, ) from concurrent.futures.thread import ( ThreadPoolExecutor, @@ -43,7 +42,6 @@ Callable, ContextManager, IO, - Optional, Protocol, TypedDict, cast, @@ -802,8 +800,8 @@ def _check_endpoint(self, method: str, path: str, *, - args: Optional[Mapping[str, Any]] = None, - endpoint: Optional[furl] = None, + args: Mapping[str, Any] | None = None, + endpoint: furl | None = None, fetch: bool = False ) -> bytes: if endpoint is None: @@ -1207,7 +1205,7 @@ def _test_dos(self, catalog: CatalogName, file: FileInnerEntity): def _get_gs_url_content(self, url: furl, - size: Optional[int] = None + size: int | None = None ) -> BytesIO: self.assertEquals('gs', url.scheme) path = os.environ['GOOGLE_APPLICATION_CREDENTIALS'] @@ -1268,7 +1266,7 @@ def update(source: SourceRef, def _get_indexed_bundles(self, catalog: CatalogName, - filters: Optional[JSON] = None + filters: JSON | None = None ) -> set[SourcedBundleFQID]: indexed_fqids = set() hits = self._get_entities(catalog, 'bundles', filters) @@ -1286,7 +1284,7 @@ def _get_indexed_bundles(self, def _assert_catalog_complete(self, catalog: CatalogName, - bundle_fqids: Set[SourcedBundleFQID] + bundle_fqids: set[SourcedBundleFQID] ) -> None: with self.subTest('catalog_complete', catalog=catalog): expected_fqids = bundle_fqids @@ -1345,7 +1343,7 @@ def _assert_catalog_empty(self, catalog: CatalogName): def _get_entities(self, catalog: CatalogName, entity_type: EntityType, - filters: Optional[JSON] = None + filters: JSON | None = None ) -> MutableJSONs: entities = [] size = 100 @@ -1377,7 +1375,7 @@ def _assert_indices_exist(self, catalog: CatalogName): def _test_managed_access(self, catalog: CatalogName, - bundle_fqids: Set[SourcedBundleFQID] + bundle_fqids: set[SourcedBundleFQID] ) -> None: with self.subTest('managed_access', catalog=catalog): indexed_source_ids = {fqid.source.id for fqid in bundle_fqids} @@ -1408,8 +1406,8 @@ def _test_managed_access(self, def _test_managed_access_repository_sources(self, catalog: CatalogName, - indexed_source_ids: Set[str], - managed_access_source_ids: Set[str] + indexed_source_ids: set[str], + managed_access_source_ids: set[str] ) -> set[str]: """ Test the managed access controls for the /repository/sources endpoint @@ -1441,7 +1439,7 @@ def list_source_ids() -> set[str]: def _test_managed_access_indices(self, catalog: CatalogName, - managed_access_source_ids: Set[str] + managed_access_source_ids: set[str] ) -> JSONs: """ Test the managed-access controls for the /index/bundles and From 37cdfde55802d76c3926f3105d42c798605f5fab Mon Sep 17 00:00:00 2001 From: Noa Aviel Dove Date: Mon, 21 Oct 2024 16:30:15 -0700 Subject: [PATCH 11/32] Refactor SourceRef prefix assignment --- src/azul/indexer/__init__.py | 3 +++ src/azul/plugins/__init__.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/azul/indexer/__init__.py b/src/azul/indexer/__init__.py index 6e673d9ec1..f8228cc835 100644 --- a/src/azul/indexer/__init__.py +++ b/src/azul/indexer/__init__.py @@ -552,6 +552,9 @@ def spec_cls(cls) -> type[SourceSpec]: spec_cls, ref_cls = get_generic_type_params(cls, SourceSpec, SourceRef) return spec_cls + def with_prefix(self, prefix: Prefix) -> Self: + return attrs.evolve(self, spec=attrs.evolve(self.spec, prefix=prefix)) + class SourcedBundleFQIDJSON(BundleFQIDJSON): source: SourceJSON diff --git a/src/azul/plugins/__init__.py b/src/azul/plugins/__init__.py index 361ed18560..893499fb2b 100644 --- a/src/azul/plugins/__init__.py +++ b/src/azul/plugins/__init__.py @@ -640,7 +640,7 @@ def partition_source(self, prefix = Prefix.for_main_deployment(count) else: prefix = Prefix.for_lesser_deployment(count) - source = attr.evolve(source, spec=attr.evolve(source.spec, prefix=prefix)) + source = source.with_prefix(prefix) return source @abstractmethod From 3998aba6e1c2a8e046fd52532b28e2536ce70e7d Mon Sep 17 00:00:00 2001 From: Noa Aviel Dove Date: Thu, 24 Oct 2024 23:44:59 -0700 Subject: [PATCH 12/32] Fix: Indexing integration test is complicated and inefficient (#6676) --- test/integration_test.py | 252 +++++++++++++++------------------------ 1 file changed, 95 insertions(+), 157 deletions(-) diff --git a/test/integration_test.py b/test/integration_test.py index f416724f15..005f8d3467 100644 --- a/test/integration_test.py +++ b/test/integration_test.py @@ -3,7 +3,6 @@ ) from collections.abc import ( Iterable, - Iterator, Mapping, Sequence, ) @@ -19,10 +18,8 @@ BytesIO, TextIOWrapper, ) -import itertools from itertools import ( count, - starmap, ) import json import os @@ -78,6 +75,7 @@ first, grouper, one, + only, ) from openapi_spec_validator import ( validate_spec, @@ -105,6 +103,9 @@ from azul.chalice import ( AzulChaliceApp, ) +from azul.collections import ( + alist, +) from azul.drs import ( AccessMethod, ) @@ -115,6 +116,7 @@ http_client, ) from azul.indexer import ( + Prefix, SourceJSON, SourceRef, SourceSpec, @@ -150,7 +152,6 @@ Link, ) from azul.plugins.repository.tdr_anvil import ( - BundleType, TDRAnvilBundleFQID, ) from azul.portal_service import ( @@ -284,73 +285,34 @@ def managed_access_sources_by_catalog(self managed_access_sources[catalog].add(ref) return managed_access_sources - def _list_partitions(self, - catalog: CatalogName, - *, - min_bundles: int, - public_1st: bool - ) -> Iterator[tuple[SourceRef, str, list[SourcedBundleFQID]]]: - """ - Iterate through the sources in the given catalog and yield partitions of - bundle FQIDs until a desired minimum number of bundles are found. For - each emitted source, every partition is included, even if it's empty. - """ - total_bundles = 0 - sources = sorted(config.sources(catalog)) - self.random.shuffle(sources) - if public_1st: - managed_access_sources = frozenset( + def _choose_source(self, + catalog: CatalogName, + *, + public: bool | None = None + ) -> SourceRef | None: + plugin = self.repository_plugin(catalog) + sources = set(config.sources(catalog)) + if public is not None: + ma_sources = { str(source.spec) + # This would raise a KeyError during the can bundle script test + # due to it using a mock catalog, so we only evaluate it when + # it's actually needed for source in self.managed_access_sources_by_catalog[catalog] - ) - index = first( - i - for i, source in enumerate(sources) - if source not in managed_access_sources - ) - sources[0], sources[index] = sources[index], sources[0] - plugin = self.azul_client.repository_plugin(catalog) - # This iteration prefers sources occurring first, so we shuffle them - # above to neutralize the bias. - for source in sources: - source = plugin.resolve_source(source) - source = plugin.partition_source(catalog, source) - for prefix in source.spec.prefix.partition_prefixes(): - new_fqids = self.azul_client.list_bundles(catalog, source, prefix) - total_bundles += len(new_fqids) - yield source, prefix, new_fqids - # We postpone this check until after we've yielded all partitions in - # the current source to ensure test coverage for handling multiple - # partitions per source - if total_bundles >= min_bundles: - break + } + self.assertIsSubset(ma_sources, sources) + if public is True: + sources -= ma_sources + elif public is False: + sources &= ma_sources + else: + assert False, public + if len(sources) == 0: + assert public is False, 'Every catalog should contain a public source' + return None else: - log.warning('Checked all sources and found only %d bundles instead of the ' - 'expected minimum %d', total_bundles, min_bundles) - - def _list_managed_access_bundles(self, - catalog: CatalogName - ) -> Iterator[tuple[SourceRef, str, list[SourcedBundleFQID]]]: - sources = self.azul_client.catalog_sources(catalog) - # We need at least one managed_access bundle per IT. To index them with - # remote_reindex and avoid collateral bundles, we use as specific a - # prefix as possible. - for source in self.managed_access_sources_by_catalog[catalog]: - assert str(source.spec) in sources - source = self.repository_plugin(catalog).partition_source(catalog, source) - bundle_fqids = sorted( - bundle_fqid - for bundle_fqid in self.azul_client.list_bundles(catalog, source, prefix='') - if not ( - # DUOS bundles are too sparse to fulfill the managed access tests - config.is_anvil_enabled(catalog) - and cast(TDRAnvilBundleFQID, bundle_fqid).table_name is BundleType.duos - ) - ) - bundle_fqid = self.random.choice(bundle_fqids) - prefix = bundle_fqid.uuid[:8] - new_fqids = self.azul_client.list_bundles(catalog, source, prefix) - yield source, prefix, new_fqids + source = self.random.choice(sorted(sources)) + return plugin.resolve_source(source) class IndexingIntegrationTest(IntegrationTestCase, AlwaysTearDownTestCase): @@ -426,6 +388,8 @@ class Catalog: name: CatalogName bundles: set[SourcedBundleFQID] notifications: list[JSON] + public_source: SourceRef + ma_source: SourceRef | None def _wait_for_indexer(): self.azul_client.wait_for_indexer() @@ -442,12 +406,23 @@ def _wait_for_indexer(): catalogs: list[Catalog] = [] for catalog in config.integration_test_catalogs: if index: - notifications, fqids = self._prepare_notifications(catalog) + public_source = self._choose_source(catalog, public=True) + ma_source = self._choose_source(catalog, public=False) + notifications, fqids = self._prepare_notifications(catalog, + sources=alist(public_source, ma_source)) else: - notifications, fqids = [], set() + with self._service_account_credentials: + fqids = self._get_indexed_bundles(catalog) + indexed_sources = {fqid.source for fqid in fqids} + ma_sources = self.managed_access_sources_by_catalog[catalog] + public_source = one(s for s in indexed_sources if s not in ma_sources) + ma_source = only(s for s in indexed_sources if s in ma_sources) + notifications = [] catalogs.append(Catalog(name=catalog, bundles=fqids, - notifications=notifications)) + notifications=notifications, + public_source=public_source, + ma_source=ma_source)) if index: for catalog in catalogs: @@ -463,12 +438,9 @@ def _wait_for_indexer(): self._test_manifest_tagging_race(catalog.name) self._test_dos_and_drs(catalog.name) self._test_repository_files(catalog.name) - if index: - bundle_fqids = catalog.bundles - else: - with self._service_account_credentials: - bundle_fqids = self._get_indexed_bundles(catalog.name) - self._test_managed_access(catalog=catalog.name, bundle_fqids=bundle_fqids) + self._test_managed_access(catalog=catalog.name, + public_source=catalog.public_source, + ma_source=catalog.ma_source) if index and delete: # FIXME: Test delete notifications @@ -1226,33 +1198,21 @@ def _validate_fastq_content(self, content: ReadableFileObject): self.assertTrue(lines[2].startswith(b'+')) def _prepare_notifications(self, - catalog: CatalogName + catalog: CatalogName, + sources: Iterable[SourceRef] ) -> tuple[JSONs, set[SourcedBundleFQID]]: - bundle_fqids: set[SourcedBundleFQID] = set() + plugin = self.repository_plugin(catalog) + bundle_fqids = set() notifications = [] - - def update(source: SourceRef, - prefix: str, - partition_bundle_fqids: Iterable[SourcedBundleFQID]): - bundle_fqids.update(partition_bundle_fqids) - notifications.append(self.azul_client.reindex_message(catalog, - source, - prefix)) - - list(starmap(update, self._list_managed_access_bundles(catalog))) - num_bundles = max(self.min_bundles - len(bundle_fqids), 1) - log.info('Selected %d bundles to satisfy managed access coverage; ' - 'selecting at least %d more', len(bundle_fqids), num_bundles) - # _list_partitions selects both public and managed access sources at random. - # If we don't index at least one public source, every request would need - # service account credentials and we couldn't compare the responses for - # public and managed access data. `public_1st` ensures that at least - # one of the sources will be public because sources are indexed starting - # with the first one yielded by the iteration. - list(starmap(update, self._list_partitions(catalog, - min_bundles=num_bundles, - public_1st=True))) - + for source in sources: + source = plugin.partition_source(catalog, source) + # Some partitions may be empty, but we include them anyway to + # ensure test coverage for handling multiple partitions per source + for partition_prefix in source.spec.prefix.partition_prefixes(): + bundle_fqids.update(self.azul_client.list_bundles(catalog, source, partition_prefix)) + notifications.append(self.azul_client.reindex_message(catalog, + source, + partition_prefix)) # Index some bundles again to test that we handle duplicate additions. # Note: random.choices() may pick the same element multiple times so # some notifications may end up being sent three or more times. @@ -1375,40 +1335,34 @@ def _assert_indices_exist(self, catalog: CatalogName): def _test_managed_access(self, catalog: CatalogName, - bundle_fqids: set[SourcedBundleFQID] + public_source: SourceRef, + ma_source: SourceRef | None, ) -> None: with self.subTest('managed_access', catalog=catalog): - indexed_source_ids = {fqid.source.id for fqid in bundle_fqids} - managed_access_sources = self.managed_access_sources_by_catalog[catalog] - managed_access_source_ids = {source.id for source in managed_access_sources} - self.assertIsSubset(managed_access_source_ids, indexed_source_ids) - - if not managed_access_sources: + if ma_source is None: if config.deployment_stage in ('dev', 'sandbox'): # There should always be at least one managed-access source # indexed and tested on the default catalog for these deployments self.assertNotEqual(catalog, config.it_catalog_for(config.default_catalog)) self.skipTest(f'No managed access sources found in catalog {catalog!r}') - with self.subTest('managed_access_indices', catalog=catalog): - self._test_managed_access_indices(catalog, managed_access_source_ids) + self._test_managed_access_indices(catalog, public_source, ma_source) with self.subTest('managed_access_repository_files', catalog=catalog): - files = self._test_managed_access_repository_files(catalog, managed_access_source_ids) + files = self._test_managed_access_repository_files(catalog, ma_source) with self.subTest('managed_access_summary', catalog=catalog): self._test_managed_access_summary(catalog, files) with self.subTest('managed_access_repository_sources', catalog=catalog): - public_source_ids = self._test_managed_access_repository_sources(catalog, - indexed_source_ids, - managed_access_source_ids) - with self.subTest('managed_access_manifest', catalog=catalog): - source_id = self.random.choice(sorted(public_source_ids & indexed_source_ids)) - self._test_managed_access_manifest(catalog, files, source_id) + self._test_managed_access_repository_sources(catalog, + public_source, + ma_source) + with self.subTest('managed_access_manifest', catalog=catalog): + self._test_managed_access_manifest(catalog, files, public_source) def _test_managed_access_repository_sources(self, catalog: CatalogName, - indexed_source_ids: set[str], - managed_access_source_ids: set[str] - ) -> set[str]: + public_source: SourceRef, + ma_source: SourceRef + ) -> None: """ Test the managed access controls for the /repository/sources endpoint :return: the set of public sources @@ -1421,11 +1375,14 @@ def list_source_ids() -> set[str]: return {source['sourceId'] for source in cast(JSONs, response['sources'])} with self._service_account_credentials: - self.assertIsSubset(indexed_source_ids, list_source_ids()) + self.assertIsSubset({public_source.id, ma_source.id}, list_source_ids()) with self._public_service_account_credentials: public_source_ids = list_source_ids() + self.assertIn(public_source.id, public_source_ids) + self.assertNotIn(ma_source.id, public_source_ids) with self._unregistered_service_account_credentials: self.assertEqual(public_source_ids, list_source_ids()) + self.assertEqual(public_source_ids, list_source_ids()) invalid_auth = OAuth2('foo') with self.assertRaises(UnauthorizedError): TDRClient.for_registered_user(invalid_auth) @@ -1433,13 +1390,11 @@ def list_source_ids() -> set[str]: invalid_client = OAuth2Client(credentials_provider=invalid_provider) with self._authorization_context(invalid_client): self.assertEqual(401, self._get_url_unchecked(GET, url).status) - self.assertEqual(set(), list_source_ids() & managed_access_source_ids) - self.assertEqual(public_source_ids, list_source_ids()) - return public_source_ids def _test_managed_access_indices(self, catalog: CatalogName, - managed_access_source_ids: set[str] + public_source: SourceRef, + ma_source: SourceRef ) -> JSONs: """ Test the managed-access controls for the /index/bundles and @@ -1449,11 +1404,6 @@ def _test_managed_access_indices(self, """ special_fields = self.metadata_plugin(catalog).special_fields - - def source_id_from_hit(hit: JSON) -> str: - sources: JSONs = hit['sources'] - return one(sources)[special_fields.source_id] - bundle_type = self._bundle_type(catalog) project_type = self._project_type(catalog) @@ -1466,31 +1416,22 @@ def source_id_from_hit(hit: JSON) -> str: hits = self._get_entities(catalog, project_type, filters=filters) if accessible is None: unfiltered_hits = hits - accessible_sources, inaccessible_sources = set(), set() for hit in hits: - source_id = source_id_from_hit(hit) - source_accessible = source_id not in managed_access_source_ids + source_id = one(hit['sources'])[special_fields.source_id] + source_accessible = {public_source.id: True, ma_source.id: False}[source_id] hit_accessible = one(hit[project_type])[special_fields.accessible] self.assertEqual(source_accessible, hit_accessible, hit['entryId']) if accessible is not None: self.assertEqual(accessible, hit_accessible) - if source_accessible: - accessible_sources.add(source_id) - else: - inaccessible_sources.add(source_id) - self.assertIsDisjoint(accessible_sources, inaccessible_sources) - self.assertIsDisjoint(managed_access_source_ids, accessible_sources) - self.assertEqual(set() if accessible else managed_access_source_ids, - inaccessible_sources) self.assertIsNotNone(unfiltered_hits, 'Cannot recover from subtest failure') bundle_fqids = self._get_indexed_bundles(catalog) hit_source_ids = {fqid.source.id for fqid in bundle_fqids} - self.assertEqual(set(), hit_source_ids & managed_access_source_ids) + self.assertEqual(hit_source_ids, {public_source.id}) source_filter = { special_fields.source_id: { - 'is': list(managed_access_source_ids) + 'is': [ma_source.id] } } params = { @@ -1499,18 +1440,18 @@ def source_id_from_hit(hit: JSON) -> str: } url = config.service_endpoint.set(path=('index', bundle_type), args=params) response = self._get_url_unchecked(GET, url) - self.assertEqual(403 if managed_access_source_ids else 200, response.status) + self.assertEqual(403, response.status) with self._service_account_credentials: bundle_fqids = self._get_indexed_bundles(catalog, filters=source_filter) hit_source_ids = {fqid.source.id for fqid in bundle_fqids} - self.assertEqual(managed_access_source_ids, hit_source_ids) + self.assertEqual({ma_source.id}, hit_source_ids) return unfiltered_hits def _test_managed_access_repository_files(self, catalog: CatalogName, - managed_access_source_ids: set[str] + ma_source: SourceRef ) -> JSONs: """ Test the managed access controls for the /repository/files endpoint @@ -1520,7 +1461,7 @@ def _test_managed_access_repository_files(self, with self._service_account_credentials: files = self._get_entities(catalog, 'files', filters={ special_fields.source_id: { - 'is': list(managed_access_source_ids) + 'is': [ma_source.id] } }) managed_access_file_urls = { @@ -1557,7 +1498,7 @@ def _get_summary_file_count() -> int: def _test_managed_access_manifest(self, catalog: CatalogName, files: JSONs, - source_id: str + public_source: SourceRef ) -> None: """ Test the managed access controls for the /manifest/files endpoint and @@ -1579,7 +1520,7 @@ def bundle_uuids(hit: JSON) -> set[str]: for file in files if len(file['sources']) == 1 )) - filters = {special_fields.source_id: {'is': [source_id]}} + filters = {special_fields.source_id: {'is': [public_source.id]}} params = {'size': 1, 'catalog': catalog, 'filters': json.dumps(filters)} files_url = furl(url=endpoint, path='index/files', args=params) response = self._get_url_json(GET, files_url) @@ -1944,13 +1885,10 @@ def test_can_bundle_canned_repository(self): self._test_catalog(mock_catalog) def bundle_fqid(self, catalog: CatalogName) -> SourcedBundleFQID: - # Skip through empty partitions - bundle_fqids = itertools.chain.from_iterable( - bundle_fqids - for _, _, bundle_fqids in self._list_partitions(catalog, - min_bundles=1, - public_1st=False) - ) + source = self._choose_source(catalog) + # The plugin will raise an exception if the source lacks a prefix + source = source.with_prefix(Prefix.of_everything) + bundle_fqids = self.repository_plugin(catalog).list_bundles(source, '') return self.random.choice(sorted(bundle_fqids)) def _can_bundle(self, From f841bf726e6fcfb622f6eb1a5b40103ff5e5d143 Mon Sep 17 00:00:00 2001 From: Noa Aviel Dove Date: Tue, 22 Oct 2024 18:59:21 -0700 Subject: [PATCH 13/32] Fix docstring formatting --- src/azul/indexer/document.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/azul/indexer/document.py b/src/azul/indexer/document.py index f082193e6d..fe5430967e 100644 --- a/src/azul/indexer/document.py +++ b/src/azul/indexer/document.py @@ -504,13 +504,12 @@ class ContributionCoordinates(DocumentCoordinates[E], Generic[E]): subgraph ("bundle") and represent either the addition of metadata to an entity or the removal of metadata from an entity. - Contributions produced by - transformers don't specify a catalog, the catalog is supplied when the - contributions are written to the index and it is guaranteed to be the same - for all contributions produced in response to one notification. When - contributions are read back during aggregation, they specify a catalog, the - catalog they were read from. Because of that duality this class has to be - generic in E, the type of EntityReference. + Contributions produced by transformers don't specify a catalog. The catalog + is supplied when the contributions are written to the index and it is + guaranteed to be the same for all contributions produced in response to one + notification. When contributions are read back during aggregation, they + specify a catalog, the catalog they were read from. Because of that duality + this class has to be generic in E, the type of EntityReference. """ doc_type: ClassVar[DocumentType] = DocumentType.contribution From 58a671a78812429c4161d3c86a8cebc8d5ff5695 Mon Sep 17 00:00:00 2001 From: Noa Aviel Dove Date: Fri, 4 Oct 2024 17:47:35 -0700 Subject: [PATCH 14/32] Populate empty/missing fields in canned tables --- ...09d-46a4-b845-7584df39263b.tables.tdr.json | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/test/indexer/data/6c87f0e1-509d-46a4-b845-7584df39263b.tables.tdr.json b/test/indexer/data/6c87f0e1-509d-46a4-b845-7584df39263b.tables.tdr.json index 422b461de6..c1783920e2 100644 --- a/test/indexer/data/6c87f0e1-509d-46a4-b845-7584df39263b.tables.tdr.json +++ b/test/indexer/data/6c87f0e1-509d-46a4-b845-7584df39263b.tables.tdr.json @@ -3,8 +3,9 @@ "anvil_activity": { "rows": [ { - "activity_id": "", - "activity_type": "", + "activity_id": "95aa5a62-0c42-48c7-aa70-e9987cfd9824", + "activity_type": "other", + "datarepo_row_id": "85342e55-8897-481e-a2c3-b7344071a72c", "generated_file_id": [], "source_datarepo_row_ids": [], "used_biosample_id": [], @@ -15,9 +16,10 @@ "anvil_alignmentactivity": { "rows": [ { - "activity_type": "", - "alignmentactivity_id": "", + "activity_type": "alignment", + "alignmentactivity_id": "bb22e6de-7704-4435-8684-a34b1e4ff188", "data_modality": [], + "datarepo_row_id": "a6f3f829-dec6-4be8-9d53-2c963f9206fc", "generated_file_id": [], "reference_assembly": [], "source_datarepo_row_ids": [], @@ -28,11 +30,12 @@ "anvil_assayactivity": { "rows": [ { - "activity_type": "", + "activity_type": "assay", "antibody_id": [], "assay_type": "", - "assayactivity_id": "", + "assayactivity_id": "d9150ab0-8e4a-46ab-9501-4b91c6f548e1", "data_modality": [], + "datarepo_row_id": "9258bb9f-1e2b-4139-9226-3588154a4632", "generated_file_id": [], "source_datarepo_row_ids": [], "used_biosample_id": [] @@ -241,13 +244,14 @@ "anvil_variantcallingactivity": { "rows": [ { - "activity_type": "", + "activity_type": "variant calling", "data_modality": [], + "datarepo_row_id": "c826d5b0-5e86-432b-ab53-36eced42ece0", "generated_file_id": [], "reference_assembly": [], "source_datarepo_row_ids": [], "used_file_id": [], - "variantcallingactivity_id": "" + "variantcallingactivity_id": "8db89e5e-1c8a-487e-b0c1-b26218ac6b7b" } ] } From 6c1c601a45adc95a018e8ed2f94f5f47d8d88473 Mon Sep 17 00:00:00 2001 From: Noa Aviel Dove Date: Fri, 4 Oct 2024 18:02:23 -0700 Subject: [PATCH 15/32] Replace `+` with f-string --- src/azul/uuids.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/azul/uuids.py b/src/azul/uuids.py index 194217e9e5..ad1017ad1d 100644 --- a/src/azul/uuids.py +++ b/src/azul/uuids.py @@ -124,7 +124,7 @@ def change_version(uuid: str, old_version: int, new_version: int) -> str: prefix, version, suffix = uuid[:14], uuid[14], uuid[15:] version = int(version, 16) assert version == old_version, (uuid, version, old_version) - uuid = prefix + hex(new_version)[2:] + suffix + uuid = f'{prefix}{new_version:x}{suffix}' assert UUID(uuid).version == new_version, (uuid, old_version) if new_version in (1, 3, 4, 5): validate_uuid(uuid) From 96747c3947fe56e2db6bd4bc3130e651ae2e88bc Mon Sep 17 00:00:00 2001 From: Noa Aviel Dove Date: Fri, 25 Oct 2024 16:18:53 -0700 Subject: [PATCH 16/32] Make bundle-counting method public --- scripts/post_deploy_tdr.py | 2 +- src/azul/plugins/__init__.py | 7 ++++--- src/azul/plugins/repository/canned/__init__.py | 11 +++++++++-- src/azul/plugins/repository/dss/__init__.py | 2 +- src/azul/plugins/repository/tdr_anvil/__init__.py | 6 ++++-- src/azul/plugins/repository/tdr_hca/__init__.py | 13 ++++++++----- 6 files changed, 27 insertions(+), 14 deletions(-) diff --git a/scripts/post_deploy_tdr.py b/scripts/post_deploy_tdr.py index 98c680f312..40411592fe 100644 --- a/scripts/post_deploy_tdr.py +++ b/scripts/post_deploy_tdr.py @@ -102,7 +102,7 @@ def verify_source(self, require(prefix.common == '', source_spec) self.tdr.check_bigquery_access(source_spec) else: - subgraph_count = len(plugin.list_bundles(ref, prefix.common)) + subgraph_count = plugin.count_bundles(ref.spec) require(subgraph_count > 0, 'Common prefix is too long', ref.spec) require(subgraph_count <= 512, 'Common prefix is too short', ref.spec) diff --git a/src/azul/plugins/__init__.py b/src/azul/plugins/__init__.py index 893499fb2b..c7dffb65e6 100644 --- a/src/azul/plugins/__init__.py +++ b/src/azul/plugins/__init__.py @@ -615,9 +615,10 @@ def _lookup_source_id(self, spec: SOURCE_SPEC) -> str: raise NotImplementedError @abstractmethod - def _count_subgraphs(self, source: SOURCE_SPEC) -> int: + def count_bundles(self, source: SOURCE_SPEC) -> int: """ - The total number of subgraphs in the given source, ignoring its prefix. + The total number of subgraphs in the given source. The source's prefix + may be None. """ raise NotImplementedError @@ -631,7 +632,7 @@ def partition_source(self, should be appropriate for indexing in the given catalog. """ if source.spec.prefix is None: - count = self._count_subgraphs(source.spec) + count = self.count_bundles(source.spec) is_main = config.deployment.is_main is_it = catalog in config.integration_test_catalogs # We use the "lesser" heuristic during IT to avoid indexing an diff --git a/src/azul/plugins/repository/canned/__init__.py b/src/azul/plugins/repository/canned/__init__.py index 3c2705fdf5..ea4c35f9ef 100644 --- a/src/azul/plugins/repository/canned/__init__.py +++ b/src/azul/plugins/repository/canned/__init__.py @@ -26,6 +26,9 @@ from furl import ( furl, ) +from more_itertools import ( + ilen, +) from azul import ( CatalogName, @@ -163,9 +166,13 @@ def staging_area(self, url: str) -> StagingArea: ref) return factory.load_staging_area(path) - def _count_subgraphs(self, source: SOURCE_SPEC) -> int: + def count_bundles(self, source: SOURCE_SPEC) -> int: staging_area = self.staging_area(source.spec.name) - return len(staging_area.links) + return ilen( + links_id + for links_id in staging_area.links + if source.prefix is None or links_id.startswith(source.prefix.common) + ) def list_bundles(self, source: CannedSourceRef, diff --git a/src/azul/plugins/repository/dss/__init__.py b/src/azul/plugins/repository/dss/__init__.py index 96a6da973a..4e5809f4a6 100644 --- a/src/azul/plugins/repository/dss/__init__.py +++ b/src/azul/plugins/repository/dss/__init__.py @@ -118,7 +118,7 @@ def sources(self) -> AbstractSet[SimpleSourceSpec]: def _lookup_source_id(self, spec: SimpleSourceSpec) -> str: return DSSSourceRef.id_from_spec(spec) - def _count_subgraphs(self, source: SimpleSourceSpec) -> NoReturn: + def count_bundles(self, source: SimpleSourceSpec) -> NoReturn: assert False, 'DSS is EOL' def list_sources(self, diff --git a/src/azul/plugins/repository/tdr_anvil/__init__.py b/src/azul/plugins/repository/tdr_anvil/__init__.py index 3d7afc2b9c..13449cd52c 100644 --- a/src/azul/plugins/repository/tdr_anvil/__init__.py +++ b/src/azul/plugins/repository/tdr_anvil/__init__.py @@ -169,14 +169,16 @@ def _version(self): datarepo_row_uuid_version = 4 bundle_uuid_version = 10 - def _count_subgraphs(self, source: TDRSourceSpec) -> int: + def count_bundles(self, source: TDRSourceSpec) -> int: + prefix = '' if source.prefix is None else source.prefix.common rows = self._run_sql(f''' SELECT COUNT(*) AS count FROM {backtick(self._full_table_name(source, BundleType.primary.value))} + WHERE STARTS_WITH(datarepo_row_id, {prefix!r}) UNION ALL SELECT COUNT(*) AS count FROM {backtick(self._full_table_name(source, BundleType.supplementary.value))} - WHERE is_supplementary + WHERE is_supplementary AND STARTS_WITH(datarepo_row_id, {prefix!r}) ''') return sum(row['count'] for row in rows) diff --git a/src/azul/plugins/repository/tdr_hca/__init__.py b/src/azul/plugins/repository/tdr_hca/__init__.py index 320b0a4f1b..2f10c564b4 100644 --- a/src/azul/plugins/repository/tdr_hca/__init__.py +++ b/src/azul/plugins/repository/tdr_hca/__init__.py @@ -278,11 +278,14 @@ def _parse_drs_uri(self, class Plugin(TDRPlugin[TDRHCABundle, TDRSourceSpec, TDRSourceRef, TDRBundleFQID]): - def _count_subgraphs(self, source: TDRSourceSpec) -> int: - rows = self._run_sql(f''' - SELECT COUNT(*) AS count - FROM {backtick(self._full_table_name(source, 'links'))} - ''') + def count_bundles(self, source: TDRSourceSpec) -> int: + prefix = '' if source.prefix is None else source.prefix.common + query = f''' + SELECT COUNT(*) AS count + FROM {backtick(self._full_table_name(source, 'links'))} + WHERE STARTS_WITH(datarepo_row_id, {prefix!r}) + ''' + rows = self._run_sql(query) return one(rows)['count'] def _list_bundles(self, From ec2e7a2fd6a637dadb818a25b44d5a9c3fccc1ee Mon Sep 17 00:00:00 2001 From: Noa Aviel Dove Date: Thu, 31 Oct 2024 02:58:15 -0700 Subject: [PATCH 17/32] Refactor list_bundles methods --- src/azul/azulclient.py | 6 +++++- .../plugins/repository/canned/__init__.py | 21 +++++++------------ src/azul/plugins/repository/tdr.py | 18 ---------------- .../plugins/repository/tdr_anvil/__init__.py | 9 ++++---- .../plugins/repository/tdr_hca/__init__.py | 9 ++++---- 5 files changed, 22 insertions(+), 41 deletions(-) diff --git a/src/azul/azulclient.py b/src/azul/azulclient.py index 67a90a1f62..5917dfc533 100644 --- a/src/azul/azulclient.py +++ b/src/azul/azulclient.py @@ -225,7 +225,11 @@ def list_bundles(self, source = plugin.resolve_source(source) else: assert isinstance(source, SourceRef), source - return plugin.list_bundles(source, prefix) + log.info('Listing bundles with prefix %r in source %r.', prefix, source) + bundle_fqids = plugin.list_bundles(source, prefix) + log.info('There are %i bundle(s) with prefix %r in source %r.', + len(bundle_fqids), prefix, source) + return bundle_fqids @property def sqs(self): diff --git a/src/azul/plugins/repository/canned/__init__.py b/src/azul/plugins/repository/canned/__init__.py index ea4c35f9ef..be4257c134 100644 --- a/src/azul/plugins/repository/canned/__init__.py +++ b/src/azul/plugins/repository/canned/__init__.py @@ -64,9 +64,6 @@ from azul.types import ( JSON, ) -from azul.uuids import ( - validate_uuid_prefix, -) from humancellatlas.data.metadata.helpers.staging_area import ( CannedStagingAreaFactory, StagingArea, @@ -179,18 +176,14 @@ def list_bundles(self, prefix: str ) -> list[CannedBundleFQID]: self._assert_source(source) - validate_uuid_prefix(prefix) - log.info('Listing bundles with prefix %r in source %r.', prefix, source) - bundle_fqids = [] staging_area = self.staging_area(source.spec.name) - for link in staging_area.links.values(): - if link.uuid.startswith(prefix): - bundle_fqids.append(CannedBundleFQID(source=source, - uuid=link.uuid, - version=link.version)) - log.info('There are %i bundle(s) with prefix %r in source %r.', - len(bundle_fqids), prefix, source) - return bundle_fqids + return [ + CannedBundleFQID(source=source, + uuid=link.uuid, + version=link.version) + for link in staging_area.links.values() + if link.uuid.startswith(prefix) + ] def fetch_bundle(self, bundle_fqid: CannedBundleFQID) -> CannedBundle: self._assert_source(bundle_fqid.source) diff --git a/src/azul/plugins/repository/tdr.py b/src/azul/plugins/repository/tdr.py index a45cf96483..0ca79cd786 100644 --- a/src/azul/plugins/repository/tdr.py +++ b/src/azul/plugins/repository/tdr.py @@ -191,17 +191,6 @@ def _drs_client(cls, def _lookup_source_id(self, spec: TDRSourceSpec) -> str: return self.tdr.lookup_source(spec) - def list_bundles(self, - source: TDRSourceRef, - prefix: str - ) -> list[TDRBundleFQID]: - self._assert_source(source) - log.info('Listing bundles with prefix %r in source %r.', prefix, source) - bundle_fqids = self._list_bundles(source, prefix) - log.info('There are %i bundle(s) with prefix %r in source %r.', - len(bundle_fqids), prefix, source) - return bundle_fqids - def fetch_bundle(self, bundle_fqid: TDRBundleFQID) -> TDR_BUNDLE: self._assert_source(bundle_fqid.source) now = time.time() @@ -223,13 +212,6 @@ def _run_sql(self, query): def _full_table_name(self, source: TDRSourceSpec, table_name: str) -> str: return source.qualify_table(table_name) - @abstractmethod - def _list_bundles(self, - source: TDRSourceRef, - prefix: str - ) -> list[TDRBundleFQID]: - raise NotImplementedError - @abstractmethod def _emulate_bundle(self, bundle_fqid: TDRBundleFQID) -> TDR_BUNDLE: raise NotImplementedError diff --git a/src/azul/plugins/repository/tdr_anvil/__init__.py b/src/azul/plugins/repository/tdr_anvil/__init__.py index 13449cd52c..a02af77958 100644 --- a/src/azul/plugins/repository/tdr_anvil/__init__.py +++ b/src/azul/plugins/repository/tdr_anvil/__init__.py @@ -182,10 +182,11 @@ def count_bundles(self, source: TDRSourceSpec) -> int: ''') return sum(row['count'] for row in rows) - def _list_bundles(self, - source: TDRSourceRef, - prefix: str - ) -> list[TDRAnvilBundleFQID]: + def list_bundles(self, + source: TDRSourceRef, + prefix: str + ) -> list[TDRAnvilBundleFQID]: + self._assert_source(source) spec = source.spec primary = BundleType.primary.value supplementary = BundleType.supplementary.value diff --git a/src/azul/plugins/repository/tdr_hca/__init__.py b/src/azul/plugins/repository/tdr_hca/__init__.py index 2f10c564b4..6114255287 100644 --- a/src/azul/plugins/repository/tdr_hca/__init__.py +++ b/src/azul/plugins/repository/tdr_hca/__init__.py @@ -288,10 +288,11 @@ def count_bundles(self, source: TDRSourceSpec) -> int: rows = self._run_sql(query) return one(rows)['count'] - def _list_bundles(self, - source: TDRSourceRef, - prefix: str - ) -> list[TDRBundleFQID]: + def list_bundles(self, + source: TDRSourceRef, + prefix: str + ) -> list[TDRBundleFQID]: + self._assert_source(source) current_bundles = self._query_unique_sorted(f''' SELECT links_id, version FROM {backtick(self._full_table_name(source.spec, 'links'))} From d9280cc53db95c70d9d7ebc15c56f01fc17b2fec Mon Sep 17 00:00:00 2001 From: Noa Aviel Dove Date: Mon, 28 Oct 2024 18:54:15 -0700 Subject: [PATCH 18/32] Improve BigQuery logging --- src/azul/plugins/repository/tdr_anvil/__init__.py | 3 ++- src/azul/plugins/repository/tdr_hca/__init__.py | 3 ++- src/azul/terra.py | 8 ++++++-- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/src/azul/plugins/repository/tdr_anvil/__init__.py b/src/azul/plugins/repository/tdr_anvil/__init__.py index a02af77958..20e9f8a872 100644 --- a/src/azul/plugins/repository/tdr_anvil/__init__.py +++ b/src/azul/plugins/repository/tdr_anvil/__init__.py @@ -660,7 +660,8 @@ def convert_column(value): log.debug('Retrieved %i entities of type %r', len(rows), entity_type) missing = keys - {row[pk_column] for row in rows} require(not missing, - f'Required entities not found in {table_name}: {missing}') + f'Found only {len(rows)} out of {len(keys)} expected rows in {table_name}. ' + f'Missing entities: {missing}') return rows else: return [] diff --git a/src/azul/plugins/repository/tdr_hca/__init__.py b/src/azul/plugins/repository/tdr_hca/__init__.py index 6114255287..e022196832 100644 --- a/src/azul/plugins/repository/tdr_hca/__init__.py +++ b/src/azul/plugins/repository/tdr_hca/__init__.py @@ -470,7 +470,8 @@ def _retrieve_entities(self, log.debug('Retrieved %i entities of type %r', len(rows), entity_type) missing = expected - {row[pk_column] for row in rows} require(not missing, - f'Required entities not found in {table_name}: {missing}') + f'Found only {len(rows)} out of {len(entity_ids)} expected rows in {table_name}. ' + f'Missing entities: {missing}') return rows def _in(self, diff --git a/src/azul/terra.py b/src/azul/terra.py index a468c382e3..5e14fa3c50 100644 --- a/src/azul/terra.py +++ b/src/azul/terra.py @@ -43,6 +43,9 @@ QueryJobConfig, QueryPriority, ) +from google.cloud.bigquery.table import ( + RowIterator, +) from more_itertools import ( one, ) @@ -504,13 +507,13 @@ def run_sql(self, query: str) -> BigQueryRows: else: assert False if log.isEnabledFor(logging.DEBUG): - log.debug('Job info: %s', json.dumps(self._job_info(job))) + log.debug('Job info: %s', json.dumps(self._job_info(job, result))) return result def _trunc_query(self, query: str) -> str: return trunc_ellipses(query, 2048) - def _job_info(self, job: QueryJob) -> JSON: + def _job_info(self, job: QueryJob, result: RowIterator) -> JSON: # noinspection PyProtectedMember stats = job._properties['statistics']['query'] if config.debug < 2: @@ -518,6 +521,7 @@ def _job_info(self, job: QueryJob) -> JSON: stats = {k: v for k, v in stats.items() if k not in ignore} return { 'job_id': job.job_id, + 'total_rows': result.total_rows, 'stats': stats, 'query': self._trunc_query(job.query) } From c8a084fca68ba4bd35d152ce086b9d38a88bbd49 Mon Sep 17 00:00:00 2001 From: Noa Aviel Dove Date: Sun, 3 Nov 2024 15:57:15 -0800 Subject: [PATCH 19/32] [r] Emit replicas for AnVIL entities not linked to files --- .../metadata/anvil/indexer/transform.py | 23 +++++++++++++------ test/indexer/test_anvil.py | 4 +--- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/src/azul/plugins/metadata/anvil/indexer/transform.py b/src/azul/plugins/metadata/anvil/indexer/transform.py index 858899bd59..ca181b789d 100644 --- a/src/azul/plugins/metadata/anvil/indexer/transform.py +++ b/src/azul/plugins/metadata/anvil/indexer/transform.py @@ -564,6 +564,22 @@ def _singleton(self) -> EntityReference: return EntityReference(entity_type='bundle', entity_id=self.bundle.uuid) + def transform(self, + partition: BundlePartition + ) -> Iterable[Contribution | Replica]: + yield from super().transform(partition) + if config.enable_replicas: + # Replicas are only emitted by the file transformer for entities + # that are linked to at least one file. This excludes a small number + # of linked entities, usually from primary bundles that don't + # include any files. Some of the replicas we emit here will be + # redundant with those emitted by the file transformer, but these + # will be coalesced by the index service before they are written to + # ElasticSearch. + for entity in self.bundle.entities: + if partition.contains(UUID(entity.entity_id)): + yield self._replica(entity, file_hub=None) + class DatasetTransformer(SingletonTransformer): @@ -574,13 +590,6 @@ def entity_type(cls) -> str: def _singleton(self) -> EntityReference: return self._only_dataset() - def _transform(self, - entity: EntityReference - ) -> Iterable[Contribution | Replica]: - yield from super()._transform(entity) - if self._is_duos(entity): - yield self._replica(entity, file_hub=None) - class DonorTransformer(BaseTransformer): diff --git a/test/indexer/test_anvil.py b/test/indexer/test_anvil.py index 263f833d1f..0b9f56b287 100644 --- a/test/indexer/test_anvil.py +++ b/test/indexer/test_anvil.py @@ -233,7 +233,5 @@ def test_dataset_description(self): self.assertDictEqual(doc_counts, { DocumentType.aggregate: 1, DocumentType.contribution: 2, - # No replica is emitted for the primary dataset because we dropped - # the files (hubs) from its bundle above - **({DocumentType.replica: 1} if config.enable_replicas else {}) + **({DocumentType.replica: 2} if config.enable_replicas else {}) }) From 104a73bde6f7b469f19eae20f0969f5c348ab37c Mon Sep 17 00:00:00 2001 From: Noa Aviel Dove Date: Sun, 3 Nov 2024 15:57:37 -0800 Subject: [PATCH 20/32] [r] Index orphaned replicas (#6626) --- scripts/can_bundle.py | 8 + scripts/post_deploy_tdr.py | 6 +- src/azul/indexer/index_service.py | 3 +- src/azul/plugins/metadata/anvil/bundle.py | 27 +- .../metadata/anvil/indexer/transform.py | 29 +- .../plugins/repository/tdr_anvil/__init__.py | 426 ++++++++++++------ src/azul/terra.py | 9 + terraform/authentication.tf.json.template.py | 1 + ...2783-aeb6-afea-e022897f4dcf.tdr.anvil.json | 27 +- ...09d-46a4-b845-7584df39263b.tables.tdr.json | 16 + ...e274-affe-aabc-eb3db63ad068.tdr.anvil.json | 3 +- ...069b-a90e-b588-eaf8d1aeeac9.tdr.anvil.json | 44 ++ ...a9c-a3b9-bc91-eb10a428374a.tdr.anvil.json} | 44 +- test/indexer/test_anvil.py | 57 ++- test/indexer/test_tdr.py | 4 +- test/integration_test.py | 24 +- test/service/test_manifest.py | 6 +- 17 files changed, 562 insertions(+), 172 deletions(-) create mode 100644 test/indexer/data/9a135c9a-069b-a90e-b588-eaf8d1aeeac9.tdr.anvil.json rename test/indexer/data/{6b0f6c0f-5d80-a242-accb-840921351cd5.tdr.anvil.json => c67e7adb-1a9c-a3b9-bc91-eb10a428374a.tdr.anvil.json} (51%) diff --git a/scripts/can_bundle.py b/scripts/can_bundle.py index 9e508ce207..a2021b27c1 100644 --- a/scripts/can_bundle.py +++ b/scripts/can_bundle.py @@ -61,6 +61,9 @@ def main(argv): help='The version of the bundle to can. Required for HCA, ignored for AnVIL.') parser.add_argument('--table-name', help='The BigQuery table of the bundle to can. Only applicable for AnVIL.') + parser.add_argument('--batch-prefix', + help='The batch prefix of the bundle to can. Only applicable for AnVIL. ' + 'Use "null" for non-batched bundle formats.') parser.add_argument('--output-dir', '-O', default=os.path.join(config.project_root, 'test', 'indexer', 'data'), help='The path to the output directory (default: %(default)s).') @@ -78,6 +81,11 @@ def parse_fqid_fields(args: argparse.Namespace) -> JSON: fields = {'uuid': args.uuid, 'version': args.version} if args.table_name is not None: fields['table_name'] = args.table_name + batch_prefix = args.batch_prefix + if batch_prefix is not None: + if batch_prefix == 'null': + batch_prefix = None + fields['batch_prefix'] = batch_prefix return fields diff --git a/scripts/post_deploy_tdr.py b/scripts/post_deploy_tdr.py index 40411592fe..a6566345ea 100644 --- a/scripts/post_deploy_tdr.py +++ b/scripts/post_deploy_tdr.py @@ -96,12 +96,12 @@ def verify_source(self, plugin = self.repository_plugin(catalog) ref = plugin.resolve_source(str(source_spec)) log.info('TDR client is authorized for API access to %s.', source_spec) - ref = plugin.partition_source(catalog, ref) - prefix = ref.spec.prefix if config.deployment.is_main: - require(prefix.common == '', source_spec) + if source_spec.prefix is not None: + require(source_spec.prefix.common == '', source_spec) self.tdr.check_bigquery_access(source_spec) else: + ref = plugin.partition_source(catalog, ref) subgraph_count = plugin.count_bundles(ref.spec) require(subgraph_count > 0, 'Common prefix is too long', ref.spec) require(subgraph_count <= 512, 'Common prefix is too short', ref.spec) diff --git a/src/azul/indexer/index_service.py b/src/azul/indexer/index_service.py index 25a3c6d40f..20c28a7beb 100644 --- a/src/azul/indexer/index_service.py +++ b/src/azul/indexer/index_service.py @@ -212,7 +212,8 @@ def index(self, catalog: CatalogName, bundle: Bundle) -> None: for contributions, replicas in transforms: tallies.update(self.contribute(catalog, contributions)) self.replicate(catalog, replicas) - self.aggregate(tallies) + if tallies: + self.aggregate(tallies) def delete(self, catalog: CatalogName, bundle: Bundle) -> None: """ diff --git a/src/azul/plugins/metadata/anvil/bundle.py b/src/azul/plugins/metadata/anvil/bundle.py index 8df46cb8c5..7e6e064f8a 100644 --- a/src/azul/plugins/metadata/anvil/bundle.py +++ b/src/azul/plugins/metadata/anvil/bundle.py @@ -122,6 +122,7 @@ class AnvilBundle(Bundle[BUNDLE_FQID], ABC): # the former to the latter during transformation. entities: dict[EntityReference, MutableJSON] = attrs.field(factory=dict) links: set[EntityLink] = attrs.field(factory=set) + orphans: dict[EntityReference, MutableJSON] = attrs.field(factory=dict) def reject_joiner(self, catalog: CatalogName): # FIXME: Optimize joiner rejection and re-enable it for AnVIL @@ -129,21 +130,29 @@ def reject_joiner(self, catalog: CatalogName): pass def to_json(self) -> MutableJSON: - return { - 'entities': { + def serialize_entities(entities): + return { str(entity_ref): entity - for entity_ref, entity in sorted(self.entities.items()) - }, + for entity_ref, entity in sorted(entities.items()) + } + + return { + 'entities': serialize_entities(self.entities), + 'orphans': serialize_entities(self.orphans), 'links': [link.to_json() for link in sorted(self.links)] } @classmethod def from_json(cls, fqid: BUNDLE_FQID, json_: JSON) -> Self: + def deserialize_entities(json_entities): + return { + EntityReference.parse(entity_ref): entity + for entity_ref, entity in json_entities.items() + } + return cls( fqid=fqid, - entities={ - EntityReference.parse(entity_ref): entity - for entity_ref, entity in json_['entities'].items() - }, - links=set(map(EntityLink.from_json, json_['links'])) + entities=deserialize_entities(json_['entities']), + links=set(map(EntityLink.from_json, json_['links'])), + orphans=deserialize_entities(json_['orphans']) ) diff --git a/src/azul/plugins/metadata/anvil/indexer/transform.py b/src/azul/plugins/metadata/anvil/indexer/transform.py index ca181b789d..395d405675 100644 --- a/src/azul/plugins/metadata/anvil/indexer/transform.py +++ b/src/azul/plugins/metadata/anvil/indexer/transform.py @@ -169,6 +169,9 @@ def aggregator(cls, entity_type) -> EntityAggregator: assert False, entity_type def estimate(self, partition: BundlePartition) -> int: + # Orphans are not considered when deciding whether to partition the + # bundle, but if the bundle is partitioned then each orphan will be + # replicated in a single partition return sum(map(partial(self._contains, partition), self.bundle.entities)) def transform(self, @@ -188,7 +191,8 @@ def _transform(self, raise NotImplementedError def _replicate(self, entity: EntityReference) -> tuple[str, JSON]: - return entity.entity_type, self.bundle.entities[entity] + content = ChainMap(self.bundle.entities, self.bundle.orphans)[entity] + return entity.entity_type, content def _convert_entity_type(self, entity_type: str) -> str: assert entity_type == 'bundle' or entity_type.startswith('anvil_'), entity_type @@ -406,7 +410,10 @@ def _file(self, file: EntityReference) -> MutableJSON: uuid=file.entity_id) def _only_dataset(self) -> EntityReference: - return one(self._entities_by_type['anvil_dataset']) + try: + return one(self._entities_by_type['anvil_dataset']) + except ValueError: + return one(o for o in self.bundle.orphans if o.entity_type == 'anvil_dataset') @cached_property def _activity_polymorphic_types(self) -> AbstractSet[str]: @@ -506,7 +513,9 @@ def _dataset(self, dataset: EntityReference) -> MutableJSON: return super()._dataset(dataset) def _list_entities(self) -> Iterable[EntityReference]: - yield self._singleton() + # Suppress contributions for bundles that only contain orphans + if self.bundle.entities: + yield self._singleton() @abstractmethod def _singleton(self) -> EntityReference: @@ -570,13 +579,13 @@ def transform(self, yield from super().transform(partition) if config.enable_replicas: # Replicas are only emitted by the file transformer for entities - # that are linked to at least one file. This excludes a small number - # of linked entities, usually from primary bundles that don't - # include any files. Some of the replicas we emit here will be - # redundant with those emitted by the file transformer, but these - # will be coalesced by the index service before they are written to - # ElasticSearch. - for entity in self.bundle.entities: + # that are linked to at least one file. This excludes all orphans, + # and a small number of linked entities, usually from primary + # bundles don't include any files. Some of the replicas we emit here + # will be redundant with those emitted by the file transformer, but + # these will be coalesced by the index service before they are + # written to ElasticSearch. + for entity in chain(self.bundle.orphans, self.bundle.entities): if partition.contains(UUID(entity.entity_id)): yield self._replica(entity, file_hub=None) diff --git a/src/azul/plugins/repository/tdr_anvil/__init__.py b/src/azul/plugins/repository/tdr_anvil/__init__.py index 20e9f8a872..e841615931 100644 --- a/src/azul/plugins/repository/tdr_anvil/__init__.py +++ b/src/azul/plugins/repository/tdr_anvil/__init__.py @@ -2,6 +2,7 @@ from enum import ( Enum, ) +import itertools import logging from operator import ( itemgetter, @@ -10,7 +11,10 @@ AbstractSet, Callable, Iterable, + Self, + cast, ) +import uuid import attrs from more_itertools import ( @@ -24,12 +28,14 @@ uuids, ) from azul.bigquery import ( + BigQueryRow, backtick, ) from azul.drs import ( DRSURI, ) from azul.indexer import ( + Prefix, SourcedBundleFQIDJSON, ) from azul.indexer.document import ( @@ -74,58 +80,88 @@ class BundleType(Enum): """ - AnVIL snapshots have no inherent notion of a "bundle". When indexing these - snapshots, we dynamically construct bundles by selecting individual entities - and following their foreign keys to discover associated entities. The - initial entity from which this graph traversal begins is termed the - "bundle entity", and its FQID serves as the basis for the bundle FQID. Each - member of this enumeration represents a strategy for selecting bundle - entities. - - Our primary such strategy is to use every biosample in a given snapshot as a - bundle entity. Biosamples were chosen for this role based on a desirable - balance between the size and number of the resulting bundles as well as the - degree of overlap between them. The implementation of the graph traversal is - tightly coupled to this choice, and switching to a different entity type - would require re-implementing much of the Plugin code. Primary bundles - consist of at least one biosample (the bundle entity), exactly one dataset, - and zero or more other entities of assorted types. - - Some snapshots include file entities that lack any foreign keys that - associate the file with any other entity. To ensure that these "orphaned" - files are indexed, they are also used as bundle entities. As with primary - bundles, the creation of these supplementary bundles depends on a - specifically tailored traversal implementation. Supplementary bundles always - consist of exactly two entities: one file (the bundle entity) and one - dataset. - - The `dataset.description` field is unusual in that it is not stored in - BigQuery and must be retrieved via Terra's DUOS API. There is only one - dataset per snapshot, which is referenced in all primary and supplementary - bundles. Therefore, only one request to DUOS per *snapshot* is necessary, - but if `description` is retrieved at the same time as the other dataset - fields, we will make one request per *bundle* instead, potentially - overloading the DUOS service. Our solution is to retrieve `description` only - in a dedicated bundle format, once per snapshot, and merge it with the other - dataset fields during aggregation. This bundle contains only a single - dataset entity with only the `description` field populated. + AnVIL snapshots have no inherent notion of a "bundle". During indexing, we + dynamically construct bundles by querying each table in the snapshot. This + class enumerates the tables that require special strategies for listing and + fetching their bundles. + + Primary bundles are defined by a biosample entity, termed the bundle entity. + Each primary bundle includes all of the bundle entity descendants and all of + those those entities' ancestors, which are discovered by iteratively + following foreign keys. Biosamples were chosen for this role based on a + desirable balance between the size and number of the resulting bundles as + well as the degree of overlap between them. The implementation of the graph + traversal is tightly coupled to this choice, and switching to a different + entity type would require re-implementing much of the Plugin code. Primary + bundles consist of at least one biosample (the bundle entity), exactly one + dataset, and zero or more other entities of assorted types. Primary bundles + never contain orphans because they are bijective to rows in the biosample + table. + + Supplementary bundles consist of batches of file entities, which may include + supplementary files, which lack any foreign keys that associate them with + any other entity. Non-supplementary files in the bundle are classified as + orphans. The bundle also includes a dataset entity linked to the + supplementary files. + + Duos bundles consist of a single dataset entity. This "entity" includes only + the dataset description retrieved from DUOS, while a copy of the BigQuery + row for this dataset is also included as an orphan. We chose this design + because there is only one dataset per snapshot, which is referenced in all + primary and supplementary bundles. Therefore, only one request to DUOS per + *snapshot* is necessary, but if `description` is retrieved at the same time + as the other dataset fields, we will make one request per *bundle* instead, + potentially overloading the DUOS service. Our solution is to retrieve + `description` only in a dedicated bundle format, once per snapshot, and + merge it with the other dataset fields during aggregation. + + All other bundles are replica bundles. Replica bundles consist of a batch of + rows from an arbitrary BigQuery table, which may or may not be described by + the AnVIL schema. Replica bundles only include orphans and have no links. """ primary = 'anvil_biosample' supplementary = 'anvil_file' duos = 'anvil_dataset' + def is_batched(self: Self | str) -> bool: + """ + >>> BundleType.primary.is_batched() + False + + >>> BundleType.is_batched('anvil_activity') + True + """ + if isinstance(self, str): + try: + self = BundleType(self) + except ValueError: + return True + return self not in (BundleType.primary, BundleType.duos) + class TDRAnvilBundleFQIDJSON(SourcedBundleFQIDJSON): table_name: str + batch_prefix: str | None @attrs.frozen(kw_only=True, eq=False) class TDRAnvilBundleFQID(TDRBundleFQID): - table_name: BundleType = attrs.field(converter=BundleType) + table_name: str + batch_prefix: str | None def to_json(self) -> TDRAnvilBundleFQIDJSON: - return dict(super().to_json(), - table_name=self.table_name.value) + return cast(TDRAnvilBundleFQIDJSON, super().to_json()) + + def __attrs_post_init__(self): + should_be_batched = BundleType.is_batched(self.table_name) + is_batched = self.is_batched + assert is_batched == should_be_batched, self + if is_batched: + assert len(self.batch_prefix) <= 8, self + + @property + def is_batched(self) -> bool: + return self.batch_prefix is not None class TDRAnvilBundle(AnvilBundle[TDRAnvilBundleFQID], TDRBundle): @@ -137,9 +173,14 @@ def canning_qualifier(cls) -> str: def add_entity(self, entity: EntityReference, version: str, - row: MutableJSON + row: MutableJSON, + *, + is_orphan: bool = False ) -> None: - assert entity not in self.entities, entity + target = self.orphans if is_orphan else self.entities + # In DUOS bundles, the dataset is represented as both as entity and an + # orphan + assert entity not in target, entity metadata = dict(row, version=version) if entity.entity_type == 'anvil_file': @@ -149,7 +190,7 @@ def add_entity(self, metadata.update(drs_uri=drs_uri, sha256='', crc32='') - self.entities[entity] = metadata + target[entity] = metadata def add_links(self, links: Iterable[EntityLink]): self.links.update(links) @@ -167,90 +208,186 @@ def _version(self): tzinfo=datetime.timezone.utc)) datarepo_row_uuid_version = 4 + batch_uuid_version = 5 bundle_uuid_version = 10 + def _batch_uuid(self, + source: TDRSourceSpec, + table_name: str, + batch_prefix: str + ) -> str: + namespace = uuid.UUID('b8b3ac80-e035-4904-8b02-2d04f9e9a369') + batch_uuid = uuid.uuid5(namespace, f'{source}:{table_name}:{batch_prefix}') + return change_version(str(batch_uuid), + self.batch_uuid_version, + self.bundle_uuid_version) + def count_bundles(self, source: TDRSourceSpec) -> int: prefix = '' if source.prefix is None else source.prefix.common - rows = self._run_sql(f''' + primary_count = one(self._run_sql(f''' SELECT COUNT(*) AS count FROM {backtick(self._full_table_name(source, BundleType.primary.value))} WHERE STARTS_WITH(datarepo_row_id, {prefix!r}) - UNION ALL + '''))['count'] + duos_count = 0 if config.duos_service_url is None else one(self._run_sql(f''' SELECT COUNT(*) AS count - FROM {backtick(self._full_table_name(source, BundleType.supplementary.value))} - WHERE is_supplementary AND STARTS_WITH(datarepo_row_id, {prefix!r}) - ''') - return sum(row['count'] for row in rows) + FROM {backtick(self._full_table_name(source, BundleType.duos.value))} + WHERE STARTS_WITH(datarepo_row_id, {prefix!r}) + '''))['count'] + sizes_by_table = self._batch_tables(source, prefix) + batched_count = sum(batch_size for (_, batch_size) in sizes_by_table.values()) + return primary_count + duos_count + batched_count def list_bundles(self, source: TDRSourceRef, prefix: str ) -> list[TDRAnvilBundleFQID]: self._assert_source(source) - spec = source.spec - primary = BundleType.primary.value - supplementary = BundleType.supplementary.value - duos = BundleType.duos.value - rows = list(self._run_sql(f''' - SELECT datarepo_row_id, {primary!r} AS table_name - FROM {backtick(self._full_table_name(spec, primary))} - WHERE STARTS_WITH(datarepo_row_id, '{prefix}') - UNION ALL - SELECT datarepo_row_id, {supplementary!r} AS table_name - FROM {backtick(self._full_table_name(spec, supplementary))} AS supp - WHERE supp.is_supplementary AND STARTS_WITH(datarepo_row_id, '{prefix}') - ''' + ( - '' - if config.duos_service_url is None else - f''' - UNION ALL - SELECT datarepo_row_id, {duos!r} AS table_name - FROM {backtick(self._full_table_name(spec, duos))} - ''' - ))) bundles = [] - duos_count = 0 - for row in rows: - # Reversibly tweak the entity UUID to prevent - # collisions between entity IDs and bundle IDs - bundle_uuid = uuids.change_version(row['datarepo_row_id'], - self.datarepo_row_uuid_version, - self.bundle_uuid_version) - # We intentionally omit the WHERE clause for datasets so that we can - # verify our assumption that each snapshot only contains rows for a - # single dataset. This verification is performed independently and - # concurrently for every partition, but only one partition actually - # emits the bundle. - if row['table_name'] == duos: - require(0 == duos_count) - duos_count += 1 - # Ensure that one partition will always contain the DUOS bundle - # regardless of the choice of common prefix - if not bundle_uuid.startswith(prefix): - continue + spec = source.spec + if config.duos_service_url is not None: + row = one(self._run_sql(f''' + SELECT datarepo_row_id + FROM {backtick(self._full_table_name(spec, BundleType.duos.value))} + ''')) + dataset_row_id = row['datarepo_row_id'] + # We intentionally omit the WHERE clause for datasets in order + # to verify our assumption that each snapshot only contains rows + # for a single dataset. This verification is performed + # independently and concurrently for every partition, but only + # one partition actually emits the bundle. + if dataset_row_id.startswith(prefix): + bundle_uuid = change_version(dataset_row_id, + self.datarepo_row_uuid_version, + self.bundle_uuid_version) + bundles.append(TDRAnvilBundleFQID( + uuid=bundle_uuid, + version=self._version, + source=source, + table_name=BundleType.duos.value, + batch_prefix=None, + )) + for row in self._run_sql(f''' + SELECT datarepo_row_id + FROM {backtick(self._full_table_name(spec, BundleType.primary.value))} + WHERE STARTS_WITH(datarepo_row_id, {prefix!r}) + '''): + bundle_uuid = change_version(row['datarepo_row_id'], + self.datarepo_row_uuid_version, + self.bundle_uuid_version) bundles.append(TDRAnvilBundleFQID( - source=source, uuid=bundle_uuid, version=self._version, - table_name=BundleType(row['table_name']) + source=source, + table_name=BundleType.primary.value, + batch_prefix=None, )) + prefix_lengths_by_table = self._batch_tables(source.spec, prefix) + for table_name, (batch_prefix_length, _) in prefix_lengths_by_table.items(): + batch_prefixes = Prefix(common=prefix, + partition=batch_prefix_length - len(prefix)).partition_prefixes() + for batch_prefix in batch_prefixes: + bundle_uuid = self._batch_uuid(spec, table_name, batch_prefix) + bundles.append(TDRAnvilBundleFQID(uuid=bundle_uuid, + version=self._version, + source=source, + table_name=table_name, + batch_prefix=batch_prefix)) return bundles def _emulate_bundle(self, bundle_fqid: TDRAnvilBundleFQID) -> TDRAnvilBundle: - if bundle_fqid.table_name is BundleType.primary: + if bundle_fqid.table_name == BundleType.primary.value: log.info('Bundle %r is a primary bundle', bundle_fqid.uuid) return self._primary_bundle(bundle_fqid) - elif bundle_fqid.table_name is BundleType.supplementary: + elif bundle_fqid.table_name == BundleType.supplementary.value: log.info('Bundle %r is a supplementary bundle', bundle_fqid.uuid) return self._supplementary_bundle(bundle_fqid) - elif bundle_fqid.table_name is BundleType.duos: + elif bundle_fqid.table_name == BundleType.duos.value: assert config.duos_service_url is not None, bundle_fqid log.info('Bundle %r is a DUOS bundle', bundle_fqid.uuid) return self._duos_bundle(bundle_fqid) else: - assert False, bundle_fqid.table_name + log.info('Bundle %r is a replica bundle', bundle_fqid.uuid) + return self._replica_bundle(bundle_fqid) + + def _batch_tables(self, + source: TDRSourceSpec, + prefix: str, + ) -> dict[str, tuple[int, int]]: + """ + Find a batch prefix length that yields as close to 256 rows per batch + as possible for each table within the specified partition. The result's + keys are table names and its values are tuples where the first element + is the prefix length (*including* the partition prefix) and the second + element is the resulting number of batches. Tables are only included in + the result if they are non-empty and are used to produce batched bundle + formats (i.e. replica and supplementary). + + Because the partitions of a table do not contain exactly the same number + of bundles, calculating the batch size statistics for the entire table + at once produces a different result than performing the same calculation + for any individual partition. We expect the inconsistencies to average + out across partitions so that `count_bundles` and `list_bundles` give + consistent results as long the partition size is substantially larger + than the batch size. + + This method relies on BigQuery's `AVG` function, which is + nondeterministic for floating-point return values. The probability that + this affects this method's return value is very small, but nonzero. + https://cloud.google.com/bigquery/docs/reference/standard-sql/aggregate_functions#avg + """ + max_length = 4 + + def repeat(fmt): + return ', '.join(fmt.format(i=i) for i in range(1, max_length + 1)) + + target_size = 256 + prefix_len = len(prefix) + table_names = self.tdr.list_tables(source) + # This table is present in all snapshots. It is large and contains no + # useful metadata, so we skip indexing replicas from it. + table_names.discard('datarepo_row_ids') + table_names = sorted(filter(BundleType.is_batched, table_names)) + log.info('Calculating batch prefix lengths for partition %r of %d tables ' + 'in source %s', prefix, len(table_names), source) + # The extraneous outer 'SELECT *' works around a bug in BigQuery emulator + query = ' UNION ALL '.join(f'''( + SELECT * FROM ( + SELECT + {table_name!r} AS table_name, + {prefix_len} + LENGTH(CONCAT( + {repeat('IFNULL(p{i}, "")')} + )) AS batch_prefix_length, + AVG(num_rows) AS average_batch_size, + COUNT(*) AS num_batches + FROM ( + SELECT + {repeat(f'SUBSTR(datarepo_row_id, {prefix_len} + {{i}}, 1) AS p{{i}}')}, + COUNT(*) AS num_rows + FROM {backtick(self._full_table_name(source, table_name))} + WHERE STARTS_WITH(datarepo_row_id, {prefix!r}) + GROUP BY ROLLUP ({repeat('p{i}')}) + ) + GROUP BY batch_prefix_length + ORDER BY ABS({target_size} - average_batch_size) + LIMIT 1 + ) + )''' for table_name in table_names) + + def result(row): + table_name = row['table_name'] + prefix_length = row['batch_prefix_length'] + average_size = row['average_batch_size'] + num_batches = row['num_batches'] + log.info('Selected batch prefix length %d for table %r (average ' + 'batch size %.1f, num batches %d)', + prefix_length, table_name, average_size, num_batches) + return table_name, (prefix_length, num_batches) + + return dict(map(result, self._run_sql(query))) def _primary_bundle(self, bundle_fqid: TDRAnvilBundleFQID) -> TDRAnvilBundle: + assert not bundle_fqid.is_batched, bundle_fqid source = bundle_fqid.source bundle_entity = self._bundle_entity(bundle_fqid) @@ -300,56 +437,80 @@ def _primary_bundle(self, bundle_fqid: TDRAnvilBundleFQID) -> TDRAnvilBundle: return result def _supplementary_bundle(self, bundle_fqid: TDRAnvilBundleFQID) -> TDRAnvilBundle: - entity_id = uuids.change_version(bundle_fqid.uuid, - self.bundle_uuid_version, - self.datarepo_row_uuid_version) + assert bundle_fqid.is_batched, bundle_fqid source = bundle_fqid.source.spec - table_name = bundle_fqid.table_name.value result = TDRAnvilBundle(fqid=bundle_fqid) - columns = self._columns(table_name) - bundle_entity = dict(one(self._run_sql(f''' - SELECT {', '.join(sorted(columns))} - FROM {backtick(self._full_table_name(source, table_name))} - WHERE datarepo_row_id = '{entity_id}' - '''))) - dataset_table = 'anvil_dataset' - columns = self._columns(dataset_table) - linked_entity = dict(one(self._run_sql(f''' - SELECT {', '.join(sorted(columns))} - FROM {backtick(self._full_table_name(source, dataset_table))} - '''))) - link_args = {} - for entity_type, row, arg in [ - ('anvil_file', bundle_entity, 'outputs'), - (dataset_table, linked_entity, 'inputs') - ]: - entity_ref = EntityReference(entity_type=entity_type, entity_id=row['datarepo_row_id']) - result.add_entity(entity_ref, self._version, row) - link_args[arg] = {entity_ref} - result.add_links({EntityLink(**link_args)}) + linked_file_refs = set() + for file_ref, file_row in self._get_batch(bundle_fqid): + is_supplementary = file_row['is_supplementary'] + result.add_entity(file_ref, + self._version, + dict(file_row), + is_orphan=not is_supplementary) + if is_supplementary: + linked_file_refs.add(file_ref) + dataset_ref, dataset_row = self._get_dataset(source) + result.add_entity(dataset_ref, self._version, dict(dataset_row)) + result.add_links([EntityLink(inputs={dataset_ref}, outputs=linked_file_refs)]) return result def _duos_bundle(self, bundle_fqid: TDRAnvilBundleFQID) -> TDRAnvilBundle: + assert not bundle_fqid.is_batched, bundle_fqid duos_info = self.tdr.get_duos(bundle_fqid.source) description = None if duos_info is None else duos_info.get('studyDescription') - entity_id = change_version(bundle_fqid.uuid, - self.bundle_uuid_version, - self.datarepo_row_uuid_version) - entity = EntityReference(entity_type='anvil_dataset', - entity_id=entity_id) + ref, row = self._get_dataset(bundle_fqid.source.spec) + expected_entity_id = change_version(bundle_fqid.uuid, + self.bundle_uuid_version, + self.datarepo_row_uuid_version) + assert ref.entity_id == expected_entity_id, (ref, bundle_fqid) bundle = TDRAnvilBundle(fqid=bundle_fqid) - bundle.add_entity(entity=entity, - version=self._version, - row={'description': description}) + bundle.add_entity(ref, self._version, {'description': description}) + # Classify as orphan to suppress the emission of a contribution + bundle.add_entity(ref, self._version, dict(row), is_orphan=True) return bundle + def _replica_bundle(self, bundle_fqid: TDRAnvilBundleFQID) -> TDRAnvilBundle: + assert bundle_fqid.is_batched, bundle_fqid + source = bundle_fqid.source.spec + result = TDRAnvilBundle(fqid=bundle_fqid) + batch = self._get_batch(bundle_fqid) + dataset = self._get_dataset(source) + for (ref, row) in itertools.chain([dataset], batch): + result.add_entity(ref, self._version, dict(row), is_orphan=True) + return result + + def _get_dataset(self, source: TDRSourceSpec) -> tuple[EntityReference, BigQueryRow]: + table_name = 'anvil_dataset' + columns = self._columns(table_name) + row = one(self._run_sql(f''' + SELECT {', '.join(sorted(columns))} + FROM {backtick(self._full_table_name(source, table_name))} + ''')) + ref = EntityReference(entity_type=table_name, entity_id=row['datarepo_row_id']) + return ref, row + + def _get_batch(self, + bundle_fqid: TDRAnvilBundleFQID + ) -> Iterable[tuple[EntityReference, BigQueryRow]]: + source = bundle_fqid.source.spec + batch_prefix = bundle_fqid.batch_prefix + table_name = bundle_fqid.table_name + columns = self._columns(table_name) + for row in self._run_sql(f''' + SELECT {', '.join(sorted(columns))} + FROM {backtick(self._full_table_name(source, table_name))} + WHERE STARTS_WITH(datarepo_row_id, {batch_prefix!r}) + '''): + ref = EntityReference(entity_type=table_name, entity_id=row['datarepo_row_id']) + yield ref, row + def _bundle_entity(self, bundle_fqid: TDRAnvilBundleFQID) -> KeyReference: source = bundle_fqid.source bundle_uuid = bundle_fqid.uuid entity_id = uuids.change_version(bundle_uuid, self.bundle_uuid_version, self.datarepo_row_uuid_version) - table_name = bundle_fqid.table_name.value + table_name = bundle_fqid.table_name pk_column = table_name.removeprefix('anvil_') + '_id' bundle_entity = one(self._run_sql(f''' SELECT {pk_column} @@ -672,6 +833,11 @@ def convert_column(value): } def _columns(self, table_name: str) -> set[str]: - columns = set(self._schema_columns[table_name]) - columns.add('datarepo_row_id') - return columns + try: + columns = self._schema_columns[table_name] + except KeyError: + return {'*'} + else: + columns = set(columns) + columns.add('datarepo_row_id') + return columns diff --git a/src/azul/terra.py b/src/azul/terra.py index 5e14fa3c50..e5046850a7 100644 --- a/src/azul/terra.py +++ b/src/azul/terra.py @@ -39,6 +39,7 @@ bigquery, ) from google.cloud.bigquery import ( + DatasetReference, QueryJob, QueryJobConfig, QueryPriority, @@ -510,6 +511,14 @@ def run_sql(self, query: str) -> BigQueryRows: log.debug('Job info: %s', json.dumps(self._job_info(job, result))) return result + def list_tables(self, source: TDRSourceSpec) -> set[str]: + bigquery = self._bigquery(self.credentials.project_id) + ref = DatasetReference(project=source.subdomain, dataset_id=source.name) + return { + table.to_api_repr()['tableReference']['tableId'] + for table in bigquery.list_tables(ref) + } + def _trunc_query(self, query: str) -> str: return trunc_ellipses(query, 2048) diff --git a/terraform/authentication.tf.json.template.py b/terraform/authentication.tf.json.template.py index 782c1581b9..cf291378b5 100644 --- a/terraform/authentication.tf.json.template.py +++ b/terraform/authentication.tf.json.template.py @@ -60,6 +60,7 @@ "title": f"azul_{config.deployment_stage}", "permissions": [ "bigquery.jobs.create", + "bigquery.tables.list", *[ f'bigquery.{resource}.{action}' for resource in ('capacityCommitments', 'reservations') diff --git a/test/indexer/data/2370f948-2783-aeb6-afea-e022897f4dcf.tdr.anvil.json b/test/indexer/data/2370f948-2783-aeb6-afea-e022897f4dcf.tdr.anvil.json index 7da9d5f3cd..9859b200f1 100644 --- a/test/indexer/data/2370f948-2783-aeb6-afea-e022897f4dcf.tdr.anvil.json +++ b/test/indexer/data/2370f948-2783-aeb6-afea-e022897f4dcf.tdr.anvil.json @@ -5,5 +5,30 @@ "version": "2022-06-01T00:00:00.000000Z" } }, - "links": [] + "links": [], + "orphans": { + "anvil_dataset/2370f948-2783-4eb6-afea-e022897f4dcf": { + "consent_group": [ + "DS-BDIS" + ], + "data_modality": [], + "data_use_permission": [ + "DS-BDIS" + ], + "datarepo_row_id": "2370f948-2783-4eb6-afea-e022897f4dcf", + "dataset_id": "52ee7665-7033-63f2-a8d9-ce8e32666739", + "owner": [ + "Debbie Nickerson" + ], + "principal_investigator": [], + "registered_identifier": [ + "phs000693" + ], + "source_datarepo_row_ids": [ + "workspace_attributes:7a22b629-9d81-4e4d-9297-f9e44ed760bc" + ], + "title": "ANVIL_CMG_UWASH_DS_BDIS", + "version": "2022-06-01T00:00:00.000000Z" + } + } } \ No newline at end of file diff --git a/test/indexer/data/6c87f0e1-509d-46a4-b845-7584df39263b.tables.tdr.json b/test/indexer/data/6c87f0e1-509d-46a4-b845-7584df39263b.tables.tdr.json index c1783920e2..8a119af5c6 100644 --- a/test/indexer/data/6c87f0e1-509d-46a4-b845-7584df39263b.tables.tdr.json +++ b/test/indexer/data/6c87f0e1-509d-46a4-b845-7584df39263b.tables.tdr.json @@ -254,6 +254,22 @@ "variantcallingactivity_id": "8db89e5e-1c8a-487e-b0c1-b26218ac6b7b" } ] + }, + "non_schema_orphan_table": { + "rows": [ + { + "datarepo_row_id": "9687b86d-a2ae-a083-b910-a16bcbef1ba4", + "non_schema_column": "spam" + }, + { + "datarepo_row_id": "28ed0f3a-157b-417b-a05a-48f57f9d3a34", + "non_schema_column": "eggs" + }, + { + "datarepo_row_id": "9db5952c-c454-49d9-8a62-5abb026701c0", + "non_schema_column": "baked beans" + } + ] } } } \ No newline at end of file diff --git a/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.tdr.anvil.json b/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.tdr.anvil.json index 9c461ee994..ec875d5fa8 100644 --- a/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.tdr.anvil.json +++ b/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.tdr.anvil.json @@ -231,5 +231,6 @@ "anvil_biosample/826dea02-e274-4ffe-aabc-eb3db63ad068" ] } - ] + ], + "orphans": {} } diff --git a/test/indexer/data/9a135c9a-069b-a90e-b588-eaf8d1aeeac9.tdr.anvil.json b/test/indexer/data/9a135c9a-069b-a90e-b588-eaf8d1aeeac9.tdr.anvil.json new file mode 100644 index 0000000000..a9ff2d7af3 --- /dev/null +++ b/test/indexer/data/9a135c9a-069b-a90e-b588-eaf8d1aeeac9.tdr.anvil.json @@ -0,0 +1,44 @@ +{ + "entities": {}, + "links": {}, + "orphans": { + "non_schema_orphan_table/9687b86d-a2ae-a083-b910-a16bcbef1ba4": { + "datarepo_row_id": "9687b86d-a2ae-a083-b910-a16bcbef1ba4", + "non_schema_column": "spam", + "version": "2022-06-01T00:00:00.000000Z" + }, + "non_schema_orphan_table/28ed0f3a-157b-417b-a05a-48f57f9d3a34": { + "datarepo_row_id": "28ed0f3a-157b-417b-a05a-48f57f9d3a34", + "non_schema_column": "eggs", + "version": "2022-06-01T00:00:00.000000Z" + }, + "non_schema_orphan_table/9db5952c-c454-49d9-8a62-5abb026701c0": { + "datarepo_row_id": "9db5952c-c454-49d9-8a62-5abb026701c0", + "non_schema_column": "baked beans", + "version": "2022-06-01T00:00:00.000000Z" + }, + "anvil_dataset/2370f948-2783-4eb6-afea-e022897f4dcf": { + "consent_group": [ + "DS-BDIS" + ], + "data_modality": [], + "data_use_permission": [ + "DS-BDIS" + ], + "datarepo_row_id": "2370f948-2783-4eb6-afea-e022897f4dcf", + "dataset_id": "52ee7665-7033-63f2-a8d9-ce8e32666739", + "owner": [ + "Debbie Nickerson" + ], + "principal_investigator": [], + "registered_identifier": [ + "phs000693" + ], + "source_datarepo_row_ids": [ + "workspace_attributes:7a22b629-9d81-4e4d-9297-f9e44ed760bc" + ], + "title": "ANVIL_CMG_UWASH_DS_BDIS", + "version": "2022-06-01T00:00:00.000000Z" + } + } +} \ No newline at end of file diff --git a/test/indexer/data/6b0f6c0f-5d80-a242-accb-840921351cd5.tdr.anvil.json b/test/indexer/data/c67e7adb-1a9c-a3b9-bc91-eb10a428374a.tdr.anvil.json similarity index 51% rename from test/indexer/data/6b0f6c0f-5d80-a242-accb-840921351cd5.tdr.anvil.json rename to test/indexer/data/c67e7adb-1a9c-a3b9-bc91-eb10a428374a.tdr.anvil.json index e689fbe3f7..dd3946fbcc 100644 --- a/test/indexer/data/6b0f6c0f-5d80-a242-accb-840921351cd5.tdr.anvil.json +++ b/test/indexer/data/c67e7adb-1a9c-a3b9-bc91-eb10a428374a.tdr.anvil.json @@ -27,8 +27,8 @@ "data_modality": [], "datarepo_row_id": "6b0f6c0f-5d80-4242-accb-840921351cd5", "file_format": ".txt", - "file_id": "1fab11f5-7eab-4318-9a58-68d8d06e0715", "file_md5sum": "S/GBrRjzZAQYqh3rdiPYzA==", + "file_id": "1fab11f5-7eab-4318-9a58-68d8d06e0715", "file_name": "CCDG_13607_B01_GRM_WGS_2019-02-19_chr15.recalibrated_variants.annotated.coding.txt", "file_ref": "drs://mock_tdr.lan/v1_6c87f0e1-509d-46a4-b845-7584df39263b_1fab11f5-7eab-4318-9a58-68d8d06e0715", "file_size": 15079345, @@ -53,5 +53,45 @@ "anvil_file/6b0f6c0f-5d80-4242-accb-840921351cd5" ] } - ] + ], + "orphans": { + "anvil_file/15b76f9c-6b46-433f-851d-34e89f1b9ba6": { + "data_modality": [], + "datarepo_row_id": "15b76f9c-6b46-433f-851d-34e89f1b9ba6", + "file_format": ".vcf.gz", + "file_id": "1e269f04-4347-4188-b060-1dcc69e71d67", + "file_md5sum": "vuxgbuCqKZ/fkT9CWTFmIg==", + "file_name": "307500.merged.matefixed.sorted.markeddups.recal.g.vcf.gz", + "file_ref": "drs://mock_tdr.lan/v1_6c87f0e1-509d-46a4-b845-7584df39263b_1e269f04-4347-4188-b060-1dcc69e71d67", + "file_size": 213021639, + "is_supplementary": false, + "reference_assembly": [], + "source_datarepo_row_ids": [ + "file_inventory:81d16471-97ac-48fe-99a0-73d9ec62c2c0" + ], + "version": "2022-06-01T00:00:00.000000Z", + "drs_uri": "drs://mock_tdr.lan/v1_6c87f0e1-509d-46a4-b845-7584df39263b_1e269f04-4347-4188-b060-1dcc69e71d67", + "sha256": "", + "crc32": "" + }, + "anvil_file/3b17377b-16b1-431c-9967-e5d01fc5923f": { + "data_modality": [], + "datarepo_row_id": "3b17377b-16b1-431c-9967-e5d01fc5923f", + "file_format": ".bam", + "file_id": "8b722e88-8103-49c1-b351-e64fa7c6ab37", + "file_md5sum": "fNn9e1SovzgOROk3BvH6LQ==", + "file_name": "307500.merged.matefixed.sorted.markeddups.recal.bam", + "file_ref": "drs://mock_tdr.lan/v1_6c87f0e1-509d-46a4-b845-7584df39263b_8b722e88-8103-49c1-b351-e64fa7c6ab37", + "file_size": 3306845592, + "is_supplementary": false, + "reference_assembly": [], + "source_datarepo_row_ids": [ + "file_inventory:9658d94a-511d-4b49-82c3-d0cb07e0cff2" + ], + "version": "2022-06-01T00:00:00.000000Z", + "drs_uri": "drs://mock_tdr.lan/v1_6c87f0e1-509d-46a4-b845-7584df39263b_8b722e88-8103-49c1-b351-e64fa7c6ab37", + "sha256": "", + "crc32": "" + } + } } \ No newline at end of file diff --git a/test/indexer/test_anvil.py b/test/indexer/test_anvil.py index 0b9f56b287..a01b0d6b13 100644 --- a/test/indexer/test_anvil.py +++ b/test/indexer/test_anvil.py @@ -105,13 +105,14 @@ def bundle_fqid(cls, *, uuid, version=None, - table_name=BundleType.primary + table_name=BundleType.primary.value, ) -> TDRAnvilBundleFQID: assert version is None, 'All AnVIL bundles should use the same version' return TDRAnvilBundleFQID(source=cls.source, uuid=uuid, version=cls.version, - table_name=table_name) + table_name=table_name, + batch_prefix='' if BundleType.is_batched(table_name) else None) @classmethod def primary_bundle(cls) -> TDRAnvilBundleFQID: @@ -119,13 +120,18 @@ def primary_bundle(cls) -> TDRAnvilBundleFQID: @classmethod def supplementary_bundle(cls) -> TDRAnvilBundleFQID: - return cls.bundle_fqid(uuid='6b0f6c0f-5d80-a242-accb-840921351cd5', - table_name=BundleType.supplementary) + return cls.bundle_fqid(uuid='c67e7adb-1a9c-a3b9-bc91-eb10a428374a', + table_name=BundleType.supplementary.value) @classmethod def duos_bundle(cls) -> TDRAnvilBundleFQID: return cls.bundle_fqid(uuid='2370f948-2783-aeb6-afea-e022897f4dcf', - table_name=BundleType.duos) + table_name=BundleType.duos.value) + + @classmethod + def replica_bundle(cls) -> TDRAnvilBundleFQID: + return cls.bundle_fqid(uuid='9a135c9a-069b-a90e-b588-eaf8d1aeeac9', + table_name='non_schema_orphan_table') class TestAnvilIndexer(AnvilIndexerTestCase, @@ -166,15 +172,33 @@ def test_indexing(self): def test_list_and_fetch_bundles(self): source_ref = self.source self._make_mock_tdr_tables(source_ref) - expected_bundle_fqids = sorted([ + canned_bundle_fqids = [ self.primary_bundle(), self.supplementary_bundle(), - self.duos_bundle() + self.duos_bundle(), + self.replica_bundle(), + ] + expected_bundle_fqids = sorted(canned_bundle_fqids + [ + # Replica bundles for the AnVIL schema tables, which we don't can + self.bundle_fqid(uuid='cd512d30-7d5f-af35-88f0-efaadabfc17a', + table_name='anvil_activity'), + self.bundle_fqid(uuid='2fd53d37-b8a3-a797-8477-4056bf2473bb', + table_name='anvil_alignmentactivity'), + self.bundle_fqid(uuid='a52d7f0a-25c3-a4cf-af95-0fd8ecec8127', + table_name='anvil_assayactivity'), + self.bundle_fqid(uuid='45e532bb-5af7-a977-8939-24933c154380', + table_name='anvil_diagnosis'), + self.bundle_fqid(uuid='8cc4686a-76cd-a411-a11c-ec3a9fa1d417', + table_name='anvil_donor'), + self.bundle_fqid(uuid='88092968-d1ec-ad7e-9112-4b38bd0df4c2', + table_name='anvil_sequencingactivity'), + self.bundle_fqid(uuid='976fc781-4a33-a062-a67d-2c068e4d5e72', + table_name='anvil_variantcallingactivity') ]) plugin = self.plugin_for_source_spec(source_ref.spec) bundle_fqids = sorted(plugin.list_bundles(source_ref, '')) self.assertEqual(expected_bundle_fqids, bundle_fqids) - for bundle_fqid in bundle_fqids: + for bundle_fqid in canned_bundle_fqids: with self.subTest(bundle_fqid=bundle_fqid): canned_bundle = self._load_canned_bundle(bundle_fqid) assert isinstance(canned_bundle, TDRAnvilBundle) @@ -183,6 +207,7 @@ def test_list_and_fetch_bundles(self): self.assertEqual(canned_bundle.fqid, bundle.fqid) self.assertEqual(canned_bundle.entities, bundle.entities) self.assertEqual(canned_bundle.links, bundle.links) + self.assertEqual(canned_bundle.orphans, bundle.orphans) class TestAnvilIndexerWithIndexesSetUp(AnvilIndexerTestCase): @@ -235,3 +260,19 @@ def test_dataset_description(self): DocumentType.contribution: 2, **({DocumentType.replica: 2} if config.enable_replicas else {}) }) + + def test_orphans(self): + bundle = self._index_canned_bundle(self.replica_bundle()) + assert isinstance(bundle, TDRAnvilBundle) + expected = bundle.orphans if config.enable_replicas else {} + actual = {} + hits = self._get_all_hits() + for hit in hits: + qualifier, doc_type = self._parse_index_name(hit) + self.assertEqual(DocumentType.replica, doc_type) + source = hit['_source'] + self.assertEqual(source['hub_ids'], []) + ref = EntityReference(entity_type=source['replica_type'], + entity_id=source['entity_id']) + actual[ref] = source['contents'] + self.assertEqual(expected, actual) diff --git a/test/indexer/test_tdr.py b/test/indexer/test_tdr.py index 62aab9dfe5..3e59985cbb 100644 --- a/test/indexer/test_tdr.py +++ b/test/indexer/test_tdr.py @@ -197,8 +197,7 @@ def setUpClass(cls): '--dataset=' + cls.source.spec.name ]) - def _make_mock_tdr_tables(self, - source: TDRSourceRef) -> None: + def _make_mock_tdr_tables(self, source: TDRSourceRef) -> JSON: tables = self._load_canned_file_version(uuid=source.id, version=None, extension='tables.tdr')['tables'] @@ -206,6 +205,7 @@ def _make_mock_tdr_tables(self, self._make_mock_entity_table(source.spec, table_name, table_rows['rows']) + return tables def _make_mock_entity_table(self, source: TDRSourceSpec, diff --git a/test/integration_test.py b/test/integration_test.py index 005f8d3467..bfbb34e133 100644 --- a/test/integration_test.py +++ b/test/integration_test.py @@ -152,6 +152,7 @@ Link, ) from azul.plugins.repository.tdr_anvil import ( + BundleType, TDRAnvilBundleFQID, ) from azul.portal_service import ( @@ -1248,7 +1249,21 @@ def _assert_catalog_complete(self, ) -> None: with self.subTest('catalog_complete', catalog=catalog): expected_fqids = bundle_fqids - if not config.is_anvil_enabled(catalog): + if config.is_anvil_enabled(catalog): + # Replica bundles do not add contributions to the index and + # therefore do not appear anywhere in the service response + replica_fqids = { + bundle_fqid + for bundle_fqid in expected_fqids + if cast(TDRAnvilBundleFQID, bundle_fqid).table_name not in ( + BundleType.primary.value, + BundleType.supplementary.value, + BundleType.duos.value, + ) + } + expected_fqids -= replica_fqids + log.info('Ignoring replica bundles %r', replica_fqids) + else: expected_fqids = set(self.azul_client.filter_obsolete_bundle_versions(expected_fqids)) obsolete_fqids = bundle_fqids - expected_fqids if obsolete_fqids: @@ -1843,7 +1858,7 @@ def _test_catalog(self, catalog: config.Catalog): } self.assertIsSubset(set(stitched), metadata_ids) elif metadata_plugin_name == 'anvil': - self.assertEqual({'entities', 'links'}, bundle_json.keys()) + self.assertEqual({'entities', 'links', 'orphans'}, bundle_json.keys()) entities, links = bundle_json['entities'], bundle_json['links'] self.assertIsInstance(entities, dict) self.assertIsInstance(links, list) @@ -1900,7 +1915,10 @@ def _can_bundle(self, '--version', fqid.version, '--source', str(fqid.source.spec), *( - ['--table-name', fqid.table_name.value] + [ + '--table-name', fqid.table_name, + '--batch-prefix', 'null' if fqid.batch_prefix is None else fqid.batch_prefix, + ] if isinstance(fqid, TDRAnvilBundleFQID) else [] ), diff --git a/test/service/test_manifest.py b/test/service/test_manifest.py index f57b88d012..5773e10442 100644 --- a/test/service/test_manifest.py +++ b/test/service/test_manifest.py @@ -1705,9 +1705,11 @@ def bundles(cls) -> list[SourcedBundleFQID]: return [ cls.bundle_fqid(uuid='2370f948-2783-aeb6-afea-e022897f4dcf', version=cls.version), - cls.bundle_fqid(uuid='6b0f6c0f-5d80-a242-accb-840921351cd5', + cls.bundle_fqid(uuid='c67e7adb-1a9c-a3b9-bc91-eb10a428374a', version=cls.version), cls.bundle_fqid(uuid='826dea02-e274-affe-aabc-eb3db63ad068', + version=cls.version), + cls.bundle_fqid(uuid='9a135c9a-069b-a90e-b588-eaf8d1aeeac9', version=cls.version) ] @@ -1717,7 +1719,7 @@ def test_compact_manifest(self): expected = [ ( 'bundles.bundle_uuid', - '6b0f6c0f-5d80-a242-accb-840921351cd5', + 'c67e7adb-1a9c-a3b9-bc91-eb10a428374a', '826dea02-e274-affe-aabc-eb3db63ad068', '826dea02-e274-affe-aabc-eb3db63ad068' ), From fc10972b809768b675d27e92b85cb78cbcdcb167 Mon Sep 17 00:00:00 2001 From: Noa Aviel Dove Date: Wed, 23 Oct 2024 02:53:47 -0700 Subject: [PATCH 21/32] Add FIXME (#6647) --- test/integration_test.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/integration_test.py b/test/integration_test.py index bfbb34e133..c9a79f0a4e 100644 --- a/test/integration_test.py +++ b/test/integration_test.py @@ -1252,6 +1252,8 @@ def _assert_catalog_complete(self, if config.is_anvil_enabled(catalog): # Replica bundles do not add contributions to the index and # therefore do not appear anywhere in the service response + # FIXME: Integration test does not assert that replica bundles are indexed + # https://github.com/DataBiosphere/azul/issues/6647 replica_fqids = { bundle_fqid for bundle_fqid in expected_fqids From 9b6cf31d7b1364ca7f8b3ab28669188c4b1e84dc Mon Sep 17 00:00:00 2001 From: Noa Aviel Dove Date: Thu, 26 Sep 2024 18:49:03 -0700 Subject: [PATCH 22/32] [r] Include parent dataset/project in hub IDs (#6626) --- src/azul/indexer/transform.py | 3 +- .../metadata/anvil/indexer/transform.py | 7 +++-- .../plugins/metadata/hca/indexer/transform.py | 8 +++-- src/azul/service/manifest_service.py | 9 +----- ...2-e274-affe-aabc-eb3db63ad068.results.json | 16 ++++++++-- ...d.2018-11-02T11:33:44.698028Z.results.json | 31 +++++++++++++------ test/indexer/test_anvil.py | 7 ++++- 7 files changed, 53 insertions(+), 28 deletions(-) diff --git a/src/azul/indexer/transform.py b/src/azul/indexer/transform.py index 1e49c5746e..4a2f2c8dc9 100644 --- a/src/azul/indexer/transform.py +++ b/src/azul/indexer/transform.py @@ -133,6 +133,7 @@ def _contribution(self, def _replica(self, entity: EntityReference, *, + root_hub: EntityID, file_hub: EntityID | None, ) -> Replica: replica_type, contents = self._replicate(entity) @@ -144,7 +145,7 @@ def _replica(self, contents=contents, # The other hubs will be added when the indexer # consolidates duplicate replicas. - hub_ids=alist(file_hub)) + hub_ids=alist(file_hub, root_hub)) @classmethod @abstractmethod diff --git a/src/azul/plugins/metadata/anvil/indexer/transform.py b/src/azul/plugins/metadata/anvil/indexer/transform.py index 395d405675..9e2f4f22ed 100644 --- a/src/azul/plugins/metadata/anvil/indexer/transform.py +++ b/src/azul/plugins/metadata/anvil/indexer/transform.py @@ -585,9 +585,10 @@ def transform(self, # will be redundant with those emitted by the file transformer, but # these will be coalesced by the index service before they are # written to ElasticSearch. + dataset = self._only_dataset() for entity in chain(self.bundle.orphans, self.bundle.entities): if partition.contains(UUID(entity.entity_id)): - yield self._replica(entity, file_hub=None) + yield self._replica(entity, file_hub=None, root_hub=dataset.entity_id) class DatasetTransformer(SingletonTransformer): @@ -632,6 +633,7 @@ def _transform(self, entity: EntityReference ) -> Iterable[Contribution | Replica]: linked = self._linked_entities(entity) + dataset = self._only_dataset() contents = dict( activities=self._entities(self._activity, chain.from_iterable( linked[activity_type] @@ -645,7 +647,7 @@ def _transform(self, ) yield self._contribution(contents, entity.entity_id) if config.enable_replicas: - yield self._replica(entity, file_hub=entity.entity_id) + yield self._replica(entity, file_hub=entity.entity_id, root_hub=dataset.entity_id) for linked_entity in linked: yield self._replica( linked_entity, @@ -655,4 +657,5 @@ def _transform(self, # hub IDs field empty for datasets and rely on the tenet # that every file is an implicit hub of its parent dataset. file_hub=None if linked_entity.entity_type == 'anvil_dataset' else entity.entity_id, + root_hub=dataset.entity_id ) diff --git a/src/azul/plugins/metadata/hca/indexer/transform.py b/src/azul/plugins/metadata/hca/indexer/transform.py index 00380d8564..174d43878e 100644 --- a/src/azul/plugins/metadata/hca/indexer/transform.py +++ b/src/azul/plugins/metadata/hca/indexer/transform.py @@ -1470,17 +1470,19 @@ def _transform(self, for entity_type, values in additional_contents.items(): contents[entity_type].extend(values) file_id = file.ref.entity_id + project_ref = self._api_project.ref + project_id = project_ref.entity_id yield self._contribution(contents, file_id) if config.enable_replicas: - yield self._replica(self.api_bundle.ref, file_hub=file_id) + yield self._replica(self.api_bundle.ref, file_hub=file_id, root_hub=project_id) # Projects are linked to every file in their snapshot, # making an explicit list of hub IDs for the project both # redundant and impractically large. Therefore, we leave the # hub IDs field empty for projects and rely on the tenet # that every file is an implicit hub of its parent project. - yield self._replica(self._api_project.ref, file_hub=None) + yield self._replica(project_ref, file_hub=None, root_hub=project_id) for linked_entity in visitor.entities: - yield self._replica(linked_entity, file_hub=file_id) + yield self._replica(linked_entity, file_hub=file_id, root_hub=project_id) def matrix_stratification_values(self, file: api.File) -> JSON: """ diff --git a/src/azul/service/manifest_service.py b/src/azul/service/manifest_service.py index c82222fda2..04cd7997b5 100644 --- a/src/azul/service/manifest_service.py +++ b/src/azul/service/manifest_service.py @@ -2037,14 +2037,7 @@ def _all_replicas(self) -> Iterable[JSON]: if replica_id not in emitted_replica_ids: num_new_replicas += 1 yield replica.to_dict() - # Note that this will be zero for replicas that use implicit - # hubs, in which case there are actually many hubs - explicit_hub_count = len(replica.hub_ids) - # We don't have to track the IDs of replicas with only one - # hub, since we know that there are no other hubs that could - # cause their re-emission. - if explicit_hub_count != 1: - emitted_replica_ids.add(replica_id) + emitted_replica_ids.add(replica_id) log.info('Found %d replicas (%d already emitted) from page of %d hubs', num_replicas, num_replicas - num_new_replicas, len(page)) diff --git a/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json b/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json index 5d3fd82432..2e09718c18 100644 --- a/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json +++ b/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json @@ -437,7 +437,8 @@ "entity_id": "1509ef40-d1ba-440d-b298-16b7c173dcd4", "replica_type": "anvil_sequencingactivity", "hub_ids": [ - "15b76f9c-6b46-433f-851d-34e89f1b9ba6" + "15b76f9c-6b46-433f-851d-34e89f1b9ba6", + "2370f948-2783-4eb6-afea-e022897f4dcf" ], "contents": { "activity_type": "Sequencing", @@ -871,7 +872,8 @@ "entity_id": "15b76f9c-6b46-433f-851d-34e89f1b9ba6", "replica_type": "anvil_file", "hub_ids": [ - "15b76f9c-6b46-433f-851d-34e89f1b9ba6" + "15b76f9c-6b46-433f-851d-34e89f1b9ba6", + "2370f948-2783-4eb6-afea-e022897f4dcf" ], "contents": { "data_modality": [], @@ -904,6 +906,7 @@ "replica_type": "anvil_diagnosis", "hub_ids": [ "15b76f9c-6b46-433f-851d-34e89f1b9ba6", + "2370f948-2783-4eb6-afea-e022897f4dcf", "3b17377b-16b1-431c-9967-e5d01fc5923f" ], "contents": { @@ -1477,7 +1480,9 @@ "_source": { "entity_id": "2370f948-2783-4eb6-afea-e022897f4dcf", "replica_type": "anvil_dataset", - "hub_ids": [], + "hub_ids": [ + "2370f948-2783-4eb6-afea-e022897f4dcf" + ], "contents": { "consent_group": [ "DS-BDIS" @@ -1748,6 +1753,7 @@ }, "replica_type": "anvil_file", "hub_ids": [ + "2370f948-2783-4eb6-afea-e022897f4dcf", "3b17377b-16b1-431c-9967-e5d01fc5923f" ] } @@ -2190,6 +2196,7 @@ "entity_id": "816e364e-1193-4e5b-a91a-14e4b009157c", "replica_type": "anvil_sequencingactivity", "hub_ids": [ + "2370f948-2783-4eb6-afea-e022897f4dcf", "3b17377b-16b1-431c-9967-e5d01fc5923f" ], "contents": { @@ -2939,6 +2946,7 @@ "replica_type": "anvil_biosample", "hub_ids": [ "15b76f9c-6b46-433f-851d-34e89f1b9ba6", + "2370f948-2783-4eb6-afea-e022897f4dcf", "3b17377b-16b1-431c-9967-e5d01fc5923f" ], "contents": { @@ -3519,6 +3527,7 @@ "replica_type": "anvil_diagnosis", "hub_ids": [ "15b76f9c-6b46-433f-851d-34e89f1b9ba6", + "2370f948-2783-4eb6-afea-e022897f4dcf", "3b17377b-16b1-431c-9967-e5d01fc5923f" ], "contents": { @@ -4107,6 +4116,7 @@ "replica_type": "anvil_donor", "hub_ids": [ "15b76f9c-6b46-433f-851d-34e89f1b9ba6", + "2370f948-2783-4eb6-afea-e022897f4dcf", "3b17377b-16b1-431c-9967-e5d01fc5923f" ] } diff --git a/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json b/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json index d7b65274b4..24aab5fdc7 100644 --- a/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json +++ b/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json @@ -70,7 +70,8 @@ "entity_id": "aaa96233-bf27-44c7-82df-b4dc15ad4d9d", "hub_ids": [ "0c5ac7c0-817e-40d4-b1b1-34c3d5cfecdb", - "70d1af4a-82c8-478a-8960-e9028b3616ca" + "70d1af4a-82c8-478a-8960-e9028b3616ca", + "e8642221-4c2c-4fd7-b926-a68bce363c88" ], "replica_type": "links" } @@ -3485,7 +3486,8 @@ "entity_id": "0c5ac7c0-817e-40d4-b1b1-34c3d5cfecdb", "replica_type": "sequence_file", "hub_ids": [ - "0c5ac7c0-817e-40d4-b1b1-34c3d5cfecdb" + "0c5ac7c0-817e-40d4-b1b1-34c3d5cfecdb", + "e8642221-4c2c-4fd7-b926-a68bce363c88" ] }, "_type": "_doc" @@ -3524,7 +3526,8 @@ "replica_type": "cell_suspension", "hub_ids": [ "0c5ac7c0-817e-40d4-b1b1-34c3d5cfecdb", - "70d1af4a-82c8-478a-8960-e9028b3616ca" + "70d1af4a-82c8-478a-8960-e9028b3616ca", + "e8642221-4c2c-4fd7-b926-a68bce363c88" ] }, "_type": "_doc" @@ -3555,7 +3558,8 @@ "entity_id": "70d1af4a-82c8-478a-8960-e9028b3616ca", "replica_type": "sequence_file", "hub_ids": [ - "70d1af4a-82c8-478a-8960-e9028b3616ca" + "70d1af4a-82c8-478a-8960-e9028b3616ca", + "e8642221-4c2c-4fd7-b926-a68bce363c88" ] }, "_type": "_doc" @@ -3608,7 +3612,8 @@ "replica_type": "specimen_from_organism", "hub_ids": [ "0c5ac7c0-817e-40d4-b1b1-34c3d5cfecdb", - "70d1af4a-82c8-478a-8960-e9028b3616ca" + "70d1af4a-82c8-478a-8960-e9028b3616ca", + "e8642221-4c2c-4fd7-b926-a68bce363c88" ] }, "_type": "_doc" @@ -3688,7 +3693,9 @@ }, "entity_id": "e8642221-4c2c-4fd7-b926-a68bce363c88", "replica_type": "project", - "hub_ids": [] + "hub_ids": [ + "e8642221-4c2c-4fd7-b926-a68bce363c88" + ] }, "_type": "_doc" }, @@ -3750,7 +3757,8 @@ "entity_id": "7b07b9d0-cc0e-4098-9f64-f4a569f7d746", "hub_ids": [ "0c5ac7c0-817e-40d4-b1b1-34c3d5cfecdb", - "70d1af4a-82c8-478a-8960-e9028b3616ca" + "70d1af4a-82c8-478a-8960-e9028b3616ca", + "e8642221-4c2c-4fd7-b926-a68bce363c88" ], "replica_type": "donor_organism" }, @@ -3793,7 +3801,8 @@ "entity_id": "9c32cf70-3ed7-4720-badc-5ee71e8a38af", "hub_ids": [ "0c5ac7c0-817e-40d4-b1b1-34c3d5cfecdb", - "70d1af4a-82c8-478a-8960-e9028b3616ca" + "70d1af4a-82c8-478a-8960-e9028b3616ca", + "e8642221-4c2c-4fd7-b926-a68bce363c88" ], "replica_type": "library_preparation_protocol" }, @@ -3830,7 +3839,8 @@ "entity_id": "61e629ed-0135-4492-ac8a-5c4ab3ccca8a", "hub_ids": [ "0c5ac7c0-817e-40d4-b1b1-34c3d5cfecdb", - "70d1af4a-82c8-478a-8960-e9028b3616ca" + "70d1af4a-82c8-478a-8960-e9028b3616ca", + "e8642221-4c2c-4fd7-b926-a68bce363c88" ], "replica_type": "sequencing_protocol" }, @@ -3856,7 +3866,8 @@ "entity_id": "771ddaf6-3a4f-4314-97fe-6294ff8e25a4", "hub_ids": [ "0c5ac7c0-817e-40d4-b1b1-34c3d5cfecdb", - "70d1af4a-82c8-478a-8960-e9028b3616ca" + "70d1af4a-82c8-478a-8960-e9028b3616ca", + "e8642221-4c2c-4fd7-b926-a68bce363c88" ], "replica_type": "process" }, diff --git a/test/indexer/test_anvil.py b/test/indexer/test_anvil.py index a01b0d6b13..1cbc52ba0e 100644 --- a/test/indexer/test_anvil.py +++ b/test/indexer/test_anvil.py @@ -264,6 +264,11 @@ def test_dataset_description(self): def test_orphans(self): bundle = self._index_canned_bundle(self.replica_bundle()) assert isinstance(bundle, TDRAnvilBundle) + dataset_entity_id = one( + ref.entity_id + for ref in bundle.orphans + if ref.entity_type == 'anvil_dataset' + ) expected = bundle.orphans if config.enable_replicas else {} actual = {} hits = self._get_all_hits() @@ -271,7 +276,7 @@ def test_orphans(self): qualifier, doc_type = self._parse_index_name(hit) self.assertEqual(DocumentType.replica, doc_type) source = hit['_source'] - self.assertEqual(source['hub_ids'], []) + self.assertEqual(source['hub_ids'], [dataset_entity_id]) ref = EntityReference(entity_type=source['replica_type'], entity_id=source['entity_id']) actual[ref] = source['contents'] From 5909b1f8d33a0bc9030774d2bac56ab5b729e90b Mon Sep 17 00:00:00 2001 From: Noa Aviel Dove Date: Wed, 9 Oct 2024 18:02:09 -0700 Subject: [PATCH 23/32] Refactor AnVIL JSONL manifest test --- test/service/test_manifest.py | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/test/service/test_manifest.py b/test/service/test_manifest.py index 5773e10442..8780b3e304 100644 --- a/test/service/test_manifest.py +++ b/test/service/test_manifest.py @@ -28,6 +28,7 @@ TemporaryDirectory, ) from typing import ( + Iterable, Optional, ) from unittest.mock import ( @@ -2087,18 +2088,24 @@ def test_compact_manifest(self): self._assert_tsv(expected, response) def test_verbatim_jsonl_manifest(self): + linked_entities_by_hash = {} + + def unique_rows(entities: Mapping[EntityReference, JSON] + ) -> Iterable[tuple[str, JSON]]: + for ref, contents in entities.items(): + yield json_hash(contents).digest(), { + 'type': ref.entity_type, + 'value': contents + } + + for bundle_fqid in self.bundles(): + bundle = self._load_canned_bundle(bundle_fqid) + linked_entities_by_hash.update(unique_rows(bundle.entities)) + response = self._get_manifest(ManifestFormat.verbatim_jsonl, filters={}) self.assertEqual(200, response.status_code) - expected = { - # Consolidate entities with the same replica (i.e. datasets) - json_hash(entity).digest(): { - 'type': entity_ref.entity_type, - 'value': entity, - } - for bundle in self.bundles() - for entity_ref, entity in self._load_canned_bundle(bundle).entities.items() - }.values() - self._assert_jsonl(list(expected), response) + expected_rows = list(linked_entities_by_hash.values()) + self._assert_jsonl(expected_rows, response) def test_verbatim_pfb_manifest(self): response = self._get_manifest(ManifestFormat.verbatim_pfb, filters={}) From 1c64ffe18583985d00f6b333b85ebbd4360d34fa Mon Sep 17 00:00:00 2001 From: Noa Aviel Dove Date: Thu, 31 Oct 2024 15:59:55 -0700 Subject: [PATCH 24/32] Eliminate lazy iterator in AnVIL verbatim manifest --- src/azul/plugins/__init__.py | 15 ++++++--------- src/azul/plugins/metadata/anvil/__init__.py | 6 +++--- src/azul/service/manifest_service.py | 4 ++-- 3 files changed, 11 insertions(+), 14 deletions(-) diff --git a/src/azul/plugins/__init__.py b/src/azul/plugins/__init__.py index c7dffb65e6..5addbe3040 100644 --- a/src/azul/plugins/__init__.py +++ b/src/azul/plugins/__init__.py @@ -433,8 +433,8 @@ def manifest_config(self) -> ManifestConfig: raise NotImplementedError def verbatim_pfb_schema(self, - replicas: Iterable[JSON] - ) -> tuple[Iterable[JSON], Sequence[str], JSON]: + replicas: list[JSON] + ) -> tuple[Sequence[str], JSON]: """ Generate a PFB schema for the verbatim manifest. The default, metadata-agnostic implementation loads all replica documents into memory @@ -445,17 +445,14 @@ def verbatim_pfb_schema(self, :param replicas: The replica documents to be described by the PFB schema - :return: a triple of - 1. the same set of replicas passed to this method - 2. the set of entity types defined by the PFB schema - 3. a PFB schema describing the provided replicas + :return: a tuple of + 1. the set of entity types defined by the PFB schema + 2. a PFB schema describing the provided replicas """ from azul.service import ( avro_pfb, ) - replicas = list(replicas) - replica_types, pfb_schema = avro_pfb.pfb_schema_from_replicas(replicas) - return replicas, replica_types, pfb_schema + return avro_pfb.pfb_schema_from_replicas(replicas) @abstractmethod def document_slice(self, entity_type: str) -> DocumentSlice | None: diff --git a/src/azul/plugins/metadata/anvil/__init__.py b/src/azul/plugins/metadata/anvil/__init__.py index b3fe445326..fc992d1a61 100644 --- a/src/azul/plugins/metadata/anvil/__init__.py +++ b/src/azul/plugins/metadata/anvil/__init__.py @@ -327,8 +327,8 @@ def recurse(mapping: MetadataPlugin._FieldMapping, path: FieldPath): return result def verbatim_pfb_schema(self, - replicas: Iterable[JSON] - ) -> tuple[Iterable[JSON], Sequence[str], JSON]: + replicas: list[JSON] + ) -> tuple[Sequence[str], JSON]: entity_schemas = [] entity_types = [] for table_schema in sorted(anvil_schema['tables'], key=itemgetter('name')): @@ -369,7 +369,7 @@ def verbatim_pfb_schema(self, 'type': 'record', 'fields': field_schemas }) - return replicas, entity_types, avro_pfb_schema(entity_schemas) + return entity_types, avro_pfb_schema(entity_schemas) def _pfb_schema_from_anvil_column(self, *, diff --git a/src/azul/service/manifest_service.py b/src/azul/service/manifest_service.py index 04cd7997b5..24966ed6a7 100644 --- a/src/azul/service/manifest_service.py +++ b/src/azul/service/manifest_service.py @@ -2100,8 +2100,8 @@ def format(cls) -> ManifestFormat: def create_file(self) -> tuple[str, Optional[str]]: plugin = self.service.metadata_plugin(self.catalog) - replicas = self._all_replicas() - replicas, replica_types, pfb_schema = plugin.verbatim_pfb_schema(replicas) + replicas = list(self._all_replicas()) + replica_types, pfb_schema = plugin.verbatim_pfb_schema(replicas) pfb_metadata_entity = avro_pfb.pfb_metadata_entity(replica_types, links=False) def pfb_entities(): From 0688af09c6c8fc9f645dc2319986d747ab2a40de Mon Sep 17 00:00:00 2001 From: Noa Aviel Dove Date: Thu, 31 Oct 2024 16:19:25 -0700 Subject: [PATCH 25/32] Refactor replica schema composition --- src/azul/plugins/__init__.py | 2 +- src/azul/plugins/metadata/anvil/__init__.py | 16 +++++++--------- src/azul/service/avro_pfb.py | 10 +++------- src/azul/service/manifest_service.py | 5 ++++- 4 files changed, 15 insertions(+), 18 deletions(-) diff --git a/src/azul/plugins/__init__.py b/src/azul/plugins/__init__.py index 5addbe3040..21c937f387 100644 --- a/src/azul/plugins/__init__.py +++ b/src/azul/plugins/__init__.py @@ -434,7 +434,7 @@ def manifest_config(self) -> ManifestConfig: def verbatim_pfb_schema(self, replicas: list[JSON] - ) -> tuple[Sequence[str], JSON]: + ) -> list[JSON]: """ Generate a PFB schema for the verbatim manifest. The default, metadata-agnostic implementation loads all replica documents into memory diff --git a/src/azul/plugins/metadata/anvil/__init__.py b/src/azul/plugins/metadata/anvil/__init__.py index fc992d1a61..921bd0160b 100644 --- a/src/azul/plugins/metadata/anvil/__init__.py +++ b/src/azul/plugins/metadata/anvil/__init__.py @@ -53,9 +53,6 @@ AnvilSearchResponseStage, AnvilSummaryResponseStage, ) -from azul.service.avro_pfb import ( - avro_pfb_schema, -) from azul.service.manifest_service import ( ManifestFormat, ) @@ -328,15 +325,16 @@ def recurse(mapping: MetadataPlugin._FieldMapping, path: FieldPath): def verbatim_pfb_schema(self, replicas: list[JSON] - ) -> tuple[Sequence[str], JSON]: + ) -> list[JSON]: + table_schemas_by_name = { + schema['name']: schema + for schema in anvil_schema['tables'] + } entity_schemas = [] - entity_types = [] - for table_schema in sorted(anvil_schema['tables'], key=itemgetter('name')): - table_name = table_schema['name'] + for table_name, table_schema in table_schemas_by_name.items(): # FIXME: Improve handling of DUOS replicas # https://github.com/DataBiosphere/azul/issues/6139 is_duos_type = table_name == 'anvil_dataset' - entity_types.append(table_name) field_schemas = [ self._pfb_schema_from_anvil_column(table_name=table_name, column_name='datarepo_row_id', @@ -369,7 +367,7 @@ def verbatim_pfb_schema(self, 'type': 'record', 'fields': field_schemas }) - return entity_types, avro_pfb_schema(entity_schemas) + return entity_schemas def _pfb_schema_from_anvil_column(self, *, diff --git a/src/azul/service/avro_pfb.py b/src/azul/service/avro_pfb.py index d88be244de..eb1f3b7fbe 100644 --- a/src/azul/service/avro_pfb.py +++ b/src/azul/service/avro_pfb.py @@ -17,7 +17,6 @@ ClassVar, MutableSet, Self, - Sequence, ) from uuid import ( UUID, @@ -315,18 +314,15 @@ def pfb_schema_from_field_types(field_types: FieldTypes) -> JSON: return avro_pfb_schema(entity_schemas) -def pfb_schema_from_replicas(replicas: Iterable[JSON] - ) -> tuple[Sequence[str], JSON]: - schemas_by_replica_type = {} +def pfb_schema_from_replicas(replicas: Iterable[JSON]) -> list[JSON]: + schemas_by_replica_type: dict[str, MutableJSON] = {} for replica in replicas: replica_type, replica_contents = replica['replica_type'], replica['contents'] _update_replica_schema(schema=schemas_by_replica_type, path=(replica_type,), key=replica_type, value=replica_contents) - schemas_by_replica_type = sorted(schemas_by_replica_type.items()) - keys, values = zip(*schemas_by_replica_type) - return keys, avro_pfb_schema(values) + return list(schemas_by_replica_type.values()) def avro_pfb_schema(azul_avro_schema: Iterable[JSON]) -> JSON: diff --git a/src/azul/service/manifest_service.py b/src/azul/service/manifest_service.py index 24966ed6a7..596c8f4876 100644 --- a/src/azul/service/manifest_service.py +++ b/src/azul/service/manifest_service.py @@ -2101,7 +2101,10 @@ def format(cls) -> ManifestFormat: def create_file(self) -> tuple[str, Optional[str]]: plugin = self.service.metadata_plugin(self.catalog) replicas = list(self._all_replicas()) - replica_types, pfb_schema = plugin.verbatim_pfb_schema(replicas) + replica_schemas = plugin.verbatim_pfb_schema(replicas) + replica_schemas.sort(key=itemgetter('name')) + replica_types = [s['name'] for s in replica_schemas] + pfb_schema = avro_pfb.avro_pfb_schema(replica_schemas) pfb_metadata_entity = avro_pfb.pfb_metadata_entity(replica_types, links=False) def pfb_entities(): From 45b7bf9f3a59c1945e868c5869576226fa7fdc43 Mon Sep 17 00:00:00 2001 From: Noa Aviel Dove Date: Wed, 9 Oct 2024 18:03:11 -0700 Subject: [PATCH 26/32] Include orphans in manifest when filtering by only project/dataset (#6619) --- src/azul/indexer/document_service.py | 3 +++ src/azul/plugins/__init__.py | 1 + src/azul/plugins/metadata/anvil/__init__.py | 12 +++++++++-- .../plugins/metadata/anvil/service/filter.py | 2 +- src/azul/plugins/metadata/hca/__init__.py | 3 ++- .../plugins/metadata/hca/service/filter.py | 2 +- src/azul/service/manifest_service.py | 12 +++++++++-- src/azul/service/repository_service.py | 3 +++ test/service/test_filters.py | 3 ++- test/service/test_manifest.py | 21 ++++++++++++++----- test/service/test_request_builder.py | 3 ++- 11 files changed, 51 insertions(+), 14 deletions(-) diff --git a/src/azul/indexer/document_service.py b/src/azul/indexer/document_service.py index 3f0e7e0ef5..f290cf6a31 100644 --- a/src/azul/indexer/document_service.py +++ b/src/azul/indexer/document_service.py @@ -122,3 +122,6 @@ def translate_fields(self, self.field_types(catalog), forward=forward, allowed_paths=allowed_paths) + + def always_limit_access(self) -> bool: + return True diff --git a/src/azul/plugins/__init__.py b/src/azul/plugins/__init__.py index 21c937f387..8e28e8dfd0 100644 --- a/src/azul/plugins/__init__.py +++ b/src/azul/plugins/__init__.py @@ -132,6 +132,7 @@ class SpecialFields: source_spec: FieldName bundle_uuid: FieldName bundle_version: FieldName + implicit_hub_id: FieldName class ManifestFormat(Enum): diff --git a/src/azul/plugins/metadata/anvil/__init__.py b/src/azul/plugins/metadata/anvil/__init__.py index 921bd0160b..ab4c55441c 100644 --- a/src/azul/plugins/metadata/anvil/__init__.py +++ b/src/azul/plugins/metadata/anvil/__init__.py @@ -243,7 +243,8 @@ def special_fields(self) -> SpecialFields: return SpecialFields(source_id='source_id', source_spec='source_spec', bundle_uuid='bundle_uuid', - bundle_version='bundle_version') + bundle_version='bundle_version', + implicit_hub_id='datasets.dataset_id') @property def implicit_hub_type(self) -> str: @@ -330,7 +331,14 @@ def verbatim_pfb_schema(self, schema['name']: schema for schema in anvil_schema['tables'] } - entity_schemas = [] + non_schema_replicas = [ + r for r in replicas + if r['replica_type'] not in table_schemas_by_name + ] + # For tables not described by the AnVIL schema, fall back to building + # their PFB schema dynamically from the shapes of the replicas + entity_schemas = super().verbatim_pfb_schema(non_schema_replicas) + # For the rest, use the AnVIL schema as the basis of the PFB schema for table_name, table_schema in table_schemas_by_name.items(): # FIXME: Improve handling of DUOS replicas # https://github.com/DataBiosphere/azul/issues/6139 diff --git a/src/azul/plugins/metadata/anvil/service/filter.py b/src/azul/plugins/metadata/anvil/service/filter.py index ac49c2c7fb..b0274054f4 100644 --- a/src/azul/plugins/metadata/anvil/service/filter.py +++ b/src/azul/plugins/metadata/anvil/service/filter.py @@ -6,4 +6,4 @@ class AnvilFilterStage(FilterStage): def _limit_access(self) -> bool: - return self.entity_type != 'datasets' + return self.service.always_limit_access() or self.entity_type != 'datasets' diff --git a/src/azul/plugins/metadata/hca/__init__.py b/src/azul/plugins/metadata/hca/__init__.py index 0286a2b489..46a8b04988 100644 --- a/src/azul/plugins/metadata/hca/__init__.py +++ b/src/azul/plugins/metadata/hca/__init__.py @@ -282,7 +282,8 @@ def special_fields(self) -> SpecialFields: return SpecialFields(source_id='sourceId', source_spec='sourceSpec', bundle_uuid='bundleUuid', - bundle_version='bundleVersion') + bundle_version='bundleVersion', + implicit_hub_id='projectId') @property def implicit_hub_type(self) -> str: diff --git a/src/azul/plugins/metadata/hca/service/filter.py b/src/azul/plugins/metadata/hca/service/filter.py index 86374979b4..2581b707c2 100644 --- a/src/azul/plugins/metadata/hca/service/filter.py +++ b/src/azul/plugins/metadata/hca/service/filter.py @@ -6,4 +6,4 @@ class HCAFilterStage(FilterStage): def _limit_access(self) -> bool: - return self.entity_type != 'projects' + return self.service.always_limit_access() or self.entity_type != 'projects' diff --git a/src/azul/service/manifest_service.py b/src/azul/service/manifest_service.py index 596c8f4876..f695a7659f 100644 --- a/src/azul/service/manifest_service.py +++ b/src/azul/service/manifest_service.py @@ -1985,7 +1985,7 @@ class VerbatimManifestGenerator(FileBasedManifestGenerator, metaclass=ABCMeta): @property def entity_type(self) -> str: - return 'files' + return self.implicit_hub_type if self.include_orphans else 'files' @property def included_fields(self) -> list[FieldPath]: @@ -2001,6 +2001,11 @@ def included_fields(self) -> list[FieldPath]: def implicit_hub_type(self) -> str: return self.service.metadata_plugin(self.catalog).implicit_hub_type + @property + def include_orphans(self) -> bool: + special_fields = self.service.metadata_plugin(self.catalog).special_fields + return self.filters.explicit.keys() == {special_fields.implicit_hub_id} + @attrs.frozen(kw_only=True) class ReplicaKeys: """ @@ -2019,8 +2024,11 @@ def _replica_keys(self) -> Iterable[ReplicaKeys]: hub_type = self.implicit_hub_type request = self._create_request() for hit in request.scan(): + replica_id = one(hit['contents'][hub_type])['document_id'] + if self.entity_type != hub_type: + replica_id = one(replica_id) yield self.ReplicaKeys(hub_id=hit['entity_id'], - replica_id=one(one(hit['contents'][hub_type])['document_id'])) + replica_id=replica_id) def _all_replicas(self) -> Iterable[JSON]: emitted_replica_ids = set() diff --git a/src/azul/service/repository_service.py b/src/azul/service/repository_service.py index 0ea5889f3a..43d000dfac 100644 --- a/src/azul/service/repository_service.py +++ b/src/azul/service/repository_service.py @@ -374,3 +374,6 @@ def _hit_to_doc(hit: Hit) -> JSON: if file_version is not None: assert file_version == file['version'] return file + + def always_limit_access(self) -> bool: + return False diff --git a/test/service/test_filters.py b/test/service/test_filters.py index 8eaf55126a..b782a5671a 100644 --- a/test/service/test_filters.py +++ b/test/service/test_filters.py @@ -33,7 +33,8 @@ class TestFilterReification(AzulTestCase): source_id='sourceId', source_spec=MagicMock(), bundle_uuid=MagicMock(), - bundle_version=MagicMock() + bundle_version=MagicMock(), + implicit_hub_id=MagicMock() ) @property diff --git a/test/service/test_manifest.py b/test/service/test_manifest.py index 8780b3e304..5ae6063d83 100644 --- a/test/service/test_manifest.py +++ b/test/service/test_manifest.py @@ -2089,6 +2089,7 @@ def test_compact_manifest(self): def test_verbatim_jsonl_manifest(self): linked_entities_by_hash = {} + all_entities_by_hash = {} def unique_rows(entities: Mapping[EntityReference, JSON] ) -> Iterable[tuple[str, JSON]]: @@ -2101,11 +2102,21 @@ def unique_rows(entities: Mapping[EntityReference, JSON] for bundle_fqid in self.bundles(): bundle = self._load_canned_bundle(bundle_fqid) linked_entities_by_hash.update(unique_rows(bundle.entities)) - - response = self._get_manifest(ManifestFormat.verbatim_jsonl, filters={}) - self.assertEqual(200, response.status_code) - expected_rows = list(linked_entities_by_hash.values()) - self._assert_jsonl(expected_rows, response) + all_entities_by_hash.update(unique_rows(bundle.orphans)) + all_entities_by_hash.update(linked_entities_by_hash) + + for filters, expect_orphans in [ + ({}, False), + ({'datasets.title': {'is': ['ANVIL_CMG_UWASH_DS_BDIS']}}, False), + # Orphans should be included only when filtering by dataset ID + ({'datasets.dataset_id': {'is': ['52ee7665-7033-63f2-a8d9-ce8e32666739']}}, True) + ]: + with self.subTest(filters=filters): + response = self._get_manifest(ManifestFormat.verbatim_jsonl, filters=filters) + self.assertEqual(200, response.status_code) + expected_rows = list((all_entities_by_hash if expect_orphans + else linked_entities_by_hash).values()) + self._assert_jsonl(expected_rows, response) def test_verbatim_pfb_manifest(self): response = self._get_manifest(ManifestFormat.verbatim_pfb, filters={}) diff --git a/test/service/test_request_builder.py b/test/service/test_request_builder.py index 218f964313..5e89aab8e6 100644 --- a/test/service/test_request_builder.py +++ b/test/service/test_request_builder.py @@ -65,7 +65,8 @@ def special_fields(self) -> SpecialFields: return SpecialFields(source_id='sourceId', source_spec='sourceSpec', bundle_uuid='bundleUuid', - bundle_version='bundleVersion') + bundle_version='bundleVersion', + implicit_hub_id='projectId') @property def _field_mapping(self) -> MetadataPlugin._FieldMapping: From 039dc984614ea15d3935634c5c468884f1c69f15 Mon Sep 17 00:00:00 2001 From: Hannes Schmidt Date: Sat, 9 Nov 2024 11:39:37 -0800 Subject: [PATCH 27/32] Document SpecialFields and friends --- src/azul/plugins/__init__.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/azul/plugins/__init__.py b/src/azul/plugins/__init__.py index 8e28e8dfd0..1f9c19da54 100644 --- a/src/azul/plugins/__init__.py +++ b/src/azul/plugins/__init__.py @@ -127,6 +127,19 @@ def order(self) -> str: @attr.s(auto_attribs=True, frozen=True, kw_only=True) class SpecialFields: + """ + Azul defines a number of fields in each /index/{entity_type} response that + are synthetic (not directly taken from the metadata) and/or are used + internally. Their naming is inconsistent between metadata plugin + implementations. This class encapsulates the naming of these fields so that + we don't need to litter the source with strings literals and conditionals. + + It is an incomplete abstraction in that it does not express the name of the + inner entity the field is a property of in the /index/{entity_type} + response. In that way, the values of the attributes of instances of this + class are more akin to a facet name, rather than a field name. However, not + every field represented here is actually a facet. + """ accessible: ClassVar[FieldName] = 'accessible' source_id: FieldName source_spec: FieldName @@ -410,6 +423,9 @@ def _field_mapping(self) -> _FieldMapping: @property @abstractmethod def special_fields(self) -> SpecialFields: + """ + See :py:class:`SpecialFields`. + """ raise NotImplementedError @property From fb648bab7e5f60d2cee27d507a1f8681a0a694ca Mon Sep 17 00:00:00 2001 From: Hannes Schmidt Date: Sat, 9 Nov 2024 11:40:56 -0800 Subject: [PATCH 28/32] Refactor conditional weakening of access enforcement --- src/azul/indexer/document_service.py | 11 ++++++++--- src/azul/plugins/metadata/anvil/service/filter.py | 3 ++- src/azul/plugins/metadata/hca/service/filter.py | 3 ++- src/azul/service/elasticsearch_service.py | 8 ++++++-- src/azul/service/repository_service.py | 1 + 5 files changed, 19 insertions(+), 7 deletions(-) diff --git a/src/azul/indexer/document_service.py b/src/azul/indexer/document_service.py index f290cf6a31..0e9bb11dec 100644 --- a/src/azul/indexer/document_service.py +++ b/src/azul/indexer/document_service.py @@ -49,6 +49,14 @@ def metadata_plugin(self, catalog: CatalogName) -> MetadataPlugin: def aggregate_class(self, catalog: CatalogName) -> Type[Aggregate]: return self.metadata_plugin(catalog).aggregate_class() + @property + def always_limit_access(self) -> bool: + """ + True if access restrictions are enforced unconditionally. False, if the + filter stage is allowed to weaken them, e.g., based on the entity type. + """ + return True + def transformer_types(self, catalog: CatalogName ) -> Iterable[Type[Transformer]]: @@ -122,6 +130,3 @@ def translate_fields(self, self.field_types(catalog), forward=forward, allowed_paths=allowed_paths) - - def always_limit_access(self) -> bool: - return True diff --git a/src/azul/plugins/metadata/anvil/service/filter.py b/src/azul/plugins/metadata/anvil/service/filter.py index b0274054f4..0b21e2c62b 100644 --- a/src/azul/plugins/metadata/anvil/service/filter.py +++ b/src/azul/plugins/metadata/anvil/service/filter.py @@ -5,5 +5,6 @@ class AnvilFilterStage(FilterStage): + @property def _limit_access(self) -> bool: - return self.service.always_limit_access() or self.entity_type != 'datasets' + return self.entity_type != 'datasets' diff --git a/src/azul/plugins/metadata/hca/service/filter.py b/src/azul/plugins/metadata/hca/service/filter.py index 2581b707c2..c2ba76ab88 100644 --- a/src/azul/plugins/metadata/hca/service/filter.py +++ b/src/azul/plugins/metadata/hca/service/filter.py @@ -5,5 +5,6 @@ class HCAFilterStage(FilterStage): + @property def _limit_access(self) -> bool: - return self.service.always_limit_access() or self.entity_type != 'projects' + return self.entity_type != 'projects' diff --git a/src/azul/service/elasticsearch_service.py b/src/azul/service/elasticsearch_service.py index 2b6b81631c..177a16c87b 100644 --- a/src/azul/service/elasticsearch_service.py +++ b/src/azul/service/elasticsearch_service.py @@ -192,14 +192,18 @@ def process_response(self, response: Response) -> Response: @cached_property def prepared_filters(self) -> TranslatedFilters: - filters_json = self.filters.reify(self.plugin, limit_access=self._limit_access()) + limit_access = self.service.always_limit_access or self._limit_access + filters_json = self.filters.reify(self.plugin, limit_access=limit_access) return self._translate_filters(filters_json) + @property @abstractmethod def _limit_access(self) -> bool: """ Whether to enforce the managed access controls during filter - reification. + reification, provided that the service allows such conditional + enforcement of access. If it doesn't, the return value should be + ignored, and access must be enforced unconditionally. """ raise NotImplementedError diff --git a/src/azul/service/repository_service.py b/src/azul/service/repository_service.py index 43d000dfac..e3257278b6 100644 --- a/src/azul/service/repository_service.py +++ b/src/azul/service/repository_service.py @@ -375,5 +375,6 @@ def _hit_to_doc(hit: Hit) -> JSON: assert file_version == file['version'] return file + @property def always_limit_access(self) -> bool: return False From dfb41a228ef60b0f3d8ceb968dc94808edfa542b Mon Sep 17 00:00:00 2001 From: Hannes Schmidt Date: Sat, 9 Nov 2024 12:36:00 -0800 Subject: [PATCH 29/32] Refactor verbatim JSONL unit test --- test/service/test_manifest.py | 45 +++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/test/service/test_manifest.py b/test/service/test_manifest.py index 5ae6063d83..dac94047db 100644 --- a/test/service/test_manifest.py +++ b/test/service/test_manifest.py @@ -28,7 +28,6 @@ TemporaryDirectory, ) from typing import ( - Iterable, Optional, ) from unittest.mock import ( @@ -2088,35 +2087,39 @@ def test_compact_manifest(self): self._assert_tsv(expected, response) def test_verbatim_jsonl_manifest(self): - linked_entities_by_hash = {} - all_entities_by_hash = {} + all_entities, linked_entities = self._canned_entities() + cases = [ + ({}, False), + ({'datasets.title': {'is': ['ANVIL_CMG_UWASH_DS_BDIS']}}, False), + # Orphans should be included only when filtering by dataset ID + ({'datasets.dataset_id': {'is': ['52ee7665-7033-63f2-a8d9-ce8e32666739']}}, True) + ] + for filters, expect_orphans in cases: + with self.subTest(filters=filters): + response = self._get_manifest(ManifestFormat.verbatim_jsonl, filters=filters) + self.assertEqual(200, response.status_code) + expected_rows = list(all_entities if expect_orphans else linked_entities) + self._assert_jsonl(expected_rows, response) + + def _canned_entities(self): - def unique_rows(entities: Mapping[EntityReference, JSON] - ) -> Iterable[tuple[str, JSON]]: - for ref, contents in entities.items(): - yield json_hash(contents).digest(), { + def hash_entities(entities: dict[EntityReference, JSON]) -> dict[str, JSON]: + return { + json_hash(contents).digest(): { 'type': ref.entity_type, 'value': contents } + for ref, contents in entities.items() + } + linked_entities_by_hash, all_entities_by_hash = {}, {} for bundle_fqid in self.bundles(): bundle = self._load_canned_bundle(bundle_fqid) - linked_entities_by_hash.update(unique_rows(bundle.entities)) - all_entities_by_hash.update(unique_rows(bundle.orphans)) + linked_entities_by_hash.update(hash_entities(bundle.entities)) + all_entities_by_hash.update(hash_entities(bundle.orphans)) all_entities_by_hash.update(linked_entities_by_hash) - for filters, expect_orphans in [ - ({}, False), - ({'datasets.title': {'is': ['ANVIL_CMG_UWASH_DS_BDIS']}}, False), - # Orphans should be included only when filtering by dataset ID - ({'datasets.dataset_id': {'is': ['52ee7665-7033-63f2-a8d9-ce8e32666739']}}, True) - ]: - with self.subTest(filters=filters): - response = self._get_manifest(ManifestFormat.verbatim_jsonl, filters=filters) - self.assertEqual(200, response.status_code) - expected_rows = list((all_entities_by_hash if expect_orphans - else linked_entities_by_hash).values()) - self._assert_jsonl(expected_rows, response) + return all_entities_by_hash.values(), linked_entities_by_hash.values() def test_verbatim_pfb_manifest(self): response = self._get_manifest(ManifestFormat.verbatim_pfb, filters={}) From df021287f35c52e0dc62c30ee77740dc66799314 Mon Sep 17 00:00:00 2001 From: Daniel Sotirhos Date: Tue, 12 Nov 2024 10:11:35 -0800 Subject: [PATCH 30/32] Fix: Inspector findings missing from report (#6692) --- .github/PULL_REQUEST_TEMPLATE/upgrade.md | 2 +- .github/pull_request_template.md.template.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/PULL_REQUEST_TEMPLATE/upgrade.md b/.github/PULL_REQUEST_TEMPLATE/upgrade.md index 117b82d285..136e5b8373 100644 --- a/.github/PULL_REQUEST_TEMPLATE/upgrade.md +++ b/.github/PULL_REQUEST_TEMPLATE/upgrade.md @@ -107,7 +107,7 @@ Connected issue: #0000 ### Operator -- [ ] At least one hour has passed since `anvildev.shared` was last deployed +- [ ] At least 24 hours have passed since `anvildev.shared` was last deployed - [ ] Ran `script/export_inspector_findings.py` against `anvildev`, imported results to [Google Sheet](https://docs.google.com/spreadsheets/d/1RWF7g5wRKWPGovLw4jpJGX_XMi8aWLXLOvvE5rxqgH8) and posted screenshot of relevant1 findings as a comment on the connected issue. - [ ] Propagated the `deploy:shared`, `deploy:gitlab`, `deploy:runner` and `backup:gitlab` labels to the next promotion PRs or this PR carries none of these labels - [ ] Propagated any specific instructions related to the `deploy:shared`, `deploy:gitlab`, `deploy:runner` and `backup:gitlab` labels, from the description of this PR to that of the next promotion PRs or this PR carries none of these labels diff --git a/.github/pull_request_template.md.template.py b/.github/pull_request_template.md.template.py index eb28c95dc8..1fd2f544bd 100644 --- a/.github/pull_request_template.md.template.py +++ b/.github/pull_request_template.md.template.py @@ -958,7 +958,7 @@ def emit(t: T, target_branch: str): *iif(t is T.upgrade, [ { 'type': 'cli', - 'content': 'At least one hour has passed since `anvildev.shared` was last deployed' + 'content': 'At least 24 hours have passed since `anvildev.shared` was last deployed' }, { 'type': 'cli', From 19ba5d9d66493f765023c5963505cde935d4c956 Mon Sep 17 00:00:00 2001 From: Abraham Chavez Date: Tue, 12 Nov 2024 10:22:22 -0800 Subject: [PATCH 31/32] [r] Add updated snapshot for project `6135382f` to `lm8` (#6689) --- deployments/prod/environment.py | 1 + 1 file changed, 1 insertion(+) diff --git a/deployments/prod/environment.py b/deployments/prod/environment.py index 079999b289..8ce9214a9a 100644 --- a/deployments/prod/environment.py +++ b/deployments/prod/environment.py @@ -1181,6 +1181,7 @@ def mkdict(previous_catalog: dict[str, str], mksrc('bigquery', 'datarepo-c9158593', 'lungmap_prod_20037472ea1d4ddb9cd356a11a6f0f76__20220307_20241002_lm8'), mksrc('bigquery', 'datarepo-35a6d7ca', 'lungmap_prod_3a02d15f9c6a4ef7852b4ddec733b70b__20241001_20241002_lm8'), mksrc('bigquery', 'datarepo-131a1234', 'lungmap_prod_4ae8c5c91520437198276935661f6c84__20231004_20241002_lm8'), + mksrc('bigquery', 'datarepo-936db385', 'lungmap_prod_6135382f487d4adb9cf84d6634125b68__20230207_20241106_lm8'), mksrc('bigquery', 'datarepo-3c4905d2', 'lungmap_prod_834e0d1671b64425a8ab022b5000961c__20241001_20241002_lm8'), mksrc('bigquery', 'datarepo-d7447983', 'lungmap_prod_f899709cae2c4bb988f0131142e6c7ec__20220310_20241002_lm8'), mksrc('bigquery', 'datarepo-c11ef363', 'lungmap_prod_fdadee7e209745d5bf81cc280bd8348e__20240206_20241002_lm8'), From b0533888adbb1599d25dffa2a7db1aaf08e643d5 Mon Sep 17 00:00:00 2001 From: Daniel Sotirhos Date: Thu, 14 Nov 2024 13:38:56 -0800 Subject: [PATCH 32/32] [r] Make anvil8 default catalog and remove anvil7 (#6707) --- deployments/anvilprod/environment.py | 1 - deployments/hammerbox/environment.py | 1 - 2 files changed, 2 deletions(-) diff --git a/deployments/anvilprod/environment.py b/deployments/anvilprod/environment.py index e7e584c0b8..9f96cd53f4 100644 --- a/deployments/anvilprod/environment.py +++ b/deployments/anvilprod/environment.py @@ -975,7 +975,6 @@ def env() -> Mapping[str, Optional[str]]: repository=dict(name='tdr_anvil')), sources=list(filter(None, sources.values()))) for atlas, catalog, sources in [ - ('anvil', 'anvil7', anvil7_sources), ('anvil', 'anvil8', anvil8_sources), ] for suffix, internal in [ diff --git a/deployments/hammerbox/environment.py b/deployments/hammerbox/environment.py index dab3c1bca0..89693fcf64 100644 --- a/deployments/hammerbox/environment.py +++ b/deployments/hammerbox/environment.py @@ -989,7 +989,6 @@ def env() -> Mapping[str, Optional[str]]: repository=dict(name='tdr_anvil')), sources=list(filter(None, sources.values()))) for atlas, catalog, sources in [ - ('anvil', 'anvil7', anvil7_sources), ('anvil', 'anvil8', anvil8_sources), ] for suffix, internal in [