diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 49bc892e5d..7666d2933d 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1 +1 @@ -* @nadove-ucsc +* @hannes-ucsc diff --git a/.github/pull_request_template.md.template.py b/.github/pull_request_template.md.template.py index 61a2d9ac95..eb28c95dc8 100644 --- a/.github/pull_request_template.md.template.py +++ b/.github/pull_request_template.md.template.py @@ -33,6 +33,7 @@ OrderedSet, ) from azul.strings import ( + back_quote as bq, join_grammatically, ) from azul.template import ( @@ -225,10 +226,6 @@ def shared_deploy_target(self, target_branch: str) -> str: return 'apply' + iif(self.shared_deploy_is_two_phase(target_branch), '_keep_unused') -def bq(s): - return '`' + s + '`' - - def main(): path = Path(sys.argv[1]) for t in T: diff --git a/README.md b/README.md index f4f0a9ea08..0261e39932 100644 --- a/README.md +++ b/README.md @@ -1819,14 +1819,14 @@ the private key and can always be regenerated again later using `make config`. ### 9.1.2 Ensuring split tunnel on client -It is important that you configure the client to only route VPC traffic -through the VPN. The VPN server will not forward any other traffic, in what's -commonly referred to as a *split tunnel*. The key indicator of a split tunnel -is that it doesn't set up a default route on the client system. There will -only be a route to the private 172.… subnet of the GitLab VPC but the default -route remains in place. If you configure the VPN connection to set up a -default route, your Internet access will be severed as soon as you establish -the VPN connection. +Except on stable deployments, you should configure the client to only route VPC +traffic through the VPN. The VPN server will not forward any other traffic, in +what's commonly referred to as a *split tunnel*. The key indicator of a split +tunnel is that it doesn't set up a default route on the client system. There +will only be a route to the private 172.… subnet of the GitLab VPC but the +default route remains in place. + +On stable deployments, split tunnels are prohibited. The `make config` step prints instruction on how to configure a split tunnel on Ubuntu. diff --git a/deployments/prod/environment.py b/deployments/prod/environment.py index 708536467d..297961d7bf 100644 --- a/deployments/prod/environment.py +++ b/deployments/prod/environment.py @@ -1126,6 +1126,21 @@ def mkdict(previous_catalog: dict[str, str], # @formatter:on ])) +dcp43_sources = mkdict(dcp42_sources, 476, mkdelta([ + # @formatter:off + mksrc('bigquery', 'datarepo-ac7cee91', 'hca_prod_087efc3c26014de6bbe90114593050d1__20241004_dcp2_20241007_dcp43'), + mksrc('bigquery', 'datarepo-e9df1043', 'hca_prod_248c5dc36b754fb4ad8acc771968483f__20240806_dcp2_20241007_dcp43'), + mksrc('bigquery', 'datarepo-65c49269', 'hca_prod_2ef3655a973d4d699b4121fa4041eed7__20220111_dcp2_20241004_dcp43'), + mksrc('bigquery', 'datarepo-456691e5', 'hca_prod_3627473eb6d645c987b5b9f12ce57a10__20241004_dcp2_20241007_dcp43'), + mksrc('bigquery', 'datarepo-c577eed5', 'hca_prod_7f351a4cd24c4fcd9040f79071b097d0__20220906_dcp2_20241004_dcp43'), + mksrc('bigquery', 'datarepo-1dbd3c50', 'hca_prod_ae9f439bbd474d6ebd7232dc70b35d97__20241004_dcp2_20241004_dcp43', ma), # noqa E501 + mksrc('bigquery', 'datarepo-21d1f89b', 'hca_prod_b39381584e8d4fdb9e139e94270dde16__20241004_dcp2_20241004_dcp43'), + mksrc('bigquery', 'datarepo-550c8f98', 'hca_prod_c3dd819dabab4957b20988f1e0900368__20241004_dcp2_20241004_dcp43'), + mksrc('bigquery', 'datarepo-06a00830', 'hca_prod_c5ca43aa3b2b42168eb3f57adcbc99a1__20220118_dcp2_20241004_dcp43'), + mksrc('bigquery', 'datarepo-55151ed4', 'hca_prod_cdabcf0b76024abf9afb3b410e545703__20230201_dcp2_20241008_dcp43') + # @formatter:on +])) + pilot1_sources = mkdict({}, 4, mkdelta([ # @formatter:off mksrc('bigquery', 'datarepo-11e4dc06', 'hca_prod_59b3bfd9cf454d538c8ee240273cba71__20240410_dcp2_20240410_dcpPilot'), # noqa E501 @@ -1171,6 +1186,17 @@ def mkdict(previous_catalog: dict[str, str], mksrc('bigquery', 'datarepo-43814140', 'lungmap_prod_fdadee7e209745d5bf81cc280bd8348e__20240206_20240626_lm7') ])) +lm8_sources = mkdict(lm7_sources, 12, mkdelta([ + mksrc('bigquery', 'datarepo-2b15227b', 'lungmap_prod_1977dc4784144263a8706b0f207d8ab3__20240206_20241002_lm8'), + mksrc('bigquery', 'datarepo-c9158593', 'lungmap_prod_20037472ea1d4ddb9cd356a11a6f0f76__20220307_20241002_lm8'), + mksrc('bigquery', 'datarepo-35a6d7ca', 'lungmap_prod_3a02d15f9c6a4ef7852b4ddec733b70b__20241001_20241002_lm8'), + mksrc('bigquery', 'datarepo-131a1234', 'lungmap_prod_4ae8c5c91520437198276935661f6c84__20231004_20241002_lm8'), + mksrc('bigquery', 'datarepo-3377446f', 'lungmap_prod_6135382f487d4adb9cf84d6634125b68__20230207_20241002_lm8'), + mksrc('bigquery', 'datarepo-3c4905d2', 'lungmap_prod_834e0d1671b64425a8ab022b5000961c__20241001_20241002_lm8'), + mksrc('bigquery', 'datarepo-d7447983', 'lungmap_prod_f899709cae2c4bb988f0131142e6c7ec__20220310_20241002_lm8'), + mksrc('bigquery', 'datarepo-c11ef363', 'lungmap_prod_fdadee7e209745d5bf81cc280bd8348e__20240206_20241002_lm8'), +])) + def env() -> Mapping[str, Optional[str]]: """ @@ -1211,8 +1237,10 @@ def env() -> Mapping[str, Optional[str]]: sources=mklist(sources)) for atlas, catalog, sources in [ ('hca', 'dcp42', dcp42_sources), + ('hca', 'dcp43', dcp43_sources), ('hca', 'pilot1', pilot1_sources), - ('lungmap', 'lm7', lm7_sources) + ('lungmap', 'lm7', lm7_sources), + ('lungmap', 'lm8', lm8_sources) ] for suffix, internal in [ ('', False), ('-it', True) diff --git a/scripts/convert_environment.py b/scripts/convert_environment.py index 65adeee8c0..1e330cd27b 100644 --- a/scripts/convert_environment.py +++ b/scripts/convert_environment.py @@ -23,6 +23,9 @@ from azul.files import ( write_file_atomically, ) +from azul.strings import ( + single_quote as sq, +) class Variable(NamedTuple): @@ -144,7 +147,7 @@ def sub(m: re.Match): return '{{' + m[1] + '}}' value = re.sub(r'\$?{([^}]+)}|\$([_A-Za-z][_A-Za-z0-9]*)', sub, value) - return f"'{value}'" + return sq(value) if __name__ == '__main__': diff --git a/src/azul/bigquery.py b/src/azul/bigquery.py index aa6f166a54..c95179e6f8 100644 --- a/src/azul/bigquery.py +++ b/src/azul/bigquery.py @@ -10,8 +10,8 @@ Union, ) -from azul import ( - require, +from azul.strings import ( + back_quote as bq, ) BigQueryValue = Union[int, float, bool, str, bytes, datetime, None] @@ -42,10 +42,9 @@ def backtick(table_name: str) -> str: >>> backtick('foo-2.bar`s.my_table') Traceback (most recent call last): ... - azul.RequirementError: foo-2.bar`s.my_table + azul.RequirementError: ('`', 'must not occur in', 'foo-2.bar`s.my_table') """ if table_name_re.fullmatch(table_name): return table_name else: - require('`' not in table_name, table_name) - return f'`{table_name}`' + return bq(table_name) diff --git a/src/azul/chalice.py b/src/azul/chalice.py index f7ad22b12c..0405133ff2 100644 --- a/src/azul/chalice.py +++ b/src/azul/chalice.py @@ -61,6 +61,10 @@ copy_json, json_head, ) +from azul.strings import ( + join_words as jw, + single_quote as sq, +) from azul.types import ( JSON, LambdaContext, @@ -189,15 +193,33 @@ def _api_gateway_context_middleware(self, event, get_response): finally: config.lambda_is_handling_api_gateway_request = False + hsts_max_age = 60 * 60 * 24 * 365 * 2 + + # Headers added to every response from the app, as well as canned 4XX and + # 5XX responses from API Gateway. Use of these headers addresses known + # security vulnerabilities. + # + security_headers = { + 'Content-Security-Policy': jw('default-src', sq('self')), + 'Referrer-Policy': 'strict-origin-when-cross-origin', + 'Strict-Transport-Security': jw(f'max-age={hsts_max_age};', + 'includeSubDomains;', + 'preload'), + 'X-Content-Type-Options': 'nosniff', + 'X-Frame-Options': 'DENY', + 'X-XSS-Protection': '1; mode=block' + } + def _security_headers_middleware(self, event, get_response): """ Add headers to the response """ response = get_response(event) - seconds = 60 * 60 * 24 * 365 - response.headers['Strict-Transport-Security'] = f'max-age={seconds}; includeSubDomains' - response.headers['X-Content-Type-Options'] = 'nosniff' - response.headers['X-Frame-Options'] = 'DENY' + response.headers.update(self.security_headers) + # FIXME: Add a CSP header with a nonce value to text/html responses + # https://github.com/DataBiosphere/azul-private/issues/6 + if response.headers.get('Content-Type') == 'text/html': + del response.headers['Content-Security-Policy'] view_function = self.routes[event.path][event.method].view_function cache_control = getattr(view_function, 'cache_control') response.headers['Cache-Control'] = cache_control diff --git a/src/azul/collections.py b/src/azul/collections.py index d996297a9d..9ddc64629a 100644 --- a/src/azul/collections.py +++ b/src/azul/collections.py @@ -131,6 +131,17 @@ def explode_dict(d: Mapping[K, Union[V, list[V], set[V], tuple[V]]] yield dict(zip(d.keys(), t)) +def none_safe_apply(f: Callable[[K], V], o: K | None) -> V | None: + """ + >>> none_safe_apply(str, 123) + '123' + + >>> none_safe_apply(str, None) is None + True + """ + return None if o is None else f(o) + + def none_safe_key(none_last: bool = False) -> Callable[[Any], Any]: """ Returns a sort key that handles None values. @@ -270,6 +281,10 @@ def adict(seq: Union[Mapping[K, V], Iterable[tuple[K, V]]] = None, return kwargs if seq is None else dict(seq, **kwargs) +def _athing(cls: type, *args): + return cls(arg for arg in args if arg is not None) + + def atuple(*args: V) -> tuple[V, ...]: """ >>> atuple() @@ -281,7 +296,7 @@ def atuple(*args: V) -> tuple[V, ...]: >>> atuple(0, None) (0,) """ - return tuple(arg for arg in args if arg is not None) + return _athing(tuple, *args) def alist(*args: V) -> list[V]: @@ -295,7 +310,21 @@ def alist(*args: V) -> list[V]: >>> alist(0, None) [0] """ - return list(arg for arg in args if arg is not None) + return _athing(list, *args) + + +def aset(*args: V) -> set[V]: + """ + >>> aset() + set() + + >>> aset(None) + set() + + >>> aset(0, None) + {0} + """ + return _athing(set, *args) class NestedDict(defaultdict): diff --git a/src/azul/indexer/index_service.py b/src/azul/indexer/index_service.py index 8fc5140e61..ab9571afdc 100644 --- a/src/azul/indexer/index_service.py +++ b/src/azul/indexer/index_service.py @@ -38,7 +38,6 @@ from more_itertools import ( first, one, - unzip, ) from azul import ( @@ -303,19 +302,21 @@ def transform(self, log.info('Transforming %i entities in partition %s of bundle %s, version %s.', num_entities, partition, bundle.uuid, bundle.version) contributions = [] - replicas = [] + replicas_by_coords = {} for transformer in transformers: - # The cast is necessary because unzip()'s type stub doesn't - # support heterogeneous tuples. - transforms = cast( - tuple[Iterable[Optional[Contribution]], Iterable[Optional[Replica]]], - unzip(transformer.transform(partition)) - ) - if transforms: - contributions_part, replicas_part = transforms - contributions.extend(filter(None, contributions_part)) - replicas.extend(filter(None, replicas_part)) - return contributions, replicas + for document in transformer.transform(partition): + if isinstance(document, Contribution): + contributions.append(document) + elif isinstance(document, Replica): + try: + dup = replicas_by_coords[document.coordinates] + except KeyError: + replicas_by_coords[document.coordinates] = document + else: + dup.hub_ids.extend(document.hub_ids) + else: + assert False, document + return contributions, list(replicas_by_coords.values()) def create_indices(self, catalog: CatalogName): es_client = ESClientFactory.get() diff --git a/src/azul/indexer/transform.py b/src/azul/indexer/transform.py index cd8ab6fae1..4f6efb7f50 100644 --- a/src/azul/indexer/transform.py +++ b/src/azul/indexer/transform.py @@ -11,6 +11,9 @@ import attr +from azul.collections import ( + alist, +) from azul.indexer import ( Bundle, BundleFQID, @@ -34,11 +37,8 @@ ) from azul.types import ( JSON, - MutableJSON, ) -Transform = tuple[Optional[Contribution], Optional[Replica]] - @attr.s(frozen=True, kw_only=True, auto_attribs=True) class Transformer(metaclass=ABCMeta): @@ -55,12 +55,14 @@ def entity_type(cls) -> EntityType: raise NotImplementedError @abstractmethod - def replica_type(self, entity: EntityReference) -> str: + def _replicate(self, entity: EntityReference) -> tuple[str, JSON]: """ - The name of the type of replica emitted by this transformer for a given - entity. + A tuple consisting of: + + 1. The name of the type of replica emitted by this transformer for a + given entity. See :py:attr:`Replica.replica_type`. - See :py:attr:`Replica.replica_type` + 2. The contents of the replica for that entity. """ raise NotImplementedError @@ -88,7 +90,9 @@ def estimate(self, partition: BundlePartition) -> int: """ @abstractmethod - def transform(self, partition: BundlePartition) -> Iterable[Transform]: + def transform(self, + partition: BundlePartition + ) -> Iterable[Contribution | Replica]: """ Return the contributions by the current bundle to the entities it contains metadata about. More than one bundle can contribute to a @@ -114,9 +118,10 @@ def aggregator(cls, entity_type: EntityType) -> Optional[EntityAggregator]: raise NotImplementedError def _contribution(self, - contents: MutableJSON, - entity: EntityReference + contents: JSON, + entity_id: EntityID ) -> Contribution: + entity = EntityReference(entity_type=self.entity_type(), entity_id=entity_id) coordinates = ContributionCoordinates(entity=entity, bundle=self.bundle.fqid.upcast(), deleted=self.deleted) @@ -126,17 +131,20 @@ def _contribution(self, contents=contents) def _replica(self, - contents: MutableJSON, entity: EntityReference, - hub_ids: list[EntityID] + *, + file_hub: EntityID | None, ) -> Replica: + replica_type, contents = self._replicate(entity) coordinates = ReplicaCoordinates(content_hash=json_hash(contents).hexdigest(), entity=entity) return Replica(coordinates=coordinates, version=None, - replica_type=self.replica_type(entity), + replica_type=replica_type, contents=contents, - hub_ids=hub_ids) + # The other hubs will be added when the indexer + # consolidates duplicate replicas. + hub_ids=alist(file_hub)) @classmethod @abstractmethod diff --git a/src/azul/plugins/metadata/anvil/__init__.py b/src/azul/plugins/metadata/anvil/__init__.py index 32afeb30a1..fe0c0203af 100644 --- a/src/azul/plugins/metadata/anvil/__init__.py +++ b/src/azul/plugins/metadata/anvil/__init__.py @@ -36,7 +36,6 @@ BiosampleTransformer, BundleTransformer, DatasetTransformer, - DiagnosisTransformer, DonorTransformer, FileTransformer, ) @@ -96,7 +95,6 @@ def transformer_types(self) -> Iterable[type[BaseTransformer]]: BiosampleTransformer, BundleTransformer, DatasetTransformer, - DiagnosisTransformer, DonorTransformer, FileTransformer, ) diff --git a/src/azul/plugins/metadata/anvil/bundle.py b/src/azul/plugins/metadata/anvil/bundle.py index ae25f4b423..e288018574 100644 --- a/src/azul/plugins/metadata/anvil/bundle.py +++ b/src/azul/plugins/metadata/anvil/bundle.py @@ -1,21 +1,26 @@ from abc import ( ABC, ) +from collections import ( + defaultdict, +) from typing import ( AbstractSet, Generic, - Iterable, + Mapping, + Self, TypeVar, ) import attrs -from more_itertools import ( - one, -) from azul import ( CatalogName, ) +from azul.collections import ( + aset, + none_safe_apply, +) from azul.indexer import ( BUNDLE_FQID, Bundle, @@ -43,50 +48,76 @@ class KeyReference: entity_type: EntityType -ENTITY_REF = TypeVar('ENTITY_REF', bound=EntityReference | KeyReference) +REF = TypeVar('REF', bound=EntityReference | KeyReference) @attrs.frozen(kw_only=True, order=False) -class Link(Generic[ENTITY_REF]): - inputs: AbstractSet[ENTITY_REF] = attrs.field(factory=frozenset, - converter=frozenset) +class Link(Generic[REF]): + inputs: AbstractSet[REF] = attrs.field(factory=frozenset, + converter=frozenset) - activity: ENTITY_REF | None = attrs.field(default=None) + activity: REF | None = attrs.field(default=None) - outputs: AbstractSet[ENTITY_REF] = attrs.field(factory=frozenset, - converter=frozenset) + outputs: AbstractSet[REF] = attrs.field(factory=frozenset, + converter=frozenset) @property - def all_entities(self) -> AbstractSet[ENTITY_REF]: - return self.inputs | self.outputs | (set() if self.activity is None else {self.activity}) + def all_entities(self) -> AbstractSet[REF]: + return self.inputs | self.outputs | aset(self.activity) @classmethod - def from_json(cls, link: JSON) -> 'Link': + def from_json(cls, link: JSON) -> Self: return cls(inputs=set(map(EntityReference.parse, link['inputs'])), - activity=None if link['activity'] is None else EntityReference.parse(link['activity']), + activity=none_safe_apply(EntityReference.parse, link['activity']), outputs=set(map(EntityReference.parse, link['outputs']))) def to_json(self) -> MutableJSON: return { 'inputs': sorted(map(str, self.inputs)), - 'activity': None if self.activity is None else str(self.activity), + 'activity': none_safe_apply(str, self.activity), 'outputs': sorted(map(str, self.outputs)) } @classmethod - def merge(cls, links: Iterable['Link']) -> 'Link': - return cls(inputs=frozenset.union(*[link.inputs for link in links]), - activity=one({link.activity for link in links}), - outputs=frozenset.union(*[link.outputs for link in links])) - - def __lt__(self, other: 'Link') -> bool: + def group_by_activity(cls, links: set[Self]): + """ + Merge links that share the same (non-null) activity. + """ + groups_by_activity: Mapping[KeyReference, set[Self]] = defaultdict(set) + for link in links: + if link.activity is not None: + groups_by_activity[link.activity].add(link) + for activity, group in groups_by_activity.items(): + if len(group) > 1: + links -= group + merged_link = cls(inputs=frozenset.union(*[link.inputs for link in group]), + activity=activity, + outputs=frozenset.union(*[link.outputs for link in group])) + links.add(merged_link) + + def __lt__(self, other: Self) -> bool: return min(self.inputs) < min(other.inputs) +class EntityLink(Link[EntityReference]): + pass + + +class KeyLink(Link[KeyReference]): + + def to_entity_link(self, + entities_by_key: Mapping[KeyReference, EntityReference] + ) -> EntityLink: + lookup = entities_by_key.__getitem__ + return EntityLink(inputs=set(map(lookup, self.inputs)), + activity=none_safe_apply(lookup, self.activity), + outputs=set(map(lookup, self.outputs))) + + @attrs.define(kw_only=True) class AnvilBundle(Bundle[BUNDLE_FQID], ABC): entities: dict[EntityReference, MutableJSON] = attrs.field(factory=dict) - links: set[Link[EntityReference]] = attrs.field(factory=set) + links: set[EntityLink] = attrs.field(factory=set) def reject_joiner(self, catalog: CatalogName): # FIXME: Optimize joiner rejection and re-enable it for AnVIL @@ -103,12 +134,12 @@ def to_json(self) -> MutableJSON: } @classmethod - def from_json(cls, fqid: BUNDLE_FQID, json_: JSON) -> 'AnvilBundle': + def from_json(cls, fqid: BUNDLE_FQID, json_: JSON) -> Self: return cls( fqid=fqid, entities={ EntityReference.parse(entity_ref): entity for entity_ref, entity in json_['entities'].items() }, - links=set(map(Link.from_json, json_['links'])) + links=set(map(EntityLink.from_json, json_['links'])) ) diff --git a/src/azul/plugins/metadata/anvil/indexer/transform.py b/src/azul/plugins/metadata/anvil/indexer/transform.py index bb6b954ff8..20107fb91d 100644 --- a/src/azul/plugins/metadata/anvil/indexer/transform.py +++ b/src/azul/plugins/metadata/anvil/indexer/transform.py @@ -22,6 +22,7 @@ Callable, Collection, Iterable, + Self, ) from uuid import ( UUID, @@ -53,6 +54,7 @@ EntityReference, EntityType, FieldTypes, + Replica, null_bool, null_int, null_str, @@ -60,12 +62,11 @@ pass_thru_json, ) from azul.indexer.transform import ( - Transform, Transformer, ) from azul.plugins.metadata.anvil.bundle import ( AnvilBundle, - Link, + EntityLink, ) from azul.plugins.metadata.anvil.indexer.aggregate import ( ActivityAggregator, @@ -97,11 +98,17 @@ class LinkedEntities: def __getitem__(self, item: EntityType) -> set[EntityReference]: return self.ancestors[item] | self.descendants[item] + def __iter__(self) -> Iterable[EntityReference]: + for entities in self.ancestors.values(): + yield from entities + for entities in self.descendants.values(): + yield from entities + @classmethod def from_links(cls, origin: EntityReference, - links: Collection[Link[EntityReference]] - ) -> 'LinkedEntities': + links: Collection[EntityLink] + ) -> Self: return cls(origin=origin, ancestors=cls._search(origin, links, from_='outputs', to='inputs'), descendants=cls._search(origin, links, from_='inputs', to='outputs')) @@ -109,7 +116,7 @@ def from_links(cls, @classmethod def _search(cls, entity_ref: EntityReference, - links: Collection[Link[EntityReference]], + links: Collection[EntityLink], entities: EntityRefsByType | None = None, *, from_: str, @@ -133,9 +140,6 @@ def _search(cls, class BaseTransformer(Transformer, metaclass=ABCMeta): bundle: AnvilBundle - def replica_type(self, entity: EntityReference) -> str: - return f'anvil_{entity.entity_type}' - @classmethod def field_types(cls) -> FieldTypes: return { @@ -167,32 +171,24 @@ def aggregator(cls, entity_type) -> EntityAggregator: def estimate(self, partition: BundlePartition) -> int: return sum(map(partial(self._contains, partition), self.bundle.entities)) - def transform(self, partition: BundlePartition) -> Iterable[Transform]: - return ( - self._transform(entity) - for entity in self._list_entities() - if self._contains(partition, entity) - ) + def transform(self, + partition: BundlePartition + ) -> Iterable[Contribution | Replica]: + for entity in self._list_entities(): + if self._contains(partition, entity): + yield from self._transform(entity) def _list_entities(self) -> Iterable[EntityReference]: return self.bundle.entities @abstractmethod - def _transform(self, entity: EntityReference) -> Transform: + def _transform(self, + entity: EntityReference + ) -> Iterable[Contribution | Replica]: raise NotImplementedError - def _add_replica(self, - contribution: MutableJSON | None, - entity: EntityReference, - hub_ids: list[EntityID] - ) -> Transform: - no_replica = not config.enable_replicas or self.entity_type() == 'bundles' - return ( - None if contribution is None else self._contribution(contribution, entity), - None if no_replica else self._replica(self.bundle.entities[entity], - entity, - hub_ids) - ) + def _replicate(self, entity: EntityReference) -> tuple[str, JSON]: + return f'anvil_{entity.entity_type}', self.bundle.entities[entity] def _pluralize(self, entity_type: str) -> str: if entity_type == 'diagnosis': @@ -334,19 +330,6 @@ def get_bound(field_name: str) -> float | None: for field_prefix in field_prefixes } - def _contribution(self, - contents: MutableJSON, - entity: EntityReference, - ) -> Contribution: - # The entity type is used to determine the index name. - # All activities go into the same index, regardless of their polymorphic type. - # Index names use plural forms. - entity_type = pluralize('activity' - if entity.entity_type.endswith('activity') else - entity.entity_type) - entity = attr.evolve(entity, entity_type=entity_type) - return super()._contribution(contents, entity) - def _entity(self, entity: EntityReference, field_types: FieldTypes, @@ -473,8 +456,8 @@ def _complete_dataset_keys(cls) -> AbstractSet[str]: class SingletonTransformer(BaseTransformer, metaclass=ABCMeta): - def _contents(self) -> MutableJSON: - return dict( + def _transform(self, entity: EntityReference) -> Iterable[Contribution]: + contents = dict( activities=self._entities(self._activity, chain.from_iterable( self._entities_by_type[activity_type] for activity_type in self._activity_polymorphic_types @@ -485,6 +468,7 @@ def _contents(self) -> MutableJSON: donors=self._entities(self._donor, self._entities_by_type['donor']), files=self._entities(self._file, self._entities_by_type['file']) ) + yield self._contribution(contents, entity.entity_id) @classmethod def field_types(cls) -> FieldTypes: @@ -503,8 +487,11 @@ def _duos_types(cls) -> FieldTypes: def _duos(self, dataset: EntityReference) -> MutableJSON: return self._entity(dataset, self._duos_types()) + def _is_duos(self, dataset: EntityReference) -> bool: + return 'description' in self.bundle.entities[dataset] + def _dataset(self, dataset: EntityReference) -> MutableJSON: - if 'description' in self.bundle.entities[dataset]: + if self._is_duos(dataset): return self._duos(dataset) else: return super()._dataset(dataset) @@ -523,19 +510,17 @@ class ActivityTransformer(BaseTransformer): def entity_type(cls) -> str: return 'activities' - def _transform(self, entity: EntityReference) -> Transform: + def _transform(self, entity: EntityReference) -> Iterable[Contribution]: linked = self._linked_entities(entity) - files = linked['file'] contents = dict( activities=[self._activity(entity)], biosamples=self._entities(self._biosample, linked['biosample']), datasets=[self._dataset(self._only_dataset())], diagnoses=self._entities(self._diagnosis, linked['diagnosis']), donors=self._entities(self._donor, linked['donor']), - files=self._entities(self._file, files), + files=self._entities(self._file, linked['file']) ) - hub_ids = [f.entity_id for f in files] - return self._add_replica(contents, entity, hub_ids) + yield self._contribution(contents, entity.entity_id) class BiosampleTransformer(BaseTransformer): @@ -544,9 +529,8 @@ class BiosampleTransformer(BaseTransformer): def entity_type(cls) -> str: return 'biosamples' - def _transform(self, entity: EntityReference) -> Transform: + def _transform(self, entity: EntityReference) -> Iterable[Contribution]: linked = self._linked_entities(entity) - files = linked['file'] contents = dict( activities=self._entities(self._activity, chain.from_iterable( linked[activity_type] @@ -556,22 +540,9 @@ def _transform(self, entity: EntityReference) -> Transform: datasets=[self._dataset(self._only_dataset())], diagnoses=self._entities(self._diagnosis, linked['diagnosis']), donors=self._entities(self._donor, linked['donor']), - files=self._entities(self._file, files), + files=self._entities(self._file, linked['file']), ) - hub_ids = [f.entity_id for f in files] - return self._add_replica(contents, entity, hub_ids) - - -class DiagnosisTransformer(BaseTransformer): - - def _transform(self, entity: EntityReference) -> Transform: - files = self._linked_entities(entity)['file'] - hub_ids = [f.entity_id for f in files] - return self._add_replica(None, entity, hub_ids) - - @classmethod - def entity_type(cls) -> EntityType: - return 'diagnoses' + yield self._contribution(contents, entity.entity_id) class BundleTransformer(SingletonTransformer): @@ -584,11 +555,6 @@ def _singleton(self) -> EntityReference: return EntityReference(entity_type='bundle', entity_id=self.bundle.uuid) - def _transform(self, entity: EntityReference) -> Transform: - contents = self._contents() - hub_ids = [f.entity_id for f in self._entities_by_type['file']] - return self._add_replica(contents, entity, hub_ids) - class DatasetTransformer(SingletonTransformer): @@ -599,17 +565,12 @@ def entity_type(cls) -> str: def _singleton(self) -> EntityReference: return self._only_dataset() - def _transform(self, entity: EntityReference) -> Transform: - contents = self._contents() - # Every file in a snapshot is linked to that snapshot's singular - # dataset, making an explicit list of hub IDs for the dataset both - # redundant and impractically large (we observe that for large - # snapshots, trying to track this many files in a single data structure - # causes a prohibitively high rate of conflicts during replica updates). - # Therefore, we leave the hub IDs field empty for datasets and rely on - # the tenet that every file is an implicit hub of its parent dataset. - hub_ids = [] - return self._add_replica(contents, entity, hub_ids) + def _transform(self, + entity: EntityReference + ) -> Iterable[Contribution | Replica]: + yield from super()._transform(entity) + if self._is_duos(entity): + yield self._replica(entity, file_hub=None) class DonorTransformer(BaseTransformer): @@ -618,9 +579,8 @@ class DonorTransformer(BaseTransformer): def entity_type(cls) -> str: return 'donors' - def _transform(self, entity: EntityReference) -> Transform: + def _transform(self, entity: EntityReference) -> Iterable[Contribution]: linked = self._linked_entities(entity) - files = linked['file'] contents = dict( activities=self._entities(self._activity, chain.from_iterable( linked[activity_type] @@ -630,10 +590,9 @@ def _transform(self, entity: EntityReference) -> Transform: datasets=[self._dataset(self._only_dataset())], diagnoses=self._entities(self._diagnosis, linked['diagnosis']), donors=[self._donor(entity)], - files=self._entities(self._file, files), + files=self._entities(self._file, linked['file']), ) - hub_ids = [f.entity_id for f in files] - return self._add_replica(contents, entity, hub_ids) + yield self._contribution(contents, entity.entity_id) class FileTransformer(BaseTransformer): @@ -642,7 +601,9 @@ class FileTransformer(BaseTransformer): def entity_type(cls) -> str: return 'files' - def _transform(self, entity: EntityReference) -> Transform: + def _transform(self, + entity: EntityReference + ) -> Iterable[Contribution | Replica]: linked = self._linked_entities(entity) contents = dict( activities=self._entities(self._activity, chain.from_iterable( @@ -655,8 +616,16 @@ def _transform(self, entity: EntityReference) -> Transform: donors=self._entities(self._donor, linked['donor']), files=[self._file(entity)], ) - # The result of the link traversal does not include the starting entity, - # so without this step the file itself wouldn't be included in its hubs - files = (entity, *linked['file']) - hub_ids = [f.entity_id for f in files] - return self._add_replica(contents, entity, hub_ids) + yield self._contribution(contents, entity.entity_id) + if config.enable_replicas: + yield self._replica(entity, file_hub=entity.entity_id) + for linked_entity in linked: + yield self._replica( + linked_entity, + # Datasets are linked to every file in their snapshot, + # making an explicit list of hub IDs for the dataset both + # redundant and impractically large. Therefore, we leave the + # hub IDs field empty for datasets and rely on the tenet + # that every file is an implicit hub of its parent dataset. + file_hub=None if linked_entity.entity_type == 'dataset' else entity.entity_id, + ) diff --git a/src/azul/plugins/metadata/hca/indexer/transform.py b/src/azul/plugins/metadata/hca/indexer/transform.py index 87cfd380fa..00380d8564 100644 --- a/src/azul/plugins/metadata/hca/indexer/transform.py +++ b/src/azul/plugins/metadata/hca/indexer/transform.py @@ -69,6 +69,7 @@ Nested, NullableString, PassThrough, + Replica, null_bool, null_datetime, null_int, @@ -78,7 +79,6 @@ pass_thru_json, ) from azul.indexer.transform import ( - Transform, Transformer, ) from azul.iterators import ( @@ -457,10 +457,6 @@ class BaseTransformer(Transformer, metaclass=ABCMeta): bundle: HCABundle api_bundle: api.Bundle - def replica_type(self, entity: EntityReference) -> str: - api_entity = self.api_bundle.entities[UUID(entity.entity_id)] - return api_entity.schema_name - @classmethod def aggregator(cls, entity_type: EntityType) -> EntityAggregator | None: if entity_type == 'files': @@ -497,22 +493,13 @@ def aggregator(cls, entity_type: EntityType) -> EntityAggregator | None: else: return SimpleAggregator() - def _add_replica(self, - contribution: MutableJSON, - entity: api.Entity | DatedEntity, - hub_ids: list[EntityID] - ) -> Transform: - entity_ref = EntityReference(entity_id=str(entity.document_id), - entity_type=self.entity_type()) - if not config.enable_replicas: - replica = None - elif self.entity_type() == 'bundles': - links = self.bundle.links - replica = self._replica(links, entity_ref, hub_ids) + def _replicate(self, entity: EntityReference) -> tuple[str, JSON]: + if entity == self.api_bundle.ref: + content = self.bundle.links else: - assert isinstance(entity, api.Entity), entity - replica = self._replica(entity.json, entity_ref, hub_ids) - return self._contribution(contribution, entity_ref), replica + api_entity = self.api_bundle.entities[UUID(entity.entity_id)] + content = api_entity.json + return entity.entity_type, content def _find_ancestor_samples(self, entity: api.LinkedEntity, @@ -1385,10 +1372,18 @@ def visit(self, entity: api.Entity) -> None: # noinspection PyDeprecation file_name = entity.manifest_entry.name is_zarr, zarr_name, sub_name = _parse_zarr_file_name(file_name) - # FIXME: Remove condition once https://github.com/HumanCellAtlas/metadata-schema/issues/623 is resolved + # zarray files no longer exist in DCP2. This condition may no longer + # be needed to support them, but we don't want to risk removing it. if not is_zarr or sub_name.endswith('.zattrs'): self.files[entity.document_id] = entity + @property + def entities(self) -> Iterable[EntityReference]: + for entity_dict in vars(self).values(): + for entity in entity_dict.values(): + yield EntityReference(entity_type=entity.schema_name, + entity_id=str(entity.document_id)) + ENTITY = TypeVar('ENTITY', bound=api.Entity) @@ -1396,7 +1391,9 @@ def visit(self, entity: api.Entity) -> None: class PartitionedTransformer(BaseTransformer, Generic[ENTITY]): @abstractmethod - def _transform(self, entities: Iterable[ENTITY]) -> Iterable[Transform]: + def _transform(self, + entities: Iterable[ENTITY] + ) -> Iterable[Contribution | Replica]: """ Transform the given outer entities into contributions. """ @@ -1415,7 +1412,9 @@ def _entities_in(self, partition: BundlePartition) -> Iterator[ENTITY]: def estimate(self, partition: BundlePartition) -> int: return ilen(self._entities_in(partition)) - def transform(self, partition: BundlePartition) -> Iterable[Transform]: + def transform(self, + partition: BundlePartition + ) -> Iterable[Contribution | Replica]: return self._transform(generable(self._entities_in, partition)) @@ -1428,12 +1427,15 @@ def entity_type(cls) -> str: def _entities(self) -> Iterable[api.File]: return self.api_bundle.not_stitched(self.api_bundle.files) - def _transform(self, files: Iterable[api.File]) -> Iterable[Contribution]: + def _transform(self, + files: Iterable[api.File] + ) -> Iterable[Contribution | Replica]: zarr_stores: Mapping[str, list[api.File]] = self.group_zarrs(files) for file in files: file_name = file.manifest_entry.name is_zarr, zarr_name, sub_name = _parse_zarr_file_name(file_name) - # FIXME: Remove condition once https://github.com/HumanCellAtlas/metadata-schema/issues/579 is resolved + # zarray files no longer exist in DCP2. This condition may no longer + # be needed to support them, but we don't want to risk removing it. if not is_zarr or sub_name.endswith('.zattrs'): if is_zarr: # This is the representative file, so add the related files @@ -1467,8 +1469,18 @@ def _transform(self, files: Iterable[api.File]) -> Iterable[Contribution]: additional_contents = self.matrix_stratification_values(file) for entity_type, values in additional_contents.items(): contents[entity_type].extend(values) - hub_ids = list(map(str, visitor.files)) - yield self._add_replica(contents, file, hub_ids) + file_id = file.ref.entity_id + yield self._contribution(contents, file_id) + if config.enable_replicas: + yield self._replica(self.api_bundle.ref, file_hub=file_id) + # Projects are linked to every file in their snapshot, + # making an explicit list of hub IDs for the project both + # redundant and impractically large. Therefore, we leave the + # hub IDs field empty for projects and rely on the tenet + # that every file is an implicit hub of its parent project. + yield self._replica(self._api_project.ref, file_hub=None) + for linked_entity in visitor.entities: + yield self._replica(linked_entity, file_hub=file_id) def matrix_stratification_values(self, file: api.File) -> JSON: """ @@ -1563,8 +1575,7 @@ def _transform(self, ), dates=[self._date(cell_suspension)], projects=[self._project(self._api_project)]) - hub_ids = list(map(str, visitor.files)) - yield self._add_replica(contents, cell_suspension, hub_ids) + yield self._contribution(contents, cell_suspension.ref.entity_id) class SampleTransformer(PartitionedTransformer): @@ -1609,8 +1620,7 @@ def _transform(self, samples: Iterable[Sample]) -> Iterable[Contribution]: ), dates=[self._date(sample)], projects=[self._project(self._api_project)]) - hub_ids = list(map(str, visitor.files)) - yield self._add_replica(contents, sample, hub_ids) + yield self._contribution(contents, sample.ref.entity_id) class BundleAsEntity(DatedEntity): @@ -1646,13 +1656,11 @@ def _dated_entities(self) -> Iterable[DatedEntity]: def estimate(self, partition: BundlePartition) -> int: return int(partition.contains(self._singleton_id)) - def transform(self, partition: BundlePartition) -> Iterable[Transform]: + def transform(self, partition: BundlePartition) -> Iterable[Contribution]: if partition.contains(self._singleton_id): yield self._transform() - else: - return () - def _transform(self) -> Transform: + def _transform(self) -> Contribution: # Project entities are not explicitly linked in the graph. The mere # presence of project metadata in a bundle indicates that all other # entities in that bundle belong to that project. Because of that we @@ -1710,12 +1718,7 @@ def _transform(self) -> Transform: contributed_analyses=contributed_analyses, dates=[self._date(self._singleton_entity())], projects=[self._project(self._api_project)]) - hub_ids = self._hub_ids(visitor) - return self._add_replica(contents, self._singleton_entity(), hub_ids) - - @abstractmethod - def _hub_ids(self, visitor: TransformerVisitor) -> list[str]: - raise NotImplementedError + return self._contribution(contents, str(self._singleton_id)) class ProjectTransformer(SingletonTransformer): @@ -1727,24 +1730,12 @@ def _singleton_entity(self) -> DatedEntity: def entity_type(cls) -> str: return 'projects' - def _hub_ids(self, visitor: TransformerVisitor) -> list[str]: - # Every file in a snapshot is linked to that snapshot's singular - # project, making an explicit list of hub IDs for the project both - # redundant and impractically large. Therefore, we leave the hub IDs - # field empty for projects and rely on the tenet that every file is an - # implicit hub of its parent project. - return [] - class BundleTransformer(SingletonTransformer): def _singleton_entity(self) -> DatedEntity: return BundleAsEntity(self.api_bundle) - def replica_type(self, entity: EntityReference) -> str: - assert entity.entity_type == self.entity_type(), entity - return 'links' - @classmethod def aggregator(cls, entity_type: EntityType) -> EntityAggregator | None: if entity_type == 'files': @@ -1755,6 +1746,3 @@ def aggregator(cls, entity_type: EntityType) -> EntityAggregator | None: @classmethod def entity_type(cls) -> str: return 'bundles' - - def _hub_ids(self, visitor: TransformerVisitor) -> list[str]: - return list(map(str, visitor.files)) diff --git a/src/azul/plugins/repository/tdr_anvil/__init__.py b/src/azul/plugins/repository/tdr_anvil/__init__.py index a39ce5b3ac..e276fe6b8a 100644 --- a/src/azul/plugins/repository/tdr_anvil/__init__.py +++ b/src/azul/plugins/repository/tdr_anvil/__init__.py @@ -1,7 +1,4 @@ -from collections import ( - defaultdict, -) -import datetime +import datetime from enum import ( Enum, ) @@ -12,7 +9,7 @@ from typing import ( AbstractSet, Callable, - Mapping, + Iterable, ) import attrs @@ -41,9 +38,10 @@ ) from azul.plugins.metadata.anvil.bundle import ( AnvilBundle, + EntityLink, Key, + KeyLink, KeyReference, - Link, ) from azul.plugins.metadata.anvil.schema import ( anvil_schema, @@ -71,7 +69,7 @@ MutableKeys = set[KeyReference] KeysByType = dict[EntityType, AbstractSet[Key]] MutableKeysByType = dict[EntityType, set[Key]] -KeyLinks = set[Link[KeyReference]] +KeyLinks = set[KeyLink] class BundleEntityType(Enum): @@ -153,21 +151,9 @@ def add_entity(self, crc32='') self.entities[entity] = metadata - def add_links(self, - links: KeyLinks, - entities_by_key: Mapping[KeyReference, EntityReference]) -> None: - def key_ref_to_entity_ref(key_ref: KeyReference) -> EntityReference: - return entities_by_key[key_ref] - - def optional_key_ref_to_entity_ref(key_ref: KeyReference | None) -> EntityReference | None: - return None if key_ref is None else key_ref_to_entity_ref(key_ref) - - self.links.update( - Link(inputs=set(map(key_ref_to_entity_ref, link.inputs)), - activity=optional_key_ref_to_entity_ref(link.activity), - outputs=set(map(key_ref_to_entity_ref, link.outputs))) - for link in links - ) + def add_links(self, links: Iterable[EntityLink]): + self.links.update(links) + EntityLink.group_by_activity(self.links) class Plugin(TDRPlugin[TDRAnvilBundle, TDRSourceSpec, TDRSourceRef, TDRAnvilBundleFQID]): @@ -307,7 +293,6 @@ def _primary_bundle(self, bundle_fqid: TDRAnvilBundleFQID) -> TDRAnvilBundle: log.info('Found %i entities linked to bundle %r: %r', len(keys), bundle_fqid.uuid, arg) - self._simplify_links(links) result = TDRAnvilBundle(fqid=bundle_fqid) entities_by_key: dict[KeyReference, EntityReference] = {} for entity_type, typed_keys in sorted(keys_by_type.items()): @@ -328,7 +313,7 @@ def _primary_bundle(self, bundle_fqid: TDRAnvilBundleFQID) -> TDRAnvilBundle: entity = EntityReference(entity_id=row['datarepo_row_id'], entity_type=entity_type) entities_by_key[key] = entity result.add_entity(entity, self._version, row) - result.add_links(links, entities_by_key) + result.add_links((link.to_entity_link(entities_by_key) for link in links)) return result def _supplementary_bundle(self, bundle_fqid: TDRAnvilBundleFQID) -> TDRAnvilBundle: @@ -350,18 +335,15 @@ def _supplementary_bundle(self, bundle_fqid: TDRAnvilBundleFQID) -> TDRAnvilBund SELECT {', '.join(sorted(columns))} FROM {backtick(self._full_table_name(source, linked_entity_type))} '''))) - entities_by_key = {} link_args = {} for entity_type, row, arg in [ (bundle_entity_type, bundle_entity, 'outputs'), (linked_entity_type, linked_entity, 'inputs') ]: entity_ref = EntityReference(entity_type=entity_type, entity_id=row['datarepo_row_id']) - key_ref = KeyReference(key=row[entity_type + '_id'], entity_type=entity_type) - entities_by_key[key_ref] = entity_ref result.add_entity(entity_ref, self._version, row) - link_args[arg] = {key_ref} - result.add_links({Link(**link_args)}, entities_by_key) + link_args[arg] = {entity_ref} + result.add_links({EntityLink(**link_args)}) return result def _duos_bundle(self, bundle_fqid: TDRAnvilBundleFQID) -> TDRAnvilBundle: @@ -410,15 +392,6 @@ def _consolidate_by_type(self, entities: Keys) -> MutableKeysByType: result[e.entity_type].add(e.key) return result - def _simplify_links(self, links: KeyLinks) -> None: - grouped_links: Mapping[KeyReference, KeyLinks] = defaultdict(set) - for link in links: - grouped_links[link.activity].add(link) - for activity, convergent_links in grouped_links.items(): - if activity is not None and len(convergent_links) > 1: - links -= convergent_links - links.add(Link.merge(convergent_links)) - def _follow_upstream(self, source: TDRSourceSpec, entities: KeysByType @@ -478,13 +451,13 @@ def _upstream_from_biosamples(self, for row in rows: downstream_ref = KeyReference(entity_type='biosample', key=row['biosample_id']) - result.add(Link(outputs={downstream_ref}, - inputs={KeyReference(entity_type='dataset', - key=one(row['part_of_dataset_id']))})) + result.add(KeyLink(outputs={downstream_ref}, + inputs={KeyReference(entity_type='dataset', + key=one(row['part_of_dataset_id']))})) for donor_id in row['donor_id']: - result.add(Link(outputs={downstream_ref}, - inputs={KeyReference(entity_type='donor', - key=donor_id)})) + result.add(KeyLink(outputs={downstream_ref}, + inputs={KeyReference(entity_type='donor', + key=donor_id)})) return result else: return set() @@ -546,7 +519,7 @@ def _upstream_from_files(self, ON f.file_id IN UNNEST(a.generated_file_id) ''') return { - Link( + KeyLink( activity=KeyReference(entity_type=row['activity_table'], key=row['activity_id']), # The generated link is not a complete representation of the # upstream activity because it does not include generated files @@ -575,9 +548,9 @@ def _diagnoses_from_donors(self, WHERE dgn.donor_id IN ({', '.join(map(repr, donor_ids))}) ''') return { - Link(inputs={KeyReference(key=row['diagnosis_id'], entity_type='diagnosis')}, - outputs={KeyReference(key=row['donor_id'], entity_type='donor')}, - activity=None) + KeyLink(inputs={KeyReference(key=row['diagnosis_id'], entity_type='diagnosis')}, + outputs={KeyReference(key=row['donor_id'], entity_type='donor')}, + activity=None) for row in rows } else: @@ -620,12 +593,12 @@ def _downstream_from_biosamples(self, WHERE biosample_id IN ({', '.join(map(repr, biosample_ids))}) ''') return { - Link(inputs={KeyReference(key=row['biosample_id'], entity_type='biosample')}, - outputs={ - KeyReference(key=output_id, entity_type='file') - for output_id in row['generated_file_id'] - }, - activity=KeyReference(key=row['activity_id'], entity_type=row['activity_table'])) + KeyLink(inputs={KeyReference(key=row['biosample_id'], entity_type='biosample')}, + outputs={ + KeyReference(key=output_id, entity_type='file') + for output_id in row['generated_file_id'] + }, + activity=KeyReference(key=row['activity_id'], entity_type=row['activity_table'])) for row in rows } else: @@ -666,12 +639,12 @@ def _downstream_from_files(self, WHERE used_file_id IN ({', '.join(map(repr, file_ids))}) ''') return { - Link(inputs={KeyReference(key=row['used_file_id'], entity_type='file')}, - outputs={ - KeyReference(key=file_id, entity_type='file') - for file_id in row['generated_file_id'] - }, - activity=KeyReference(key=row['activity_id'], entity_type=row['activity_table'])) + KeyLink(inputs={KeyReference(key=row['used_file_id'], entity_type='file')}, + outputs={ + KeyReference(key=file_id, entity_type='file') + for file_id in row['generated_file_id'] + }, + activity=KeyReference(key=row['activity_id'], entity_type=row['activity_table'])) for row in rows } else: diff --git a/src/azul/plugins/repository/tdr_hca/__init__.py b/src/azul/plugins/repository/tdr_hca/__init__.py index dcc9f733b6..320b0a4f1b 100644 --- a/src/azul/plugins/repository/tdr_hca/__init__.py +++ b/src/azul/plugins/repository/tdr_hca/__init__.py @@ -56,6 +56,9 @@ TDRBundleFQID, TDRPlugin, ) +from azul.strings import ( + single_quote as sq, +) from azul.terra import ( TDRSourceRef, TDRSourceSpec, @@ -439,22 +442,19 @@ def _retrieve_entities(self, table_name = backtick(self._full_table_name(source, entity_type)) entity_id_type = one(set(map(type, entity_ids))) - def quote(s): - return f"'{s}'" - if entity_type == 'links': assert issubclass(entity_id_type, BundleFQID), entity_id_type entity_ids = cast(set[BundleFQID], entity_ids) where_columns = (pk_column, version_column) where_values = ( - (quote(fqid.uuid), f'TIMESTAMP({quote(fqid.version)})') + (sq(fqid.uuid), f'TIMESTAMP({sq(fqid.version)})') for fqid in entity_ids ) expected = {fqid.uuid for fqid in entity_ids} else: assert issubclass(entity_id_type, str), (entity_type, entity_id_type) where_columns = (pk_column,) - where_values = ((quote(entity_id),) for entity_id in entity_ids) + where_values = ((sq(str(entity_id)),) for entity_id in entity_ids) expected = entity_ids query = f''' SELECT {', '.join({pk_column, *non_pk_columns})} diff --git a/src/azul/service/manifest_service.py b/src/azul/service/manifest_service.py index 953c58ece7..c82222fda2 100644 --- a/src/azul/service/manifest_service.py +++ b/src/azul/service/manifest_service.py @@ -140,6 +140,9 @@ StorageObjectNotFound, StorageService, ) +from azul.strings import ( + double_quote as dq, +) from azul.types import ( AnyJSON, FlatJSON, @@ -887,9 +890,8 @@ def _cmd_exe_quote(cls, s: str) -> str: """ Escape a string for insertion into a `cmd.exe` command line """ - assert '"' not in s, s assert '\\' not in s, s - return f'"{s}"' + return dq(s) @classmethod def command_lines(cls, diff --git a/src/azul/strings.py b/src/azul/strings.py index 494880c8e9..72b7995fca 100644 --- a/src/azul/strings.py +++ b/src/azul/strings.py @@ -9,6 +9,10 @@ minmax, ) +from azul import ( + reject, +) + def to_camel_case(text: str) -> str: camel_cased = ''.join(part.title() for part in text.split('_')) @@ -223,3 +227,109 @@ def longest_common_prefix(strings: Iterable[str]) -> Optional[str]: if s2[i] != c: return s1[:i] return s1 + + +def join_lines(*lines: str) -> str: + """ + Join the arguments with a newline character. + + >>> join_lines() + '' + + >>> join_lines('a') + 'a' + + >>> join_lines('a', 'b') + 'a\\nb' + """ + return '\n'.join(lines) + + +def join_words(*words: str) -> str: + """ + Join the arguments with a space character. + + >>> join_words() + '' + + >>> join_words('a') + 'a' + + >>> join_words('a', 'b') + 'a b' + """ + return ' '.join(words) + + +def delimit(s: str, delimiter: str) -> str: + """ + Prepend and append a delimiter to a string after ensuring that the former + does not occur in the latter. + + >>> delimit('foo', "'") + "'foo'" + + >>> delimit("foo's", "'") + Traceback (most recent call last): + ... + azul.RequirementError: ("'", 'must not occur in', "foo's") + """ + reject(delimiter in s, delimiter, 'must not occur in', s) + return delimiter + s + delimiter + + +def back_quote(*words: str) -> str: + """ + Join the arguments with a space character and enclose the result in back + quotes. The arguments must not contain back quotes. + + >>> back_quote() + '``' + + >>> back_quote('foo', 'bar') + '`foo bar`' + + >>> back_quote('foo`s') + Traceback (most recent call last): + ... + azul.RequirementError: ('`', 'must not occur in', 'foo`s') + """ + return delimit(join_words(*words), '`') + + +def single_quote(*words: str) -> str: + """ + Join the arguments with a space character and enclose the result in single + quotes. The arguments must not contain single quotes. + + >>> single_quote() + "''" + + >>> single_quote('foo', 'bar') + "'foo bar'" + + >>> single_quote("foo", "bar's") + Traceback (most recent call last): + ... + azul.RequirementError: ("'", 'must not occur in', "foo bar's") + """ + return delimit(join_words(*words), "'") + + +def double_quote(*words: str) -> str: + """ + Join the arguments with a space character and enclose the result in double + quotes. The arguments must not contain double quotes. + + >>> double_quote() + '""' + + >>> double_quote('foo', 'bar') + '"foo bar"' + + >>> double_quote('foo', 'b"a"r') + Traceback (most recent call last): + ... + azul.RequirementError: ('"', 'must not occur in', 'foo b"a"r') + """ + return delimit(join_words(*words), '"') diff --git a/src/azul/terraform.py b/src/azul/terraform.py index cd9afe287f..f40382e80f 100644 --- a/src/azul/terraform.py +++ b/src/azul/terraform.py @@ -28,6 +28,9 @@ config, require, ) +from azul.chalice import ( + AzulChaliceApp, +) from azul.deployment import ( aws, ) @@ -802,11 +805,34 @@ def tf_config(self, app_name): # Setting this property using AWS API Gateway extensions to the OpenAPI # specification works around this issue. # + # We ran into similar difficulties when using Terraform to configure + # default responses for the API, so we use these extensions for that + # purpose, too. + # + openapi_spec = json.loads(locals[app_name]) rest_api = resources['aws_api_gateway_rest_api'][app_name] assert 'minimum_compression_size' not in rest_api, rest_api - openapi_spec = json.loads(locals[app_name]) key = 'x-amazon-apigateway-minimum-compression-size' openapi_spec[key] = config.minimum_compression_size + assert 'aws_api_gateway_gateway_response' not in resources, resources + openapi_spec['x-amazon-apigateway-gateway-responses'] = { + f'DEFAULT_{response_type}': { + 'responseParameters': { + # Static value response header parameters must be enclosed + # within a pair of single quotes. + # + # https://docs.aws.amazon.com/apigateway/latest/developerguide/request-response-data-mappings.html#mapping-response-parameters + # https://docs.aws.amazon.com/apigateway/latest/developerguide/api-gateway-swagger-extensions-gateway-responses.html + # + # Note that azul.strings.single_quote() is not used here + # since API Gateway allows internal single quotes in the + # value, which that function would prohibit. + # + f'gatewayresponse.header.{k}': f"'{v}'" + for k, v in AzulChaliceApp.security_headers.items() + } + } for response_type in ['4XX', '5XX'] + } locals[app_name] = json.dumps(openapi_spec) return { diff --git a/src/humancellatlas/data/metadata/api.py b/src/humancellatlas/data/metadata/api.py index 26c650c250..bd637f664c 100644 --- a/src/humancellatlas/data/metadata/api.py +++ b/src/humancellatlas/data/metadata/api.py @@ -138,8 +138,9 @@ def _optional_datetime(self, s: str | None) -> datetime | None: return s if s is None else self._datetime(s) @property - def address(self): - return self.schema_name + '@' + str(self.document_id) + def ref(self) -> EntityReference: + return EntityReference(entity_type=self.schema_name, + entity_id=str(self.document_id)) @property def schema_name(self): @@ -202,9 +203,10 @@ def accept(self, visitor: EntityVisitor): class LinkError(RuntimeError): def __init__(self, entity: LinkedEntity, other_entity: Entity, forward: bool) -> None: - super().__init__(entity.address + - ' cannot ' + ('reference ' if forward else 'be referenced by ') + - other_entity.address) + super().__init__( + f'{entity.ref} cannot {"reference" if forward else "be referenced by"} ' + f'{other_entity.ref}' + ) L = TypeVar('L', bound=LinkedEntity) @@ -1042,6 +1044,10 @@ def sequencing_output(self) -> list[SequenceFile]: if isinstance(f, SequenceFile) and any(ps.is_sequencing_process() for ps in f.from_processes.values())] + @property + def ref(self) -> EntityReference: + return EntityReference(entity_type='links', entity_id=str(self.uuid)) + entity_types = { # Biomaterials diff --git a/terraform/browser/browser.tf.json.template.py b/terraform/browser/browser.tf.json.template.py index d104051235..c1818ce65e 100644 --- a/terraform/browser/browser.tf.json.template.py +++ b/terraform/browser/browser.tf.json.template.py @@ -31,6 +31,11 @@ from azul.deployment import ( aws, ) +from azul.strings import ( + double_quote as dq, + join_words as jw, + single_quote as sq, +) from azul.terraform import ( block_public_s3_bucket_access, emit_tf, @@ -370,7 +375,7 @@ def emit(): '--fail', '--silent', gitlab_helper.curl_auth_flags(), - quote(gitlab_helper.tarball_url(site)) + dq(gitlab_helper.tarball_url(site)) ]), ' '.join([ # --transform is specific to GNU Tar, which, on macOS must be installed @@ -492,60 +497,54 @@ def predicate(s): def content_security_policy() -> str: - def q(s: str) -> str: - return f"'{s}'" - - def s(*args: str) -> str: - return ' '.join(args) - - self = q('self') - none = q('none') - unsafe_inline = q('unsafe-inline') - unsafe_eval = q('unsafe-eval') + self = sq('self') + none = sq('none') + unsafe_inline = sq('unsafe-inline') + unsafe_eval = sq('unsafe-eval') return ';'.join([ - s('default-src', self), - s('object-src', none), - s('frame-src', none), - s('frame-ancestors', none), - s('child-src', none), - s('img-src', - self, - 'data:', - 'https://lh3.googleusercontent.com', - 'https://www.google-analytics.com', - 'https://www.googletagmanager.com'), - s('script-src', - self, - unsafe_inline, - unsafe_eval, - 'https://accounts.google.com/gsi/client', - 'https://www.google-analytics.com', - 'https://www.googletagmanager.com'), - s('style-src', - self, - unsafe_inline, - 'https://fonts.googleapis.com', - 'https://p.typekit.net', - 'https://use.typekit.net'), - s('font-src', - self, - 'data:', - 'https://fonts.gstatic.com', - 'https://use.typekit.net/af/'), - s('connect-src', - self, - 'https://www.google-analytics.com', - 'https://www.googleapis.com/oauth2/v3/userinfo', - 'https://www.googletagmanager.com', - 'https://support.terra.bio/api/v2/', - str(furl(config.sam_service_url, - path='/register/user/v1')), - str(furl(config.sam_service_url, - path='/register/user/v2/self/termsOfServiceDetails')), - str(furl(config.terra_service_url, - path='/api/nih/status')), - str(config.service_endpoint)) + jw('default-src', self), + jw('object-src', none), + jw('frame-src', none), + jw('frame-ancestors', none), + jw('child-src', none), + jw('img-src', + self, + 'data:', + 'https://lh3.googleusercontent.com', + 'https://www.google-analytics.com', + 'https://www.googletagmanager.com'), + jw('script-src', + self, + unsafe_inline, + unsafe_eval, + 'https://accounts.google.com/gsi/client', + 'https://www.google-analytics.com', + 'https://www.googletagmanager.com'), + jw('style-src', + self, + unsafe_inline, + 'https://fonts.googleapis.com', + 'https://p.typekit.net', + 'https://use.typekit.net'), + jw('font-src', + self, + 'data:', + 'https://fonts.gstatic.com', + 'https://use.typekit.net/af/'), + jw('connect-src', + self, + 'https://www.google-analytics.com', + 'https://www.googleapis.com/oauth2/v3/userinfo', + 'https://www.googletagmanager.com', + 'https://support.terra.bio/api/v2/', + str(furl(config.sam_service_url, + path='/register/user/v1')), + str(furl(config.sam_service_url, + path='/register/user/v2/self/termsOfServiceDetails')), + str(furl(config.terra_service_url, + path='/api/nih/status')), + str(config.service_endpoint)) ]) @@ -613,7 +612,7 @@ def curl_auth_flags(self) -> str: token_type, token = 'PRIVATE', config.gitlab_access_token if token is None: token_type, token = 'JOB', os.environ['CI_JOB_TOKEN'] - header = quote(f'{token_type}-TOKEN: {token}') + header = dq(token_type + '-TOKEN:', token) return '--header ' + header def tarball_hash(self, site: config.BrowserSite) -> str: @@ -652,11 +651,6 @@ def tarball_version(self, branch: str) -> str: return branch.replace('/', '_') -def quote(s): - assert '"' not in s, s - return '"' + s + '"' - - gitlab_helper = GitLabHelper() del GitLabHelper diff --git a/terraform/gitlab/gitlab.tf.json.template.py b/terraform/gitlab/gitlab.tf.json.template.py index 6e4f544853..6c5e0f28a6 100644 --- a/terraform/gitlab/gitlab.tf.json.template.py +++ b/terraform/gitlab/gitlab.tf.json.template.py @@ -26,6 +26,10 @@ ) from azul.strings import ( departition, + double_quote as dq, + join_lines as jl, + join_words as jw, + single_quote as sq, ) from azul.terraform import ( chalice, @@ -272,22 +276,6 @@ def merge(sets: Iterable[Iterable[str]]) -> Iterable[str]: return sorted(set(chain(*sets))) -def jw(*words): - return ' '.join(words) - - -def jl(*lines): - return '\n'.join(lines) - - -def qq(*words): - return '"' + jw(*words) + '"' - - -def sq(*words): - return "'" + jw(*words) + "'" - - emit_tf({} if config.terraform_component != 'gitlab' else { 'data': { 'aws_sns_topic': { @@ -1882,7 +1870,7 @@ def sq(*words): str(clamav_image), '/bin/sh', '-c', - qq( + dq( 'freshclam', '&& echo freshclam succeeded', '|| (echo freshclam failed; false)', diff --git a/terraform/shared/shared.tf.json.template.py b/terraform/shared/shared.tf.json.template.py index 129d879c68..0c46a1a467 100644 --- a/terraform/shared/shared.tf.json.template.py +++ b/terraform/shared/shared.tf.json.template.py @@ -385,9 +385,8 @@ def conformance_pack(name: str) -> str: # security boundary, so vulnerabilities that are detected within # them do not need to be addressed with the same urgency. # - # Using CF stack because the AWS provider does not support - # Inspector filters and the AWSCC provider fails due to - # https://github.com/hashicorp/terraform-provider-awscc/issues/1364 + # FIXME: Remove workaround for false Terraform bug + # https://github.com/DataBiosphere/azul/issues/6577 'inspector_filters': { 'name': config.qualified_resource_name('inspectorfilters'), 'template_body': json.dumps({ diff --git a/test/hca_metadata_api/test.py b/test/hca_metadata_api/test.py index 6ae9ac7077..bdb1514158 100644 --- a/test/hca_metadata_api/test.py +++ b/test/hca_metadata_api/test.py @@ -437,7 +437,7 @@ def _assert_bundle(self, uuid, version, manifest, metadata, links, self.assertIn(DonorOrganism, root_entity_types) self.assertTrue({DonorOrganism, SupplementaryFile}.issuperset(root_entity_types)) root_entity = next(iter(root_entities)) - self.assertRegex(root_entity.address, 'donor_organism@.*') + self.assertEqual(root_entity.ref.entity_type, 'donor_organism') self.assertIsInstance(root_entity, DonorOrganism) self.assertEqual(root_entity.organism_age_in_seconds, age_range) self.assertTrue(root_entity.sex in ('female', 'male', 'unknown')) diff --git a/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json b/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json index b1db16dbda..d7b65274b4 100644 --- a/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json +++ b/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json @@ -1,6 +1,6 @@ [ { - "_id": "bundles_aaa96233-bf27-44c7-82df-b4dc15ad4d9d_50b88c774d130ce5cb9e680241549c2df665efca", + "_id": "links_aaa96233-bf27-44c7-82df-b4dc15ad4d9d_50b88c774d130ce5cb9e680241549c2df665efca", "_index": "azul_v2_dev_test_replica", "_score": 1.0, "_type": "_doc", @@ -3460,7 +3460,7 @@ } }, { - "_id": "files_0c5ac7c0-817e-40d4-b1b1-34c3d5cfecdb_4628c80cede8a4dd39584f8b50619edacc7f62e7", + "_id": "sequence_file_0c5ac7c0-817e-40d4-b1b1-34c3d5cfecdb_4628c80cede8a4dd39584f8b50619edacc7f62e7", "_index": "azul_v2_dev_test_replica", "_score": 1.0, "_source": { @@ -3491,7 +3491,7 @@ "_type": "_doc" }, { - "_id": "cell_suspensions_412898c5-5b9b-4907-b07c-e9b89666e204_f8ca7504548249c223277135af497beb5fba29df", + "_id": "cell_suspension_412898c5-5b9b-4907-b07c-e9b89666e204_f8ca7504548249c223277135af497beb5fba29df", "_index": "azul_v2_dev_test_replica", "_score": 1.0, "_source": { @@ -3530,7 +3530,7 @@ "_type": "_doc" }, { - "_id": "files_70d1af4a-82c8-478a-8960-e9028b3616ca_8ee6fbddb7c933ebe39b31b977c51af21687f036", + "_id": "sequence_file_70d1af4a-82c8-478a-8960-e9028b3616ca_8ee6fbddb7c933ebe39b31b977c51af21687f036", "_index": "azul_v2_dev_test_replica", "_score": 1.0, "_source": { @@ -3561,7 +3561,7 @@ "_type": "_doc" }, { - "_id": "samples_a21dc760-a500-4236-bcff-da34a0e873d2_0bb754590d94de62d6fb15bdbac5f64f54072cb0", + "_id": "specimen_from_organism_a21dc760-a500-4236-bcff-da34a0e873d2_0bb754590d94de62d6fb15bdbac5f64f54072cb0", "_index": "azul_v2_dev_test_replica", "_score": 1.0, "_source": { @@ -3614,7 +3614,7 @@ "_type": "_doc" }, { - "_id": "projects_e8642221-4c2c-4fd7-b926-a68bce363c88_42d51043dcd0030b5ee9ce47fa30c85bcc92bac6", + "_id": "project_e8642221-4c2c-4fd7-b926-a68bce363c88_42d51043dcd0030b5ee9ce47fa30c85bcc92bac6", "_index": "azul_v2_dev_test_replica", "_score": 1.0, "_source": { @@ -3691,5 +3691,175 @@ "hub_ids": [] }, "_type": "_doc" + }, + { + "_id": "donor_organism_7b07b9d0-cc0e-4098-9f64-f4a569f7d746_7dd78e18008a0c036bb25e84f7742c075e16873f", + "_index": "azul_v2_dummy_test_replica", + "_score": 1.0, + "_source": { + "contents": { + "biomaterial_core": { + "biomaterial_id": "DID_scRSq06", + "ncbi_taxon_id": [ + 9606 + ] + }, + "death": { + "cause_of_death": "stroke" + }, + "describedBy": "https://schema.humancellatlas.org/type/biomaterial/10.1.2/donor_organism", + "diseases": [ + { + "ontology": "PATO:0000461", + "ontology_label": "normal", + "text": "normal" + } + ], + "genus_species": [ + { + "ontology": "NCBITaxon:9606", + "ontology_label": "Australopithecus", + "text": "Australopithecus" + } + ], + "human_specific": { + "body_mass_index": 29.5, + "ethnicity": [ + { + "ontology": "hancestro:0005", + "ontology_label": "European", + "text": "European" + } + ] + }, + "is_living": "no", + "organism_age": "38", + "organism_age_unit": { + "ontology": "UO:0000036", + "ontology_label": "year", + "text": "year" + }, + "provenance": { + "document_id": "7b07b9d0-cc0e-4098-9f64-f4a569f7d746", + "submission_date": "2018-11-02T10:02:12.191Z", + "update_date": "2018-11-02T10:07:39.622Z" + }, + "schema_type": "biomaterial", + "sex": "female" + }, + "entity_id": "7b07b9d0-cc0e-4098-9f64-f4a569f7d746", + "hub_ids": [ + "0c5ac7c0-817e-40d4-b1b1-34c3d5cfecdb", + "70d1af4a-82c8-478a-8960-e9028b3616ca" + ], + "replica_type": "donor_organism" + }, + "_type": "_doc" + }, + { + "_id": "library_preparation_protocol_9c32cf70-3ed7-4720-badc-5ee71e8a38af_33c8ff808fc57c3dcb4b3872d1dcbc859e45a85a", + "_index": "azul_v2_dummy_test_replica", + "_score": 1.0, + "_source": { + "contents": { + "describedBy": "https://schema.humancellatlas.org/type/protocol/sequencing/4.3.3/library_preparation_protocol", + "end_bias": "full length", + "input_nucleic_acid_molecule": { + "ontology": "OBI:0000869", + "text": "polyA RNA" + }, + "library_construction_approach": { + "ontology": "EFO:0008931", + "ontology_label": "Smart-seq2", + "text": "Smart-seq2" + }, + "library_construction_kit": { + "manufacturer": "Illumina", + "retail_name": "Nextera XT kit" + }, + "nucleic_acid_source": "single cell", + "primer": "poly-dT", + "protocol_core": { + "protocol_id": "library_preparation_protocol_1" + }, + "provenance": { + "document_id": "9c32cf70-3ed7-4720-badc-5ee71e8a38af", + "submission_date": "2018-11-02T10:05:05.547Z", + "update_date": "2018-11-02T10:05:10.360Z" + }, + "schema_type": "protocol", + "strand": "unstranded" + }, + "entity_id": "9c32cf70-3ed7-4720-badc-5ee71e8a38af", + "hub_ids": [ + "0c5ac7c0-817e-40d4-b1b1-34c3d5cfecdb", + "70d1af4a-82c8-478a-8960-e9028b3616ca" + ], + "replica_type": "library_preparation_protocol" + }, + "_type": "_doc" + }, + { + "_id": "sequencing_protocol_61e629ed-0135-4492-ac8a-5c4ab3ccca8a_187e49da61e5d3fbf91f3a6cf626bc4ce3c2d2be", + "_index": "azul_v2_dummy_test_replica", + "_score": 1.0, + "_source": { + "contents": { + "describedBy": "https://schema.humancellatlas.org/type/protocol/sequencing/9.0.3/sequencing_protocol", + "instrument_manufacturer_model": { + "ontology": "EFO:0008566", + "ontology_label": "Illumina NextSeq 500", + "text": "Illumina NextSeq 500" + }, + "paired_end": true, + "protocol_core": { + "protocol_id": "sequencing_protocol_1" + }, + "provenance": { + "document_id": "61e629ed-0135-4492-ac8a-5c4ab3ccca8a", + "submission_date": "2018-11-02T10:05:05.555Z", + "update_date": "2018-11-02T10:05:10.376Z" + }, + "schema_type": "protocol", + "sequencing_approach": { + "ontology": "EFO:0008896", + "ontology_label": "RNA-Seq", + "text": "RNA-Seq" + } + }, + "entity_id": "61e629ed-0135-4492-ac8a-5c4ab3ccca8a", + "hub_ids": [ + "0c5ac7c0-817e-40d4-b1b1-34c3d5cfecdb", + "70d1af4a-82c8-478a-8960-e9028b3616ca" + ], + "replica_type": "sequencing_protocol" + }, + "_type": "_doc" + }, + { + "_id": "process_771ddaf6-3a4f-4314-97fe-6294ff8e25a4_2e08df0ed1adfe582cd63f4daef287d9a637cdf1", + "_index": "azul_v2_dummy_test_replica", + "_score": 1.0, + "_source": { + "contents": { + "describedBy": "https://schema.humancellatlas.org/type/process/6.0.2/process", + "process_core": { + "process_id": "SRR3562915" + }, + "provenance": { + "document_id": "771ddaf6-3a4f-4314-97fe-6294ff8e25a4", + "submission_date": "2018-11-02T10:06:37.087Z", + "update_date": "2018-11-02T10:13:43.197Z" + }, + "schema_type": "process" + }, + "entity_id": "771ddaf6-3a4f-4314-97fe-6294ff8e25a4", + "hub_ids": [ + "0c5ac7c0-817e-40d4-b1b1-34c3d5cfecdb", + "70d1af4a-82c8-478a-8960-e9028b3616ca" + ], + "replica_type": "process" + }, + "_type": "_doc" } ] diff --git a/test/indexer/test_anvil.py b/test/indexer/test_anvil.py index bf0ddbed18..3ad0d3b719 100644 --- a/test/indexer/test_anvil.py +++ b/test/indexer/test_anvil.py @@ -204,6 +204,8 @@ def test_dataset_description(self): bundles = [self.primary_bundle(), self.duos_bundle()] for bundle_fqid in bundles: bundle = cast(TDRAnvilBundle, self._load_canned_bundle(bundle_fqid)) + # To simplify the test, we drop all entities from the bundles + # except for the dataset bundle.links.clear() bundle.entities = {dataset_ref: bundle.entities[dataset_ref]} self._index_bundle(bundle, delete=False) @@ -231,5 +233,7 @@ def test_dataset_description(self): self.assertDictEqual(doc_counts, { DocumentType.aggregate: 1, DocumentType.contribution: 2, - **({DocumentType.replica: 2} if config.enable_replicas else {}) + # No replica is emitted for the primary dataset because we dropped + # the files (hubs) from its bundle above + **({DocumentType.replica: 1} if config.enable_replicas else {}) }) diff --git a/test/indexer/test_indexer.py b/test/indexer/test_indexer.py index 34c395d77b..2fbf8206ba 100644 --- a/test/indexer/test_indexer.py +++ b/test/indexer/test_indexer.py @@ -132,28 +132,12 @@ def new_bundle(self): def metadata_plugin(self) -> MetadataPlugin: return MetadataPlugin.load(self.catalog).create() - def _num_replicas(self, *, num_additions: int, num_dups: int = 0) -> int: - """ - The number of replicas the index is expected to contain. - - :param num_additions: Number of addition contributions written to the - indices - - :param num_dups: How many of those contributions had identical contents - with another contribution - """ - if config.enable_replicas: - assert num_dups <= num_additions - return num_additions - num_dups - else: - return 0 - def _assert_hit_counts(self, hits: list[JSON], num_contribs: int, + num_replicas: int, *, num_aggs: Optional[int] = None, - num_replicas: Optional[int] = None ): """ Verify that the indices contain the correct number of hits of each @@ -163,22 +147,19 @@ def _assert_hit_counts(self, :param num_contribs: Expected number of contributions + :param num_replicas: Expected number of replicas, assuming replicas are + enabled. + :param num_aggs: Expected number of aggregates. If unspecified, ``num_contribs`` becomes the default. - - :param num_replicas: Expected number of replicas. If unspecified, - ``num_contribs`` is used to calculate it, assuming - all contributions have distinct contents. """ if num_aggs is None: # By default, assume 1 aggregate per contribution. num_aggs = num_contribs - if num_replicas is None: - num_replicas = self._num_replicas(num_additions=num_contribs) expected = { DocumentType.contribution: num_contribs, DocumentType.aggregate: num_aggs, - DocumentType.replica: num_replicas + DocumentType.replica: num_replicas if config.enable_replicas else 0, } actual = dict.fromkeys(expected.keys(), 0) actual |= Counter(self._parse_index_name(h)[1] for h in hits) @@ -235,13 +216,13 @@ def test_indexing(self): if self._parse_index_name(h)[1] is not DocumentType.replica ] self.assertElasticEqual(expected_hits, hits) - contributions = [] - replicas = [] + contributions = set() + replicas = set() for hit in hits: qualifier, doc_type = self._parse_index_name(hit) if doc_type is DocumentType.replica: entity_id = ReplicaCoordinates.from_hit(hit).entity.entity_id - replicas.append(entity_id) + replicas.add(entity_id) if entity_id == bundle.uuid: expected = bundle.links else: @@ -257,11 +238,15 @@ def test_indexing(self): self.assertEqual(expected, actual) elif doc_type is DocumentType.contribution: entity_id = ContributionCoordinates.from_hit(hit).entity.entity_id - contributions.append(entity_id) - contributions.sort() - replicas.sort() - # Every contribution should be replicated - self.assertEqual(contributions if enable_replicas else [], replicas) + contributions.add(entity_id) + # Every contribution should be replicated, but many + # replicated entities never manifest as outer + # entities in the index and thus have no + # corresponding contribution. + if enable_replicas: + self.assertIsSubset(contributions, replicas) + else: + self.assertEqual(set(), replicas) finally: self.index_service.delete_indices(self.catalog) @@ -271,21 +256,21 @@ def test_deletion(self): """ # Ensure that we have a bundle whose documents are written individually # and another one that's written in bulk. - bundle_sizes = { - self.new_bundle: 6, - self.bundle_fqid(uuid='2a87dc5c-0c3c-4d91-a348-5d784ab48b92', - version='2018-03-29T10:39:45.437487Z'): 258 - } + bundle_sizes = [ + (self.new_bundle, 6, 10), + (self.bundle_fqid(uuid='2a87dc5c-0c3c-4d91-a348-5d784ab48b92', + version='2018-03-29T10:39:45.437487Z'), 258, 263) + ] self.assertTrue( - min(bundle_sizes.values()) + bundle_sizes[0][1] < IndexWriter.bulk_threshold - < max(bundle_sizes.values()) + < bundle_sizes[1][1] ) field_types = self.index_service.catalogued_field_types() aggregate_cls = self.metadata_plugin.aggregate_class() - for bundle_fqid, size in bundle_sizes.items(): - with self.subTest(size=size): + for bundle_fqid, num_contribs, num_replicas in bundle_sizes: + with self.subTest(num_contribs=num_contribs): bundle = self._load_canned_bundle(bundle_fqid) bundle = DSSBundle(fqid=bundle_fqid, manifest=bundle.manifest, @@ -295,7 +280,7 @@ def test_deletion(self): try: self._index_bundle(bundle) hits = self._get_all_hits() - self._assert_hit_counts(hits, size) + self._assert_hit_counts(hits, num_contribs, num_replicas) for hit in hits: qualifier, doc_type = self._parse_index_name(hit) if doc_type is DocumentType.aggregate: @@ -315,12 +300,11 @@ def test_deletion(self): hits = self._get_all_hits() # Twice the number of contributions because deletions create # new documents instead of removing them. - num_contribs = size * 2 + num_contribs += num_contribs # The aggregates are removed when the deletions cause their # contents to become emtpy. num_aggs = 0 # Deletions do not affect the number of replicas. - num_replicas = self._num_replicas(num_additions=size) self._assert_hit_counts(hits, num_contribs=num_contribs, num_aggs=num_aggs, @@ -396,10 +380,10 @@ def test_duplicate_notification(self): entity = next(e for e in tallies_1.keys() if e.entity_type != 'bundles') tallies_1.pop(entity) - entity_contents = one( - metadata + replica_ref, entity_contents = one( + (parsed_ref, metadata) for ref, metadata in bundle.metadata.items() - if EntityReference.parse(ref).entity_id == entity.entity_id + if (parsed_ref := EntityReference.parse(ref)).entity_id == entity.entity_id ) coordinates = [ ContributionCoordinates( @@ -411,7 +395,7 @@ def test_duplicate_notification(self): if config.enable_replicas: coordinates.append( ReplicaCoordinates( - entity=entity, + entity=replica_ref, content_hash=json_hash(entity_contents).hexdigest() ).with_catalog(self.catalog) ) @@ -483,9 +467,9 @@ def test_deletion_before_addition(self): def _assert_index_counts(self, *, just_deletion: bool): # Two files, a project, a cell suspension, a sample, and a bundle - num_entities = 6 - num_addition_contribs = 0 if just_deletion else num_entities - num_deletion_contribs = num_entities + num_old_contribs = 6 + num_addition_contribs = 0 if just_deletion else num_old_contribs + num_deletion_contribs = num_old_contribs hits = self._get_all_hits() @@ -500,7 +484,7 @@ def _assert_index_counts(self, *, just_deletion: bool): num_contribs = num_addition_contribs + num_deletion_contribs # These deletion markers do not affect the number of replicas because we # don't support deleting replicas. - num_replicas = self._num_replicas(num_additions=num_addition_contribs) + num_replicas = 0 if just_deletion else 10 self._assert_hit_counts(hits, num_contribs=num_contribs, num_aggs=0, @@ -1135,7 +1119,9 @@ def test_derived_files(self): hits = self._get_all_hits() num_files = 33 num_expected = dict(files=num_files, samples=1, cell_suspensions=1, projects=1, bundles=1) - self._assert_hit_counts(hits, sum(num_expected.values())) + num_contribs = sum(num_expected.values()) + num_replicas = 41 + self._assert_hit_counts(hits, num_contribs, num_replicas) num_contribs, num_aggregates = Counter(), Counter() for hit in hits: qualifier, doc_type = self._parse_index_name(hit) @@ -1202,37 +1188,27 @@ def _assert_old_bundle(self, *, expect_new_version: bool | None) -> HitsById: :return: A dictionary with all hits from the old bundle """ # Two files, a project, a cell suspension, a sample, and a bundle - num_entities = 6 - - # Expect a replica for each entity in the old version - num_additions = num_entities - if expect_new_version is True: - # Expect an updated replica for each entity in the new version - num_additions += num_entities - elif expect_new_version is None: - # Even after the new version is deleted, the updated replicas - # remain, since deletion of replicas is not supported - num_additions += num_entities - # New and old replicas for `links.json` are identical - num_dups = 0 if expect_new_version is False else 1 - num_replicas = self._num_replicas(num_additions=num_additions, - num_dups=num_dups) - + num_new_contribs = 6 + num_contribs = num_new_contribs # Expect the old version's contributions - num_contribs = num_entities - if expect_new_version is True: - # Expect the new version's contributions - num_contribs += num_entities - elif expect_new_version is None: - # Expect the new version's contributions … - num_contribs += num_entities - # as well as deletion markers for them - num_contribs += num_entities + num_aggs = num_new_contribs + # Expect a replica for each entity in the old version + num_replicas = 10 + if expect_new_version in (True, None): + # New and old replicas for `links.json` are identical + num_replicas += num_replicas - 1 + # Expect the new version's contributions... + num_contribs += num_new_contribs + if expect_new_version is None: + # ....as well as deletion markers for them + num_contribs += num_new_contribs + # Even after the new version is deleted, the updated replicas + # remain, since deletion of replicas is not supported hits = self._get_all_hits() self._assert_hit_counts(hits, num_contribs=num_contribs, - num_aggs=num_entities, + num_aggs=num_aggs, num_replicas=num_replicas) num_actual_new_contributions = 0 @@ -1272,9 +1248,9 @@ def _assert_old_bundle(self, *, expect_new_version: bool | None) -> HitsById: # We count the deleted contributions here too since they should have a # corresponding addition contribution - self.assertEqual(0 if expect_new_version is False else num_entities, + self.assertEqual(0 if expect_new_version is False else num_new_contribs, num_actual_new_contributions) - self.assertEqual(num_entities if expect_new_version is None else 0, + self.assertEqual(num_new_contribs if expect_new_version is None else 0, num_actual_new_deleted_contributions) return hits_by_id @@ -1297,23 +1273,21 @@ def _assert_new_bundle(self, self.assertTrue(expect_old_version) # Two files, a project, a cell suspension, a sample, and a bundle - num_entities = 6 + num_old_contribs = 6 + num_contribs = num_old_contribs + num_aggs = num_old_contribs + num_replicas = 10 - # Expect an updaded replica for each entity in the new version - num_contribs = num_entities if expect_old_version: - # Expect a replica for each entity in the old version - num_contribs += num_entities - - # New and old replicas for `links.json` are identical - num_dups = 1 if expect_old_version else 0 - num_replicas = self._num_replicas(num_additions=num_contribs, - num_dups=num_dups) + # Expect an updated contribution for each entity in the old version + num_contribs += num_old_contribs + # New and old replicas for `links.json` are identical + num_replicas += num_replicas - 1 hits = self._get_all_hits() self._assert_hit_counts(hits, num_contribs=num_contribs, - num_aggs=num_entities, + num_aggs=num_aggs, num_replicas=num_replicas) num_actual_old_contributions = 0 @@ -1360,7 +1334,7 @@ def _assert_new_bundle(self, elif doc_type is DocumentType.contribution: self.assertNotIn('Farmers Trucks', [c.get('institution') for c in project['contributors']]) - self.assertEqual(num_entities if expect_old_version else 0, + self.assertEqual(num_old_contribs if expect_old_version else 0, num_actual_old_contributions) def _extract_bundle_version(self, doc_type: DocumentType, source: JSON) -> str: @@ -1409,17 +1383,16 @@ def mocked_mget(self, body, _source_includes): hits = self._get_all_hits() file_uuids = set() # Two files, a project, a cell suspension, a sample, and a bundle - num_entities = 6 + num_contribs = 6 # Two bundles - num_contribs = num_entities * 2 + num_contribs = num_contribs * 2 # Both bundles share the same sample and the project, so they get # aggregated only once num_aggs = num_contribs - 2 # The sample contributions from each bundle are identical and yield a # single replica, but the project contributions have different schema # versions and thus yield two. - num_replicas = self._num_replicas(num_additions=num_contribs, - num_dups=1) + num_replicas = 14 self._assert_hit_counts(hits, num_contribs=num_contribs, num_aggs=num_aggs, @@ -1501,7 +1474,9 @@ def test_indexing_matrix_related_files(self): self.assertNotIn('!', related_file['name']) def test_indexing_with_skipped_matrix_file(self): - # FIXME: Remove once https://github.com/HumanCellAtlas/metadata-schema/issues/579 is resolved + # zarray files no longer exist in DCP2. This test covers behavior that + # may no longer be needed to support them, but we don't want to risk + # removing it. bundle_fqid = self.bundle_fqid(uuid='587d74b4-1075-4bbf-b96a-4d1ede0481b2', version='2018-10-10T02:23:43.182000Z') self._index_canned_bundle(bundle_fqid) @@ -1519,8 +1494,6 @@ def test_indexing_with_skipped_matrix_file(self): aggregate_file_names.add(one(files)['name']) else: for file in files: - # FIXME: need for one() is odd, file_format is a group field - # https://github.com/DataBiosphere/azul/issues/612 if qualifier == 'bundles': if file['file_format'] == 'matrix': entities_with_matrix_files.add(hit['_source']['entity_id']) @@ -2169,7 +2142,7 @@ def test_mapper_parsing(self): self._index_canned_bundle(bundle_fqid) hits = self._get_all_hits() - self._assert_hit_counts(hits, 21) + self._assert_hit_counts(hits, num_contribs=21, num_replicas=28) def test_disallow_manifest_column_joiner(self): bundle_fqid = self.bundle_fqid(uuid='1b6d8348-d6e9-406a-aa6a-7ee886e52bf9', diff --git a/test/integration_test.py b/test/integration_test.py index 9623692a3f..ee66f595c2 100644 --- a/test/integration_test.py +++ b/test/integration_test.py @@ -104,6 +104,9 @@ AzulClient, AzulClientNotificationError, ) +from azul.chalice import ( + AzulChaliceApp, +) from azul.drs import ( AccessMethod, ) @@ -2042,11 +2045,6 @@ def test_response_security_headers(self): '/oauth2_redirect': {'Cache-Control': 'no-store'}, '/health/basic': {'Cache-Control': 'no-store'} } - global_headers = { - 'Strict-Transport-Security': 'max-age=31536000; includeSubDomains', - 'X-Content-Type-Options': 'nosniff', - 'X-Frame-Options': 'DENY', - } for endpoint in (config.service_endpoint, config.indexer_endpoint): for path, expected_headers in test_cases.items(): with self.subTest(endpoint=endpoint, path=path): @@ -2055,5 +2053,17 @@ def test_response_security_headers(self): else: response = requests.get(str(endpoint / path)) response.raise_for_status() - expected = expected_headers | global_headers + expected = AzulChaliceApp.security_headers | expected_headers + # FIXME: Add a CSP header with a nonce value to text/html responses + # https://github.com/DataBiosphere/azul-private/issues/6 + if path in ['/', '/oauth2_redirect']: + del expected['Content-Security-Policy'] self.assertIsSubset(expected.items(), response.headers.items()) + + def test_default_4xx_response_headers(self): + for endpoint in (config.service_endpoint, config.indexer_endpoint): + with self.subTest(endpoint=endpoint): + response = requests.get(str(endpoint / 'does-not-exist')) + self.assertEqual(403, response.status_code) + self.assertIsSubset(AzulChaliceApp.security_headers.items(), + response.headers.items()) diff --git a/test/service/data/verbatim/hca/pfb_entities.json b/test/service/data/verbatim/hca/pfb_entities.json index cdb1266adb..7b586923c1 100644 --- a/test/service/data/verbatim/hca/pfb_entities.json +++ b/test/service/data/verbatim/hca/pfb_entities.json @@ -12,6 +12,20 @@ "properties": [], "values": {} }, + { + "links": [], + "name": "donor_organism", + "ontology_reference": "", + "properties": [], + "values": {} + }, + { + "links": [], + "name": "library_preparation_protocol", + "ontology_reference": "", + "properties": [], + "values": {} + }, { "links": [], "name": "links", @@ -19,6 +33,13 @@ "properties": [], "values": {} }, + { + "links": [], + "name": "process", + "ontology_reference": "", + "properties": [], + "values": {} + }, { "links": [], "name": "project", @@ -33,6 +54,13 @@ "properties": [], "values": {} }, + { + "links": [], + "name": "sequencing_protocol", + "ontology_reference": "", + "properties": [], + "values": {} + }, { "links": [], "name": "specimen_from_organism", @@ -120,6 +148,140 @@ }, "relations": [] }, + { + "id": "61e629ed-0135-4492-ac8a-5c4ab3ccca8a", + "name": "sequencing_protocol", + "object": { + "describedBy": "https://schema.humancellatlas.org/type/protocol/sequencing/9.0.3/sequencing_protocol", + "instrument_manufacturer_model": { + "ontology": "EFO:0008566", + "ontology_label": "Illumina NextSeq 500", + "text": "Illumina NextSeq 500" + }, + "paired_end": true, + "protocol_core": { + "protocol_id": "sequencing_protocol_1" + }, + "provenance": { + "document_id": "61e629ed-0135-4492-ac8a-5c4ab3ccca8a", + "submission_date": "2018-11-02T10:05:05.555Z", + "update_date": "2018-11-02T10:05:10.376Z" + }, + "schema_type": "protocol", + "sequencing_approach": { + "ontology": "EFO:0008896", + "ontology_label": "RNA-Seq", + "text": "RNA-Seq" + } + }, + "relations": [] + }, + { + "id": "771ddaf6-3a4f-4314-97fe-6294ff8e25a4", + "name": "process", + "object": { + "describedBy": "https://schema.humancellatlas.org/type/process/6.0.2/process", + "process_core": { + "process_id": "SRR3562915" + }, + "provenance": { + "document_id": "771ddaf6-3a4f-4314-97fe-6294ff8e25a4", + "submission_date": "2018-11-02T10:06:37.087Z", + "update_date": "2018-11-02T10:13:43.197Z" + }, + "schema_type": "process" + }, + "relations": [] + }, + { + "id": "7b07b9d0-cc0e-4098-9f64-f4a569f7d746", + "name": "donor_organism", + "object": { + "biomaterial_core": { + "biomaterial_id": "DID_scRSq06", + "ncbi_taxon_id": [ + 9606 + ] + }, + "death": { + "cause_of_death": "stroke" + }, + "describedBy": "https://schema.humancellatlas.org/type/biomaterial/10.1.2/donor_organism", + "diseases": [ + { + "ontology": "PATO:0000461", + "ontology_label": "normal", + "text": "normal" + } + ], + "genus_species": [ + { + "ontology": "NCBITaxon:9606", + "ontology_label": "Australopithecus", + "text": "Australopithecus" + } + ], + "human_specific": { + "body_mass_index": 29.5, + "ethnicity": [ + { + "ontology": "hancestro:0005", + "ontology_label": "European", + "text": "European" + } + ] + }, + "is_living": "no", + "organism_age": "38", + "organism_age_unit": { + "ontology": "UO:0000036", + "ontology_label": "year", + "text": "year" + }, + "provenance": { + "document_id": "7b07b9d0-cc0e-4098-9f64-f4a569f7d746", + "submission_date": "2018-11-02T10:02:12.191Z", + "update_date": "2018-11-02T10:07:39.622Z" + }, + "schema_type": "biomaterial", + "sex": "female" + }, + "relations": [] + }, + { + "id": "9c32cf70-3ed7-4720-badc-5ee71e8a38af", + "name": "library_preparation_protocol", + "object": { + "describedBy": "https://schema.humancellatlas.org/type/protocol/sequencing/4.3.3/library_preparation_protocol", + "end_bias": "full length", + "input_nucleic_acid_molecule": { + "ontology": "OBI:0000869", + "text": "polyA RNA" + }, + "library_construction_approach": { + "ontology": "EFO:0008931", + "ontology_label": "Smart-seq2", + "text": "Smart-seq2" + }, + "library_construction_kit": { + "manufacturer": "Illumina", + "retail_name": "Nextera XT kit" + }, + "nucleic_acid_source": "single cell", + "primer": "poly-dT", + "protocol_core": { + "protocol_id": "library_preparation_protocol_1" + }, + "provenance": { + "document_id": "9c32cf70-3ed7-4720-badc-5ee71e8a38af", + "submission_date": "2018-11-02T10:05:05.547Z", + "update_date": "2018-11-02T10:05:10.360Z" + }, + "schema_type": "protocol", + "strand": "unstranded" + }, + "relations": [] + }, { "id": "a21dc760-a500-4236-bcff-da34a0e873d2", "name": "specimen_from_organism", diff --git a/test/service/data/verbatim/hca/pfb_schema.json b/test/service/data/verbatim/hca/pfb_schema.json index af50aa32f7..9a7a044fc4 100644 --- a/test/service/data/verbatim/hca/pfb_schema.json +++ b/test/service/data/verbatim/hca/pfb_schema.json @@ -222,6 +222,365 @@ "name": "cell_suspension", "type": "record" }, + { + "fields": [ + { + "name": "biomaterial_core", + "namespace": "donor_organism", + "type": { + "fields": [ + { + "name": "biomaterial_id", + "namespace": "donor_organism.biomaterial_core", + "type": "string" + }, + { + "name": "ncbi_taxon_id", + "namespace": "donor_organism.biomaterial_core", + "type": { + "items": "long", + "type": "array" + } + } + ], + "name": "donor_organism.biomaterial_core", + "type": "record" + } + }, + { + "name": "death", + "namespace": "donor_organism", + "type": { + "fields": [ + { + "name": "cause_of_death", + "namespace": "donor_organism.death", + "type": "string" + } + ], + "name": "donor_organism.death", + "type": "record" + } + }, + { + "name": "describedBy", + "namespace": "donor_organism", + "type": "string" + }, + { + "name": "diseases", + "namespace": "donor_organism", + "type": { + "items": { + "fields": [ + { + "name": "ontology", + "namespace": "donor_organism.diseases", + "type": "string" + }, + { + "name": "ontology_label", + "namespace": "donor_organism.diseases", + "type": "string" + }, + { + "name": "text", + "namespace": "donor_organism.diseases", + "type": "string" + } + ], + "name": "donor_organism.diseases", + "type": "record" + }, + "type": "array" + } + }, + { + "name": "genus_species", + "namespace": "donor_organism", + "type": { + "items": { + "fields": [ + { + "name": "ontology", + "namespace": "donor_organism.genus_species", + "type": "string" + }, + { + "name": "ontology_label", + "namespace": "donor_organism.genus_species", + "type": "string" + }, + { + "name": "text", + "namespace": "donor_organism.genus_species", + "type": "string" + } + ], + "name": "donor_organism.genus_species", + "type": "record" + }, + "type": "array" + } + }, + { + "name": "human_specific", + "namespace": "donor_organism", + "type": { + "fields": [ + { + "name": "body_mass_index", + "namespace": "donor_organism.human_specific", + "type": "double" + }, + { + "name": "ethnicity", + "namespace": "donor_organism.human_specific", + "type": { + "items": { + "fields": [ + { + "name": "ontology", + "namespace": "donor_organism.human_specific.ethnicity", + "type": "string" + }, + { + "name": "ontology_label", + "namespace": "donor_organism.human_specific.ethnicity", + "type": "string" + }, + { + "name": "text", + "namespace": "donor_organism.human_specific.ethnicity", + "type": "string" + } + ], + "name": "donor_organism.human_specific.ethnicity", + "type": "record" + }, + "type": "array" + } + } + ], + "name": "donor_organism.human_specific", + "type": "record" + } + }, + { + "name": "is_living", + "namespace": "donor_organism", + "type": "string" + }, + { + "name": "organism_age", + "namespace": "donor_organism", + "type": "string" + }, + { + "name": "organism_age_unit", + "namespace": "donor_organism", + "type": { + "fields": [ + { + "name": "ontology", + "namespace": "donor_organism.organism_age_unit", + "type": "string" + }, + { + "name": "ontology_label", + "namespace": "donor_organism.organism_age_unit", + "type": "string" + }, + { + "name": "text", + "namespace": "donor_organism.organism_age_unit", + "type": "string" + } + ], + "name": "donor_organism.organism_age_unit", + "type": "record" + } + }, + { + "name": "provenance", + "namespace": "donor_organism", + "type": { + "fields": [ + { + "name": "document_id", + "namespace": "donor_organism.provenance", + "type": "string" + }, + { + "name": "submission_date", + "namespace": "donor_organism.provenance", + "type": "string" + }, + { + "name": "update_date", + "namespace": "donor_organism.provenance", + "type": "string" + } + ], + "name": "donor_organism.provenance", + "type": "record" + } + }, + { + "name": "schema_type", + "namespace": "donor_organism", + "type": "string" + }, + { + "name": "sex", + "namespace": "donor_organism", + "type": "string" + } + ], + "name": "donor_organism", + "type": "record" + }, + { + "fields": [ + { + "name": "describedBy", + "namespace": "library_preparation_protocol", + "type": "string" + }, + { + "name": "end_bias", + "namespace": "library_preparation_protocol", + "type": "string" + }, + { + "name": "input_nucleic_acid_molecule", + "namespace": "library_preparation_protocol", + "type": { + "fields": [ + { + "name": "ontology", + "namespace": "library_preparation_protocol.input_nucleic_acid_molecule", + "type": "string" + }, + { + "name": "text", + "namespace": "library_preparation_protocol.input_nucleic_acid_molecule", + "type": "string" + } + ], + "name": "library_preparation_protocol.input_nucleic_acid_molecule", + "type": "record" + } + }, + { + "name": "library_construction_approach", + "namespace": "library_preparation_protocol", + "type": { + "fields": [ + { + "name": "ontology", + "namespace": "library_preparation_protocol.library_construction_approach", + "type": "string" + }, + { + "name": "ontology_label", + "namespace": "library_preparation_protocol.library_construction_approach", + "type": "string" + }, + { + "name": "text", + "namespace": "library_preparation_protocol.library_construction_approach", + "type": "string" + } + ], + "name": "library_preparation_protocol.library_construction_approach", + "type": "record" + } + }, + { + "name": "library_construction_kit", + "namespace": "library_preparation_protocol", + "type": { + "fields": [ + { + "name": "manufacturer", + "namespace": "library_preparation_protocol.library_construction_kit", + "type": "string" + }, + { + "name": "retail_name", + "namespace": "library_preparation_protocol.library_construction_kit", + "type": "string" + } + ], + "name": "library_preparation_protocol.library_construction_kit", + "type": "record" + } + }, + { + "name": "nucleic_acid_source", + "namespace": "library_preparation_protocol", + "type": "string" + }, + { + "name": "primer", + "namespace": "library_preparation_protocol", + "type": "string" + }, + { + "name": "protocol_core", + "namespace": "library_preparation_protocol", + "type": { + "fields": [ + { + "name": "protocol_id", + "namespace": "library_preparation_protocol.protocol_core", + "type": "string" + } + ], + "name": "library_preparation_protocol.protocol_core", + "type": "record" + } + }, + { + "name": "provenance", + "namespace": "library_preparation_protocol", + "type": { + "fields": [ + { + "name": "document_id", + "namespace": "library_preparation_protocol.provenance", + "type": "string" + }, + { + "name": "submission_date", + "namespace": "library_preparation_protocol.provenance", + "type": "string" + }, + { + "name": "update_date", + "namespace": "library_preparation_protocol.provenance", + "type": "string" + } + ], + "name": "library_preparation_protocol.provenance", + "type": "record" + } + }, + { + "name": "schema_type", + "namespace": "library_preparation_protocol", + "type": "string" + }, + { + "name": "strand", + "namespace": "library_preparation_protocol", + "type": "string" + } + ], + "name": "library_preparation_protocol", + "type": "record" + }, { "fields": [ { @@ -310,6 +669,62 @@ "name": "links", "type": "record" }, + { + "fields": [ + { + "name": "describedBy", + "namespace": "process", + "type": "string" + }, + { + "name": "process_core", + "namespace": "process", + "type": { + "fields": [ + { + "name": "process_id", + "namespace": "process.process_core", + "type": "string" + } + ], + "name": "process.process_core", + "type": "record" + } + }, + { + "name": "provenance", + "namespace": "process", + "type": { + "fields": [ + { + "name": "document_id", + "namespace": "process.provenance", + "type": "string" + }, + { + "name": "submission_date", + "namespace": "process.provenance", + "type": "string" + }, + { + "name": "update_date", + "namespace": "process.provenance", + "type": "string" + } + ], + "name": "process.provenance", + "type": "record" + } + }, + { + "name": "schema_type", + "namespace": "process", + "type": "string" + } + ], + "name": "process", + "type": "record" + }, { "fields": [ { @@ -592,6 +1007,117 @@ "name": "sequence_file", "type": "record" }, + { + "fields": [ + { + "name": "describedBy", + "namespace": "sequencing_protocol", + "type": "string" + }, + { + "name": "instrument_manufacturer_model", + "namespace": "sequencing_protocol", + "type": { + "fields": [ + { + "name": "ontology", + "namespace": "sequencing_protocol.instrument_manufacturer_model", + "type": "string" + }, + { + "name": "ontology_label", + "namespace": "sequencing_protocol.instrument_manufacturer_model", + "type": "string" + }, + { + "name": "text", + "namespace": "sequencing_protocol.instrument_manufacturer_model", + "type": "string" + } + ], + "name": "sequencing_protocol.instrument_manufacturer_model", + "type": "record" + } + }, + { + "name": "paired_end", + "namespace": "sequencing_protocol", + "type": "boolean" + }, + { + "name": "protocol_core", + "namespace": "sequencing_protocol", + "type": { + "fields": [ + { + "name": "protocol_id", + "namespace": "sequencing_protocol.protocol_core", + "type": "string" + } + ], + "name": "sequencing_protocol.protocol_core", + "type": "record" + } + }, + { + "name": "provenance", + "namespace": "sequencing_protocol", + "type": { + "fields": [ + { + "name": "document_id", + "namespace": "sequencing_protocol.provenance", + "type": "string" + }, + { + "name": "submission_date", + "namespace": "sequencing_protocol.provenance", + "type": "string" + }, + { + "name": "update_date", + "namespace": "sequencing_protocol.provenance", + "type": "string" + } + ], + "name": "sequencing_protocol.provenance", + "type": "record" + } + }, + { + "name": "schema_type", + "namespace": "sequencing_protocol", + "type": "string" + }, + { + "name": "sequencing_approach", + "namespace": "sequencing_protocol", + "type": { + "fields": [ + { + "name": "ontology", + "namespace": "sequencing_protocol.sequencing_approach", + "type": "string" + }, + { + "name": "ontology_label", + "namespace": "sequencing_protocol.sequencing_approach", + "type": "string" + }, + { + "name": "text", + "namespace": "sequencing_protocol.sequencing_approach", + "type": "string" + } + ], + "name": "sequencing_protocol.sequencing_approach", + "type": "record" + } + } + ], + "name": "sequencing_protocol", + "type": "record" + }, { "fields": [ { diff --git a/test/service/test_app_logging.py b/test/service/test_app_logging.py index 2d4251f561..9585bde07f 100644 --- a/test/service/test_app_logging.py +++ b/test/service/test_app_logging.py @@ -15,6 +15,9 @@ from azul.logging import ( configure_test_logging, ) +from azul.strings import ( + single_quote as sq, +) from indexer import ( DCP1CannedBundleTestCase, ) @@ -66,9 +69,12 @@ def test_request_logs(self): 'Returning 200 response with headers {"Access-Control-Allow-Origin": ' '"*", "Access-Control-Allow-Headers": ' '"Authorization,Content-Type,X-Amz-Date,X-Amz-Security-Token,X-Api-Key", ' - '"Strict-Transport-Security": "max-age=31536000; includeSubDomains", ' + f'"Content-Security-Policy": "default-src {sq("self")}", ' + '"Referrer-Policy": "strict-origin-when-cross-origin", ' + '"Strict-Transport-Security": "max-age=63072000; includeSubDomains; preload", ' '"X-Content-Type-Options": "nosniff", ' '"X-Frame-Options": "DENY", ' + '"X-XSS-Protection": "1; mode=block", ' '"Cache-Control": "no-store"}. ' 'See next line for the first 1024 characters of the body.\n' '{"up": true}' diff --git a/test/service/test_manifest.py b/test/service/test_manifest.py index 4513bbbe75..92d0b5546f 100644 --- a/test/service/test_manifest.py +++ b/test/service/test_manifest.py @@ -106,6 +106,9 @@ from azul.service.storage_service import ( StorageService, ) +from azul.strings import ( + single_quote as sq, +) from azul.types import ( JSON, JSONs, @@ -1346,7 +1349,11 @@ def test_verbatim_jsonl_manifest(self): 'project/e8642221-4c2c-4fd7-b926-a68bce363c88', 'sequence_file/70d1af4a-82c8-478a-8960-e9028b3616ca', 'sequence_file/0c5ac7c0-817e-40d4-b1b1-34c3d5cfecdb', - 'specimen_from_organism/a21dc760-a500-4236-bcff-da34a0e873d2' + 'specimen_from_organism/a21dc760-a500-4236-bcff-da34a0e873d2', + 'donor_organism/7b07b9d0-cc0e-4098-9f64-f4a569f7d746', + 'library_preparation_protocol/9c32cf70-3ed7-4720-badc-5ee71e8a38af', + 'sequencing_protocol/61e629ed-0135-4492-ac8a-5c4ab3ccca8a', + 'process/771ddaf6-3a4f-4314-97fe-6294ff8e25a4' ]: expected.append({ 'type': EntityReference.parse(ref).entity_type, @@ -1611,7 +1618,7 @@ def test(*, format: ManifestFormat, fetch: bool, url: Optional[furl] = None): expected_url_for_bash = expected_url else: expected_url = object_url - expected_url_for_bash = f"'{expected_url}'" + expected_url_for_bash = sq(str(expected_url)) if format is ManifestFormat.curl: manifest_options = '--location --fail' file_options = '--fail-early --continue-at - --retry 15 --retry-delay 10' diff --git a/test/test_app_logging.py b/test/test_app_logging.py index 6efda81bf5..2372707577 100644 --- a/test/test_app_logging.py +++ b/test/test_app_logging.py @@ -31,6 +31,9 @@ azul_log_level, configure_test_logging, ) +from azul.strings import ( + single_quote as sq, +) from azul_test_case import ( AlwaysTearDownTestCase, AzulUnitTestCase, @@ -105,9 +108,12 @@ def fail(): self.assertEqual( azul_log.output[2], 'DEBUG:azul.chalice:Returning 500 response with headers {"Content-Type": "text/plain", ' - '"Strict-Transport-Security": "max-age=31536000; includeSubDomains", ' + f'"Content-Security-Policy": "default-src {sq("self")}", ' + '"Referrer-Policy": "strict-origin-when-cross-origin", ' + '"Strict-Transport-Security": "max-age=63072000; includeSubDomains; preload", ' '"X-Content-Type-Options": "nosniff", ' '"X-Frame-Options": "DENY", ' + '"X-XSS-Protection": "1; mode=block", ' '"Cache-Control": "no-store"}. ' 'See next line for the first 1024 characters of the body.\n' + response) else: diff --git a/test/test_log_forwarding.py b/test/test_log_forwarding.py index 519fb06144..9994cee56d 100644 --- a/test/test_log_forwarding.py +++ b/test/test_log_forwarding.py @@ -23,6 +23,9 @@ from azul.indexer.log_forwarding_controller import ( LogForwardingController, ) +from azul.strings import ( + double_quote as dq, +) from azul.types import ( JSONs, ) @@ -77,23 +80,23 @@ def test_alb(self): '0.000', '0.002', '0.000', '204', '204', '963', '229', - '"POST ' - f'https://gitlab.dev.singlecell.gi.ucsc.edu:443/api/v4/jobs/request?chars={escaped} ' - 'HTTP/1.1"', - f'"gitlab-runner 15.6.1 (15-6-stable; go1.18.8; linux/amd64; {escaped})"', + dq('POST', + f'https://gitlab.dev.singlecell.gi.ucsc.edu:443/api/v4/jobs/request?chars={escaped}', + 'HTTP/1.1'), + dq(f'gitlab-runner 15.6.1 (15-6-stable; go1.18.8; linux/amd64; {escaped})'), 'ECDHE-RSA-AES128-GCM-SHA256', 'TLSv1.2', 'arn:aws:elasticloadbalancing:us-east-1:122796619775:targetgroup/azul-gitlab-http/136c2d6db59941f6', - '"Root=1-63b0cbd4-7d218b82786295005dbf8b6d"', - '"gitlab.dev.singlecell.gi.ucsc.edu"', - '"arn:aws:acm:us-east-1:122796619775:certificate/81241b8e-c875-4a22-a30e-58003ee139ae"', + dq('Root=1-63b0cbd4-7d218b82786295005dbf8b6d'), + dq('gitlab.dev.singlecell.gi.ucsc.edu'), + dq('arn:aws:acm:us-east-1:122796619775:certificate/81241b8e-c875-4a22-a30e-58003ee139ae'), '0', '2022-12-31T23:55:00.386000Z', - '"forward"', - '"-"', '"-"', - '"172.71.0.215:80"', - '"204"', - '"-"', '"-"', + dq('forward'), + dq('-'), dq('-'), + dq('172.71.0.215:80'), + dq('204'), + dq('-'), dq('-'), ])] expected_output = [{ '_source_bucket': self.log_bucket, @@ -142,16 +145,16 @@ def test_s3(self): 'K829N8AH88F1RX7K', 'REST.PUT.OBJECT', 'health/service', - '"PUT /edu-ucsc-gi-platform-anvil-dev-storage-anvilbox.us-east-1/health/service HTTP/1.1"', + dq('PUT /edu-ucsc-gi-platform-anvil-dev-storage-anvilbox.us-east-1/health/service HTTP/1.1'), '200', '-', '-', '523', '85', '52', - '"-"', - '"Boto3/1.24.94 Python/3.11.5 Linux/4.14.255-301-238.520.amzn2.x86_64 ' - + 'exec-env/AWS_Lambda_python3.11 aws-chalice/1.30.0 Botocore/1.27.94"', + dq('-'), + dq('Boto3/1.24.94 Python/3.11.5 Linux/4.14.255-301-238.520.amzn2.x86_64', + 'exec-env/AWS_Lambda_python3.11 aws-chalice/1.30.0 Botocore/1.27.94'), '-', 'jcmyLMRqqJ7dT4ovtY21rtgwmuTC3qs24vgAtLAkcad9sRV92zC90gf2zGvCkxxsLSaKm48AMjo=', 'SigV4',