DataBiosphere · dsotirho-ucsc · Oct 18, 2024 · Sep 16, 2024 · Sep 10, 2024 · Oct 1, 2024
@@ -1 +1 @@
-* @nadove-ucsc
+* @hannes-ucsc
@@ -33,6 +33,7 @@
     OrderedSet,
 )
 from azul.strings import (
+    back_quote as bq,
     join_grammatically,
 )
 from azul.template import (
@@ -225,10 +226,6 @@ def shared_deploy_target(self, target_branch: str) -> str:
         return 'apply' + iif(self.shared_deploy_is_two_phase(target_branch), '_keep_unused')
 
 
-def bq(s):
-    return '`' + s + '`'
-
-
 def main():
     path = Path(sys.argv[1])
     for t in T:

@@ -1819,14 +1819,14 @@ the private key and can always be regenerated again later using `make config`.
 
 ### 9.1.2 Ensuring split tunnel on client
 
-It is important that you configure the client to only route VPC traffic
-through the VPN. The VPN server will not forward any other traffic, in what's
-commonly referred to as a *split tunnel*. The key indicator of a split tunnel
-is that it doesn't set up a default route on the client system. There will
-only be a route to the private 172.… subnet of the GitLab VPC but the default
-route remains in place. If you configure the VPN connection to set up a
-default route, your Internet access will be severed as soon as you establish
-the VPN connection. 
+Except on stable deployments, you should configure the client to only route VPC
+traffic through the VPN. The VPN server will not forward any other traffic, in
+what's commonly referred to as a *split tunnel*. The key indicator of a split
+tunnel is that it doesn't set up a default route on the client system. There
+will only be a route to the private 172.… subnet of the GitLab VPC but the
+default route remains in place.
+
+On stable deployments, split tunnels are prohibited.
 
 The `make config` step prints instruction on how to configure a split tunnel
 on Ubuntu. 

@@ -1126,6 +1126,21 @@ def mkdict(previous_catalog: dict[str, str],
     # @formatter:on
 ]))
 
+dcp43_sources = mkdict(dcp42_sources, 476, mkdelta([
+    # @formatter:off
+    mksrc('bigquery', 'datarepo-ac7cee91', 'hca_prod_087efc3c26014de6bbe90114593050d1__20241004_dcp2_20241007_dcp43'),
+    mksrc('bigquery', 'datarepo-e9df1043', 'hca_prod_248c5dc36b754fb4ad8acc771968483f__20240806_dcp2_20241007_dcp43'),
+    mksrc('bigquery', 'datarepo-65c49269', 'hca_prod_2ef3655a973d4d699b4121fa4041eed7__20220111_dcp2_20241004_dcp43'),
+    mksrc('bigquery', 'datarepo-456691e5', 'hca_prod_3627473eb6d645c987b5b9f12ce57a10__20241004_dcp2_20241007_dcp43'),
+    mksrc('bigquery', 'datarepo-c577eed5', 'hca_prod_7f351a4cd24c4fcd9040f79071b097d0__20220906_dcp2_20241004_dcp43'),
+    mksrc('bigquery', 'datarepo-1dbd3c50', 'hca_prod_ae9f439bbd474d6ebd7232dc70b35d97__20241004_dcp2_20241004_dcp43', ma),  # noqa E501
+    mksrc('bigquery', 'datarepo-21d1f89b', 'hca_prod_b39381584e8d4fdb9e139e94270dde16__20241004_dcp2_20241004_dcp43'),
+    mksrc('bigquery', 'datarepo-550c8f98', 'hca_prod_c3dd819dabab4957b20988f1e0900368__20241004_dcp2_20241004_dcp43'),
+    mksrc('bigquery', 'datarepo-06a00830', 'hca_prod_c5ca43aa3b2b42168eb3f57adcbc99a1__20220118_dcp2_20241004_dcp43'),
+    mksrc('bigquery', 'datarepo-55151ed4', 'hca_prod_cdabcf0b76024abf9afb3b410e545703__20230201_dcp2_20241008_dcp43')
+    # @formatter:on
+]))
+
 pilot1_sources = mkdict({}, 4, mkdelta([
     # @formatter:off
     mksrc('bigquery', 'datarepo-11e4dc06', 'hca_prod_59b3bfd9cf454d538c8ee240273cba71__20240410_dcp2_20240410_dcpPilot'), # noqa E501
@@ -1171,6 +1186,17 @@ def mkdict(previous_catalog: dict[str, str],
     mksrc('bigquery', 'datarepo-43814140', 'lungmap_prod_fdadee7e209745d5bf81cc280bd8348e__20240206_20240626_lm7')
 ]))
 
+lm8_sources = mkdict(lm7_sources, 12, mkdelta([
+    mksrc('bigquery', 'datarepo-2b15227b', 'lungmap_prod_1977dc4784144263a8706b0f207d8ab3__20240206_20241002_lm8'),
+    mksrc('bigquery', 'datarepo-c9158593', 'lungmap_prod_20037472ea1d4ddb9cd356a11a6f0f76__20220307_20241002_lm8'),
+    mksrc('bigquery', 'datarepo-35a6d7ca', 'lungmap_prod_3a02d15f9c6a4ef7852b4ddec733b70b__20241001_20241002_lm8'),
+    mksrc('bigquery', 'datarepo-131a1234', 'lungmap_prod_4ae8c5c91520437198276935661f6c84__20231004_20241002_lm8'),
+    mksrc('bigquery', 'datarepo-3377446f', 'lungmap_prod_6135382f487d4adb9cf84d6634125b68__20230207_20241002_lm8'),
+    mksrc('bigquery', 'datarepo-3c4905d2', 'lungmap_prod_834e0d1671b64425a8ab022b5000961c__20241001_20241002_lm8'),
+    mksrc('bigquery', 'datarepo-d7447983', 'lungmap_prod_f899709cae2c4bb988f0131142e6c7ec__20220310_20241002_lm8'),
+    mksrc('bigquery', 'datarepo-c11ef363', 'lungmap_prod_fdadee7e209745d5bf81cc280bd8348e__20240206_20241002_lm8'),
+]))
+
 
 def env() -> Mapping[str, Optional[str]]:
     """
@@ -1211,8 +1237,10 @@ def env() -> Mapping[str, Optional[str]]:
                                        sources=mklist(sources))
             for atlas, catalog, sources in [
                 ('hca', 'dcp42', dcp42_sources),
+                ('hca', 'dcp43', dcp43_sources),
                 ('hca', 'pilot1', pilot1_sources),
-                ('lungmap', 'lm7', lm7_sources)
+                ('lungmap', 'lm7', lm7_sources),
+                ('lungmap', 'lm8', lm8_sources)
             ] for suffix, internal in [
                 ('', False),
                 ('-it', True)

@@ -23,6 +23,9 @@
 from azul.files import (
     write_file_atomically,
 )
+from azul.strings import (
+    single_quote as sq,
+)
 
 
 class Variable(NamedTuple):
@@ -144,7 +147,7 @@ def sub(m: re.Match):
                 return '{{' + m[1] + '}}'
 
         value = re.sub(r'\$?{([^}]+)}|\$([_A-Za-z][_A-Za-z0-9]*)', sub, value)
-        return f"'{value}'"
+        return sq(value)
 
 
 if __name__ == '__main__':

@@ -10,8 +10,8 @@
     Union,
 )
 
-from azul import (
-    require,
+from azul.strings import (
+    back_quote as bq,
 )
 
 BigQueryValue = Union[int, float, bool, str, bytes, datetime, None]
@@ -42,10 +42,9 @@ def backtick(table_name: str) -> str:
     >>> backtick('foo-2.bar`s.my_table')
     Traceback (most recent call last):
     ...
-    azul.RequirementError: foo-2.bar`s.my_table
+    azul.RequirementError: ('`', 'must not occur in', 'foo-2.bar`s.my_table')
     """
     if table_name_re.fullmatch(table_name):
         return table_name
     else:
-        require('`' not in table_name, table_name)
-        return f'`{table_name}`'
+        return bq(table_name)
@@ -61,6 +61,10 @@
     copy_json,
     json_head,
 )
+from azul.strings import (
+    join_words as jw,
+    single_quote as sq,
+)
 from azul.types import (
     JSON,
     LambdaContext,
@@ -189,15 +193,33 @@ def _api_gateway_context_middleware(self, event, get_response):
         finally:
             config.lambda_is_handling_api_gateway_request = False
 
+    hsts_max_age = 60 * 60 * 24 * 365 * 2
+
+    # Headers added to every response from the app, as well as canned 4XX and
+    # 5XX responses from API Gateway. Use of these headers addresses known
+    # security vulnerabilities.
+    #
+    security_headers = {
+        'Content-Security-Policy': jw('default-src', sq('self')),
+        'Referrer-Policy': 'strict-origin-when-cross-origin',
+        'Strict-Transport-Security': jw(f'max-age={hsts_max_age};',
+                                        'includeSubDomains;',
+                                        'preload'),
+        'X-Content-Type-Options': 'nosniff',
+        'X-Frame-Options': 'DENY',
+        'X-XSS-Protection': '1; mode=block'
+    }
+
     def _security_headers_middleware(self, event, get_response):
         """
         Add headers to the response
         """
         response = get_response(event)
-        seconds = 60 * 60 * 24 * 365
-        response.headers['Strict-Transport-Security'] = f'max-age={seconds}; includeSubDomains'
-        response.headers['X-Content-Type-Options'] = 'nosniff'
-        response.headers['X-Frame-Options'] = 'DENY'
+        response.headers.update(self.security_headers)
+        # FIXME: Add a CSP header with a nonce value to text/html responses
+        #        https://github.com/DataBiosphere/azul-private/issues/6
+        if response.headers.get('Content-Type') == 'text/html':
+            del response.headers['Content-Security-Policy']
         view_function = self.routes[event.path][event.method].view_function
         cache_control = getattr(view_function, 'cache_control')
         response.headers['Cache-Control'] = cache_control

@@ -131,6 +131,17 @@ def explode_dict(d: Mapping[K, Union[V, list[V], set[V], tuple[V]]]
         yield dict(zip(d.keys(), t))
 
 
+def none_safe_apply(f: Callable[[K], V], o: K | None) -> V | None:
+    """
+    >>> none_safe_apply(str, 123)
+    '123'
+
+    >>> none_safe_apply(str, None) is None
+    True
+    """
+    return None if o is None else f(o)
+
+
 def none_safe_key(none_last: bool = False) -> Callable[[Any], Any]:
     """
     Returns a sort key that handles None values.
@@ -270,6 +281,10 @@ def adict(seq: Union[Mapping[K, V], Iterable[tuple[K, V]]] = None,
     return kwargs if seq is None else dict(seq, **kwargs)
 
 
+def _athing(cls: type, *args):
+    return cls(arg for arg in args if arg is not None)
+
+
 def atuple(*args: V) -> tuple[V, ...]:
     """
     >>> atuple()
@@ -281,7 +296,7 @@ def atuple(*args: V) -> tuple[V, ...]:
     >>> atuple(0, None)
     (0,)
     """
-    return tuple(arg for arg in args if arg is not None)
+    return _athing(tuple, *args)
 
 
 def alist(*args: V) -> list[V]:
@@ -295,7 +310,21 @@ def alist(*args: V) -> list[V]:
     >>> alist(0, None)
     [0]
     """
-    return list(arg for arg in args if arg is not None)
+    return _athing(list, *args)
+
+
+def aset(*args: V) -> set[V]:
+    """
+    >>> aset()
+    set()
+
+    >>> aset(None)
+    set()
+
+    >>> aset(0, None)
+    {0}
+    """
+    return _athing(set, *args)
 
 
 class NestedDict(defaultdict):

@@ -38,7 +38,6 @@
 from more_itertools import (
     first,
     one,
-    unzip,
 )
 
 from azul import (
@@ -303,19 +302,21 @@ def transform(self,
             log.info('Transforming %i entities in partition %s of bundle %s, version %s.',
                      num_entities, partition, bundle.uuid, bundle.version)
             contributions = []
-            replicas = []
+            replicas_by_coords = {}
             for transformer in transformers:
-                # The cast is necessary because unzip()'s type stub doesn't
-                # support heterogeneous tuples.
-                transforms = cast(
-                    tuple[Iterable[Optional[Contribution]], Iterable[Optional[Replica]]],
-                    unzip(transformer.transform(partition))
-                )
-                if transforms:
-                    contributions_part, replicas_part = transforms
-                    contributions.extend(filter(None, contributions_part))
-                    replicas.extend(filter(None, replicas_part))
-            return contributions, replicas
+                for document in transformer.transform(partition):
+                    if isinstance(document, Contribution):
+                        contributions.append(document)
+                    elif isinstance(document, Replica):
+                        try:
+                            dup = replicas_by_coords[document.coordinates]
+                        except KeyError:
+                            replicas_by_coords[document.coordinates] = document
+                        else:
+                            dup.hub_ids.extend(document.hub_ids)
+                    else:
+                        assert False, document
+            return contributions, list(replicas_by_coords.values())
 
     def create_indices(self, catalog: CatalogName):
         es_client = ESClientFactory.get()

@@ -11,6 +11,9 @@
 
 import attr
 
+from azul.collections import (
+    alist,
+)
 from azul.indexer import (
     Bundle,
     BundleFQID,
@@ -34,11 +37,8 @@
 )
 from azul.types import (
     JSON,
-    MutableJSON,
 )
 
-Transform = tuple[Optional[Contribution], Optional[Replica]]
-
 
 @attr.s(frozen=True, kw_only=True, auto_attribs=True)
 class Transformer(metaclass=ABCMeta):
@@ -55,12 +55,14 @@ def entity_type(cls) -> EntityType:
         raise NotImplementedError
 
     @abstractmethod
-    def replica_type(self, entity: EntityReference) -> str:
+    def _replicate(self, entity: EntityReference) -> tuple[str, JSON]:
         """
-        The name of the type of replica emitted by this transformer for a given
-        entity.
+        A tuple consisting of:
+
+            1. The name of the type of replica emitted by this transformer for a
+               given entity. See :py:attr:`Replica.replica_type`.
 
-        See :py:attr:`Replica.replica_type`
+            2. The contents of the replica for that entity.
         """
         raise NotImplementedError
 
@@ -88,7 +90,9 @@ def estimate(self, partition: BundlePartition) -> int:
         """
 
     @abstractmethod
-    def transform(self, partition: BundlePartition) -> Iterable[Transform]:
+    def transform(self,
+                  partition: BundlePartition
+                  ) -> Iterable[Contribution | Replica]:
         """
         Return the contributions by the current bundle to the entities it
         contains metadata about. More than one bundle can contribute to a
@@ -114,9 +118,10 @@ def aggregator(cls, entity_type: EntityType) -> Optional[EntityAggregator]:
         raise NotImplementedError
 
     def _contribution(self,
-                      contents: MutableJSON,
-                      entity: EntityReference
+                      contents: JSON,
+                      entity_id: EntityID
                       ) -> Contribution:
+        entity = EntityReference(entity_type=self.entity_type(), entity_id=entity_id)
         coordinates = ContributionCoordinates(entity=entity,
                                               bundle=self.bundle.fqid.upcast(),
                                               deleted=self.deleted)
@@ -126,17 +131,20 @@ def _contribution(self,
                             contents=contents)
 
     def _replica(self,
-                 contents: MutableJSON,
                  entity: EntityReference,
-                 hub_ids: list[EntityID]
+                 *,
+                 file_hub: EntityID | None,
                  ) -> Replica:
+        replica_type, contents = self._replicate(entity)
         coordinates = ReplicaCoordinates(content_hash=json_hash(contents).hexdigest(),
                                          entity=entity)
         return Replica(coordinates=coordinates,
                        version=None,
-                       replica_type=self.replica_type(entity),
+                       replica_type=replica_type,
                        contents=contents,
-                       hub_ids=hub_ids)
+                       # The other hubs will be added when the indexer
+                       # consolidates duplicate replicas.
+                       hub_ids=alist(file_hub))
 
     @classmethod
     @abstractmethod

@@ -36,7 +36,6 @@
     BiosampleTransformer,
     BundleTransformer,
     DatasetTransformer,
-    DiagnosisTransformer,
     DonorTransformer,
     FileTransformer,
 )
@@ -96,7 +95,6 @@ def transformer_types(self) -> Iterable[type[BaseTransformer]]:
             BiosampleTransformer,
             BundleTransformer,
             DatasetTransformer,
-            DiagnosisTransformer,
             DonorTransformer,
             FileTransformer,
         )