Skip to content

Commit

Permalink
[r] Promotion 2024-10-15 anvilprod (#6630, PR #6633)
Browse files Browse the repository at this point in the history
  • Loading branch information
dsotirho-ucsc committed Oct 18, 2024
2 parents 10fc36d + 54e12dd commit 345fb5a
Show file tree
Hide file tree
Showing 34 changed files with 1,561 additions and 523 deletions.
2 changes: 1 addition & 1 deletion .github/CODEOWNERS
Original file line number Diff line number Diff line change
@@ -1 +1 @@
* @nadove-ucsc
* @hannes-ucsc
5 changes: 1 addition & 4 deletions .github/pull_request_template.md.template.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
OrderedSet,
)
from azul.strings import (
back_quote as bq,
join_grammatically,
)
from azul.template import (
Expand Down Expand Up @@ -225,10 +226,6 @@ def shared_deploy_target(self, target_branch: str) -> str:
return 'apply' + iif(self.shared_deploy_is_two_phase(target_branch), '_keep_unused')


def bq(s):
return '`' + s + '`'


def main():
path = Path(sys.argv[1])
for t in T:
Expand Down
16 changes: 8 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -1819,14 +1819,14 @@ the private key and can always be regenerated again later using `make config`.

### 9.1.2 Ensuring split tunnel on client

It is important that you configure the client to only route VPC traffic
through the VPN. The VPN server will not forward any other traffic, in what's
commonly referred to as a *split tunnel*. The key indicator of a split tunnel
is that it doesn't set up a default route on the client system. There will
only be a route to the private 172.… subnet of the GitLab VPC but the default
route remains in place. If you configure the VPN connection to set up a
default route, your Internet access will be severed as soon as you establish
the VPN connection.
Except on stable deployments, you should configure the client to only route VPC
traffic through the VPN. The VPN server will not forward any other traffic, in
what's commonly referred to as a *split tunnel*. The key indicator of a split
tunnel is that it doesn't set up a default route on the client system. There
will only be a route to the private 172.… subnet of the GitLab VPC but the
default route remains in place.

On stable deployments, split tunnels are prohibited.

The `make config` step prints instruction on how to configure a split tunnel
on Ubuntu.
Expand Down
46 changes: 37 additions & 9 deletions deployments/prod/environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -1115,14 +1115,29 @@ def mkdict(previous_catalog: dict[str, str],

dcp42_sources = mkdict(dcp41_sources, 470, mkdelta([
# @formatter:off
mksrc('bigquery', 'datarepo-db22b6c5', 'hca_prod_19037ec943a74823b93f9e59c694d17e__20240903_dcp2_20240904_dcp42', 16), # noqa E501
mksrc('bigquery', 'datarepo-8e43554a', 'hca_prod_35d5b0573daf4ccd8112196194598893__20240903_dcp2_20240905_dcp42', 303, ma), # noqa E501
mksrc('bigquery', 'datarepo-5b6ac433', 'hca_prod_5f1a1aee6c484dd4a2c4eb4ca6aadf74__20240903_dcp2_20240904_dcp42', 40), # noqa E501
mksrc('bigquery', 'datarepo-d5e4c41e', 'hca_prod_7c75f07c608d4c4aa1b7b13d11c0ad31__20220117_dcp2_20240904_dcp42', 80), # noqa E501
mksrc('bigquery', 'datarepo-eb6182b7', 'hca_prod_888f17664c8443bb8717b5f9d2046097__20240903_dcp2_20240904_dcp42', 111), # noqa E501
mksrc('bigquery', 'datarepo-b9e1d9ec', 'hca_prod_9dd91b6e7c6249d3a3d474f603deffdb__20240903_dcp2_20240904_dcp42', 135), # noqa E501
mksrc('bigquery', 'datarepo-582bf509', 'hca_prod_b176d75662d8493383a48b026380262f__20240903_dcp2_20240904_dcp42', 106), # noqa E501
mksrc('bigquery', 'datarepo-c85d293d', 'hca_prod_f598aee0d269403690e9d6d5b1c84429__20240903_dcp2_20240904_dcp42', 6)
mksrc('bigquery', 'datarepo-db22b6c5', 'hca_prod_19037ec943a74823b93f9e59c694d17e__20240903_dcp2_20240904_dcp42'),
mksrc('bigquery', 'datarepo-8e43554a', 'hca_prod_35d5b0573daf4ccd8112196194598893__20240903_dcp2_20240905_dcp42', ma), # noqa E501
mksrc('bigquery', 'datarepo-5b6ac433', 'hca_prod_5f1a1aee6c484dd4a2c4eb4ca6aadf74__20240903_dcp2_20240904_dcp42',),
mksrc('bigquery', 'datarepo-d5e4c41e', 'hca_prod_7c75f07c608d4c4aa1b7b13d11c0ad31__20220117_dcp2_20240904_dcp42',),
mksrc('bigquery', 'datarepo-eb6182b7', 'hca_prod_888f17664c8443bb8717b5f9d2046097__20240903_dcp2_20240904_dcp42',),
mksrc('bigquery', 'datarepo-b9e1d9ec', 'hca_prod_9dd91b6e7c6249d3a3d474f603deffdb__20240903_dcp2_20240904_dcp42',),
mksrc('bigquery', 'datarepo-582bf509', 'hca_prod_b176d75662d8493383a48b026380262f__20240903_dcp2_20240904_dcp42',),
mksrc('bigquery', 'datarepo-c85d293d', 'hca_prod_f598aee0d269403690e9d6d5b1c84429__20240903_dcp2_20240904_dcp42',)
# @formatter:on
]))

dcp43_sources = mkdict(dcp42_sources, 476, mkdelta([
# @formatter:off
mksrc('bigquery', 'datarepo-ac7cee91', 'hca_prod_087efc3c26014de6bbe90114593050d1__20241004_dcp2_20241007_dcp43'),
mksrc('bigquery', 'datarepo-e9df1043', 'hca_prod_248c5dc36b754fb4ad8acc771968483f__20240806_dcp2_20241007_dcp43'),
mksrc('bigquery', 'datarepo-65c49269', 'hca_prod_2ef3655a973d4d699b4121fa4041eed7__20220111_dcp2_20241004_dcp43'),
mksrc('bigquery', 'datarepo-456691e5', 'hca_prod_3627473eb6d645c987b5b9f12ce57a10__20241004_dcp2_20241007_dcp43'),
mksrc('bigquery', 'datarepo-c577eed5', 'hca_prod_7f351a4cd24c4fcd9040f79071b097d0__20220906_dcp2_20241004_dcp43'),
mksrc('bigquery', 'datarepo-1dbd3c50', 'hca_prod_ae9f439bbd474d6ebd7232dc70b35d97__20241004_dcp2_20241004_dcp43', ma), # noqa E501
mksrc('bigquery', 'datarepo-21d1f89b', 'hca_prod_b39381584e8d4fdb9e139e94270dde16__20241004_dcp2_20241004_dcp43'),
mksrc('bigquery', 'datarepo-550c8f98', 'hca_prod_c3dd819dabab4957b20988f1e0900368__20241004_dcp2_20241004_dcp43'),
mksrc('bigquery', 'datarepo-06a00830', 'hca_prod_c5ca43aa3b2b42168eb3f57adcbc99a1__20220118_dcp2_20241004_dcp43'),
mksrc('bigquery', 'datarepo-55151ed4', 'hca_prod_cdabcf0b76024abf9afb3b410e545703__20230201_dcp2_20241008_dcp43')
# @formatter:on
]))

Expand Down Expand Up @@ -1171,6 +1186,17 @@ def mkdict(previous_catalog: dict[str, str],
mksrc('bigquery', 'datarepo-43814140', 'lungmap_prod_fdadee7e209745d5bf81cc280bd8348e__20240206_20240626_lm7')
]))

lm8_sources = mkdict(lm7_sources, 12, mkdelta([
mksrc('bigquery', 'datarepo-2b15227b', 'lungmap_prod_1977dc4784144263a8706b0f207d8ab3__20240206_20241002_lm8'),
mksrc('bigquery', 'datarepo-c9158593', 'lungmap_prod_20037472ea1d4ddb9cd356a11a6f0f76__20220307_20241002_lm8'),
mksrc('bigquery', 'datarepo-35a6d7ca', 'lungmap_prod_3a02d15f9c6a4ef7852b4ddec733b70b__20241001_20241002_lm8'),
mksrc('bigquery', 'datarepo-131a1234', 'lungmap_prod_4ae8c5c91520437198276935661f6c84__20231004_20241002_lm8'),
mksrc('bigquery', 'datarepo-3377446f', 'lungmap_prod_6135382f487d4adb9cf84d6634125b68__20230207_20241002_lm8'),
mksrc('bigquery', 'datarepo-3c4905d2', 'lungmap_prod_834e0d1671b64425a8ab022b5000961c__20241001_20241002_lm8'),
mksrc('bigquery', 'datarepo-d7447983', 'lungmap_prod_f899709cae2c4bb988f0131142e6c7ec__20220310_20241002_lm8'),
mksrc('bigquery', 'datarepo-c11ef363', 'lungmap_prod_fdadee7e209745d5bf81cc280bd8348e__20240206_20241002_lm8'),
]))


def env() -> Mapping[str, Optional[str]]:
"""
Expand Down Expand Up @@ -1211,8 +1237,10 @@ def env() -> Mapping[str, Optional[str]]:
sources=mklist(sources))
for atlas, catalog, sources in [
('hca', 'dcp42', dcp42_sources),
('hca', 'dcp43', dcp43_sources),
('hca', 'pilot1', pilot1_sources),
('lungmap', 'lm7', lm7_sources)
('lungmap', 'lm7', lm7_sources),
('lungmap', 'lm8', lm8_sources)
] for suffix, internal in [
('', False),
('-it', True)
Expand Down
5 changes: 4 additions & 1 deletion scripts/convert_environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@
from azul.files import (
write_file_atomically,
)
from azul.strings import (
single_quote as sq,
)


class Variable(NamedTuple):
Expand Down Expand Up @@ -144,7 +147,7 @@ def sub(m: re.Match):
return '{{' + m[1] + '}}'

value = re.sub(r'\$?{([^}]+)}|\$([_A-Za-z][_A-Za-z0-9]*)', sub, value)
return f"'{value}'"
return sq(value)


if __name__ == '__main__':
Expand Down
9 changes: 4 additions & 5 deletions src/azul/bigquery.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@
Union,
)

from azul import (
require,
from azul.strings import (
back_quote as bq,
)

BigQueryValue = Union[int, float, bool, str, bytes, datetime, None]
Expand Down Expand Up @@ -42,10 +42,9 @@ def backtick(table_name: str) -> str:
>>> backtick('foo-2.bar`s.my_table')
Traceback (most recent call last):
...
azul.RequirementError: foo-2.bar`s.my_table
azul.RequirementError: ('`', 'must not occur in', 'foo-2.bar`s.my_table')
"""
if table_name_re.fullmatch(table_name):
return table_name
else:
require('`' not in table_name, table_name)
return f'`{table_name}`'
return bq(table_name)
30 changes: 26 additions & 4 deletions src/azul/chalice.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,10 @@
copy_json,
json_head,
)
from azul.strings import (
join_words as jw,
single_quote as sq,
)
from azul.types import (
JSON,
LambdaContext,
Expand Down Expand Up @@ -189,15 +193,33 @@ def _api_gateway_context_middleware(self, event, get_response):
finally:
config.lambda_is_handling_api_gateway_request = False

hsts_max_age = 60 * 60 * 24 * 365 * 2

# Headers added to every response from the app, as well as canned 4XX and
# 5XX responses from API Gateway. Use of these headers addresses known
# security vulnerabilities.
#
security_headers = {
'Content-Security-Policy': jw('default-src', sq('self')),
'Referrer-Policy': 'strict-origin-when-cross-origin',
'Strict-Transport-Security': jw(f'max-age={hsts_max_age};',
'includeSubDomains;',
'preload'),
'X-Content-Type-Options': 'nosniff',
'X-Frame-Options': 'DENY',
'X-XSS-Protection': '1; mode=block'
}

def _security_headers_middleware(self, event, get_response):
"""
Add headers to the response
"""
response = get_response(event)
seconds = 60 * 60 * 24 * 365
response.headers['Strict-Transport-Security'] = f'max-age={seconds}; includeSubDomains'
response.headers['X-Content-Type-Options'] = 'nosniff'
response.headers['X-Frame-Options'] = 'DENY'
response.headers.update(self.security_headers)
# FIXME: Add a CSP header with a nonce value to text/html responses
# https://github.com/DataBiosphere/azul-private/issues/6
if response.headers.get('Content-Type') == 'text/html':
del response.headers['Content-Security-Policy']
view_function = self.routes[event.path][event.method].view_function
cache_control = getattr(view_function, 'cache_control')
response.headers['Cache-Control'] = cache_control
Expand Down
33 changes: 31 additions & 2 deletions src/azul/collections.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,17 @@ def explode_dict(d: Mapping[K, Union[V, list[V], set[V], tuple[V]]]
yield dict(zip(d.keys(), t))


def none_safe_apply(f: Callable[[K], V], o: K | None) -> V | None:
"""
>>> none_safe_apply(str, 123)
'123'
>>> none_safe_apply(str, None) is None
True
"""
return None if o is None else f(o)


def none_safe_key(none_last: bool = False) -> Callable[[Any], Any]:
"""
Returns a sort key that handles None values.
Expand Down Expand Up @@ -270,6 +281,10 @@ def adict(seq: Union[Mapping[K, V], Iterable[tuple[K, V]]] = None,
return kwargs if seq is None else dict(seq, **kwargs)


def _athing(cls: type, *args):
return cls(arg for arg in args if arg is not None)


def atuple(*args: V) -> tuple[V, ...]:
"""
>>> atuple()
Expand All @@ -281,7 +296,7 @@ def atuple(*args: V) -> tuple[V, ...]:
>>> atuple(0, None)
(0,)
"""
return tuple(arg for arg in args if arg is not None)
return _athing(tuple, *args)


def alist(*args: V) -> list[V]:
Expand All @@ -295,7 +310,21 @@ def alist(*args: V) -> list[V]:
>>> alist(0, None)
[0]
"""
return list(arg for arg in args if arg is not None)
return _athing(list, *args)


def aset(*args: V) -> set[V]:
"""
>>> aset()
set()
>>> aset(None)
set()
>>> aset(0, None)
{0}
"""
return _athing(set, *args)


class NestedDict(defaultdict):
Expand Down
27 changes: 14 additions & 13 deletions src/azul/indexer/index_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@
from more_itertools import (
first,
one,
unzip,
)

from azul import (
Expand Down Expand Up @@ -303,19 +302,21 @@ def transform(self,
log.info('Transforming %i entities in partition %s of bundle %s, version %s.',
num_entities, partition, bundle.uuid, bundle.version)
contributions = []
replicas = []
replicas_by_coords = {}
for transformer in transformers:
# The cast is necessary because unzip()'s type stub doesn't
# support heterogeneous tuples.
transforms = cast(
tuple[Iterable[Optional[Contribution]], Iterable[Optional[Replica]]],
unzip(transformer.transform(partition))
)
if transforms:
contributions_part, replicas_part = transforms
contributions.extend(filter(None, contributions_part))
replicas.extend(filter(None, replicas_part))
return contributions, replicas
for document in transformer.transform(partition):
if isinstance(document, Contribution):
contributions.append(document)
elif isinstance(document, Replica):
try:
dup = replicas_by_coords[document.coordinates]
except KeyError:
replicas_by_coords[document.coordinates] = document
else:
dup.hub_ids.extend(document.hub_ids)
else:
assert False, document
return contributions, list(replicas_by_coords.values())

def create_indices(self, catalog: CatalogName):
es_client = ESClientFactory.get()
Expand Down
36 changes: 22 additions & 14 deletions src/azul/indexer/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@

import attr

from azul.collections import (
alist,
)
from azul.indexer import (
Bundle,
BundleFQID,
Expand All @@ -34,11 +37,8 @@
)
from azul.types import (
JSON,
MutableJSON,
)

Transform = tuple[Optional[Contribution], Optional[Replica]]


@attr.s(frozen=True, kw_only=True, auto_attribs=True)
class Transformer(metaclass=ABCMeta):
Expand All @@ -55,12 +55,14 @@ def entity_type(cls) -> EntityType:
raise NotImplementedError

@abstractmethod
def replica_type(self, entity: EntityReference) -> str:
def _replicate(self, entity: EntityReference) -> tuple[str, JSON]:
"""
The name of the type of replica emitted by this transformer for a given
entity.
A tuple consisting of:
1. The name of the type of replica emitted by this transformer for a
given entity. See :py:attr:`Replica.replica_type`.
See :py:attr:`Replica.replica_type`
2. The contents of the replica for that entity.
"""
raise NotImplementedError

Expand Down Expand Up @@ -88,7 +90,9 @@ def estimate(self, partition: BundlePartition) -> int:
"""

@abstractmethod
def transform(self, partition: BundlePartition) -> Iterable[Transform]:
def transform(self,
partition: BundlePartition
) -> Iterable[Contribution | Replica]:
"""
Return the contributions by the current bundle to the entities it
contains metadata about. More than one bundle can contribute to a
Expand All @@ -114,9 +118,10 @@ def aggregator(cls, entity_type: EntityType) -> Optional[EntityAggregator]:
raise NotImplementedError

def _contribution(self,
contents: MutableJSON,
entity: EntityReference
contents: JSON,
entity_id: EntityID
) -> Contribution:
entity = EntityReference(entity_type=self.entity_type(), entity_id=entity_id)
coordinates = ContributionCoordinates(entity=entity,
bundle=self.bundle.fqid.upcast(),
deleted=self.deleted)
Expand All @@ -126,17 +131,20 @@ def _contribution(self,
contents=contents)

def _replica(self,
contents: MutableJSON,
entity: EntityReference,
hub_ids: list[EntityID]
*,
file_hub: EntityID | None,
) -> Replica:
replica_type, contents = self._replicate(entity)
coordinates = ReplicaCoordinates(content_hash=json_hash(contents).hexdigest(),
entity=entity)
return Replica(coordinates=coordinates,
version=None,
replica_type=self.replica_type(entity),
replica_type=replica_type,
contents=contents,
hub_ids=hub_ids)
# The other hubs will be added when the indexer
# consolidates duplicate replicas.
hub_ids=alist(file_hub))

@classmethod
@abstractmethod
Expand Down
Loading

0 comments on commit 345fb5a

Please sign in to comment.