Merge pull request #68 from 4dn-dcic/kmp_C4-102_C4-92_cgap_bucketing

CGAP bucketing fixes (C4-92, C4-102)
4dn-dcic · Mar 31, 2020 · 95c5ff5 · 95c5ff5
2 parents e00979b + 0a5d77f
commit 95c5ff5
Show file tree

Hide file tree

Showing 10 changed files with 401 additions and 75 deletions.
diff --git a/dcicutils/beanstalk_utils.py b/dcicutils/beanstalk_utils.py
@@ -14,6 +14,7 @@
 from . import ff_utils
 from botocore.exceptions import ClientError
 from .misc_utils import PRINT
+from .env_utils import is_cgap_env, is_stg_or_prd_env, public_url_mappings, blue_green_mirror_env
 
 logging.basicConfig()
 logger = logging.getLogger('logger')
@@ -35,9 +36,22 @@
 
 
 FOURSIGHT_URL = 'https://foursight.4dnucleome.org/'
-# magic CNAME corresponds to data.4dnucleome
-MAGIC_CNAME = 'fourfront-green.us-east-1.elasticbeanstalk.com'
-GOLDEN_DB = 'fourfront-production.co3gwj7b7tpq.us-east-1.rds.amazonaws.com'
+
+# FF_MAGIC_CNAME corresponds to data.4dnucleome.org
+FF_MAGIC_CNAME = 'fourfront-green.us-east-1.elasticbeanstalk.com'
+# CGAP_MAGIC_CNAME corresponds to cgap.hms.harvard.edu
+CGAP_MAGIC_CNAME = 'fourfront-cgap.9wzadzju3p.us-east-1.elasticbeanstalk.com'
+# The legacy name MAGIC_CNAME is deprecated (retained for backward compatibility until a major release boundary).
+MAGIC_CNAME = FF_MAGIC_CNAME
+
+# FF_GOLDEN_DB is the database behind data.4dnucleome.org (and shared by staging.4dnucleome.org)
+FF_GOLDEN_DB = 'fourfront-production.co3gwj7b7tpq.us-east-1.rds.amazonaws.com'
+# CGAP_GOLDEN_DB is the database behind cgap.hms.harvard.edu
+CGAP_GOLDEN_DB = 'fourfront-cgap.co3gwj7b7tpq.us-east-1.rds.amazonaws.com'
+# The name GOLDEN_DB is deprecated (retained for backward compatibility until a major release boundary).
+# Although not visibly used in this repository, this variable is imported by Torb.
+GOLDEN_DB = FF_GOLDEN_DB
+
 REGION = 'us-east-1'
 
 
@@ -206,24 +220,36 @@ def swap_cname(src, dest):
     client.restart_app_server(EnvironmentName=dest)
 
 
-def whodaman():
+def _compute_prd_env_for_project(project):
     '''
     Determines which ElasticBeanstalk environment is currently hosting
     data.4dnucleome.org. Requires IAM permissions for EB!
 
     Returns:
         str: EB environment name hosting data.4dnucleome
     '''
+    magic_cname = CGAP_MAGIC_CNAME if project == 'cgap' else FF_MAGIC_CNAME
     client = boto3.client('elasticbeanstalk', region_name=REGION)
     res = describe_beanstalk_environments(client, ApplicationName="4dn-web")
     logger.info(res)
     for env in res['Environments']:
         logger.info(env)
-        if env.get('CNAME') == MAGIC_CNAME:
+        if env.get('CNAME') == magic_cname:
             # we found data
             return env.get('EnvironmentName')
 
 
+def compute_ff_prd_env():
+    return _compute_prd_env_for_project('ff')
+
+
+whodaman = compute_ff_prd_env  # This naming is deprecated but retained for compatibility.
+
+
+def compute_cgap_prd_env():
+    return _compute_prd_env_for_project('cgap')
+
+
 def beanstalk_info(env):
     """
     Describe a ElasticBeanstalk environment given an environment name
@@ -242,7 +268,7 @@ def beanstalk_info(env):
 def get_beanstalk_real_url(env):
     """
     Return the real url for the elasticbeanstalk with given environment name.
-    Name can be 'data', 'staging', or an actual environment.
+    Name can be 'cgap', 'data', 'staging', or an actual environment.
 
     Args:
         env (str): ElasticBeanstalk environment name
@@ -251,25 +277,24 @@ def get_beanstalk_real_url(env):
         str: url of the ElasticBeanstalk environment
     """
     url = ''
-    urls = {'staging': 'http://staging.4dnucleome.org',
-            'data': 'https://data.4dnucleome.org'}
+    urls = public_url_mappings(env)
 
-    if env in urls:
+    if env in urls:  # Special case handling of 'cgap', 'data', or 'staging' as an argument.
         return urls[env]
 
-    # TODO (C4-91): Reconsider environment names.
-    # This code is too fragile.
-    if 'webprod' in env or 'blue' in env or 'green' in env:
-        data_env = whodaman()
-
-        if data_env == env:
-            url = urls['data']
-        else:
-            url = urls['staging']
-    else:
-        bs_info = beanstalk_info(env)
-        url = "http://" + bs_info['CNAME']
-
+    if is_stg_or_prd_env(env):
+        # What counts as staging/prod depends on whether we're in the CGAP or Fourfront space.
+        data_env = compute_cgap_prd_env() if is_cgap_env(env) else compute_ff_prd_env()
+        # There is only one production environment. Everything else is staging, but everything
+        # else is not staging.4dnucleome.org. Only one is that.
+        if env == data_env:
+            return urls['data']
+        elif env == blue_green_mirror_env(data_env):
+            # Mirror env might be None, in which case this clause will not be entered
+            return urls['staging']
+
+    bs_info = beanstalk_info(env)
+    url = "http://" + bs_info['CNAME']
     return url
 
 

diff --git a/dcicutils/env_utils.py b/dcicutils/env_utils.py
@@ -34,17 +34,52 @@
 # CGAP_ENV_WEBPROD2_NEW is meaningless here. See CGAP_ENV_STAGING_NEW.
 CGAP_ENV_WOLF_NEW = 'cgap-wolf'  # Maybe not used
 
+# The bucket names were allocated originally and needn't change.
+
+FF_PROD_BUCKET_ENV = FF_ENV_WEBPROD
+CGAP_PROD_BUCKET_ENV = CGAP_ENV_WEBPROD
 
 # Done this way to get maximally compatible behavior.
 FOURFRONT_STG_OR_PRD_TOKENS = ['webprod', 'blue', 'green']
 FOURFRONT_STG_OR_PRD_NAMES = ['staging', 'stagging', 'data']
 
 # Done this way because it's safer going forward.
 CGAP_STG_OR_PRD_TOKENS = []
-CGAP_STG_OR_PRD_NAMES = [CGAP_ENV_WEBPROD, CGAP_ENV_PRODUCTION_GREEN, CGAP_ENV_PRODUCTION_BLUE]
+CGAP_STG_OR_PRD_NAMES = [CGAP_ENV_WEBPROD, CGAP_ENV_PRODUCTION_GREEN, CGAP_ENV_PRODUCTION_BLUE, 'cgap']
+
+
+FF_PUBLIC_URL_STG = 'http://staging.4dnucleome.org'
+FF_PUBLIC_URL_PRD = 'https://data.4dnucleome.org'
 
+FF_PUBLIC_URLS = {
+    'staging': FF_PUBLIC_URL_STG,
+    'data': FF_PUBLIC_URL_PRD,
+}
+
+CGAP_PUBLIC_URL_STG = 'https://staging.cgap.hms.harvard.edu'  # This is a stopgap for testing and may have to change
+CGAP_PUBLIC_URL_PRD = 'https://cgap.hms.harvard.edu'
+
+CGAP_PUBLIC_URLS = {
+    'cgap': CGAP_PUBLIC_URL_PRD,
+    'data': CGAP_PUBLIC_URL_PRD,
+    'staging': CGAP_PUBLIC_URL_STG,
+}
+
+BEANSTALK_PROD_BUCKET_ENVS = {
+    'staging': FF_PROD_BUCKET_ENV,
+    'data': FF_PROD_BUCKET_ENV,
+    FF_ENV_WEBPROD: FF_PROD_BUCKET_ENV,
+    FF_ENV_WEBPROD2: FF_PROD_BUCKET_ENV,
+    FF_ENV_PRODUCTION_BLUE: FF_PROD_BUCKET_ENV,
+    FF_ENV_PRODUCTION_GREEN: FF_PROD_BUCKET_ENV,
+    'cgap': CGAP_PROD_BUCKET_ENV,
+    CGAP_ENV_PRODUCTION_BLUE: CGAP_PROD_BUCKET_ENV,
+    CGAP_ENV_PRODUCTION_GREEN: CGAP_PROD_BUCKET_ENV,
+    CGAP_ENV_WEBPROD: CGAP_PROD_BUCKET_ENV,
+    CGAP_ENV_PRODUCTION_BLUE_NEW: CGAP_PROD_BUCKET_ENV,
+    CGAP_ENV_PRODUCTION_GREEN_NEW: CGAP_PROD_BUCKET_ENV,
+}
 
-# These operate as pairs. Don't add extras.
 BEANSTALK_PROD_MIRRORS = {
 
     FF_ENV_PRODUCTION_BLUE: FF_ENV_PRODUCTION_GREEN,
@@ -61,7 +96,6 @@
 
 }
 
-
 BEANSTALK_TEST_ENVS = [
 
     FF_ENV_HOTSEAT,
@@ -95,6 +129,34 @@ def blue_green_mirror_env(envname):
         return None
 
 
+def prod_bucket_env(envname):
+    """
+    Given a production-class envname returns the envname of the associated production bucket.
+    For other envnames that aren't production envs, this returns None.
+
+    The envname is something that is either a staging or production env, in particular something
+    that is_stg_or_prd_env returns True for.
+
+    This is intended for use when configuring a beanstalk. This functionality is agnostic
+    about whether we're asking on behalf of CGAP or Fourfront, and whether we're using an old or new
+    naming scheme. Just give the current envname as an argument, and it will know (by declaration,
+    see the BEANSTALK_PROD_ENV_BUCKET_TOKENS table) what the appropriate production bucket name token is for
+    that ecosystem.
+    """
+    return BEANSTALK_PROD_BUCKET_ENVS.get(envname)
+
+
+def public_url_mappings(envname):
+    """
+    Returns a table of the public URLs we use for the ecosystem in which the envname resides.
+    For example, if envname is a CGAP URL, this returns a set table of CGAP public URLs,
+    and otherwise it returns a set of Fourfront URLs.
+
+    The envname may be 'cgap', 'data', 'staging', or an environment name.
+    """
+    return CGAP_PUBLIC_URLS if is_cgap_env(envname) else FF_PUBLIC_URLS
+
+
 def is_cgap_env(envname):
     """
     Returns True of the given string looks like a CGAP elasticbeanstalk environment name.

diff --git a/dcicutils/ff_utils.py b/dcicutils/ff_utils.py
@@ -7,7 +7,8 @@
 import boto3
 from . import (
     s3_utils,
-    es_utils
+    es_utils,
+    env_utils,
 )
 from .misc_utils import PRINT
 import requests
@@ -20,11 +21,13 @@
     from urllib.parse import urlencode
 
 
+# TODO (C4-92, C4-102): Probably to centralize this information in env_utils. Also figure out relation to CGAP.
 HIGLASS_BUCKETS = ['elasticbeanstalk-fourfront-webprod-wfoutput',
                    'elasticbeanstalk-fourfront-webdev-wfoutput']
 
 
 # TODO (C4-92): Centralize this information, it is repeated in other repos
+# TODO (C4-102): Does this need to include CGAP envs? As part of the same list, or as a separate list?
 PRODUCTION_ENVS = ['fourfront-blue', 'fourfront-green']
 
 
@@ -1067,11 +1070,8 @@ def unified_authentication(auth=None, ff_env=None):
     """
     # first see if key should be obtained from using ff_env
     if not auth and ff_env:
-        # webprod, webprod2 and blue/green all use the fourfront-webprod bucket for keys
-        if 'webprod' in ff_env or ff_env in PRODUCTION_ENVS:
-            use_env = 'fourfront-webprod'
-        else:
-            use_env = ff_env
+        # TODO: The ff_env argument is mis-named, something we should fix sometime. It can be a cgap env, too.
+        use_env = env_utils.prod_bucket_env(ff_env) if env_utils.is_stg_or_prd_env(ff_env) else ff_env
         auth = s3_utils.s3Utils(env=use_env).get_access_keys()
     # see if auth is directly from get_access_keys()
     use_auth = None

diff --git a/dcicutils/qa_utils.py b/dcicutils/qa_utils.py
@@ -0,0 +1,20 @@
+"""
+qa_utils: Tools for use in quality assurance testing.
+"""
+
+from .misc_utils import PRINT
+
+
+def mock_not_called(name):
+    """
+    This can be used in mocking to mock a function that should not be called.
+    Called with the name of a function, it returns a function that if called
+    will raise an AssertionError complaining that such a name was called.
+    """
+    def mocked_function(*args, **kwargs):
+        # It's OK to print here because we're expected to be called in a testing context, and
+        # we're just about to fail a test. The person invoking the tests may want this data.
+        PRINT("args=", args)
+        PRINT("kwargs=", kwargs)
+        raise AssertionError("%s was called where not expected." % name)
+    return mocked_function
diff --git a/dcicutils/s3_utils.py b/dcicutils/s3_utils.py
@@ -6,7 +6,7 @@
 from zipfile import ZipFile
 from io import BytesIO
 import logging
-from .env_utils import is_stg_or_prd_env
+from .env_utils import is_stg_or_prd_env, prod_bucket_env
 from .misc_utils import PRINT
 
 
@@ -33,7 +33,7 @@ def __init__(self, outfile_bucket=None, sys_bucket=None, raw_file_bucket=None,
             if env:
                 if is_stg_or_prd_env(env):
                     self.url = get_beanstalk_real_url(env)
-                    env = 'fourfront-webprod'
+                    env = prod_bucket_env(env)
             # we use standardized naming schema, so s3 buckets always have same prefix
             sys_bucket = "elasticbeanstalk-%s-system" % env
             outfile_bucket = "elasticbeanstalk-%s-wfoutput" % env

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "dcicutils"
-version = "0.12.1"
+version = "0.13.0"
 description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources"
 authors = ["William Ronchetti <[email protected]>"]
 license = "MIT"