Shard affinity for k8s workloads

Instances that are part of the same K8S cluster will get scheduled to the same shard (vCenter). It identifies the K8S cluster by looking at the tags or metadata set by the k8s cluster orchestrators when creating the instances. Kubernikus and Gardener are supported for now. BigVMs are "out of the picture" and should not adhere to shards. They are only scheduled on their allocated hosts. Change-Id: I73d04ba295d23db1d4728e9db124fc2a27c2d4bc
sapcc · Jul 29, 2023 · 867a490 · 867a490
1 parent 045e331
commit 867a490
Show file tree

Hide file tree

Showing 3 changed files with 416 additions and 47 deletions.
diff --git a/nova/db/main/api.py b/nova/db/main/api.py
@@ -2087,6 +2087,69 @@ def instance_get_active_by_window_joined(context, begin, end=None,
     return _instances_fill_metadata(context, query.all(), manual_joins)
 
 
+@require_context
+@pick_context_manager_reader_allow_async
+def instance_get_host_by_tag(context, tag, filters=None):
+    count_label = func.count('*').label('count')
+    query = context.session.query(models.Instance, count_label). \
+        join(models.Tag, models.Tag.resource_id == models.Instance.uuid)
+    query = _handle_instance_host_query_filters(query, filters)
+    query = query.filter(models.Instance.deleted == 0,
+                         models.Tag.tag == tag)
+
+    query = query.group_by(models.Instance.host). \
+        order_by(sql.desc(count_label)).limit(1)
+
+    result = query.all()
+    if result:
+        return result[0]
+    else:
+        return None
+
+
+@require_context
+@pick_context_manager_reader_allow_async
+def instance_get_host_by_metadata(context, meta_key, meta_value,
+                                  filters=None):
+    count_label = func.count('*').label('count')
+    query = context.session.query(models.Instance.host, count_label). \
+        join(models.InstanceMetadata,
+             models.InstanceMetadata.instance_uuid == models.Instance.uuid)
+    query = _handle_instance_host_query_filters(query, filters)
+    query = query.filter(models.Instance.deleted == 0,
+                         models.InstanceMetadata.deleted == 0,
+                         models.InstanceMetadata.key == meta_key,
+                         models.InstanceMetadata.value == meta_value)
+    query = query.group_by(models.Instance.host). \
+        order_by(sql.desc(count_label)). \
+        limit(1)
+
+    result = query.all()
+    if result:
+        return result[0]
+    else:
+        return None
+
+
+def _handle_instance_host_query_filters(query, filters=None):
+    if not filters:
+        return query
+    hv_type = filters.get('hv_type')
+    if hv_type:
+        query = query.join(
+            models.ComputeNode,
+            models.Instance.node == models.ComputeNode.hypervisor_hostname)
+
+    availability_zone = filters.get('availability_zone')
+    if availability_zone:
+        query = query.filter(
+            models.Instance.availability_zone == availability_zone)
+    if hv_type:
+        query = query.filter(models.ComputeNode.deleted == 0,
+                             models.ComputeNode.hypervisor_type == hv_type)
+    return query
+
+
 def _instance_get_all_query(context, project_only=False, joins=None):
     if joins is None:
         joins = ['info_cache', 'security_groups']

diff --git a/nova/scheduler/filters/shard_filter.py b/nova/scheduler/filters/shard_filter.py
@@ -19,6 +19,10 @@
 from oslo_log import log as logging
 
 import nova.conf
+from nova import context as nova_context
+from nova.db.main import api as main_db_api
+from nova.objects.aggregate import AggregateList
+from nova.objects.build_request import BuildRequest
 from nova.scheduler import filters
 from nova.scheduler import utils
 from nova import utils as nova_utils
@@ -28,6 +32,10 @@
 CONF = nova.conf.CONF
 
 _SERVICE_AUTH = None
+GARDENER_PREFIX = "kubernetes.io-cluster-"
+KKS_PREFIX = "kubernikus:kluster"
+HANA_PREFIX = "hana_"
+VMWARE_HV_TYPE = 'VMware vCenter Server'
 
 
 class ShardFilter(filters.BaseHostFilter):
@@ -37,6 +45,8 @@ class ShardFilter(filters.BaseHostFilter):
 
     Alternatively the project may have the "sharding_enabled" tag set, which
     enables the project for hosts in all shards.
+
+    Implements `filter_all` directly instead of `host_passes`
     """
 
     _PROJECT_SHARD_CACHE = {}
@@ -114,11 +124,63 @@ def _get_shards(self, project_id):
 
         return self._PROJECT_SHARD_CACHE.get(project_id)
 
-    def host_passes(self, host_state, spec_obj):
+    def _get_k8s_shard(self, spec_obj):
+        if (spec_obj.flavor.name.startswith(HANA_PREFIX) or
+                utils.request_is_resize(spec_obj)):
+            return None
+        elevated = nova_context.get_admin_context()
+        build_request = BuildRequest.get_by_instance_uuid(
+            elevated, spec_obj.instance_uuid)
+
+        kks_tag = next((t.tag for t in build_request.tags
+                        if t.tag.startswith(KKS_PREFIX)), None)
+        gardener_meta = None
+        if not kks_tag:
+            gardener_meta = \
+                {k: v for k, v in build_request.instance.metadata.items()
+                 if k.startswith(GARDENER_PREFIX)}
+
+        if not kks_tag and not gardener_meta:
+            return None
+
+        q_filters = {'hv_type': VMWARE_HV_TYPE}
+        if spec_obj.availability_zone:
+            q_filters['availability_zone'] = spec_obj.availability_zone
+
+        k8s_host = None
+        if kks_tag:
+            k8s_host = nova_context.scatter_gather_skip_cell0(
+                elevated, main_db_api.instance_get_host_by_tag,
+                kks_tag, filters=q_filters)
+        elif gardener_meta:
+            (meta_key, meta_value) = next(
+                (k, v) for k, v in gardener_meta.items())
+            k8s_host = nova_context.scatter_gather_skip_cell0(
+                elevated, main_db_api.instance_get_host_by_metadata,
+                meta_key, meta_value, filters=q_filters)
+
+        if not k8s_host:
+            return None
+
+        aggrs = [aggr.name for aggr in
+                 AggregateList.get_by_host(elevated, k8s_host)
+                 if aggr.name.startswith(self._SHARD_PREFIX)]
+        if aggrs:
+            return aggrs[0]
+        else:
+            return None
+
+    def filter_all(self, filter_obj_list, spec_obj):
         # Only VMware
         if utils.is_non_vmware_spec(spec_obj):
-            return True
+            return filter_obj_list
+
+        k8s_shard = self._get_k8s_shard(spec_obj)
+
+        return [host_state for host_state in filter_obj_list
+                if self._host_passes(host_state, spec_obj, k8s_shard)]
 
+    def _host_passes(self, host_state, spec_obj, k8s_shard):
         host_shard_aggrs = [aggr for aggr in host_state.aggregates
                             if aggr.name.startswith(self._SHARD_PREFIX)]
 
@@ -148,18 +210,30 @@ def host_passes(self, host_state, spec_obj):
         if self._ALL_SHARDS in shards:
             LOG.debug('project enabled for all shards %(project_shards)s.',
                       {'project_shards': shards})
-            return True
         elif host_shard_names & set(shards):
             LOG.debug('%(host_state)s shard %(host_shard)s found in project '
                       'shards %(project_shards)s.',
                       {'host_state': host_state,
                        'host_shard': host_shard_names,
                        'project_shards': shards})
-            return True
         else:
             LOG.debug('%(host_state)s shard %(host_shard)s not found in '
                       'project shards %(project_shards)s.',
                       {'host_state': host_state,
                        'host_shard': host_shard_names,
                        'project_shards': shards})
             return False
+
+        if k8s_shard:
+            return any(host_shard == k8s_shard
+                       for host_shard in host_shard_names)
+
+        return True
+
+    def _host_passes_k8s(self, host_shard_aggrs, k8s_hosts):
+        """Instances of a K8S cluster must end up on the same shard.
+        The K8S cluster is identified by the metadata or tags set
+        by the orchestrator (Gardener or Kubernikus).
+        """
+        return any(set(aggr.hosts) & k8s_hosts
+                   for aggr in host_shard_aggrs)