Azure: monitor long running resources (#826)

redhat-performance · Sep 2, 2024 · 7a9d70a · 7a9d70a
1 parent 6e44ad0
commit 7a9d70a
Show file tree

Hide file tree

Showing 7 changed files with 427 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -217,3 +217,4 @@ empty_test_environment_variables.py
 /cloud_governance/main/.test_env
 /cloud_governance/policy/send_mail.py
 cloudsensei/.env.txt
+.vscode
diff --git a/cloudsensei/azure/.funcignore b/cloudsensei/azure/.funcignore
@@ -0,0 +1,8 @@
+.git*
+.vscode
+__azurite_db*__.json
+__blobstorage__
+__queuestorage__
+local.settings.json
+test
+.venv
diff --git a/cloudsensei/azure/.gitignore b/cloudsensei/azure/.gitignore
@@ -0,0 +1,135 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don’t work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# Azure Functions artifacts
+bin
+obj
+appsettings.json
+local.settings.json
+
+# Azurite artifacts
+__blobstorage__
+__queuestorage__
+__azurite_db*__.json
+.python_packages
diff --git a/cloudsensei/azure/function_app.py b/cloudsensei/azure/function_app.py
@@ -0,0 +1,194 @@
+import logging
+import os
+from datetime import datetime, timezone
+
+import azure.functions as func
+from azure.mgmt.compute import ComputeManagementClient
+from azure.identity import DefaultAzureCredential
+
+from slack_operations import SlackOperations
+
+app = func.FunctionApp()
+
+
+class AzureComputeOperations:
+
+    def __init__(self) -> None:
+        self.credential = DefaultAzureCredential()
+        self.subscription_id = os.environ['SUBSCRIPTION_ID']
+        self.client = ComputeManagementClient(credential=self.credential,
+                                              subscription_id=self.subscription_id)
+
+    def list_instances(self):
+        """
+        This method returns a list of all instances.
+        :return:
+        """
+        resources = self.client.virtual_machines.list_all()
+        return self._item_paged_iterator(resources)
+
+    def get_id_dict_data(self, resource_id: str):
+        """
+        This method generates the vm id dictionary
+        :param resource_id:
+        :type resource_id:
+        :return:
+        :rtype:
+        """
+        pairs = resource_id.split('/')[1:]
+        key_pairs = {pairs[i].lower(): pairs[i + 1] for i in range(0, len(pairs), 2)}
+        return key_pairs
+
+    def _item_paged_iterator(self, item_paged_object, as_dict: bool = False):
+        """
+        This method iterates the paged object and return the list
+        :param item_paged_object:
+        :return:
+        """
+        iterator_list = []
+        try:
+            page_item = item_paged_object.next()
+            while page_item:
+                if as_dict:
+                    iterator_list.append(page_item.as_dict())
+                else:
+                    iterator_list.append(page_item)
+                page_item = item_paged_object.next()
+        except StopIteration:
+            pass
+        return iterator_list
+
+    def get_instance_statuses(self, resource_group_name: str, vm_name: str) -> dict:
+        """
+        This method returns the virtual machine instance status
+        :param vm_name:
+        :type vm_name:
+        :param resource_group_name:
+        :type resource_group_name:
+        :return:
+        :rtype:
+        """
+        virtual_machine = self.client.virtual_machines.instance_view(resource_group_name=resource_group_name,
+                                                                     vm_name=vm_name)
+        return virtual_machine.as_dict()
+
+    def _get_instance_status(self, resource_group_name: str, vm_name: str):
+        """
+        This method returns the VM status of the Virtual Machine
+        :param resource_group_name:
+        :type resource_group_name:
+        :param vm_name:
+        :type vm_name:
+        :return:
+        :rtype:
+        """
+        instance_statuses = self.get_instance_statuses(resource_group_name=resource_group_name, vm_name=vm_name)
+        statuses = instance_statuses.get('statuses', {})
+        if len(statuses) >= 2:
+            status = statuses[1].get('display_status', '').lower()
+        elif len(statuses) == 1:
+            status = statuses[0].get('display_status', '').lower()
+        else:
+            status = 'Unknown Status'
+        return status
+
+
+class ProcessInstances:
+    SLACK_ITEM_SIZE = 50
+
+    def __init__(self):
+        self.__azure_operations = AzureComputeOperations()
+        self.__resource_days = int(os.environ.get('RESOURCE_DAYS', 7))
+
+    def get_long_running_instances(self):
+        """
+        This method returns a list of long-running instances.
+        :return:
+        """
+        instances_list = self.__azure_operations.list_instances()
+        long_running_instances = []
+        current_datetime = datetime.now(timezone.utc)
+        for instance in instances_list:
+            running_days = (current_datetime - instance.time_created).days
+            if running_days >= self.__resource_days:
+                id_dict = self.__azure_operations.get_id_dict_data(instance.id)
+                resource_group = id_dict.get("resourcegroups")
+
+                instance_resource = {
+                    'name': instance.name,
+                    'resource_group': resource_group,
+                    'time_created': instance.time_created.strftime('%Y-%m-%d %H:%M:%S'),
+                    'region': instance.location,
+                    'instance_type': instance.hardware_profile.vm_size,
+                    'status': self.__azure_operations._get_instance_status(resource_group, instance.name)
+                }
+                long_running_instances.append(instance_resource)
+        long_running_instances.sort(key=lambda x: (x['region'], x['resource_group']))
+        return long_running_instances
+
+    def organize_message_to_send_slack(self, resources_list: list):
+        """
+        This method returns the message to send to slack
+        :param resources_list:
+        :return:
+        """
+
+        divider = {"type": "divider"}
+        rows = []
+        keys = []
+        for resource in resources_list:
+            if not keys:
+                keys = list(resources_list[0].keys())
+            rows.append({
+                "type": "section",
+                "fields": [
+                    {"type": "mrkdwn", "text": f"{value}"}
+                    for key, value in resource.items()]}
+            )
+            rows.append(divider)
+        item_blocks = [rows[i:i + self.SLACK_ITEM_SIZE] for i in
+                       range(0, len(rows), self.SLACK_ITEM_SIZE)]  # splitting because slack block allows only 50 items
+        slack_message_block = [[{
+            "type": "rich_text",
+            "elements": [
+                {
+                    "type": "rich_text_section",
+                    "elements": [
+                        {
+                            "type": "text",
+                            "text": "Please look at the following instances and take an respective action",
+                            "style": {
+                                "bold": True
+                            },
+                        }
+                    ]
+                }
+            ]
+        }], [{
+            'type': 'section',
+            'fields': [
+                {"type": "mrkdwn", "text": f"{item}"} for item in keys
+            ]
+        }]]
+        if not item_blocks:
+            slack_message_block.append([{
+                "type": "section",
+                "text": {
+                    "type": "mrkdwn",
+                    "text": "No long running instances"
+                }
+            }])
+        for block in item_blocks:
+            slack_message_block.append(block)
+        return slack_message_block
+
+
+@app.schedule(schedule="0 0 18 * * * ", arg_name="myTimer", run_on_startup=True,
+              use_monitor=False)
+def monitor_resources(myTimer: func.TimerRequest) -> None:
+    process_instances = ProcessInstances()
+    long_running_resources = process_instances.get_long_running_instances()
+    slack_message_block = process_instances.organize_message_to_send_slack(long_running_resources)
+    slack_operations = SlackOperations()
+    threadts = slack_operations.create_thread(cloud_name='Azure', account_name='PerfScale')
+    slack_operations.post_message_blocks_in_thread(message_blocks=slack_message_block, thread_ts=threadts)
diff --git a/cloudsensei/azure/host.json b/cloudsensei/azure/host.json
@@ -0,0 +1,15 @@
+{
+  "version": "2.0",
+  "logging": {
+    "applicationInsights": {
+      "samplingSettings": {
+        "isEnabled": true,
+        "excludedTypes": "Request"
+      }
+    }
+  },
+  "extensionBundle": {
+    "id": "Microsoft.Azure.Functions.ExtensionBundle",
+    "version": "[4.*, 5.0.0)"
+  }
+}
diff --git a/cloudsensei/azure/requirements.txt b/cloudsensei/azure/requirements.txt
@@ -0,0 +1,7 @@
+# DO NOT include azure-functions-worker in this file
+# The Python Worker is managed by Azure Functions platform
+# Manually managing azure-functions-worker may cause unexpected issues
+
+azure-functions
+azure-identity
+azure-mgmt-compute