Skip to content

Commit

Permalink
Azure: monitor long running resources (#826)
Browse files Browse the repository at this point in the history
  • Loading branch information
athiruma authored Sep 2, 2024
1 parent 6e44ad0 commit 7a9d70a
Show file tree
Hide file tree
Showing 7 changed files with 427 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -217,3 +217,4 @@ empty_test_environment_variables.py
/cloud_governance/main/.test_env
/cloud_governance/policy/send_mail.py
cloudsensei/.env.txt
.vscode
8 changes: 8 additions & 0 deletions cloudsensei/azure/.funcignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
.git*
.vscode
__azurite_db*__.json
__blobstorage__
__queuestorage__
local.settings.json
test
.venv
135 changes: 135 additions & 0 deletions cloudsensei/azure/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
.python-version

# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don’t work, or not
# install all needed dependencies.
#Pipfile.lock

# celery beat schedule file
celerybeat-schedule

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# Azure Functions artifacts
bin
obj
appsettings.json
local.settings.json

# Azurite artifacts
__blobstorage__
__queuestorage__
__azurite_db*__.json
.python_packages
194 changes: 194 additions & 0 deletions cloudsensei/azure/function_app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
import logging
import os
from datetime import datetime, timezone

import azure.functions as func
from azure.mgmt.compute import ComputeManagementClient
from azure.identity import DefaultAzureCredential

from slack_operations import SlackOperations

app = func.FunctionApp()


class AzureComputeOperations:

def __init__(self) -> None:
self.credential = DefaultAzureCredential()
self.subscription_id = os.environ['SUBSCRIPTION_ID']
self.client = ComputeManagementClient(credential=self.credential,
subscription_id=self.subscription_id)

def list_instances(self):
"""
This method returns a list of all instances.
:return:
"""
resources = self.client.virtual_machines.list_all()
return self._item_paged_iterator(resources)

def get_id_dict_data(self, resource_id: str):
"""
This method generates the vm id dictionary
:param resource_id:
:type resource_id:
:return:
:rtype:
"""
pairs = resource_id.split('/')[1:]
key_pairs = {pairs[i].lower(): pairs[i + 1] for i in range(0, len(pairs), 2)}
return key_pairs

def _item_paged_iterator(self, item_paged_object, as_dict: bool = False):
"""
This method iterates the paged object and return the list
:param item_paged_object:
:return:
"""
iterator_list = []
try:
page_item = item_paged_object.next()
while page_item:
if as_dict:
iterator_list.append(page_item.as_dict())
else:
iterator_list.append(page_item)
page_item = item_paged_object.next()
except StopIteration:
pass
return iterator_list

def get_instance_statuses(self, resource_group_name: str, vm_name: str) -> dict:
"""
This method returns the virtual machine instance status
:param vm_name:
:type vm_name:
:param resource_group_name:
:type resource_group_name:
:return:
:rtype:
"""
virtual_machine = self.client.virtual_machines.instance_view(resource_group_name=resource_group_name,
vm_name=vm_name)
return virtual_machine.as_dict()

def _get_instance_status(self, resource_group_name: str, vm_name: str):
"""
This method returns the VM status of the Virtual Machine
:param resource_group_name:
:type resource_group_name:
:param vm_name:
:type vm_name:
:return:
:rtype:
"""
instance_statuses = self.get_instance_statuses(resource_group_name=resource_group_name, vm_name=vm_name)
statuses = instance_statuses.get('statuses', {})
if len(statuses) >= 2:
status = statuses[1].get('display_status', '').lower()
elif len(statuses) == 1:
status = statuses[0].get('display_status', '').lower()
else:
status = 'Unknown Status'
return status


class ProcessInstances:
SLACK_ITEM_SIZE = 50

def __init__(self):
self.__azure_operations = AzureComputeOperations()
self.__resource_days = int(os.environ.get('RESOURCE_DAYS', 7))

def get_long_running_instances(self):
"""
This method returns a list of long-running instances.
:return:
"""
instances_list = self.__azure_operations.list_instances()
long_running_instances = []
current_datetime = datetime.now(timezone.utc)
for instance in instances_list:
running_days = (current_datetime - instance.time_created).days
if running_days >= self.__resource_days:
id_dict = self.__azure_operations.get_id_dict_data(instance.id)
resource_group = id_dict.get("resourcegroups")

instance_resource = {
'name': instance.name,
'resource_group': resource_group,
'time_created': instance.time_created.strftime('%Y-%m-%d %H:%M:%S'),
'region': instance.location,
'instance_type': instance.hardware_profile.vm_size,
'status': self.__azure_operations._get_instance_status(resource_group, instance.name)
}
long_running_instances.append(instance_resource)
long_running_instances.sort(key=lambda x: (x['region'], x['resource_group']))
return long_running_instances

def organize_message_to_send_slack(self, resources_list: list):
"""
This method returns the message to send to slack
:param resources_list:
:return:
"""

divider = {"type": "divider"}
rows = []
keys = []
for resource in resources_list:
if not keys:
keys = list(resources_list[0].keys())
rows.append({
"type": "section",
"fields": [
{"type": "mrkdwn", "text": f"{value}"}
for key, value in resource.items()]}
)
rows.append(divider)
item_blocks = [rows[i:i + self.SLACK_ITEM_SIZE] for i in
range(0, len(rows), self.SLACK_ITEM_SIZE)] # splitting because slack block allows only 50 items
slack_message_block = [[{
"type": "rich_text",
"elements": [
{
"type": "rich_text_section",
"elements": [
{
"type": "text",
"text": "Please look at the following instances and take an respective action",
"style": {
"bold": True
},
}
]
}
]
}], [{
'type': 'section',
'fields': [
{"type": "mrkdwn", "text": f"{item}"} for item in keys
]
}]]
if not item_blocks:
slack_message_block.append([{
"type": "section",
"text": {
"type": "mrkdwn",
"text": "No long running instances"
}
}])
for block in item_blocks:
slack_message_block.append(block)
return slack_message_block


@app.schedule(schedule="0 0 18 * * * ", arg_name="myTimer", run_on_startup=True,
use_monitor=False)
def monitor_resources(myTimer: func.TimerRequest) -> None:
process_instances = ProcessInstances()
long_running_resources = process_instances.get_long_running_instances()
slack_message_block = process_instances.organize_message_to_send_slack(long_running_resources)
slack_operations = SlackOperations()
threadts = slack_operations.create_thread(cloud_name='Azure', account_name='PerfScale')
slack_operations.post_message_blocks_in_thread(message_blocks=slack_message_block, thread_ts=threadts)
15 changes: 15 additions & 0 deletions cloudsensei/azure/host.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"version": "2.0",
"logging": {
"applicationInsights": {
"samplingSettings": {
"isEnabled": true,
"excludedTypes": "Request"
}
}
},
"extensionBundle": {
"id": "Microsoft.Azure.Functions.ExtensionBundle",
"version": "[4.*, 5.0.0)"
}
}
7 changes: 7 additions & 0 deletions cloudsensei/azure/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# DO NOT include azure-functions-worker in this file
# The Python Worker is managed by Azure Functions platform
# Manually managing azure-functions-worker may cause unexpected issues

azure-functions
azure-identity
azure-mgmt-compute
Loading

0 comments on commit 7a9d70a

Please sign in to comment.