Skip to content

Commit

Permalink
Add chart status and bug fixes(#5)
Browse files Browse the repository at this point in the history
* test(notification): strict checks for required keys

* test: add edge cases

* fix(service): not creating intervalTask when creating service

* test: a simple go test server to check if notification can be reached (need to replace with pytest mocker)

* feat:add endpoint for statistics for UptimeRecord

* test: UptimeRecord based on service ID

* feat: chart data API

* feat:support define interval for charting API

* fix: keep sending notification after a service is recovered. Now limited sending to 3 times.

* fix: return UTC time for chartAPI instead of local time

* chores: ignore pycache

* chores: add source of readme chart
  • Loading branch information
AnsonDev42 authored Apr 10, 2024
1 parent ce70540 commit 830f8a0
Show file tree
Hide file tree
Showing 14 changed files with 3,563 additions and 35 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
/django_debug.log
**/__pycache__/
103 changes: 88 additions & 15 deletions apps/monitoring/statistics.py
Original file line number Diff line number Diff line change
@@ -1,42 +1,115 @@
from datetime import timedelta

from django.db.models import Avg
from django.utils import timezone
from django.utils.timezone import now

from apps.monitoring.models import UptimeRecord


QUERY_TIME_RANGE_TYPE = {
1: "Last 1 hour",
3: "Last 3 hours",
6: "Last 6 hours",
24: "Last 24 hours",
168: "Last 7 days",
720: "Last 30 days",
-1: "All time",
}


def calculate_past(time_range=None):
def calculate_past_summary(time_range=None):
"""
Given an time range in HOUR?DATE, query all UptimeRecord and
calculate uptime percentage and the average response time
:return: uptime_percentage and avg_response_time
:return:total_records, uptime_percentage and avg_response_time
"""

uptime_percentage, avg_response_time = None, None
if not time_range or time_range.value not in QUERY_TIME_RANGE_TYPE.keys():
if (not time_range) or (time_range not in QUERY_TIME_RANGE_TYPE.keys()):
return uptime_percentage and avg_response_time
time_delta = time_range.value
time_delta = time_range
results = UptimeRecord.objects.filter(
created_at__gte=now() - timedelta(hours=time_delta)
)
# results = UptimeRecord.objects.filter(created_at? range)
sum_avg_time, sum_uptime = 0, 0
total_records = results.count()
for record in results:
sum_avg_time += record.response_time
sum_uptime += 1 if record.status else 0
if total_records:
avg_response_time = sum_avg_time / total_records
uptime_percentage = (sum_uptime / total_records) * 100
return uptime_percentage, avg_response_time
up_records = results.filter(status=True).count()
average_response_time = (
results.filter(status=True).aggregate(Avg("response_time"))[
"response_time__avg"
]
or 0
)

uptime_percentage = (up_records / total_records) * 100 if total_records else 0

return total_records, uptime_percentage, average_response_time


def calculate_past_chart(time_range, split_interval):
"""
Given a time range in HOUR, query all UptimeRecord and
calculate uptime_percentage and the average_response_time in the interval for chart,
the interval is calculated by showing 30 records in the chart. E.g. if the time_range is 720 hours,
the chart will show 30 records, each record represents 24 hours.
:return: a json contains a summary of uptime percentage and average response time, following 30 detailed records
where each record contains total_records, uptime_percentage, average_response_time, time_start and time_end
"""

if (not time_range) or (time_range not in QUERY_TIME_RANGE_TYPE.keys()):
return KeyError("Invalid time range")
# iterate 30 intervals in the given time range
if split_interval < 1:
split_interval = 1
delta = timedelta(hours=time_range / split_interval)
start_time = now() - timedelta(hours=time_range)
total_records, total_up_records = 0, 0
all_results = []
total_avg_response_time = []

for _ in range(split_interval):
end_time = start_time + delta
results = UptimeRecord.objects.filter(
created_at__gte=start_time, created_at__lt=end_time
)
interval_total_records = results.count()
interval_up_records = results.filter(status=True).count()
average_response_time = (
results.filter(status=True).aggregate(Avg("response_time"))[
"response_time__avg"
]
or 0
)
all_results.append(
{
"uptime_percentage": (interval_up_records / interval_total_records)
* 100
if interval_total_records
else 0,
"average_response_time": average_response_time,
"time_start": timezone.localtime(end_time).strftime("%b. %-d, %H:%M"),
}
)
total_records += interval_total_records
total_up_records += interval_up_records
total_avg_response_time.append(average_response_time)
start_time = end_time

total_avg_response_time = sum(total_avg_response_time) / len(
total_avg_response_time
)
uptime_percentage = (total_up_records / total_records) * 100 if total_records else 0

summary = {
"time_range": time_range,
"total_records": total_records,
"uptime_percentage": uptime_percentage,
"average_response_time": total_avg_response_time,
"time_start": timezone.localtime(now()).strftime("%b. %-d, %H:%M"),
"time_end": timezone.localtime(now()).strftime("%b. %-d, %H:%M"),
}
response = {
"summary": summary,
"data": all_results,
}
return response
18 changes: 14 additions & 4 deletions apps/monitoring/tasks.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from celery import shared_task
from .models import Service, UptimeRecord
from apps.monitoring.models import Service, UptimeRecord
from apps.notification.models import NotificationChannel, NotificationLog
from .utils import check_service_status
from apps.monitoring.utils import check_service_status


# logger = get_task_logger(__nae__)
Expand Down Expand Up @@ -29,20 +29,30 @@ def check_monitor_services_status(service_id=None):
error_message=error_message,
service=service,
)
# breakpoint()
if not Service.objects.filter(id=service_id).exists():
return
if not is_up:
message = f"Service {service.name} is down."
channels = NotificationChannel.objects.all() # Example: Notify all channels
channels = Service.objects.get(id=service_id).notification_channel.all()
for channel in channels:
was_success = channel.send_notification(service, message)
NotificationLog.objects.create(
service=service, message=message, was_success=was_success
)
else:
# check the last three records are up or not, if all up, do not send notification
records = UptimeRecord.objects.filter(service=service).order_by("-check_at")[:3]
if len(records) == 3 and all(record.status for record in records):
return
message = f"Service {service.name} is up."
channels = NotificationChannel.objects.all() # Example: Notify all channels
for channel in channels:
was_success = channel.send_notification(service, message)
NotificationLog.objects.create(
service=service, message=message, was_success=was_success
)


if __name__ == "__main__":
check_monitor_services_status(service_id=1)
print("check_monitor_services_status()")
45 changes: 43 additions & 2 deletions apps/monitoring/views.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,19 @@
from rest_framework import viewsets
from django_celery_beat.models import IntervalSchedule, PeriodicTask
from rest_framework.decorators import action
from rest_framework.response import Response

from .models import UptimeRecord
from .serializers import (
from apps.monitoring.models import UptimeRecord
from apps.monitoring.serializers import (
IntervalScheduleSerializer,
PeriodicTaskSerializer,
UptimeRecordSerializer,
)
from apps.monitoring.statistics import (
QUERY_TIME_RANGE_TYPE,
calculate_past_summary,
calculate_past_chart,
)


class IntervalScheduleViewSet(viewsets.ModelViewSet):
Expand All @@ -22,3 +29,37 @@ class PeriodicTaskViewSet(viewsets.ModelViewSet):
class UptimeRecordViewSet(viewsets.ModelViewSet):
queryset = UptimeRecord.objects.all()
serializer_class = UptimeRecordSerializer

@action(detail=False, methods=["get"])
def stats(self, request):
# service_id = request.query_params.get("service_id")
time_range = int(request.query_params.get("time_range", 1))

# Apply time_range if specified and valid
if time_range not in QUERY_TIME_RANGE_TYPE:
return Response({"error": "Invalid time range"}, status=400)
(
total_records,
uptime_percentage,
average_response_time,
) = calculate_past_summary(time_range=time_range)

data = {
"total_records": total_records,
"uptime_percentage": uptime_percentage,
"average_response_time": average_response_time,
}

return Response(data)

@action(detail=False, methods=["get"])
def chart(self, request):
time_range = int(request.query_params.get("time_range", 1))
split_interval = int(request.query_params.get("split_interval", 6))
if time_range not in QUERY_TIME_RANGE_TYPE:
return Response({"error": "Invalid time range"}, status=400)

data = calculate_past_chart(
time_range=time_range, split_interval=split_interval
)
return Response(data)
38 changes: 37 additions & 1 deletion apps/notification/serializers.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
from pydantic_core import ValidationError
from rest_framework import serializers

from apps.notification.models import NotificationChannel, NotificationType
from apps.notification.notify_services.bark import Bark
from apps.notification.notify_services.telegram import Telegram


class NotificationChannelSerializer(serializers.HyperlinkedModelSerializer):
type = serializers.ChoiceField(choices=NotificationType.choices)
url = serializers.URLField(required=False)

class Meta:
model = NotificationChannel
fields = (
Expand All @@ -13,4 +19,34 @@ class Meta:
"type",
"url",
) # Explicitly include 'id' and other fields you need
type = serializers.ChoiceField(choices=NotificationType.choices)

def validate(self, attrs):
details = attrs.get("details")
channel_type = attrs.get("type")
match channel_type:
case NotificationType.TELEGRAM:
pydantic_model = Telegram
case NotificationType.BARK:
pydantic_model = Bark
case _:
pydantic_model = None

if pydantic_model:
try:
# Validates the details using the Pydantic model
pydantic_model(**details)
except ValidationError as e:
raise serializers.ValidationError({"details": e.errors()})

return attrs

def create(self, validated_data):
channel = NotificationChannel.objects.create(**validated_data)
return channel

def update(self, instance, validated_data):
instance.name = validated_data.get("name", instance.name)
instance.details = validated_data.get("details", instance.details)
instance.type = validated_data.get("type", instance.type)
instance.save()
return instance
33 changes: 21 additions & 12 deletions apps/service/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ class ServiceSerializer(serializers.HyperlinkedModelSerializer):
class Meta:
model = Service
fields = (
"id",
"name",
"description",
"monitoring_endpoint",
Expand All @@ -67,23 +68,31 @@ class Meta:
def create(self, validated_data):
periodic_task_data = validated_data.pop("periodic_task_data", None)
notification_channels_data = validated_data.pop("notification_channel", [])

interval_data = periodic_task_data.pop("interval", None)
service = Service.objects.create(**validated_data)

if notification_channels_data:
service.notification_channel.set(notification_channels_data)

if periodic_task_data:
periodic_task_data["kwargs"] = json.dumps(
{"service_id": service.id}
) # Update this line based on the task argument structure
periodic_task_serializer = PeriodicTaskSerializer(data=periodic_task_data)
if periodic_task_serializer.is_valid(raise_exception=True):
periodic_task = periodic_task_serializer.save()
service.periodic_task = (
periodic_task # Bind the PeriodicTask to the Service
)
service.save()
if not periodic_task_data or not interval_data:
raise ValueError(
"Invalid or missing 'periodic_task_data' for Service creation."
)
# create interval schedule
interval_serializer = IntervalScheduleSerializer(data=interval_data)
if not interval_serializer.is_valid(raise_exception=True):
raise ValueError("Invalid 'interval' data for PeriodicTask creation.")
# overwrite periodic_task_data kwargs with service id
periodic_task_data["kwargs"] = json.dumps({"service_id": service.id})
# create periodic task
periodic_task_data["interval"] = interval_data
periodic_task_serializer = PeriodicTaskSerializer(data=periodic_task_data)
if periodic_task_serializer.is_valid(raise_exception=True):
periodic_task = periodic_task_serializer.save()
service.periodic_task = (
periodic_task # Bind the PeriodicTask to the Service
)
service.save()

return service

Expand Down
Loading

0 comments on commit 830f8a0

Please sign in to comment.