diff --git a/.gitignore b/.gitignore index a11ff3a1a..029ca0a9f 100755 --- a/.gitignore +++ b/.gitignore @@ -57,6 +57,7 @@ docs/_build/ target/ Python.gitignore venv/ +.venv # Notepad++ backups # *.bak diff --git a/Dockerfile b/Dockerfile index e4baa1aa7..af2ebdb38 100644 --- a/Dockerfile +++ b/Dockerfile @@ -44,9 +44,11 @@ RUN set -eux; \ rm -rf /var/lib/apt/lists/* COPY app/requirements /app/requirements +COPY app/signals/apps/classification/requirements.txt /app/signals/apps/classification/requirements.txt RUN set -eux; \ pip install --no-cache -r /app/requirements/requirements.txt; \ + pip install --no-cache -r /app/signals/apps/classification/requirements.txt; \ pip install --no-cache tox; \ chgrp signals /app; \ chmod g+w /app; \ diff --git a/app/requirements/requirements.txt b/app/requirements/requirements.txt index 4ec4ba9a8..086a6365f 100644 --- a/app/requirements/requirements.txt +++ b/app/requirements/requirements.txt @@ -72,7 +72,7 @@ click-repl==0.3.0 # via celery cron-descriptor==1.4.3 # via django-celery-beat -cryptography==43.0.1 +cryptography==43.0.0 # via # azure-storage-blob # josepy diff --git a/app/requirements/requirements_dev.txt b/app/requirements/requirements_dev.txt index ba1031a22..361a86730 100644 --- a/app/requirements/requirements_dev.txt +++ b/app/requirements/requirements_dev.txt @@ -119,7 +119,7 @@ cron-descriptor==1.4.3 # via # -r requirements_test.txt # django-celery-beat -cryptography==43.0.1 +cryptography==43.0.0 # via # -r requirements_test.txt # azure-storage-blob diff --git a/app/requirements/requirements_test.txt b/app/requirements/requirements_test.txt index 4d5f8a49e..786f9cb7a 100644 --- a/app/requirements/requirements_test.txt +++ b/app/requirements/requirements_test.txt @@ -111,7 +111,7 @@ cron-descriptor==1.4.3 # via # -r requirements.txt # django-celery-beat -cryptography==43.0.1 +cryptography==43.0.0 # via # -r requirements.txt # azure-storage-blob diff --git a/app/signals/apps/api/urls.py b/app/signals/apps/api/urls.py index 4a08dccef..d934b6f1c 100644 --- a/app/signals/apps/api/urls.py +++ b/app/signals/apps/api/urls.py @@ -90,8 +90,9 @@ # Status message search re_path(r'v1/private/status-messages/search/?$', StatusMessageSearchView.as_view(), name='status-message-search'), - # Legacy prediction proxy endpoint, still needed - path('category/prediction', LegacyMlPredictCategoryView.as_view(), name='ml-tool-predict-proxy'), + # # Legacy prediction proxy endpoint, still needed + # path('category/prediction', LegacyMlPredictCategoryView.as_view(), name='ml-tool-predict-proxy'), + path('', include('signals.apps.classification.urls')), # The base routes of the API path('v1/', include(base_router.urls)), diff --git a/app/signals/apps/classification/__init__.py b/app/signals/apps/classification/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/app/signals/apps/classification/admin/__init__.py b/app/signals/apps/classification/admin/__init__.py new file mode 100644 index 000000000..0fc8d92ee --- /dev/null +++ b/app/signals/apps/classification/admin/__init__.py @@ -0,0 +1,9 @@ +from django.contrib import admin + +from signals.apps.classification.admin.admins import TrainingSetAdmin, ClassifierAdmin +from signals.apps.classification.models import TrainingSet +from signals.apps.classification.models.classifier import Classifier + +admin.site.register(TrainingSet, TrainingSetAdmin) +admin.site.register(Classifier, ClassifierAdmin) + diff --git a/app/signals/apps/classification/admin/admins.py b/app/signals/apps/classification/admin/admins.py new file mode 100644 index 000000000..0a872bd30 --- /dev/null +++ b/app/signals/apps/classification/admin/admins.py @@ -0,0 +1,113 @@ +import this + +from django.contrib import admin, messages + +from signals.apps.classification.models import Classifier +from signals.apps.classification.tasks import train_classifier +import openpyxl + + +class TrainingSetAdmin(admin.ModelAdmin): + list_display = ('name', 'file', ) + actions = ["run_training_with_training_set"] + + @admin.action(description="Train model met geselecteerde dataset") + def run_training_with_training_set(self, request, queryset): + """ + Run validation, if validation fails show an error message. + + First we validate if there are no missing columns (Main, Sub and Text column are required), after this we check if there is atleast one row of data (next + to the headers) + """ + for training_set in queryset: + file = training_set.file + + wb = openpyxl.load_workbook(file) + first_sheet = wb.active + + headers = [cell.value for cell in first_sheet[1]] + required_columns = ["Main", "Sub", "Text"] + missing_columns = [col for col in required_columns if col not in headers] + + if missing_columns: + self.message_user( + request, + f"Training set { training_set.name } is missing required columns: {', '.join(missing_columns)}", + messages.ERROR, + ) + + return + + data_rows = list(first_sheet.iter_rows(min_row=2, values_only=True)) + if not any(data_rows): + self.message_user( + request, + f"The training set { training_set.name } does not contain any data rows.", + messages.ERROR + ) + return + + train_classifier.delay(training_set.id) + + self.message_user( + request, + "Training of the model has been initiated.", + messages.SUCCESS, + ) + + +class ClassifierAdmin(admin.ModelAdmin): + """ + Creating or disabling classifiers by hand in the Admin interface is disabled, + + a successful training job should create his own classifier object. + """ + list_display = ('name', 'precision', 'recall', 'accuracy', 'is_active', ) + actions = ["activate_classifier"] + readonly_fields = ('training_status', 'training_error', ) + + @admin.action(description="Maak deze classifier actief") + def activate_classifier(self, request, queryset): + """ + Make chosen classifier active, disable other classifiers + """ + + if queryset.count() > 1: + self.message_user( + request, + "You can only make one classifier active.", + messages.ERROR + ) + return + + try: + Classifier.objects.update(is_active=False) + Classifier.objects.filter(id=queryset.first().id).update(is_active=True) + + self.message_user( + request, + f"Classifier { queryset.first().name } has been activated.", + messages.SUCCESS + ) + except Exception: + self.message_user( + request, + f"Classifier { queryset.first().name } has not been activated.", + messages.ERROR + ) + + + + def get_readonly_fields(self, request, obj=None): + if obj: + return [f.name for f in self.model._meta.fields] + return [] + + def has_add_permission(self, request): + return False + + def has_change_permission(self, request, obj=None): + return False + + def has_delete_permission(self, request, obj=None): + return True diff --git a/app/signals/apps/classification/apps.py b/app/signals/apps/classification/apps.py new file mode 100644 index 000000000..c23af73e8 --- /dev/null +++ b/app/signals/apps/classification/apps.py @@ -0,0 +1,6 @@ +from django.apps import AppConfig + + +class ClassificationConfig(AppConfig): + name = 'signals.apps.classification' + verbose_name = 'Classificatie management' \ No newline at end of file diff --git a/app/signals/apps/classification/management/__init__.py b/app/signals/apps/classification/management/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/app/signals/apps/classification/management/commands/__init__.py b/app/signals/apps/classification/management/commands/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/app/signals/apps/classification/management/commands/train-ml.py b/app/signals/apps/classification/management/commands/train-ml.py new file mode 100644 index 000000000..9dd2a3e09 --- /dev/null +++ b/app/signals/apps/classification/management/commands/train-ml.py @@ -0,0 +1,19 @@ +from django.core.management.base import BaseCommand, CommandError + +from signals.apps.classification.models import TrainingSet +from signals.apps.classification.tasks import train_classifier + +class Command(BaseCommand): + help = "Train specific model" + + def add_arguments(self, parser): + parser.add_argument("training_set_id", type=int) + + def handle(self, *args, **options): + try: + training_set = TrainingSet.objects.get(pk=options["training_set_id"]) + except TrainingSet.DoesNotExist: + raise CommandError('Training Set "%s" does not exist' % options["training_set_id"]) + + train_classifier(training_set.id) + diff --git a/app/signals/apps/classification/migrations/0001_initial.py b/app/signals/apps/classification/migrations/0001_initial.py new file mode 100644 index 000000000..5cbfd73b6 --- /dev/null +++ b/app/signals/apps/classification/migrations/0001_initial.py @@ -0,0 +1,42 @@ +# Generated by Django 4.2.11 on 2024-09-17 09:53 + +from django.db import migrations, models + +import signals.apps.services.domain.checker_factories +import signals.apps.services.domain.mimetypes +import signals.apps.services.validator.file +import signals.apps.signals.models.utils + + +class Migration(migrations.Migration): + + initial = True + + dependencies = [ + ] + + operations = [ + migrations.CreateModel( + name='TrainingSet', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('created_at', models.DateTimeField(auto_now_add=True)), + ('name', models.CharField(max_length=255)), + ('file', models.FileField(max_length=255, upload_to='training_sets/%Y/%m/%d/', validators=[ + signals.apps.services.validator.file.MimeTypeAllowedValidator( + signals.apps.services.domain.mimetypes.MimeTypeFromContentResolverFactory(), + 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' + ), + signals.apps.services.validator.file.MimeTypeIntegrityValidator( + signals.apps.services.domain.mimetypes.MimeTypeFromContentResolverFactory(), + signals.apps.services.domain.mimetypes.MimeTypeFromFilenameResolverFactory() + ), + signals.apps.services.validator.file.ContentIntegrityValidator( + signals.apps.services.domain.mimetypes.MimeTypeFromContentResolverFactory(), + signals.apps.services.domain.checker_factories.ContentCheckerFactory() + ), + signals.apps.services.validator.file.FileSizeValidator(20971520) + ])), + ], + ), + ] diff --git a/app/signals/apps/classification/migrations/0002_classifier.py b/app/signals/apps/classification/migrations/0002_classifier.py new file mode 100644 index 000000000..5a9d1ef8f --- /dev/null +++ b/app/signals/apps/classification/migrations/0002_classifier.py @@ -0,0 +1,23 @@ +# Generated by Django 4.2.11 on 2024-09-17 11:09 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('classification', '0001_initial'), + ] + + operations = [ + migrations.CreateModel( + name='Classifier', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('created_at', models.DateTimeField(auto_now_add=True)), + ('name', models.CharField(max_length=255)), + ('middle_model', models.FileField(max_length=255, upload_to='classification_models/middle/%Y/%m/%d/')), + ('middle_sub_model', models.FileField(max_length=255, upload_to='classification_models/middle_sub/%Y/%m/%d/')), + ], + ), + ] diff --git a/app/signals/apps/classification/migrations/0003_classifier_accuracy_classifier_precision_and_more.py b/app/signals/apps/classification/migrations/0003_classifier_accuracy_classifier_precision_and_more.py new file mode 100644 index 000000000..304a0cf2a --- /dev/null +++ b/app/signals/apps/classification/migrations/0003_classifier_accuracy_classifier_precision_and_more.py @@ -0,0 +1,28 @@ +# Generated by Django 4.2.11 on 2024-09-17 11:58 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('classification', '0002_classifier'), + ] + + operations = [ + migrations.AddField( + model_name='classifier', + name='accuracy', + field=models.FloatField(default=0), + ), + migrations.AddField( + model_name='classifier', + name='precision', + field=models.FloatField(default=0), + ), + migrations.AddField( + model_name='classifier', + name='recall', + field=models.FloatField(default=0), + ), + ] diff --git a/app/signals/apps/classification/migrations/0004_remove_classifier_middle_model_and_more.py b/app/signals/apps/classification/migrations/0004_remove_classifier_middle_model_and_more.py new file mode 100644 index 000000000..40af53db3 --- /dev/null +++ b/app/signals/apps/classification/migrations/0004_remove_classifier_middle_model_and_more.py @@ -0,0 +1,46 @@ +# Generated by Django 4.2.11 on 2024-09-19 08:07 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('classification', '0003_classifier_accuracy_classifier_precision_and_more'), + ] + + operations = [ + migrations.RemoveField( + model_name='classifier', + name='middle_model', + ), + migrations.RemoveField( + model_name='classifier', + name='middle_sub_model', + ), + migrations.AddField( + model_name='classifier', + name='main_model', + field=models.FileField(blank=True, max_length=255, null=True, upload_to='classification_models/middle/%Y/%m/%d/'), + ), + migrations.AddField( + model_name='classifier', + name='sub_model', + field=models.FileField(blank=True, max_length=255, null=True, upload_to='classification_models/middle_sub/%Y/%m/%d/'), + ), + migrations.AlterField( + model_name='classifier', + name='accuracy', + field=models.FloatField(blank=True, default=0, null=True), + ), + migrations.AlterField( + model_name='classifier', + name='precision', + field=models.FloatField(blank=True, default=0, null=True), + ), + migrations.AlterField( + model_name='classifier', + name='recall', + field=models.FloatField(blank=True, default=0, null=True), + ), + ] diff --git a/app/signals/apps/classification/migrations/0005_classifier_is_active_alter_classifier_main_model_and_more.py b/app/signals/apps/classification/migrations/0005_classifier_is_active_alter_classifier_main_model_and_more.py new file mode 100644 index 000000000..36079754b --- /dev/null +++ b/app/signals/apps/classification/migrations/0005_classifier_is_active_alter_classifier_main_model_and_more.py @@ -0,0 +1,28 @@ +# Generated by Django 4.2.15 on 2024-09-27 10:25 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('classification', '0004_remove_classifier_middle_model_and_more'), + ] + + operations = [ + migrations.AddField( + model_name='classifier', + name='is_active', + field=models.BooleanField(default=False), + ), + migrations.AlterField( + model_name='classifier', + name='main_model', + field=models.FileField(blank=True, max_length=255, null=True, upload_to='classification_models/main/%Y/%m/%d/'), + ), + migrations.AlterField( + model_name='classifier', + name='sub_model', + field=models.FileField(blank=True, max_length=255, null=True, upload_to='classification_models/main_sub/%Y/%m/%d/'), + ), + ] diff --git a/app/signals/apps/classification/migrations/0006_classifier_training_error_classifier_training_status_and_more.py b/app/signals/apps/classification/migrations/0006_classifier_training_error_classifier_training_status_and_more.py new file mode 100644 index 000000000..65aa3a757 --- /dev/null +++ b/app/signals/apps/classification/migrations/0006_classifier_training_error_classifier_training_status_and_more.py @@ -0,0 +1,42 @@ +# Generated by Django 4.2.15 on 2024-10-17 09:55 + +from django.db import migrations, models +import signals.apps.classification.utils +import signals.apps.services.domain.checker_factories +import signals.apps.services.domain.mimetypes +import signals.apps.services.validator.file + + +class Migration(migrations.Migration): + + dependencies = [ + ('classification', '0005_classifier_is_active_alter_classifier_main_model_and_more'), + ] + + operations = [ + migrations.AddField( + model_name='classifier', + name='training_error', + field=models.TextField(blank=True, null=True), + ), + migrations.AddField( + model_name='classifier', + name='training_status', + field=models.CharField(choices=[('RUNNING', 'Running'), ('COMPLETED', 'Completed'), ('FAILED', 'Failed')], default='RUNNING', max_length=20), + ), + migrations.AlterField( + model_name='classifier', + name='main_model', + field=models.FileField(blank=True, max_length=255, null=True, storage=signals.apps.classification.utils._get_storage_backend, upload_to='classification_models/main/%Y/%m/%d/'), + ), + migrations.AlterField( + model_name='classifier', + name='sub_model', + field=models.FileField(blank=True, max_length=255, null=True, storage=signals.apps.classification.utils._get_storage_backend, upload_to='classification_models/main_sub/%Y/%m/%d/'), + ), + migrations.AlterField( + model_name='trainingset', + name='file', + field=models.FileField(max_length=255, storage=signals.apps.classification.utils._get_storage_backend, upload_to='training_sets/%Y/%m/%d/', validators=[signals.apps.services.validator.file.MimeTypeAllowedValidator(signals.apps.services.domain.mimetypes.MimeTypeFromContentResolverFactory(), 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'), signals.apps.services.validator.file.MimeTypeIntegrityValidator(signals.apps.services.domain.mimetypes.MimeTypeFromContentResolverFactory(), signals.apps.services.domain.mimetypes.MimeTypeFromFilenameResolverFactory()), signals.apps.services.validator.file.ContentIntegrityValidator(signals.apps.services.domain.mimetypes.MimeTypeFromContentResolverFactory(), signals.apps.services.domain.checker_factories.ContentCheckerFactory()), signals.apps.services.validator.file.FileSizeValidator(20971520)]), + ), + ] diff --git a/app/signals/apps/classification/migrations/__init__.py b/app/signals/apps/classification/migrations/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/app/signals/apps/classification/models/__init__.py b/app/signals/apps/classification/models/__init__.py new file mode 100644 index 000000000..1b7d4b282 --- /dev/null +++ b/app/signals/apps/classification/models/__init__.py @@ -0,0 +1,7 @@ +from signals.apps.classification.models.classifier import Classifier +from signals.apps.classification.models.training_set import TrainingSet + +__all__ = [ + 'TrainingSet', + 'Classifier' +] diff --git a/app/signals/apps/classification/models/classifier.py b/app/signals/apps/classification/models/classifier.py new file mode 100644 index 000000000..79152932e --- /dev/null +++ b/app/signals/apps/classification/models/classifier.py @@ -0,0 +1,41 @@ +from django.contrib.gis.db import models + +from signals.apps.classification.utils import _get_storage_backend + +class Classifier(models.Model): + TRAINING_STATUSES = ( + ('RUNNING', 'Running'), + ('COMPLETED', 'Completed'), + ('FAILED', 'Failed'), + ) + + """ + This model represents a classification model consisting of a reference to the "Main" model and a reference to the + + "Main, Sub" model + """ + created_at = models.DateTimeField(editable=False, auto_now_add=True) + + name = models.CharField(max_length=255, null=False, blank=False) + precision = models.FloatField(null=True, blank=True, default=0) + recall = models.FloatField(null=True, blank=True, default=0) + accuracy = models.FloatField(null=True, blank=True, default=0) + is_active = models.BooleanField(default=False) + training_status = models.CharField(default="RUNNING", choices=TRAINING_STATUSES, max_length=20) + training_error = models.TextField(null=True, blank=True) + + main_model = models.FileField( + upload_to='classification_models/main/%Y/%m/%d/', + null=True, + blank=True, + storage=_get_storage_backend, + max_length=255, + ) + + sub_model = models.FileField( + upload_to='classification_models/main_sub/%Y/%m/%d/', + null=True, + blank=True, + storage=_get_storage_backend, + max_length=255, + ) \ No newline at end of file diff --git a/app/signals/apps/classification/models/training_set.py b/app/signals/apps/classification/models/training_set.py new file mode 100644 index 000000000..a3e558e37 --- /dev/null +++ b/app/signals/apps/classification/models/training_set.py @@ -0,0 +1,41 @@ +from django.contrib.gis.db import models + +from signals import settings +from signals.apps.classification.utils import _get_storage_backend +from signals.apps.services.domain.checker_factories import ContentCheckerFactory +from signals.apps.services.domain.mimetypes import MimeTypeFromContentResolverFactory, \ + MimeTypeFromFilenameResolverFactory +from signals.apps.services.validator.file import MimeTypeAllowedValidator, MimeTypeIntegrityValidator, \ + ContentIntegrityValidator, FileSizeValidator + + +class TrainingSet(models.Model): + """ + This model represents a training set consisting of a single XLSX file, with a "Main", "Sub" and "Text" column + """ + created_at = models.DateTimeField(editable=False, auto_now_add=True) + + name = models.CharField(max_length=255, null=False, blank=False) + + file = models.FileField( + upload_to='training_sets/%Y/%m/%d/', + storage=_get_storage_backend, + null=False, + blank=False, + max_length=255, + validators=[ + MimeTypeAllowedValidator( + MimeTypeFromContentResolverFactory(), + ( + 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' + ) + ), + MimeTypeIntegrityValidator( + MimeTypeFromContentResolverFactory(), + MimeTypeFromFilenameResolverFactory() + ), + ContentIntegrityValidator(MimeTypeFromContentResolverFactory(), ContentCheckerFactory()), + FileSizeValidator(settings.API_MAX_UPLOAD_SIZE), + ], + ) + diff --git a/app/signals/apps/classification/requirements.in b/app/signals/apps/classification/requirements.in new file mode 100644 index 000000000..3d2c4203e --- /dev/null +++ b/app/signals/apps/classification/requirements.in @@ -0,0 +1,4 @@ +nltk +openpyxl +pandas +scikit-learn \ No newline at end of file diff --git a/app/signals/apps/classification/requirements.txt b/app/signals/apps/classification/requirements.txt new file mode 100644 index 000000000..9f830c61f --- /dev/null +++ b/app/signals/apps/classification/requirements.txt @@ -0,0 +1,43 @@ +# +# This file is autogenerated by pip-compile with Python 3.11 +# by the following command: +# +# pip-compile --output-file=requirements.txt requirements.in +# +click==8.1.7 + # via nltk +et-xmlfile==1.1.0 + # via openpyxl +joblib==1.4.2 + # via + # nltk + # scikit-learn +nltk==3.9.1 + # via -r requirements.in +numpy==2.1.2 + # via + # pandas + # scikit-learn + # scipy +openpyxl==3.1.5 + # via -r requirements.in +pandas==2.2.3 + # via -r requirements.in +python-dateutil==2.9.0.post0 + # via pandas +pytz==2024.2 + # via pandas +regex==2024.9.11 + # via nltk +scikit-learn==1.5.2 + # via -r requirements.in +scipy==1.14.1 + # via scikit-learn +six==1.16.0 + # via python-dateutil +threadpoolctl==3.5.0 + # via scikit-learn +tqdm==4.66.5 + # via nltk +tzdata==2024.2 + # via pandas diff --git a/app/signals/apps/classification/tasks.py b/app/signals/apps/classification/tasks.py new file mode 100644 index 000000000..96c903bc1 --- /dev/null +++ b/app/signals/apps/classification/tasks.py @@ -0,0 +1,10 @@ +from signals.apps.classification.train import TrainClassifier +from signals.celery import app + + +@app.task +def train_classifier(training_set_id): + TrainClassifier(training_set_id).run() + + + diff --git a/app/signals/apps/classification/tests/__init__.py b/app/signals/apps/classification/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/app/signals/apps/classification/tests/test_training_set.py b/app/signals/apps/classification/tests/test_training_set.py new file mode 100644 index 000000000..20fd20c47 --- /dev/null +++ b/app/signals/apps/classification/tests/test_training_set.py @@ -0,0 +1,23 @@ +from django.core.files.uploadedfile import SimpleUploadedFile +from django.test import TestCase + +from signals.apps.classification.models import TrainingSet + + +class TrainingSetTestCase(TestCase): + def setUp(self): + self.test_file = SimpleUploadedFile('test_data_set.xlsx', '', 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet') + + def test_create_training_set(self): + training_set = TrainingSet.objects.create( + name='test_training_set', + file=self.test_file, + ) + + self.assertEqual(TrainingSet.objects.count(), 1) + self.assertEqual(training_set.name, 'test_training_set') + self.assertIsNotNone(training_set.file) + + def tearDown(self): + for training_set in TrainingSet.objects.all(): + training_set.file.delete() \ No newline at end of file diff --git a/app/signals/apps/classification/train.py b/app/signals/apps/classification/train.py new file mode 100644 index 000000000..ad97aef8f --- /dev/null +++ b/app/signals/apps/classification/train.py @@ -0,0 +1,170 @@ +import os +import re + +import pandas as pd +import nltk +from django.core.files.base import ContentFile +from nltk.stem.snowball import DutchStemmer +from sklearn.model_selection import train_test_split, GridSearchCV +from sklearn.pipeline import Pipeline +from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import precision_score, recall_score, accuracy_score +import pickle +from django.utils.text import slugify + +from signals.apps.classification.models import TrainingSet, Classifier + + +class TrainClassifier: + def __init__(self, training_set_id): + self.training_set_id = training_set_id + self.training_set = self.get_training_set() + self.df = None + + nltk.download('stopwords') + + def get_training_set(self): + return TrainingSet.objects.get(pk=self.training_set_id) + + def read_file(self): + _, extension = os.path.splitext(self.training_set.file.name) + + if extension == '.csv': + self.df = pd.read_csv(self.training_set.file, sep=None, engine='python') + elif extension == '.xlsx': + self.df = pd.read_excel(self.training_set.file) + else: + raise Exception('Could not read input file. Extension should be .csv or .xlsx') + + def preprocess_file(self): + self.df = self.df.dropna(axis=0) + self.df["_main_label"] = self.df["Main"] + self.df["_sub_label"] = f'{self.df["Main"]}|{self.df["Sub"]}' + + def stopper(self): + stop_words = list(set(nltk.corpus.stopwords.words('dutch'))) + return stop_words + + def preprocessor(self, text): + stemmer = DutchStemmer(ignore_stopwords=True) + + text = str(text) + text = text.lower() + + words = re.split("\\s+", text) + stemmed_words = [stemmer.stem(word=word) for word in words] + return ' '.join(stemmed_words) + + def train_test_split(self, columns): + labels = self.df[columns].map(lambda x: slugify(x)).apply('|'.join, axis=1) + + return train_test_split( + self.df["Text"], labels, test_size=0.2, stratify=labels + ) + + def train_model(self, train_texts, train_labels): + stop_words = self.stopper() + + pipeline = Pipeline([ + ('vect', CountVectorizer(preprocessor=self.preprocessor, stop_words=stop_words)), + ('tfidf', TfidfTransformer()), + ('clf', LogisticRegression()), + ]) + + parameters_slow = { + 'clf__class_weight': (None, 'balanced'), + 'clf__max_iter': (300, 500), + 'clf__penalty': ('l1',), + 'clf__multi_class': ('auto',), + 'clf__solver': ('liblinear',), + 'tfidf__norm': ('l2',), + 'tfidf__use_idf': (False,), + 'vect__max_df': (1.0,), + 'vect__max_features': (None,), + 'vect__ngram_range': ((1, 1), (1, 2)) + } + + grid_search = GridSearchCV(pipeline, parameters_slow, verbose=True, n_jobs=1, cv=5) + grid_search.fit(train_texts, train_labels) + + return grid_search + + def evaluate_model(self, model, test_texts, test_labels): + test_predict = model.predict(test_texts) + precision = precision_score(test_labels, test_predict, average='macro', zero_division=0) + recall = recall_score(test_labels, test_predict, average='macro') + accuracy = accuracy_score(test_labels, test_predict) + + return precision, recall, accuracy + + def save_model(self, main_model, sub_model, scores): + pickled_main_model = pickle.dumps(main_model, pickle.HIGHEST_PROTOCOL) + pickled_sub_model = pickle.dumps(sub_model, pickle.HIGHEST_PROTOCOL) + + precision, recall, accuracy = scores + + classifier = Classifier.objects.create( + main_model=ContentFile(pickled_main_model, '_main_model.pkl'), + sub_model=ContentFile(pickled_sub_model, '_sub_model.pkl'), + precision=precision, + recall=recall, + accuracy=accuracy, + name=self.training_set.name, + is_active=False + ) + + classifier.save() + + def create_model(self): + classifier = Classifier.objects.create( + name=self.training_set.name, + is_active=False, + training_status="RUNNING", + ) + + return classifier + + def persist_model(self, classifier, main_model, sub_model, scores): + pickled_main_model = pickle.dumps(main_model, pickle.HIGHEST_PROTOCOL) + pickled_sub_model = pickle.dumps(sub_model, pickle.HIGHEST_PROTOCOL) + + precision, recall, accuracy = scores + + classifier.main_model = ContentFile(pickled_main_model, '_main_model.pkl') + classifier.sub_model = ContentFile(pickled_sub_model, '_sub_model.pkl') + classifier.precision=precision + classifier.recall=recall + classifier.accuracy=accuracy + classifier.save() + + def update_status(self, classifier, status, error): + classifier.training_status = status + classifier.training_error = error + classifier.save() + + def run(self): + self.read_file() + self.preprocess_file() + + classifier = self.create_model() + + try: + # Train main model + train_texts, test_texts, train_labels, text_labels = self.train_test_split(['Main']) + main_model = self.train_model(train_texts, train_labels) + main_scores = self.evaluate_model(main_model, test_texts, text_labels) + + # Train sub model + train_texts, test_texts, train_labels, text_labels = self.train_test_split(['Main', 'Sub']) + sub_model = self.train_model(train_texts, train_labels) + sub_scores = self.evaluate_model(sub_model, test_texts, text_labels) + + # scores te delen + scores = [(x + y) / 2 for x, y in zip(main_scores, sub_scores)] + + self.persist_model(classifier, main_model, sub_model, scores) + self.update_status(classifier, 'COMPLETED', None) + except ValueError as e: + self.update_status(classifier, 'FAILED', e) + diff --git a/app/signals/apps/classification/urls.py b/app/signals/apps/classification/urls.py new file mode 100644 index 000000000..22d901906 --- /dev/null +++ b/app/signals/apps/classification/urls.py @@ -0,0 +1,7 @@ +from django.urls import path + +from signals.apps.classification.views import MlPredictCategoryView + +urlpatterns = [ + path('category/prediction', MlPredictCategoryView.as_view(), name='ml-tool-predict-proxy'), +] \ No newline at end of file diff --git a/app/signals/apps/classification/utils.py b/app/signals/apps/classification/utils.py new file mode 100644 index 000000000..8733c3a3c --- /dev/null +++ b/app/signals/apps/classification/utils.py @@ -0,0 +1,20 @@ +# SPDX-License-Identifier: MPL-2.0 +# Copyright (C) 2020 - 2023 Gemeente Amsterdam +from typing import Union + +from django.conf import settings +from django.core.exceptions import ImproperlyConfigured +from django.core.files.storage import FileSystemStorage +from storages.backends.azure_storage import AzureStorage + + +def _get_storage_backend() -> FileSystemStorage: + """ + Returns one of the following storages: + - AzureStorage, the "using" must be present in the AZURE_CONTAINERS setting. + - FileSystemStorage, location is set to the settings.DWH_MEDIA_ROOT. + + :param using: + :returns: AzureStorage or FileSystemStorage + """ + return FileSystemStorage(location=settings.DWH_MEDIA_ROOT) \ No newline at end of file diff --git a/app/signals/apps/classification/views.py b/app/signals/apps/classification/views.py new file mode 100644 index 000000000..ccbc5e9fd --- /dev/null +++ b/app/signals/apps/classification/views.py @@ -0,0 +1,106 @@ +# SPDX-License-Identifier: MPL-2.0 +# Copyright (C) 2019 - 2023 Gemeente Amsterdam +from django.core.exceptions import ValidationError as DjangoCoreValidationError +from drf_spectacular.utils import extend_schema +from rest_framework.exceptions import ValidationError +from rest_framework.response import Response +from rest_framework.views import APIView + +from signals.apps.api.ml_tool.client import MLToolClient +from signals.apps.signals.models import Category +import pickle + +from django.conf import settings +from rest_framework import status + +from signals.apps.classification.models import Classifier + + +@extend_schema(exclude=True) +class MlPredictCategoryView(APIView): + ml_tool_client = MLToolClient() + + _default_category_url = None + default_category = None + + def __init__(self, *args, **kwargs): + # When we cannot translate we return the 'overig-overig' category url + self.default_category = Category.objects.get(slug='overig', parent__isnull=False, parent__slug='overig') + + super().__init__(*args, **kwargs) + + @property + def default_category_url(self): + if not self._default_category_url and self.default_category: + request = self.request if self.request else None + self._default_category_url = self.default_category.get_absolute_url(request=request) + return self._default_category_url + + def get_prediction_old_ml_proxy(self, request): + # Default empty response + data = {'hoofdrubriek': [], 'subrubriek': []} + + try: + response = self.ml_tool_client.predict(text=request.data['text']) + except DjangoCoreValidationError as e: + raise ValidationError(e.message, e.code) + else: + if response.status_code == 200: + response_data = response.json() + + for key in data.keys(): + try: + category = Category.objects.get_from_url(url=response_data[key][0][0]) + except Category.DoesNotExist: + category_url = self.default_category_url + else: + category_url = category.get_absolute_url(request=request) + + data[key].append([category_url]) + data[key].append([response_data[key][1][0]]) + + return Response(data) + + def get_prediction_new_ml_proxy(self, request, classifier): + try: + main_model = pickle.load(classifier.main_model) + sub_model = pickle.load(classifier.sub_model) + + text = request.data['text'] + + # Get prediction and probability for the main model + main_prediction = main_model.predict([text]) + main_probability = main_model.predict_proba([text]) + + # Get prediction and probability for the sub model + sub_prediction = sub_model.predict([text]) + sub_probability = sub_model.predict_proba([text]) + + main_slug = main_prediction[0] + sub_slug = sub_prediction[0].split('|')[1] + + data = { + 'hoofdrubriek': [ + [settings.BACKEND_URL + f'/signals/v1/public/terms/categories/{main_slug}'], + [main_probability[0][0]] + ], + 'subrubriek': [ + [settings.BACKEND_URL + f'/signals/v1/public/terms/categories/{main_slug}/sub_categories/{sub_slug}'], + [sub_probability[0][0]] + ] + } + except: + return Response(status=status.HTTP_500_INTERNAL_SERVER_ERROR) + else: + return Response(status=status.HTTP_200_OK, data=data) + + def post(self, request, *args, **kwargs): + try: + classifier = Classifier.objects.get(is_active=True) + return self.get_prediction_new_ml_proxy(request, classifier) + except Classifier.DoesNotExist: + return self.get_prediction_old_ml_proxy(request) + + + + diff --git a/app/signals/settings.py b/app/signals/settings.py index f8f7a98bb..47f778bd7 100644 --- a/app/signals/settings.py +++ b/app/signals/settings.py @@ -82,6 +82,7 @@ 'signals.apps.questionnaires', 'signals.apps.my_signals', 'signals.apps.zgw', + 'signals.apps.classification' ] INSTALLED_APPS: list[str] = [ @@ -399,6 +400,7 @@ def is_super_user(user) -> bool: # The URL of the Frontend FRONTEND_URL: str | None = os.getenv('FRONTEND_URL', None) +BACKEND_URL: str | None = os.getenv('BACKEND_URL', 'http://localhost:8000') ML_TOOL_ENDPOINT: str = os.getenv('SIGNALS_ML_TOOL_ENDPOINT', 'https://api.data.amsterdam.nl/signals_mltool') # noqa diff --git a/docker-compose.yml b/docker-compose.yml index 57bf0ae69..2b1de5d16 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -92,7 +92,7 @@ services: celery: condition: service_started env_file: - - docker-compose/environments/.celery_beat + - docker-compose/environments/.celery volumes: - ./app:/app command: celery -A signals beat -l debug --pidfile /tmp/celerybeat.pid