Merge pull request #181 from jdi-testing/rc

Revival of main branch
jdi-testing · Feb 14, 2024 · 82fe341 · 82fe341
2 parents 9130dbb + 97c123a
commit 82fe341
Show file tree

Hide file tree

Showing 618 changed files with 91,202 additions and 4,707 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/.env.dist b/.env.dist
@@ -0,0 +1 @@
+SELENOID_PARALLEL_SESSIONS_COUNT=4
diff --git a/.github/workflows/.release-workflow-rc.yml b/.github/workflows/.release-workflow-rc.yml
@@ -0,0 +1,46 @@
+name: JDI QASP build workflow
+
+on:
+  workflow_call:
+    inputs:
+      image_tag:
+        required: true
+        type: string
+
+env:
+  REGISTRY: ghcr.io
+  REGISTRY_USER: jdi-testing
+  IMAGE_NAME: jdi-qasp-ml
+  IMAGE_PATH: ghcr.io/jdi-testing/jdi-qasp-ml
+
+jobs:
+  release:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: write
+      packages: write
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+      - name: Load default environment variables values
+        uses: cardinalby/export-env-action@v2
+        with:
+          envFile: '.env.dist'
+      - name: Build
+        run: docker compose -f docker-compose.dev.yaml up -d
+      - name: Lint and Test
+        run: docker compose -f docker-compose.dev.yaml run --rm api make lint
+      - name: Log into registry ${{ env.REGISTRY }}
+        uses: docker/login-action@v1
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+      - name: Build and push Docker image
+        uses: docker/build-push-action@v2
+        with:
+          context: .
+          push: true
+          tags: ${{ env.IMAGE_PATH }}:${{ inputs.image_tag }}
diff --git a/.github/workflows/.release-workflow.yml b/.github/workflows/.release-workflow.yml
@@ -17,13 +17,37 @@ jobs:
   release:
     runs-on: ubuntu-latest
     permissions:
-      contents: read
+      contents: write
       packages: write
     steps:
       - name: Checkout repository
         uses: actions/checkout@v2
-      - name: Build and test
-        run: docker compose -f docker-compose.dev.yaml run --rm api make unittest
+        with:
+          submodules: recursive
+      - name: Increment version
+        run: |
+          cd model/version
+          OLD_VERSION=$(ls)
+          FIRST_DIGIT=$(echo $OLD_VERSION | cut -d '.' -f 1)
+          SECOND_DIGIT=$(echo $OLD_VERSION | cut -d '.' -f 2)
+          THIRD_DIGIT=$(echo $OLD_VERSION | cut -d '.' -f 3)
+          NEW_THIRD_DIGIT=$((THIRD_DIGIT+1))
+          NEW_VERSION=$FIRST_DIGIT.$SECOND_DIGIT.$NEW_THIRD_DIGIT
+          mv ./$OLD_VERSION ./$NEW_VERSION
+          cd ../..
+      - name: Commit incremented version
+        uses: stefanzweifel/git-auto-commit-action@v4
+        with:
+          commit_message: increment version
+          branch: develop
+      - name: Load default environment variables values
+        uses: cardinalby/export-env-action@v2
+        with:
+          envFile: '.env.dist'
+      - name: Build
+        run: docker compose -f docker-compose.dev.yaml up -d
+      - name: Lint and Test
+        run: docker compose -f docker-compose.dev.yaml run --rm api make lint
       - name: Log into registry ${{ env.REGISTRY }}
         uses: docker/login-action@v1
         with:

diff --git a/.github/workflows/release-rc-version.yml b/.github/workflows/release-rc-version.yml
@@ -0,0 +1,11 @@
+name: JDI QASP RC version
+
+on:
+  push:
+    branches: [ "rc" ]
+
+jobs:
+  release:
+    uses: jdi-testing/jdi-qasp-ml/.github/workflows/.release-workflow-rc.yml@rc
+    with:
+      image_tag: rc
diff --git a/.github/workflows/release-ws-version.yml b/.github/workflows/release-ws-version.yml
diff --git a/.gitignore b/.gitignore
@@ -12,6 +12,9 @@
 flask-temp-storage/*
 
 generators/HTMLgenerator/output
+generators/HTMLgenerator/dataset
+generators/VuetifyGenerator/node_modules
+generators/VuetifyGenerator/dist
 venv/
 
 data/mui_dataset/*
@@ -39,10 +42,14 @@ dataset/cache-labels/*
 dataset/df/20*
 .idea/*
 HTMLgenerator/output/*
-venv/
 .pytest_cache/*
 .coverage
 .coverage.*
 .cache
 coverage.xml
-.DS_Store
+.DS_Store
+
+**/.DS_Store
+.env
+.env.rc
+.env.dev
diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "kombu-redis-priority"]
+	path = kombu-redis-priority
+	url = https://github.com/Captricity/kombu-redis-priority.git
diff --git a/.isort.cfg b/.isort.cfg
@@ -0,0 +1,2 @@
+[tool.isort]
+profile = "black"
diff --git a/__init__.py → Angular_model/__init__.py b/__init__.py → Angular_model/__init__.py
diff --git a/Angular_model/build_datasets_for_angular_sites.py b/Angular_model/build_datasets_for_angular_sites.py
@@ -0,0 +1,42 @@
+import os
+import sys
+import re
+from glob import glob
+from time import sleep
+
+prefix = os.getcwd().split("jdi-qasp-ml")[0]
+dataset_path = os.path.join(prefix, "jdi-qasp-ml", "data/angular_dataset")
+
+sys.path.append(os.path.join(prefix, "jdi-qasp-ml"))
+
+from utils.config import logger  # noqa
+
+from utils.dataset_builder import DatasetBuilder  # noqa
+from utils.common import maximize_window  # noqa
+
+os.makedirs(dataset_path, exist_ok=True)
+
+WAIT_TIME_SECONDS = 3
+
+SITE_URLS = [
+    "file://" + p.replace("\\", "/") + "/index.html"
+    for p in glob(f'{os.path.join(prefix, "jdi-qasp-ml")}/data/angular_dataset/build/*')
+]
+DATASET_NAMES = [re.search("site-[0-9]+", nm)[0] for nm in SITE_URLS]
+
+
+class JDIDatasetBuilder(DatasetBuilder):
+    def setUp(self, driver):
+        self.logger.info("getting page")
+        driver.get(self.url)
+        maximize_window(driver=driver)
+        sleep(WAIT_TIME_SECONDS)
+
+
+i = 1
+for site, ds_name in zip(SITE_URLS, DATASET_NAMES):
+    JDIDatasetBuilder(
+        url=site, dataset_name=ds_name, headless=True, dataset_root_path=dataset_path,
+    )
+    logger.info(f"\n------------\n{len(SITE_URLS)-i} SITES LEFT TO PROCESS!")
+    i += 1
diff --git a/Angular_model/model/count_children_tags.pkl b/Angular_model/model/count_children_tags.pkl
diff --git a/Angular_model/model/count_followers_tags.pkl b/Angular_model/model/count_followers_tags.pkl
diff --git a/Angular_model/model/model.pth b/Angular_model/model/model.pth
diff --git a/Old_model/model/ohe_role.pkl → Angular_model/model/ohe_role.pkl b/Old_model/model/ohe_role.pkl → Angular_model/model/ohe_role.pkl
diff --git a/Angular_model/model/ohe_tag_name.pkl b/Angular_model/model/ohe_tag_name.pkl
diff --git a/Angular_model/model/ohe_type.pkl b/Angular_model/model/ohe_type.pkl
diff --git a/Angular_model/model/tfidf_attr_class.pkl b/Angular_model/model/tfidf_attr_class.pkl
diff --git a/Angular_model/model/tfidf_children_tags.pkl b/Angular_model/model/tfidf_children_tags.pkl
diff --git a/Angular_model/model/tfidf_followers_tags.pkl b/Angular_model/model/tfidf_followers_tags.pkl
diff --git a/Angular_model/train.py b/Angular_model/train.py
@@ -0,0 +1,206 @@
+# import pandas as pd
+# import numpy as np
+# import matplotlib.pyplot as plt
+import os
+import sys
+import gc
+from tqdm.auto import trange
+import pandas as pd
+from glob import glob
+import logging
+
+
+import torch
+from torch.utils.data import DataLoader
+
+from multiprocessing import freeze_support
+from terminaltables import DoubleTable
+
+prefix = os.getcwd().split("jdi-qasp-ml")[0]
+sys.path.append(os.path.join(prefix, "jdi-qasp-ml"))
+
+from vars.mui_train_vars import (  # noqa
+    BATCH_SIZE,  # noqa
+    TRAIN_LEN,  # noqa
+    TEST_LEN,  # noqa
+    NUM_EPOCHS,  # noqa
+    EARLY_STOPPING_THRESHOLD,  # noqa
+    SCHEDULER_STEP,  # noqa
+)  # noqa
+
+
+from utils.dataset import MUI_JDNDataset  # noqa
+from utils.model_new import JDIModel  # noqa
+from utils.common import accuracy #, accuracy_each_class, recall, precision  # noqa
+
+model_path = os.path.join(prefix, "jdi-qasp-ml", "Angular_model/model")
+df_path = os.path.join(prefix, "jdi-qasp-ml", "data/angular_dataset/df")
+
+ds_files = glob(f"{df_path}/site-*.pkl")
+DATASET_NAMES = [os.path.basename(path)[:-4] for path in ds_files]
+
+train_names = DATASET_NAMES[:TRAIN_LEN]
+test_names = DATASET_NAMES[TRAIN_LEN : TRAIN_LEN + TEST_LEN]  # noqa
+
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+
+LOG_FILENAME = "C:/Users/Iuliia_Elizarova/Desktop/JDI/repo_ml\\jdi-qasp-ml/data/angular_dataset/logfile.log"
+logging.basicConfig(filename=LOG_FILENAME,
+                    format='%(asctime)s %(message)s',
+                    filemode='w')
+logger = logging.getLogger()
+
+logger.info(f"device: {DEVICE}")
+
+
+def evaluate(model: JDIModel, dataset: MUI_JDNDataset) -> pd.DataFrame:
+    model.eval()
+    with torch.no_grad():
+
+        dataloader = DataLoader(dataset, shuffle=False, batch_size=1, pin_memory=True)
+        results = []
+
+        with trange(len(dataloader), desc="Evaluating:") as bar:
+            with torch.no_grad():
+                for x, y in dataloader:
+                    y_pred = (
+                        torch.round(
+                            torch.nn.Softmax(dim=1)(model(x.to(DEVICE)).to("cpu"))
+                        )
+                        .detach()
+                        .numpy()
+                    )
+                    y_pred = y_pred[0].argmax()
+                    y = y.item()
+
+                    results.append(
+                        {
+                            "y_true": y,
+                            "y_pred": y_pred,
+                            "y_true_label": dataset.classes_reverse_dict[y],
+                            "y_pred_label": dataset.classes_reverse_dict[y_pred],
+                        }
+                    )
+                    bar.update(1)
+
+    results_df = pd.DataFrame(results)
+    return accuracy(results_df)
+    # return accuracy(results_df), accuracy_each_class(results_df), recall(results_df), precision(results_df)
+
+
+def train_model(model):
+
+    train_metrics = []
+    gc.collect()
+
+    criterion = torch.nn.CrossEntropyLoss()
+    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
+    scheduler = torch.optim.lr_scheduler.StepLR(
+        optimizer, step_size=SCHEDULER_STEP, gamma=0.1
+    )
+
+    NUM_BATCHES = len(train_dataloader)
+
+    early_stopping_steps = EARLY_STOPPING_THRESHOLD
+    best_accuracy = 0
+    for epoch in range(NUM_EPOCHS):
+
+        model.train()
+        model.to(DEVICE)
+
+        cumulative_loss = 0.0
+
+        with trange(NUM_BATCHES) as bar:
+
+            for x, y in train_dataloader:
+                y_hat = model(x.to(DEVICE))
+
+                optimizer.zero_grad()
+
+                loss = criterion(y_hat, y.long().to(DEVICE))  # \ noqa
+
+                loss.backward()
+                optimizer.step()
+                cumulative_loss += loss.item()
+                bar.set_description(
+                    f"Epoch: {epoch}, {round(cumulative_loss,5)}, {round(loss.item(),5)}"
+                )  # noqa
+                bar.update(1)
+
+            bar.update(1)
+
+        early_stopping_steps -= 1
+        test_accuracy = evaluate(model=model, dataset=test_dataset)
+        if test_accuracy > best_accuracy:
+            best_accuracy = test_accuracy
+            logger.info(f"SAVING MODEL WITH THE BEST ACCURACY: {best_accuracy}")
+            torch.save(model, f"{model_path}/model.pth")
+            early_stopping_steps = EARLY_STOPPING_THRESHOLD
+
+        train_metrics.append(
+            {
+                "epoch": epoch,
+                "mean(loss)": cumulative_loss / NUM_BATCHES,
+                "accuracy(test)": test_accuracy,
+            }
+        )
+
+        # report metrics
+        print()
+        table_data = [["epoch", "mean(loss)", "accuracy(test)"]]
+        for r in train_metrics:
+            table_data.append([r["epoch"], r["mean(loss)"], r["accuracy(test)"]])
+
+        print(f"Best accuracy: {best_accuracy}, attempts left: {early_stopping_steps}")
+
+        if early_stopping_steps <= 0:
+            logger.info("EARLY STOPPING")
+            break
+        scheduler.step()
+
+    pd.DataFrame(train_metrics, index=list(range(len(train_metrics)))).to_csv(
+        "tmp/train_metrics.csv"
+    )
+
+    print(DoubleTable(table_data=table_data).table)
+
+
+if __name__ == "__main__":
+
+    freeze_support()
+
+    train_dataset = MUI_JDNDataset(
+        datasets_list=train_names, dataset_type="angular", rebalance_and_shuffle=True
+    )
+    test_dataset = MUI_JDNDataset(
+        datasets_list=test_names, dataset_type="angular", rebalance_and_shuffle=False
+    )
+
+    logger.info(
+        f"Train dataset shape:  {train_dataset.X.shape}; Test dataset shape: {test_dataset.X.shape}"
+    )
+
+    train_dataloader = DataLoader(
+        train_dataset,
+        batch_size=BATCH_SIZE,
+        shuffle=True,
+        pin_memory=True,
+        drop_last=True,
+        num_workers=0,
+    )
+
+    IN_FEATURES = next(iter(train_dataloader))[0][0].shape[0]
+    OUT_FEATURES = len(train_dataset.classes_dict)
+
+    model = JDIModel(in_features=IN_FEATURES, out_features=OUT_FEATURES)
+    train_model(model)
+    # best_accuracy, best_accuracy_each_class, \
+    #     best_recall_each_class, best_precision_each_class = evaluate(model=model, dataset=test_dataset)
+
+    best_accuracy = evaluate(model=model, dataset=test_dataset)
+
+    # logger.info(f"START TRAINING THE MODEL WITH THE BEST ACCURACY: {best_accuracy}, \
+    #     best accuracy for each class {best_accuracy_each_class}, \
+    #     best recall for each class {best_recall_each_class}, \
+    #     best precision for each class {best_precision_each_class}")
+    logger.info("\n")