Skip to content

Commit

Permalink
Merge pull request #181 from jdi-testing/rc
Browse files Browse the repository at this point in the history
Revival of main branch
  • Loading branch information
ivnglkv authored Feb 14, 2024
2 parents 9130dbb + 97c123a commit 82fe341
Show file tree
Hide file tree
Showing 618 changed files with 91,202 additions and 4,707 deletions.
Binary file removed .DS_Store
Binary file not shown.
1 change: 1 addition & 0 deletions .env.dist
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
SELENOID_PARALLEL_SESSIONS_COUNT=4
46 changes: 46 additions & 0 deletions .github/workflows/.release-workflow-rc.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
name: JDI QASP build workflow

on:
workflow_call:
inputs:
image_tag:
required: true
type: string

env:
REGISTRY: ghcr.io
REGISTRY_USER: jdi-testing
IMAGE_NAME: jdi-qasp-ml
IMAGE_PATH: ghcr.io/jdi-testing/jdi-qasp-ml

jobs:
release:
runs-on: ubuntu-latest
permissions:
contents: write
packages: write
steps:
- name: Checkout repository
uses: actions/checkout@v3
with:
submodules: true
- name: Load default environment variables values
uses: cardinalby/export-env-action@v2
with:
envFile: '.env.dist'
- name: Build
run: docker compose -f docker-compose.dev.yaml up -d
- name: Lint and Test
run: docker compose -f docker-compose.dev.yaml run --rm api make lint
- name: Log into registry ${{ env.REGISTRY }}
uses: docker/login-action@v1
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Build and push Docker image
uses: docker/build-push-action@v2
with:
context: .
push: true
tags: ${{ env.IMAGE_PATH }}:${{ inputs.image_tag }}
30 changes: 27 additions & 3 deletions .github/workflows/.release-workflow.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,37 @@ jobs:
release:
runs-on: ubuntu-latest
permissions:
contents: read
contents: write
packages: write
steps:
- name: Checkout repository
uses: actions/checkout@v2
- name: Build and test
run: docker compose -f docker-compose.dev.yaml run --rm api make unittest
with:
submodules: recursive
- name: Increment version
run: |
cd model/version
OLD_VERSION=$(ls)
FIRST_DIGIT=$(echo $OLD_VERSION | cut -d '.' -f 1)
SECOND_DIGIT=$(echo $OLD_VERSION | cut -d '.' -f 2)
THIRD_DIGIT=$(echo $OLD_VERSION | cut -d '.' -f 3)
NEW_THIRD_DIGIT=$((THIRD_DIGIT+1))
NEW_VERSION=$FIRST_DIGIT.$SECOND_DIGIT.$NEW_THIRD_DIGIT
mv ./$OLD_VERSION ./$NEW_VERSION
cd ../..
- name: Commit incremented version
uses: stefanzweifel/git-auto-commit-action@v4
with:
commit_message: increment version
branch: develop
- name: Load default environment variables values
uses: cardinalby/export-env-action@v2
with:
envFile: '.env.dist'
- name: Build
run: docker compose -f docker-compose.dev.yaml up -d
- name: Lint and Test
run: docker compose -f docker-compose.dev.yaml run --rm api make lint
- name: Log into registry ${{ env.REGISTRY }}
uses: docker/login-action@v1
with:
Expand Down
11 changes: 11 additions & 0 deletions .github/workflows/release-rc-version.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
name: JDI QASP RC version

on:
push:
branches: [ "rc" ]

jobs:
release:
uses: jdi-testing/jdi-qasp-ml/.github/workflows/.release-workflow-rc.yml@rc
with:
image_tag: rc
11 changes: 0 additions & 11 deletions .github/workflows/release-ws-version.yml

This file was deleted.

11 changes: 9 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@
flask-temp-storage/*

generators/HTMLgenerator/output
generators/HTMLgenerator/dataset
generators/VuetifyGenerator/node_modules
generators/VuetifyGenerator/dist
venv/

data/mui_dataset/*
Expand Down Expand Up @@ -39,10 +42,14 @@ dataset/cache-labels/*
dataset/df/20*
.idea/*
HTMLgenerator/output/*
venv/
.pytest_cache/*
.coverage
.coverage.*
.cache
coverage.xml
.DS_Store
.DS_Store

**/.DS_Store
.env
.env.rc
.env.dev
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[submodule "kombu-redis-priority"]
path = kombu-redis-priority
url = https://github.com/Captricity/kombu-redis-priority.git
2 changes: 2 additions & 0 deletions .isort.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[tool.isort]
profile = "black"
File renamed without changes.
42 changes: 42 additions & 0 deletions Angular_model/build_datasets_for_angular_sites.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import os
import sys
import re
from glob import glob
from time import sleep

prefix = os.getcwd().split("jdi-qasp-ml")[0]
dataset_path = os.path.join(prefix, "jdi-qasp-ml", "data/angular_dataset")

sys.path.append(os.path.join(prefix, "jdi-qasp-ml"))

from utils.config import logger # noqa

from utils.dataset_builder import DatasetBuilder # noqa
from utils.common import maximize_window # noqa

os.makedirs(dataset_path, exist_ok=True)

WAIT_TIME_SECONDS = 3

SITE_URLS = [
"file://" + p.replace("\\", "/") + "/index.html"
for p in glob(f'{os.path.join(prefix, "jdi-qasp-ml")}/data/angular_dataset/build/*')
]
DATASET_NAMES = [re.search("site-[0-9]+", nm)[0] for nm in SITE_URLS]


class JDIDatasetBuilder(DatasetBuilder):
def setUp(self, driver):
self.logger.info("getting page")
driver.get(self.url)
maximize_window(driver=driver)
sleep(WAIT_TIME_SECONDS)


i = 1
for site, ds_name in zip(SITE_URLS, DATASET_NAMES):
JDIDatasetBuilder(
url=site, dataset_name=ds_name, headless=True, dataset_root_path=dataset_path,
)
logger.info(f"\n------------\n{len(SITE_URLS)-i} SITES LEFT TO PROCESS!")
i += 1
Binary file added Angular_model/model/count_children_tags.pkl
Binary file not shown.
Binary file added Angular_model/model/count_followers_tags.pkl
Binary file not shown.
Binary file added Angular_model/model/model.pth
Binary file not shown.
Binary file not shown.
Binary file added Angular_model/model/ohe_tag_name.pkl
Binary file not shown.
Binary file added Angular_model/model/ohe_type.pkl
Binary file not shown.
Binary file added Angular_model/model/tfidf_attr_class.pkl
Binary file not shown.
Binary file added Angular_model/model/tfidf_children_tags.pkl
Binary file not shown.
Binary file added Angular_model/model/tfidf_followers_tags.pkl
Binary file not shown.
206 changes: 206 additions & 0 deletions Angular_model/train.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
# import pandas as pd
# import numpy as np
# import matplotlib.pyplot as plt
import os
import sys
import gc
from tqdm.auto import trange
import pandas as pd
from glob import glob
import logging


import torch
from torch.utils.data import DataLoader

from multiprocessing import freeze_support
from terminaltables import DoubleTable

prefix = os.getcwd().split("jdi-qasp-ml")[0]
sys.path.append(os.path.join(prefix, "jdi-qasp-ml"))

from vars.mui_train_vars import ( # noqa
BATCH_SIZE, # noqa
TRAIN_LEN, # noqa
TEST_LEN, # noqa
NUM_EPOCHS, # noqa
EARLY_STOPPING_THRESHOLD, # noqa
SCHEDULER_STEP, # noqa
) # noqa


from utils.dataset import MUI_JDNDataset # noqa
from utils.model_new import JDIModel # noqa
from utils.common import accuracy #, accuracy_each_class, recall, precision # noqa

model_path = os.path.join(prefix, "jdi-qasp-ml", "Angular_model/model")
df_path = os.path.join(prefix, "jdi-qasp-ml", "data/angular_dataset/df")

ds_files = glob(f"{df_path}/site-*.pkl")
DATASET_NAMES = [os.path.basename(path)[:-4] for path in ds_files]

train_names = DATASET_NAMES[:TRAIN_LEN]
test_names = DATASET_NAMES[TRAIN_LEN : TRAIN_LEN + TEST_LEN] # noqa

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

LOG_FILENAME = "C:/Users/Iuliia_Elizarova/Desktop/JDI/repo_ml\\jdi-qasp-ml/data/angular_dataset/logfile.log"
logging.basicConfig(filename=LOG_FILENAME,
format='%(asctime)s %(message)s',
filemode='w')
logger = logging.getLogger()

logger.info(f"device: {DEVICE}")


def evaluate(model: JDIModel, dataset: MUI_JDNDataset) -> pd.DataFrame:
model.eval()
with torch.no_grad():

dataloader = DataLoader(dataset, shuffle=False, batch_size=1, pin_memory=True)
results = []

with trange(len(dataloader), desc="Evaluating:") as bar:
with torch.no_grad():
for x, y in dataloader:
y_pred = (
torch.round(
torch.nn.Softmax(dim=1)(model(x.to(DEVICE)).to("cpu"))
)
.detach()
.numpy()
)
y_pred = y_pred[0].argmax()
y = y.item()

results.append(
{
"y_true": y,
"y_pred": y_pred,
"y_true_label": dataset.classes_reverse_dict[y],
"y_pred_label": dataset.classes_reverse_dict[y_pred],
}
)
bar.update(1)

results_df = pd.DataFrame(results)
return accuracy(results_df)
# return accuracy(results_df), accuracy_each_class(results_df), recall(results_df), precision(results_df)


def train_model(model):

train_metrics = []
gc.collect()

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
scheduler = torch.optim.lr_scheduler.StepLR(
optimizer, step_size=SCHEDULER_STEP, gamma=0.1
)

NUM_BATCHES = len(train_dataloader)

early_stopping_steps = EARLY_STOPPING_THRESHOLD
best_accuracy = 0
for epoch in range(NUM_EPOCHS):

model.train()
model.to(DEVICE)

cumulative_loss = 0.0

with trange(NUM_BATCHES) as bar:

for x, y in train_dataloader:
y_hat = model(x.to(DEVICE))

optimizer.zero_grad()

loss = criterion(y_hat, y.long().to(DEVICE)) # \ noqa

loss.backward()
optimizer.step()
cumulative_loss += loss.item()
bar.set_description(
f"Epoch: {epoch}, {round(cumulative_loss,5)}, {round(loss.item(),5)}"
) # noqa
bar.update(1)

bar.update(1)

early_stopping_steps -= 1
test_accuracy = evaluate(model=model, dataset=test_dataset)
if test_accuracy > best_accuracy:
best_accuracy = test_accuracy
logger.info(f"SAVING MODEL WITH THE BEST ACCURACY: {best_accuracy}")
torch.save(model, f"{model_path}/model.pth")
early_stopping_steps = EARLY_STOPPING_THRESHOLD

train_metrics.append(
{
"epoch": epoch,
"mean(loss)": cumulative_loss / NUM_BATCHES,
"accuracy(test)": test_accuracy,
}
)

# report metrics
print()
table_data = [["epoch", "mean(loss)", "accuracy(test)"]]
for r in train_metrics:
table_data.append([r["epoch"], r["mean(loss)"], r["accuracy(test)"]])

print(f"Best accuracy: {best_accuracy}, attempts left: {early_stopping_steps}")

if early_stopping_steps <= 0:
logger.info("EARLY STOPPING")
break
scheduler.step()

pd.DataFrame(train_metrics, index=list(range(len(train_metrics)))).to_csv(
"tmp/train_metrics.csv"
)

print(DoubleTable(table_data=table_data).table)


if __name__ == "__main__":

freeze_support()

train_dataset = MUI_JDNDataset(
datasets_list=train_names, dataset_type="angular", rebalance_and_shuffle=True
)
test_dataset = MUI_JDNDataset(
datasets_list=test_names, dataset_type="angular", rebalance_and_shuffle=False
)

logger.info(
f"Train dataset shape: {train_dataset.X.shape}; Test dataset shape: {test_dataset.X.shape}"
)

train_dataloader = DataLoader(
train_dataset,
batch_size=BATCH_SIZE,
shuffle=True,
pin_memory=True,
drop_last=True,
num_workers=0,
)

IN_FEATURES = next(iter(train_dataloader))[0][0].shape[0]
OUT_FEATURES = len(train_dataset.classes_dict)

model = JDIModel(in_features=IN_FEATURES, out_features=OUT_FEATURES)
train_model(model)
# best_accuracy, best_accuracy_each_class, \
# best_recall_each_class, best_precision_each_class = evaluate(model=model, dataset=test_dataset)

best_accuracy = evaluate(model=model, dataset=test_dataset)

# logger.info(f"START TRAINING THE MODEL WITH THE BEST ACCURACY: {best_accuracy}, \
# best accuracy for each class {best_accuracy_each_class}, \
# best recall for each class {best_recall_each_class}, \
# best precision for each class {best_precision_each_class}")
logger.info("\n")
Loading

0 comments on commit 82fe341

Please sign in to comment.