modelling/frame_classifier/model.py

# %%
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier

import pathlib


np.random.seed(27)

# need this to ensure the import works properly
sys.path.insert(0, str(pathlib.Path(__file__).parent.parent.parent.absolute()))
from generate_segment_trajectories import get_basic_dataframe


def get_data(
    normal_folders, anomalous_folders, agent_map_folder="../agent_maps", test_ratio=0.3
):
    """Get the raw data from the relevant dataprep functions, filter and split the data into train/test

    Returns:
        Tuple: X_train, X_test, y_train, y_test that are generated by shuffling the base raw data
    """
    normal = get_basic_dataframe(
        subfolders=normal_folders, agent_map_folder=agent_map_folder, max_agents=1000
    )
    anomalous = get_basic_dataframe(
        subfolders=anomalous_folders, agent_map_folder=agent_map_folder, max_agents=1000
    )
    print("Got all dataframes")

    normal = pd.concat(normal)
    anomalous = pd.concat(anomalous)
    print(
        f"Nominal frame count: {normal.shape}\nAnomalous frame count:{anomalous.shape}"
    )
    useful_cols = [
        "max_velocity_x",
        "max_velocity_y",
        "max_velocity_z",
        "max_ang_velocity_x",
        "max_ang_velocity_y",
        "max_ang_velocity_z",
        "min_velocity_x",
        "min_velocity_y",
        "min_velocity_z",
        "min_ang_velocity_x",
        "min_ang_velocity_y",
        "min_ang_velocity_z",
        "max_acc_x",
        "max_acc_y",
        "max_acc_z",
        "min_acc_x",
        "min_acc_y",
        "min_acc_z",
    ]

    X = pd.concat([normal[useful_cols], anomalous[useful_cols]])
    y = np.hstack([np.zeros(normal.shape[0]), np.ones(anomalous.shape[0])])

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_ratio, random_state=42
    )

    return X_train, X_test, y_train, y_test


def build_randomforest_model(
    normal_folders, anomalous_folders, agent_map_folder, data=None
):
    if data is None:
        X_train, X_test, y_train, y_test = get_data(
            normal_folders, anomalous_folders, agent_map_folder
        )
    else:
        X_train, X_test, y_train, y_test = data
    RF = RandomForestClassifier(n_estimators=1000, max_depth=10, random_state=0)
    RF_res_model = RF.fit(X_train, y_train)
    RF_res_model.predict(X_test)
    # round(RF_res_model.score(X_test, y_test), 4)
    return RF_res_model, RF_res_model.score(X_test, y_test)


def build_mlp_classifier(normal_folders, anomalous_folders, agent_map_folder):
    X_train, X_test, y_train, y_test = get_data(
        normal_folders, anomalous_folders, agent_map_folder
    )
    NN = MLPClassifier(
        solver="lbfgs", alpha=1e-5, hidden_layer_sizes=(150, 10), random_state=1
    ).fit(X_train, y_train)
    NN.predict(X_test)
    return NN, NN.score(X_test, y_test)


def show_permutation_imp(RF, X, y):
    """Show the permutation importance plot for the train/test set

    Args:
        dataset_type (str, optional): This can be "train" or "test". 
            Runs the permutation importance for either set. Defaults to "train".
    """
    result = permutation_importance(RF, X, y, n_repeats=10, random_state=42, n_jobs=2)
    sorted_idx = result.importances_mean.argsort()

    fig, ax = plt.subplots()
    ax.boxplot(
        result.importances[sorted_idx].T, vert=False, labels=X.columns[sorted_idx]
    )
    ax.set_title("Permutation Importances")
    fig.tight_layout()
    plt.show()