-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
77 lines (63 loc) · 2.92 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import os
import time
import pandas as pd
from sklearn.cross_decomposition import PLSRegression
from sklearn.preprocessing import LabelEncoder
from core import folder, preprocess, report
from core.models import PLS_DA, SVM, PLS_DA_plot, XGBoost
# Use the SAF's features as the "y" vector
df_SAF = pd.read_csv("data/class.csv")
y = df_SAF["Structure"]
validation_csv_file = "data/validation.csv"
# Initialize and fit the LabelEncoder
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)
script_path = os.path.abspath(__file__)
script_dir_path = os.path.dirname(script_path)
output_dir_path = os.path.join(script_dir_path, "features")
# Find all .csv files in folders
csv_file_paths = folder.find_csv_files(output_dir_path)
for i, csv_file_path in enumerate(csv_file_paths, start=1):
start_time = time.perf_counter()
# Load and preprocess the dataset
X_df, X, columns, scaler_standard, scaler_minmax = preprocess.prepare_standarlize_X_block_(csv_file_path)
print(
f"\nProcessing {csv_file_path} with {X.shape[1]} features ({i}/{len(csv_file_paths)})."
)
X_val_df, X_val = preprocess.preprocess_validation_data(
validation_csv_file, scaler_standard, scaler_minmax
)
print("(1/4) Running SVM model...")
feature_file_name = folder.get_file_name(csv_file_path)
SVM_model_report = SVM.get_report(X, y)
report.record_model_performance(SVM_model_report, "SVM", csv_file_path)
probabilities, y_pred_svm = SVM.validate_svc_with_probabilities(
X, y, X_val, csv_file_path, validation_csv_file
)
print("(2/4) Running PLS_DA n=2...")
PLS_DA_plot.plot_two_component(X, y, csv_file_path)
PLS_DA_plot.plot_two_component_with_validation(X, y, X_val, csv_file_path)
print("(3/4) Running PLS_DA model with the best n...")
file_name = folder.get_file_name(csv_file_path)
# Determine the best number of components
best_n_components = PLS_DA.find_best_n_dim(X, y_encoded, csv_file_path)
best_pls = PLSRegression(n_components=best_n_components)
PLA_DA_model_report = PLS_DA.generate_classification_report(X, y, best_pls)
report.record_model_performance(PLA_DA_model_report, "PLS_DA", csv_file_path)
PLS_DA.save_feature_importance(
X, columns, y_encoded, best_pls, best_n_components, csv_file_path
)
# validation
y_pred_val = PLS_DA.validate_PLS_DA_model(
best_pls, X, y, X_val, csv_file_path, validation_csv_file
)
print("(4/4) Running XGBoost model...")
XGBoost_model_report = XGBoost.run_XGBoost(X_df, y)
report.record_model_performance(XGBoost_model_report, "XGBoost", csv_file_path)
XGBoost.plot_XGBoost_feature_importance(X_df, y_encoded, csv_file_path)
# validation
y_pred_val_xgb = XGBoost.validate_XGBoost(
X, y, X_val, csv_file_path, validation_csv_file
)
elapsed_time = time.perf_counter() - start_time
print(f"===========Elapsed time: {elapsed_time:0.2f} seconds===========")