From b7c61c5273aa4c16c328a7b8491ac07a916b9cde Mon Sep 17 00:00:00 2001 From: Daniel Young Date: Tue, 30 Jul 2024 16:57:24 -0700 Subject: [PATCH] Moved predictors out to sdk --- use_cases/eluc/app/components/prediction.py | 6 +- use_cases/eluc/app/components/prescription.py | 3 +- use_cases/eluc/data/torch_data.py | 28 --- .../experiments/predictor_experiments.ipynb | 10 +- .../experiments/predictor_significance.py | 6 +- .../experiments/prescriptor_experiments.ipynb | 5 +- .../persistence/persistors/hf_persistor.py | 70 ------ .../serializers/neural_network_serializer.py | 75 ------- .../serializers/sklearn_serializer.py | 45 ---- .../custom/template/template_predictor.py | 7 - .../predictors/neural_network/__init__.py | 0 .../neural_network/eluc_neural_net.py | 55 ----- .../neural_network/neural_net_predictor.py | 201 ------------------ .../percent_change_predictor.py | 3 - use_cases/eluc/predictors/scoring/scorer.py | 2 +- .../predictors/sklearn_predictor/__init__.py | 0 .../sklearn_predictor/sklearn_predictor.py | 80 ------- .../prescriptors/heuristics/heuristics.py | 1 - .../eluc/prescriptors/nsga2/create_seeds.py | 3 +- .../nsga2/land_use_prescriptor.py | 6 +- .../prescriptors/nsga2/train_prescriptors.py | 3 +- use_cases/eluc/prescriptors/nsga2/trainer.py | 2 +- use_cases/eluc/tests/test_predictors.py | 125 ----------- 23 files changed, 26 insertions(+), 710 deletions(-) delete mode 100644 use_cases/eluc/data/torch_data.py delete mode 100644 use_cases/eluc/persistence/persistors/hf_persistor.py delete mode 100644 use_cases/eluc/persistence/serializers/neural_network_serializer.py delete mode 100644 use_cases/eluc/persistence/serializers/sklearn_serializer.py delete mode 100644 use_cases/eluc/predictors/neural_network/__init__.py delete mode 100644 use_cases/eluc/predictors/neural_network/eluc_neural_net.py delete mode 100644 use_cases/eluc/predictors/neural_network/neural_net_predictor.py delete mode 100644 use_cases/eluc/predictors/sklearn_predictor/__init__.py delete mode 100644 use_cases/eluc/predictors/sklearn_predictor/sklearn_predictor.py delete mode 100644 use_cases/eluc/tests/test_predictors.py diff --git a/use_cases/eluc/app/components/prediction.py b/use_cases/eluc/app/components/prediction.py index 128e002..7a58177 100644 --- a/use_cases/eluc/app/components/prediction.py +++ b/use_cases/eluc/app/components/prediction.py @@ -8,13 +8,13 @@ from dash import html import pandas as pd +from prsdk.persistence.persistors.hf_persistor import HuggingFacePersistor +from prsdk.persistence.serializers.neural_network_serializer import NeuralNetSerializer +from prsdk.persistence.serializers.sklearn_serializer import SKLearnSerializer from prsdk.predictors.predictor import Predictor from app import constants as app_constants from data import constants -from persistence.persistors.hf_persistor import HuggingFacePersistor -from persistence.serializers.neural_network_serializer import NeuralNetSerializer -from persistence.serializers.sklearn_serializer import SKLearnSerializer from predictors.percent_change.percent_change_predictor import PercentChangePredictor diff --git a/use_cases/eluc/app/components/prescription.py b/use_cases/eluc/app/components/prescription.py index c66a529..809cce4 100644 --- a/use_cases/eluc/app/components/prescription.py +++ b/use_cases/eluc/app/components/prescription.py @@ -9,9 +9,10 @@ import pandas as pd import plotly.graph_objects as go +from prsdk.persistence.persistors.hf_persistor import HuggingFacePersistor + from app import constants as app_constants from data import constants -from persistence.persistors.hf_persistor import HuggingFacePersistor from persistence.serializers.prescriptor_serializer import PrescriptorSerializer from prescriptors.prescriptor_manager import PrescriptorManager diff --git a/use_cases/eluc/data/torch_data.py b/use_cases/eluc/data/torch_data.py deleted file mode 100644 index 980b616..0000000 --- a/use_cases/eluc/data/torch_data.py +++ /dev/null @@ -1,28 +0,0 @@ -""" -A simple custom PyTorch dataset is created here. This is used to keep our -datasets standard between models. It is used in both Torch prescription -and Neural Network training. -""" - -import numpy as np -import torch -from torch.utils.data.dataset import Dataset - - -class TorchDataset(Dataset): - """ - Simple custom torch dataset. - :param X: data - :param y: labels - """ - def __init__(self, X: np.ndarray, y: np.ndarray, device="cpu"): - super().__init__() - self.X = torch.tensor(X, dtype=torch.float32, device=device) - self.y = torch.tensor(y, device=device) - assert len(self.X) == len(self.y), "X and y must have the same length" - - def __len__(self): - return len(self.X) - - def __getitem__(self, idx: int) -> tuple: - return self.X[idx], self.y[idx] diff --git a/use_cases/eluc/experiments/predictor_experiments.ipynb b/use_cases/eluc/experiments/predictor_experiments.ipynb index 1c075e1..0afbd10 100644 --- a/use_cases/eluc/experiments/predictor_experiments.ipynb +++ b/use_cases/eluc/experiments/predictor_experiments.ipynb @@ -26,14 +26,14 @@ "from scipy.stats import ttest_1samp, ttest_ind\n", "from sklearn.metrics import mean_absolute_error\n", "\n", + "from prsdk.persistence.serializers.neural_network_serializer import NeuralNetSerializer\n", + "from prsdk.persistence.serializers.sklearn_serializer import SKLearnSerializer\n", "from prsdk.predictors.predictor import Predictor\n", + "from prsdk.predictors.neural_network.neural_net_predictor import NeuralNetPredictor\n", + "from prsdk.predictors.sklearn_predictor.sklearn_predictor import LinearRegressionPredictor, RandomForestPredictor\n", "\n", "from data.eluc_data import ELUCData\n", - "from data import constants\n", - "from persistence.serializers.neural_network_serializer import NeuralNetSerializer\n", - "from persistence.serializers.sklearn_serializer import SKLearnSerializer\n", - "from predictors.neural_network.neural_net_predictor import NeuralNetPredictor\n", - "from predictors.sklearn_predictor.sklearn_predictor import LinearRegressionPredictor, RandomForestPredictor" + "from data import constants" ] }, { diff --git a/use_cases/eluc/experiments/predictor_significance.py b/use_cases/eluc/experiments/predictor_significance.py index c74f71a..d2d46de 100644 --- a/use_cases/eluc/experiments/predictor_significance.py +++ b/use_cases/eluc/experiments/predictor_significance.py @@ -10,11 +10,13 @@ from tqdm import tqdm from sklearn.metrics import mean_absolute_error +from prsdk.predictors.neural_network.neural_net_predictor import NeuralNetPredictor +from prsdk.predictors.sklearn_predictors.linear_regression_predictor import LinearRegressionPredictor +from prsdk.predictors.sklearn_predictors.random_forest_predictor import RandomForestPredictor + from data.eluc_data import ELUCData from data import constants from data.conversion import construct_countries_df -from predictors.neural_network.neural_net_predictor import NeuralNetPredictor -from predictors.sklearn_predictor.sklearn_predictor import RandomForestPredictor, LinearRegressionPredictor def train_and_test(n: int, diff --git a/use_cases/eluc/experiments/prescriptor_experiments.ipynb b/use_cases/eluc/experiments/prescriptor_experiments.ipynb index 6fb9059..1641a5c 100644 --- a/use_cases/eluc/experiments/prescriptor_experiments.ipynb +++ b/use_cases/eluc/experiments/prescriptor_experiments.ipynb @@ -24,14 +24,15 @@ "import numpy as np\n", "from sklearn.linear_model import LinearRegression\n", "\n", + "from prsdk.predictors.neural_network.neural_net_predictor import NeuralNetPredictor\n", + "\n", "from data import constants\n", "from data.eluc_data import ELUCData\n", "from data.eluc_encoder import ELUCEncoder\n", "from prescriptors.nsga2.candidate import Candidate\n", "from prescriptors.nsga2.land_use_prescriptor import LandUsePrescriptor\n", "from prescriptors.prescriptor_manager import PrescriptorManager\n", - "from prescriptors.heuristics.heuristics import EvenHeuristic, PerfectHeuristic\n", - "from predictors.neural_network.neural_net_predictor import NeuralNetPredictor" + "from prescriptors.heuristics.heuristics import EvenHeuristic, PerfectHeuristic" ] }, { diff --git a/use_cases/eluc/persistence/persistors/hf_persistor.py b/use_cases/eluc/persistence/persistors/hf_persistor.py deleted file mode 100644 index 4e90a78..0000000 --- a/use_cases/eluc/persistence/persistors/hf_persistor.py +++ /dev/null @@ -1,70 +0,0 @@ -""" -Persistor for models to and from HuggingFace repo. -""" -from pathlib import Path - -from huggingface_hub import HfApi, snapshot_download - -from prsdk.persistence.persistors.persistor import Persistor - - -class HuggingFacePersistor(Persistor): - """ - Persists models to and from HuggingFace repo. - """ - def write_readme(self, model_path: str): - """ - Writes readme to model save path to upload. - TODO: Need to add more info to the readme and make it a proper template. - """ - model_path = Path(model_path) - with open(model_path / "README.md", "w", encoding="utf-8") as file: - file.write("This is a demo model created for project resilience") - - def persist(self, model, model_path: Path, repo_id: str, **persistence_args): - """ - Serializes the model to a local path using the file_serializer, - then uploads the model to a HuggingFace repo. - """ - # Save model and write readme - self.serializer.save(model, model_path) - self.write_readme(model_path) - - # Get token if it exists - token = persistence_args.get("token", None) - - api = HfApi() - # Create repo if it doesn't exist - api.create_repo( - repo_id=repo_id, - repo_type="model", - exist_ok=True, - token=token - ) - - # Upload model to repo - api.upload_folder( - folder_path=model_path, - repo_id=repo_id, - repo_type="model", - token=token - ) - - def from_pretrained(self, path_or_url: str, **hf_args): - """ - Loads a model from a path or if it is not found, from a huggingface repo. - :param path_or_url: path to the model or url to the huggingface repo. - :param hf_args: arguments to pass to the snapshot_download function from huggingface. - """ - path = Path(path_or_url) - if path.exists() and path.is_dir(): - return self.serializer.load(path) - # TODO: Need a try except block to catch download errors - url_path = path_or_url.replace("/", "--") - local_dir = hf_args.get("local_dir", f"huggingface_models/{url_path}") - - if not Path(local_dir).exists() or not Path(local_dir).is_dir(): - hf_args["local_dir"] = local_dir - snapshot_download(repo_id=path_or_url, **hf_args) - - return self.serializer.load(Path(local_dir)) diff --git a/use_cases/eluc/persistence/serializers/neural_network_serializer.py b/use_cases/eluc/persistence/serializers/neural_network_serializer.py deleted file mode 100644 index 49033d7..0000000 --- a/use_cases/eluc/persistence/serializers/neural_network_serializer.py +++ /dev/null @@ -1,75 +0,0 @@ -""" -Serializer for the Neural Network Predictor class. -""" -import json -from pathlib import Path - -import joblib -import torch - -from prsdk.persistence.serializers.serializer import Serializer - -from predictors.neural_network.eluc_neural_net import ELUCNeuralNet -from predictors.neural_network.neural_net_predictor import NeuralNetPredictor - - -class NeuralNetSerializer(Serializer): - """ - Serializer for the NeuralNetPredictor. - Saves config necessary to recreate the model, the model itself, and the scaler for the data to a folder. - """ - def save(self, model: NeuralNetPredictor, path: Path): - """ - Saves model, config, and scaler into format for loading. - Generates path to folder if it does not exist. - :param path: path to folder to save model files. - """ - if model.model is None: - raise ValueError("Model not fitted yet.") - path.mkdir(parents=True, exist_ok=True) - - config = { - "features": model.features, - "label": model.label, - "hidden_sizes": model.hidden_sizes, - "linear_skip": model.linear_skip, - "dropout": model.dropout, - "device": model.device, - "epochs": model.epochs, - "batch_size": model.batch_size, - "optim_params": model.optim_params, - "train_pct": model.train_pct, - "step_lr_params": model.step_lr_params - } - with open(path / "config.json", "w", encoding="utf-8") as file: - json.dump(config, file) - torch.save(model.model.state_dict(), path / "model.pt") - joblib.dump(model.scaler, path / "scaler.joblib") - - def load(self, path: Path) -> "NeuralNetPredictor": - """ - Loads a model from a given folder. Creates empty model with config, then loads model state dict and scaler. - :param path: path to folder containing model files. - """ - if not path.exists() or not path.is_dir(): - raise FileNotFoundError(f"Path {path} does not exist.") - if not (path / "config.json").exists() or \ - not (path / "model.pt").exists() or \ - not (path / "scaler.joblib").exists(): - raise FileNotFoundError("Model files not found in path.") - - # Initialize model with config - with open(path / "config.json", "r", encoding="utf-8") as file: - config = json.load(file) - - nnp = NeuralNetPredictor(config) - - nnp.model = ELUCNeuralNet(len(config["features"]), - config["hidden_sizes"], - config["linear_skip"], - config["dropout"]) - nnp.model.load_state_dict(torch.load(path / "model.pt")) - nnp.model.to(config["device"]) - nnp.model.eval() - nnp.scaler = joblib.load(path / "scaler.joblib") - return nnp diff --git a/use_cases/eluc/persistence/serializers/sklearn_serializer.py b/use_cases/eluc/persistence/serializers/sklearn_serializer.py deleted file mode 100644 index c480c66..0000000 --- a/use_cases/eluc/persistence/serializers/sklearn_serializer.py +++ /dev/null @@ -1,45 +0,0 @@ -""" -Serializer for the SKLearnPredictor class. -""" -import json -from pathlib import Path - -import joblib - -from prsdk.persistence.serializers.serializer import Serializer - -from predictors.sklearn_predictor.sklearn_predictor import SKLearnPredictor - - -class SKLearnSerializer(Serializer): - """ - Serializer for the SKLearnPredictor. - Uses joblib to save the model and json to save the config used to load it. - """ - def save(self, model: SKLearnPredictor, path: Path): - """ - Saves saves model and features into format for loading. - Generates path to folder if it does not exist. - :param path: path to folder to save model files. - """ - path.mkdir(parents=True, exist_ok=True) - with open(path / "config.json", "w", encoding="utf-8") as file: - json.dump(model.config, file) - joblib.dump(model.model, path / "model.joblib") - - def load(self, path: Path) -> "SKLearnPredictor": - """ - Loads saved model and config from a local folder. - :param path: path to folder to load model files from. - """ - load_path = Path(path) - if not load_path.exists() or not load_path.is_dir(): - raise FileNotFoundError(f"Path {path} does not exist.") - if not (load_path / "config.json").exists() or not (load_path / "model.joblib").exists(): - raise FileNotFoundError("Model files not found in path.") - - with open(load_path / "config.json", "r", encoding="utf-8") as file: - config = json.load(file) - model = joblib.load(load_path / "model.joblib") - sklearn_predictor = SKLearnPredictor(model, config) - return sklearn_predictor diff --git a/use_cases/eluc/predictors/custom/template/template_predictor.py b/use_cases/eluc/predictors/custom/template/template_predictor.py index 59f4254..d9fd59a 100644 --- a/use_cases/eluc/predictors/custom/template/template_predictor.py +++ b/use_cases/eluc/predictors/custom/template/template_predictor.py @@ -5,8 +5,6 @@ from prsdk.predictors.predictor import Predictor -from data import constants - class TemplatePredictor(Predictor): """ @@ -14,11 +12,6 @@ class TemplatePredictor(Predictor): The class that gets passed into the Evaluator should call the load method which should return a Predictor. The Predictor just needs to impelement predict. """ - def __init__(self): - super().__init__(context=constants.CAO_MAPPING["context"], - actions=constants.CAO_MAPPING["actions"], - outcomes=constants.CAO_MAPPING["outcomes"]) - def fit(self, X_train, y_train): pass diff --git a/use_cases/eluc/predictors/neural_network/__init__.py b/use_cases/eluc/predictors/neural_network/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/use_cases/eluc/predictors/neural_network/eluc_neural_net.py b/use_cases/eluc/predictors/neural_network/eluc_neural_net.py deleted file mode 100644 index ad9433a..0000000 --- a/use_cases/eluc/predictors/neural_network/eluc_neural_net.py +++ /dev/null @@ -1,55 +0,0 @@ -""" -Simple feed-forward neural network to be used in the Neural Network Predictor. -""" -import torch - - -class ELUCNeuralNet(torch.nn.Module): - """ - Custom torch neural network module. - :param in_size: number of input features - :param hidden_sizes: list of hidden layer sizes - :param linear_skip: whether to concatenate input to hidden layer output - :param dropout: dropout probability - """ - class EncBlock(torch.nn.Module): - """ - Encoding block for neural network. - Simple feed forward layer with ReLU activation and optional dropout. - """ - def __init__(self, in_size: int, out_size: int, dropout: float): - super().__init__() - self.model = torch.nn.Sequential( - torch.nn.Linear(in_size, out_size), - torch.nn.ReLU(), - torch.nn.Dropout(p=dropout) - ) - - def forward(self, X: torch.FloatTensor) -> torch.FloatTensor: - """ - Passes input through the block. - """ - return self.model(X) - - def __init__(self, in_size: int, hidden_sizes: list[str], linear_skip: bool, dropout: float): - super().__init__() - self.linear_skip = linear_skip - hidden_sizes = [in_size] + hidden_sizes - enc_blocks = [self.EncBlock(hidden_sizes[i], hidden_sizes[i+1], dropout) for i in range(len(hidden_sizes) - 1)] - self.enc = torch.nn.Sequential(*enc_blocks) - # If we are using linear skip, we concatenate the input to the output of the hidden layers - out_size = hidden_sizes[-1] + in_size if linear_skip else hidden_sizes[-1] - self.linear = torch.nn.Linear(out_size, 1) - - def forward(self, X: torch.FloatTensor) -> torch.FloatTensor: - """ - Performs a forward pass of the neural net. - If linear_skip is True, we concatenate the input to the output of the hidden layers. - :param X: input data - :return: output of the neural net - """ - hid = self.enc(X) - if self.linear_skip: - hid = torch.concatenate([hid, X], dim=1) - out = self.linear(hid) - return out diff --git a/use_cases/eluc/predictors/neural_network/neural_net_predictor.py b/use_cases/eluc/predictors/neural_network/neural_net_predictor.py deleted file mode 100644 index 5a3bfd1..0000000 --- a/use_cases/eluc/predictors/neural_network/neural_net_predictor.py +++ /dev/null @@ -1,201 +0,0 @@ -""" -Implementation of predictor.py using a simple feed-forward NeuralNetwork -implemented in PyTorch. -""" -import copy -import time - -import numpy as np -import pandas as pd -from sklearn.preprocessing import StandardScaler -from tqdm import tqdm - -import torch -from torch.utils.data import DataLoader -from torch.utils.tensorboard import SummaryWriter - -from prsdk.predictors.predictor import Predictor - -from data import constants -from data.torch_data import TorchDataset -from predictors.neural_network.eluc_neural_net import ELUCNeuralNet - - -class NeuralNetPredictor(Predictor): - """ - Simple feed-forward neural network predictor implemented in PyTorch. - Has the option to use wide and deep, concatenating the input to the output of the hidden layers - in order to take advantage of the linear relationship in the data. - Data is automatically standardized and the scaler is saved with the model. - """ - def __init__(self, model_config: dict): - """ - Model config should contain the following: - features: list of features to use in the model (optional, defaults to all context + actions) - label: name of the label column (optional, defaults to passed label in fit) - hidden_sizes: list of hidden layer sizes - linear_skip: whether to concatenate input to hidden layer output - dropout: dropout probability - device: device to run the model on - epochs: number of epochs to train for - batch_size: batch size for training - optim_params: dictionary of parameters to pass to the optimizer - train_pct: percentage of training data to use - step_lr_params: dictionary of parameters to pass to the step learning rate scheduler - """ - super().__init__(constants.CAO_MAPPING["context"], constants.CAO_MAPPING["actions"], ["ELUC"]) - self.features = model_config.get("features", None) - self.label = model_config.get("label", None) - - self.hidden_sizes = model_config.get("hidden_sizes", [4096]) - self.linear_skip = model_config.get("linear_skip", True) - self.dropout = model_config.get("dropout", 0) - self.device = model_config.get("device", "cpu") - self.epochs = model_config.get("epochs", 3) - self.batch_size = model_config.get("batch_size", 2048) - self.optim_params = model_config.get("optim_params", {}) - self.train_pct = model_config.get("train_pct", 1) - self.step_lr_params = model_config.get("step_lr_params", {"step_size": 1, "gamma": 0.1}) - - self.model = None - self.scaler = StandardScaler() - - def fit(self, X_train: pd.DataFrame, y_train: pd.Series, - X_val=None, y_val=None, - X_test=None, y_test=None, - log_path=None, verbose=False) -> dict: - """ - Fits neural network to given data using predefined parameters and hyperparameters. - If no features were specified we use all the columns in X_train. - We scale based on the training data and apply it to validation and test data. - AdamW optimizer is used with L1 loss. - :param X_train: training data, may be unscaled and have excess features. - :param y_train: training labels. - :param X_val: validation data, may be unscaled and have excess features. - :param y_val: validation labels. - :param X_test: test data, may be unscaled and have excess features. - :param y_test: test labels. - :param log_path: path to log training data to tensorboard. - :param verbose: whether to print progress bars. - :return: dictionary of results from training containing time taken, best epoch, best loss, - and test loss if applicable. - """ - if not self.features: - self.features = X_train.columns.tolist() - self.label = y_train.name - - self.model = ELUCNeuralNet(len(self.features), self.hidden_sizes, self.linear_skip, self.dropout) - self.model.to(self.device) - self.model.train() - - start = time.time() - - # Set up train set - X_train = self.scaler.fit_transform(X_train[self.features]) - y_train = y_train.values - train_ds = TorchDataset(X_train, y_train) - sampler = torch.utils.data.RandomSampler(train_ds, num_samples=int(len(train_ds) * self.train_pct)) - train_dl = DataLoader(train_ds, self.batch_size, sampler=sampler) - - # If we pass in a validation set, use them - if X_val is not None and y_val is not None: - X_val = self.scaler.transform(X_val[self.features]) - y_val = y_val.values - val_ds = TorchDataset(X_val, y_val) - val_dl = DataLoader(val_ds, self.batch_size, shuffle=False) - - # Optimization parameters - optimizer = torch.optim.AdamW(self.model.parameters(), **self.optim_params) - loss_fn = torch.nn.L1Loss() - if self.step_lr_params: - scheduler = torch.optim.lr_scheduler.StepLR(optimizer, **self.step_lr_params) - - if log_path: - writer = SummaryWriter(log_path) - - # Keeping track of best performance for validation - result_dict = {} - best_model = None - best_loss = np.inf - end = 0 - - step = 0 - for epoch in range(self.epochs): - self.model.train() - # Standard training loop - train_iter = tqdm(train_dl) if verbose else train_dl - for X, y in train_iter: - X, y = X.to(self.device), y.to(self.device) - optimizer.zero_grad() - out = self.model(X) - loss = loss_fn(out.squeeze(), y.squeeze()) - if log_path: - writer.add_scalar("loss", loss.item(), step) - step += 1 - loss.backward() - optimizer.step() - - # LR Decay - if self.step_lr_params: - scheduler.step() - - # Evaluate epoch - if X_val is not None and y_val is not None: - total = 0 - self.model.eval() - with torch.no_grad(): - for X, y in tqdm(val_dl): - X, y = X.to(self.device), y.to(self.device) - out = self.model(X) - loss = loss_fn(out.squeeze(), y.squeeze()) - total += loss.item() * y.shape[0] - - if log_path: - writer.add_scalar("val_loss", total / len(val_ds), step) - - if total < best_loss: - best_model = copy.deepcopy(self.model.state_dict()) - best_loss = total - end = time.time() - result_dict["best_epoch"] = epoch - result_dict["best_loss"] = total / len(val_ds) - result_dict["time"] = end - start - - print(f"epoch {epoch} mae {total / len(val_ds)}") - - if best_model: - self.model.load_state_dict(best_model) - else: - end = time.time() - result_dict["time"] = end - start - - # If we provide a test dataset - if X_test is not None and y_test is not None: - y_pred = self.predict(X_test) - y_true = y_test.values - mae = np.mean(np.abs(y_pred - y_true)) - result_dict["test_loss"] = mae - - return result_dict - - def predict(self, context_actions_df: pd.DataFrame) -> pd.DataFrame: - """ - Generates prediction from model for given test data. - :param context_actions_df: test data to predict on. - :return: DataFrame of predictions properly labeled. - """ - X_test_scaled = self.scaler.transform(context_actions_df[self.features]) - test_ds = TorchDataset(X_test_scaled, np.zeros(len(X_test_scaled))) - test_dl = DataLoader(test_ds, self.batch_size, shuffle=False) - pred_list = [] - with torch.no_grad(): - self.model.eval() - for X, _ in test_dl: - X = X.to(self.device) - pred_list.append(self.model(X)) - - if len(pred_list) > 1: - y_pred = torch.concatenate(pred_list, dim=0).cpu().numpy() - else: - y_pred = pred_list[0].cpu().numpy() - return pd.DataFrame(y_pred, index=context_actions_df.index, columns=[self.label]) diff --git a/use_cases/eluc/predictors/percent_change/percent_change_predictor.py b/use_cases/eluc/predictors/percent_change/percent_change_predictor.py index 8e7082c..98cee81 100644 --- a/use_cases/eluc/predictors/percent_change/percent_change_predictor.py +++ b/use_cases/eluc/predictors/percent_change/percent_change_predictor.py @@ -12,9 +12,6 @@ class PercentChangePredictor(Predictor): """ Heuristic that calculates the percent change of land use from actions and context. """ - def __init__(self): - super().__init__(constants.CAO_MAPPING["context"], constants.CAO_MAPPING["actions"], ["change"]) - def fit(self, X_train: pd.DataFrame, y_train: pd.Series): """ No fitting required for this model. diff --git a/use_cases/eluc/predictors/scoring/scorer.py b/use_cases/eluc/predictors/scoring/scorer.py index 81bb33a..d87978f 100644 --- a/use_cases/eluc/predictors/scoring/scorer.py +++ b/use_cases/eluc/predictors/scoring/scorer.py @@ -8,11 +8,11 @@ import pandas as pd +from prsdk.persistence.persistors.hf_persistor import HuggingFacePersistor from prsdk.predictors.predictor import Predictor import data.constants as constants from data.eluc_data import ELUCData -from persistence.persistors.hf_persistor import HuggingFacePersistor from predictors.scoring.validator import Validator diff --git a/use_cases/eluc/predictors/sklearn_predictor/__init__.py b/use_cases/eluc/predictors/sklearn_predictor/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/use_cases/eluc/predictors/sklearn_predictor/sklearn_predictor.py b/use_cases/eluc/predictors/sklearn_predictor/sklearn_predictor.py deleted file mode 100644 index 1213dc1..0000000 --- a/use_cases/eluc/predictors/sklearn_predictor/sklearn_predictor.py +++ /dev/null @@ -1,80 +0,0 @@ -""" -Abstract SKLearn predictor and its implementations. -Since the SKLearn library is standardized we can easily make more. -""" -from abc import ABC - -import pandas as pd -from sklearn.linear_model import LinearRegression -from sklearn.ensemble import RandomForestRegressor - -from prsdk.predictors.predictor import Predictor - -from data import constants - - -class SKLearnPredictor(Predictor, ABC): - """ - Simple abstract class for sklearn predictors. - Keeps track of features fit on and label to predict. - """ - def __init__(self, model, model_config: dict): - """ - Model config contains the following: - features: list of features to use for prediction (optional, defaults to all features) - label: name of the label to predict (optional, defaults to passed label during fit) - Any other parameters are passed to the model. - """ - super().__init__(constants.CAO_MAPPING["context"], constants.CAO_MAPPING["actions"], ["ELUC"]) - self.config = model_config - self.model = model - - def fit(self, X_train: pd.DataFrame, y_train: pd.Series): - """ - Fits SKLearn model with standard sklearn fit method. - If we passed in features, use those. Otherwise use all columns. - :param X_train: DataFrame with input data - :param y_train: series with target data - """ - if "features" in self.config: - X_train = X_train[self.config["features"]] - else: - self.config["features"] = list(X_train.columns) - self.config["label"] = y_train.name - self.model.fit(X_train, y_train) - - def predict(self, context_actions_df: pd.DataFrame) -> pd.DataFrame: - """ - Standard sklearn predict method. - Makes sure to use the same features as were used in fit. - :param context_actions_df: DataFrame with input data - :return: properly labeled DataFrame with predictions and matching index. - """ - context_actions_df = context_actions_df[self.config["features"]] - y_pred = self.model.predict(context_actions_df) - return pd.DataFrame(y_pred, index=context_actions_df.index, columns=[self.config["label"]]) - - -class LinearRegressionPredictor(SKLearnPredictor): - """ - Simple linear regression predictor. - See SKLearnPredictor for more details. - """ - def __init__(self, model_config: dict): - if not model_config: - model_config = {} - lr_config = {key: value for key, value in model_config.items() if key not in ["features", "label"]} - model = LinearRegression(**lr_config) - super().__init__(model, model_config) - - -class RandomForestPredictor(SKLearnPredictor): - """ - Simple random forest predictor. - See SKLearnPredictor for more details. - Overrides save method in order to compress it. - """ - def __init__(self, model_config: dict): - rf_config = {key: value for key, value in model_config.items() if key not in ["features", "label"]} - model = RandomForestRegressor(**rf_config) - super().__init__(model, model_config) diff --git a/use_cases/eluc/prescriptors/heuristics/heuristics.py b/use_cases/eluc/prescriptors/heuristics/heuristics.py index 9b0ca52..dc9ef0e 100644 --- a/use_cases/eluc/prescriptors/heuristics/heuristics.py +++ b/use_cases/eluc/prescriptors/heuristics/heuristics.py @@ -18,7 +18,6 @@ class HeuristicPrescriptor(Prescriptor, ABC): recommendations based on the heuristic. """ def __init__(self, pct: float): - super().__init__(constants.CAO_MAPPING["context"], constants.CAO_MAPPING["actions"]) self.pct = pct @abstractmethod diff --git a/use_cases/eluc/prescriptors/nsga2/create_seeds.py b/use_cases/eluc/prescriptors/nsga2/create_seeds.py index f707679..d5137ae 100644 --- a/use_cases/eluc/prescriptors/nsga2/create_seeds.py +++ b/use_cases/eluc/prescriptors/nsga2/create_seeds.py @@ -8,9 +8,10 @@ from torch.utils.data import DataLoader, random_split from tqdm import tqdm +from prsdk.data.torch_data import TorchDataset + from data import constants from data.eluc_data import ELUCData -from data.torch_data import TorchDataset from prescriptors.nsga2.candidate import Candidate diff --git a/use_cases/eluc/prescriptors/nsga2/land_use_prescriptor.py b/use_cases/eluc/prescriptors/nsga2/land_use_prescriptor.py index b673d82..c56f87a 100644 --- a/use_cases/eluc/prescriptors/nsga2/land_use_prescriptor.py +++ b/use_cases/eluc/prescriptors/nsga2/land_use_prescriptor.py @@ -6,11 +6,11 @@ import torch from torch.utils.data import DataLoader +from prsdk.data.torch_data import TorchDataset from prsdk.prescriptors.prescriptor import Prescriptor from data import constants from data.eluc_data import ELUCEncoder -from data.torch_data import TorchDataset from prescriptors.nsga2.candidate import Candidate @@ -20,7 +20,6 @@ class LandUsePrescriptor(Prescriptor): evolution using NSGA-II. """ def __init__(self, candidate: Candidate, encoder: ELUCEncoder, batch_size: int = 4096): - super().__init__(constants.CAO_MAPPING["context"], constants.CAO_MAPPING["actions"]) self.candidate = candidate self.encoder = encoder self.batch_size = batch_size @@ -47,7 +46,8 @@ def _reco_to_context_actions(self, reco_df: pd.DataFrame, context_df: pd.DataFra presc_actions_df = reco_df - context_df[constants.RECO_COLS] presc_actions_df = presc_actions_df.rename(constants.RECO_MAP, axis=1) presc_actions_df[constants.NO_CHANGE_COLS] = 0 - context_actions_df = pd.concat([context_df[self.context], presc_actions_df[self.actions]], axis=1) + context_actions_df = pd.concat([context_df[constants.CAO_MAPPING["context"]], + presc_actions_df[constants.CAO_MAPPING["actions"]]], axis=1) return context_actions_df def prescribe(self, context_df) -> pd.DataFrame: diff --git a/use_cases/eluc/prescriptors/nsga2/train_prescriptors.py b/use_cases/eluc/prescriptors/nsga2/train_prescriptors.py index 01e38d8..9fb5240 100644 --- a/use_cases/eluc/prescriptors/nsga2/train_prescriptors.py +++ b/use_cases/eluc/prescriptors/nsga2/train_prescriptors.py @@ -7,9 +7,10 @@ import json from pathlib import Path +from prsdk.persistence.serializers.neural_network_serializer import NeuralNetSerializer + from data.eluc_data import ELUCData from data.eluc_encoder import ELUCEncoder -from persistence.serializers.neural_network_serializer import NeuralNetSerializer from prescriptors.nsga2.trainer import TorchTrainer from predictors.percent_change.percent_change_predictor import PercentChangePredictor diff --git a/use_cases/eluc/prescriptors/nsga2/trainer.py b/use_cases/eluc/prescriptors/nsga2/trainer.py index 9c0a889..2a46db6 100644 --- a/use_cases/eluc/prescriptors/nsga2/trainer.py +++ b/use_cases/eluc/prescriptors/nsga2/trainer.py @@ -10,11 +10,11 @@ import torch from torch.utils.data import DataLoader +from prsdk.data.torch_data import TorchDataset from prsdk.predictors.predictor import Predictor from data import constants from data.eluc_data import ELUCEncoder -from data.torch_data import TorchDataset from prescriptors.nsga2 import nsga2_utils from prescriptors.nsga2.candidate import Candidate from prescriptors.nsga2.land_use_prescriptor import LandUsePrescriptor diff --git a/use_cases/eluc/tests/test_predictors.py b/use_cases/eluc/tests/test_predictors.py deleted file mode 100644 index be26f45..0000000 --- a/use_cases/eluc/tests/test_predictors.py +++ /dev/null @@ -1,125 +0,0 @@ -""" -Unit tests for the predictors. -""" -import unittest -import shutil -from pathlib import Path - -import pandas as pd - -from persistence.serializers.neural_network_serializer import NeuralNetSerializer -from persistence.serializers.sklearn_serializer import SKLearnSerializer -from predictors.neural_network.neural_net_predictor import NeuralNetPredictor -from predictors.sklearn_predictor.sklearn_predictor import LinearRegressionPredictor, RandomForestPredictor - - -class TestPredictors(unittest.TestCase): - """ - Tests the 3 base predictor implementations' saving and loading behavior. - """ - def setUp(self): - """ - We set the models up like this so that in test_loaded_same we can instantiate - 2 models with the same parameters, load one from the other's save, and check if - their predictions are the same. - """ - self.models = [ - NeuralNetPredictor, - LinearRegressionPredictor, - RandomForestPredictor - ] - self.serializers = [ - NeuralNetSerializer(), - SKLearnSerializer(), - SKLearnSerializer() - ] - self.configs = [ - {'hidden_sizes': [4], 'epochs': 1, 'batch_size': 1, 'device': 'cpu'}, - {'n_jobs': -1}, - {'n_jobs': -1, "n_estimators": 10, "max_depth": 2} - ] - self.dummy_data = pd.DataFrame({"a": [1, 2, 3, 4], "b": [4, 5, 6, 4], "c": [7, 8, 9, 4]}) - self.dummy_target = pd.Series([1, 2, 3, 4], name="label") - self.temp_path = Path("tests/temp") - - def test_save_file_names(self): - """ - Checks to make sure the model's save method creates the correct files. - """ - save_file_names = [ - ["model.pt", "config.json", "scaler.joblib"], - ["model.joblib", "config.json"], - ["model.joblib", "config.json"] - ] - for model, serializer, config, test_names in zip(self.models, self.serializers, self.configs, save_file_names): - with self.subTest(model=model): - predictor = model(config) - predictor.fit(self.dummy_data, self.dummy_target) - serializer.save(predictor, self.temp_path) - files = [f.name for f in self.temp_path.glob("**/*") if f.is_file()] - self.assertEqual(set(files), set(test_names)) - shutil.rmtree(self.temp_path) - self.assertFalse(self.temp_path.exists()) - - def test_loaded_same(self): - """ - Makes sure a predictor's predictions are consistent before and after saving/loading. - Fits a predictor then saves and loads it, then checks if the predictions are the same. - """ - - for model, serializer, config in zip(self.models, self.serializers, self.configs): - with self.subTest(model=model): - predictor = model(config) - predictor.fit(self.dummy_data.iloc[:2], self.dummy_target.iloc[:2]) - output = predictor.predict(self.dummy_data.iloc[2:]) - serializer.save(predictor, self.temp_path) - - loaded = serializer.load(self.temp_path) - loaded_output = loaded.predict(self.dummy_data.iloc[2:]) - - self.assertTrue((output == loaded_output).all().all()) - shutil.rmtree(self.temp_path) - self.assertFalse(self.temp_path.exists()) - - def tearDown(self): - """ - Removes the temp directory if it exists. - """ - if self.temp_path.exists(): - shutil.rmtree(self.temp_path) - - -class TestNeuralNet(unittest.TestCase): - """ - Specifically tests the neural net predictor - """ - - def test_single_input(self): - """ - Tests the neural net with a single input. - """ - predictor = NeuralNetPredictor({"hidden_sizes": [4], "epochs": 1, "batch_size": 1, "device": "cpu"}) - - train_data = pd.DataFrame({"a": [1], "b": [2], "c": [3], "label": [4]}) - test_data = pd.DataFrame({"a": [4], "b": [5], "c": [6]}) - - predictor.fit(train_data[['a', 'b', 'c']], train_data['label']) - out = predictor.predict(test_data) - self.assertEqual(out.shape, (1, 1)) - - def test_multi_input(self): - """ - Tests the neural net with multiple inputs. - """ - predictor = NeuralNetPredictor({"hidden_sizes": [4], "epochs": 1, "batch_size": 1, "device": "cpu"}) - - train_data = pd.DataFrame({"a": [1, 2], "b": [2, 3], "c": [3, 4], "label": [4, 5]}) - test_data = pd.DataFrame({"a": [4, 5], "b": [5, 6], "c": [6, 7]}) - - predictor.fit(train_data[['a', 'b', 'c']], train_data['label']) - out = predictor.predict(test_data) - self.assertEqual(out.shape, (2, 1)) - - -if __name__ == "__main__": - unittest.main()