Skip to content

Commit

Permalink
Merge branch '0.1.0' into frrd-80
Browse files Browse the repository at this point in the history
  • Loading branch information
Eve-ning authored Jun 4, 2024
2 parents dea588f + 226b0d9 commit 49fdf24
Show file tree
Hide file tree
Showing 9 changed files with 105 additions and 120 deletions.
3 changes: 2 additions & 1 deletion .github/workflows/model-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,8 @@ jobs:
working-directory: ${{ github.workspace }}/tests
run: |
git config --global --add safe.directory /__w/FRDC-ML/FRDC-ML
python3 -m model_tests.chestnut_dec_may.train
python3 -m model_tests.chestnut_dec_may.train_mixmatch
python3 -m model_tests.chestnut_dec_may.train_fixmatch
- name: Comment results via CML
run: |
Expand Down
1 change: 0 additions & 1 deletion src/frdc/load/label_studio.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@

from frdc.conf import LABEL_STUDIO_CLIENT


logger = logging.getLogger(__name__)


Expand Down
34 changes: 15 additions & 19 deletions src/frdc/train/fixmatch_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,12 +92,12 @@ def training_step(self, batch, batch_idx):
Loss: ℓ_lbl + ℓ_unl
"""

def training_step(self, batch, batch_idx):
(x_lbl, y_lbl), x_unls = batch
opt = self.optimizers()

# Backprop for labelled data
opt.zero_grad()
(x_lbl, y_lbl), x_unls = batch
loss_lbl = F.cross_entropy((y_lbl_pred := self(x_lbl)), y_lbl.long())
self.manual_backward(loss_lbl)
opt.step()
Expand Down Expand Up @@ -174,7 +174,9 @@ def training_step(self, batch, batch_idx):
)

def validation_step(self, batch, batch_idx):
x, y = batch
# The batch outputs x_unls due to our on_before_batch_transfer
(x, y), _x_unls = batch
wandb.log({"val/y_lbl": wandb_hist(y, self.n_classes)})
y_pred = self(x)
loss = F.cross_entropy(y_pred, y.long())
acc = accuracy(
Expand All @@ -194,7 +196,8 @@ def validation_step(self, batch, batch_idx):
return loss

def test_step(self, batch, batch_idx):
x, y = batch
# The batch outputs x_unls due to our on_before_batch_transfer
(x, y), _x_unls = batch
y_pred = self(x)
loss = F.cross_entropy(y_pred, y.long())

Expand All @@ -206,7 +209,7 @@ def test_step(self, batch, batch_idx):
return loss

def predict_step(self, batch, *args, **kwargs) -> Any:
x, y = batch
(x, y), _x_unls = batch
y_pred = self(x)
y_true_str = self.y_encoder.inverse_transform(
y.cpu().numpy().reshape(-1, 1)
Expand All @@ -230,23 +233,16 @@ def on_before_batch_transfer(self, batch: Any, dataloader_idx: int) -> Any:
want to export the model alongside the transformations.
"""

# We need to handle the train and val dataloaders differently.
# For training, the unlabelled data is returned while for validation,
# the unlabelled data is just omitted.
if self.training:
(x_lab, y), x_unl = batch
(x_lbl, y_lbl), x_unl = batch
else:
x_lab, y = batch
x_unl = []
x_lbl, y_lbl = batch
x_unl = None

(x_lab_trans, y_trans), x_unl_trans = preprocess(
x_lab=x_lab,
y_lab=y,
x_unl=x_unl,
return preprocess(
x_lbl=x_lbl,
y_lbl=y_lbl,
x_scaler=self.x_scaler,
y_encoder=self.y_encoder,
x_unl=x_unl,
)
if self.training:
return (x_lab_trans, y_trans), x_unl_trans
else:
return x_lab_trans, y_trans
40 changes: 16 additions & 24 deletions src/frdc/train/frdc_datamodule.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,53 +66,45 @@ class FRDCDataModule(LightningDataModule):
def __post_init__(self):
super().__init__()

# This provides a failsafe interface if somehow someone used the
# labelled dataset as the unlabelled dataset.
if isinstance(self.train_unl_ds, FRDCDataset):
self.train_unl_ds.__class__ = FRDCUnlabelledDataset

def train_dataloader(self):
num_samples = self.batch_size * self.train_iters
n_samples = self.batch_size * self.train_iters
if self.sampling_strategy == "stratified":
sampler = lambda ds: RandomStratifiedSampler(
ds.targets, num_samples=num_samples, replacement=True
sampler_fn = lambda ds: RandomStratifiedSampler(
ds.targets,
num_samples=n_samples,
)
elif self.sampling_strategy == "random":
sampler = lambda ds: RandomSampler(
ds, num_samples=num_samples, replacement=True
sampler_fn = lambda ds: RandomSampler(
ds,
num_samples=n_samples,
)
else:
raise ValueError(
f"Invalid sampling strategy: {self.sampling_strategy}"
)
raise ValueError(f"Invalid strategy: {self.sampling_strategy}")

lab_dl = DataLoader(
self.train_lab_ds,
batch_size=self.batch_size,
sampler=sampler(self.train_lab_ds),
sampler=sampler_fn(self.train_lab_ds),
)
unl_dl = (
DataLoader(
self.train_unl_ds,
batch_size=self.batch_size,
sampler=sampler(self.train_unl_ds),
sampler=sampler_fn(self.train_unl_ds),
)
if self.train_unl_ds is not None
# This is a hacky way to create an empty dataloader.
# The size should be the same as the labelled dataloader so that
# the iterator doesn't prematurely stop.
else DataLoader(
empty := [[] for _ in range(len(self.train_lab_ds))],
batch_size=self.batch_size,
sampler=RandomSampler(
empty,
num_samples=num_samples,
),
)
# The size should be the same or larger than the
# labelled dataloader so the iterator doesn't prematurely stop.
else DataLoader([[] for _ in range(len(lab_dl))])
)

return [lab_dl, unl_dl]

def val_dataloader(self):
return DataLoader(
self.val_ds,
batch_size=self.batch_size,
)
return DataLoader(self.val_ds, batch_size=self.batch_size)
30 changes: 10 additions & 20 deletions src/frdc/train/mixmatch_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from abc import abstractmethod
from typing import Any

import numpy as np
import torch
import torch.nn.functional as F
import wandb
Expand All @@ -16,8 +15,6 @@
mix_up,
sharpen,
wandb_hist,
x_standard_scale,
y_encode,
preprocess,
)

Expand Down Expand Up @@ -194,7 +191,7 @@ def on_after_backward(self) -> None:
self.update_ema()

def validation_step(self, batch, batch_idx):
x, y = batch
(x, y), _x_unls = batch
wandb.log({"val/y_lbl": wandb_hist(y, self.n_classes)})
y_pred = self.ema_model(x)
wandb.log(
Expand All @@ -214,7 +211,7 @@ def validation_step(self, batch, batch_idx):
return loss

def test_step(self, batch, batch_idx):
x, y = batch
(x, y), _x_unls = batch
y_pred = self.ema_model(x)
loss = F.cross_entropy(y_pred, y.long())

Expand All @@ -226,7 +223,7 @@ def test_step(self, batch, batch_idx):
return loss

def predict_step(self, batch, *args, **kwargs) -> Any:
x, y = batch
(x, y), _x_unls = batch
y_pred = self.ema_model(x)
y_true_str = self.y_encoder.inverse_transform(
y.cpu().numpy().reshape(-1, 1)
Expand All @@ -250,23 +247,16 @@ def on_before_batch_transfer(self, batch: Any, dataloader_idx: int) -> Any:
want to export the model alongside the transformations.
"""

# We need to handle the train and val dataloaders differently.
# For training, the unlabelled data is returned while for validation,
# the unlabelled data is just omitted.
if self.training:
(x_lab, y), x_unl = batch
(x_lbl, y_lbl), x_unl = batch
else:
x_lab, y = batch
x_unl = []
x_lbl, y_lbl = batch
x_unl = None

(x_lab_trans, y_trans), x_unl_trans = preprocess(
x_lab=x_lab,
y_lab=y,
x_unl=x_unl,
return preprocess(
x_lbl=x_lbl,
y_lbl=y_lbl,
x_scaler=self.x_scaler,
y_encoder=self.y_encoder,
x_unl=x_unl,
)
if self.training:
return (x_lab_trans, y_trans), x_unl_trans
else:
return x_lab_trans, y_trans
103 changes: 55 additions & 48 deletions src/frdc/train/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,60 @@ def sharpen(y: torch.Tensor, temp: float) -> torch.Tensor:
return y_sharp


def preprocess(
x_lbl: torch.Tensor,
y_lbl: torch.Tensor,
x_scaler: StandardScaler,
y_encoder: OrdinalEncoder,
x_unl: list[torch.Tensor] = None,
) -> tuple[tuple[torch.Tensor, torch.Tensor], list[torch.Tensor]]:
"""Preprocesses the data
Notes:
The reason why x and y's preprocessing is coupled is due to the NaN
elimination step. The NaN elimination step is due to unseen labels by y
fn_recursive is to recursively apply some function to a nested list.
This happens due to unlabelled being a list of tensors.
Args:
x_lbl: The data to preprocess.
y_lbl: The labels to preprocess.
x_scaler: The StandardScaler to use.
y_encoder: The OrdinalEncoder to use.
Returns:
The preprocessed data and labels.
"""

x_unl = [] if x_unl is None else x_unl

x_lbl_trans = x_standard_scale(x_scaler, x_lbl)
y_trans = y_encode(y_encoder, y_lbl)
x_unl_trans = fn_recursive(
x_unl,
fn=lambda x: x_standard_scale(x_scaler, x),
type_atom=torch.Tensor,
type_list=list,
)

# Remove nan values from the batch
# Ordinal Encoders can return a np.nan if the value is not in the
# categories. We will remove that from the batch.
nan = ~torch.isnan(y_trans)
x_lbl_trans = x_lbl_trans[nan]
x_lbl_trans = torch.nan_to_num(x_lbl_trans)
x_unl_trans = fn_recursive(
x_unl_trans,
fn=lambda x: torch.nan_to_num(x[nan]),
type_atom=torch.Tensor,
type_list=list,
)
y_trans = y_trans[nan]

return (x_lbl_trans, y_trans.long()), x_unl_trans


def x_standard_scale(
x_scaler: StandardScaler, x: torch.Tensor
) -> torch.Tensor:
Expand Down Expand Up @@ -92,57 +146,10 @@ def y_encode(y_encoder: OrdinalEncoder, y: torch.Tensor) -> torch.Tensor:
y: The labels to encode.
"""
return torch.from_numpy(
y_encoder.transform(np.array(y).reshape(-1, 1)).squeeze()
y_encoder.transform(np.array(y).reshape(-1, 1))[..., 0]
)


def preprocess(
x_lab: torch.Tensor,
y_lab: torch.Tensor,
x_scaler: StandardScaler,
y_encoder: OrdinalEncoder,
x_unl: list[torch.Tensor] = None,
) -> tuple[tuple[torch.Tensor, torch.Tensor], list[torch.Tensor]]:
"""Preprocesses the data
Args:
x_lab: The data to preprocess.
y_lab: The labels to preprocess.
x_scaler: The StandardScaler to use.
y_encoder: The OrdinalEncoder to use.
Returns:
The preprocessed data and labels.
"""

x_unl = [] if x_unl is None else x_unl

x_lab_trans = x_standard_scale(x_scaler, x_lab)
y_trans = y_encode(y_encoder, y_lab)
x_unl_trans = fn_recursive(
x_unl,
fn=lambda x: x_standard_scale(x_scaler, x),
type_atom=torch.Tensor,
type_list=list,
)

# Remove nan values from the batch
# Ordinal Encoders can return a np.nan if the value is not in the
# categories. We will remove that from the batch.
nan = ~torch.isnan(y_trans)
x_lab_trans = x_lab_trans[nan]
x_lab_trans = torch.nan_to_num(x_lab_trans)
x_unl_trans = fn_recursive(
x_unl_trans,
fn=lambda x: torch.nan_to_num(x[nan]),
type_atom=torch.Tensor,
type_list=list,
)
y_trans = y_trans[nan]

return (x_lab_trans, y_trans.long()), x_unl_trans


def wandb_hist(x: torch.Tensor, num_bins: int) -> wandb.Histogram:
"""Records a W&B Histogram"""
return wandb.Histogram(
Expand Down
1 change: 1 addition & 0 deletions src/frdc/utils/training.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from __future__ import annotations

from pathlib import Path

import lightning as pl
Expand Down
6 changes: 3 additions & 3 deletions tests/model_tests/chestnut_dec_may/train_fixmatch.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,9 +109,9 @@ def main(

trainer.fit(m, datamodule=dm)

with open(Path(__file__).parent / "report.md", "w") as f:
with open(Path(__file__).parent / "report.md", "a+") as f:
f.write(
f"# Chestnut Nature Park (Dec 2020 vs May 2021)\n"
f"# Chestnut Nature Park (Dec 2020 vs May 2021) FixMatch\n"
f"- Results: [WandB Report]({wandb.run.get_url()})"
)

Expand All @@ -136,7 +136,7 @@ def main(

if __name__ == "__main__":
BATCH_SIZE = 32
EPOCHS = 50
EPOCHS = 10
TRAIN_ITERS = 25
LR = 3e-3

Expand Down
Loading

0 comments on commit 49fdf24

Please sign in to comment.