Merge branch '0.1.0' into frrd-80

FR-DC · Jun 4, 2024 · 49fdf24 · 49fdf24
2 parents dea588f + 226b0d9
commit 49fdf24
Show file tree

Hide file tree

Showing 9 changed files with 105 additions and 120 deletions.
diff --git a/.github/workflows/model-tests.yml b/.github/workflows/model-tests.yml
@@ -85,7 +85,8 @@ jobs:
         working-directory: ${{ github.workspace }}/tests
         run: |
           git config --global --add safe.directory /__w/FRDC-ML/FRDC-ML
-          python3 -m model_tests.chestnut_dec_may.train
+          python3 -m model_tests.chestnut_dec_may.train_mixmatch
+          python3 -m model_tests.chestnut_dec_may.train_fixmatch
 
       - name: Comment results via CML
         run: |

diff --git a/src/frdc/load/label_studio.py b/src/frdc/load/label_studio.py
@@ -8,7 +8,6 @@
 
 from frdc.conf import LABEL_STUDIO_CLIENT
 
-
 logger = logging.getLogger(__name__)
 
 

diff --git a/src/frdc/train/fixmatch_module.py b/src/frdc/train/fixmatch_module.py
@@ -92,12 +92,12 @@ def training_step(self, batch, batch_idx):
             ℓ
             Loss: ℓ_lbl + ℓ_unl
         """
-
+    def training_step(self, batch, batch_idx):
+        (x_lbl, y_lbl), x_unls = batch
         opt = self.optimizers()
 
         # Backprop for labelled data
         opt.zero_grad()
-        (x_lbl, y_lbl), x_unls = batch
         loss_lbl = F.cross_entropy((y_lbl_pred := self(x_lbl)), y_lbl.long())
         self.manual_backward(loss_lbl)
         opt.step()
@@ -174,7 +174,9 @@ def training_step(self, batch, batch_idx):
         )
 
     def validation_step(self, batch, batch_idx):
-        x, y = batch
+        # The batch outputs x_unls due to our on_before_batch_transfer
+        (x, y), _x_unls = batch
+        wandb.log({"val/y_lbl": wandb_hist(y, self.n_classes)})
         y_pred = self(x)
         loss = F.cross_entropy(y_pred, y.long())
         acc = accuracy(
@@ -194,7 +196,8 @@ def validation_step(self, batch, batch_idx):
         return loss
 
     def test_step(self, batch, batch_idx):
-        x, y = batch
+        # The batch outputs x_unls due to our on_before_batch_transfer
+        (x, y), _x_unls = batch
         y_pred = self(x)
         loss = F.cross_entropy(y_pred, y.long())
 
@@ -206,7 +209,7 @@ def test_step(self, batch, batch_idx):
         return loss
 
     def predict_step(self, batch, *args, **kwargs) -> Any:
-        x, y = batch
+        (x, y), _x_unls = batch
         y_pred = self(x)
         y_true_str = self.y_encoder.inverse_transform(
             y.cpu().numpy().reshape(-1, 1)
@@ -230,23 +233,16 @@ def on_before_batch_transfer(self, batch: Any, dataloader_idx: int) -> Any:
             want to export the model alongside the transformations.
         """
 
-        # We need to handle the train and val dataloaders differently.
-        # For training, the unlabelled data is returned while for validation,
-        # the unlabelled data is just omitted.
         if self.training:
-            (x_lab, y), x_unl = batch
+            (x_lbl, y_lbl), x_unl = batch
         else:
-            x_lab, y = batch
-            x_unl = []
+            x_lbl, y_lbl = batch
+            x_unl = None
 
-        (x_lab_trans, y_trans), x_unl_trans = preprocess(
-            x_lab=x_lab,
-            y_lab=y,
-            x_unl=x_unl,
+        return preprocess(
+            x_lbl=x_lbl,
+            y_lbl=y_lbl,
             x_scaler=self.x_scaler,
             y_encoder=self.y_encoder,
+            x_unl=x_unl,
         )
-        if self.training:
-            return (x_lab_trans, y_trans), x_unl_trans
-        else:
-            return x_lab_trans, y_trans
diff --git a/src/frdc/train/frdc_datamodule.py b/src/frdc/train/frdc_datamodule.py
@@ -66,53 +66,45 @@ class FRDCDataModule(LightningDataModule):
     def __post_init__(self):
         super().__init__()
 
+        # This provides a failsafe interface if somehow someone used the
+        # labelled dataset as the unlabelled dataset.
         if isinstance(self.train_unl_ds, FRDCDataset):
             self.train_unl_ds.__class__ = FRDCUnlabelledDataset
 
     def train_dataloader(self):
-        num_samples = self.batch_size * self.train_iters
+        n_samples = self.batch_size * self.train_iters
         if self.sampling_strategy == "stratified":
-            sampler = lambda ds: RandomStratifiedSampler(
-                ds.targets, num_samples=num_samples, replacement=True
+            sampler_fn = lambda ds: RandomStratifiedSampler(
+                ds.targets,
+                num_samples=n_samples,
             )
         elif self.sampling_strategy == "random":
-            sampler = lambda ds: RandomSampler(
-                ds, num_samples=num_samples, replacement=True
+            sampler_fn = lambda ds: RandomSampler(
+                ds,
+                num_samples=n_samples,
             )
         else:
-            raise ValueError(
-                f"Invalid sampling strategy: {self.sampling_strategy}"
-            )
+            raise ValueError(f"Invalid strategy: {self.sampling_strategy}")
 
         lab_dl = DataLoader(
             self.train_lab_ds,
             batch_size=self.batch_size,
-            sampler=sampler(self.train_lab_ds),
+            sampler=sampler_fn(self.train_lab_ds),
         )
         unl_dl = (
             DataLoader(
                 self.train_unl_ds,
                 batch_size=self.batch_size,
-                sampler=sampler(self.train_unl_ds),
+                sampler=sampler_fn(self.train_unl_ds),
             )
             if self.train_unl_ds is not None
             # This is a hacky way to create an empty dataloader.
-            # The size should be the same as the labelled dataloader so that
-            #  the iterator doesn't prematurely stop.
-            else DataLoader(
-                empty := [[] for _ in range(len(self.train_lab_ds))],
-                batch_size=self.batch_size,
-                sampler=RandomSampler(
-                    empty,
-                    num_samples=num_samples,
-                ),
-            )
+            # The size should be the same or larger than the
+            # labelled dataloader so the iterator doesn't prematurely stop.
+            else DataLoader([[] for _ in range(len(lab_dl))])
         )
 
         return [lab_dl, unl_dl]
 
     def val_dataloader(self):
-        return DataLoader(
-            self.val_ds,
-            batch_size=self.batch_size,
-        )
+        return DataLoader(self.val_ds, batch_size=self.batch_size)
diff --git a/src/frdc/train/mixmatch_module.py b/src/frdc/train/mixmatch_module.py
@@ -3,7 +3,6 @@
 from abc import abstractmethod
 from typing import Any
 
-import numpy as np
 import torch
 import torch.nn.functional as F
 import wandb
@@ -16,8 +15,6 @@
     mix_up,
     sharpen,
     wandb_hist,
-    x_standard_scale,
-    y_encode,
     preprocess,
 )
 
@@ -194,7 +191,7 @@ def on_after_backward(self) -> None:
         self.update_ema()
 
     def validation_step(self, batch, batch_idx):
-        x, y = batch
+        (x, y), _x_unls = batch
         wandb.log({"val/y_lbl": wandb_hist(y, self.n_classes)})
         y_pred = self.ema_model(x)
         wandb.log(
@@ -214,7 +211,7 @@ def validation_step(self, batch, batch_idx):
         return loss
 
     def test_step(self, batch, batch_idx):
-        x, y = batch
+        (x, y), _x_unls = batch
         y_pred = self.ema_model(x)
         loss = F.cross_entropy(y_pred, y.long())
 
@@ -226,7 +223,7 @@ def test_step(self, batch, batch_idx):
         return loss
 
     def predict_step(self, batch, *args, **kwargs) -> Any:
-        x, y = batch
+        (x, y), _x_unls = batch
         y_pred = self.ema_model(x)
         y_true_str = self.y_encoder.inverse_transform(
             y.cpu().numpy().reshape(-1, 1)
@@ -250,23 +247,16 @@ def on_before_batch_transfer(self, batch: Any, dataloader_idx: int) -> Any:
             want to export the model alongside the transformations.
         """
 
-        # We need to handle the train and val dataloaders differently.
-        # For training, the unlabelled data is returned while for validation,
-        # the unlabelled data is just omitted.
         if self.training:
-            (x_lab, y), x_unl = batch
+            (x_lbl, y_lbl), x_unl = batch
         else:
-            x_lab, y = batch
-            x_unl = []
+            x_lbl, y_lbl = batch
+            x_unl = None
 
-        (x_lab_trans, y_trans), x_unl_trans = preprocess(
-            x_lab=x_lab,
-            y_lab=y,
-            x_unl=x_unl,
+        return preprocess(
+            x_lbl=x_lbl,
+            y_lbl=y_lbl,
             x_scaler=self.x_scaler,
             y_encoder=self.y_encoder,
+            x_unl=x_unl,
         )
-        if self.training:
-            return (x_lab_trans, y_trans), x_unl_trans
-        else:
-            return x_lab_trans, y_trans
diff --git a/src/frdc/train/utils.py b/src/frdc/train/utils.py
@@ -52,6 +52,60 @@ def sharpen(y: torch.Tensor, temp: float) -> torch.Tensor:
     return y_sharp
 
 
+def preprocess(
+    x_lbl: torch.Tensor,
+    y_lbl: torch.Tensor,
+    x_scaler: StandardScaler,
+    y_encoder: OrdinalEncoder,
+    x_unl: list[torch.Tensor] = None,
+) -> tuple[tuple[torch.Tensor, torch.Tensor], list[torch.Tensor]]:
+    """Preprocesses the data
+
+    Notes:
+        The reason why x and y's preprocessing is coupled is due to the NaN
+        elimination step. The NaN elimination step is due to unseen labels by y
+
+        fn_recursive is to recursively apply some function to a nested list.
+        This happens due to unlabelled being a list of tensors.
+
+    Args:
+        x_lbl: The data to preprocess.
+        y_lbl: The labels to preprocess.
+        x_scaler: The StandardScaler to use.
+        y_encoder: The OrdinalEncoder to use.
+
+    Returns:
+        The preprocessed data and labels.
+    """
+
+    x_unl = [] if x_unl is None else x_unl
+
+    x_lbl_trans = x_standard_scale(x_scaler, x_lbl)
+    y_trans = y_encode(y_encoder, y_lbl)
+    x_unl_trans = fn_recursive(
+        x_unl,
+        fn=lambda x: x_standard_scale(x_scaler, x),
+        type_atom=torch.Tensor,
+        type_list=list,
+    )
+
+    # Remove nan values from the batch
+    #   Ordinal Encoders can return a np.nan if the value is not in the
+    #   categories. We will remove that from the batch.
+    nan = ~torch.isnan(y_trans)
+    x_lbl_trans = x_lbl_trans[nan]
+    x_lbl_trans = torch.nan_to_num(x_lbl_trans)
+    x_unl_trans = fn_recursive(
+        x_unl_trans,
+        fn=lambda x: torch.nan_to_num(x[nan]),
+        type_atom=torch.Tensor,
+        type_list=list,
+    )
+    y_trans = y_trans[nan]
+
+    return (x_lbl_trans, y_trans.long()), x_unl_trans
+
+
 def x_standard_scale(
     x_scaler: StandardScaler, x: torch.Tensor
 ) -> torch.Tensor:
@@ -92,57 +146,10 @@ def y_encode(y_encoder: OrdinalEncoder, y: torch.Tensor) -> torch.Tensor:
         y: The labels to encode.
     """
     return torch.from_numpy(
-        y_encoder.transform(np.array(y).reshape(-1, 1)).squeeze()
+        y_encoder.transform(np.array(y).reshape(-1, 1))[..., 0]
     )
 
 
-def preprocess(
-    x_lab: torch.Tensor,
-    y_lab: torch.Tensor,
-    x_scaler: StandardScaler,
-    y_encoder: OrdinalEncoder,
-    x_unl: list[torch.Tensor] = None,
-) -> tuple[tuple[torch.Tensor, torch.Tensor], list[torch.Tensor]]:
-    """Preprocesses the data
-
-    Args:
-        x_lab: The data to preprocess.
-        y_lab: The labels to preprocess.
-        x_scaler: The StandardScaler to use.
-        y_encoder: The OrdinalEncoder to use.
-
-    Returns:
-        The preprocessed data and labels.
-    """
-
-    x_unl = [] if x_unl is None else x_unl
-
-    x_lab_trans = x_standard_scale(x_scaler, x_lab)
-    y_trans = y_encode(y_encoder, y_lab)
-    x_unl_trans = fn_recursive(
-        x_unl,
-        fn=lambda x: x_standard_scale(x_scaler, x),
-        type_atom=torch.Tensor,
-        type_list=list,
-    )
-
-    # Remove nan values from the batch
-    #   Ordinal Encoders can return a np.nan if the value is not in the
-    #   categories. We will remove that from the batch.
-    nan = ~torch.isnan(y_trans)
-    x_lab_trans = x_lab_trans[nan]
-    x_lab_trans = torch.nan_to_num(x_lab_trans)
-    x_unl_trans = fn_recursive(
-        x_unl_trans,
-        fn=lambda x: torch.nan_to_num(x[nan]),
-        type_atom=torch.Tensor,
-        type_list=list,
-    )
-    y_trans = y_trans[nan]
-
-    return (x_lab_trans, y_trans.long()), x_unl_trans
-
-
 def wandb_hist(x: torch.Tensor, num_bins: int) -> wandb.Histogram:
     """Records a W&B Histogram"""
     return wandb.Histogram(

diff --git a/src/frdc/utils/training.py b/src/frdc/utils/training.py
@@ -1,4 +1,5 @@
 from __future__ import annotations
+
 from pathlib import Path
 
 import lightning as pl

diff --git a/tests/model_tests/chestnut_dec_may/train_fixmatch.py b/tests/model_tests/chestnut_dec_may/train_fixmatch.py
@@ -109,9 +109,9 @@ def main(
 
     trainer.fit(m, datamodule=dm)
 
-    with open(Path(__file__).parent / "report.md", "w") as f:
+    with open(Path(__file__).parent / "report.md", "a+") as f:
         f.write(
-            f"# Chestnut Nature Park (Dec 2020 vs May 2021)\n"
+            f"# Chestnut Nature Park (Dec 2020 vs May 2021) FixMatch\n"
             f"- Results: [WandB Report]({wandb.run.get_url()})"
         )
 
@@ -136,7 +136,7 @@ def main(
 
 if __name__ == "__main__":
     BATCH_SIZE = 32
-    EPOCHS = 50
+    EPOCHS = 10
     TRAIN_ITERS = 25
     LR = 3e-3
Original file line number	Diff line number	Diff line change
Expand Up		@@ -8,7 +8,6 @@

		from frdc.conf import LABEL_STUDIO_CLIENT


		logger = logging.getLogger(__name__)


Expand Down