From bffa9ae5f091adb38e592dd8356d21555e5b3fd5 Mon Sep 17 00:00:00 2001 From: Eve-ning Date: Fri, 21 Jun 2024 10:07:43 +0800 Subject: [PATCH 01/12] Improve logging information during initialization --- .env.example | 5 ++ src/frdc/conf.py | 164 ++++++++++++++++++++++++++++++++--------------- 2 files changed, 117 insertions(+), 52 deletions(-) create mode 100644 .env.example diff --git a/.env.example b/.env.example new file mode 100644 index 00000000..197a60d6 --- /dev/null +++ b/.env.example @@ -0,0 +1,5 @@ +LABEL_STUDIO_API_KEY= +LABEL_STUDIO_HOST=10.97.41.70 +LABEL_STUDIO_PORT=8080 +GCS_PROJECT_ID=frmodel +GCS_BUCKET_NAME=frdc-ds diff --git a/src/frdc/conf.py b/src/frdc/conf.py index d9683d23..5d566b4e 100644 --- a/src/frdc/conf.py +++ b/src/frdc/conf.py @@ -9,20 +9,124 @@ from google.cloud import storage as gcs logger = logging.getLogger(__name__) +logging.warning( + "Initializing the project configuration, this may take a moment...\n" + "Note that your project can be configured in the .env file in the root " + "directory of the project." +) +# The ROOT_DIR is the root directory of the project. +# E.g. ROOT_DIR / src / frdc / conf.py is this file. ROOT_DIR = Path(__file__).parents[2] +ENV_FILE = ROOT_DIR / ".env" + +if ENV_FILE.exists(): + logger.info(f"Loading Environment Variables from {ENV_FILE.as_posix()}...") + load_dotenv(ENV_FILE) +else: + import shutil + + ENV_EXAMPLE_FILE = ROOT_DIR / ".env.example" + if ENV_EXAMPLE_FILE.exists(): + shutil.copy(ENV_EXAMPLE_FILE, ENV_FILE) + raise FileNotFoundError( + f"Environment file not found at {ENV_FILE.as_posix()}. " + "A new one has been created from the .env.example file.\n" + "Set the necessary variables and re-run the script." + ) + else: + raise FileNotFoundError( + f"Environment file not found at {ENV_FILE.as_posix()}. " + "Please create one or copy the .env.example file in the GitHub " + "repository." + ) -load_dotenv(ROOT_DIR / ".env") LOCAL_DATASET_ROOT_DIR = ROOT_DIR / "rsc" +logger.info(f"Local Dataset Save Root: {LOCAL_DATASET_ROOT_DIR.as_posix()}") + +# == CONNECT TO GCS =========================================================== + os.environ["GOOGLE_CLOUD_PROJECT"] = "frmodel" -GCS_PROJECT_ID = "frmodel" -GCS_BUCKET_NAME = "frdc-ds" -GCS_CREDENTIALS = None -LABEL_STUDIO_HOST = os.environ.get("LABEL_STUDIO_HOST", "localhost") -LABEL_STUDIO_URL = f"http://{LABEL_STUDIO_HOST}:8080" +GCS_PROJECT_ID = os.environ.get("GCS_PROJECT_ID") +GCS_BUCKET_NAME = os.environ.get("GCS_BUCKET_NAME") +logger.info(f"GCS Project: {GCS_PROJECT_ID}") +logger.info(f"GCS Bucket: {GCS_BUCKET_NAME}") +GCS_CLIENT = None +GCS_BUCKET = None -if not (LABEL_STUDIO_API_KEY := os.environ.get("LABEL_STUDIO_API_KEY", None)): - logger.warning("LABEL_STUDIO_API_KEY not set") +if GCS_PROJECT_ID is None or GCS_BUCKET_NAME is None: + logger.warning("GCS_PROJECT_ID or GCS_BUCKET_NAME not set.") +else: + try: + logger.info("Connecting to GCS...") + GCS_CLIENT = gcs.Client(project=GCS_PROJECT_ID) + GCS_BUCKET = GCS_CLIENT.bucket(GCS_BUCKET_NAME) + logger.info("Connected to GCS.") + except Exception: + logger.warning( + "Couldn't connect to GCS. You will not be able to download files. " + "Check that you've (1) Installed the GCS CLI and (2) Set up the" + "ADC with `gcloud auth application-default login`. " + "GCS_CLIENT will be None." + ) + +# == CONNECT TO LABEL STUDIO ================================================== + +LABEL_STUDIO_HOST = os.environ.get("LABEL_STUDIO_HOST") +LABEL_STUDIO_PORT = os.environ.get("LABEL_STUDIO_PORT") +LABEL_STUDIO_API_KEY = os.environ.get("LABEL_STUDIO_API_KEY", None) +LABEL_STUDIO_URL = f"http://{LABEL_STUDIO_HOST}:{LABEL_STUDIO_PORT}" +LABEL_STUDIO_CLIENT = None + +logger.info(f"Label Studio URL: {LABEL_STUDIO_URL}") +logger.info("Retrieving Label Studio API Key from Environment...") + +if LABEL_STUDIO_API_KEY is None or LABEL_STUDIO_API_KEY == "": + logger.warning( + "Env. Var. LABEL_STUDIO_API_KEY not set. " + "You will not be able to connect to Label Studio to retrieve our " + "datasets. \n" + f"You can set this in your .env file @ {ENV_FILE.as_posix()}, or " + "set it in your machine's environment variables." + ) +else: + try: + logger.info("Connecting to Label Studio...") + requests.get(LABEL_STUDIO_URL) + LABEL_STUDIO_CLIENT = label_studio.Client( + url=LABEL_STUDIO_URL, + api_key=LABEL_STUDIO_API_KEY, + ) + logger.info("Connected to Label Studio.") + try: + logger.info("Retrieving main Label Studio Project id:1...") + LABEL_STUDIO_CLIENT.get_project(1) + logger.info( + "Successfully retrieved main Label Studio Project id:1." + ) + except requests.exceptions.HTTPError: + logger.warning( + "Couldn't get annotation project, " + "live annotations won't work. " + "Check that\n" + "(1) Your API Key is correct.\n" + "(2) Your API Key is for the correct LS instance.\n" + "(3) Your .netrc is not preventing you from accessing the " + "project. " + ) + except requests.exceptions.ConnectionError: + logger.warning( + f"Could not connect to Label Studio at {LABEL_STUDIO_URL}.\n" + f"Check that the server is running in your browser. " + f"Label Studio features won't work. " + ) + +if LABEL_STUDIO_CLIENT is None: + logger.error( + "Failed to connect to Label Studio, LABEL_STUDIO_CLIENT will be None." + ) + +# == OTHER CONSTANTS ========================================================== BAND_CONFIG = OrderedDict( { @@ -47,47 +151,3 @@ "RE": (0, 2**14), "NIR": (0, 2**14), } - -try: - logger.info("Connecting to GCS...") - GCS_CLIENT = gcs.Client( - project=GCS_PROJECT_ID, - credentials=GCS_CREDENTIALS, - ) - GCS_BUCKET = GCS_CLIENT.bucket(GCS_BUCKET_NAME) - logger.info("Connected to GCS.") -except Exception: - logger.warning( - "Could not connect to GCS. Will not be able to download files. " - "Check that you've (1) Installed the GCS CLI and (2) Set up the" - "ADC with `gcloud auth application-default login`. " - "GCS_CLIENT will be None." - ) - GCS_CLIENT = None - GCS_BUCKET = None - -try: - logger.info("Connecting to Label Studio...") - requests.get(LABEL_STUDIO_URL) - LABEL_STUDIO_CLIENT = label_studio.Client( - url=LABEL_STUDIO_URL, - api_key=LABEL_STUDIO_API_KEY, - ) - logger.info("Connected to Label Studio.") - try: - logger.info("Attempting to Get Label Studio Project...") - LABEL_STUDIO_CLIENT.get_project(1) - except requests.exceptions.HTTPError: - logger.warning( - "Could not get main annotation project. " - "Pulling annotations may not work. " - "It's possible that your API Key is incorrect, " - "or somehow your .netrc is preventing you from " - "accessing the project. " - ) -except requests.exceptions.ConnectionError: - logger.warning( - f"Could not connect to Label Studio at {LABEL_STUDIO_URL}. " - f"LABEL_STUDIO_CLIENT will be None." - ) - LABEL_STUDIO_CLIENT = None From 5294c3dee4924004828fa3e57c2574deafbfeeb4 Mon Sep 17 00:00:00 2001 From: Eve-ning Date: Fri, 21 Jun 2024 10:57:59 +0800 Subject: [PATCH 02/12] Update FRDCDataset to hide unused methods --- Writerside/topics/Retrieve-our-Datasets.md | 12 ++++++------ Writerside/topics/load.dataset.md | 10 +++++----- .../topics/preprocessing.extract_segments.md | 6 +++--- Writerside/topics/preprocessing.morphology.md | 2 +- Writerside/topics/preprocessing.scale.md | 2 +- src/frdc/load/dataset.py | 18 +++++++++--------- tests/conftest.py | 2 +- tests/unit_tests/load/test_frdc_dataset.py | 8 ++++---- tests/unit_tests/load/test_label_studio.py | 2 +- .../preprocess/test_extract_segments.py | 4 ++-- 10 files changed, 33 insertions(+), 33 deletions(-) diff --git a/Writerside/topics/Retrieve-our-Datasets.md b/Writerside/topics/Retrieve-our-Datasets.md index 9c671cbd..14a4b815 100644 --- a/Writerside/topics/Retrieve-our-Datasets.md +++ b/Writerside/topics/Retrieve-our-Datasets.md @@ -28,8 +28,8 @@ Here, we'll download and load our from frdc.load.preset import FRDCDatasetPreset ds = FRDCDatasetPreset.chestnut_20201218() -ar, order = ds.get_ar_bands() -bounds, labels = ds.get_bounds_and_labels() +ar, order = ds._get_ar_bands() +bounds, labels = ds._get_bounds_and_labels() ``` ### What Datasets are there? {collapsible="true"} @@ -91,8 +91,8 @@ from frdc.load.preset import FRDCDatasetPreset from frdc.preprocess.extract_segments import extract_segments_from_bounds ds = FRDCDatasetPreset.chestnut_20201218() -ar, order = ds.get_ar_bands() -bounds, labels = ds.get_bounds_and_labels() +ar, order = ds._get_ar_bands() +bounds, labels = ds._get_bounds_and_labels() segments = extract_segments_from_bounds(ar, bounds) ``` @@ -115,8 +115,8 @@ from frdc.preprocess.extract_segments import extract_segments_from_bounds from frdc.preprocess.scale import scale_0_1_per_band ds = FRDCDatasetPreset.chestnut_20201218() -ar, order = ds.get_ar_bands() -bounds, labels = ds.get_bounds_and_labels() +ar, order = ds._get_ar_bands() +bounds, labels = ds._get_bounds_and_labels() segments = extract_segments_from_bounds(ar, bounds) segment_0_bgr = segments[0] segment_0_rgb = segment_0_bgr[..., [2, 1, 0]] diff --git a/Writerside/topics/load.dataset.md b/Writerside/topics/load.dataset.md index 7cbf6cbc..e5842d20 100644 --- a/Writerside/topics/load.dataset.md +++ b/Writerside/topics/load.dataset.md @@ -25,9 +25,9 @@ ds = FRDCDatasetPreset.chestnut_20201218() Then, we can use the `ds` object to load objects of the dataset: ```python -ar, order = ds.get_ar_bands() -d = ds.get_ar_bands_as_dict() -bounds, labels = ds.get_bounds_and_labels() +ar, order = ds._get_ar_bands() +d = ds._get_ar_bands_as_dict() +bounds, labels = ds._get_bounds_and_labels() ``` - `ar` is a stacked NDArray of the hyperspectral bands of shape (H x W x C) @@ -54,8 +54,8 @@ argument. For example, to get the Wideband RGB bands, you can do: ```python -ar, order = ds.get_ar_bands(bands=['WR', 'WG', 'WB']) -d = ds.get_ar_bands_as_dict(bands=['WR', 'WG', 'WB']) +ar, order = ds._get_ar_bands(bands=['WR', 'WG', 'WB']) +d = ds._get_ar_bands_as_dict(bands=['WR', 'WG', 'WB']) ``` This will also alter the channel order to the order of the bands provided. diff --git a/Writerside/topics/preprocessing.extract_segments.md b/Writerside/topics/preprocessing.extract_segments.md index a83b3060..32c9d78c 100644 --- a/Writerside/topics/preprocessing.extract_segments.md +++ b/Writerside/topics/preprocessing.extract_segments.md @@ -139,8 +139,8 @@ from frdc.load.preset import FRDCDatasetPreset from frdc.preprocess.extract_segments import extract_segments_from_bounds ds = FRDCDatasetPreset.chestnut_20201218() -ar, order = ds.get_ar_bands() -bounds, labels = ds.get_bounds_and_labels() +ar, order = ds._get_ar_bands() +bounds, labels = ds._get_bounds_and_labels() segments: list[np.ndarray] = extract_segments_from_bounds(ar, bounds) ``` @@ -163,7 +163,7 @@ from frdc.preprocess.extract_segments import ( ) ds = FRDCDatasetPreset.chestnut_20201218() -ar, order = ds.get_ar_bands() +ar, order = ds._get_ar_bands() ar = scale_0_1_per_band(ar) ar_mask = threshold_binary_mask(ar, -1, 90 / 256) ar_mask = remove_small_objects(ar_mask, min_size=100, connectivity=2) diff --git a/Writerside/topics/preprocessing.morphology.md b/Writerside/topics/preprocessing.morphology.md index 95289404..3a3a9c50 100644 --- a/Writerside/topics/preprocessing.morphology.md +++ b/Writerside/topics/preprocessing.morphology.md @@ -35,7 +35,7 @@ from frdc.preprocess.morphology import ( ) ds = FRDCDatasetPreset.chestnut_20201218() -ar, order = ds.get_ar_bands() +ar, order = ds._get_ar_bands() mask = threshold_binary_mask(ar, order.index('NIR'), 90 / 256) ar_label = binary_watershed(mask) ``` diff --git a/Writerside/topics/preprocessing.scale.md b/Writerside/topics/preprocessing.scale.md index 0b0e5946..925dcaa8 100644 --- a/Writerside/topics/preprocessing.scale.md +++ b/Writerside/topics/preprocessing.scale.md @@ -42,7 +42,7 @@ from frdc.preprocess.scale import ( from frdc.conf import BAND_MAX_CONFIG ds = FRDCDatasetPreset.chestnut_20201218() -ar, order = ds.get_ar_bands() +ar, order = ds._get_ar_bands() ar_01 = scale_0_1_per_band(ar) ar_norm = scale_normal_per_band(ar) ar_static = scale_static_per_band(ar, order, BAND_MAX_CONFIG) diff --git a/src/frdc/load/dataset.py b/src/frdc/load/dataset.py index 4076e1e9..d7686f4d 100644 --- a/src/frdc/load/dataset.py +++ b/src/frdc/load/dataset.py @@ -115,14 +115,14 @@ def __init__( self.date = date self.version = version - self.ar, self.order = self.get_ar_bands() + self.ar, self.order = self._get_ar_bands() self.targets = None if use_legacy_bounds or (LABEL_STUDIO_CLIENT is None): - bounds, self.targets = self.get_bounds_and_labels() + bounds, self.targets = self._get_bounds_and_labels() self.ar_segments = extract_segments_from_bounds(self.ar, bounds) else: - bounds, self.targets = self.get_polybounds_and_labels() + bounds, self.targets = self._get_polybounds_and_labels() self.ar_segments = extract_segments_from_polybounds( self.ar, bounds, @@ -177,7 +177,7 @@ def dataset_dir(self): f"{self.version + '/' if self.version else ''}" ) - def get_ar_bands_as_dict( + def _get_ar_bands_as_dict( self, bands: Iterable[str] = BAND_CONFIG.keys(), ) -> dict[str, np.ndarray]: @@ -227,7 +227,7 @@ def get_ar_bands_as_dict( return d - def get_ar_bands( + def _get_ar_bands( self, bands: Iterable[str] = BAND_CONFIG.keys(), ) -> tuple[np.ndarray, list[str]]: @@ -252,10 +252,10 @@ def get_ar_bands( (H, W, C) and band_order is a list of band names. """ - d: dict[str, np.ndarray] = self.get_ar_bands_as_dict(bands) + d: dict[str, np.ndarray] = self._get_ar_bands_as_dict(bands) return np.concatenate(list(d.values()), axis=-1), list(d.keys()) - def get_bounds_and_labels( + def _get_bounds_and_labels( self, file_name="bounds.csv", ) -> tuple[list[Rect], list[str]]: @@ -285,11 +285,11 @@ def get_bounds_and_labels( df["name"].tolist(), ) - def get_polybounds_and_labels(self): + def _get_polybounds_and_labels(self): """Gets the bounds and labels from Label Studio.""" return get_task( Path(f"{self.dataset_dir}/result.jpg") - ).get_bounds_and_labels() + )._get_bounds_and_labels() @staticmethod def _load_image(path: Path | str) -> np.ndarray: diff --git a/tests/conftest.py b/tests/conftest.py index 26687c52..21697f8f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -15,7 +15,7 @@ def ds() -> FRDCDataset: @pytest.fixture(scope="session") def ar_and_order(ds) -> tuple[np.ndarray, list[str]]: - return ds.get_ar_bands() + return ds._get_ar_bands() @pytest.fixture(scope="session") diff --git a/tests/unit_tests/load/test_frdc_dataset.py b/tests/unit_tests/load/test_frdc_dataset.py index 0a75425c..c887a871 100644 --- a/tests/unit_tests/load/test_frdc_dataset.py +++ b/tests/unit_tests/load/test_frdc_dataset.py @@ -4,24 +4,24 @@ def test_get_ar_bands_as_dict(ds): - d = ds.get_ar_bands_as_dict(BAND_CONFIG) + d = ds._get_ar_bands_as_dict(BAND_CONFIG) assert set(d.keys()) == set(d.keys()) def test_get_ar_bands(ds): - ar, order = ds.get_ar_bands() + ar, order = ds._get_ar_bands() assert ar.shape[-1] == len(BAND_CONFIG) assert order == list(BAND_CONFIG.keys()) def test_get_ar_bands_ordering(ds): - ar, order = ds.get_ar_bands(["WB", "WG"]) + ar, order = ds._get_ar_bands(["WB", "WG"]) assert ar.shape[-1] == 2 assert order == ["WB", "WG"] def test_get_bounds(ds): - bounds, labels = ds.get_bounds_and_labels() + bounds, labels = ds._get_bounds_and_labels() assert all([isinstance(b, Rect) for b in bounds]) assert len(bounds) == len(labels) diff --git a/tests/unit_tests/load/test_label_studio.py b/tests/unit_tests/load/test_label_studio.py index 2059aa51..a38b3c19 100644 --- a/tests/unit_tests/load/test_label_studio.py +++ b/tests/unit_tests/load/test_label_studio.py @@ -5,4 +5,4 @@ @requires_label_studio def test_get_bounds_and_labels(): task = get_task("DEBUG/0/result.jpg") - bounds, labels = task.get_bounds_and_labels() + bounds, labels = task._get_bounds_and_labels() diff --git a/tests/unit_tests/preprocess/test_extract_segments.py b/tests/unit_tests/preprocess/test_extract_segments.py index a16c2a4a..5433e0e8 100644 --- a/tests/unit_tests/preprocess/test_extract_segments.py +++ b/tests/unit_tests/preprocess/test_extract_segments.py @@ -76,13 +76,13 @@ def test_unique_labels( def test_extract_segments_from_bounds_cropped(ds, ar): - bounds, labels = ds.get_bounds_and_labels() + bounds, labels = ds._get_bounds_and_labels() segments = extract_segments_from_bounds(ar, bounds, cropped=True) assert any(segment.shape != ar.shape for segment in segments) def test_extract_segments_from_bounds_no_crop(ds, ar): - bounds, labels = ds.get_bounds_and_labels() + bounds, labels = ds._get_bounds_and_labels() segments = extract_segments_from_bounds(ar, bounds, cropped=False) assert all(segment.shape == ar.shape for segment in segments) From 3d4c7e6b8ea5b41c294aaf24e9bfa922b7db89d9 Mon Sep 17 00:00:00 2001 From: Eve-ning Date: Fri, 21 Jun 2024 11:25:45 +0800 Subject: [PATCH 03/12] Update docs to encourage iterating --- Writerside/topics/Retrieve-our-Datasets.md | 123 +++++++++------------ Writerside/topics/load.dataset.md | 53 ++++++--- 2 files changed, 91 insertions(+), 85 deletions(-) diff --git a/Writerside/topics/Retrieve-our-Datasets.md b/Writerside/topics/Retrieve-our-Datasets.md index 14a4b815..b503c2eb 100644 --- a/Writerside/topics/Retrieve-our-Datasets.md +++ b/Writerside/topics/Retrieve-our-Datasets.md @@ -15,93 +15,78 @@ In this tutorial, we'll learn how to : ## Retrieve the Data -To retrieve the data, use [FRDCDataset](load.dataset.md) +To retrieve the data, use [FRDCDatasetPreset](load.dataset.md). +This module presets to load explicitly known datasets. -Here, we'll download and load our +For example: +```python +from frdc.load.preset import FRDCDatasetPreset + +ds = FRDCDatasetPreset.chestnut_20201218() +for x, y in ds: + print(x.shape, y) +``` + +You should get something like this: +``` +(831, 700, 8) Falcataria Moluccana +(540, 536, 8) Ficus Variegata +(457, 660, 8) Bridelia Sp. +... +``` + +- `x` is a `torch.Tensor` +- `y` is a `str`. + +> [What if I can't find a preset dataset?](load.dataset.md#i-can-t-find-a-dataset) +{style='warning'} -- `ar`: Hyperspectral Image Data -- `order`: The order of the bands -- `bounds`: The bounds of the trees (segments) -- `labels`: The labels of the trees (segments) +## Iterate through the Data + +The dataset, when you load it, will be automatically segmented by bounds. +Therefore, if you want to simply loop through the segments and labels, +you can treat the dataset as an iterable. ```python from frdc.load.preset import FRDCDatasetPreset ds = FRDCDatasetPreset.chestnut_20201218() -ar, order = ds._get_ar_bands() -bounds, labels = ds._get_bounds_and_labels() +for x, y in ds: + print(x.shape, y) ``` -### What Datasets are there? {collapsible="true"} +If you just want the segments or targets separately, use `.ar_segments` and +`.targets` respectively. -> We recommend to use FRDCDatasetPreset. However, if you want -> to know what other datasets are available, you can run -> [load.gcs](load.gcs.md)'s `list_gcs_datasets()` -> method +```python +from frdc.load.preset import FRDCDatasetPreset -> Note that some datasets do not have `bounds` and `labels` available as they -> have not been annotated yet. -> {style='warning'} +ds = FRDCDatasetPreset.chestnut_20201218() +for x in ds.ar_segments: + print(x.shape) -```python -from frdc.load.gcs import list_gcs_datasets -print(list_gcs_datasets()) -# 0 DEBUG/0 -# 1 casuarina/20220418/183deg -# 2 casuarina/20220418/93deg -# 3 chestnut_nature_park/20201218 -# ... +for y in ds.targets: + print(y) ``` -- The first part of the path is the `site`, and the second part is the `date`. -- The `version` is the rest of the path, if there isn't any, use `None`. - - - - -
  • site="ds"
  • -
  • date="date"
  • -
  • version="ver"
  • -
    -
    - - -
  • site="ds"
  • -
  • date="date"
  • -
  • version="ver/01/data"
  • -
    -
    - - -
  • site="ds"
  • -
  • date="date"
  • -
  • version=None
  • -
    -
    -
    - -## Segment the Data - -To segment the data, use [Extract Segments](preprocessing.extract_segments.md). - -Here, we'll segment the data by the bounds. +If you want the entire image, use `.ar`. ```python from frdc.load.preset import FRDCDatasetPreset -from frdc.preprocess.extract_segments import extract_segments_from_bounds ds = FRDCDatasetPreset.chestnut_20201218() -ar, order = ds._get_ar_bands() -bounds, labels = ds._get_bounds_and_labels() -segments = extract_segments_from_bounds(ar, bounds) +ar = ds.ar ``` -`segments` is a list of `np.ndarray` of shape H, W, C, representing a tree. -The order of `segments` is the same as `labels`, so you can use `labels` to -identify the tree. +Finally, to inspect the order of the bands, you can use the `order` attribute. -> While we have not used `order` in our example, it's useful to determine the -> order of the bands in `ar` in other applications. +```python +from frdc.load.preset import FRDCDatasetPreset + +ds = FRDCDatasetPreset.chestnut_20201218() +ds.order +# > ['WB', 'WG', 'WR', 'NB', 'NG', 'NR', 'RE', 'NIR'] +``` ## Plot the Data (Optional) {collapsible="true"} @@ -111,19 +96,15 @@ We can then use these data to plot out the first tree segment. import matplotlib.pyplot as plt from frdc.load.preset import FRDCDatasetPreset -from frdc.preprocess.extract_segments import extract_segments_from_bounds from frdc.preprocess.scale import scale_0_1_per_band ds = FRDCDatasetPreset.chestnut_20201218() -ar, order = ds._get_ar_bands() -bounds, labels = ds._get_bounds_and_labels() -segments = extract_segments_from_bounds(ar, bounds) -segment_0_bgr = segments[0] +segment_0_bgr = ds.ar_segments[0] segment_0_rgb = segment_0_bgr[..., [2, 1, 0]] segment_0_rgb_scaled = scale_0_1_per_band(segment_0_rgb) plt.imshow(segment_0_rgb_scaled) -plt.title(f"Tree {labels[0]}") +plt.title(f"Tree {ds.targets[0]}") plt.show() ``` See also: [preprocessing.scale.scale_0_1_per_band](preprocessing.scale.md) diff --git a/Writerside/topics/load.dataset.md b/Writerside/topics/load.dataset.md index e5842d20..704c012b 100644 --- a/Writerside/topics/load.dataset.md +++ b/Writerside/topics/load.dataset.md @@ -10,11 +10,11 @@ Load dataset objects from our GCS bucket. ## Usage -Firstly, to load a dataset instance, you need to -initiliaze a `FRDCDataset` object, providing the site, date, and -version. - -For example, to load our Chestnut Nature Park dataset. +Firstly, to load a dataset instance, you need to initiliaze a `FRDCDataset` +object, providing the site, date, and version. + +We recommend using the `FRDCDatasetPreset` module to load explicitly known +datasets. ```python from frdc.load.preset import FRDCDatasetPreset @@ -45,19 +45,44 @@ bounds, labels = ds._get_bounds_and_labels() > NDArray, and returns the channel order as well. {style='note'} -## Filters -You can also selectively get the channels for both `get_ar_bands()` and -`get_ar_bands_as_dict()` by providing a list of strings to the `bands` -argument. +### I can't find a dataset! -For example, to get the Wideband RGB bands, you can do: +Some datasets, especially new ones may be unregistered and you must specify +the **exact** site / date / version of it. ```python -ar, order = ds._get_ar_bands(bands=['WR', 'WG', 'WB']) -d = ds._get_ar_bands_as_dict(bands=['WR', 'WG', 'WB']) +from frdc.load.dataset import FRDCDataset + +ds = FRDCDataset(site="mysite", date="mydate", version="myversion") ``` -This will also alter the channel order to the order of the bands provided. +> `version` can be `None` if there isn't one. +{style='note'} + +See below for examples on how to format this. + + + + +
  • site="ds"
  • +
  • date="date"
  • +
  • version="ver"
  • +
    +
    + + +
  • site="ds"
  • +
  • date="date"
  • +
  • version="ver/01/data"
  • +
    +
    + + +
  • site="ds"
  • +
  • date="date"
  • +
  • version=None
  • +
    +
    +
    -See [load.gcs](load.gcs.md#configuration) for configuration options. From 0bb4734ede16baeefdc8e1a11aec3bd8215f6ec2 Mon Sep 17 00:00:00 2001 From: Eve-ning Date: Wed, 26 Jun 2024 15:13:43 +0800 Subject: [PATCH 04/12] Refactor dataset.py --- src/frdc/load/dataset.py | 135 +++++++++++++++++++++------------------ 1 file changed, 73 insertions(+), 62 deletions(-) diff --git a/src/frdc/load/dataset.py b/src/frdc/load/dataset.py index d7686f4d..7babd794 100644 --- a/src/frdc/load/dataset.py +++ b/src/frdc/load/dataset.py @@ -14,11 +14,8 @@ from torch.utils.data import Dataset, ConcatDataset from torchvision.transforms.v2.functional import hflip -from frdc.conf import ( - BAND_CONFIG, - LABEL_STUDIO_CLIENT, -) -from frdc.load.gcs import download +from frdc.conf import BAND_CONFIG, LABEL_STUDIO_CLIENT +from frdc.load import gcs from frdc.load.label_studio import get_task from frdc.preprocess.extract_segments import ( extract_segments_from_bounds, @@ -29,44 +26,6 @@ logger = logging.getLogger(__name__) -class FRDCConcatDataset(ConcatDataset): - """ConcatDataset for FRDCDataset. - - Notes: - This handles concatenating the targets when you add two datasets - together, furthermore, implements the addition operator to - simplify the syntax. - - Examples: - If you have two datasets, ds1 and ds2, you can concatenate them:: - - ds = ds1 + ds2 - - `ds` will be a FRDCConcatDataset, which is a subclass of ConcatDataset. - - You can further add to a concatenated dataset:: - - ds = ds1 + ds2 - ds = ds + ds3 - - Finallu, all concatenated datasets have the `targets` property, which - is a list of all the targets in the datasets:: - - (ds1 + ds2).targets == ds1.targets + ds2.targets - """ - - def __init__(self, datasets: list[FRDCDataset]): - super().__init__(datasets) - self.datasets: list[FRDCDataset] = datasets - - @property - def targets(self): - return [t for ds in self.datasets for t in ds.targets] - - def __add__(self, other: FRDCDataset) -> FRDCConcatDataset: - return FRDCConcatDataset([*self.datasets, other]) - - @dataclass class FRDCDataset(Dataset): def __init__( @@ -118,18 +77,26 @@ def __init__( self.ar, self.order = self._get_ar_bands() self.targets = None - if use_legacy_bounds or (LABEL_STUDIO_CLIENT is None): - bounds, self.targets = self._get_bounds_and_labels() + if use_legacy_bounds: + bounds, self.targets = self._get_legacy_bounds_and_labels() self.ar_segments = extract_segments_from_bounds(self.ar, bounds) else: - bounds, self.targets = self._get_polybounds_and_labels() - self.ar_segments = extract_segments_from_polybounds( - self.ar, - bounds, - cropped=True, - polycrop=polycrop, - polycrop_value=polycrop_value, - ) + if LABEL_STUDIO_CLIENT: + bounds, self.targets = self._get_polybounds_and_labels() + self.ar_segments = extract_segments_from_polybounds( + self.ar, + bounds, + cropped=True, + polycrop=polycrop, + polycrop_value=polycrop_value, + ) + else: + raise ConnectionError( + "Cannot connect to Label Studio, cannot use live bounds. " + "Retry with use_legacy_bounds=True to attempt to use the " + "legacy bounds.csv file." + ) + self.transform = transform self.target_transform = target_transform @@ -211,19 +178,19 @@ def _get_ar_bands_as_dict( f"Invalid band name. Valid band names are {BAND_CONFIG.keys()}" ) - for name, (glob, transform) in config.items(): - fp = download(fp=self.dataset_dir / glob) + for band_name, (glob, band_transform) in config.items(): + fp = gcs.download(fp=self.dataset_dir / glob) # We may use the same file multiple times, so we cache it if fp in fp_cache: logging.debug(f"Cache hit for {fp}, using cached image...") - im = fp_cache[fp] + im_band = fp_cache[fp] else: logging.debug(f"Cache miss for {fp}, loading...") - im = self._load_image(fp) - fp_cache[fp] = im + im_band = self._load_image(fp) + fp_cache[fp] = im_band - d[name] = transform(im) + d[band_name] = band_transform(im_band) return d @@ -255,7 +222,7 @@ def _get_ar_bands( d: dict[str, np.ndarray] = self._get_ar_bands_as_dict(bands) return np.concatenate(list(d.values()), axis=-1), list(d.keys()) - def _get_bounds_and_labels( + def _get_legacy_bounds_and_labels( self, file_name="bounds.csv", ) -> tuple[list[Rect], list[str]]: @@ -278,7 +245,13 @@ def _get_bounds_and_labels( "This is pending to be deprecated in favour of pulling " "annotations from Label Studio." ) - fp = download(fp=self.dataset_dir / file_name) + try: + fp = gcs.download(fp=self.dataset_dir / file_name) + except FileNotFoundError: + raise FileNotFoundError( + f"bounds.csv not found in {self.dataset_dir}. " + f"Please check the file exists." + ) df = pd.read_csv(fp) return ( [Rect(i.x0, i.y0, i.x1, i.y1) for i in df.itertuples()], @@ -289,7 +262,7 @@ def _get_polybounds_and_labels(self): """Gets the bounds and labels from Label Studio.""" return get_task( Path(f"{self.dataset_dir}/result.jpg") - )._get_bounds_and_labels() + ).get_bounds_and_labels() @staticmethod def _load_image(path: Path | str) -> np.ndarray: @@ -373,3 +346,41 @@ def __getitem__(self, idx): x_ = hflip(rot90(x, 3, (1, 2))) return x_, y + + +class FRDCConcatDataset(ConcatDataset): + """ConcatDataset for FRDCDataset. + + Notes: + This handles concatenating the targets when you add two datasets + together, furthermore, implements the addition operator to + simplify the syntax. + + Examples: + If you have two datasets, ds1 and ds2, you can concatenate them:: + + ds = ds1 + ds2 + + `ds` will be a FRDCConcatDataset, which is a subclass of ConcatDataset. + + You can further add to a concatenated dataset:: + + ds = ds1 + ds2 + ds = ds + ds3 + + Finallu, all concatenated datasets have the `targets` property, which + is a list of all the targets in the datasets:: + + (ds1 + ds2).targets == ds1.targets + ds2.targets + """ + + def __init__(self, datasets: list[FRDCDataset]): + super().__init__(datasets) + self.datasets: list[FRDCDataset] = datasets + + @property + def targets(self): + return [t for ds in self.datasets for t in ds.targets] + + def __add__(self, other: FRDCDataset) -> FRDCConcatDataset: + return FRDCConcatDataset([*self.datasets, other]) From 856eed73f44d029467d941a6b6d3f42b450fa79d Mon Sep 17 00:00:00 2001 From: Eve-ning Date: Wed, 26 Jun 2024 15:13:53 +0800 Subject: [PATCH 05/12] Remove unraisable error --- src/frdc/load/gcs.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/frdc/load/gcs.py b/src/frdc/load/gcs.py index 6c84a20b..7c68fb1e 100644 --- a/src/frdc/load/gcs.py +++ b/src/frdc/load/gcs.py @@ -39,8 +39,6 @@ def download( Raises: ValueError: If there are multiple blobs that match the path_glob. FileNotFoundError: If the file does not exist in GCS. - FileExistsError: If the file already exists locally and the hashes - match. Returns: The local path to the downloaded file. From 78a5bf6bbd2d1d25ec1254d87b66b3541d54f8a4 Mon Sep 17 00:00:00 2001 From: Eve-ning Date: Wed, 26 Jun 2024 15:15:03 +0800 Subject: [PATCH 06/12] Drop unused imports and add return type hint --- src/frdc/load/label_studio.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/frdc/load/label_studio.py b/src/frdc/load/label_studio.py index 435cc40f..33cbdabf 100644 --- a/src/frdc/load/label_studio.py +++ b/src/frdc/load/label_studio.py @@ -4,8 +4,6 @@ from pathlib import Path from warnings import warn -from label_studio_sdk.data_manager import Filters, Column, Type, Operator - from frdc.conf import LABEL_STUDIO_CLIENT logger = logging.getLogger(__name__) @@ -61,7 +59,7 @@ def get_bounds_and_labels(self) -> tuple[list[tuple[int, int]], list[str]]: bbox["label"] = bbox.pop("polygonlabels")[0] if not bbox["closed"]: logger.warning( - f"Label for {bbox['label']} @ {bbox['points']} not closed. " + f"Label {bbox['label']} @ {bbox['points']} is not closed. " f"Skipping" ) continue @@ -75,7 +73,7 @@ def get_bounds_and_labels(self) -> tuple[list[tuple[int, int]], list[str]]: def get_task( file_name: Path | str = "chestnut_nature_park/20201218/result.jpg", project_id: int = 1, -): +) -> Task: proj = LABEL_STUDIO_CLIENT.get_project(project_id) task_ids = [ task["id"] From b17cb84056785a78e8dc682b3c5052bf76b9fef0 Mon Sep 17 00:00:00 2001 From: Eve-ning Date: Wed, 26 Jun 2024 15:16:09 +0800 Subject: [PATCH 07/12] Update signatures for get legacy bounds --- Writerside/topics/load.dataset.md | 2 +- Writerside/topics/preprocessing.extract_segments.md | 2 +- tests/unit_tests/load/test_frdc_dataset.py | 2 +- tests/unit_tests/load/test_label_studio.py | 2 +- tests/unit_tests/preprocess/test_extract_segments.py | 4 ++-- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Writerside/topics/load.dataset.md b/Writerside/topics/load.dataset.md index 704c012b..7e5e610c 100644 --- a/Writerside/topics/load.dataset.md +++ b/Writerside/topics/load.dataset.md @@ -27,7 +27,7 @@ Then, we can use the `ds` object to load objects of the dataset: ```python ar, order = ds._get_ar_bands() d = ds._get_ar_bands_as_dict() -bounds, labels = ds._get_bounds_and_labels() +bounds, labels = ds._get_legacy_bounds_and_labels() ``` - `ar` is a stacked NDArray of the hyperspectral bands of shape (H x W x C) diff --git a/Writerside/topics/preprocessing.extract_segments.md b/Writerside/topics/preprocessing.extract_segments.md index 32c9d78c..de16038a 100644 --- a/Writerside/topics/preprocessing.extract_segments.md +++ b/Writerside/topics/preprocessing.extract_segments.md @@ -140,7 +140,7 @@ from frdc.preprocess.extract_segments import extract_segments_from_bounds ds = FRDCDatasetPreset.chestnut_20201218() ar, order = ds._get_ar_bands() -bounds, labels = ds._get_bounds_and_labels() +bounds, labels = ds._get_legacy_bounds_and_labels() segments: list[np.ndarray] = extract_segments_from_bounds(ar, bounds) ``` diff --git a/tests/unit_tests/load/test_frdc_dataset.py b/tests/unit_tests/load/test_frdc_dataset.py index c887a871..8376eccb 100644 --- a/tests/unit_tests/load/test_frdc_dataset.py +++ b/tests/unit_tests/load/test_frdc_dataset.py @@ -21,7 +21,7 @@ def test_get_ar_bands_ordering(ds): def test_get_bounds(ds): - bounds, labels = ds._get_bounds_and_labels() + bounds, labels = ds._get_legacy_bounds_and_labels() assert all([isinstance(b, Rect) for b in bounds]) assert len(bounds) == len(labels) diff --git a/tests/unit_tests/load/test_label_studio.py b/tests/unit_tests/load/test_label_studio.py index a38b3c19..aba6e757 100644 --- a/tests/unit_tests/load/test_label_studio.py +++ b/tests/unit_tests/load/test_label_studio.py @@ -5,4 +5,4 @@ @requires_label_studio def test_get_bounds_and_labels(): task = get_task("DEBUG/0/result.jpg") - bounds, labels = task._get_bounds_and_labels() + bounds, labels = task._get_legacy_bounds_and_labels() diff --git a/tests/unit_tests/preprocess/test_extract_segments.py b/tests/unit_tests/preprocess/test_extract_segments.py index 5433e0e8..4f6ecbb1 100644 --- a/tests/unit_tests/preprocess/test_extract_segments.py +++ b/tests/unit_tests/preprocess/test_extract_segments.py @@ -76,13 +76,13 @@ def test_unique_labels( def test_extract_segments_from_bounds_cropped(ds, ar): - bounds, labels = ds._get_bounds_and_labels() + bounds, labels = ds._get_legacy_bounds_and_labels() segments = extract_segments_from_bounds(ar, bounds, cropped=True) assert any(segment.shape != ar.shape for segment in segments) def test_extract_segments_from_bounds_no_crop(ds, ar): - bounds, labels = ds._get_bounds_and_labels() + bounds, labels = ds._get_legacy_bounds_and_labels() segments = extract_segments_from_bounds(ar, bounds, cropped=False) assert all(segment.shape == ar.shape for segment in segments) From a584ad32f4cd8ab75f3e0580f9ccfb9eba2d5297 Mon Sep 17 00:00:00 2001 From: Eve-ning Date: Wed, 26 Jun 2024 19:19:25 +0800 Subject: [PATCH 08/12] Clarify attribute naming of order --- src/frdc/load/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/frdc/load/dataset.py b/src/frdc/load/dataset.py index 7babd794..ca18f446 100644 --- a/src/frdc/load/dataset.py +++ b/src/frdc/load/dataset.py @@ -74,7 +74,7 @@ def __init__( self.date = date self.version = version - self.ar, self.order = self._get_ar_bands() + self.ar, self.band_order = self._get_ar_bands() self.targets = None if use_legacy_bounds: From 1673a90e060909fc914074a90cedd63ff6101447 Mon Sep 17 00:00:00 2001 From: Eve-ning Date: Wed, 26 Jun 2024 19:21:08 +0800 Subject: [PATCH 09/12] Update docs --- Writerside/d.tree | 2 +- Writerside/topics/ML-Architecture.md | 107 +++++++++++++++++++++ Writerside/topics/Retrieve-our-Datasets.md | 53 ++++++++-- Writerside/topics/train.frdc_lightning.md | 70 -------------- Writerside/writerside.cfg | 2 +- docs/HelpTOC.json | 2 +- docs/Map.jhm | 2 +- docs/api-object-digest.json | 1 + docs/config.json | 2 +- docs/current.help.version | 2 +- docs/custom-k-aug-dataloaders.html | 21 +++- docs/get-started-with-dev-containers.html | 19 +++- docs/getting-started.html | 35 +++++-- docs/load-dataset.html | 34 +++++-- docs/load-gcs.html | 21 +++- docs/mix-match-module.html | 27 ++++-- docs/mix-match.html | 17 +++- docs/ml-architecture.html | 16 +++ docs/model-test-chestnut-may-dec.html | 17 +++- docs/overview.html | 17 +++- docs/preprocessing-extract-segments.html | 37 ++++--- docs/preprocessing-glcm-padded.html | 19 +++- docs/preprocessing-morphology.html | 23 ++++- docs/preprocessing-scale.html | 21 +++- docs/retrieve-our-datasets.html | 94 +++++++++++++----- docs/train-frdc-lightning.html | 4 - 26 files changed, 499 insertions(+), 166 deletions(-) create mode 100644 Writerside/topics/ML-Architecture.md delete mode 100644 Writerside/topics/train.frdc_lightning.md create mode 100644 docs/api-object-digest.json create mode 100644 docs/ml-architecture.html delete mode 100644 docs/train-frdc-lightning.html diff --git a/Writerside/d.tree b/Writerside/d.tree index 32778cf1..9918f35d 100644 --- a/Writerside/d.tree +++ b/Writerside/d.tree @@ -8,6 +8,7 @@ start-page="Overview.md"> + @@ -28,6 +29,5 @@ - \ No newline at end of file diff --git a/Writerside/topics/ML-Architecture.md b/Writerside/topics/ML-Architecture.md new file mode 100644 index 00000000..058c975d --- /dev/null +++ b/Writerside/topics/ML-Architecture.md @@ -0,0 +1,107 @@ +# ML Architecture + +The architecture is the backbone of the project. If you're interested on how +everything is pieced together, this article is for you. + +In Machine Learning architectures, we mostly care about 2 things the data, +and the model. As the name implies, DataModules, DataLoaders, Datasets deal +with data, and Modules for model construction. + +## Data Classes +There's a small difference between the Data___ classes. Firstly, we load data +in as `Dataset` instances, then preprocessed before being batched by +`DataLoader`, finally, housed in `DataModule`. + +```mermaid +graph LR + src[Data Source] --> load{{Load}} + load --> ds[Dataset] + ds --> prep + subgraph prep[Preprocess] + direction TB + subgraph Augmentations + direction TB + alt[Alternatives] --> dist[Distortions] + dist --> crop[Cropping or Resizing] + end + crop --> sca[Scaling] + end + + prep --> dl[DataLoader] + + subgraph DataModule + direction TB + trndl[Train DataLoader] + valdl[Validation DataLoader] + tesdl[Test DataLoader] + end + + dl --> trndl + dl --> valdl + dl --> tesdl +``` + +There are 2 **IMPORTANT** design decisions here: + +### Dataset and DataLoader + +Data in `Dataset` are unbatched, data in `DataLoader` must be batched. +This means that it's possible to have jagged tensors at this stage, however +they must be made "stackable" before loading into the `DataLoader`. + +For example, the data in `Dataset` could be of shapes +`[(8, 200, 100), (8, 100, 300), ...]`. While, **BEFORE** loading into +`DataLoader` must have equal shapes, for example +`[(8, 100, 100), (8, 100, 100), ...]` + +This is because when you initialize a `DataLoader` you need to include the +`batch_size`, which implies the data are stacked in some manner. + +This also leads to the reason why **preprocessing** must happen before the +`DataLoader` + +### Preprocessing + +Excluding functionalities to **load** the data, this is the step before the +data is set in stone. So, steps such as augmentation, transformation, even +analytics needs to be performed here as the data is in its **"rawest"** form. + +We use this step to +1. Construct alternative augmentations. i.e. images that we **could've** taken + instead. +2. Using those alternatives, add distortions. i.e. unintentional changes + to the photo that reduces quality. +3. Cropping or resizing the image. +4. Scale the data. e.g. Standard Scaling, ZCA Scaling, etc. + +The order of the steps are choice by design. + +## Modules + +We analyze the inheritance structure of the Modules (also the ML Models): + +```mermaid +graph TD + lib([Library Module]) + pytmod([PyTorch Module]) --> litmod([Lightning Module]) + litmod --> frdcmod[FRDC Module] --> fmmod[FixMatch Module] + frdcmod --> mmmod[MixMatch Module] + fmmod --> effmmod[EfficientNetB1 FixMatch Module] + mmmod --> effmmmod[EfficientNetB1 MixMatch Module] + custom[Custom Module] +``` + +Custom Modules are our self-defined classes. +- **FRDC Module**: This is the base class for all our models. Implements common + functionality, such as partial saving of unfrozen parameters. +- **Y Module**: Y is the architecture/framework of the model + in our case, this only defines the method of training, not the actual model + itself. +- **X Y Module**: X defines the actual model being used within Y's framework. + +To give an example, we look at `EfficientNetB1FixMatchModule`. Due to its +naming scheme `Module`, we see that it's an EfficientNetB1 +model used in the FixMatch framework. + +Furthermore, because it's well decoupled, implementing a new model is as easy +as overriding some defaults. diff --git a/Writerside/topics/Retrieve-our-Datasets.md b/Writerside/topics/Retrieve-our-Datasets.md index b503c2eb..b7749207 100644 --- a/Writerside/topics/Retrieve-our-Datasets.md +++ b/Writerside/topics/Retrieve-our-Datasets.md @@ -4,9 +4,10 @@ In this tutorial, we'll learn how to : -- Retrieve FRDC's Hyperspectral Image Data as `np.ndarray` -- Retrieve FRDC's Ground Truth bounds and labels -- Slice/segment the image data by the bounds +- Retrieve FRDC's Datasets +- How to inspect the data +- How to integrate it with PyTorch's DataLoader +- How to visualize the data ## Prerequisites @@ -78,16 +79,56 @@ ds = FRDCDatasetPreset.chestnut_20201218() ar = ds.ar ``` -Finally, to inspect the order of the bands, you can use the `order` attribute. +Finally, inspect the order of the bands through the `band_order` attribute. ```python from frdc.load.preset import FRDCDatasetPreset ds = FRDCDatasetPreset.chestnut_20201218() -ds.order -# > ['WB', 'WG', 'WR', 'NB', 'NG', 'NR', 'RE', 'NIR'] +ds.band_order ``` +```Console +> ['WB', 'WG', 'WR', 'NB', 'NG', 'NR', 'RE', 'NIR'] +``` + +## Using with PyTorch's DataLoader + +Every `FRDCDataset` is a `Dataset` object, so you can use it with PyTorch's +`DataLoader`. This allows you to retrieve by batches! + +```python +from torch.utils.data import DataLoader +from torchvision.transforms.v2 import CenterCrop, Compose, Resize, ToImage + +from frdc.load.preset import FRDCDatasetPreset + +ds = FRDCDatasetPreset.chestnut_20201218( + use_legacy_bounds=True, + transform=Compose([ToImage(), Resize(100), CenterCrop(100)]), +) +dl = DataLoader(ds, batch_size=4, shuffle=True) + +for x, y in dl: + print(x.shape, y) +``` + +Which should output + +```Console +torch.Size([4, 8, 100, 100]) ('Falcataria Moluccana', ...) +torch.Size([4, 8, 100, 100]) ('Clausena Excavata', ...) +torch.Size([4, 8, 100, 100]) ('Clausena Excavata', ...) +... +``` + +> **RuntimeError: stack expects each tensor to be equal size**: +> The reason for this error is that `DataLoader` expects equal dimensions +> (image height and width) for all images. To fix this, you can use +> `torchvision.transforms.v2.Resize` to resize the images to a fixed size in +> the above example. +{style='warning'} + ## Plot the Data (Optional) {collapsible="true"} We can then use these data to plot out the first tree segment. diff --git a/Writerside/topics/train.frdc_lightning.md b/Writerside/topics/train.frdc_lightning.md deleted file mode 100644 index dba46e74..00000000 --- a/Writerside/topics/train.frdc_lightning.md +++ /dev/null @@ -1,70 +0,0 @@ -# train.frdc_datamodule & frdc_module - - -The FRDC PyTorch LightningDataModule and LightningModule. - - -These are FRDC specific LightningDataModule and LightningModule, -a core component in the PyTorch Lightning ecosystem to provide a simple -interface to train and evaluate models. - -## Classes - -> It's optional to use these classes, you can use your own training loop -> if you want. We'll use these for our training pipeline. -> {style='note'} - - - -The FRDC PyTorch Lightning DataModule. - - -The FRDC PyTorch Lightning Module. - - - -## Usage - -> See our training pipeline for a full example - -## API - - - -Initializes the FRDC PyTorch Lightning DataModule.
    - -
  • segments, labels are retrieved from - -
  • FRDCDataset
  • -
  • Segmentation
  • -
    - -
  • preprocess is a function that takes in a segment and returns a preprocessed -segment. In particular, it should accept a list of NumPy NDArrays and return -a single stacked PyToch Tensor.
  • -
  • augmentation is a function that takes in a segment and returns an augmented -segment. In particular, it takes in a PyTorch Tensor and returns another.
  • -
  • train_val_test_split is a function that takes a TensorDataset and returns -a list of 3 TensorDatasets, for train, val and test respectively.
  • -
  • batch_size is the batch size.
  • - -For now, the augmentation is only applied to training -data -
    - -Initializes the FRDC PyTorch Lightning Module.
    - -
  • model_cls is the Class of the model.
  • -
  • model_kwargs is the kwargs to pass to the model.
  • -
  • optim_cls is the Class of the optimizer.
  • -
  • optim_kwargs is the kwargs to pass to the optimizer.
  • -
    -Internally, the module will initialize the model and optimizer as follows: - -model = model_cls(**model_kwargs) -optim = optim_cls(model.parameters(), **optim_kwargs) - -We do not accept the instances of the Model and Optimizer so -that we can pickle them. -
    -
    diff --git a/Writerside/writerside.cfg b/Writerside/writerside.cfg index 9e1b0444..40df6e66 100644 --- a/Writerside/writerside.cfg +++ b/Writerside/writerside.cfg @@ -4,5 +4,5 @@ - + \ No newline at end of file diff --git a/docs/HelpTOC.json b/docs/HelpTOC.json index 5467dd78..59ab5806 100644 --- a/docs/HelpTOC.json +++ b/docs/HelpTOC.json @@ -1 +1 @@ -{"entities":{"pages":{"Overview":{"id":"Overview","title":"Overview","url":"overview.html","level":0,"tabIndex":0},"Getting-Started":{"id":"Getting-Started","title":"Getting Started","url":"getting-started.html","level":0,"pages":["Get-Started-with-Dev-Containers"],"tabIndex":1},"Get-Started-with-Dev-Containers":{"id":"Get-Started-with-Dev-Containers","title":"Get Started with Dev Containers","url":"get-started-with-dev-containers.html","level":1,"parentId":"Getting-Started","tabIndex":0},"f6c570e4_4234":{"id":"f6c570e4_4234","title":"Tutorials","level":0,"pages":["Retrieve-our-Datasets"],"tabIndex":2},"Retrieve-our-Datasets":{"id":"Retrieve-our-Datasets","title":"Retrieve our Datasets","url":"retrieve-our-datasets.html","level":1,"parentId":"f6c570e4_4234","tabIndex":0},"mix-match":{"id":"mix-match","title":"MixMatch","url":"mix-match.html","level":0,"pages":["mix-match-module","custom-k-aug-dataloaders"],"tabIndex":3},"mix-match-module":{"id":"mix-match-module","title":"MixMatch Module","url":"mix-match-module.html","level":1,"parentId":"mix-match","tabIndex":0},"custom-k-aug-dataloaders":{"id":"custom-k-aug-dataloaders","title":"Custom K-Aug Dataloaders","url":"custom-k-aug-dataloaders.html","level":1,"parentId":"mix-match","tabIndex":1},"f6c570e4_4239":{"id":"f6c570e4_4239","title":"Model Tests","level":0,"pages":["Model-Test-Chestnut-May-Dec"],"tabIndex":4},"Model-Test-Chestnut-May-Dec":{"id":"Model-Test-Chestnut-May-Dec","title":"Model Test Chestnut May-Dec","url":"model-test-chestnut-may-dec.html","level":1,"parentId":"f6c570e4_4239","tabIndex":0},"f6c570e4_4241":{"id":"f6c570e4_4241","title":"API","level":0,"pages":["load.dataset","load.gcs","preprocessing.scale","preprocessing.extract_segments","preprocessing.morphology","preprocessing.glcm_padded","train.frdc_lightning"],"tabIndex":5},"load.dataset":{"id":"load.dataset","title":"load.dataset","url":"load-dataset.html","level":1,"parentId":"f6c570e4_4241","tabIndex":0},"load.gcs":{"id":"load.gcs","title":"load.gcs","url":"load-gcs.html","level":1,"parentId":"f6c570e4_4241","tabIndex":1},"preprocessing.scale":{"id":"preprocessing.scale","title":"preprocessing.scale","url":"preprocessing-scale.html","level":1,"parentId":"f6c570e4_4241","tabIndex":2},"preprocessing.extract_segments":{"id":"preprocessing.extract_segments","title":"preprocessing.extract_segments","url":"preprocessing-extract-segments.html","level":1,"parentId":"f6c570e4_4241","tabIndex":3},"preprocessing.morphology":{"id":"preprocessing.morphology","title":"preprocessing.morphology","url":"preprocessing-morphology.html","level":1,"parentId":"f6c570e4_4241","tabIndex":4},"preprocessing.glcm_padded":{"id":"preprocessing.glcm_padded","title":"preprocessing.glcm_padded","url":"preprocessing-glcm-padded.html","level":1,"parentId":"f6c570e4_4241","tabIndex":5},"train.frdc_lightning":{"id":"train.frdc_lightning","title":"train.frdc_datamodule \u0026 frdc_module","url":"train-frdc-lightning.html","level":1,"parentId":"f6c570e4_4241","tabIndex":6}}},"topLevelIds":["Overview","Getting-Started","f6c570e4_4234","mix-match","f6c570e4_4239","f6c570e4_4241"]} \ No newline at end of file +{"entities":{"pages":{"Overview":{"id":"Overview","title":"Overview","url":"overview.html","level":0,"tabIndex":0},"ML-Architecture":{"id":"ML-Architecture","title":"ML Architecture","url":"ml-architecture.html","level":0,"tabIndex":1},"Getting-Started":{"id":"Getting-Started","title":"Getting Started","url":"getting-started.html","level":0,"pages":["Get-Started-with-Dev-Containers"],"tabIndex":2},"Get-Started-with-Dev-Containers":{"id":"Get-Started-with-Dev-Containers","title":"Get Started with Dev Containers","url":"get-started-with-dev-containers.html","level":1,"parentId":"Getting-Started","tabIndex":0},"-6vddrq_5799":{"id":"-6vddrq_5799","title":"Tutorials","level":0,"pages":["Retrieve-our-Datasets"],"tabIndex":3},"Retrieve-our-Datasets":{"id":"Retrieve-our-Datasets","title":"Retrieve our Datasets","url":"retrieve-our-datasets.html","level":1,"parentId":"-6vddrq_5799","tabIndex":0},"mix-match":{"id":"mix-match","title":"MixMatch","url":"mix-match.html","level":0,"pages":["mix-match-module","custom-k-aug-dataloaders"],"tabIndex":4},"mix-match-module":{"id":"mix-match-module","title":"MixMatch Module","url":"mix-match-module.html","level":1,"parentId":"mix-match","tabIndex":0},"custom-k-aug-dataloaders":{"id":"custom-k-aug-dataloaders","title":"Custom K-Aug Dataloaders","url":"custom-k-aug-dataloaders.html","level":1,"parentId":"mix-match","tabIndex":1},"-6vddrq_5804":{"id":"-6vddrq_5804","title":"Model Tests","level":0,"pages":["Model-Test-Chestnut-May-Dec"],"tabIndex":5},"Model-Test-Chestnut-May-Dec":{"id":"Model-Test-Chestnut-May-Dec","title":"Model Test Chestnut May-Dec","url":"model-test-chestnut-may-dec.html","level":1,"parentId":"-6vddrq_5804","tabIndex":0},"-6vddrq_5806":{"id":"-6vddrq_5806","title":"API","level":0,"pages":["load.dataset","load.gcs","preprocessing.scale","preprocessing.extract_segments","preprocessing.morphology","preprocessing.glcm_padded"],"tabIndex":6},"load.dataset":{"id":"load.dataset","title":"load.dataset","url":"load-dataset.html","level":1,"parentId":"-6vddrq_5806","tabIndex":0},"load.gcs":{"id":"load.gcs","title":"load.gcs","url":"load-gcs.html","level":1,"parentId":"-6vddrq_5806","tabIndex":1},"preprocessing.scale":{"id":"preprocessing.scale","title":"preprocessing.scale","url":"preprocessing-scale.html","level":1,"parentId":"-6vddrq_5806","tabIndex":2},"preprocessing.extract_segments":{"id":"preprocessing.extract_segments","title":"preprocessing.extract_segments","url":"preprocessing-extract-segments.html","level":1,"parentId":"-6vddrq_5806","tabIndex":3},"preprocessing.morphology":{"id":"preprocessing.morphology","title":"preprocessing.morphology","url":"preprocessing-morphology.html","level":1,"parentId":"-6vddrq_5806","tabIndex":4},"preprocessing.glcm_padded":{"id":"preprocessing.glcm_padded","title":"preprocessing.glcm_padded","url":"preprocessing-glcm-padded.html","level":1,"parentId":"-6vddrq_5806","tabIndex":5}}},"topLevelIds":["Overview","ML-Architecture","Getting-Started","-6vddrq_5799","mix-match","-6vddrq_5804","-6vddrq_5806"]} \ No newline at end of file diff --git a/docs/Map.jhm b/docs/Map.jhm index 9449faa3..7b65cf66 100644 --- a/docs/Map.jhm +++ b/docs/Map.jhm @@ -1 +1 @@ - \ No newline at end of file + \ No newline at end of file diff --git a/docs/api-object-digest.json b/docs/api-object-digest.json new file mode 100644 index 00000000..9e26dfee --- /dev/null +++ b/docs/api-object-digest.json @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/docs/config.json b/docs/config.json index 1f05ea18..d1e3c8ce 100644 --- a/docs/config.json +++ b/docs/config.json @@ -1 +1 @@ -{"productVersion":"0.0.8","productId":"d","stage":"release","downloadTitle":"Get Documentation","keymaps":{},"searchMaxHits":75,"productName":"Documentation"} \ No newline at end of file +{"productVersion":"0.1.2","productWebUrl":".","productId":"d","stage":"release","downloadTitle":"Get Documentation","keymaps":{},"searchMaxHits":75,"productName":"Documentation"} \ No newline at end of file diff --git a/docs/current.help.version b/docs/current.help.version index 7d6b3eb3..8294c184 100644 --- a/docs/current.help.version +++ b/docs/current.help.version @@ -1 +1 @@ -0.0.8 \ No newline at end of file +0.1.2 \ No newline at end of file diff --git a/docs/custom-k-aug-dataloaders.html b/docs/custom-k-aug-dataloaders.html index 473d540d..0d084f17 100644 --- a/docs/custom-k-aug-dataloaders.html +++ b/docs/custom-k-aug-dataloaders.html @@ -1,4 +1,19 @@ - Custom K-Aug Dataloaders | Documentation

    Documentation 0.0.8 Help

    Custom K-Aug Dataloaders

    In MixMatch, implementing the data loading methods is quite unconventional.

    1. We need to load multiple augmented versions of the same image into the same batch.

    2. The labelled set is usually too small, causing a premature end to the epoch as it runs out of samples to draw from faster than the unlabelled set.

    This can be rather tricky to implement in PyTorch. This tutorial will illustrate how we did it.

    Loading Multiple Augmented Versions of the Same Image

    See: frdc/load/dataset.py FRDCDataset.__getitem__

    In MixMatch, a single train batch must consist of:

    1. A batch of labeled images

    2. K batches of unlabeled images

    Aug
    Aug
    Aug
    Aug
    Get Batch
    Aug Labelled Batch
    Unlabelled Batch
    Aug Unl. Batch 1
    Aug Unl. Batch i
    Aug Unl. Batch K

    Keep in mind that the unlabelled batch, is a single batch of images, not separate draws of batches. It is then "duplicated" K times, and each copy is augmented differently.

    Solution 1: Custom Dataset

    To solve this, we need to understand the role of both a Dataset and a DataLoader.

    • A Dataset represents a collection of data, responsible for loading and returning something.

    • A DataLoader draws samples from a Dataset and returns batched samples.

    The key here is that a Dataset is not limited to returning 1 sample at a time, we can make it return the K augmented versions of the same image.

    Aug
    Aug
    Aug
    Sample
    Aug Sample 1
    Aug Sample i
    Aug Sample K

    In code, this is done by subclassing the Dataset class and overriding the __getitem__ method.

    + +Custom K-Aug Dataloaders | Documentation

    Documentation 0.1.2 Help

    Custom K-Aug Dataloaders

    In MixMatch, implementing the data loading methods is quite unconventional.

    1. We need to load multiple augmented versions of the same image into the same batch.

    2. The labelled set is usually too small, causing a premature end to the epoch as it runs out of samples to draw from faster than the unlabelled set.

    This can be rather tricky to implement in PyTorch. This tutorial will illustrate how we did it.

    Loading Multiple Augmented Versions of the Same Image

    See: frdc/load/dataset.py FRDCDataset.__getitem__

    In MixMatch, a single train batch must consist of:

    1. A batch of labeled images

    2. K batches of unlabeled images

    Aug
    Aug
    Aug
    Aug
    Get Batch
    Aug Labelled Batch
    Unlabelled Batch
    Aug Unl. Batch 1
    Aug Unl. Batch i
    Aug Unl. Batch K

    Keep in mind that the unlabelled batch, is a single batch of images, not separate draws of batches. It is then "duplicated" K times, and each copy is augmented differently.

    Solution 1: Custom Dataset

    To solve this, we need to understand the role of both a Dataset and a DataLoader.

    • A Dataset represents a collection of data, responsible for loading and returning something.

    • A DataLoader draws samples from a Dataset and returns batched samples.

    The key here is that a Dataset is not limited to returning 1 sample at a time, we can make it return the K augmented versions of the same image.

    Aug
    Aug
    Aug
    Sample
    Aug Sample 1
    Aug Sample i
    Aug Sample K

    In code, this is done by subclassing the Dataset class and overriding the __getitem__ method.

    def duplicate(x): return x, deepcopy(x), deepcopy(x) @@ -10,7 +25,7 @@ def __getitem__(self, index): x, y = self.dataset[index] return self.aug(x), y -

    In the above example, we have a Dataset that returns 3 duplicate versions of the same image. By leveraging this technique, we can create a Dataset that returns K augmented versions of the same image as a tuple

    Premature End of Epoch due to Small Labelled Set

    See: frdc/train/frdc_datamodule.py

    In MixMatch, the definition of an "epoch" is a bit different. Instead of implying that we have seen all the data once, it implies that we've drawn N batches. The N is referred to as the number of iterations per epoch.

    Take for example, a labelled set of numbers [1, 2, 3] and an unlabelled set [4, 5, 6, 7, 8, 9, 10]. With batch size of 2, we'll run out of labelled samples after 2 iterations, but we'll still have 3 more iterations for the unlabelled set.

    • Draw 1: [1, 2], [4, 5]

    • Draw 2: [3], [6, 7].

    • Epoch ends.

    Solution 2: Random Sampling

    To fix this, instead of sequentially sampling the labelled set (and the unlabelled set), we can sample them randomly. This way, we can ensure that it never runs out.

    • Draw 1: [1, 3], [7, 5]

    • Draw 2: [2, 1], [4, 9]

    • Draw 3: [3, 2], [8, 6]

    • ... and so on.

    Luckily, PyTorch's DataLoader supports random sampling. We just need to use RandomSampler instead of SequentialSampler (which is the default).

    +

    In the above example, we have a Dataset that returns 3 duplicate versions of the same image. By leveraging this technique, we can create a Dataset that returns K augmented versions of the same image as a tuple

    Premature End of Epoch due to Small Labelled Set

    See: frdc/train/frdc_datamodule.py

    In MixMatch, the definition of an "epoch" is a bit different. Instead of implying that we have seen all the data once, it implies that we've drawn N batches. The N is referred to as the number of iterations per epoch.

    Take for example, a labelled set of numbers [1, 2, 3] and an unlabelled set [4, 5, 6, 7, 8, 9, 10]. With batch size of 2, we'll run out of labelled samples after 2 iterations, but we'll still have 3 more iterations for the unlabelled set.

    • Draw 1: [1, 2], [4, 5]

    • Draw 2: [3], [6, 7].

    • Epoch ends.

    Solution 2: Random Sampling

    To fix this, instead of sequentially sampling the labelled set (and the unlabelled set), we can sample them randomly. This way, we can ensure that it never runs out.

    • Draw 1: [1, 3], [7, 5]

    • Draw 2: [2, 1], [4, 9]

    • Draw 3: [3, 2], [8, 6]

    • ... and so on.

    Luckily, PyTorch's DataLoader supports random sampling. We just need to use RandomSampler instead of SequentialSampler (which is the default).

    from torch.utils.data import DataLoader, RandomSampler dl = DataLoader( @@ -21,4 +36,4 @@ replacement=False, ) ) -

    This will ensure that the "epoch" ends when we've drawn train_iters batches

    Last modified: 29 December 2023
    \ No newline at end of file +

    This will ensure that the "epoch" ends when we've drawn train_iters batches

    Last modified: 26 June 2024
    \ No newline at end of file diff --git a/docs/get-started-with-dev-containers.html b/docs/get-started-with-dev-containers.html index be7dc455..5aa1c5f3 100644 --- a/docs/get-started-with-dev-containers.html +++ b/docs/get-started-with-dev-containers.html @@ -1,3 +1,18 @@ - Get Started with Dev Containers | Documentation

    Documentation 0.0.8 Help

    Get Started with Dev Containers

    Dev. Containers are a great way to get started with a project. They define all necessary dependencies and environments, so you can just start coding within the container.

    In this article, we'll only go over additional steps to set up with our project. For more information on how to use Dev Containers, please refer to the official documentation for each IDE. Once you've set up the Dev Container, come back here to finish the setup:

    Python Environment

    The dev environment is already created and is managed by Anaconda /opt/conda/bin/conda. To activate the environment, run the following command:

    + +Get Started with Dev Containers | Documentation

    Documentation 0.1.2 Help

    Get Started with Dev Containers

    Dev. Containers are a great way to get started with a project. They define all necessary dependencies and environments, so you can just start coding within the container.

    In this article, we'll only go over additional steps to set up with our project. For more information on how to use Dev Containers, please refer to the official documentation for each IDE. Once you've set up the Dev Container, come back here to finish the setup:

    Python Environment

    The dev environment is already created and is managed by Anaconda /opt/conda/bin/conda. To activate the environment, run the following command:

    conda activate base -

    Mark as Sources Root (Add to PYTHONPATH)

    For import statements to work, you need to mark the src folder as the sources root. Optionally, also mark the tests folder as the tests root.

    Additional Setup

    Refer to the Getting Started guide for additional setup steps such as:

    • Google Cloud Application Default Credentials

    • Weight & Bias API Key

    • Label Studio API Key

    Last modified: 29 December 2023
    \ No newline at end of file +

    Mark as Sources Root (Add to PYTHONPATH)

    For import statements to work, you need to mark the src folder as the sources root. Optionally, also mark the tests folder as the tests root.

    Additional Setup

    Refer to the Getting Started guide for additional setup steps such as:

    • Google Cloud Application Default Credentials

    • Weight & Bias API Key

    • Label Studio API Key

    Last modified: 26 June 2024
    \ No newline at end of file diff --git a/docs/getting-started.html b/docs/getting-started.html index 8bc4700d..375a039b 100644 --- a/docs/getting-started.html +++ b/docs/getting-started.html @@ -1,22 +1,37 @@ - Getting Started | Documentation

    Documentation 0.0.8 Help

    Getting Started

    Installing the Dev. Environment

    1. Ensure that you have the right version of Python. The required Python version can be seen in pyproject.toml

      + +Getting Started | Documentation

      Documentation 0.1.2 Help

      Getting Started

      Installing the Dev. Environment

      1. Ensure that you have the right version of Python. The required Python version can be seen in pyproject.toml

        [tool.poetry.dependencies] python = "..." -
      2. Start by cloning our repository.

        +
      3. Start by cloning our repository.

        git clone https://github.com/FR-DC/FRDC-ML.git -
      4. Then, create a Python Virtual Env pyvenv

        python -m venv venv/
        python3 -m venv venv/
      5. Install Poetry Then check if it's installed with

        poetry --version
      6. Activate the virtual environment

        +
      7. Then, create a Python Virtual Env pyvenv

        python -m venv venv/
        python3 -m venv venv/
      8. Install Poetry Then check if it's installed with

        poetry --version
      9. Activate the virtual environment

        cd venv/Scripts activate cd ../.. -
        +
        source venv/bin/activate -
      10. Install the dependencies. You should be in the same directory as pyproject.toml

        +
    2. Install the dependencies. You should be in the same directory as pyproject.toml

      poetry install --with dev -
    3. Install Pre-Commit Hooks

      +
    4. Install Pre-Commit Hooks

      pre-commit install -

    Setting Up Google Cloud

    1. We use Google Cloud to store our datasets. To set up Google Cloud, install the Google Cloud CLI

    2. Then, authenticate your account.

      gcloud auth login
    3. Finally, set up Application Default Credentials (ADC).

      gcloud auth application-default login
    4. To make sure everything is working, run the tests.

    Setting Up Label Studio

    1. We use Label Studio to annotate our datasets. We won't go through how to install Label Studio, for contributors, it should be up on localhost:8080.

    2. Then, retrieve your own API key from Label Studio. Go to your account page and copy the API key.


    3. Set your API key as an environment variable.

      In Windows, go to "Edit environment variables for your account" and add this as a new environment variable with name LABEL_STUDIO_API_KEY.

      Export it as an environment variable.

      export LABEL_STUDIO_API_KEY=...

    Setting Up Weight and Biases

    1. We use W&B to track our experiments. To set up W&B, install the W&B CLI

    2. Then, authenticate your account.

      wandb login

    Pre-commit Hooks

    • +

    Setting Up Google Cloud

    1. We use Google Cloud to store our datasets. To set up Google Cloud, install the Google Cloud CLI

    2. Then, authenticate your account.

      gcloud auth login
    3. Finally, set up Application Default Credentials (ADC).

      gcloud auth application-default login
    4. To make sure everything is working, run the tests.

    Setting Up Label Studio

    1. We use Label Studio to annotate our datasets. We won't go through how to install Label Studio, for contributors, it should be up on localhost:8080.

    2. Then, retrieve your own API key from Label Studio. Go to your account page and copy the API key.


    3. Set your API key as an environment variable.

      In Windows, go to "Edit environment variables for your account" and add this as a new environment variable with name LABEL_STUDIO_API_KEY.

      Export it as an environment variable.

      export LABEL_STUDIO_API_KEY=...

    Setting Up Weight and Biases

    1. We use W&B to track our experiments. To set up W&B, install the W&B CLI

    2. Then, authenticate your account.

      wandb login

    Pre-commit Hooks

    • pre-commit install -

    Running the Tests

    • Run the tests to make sure everything is working

      +

    Running the Tests

    • Run the tests to make sure everything is working

      pytest -

    Troubleshooting

    ModuleNotFoundError

    It's likely that your src and tests directories are not in PYTHONPATH. To fix this, run the following command:

    +

    Troubleshooting

    ModuleNotFoundError

    It's likely that your src and tests directories are not in PYTHONPATH. To fix this, run the following command:

    export PYTHONPATH=$PYTHONPATH:./src:./tests -

    Or, set it in your IDE, for example, IntelliJ allows setting directories as Source Roots.

    google.auth.exceptions.DefaultCredentialsError

    It's likely that you haven't authenticated your Google Cloud account. See Setting Up Google Cloud

    Couldn't connect to Label Studio

    Label Studio must be running locally, exposed on localhost:8080. Furthermore, you need to specify the LABEL_STUDIO_API_KEY environment variable. See Setting Up Label Studio

    Cannot login to W&B

    You need to authenticate your W&B account. See Setting Up Weight and Biases If you're facing difficulties, set the WANDB_MODE environment variable to offline to disable W&B.

    Our Repository Structure

    Before starting development, take a look at our repository structure. This will help you understand where to put your code.

    Core Dependencies
    Resources
    Tests
    Repo Dependencies
    Dataset Loaders
    Preprocessing Fn.
    Train Deps
    Model Architectures
    Datasets ...
    FRDC
    src/frdc/
    rsc/
    tests/
    pyproject.toml,poetry.lock
    ./load/
    ./preprocess/
    ./train/
    ./models/
    ./dataset_name/
    src/frdc/

    Source Code for our package. These are the unit components of our pipeline.

    rsc/

    Resources. These are usually cached datasets

    tests/

    PyTest tests. These are unit, integration, and model tests.

    Unit, Integration, and Pipeline Tests

    We have 3 types of tests:

    • Unit Tests are usually small, single function tests.

    • Integration Tests are larger tests that tests a mock pipeline.

    • Model Tests are the true production pipeline tests that will generate a model.

    Where Should I contribute?

    Changing a small component

    If you're changing a small component, such as a argument for preprocessing, a new model architecture, or a new configuration for a dataset, take a look at the src/frdc/ directory.

    Adding a test

    By adding a new component, you'll need to add a new test. Take a look at the tests/ directory.

    Changing the model pipeline

    If you're a ML Researcher, you'll probably be changing the pipeline. Take a look at the tests/model_tests/ directory.

    Adding a dependency

    If you're adding a new dependency, use poetry add PACKAGE and commit the changes to pyproject.toml and poetry.lock.

    Last modified: 29 December 2023
    \ No newline at end of file +

    Or, set it in your IDE, for example, IntelliJ allows setting directories as Source Roots.

    google.auth.exceptions.DefaultCredentialsError

    It's likely that you haven't authenticated your Google Cloud account. See Setting Up Google Cloud

    Couldn't connect to Label Studio

    Label Studio must be running locally, exposed on localhost:8080. Furthermore, you need to specify the LABEL_STUDIO_API_KEY environment variable. See Setting Up Label Studio

    Cannot login to W&B

    You need to authenticate your W&B account. See Setting Up Weight and Biases If you're facing difficulties, set the WANDB_MODE environment variable to offline to disable W&B.

    Our Repository Structure

    Before starting development, take a look at our repository structure. This will help you understand where to put your code.

    Core Dependencies
    Resources
    Tests
    Repo Dependencies
    Dataset Loaders
    Preprocessing Fn.
    Train Deps
    Model Architectures
    Datasets ...
    FRDC
    src/frdc/
    rsc/
    tests/
    pyproject.toml,poetry.lock
    ./load/
    ./preprocess/
    ./train/
    ./models/
    ./dataset_name/
    src/frdc/

    Source Code for our package. These are the unit components of our pipeline.

    rsc/

    Resources. These are usually cached datasets

    tests/

    PyTest tests. These are unit, integration, and model tests.

    Unit, Integration, and Pipeline Tests

    We have 3 types of tests:

    • Unit Tests are usually small, single function tests.

    • Integration Tests are larger tests that tests a mock pipeline.

    • Model Tests are the true production pipeline tests that will generate a model.

    Where Should I contribute?

    Changing a small component

    If you're changing a small component, such as a argument for preprocessing, a new model architecture, or a new configuration for a dataset, take a look at the src/frdc/ directory.

    Adding a test

    By adding a new component, you'll need to add a new test. Take a look at the tests/ directory.

    Changing the model pipeline

    If you're a ML Researcher, you'll probably be changing the pipeline. Take a look at the tests/model_tests/ directory.

    Adding a dependency

    If you're adding a new dependency, use poetry add PACKAGE and commit the changes to pyproject.toml and poetry.lock.

    Last modified: 26 June 2024
    \ No newline at end of file diff --git a/docs/load-dataset.html b/docs/load-dataset.html index 8ae04891..7522b773 100644 --- a/docs/load-dataset.html +++ b/docs/load-dataset.html @@ -1,12 +1,28 @@ - load.dataset | Documentation

    Documentation 0.0.8 Help

    load.dataset

    Usage

    Firstly, to load a dataset instance, you need to initiliaze a FRDCDataset object, providing the site, date, and version.

    For example, to load our Chestnut Nature Park dataset.

    + +load.dataset | Documentation

    Documentation 0.1.2 Help

    load.dataset

    Usage

    Firstly, to load a dataset instance, you need to initiliaze a FRDCDataset object, providing the site, date, and version.

    We recommend using the FRDCDatasetPreset module to load explicitly known datasets.

    from frdc.load.preset import FRDCDatasetPreset ds = FRDCDatasetPreset.chestnut_20201218() -

    Then, we can use the ds object to load objects of the dataset:

    -ar, order = ds.get_ar_bands() -d = ds.get_ar_bands_as_dict() -bounds, labels = ds.get_bounds_and_labels() -
    • ar is a stacked NDArray of the hyperspectral bands of shape (H x W x C)

    • order is a list of strings, containing the names of the bands, ordered according to the channels of ar

    • d is a dictionary of the hyperspectral bands of shape (H x W), keyed by the band names

    • bounds is a list of bounding boxes, in the format of Rect, a namedtuple of x0, y0, x1, y1

    • labels is a list of strings, containing the labels of the bounding boxes, ordered according to bounds

    Filters

    You can also selectively get the channels for both get_ar_bands() and get_ar_bands_as_dict() by providing a list of strings to the bands argument.

    For example, to get the Wideband RGB bands, you can do:

    -ar, order = ds.get_ar_bands(bands=['WR', 'WG', 'WB']) -d = ds.get_ar_bands_as_dict(bands=['WR', 'WG', 'WB']) -

    This will also alter the channel order to the order of the bands provided.

    See load.gcs for configuration options.

    Last modified: 29 December 2023
    \ No newline at end of file +

    Then, we can use the ds object to load objects of the dataset:

    +ar, order = ds._get_ar_bands() +d = ds._get_ar_bands_as_dict() +bounds, labels = ds._get_legacy_bounds_and_labels() +
    • ar is a stacked NDArray of the hyperspectral bands of shape (H x W x C)

    • order is a list of strings, containing the names of the bands, ordered according to the channels of ar

    • d is a dictionary of the hyperspectral bands of shape (H x W), keyed by the band names

    • bounds is a list of bounding boxes, in the format of Rect, a namedtuple of x0, y0, x1, y1

    • labels is a list of strings, containing the labels of the bounding boxes, ordered according to bounds

    I can't find a dataset!

    Some datasets, especially new ones may be unregistered and you must specify the exact site / date / version of it.

    +from frdc.load.dataset import FRDCDataset + +ds = FRDCDataset(site="mysite", date="mydate", version="myversion") +

    See below for examples on how to format this.

    • site="ds"

    • date="date"

    • version="ver"

    • site="ds"

    • date="date"

    • version="ver/01/data"

    • site="ds"

    • date="date"

    • version=None

    Last modified: 26 June 2024
    \ No newline at end of file diff --git a/docs/load-gcs.html b/docs/load-gcs.html index ff221005..aa9fd5e1 100644 --- a/docs/load-gcs.html +++ b/docs/load-gcs.html @@ -1,11 +1,26 @@ - load.gcs | Documentation

    Documentation 0.0.8 Help

    load.gcs

    Usage

    These are defined in the top-level load.gcs module.

    list_gcs_datasets

    Lists all datasets in the bucket as a DataFrame. This works by checking which folders have a specific file, which we call the anchor.

    download

    Downloads a file from Google Cloud Storage and returns the local file path.

    open_file

    Downloads and opens a file from Google Cloud Storage. Returns a file handle.

    open_image

    Downloads and returns the PIL image from Google Cloud Storage.

    Pathing

    The path to specify is relative to the bucket, which is frdc-ds by default.

    For example this filesystem on GCS:

    + +load.gcs | Documentation

    Documentation 0.1.2 Help

    load.gcs

    Usage

    These are defined in the top-level load.gcs module.

    list_gcs_datasets

    Lists all datasets in the bucket as a DataFrame. This works by checking which folders have a specific file, which we call the anchor.

    download

    Downloads a file from Google Cloud Storage and returns the local file path.

    open_file

    Downloads and opens a file from Google Cloud Storage. Returns a file handle.

    open_image

    Downloads and returns the PIL image from Google Cloud Storage.

    Pathing

    The path to specify is relative to the bucket, which is frdc-ds by default.

    For example this filesystem on GCS:

    # On Google Cloud Storage frdc-ds ├── chestnut_nature_park │ └── 20201218 │ └── 90deg │ └── bounds.json -

    To download bounds.json, use download(r"chestnut_nature_park/20201218/90deg/bounds.json"). By default, all files will be downloaded to PROJ_DIR/rsc/....

    +

    To download bounds.json, use download(r"chestnut_nature_park/20201218/90deg/bounds.json"). By default, all files will be downloaded to PROJ_DIR/rsc/....

    # On local filesystem PROJ_DIR ├── rsc @@ -13,4 +28,4 @@ │ └── 20201218 │ └── 90deg │ └── bounds.json -

    Configuration

    If you need granular control over

    • where the files are downloaded

    • the credentials used

    • the project used

    • the bucket used

    Then edit conf.py.

    GCS_CREDENTIALS

    Google Cloud credentials.


    A google.oauth2.service_account.Credentials object. See the object documentation for more information.

    LOCAL_DATASET_ROOT_DIR

    Local directory to download files to.


    Path to a directory, or a Path object.

    GCS_PROJECT_ID

    Google Cloud project ID.


    GCS_BUCKET_NAME

    Google Cloud Storage bucket name.


    Last modified: 29 December 2023
    \ No newline at end of file +

    Configuration

    If you need granular control over

    • where the files are downloaded

    • the credentials used

    • the project used

    • the bucket used

    Then edit conf.py.

    GCS_CREDENTIALS

    Google Cloud credentials.


    A google.oauth2.service_account.Credentials object. See the object documentation for more information.

    LOCAL_DATASET_ROOT_DIR

    Local directory to download files to.


    Path to a directory, or a Path object.

    GCS_PROJECT_ID

    Google Cloud project ID.


    GCS_BUCKET_NAME

    Google Cloud Storage bucket name.


    Last modified: 26 June 2024
    \ No newline at end of file diff --git a/docs/mix-match-module.html b/docs/mix-match-module.html index 9ef02365..e9524435 100644 --- a/docs/mix-match-module.html +++ b/docs/mix-match-module.html @@ -1,4 +1,19 @@ - MixMatch Module | Documentation

    Documentation 0.0.8 Help

    MixMatch Module

    See frdc/train/mixmatch_module.py.

    Quick Recap

    We will go over the essential parts of the code here. Before that, we revise some of the concepts that are used in the code.

    Abstract Methods

    In Python, we can define abstract methods using the abc module. Just like other OOP languages, abstract methods are methods that must be implemented by the child class.

    For example:

    + +MixMatch Module | Documentation

    Documentation 0.1.2 Help

    MixMatch Module

    See frdc/train/mixmatch_module.py.

    Quick Recap

    We will go over the essential parts of the code here. Before that, we revise some of the concepts that are used in the code.

    Abstract Methods

    In Python, we can define abstract methods using the abc module. Just like other OOP languages, abstract methods are methods that must be implemented by the child class.

    For example:

    from abc import ABC, abstractmethod @@ -11,7 +26,7 @@ class MyChildClass(MyAbstractClass): def my_abstract_method(self): print("Hello World!") -

    nn.Module & LightningModule

    If you're unfamiliar with PyTorch, you should read the nn.Module Documentation.

    nn.Module is the base class for all neural network modules in PyTorch. While LightningModule is a PyTorch Lightning class that extends nn.Module, providing it with additional functionality that reduces boilerplate code.

    By implementing it as a LightningModule, we also enter the PyTorch Lightning ecosystem, which provides us with a lot of useful features such as logging, early stopping, and more.

    What do we implement in a Module?

    One key component that nn.Module requires, is the model. So for example:

    +

    nn.Module & LightningModule

    If you're unfamiliar with PyTorch, you should read the nn.Module Documentation.

    nn.Module is the base class for all neural network modules in PyTorch. While LightningModule is a PyTorch Lightning class that extends nn.Module, providing it with additional functionality that reduces boilerplate code.

    By implementing it as a LightningModule, we also enter the PyTorch Lightning ecosystem, which provides us with a lot of useful features such as logging, early stopping, and more.

    What do we implement in a Module?

    One key component that nn.Module requires, is the model. So for example:

    class MyModule(nn.Module): def __init__(self): super().__init__() @@ -23,7 +38,7 @@ def forward(self, x): return self.model(x) -

    PyTorch Lightning builds on top of it, requiring training_step and validation_step. Each "step" is a batch of data, and the model is trained on it. So for example:

    +

    PyTorch Lightning builds on top of it, requiring training_step and validation_step. Each "step" is a batch of data, and the model is trained on it. So for example:

    class MyModule(LightningModule): def __init__(self): ... @@ -40,7 +55,7 @@ y_hat = self(x) loss = F.cross_entropy(y_hat, y) return loss -

    Usually, the training and validation steps are the same, but in some cases, such as MixMatch, they are different. In MixMatch, we not only use a different loss function for train, we also handle a batch differently. The PyTorch Lightning framework allows us to separate the two, and implement them separately.

    Model Embedded Preprocessing on_before_batch_transfer

    In PyTorch Lightning, we can also inject a step before the batch is passed to the model. This is done by overriding the on_before_batch_transfer method.

    Batch
    on_before_batch_transfer
    training_step
    validation_step

    This allows us to do preprocessing on the batch, such as scaling the data, encoding the labels, and more.

    Custom EMA Update on_after_backward

    We also leverage another hook, called on_after_backward. This hook is called after the backward pass, and allows us to do custom operations. In our case, we use it to update the EMA model.

    Batch
    training_step
    on_after_backward
    update_ema

    MixMatch

    We recommend having tests/model_tests/chestnut_dec_may/train.py open while reading this section. It implements a real-world example of MixMatch.

    As a summary:

    1. We learned what is an abstract method, and how to implement it

    2. We implement the model in LightningModule much like we would in nn.Module

    3. We implement on_before_batch_transfer to preprocess the batch

    4. Finally, we implement on_after_backward to update the EMA model

    With the above in mind, let's look at the MixMatch implementation.

    forward (abstract)

    Forward pass of the model

    ema_model (abstract)

    The model that is used for EMA. We expect this property to be implemented by the child class.

    update_ema (abstract)

    The method to update the EMA model. We expect this method to be implemented by the child class.

    loss_unl_scaler (static)

    Takes in the current progress of the training, 0.0 to 1.0, where 0.0 is the start of the training, and 1.0 is the end. Then, returns the multiplier for the unlabeled loss.

    loss_lbl (static)

    Implements the loss for labeled data. Takes in the predicted labels and the ground truth labels, and returns the loss. This is cross entropy for MixMatch.

    loss_unl (static)

    Implements the loss for unlabeled data. Takes in the predicted labels and the ground truth labels, and returns the loss. This is MSE for MixMatch.

    mixup

    Takes in the data and the labels, the beta distribution parameter, and returns the mixed data and labels.

    sharpen

    Takes in the labels and temperature, and returns the sharpened labels.

    guess_labels

    Takes in the unlabeled data, and returns the guessed labels.

    progress

    The current progress of the training, 0.0 to 1.0, where 0.0 is the start of the training, and 1.0 is the end.

    training_step

    The training step runs through 1 batch of data, and returns the loss. Note that this is significantly different from validation step, as we handle the K-Augmented data differently.

    test / validation_step

    The test / validation step runs through 1 batch of data, and returns the loss.

    predict_step

    The predict step runs through 1 batch of data, and returns the actual decoded labels.

    on_after_backward

    The on_after_backward hook is called after the backward pass, and allows us to do custom operations. In our case, we use it to update the EMA model.

    on_before_batch_transfer

    The on_before_batch_transfer hook is called before the batch is transferred to the GPU. In our case, we use it to preprocess the batch.

    A diagram of how these components interact with each other is shown below:

    Batch
    on_before_batch_transfer
    training_step
    guess_labels
    sharpen
    mix_up
    loss_unl
    loss_unl_scaler
    loss
    loss_lbl
    backward
    on_after_backward
    update_ema
    validation_step
    loss

    Finally, we show an example of how to use the MixMatch module:

    +

    Usually, the training and validation steps are the same, but in some cases, such as MixMatch, they are different. In MixMatch, we not only use a different loss function for train, we also handle a batch differently. The PyTorch Lightning framework allows us to separate the two, and implement them separately.

    Model Embedded Preprocessing on_before_batch_transfer

    In PyTorch Lightning, we can also inject a step before the batch is passed to the model. This is done by overriding the on_before_batch_transfer method.

    Batch
    on_before_batch_transfer
    training_step
    validation_step

    This allows us to do preprocessing on the batch, such as scaling the data, encoding the labels, and more.

    Custom EMA Update on_after_backward

    We also leverage another hook, called on_after_backward. This hook is called after the backward pass, and allows us to do custom operations. In our case, we use it to update the EMA model.

    Batch
    training_step
    on_after_backward
    update_ema

    MixMatch

    We recommend having tests/model_tests/chestnut_dec_may/train.py open while reading this section. It implements a real-world example of MixMatch.

    As a summary:

    1. We learned what is an abstract method, and how to implement it

    2. We implement the model in LightningModule much like we would in nn.Module

    3. We implement on_before_batch_transfer to preprocess the batch

    4. Finally, we implement on_after_backward to update the EMA model

    With the above in mind, let's look at the MixMatch implementation.

    forward (abstract)

    Forward pass of the model

    ema_model (abstract)

    The model that is used for EMA. We expect this property to be implemented by the child class.

    update_ema (abstract)

    The method to update the EMA model. We expect this method to be implemented by the child class.

    loss_unl_scaler (static)

    Takes in the current progress of the training, 0.0 to 1.0, where 0.0 is the start of the training, and 1.0 is the end. Then, returns the multiplier for the unlabeled loss.

    loss_lbl (static)

    Implements the loss for labeled data. Takes in the predicted labels and the ground truth labels, and returns the loss. This is cross entropy for MixMatch.

    loss_unl (static)

    Implements the loss for unlabeled data. Takes in the predicted labels and the ground truth labels, and returns the loss. This is MSE for MixMatch.

    mixup

    Takes in the data and the labels, the beta distribution parameter, and returns the mixed data and labels.

    sharpen

    Takes in the labels and temperature, and returns the sharpened labels.

    guess_labels

    Takes in the unlabeled data, and returns the guessed labels.

    progress

    The current progress of the training, 0.0 to 1.0, where 0.0 is the start of the training, and 1.0 is the end.

    training_step

    The training step runs through 1 batch of data, and returns the loss. Note that this is significantly different from validation step, as we handle the K-Augmented data differently.

    test / validation_step

    The test / validation step runs through 1 batch of data, and returns the loss.

    predict_step

    The predict step runs through 1 batch of data, and returns the actual decoded labels.

    on_after_backward

    The on_after_backward hook is called after the backward pass, and allows us to do custom operations. In our case, we use it to update the EMA model.

    on_before_batch_transfer

    The on_before_batch_transfer hook is called before the batch is transferred to the GPU. In our case, we use it to preprocess the batch.

    A diagram of how these components interact with each other is shown below:

    Batch
    on_before_batch_transfer
    training_step
    guess_labels
    sharpen
    mix_up
    loss_unl
    loss_unl_scaler
    loss
    loss_lbl
    backward
    on_after_backward
    update_ema
    validation_step
    loss

    Finally, we show an example of how to use the MixMatch module:

    from sklearn.preprocessing import StandardScaler, OrdinalEncoder from frdc.train.mixmatch_module import MixMatchModule @@ -60,7 +75,7 @@ sharpen_temp=0.5, mix_beta_alpha=0.75, ) -

    In particular, we need to supply some transformations for the preprocessing step. In this case, we use StandardScaler to scale the data, and OrdinalEncoder to encode the labels.

    1. It's best if standardization is done only on the training data, and not the validation data to better fit real-world scenarios.

    2. We use OrdinalEncoder as it handles unseen labels. So if a class doesn't show up in the training data, it will be encoded as np.nan, and will not participate in the loss calculation.

    Design Choices

    Static Method Overriding

    We implement many functions as static, as we believe that a functional style reduces dependencies, thus making the code easier to test and debug.

    Furthermore, it allows the subclasses to easily override the functions, to customize the behavior of the MixMatch module.

    For example, the loss_unl_scaler function is static, thus, we can implement our own scaling function, and pass it to the MixMatch module.

    +

    In particular, we need to supply some transformations for the preprocessing step. In this case, we use StandardScaler to scale the data, and OrdinalEncoder to encode the labels.

    1. It's best if standardization is done only on the training data, and not the validation data to better fit real-world scenarios.

    2. We use OrdinalEncoder as it handles unseen labels. So if a class doesn't show up in the training data, it will be encoded as np.nan, and will not participate in the loss calculation.

    Design Choices

    Static Method Overriding

    We implement many functions as static, as we believe that a functional style reduces dependencies, thus making the code easier to test and debug.

    Furthermore, it allows the subclasses to easily override the functions, to customize the behavior of the MixMatch module.

    For example, the loss_unl_scaler function is static, thus, we can implement our own scaling function, and pass it to the MixMatch module.

    def my_loss_unl_scaler(progress: float) -> float: return progress ** 2 @@ -68,4 +83,4 @@ @staticmethod def loss_unl_scaler(progress: float) -> float: return my_loss_unl_scaler(progress) -

    If we had used a method instead, we would have to consider instance state, which would make it harder to override.

    Why not use Dataclasses?

    One of the biggest caveats of nn.Module is that it requires super().__init__() to be called before anything is assigned. While dataclass can leverage __post_init__ to do the same, we felt that this was too much of a hassle to save a few keystrokes. Thus, we opted to use __init__ instead, while more verbose, it is more explicit.

    Why use PyTorch Lightning?

    While we did hit some road blocks implementing SSL, due to its complex and unconventional nature, we felt that the benefits of using PyTorch Lightning outweighed the cons.

    on_before_batch_transfer and on_after_backward are unconventional hooks, and we had to do some digging to find them. It can be argued that by just writing explicit code, we can avoid the need for these hooks, but the PyTorch ecosystem fixes many other issues, so we closed an eye on this.

    References

    Last modified: 29 December 2023
    \ No newline at end of file +

    If we had used a method instead, we would have to consider instance state, which would make it harder to override.

    Why not use Dataclasses?

    One of the biggest caveats of nn.Module is that it requires super().__init__() to be called before anything is assigned. While dataclass can leverage __post_init__ to do the same, we felt that this was too much of a hassle to save a few keystrokes. Thus, we opted to use __init__ instead, while more verbose, it is more explicit.

    Why use PyTorch Lightning?

    While we did hit some road blocks implementing SSL, due to its complex and unconventional nature, we felt that the benefits of using PyTorch Lightning outweighed the cons.

    on_before_batch_transfer and on_after_backward are unconventional hooks, and we had to do some digging to find them. It can be argued that by just writing explicit code, we can avoid the need for these hooks, but the PyTorch ecosystem fixes many other issues, so we closed an eye on this.

    References

    Last modified: 26 June 2024
    \ No newline at end of file diff --git a/docs/mix-match.html b/docs/mix-match.html index 80686148..18761d00 100644 --- a/docs/mix-match.html +++ b/docs/mix-match.html @@ -1 +1,16 @@ - MixMatch | Documentation

    Documentation 0.0.8 Help

    MixMatch

    In FRDC-ML, we leverage semi-supervised learning to improve the model's performance through better augmentation consistency and using even unlabelled data.

    The algorithm we use is MixMatch. A state-of-the-art semi-supervised learning algorithm. It is based on the idea of consistency regularization, which encourages models to predict the same class even after augmentations that occur naturally in the real world.

    Our implementation of MixMatch is a refactored version of YU1ut/MixMatch-pytorch We've refactored the code to follow more modern PyTorch practices, allowing us to utilize it with modern PyTorch frameworks such as PyTorch Lightning.

    We won't go through the details of MixMatch here, see Our Documentation in our MixMatch-PyTorch-CIFAR10 repository for more details.

    Implementation Details

    1. How we implemented the MixMatch logic MixMatchModule

    2. How we implemented the unique MixMatch data loading logic Custom MixMatch Data Loading

    References

    Last modified: 29 December 2023
    \ No newline at end of file + +MixMatch | Documentation

    Documentation 0.1.2 Help

    MixMatch

    In FRDC-ML, we leverage semi-supervised learning to improve the model's performance through better augmentation consistency and using even unlabelled data.

    The algorithm we use is MixMatch. A state-of-the-art semi-supervised learning algorithm. It is based on the idea of consistency regularization, which encourages models to predict the same class even after augmentations that occur naturally in the real world.

    Our implementation of MixMatch is a refactored version of YU1ut/MixMatch-pytorch We've refactored the code to follow more modern PyTorch practices, allowing us to utilize it with modern PyTorch frameworks such as PyTorch Lightning.

    We won't go through the details of MixMatch here, see Our Documentation in our MixMatch-PyTorch-CIFAR10 repository for more details.

    Implementation Details

    1. How we implemented the MixMatch logic MixMatchModule

    2. How we implemented the unique MixMatch data loading logic Custom MixMatch Data Loading

    References

    Last modified: 26 June 2024
    \ No newline at end of file diff --git a/docs/ml-architecture.html b/docs/ml-architecture.html new file mode 100644 index 00000000..3016f86f --- /dev/null +++ b/docs/ml-architecture.html @@ -0,0 +1,16 @@ + +ML Architecture | Documentation

    Documentation 0.1.2 Help

    ML Architecture

    The architecture is the backbone of the project. If you're interested on how everything is pieced together, this article is for you.

    In Machine Learning architectures, we mostly care about 2 things the data, and the model. As the name implies, DataModules, DataLoaders, Datasets deal with data, and Modules for model construction.

    Data Classes

    There's a small difference between the Data___ classes. Firstly, we load data in as Dataset instances, then preprocessed before being batched by DataLoader, finally, housed in DataModule.

    DataModule
    Train DataLoader
    Validation DataLoader
    Test DataLoader
    Preprocess
    Augmentations
    Distortions
    Alternatives
    Cropping or Resizing
    Scaling
    Data Source
    Load
    Dataset
    DataLoader

    There are 2 IMPORTANT design decisions here:

    Dataset and DataLoader

    Data in Dataset are unbatched, data in DataLoader must be batched. This means that it's possible to have jagged tensors at this stage, however they must be made "stackable" before loading into the DataLoader.

    For example, the data in Dataset could be of shapes [(8, 200, 100), (8, 100, 300), ...]. While, BEFORE loading into DataLoader must have equal shapes, for example [(8, 100, 100), (8, 100, 100), ...]

    This is because when you initialize a DataLoader you need to include the batch_size, which implies the data are stacked in some manner.

    This also leads to the reason why preprocessing must happen before the DataLoader

    Preprocessing

    Excluding functionalities to load the data, this is the step before the data is set in stone. So, steps such as augmentation, transformation, even analytics needs to be performed here as the data is in its "rawest" form.

    We use this step to

    1. Construct alternative augmentations. i.e. images that we could've taken instead.

    2. Using those alternatives, add distortions. i.e. unintentional changes to the photo that reduces quality.

    3. Cropping or resizing the image.

    4. Scale the data. e.g. Standard Scaling, ZCA Scaling, etc.

    The order of the steps are choice by design.

    Modules

    We analyze the inheritance structure of the Modules (also the ML Models):

    Library Module
    PyTorch Module
    Lightning Module
    FRDC Module
    FixMatch Module
    MixMatch Module
    EfficientNetB1 FixMatch Module
    EfficientNetB1 MixMatch Module
    Custom Module

    Custom Modules are our self-defined classes.

    • FRDC Module: This is the base class for all our models. Implements common functionality, such as partial saving of unfrozen parameters.

    • Y Module: Y is the architecture/framework of the model in our case, this only defines the method of training, not the actual model itself.

    • X Y Module: X defines the actual model being used within Y's framework.

    To give an example, we look at EfficientNetB1FixMatchModule. Due to its naming scheme <Model><Framework>Module, we see that it's an EfficientNetB1 model used in the FixMatch framework.

    Furthermore, because it's well decoupled, implementing a new model is as easy as overriding some defaults.

    Last modified: 26 June 2024
    \ No newline at end of file diff --git a/docs/model-test-chestnut-may-dec.html b/docs/model-test-chestnut-may-dec.html index 245427b2..cf969761 100644 --- a/docs/model-test-chestnut-may-dec.html +++ b/docs/model-test-chestnut-may-dec.html @@ -1 +1,16 @@ - Model Test Chestnut May-Dec | Documentation

    Documentation 0.0.8 Help

    Model Test Chestnut May-Dec

    This test is used to evaluate the model performance on the Chestnut Nature Park May & December dataset.

    See this script in model_tests/chestnut_dec_may/train.py.

    Motivation

    The usage of this model will be to classify trees in unseen datasets under different conditions. In this test, we'll evaluate it under a different season.

    A caveat is that it'll be evaluated on the same set of trees, so it's not a representative of a field-test. However, given difficulties of yielding datasets, this still gives us a good preliminary idea of how the model will perform in different conditions.

    Methodology

    We train on the December dataset, and test on the May dataset.

    Labelled Train
    Unlabelled Train
    Test
    DecDataset
    Model
    MayDataset

    Despite not having any true unlabelled data, we use MixMatch by treating the labelled data of the December dataset as unlabelled data.

    Model

    The current Model used is a simple InceptionV3 Transfer Learning model, with the last layer replaced with a fully connected layer(s).

    SSL Loss
    Input
    InceptionV3 Frozen
    FC Layer(s)
    Softmax
    Output

    Preprocessing

    For Training:

    Segment
    RandomCrop 299
    Horizontal Flip 50%
    Vertical Flip 50%
    Normalize By Training Mean & Std

    For Validation:

    Segment
    CenterCrop 299
    Normalize By Training Mean & Std

    For Evaluation:

    Segment
    CenterCrop 299
    Normalize By Training Mean & Std
    As Is
    Horizontal Flip
    Vertical Flip
    Horizontal & Vertical Flip

    For evaluation, we evaluate that the model should be invariant to horizontal and vertical flips, as well as the original image.

    Hyperparameters

    The following hyperparameters are used:

    • Optimizer: Adam

    • Learning Rate: 1e-3

    • Batch Size: 32

    • Epochs: 10

    • Train Iterations: 25~100

    • Validation Iterations: 10~25

    • Early Stopping: 4

    Results

    We evaluate around 40% accuracy on the test set, compared to 100% for the training set. This indicates that the model has saturated and is not able to learn anymore from the training set. There's no indication of overfitting as the validation loss just plateaus.

    W&B Dashboard

    Caveats

    • The test set is very small, so the results are not very representative.

    • The test set is the same set of trees, so it's not a true test of the model performance in different conditions.

    • There are many classes with 1 sample, so the model may not be able to learn the features of these classes well.

    Last modified: 29 December 2023
    \ No newline at end of file + +Model Test Chestnut May-Dec | Documentation

    Documentation 0.1.2 Help

    Model Test Chestnut May-Dec

    This test is used to evaluate the model performance on the Chestnut Nature Park May & December dataset.

    See this script in model_tests/chestnut_dec_may/train.py.

    Motivation

    The usage of this model will be to classify trees in unseen datasets under different conditions. In this test, we'll evaluate it under a different season.

    A caveat is that it'll be evaluated on the same set of trees, so it's not a representative of a field-test. However, given difficulties of yielding datasets, this still gives us a good preliminary idea of how the model will perform in different conditions.

    Methodology

    We train on the December dataset, and test on the May dataset.

    Labelled Train
    Unlabelled Train
    Test
    DecDataset
    Model
    MayDataset

    Despite not having any true unlabelled data, we use MixMatch by treating the labelled data of the December dataset as unlabelled data.

    Model

    The current Model used is a simple InceptionV3 Transfer Learning model, with the last layer replaced with a fully connected layer(s).

    SSL Loss
    Input
    InceptionV3 Frozen
    FC Layer(s)
    Softmax
    Output

    Preprocessing

    For Training:

    Segment
    RandomCrop 299
    Horizontal Flip 50%
    Vertical Flip 50%
    Normalize By Training Mean & Std

    For Validation:

    Segment
    CenterCrop 299
    Normalize By Training Mean & Std

    For Evaluation:

    Segment
    CenterCrop 299
    Normalize By Training Mean & Std
    As Is
    Horizontal Flip
    Vertical Flip
    Horizontal & Vertical Flip

    For evaluation, we evaluate that the model should be invariant to horizontal and vertical flips, as well as the original image.

    Hyperparameters

    The following hyperparameters are used:

    • Optimizer: Adam

    • Learning Rate: 1e-3

    • Batch Size: 32

    • Epochs: 10

    • Train Iterations: 25~100

    • Validation Iterations: 10~25

    • Early Stopping: 4

    Results

    We evaluate around 40% accuracy on the test set, compared to 100% for the training set. This indicates that the model has saturated and is not able to learn anymore from the training set. There's no indication of overfitting as the validation loss just plateaus.

    W&B Dashboard

    Caveats

    • The test set is very small, so the results are not very representative.

    • The test set is the same set of trees, so it's not a true test of the model performance in different conditions.

    • There are many classes with 1 sample, so the model may not be able to learn the features of these classes well.

    Last modified: 26 June 2024
    \ No newline at end of file diff --git a/docs/overview.html b/docs/overview.html index 896ab236..f5833ce7 100644 --- a/docs/overview.html +++ b/docs/overview.html @@ -1 +1,16 @@ - Overview | Documentation

    Documentation 0.0.8 Help

    Overview

    Forest Recovery Digital Companion (FRDC) is a ML-assisted companion for ecologists to automatically classify surveyed trees via an Unmanned Aerial Vehicle (UAV).

    This package, FRDC-ML is the Machine Learning backbone of this project, a centralized repository of tools and model architectures to be used in the FRDC pipeline.

    Get started here

    Other Projects

    FRDC-UI

    The User Interface Repository for FRDC, a WebApp GUI for ecologists to adjust annotations.

    Last modified: 29 December 2023
    \ No newline at end of file + +Overview | Documentation

    Documentation 0.1.2 Help

    Overview

    Forest Recovery Digital Companion (FRDC) is a ML-assisted companion for ecologists to automatically classify surveyed trees via an Unmanned Aerial Vehicle (UAV).

    This package, FRDC-ML is the Machine Learning backbone of this project, a centralized repository of tools and model architectures to be used in the FRDC pipeline.

    Get started here

    Other Projects

    FRDC-UI

    The User Interface Repository for FRDC, a WebApp GUI for ecologists to adjust annotations.

    Last modified: 26 June 2024
    \ No newline at end of file diff --git a/docs/preprocessing-extract-segments.html b/docs/preprocessing-extract-segments.html index 9a248812..e6922357 100644 --- a/docs/preprocessing-extract-segments.html +++ b/docs/preprocessing-extract-segments.html @@ -1,4 +1,19 @@ - preprocessing.extract_segments | Documentation

    Documentation 0.0.8 Help

    preprocessing.extract_segments

    Functions

    extract_segments_from_labels

    Extracts segments from a label classification.

    extract_segments_from_bounds

    Extracts segments from Rect bounds.

    remove_small_segments_from_labels

    Removes small segments from a label classification.

    Extract with Boundaries

    A boundary is a Rect object that represents the minimum bounding box of a segment, with x0, y0, x1, y1 coordinates.

    It simply slices the original image to the bounding box. The origin is the top left corner of the image.

    + +preprocessing.extract_segments | Documentation

    Documentation 0.1.2 Help

    preprocessing.extract_segments

    Functions

    extract_segments_from_labels

    Extracts segments from a label classification.

    extract_segments_from_bounds

    Extracts segments from Rect bounds.

    remove_small_segments_from_labels

    Removes small segments from a label classification.

    Extract with Boundaries

    A boundary is a Rect object that represents the minimum bounding box of a segment, with x0, y0, x1, y1 coordinates.

    It simply slices the original image to the bounding box. The origin is the top left corner of the image.

    +-----------------+ +-----------+ | Original | | Segmented | | Image | | Image | @@ -9,7 +24,7 @@ +-----+-----+-----+ 1, 2, 0, 2 +-----+-----+ | 7 | 8 | 9 | x0 y0 x1 y1 | 8 | 9 | +-----+-----+-----+ +-----+-----+ -
    +
    +-----------------+ +-----------------+ | Original | | Segmented | | Image | | Image | @@ -20,7 +35,7 @@ +-----+-----+-----+ 1, 2, 0, 2 +-----+-----+-----+ | 7 | 8 | 9 | x0 y0 x1 y1 | 0 | 8 | 9 | +-----+-----+-----+ +-----+-----+-----+ -

    Extract with Labels

    A label classification is a np.ndarray where each pixel is mapped to a segment. The segments are mapped to a unique integer. In our project, the 0th label is the background.

    For example, a label classification of 3 segments will look like this:

    +

    Extract with Labels

    A label classification is a np.ndarray where each pixel is mapped to a segment. The segments are mapped to a unique integer. In our project, the 0th label is the background.

    For example, a label classification of 3 segments will look like this:

    +-----------------+ +-----------------+ | Label | | Original | | Classification | | Image | @@ -31,7 +46,7 @@ +-----+-----+-----+ +-----+-----+-----+ | 1 | 1 | 0 | | 7 | 8 | 9 | +-----+-----+-----+ +-----+-----+-----+ -

    The extraction will take the minimum bounding box of each segment and return a list of segments.

    For example, the label 1 and 2 extracted images will be

    +

    The extraction will take the minimum bounding box of each segment and return a list of segments.

    For example, the label 1 and 2 extracted images will be

    +-----------+ +-----------+ | Extracted | | Extracted | | Segment 1 | | Segment 2 | @@ -42,7 +57,7 @@ +-----+-----+ +-----+-----+ | 7 | 8 | +-----+-----+ -
    +
    +-----------------+ +-----------------+ | Extracted | | Extracted | | Segment 1 | | Segment 2 | @@ -53,17 +68,17 @@ +-----+-----+-----+ +-----+-----+-----+ | 7 | 8 | 0 | | 0 | 0 | 0 | +-----+-----+-----+ +-----+-----+-----+ -
    • If cropped is False, the segments are padded with 0s to the original image size. While this can ensure shape consistency, it can consume more memory for large images.

    • If cropped is True, the segments are cropped to the minimum bounding box. This can save memory, but the shape of the segments will be inconsistent.

    Usage

    Extract from Bounds and Labels

    Extract segments from bounds and labels.

    +
    • If cropped is False, the segments are padded with 0s to the original image size. While this can ensure shape consistency, it can consume more memory for large images.

    • If cropped is True, the segments are cropped to the minimum bounding box. This can save memory, but the shape of the segments will be inconsistent.

    Usage

    Extract from Bounds and Labels

    Extract segments from bounds and labels.

    import numpy as np from frdc.load.preset import FRDCDatasetPreset from frdc.preprocess.extract_segments import extract_segments_from_bounds ds = FRDCDatasetPreset.chestnut_20201218() -ar, order = ds.get_ar_bands() -bounds, labels = ds.get_bounds_and_labels() +ar, order = ds._get_ar_bands() +bounds, labels = ds._get_legacy_bounds_and_labels() segments: list[np.ndarray] = extract_segments_from_bounds(ar, bounds) -

    Extract from Auto-Segmentation

    Extract segments from a label classification.

    +

    Extract from Auto-Segmentation

    Extract segments from a label classification.

    from skimage.morphology import remove_small_objects, remove_small_holes import numpy as np @@ -77,7 +92,7 @@ ) ds = FRDCDatasetPreset.chestnut_20201218() -ar, order = ds.get_ar_bands() +ar, order = ds._get_ar_bands() ar = scale_0_1_per_band(ar) ar_mask = threshold_binary_mask(ar, -1, 90 / 256) ar_mask = remove_small_objects(ar_mask, min_size=100, connectivity=2) @@ -87,4 +102,4 @@ min_height=10, min_width=10) segments: list[np.ndarray] = extract_segments_from_labels(ar, ar_labels) -

    API

    extract_segments_from_labels(ar, ar_labels, cropped)

    Extracts segments from a label classification.


    ar_labels is a label classification as a np.ndarray

    extract_segments_from_bounds(ar, bounds, cropped)

    Extracts segments from Rect bounds.


    bounds is a list of Rect bounds.

    remove_small_segments_from_labels(ar_labels, min_height, min_width)

    Removes small segments from a label classification.


    Last modified: 29 December 2023
    \ No newline at end of file +

    API

    extract_segments_from_labels(ar, ar_labels, cropped)

    Extracts segments from a label classification.


    ar_labels is a label classification as a np.ndarray

    extract_segments_from_bounds(ar, bounds, cropped)

    Extracts segments from Rect bounds.


    bounds is a list of Rect bounds.

    remove_small_segments_from_labels(ar_labels, min_height, min_width)

    Removes small segments from a label classification.


    Last modified: 26 June 2024
    \ No newline at end of file diff --git a/docs/preprocessing-glcm-padded.html b/docs/preprocessing-glcm-padded.html index f9e3da4c..749cfb70 100644 --- a/docs/preprocessing-glcm-padded.html +++ b/docs/preprocessing-glcm-padded.html @@ -1,4 +1,19 @@ - preprocessing.glcm_padded | Documentation

    Documentation 0.0.8 Help

    preprocessing.glcm_padded

    Functions

    glcm_padded

    Computes the GLCM of the NDArray bands with padding.

    glcm_padded_cached

    Computes the GLCM of the NDArray bands with padding, and caches it.

    append_glcm_padded_cached

    Computes the GLCM of the NDArray bands with padding, and caches it and also appends it onto the original array.

    Usage

    We show a few examples of how to use the GLCM functions.

    + +preprocessing.glcm_padded | Documentation

    Documentation 0.1.2 Help

    preprocessing.glcm_padded

    Functions

    glcm_padded

    Computes the GLCM of the NDArray bands with padding.

    glcm_padded_cached

    Computes the GLCM of the NDArray bands with padding, and caches it.

    append_glcm_padded_cached

    Computes the GLCM of the NDArray bands with padding, and caches it and also appends it onto the original array.

    Usage

    We show a few examples of how to use the GLCM functions.

    import numpy as np from glcm_cupy import Features @@ -23,4 +38,4 @@ ar_glcm_cached_appended = append_glcm_padded_cached(ar, bin_from=1, bin_to=4, radius=3) -
    • ar_glcm is the GLCM of the original array, with the last dimension being the GLCM features. The number of features is determined by the features parameter, which defaults to all features.

    • ar_glcm_2_features selects only 2 features, with the last dimension being the 2 GLCM features specified.

    • ar_glcm_cached caches the GLCM so that if you call it again, it will return the cached version. It stores its data at the project root dir, under .cache/.

    • ar_glcm_cached_appended is a wrapper around ar_glcm_cached, it appends the GLCM features onto the original array. It's equivalent to calling ar_glcm_cached and then np.concatenate on the final axes.

    Caching

    GLCM is an expensive operation, thus we recommend to cache it if the input parameters will be the same. This is especially useful if you're experimenting with the same dataset with constant parameters.

    API

    glcm_padded(ar, bin_from, bin_to, radius, step_size, features)

    Computes the GLCM of the NDArray bands with padding.


    • ar is the input array

    • bin_from is the upper bound of the input

    • bin_to is the upper bound of the GLCM input, i.e. the resolution that GLCM operates on

    • radius is the radius of the GLCM

    • step_size is the step size of the GLCM

    • features is the list of GLCM features to compute

    The return shape is

    See glcm_cupy for the GLCM Features.

    glcm_padded_cached(ar, bin_from, bin_to, radius, step_size, features)

    Computes the GLCM of the NDArray bands with padding, and caches it.


    See glcm_padded for the parameters and output shape

    append_glcm_padded_cached(ar, bin_from, bin_to, radius, step_size, features)

    Computes the GLCM of the NDArray bands with padding, and caches it and also appends it onto the original array.


    See glcm_padded for the parameters


    The return shape is:

    The function automatically flattens the last 2 dimensions of the GLCM features, and appends it onto the original array.

    Last modified: 29 December 2023
    \ No newline at end of file +
    • ar_glcm is the GLCM of the original array, with the last dimension being the GLCM features. The number of features is determined by the features parameter, which defaults to all features.

    • ar_glcm_2_features selects only 2 features, with the last dimension being the 2 GLCM features specified.

    • ar_glcm_cached caches the GLCM so that if you call it again, it will return the cached version. It stores its data at the project root dir, under .cache/.

    • ar_glcm_cached_appended is a wrapper around ar_glcm_cached, it appends the GLCM features onto the original array. It's equivalent to calling ar_glcm_cached and then np.concatenate on the final axes.

    Caching

    GLCM is an expensive operation, thus we recommend to cache it if the input parameters will be the same. This is especially useful if you're experimenting with the same dataset with constant parameters.

    API

    glcm_padded(ar, bin_from, bin_to, radius, step_size, features)

    Computes the GLCM of the NDArray bands with padding.


    • ar is the input array

    • bin_from is the upper bound of the input

    • bin_to is the upper bound of the GLCM input, i.e. the resolution that GLCM operates on

    • radius is the radius of the GLCM

    • step_size is the step size of the GLCM

    • features is the list of GLCM features to compute

    The return shape is

    See glcm_cupy for the GLCM Features.

    glcm_padded_cached(ar, bin_from, bin_to, radius, step_size, features)

    Computes the GLCM of the NDArray bands with padding, and caches it.


    See glcm_padded for the parameters and output shape

    append_glcm_padded_cached(ar, bin_from, bin_to, radius, step_size, features)

    Computes the GLCM of the NDArray bands with padding, and caches it and also appends it onto the original array.


    See glcm_padded for the parameters


    The return shape is:

    The function automatically flattens the last 2 dimensions of the GLCM features, and appends it onto the original array.

    Last modified: 26 June 2024
    \ No newline at end of file diff --git a/docs/preprocessing-morphology.html b/docs/preprocessing-morphology.html index 0c21fd83..ba281416 100644 --- a/docs/preprocessing-morphology.html +++ b/docs/preprocessing-morphology.html @@ -1,13 +1,28 @@ - preprocessing.morphology | Documentation

    Documentation 0.0.8 Help

    preprocessing.morphology

    Functions

    threshold_binary_mask

    Thresholds a selected NDArray bands to yield a binary mask.

    binary_watershed

    Performs watershed on a binary mask to yield a mapped label classification

    Usage

    Perform auto-segmentation on a dataset to yield a label classification.

    + +preprocessing.morphology | Documentation

    Documentation 0.1.2 Help

    preprocessing.morphology

    Functions

    threshold_binary_mask

    Thresholds a selected NDArray bands to yield a binary mask.

    binary_watershed

    Performs watershed on a binary mask to yield a mapped label classification

    Usage

    Perform auto-segmentation on a dataset to yield a label classification.

    from frdc.load.preset import FRDCDatasetPreset from frdc.preprocess.morphology import ( threshold_binary_mask, binary_watershed ) ds = FRDCDatasetPreset.chestnut_20201218() -ar, order = ds.get_ar_bands() +ar, order = ds._get_ar_bands() mask = threshold_binary_mask(ar, order.index('NIR'), 90 / 256) ar_label = binary_watershed(mask) -

    API

    threshold_binary_mask(ar, band_idx, threshold_value)

    Thresholds a selected NDArray bands to yield a binary mask as np.ndarray


    This is equivalent to

    +

    API

    threshold_binary_mask(ar, band_idx, threshold_value)

    Thresholds a selected NDArray bands to yield a binary mask as np.ndarray


    This is equivalent to

    ar[..., band_idx] > threshold_value -
    binary_watershed(ar_mask, peaks_footprint, watershed_compactness)

    Performs watershed on a binary mask to yield a mapped label classification as a np.ndarray


    • peaks_footprint is the footprint of skimage.feature.peak_local_max

    • watershed_compactness is the compactness of skimage.morphology.watershed

    Last modified: 29 December 2023
    \ No newline at end of file +
    binary_watershed(ar_mask, peaks_footprint, watershed_compactness)

    Performs watershed on a binary mask to yield a mapped label classification as a np.ndarray


    • peaks_footprint is the footprint of skimage.feature.peak_local_max

    • watershed_compactness is the compactness of skimage.morphology.watershed

    Last modified: 26 June 2024
    \ No newline at end of file diff --git a/docs/preprocessing-scale.html b/docs/preprocessing-scale.html index 2dd27886..386d7c47 100644 --- a/docs/preprocessing-scale.html +++ b/docs/preprocessing-scale.html @@ -1,4 +1,19 @@ - preprocessing.scale | Documentation

    Documentation 0.0.8 Help

    preprocessing.scale

    Functions

    scale_0_1_per_band

    Scales the NDArray bands to [0, 1] per band.

    scale_normal_per_band

    Scales the NDArray bands to zero mean unit variance per band.

    scale_static_per_band

    Scales the NDArray bands by a predefined configuration. Take a look at frdc.conf.BAND_MAX_CONFIG for an example.

    Usage

    + +preprocessing.scale | Documentation

    Documentation 0.1.2 Help

    preprocessing.scale

    Functions

    scale_0_1_per_band

    Scales the NDArray bands to [0, 1] per band.

    scale_normal_per_band

    Scales the NDArray bands to zero mean unit variance per band.

    scale_static_per_band

    Scales the NDArray bands by a predefined configuration. Take a look at frdc.conf.BAND_MAX_CONFIG for an example.

    Usage

    from frdc.load.preset import FRDCDatasetPreset from frdc.preprocess.scale import ( scale_0_1_per_band, scale_normal_per_band, scale_static_per_band @@ -6,8 +21,8 @@ from frdc.conf import BAND_MAX_CONFIG ds = FRDCDatasetPreset.chestnut_20201218() -ar, order = ds.get_ar_bands() +ar, order = ds._get_ar_bands() ar_01 = scale_0_1_per_band(ar) ar_norm = scale_normal_per_band(ar) ar_static = scale_static_per_band(ar, order, BAND_MAX_CONFIG) -
    Last modified: 29 December 2023
    \ No newline at end of file +
    Last modified: 26 June 2024
    \ No newline at end of file diff --git a/docs/retrieve-our-datasets.html b/docs/retrieve-our-datasets.html index 0f0ef2d9..23964d77 100644 --- a/docs/retrieve-our-datasets.html +++ b/docs/retrieve-our-datasets.html @@ -1,41 +1,87 @@ - Retrieve our Datasets | Documentation

    Documentation 0.0.8 Help

    Retrieve our Datasets

    In this tutorial, we'll learn how to :

    • Retrieve FRDC's Hyperspectral Image Data as np.ndarray

    • Retrieve FRDC's Ground Truth bounds and labels

    • Slice/segment the image data by the bounds

    Prerequisites

    • New here? Get Started.

    • Setup the Google Cloud Authorization to download the data.

    Retrieve the Data

    To retrieve the data, use FRDCDataset

    Here, we'll download and load our

    • ar: Hyperspectral Image Data

    • order: The order of the bands

    • bounds: The bounds of the trees (segments)

    • labels: The labels of the trees (segments)

    + +Retrieve our Datasets | Documentation

    Documentation 0.1.2 Help

    Retrieve our Datasets

    In this tutorial, we'll learn how to :

    • Retrieve FRDC's Datasets

    • How to inspect the data

    • How to integrate it with PyTorch's DataLoader

    • How to visualize the data

    Prerequisites

    • New here? Get Started.

    • Setup the Google Cloud Authorization to download the data.

    Retrieve the Data

    To retrieve the data, use FRDCDatasetPreset. This module presets to load explicitly known datasets.

    For example:

    from frdc.load.preset import FRDCDatasetPreset ds = FRDCDatasetPreset.chestnut_20201218() -ar, order = ds.get_ar_bands() -bounds, labels = ds.get_bounds_and_labels() -

    What Datasets are there?

    -from frdc.load.gcs import list_gcs_datasets -print(list_gcs_datasets()) -# 0 DEBUG/0 -# 1 casuarina/20220418/183deg -# 2 casuarina/20220418/93deg -# 3 chestnut_nature_park/20201218 -# ... -
    • The first part of the path is the site, and the second part is the date.

    • The version is the rest of the path, if there isn't any, use None.

    • site="ds"

    • date="date"

    • version="ver"

    • site="ds"

    • date="date"

    • version="ver/01/data"

    • site="ds"

    • date="date"

    • version=None

    Segment the Data

    To segment the data, use Extract Segments.

    Here, we'll segment the data by the bounds.

    +for x, y in ds: + print(x.shape, y) +

    You should get something like this:

    +(831, 700, 8) Falcataria Moluccana +(540, 536, 8) Ficus Variegata +(457, 660, 8) Bridelia Sp. +... +
    • x is a torch.Tensor

    • y is a str.

    Iterate through the Data

    The dataset, when you load it, will be automatically segmented by bounds. Therefore, if you want to simply loop through the segments and labels, you can treat the dataset as an iterable.

    from frdc.load.preset import FRDCDatasetPreset -from frdc.preprocess.extract_segments import extract_segments_from_bounds ds = FRDCDatasetPreset.chestnut_20201218() -ar, order = ds.get_ar_bands() -bounds, labels = ds.get_bounds_and_labels() -segments = extract_segments_from_bounds(ar, bounds) -

    segments is a list of np.ndarray of shape H, W, C, representing a tree. The order of segments is the same as labels, so you can use labels to identify the tree.

    Plot the Data (Optional)

    We can then use these data to plot out the first tree segment.

    +for x, y in ds: + print(x.shape, y) +

    If you just want the segments or targets separately, use .ar_segments and .targets respectively.

    +from frdc.load.preset import FRDCDatasetPreset + +ds = FRDCDatasetPreset.chestnut_20201218() +for x in ds.ar_segments: + print(x.shape) + +for y in ds.targets: + print(y) +

    If you want the entire image, use .ar.

    +from frdc.load.preset import FRDCDatasetPreset + +ds = FRDCDatasetPreset.chestnut_20201218() +ar = ds.ar +

    Finally, inspect the order of the bands through the band_order attribute.

    +from frdc.load.preset import FRDCDatasetPreset + +ds = FRDCDatasetPreset.chestnut_20201218() +ds.band_order +
    +> ['WB', 'WG', 'WR', 'NB', 'NG', 'NR', 'RE', 'NIR'] +

    Using with PyTorch's DataLoader

    Every FRDCDataset is a Dataset object, so you can use it with PyTorch's DataLoader. This allows you to retrieve by batches!

    +from torch.utils.data import DataLoader +from torchvision.transforms.v2 import CenterCrop, Compose, Resize, ToImage + +from frdc.load.preset import FRDCDatasetPreset + +ds = FRDCDatasetPreset.chestnut_20201218( + use_legacy_bounds=True, + transform=Compose([ToImage(), Resize(100), CenterCrop(100)]), +) +dl = DataLoader(ds, batch_size=4, shuffle=True) + +for x, y in dl: + print(x.shape, y) +

    Which should output

    +torch.Size([4, 8, 100, 100]) ('Falcataria Moluccana', ...) +torch.Size([4, 8, 100, 100]) ('Clausena Excavata', ...) +torch.Size([4, 8, 100, 100]) ('Clausena Excavata', ...) +... +

    Plot the Data (Optional)

    We can then use these data to plot out the first tree segment.

    import matplotlib.pyplot as plt from frdc.load.preset import FRDCDatasetPreset -from frdc.preprocess.extract_segments import extract_segments_from_bounds from frdc.preprocess.scale import scale_0_1_per_band ds = FRDCDatasetPreset.chestnut_20201218() -ar, order = ds.get_ar_bands() -bounds, labels = ds.get_bounds_and_labels() -segments = extract_segments_from_bounds(ar, bounds) -segment_0_bgr = segments[0] +segment_0_bgr = ds.ar_segments[0] segment_0_rgb = segment_0_bgr[..., [2, 1, 0]] segment_0_rgb_scaled = scale_0_1_per_band(segment_0_rgb) plt.imshow(segment_0_rgb_scaled) -plt.title(f"Tree {labels[0]}") +plt.title(f"Tree {ds.targets[0]}") plt.show() -

    See also: preprocessing.scale.scale_0_1_per_band

    MatPlotLib cannot show the data correctly as-is, so we need to

    • Convert the data from BGR to RGB

    • Scale the data to 0-1 per band

    Last modified: 29 December 2023
    \ No newline at end of file +

    See also: preprocessing.scale.scale_0_1_per_band

    MatPlotLib cannot show the data correctly as-is, so we need to

    • Convert the data from BGR to RGB

    • Scale the data to 0-1 per band

    Last modified: 26 June 2024
    \ No newline at end of file diff --git a/docs/train-frdc-lightning.html b/docs/train-frdc-lightning.html deleted file mode 100644 index 2bdfd346..00000000 --- a/docs/train-frdc-lightning.html +++ /dev/null @@ -1,4 +0,0 @@ - train.frdc_datamodule & frdc_module | Documentation

    Documentation 0.0.8 Help

    train.frdc_datamodule & frdc_module

    These are FRDC specific LightningDataModule and LightningModule, a core component in the PyTorch Lightning ecosystem to provide a simple interface to train and evaluate models.

    Classes

    FRDCDataModule

    The FRDC PyTorch Lightning DataModule.

    FRDCModule

    The FRDC PyTorch Lightning Module.

    Usage

    API

    FRDCDataModule(segments, labels, preprocess, augmentation, train_val_test_split, batch_size)

    Initializes the FRDC PyTorch Lightning DataModule.


    • segments, labels are retrieved from

    • preprocess is a function that takes in a segment and returns a preprocessed segment. In particular, it should accept a list of NumPy NDArrays and return a single stacked PyToch Tensor.

    • augmentation is a function that takes in a segment and returns an augmented segment. In particular, it takes in a PyTorch Tensor and returns another.

    • train_val_test_split is a function that takes a TensorDataset and returns a list of 3 TensorDatasets, for train, val and test respectively.

    • batch_size is the batch size.

    FRDCModule(model_cls, model_kwargs, optim_cls, optim_kwargs)

    Initializes the FRDC PyTorch Lightning Module.


    • model_cls is the Class of the model.

    • model_kwargs is the kwargs to pass to the model.

    • optim_cls is the Class of the optimizer.

    • optim_kwargs is the kwargs to pass to the optimizer.

    Internally, the module will initialize the model and optimizer as follows:

    -model = model_cls(**model_kwargs) -optim = optim_cls(model.parameters(), **optim_kwargs) -
    Last modified: 29 December 2023
    \ No newline at end of file From 965b623bcfa03efdae986fd9b7a9a06f0c8d0193 Mon Sep 17 00:00:00 2001 From: Eve-ning Date: Wed, 26 Jun 2024 19:29:01 +0800 Subject: [PATCH 10/12] Default DEBUG to use legacy bounds for tests --- src/frdc/load/preset.py | 24 +++++++++++++----------- tests/conftest.py | 2 +- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/src/frdc/load/preset.py b/src/frdc/load/preset.py index 518dd119..2f39d4ab 100644 --- a/src/frdc/load/preset.py +++ b/src/frdc/load/preset.py @@ -238,14 +238,16 @@ class FRDCDatasetPreset: casuarina_20220418_93deg = FRDCDatasetPartial( "casuarina", "20220418", "93deg" ) - DEBUG = lambda resize=299: FRDCDatasetPartial( - site="DEBUG", date="0", version=None - )( - transform=Compose( - [ - ToImage(), - ToDtype(torch.float32), - Resize((resize, resize)), - ] - ), - ) + + @staticmethod + def _debug(resize=299, use_legacy_bounds=False): + return FRDCDatasetPartial(site="DEBUG", date="0", version=None)( + transform=Compose( + [ + ToImage(), + ToDtype(torch.float32), + Resize((resize, resize)), + ] + ), + use_legacy_bounds=use_legacy_bounds, + ) diff --git a/tests/conftest.py b/tests/conftest.py index 21697f8f..56cf2007 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -10,7 +10,7 @@ @pytest.fixture(scope="session") def ds() -> FRDCDataset: - return FRDCDatasetPreset.DEBUG() + return FRDCDatasetPreset._debug(use_legacy_bounds=True) @pytest.fixture(scope="session") From 6b49b5eeb1deb47e5b62efbf25829b8f3ea81852 Mon Sep 17 00:00:00 2001 From: Eve-ning Date: Wed, 26 Jun 2024 19:45:41 +0800 Subject: [PATCH 11/12] Make .env non-mandatory --- .../topics/Get-Started-with-Dev-Containers.md | 5 + Writerside/topics/Getting-Started.md | 260 +++++++++--------- docs/HelpTOC.json | 2 +- docs/custom-k-aug-dataloaders.html | 8 +- docs/get-started-with-dev-containers.html | 6 +- docs/getting-started.html | 44 +-- docs/load-dataset.html | 10 +- docs/load-gcs.html | 8 +- docs/mix-match-module.html | 14 +- docs/mix-match.html | 4 +- docs/ml-architecture.html | 4 +- docs/model-test-chestnut-may-dec.html | 4 +- docs/overview.html | 4 +- docs/preprocessing-extract-segments.html | 18 +- docs/preprocessing-glcm-padded.html | 6 +- docs/preprocessing-morphology.html | 8 +- docs/preprocessing-scale.html | 6 +- docs/retrieve-our-datasets.html | 22 +- src/frdc/conf.py | 5 +- 19 files changed, 225 insertions(+), 213 deletions(-) diff --git a/Writerside/topics/Get-Started-with-Dev-Containers.md b/Writerside/topics/Get-Started-with-Dev-Containers.md index 750bead5..342721f5 100644 --- a/Writerside/topics/Get-Started-with-Dev-Containers.md +++ b/Writerside/topics/Get-Started-with-Dev-Containers.md @@ -47,3 +47,8 @@ steps such as: - Google Cloud Application Default Credentials - Weight & Bias API Key - Label Studio API Key + +> You can set the API Keys in the `.env` file in the root of the project. +> Be careful not to commit the `.env` file to the repository, which should +> have been ignored by default. +{style='note'} \ No newline at end of file diff --git a/Writerside/topics/Getting-Started.md b/Writerside/topics/Getting-Started.md index c62ee26f..746c93f8 100644 --- a/Writerside/topics/Getting-Started.md +++ b/Writerside/topics/Getting-Started.md @@ -1,155 +1,161 @@ # Getting Started -> Want to use a Dev Container? See [Get Started with Dev Containers](Get-Started-with-Dev-Containers.md) +> Want to use a Dev Container? +> See [Get Started with Dev Containers](Get-Started-with-Dev-Containers.md) - Ensure that you have the right version of Python. - The required Python version can be seen in pyproject.toml - - [tool.poetry.dependencies] - python = "..." - - - Start by cloning our repository. - - git clone https://github.com/FR-DC/FRDC-ML.git - - - Then, create a Python Virtual Env pyvenv - - - python -m venv venv/ - - - python3 -m venv venv/ - - - - - Install Poetry - Then check if it's installed with - poetry --version - - If poetry is not found, it's likely not in the user PATH. - - - Activate the virtual environment - - + Ensure that you have the right version of Python. + The required Python version can be seen in pyproject.toml + + [tool.poetry.dependencies] + python = "..." + + + Start by cloning our repository. + + git clone https://github.com/FR-DC/FRDC-ML.git + + + Then, create a Python Virtual Env pyvenv + + + python -m venv venv/ + + + python3 -m venv venv/ + + + + + Install Poetry + Then check if it's installed with + poetry --version + + If poetry is not found, it's likely not in the user PATH. + + + Activate the virtual environment + + - cd venv/Scripts - activate - cd ../.. + cd venv/Scripts + activate + cd ../.. - - + + - source venv/bin/activate + source venv/bin/activate - - - - Install the dependencies. You should be in the same directory as - pyproject.toml - - poetry install --with dev - - - Install Pre-Commit Hooks - - pre-commit install - - + + + + Install the dependencies. You should be in the same directory as + pyproject.toml + + poetry install --with dev + + + Install Pre-Commit Hooks + + pre-commit install + + - - We use Google Cloud to store our datasets. To set up Google Cloud, - - install the Google Cloud CLI - - - - Then, - - authenticate your account - . - gcloud auth login - - - Finally, - - set up Application Default Credentials (ADC) - . - gcloud auth application-default login - - - To make sure everything is working, run the tests. - + + We use Google Cloud to store our datasets. To set up Google Cloud, + + install the Google Cloud CLI + + + + Then, + + authenticate your account + . + gcloud auth login + + + Finally, + + set up Application Default Credentials (ADC) + . + gcloud auth application-default login + + + To make sure everything is working, run the tests. + - This is only necessary if any task requires Label Studio annotations - - We use Label Studio to annotate our datasets. - We won't go through how to install Label Studio, for contributors, it - should be up on localhost:8080. - - - Then, retrieve your own API key from Label Studio. - Go to your account page - and copy the API key.
    - Set your API key as an environment variable. - - + This is only necessary if any task requires Label Studio annotations + + We use Label Studio to annotate our datasets. + We won't go through how to install Label Studio, for contributors, it + should be up on localhost:8080. + + + Then, retrieve your own API key from Label Studio. + Go to your account page + and copy the API key.
    + Set your API key as an environment variable. + + In Windows, go to "Edit environment variables for your account" and add this as a new environment variable with name LABEL_STUDIO_API_KEY. - - + + Export it as an environment variable. export LABEL_STUDIO_API_KEY=... - - - +
    + + In all cases, you can create a .env file in the root of + the project and add the following line: + LABEL_STUDIO_API_KEY=... + +
    +
    - - We use W&B to track our experiments. To set up W&B, - - install the W&B CLI - - - - Then, - - authenticate your account - . - wandb login - + + We use W&B to track our experiments. To set up W&B, + + install the W&B CLI + + + + Then, + + authenticate your account + . + wandb login + - This is optional but recommended. - Pre-commit hooks are a way to ensure that your code is formatted correctly. - This is done by running a series of checks before you commit your code. - - - - pre-commit install - - + This is optional but recommended. + Pre-commit hooks are a way to ensure that your code is formatted correctly. + This is done by running a series of checks before you commit your code. + + + + pre-commit install + + - - Run the tests to make sure everything is working - - pytest - - + + Run the tests to make sure everything is working + + pytest + + ## Troubleshooting @@ -174,13 +180,15 @@ See [Setting Up Google Cloud](#gcloud) ### Couldn't connect to Label Studio Label Studio must be running locally, exposed on `localhost:8080`. Furthermore, -you need to specify the `LABEL_STUDIO_API_KEY` environment variable. See +you need to specify the `LABEL_STUDIO_API_KEY` environment variable. See [Setting Up Label Studio](#ls) ### Cannot login to W&B -You need to authenticate your W&B account. See [Setting Up Weight and Biases](#wandb) -If you're facing difficulties, set the `WANDB_MODE` environment variable to `offline` +You need to authenticate your W&B account. +See [Setting Up Weight and Biases](#wandb) +If you're facing difficulties, set the `WANDB_MODE` environment variable +to `offline` to disable W&B. ## Our Repository Structure diff --git a/docs/HelpTOC.json b/docs/HelpTOC.json index 59ab5806..98bda536 100644 --- a/docs/HelpTOC.json +++ b/docs/HelpTOC.json @@ -1 +1 @@ -{"entities":{"pages":{"Overview":{"id":"Overview","title":"Overview","url":"overview.html","level":0,"tabIndex":0},"ML-Architecture":{"id":"ML-Architecture","title":"ML Architecture","url":"ml-architecture.html","level":0,"tabIndex":1},"Getting-Started":{"id":"Getting-Started","title":"Getting Started","url":"getting-started.html","level":0,"pages":["Get-Started-with-Dev-Containers"],"tabIndex":2},"Get-Started-with-Dev-Containers":{"id":"Get-Started-with-Dev-Containers","title":"Get Started with Dev Containers","url":"get-started-with-dev-containers.html","level":1,"parentId":"Getting-Started","tabIndex":0},"-6vddrq_5799":{"id":"-6vddrq_5799","title":"Tutorials","level":0,"pages":["Retrieve-our-Datasets"],"tabIndex":3},"Retrieve-our-Datasets":{"id":"Retrieve-our-Datasets","title":"Retrieve our Datasets","url":"retrieve-our-datasets.html","level":1,"parentId":"-6vddrq_5799","tabIndex":0},"mix-match":{"id":"mix-match","title":"MixMatch","url":"mix-match.html","level":0,"pages":["mix-match-module","custom-k-aug-dataloaders"],"tabIndex":4},"mix-match-module":{"id":"mix-match-module","title":"MixMatch Module","url":"mix-match-module.html","level":1,"parentId":"mix-match","tabIndex":0},"custom-k-aug-dataloaders":{"id":"custom-k-aug-dataloaders","title":"Custom K-Aug Dataloaders","url":"custom-k-aug-dataloaders.html","level":1,"parentId":"mix-match","tabIndex":1},"-6vddrq_5804":{"id":"-6vddrq_5804","title":"Model Tests","level":0,"pages":["Model-Test-Chestnut-May-Dec"],"tabIndex":5},"Model-Test-Chestnut-May-Dec":{"id":"Model-Test-Chestnut-May-Dec","title":"Model Test Chestnut May-Dec","url":"model-test-chestnut-may-dec.html","level":1,"parentId":"-6vddrq_5804","tabIndex":0},"-6vddrq_5806":{"id":"-6vddrq_5806","title":"API","level":0,"pages":["load.dataset","load.gcs","preprocessing.scale","preprocessing.extract_segments","preprocessing.morphology","preprocessing.glcm_padded"],"tabIndex":6},"load.dataset":{"id":"load.dataset","title":"load.dataset","url":"load-dataset.html","level":1,"parentId":"-6vddrq_5806","tabIndex":0},"load.gcs":{"id":"load.gcs","title":"load.gcs","url":"load-gcs.html","level":1,"parentId":"-6vddrq_5806","tabIndex":1},"preprocessing.scale":{"id":"preprocessing.scale","title":"preprocessing.scale","url":"preprocessing-scale.html","level":1,"parentId":"-6vddrq_5806","tabIndex":2},"preprocessing.extract_segments":{"id":"preprocessing.extract_segments","title":"preprocessing.extract_segments","url":"preprocessing-extract-segments.html","level":1,"parentId":"-6vddrq_5806","tabIndex":3},"preprocessing.morphology":{"id":"preprocessing.morphology","title":"preprocessing.morphology","url":"preprocessing-morphology.html","level":1,"parentId":"-6vddrq_5806","tabIndex":4},"preprocessing.glcm_padded":{"id":"preprocessing.glcm_padded","title":"preprocessing.glcm_padded","url":"preprocessing-glcm-padded.html","level":1,"parentId":"-6vddrq_5806","tabIndex":5}}},"topLevelIds":["Overview","ML-Architecture","Getting-Started","-6vddrq_5799","mix-match","-6vddrq_5804","-6vddrq_5806"]} \ No newline at end of file +{"entities":{"pages":{"Overview":{"id":"Overview","title":"Overview","url":"overview.html","level":0,"tabIndex":0},"ML-Architecture":{"id":"ML-Architecture","title":"ML Architecture","url":"ml-architecture.html","level":0,"tabIndex":1},"Getting-Started":{"id":"Getting-Started","title":"Getting Started","url":"getting-started.html","level":0,"pages":["Get-Started-with-Dev-Containers"],"tabIndex":2},"Get-Started-with-Dev-Containers":{"id":"Get-Started-with-Dev-Containers","title":"Get Started with Dev Containers","url":"get-started-with-dev-containers.html","level":1,"parentId":"Getting-Started","tabIndex":0},"-6vddrq_6549":{"id":"-6vddrq_6549","title":"Tutorials","level":0,"pages":["Retrieve-our-Datasets"],"tabIndex":3},"Retrieve-our-Datasets":{"id":"Retrieve-our-Datasets","title":"Retrieve our Datasets","url":"retrieve-our-datasets.html","level":1,"parentId":"-6vddrq_6549","tabIndex":0},"mix-match":{"id":"mix-match","title":"MixMatch","url":"mix-match.html","level":0,"pages":["mix-match-module","custom-k-aug-dataloaders"],"tabIndex":4},"mix-match-module":{"id":"mix-match-module","title":"MixMatch Module","url":"mix-match-module.html","level":1,"parentId":"mix-match","tabIndex":0},"custom-k-aug-dataloaders":{"id":"custom-k-aug-dataloaders","title":"Custom K-Aug Dataloaders","url":"custom-k-aug-dataloaders.html","level":1,"parentId":"mix-match","tabIndex":1},"-6vddrq_6554":{"id":"-6vddrq_6554","title":"Model Tests","level":0,"pages":["Model-Test-Chestnut-May-Dec"],"tabIndex":5},"Model-Test-Chestnut-May-Dec":{"id":"Model-Test-Chestnut-May-Dec","title":"Model Test Chestnut May-Dec","url":"model-test-chestnut-may-dec.html","level":1,"parentId":"-6vddrq_6554","tabIndex":0},"-6vddrq_6556":{"id":"-6vddrq_6556","title":"API","level":0,"pages":["load.dataset","load.gcs","preprocessing.scale","preprocessing.extract_segments","preprocessing.morphology","preprocessing.glcm_padded"],"tabIndex":6},"load.dataset":{"id":"load.dataset","title":"load.dataset","url":"load-dataset.html","level":1,"parentId":"-6vddrq_6556","tabIndex":0},"load.gcs":{"id":"load.gcs","title":"load.gcs","url":"load-gcs.html","level":1,"parentId":"-6vddrq_6556","tabIndex":1},"preprocessing.scale":{"id":"preprocessing.scale","title":"preprocessing.scale","url":"preprocessing-scale.html","level":1,"parentId":"-6vddrq_6556","tabIndex":2},"preprocessing.extract_segments":{"id":"preprocessing.extract_segments","title":"preprocessing.extract_segments","url":"preprocessing-extract-segments.html","level":1,"parentId":"-6vddrq_6556","tabIndex":3},"preprocessing.morphology":{"id":"preprocessing.morphology","title":"preprocessing.morphology","url":"preprocessing-morphology.html","level":1,"parentId":"-6vddrq_6556","tabIndex":4},"preprocessing.glcm_padded":{"id":"preprocessing.glcm_padded","title":"preprocessing.glcm_padded","url":"preprocessing-glcm-padded.html","level":1,"parentId":"-6vddrq_6556","tabIndex":5}}},"topLevelIds":["Overview","ML-Architecture","Getting-Started","-6vddrq_6549","mix-match","-6vddrq_6554","-6vddrq_6556"]} \ No newline at end of file diff --git a/docs/custom-k-aug-dataloaders.html b/docs/custom-k-aug-dataloaders.html index 0d084f17..98a565ab 100644 --- a/docs/custom-k-aug-dataloaders.html +++ b/docs/custom-k-aug-dataloaders.html @@ -1,5 +1,5 @@ -Custom K-Aug Dataloaders | Documentation

    Documentation 0.1.2 Help

    Custom K-Aug Dataloaders

    In MixMatch, implementing the data loading methods is quite unconventional.

    1. We need to load multiple augmented versions of the same image into the same batch.

    2. The labelled set is usually too small, causing a premature end to the epoch as it runs out of samples to draw from faster than the unlabelled set.

    This can be rather tricky to implement in PyTorch. This tutorial will illustrate how we did it.

    Loading Multiple Augmented Versions of the Same Image

    See: frdc/load/dataset.py FRDCDataset.__getitem__

    In MixMatch, a single train batch must consist of:

    1. A batch of labeled images

    2. K batches of unlabeled images

    Aug
    Aug
    Aug
    Aug
    Get Batch
    Aug Labelled Batch
    Unlabelled Batch
    Aug Unl. Batch 1
    Aug Unl. Batch i
    Aug Unl. Batch K

    Keep in mind that the unlabelled batch, is a single batch of images, not separate draws of batches. It is then "duplicated" K times, and each copy is augmented differently.

    Solution 1: Custom Dataset

    To solve this, we need to understand the role of both a Dataset and a DataLoader.

    • A Dataset represents a collection of data, responsible for loading and returning something.

    • A DataLoader draws samples from a Dataset and returns batched samples.

    The key here is that a Dataset is not limited to returning 1 sample at a time, we can make it return the K augmented versions of the same image.

    Aug
    Aug
    Aug
    Sample
    Aug Sample 1
    Aug Sample i
    Aug Sample K

    In code, this is done by subclassing the Dataset class and overriding the __getitem__ method.

    +}

    Documentation 0.1.2 Help

    Custom K-Aug Dataloaders

    In MixMatch, implementing the data loading methods is quite unconventional.

    1. We need to load multiple augmented versions of the same image into the same batch.

    2. The labelled set is usually too small, causing a premature end to the epoch as it runs out of samples to draw from faster than the unlabelled set.

    This can be rather tricky to implement in PyTorch. This tutorial will illustrate how we did it.

    Loading Multiple Augmented Versions of the Same Image

    See: frdc/load/dataset.py FRDCDataset.__getitem__

    In MixMatch, a single train batch must consist of:

    1. A batch of labeled images

    2. K batches of unlabeled images

    Aug
    Aug
    Aug
    Aug
    Get Batch
    Aug Labelled Batch
    Unlabelled Batch
    Aug Unl. Batch 1
    Aug Unl. Batch i
    Aug Unl. Batch K

    Keep in mind that the unlabelled batch, is a single batch of images, not separate draws of batches. It is then "duplicated" K times, and each copy is augmented differently.

    Solution 1: Custom Dataset

    To solve this, we need to understand the role of both a Dataset and a DataLoader.

    • A Dataset represents a collection of data, responsible for loading and returning something.

    • A DataLoader draws samples from a Dataset and returns batched samples.

    The key here is that a Dataset is not limited to returning 1 sample at a time, we can make it return the K augmented versions of the same image.

    Aug
    Aug
    Aug
    Sample
    Aug Sample 1
    Aug Sample i
    Aug Sample K

    In code, this is done by subclassing the Dataset class and overriding the __getitem__ method.

    def duplicate(x): return x, deepcopy(x), deepcopy(x) @@ -25,7 +25,7 @@ def __getitem__(self, index): x, y = self.dataset[index] return self.aug(x), y -

    In the above example, we have a Dataset that returns 3 duplicate versions of the same image. By leveraging this technique, we can create a Dataset that returns K augmented versions of the same image as a tuple

    Premature End of Epoch due to Small Labelled Set

    See: frdc/train/frdc_datamodule.py

    In MixMatch, the definition of an "epoch" is a bit different. Instead of implying that we have seen all the data once, it implies that we've drawn N batches. The N is referred to as the number of iterations per epoch.

    Take for example, a labelled set of numbers [1, 2, 3] and an unlabelled set [4, 5, 6, 7, 8, 9, 10]. With batch size of 2, we'll run out of labelled samples after 2 iterations, but we'll still have 3 more iterations for the unlabelled set.

    • Draw 1: [1, 2], [4, 5]

    • Draw 2: [3], [6, 7].

    • Epoch ends.

    Solution 2: Random Sampling

    To fix this, instead of sequentially sampling the labelled set (and the unlabelled set), we can sample them randomly. This way, we can ensure that it never runs out.

    • Draw 1: [1, 3], [7, 5]

    • Draw 2: [2, 1], [4, 9]

    • Draw 3: [3, 2], [8, 6]

    • ... and so on.

    Luckily, PyTorch's DataLoader supports random sampling. We just need to use RandomSampler instead of SequentialSampler (which is the default).

    +

    In the above example, we have a Dataset that returns 3 duplicate versions of the same image. By leveraging this technique, we can create a Dataset that returns K augmented versions of the same image as a tuple

    Premature End of Epoch due to Small Labelled Set

    See: frdc/train/frdc_datamodule.py

    In MixMatch, the definition of an "epoch" is a bit different. Instead of implying that we have seen all the data once, it implies that we've drawn N batches. The N is referred to as the number of iterations per epoch.

    Take for example, a labelled set of numbers [1, 2, 3] and an unlabelled set [4, 5, 6, 7, 8, 9, 10]. With batch size of 2, we'll run out of labelled samples after 2 iterations, but we'll still have 3 more iterations for the unlabelled set.

    • Draw 1: [1, 2], [4, 5]

    • Draw 2: [3], [6, 7].

    • Epoch ends.

    Solution 2: Random Sampling

    To fix this, instead of sequentially sampling the labelled set (and the unlabelled set), we can sample them randomly. This way, we can ensure that it never runs out.

    • Draw 1: [1, 3], [7, 5]

    • Draw 2: [2, 1], [4, 9]

    • Draw 3: [3, 2], [8, 6]

    • ... and so on.

    Luckily, PyTorch's DataLoader supports random sampling. We just need to use RandomSampler instead of SequentialSampler (which is the default).

    from torch.utils.data import DataLoader, RandomSampler dl = DataLoader( @@ -36,4 +36,4 @@ replacement=False, ) ) -

    This will ensure that the "epoch" ends when we've drawn train_iters batches

    Last modified: 26 June 2024
    \ No newline at end of file +

    This will ensure that the "epoch" ends when we've drawn train_iters batches

    Last modified: 26 June 2024
    \ No newline at end of file diff --git a/docs/get-started-with-dev-containers.html b/docs/get-started-with-dev-containers.html index 5aa1c5f3..b0579d13 100644 --- a/docs/get-started-with-dev-containers.html +++ b/docs/get-started-with-dev-containers.html @@ -1,5 +1,5 @@ -Get Started with Dev Containers | Documentation

    Documentation 0.1.2 Help

    Get Started with Dev Containers

    Dev. Containers are a great way to get started with a project. They define all necessary dependencies and environments, so you can just start coding within the container.

    In this article, we'll only go over additional steps to set up with our project. For more information on how to use Dev Containers, please refer to the official documentation for each IDE. Once you've set up the Dev Container, come back here to finish the setup:

    Python Environment

    The dev environment is already created and is managed by Anaconda /opt/conda/bin/conda. To activate the environment, run the following command:

    +}

    Documentation 0.1.2 Help

    Get Started with Dev Containers

    Dev. Containers are a great way to get started with a project. They define all necessary dependencies and environments, so you can just start coding within the container.

    In this article, we'll only go over additional steps to set up with our project. For more information on how to use Dev Containers, please refer to the official documentation for each IDE. Once you've set up the Dev Container, come back here to finish the setup:

    Python Environment

    The dev environment is already created and is managed by Anaconda /opt/conda/bin/conda. To activate the environment, run the following command:

    conda activate base -

    Mark as Sources Root (Add to PYTHONPATH)

    For import statements to work, you need to mark the src folder as the sources root. Optionally, also mark the tests folder as the tests root.

    Additional Setup

    Refer to the Getting Started guide for additional setup steps such as:

    • Google Cloud Application Default Credentials

    • Weight & Bias API Key

    • Label Studio API Key

    Last modified: 26 June 2024
    \ No newline at end of file +

    Mark as Sources Root (Add to PYTHONPATH)

    For import statements to work, you need to mark the src folder as the sources root. Optionally, also mark the tests folder as the tests root.

    Additional Setup

    Refer to the Getting Started guide for additional setup steps such as:

    • Google Cloud Application Default Credentials

    • Weight & Bias API Key

    • Label Studio API Key

    Last modified: 26 June 2024
    \ No newline at end of file diff --git a/docs/getting-started.html b/docs/getting-started.html index 375a039b..76619f33 100644 --- a/docs/getting-started.html +++ b/docs/getting-started.html @@ -1,5 +1,5 @@ -Getting Started | Documentation

    Documentation 0.1.2 Help

    Getting Started

    Installing the Dev. Environment

    1. Ensure that you have the right version of Python. The required Python version can be seen in pyproject.toml

      - [tool.poetry.dependencies] - python = "..." -
    2. Start by cloning our repository.

      - git clone https://github.com/FR-DC/FRDC-ML.git -
    3. Then, create a Python Virtual Env pyvenv

      python -m venv venv/
      python3 -m venv venv/
    4. Install Poetry Then check if it's installed with

      poetry --version
    5. Activate the virtual environment

      - cd venv/Scripts - activate - cd ../.. -
      - source venv/bin/activate -
    6. Install the dependencies. You should be in the same directory as pyproject.toml

      - poetry install --with dev -
    7. Install Pre-Commit Hooks

      - pre-commit install -

    Setting Up Google Cloud

    1. We use Google Cloud to store our datasets. To set up Google Cloud, install the Google Cloud CLI

    2. Then, authenticate your account.

      gcloud auth login
    3. Finally, set up Application Default Credentials (ADC).

      gcloud auth application-default login
    4. To make sure everything is working, run the tests.

    Setting Up Label Studio

    1. We use Label Studio to annotate our datasets. We won't go through how to install Label Studio, for contributors, it should be up on localhost:8080.

    2. Then, retrieve your own API key from Label Studio. Go to your account page and copy the API key.


    3. Set your API key as an environment variable.

      In Windows, go to "Edit environment variables for your account" and add this as a new environment variable with name LABEL_STUDIO_API_KEY.

      Export it as an environment variable.

      export LABEL_STUDIO_API_KEY=...

    Setting Up Weight and Biases

    1. We use W&B to track our experiments. To set up W&B, install the W&B CLI

    2. Then, authenticate your account.

      wandb login

    Pre-commit Hooks

    • - pre-commit install -

    Running the Tests

    • Run the tests to make sure everything is working

      - pytest -

    Troubleshooting

    ModuleNotFoundError

    It's likely that your src and tests directories are not in PYTHONPATH. To fix this, run the following command:

    +}

    Documentation 0.1.2 Help

    Getting Started

    Installing the Dev. Environment

    1. Ensure that you have the right version of Python. The required Python version can be seen in pyproject.toml

      + [tool.poetry.dependencies] + python = "..." +
    2. Start by cloning our repository.

      + git clone https://github.com/FR-DC/FRDC-ML.git +
    3. Then, create a Python Virtual Env pyvenv

      python -m venv venv/
      python3 -m venv venv/
    4. Install Poetry Then check if it's installed with

      poetry --version
    5. Activate the virtual environment

      + cd venv/Scripts + activate + cd ../.. +
      + source venv/bin/activate +
    6. Install the dependencies. You should be in the same directory as pyproject.toml

      + poetry install --with dev +
    7. Install Pre-Commit Hooks

      + pre-commit install +

    Setting Up Google Cloud

    1. We use Google Cloud to store our datasets. To set up Google Cloud, install the Google Cloud CLI

    2. Then, authenticate your account.

      gcloud auth login
    3. Finally, set up Application Default Credentials (ADC).

      gcloud auth application-default login
    4. To make sure everything is working, run the tests.

    Setting Up Label Studio

    1. We use Label Studio to annotate our datasets. We won't go through how to install Label Studio, for contributors, it should be up on localhost:8080.

    2. Then, retrieve your own API key from Label Studio. Go to your account page and copy the API key.


    3. Set your API key as an environment variable.

      In Windows, go to "Edit environment variables for your account" and add this as a new environment variable with name LABEL_STUDIO_API_KEY.

      Export it as an environment variable.

      export LABEL_STUDIO_API_KEY=...

      In all cases, you can create a .env file in the root of the project and add the following line: LABEL_STUDIO_API_KEY=...

    Setting Up Weight and Biases

    1. We use W&B to track our experiments. To set up W&B, install the W&B CLI

    2. Then, authenticate your account.

      wandb login

    Pre-commit Hooks

    • + pre-commit install +

    Running the Tests

    • Run the tests to make sure everything is working

      + pytest +

    Troubleshooting

    ModuleNotFoundError

    It's likely that your src and tests directories are not in PYTHONPATH. To fix this, run the following command:

    export PYTHONPATH=$PYTHONPATH:./src:./tests -

    Or, set it in your IDE, for example, IntelliJ allows setting directories as Source Roots.

    google.auth.exceptions.DefaultCredentialsError

    It's likely that you haven't authenticated your Google Cloud account. See Setting Up Google Cloud

    Couldn't connect to Label Studio

    Label Studio must be running locally, exposed on localhost:8080. Furthermore, you need to specify the LABEL_STUDIO_API_KEY environment variable. See Setting Up Label Studio

    Cannot login to W&B

    You need to authenticate your W&B account. See Setting Up Weight and Biases If you're facing difficulties, set the WANDB_MODE environment variable to offline to disable W&B.

    Our Repository Structure

    Before starting development, take a look at our repository structure. This will help you understand where to put your code.

    Core Dependencies
    Resources
    Tests
    Repo Dependencies
    Dataset Loaders
    Preprocessing Fn.
    Train Deps
    Model Architectures
    Datasets ...
    FRDC
    src/frdc/
    rsc/
    tests/
    pyproject.toml,poetry.lock
    ./load/
    ./preprocess/
    ./train/
    ./models/
    ./dataset_name/
    src/frdc/

    Source Code for our package. These are the unit components of our pipeline.

    rsc/

    Resources. These are usually cached datasets

    tests/

    PyTest tests. These are unit, integration, and model tests.

    Unit, Integration, and Pipeline Tests

    We have 3 types of tests:

    • Unit Tests are usually small, single function tests.

    • Integration Tests are larger tests that tests a mock pipeline.

    • Model Tests are the true production pipeline tests that will generate a model.

    Where Should I contribute?

    Changing a small component

    If you're changing a small component, such as a argument for preprocessing, a new model architecture, or a new configuration for a dataset, take a look at the src/frdc/ directory.

    Adding a test

    By adding a new component, you'll need to add a new test. Take a look at the tests/ directory.

    Changing the model pipeline

    If you're a ML Researcher, you'll probably be changing the pipeline. Take a look at the tests/model_tests/ directory.

    Adding a dependency

    If you're adding a new dependency, use poetry add PACKAGE and commit the changes to pyproject.toml and poetry.lock.

    Last modified: 26 June 2024
    \ No newline at end of file +

    Or, set it in your IDE, for example, IntelliJ allows setting directories as Source Roots.

    google.auth.exceptions.DefaultCredentialsError

    It's likely that you haven't authenticated your Google Cloud account. See Setting Up Google Cloud

    Couldn't connect to Label Studio

    Label Studio must be running locally, exposed on localhost:8080. Furthermore, you need to specify the LABEL_STUDIO_API_KEY environment variable. See Setting Up Label Studio

    Cannot login to W&B

    You need to authenticate your W&B account. See Setting Up Weight and Biases If you're facing difficulties, set the WANDB_MODE environment variable to offline to disable W&B.

    Our Repository Structure

    Before starting development, take a look at our repository structure. This will help you understand where to put your code.

    Core Dependencies
    Resources
    Tests
    Repo Dependencies
    Dataset Loaders
    Preprocessing Fn.
    Train Deps
    Model Architectures
    Datasets ...
    FRDC
    src/frdc/
    rsc/
    tests/
    pyproject.toml,poetry.lock
    ./load/
    ./preprocess/
    ./train/
    ./models/
    ./dataset_name/
    src/frdc/

    Source Code for our package. These are the unit components of our pipeline.

    rsc/

    Resources. These are usually cached datasets

    tests/

    PyTest tests. These are unit, integration, and model tests.

    Unit, Integration, and Pipeline Tests

    We have 3 types of tests:

    • Unit Tests are usually small, single function tests.

    • Integration Tests are larger tests that tests a mock pipeline.

    • Model Tests are the true production pipeline tests that will generate a model.

    Where Should I contribute?

    Changing a small component

    If you're changing a small component, such as a argument for preprocessing, a new model architecture, or a new configuration for a dataset, take a look at the src/frdc/ directory.

    Adding a test

    By adding a new component, you'll need to add a new test. Take a look at the tests/ directory.

    Changing the model pipeline

    If you're a ML Researcher, you'll probably be changing the pipeline. Take a look at the tests/model_tests/ directory.

    Adding a dependency

    If you're adding a new dependency, use poetry add PACKAGE and commit the changes to pyproject.toml and poetry.lock.

    Last modified: 26 June 2024
    \ No newline at end of file diff --git a/docs/load-dataset.html b/docs/load-dataset.html index 7522b773..cdfafda1 100644 --- a/docs/load-dataset.html +++ b/docs/load-dataset.html @@ -1,5 +1,5 @@ -load.dataset | Documentation

    Documentation 0.1.2 Help

    load.dataset

    Usage

    Firstly, to load a dataset instance, you need to initiliaze a FRDCDataset object, providing the site, date, and version.

    We recommend using the FRDCDatasetPreset module to load explicitly known datasets.

    +}

    Documentation 0.1.2 Help

    load.dataset

    Usage

    Firstly, to load a dataset instance, you need to initiliaze a FRDCDataset object, providing the site, date, and version.

    We recommend using the FRDCDatasetPreset module to load explicitly known datasets.

    from frdc.load.preset import FRDCDatasetPreset ds = FRDCDatasetPreset.chestnut_20201218() -

    Then, we can use the ds object to load objects of the dataset:

    +

    Then, we can use the ds object to load objects of the dataset:

    ar, order = ds._get_ar_bands() d = ds._get_ar_bands_as_dict() bounds, labels = ds._get_legacy_bounds_and_labels() -
    • ar is a stacked NDArray of the hyperspectral bands of shape (H x W x C)

    • order is a list of strings, containing the names of the bands, ordered according to the channels of ar

    • d is a dictionary of the hyperspectral bands of shape (H x W), keyed by the band names

    • bounds is a list of bounding boxes, in the format of Rect, a namedtuple of x0, y0, x1, y1

    • labels is a list of strings, containing the labels of the bounding boxes, ordered according to bounds

    I can't find a dataset!

    Some datasets, especially new ones may be unregistered and you must specify the exact site / date / version of it.

    +
    • ar is a stacked NDArray of the hyperspectral bands of shape (H x W x C)

    • order is a list of strings, containing the names of the bands, ordered according to the channels of ar

    • d is a dictionary of the hyperspectral bands of shape (H x W), keyed by the band names

    • bounds is a list of bounding boxes, in the format of Rect, a namedtuple of x0, y0, x1, y1

    • labels is a list of strings, containing the labels of the bounding boxes, ordered according to bounds

    I can't find a dataset!

    Some datasets, especially new ones may be unregistered and you must specify the exact site / date / version of it.

    from frdc.load.dataset import FRDCDataset ds = FRDCDataset(site="mysite", date="mydate", version="myversion") -

    See below for examples on how to format this.

    • site="ds"

    • date="date"

    • version="ver"

    • site="ds"

    • date="date"

    • version="ver/01/data"

    • site="ds"

    • date="date"

    • version=None

    Last modified: 26 June 2024
    \ No newline at end of file +

    See below for examples on how to format this.

    • site="ds"

    • date="date"

    • version="ver"

    • site="ds"

    • date="date"

    • version="ver/01/data"

    • site="ds"

    • date="date"

    • version=None

    Last modified: 26 June 2024
    \ No newline at end of file diff --git a/docs/load-gcs.html b/docs/load-gcs.html index aa9fd5e1..bf12b421 100644 --- a/docs/load-gcs.html +++ b/docs/load-gcs.html @@ -1,5 +1,5 @@ -load.gcs | Documentation

    Documentation 0.1.2 Help

    load.gcs

    Usage

    These are defined in the top-level load.gcs module.

    list_gcs_datasets

    Lists all datasets in the bucket as a DataFrame. This works by checking which folders have a specific file, which we call the anchor.

    download

    Downloads a file from Google Cloud Storage and returns the local file path.

    open_file

    Downloads and opens a file from Google Cloud Storage. Returns a file handle.

    open_image

    Downloads and returns the PIL image from Google Cloud Storage.

    Pathing

    The path to specify is relative to the bucket, which is frdc-ds by default.

    For example this filesystem on GCS:

    +}

    Documentation 0.1.2 Help

    load.gcs

    Usage

    These are defined in the top-level load.gcs module.

    list_gcs_datasets

    Lists all datasets in the bucket as a DataFrame. This works by checking which folders have a specific file, which we call the anchor.

    download

    Downloads a file from Google Cloud Storage and returns the local file path.

    open_file

    Downloads and opens a file from Google Cloud Storage. Returns a file handle.

    open_image

    Downloads and returns the PIL image from Google Cloud Storage.

    Pathing

    The path to specify is relative to the bucket, which is frdc-ds by default.

    For example this filesystem on GCS:

    # On Google Cloud Storage frdc-ds ├── chestnut_nature_park │ └── 20201218 │ └── 90deg │ └── bounds.json -

    To download bounds.json, use download(r"chestnut_nature_park/20201218/90deg/bounds.json"). By default, all files will be downloaded to PROJ_DIR/rsc/....

    +

    To download bounds.json, use download(r"chestnut_nature_park/20201218/90deg/bounds.json"). By default, all files will be downloaded to PROJ_DIR/rsc/....

    # On local filesystem PROJ_DIR ├── rsc @@ -28,4 +28,4 @@ │ └── 20201218 │ └── 90deg │ └── bounds.json -

    Configuration

    If you need granular control over

    • where the files are downloaded

    • the credentials used

    • the project used

    • the bucket used

    Then edit conf.py.

    GCS_CREDENTIALS

    Google Cloud credentials.


    A google.oauth2.service_account.Credentials object. See the object documentation for more information.

    LOCAL_DATASET_ROOT_DIR

    Local directory to download files to.


    Path to a directory, or a Path object.

    GCS_PROJECT_ID

    Google Cloud project ID.


    GCS_BUCKET_NAME

    Google Cloud Storage bucket name.


    Last modified: 26 June 2024
    \ No newline at end of file +

    Configuration

    If you need granular control over

    • where the files are downloaded

    • the credentials used

    • the project used

    • the bucket used

    Then edit conf.py.

    GCS_CREDENTIALS

    Google Cloud credentials.


    A google.oauth2.service_account.Credentials object. See the object documentation for more information.

    LOCAL_DATASET_ROOT_DIR

    Local directory to download files to.


    Path to a directory, or a Path object.

    GCS_PROJECT_ID

    Google Cloud project ID.


    GCS_BUCKET_NAME

    Google Cloud Storage bucket name.


    Last modified: 26 June 2024
    \ No newline at end of file diff --git a/docs/mix-match-module.html b/docs/mix-match-module.html index e9524435..98364a0b 100644 --- a/docs/mix-match-module.html +++ b/docs/mix-match-module.html @@ -1,5 +1,5 @@ -MixMatch Module | Documentation

    Documentation 0.1.2 Help

    MixMatch Module

    See frdc/train/mixmatch_module.py.

    Quick Recap

    We will go over the essential parts of the code here. Before that, we revise some of the concepts that are used in the code.

    Abstract Methods

    In Python, we can define abstract methods using the abc module. Just like other OOP languages, abstract methods are methods that must be implemented by the child class.

    For example:

    +}

    Documentation 0.1.2 Help

    MixMatch Module

    See frdc/train/mixmatch_module.py.

    Quick Recap

    We will go over the essential parts of the code here. Before that, we revise some of the concepts that are used in the code.

    Abstract Methods

    In Python, we can define abstract methods using the abc module. Just like other OOP languages, abstract methods are methods that must be implemented by the child class.

    For example:

    from abc import ABC, abstractmethod @@ -26,7 +26,7 @@ class MyChildClass(MyAbstractClass): def my_abstract_method(self): print("Hello World!") -

    nn.Module & LightningModule

    If you're unfamiliar with PyTorch, you should read the nn.Module Documentation.

    nn.Module is the base class for all neural network modules in PyTorch. While LightningModule is a PyTorch Lightning class that extends nn.Module, providing it with additional functionality that reduces boilerplate code.

    By implementing it as a LightningModule, we also enter the PyTorch Lightning ecosystem, which provides us with a lot of useful features such as logging, early stopping, and more.

    What do we implement in a Module?

    One key component that nn.Module requires, is the model. So for example:

    +

    nn.Module & LightningModule

    If you're unfamiliar with PyTorch, you should read the nn.Module Documentation.

    nn.Module is the base class for all neural network modules in PyTorch. While LightningModule is a PyTorch Lightning class that extends nn.Module, providing it with additional functionality that reduces boilerplate code.

    By implementing it as a LightningModule, we also enter the PyTorch Lightning ecosystem, which provides us with a lot of useful features such as logging, early stopping, and more.

    What do we implement in a Module?

    One key component that nn.Module requires, is the model. So for example:

    class MyModule(nn.Module): def __init__(self): super().__init__() @@ -38,7 +38,7 @@ def forward(self, x): return self.model(x) -

    PyTorch Lightning builds on top of it, requiring training_step and validation_step. Each "step" is a batch of data, and the model is trained on it. So for example:

    +

    PyTorch Lightning builds on top of it, requiring training_step and validation_step. Each "step" is a batch of data, and the model is trained on it. So for example:

    class MyModule(LightningModule): def __init__(self): ... @@ -55,7 +55,7 @@ y_hat = self(x) loss = F.cross_entropy(y_hat, y) return loss -

    Usually, the training and validation steps are the same, but in some cases, such as MixMatch, they are different. In MixMatch, we not only use a different loss function for train, we also handle a batch differently. The PyTorch Lightning framework allows us to separate the two, and implement them separately.

    Model Embedded Preprocessing on_before_batch_transfer

    In PyTorch Lightning, we can also inject a step before the batch is passed to the model. This is done by overriding the on_before_batch_transfer method.

    Batch
    on_before_batch_transfer
    training_step
    validation_step

    This allows us to do preprocessing on the batch, such as scaling the data, encoding the labels, and more.

    Custom EMA Update on_after_backward

    We also leverage another hook, called on_after_backward. This hook is called after the backward pass, and allows us to do custom operations. In our case, we use it to update the EMA model.

    Batch
    training_step
    on_after_backward
    update_ema

    MixMatch

    We recommend having tests/model_tests/chestnut_dec_may/train.py open while reading this section. It implements a real-world example of MixMatch.

    As a summary:

    1. We learned what is an abstract method, and how to implement it

    2. We implement the model in LightningModule much like we would in nn.Module

    3. We implement on_before_batch_transfer to preprocess the batch

    4. Finally, we implement on_after_backward to update the EMA model

    With the above in mind, let's look at the MixMatch implementation.

    forward (abstract)

    Forward pass of the model

    ema_model (abstract)

    The model that is used for EMA. We expect this property to be implemented by the child class.

    update_ema (abstract)

    The method to update the EMA model. We expect this method to be implemented by the child class.

    loss_unl_scaler (static)

    Takes in the current progress of the training, 0.0 to 1.0, where 0.0 is the start of the training, and 1.0 is the end. Then, returns the multiplier for the unlabeled loss.

    loss_lbl (static)

    Implements the loss for labeled data. Takes in the predicted labels and the ground truth labels, and returns the loss. This is cross entropy for MixMatch.

    loss_unl (static)

    Implements the loss for unlabeled data. Takes in the predicted labels and the ground truth labels, and returns the loss. This is MSE for MixMatch.

    mixup

    Takes in the data and the labels, the beta distribution parameter, and returns the mixed data and labels.

    sharpen

    Takes in the labels and temperature, and returns the sharpened labels.

    guess_labels

    Takes in the unlabeled data, and returns the guessed labels.

    progress

    The current progress of the training, 0.0 to 1.0, where 0.0 is the start of the training, and 1.0 is the end.

    training_step

    The training step runs through 1 batch of data, and returns the loss. Note that this is significantly different from validation step, as we handle the K-Augmented data differently.

    test / validation_step

    The test / validation step runs through 1 batch of data, and returns the loss.

    predict_step

    The predict step runs through 1 batch of data, and returns the actual decoded labels.

    on_after_backward

    The on_after_backward hook is called after the backward pass, and allows us to do custom operations. In our case, we use it to update the EMA model.

    on_before_batch_transfer

    The on_before_batch_transfer hook is called before the batch is transferred to the GPU. In our case, we use it to preprocess the batch.

    A diagram of how these components interact with each other is shown below:

    Batch
    on_before_batch_transfer
    training_step
    guess_labels
    sharpen
    mix_up
    loss_unl
    loss_unl_scaler
    loss
    loss_lbl
    backward
    on_after_backward
    update_ema
    validation_step
    loss

    Finally, we show an example of how to use the MixMatch module:

    +

    Usually, the training and validation steps are the same, but in some cases, such as MixMatch, they are different. In MixMatch, we not only use a different loss function for train, we also handle a batch differently. The PyTorch Lightning framework allows us to separate the two, and implement them separately.

    Model Embedded Preprocessing on_before_batch_transfer

    In PyTorch Lightning, we can also inject a step before the batch is passed to the model. This is done by overriding the on_before_batch_transfer method.

    Batch
    on_before_batch_transfer
    training_step
    validation_step

    This allows us to do preprocessing on the batch, such as scaling the data, encoding the labels, and more.

    Custom EMA Update on_after_backward

    We also leverage another hook, called on_after_backward. This hook is called after the backward pass, and allows us to do custom operations. In our case, we use it to update the EMA model.

    Batch
    training_step
    on_after_backward
    update_ema

    MixMatch

    We recommend having tests/model_tests/chestnut_dec_may/train.py open while reading this section. It implements a real-world example of MixMatch.

    As a summary:

    1. We learned what is an abstract method, and how to implement it

    2. We implement the model in LightningModule much like we would in nn.Module

    3. We implement on_before_batch_transfer to preprocess the batch

    4. Finally, we implement on_after_backward to update the EMA model

    With the above in mind, let's look at the MixMatch implementation.

    forward (abstract)

    Forward pass of the model

    ema_model (abstract)

    The model that is used for EMA. We expect this property to be implemented by the child class.

    update_ema (abstract)

    The method to update the EMA model. We expect this method to be implemented by the child class.

    loss_unl_scaler (static)

    Takes in the current progress of the training, 0.0 to 1.0, where 0.0 is the start of the training, and 1.0 is the end. Then, returns the multiplier for the unlabeled loss.

    loss_lbl (static)

    Implements the loss for labeled data. Takes in the predicted labels and the ground truth labels, and returns the loss. This is cross entropy for MixMatch.

    loss_unl (static)

    Implements the loss for unlabeled data. Takes in the predicted labels and the ground truth labels, and returns the loss. This is MSE for MixMatch.

    mixup

    Takes in the data and the labels, the beta distribution parameter, and returns the mixed data and labels.

    sharpen

    Takes in the labels and temperature, and returns the sharpened labels.

    guess_labels

    Takes in the unlabeled data, and returns the guessed labels.

    progress

    The current progress of the training, 0.0 to 1.0, where 0.0 is the start of the training, and 1.0 is the end.

    training_step

    The training step runs through 1 batch of data, and returns the loss. Note that this is significantly different from validation step, as we handle the K-Augmented data differently.

    test / validation_step

    The test / validation step runs through 1 batch of data, and returns the loss.

    predict_step

    The predict step runs through 1 batch of data, and returns the actual decoded labels.

    on_after_backward

    The on_after_backward hook is called after the backward pass, and allows us to do custom operations. In our case, we use it to update the EMA model.

    on_before_batch_transfer

    The on_before_batch_transfer hook is called before the batch is transferred to the GPU. In our case, we use it to preprocess the batch.

    A diagram of how these components interact with each other is shown below:

    Batch
    on_before_batch_transfer
    training_step
    guess_labels
    sharpen
    mix_up
    loss_unl
    loss_unl_scaler
    loss
    loss_lbl
    backward
    on_after_backward
    update_ema
    validation_step
    loss

    Finally, we show an example of how to use the MixMatch module:

    from sklearn.preprocessing import StandardScaler, OrdinalEncoder from frdc.train.mixmatch_module import MixMatchModule @@ -75,7 +75,7 @@ sharpen_temp=0.5, mix_beta_alpha=0.75, ) -

    In particular, we need to supply some transformations for the preprocessing step. In this case, we use StandardScaler to scale the data, and OrdinalEncoder to encode the labels.

    1. It's best if standardization is done only on the training data, and not the validation data to better fit real-world scenarios.

    2. We use OrdinalEncoder as it handles unseen labels. So if a class doesn't show up in the training data, it will be encoded as np.nan, and will not participate in the loss calculation.

    Design Choices

    Static Method Overriding

    We implement many functions as static, as we believe that a functional style reduces dependencies, thus making the code easier to test and debug.

    Furthermore, it allows the subclasses to easily override the functions, to customize the behavior of the MixMatch module.

    For example, the loss_unl_scaler function is static, thus, we can implement our own scaling function, and pass it to the MixMatch module.

    +

    In particular, we need to supply some transformations for the preprocessing step. In this case, we use StandardScaler to scale the data, and OrdinalEncoder to encode the labels.

    1. It's best if standardization is done only on the training data, and not the validation data to better fit real-world scenarios.

    2. We use OrdinalEncoder as it handles unseen labels. So if a class doesn't show up in the training data, it will be encoded as np.nan, and will not participate in the loss calculation.

    Design Choices

    Static Method Overriding

    We implement many functions as static, as we believe that a functional style reduces dependencies, thus making the code easier to test and debug.

    Furthermore, it allows the subclasses to easily override the functions, to customize the behavior of the MixMatch module.

    For example, the loss_unl_scaler function is static, thus, we can implement our own scaling function, and pass it to the MixMatch module.

    def my_loss_unl_scaler(progress: float) -> float: return progress ** 2 @@ -83,4 +83,4 @@ @staticmethod def loss_unl_scaler(progress: float) -> float: return my_loss_unl_scaler(progress) -

    If we had used a method instead, we would have to consider instance state, which would make it harder to override.

    Why not use Dataclasses?

    One of the biggest caveats of nn.Module is that it requires super().__init__() to be called before anything is assigned. While dataclass can leverage __post_init__ to do the same, we felt that this was too much of a hassle to save a few keystrokes. Thus, we opted to use __init__ instead, while more verbose, it is more explicit.

    Why use PyTorch Lightning?

    While we did hit some road blocks implementing SSL, due to its complex and unconventional nature, we felt that the benefits of using PyTorch Lightning outweighed the cons.

    on_before_batch_transfer and on_after_backward are unconventional hooks, and we had to do some digging to find them. It can be argued that by just writing explicit code, we can avoid the need for these hooks, but the PyTorch ecosystem fixes many other issues, so we closed an eye on this.

    References

    Last modified: 26 June 2024
    \ No newline at end of file +

    If we had used a method instead, we would have to consider instance state, which would make it harder to override.

    Why not use Dataclasses?

    One of the biggest caveats of nn.Module is that it requires super().__init__() to be called before anything is assigned. While dataclass can leverage __post_init__ to do the same, we felt that this was too much of a hassle to save a few keystrokes. Thus, we opted to use __init__ instead, while more verbose, it is more explicit.

    Why use PyTorch Lightning?

    While we did hit some road blocks implementing SSL, due to its complex and unconventional nature, we felt that the benefits of using PyTorch Lightning outweighed the cons.

    on_before_batch_transfer and on_after_backward are unconventional hooks, and we had to do some digging to find them. It can be argued that by just writing explicit code, we can avoid the need for these hooks, but the PyTorch ecosystem fixes many other issues, so we closed an eye on this.

    References

    Last modified: 26 June 2024
    \ No newline at end of file diff --git a/docs/mix-match.html b/docs/mix-match.html index 18761d00..0b1edc52 100644 --- a/docs/mix-match.html +++ b/docs/mix-match.html @@ -1,5 +1,5 @@ -MixMatch | Documentation

    Documentation 0.1.2 Help

    MixMatch

    In FRDC-ML, we leverage semi-supervised learning to improve the model's performance through better augmentation consistency and using even unlabelled data.

    The algorithm we use is MixMatch. A state-of-the-art semi-supervised learning algorithm. It is based on the idea of consistency regularization, which encourages models to predict the same class even after augmentations that occur naturally in the real world.

    Our implementation of MixMatch is a refactored version of YU1ut/MixMatch-pytorch We've refactored the code to follow more modern PyTorch practices, allowing us to utilize it with modern PyTorch frameworks such as PyTorch Lightning.

    We won't go through the details of MixMatch here, see Our Documentation in our MixMatch-PyTorch-CIFAR10 repository for more details.

    Implementation Details

    1. How we implemented the MixMatch logic MixMatchModule

    2. How we implemented the unique MixMatch data loading logic Custom MixMatch Data Loading

    References

    Last modified: 26 June 2024
    \ No newline at end of file +}

    Documentation 0.1.2 Help

    MixMatch

    In FRDC-ML, we leverage semi-supervised learning to improve the model's performance through better augmentation consistency and using even unlabelled data.

    The algorithm we use is MixMatch. A state-of-the-art semi-supervised learning algorithm. It is based on the idea of consistency regularization, which encourages models to predict the same class even after augmentations that occur naturally in the real world.

    Our implementation of MixMatch is a refactored version of YU1ut/MixMatch-pytorch We've refactored the code to follow more modern PyTorch practices, allowing us to utilize it with modern PyTorch frameworks such as PyTorch Lightning.

    We won't go through the details of MixMatch here, see Our Documentation in our MixMatch-PyTorch-CIFAR10 repository for more details.

    Implementation Details

    1. How we implemented the MixMatch logic MixMatchModule

    2. How we implemented the unique MixMatch data loading logic Custom MixMatch Data Loading

    References

    Last modified: 26 June 2024
    \ No newline at end of file diff --git a/docs/ml-architecture.html b/docs/ml-architecture.html index 3016f86f..e62735d0 100644 --- a/docs/ml-architecture.html +++ b/docs/ml-architecture.html @@ -1,5 +1,5 @@ -ML Architecture | Documentation

    Documentation 0.1.2 Help

    ML Architecture

    The architecture is the backbone of the project. If you're interested on how everything is pieced together, this article is for you.

    In Machine Learning architectures, we mostly care about 2 things the data, and the model. As the name implies, DataModules, DataLoaders, Datasets deal with data, and Modules for model construction.

    Data Classes

    There's a small difference between the Data___ classes. Firstly, we load data in as Dataset instances, then preprocessed before being batched by DataLoader, finally, housed in DataModule.

    DataModule
    Train DataLoader
    Validation DataLoader
    Test DataLoader
    Preprocess
    Augmentations
    Distortions
    Alternatives
    Cropping or Resizing
    Scaling
    Data Source
    Load
    Dataset
    DataLoader

    There are 2 IMPORTANT design decisions here:

    Dataset and DataLoader

    Data in Dataset are unbatched, data in DataLoader must be batched. This means that it's possible to have jagged tensors at this stage, however they must be made "stackable" before loading into the DataLoader.

    For example, the data in Dataset could be of shapes [(8, 200, 100), (8, 100, 300), ...]. While, BEFORE loading into DataLoader must have equal shapes, for example [(8, 100, 100), (8, 100, 100), ...]

    This is because when you initialize a DataLoader you need to include the batch_size, which implies the data are stacked in some manner.

    This also leads to the reason why preprocessing must happen before the DataLoader

    Preprocessing

    Excluding functionalities to load the data, this is the step before the data is set in stone. So, steps such as augmentation, transformation, even analytics needs to be performed here as the data is in its "rawest" form.

    We use this step to

    1. Construct alternative augmentations. i.e. images that we could've taken instead.

    2. Using those alternatives, add distortions. i.e. unintentional changes to the photo that reduces quality.

    3. Cropping or resizing the image.

    4. Scale the data. e.g. Standard Scaling, ZCA Scaling, etc.

    The order of the steps are choice by design.

    Modules

    We analyze the inheritance structure of the Modules (also the ML Models):

    Library Module
    PyTorch Module
    Lightning Module
    FRDC Module
    FixMatch Module
    MixMatch Module
    EfficientNetB1 FixMatch Module
    EfficientNetB1 MixMatch Module
    Custom Module

    Custom Modules are our self-defined classes.

    • FRDC Module: This is the base class for all our models. Implements common functionality, such as partial saving of unfrozen parameters.

    • Y Module: Y is the architecture/framework of the model in our case, this only defines the method of training, not the actual model itself.

    • X Y Module: X defines the actual model being used within Y's framework.

    To give an example, we look at EfficientNetB1FixMatchModule. Due to its naming scheme <Model><Framework>Module, we see that it's an EfficientNetB1 model used in the FixMatch framework.

    Furthermore, because it's well decoupled, implementing a new model is as easy as overriding some defaults.

    Last modified: 26 June 2024
    \ No newline at end of file +}

    Documentation 0.1.2 Help

    ML Architecture

    The architecture is the backbone of the project. If you're interested on how everything is pieced together, this article is for you.

    In Machine Learning architectures, we mostly care about 2 things the data, and the model. As the name implies, DataModules, DataLoaders, Datasets deal with data, and Modules for model construction.

    Data Classes

    There's a small difference between the Data___ classes. Firstly, we load data in as Dataset instances, then preprocessed before being batched by DataLoader, finally, housed in DataModule.

    DataModule
    Train DataLoader
    Validation DataLoader
    Test DataLoader
    Preprocess
    Augmentations
    Distortions
    Alternatives
    Cropping or Resizing
    Scaling
    Data Source
    Load
    Dataset
    DataLoader

    There are 2 IMPORTANT design decisions here:

    Dataset and DataLoader

    Data in Dataset are unbatched, data in DataLoader must be batched. This means that it's possible to have jagged tensors at this stage, however they must be made "stackable" before loading into the DataLoader.

    For example, the data in Dataset could be of shapes [(8, 200, 100), (8, 100, 300), ...]. While, BEFORE loading into DataLoader must have equal shapes, for example [(8, 100, 100), (8, 100, 100), ...]

    This is because when you initialize a DataLoader you need to include the batch_size, which implies the data are stacked in some manner.

    This also leads to the reason why preprocessing must happen before the DataLoader

    Preprocessing

    Excluding functionalities to load the data, this is the step before the data is set in stone. So, steps such as augmentation, transformation, even analytics needs to be performed here as the data is in its "rawest" form.

    We use this step to

    1. Construct alternative augmentations. i.e. images that we could've taken instead.

    2. Using those alternatives, add distortions. i.e. unintentional changes to the photo that reduces quality.

    3. Cropping or resizing the image.

    4. Scale the data. e.g. Standard Scaling, ZCA Scaling, etc.

    The order of the steps are choice by design.

    Modules

    We analyze the inheritance structure of the Modules (also the ML Models):

    Library Module
    PyTorch Module
    Lightning Module
    FRDC Module
    FixMatch Module
    MixMatch Module
    EfficientNetB1 FixMatch Module
    EfficientNetB1 MixMatch Module
    Custom Module

    Custom Modules are our self-defined classes.

    • FRDC Module: This is the base class for all our models. Implements common functionality, such as partial saving of unfrozen parameters.

    • Y Module: Y is the architecture/framework of the model in our case, this only defines the method of training, not the actual model itself.

    • X Y Module: X defines the actual model being used within Y's framework.

    To give an example, we look at EfficientNetB1FixMatchModule. Due to its naming scheme <Model><Framework>Module, we see that it's an EfficientNetB1 model used in the FixMatch framework.

    Furthermore, because it's well decoupled, implementing a new model is as easy as overriding some defaults.

    Last modified: 26 June 2024
    \ No newline at end of file diff --git a/docs/model-test-chestnut-may-dec.html b/docs/model-test-chestnut-may-dec.html index cf969761..585bd93a 100644 --- a/docs/model-test-chestnut-may-dec.html +++ b/docs/model-test-chestnut-may-dec.html @@ -1,5 +1,5 @@ -Model Test Chestnut May-Dec | Documentation

    Documentation 0.1.2 Help

    Model Test Chestnut May-Dec

    This test is used to evaluate the model performance on the Chestnut Nature Park May & December dataset.

    See this script in model_tests/chestnut_dec_may/train.py.

    Motivation

    The usage of this model will be to classify trees in unseen datasets under different conditions. In this test, we'll evaluate it under a different season.

    A caveat is that it'll be evaluated on the same set of trees, so it's not a representative of a field-test. However, given difficulties of yielding datasets, this still gives us a good preliminary idea of how the model will perform in different conditions.

    Methodology

    We train on the December dataset, and test on the May dataset.

    Labelled Train
    Unlabelled Train
    Test
    DecDataset
    Model
    MayDataset

    Despite not having any true unlabelled data, we use MixMatch by treating the labelled data of the December dataset as unlabelled data.

    Model

    The current Model used is a simple InceptionV3 Transfer Learning model, with the last layer replaced with a fully connected layer(s).

    SSL Loss
    Input
    InceptionV3 Frozen
    FC Layer(s)
    Softmax
    Output

    Preprocessing

    For Training:

    Segment
    RandomCrop 299
    Horizontal Flip 50%
    Vertical Flip 50%
    Normalize By Training Mean & Std

    For Validation:

    Segment
    CenterCrop 299
    Normalize By Training Mean & Std

    For Evaluation:

    Segment
    CenterCrop 299
    Normalize By Training Mean & Std
    As Is
    Horizontal Flip
    Vertical Flip
    Horizontal & Vertical Flip

    For evaluation, we evaluate that the model should be invariant to horizontal and vertical flips, as well as the original image.

    Hyperparameters

    The following hyperparameters are used:

    • Optimizer: Adam

    • Learning Rate: 1e-3

    • Batch Size: 32

    • Epochs: 10

    • Train Iterations: 25~100

    • Validation Iterations: 10~25

    • Early Stopping: 4

    Results

    We evaluate around 40% accuracy on the test set, compared to 100% for the training set. This indicates that the model has saturated and is not able to learn anymore from the training set. There's no indication of overfitting as the validation loss just plateaus.

    W&B Dashboard

    Caveats

    • The test set is very small, so the results are not very representative.

    • The test set is the same set of trees, so it's not a true test of the model performance in different conditions.

    • There are many classes with 1 sample, so the model may not be able to learn the features of these classes well.

    Last modified: 26 June 2024
    \ No newline at end of file +}

    Documentation 0.1.2 Help

    Model Test Chestnut May-Dec

    This test is used to evaluate the model performance on the Chestnut Nature Park May & December dataset.

    See this script in model_tests/chestnut_dec_may/train.py.

    Motivation

    The usage of this model will be to classify trees in unseen datasets under different conditions. In this test, we'll evaluate it under a different season.

    A caveat is that it'll be evaluated on the same set of trees, so it's not a representative of a field-test. However, given difficulties of yielding datasets, this still gives us a good preliminary idea of how the model will perform in different conditions.

    Methodology

    We train on the December dataset, and test on the May dataset.

    Labelled Train
    Unlabelled Train
    Test
    DecDataset
    Model
    MayDataset

    Despite not having any true unlabelled data, we use MixMatch by treating the labelled data of the December dataset as unlabelled data.

    Model

    The current Model used is a simple InceptionV3 Transfer Learning model, with the last layer replaced with a fully connected layer(s).

    SSL Loss
    Input
    InceptionV3 Frozen
    FC Layer(s)
    Softmax
    Output

    Preprocessing

    For Training:

    Segment
    RandomCrop 299
    Horizontal Flip 50%
    Vertical Flip 50%
    Normalize By Training Mean & Std

    For Validation:

    Segment
    CenterCrop 299
    Normalize By Training Mean & Std

    For Evaluation:

    Segment
    CenterCrop 299
    Normalize By Training Mean & Std
    As Is
    Horizontal Flip
    Vertical Flip
    Horizontal & Vertical Flip

    For evaluation, we evaluate that the model should be invariant to horizontal and vertical flips, as well as the original image.

    Hyperparameters

    The following hyperparameters are used:

    • Optimizer: Adam

    • Learning Rate: 1e-3

    • Batch Size: 32

    • Epochs: 10

    • Train Iterations: 25~100

    • Validation Iterations: 10~25

    • Early Stopping: 4

    Results

    We evaluate around 40% accuracy on the test set, compared to 100% for the training set. This indicates that the model has saturated and is not able to learn anymore from the training set. There's no indication of overfitting as the validation loss just plateaus.

    W&B Dashboard

    Caveats

    • The test set is very small, so the results are not very representative.

    • The test set is the same set of trees, so it's not a true test of the model performance in different conditions.

    • There are many classes with 1 sample, so the model may not be able to learn the features of these classes well.

    Last modified: 26 June 2024
    \ No newline at end of file diff --git a/docs/overview.html b/docs/overview.html index f5833ce7..f818369f 100644 --- a/docs/overview.html +++ b/docs/overview.html @@ -1,5 +1,5 @@ -Overview | Documentation

    Documentation 0.1.2 Help

    Overview

    Forest Recovery Digital Companion (FRDC) is a ML-assisted companion for ecologists to automatically classify surveyed trees via an Unmanned Aerial Vehicle (UAV).

    This package, FRDC-ML is the Machine Learning backbone of this project, a centralized repository of tools and model architectures to be used in the FRDC pipeline.

    Get started here

    Other Projects

    FRDC-UI

    The User Interface Repository for FRDC, a WebApp GUI for ecologists to adjust annotations.

    Last modified: 26 June 2024
    \ No newline at end of file +}

    Documentation 0.1.2 Help

    Overview

    Forest Recovery Digital Companion (FRDC) is a ML-assisted companion for ecologists to automatically classify surveyed trees via an Unmanned Aerial Vehicle (UAV).

    This package, FRDC-ML is the Machine Learning backbone of this project, a centralized repository of tools and model architectures to be used in the FRDC pipeline.

    Get started here

    Other Projects

    FRDC-UI

    The User Interface Repository for FRDC, a WebApp GUI for ecologists to adjust annotations.

    Last modified: 26 June 2024
    \ No newline at end of file diff --git a/docs/preprocessing-extract-segments.html b/docs/preprocessing-extract-segments.html index e6922357..56f4caf3 100644 --- a/docs/preprocessing-extract-segments.html +++ b/docs/preprocessing-extract-segments.html @@ -1,5 +1,5 @@ -preprocessing.extract_segments | Documentation

    Documentation 0.1.2 Help

    preprocessing.extract_segments

    Functions

    extract_segments_from_labels

    Extracts segments from a label classification.

    extract_segments_from_bounds

    Extracts segments from Rect bounds.

    remove_small_segments_from_labels

    Removes small segments from a label classification.

    Extract with Boundaries

    A boundary is a Rect object that represents the minimum bounding box of a segment, with x0, y0, x1, y1 coordinates.

    It simply slices the original image to the bounding box. The origin is the top left corner of the image.

    +}

    Documentation 0.1.2 Help

    preprocessing.extract_segments

    Functions

    extract_segments_from_labels

    Extracts segments from a label classification.

    extract_segments_from_bounds

    Extracts segments from Rect bounds.

    remove_small_segments_from_labels

    Removes small segments from a label classification.

    Extract with Boundaries

    A boundary is a Rect object that represents the minimum bounding box of a segment, with x0, y0, x1, y1 coordinates.

    It simply slices the original image to the bounding box. The origin is the top left corner of the image.

    +-----------------+ +-----------+ | Original | | Segmented | | Image | | Image | @@ -24,7 +24,7 @@ +-----+-----+-----+ 1, 2, 0, 2 +-----+-----+ | 7 | 8 | 9 | x0 y0 x1 y1 | 8 | 9 | +-----+-----+-----+ +-----+-----+ -
    +
    +-----------------+ +-----------------+ | Original | | Segmented | | Image | | Image | @@ -35,7 +35,7 @@ +-----+-----+-----+ 1, 2, 0, 2 +-----+-----+-----+ | 7 | 8 | 9 | x0 y0 x1 y1 | 0 | 8 | 9 | +-----+-----+-----+ +-----+-----+-----+ -

    Extract with Labels

    A label classification is a np.ndarray where each pixel is mapped to a segment. The segments are mapped to a unique integer. In our project, the 0th label is the background.

    For example, a label classification of 3 segments will look like this:

    +

    Extract with Labels

    A label classification is a np.ndarray where each pixel is mapped to a segment. The segments are mapped to a unique integer. In our project, the 0th label is the background.

    For example, a label classification of 3 segments will look like this:

    +-----------------+ +-----------------+ | Label | | Original | | Classification | | Image | @@ -46,7 +46,7 @@ +-----+-----+-----+ +-----+-----+-----+ | 1 | 1 | 0 | | 7 | 8 | 9 | +-----+-----+-----+ +-----+-----+-----+ -

    The extraction will take the minimum bounding box of each segment and return a list of segments.

    For example, the label 1 and 2 extracted images will be

    +

    The extraction will take the minimum bounding box of each segment and return a list of segments.

    For example, the label 1 and 2 extracted images will be

    +-----------+ +-----------+ | Extracted | | Extracted | | Segment 1 | | Segment 2 | @@ -57,7 +57,7 @@ +-----+-----+ +-----+-----+ | 7 | 8 | +-----+-----+ -
    +
    +-----------------+ +-----------------+ | Extracted | | Extracted | | Segment 1 | | Segment 2 | @@ -68,7 +68,7 @@ +-----+-----+-----+ +-----+-----+-----+ | 7 | 8 | 0 | | 0 | 0 | 0 | +-----+-----+-----+ +-----+-----+-----+ -
    • If cropped is False, the segments are padded with 0s to the original image size. While this can ensure shape consistency, it can consume more memory for large images.

    • If cropped is True, the segments are cropped to the minimum bounding box. This can save memory, but the shape of the segments will be inconsistent.

    Usage

    Extract from Bounds and Labels

    Extract segments from bounds and labels.

    +
    • If cropped is False, the segments are padded with 0s to the original image size. While this can ensure shape consistency, it can consume more memory for large images.

    • If cropped is True, the segments are cropped to the minimum bounding box. This can save memory, but the shape of the segments will be inconsistent.

    Usage

    Extract from Bounds and Labels

    Extract segments from bounds and labels.

    import numpy as np from frdc.load.preset import FRDCDatasetPreset from frdc.preprocess.extract_segments import extract_segments_from_bounds @@ -78,7 +78,7 @@ bounds, labels = ds._get_legacy_bounds_and_labels() segments: list[np.ndarray] = extract_segments_from_bounds(ar, bounds) -

    Extract from Auto-Segmentation

    Extract segments from a label classification.

    +

    Extract from Auto-Segmentation

    Extract segments from a label classification.

    from skimage.morphology import remove_small_objects, remove_small_holes import numpy as np @@ -102,4 +102,4 @@ min_height=10, min_width=10) segments: list[np.ndarray] = extract_segments_from_labels(ar, ar_labels) -

    API

    extract_segments_from_labels(ar, ar_labels, cropped)

    Extracts segments from a label classification.


    ar_labels is a label classification as a np.ndarray

    extract_segments_from_bounds(ar, bounds, cropped)

    Extracts segments from Rect bounds.


    bounds is a list of Rect bounds.

    remove_small_segments_from_labels(ar_labels, min_height, min_width)

    Removes small segments from a label classification.


    Last modified: 26 June 2024
    \ No newline at end of file +

    API

    extract_segments_from_labels(ar, ar_labels, cropped)

    Extracts segments from a label classification.


    ar_labels is a label classification as a np.ndarray

    extract_segments_from_bounds(ar, bounds, cropped)

    Extracts segments from Rect bounds.


    bounds is a list of Rect bounds.

    remove_small_segments_from_labels(ar_labels, min_height, min_width)

    Removes small segments from a label classification.


    Last modified: 26 June 2024
    \ No newline at end of file diff --git a/docs/preprocessing-glcm-padded.html b/docs/preprocessing-glcm-padded.html index 749cfb70..40d174a1 100644 --- a/docs/preprocessing-glcm-padded.html +++ b/docs/preprocessing-glcm-padded.html @@ -1,5 +1,5 @@ -preprocessing.glcm_padded | Documentation

    Documentation 0.1.2 Help

    preprocessing.glcm_padded

    Functions

    glcm_padded

    Computes the GLCM of the NDArray bands with padding.

    glcm_padded_cached

    Computes the GLCM of the NDArray bands with padding, and caches it.

    append_glcm_padded_cached

    Computes the GLCM of the NDArray bands with padding, and caches it and also appends it onto the original array.

    Usage

    We show a few examples of how to use the GLCM functions.

    +}

    Documentation 0.1.2 Help

    preprocessing.glcm_padded

    Functions

    glcm_padded

    Computes the GLCM of the NDArray bands with padding.

    glcm_padded_cached

    Computes the GLCM of the NDArray bands with padding, and caches it.

    append_glcm_padded_cached

    Computes the GLCM of the NDArray bands with padding, and caches it and also appends it onto the original array.

    Usage

    We show a few examples of how to use the GLCM functions.

    import numpy as np from glcm_cupy import Features @@ -38,4 +38,4 @@ ar_glcm_cached_appended = append_glcm_padded_cached(ar, bin_from=1, bin_to=4, radius=3) -
    • ar_glcm is the GLCM of the original array, with the last dimension being the GLCM features. The number of features is determined by the features parameter, which defaults to all features.

    • ar_glcm_2_features selects only 2 features, with the last dimension being the 2 GLCM features specified.

    • ar_glcm_cached caches the GLCM so that if you call it again, it will return the cached version. It stores its data at the project root dir, under .cache/.

    • ar_glcm_cached_appended is a wrapper around ar_glcm_cached, it appends the GLCM features onto the original array. It's equivalent to calling ar_glcm_cached and then np.concatenate on the final axes.

    Caching

    GLCM is an expensive operation, thus we recommend to cache it if the input parameters will be the same. This is especially useful if you're experimenting with the same dataset with constant parameters.

    API

    glcm_padded(ar, bin_from, bin_to, radius, step_size, features)

    Computes the GLCM of the NDArray bands with padding.


    • ar is the input array

    • bin_from is the upper bound of the input

    • bin_to is the upper bound of the GLCM input, i.e. the resolution that GLCM operates on

    • radius is the radius of the GLCM

    • step_size is the step size of the GLCM

    • features is the list of GLCM features to compute

    The return shape is

    See glcm_cupy for the GLCM Features.

    glcm_padded_cached(ar, bin_from, bin_to, radius, step_size, features)

    Computes the GLCM of the NDArray bands with padding, and caches it.


    See glcm_padded for the parameters and output shape

    append_glcm_padded_cached(ar, bin_from, bin_to, radius, step_size, features)

    Computes the GLCM of the NDArray bands with padding, and caches it and also appends it onto the original array.


    See glcm_padded for the parameters


    The return shape is:

    The function automatically flattens the last 2 dimensions of the GLCM features, and appends it onto the original array.

    Last modified: 26 June 2024
    \ No newline at end of file +
    • ar_glcm is the GLCM of the original array, with the last dimension being the GLCM features. The number of features is determined by the features parameter, which defaults to all features.

    • ar_glcm_2_features selects only 2 features, with the last dimension being the 2 GLCM features specified.

    • ar_glcm_cached caches the GLCM so that if you call it again, it will return the cached version. It stores its data at the project root dir, under .cache/.

    • ar_glcm_cached_appended is a wrapper around ar_glcm_cached, it appends the GLCM features onto the original array. It's equivalent to calling ar_glcm_cached and then np.concatenate on the final axes.

    Caching

    GLCM is an expensive operation, thus we recommend to cache it if the input parameters will be the same. This is especially useful if you're experimenting with the same dataset with constant parameters.

    API

    glcm_padded(ar, bin_from, bin_to, radius, step_size, features)

    Computes the GLCM of the NDArray bands with padding.


    • ar is the input array

    • bin_from is the upper bound of the input

    • bin_to is the upper bound of the GLCM input, i.e. the resolution that GLCM operates on

    • radius is the radius of the GLCM

    • step_size is the step size of the GLCM

    • features is the list of GLCM features to compute

    The return shape is

    See glcm_cupy for the GLCM Features.

    glcm_padded_cached(ar, bin_from, bin_to, radius, step_size, features)

    Computes the GLCM of the NDArray bands with padding, and caches it.


    See glcm_padded for the parameters and output shape

    append_glcm_padded_cached(ar, bin_from, bin_to, radius, step_size, features)

    Computes the GLCM of the NDArray bands with padding, and caches it and also appends it onto the original array.


    See glcm_padded for the parameters


    The return shape is:

    The function automatically flattens the last 2 dimensions of the GLCM features, and appends it onto the original array.

    Last modified: 26 June 2024
    \ No newline at end of file diff --git a/docs/preprocessing-morphology.html b/docs/preprocessing-morphology.html index ba281416..140058d3 100644 --- a/docs/preprocessing-morphology.html +++ b/docs/preprocessing-morphology.html @@ -1,5 +1,5 @@ -preprocessing.morphology | Documentation

    Documentation 0.1.2 Help

    preprocessing.morphology

    Functions

    threshold_binary_mask

    Thresholds a selected NDArray bands to yield a binary mask.

    binary_watershed

    Performs watershed on a binary mask to yield a mapped label classification

    Usage

    Perform auto-segmentation on a dataset to yield a label classification.

    +}

    Documentation 0.1.2 Help

    preprocessing.morphology

    Functions

    threshold_binary_mask

    Thresholds a selected NDArray bands to yield a binary mask.

    binary_watershed

    Performs watershed on a binary mask to yield a mapped label classification

    Usage

    Perform auto-segmentation on a dataset to yield a label classification.

    from frdc.load.preset import FRDCDatasetPreset from frdc.preprocess.morphology import ( threshold_binary_mask, binary_watershed @@ -23,6 +23,6 @@ ar, order = ds._get_ar_bands() mask = threshold_binary_mask(ar, order.index('NIR'), 90 / 256) ar_label = binary_watershed(mask) -

    API

    threshold_binary_mask(ar, band_idx, threshold_value)

    Thresholds a selected NDArray bands to yield a binary mask as np.ndarray


    This is equivalent to

    +

    API

    threshold_binary_mask(ar, band_idx, threshold_value)

    Thresholds a selected NDArray bands to yield a binary mask as np.ndarray


    This is equivalent to

    ar[..., band_idx] > threshold_value -
    binary_watershed(ar_mask, peaks_footprint, watershed_compactness)

    Performs watershed on a binary mask to yield a mapped label classification as a np.ndarray


    • peaks_footprint is the footprint of skimage.feature.peak_local_max

    • watershed_compactness is the compactness of skimage.morphology.watershed

    Last modified: 26 June 2024
    \ No newline at end of file +
    binary_watershed(ar_mask, peaks_footprint, watershed_compactness)

    Performs watershed on a binary mask to yield a mapped label classification as a np.ndarray


    • peaks_footprint is the footprint of skimage.feature.peak_local_max

    • watershed_compactness is the compactness of skimage.morphology.watershed

    Last modified: 26 June 2024
    \ No newline at end of file diff --git a/docs/preprocessing-scale.html b/docs/preprocessing-scale.html index 386d7c47..d5ab0f0e 100644 --- a/docs/preprocessing-scale.html +++ b/docs/preprocessing-scale.html @@ -1,5 +1,5 @@ -preprocessing.scale | Documentation

    Documentation 0.1.2 Help

    preprocessing.scale

    Functions

    scale_0_1_per_band

    Scales the NDArray bands to [0, 1] per band.

    scale_normal_per_band

    Scales the NDArray bands to zero mean unit variance per band.

    scale_static_per_band

    Scales the NDArray bands by a predefined configuration. Take a look at frdc.conf.BAND_MAX_CONFIG for an example.

    Usage

    +}

    Documentation 0.1.2 Help

    preprocessing.scale

    Functions

    scale_0_1_per_band

    Scales the NDArray bands to [0, 1] per band.

    scale_normal_per_band

    Scales the NDArray bands to zero mean unit variance per band.

    scale_static_per_band

    Scales the NDArray bands by a predefined configuration. Take a look at frdc.conf.BAND_MAX_CONFIG for an example.

    Usage

    from frdc.load.preset import FRDCDatasetPreset from frdc.preprocess.scale import ( scale_0_1_per_band, scale_normal_per_band, scale_static_per_band @@ -25,4 +25,4 @@ ar_01 = scale_0_1_per_band(ar) ar_norm = scale_normal_per_band(ar) ar_static = scale_static_per_band(ar, order, BAND_MAX_CONFIG) -
    Last modified: 26 June 2024
    \ No newline at end of file +
    Last modified: 26 June 2024
    \ No newline at end of file diff --git a/docs/retrieve-our-datasets.html b/docs/retrieve-our-datasets.html index 23964d77..d0c2fdce 100644 --- a/docs/retrieve-our-datasets.html +++ b/docs/retrieve-our-datasets.html @@ -1,5 +1,5 @@ -Retrieve our Datasets | Documentation

    Documentation 0.1.2 Help

    Retrieve our Datasets

    In this tutorial, we'll learn how to :

    • Retrieve FRDC's Datasets

    • How to inspect the data

    • How to integrate it with PyTorch's DataLoader

    • How to visualize the data

    Prerequisites

    • New here? Get Started.

    • Setup the Google Cloud Authorization to download the data.

    Retrieve the Data

    To retrieve the data, use FRDCDatasetPreset. This module presets to load explicitly known datasets.

    For example:

    +}

    Documentation 0.1.2 Help

    Retrieve our Datasets

    In this tutorial, we'll learn how to :

    • Retrieve FRDC's Datasets

    • How to inspect the data

    • How to integrate it with PyTorch's DataLoader

    • How to visualize the data

    Prerequisites

    • New here? Get Started.

    • Setup the Google Cloud Authorization to download the data.

    Retrieve the Data

    To retrieve the data, use FRDCDatasetPreset. This module presets to load explicitly known datasets.

    For example:

    from frdc.load.preset import FRDCDatasetPreset ds = FRDCDatasetPreset.chestnut_20201218() for x, y in ds: print(x.shape, y) -

    You should get something like this:

    +

    You should get something like this:

    (831, 700, 8) Falcataria Moluccana (540, 536, 8) Ficus Variegata (457, 660, 8) Bridelia Sp. ... -
    • x is a torch.Tensor

    • y is a str.

    Iterate through the Data

    The dataset, when you load it, will be automatically segmented by bounds. Therefore, if you want to simply loop through the segments and labels, you can treat the dataset as an iterable.

    +
    • x is a torch.Tensor

    • y is a str.

    Iterate through the Data

    The dataset, when you load it, will be automatically segmented by bounds. Therefore, if you want to simply loop through the segments and labels, you can treat the dataset as an iterable.

    from frdc.load.preset import FRDCDatasetPreset ds = FRDCDatasetPreset.chestnut_20201218() for x, y in ds: print(x.shape, y) -

    If you just want the segments or targets separately, use .ar_segments and .targets respectively.

    +

    If you just want the segments or targets separately, use .ar_segments and .targets respectively.

    from frdc.load.preset import FRDCDatasetPreset ds = FRDCDatasetPreset.chestnut_20201218() @@ -39,19 +39,19 @@ for y in ds.targets: print(y) -

    If you want the entire image, use .ar.

    +

    If you want the entire image, use .ar.

    from frdc.load.preset import FRDCDatasetPreset ds = FRDCDatasetPreset.chestnut_20201218() ar = ds.ar -

    Finally, inspect the order of the bands through the band_order attribute.

    +

    Finally, inspect the order of the bands through the band_order attribute.

    from frdc.load.preset import FRDCDatasetPreset ds = FRDCDatasetPreset.chestnut_20201218() ds.band_order
    > ['WB', 'WG', 'WR', 'NB', 'NG', 'NR', 'RE', 'NIR'] -

    Using with PyTorch's DataLoader

    Every FRDCDataset is a Dataset object, so you can use it with PyTorch's DataLoader. This allows you to retrieve by batches!

    +

    Using with PyTorch's DataLoader

    Every FRDCDataset is a Dataset object, so you can use it with PyTorch's DataLoader. This allows you to retrieve by batches!

    from torch.utils.data import DataLoader from torchvision.transforms.v2 import CenterCrop, Compose, Resize, ToImage @@ -65,12 +65,12 @@ for x, y in dl: print(x.shape, y) -

    Which should output

    +

    Which should output

    torch.Size([4, 8, 100, 100]) ('Falcataria Moluccana', ...) torch.Size([4, 8, 100, 100]) ('Clausena Excavata', ...) torch.Size([4, 8, 100, 100]) ('Clausena Excavata', ...) ... -

    Plot the Data (Optional)

    We can then use these data to plot out the first tree segment.

    +

    Plot the Data (Optional)

    We can then use these data to plot out the first tree segment.

    import matplotlib.pyplot as plt from frdc.load.preset import FRDCDatasetPreset @@ -84,4 +84,4 @@ plt.imshow(segment_0_rgb_scaled) plt.title(f"Tree {ds.targets[0]}") plt.show() -

    See also: preprocessing.scale.scale_0_1_per_band

    MatPlotLib cannot show the data correctly as-is, so we need to

    • Convert the data from BGR to RGB

    • Scale the data to 0-1 per band

    Last modified: 26 June 2024
    \ No newline at end of file +

    See also: preprocessing.scale.scale_0_1_per_band

    MatPlotLib cannot show the data correctly as-is, so we need to

    • Convert the data from BGR to RGB

    • Scale the data to 0-1 per band

    Last modified: 26 June 2024
    \ No newline at end of file diff --git a/src/frdc/conf.py b/src/frdc/conf.py index 5d566b4e..84e153eb 100644 --- a/src/frdc/conf.py +++ b/src/frdc/conf.py @@ -29,13 +29,12 @@ ENV_EXAMPLE_FILE = ROOT_DIR / ".env.example" if ENV_EXAMPLE_FILE.exists(): shutil.copy(ENV_EXAMPLE_FILE, ENV_FILE) - raise FileNotFoundError( + logger.warning( f"Environment file not found at {ENV_FILE.as_posix()}. " "A new one has been created from the .env.example file.\n" - "Set the necessary variables and re-run the script." ) else: - raise FileNotFoundError( + logger.warning( f"Environment file not found at {ENV_FILE.as_posix()}. " "Please create one or copy the .env.example file in the GitHub " "repository." From 656b203af878c774f3107909d1ff3c97ea5fe4be Mon Sep 17 00:00:00 2001 From: Eve-ning Date: Wed, 26 Jun 2024 19:58:07 +0800 Subject: [PATCH 12/12] Fix issue with .env not being copied in workflow --- .github/workflows/basic-tests.yml | 4 ++++ .github/workflows/model-tests.yml | 4 ++++ Writerside/topics/Getting-Started.md | 14 +++++++++++- docs/HelpTOC.json | 2 +- docs/custom-k-aug-dataloaders.html | 8 +++---- docs/get-started-with-dev-containers.html | 6 ++--- docs/getting-started.html | 28 ++++++++++++++--------- docs/load-dataset.html | 10 ++++---- docs/load-gcs.html | 8 +++---- docs/mix-match-module.html | 14 ++++++------ docs/mix-match.html | 4 ++-- docs/ml-architecture.html | 4 ++-- docs/model-test-chestnut-may-dec.html | 4 ++-- docs/overview.html | 4 ++-- docs/preprocessing-extract-segments.html | 18 +++++++-------- docs/preprocessing-glcm-padded.html | 6 ++--- docs/preprocessing-morphology.html | 8 +++---- docs/preprocessing-scale.html | 6 ++--- docs/retrieve-our-datasets.html | 22 +++++++++--------- 19 files changed, 100 insertions(+), 74 deletions(-) diff --git a/.github/workflows/basic-tests.yml b/.github/workflows/basic-tests.yml index d3cb80f7..7fb0ddf5 100644 --- a/.github/workflows/basic-tests.yml +++ b/.github/workflows/basic-tests.yml @@ -49,6 +49,10 @@ jobs: # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide flake8 src/ --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Copy over .env + run: | + cp .env.example .env + - name: Test with pytest run: | pytest diff --git a/.github/workflows/model-tests.yml b/.github/workflows/model-tests.yml index a4a4991c..ce83de3f 100644 --- a/.github/workflows/model-tests.yml +++ b/.github/workflows/model-tests.yml @@ -80,6 +80,10 @@ jobs: uses: mxschmitt/action-tmate@v3 if: ${{ github.event_name == 'workflow_dispatch' && inputs.debug_enabled }} + - name: Copy over .env + run: | + cp .env.example .env + - name: Run Model Training working-directory: ${{ github.workspace }}/tests run: | diff --git a/Writerside/topics/Getting-Started.md b/Writerside/topics/Getting-Started.md index 746c93f8..b2eb7a59 100644 --- a/Writerside/topics/Getting-Started.md +++ b/Writerside/topics/Getting-Started.md @@ -56,6 +56,19 @@ poetry install --with dev + + Make a copy of the .env.example file and rename it to + .env + + Fill in additional environment variables in the .env file + + LABEL_STUDIO_API_KEY=... + LABEL_STUDIO_HOST=10.97.41.70 + LABEL_STUDIO_PORT=8080 + GCS_PROJECT_ID=frmodel + GCS_BUCKET_NAME=frdc-ds + + Install Pre-Commit Hooks pre-commit install @@ -63,7 +76,6 @@ - We use Google Cloud to store our datasets. To set up Google Cloud, diff --git a/docs/HelpTOC.json b/docs/HelpTOC.json index 98bda536..aab1ee61 100644 --- a/docs/HelpTOC.json +++ b/docs/HelpTOC.json @@ -1 +1 @@ -{"entities":{"pages":{"Overview":{"id":"Overview","title":"Overview","url":"overview.html","level":0,"tabIndex":0},"ML-Architecture":{"id":"ML-Architecture","title":"ML Architecture","url":"ml-architecture.html","level":0,"tabIndex":1},"Getting-Started":{"id":"Getting-Started","title":"Getting Started","url":"getting-started.html","level":0,"pages":["Get-Started-with-Dev-Containers"],"tabIndex":2},"Get-Started-with-Dev-Containers":{"id":"Get-Started-with-Dev-Containers","title":"Get Started with Dev Containers","url":"get-started-with-dev-containers.html","level":1,"parentId":"Getting-Started","tabIndex":0},"-6vddrq_6549":{"id":"-6vddrq_6549","title":"Tutorials","level":0,"pages":["Retrieve-our-Datasets"],"tabIndex":3},"Retrieve-our-Datasets":{"id":"Retrieve-our-Datasets","title":"Retrieve our Datasets","url":"retrieve-our-datasets.html","level":1,"parentId":"-6vddrq_6549","tabIndex":0},"mix-match":{"id":"mix-match","title":"MixMatch","url":"mix-match.html","level":0,"pages":["mix-match-module","custom-k-aug-dataloaders"],"tabIndex":4},"mix-match-module":{"id":"mix-match-module","title":"MixMatch Module","url":"mix-match-module.html","level":1,"parentId":"mix-match","tabIndex":0},"custom-k-aug-dataloaders":{"id":"custom-k-aug-dataloaders","title":"Custom K-Aug Dataloaders","url":"custom-k-aug-dataloaders.html","level":1,"parentId":"mix-match","tabIndex":1},"-6vddrq_6554":{"id":"-6vddrq_6554","title":"Model Tests","level":0,"pages":["Model-Test-Chestnut-May-Dec"],"tabIndex":5},"Model-Test-Chestnut-May-Dec":{"id":"Model-Test-Chestnut-May-Dec","title":"Model Test Chestnut May-Dec","url":"model-test-chestnut-may-dec.html","level":1,"parentId":"-6vddrq_6554","tabIndex":0},"-6vddrq_6556":{"id":"-6vddrq_6556","title":"API","level":0,"pages":["load.dataset","load.gcs","preprocessing.scale","preprocessing.extract_segments","preprocessing.morphology","preprocessing.glcm_padded"],"tabIndex":6},"load.dataset":{"id":"load.dataset","title":"load.dataset","url":"load-dataset.html","level":1,"parentId":"-6vddrq_6556","tabIndex":0},"load.gcs":{"id":"load.gcs","title":"load.gcs","url":"load-gcs.html","level":1,"parentId":"-6vddrq_6556","tabIndex":1},"preprocessing.scale":{"id":"preprocessing.scale","title":"preprocessing.scale","url":"preprocessing-scale.html","level":1,"parentId":"-6vddrq_6556","tabIndex":2},"preprocessing.extract_segments":{"id":"preprocessing.extract_segments","title":"preprocessing.extract_segments","url":"preprocessing-extract-segments.html","level":1,"parentId":"-6vddrq_6556","tabIndex":3},"preprocessing.morphology":{"id":"preprocessing.morphology","title":"preprocessing.morphology","url":"preprocessing-morphology.html","level":1,"parentId":"-6vddrq_6556","tabIndex":4},"preprocessing.glcm_padded":{"id":"preprocessing.glcm_padded","title":"preprocessing.glcm_padded","url":"preprocessing-glcm-padded.html","level":1,"parentId":"-6vddrq_6556","tabIndex":5}}},"topLevelIds":["Overview","ML-Architecture","Getting-Started","-6vddrq_6549","mix-match","-6vddrq_6554","-6vddrq_6556"]} \ No newline at end of file +{"entities":{"pages":{"Overview":{"id":"Overview","title":"Overview","url":"overview.html","level":0,"tabIndex":0},"ML-Architecture":{"id":"ML-Architecture","title":"ML Architecture","url":"ml-architecture.html","level":0,"tabIndex":1},"Getting-Started":{"id":"Getting-Started","title":"Getting Started","url":"getting-started.html","level":0,"pages":["Get-Started-with-Dev-Containers"],"tabIndex":2},"Get-Started-with-Dev-Containers":{"id":"Get-Started-with-Dev-Containers","title":"Get Started with Dev Containers","url":"get-started-with-dev-containers.html","level":1,"parentId":"Getting-Started","tabIndex":0},"-6vddrq_6999":{"id":"-6vddrq_6999","title":"Tutorials","level":0,"pages":["Retrieve-our-Datasets"],"tabIndex":3},"Retrieve-our-Datasets":{"id":"Retrieve-our-Datasets","title":"Retrieve our Datasets","url":"retrieve-our-datasets.html","level":1,"parentId":"-6vddrq_6999","tabIndex":0},"mix-match":{"id":"mix-match","title":"MixMatch","url":"mix-match.html","level":0,"pages":["mix-match-module","custom-k-aug-dataloaders"],"tabIndex":4},"mix-match-module":{"id":"mix-match-module","title":"MixMatch Module","url":"mix-match-module.html","level":1,"parentId":"mix-match","tabIndex":0},"custom-k-aug-dataloaders":{"id":"custom-k-aug-dataloaders","title":"Custom K-Aug Dataloaders","url":"custom-k-aug-dataloaders.html","level":1,"parentId":"mix-match","tabIndex":1},"-6vddrq_7004":{"id":"-6vddrq_7004","title":"Model Tests","level":0,"pages":["Model-Test-Chestnut-May-Dec"],"tabIndex":5},"Model-Test-Chestnut-May-Dec":{"id":"Model-Test-Chestnut-May-Dec","title":"Model Test Chestnut May-Dec","url":"model-test-chestnut-may-dec.html","level":1,"parentId":"-6vddrq_7004","tabIndex":0},"-6vddrq_7006":{"id":"-6vddrq_7006","title":"API","level":0,"pages":["load.dataset","load.gcs","preprocessing.scale","preprocessing.extract_segments","preprocessing.morphology","preprocessing.glcm_padded"],"tabIndex":6},"load.dataset":{"id":"load.dataset","title":"load.dataset","url":"load-dataset.html","level":1,"parentId":"-6vddrq_7006","tabIndex":0},"load.gcs":{"id":"load.gcs","title":"load.gcs","url":"load-gcs.html","level":1,"parentId":"-6vddrq_7006","tabIndex":1},"preprocessing.scale":{"id":"preprocessing.scale","title":"preprocessing.scale","url":"preprocessing-scale.html","level":1,"parentId":"-6vddrq_7006","tabIndex":2},"preprocessing.extract_segments":{"id":"preprocessing.extract_segments","title":"preprocessing.extract_segments","url":"preprocessing-extract-segments.html","level":1,"parentId":"-6vddrq_7006","tabIndex":3},"preprocessing.morphology":{"id":"preprocessing.morphology","title":"preprocessing.morphology","url":"preprocessing-morphology.html","level":1,"parentId":"-6vddrq_7006","tabIndex":4},"preprocessing.glcm_padded":{"id":"preprocessing.glcm_padded","title":"preprocessing.glcm_padded","url":"preprocessing-glcm-padded.html","level":1,"parentId":"-6vddrq_7006","tabIndex":5}}},"topLevelIds":["Overview","ML-Architecture","Getting-Started","-6vddrq_6999","mix-match","-6vddrq_7004","-6vddrq_7006"]} \ No newline at end of file diff --git a/docs/custom-k-aug-dataloaders.html b/docs/custom-k-aug-dataloaders.html index 98a565ab..0a2c46c8 100644 --- a/docs/custom-k-aug-dataloaders.html +++ b/docs/custom-k-aug-dataloaders.html @@ -1,5 +1,5 @@ -Custom K-Aug Dataloaders | Documentation

    Documentation 0.1.2 Help

    Custom K-Aug Dataloaders

    In MixMatch, implementing the data loading methods is quite unconventional.

    1. We need to load multiple augmented versions of the same image into the same batch.

    2. The labelled set is usually too small, causing a premature end to the epoch as it runs out of samples to draw from faster than the unlabelled set.

    This can be rather tricky to implement in PyTorch. This tutorial will illustrate how we did it.

    Loading Multiple Augmented Versions of the Same Image

    See: frdc/load/dataset.py FRDCDataset.__getitem__

    In MixMatch, a single train batch must consist of:

    1. A batch of labeled images

    2. K batches of unlabeled images

    Aug
    Aug
    Aug
    Aug
    Get Batch
    Aug Labelled Batch
    Unlabelled Batch
    Aug Unl. Batch 1
    Aug Unl. Batch i
    Aug Unl. Batch K

    Keep in mind that the unlabelled batch, is a single batch of images, not separate draws of batches. It is then "duplicated" K times, and each copy is augmented differently.

    Solution 1: Custom Dataset

    To solve this, we need to understand the role of both a Dataset and a DataLoader.

    • A Dataset represents a collection of data, responsible for loading and returning something.

    • A DataLoader draws samples from a Dataset and returns batched samples.

    The key here is that a Dataset is not limited to returning 1 sample at a time, we can make it return the K augmented versions of the same image.

    Aug
    Aug
    Aug
    Sample
    Aug Sample 1
    Aug Sample i
    Aug Sample K

    In code, this is done by subclassing the Dataset class and overriding the __getitem__ method.

    +}

    Documentation 0.1.2 Help

    Custom K-Aug Dataloaders

    In MixMatch, implementing the data loading methods is quite unconventional.

    1. We need to load multiple augmented versions of the same image into the same batch.

    2. The labelled set is usually too small, causing a premature end to the epoch as it runs out of samples to draw from faster than the unlabelled set.

    This can be rather tricky to implement in PyTorch. This tutorial will illustrate how we did it.

    Loading Multiple Augmented Versions of the Same Image

    See: frdc/load/dataset.py FRDCDataset.__getitem__

    In MixMatch, a single train batch must consist of:

    1. A batch of labeled images

    2. K batches of unlabeled images

    Aug
    Aug
    Aug
    Aug
    Get Batch
    Aug Labelled Batch
    Unlabelled Batch
    Aug Unl. Batch 1
    Aug Unl. Batch i
    Aug Unl. Batch K

    Keep in mind that the unlabelled batch, is a single batch of images, not separate draws of batches. It is then "duplicated" K times, and each copy is augmented differently.

    Solution 1: Custom Dataset

    To solve this, we need to understand the role of both a Dataset and a DataLoader.

    • A Dataset represents a collection of data, responsible for loading and returning something.

    • A DataLoader draws samples from a Dataset and returns batched samples.

    The key here is that a Dataset is not limited to returning 1 sample at a time, we can make it return the K augmented versions of the same image.

    Aug
    Aug
    Aug
    Sample
    Aug Sample 1
    Aug Sample i
    Aug Sample K

    In code, this is done by subclassing the Dataset class and overriding the __getitem__ method.

    def duplicate(x): return x, deepcopy(x), deepcopy(x) @@ -25,7 +25,7 @@ def __getitem__(self, index): x, y = self.dataset[index] return self.aug(x), y -

    In the above example, we have a Dataset that returns 3 duplicate versions of the same image. By leveraging this technique, we can create a Dataset that returns K augmented versions of the same image as a tuple

    Premature End of Epoch due to Small Labelled Set

    See: frdc/train/frdc_datamodule.py

    In MixMatch, the definition of an "epoch" is a bit different. Instead of implying that we have seen all the data once, it implies that we've drawn N batches. The N is referred to as the number of iterations per epoch.

    Take for example, a labelled set of numbers [1, 2, 3] and an unlabelled set [4, 5, 6, 7, 8, 9, 10]. With batch size of 2, we'll run out of labelled samples after 2 iterations, but we'll still have 3 more iterations for the unlabelled set.

    • Draw 1: [1, 2], [4, 5]

    • Draw 2: [3], [6, 7].

    • Epoch ends.

    Solution 2: Random Sampling

    To fix this, instead of sequentially sampling the labelled set (and the unlabelled set), we can sample them randomly. This way, we can ensure that it never runs out.

    • Draw 1: [1, 3], [7, 5]

    • Draw 2: [2, 1], [4, 9]

    • Draw 3: [3, 2], [8, 6]

    • ... and so on.

    Luckily, PyTorch's DataLoader supports random sampling. We just need to use RandomSampler instead of SequentialSampler (which is the default).

    +

    In the above example, we have a Dataset that returns 3 duplicate versions of the same image. By leveraging this technique, we can create a Dataset that returns K augmented versions of the same image as a tuple

    Premature End of Epoch due to Small Labelled Set

    See: frdc/train/frdc_datamodule.py

    In MixMatch, the definition of an "epoch" is a bit different. Instead of implying that we have seen all the data once, it implies that we've drawn N batches. The N is referred to as the number of iterations per epoch.

    Take for example, a labelled set of numbers [1, 2, 3] and an unlabelled set [4, 5, 6, 7, 8, 9, 10]. With batch size of 2, we'll run out of labelled samples after 2 iterations, but we'll still have 3 more iterations for the unlabelled set.

    • Draw 1: [1, 2], [4, 5]

    • Draw 2: [3], [6, 7].

    • Epoch ends.

    Solution 2: Random Sampling

    To fix this, instead of sequentially sampling the labelled set (and the unlabelled set), we can sample them randomly. This way, we can ensure that it never runs out.

    • Draw 1: [1, 3], [7, 5]

    • Draw 2: [2, 1], [4, 9]

    • Draw 3: [3, 2], [8, 6]

    • ... and so on.

    Luckily, PyTorch's DataLoader supports random sampling. We just need to use RandomSampler instead of SequentialSampler (which is the default).

    from torch.utils.data import DataLoader, RandomSampler dl = DataLoader( @@ -36,4 +36,4 @@ replacement=False, ) ) -

    This will ensure that the "epoch" ends when we've drawn train_iters batches

    Last modified: 26 June 2024
    \ No newline at end of file +

    This will ensure that the "epoch" ends when we've drawn train_iters batches

    Last modified: 26 June 2024
    \ No newline at end of file diff --git a/docs/get-started-with-dev-containers.html b/docs/get-started-with-dev-containers.html index b0579d13..bb307e57 100644 --- a/docs/get-started-with-dev-containers.html +++ b/docs/get-started-with-dev-containers.html @@ -1,5 +1,5 @@ -Get Started with Dev Containers | Documentation

    Documentation 0.1.2 Help

    Get Started with Dev Containers

    Dev. Containers are a great way to get started with a project. They define all necessary dependencies and environments, so you can just start coding within the container.

    In this article, we'll only go over additional steps to set up with our project. For more information on how to use Dev Containers, please refer to the official documentation for each IDE. Once you've set up the Dev Container, come back here to finish the setup:

    Python Environment

    The dev environment is already created and is managed by Anaconda /opt/conda/bin/conda. To activate the environment, run the following command:

    +}

    Documentation 0.1.2 Help

    Get Started with Dev Containers

    Dev. Containers are a great way to get started with a project. They define all necessary dependencies and environments, so you can just start coding within the container.

    In this article, we'll only go over additional steps to set up with our project. For more information on how to use Dev Containers, please refer to the official documentation for each IDE. Once you've set up the Dev Container, come back here to finish the setup:

    Python Environment

    The dev environment is already created and is managed by Anaconda /opt/conda/bin/conda. To activate the environment, run the following command:

    conda activate base -

    Mark as Sources Root (Add to PYTHONPATH)

    For import statements to work, you need to mark the src folder as the sources root. Optionally, also mark the tests folder as the tests root.

    Additional Setup

    Refer to the Getting Started guide for additional setup steps such as:

    • Google Cloud Application Default Credentials

    • Weight & Bias API Key

    • Label Studio API Key

    Last modified: 26 June 2024
    \ No newline at end of file +

    Mark as Sources Root (Add to PYTHONPATH)

    For import statements to work, you need to mark the src folder as the sources root. Optionally, also mark the tests folder as the tests root.

    Additional Setup

    Refer to the Getting Started guide for additional setup steps such as:

    • Google Cloud Application Default Credentials

    • Weight & Bias API Key

    • Label Studio API Key

    Last modified: 26 June 2024
    \ No newline at end of file diff --git a/docs/getting-started.html b/docs/getting-started.html index 76619f33..74f3183c 100644 --- a/docs/getting-started.html +++ b/docs/getting-started.html @@ -1,5 +1,5 @@ -Getting Started | Documentation

    Documentation 0.1.2 Help

    Getting Started

    Installing the Dev. Environment

    1. Ensure that you have the right version of Python. The required Python version can be seen in pyproject.toml

      +}

      Documentation 0.1.2 Help

      Getting Started

      Installing the Dev. Environment

      1. Ensure that you have the right version of Python. The required Python version can be seen in pyproject.toml

        [tool.poetry.dependencies] python = "..." -
      2. Start by cloning our repository.

        +
      3. Start by cloning our repository.

        git clone https://github.com/FR-DC/FRDC-ML.git -
      4. Then, create a Python Virtual Env pyvenv

        python -m venv venv/
        python3 -m venv venv/
      5. Install Poetry Then check if it's installed with

        poetry --version
      6. Activate the virtual environment

        +
      7. Then, create a Python Virtual Env pyvenv

        python -m venv venv/
        python3 -m venv venv/
      8. Install Poetry Then check if it's installed with

        poetry --version
      9. Activate the virtual environment

        cd venv/Scripts activate cd ../.. -
        +
        source venv/bin/activate -
      10. Install the dependencies. You should be in the same directory as pyproject.toml

        +
    2. Install the dependencies. You should be in the same directory as pyproject.toml

      poetry install --with dev -
    3. Install Pre-Commit Hooks

      +
    4. Make a copy of the .env.example file and rename it to .env

    5. Fill in additional environment variables in the .env file

      + LABEL_STUDIO_API_KEY=... + LABEL_STUDIO_HOST=10.97.41.70 + LABEL_STUDIO_PORT=8080 + GCS_PROJECT_ID=frmodel + GCS_BUCKET_NAME=frdc-ds +
    6. Install Pre-Commit Hooks

      pre-commit install -

    Setting Up Google Cloud

    1. We use Google Cloud to store our datasets. To set up Google Cloud, install the Google Cloud CLI

    2. Then, authenticate your account.

      gcloud auth login
    3. Finally, set up Application Default Credentials (ADC).

      gcloud auth application-default login
    4. To make sure everything is working, run the tests.

    Setting Up Label Studio

    1. We use Label Studio to annotate our datasets. We won't go through how to install Label Studio, for contributors, it should be up on localhost:8080.

    2. Then, retrieve your own API key from Label Studio. Go to your account page and copy the API key.


    3. Set your API key as an environment variable.

      In Windows, go to "Edit environment variables for your account" and add this as a new environment variable with name LABEL_STUDIO_API_KEY.

      Export it as an environment variable.

      export LABEL_STUDIO_API_KEY=...

      In all cases, you can create a .env file in the root of the project and add the following line: LABEL_STUDIO_API_KEY=...

    Setting Up Weight and Biases

    1. We use W&B to track our experiments. To set up W&B, install the W&B CLI

    2. Then, authenticate your account.

      wandb login

    Pre-commit Hooks

    • +

    Setting Up Google Cloud

    1. We use Google Cloud to store our datasets. To set up Google Cloud, install the Google Cloud CLI

    2. Then, authenticate your account.

      gcloud auth login
    3. Finally, set up Application Default Credentials (ADC).

      gcloud auth application-default login
    4. To make sure everything is working, run the tests.

    Setting Up Label Studio

    1. We use Label Studio to annotate our datasets. We won't go through how to install Label Studio, for contributors, it should be up on localhost:8080.

    2. Then, retrieve your own API key from Label Studio. Go to your account page and copy the API key.


    3. Set your API key as an environment variable.

      In Windows, go to "Edit environment variables for your account" and add this as a new environment variable with name LABEL_STUDIO_API_KEY.

      Export it as an environment variable.

      export LABEL_STUDIO_API_KEY=...

      In all cases, you can create a .env file in the root of the project and add the following line: LABEL_STUDIO_API_KEY=...

    Setting Up Weight and Biases

    1. We use W&B to track our experiments. To set up W&B, install the W&B CLI

    2. Then, authenticate your account.

      wandb login

    Pre-commit Hooks

    • pre-commit install -

    Running the Tests

    • Run the tests to make sure everything is working

      +

    Running the Tests

    • Run the tests to make sure everything is working

      pytest -

    Troubleshooting

    ModuleNotFoundError

    It's likely that your src and tests directories are not in PYTHONPATH. To fix this, run the following command:

    +

    Troubleshooting

    ModuleNotFoundError

    It's likely that your src and tests directories are not in PYTHONPATH. To fix this, run the following command:

    export PYTHONPATH=$PYTHONPATH:./src:./tests -

    Or, set it in your IDE, for example, IntelliJ allows setting directories as Source Roots.

    google.auth.exceptions.DefaultCredentialsError

    It's likely that you haven't authenticated your Google Cloud account. See Setting Up Google Cloud

    Couldn't connect to Label Studio

    Label Studio must be running locally, exposed on localhost:8080. Furthermore, you need to specify the LABEL_STUDIO_API_KEY environment variable. See Setting Up Label Studio

    Cannot login to W&B

    You need to authenticate your W&B account. See Setting Up Weight and Biases If you're facing difficulties, set the WANDB_MODE environment variable to offline to disable W&B.

    Our Repository Structure

    Before starting development, take a look at our repository structure. This will help you understand where to put your code.

    Core Dependencies
    Resources
    Tests
    Repo Dependencies
    Dataset Loaders
    Preprocessing Fn.
    Train Deps
    Model Architectures
    Datasets ...
    FRDC
    src/frdc/
    rsc/
    tests/
    pyproject.toml,poetry.lock
    ./load/
    ./preprocess/
    ./train/
    ./models/
    ./dataset_name/
    src/frdc/

    Source Code for our package. These are the unit components of our pipeline.

    rsc/

    Resources. These are usually cached datasets

    tests/

    PyTest tests. These are unit, integration, and model tests.

    Unit, Integration, and Pipeline Tests

    We have 3 types of tests:

    • Unit Tests are usually small, single function tests.

    • Integration Tests are larger tests that tests a mock pipeline.

    • Model Tests are the true production pipeline tests that will generate a model.

    Where Should I contribute?

    Changing a small component

    If you're changing a small component, such as a argument for preprocessing, a new model architecture, or a new configuration for a dataset, take a look at the src/frdc/ directory.

    Adding a test

    By adding a new component, you'll need to add a new test. Take a look at the tests/ directory.

    Changing the model pipeline

    If you're a ML Researcher, you'll probably be changing the pipeline. Take a look at the tests/model_tests/ directory.

    Adding a dependency

    If you're adding a new dependency, use poetry add PACKAGE and commit the changes to pyproject.toml and poetry.lock.

    Last modified: 26 June 2024
    \ No newline at end of file +

    Or, set it in your IDE, for example, IntelliJ allows setting directories as Source Roots.

    google.auth.exceptions.DefaultCredentialsError

    It's likely that you haven't authenticated your Google Cloud account. See Setting Up Google Cloud

    Couldn't connect to Label Studio

    Label Studio must be running locally, exposed on localhost:8080. Furthermore, you need to specify the LABEL_STUDIO_API_KEY environment variable. See Setting Up Label Studio

    Cannot login to W&B

    You need to authenticate your W&B account. See Setting Up Weight and Biases If you're facing difficulties, set the WANDB_MODE environment variable to offline to disable W&B.

    Our Repository Structure

    Before starting development, take a look at our repository structure. This will help you understand where to put your code.

    Core Dependencies
    Resources
    Tests
    Repo Dependencies
    Dataset Loaders
    Preprocessing Fn.
    Train Deps
    Model Architectures
    Datasets ...
    FRDC
    src/frdc/
    rsc/
    tests/
    pyproject.toml,poetry.lock
    ./load/
    ./preprocess/
    ./train/
    ./models/
    ./dataset_name/
    src/frdc/

    Source Code for our package. These are the unit components of our pipeline.

    rsc/

    Resources. These are usually cached datasets

    tests/

    PyTest tests. These are unit, integration, and model tests.

    Unit, Integration, and Pipeline Tests

    We have 3 types of tests:

    • Unit Tests are usually small, single function tests.

    • Integration Tests are larger tests that tests a mock pipeline.

    • Model Tests are the true production pipeline tests that will generate a model.

    Where Should I contribute?

    Changing a small component

    If you're changing a small component, such as a argument for preprocessing, a new model architecture, or a new configuration for a dataset, take a look at the src/frdc/ directory.

    Adding a test

    By adding a new component, you'll need to add a new test. Take a look at the tests/ directory.

    Changing the model pipeline

    If you're a ML Researcher, you'll probably be changing the pipeline. Take a look at the tests/model_tests/ directory.

    Adding a dependency

    If you're adding a new dependency, use poetry add PACKAGE and commit the changes to pyproject.toml and poetry.lock.

    Last modified: 26 June 2024
    \ No newline at end of file diff --git a/docs/load-dataset.html b/docs/load-dataset.html index cdfafda1..dafdd9d4 100644 --- a/docs/load-dataset.html +++ b/docs/load-dataset.html @@ -1,5 +1,5 @@ -load.dataset | Documentation

    Documentation 0.1.2 Help

    load.dataset

    Usage

    Firstly, to load a dataset instance, you need to initiliaze a FRDCDataset object, providing the site, date, and version.

    We recommend using the FRDCDatasetPreset module to load explicitly known datasets.

    +}

    Documentation 0.1.2 Help

    load.dataset

    Usage

    Firstly, to load a dataset instance, you need to initiliaze a FRDCDataset object, providing the site, date, and version.

    We recommend using the FRDCDatasetPreset module to load explicitly known datasets.

    from frdc.load.preset import FRDCDatasetPreset ds = FRDCDatasetPreset.chestnut_20201218() -

    Then, we can use the ds object to load objects of the dataset:

    +

    Then, we can use the ds object to load objects of the dataset:

    ar, order = ds._get_ar_bands() d = ds._get_ar_bands_as_dict() bounds, labels = ds._get_legacy_bounds_and_labels() -
    • ar is a stacked NDArray of the hyperspectral bands of shape (H x W x C)

    • order is a list of strings, containing the names of the bands, ordered according to the channels of ar

    • d is a dictionary of the hyperspectral bands of shape (H x W), keyed by the band names

    • bounds is a list of bounding boxes, in the format of Rect, a namedtuple of x0, y0, x1, y1

    • labels is a list of strings, containing the labels of the bounding boxes, ordered according to bounds

    I can't find a dataset!

    Some datasets, especially new ones may be unregistered and you must specify the exact site / date / version of it.

    +
    • ar is a stacked NDArray of the hyperspectral bands of shape (H x W x C)

    • order is a list of strings, containing the names of the bands, ordered according to the channels of ar

    • d is a dictionary of the hyperspectral bands of shape (H x W), keyed by the band names

    • bounds is a list of bounding boxes, in the format of Rect, a namedtuple of x0, y0, x1, y1

    • labels is a list of strings, containing the labels of the bounding boxes, ordered according to bounds

    I can't find a dataset!

    Some datasets, especially new ones may be unregistered and you must specify the exact site / date / version of it.

    from frdc.load.dataset import FRDCDataset ds = FRDCDataset(site="mysite", date="mydate", version="myversion") -

    See below for examples on how to format this.

    • site="ds"

    • date="date"

    • version="ver"

    • site="ds"

    • date="date"

    • version="ver/01/data"

    • site="ds"

    • date="date"

    • version=None

    Last modified: 26 June 2024
    \ No newline at end of file +

    See below for examples on how to format this.

    • site="ds"

    • date="date"

    • version="ver"

    • site="ds"

    • date="date"

    • version="ver/01/data"

    • site="ds"

    • date="date"

    • version=None

    Last modified: 26 June 2024
    \ No newline at end of file diff --git a/docs/load-gcs.html b/docs/load-gcs.html index bf12b421..0bd67f01 100644 --- a/docs/load-gcs.html +++ b/docs/load-gcs.html @@ -1,5 +1,5 @@ -load.gcs | Documentation

    Documentation 0.1.2 Help

    load.gcs

    Usage

    These are defined in the top-level load.gcs module.

    list_gcs_datasets

    Lists all datasets in the bucket as a DataFrame. This works by checking which folders have a specific file, which we call the anchor.

    download

    Downloads a file from Google Cloud Storage and returns the local file path.

    open_file

    Downloads and opens a file from Google Cloud Storage. Returns a file handle.

    open_image

    Downloads and returns the PIL image from Google Cloud Storage.

    Pathing

    The path to specify is relative to the bucket, which is frdc-ds by default.

    For example this filesystem on GCS:

    +}

    Documentation 0.1.2 Help

    load.gcs

    Usage

    These are defined in the top-level load.gcs module.

    list_gcs_datasets

    Lists all datasets in the bucket as a DataFrame. This works by checking which folders have a specific file, which we call the anchor.

    download

    Downloads a file from Google Cloud Storage and returns the local file path.

    open_file

    Downloads and opens a file from Google Cloud Storage. Returns a file handle.

    open_image

    Downloads and returns the PIL image from Google Cloud Storage.

    Pathing

    The path to specify is relative to the bucket, which is frdc-ds by default.

    For example this filesystem on GCS:

    # On Google Cloud Storage frdc-ds ├── chestnut_nature_park │ └── 20201218 │ └── 90deg │ └── bounds.json -

    To download bounds.json, use download(r"chestnut_nature_park/20201218/90deg/bounds.json"). By default, all files will be downloaded to PROJ_DIR/rsc/....

    +

    To download bounds.json, use download(r"chestnut_nature_park/20201218/90deg/bounds.json"). By default, all files will be downloaded to PROJ_DIR/rsc/....

    # On local filesystem PROJ_DIR ├── rsc @@ -28,4 +28,4 @@ │ └── 20201218 │ └── 90deg │ └── bounds.json -

    Configuration

    If you need granular control over

    • where the files are downloaded

    • the credentials used

    • the project used

    • the bucket used

    Then edit conf.py.

    GCS_CREDENTIALS

    Google Cloud credentials.


    A google.oauth2.service_account.Credentials object. See the object documentation for more information.

    LOCAL_DATASET_ROOT_DIR

    Local directory to download files to.


    Path to a directory, or a Path object.

    GCS_PROJECT_ID

    Google Cloud project ID.


    GCS_BUCKET_NAME

    Google Cloud Storage bucket name.


    Last modified: 26 June 2024
    \ No newline at end of file +

    Configuration

    If you need granular control over

    • where the files are downloaded

    • the credentials used

    • the project used

    • the bucket used

    Then edit conf.py.

    GCS_CREDENTIALS

    Google Cloud credentials.


    A google.oauth2.service_account.Credentials object. See the object documentation for more information.

    LOCAL_DATASET_ROOT_DIR

    Local directory to download files to.


    Path to a directory, or a Path object.

    GCS_PROJECT_ID

    Google Cloud project ID.


    GCS_BUCKET_NAME

    Google Cloud Storage bucket name.


    Last modified: 26 June 2024
    \ No newline at end of file diff --git a/docs/mix-match-module.html b/docs/mix-match-module.html index 98364a0b..fce01e41 100644 --- a/docs/mix-match-module.html +++ b/docs/mix-match-module.html @@ -1,5 +1,5 @@ -MixMatch Module | Documentation

    Documentation 0.1.2 Help

    MixMatch Module

    See frdc/train/mixmatch_module.py.

    Quick Recap

    We will go over the essential parts of the code here. Before that, we revise some of the concepts that are used in the code.

    Abstract Methods

    In Python, we can define abstract methods using the abc module. Just like other OOP languages, abstract methods are methods that must be implemented by the child class.

    For example:

    +}

    Documentation 0.1.2 Help

    MixMatch Module

    See frdc/train/mixmatch_module.py.

    Quick Recap

    We will go over the essential parts of the code here. Before that, we revise some of the concepts that are used in the code.

    Abstract Methods

    In Python, we can define abstract methods using the abc module. Just like other OOP languages, abstract methods are methods that must be implemented by the child class.

    For example:

    from abc import ABC, abstractmethod @@ -26,7 +26,7 @@ class MyChildClass(MyAbstractClass): def my_abstract_method(self): print("Hello World!") -

    nn.Module & LightningModule

    If you're unfamiliar with PyTorch, you should read the nn.Module Documentation.

    nn.Module is the base class for all neural network modules in PyTorch. While LightningModule is a PyTorch Lightning class that extends nn.Module, providing it with additional functionality that reduces boilerplate code.

    By implementing it as a LightningModule, we also enter the PyTorch Lightning ecosystem, which provides us with a lot of useful features such as logging, early stopping, and more.

    What do we implement in a Module?

    One key component that nn.Module requires, is the model. So for example:

    +

    nn.Module & LightningModule

    If you're unfamiliar with PyTorch, you should read the nn.Module Documentation.

    nn.Module is the base class for all neural network modules in PyTorch. While LightningModule is a PyTorch Lightning class that extends nn.Module, providing it with additional functionality that reduces boilerplate code.

    By implementing it as a LightningModule, we also enter the PyTorch Lightning ecosystem, which provides us with a lot of useful features such as logging, early stopping, and more.

    What do we implement in a Module?

    One key component that nn.Module requires, is the model. So for example:

    class MyModule(nn.Module): def __init__(self): super().__init__() @@ -38,7 +38,7 @@ def forward(self, x): return self.model(x) -

    PyTorch Lightning builds on top of it, requiring training_step and validation_step. Each "step" is a batch of data, and the model is trained on it. So for example:

    +

    PyTorch Lightning builds on top of it, requiring training_step and validation_step. Each "step" is a batch of data, and the model is trained on it. So for example:

    class MyModule(LightningModule): def __init__(self): ... @@ -55,7 +55,7 @@ y_hat = self(x) loss = F.cross_entropy(y_hat, y) return loss -

    Usually, the training and validation steps are the same, but in some cases, such as MixMatch, they are different. In MixMatch, we not only use a different loss function for train, we also handle a batch differently. The PyTorch Lightning framework allows us to separate the two, and implement them separately.

    Model Embedded Preprocessing on_before_batch_transfer

    In PyTorch Lightning, we can also inject a step before the batch is passed to the model. This is done by overriding the on_before_batch_transfer method.

    Batch
    on_before_batch_transfer
    training_step
    validation_step

    This allows us to do preprocessing on the batch, such as scaling the data, encoding the labels, and more.

    Custom EMA Update on_after_backward

    We also leverage another hook, called on_after_backward. This hook is called after the backward pass, and allows us to do custom operations. In our case, we use it to update the EMA model.

    Batch
    training_step
    on_after_backward
    update_ema

    MixMatch

    We recommend having tests/model_tests/chestnut_dec_may/train.py open while reading this section. It implements a real-world example of MixMatch.

    As a summary:

    1. We learned what is an abstract method, and how to implement it

    2. We implement the model in LightningModule much like we would in nn.Module

    3. We implement on_before_batch_transfer to preprocess the batch

    4. Finally, we implement on_after_backward to update the EMA model

    With the above in mind, let's look at the MixMatch implementation.

    forward (abstract)

    Forward pass of the model

    ema_model (abstract)

    The model that is used for EMA. We expect this property to be implemented by the child class.

    update_ema (abstract)

    The method to update the EMA model. We expect this method to be implemented by the child class.

    loss_unl_scaler (static)

    Takes in the current progress of the training, 0.0 to 1.0, where 0.0 is the start of the training, and 1.0 is the end. Then, returns the multiplier for the unlabeled loss.

    loss_lbl (static)

    Implements the loss for labeled data. Takes in the predicted labels and the ground truth labels, and returns the loss. This is cross entropy for MixMatch.

    loss_unl (static)

    Implements the loss for unlabeled data. Takes in the predicted labels and the ground truth labels, and returns the loss. This is MSE for MixMatch.

    mixup

    Takes in the data and the labels, the beta distribution parameter, and returns the mixed data and labels.

    sharpen

    Takes in the labels and temperature, and returns the sharpened labels.

    guess_labels

    Takes in the unlabeled data, and returns the guessed labels.

    progress

    The current progress of the training, 0.0 to 1.0, where 0.0 is the start of the training, and 1.0 is the end.

    training_step

    The training step runs through 1 batch of data, and returns the loss. Note that this is significantly different from validation step, as we handle the K-Augmented data differently.

    test / validation_step

    The test / validation step runs through 1 batch of data, and returns the loss.

    predict_step

    The predict step runs through 1 batch of data, and returns the actual decoded labels.

    on_after_backward

    The on_after_backward hook is called after the backward pass, and allows us to do custom operations. In our case, we use it to update the EMA model.

    on_before_batch_transfer

    The on_before_batch_transfer hook is called before the batch is transferred to the GPU. In our case, we use it to preprocess the batch.

    A diagram of how these components interact with each other is shown below:

    Batch
    on_before_batch_transfer
    training_step
    guess_labels
    sharpen
    mix_up
    loss_unl
    loss_unl_scaler
    loss
    loss_lbl
    backward
    on_after_backward
    update_ema
    validation_step
    loss

    Finally, we show an example of how to use the MixMatch module:

    +

    Usually, the training and validation steps are the same, but in some cases, such as MixMatch, they are different. In MixMatch, we not only use a different loss function for train, we also handle a batch differently. The PyTorch Lightning framework allows us to separate the two, and implement them separately.

    Model Embedded Preprocessing on_before_batch_transfer

    In PyTorch Lightning, we can also inject a step before the batch is passed to the model. This is done by overriding the on_before_batch_transfer method.

    Batch
    on_before_batch_transfer
    training_step
    validation_step

    This allows us to do preprocessing on the batch, such as scaling the data, encoding the labels, and more.

    Custom EMA Update on_after_backward

    We also leverage another hook, called on_after_backward. This hook is called after the backward pass, and allows us to do custom operations. In our case, we use it to update the EMA model.

    Batch
    training_step
    on_after_backward
    update_ema

    MixMatch

    We recommend having tests/model_tests/chestnut_dec_may/train.py open while reading this section. It implements a real-world example of MixMatch.

    As a summary:

    1. We learned what is an abstract method, and how to implement it

    2. We implement the model in LightningModule much like we would in nn.Module

    3. We implement on_before_batch_transfer to preprocess the batch

    4. Finally, we implement on_after_backward to update the EMA model

    With the above in mind, let's look at the MixMatch implementation.

    forward (abstract)

    Forward pass of the model

    ema_model (abstract)

    The model that is used for EMA. We expect this property to be implemented by the child class.

    update_ema (abstract)

    The method to update the EMA model. We expect this method to be implemented by the child class.

    loss_unl_scaler (static)

    Takes in the current progress of the training, 0.0 to 1.0, where 0.0 is the start of the training, and 1.0 is the end. Then, returns the multiplier for the unlabeled loss.

    loss_lbl (static)

    Implements the loss for labeled data. Takes in the predicted labels and the ground truth labels, and returns the loss. This is cross entropy for MixMatch.

    loss_unl (static)

    Implements the loss for unlabeled data. Takes in the predicted labels and the ground truth labels, and returns the loss. This is MSE for MixMatch.

    mixup

    Takes in the data and the labels, the beta distribution parameter, and returns the mixed data and labels.

    sharpen

    Takes in the labels and temperature, and returns the sharpened labels.

    guess_labels

    Takes in the unlabeled data, and returns the guessed labels.

    progress

    The current progress of the training, 0.0 to 1.0, where 0.0 is the start of the training, and 1.0 is the end.

    training_step

    The training step runs through 1 batch of data, and returns the loss. Note that this is significantly different from validation step, as we handle the K-Augmented data differently.

    test / validation_step

    The test / validation step runs through 1 batch of data, and returns the loss.

    predict_step

    The predict step runs through 1 batch of data, and returns the actual decoded labels.

    on_after_backward

    The on_after_backward hook is called after the backward pass, and allows us to do custom operations. In our case, we use it to update the EMA model.

    on_before_batch_transfer

    The on_before_batch_transfer hook is called before the batch is transferred to the GPU. In our case, we use it to preprocess the batch.

    A diagram of how these components interact with each other is shown below:

    Batch
    on_before_batch_transfer
    training_step
    guess_labels
    sharpen
    mix_up
    loss_unl
    loss_unl_scaler
    loss
    loss_lbl
    backward
    on_after_backward
    update_ema
    validation_step
    loss

    Finally, we show an example of how to use the MixMatch module:

    from sklearn.preprocessing import StandardScaler, OrdinalEncoder from frdc.train.mixmatch_module import MixMatchModule @@ -75,7 +75,7 @@ sharpen_temp=0.5, mix_beta_alpha=0.75, ) -

    In particular, we need to supply some transformations for the preprocessing step. In this case, we use StandardScaler to scale the data, and OrdinalEncoder to encode the labels.

    1. It's best if standardization is done only on the training data, and not the validation data to better fit real-world scenarios.

    2. We use OrdinalEncoder as it handles unseen labels. So if a class doesn't show up in the training data, it will be encoded as np.nan, and will not participate in the loss calculation.

    Design Choices

    Static Method Overriding

    We implement many functions as static, as we believe that a functional style reduces dependencies, thus making the code easier to test and debug.

    Furthermore, it allows the subclasses to easily override the functions, to customize the behavior of the MixMatch module.

    For example, the loss_unl_scaler function is static, thus, we can implement our own scaling function, and pass it to the MixMatch module.

    +

    In particular, we need to supply some transformations for the preprocessing step. In this case, we use StandardScaler to scale the data, and OrdinalEncoder to encode the labels.

    1. It's best if standardization is done only on the training data, and not the validation data to better fit real-world scenarios.

    2. We use OrdinalEncoder as it handles unseen labels. So if a class doesn't show up in the training data, it will be encoded as np.nan, and will not participate in the loss calculation.

    Design Choices

    Static Method Overriding

    We implement many functions as static, as we believe that a functional style reduces dependencies, thus making the code easier to test and debug.

    Furthermore, it allows the subclasses to easily override the functions, to customize the behavior of the MixMatch module.

    For example, the loss_unl_scaler function is static, thus, we can implement our own scaling function, and pass it to the MixMatch module.

    def my_loss_unl_scaler(progress: float) -> float: return progress ** 2 @@ -83,4 +83,4 @@ @staticmethod def loss_unl_scaler(progress: float) -> float: return my_loss_unl_scaler(progress) -

    If we had used a method instead, we would have to consider instance state, which would make it harder to override.

    Why not use Dataclasses?

    One of the biggest caveats of nn.Module is that it requires super().__init__() to be called before anything is assigned. While dataclass can leverage __post_init__ to do the same, we felt that this was too much of a hassle to save a few keystrokes. Thus, we opted to use __init__ instead, while more verbose, it is more explicit.

    Why use PyTorch Lightning?

    While we did hit some road blocks implementing SSL, due to its complex and unconventional nature, we felt that the benefits of using PyTorch Lightning outweighed the cons.

    on_before_batch_transfer and on_after_backward are unconventional hooks, and we had to do some digging to find them. It can be argued that by just writing explicit code, we can avoid the need for these hooks, but the PyTorch ecosystem fixes many other issues, so we closed an eye on this.

    References

    Last modified: 26 June 2024
    \ No newline at end of file +

    If we had used a method instead, we would have to consider instance state, which would make it harder to override.

    Why not use Dataclasses?

    One of the biggest caveats of nn.Module is that it requires super().__init__() to be called before anything is assigned. While dataclass can leverage __post_init__ to do the same, we felt that this was too much of a hassle to save a few keystrokes. Thus, we opted to use __init__ instead, while more verbose, it is more explicit.

    Why use PyTorch Lightning?

    While we did hit some road blocks implementing SSL, due to its complex and unconventional nature, we felt that the benefits of using PyTorch Lightning outweighed the cons.

    on_before_batch_transfer and on_after_backward are unconventional hooks, and we had to do some digging to find them. It can be argued that by just writing explicit code, we can avoid the need for these hooks, but the PyTorch ecosystem fixes many other issues, so we closed an eye on this.

    References

    Last modified: 26 June 2024
    \ No newline at end of file diff --git a/docs/mix-match.html b/docs/mix-match.html index 0b1edc52..c6e00d48 100644 --- a/docs/mix-match.html +++ b/docs/mix-match.html @@ -1,5 +1,5 @@ -MixMatch | Documentation

    Documentation 0.1.2 Help

    MixMatch

    In FRDC-ML, we leverage semi-supervised learning to improve the model's performance through better augmentation consistency and using even unlabelled data.

    The algorithm we use is MixMatch. A state-of-the-art semi-supervised learning algorithm. It is based on the idea of consistency regularization, which encourages models to predict the same class even after augmentations that occur naturally in the real world.

    Our implementation of MixMatch is a refactored version of YU1ut/MixMatch-pytorch We've refactored the code to follow more modern PyTorch practices, allowing us to utilize it with modern PyTorch frameworks such as PyTorch Lightning.

    We won't go through the details of MixMatch here, see Our Documentation in our MixMatch-PyTorch-CIFAR10 repository for more details.

    Implementation Details

    1. How we implemented the MixMatch logic MixMatchModule

    2. How we implemented the unique MixMatch data loading logic Custom MixMatch Data Loading

    References

    Last modified: 26 June 2024
    \ No newline at end of file +}

    Documentation 0.1.2 Help

    MixMatch

    In FRDC-ML, we leverage semi-supervised learning to improve the model's performance through better augmentation consistency and using even unlabelled data.

    The algorithm we use is MixMatch. A state-of-the-art semi-supervised learning algorithm. It is based on the idea of consistency regularization, which encourages models to predict the same class even after augmentations that occur naturally in the real world.

    Our implementation of MixMatch is a refactored version of YU1ut/MixMatch-pytorch We've refactored the code to follow more modern PyTorch practices, allowing us to utilize it with modern PyTorch frameworks such as PyTorch Lightning.

    We won't go through the details of MixMatch here, see Our Documentation in our MixMatch-PyTorch-CIFAR10 repository for more details.

    Implementation Details

    1. How we implemented the MixMatch logic MixMatchModule

    2. How we implemented the unique MixMatch data loading logic Custom MixMatch Data Loading

    References

    Last modified: 26 June 2024
    \ No newline at end of file diff --git a/docs/ml-architecture.html b/docs/ml-architecture.html index e62735d0..402bb875 100644 --- a/docs/ml-architecture.html +++ b/docs/ml-architecture.html @@ -1,5 +1,5 @@ -ML Architecture | Documentation

    Documentation 0.1.2 Help

    ML Architecture

    The architecture is the backbone of the project. If you're interested on how everything is pieced together, this article is for you.

    In Machine Learning architectures, we mostly care about 2 things the data, and the model. As the name implies, DataModules, DataLoaders, Datasets deal with data, and Modules for model construction.

    Data Classes

    There's a small difference between the Data___ classes. Firstly, we load data in as Dataset instances, then preprocessed before being batched by DataLoader, finally, housed in DataModule.

    DataModule
    Train DataLoader
    Validation DataLoader
    Test DataLoader
    Preprocess
    Augmentations
    Distortions
    Alternatives
    Cropping or Resizing
    Scaling
    Data Source
    Load
    Dataset
    DataLoader

    There are 2 IMPORTANT design decisions here:

    Dataset and DataLoader

    Data in Dataset are unbatched, data in DataLoader must be batched. This means that it's possible to have jagged tensors at this stage, however they must be made "stackable" before loading into the DataLoader.

    For example, the data in Dataset could be of shapes [(8, 200, 100), (8, 100, 300), ...]. While, BEFORE loading into DataLoader must have equal shapes, for example [(8, 100, 100), (8, 100, 100), ...]

    This is because when you initialize a DataLoader you need to include the batch_size, which implies the data are stacked in some manner.

    This also leads to the reason why preprocessing must happen before the DataLoader

    Preprocessing

    Excluding functionalities to load the data, this is the step before the data is set in stone. So, steps such as augmentation, transformation, even analytics needs to be performed here as the data is in its "rawest" form.

    We use this step to

    1. Construct alternative augmentations. i.e. images that we could've taken instead.

    2. Using those alternatives, add distortions. i.e. unintentional changes to the photo that reduces quality.

    3. Cropping or resizing the image.

    4. Scale the data. e.g. Standard Scaling, ZCA Scaling, etc.

    The order of the steps are choice by design.

    Modules

    We analyze the inheritance structure of the Modules (also the ML Models):

    Library Module
    PyTorch Module
    Lightning Module
    FRDC Module
    FixMatch Module
    MixMatch Module
    EfficientNetB1 FixMatch Module
    EfficientNetB1 MixMatch Module
    Custom Module

    Custom Modules are our self-defined classes.

    • FRDC Module: This is the base class for all our models. Implements common functionality, such as partial saving of unfrozen parameters.

    • Y Module: Y is the architecture/framework of the model in our case, this only defines the method of training, not the actual model itself.

    • X Y Module: X defines the actual model being used within Y's framework.

    To give an example, we look at EfficientNetB1FixMatchModule. Due to its naming scheme <Model><Framework>Module, we see that it's an EfficientNetB1 model used in the FixMatch framework.

    Furthermore, because it's well decoupled, implementing a new model is as easy as overriding some defaults.

    Last modified: 26 June 2024
    \ No newline at end of file +}

    Documentation 0.1.2 Help

    ML Architecture

    The architecture is the backbone of the project. If you're interested on how everything is pieced together, this article is for you.

    In Machine Learning architectures, we mostly care about 2 things the data, and the model. As the name implies, DataModules, DataLoaders, Datasets deal with data, and Modules for model construction.

    Data Classes

    There's a small difference between the Data___ classes. Firstly, we load data in as Dataset instances, then preprocessed before being batched by DataLoader, finally, housed in DataModule.

    DataModule
    Train DataLoader
    Validation DataLoader
    Test DataLoader
    Preprocess
    Augmentations
    Distortions
    Alternatives
    Cropping or Resizing
    Scaling
    Data Source
    Load
    Dataset
    DataLoader

    There are 2 IMPORTANT design decisions here:

    Dataset and DataLoader

    Data in Dataset are unbatched, data in DataLoader must be batched. This means that it's possible to have jagged tensors at this stage, however they must be made "stackable" before loading into the DataLoader.

    For example, the data in Dataset could be of shapes [(8, 200, 100), (8, 100, 300), ...]. While, BEFORE loading into DataLoader must have equal shapes, for example [(8, 100, 100), (8, 100, 100), ...]

    This is because when you initialize a DataLoader you need to include the batch_size, which implies the data are stacked in some manner.

    This also leads to the reason why preprocessing must happen before the DataLoader

    Preprocessing

    Excluding functionalities to load the data, this is the step before the data is set in stone. So, steps such as augmentation, transformation, even analytics needs to be performed here as the data is in its "rawest" form.

    We use this step to

    1. Construct alternative augmentations. i.e. images that we could've taken instead.

    2. Using those alternatives, add distortions. i.e. unintentional changes to the photo that reduces quality.

    3. Cropping or resizing the image.

    4. Scale the data. e.g. Standard Scaling, ZCA Scaling, etc.

    The order of the steps are choice by design.

    Modules

    We analyze the inheritance structure of the Modules (also the ML Models):

    Library Module
    PyTorch Module
    Lightning Module
    FRDC Module
    FixMatch Module
    MixMatch Module
    EfficientNetB1 FixMatch Module
    EfficientNetB1 MixMatch Module
    Custom Module

    Custom Modules are our self-defined classes.

    • FRDC Module: This is the base class for all our models. Implements common functionality, such as partial saving of unfrozen parameters.

    • Y Module: Y is the architecture/framework of the model in our case, this only defines the method of training, not the actual model itself.

    • X Y Module: X defines the actual model being used within Y's framework.

    To give an example, we look at EfficientNetB1FixMatchModule. Due to its naming scheme <Model><Framework>Module, we see that it's an EfficientNetB1 model used in the FixMatch framework.

    Furthermore, because it's well decoupled, implementing a new model is as easy as overriding some defaults.

    Last modified: 26 June 2024
    \ No newline at end of file diff --git a/docs/model-test-chestnut-may-dec.html b/docs/model-test-chestnut-may-dec.html index 585bd93a..9a4466e1 100644 --- a/docs/model-test-chestnut-may-dec.html +++ b/docs/model-test-chestnut-may-dec.html @@ -1,5 +1,5 @@ -Model Test Chestnut May-Dec | Documentation

    Documentation 0.1.2 Help

    Model Test Chestnut May-Dec

    This test is used to evaluate the model performance on the Chestnut Nature Park May & December dataset.

    See this script in model_tests/chestnut_dec_may/train.py.

    Motivation

    The usage of this model will be to classify trees in unseen datasets under different conditions. In this test, we'll evaluate it under a different season.

    A caveat is that it'll be evaluated on the same set of trees, so it's not a representative of a field-test. However, given difficulties of yielding datasets, this still gives us a good preliminary idea of how the model will perform in different conditions.

    Methodology

    We train on the December dataset, and test on the May dataset.

    Labelled Train
    Unlabelled Train
    Test
    DecDataset
    Model
    MayDataset

    Despite not having any true unlabelled data, we use MixMatch by treating the labelled data of the December dataset as unlabelled data.

    Model

    The current Model used is a simple InceptionV3 Transfer Learning model, with the last layer replaced with a fully connected layer(s).

    SSL Loss
    Input
    InceptionV3 Frozen
    FC Layer(s)
    Softmax
    Output

    Preprocessing

    For Training:

    Segment
    RandomCrop 299
    Horizontal Flip 50%
    Vertical Flip 50%
    Normalize By Training Mean & Std

    For Validation:

    Segment
    CenterCrop 299
    Normalize By Training Mean & Std

    For Evaluation:

    Segment
    CenterCrop 299
    Normalize By Training Mean & Std
    As Is
    Horizontal Flip
    Vertical Flip
    Horizontal & Vertical Flip

    For evaluation, we evaluate that the model should be invariant to horizontal and vertical flips, as well as the original image.

    Hyperparameters

    The following hyperparameters are used:

    • Optimizer: Adam

    • Learning Rate: 1e-3

    • Batch Size: 32

    • Epochs: 10

    • Train Iterations: 25~100

    • Validation Iterations: 10~25

    • Early Stopping: 4

    Results

    We evaluate around 40% accuracy on the test set, compared to 100% for the training set. This indicates that the model has saturated and is not able to learn anymore from the training set. There's no indication of overfitting as the validation loss just plateaus.

    W&B Dashboard

    Caveats

    • The test set is very small, so the results are not very representative.

    • The test set is the same set of trees, so it's not a true test of the model performance in different conditions.

    • There are many classes with 1 sample, so the model may not be able to learn the features of these classes well.

    Last modified: 26 June 2024
    \ No newline at end of file +}

    Documentation 0.1.2 Help

    Model Test Chestnut May-Dec

    This test is used to evaluate the model performance on the Chestnut Nature Park May & December dataset.

    See this script in model_tests/chestnut_dec_may/train.py.

    Motivation

    The usage of this model will be to classify trees in unseen datasets under different conditions. In this test, we'll evaluate it under a different season.

    A caveat is that it'll be evaluated on the same set of trees, so it's not a representative of a field-test. However, given difficulties of yielding datasets, this still gives us a good preliminary idea of how the model will perform in different conditions.

    Methodology

    We train on the December dataset, and test on the May dataset.

    Labelled Train
    Unlabelled Train
    Test
    DecDataset
    Model
    MayDataset

    Despite not having any true unlabelled data, we use MixMatch by treating the labelled data of the December dataset as unlabelled data.

    Model

    The current Model used is a simple InceptionV3 Transfer Learning model, with the last layer replaced with a fully connected layer(s).

    SSL Loss
    Input
    InceptionV3 Frozen
    FC Layer(s)
    Softmax
    Output

    Preprocessing

    For Training:

    Segment
    RandomCrop 299
    Horizontal Flip 50%
    Vertical Flip 50%
    Normalize By Training Mean & Std

    For Validation:

    Segment
    CenterCrop 299
    Normalize By Training Mean & Std

    For Evaluation:

    Segment
    CenterCrop 299
    Normalize By Training Mean & Std
    As Is
    Horizontal Flip
    Vertical Flip
    Horizontal & Vertical Flip

    For evaluation, we evaluate that the model should be invariant to horizontal and vertical flips, as well as the original image.

    Hyperparameters

    The following hyperparameters are used:

    • Optimizer: Adam

    • Learning Rate: 1e-3

    • Batch Size: 32

    • Epochs: 10

    • Train Iterations: 25~100

    • Validation Iterations: 10~25

    • Early Stopping: 4

    Results

    We evaluate around 40% accuracy on the test set, compared to 100% for the training set. This indicates that the model has saturated and is not able to learn anymore from the training set. There's no indication of overfitting as the validation loss just plateaus.

    W&B Dashboard

    Caveats

    • The test set is very small, so the results are not very representative.

    • The test set is the same set of trees, so it's not a true test of the model performance in different conditions.

    • There are many classes with 1 sample, so the model may not be able to learn the features of these classes well.

    Last modified: 26 June 2024
    \ No newline at end of file diff --git a/docs/overview.html b/docs/overview.html index f818369f..f6bdc102 100644 --- a/docs/overview.html +++ b/docs/overview.html @@ -1,5 +1,5 @@ -Overview | Documentation

    Documentation 0.1.2 Help

    Overview

    Forest Recovery Digital Companion (FRDC) is a ML-assisted companion for ecologists to automatically classify surveyed trees via an Unmanned Aerial Vehicle (UAV).

    This package, FRDC-ML is the Machine Learning backbone of this project, a centralized repository of tools and model architectures to be used in the FRDC pipeline.

    Get started here

    Other Projects

    FRDC-UI

    The User Interface Repository for FRDC, a WebApp GUI for ecologists to adjust annotations.

    Last modified: 26 June 2024
    \ No newline at end of file +}

    Documentation 0.1.2 Help

    Overview

    Forest Recovery Digital Companion (FRDC) is a ML-assisted companion for ecologists to automatically classify surveyed trees via an Unmanned Aerial Vehicle (UAV).

    This package, FRDC-ML is the Machine Learning backbone of this project, a centralized repository of tools and model architectures to be used in the FRDC pipeline.

    Get started here

    Other Projects

    FRDC-UI

    The User Interface Repository for FRDC, a WebApp GUI for ecologists to adjust annotations.

    Last modified: 26 June 2024
    \ No newline at end of file diff --git a/docs/preprocessing-extract-segments.html b/docs/preprocessing-extract-segments.html index 56f4caf3..a8e9cedf 100644 --- a/docs/preprocessing-extract-segments.html +++ b/docs/preprocessing-extract-segments.html @@ -1,5 +1,5 @@ -preprocessing.extract_segments | Documentation

    Documentation 0.1.2 Help

    preprocessing.extract_segments

    Functions

    extract_segments_from_labels

    Extracts segments from a label classification.

    extract_segments_from_bounds

    Extracts segments from Rect bounds.

    remove_small_segments_from_labels

    Removes small segments from a label classification.

    Extract with Boundaries

    A boundary is a Rect object that represents the minimum bounding box of a segment, with x0, y0, x1, y1 coordinates.

    It simply slices the original image to the bounding box. The origin is the top left corner of the image.

    +}

    Documentation 0.1.2 Help

    preprocessing.extract_segments

    Functions

    extract_segments_from_labels

    Extracts segments from a label classification.

    extract_segments_from_bounds

    Extracts segments from Rect bounds.

    remove_small_segments_from_labels

    Removes small segments from a label classification.

    Extract with Boundaries

    A boundary is a Rect object that represents the minimum bounding box of a segment, with x0, y0, x1, y1 coordinates.

    It simply slices the original image to the bounding box. The origin is the top left corner of the image.

    +-----------------+ +-----------+ | Original | | Segmented | | Image | | Image | @@ -24,7 +24,7 @@ +-----+-----+-----+ 1, 2, 0, 2 +-----+-----+ | 7 | 8 | 9 | x0 y0 x1 y1 | 8 | 9 | +-----+-----+-----+ +-----+-----+ -
    +
    +-----------------+ +-----------------+ | Original | | Segmented | | Image | | Image | @@ -35,7 +35,7 @@ +-----+-----+-----+ 1, 2, 0, 2 +-----+-----+-----+ | 7 | 8 | 9 | x0 y0 x1 y1 | 0 | 8 | 9 | +-----+-----+-----+ +-----+-----+-----+ -

    Extract with Labels

    A label classification is a np.ndarray where each pixel is mapped to a segment. The segments are mapped to a unique integer. In our project, the 0th label is the background.

    For example, a label classification of 3 segments will look like this:

    +

    Extract with Labels

    A label classification is a np.ndarray where each pixel is mapped to a segment. The segments are mapped to a unique integer. In our project, the 0th label is the background.

    For example, a label classification of 3 segments will look like this:

    +-----------------+ +-----------------+ | Label | | Original | | Classification | | Image | @@ -46,7 +46,7 @@ +-----+-----+-----+ +-----+-----+-----+ | 1 | 1 | 0 | | 7 | 8 | 9 | +-----+-----+-----+ +-----+-----+-----+ -

    The extraction will take the minimum bounding box of each segment and return a list of segments.

    For example, the label 1 and 2 extracted images will be

    +

    The extraction will take the minimum bounding box of each segment and return a list of segments.

    For example, the label 1 and 2 extracted images will be

    +-----------+ +-----------+ | Extracted | | Extracted | | Segment 1 | | Segment 2 | @@ -57,7 +57,7 @@ +-----+-----+ +-----+-----+ | 7 | 8 | +-----+-----+ -
    +
    +-----------------+ +-----------------+ | Extracted | | Extracted | | Segment 1 | | Segment 2 | @@ -68,7 +68,7 @@ +-----+-----+-----+ +-----+-----+-----+ | 7 | 8 | 0 | | 0 | 0 | 0 | +-----+-----+-----+ +-----+-----+-----+ -
    • If cropped is False, the segments are padded with 0s to the original image size. While this can ensure shape consistency, it can consume more memory for large images.

    • If cropped is True, the segments are cropped to the minimum bounding box. This can save memory, but the shape of the segments will be inconsistent.

    Usage

    Extract from Bounds and Labels

    Extract segments from bounds and labels.

    +
    • If cropped is False, the segments are padded with 0s to the original image size. While this can ensure shape consistency, it can consume more memory for large images.

    • If cropped is True, the segments are cropped to the minimum bounding box. This can save memory, but the shape of the segments will be inconsistent.

    Usage

    Extract from Bounds and Labels

    Extract segments from bounds and labels.

    import numpy as np from frdc.load.preset import FRDCDatasetPreset from frdc.preprocess.extract_segments import extract_segments_from_bounds @@ -78,7 +78,7 @@ bounds, labels = ds._get_legacy_bounds_and_labels() segments: list[np.ndarray] = extract_segments_from_bounds(ar, bounds) -

    Extract from Auto-Segmentation

    Extract segments from a label classification.

    +

    Extract from Auto-Segmentation

    Extract segments from a label classification.

    from skimage.morphology import remove_small_objects, remove_small_holes import numpy as np @@ -102,4 +102,4 @@ min_height=10, min_width=10) segments: list[np.ndarray] = extract_segments_from_labels(ar, ar_labels) -

    API

    extract_segments_from_labels(ar, ar_labels, cropped)

    Extracts segments from a label classification.


    ar_labels is a label classification as a np.ndarray

    extract_segments_from_bounds(ar, bounds, cropped)

    Extracts segments from Rect bounds.


    bounds is a list of Rect bounds.

    remove_small_segments_from_labels(ar_labels, min_height, min_width)

    Removes small segments from a label classification.


    Last modified: 26 June 2024
    \ No newline at end of file +

    API

    extract_segments_from_labels(ar, ar_labels, cropped)

    Extracts segments from a label classification.


    ar_labels is a label classification as a np.ndarray

    extract_segments_from_bounds(ar, bounds, cropped)

    Extracts segments from Rect bounds.


    bounds is a list of Rect bounds.

    remove_small_segments_from_labels(ar_labels, min_height, min_width)

    Removes small segments from a label classification.


    Last modified: 26 June 2024
    \ No newline at end of file diff --git a/docs/preprocessing-glcm-padded.html b/docs/preprocessing-glcm-padded.html index 40d174a1..2f520c08 100644 --- a/docs/preprocessing-glcm-padded.html +++ b/docs/preprocessing-glcm-padded.html @@ -1,5 +1,5 @@ -preprocessing.glcm_padded | Documentation

    Documentation 0.1.2 Help

    preprocessing.glcm_padded

    Functions

    glcm_padded

    Computes the GLCM of the NDArray bands with padding.

    glcm_padded_cached

    Computes the GLCM of the NDArray bands with padding, and caches it.

    append_glcm_padded_cached

    Computes the GLCM of the NDArray bands with padding, and caches it and also appends it onto the original array.

    Usage

    We show a few examples of how to use the GLCM functions.

    +}

    Documentation 0.1.2 Help

    preprocessing.glcm_padded

    Functions

    glcm_padded

    Computes the GLCM of the NDArray bands with padding.

    glcm_padded_cached

    Computes the GLCM of the NDArray bands with padding, and caches it.

    append_glcm_padded_cached

    Computes the GLCM of the NDArray bands with padding, and caches it and also appends it onto the original array.

    Usage

    We show a few examples of how to use the GLCM functions.

    import numpy as np from glcm_cupy import Features @@ -38,4 +38,4 @@ ar_glcm_cached_appended = append_glcm_padded_cached(ar, bin_from=1, bin_to=4, radius=3) -
    • ar_glcm is the GLCM of the original array, with the last dimension being the GLCM features. The number of features is determined by the features parameter, which defaults to all features.

    • ar_glcm_2_features selects only 2 features, with the last dimension being the 2 GLCM features specified.

    • ar_glcm_cached caches the GLCM so that if you call it again, it will return the cached version. It stores its data at the project root dir, under .cache/.

    • ar_glcm_cached_appended is a wrapper around ar_glcm_cached, it appends the GLCM features onto the original array. It's equivalent to calling ar_glcm_cached and then np.concatenate on the final axes.

    Caching

    GLCM is an expensive operation, thus we recommend to cache it if the input parameters will be the same. This is especially useful if you're experimenting with the same dataset with constant parameters.

    API

    glcm_padded(ar, bin_from, bin_to, radius, step_size, features)

    Computes the GLCM of the NDArray bands with padding.


    • ar is the input array

    • bin_from is the upper bound of the input

    • bin_to is the upper bound of the GLCM input, i.e. the resolution that GLCM operates on

    • radius is the radius of the GLCM

    • step_size is the step size of the GLCM

    • features is the list of GLCM features to compute

    The return shape is

    See glcm_cupy for the GLCM Features.

    glcm_padded_cached(ar, bin_from, bin_to, radius, step_size, features)

    Computes the GLCM of the NDArray bands with padding, and caches it.


    See glcm_padded for the parameters and output shape

    append_glcm_padded_cached(ar, bin_from, bin_to, radius, step_size, features)

    Computes the GLCM of the NDArray bands with padding, and caches it and also appends it onto the original array.


    See glcm_padded for the parameters


    The return shape is:

    The function automatically flattens the last 2 dimensions of the GLCM features, and appends it onto the original array.

    Last modified: 26 June 2024
    \ No newline at end of file +
    • ar_glcm is the GLCM of the original array, with the last dimension being the GLCM features. The number of features is determined by the features parameter, which defaults to all features.

    • ar_glcm_2_features selects only 2 features, with the last dimension being the 2 GLCM features specified.

    • ar_glcm_cached caches the GLCM so that if you call it again, it will return the cached version. It stores its data at the project root dir, under .cache/.

    • ar_glcm_cached_appended is a wrapper around ar_glcm_cached, it appends the GLCM features onto the original array. It's equivalent to calling ar_glcm_cached and then np.concatenate on the final axes.

    Caching

    GLCM is an expensive operation, thus we recommend to cache it if the input parameters will be the same. This is especially useful if you're experimenting with the same dataset with constant parameters.

    API

    glcm_padded(ar, bin_from, bin_to, radius, step_size, features)

    Computes the GLCM of the NDArray bands with padding.


    • ar is the input array

    • bin_from is the upper bound of the input

    • bin_to is the upper bound of the GLCM input, i.e. the resolution that GLCM operates on

    • radius is the radius of the GLCM

    • step_size is the step size of the GLCM

    • features is the list of GLCM features to compute

    The return shape is

    See glcm_cupy for the GLCM Features.

    glcm_padded_cached(ar, bin_from, bin_to, radius, step_size, features)

    Computes the GLCM of the NDArray bands with padding, and caches it.


    See glcm_padded for the parameters and output shape

    append_glcm_padded_cached(ar, bin_from, bin_to, radius, step_size, features)

    Computes the GLCM of the NDArray bands with padding, and caches it and also appends it onto the original array.


    See glcm_padded for the parameters


    The return shape is:

    The function automatically flattens the last 2 dimensions of the GLCM features, and appends it onto the original array.

    Last modified: 26 June 2024
    \ No newline at end of file diff --git a/docs/preprocessing-morphology.html b/docs/preprocessing-morphology.html index 140058d3..6246b28f 100644 --- a/docs/preprocessing-morphology.html +++ b/docs/preprocessing-morphology.html @@ -1,5 +1,5 @@ -preprocessing.morphology | Documentation

    Documentation 0.1.2 Help

    preprocessing.morphology

    Functions

    threshold_binary_mask

    Thresholds a selected NDArray bands to yield a binary mask.

    binary_watershed

    Performs watershed on a binary mask to yield a mapped label classification

    Usage

    Perform auto-segmentation on a dataset to yield a label classification.

    +}

    Documentation 0.1.2 Help

    preprocessing.morphology

    Functions

    threshold_binary_mask

    Thresholds a selected NDArray bands to yield a binary mask.

    binary_watershed

    Performs watershed on a binary mask to yield a mapped label classification

    Usage

    Perform auto-segmentation on a dataset to yield a label classification.

    from frdc.load.preset import FRDCDatasetPreset from frdc.preprocess.morphology import ( threshold_binary_mask, binary_watershed @@ -23,6 +23,6 @@ ar, order = ds._get_ar_bands() mask = threshold_binary_mask(ar, order.index('NIR'), 90 / 256) ar_label = binary_watershed(mask) -

    API

    threshold_binary_mask(ar, band_idx, threshold_value)

    Thresholds a selected NDArray bands to yield a binary mask as np.ndarray


    This is equivalent to

    +

    API

    threshold_binary_mask(ar, band_idx, threshold_value)

    Thresholds a selected NDArray bands to yield a binary mask as np.ndarray


    This is equivalent to

    ar[..., band_idx] > threshold_value -
    binary_watershed(ar_mask, peaks_footprint, watershed_compactness)

    Performs watershed on a binary mask to yield a mapped label classification as a np.ndarray


    • peaks_footprint is the footprint of skimage.feature.peak_local_max

    • watershed_compactness is the compactness of skimage.morphology.watershed

    Last modified: 26 June 2024
    \ No newline at end of file +
    binary_watershed(ar_mask, peaks_footprint, watershed_compactness)

    Performs watershed on a binary mask to yield a mapped label classification as a np.ndarray


    • peaks_footprint is the footprint of skimage.feature.peak_local_max

    • watershed_compactness is the compactness of skimage.morphology.watershed

    Last modified: 26 June 2024
    \ No newline at end of file diff --git a/docs/preprocessing-scale.html b/docs/preprocessing-scale.html index d5ab0f0e..d413b2fc 100644 --- a/docs/preprocessing-scale.html +++ b/docs/preprocessing-scale.html @@ -1,5 +1,5 @@ -preprocessing.scale | Documentation

    Documentation 0.1.2 Help

    preprocessing.scale

    Functions

    scale_0_1_per_band

    Scales the NDArray bands to [0, 1] per band.

    scale_normal_per_band

    Scales the NDArray bands to zero mean unit variance per band.

    scale_static_per_band

    Scales the NDArray bands by a predefined configuration. Take a look at frdc.conf.BAND_MAX_CONFIG for an example.

    Usage

    +}

    Documentation 0.1.2 Help

    preprocessing.scale

    Functions

    scale_0_1_per_band

    Scales the NDArray bands to [0, 1] per band.

    scale_normal_per_band

    Scales the NDArray bands to zero mean unit variance per band.

    scale_static_per_band

    Scales the NDArray bands by a predefined configuration. Take a look at frdc.conf.BAND_MAX_CONFIG for an example.

    Usage

    from frdc.load.preset import FRDCDatasetPreset from frdc.preprocess.scale import ( scale_0_1_per_band, scale_normal_per_band, scale_static_per_band @@ -25,4 +25,4 @@ ar_01 = scale_0_1_per_band(ar) ar_norm = scale_normal_per_band(ar) ar_static = scale_static_per_band(ar, order, BAND_MAX_CONFIG) -
    Last modified: 26 June 2024
    \ No newline at end of file +
    Last modified: 26 June 2024
    \ No newline at end of file diff --git a/docs/retrieve-our-datasets.html b/docs/retrieve-our-datasets.html index d0c2fdce..8d6806c7 100644 --- a/docs/retrieve-our-datasets.html +++ b/docs/retrieve-our-datasets.html @@ -1,5 +1,5 @@ -Retrieve our Datasets | Documentation

    Documentation 0.1.2 Help

    Retrieve our Datasets

    In this tutorial, we'll learn how to :

    • Retrieve FRDC's Datasets

    • How to inspect the data

    • How to integrate it with PyTorch's DataLoader

    • How to visualize the data

    Prerequisites

    • New here? Get Started.

    • Setup the Google Cloud Authorization to download the data.

    Retrieve the Data

    To retrieve the data, use FRDCDatasetPreset. This module presets to load explicitly known datasets.

    For example:

    +}

    Documentation 0.1.2 Help

    Retrieve our Datasets

    In this tutorial, we'll learn how to :

    • Retrieve FRDC's Datasets

    • How to inspect the data

    • How to integrate it with PyTorch's DataLoader

    • How to visualize the data

    Prerequisites

    • New here? Get Started.

    • Setup the Google Cloud Authorization to download the data.

    Retrieve the Data

    To retrieve the data, use FRDCDatasetPreset. This module presets to load explicitly known datasets.

    For example:

    from frdc.load.preset import FRDCDatasetPreset ds = FRDCDatasetPreset.chestnut_20201218() for x, y in ds: print(x.shape, y) -

    You should get something like this:

    +

    You should get something like this:

    (831, 700, 8) Falcataria Moluccana (540, 536, 8) Ficus Variegata (457, 660, 8) Bridelia Sp. ... -
    • x is a torch.Tensor

    • y is a str.

    Iterate through the Data

    The dataset, when you load it, will be automatically segmented by bounds. Therefore, if you want to simply loop through the segments and labels, you can treat the dataset as an iterable.

    +
    • x is a torch.Tensor

    • y is a str.

    Iterate through the Data

    The dataset, when you load it, will be automatically segmented by bounds. Therefore, if you want to simply loop through the segments and labels, you can treat the dataset as an iterable.

    from frdc.load.preset import FRDCDatasetPreset ds = FRDCDatasetPreset.chestnut_20201218() for x, y in ds: print(x.shape, y) -

    If you just want the segments or targets separately, use .ar_segments and .targets respectively.

    +

    If you just want the segments or targets separately, use .ar_segments and .targets respectively.

    from frdc.load.preset import FRDCDatasetPreset ds = FRDCDatasetPreset.chestnut_20201218() @@ -39,19 +39,19 @@ for y in ds.targets: print(y) -

    If you want the entire image, use .ar.

    +

    If you want the entire image, use .ar.

    from frdc.load.preset import FRDCDatasetPreset ds = FRDCDatasetPreset.chestnut_20201218() ar = ds.ar -

    Finally, inspect the order of the bands through the band_order attribute.

    +

    Finally, inspect the order of the bands through the band_order attribute.

    from frdc.load.preset import FRDCDatasetPreset ds = FRDCDatasetPreset.chestnut_20201218() ds.band_order
    > ['WB', 'WG', 'WR', 'NB', 'NG', 'NR', 'RE', 'NIR'] -

    Using with PyTorch's DataLoader

    Every FRDCDataset is a Dataset object, so you can use it with PyTorch's DataLoader. This allows you to retrieve by batches!

    +

    Using with PyTorch's DataLoader

    Every FRDCDataset is a Dataset object, so you can use it with PyTorch's DataLoader. This allows you to retrieve by batches!

    from torch.utils.data import DataLoader from torchvision.transforms.v2 import CenterCrop, Compose, Resize, ToImage @@ -65,12 +65,12 @@ for x, y in dl: print(x.shape, y) -

    Which should output

    +

    Which should output

    torch.Size([4, 8, 100, 100]) ('Falcataria Moluccana', ...) torch.Size([4, 8, 100, 100]) ('Clausena Excavata', ...) torch.Size([4, 8, 100, 100]) ('Clausena Excavata', ...) ... -

    Plot the Data (Optional)

    We can then use these data to plot out the first tree segment.

    +

    Plot the Data (Optional)

    We can then use these data to plot out the first tree segment.

    import matplotlib.pyplot as plt from frdc.load.preset import FRDCDatasetPreset @@ -84,4 +84,4 @@ plt.imshow(segment_0_rgb_scaled) plt.title(f"Tree {ds.targets[0]}") plt.show() -

    See also: preprocessing.scale.scale_0_1_per_band

    MatPlotLib cannot show the data correctly as-is, so we need to

    • Convert the data from BGR to RGB

    • Scale the data to 0-1 per band

    Last modified: 26 June 2024
    \ No newline at end of file +

    See also: preprocessing.scale.scale_0_1_per_band

    MatPlotLib cannot show the data correctly as-is, so we need to

    • Convert the data from BGR to RGB

    • Scale the data to 0-1 per band

    Last modified: 26 June 2024
    \ No newline at end of file