From bf92c2c1ccfc8b072ab6d943149c6f9b20a24a45 Mon Sep 17 00:00:00 2001 From: Evening Date: Tue, 26 Dec 2023 14:19:48 +0800 Subject: [PATCH 01/52] Remove unused evaluate script --- src/frdc/evaluate/__init__.py | 2 -- src/frdc/evaluate/evaluate.py | 31 ------------------------------- 2 files changed, 33 deletions(-) delete mode 100644 src/frdc/evaluate/evaluate.py diff --git a/src/frdc/evaluate/__init__.py b/src/frdc/evaluate/__init__.py index 48354a4b..8b137891 100644 --- a/src/frdc/evaluate/__init__.py +++ b/src/frdc/evaluate/__init__.py @@ -1,3 +1 @@ -from .evaluate import dummy_evaluate -__all__ = ["dummy_evaluate"] diff --git a/src/frdc/evaluate/evaluate.py b/src/frdc/evaluate/evaluate.py deleted file mode 100644 index a0ccfbed..00000000 --- a/src/frdc/evaluate/evaluate.py +++ /dev/null @@ -1,31 +0,0 @@ -from typing import Callable - -import numpy as np -from sklearn.base import ClassifierMixin - - -def dummy_evaluate( - *, - feature_extraction: Callable[[np.ndarray], np.ndarray], - classifier: ClassifierMixin, - X_test: np.ndarray, - y_test: np.ndarray, -) -> float: - """Dummy Evaluation function. - - Notes: - This is obviously not final. This is just a placeholder to get the - pipeline working. - - Args: - feature_extraction: The feature extraction function. - classifier: The classifier. - X_test: X_test is the test image numpy array of shape (N, H, W, C). - y_test: y_test is the test class label a numpy array of shape (N,). - - Returns: - The score of the model. - """ - # TODO: Replace this with how the model scores - - return classifier.score(feature_extraction(X_test), y_test) From 77ba78a33bed09b7bf4051e889795634311cda20 Mon Sep 17 00:00:00 2001 From: Evening Date: Tue, 26 Dec 2023 14:20:44 +0800 Subject: [PATCH 02/52] Make GCS error clearer --- src/frdc/conf.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/frdc/conf.py b/src/frdc/conf.py index 89986988..5098025d 100644 --- a/src/frdc/conf.py +++ b/src/frdc/conf.py @@ -56,6 +56,8 @@ except Exception as e: logger.warning( "Could not connect to GCS. Will not be able to download files. " + "Check that you've (1) Installed the GCS CLI and (2) Set up the" + "ADC with `gcloud auth application-default login`. " "GCS_CLIENT will be None." ) GCS_CLIENT = None From 5c4a36cfbd672593293dc07ebe1fd30c9f762f63 Mon Sep 17 00:00:00 2001 From: Evening Date: Tue, 26 Dec 2023 14:20:51 +0800 Subject: [PATCH 03/52] Fix missing default on exception --- src/frdc/conf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/frdc/conf.py b/src/frdc/conf.py index 5098025d..e2f5958a 100644 --- a/src/frdc/conf.py +++ b/src/frdc/conf.py @@ -61,6 +61,7 @@ "GCS_CLIENT will be None." ) GCS_CLIENT = None + GCS_BUCKET = None try: logger.info("Connecting to Label Studio...") From d46f4e3fb56ae6b5becd2ba81389fd9dd3119e2b Mon Sep 17 00:00:00 2001 From: Evening Date: Tue, 26 Dec 2023 14:21:03 +0800 Subject: [PATCH 04/52] Add dev container spec --- .devcontainer/devcontainer.json | 16 ++++++++++++++++ Dockerfile | 19 +++++++++++++++++++ 2 files changed, 35 insertions(+) create mode 100644 .devcontainer/devcontainer.json create mode 100644 Dockerfile diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 00000000..9b0313b8 --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,16 @@ +{ + "name": "frdc", + "build": { + "context": "../", + "dockerfile": "Dockerfile", + }, + "containerEnv": { + "LABEL_STUDIO_HOST": "host.docker.internal", + }, + "runArgs": [ + "--gpus=all", + ], + "hostRequirements": { + "gpu": true, + } +} \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..e1378f66 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,19 @@ +FROM pytorch/pytorch:2.1.2-cuda12.1-cudnn8-runtime as torch +WORKDIR /devcontainer + +COPY ./pyproject.toml /devcontainer/pyproject.toml + +RUN apt-get update +RUN apt-get install git -y + +RUN pip3 install --upgrade pip && \ + pip3 install poetry + +RUN conda init bash \ + && . ~/.bashrc \ + && conda activate base \ + && poetry config virtualenvs.create false \ + && poetry install --with dev --no-interaction --no-ansi + +RUN apt-get install curl -y && curl -sSL https://sdk.cloud.google.com | bash +ENV PATH $PATH:/root/google-cloud-sdk/bin \ No newline at end of file From c2ba141e24d02c47d88691ca199087c63a553c4b Mon Sep 17 00:00:00 2001 From: Evening Date: Tue, 26 Dec 2023 14:22:54 +0800 Subject: [PATCH 05/52] Delete rsc.dvc --- rsc.dvc | 424 -------------------------------------------------------- 1 file changed, 424 deletions(-) delete mode 100644 rsc.dvc diff --git a/rsc.dvc b/rsc.dvc deleted file mode 100644 index f9ae0e31..00000000 --- a/rsc.dvc +++ /dev/null @@ -1,424 +0,0 @@ -outs: -- hash: md5 - path: rsc - files: - - relpath: DEBUG/0/bounds.csv - md5: ad253d8dfe0cbbc8107647b11d9a2ce2 - size: 325 - cloud: - frdc-ds: - etag: 08b6ab869c8de682031001 - version_id: '1701154195084726' - - relpath: DEBUG/0/result.jpg - md5: 36e390821a45f70fc442bbd5d881f495 - size: 7361 - cloud: - frdc-ds: - etag: 08ccc6cc8191e682031001 - version_id: '1701155213353804' - - relpath: DEBUG/0/result.tif - md5: c700d23b1332113e74051d0b0d187d6e - size: 60154 - cloud: - frdc-ds: - etag: 0893da859c8de682031001 - version_id: '1701154195074323' - - relpath: DEBUG/0/result_Blue.tif - md5: 663cb540356deb8811d0b779e4435bf5 - size: 60134 - cloud: - frdc-ds: - etag: 08fada859c8de682031001 - version_id: '1701154195074426' - - relpath: DEBUG/0/result_Green.tif - md5: 2643f64a2e775d4ee95583e108ab5e9c - size: 60134 - cloud: - frdc-ds: - etag: 08bc99869c8de682031001 - version_id: '1701154195082428' - - relpath: DEBUG/0/result_NIR.tif - md5: 946c2eedf7e658fd9924b0a99b3282fa - size: 60134 - cloud: - frdc-ds: - etag: 08879c869c8de682031001 - version_id: '1701154195082759' - - relpath: DEBUG/0/result_Red.tif - md5: f566accc812ae46a0147b0078d0f4411 - size: 60134 - cloud: - frdc-ds: - etag: 0884fa869c8de682031001 - version_id: '1701154195094788' - - relpath: DEBUG/0/result_RedEdge.tif - md5: ab998dba488a0fd7c362fab04c1b4034 - size: 60134 - cloud: - frdc-ds: - etag: 08a98c869c8de682031001 - version_id: '1701154195080745' - - relpath: casuarina/20220418/183deg/dsm.tif - md5: 43260454e85e6f5795a5c17213de3334 - size: 36290602 - cloud: - frdc-ds: - etag: 08fd818aa08de682031001 - version_id: '1701154203533565' - - relpath: casuarina/20220418/183deg/gsddsm.tif - md5: f67e3c112989145af0cc0b5a0478fa22 - size: 98938 - cloud: - frdc-ds: - etag: 08ceb6879c8de682031001 - version_id: '1701154195102542' - - relpath: casuarina/20220418/183deg/result.jpg - md5: ceae7eab3f8a39485e3b4f5c5b7d47b3 - size: 13379694 - cloud: - frdc-ds: - etag: 08be83e98191e682031001 - version_id: '1701155213820350' - - relpath: casuarina/20220418/183deg/result.tif - md5: e355405f060d55a6b7374b3542fdc3c1 - size: 197880775 - cloud: - frdc-ds: - etag: 08f2c3ebaf8de682031001 - version_id: '1701154236588530' - - relpath: casuarina/20220418/183deg/result_Blue.tif - md5: 03bf7028f6527db900b95d7621463cce - size: 223640924 - cloud: - frdc-ds: - etag: 08b4bf9eac8de682031001 - version_id: '1701154229034932' - - relpath: casuarina/20220418/183deg/result_Green.tif - md5: d475aef93075d7375f38f73895d5fdb8 - size: 226577241 - cloud: - frdc-ds: - etag: 088b9682ac8de682031001 - version_id: '1701154228570891' - - relpath: casuarina/20220418/183deg/result_NIR.tif - md5: 2c9cc4d48cf9316dea3a6f536fe5a3ac - size: 222016430 - cloud: - frdc-ds: - etag: 089be4c1ab8de682031001 - version_id: '1701154227515931' - - relpath: casuarina/20220418/183deg/result_Red.tif - md5: 73977a9d21760bbbcc8b42f79bcb4692 - size: 226754875 - cloud: - frdc-ds: - etag: 08e99287ac8de682031001 - version_id: '1701154228652393' - - relpath: casuarina/20220418/183deg/result_RedEdge.tif - md5: 3f04518f4d8a35e2efdc3c3d6187860f - size: 223730363 - cloud: - frdc-ds: - etag: 08ffbea1ac8de682031001 - version_id: '1701154229084031' - - relpath: casuarina/20220418/183deg/segment.tif - md5: 830f7ca57c34ac3a0eb6a852cb272373 - size: 26708 - cloud: - frdc-ds: - etag: 08ccdd859c8de682031001 - version_id: '1701154195074764' - - relpath: casuarina/20220418/93deg/dsm.tif - md5: 5fbd6841a0b7426bd01d4a3b68d06a9c - size: 36780878 - cloud: - frdc-ds: - etag: 08f8f3c9a08de682031001 - version_id: '1701154204580344' - - relpath: casuarina/20220418/93deg/gsddsm.tif - md5: ac659a3f17b2e21dc852a04af246906b - size: 91346 - cloud: - frdc-ds: - etag: 08c798869c8de682031001 - version_id: '1701154195082311' - - relpath: casuarina/20220418/93deg/result.jpg - md5: e0f493e7704cf377b4f41132313950d1 - size: 13131131 - cloud: - frdc-ds: - etag: 08bffae48191e682031001 - version_id: '1701155213753663' - - relpath: casuarina/20220418/93deg/result.tif - md5: 83feb031787b457f5017db9acdebf625 - size: 195394032 - cloud: - frdc-ds: - etag: 088df3e1aa8de682031001 - version_id: '1701154225944973' - - relpath: casuarina/20220418/93deg/result_Blue.tif - md5: d97ce2b5a44e896810e6401b4f5ab506 - size: 220670499 - cloud: - frdc-ds: - etag: 08d5d5b0b28de682031001 - version_id: '1701154241915605' - - relpath: casuarina/20220418/93deg/result_Green.tif - md5: 40ef3a8c81ba28366ba47a1fa4989702 - size: 223797748 - cloud: - frdc-ds: - etag: 08d3ec8cac8de682031001 - version_id: '1701154228745811' - - relpath: casuarina/20220418/93deg/result_NIR.tif - md5: a2ce084c7ba1c5f5a6b0b4a1dc93a98c - size: 219316999 - cloud: - frdc-ds: - etag: 08f4b0fdab8de682031001 - version_id: '1701154228492404' - - relpath: casuarina/20220418/93deg/result_Red.tif - md5: 65144fb2a827f6dd1723538afd375902 - size: 224060322 - cloud: - frdc-ds: - etag: 0885ce9fac8de682031001 - version_id: '1701154229053189' - - relpath: casuarina/20220418/93deg/result_RedEdge.tif - md5: ceda32971f11a11af00a4f1838475e16 - size: 221188044 - cloud: - frdc-ds: - etag: 08f8dceaab8de682031001 - version_id: '1701154228186744' - - relpath: casuarina/20220418/93deg/segment.tif - md5: ea46f1c799137aa0fd0256b20c5a66d6 - size: 25383 - cloud: - frdc-ds: - etag: 0881da859c8de682031001 - version_id: '1701154195074305' - - relpath: chestnut_nature_park/20201218/bounds.csv - md5: d7043890368db04693865c2451c35a1f - size: 1408 - cloud: - frdc-ds: - etag: 08e98f869c8de682031001 - version_id: '1701154195081193' - - relpath: chestnut_nature_park/20201218/dsm.tif - md5: fbf608de495c51dace8bda86cadb4b65 - size: 14482537 - cloud: - frdc-ds: - etag: 08b8a1c79c8de682031001 - version_id: '1701154196148408' - - relpath: chestnut_nature_park/20201218/result.jpg - md5: 91c1cff8671c332b20e3ca128431d6d7 - size: 4780586 - cloud: - frdc-ds: - etag: 08ddf0c49c8de682031001 - version_id: '1701154196109405' - - relpath: chestnut_nature_park/20201218/result.tif - md5: d08c5b89a54b24ae85dd5d15e2a462fb - size: 80415617 - cloud: - frdc-ds: - etag: 08fafec5a58de682031001 - version_id: '1701154215001978' - - relpath: chestnut_nature_park/20201218/result_Blue.tif - md5: ad5eacd90476ad6a56111009f958a30b - size: 95205235 - cloud: - frdc-ds: - etag: 08cebef79b8de682031001 - version_id: '1701154194841422' - - relpath: chestnut_nature_park/20201218/result_Green.tif - md5: 5c85ec286f81edecaa5f2978086a6f79 - size: 96849701 - cloud: - frdc-ds: - etag: 08d2eff6a08de682031001 - version_id: '1701154205317074' - - relpath: chestnut_nature_park/20201218/result_NIR.tif - md5: fa7df3dece555c289aea03c2c996195d - size: 94504684 - cloud: - frdc-ds: - etag: 08b180c2a68de682031001 - version_id: '1701154217033777' - - relpath: chestnut_nature_park/20201218/result_Red.tif - md5: 09b07ef8678e8f4ce058fd091760eab6 - size: 96915726 - cloud: - frdc-ds: - etag: 088cd0d0a18de682031001 - version_id: '1701154206787596' - - relpath: chestnut_nature_park/20201218/result_RedEdge.tif - md5: 097c3c39629534dcdcb8f0b645ad9736 - size: 95540119 - cloud: - frdc-ds: - etag: 08ffa289a78de682031001 - version_id: '1701154218201471' - - relpath: chestnut_nature_park/20210510/90deg43m85pct255deg/bounds.csv - md5: 58f03982df33a8b52c61e51daf6b450b - size: 1452 - cloud: - frdc-ds: - etag: 08b2b8859c8de682031001 - version_id: '1701154195070002' - - relpath: chestnut_nature_park/20210510/90deg43m85pct255deg/dsm.tif - md5: 041257e74edb3d9719debc3d9fe7d4dd - size: 20773786 - cloud: - frdc-ds: - etag: 08f7b2809f8de682031001 - version_id: '1701154201278839' - - relpath: chestnut_nature_park/20210510/90deg43m85pct255deg/gsddsm.tif - md5: 45b28f2f3735de83d65aea19e2697676 - size: 45878 - cloud: - frdc-ds: - etag: 08f691869c8de682031001 - version_id: '1701154195081462' - - relpath: chestnut_nature_park/20210510/90deg43m85pct255deg/result.jpg - md5: d012b232cc1faed2871ebf5150d20cc1 - size: 6941116 - cloud: - frdc-ds: - etag: 08cfbae38191e682031001 - version_id: '1701155213729103' - - relpath: chestnut_nature_park/20210510/90deg43m85pct255deg/result.tif - md5: 43541fa1f15171ba8548fb9b6ae77cfe - size: 103859274 - cloud: - frdc-ds: - etag: 089b99c3a78de682031001 - version_id: '1701154219150491' - - relpath: chestnut_nature_park/20210510/90deg43m85pct255deg/result_Blue.tif - md5: 87bef8f7bff639951ffa452f6ef9bc3f - size: 117714429 - cloud: - frdc-ds: - etag: 08d8b18ca88de682031001 - version_id: '1701154220349656' - - relpath: chestnut_nature_park/20210510/90deg43m85pct255deg/result_Green.tif - md5: c01de4a50b5caa01ace693e5191b3870 - size: 119925274 - cloud: - frdc-ds: - etag: 08bdfcbaa78de682031001 - version_id: '1701154219015741' - - relpath: chestnut_nature_park/20210510/90deg43m85pct255deg/result_NIR.tif - md5: 2d842ef0d2f43e5269c1fc456aeb746d - size: 117211078 - cloud: - frdc-ds: - etag: 08fb9adaa78de682031001 - version_id: '1701154219527547' - - relpath: chestnut_nature_park/20210510/90deg43m85pct255deg/result_Red.tif - md5: 214d97f9eacf93ce29c6898858b7f146 - size: 119913392 - cloud: - frdc-ds: - etag: 08e5ecdda78de682031001 - version_id: '1701154219587173' - - relpath: chestnut_nature_park/20210510/90deg43m85pct255deg/result_RedEdge.tif - md5: b45f71ffe2c14280100e22c23ba871ab - size: 118263239 - cloud: - frdc-ds: - etag: 089798e6a88de682031001 - version_id: '1701154221820951' - - relpath: chestnut_nature_park/20210510/90deg43m85pct255deg/segment.tif - md5: 86191f2ab6e7374ce98c83491ce43527 - size: 10021 - cloud: - frdc-ds: - etag: 08b5e5859c8de682031001 - version_id: '1701154195075765' - - relpath: chestnut_nature_park/20210510/90deg60m84.5pct255deg/dsm.tif - md5: 99fa34f9ca8dc66d95b9b2fb144b334f - size: 10857218 - cloud: - frdc-ds: - etag: 08dc84e09d8de682031001 - version_id: '1701154198651484' - - relpath: chestnut_nature_park/20210510/90deg60m84.5pct255deg/gsddsm.tif - md5: 09f4143d1083c743f6a2bbed2dba5d81 - size: 55570 - cloud: - frdc-ds: - etag: 089e95869c8de682031001 - version_id: '1701154195081886' - - relpath: chestnut_nature_park/20210510/90deg60m84.5pct255deg/result.jpg - md5: cfc21b5bf466b57e888733f37d922017 - size: 3243560 - cloud: - frdc-ds: - etag: 08edc4db8191e682031001 - version_id: '1701155213599341' - - relpath: chestnut_nature_park/20210510/90deg60m84.5pct255deg/result.tif - md5: 6b99351bd16f05fafc2f96e2945ec094 - size: 51407149 - cloud: - frdc-ds: - etag: 08fca18da48de682031001 - version_id: '1701154211975420' - - relpath: chestnut_nature_park/20210510/90deg60m84.5pct255deg/result_Blue.tif - md5: c240b4e607af2cdadc3d4fc96181e005 - size: 59362235 - cloud: - frdc-ds: - etag: 08b3d484a58de682031001 - version_id: '1701154213931571' - - relpath: chestnut_nature_park/20210510/90deg60m84.5pct255deg/result_Green.tif - md5: 08d7b7e3e4823b0cf9c53692d6b55f5b - size: 60380606 - cloud: - frdc-ds: - etag: 08ebe3a2a58de682031001 - version_id: '1701154214425067' - - relpath: chestnut_nature_park/20210510/90deg60m84.5pct255deg/result_NIR.tif - md5: ab3509fac188fe452d01957237181f31 - size: 59034799 - cloud: - frdc-ds: - etag: 0893bcfca48de682031001 - version_id: '1701154213797395' - - relpath: chestnut_nature_park/20210510/90deg60m84.5pct255deg/result_Red.tif - md5: 6b0531431f0d898aacda6334c10fca8a - size: 60369184 - cloud: - frdc-ds: - etag: 08e8a58ea58de682031001 - version_id: '1701154214089448' - - relpath: chestnut_nature_park/20210510/90deg60m84.5pct255deg/result_RedEdge.tif - md5: 75e67bc7c1b01e060c68062354192b8e - size: 59567305 - cloud: - frdc-ds: - etag: 08f0dc91ad8de682031001 - version_id: '1701154230922864' - - relpath: chestnut_nature_park/20210510/90deg60m84.5pct255deg/segment.tif - md5: 2b5856c859b44e54517b7891fd53bd60 - size: 12784 - cloud: - frdc-ds: - etag: 08838e869c8de682031001 - version_id: '1701154195080963' - - relpath: chestnut_nature_park/20210510/Join/adding -90deg 60m data/cloud.las - md5: a4755ad17e0c0f2686a1fb9cacc712b7 - size: 74248398 - cloud: - frdc-ds: - etag: 0892daa1a48de682031001 - version_id: '1701154212310290' - - relpath: chestnut_nature_park/20210510/Join/cloud.las - md5: 8ce9bccccab90a8a5c17ad5b3a982f0d - size: 61187376 - cloud: - frdc-ds: - etag: 08fda5a8a58de682031001 - version_id: '1701154214515453' From 276fa174e0dcac7def581b0e715c97dcc875f760 Mon Sep 17 00:00:00 2001 From: Evening Date: Tue, 26 Dec 2023 14:41:50 +0800 Subject: [PATCH 06/52] Get api key from host --- .devcontainer/devcontainer.json | 1 + 1 file changed, 1 insertion(+) diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 9b0313b8..ca281b81 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -6,6 +6,7 @@ }, "containerEnv": { "LABEL_STUDIO_HOST": "host.docker.internal", + "LABEL_STUDIO_API_KEY": "${localEnv:LABEL_STUDIO_API_KEY}", }, "runArgs": [ "--gpus=all", From 70b275ebb3a9b9bed4559a851e437d306c0226f6 Mon Sep 17 00:00:00 2001 From: Evening Date: Tue, 26 Dec 2023 15:10:40 +0800 Subject: [PATCH 07/52] Add missing lightning dep --- Dockerfile | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index e1378f66..9951dc95 100644 --- a/Dockerfile +++ b/Dockerfile @@ -7,7 +7,8 @@ RUN apt-get update RUN apt-get install git -y RUN pip3 install --upgrade pip && \ - pip3 install poetry + pip3 install poetry && \ + pip3 install lightning RUN conda init bash \ && . ~/.bashrc \ @@ -16,4 +17,4 @@ RUN conda init bash \ && poetry install --with dev --no-interaction --no-ansi RUN apt-get install curl -y && curl -sSL https://sdk.cloud.google.com | bash -ENV PATH $PATH:/root/google-cloud-sdk/bin \ No newline at end of file +ENV PATH $PATH:/root/google-cloud-sdk/bin From a1d79c158cfc927cca4a7319e54609e50e64de84 Mon Sep 17 00:00:00 2001 From: Evening Date: Tue, 26 Dec 2023 15:10:53 +0800 Subject: [PATCH 08/52] Add uncommentable local W&B setup --- tests/model_tests/chestnut_dec_may/train.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/model_tests/chestnut_dec_may/train.py b/tests/model_tests/chestnut_dec_may/train.py index 9c2b3c96..f31ee825 100644 --- a/tests/model_tests/chestnut_dec_may/train.py +++ b/tests/model_tests/chestnut_dec_may/train.py @@ -4,7 +4,10 @@ the 20210510 dataset. """ -import os +# Uncomment this to run the W&B monitoring locally +# import os +# os.environ["WANDB_MODE"] = "offline" + from pathlib import Path import lightning as pl From 5d457abd3c17cf7d1884b3d87bd7b891741b7f5c Mon Sep 17 00:00:00 2001 From: Evening Date: Tue, 26 Dec 2023 15:46:13 +0800 Subject: [PATCH 09/52] Update getting started docs for dev container --- Writerside/topics/Getting-Started.md | 125 ++++++++++++++++++----- Writerside/writerside.cfg | 2 +- docs/HelpTOC.json | 2 +- docs/custom-k-aug-dataloaders.html | 6 +- docs/getting-started.html | 24 +++-- docs/icon-192.png | Bin 0 -> 337 bytes docs/icon-512.png | Bin 0 -> 1103 bytes docs/load-dataset.html | 8 +- docs/load-gcs.html | 6 +- docs/mix-match-module.html | 12 +-- docs/mix-match.html | 2 +- docs/model-test-chestnut-may-dec.html | 2 +- docs/overview.html | 2 +- docs/preprocessing-extract-segments.html | 16 +-- docs/preprocessing-glcm-padded.html | 4 +- docs/preprocessing-morphology.html | 6 +- docs/preprocessing-scale.html | 4 +- docs/retrieve-our-datasets.html | 10 +- docs/site.webmanifest | 11 ++ docs/train-frdc-lightning.html | 4 +- 20 files changed, 167 insertions(+), 79 deletions(-) create mode 100644 docs/icon-192.png create mode 100644 docs/icon-512.png create mode 100644 docs/site.webmanifest diff --git a/Writerside/topics/Getting-Started.md b/Writerside/topics/Getting-Started.md index 7615174d..10fe4398 100644 --- a/Writerside/topics/Getting-Started.md +++ b/Writerside/topics/Getting-Started.md @@ -10,7 +10,7 @@ Start by cloning our repository. - git clone https://github.com/Forest-Recovery-Digital-Companion/FRDC-ML.git + git clone https://github.com/FR-DC/FRDC-ML.git Then, create a Python Virtual Env pyvenv @@ -60,6 +60,26 @@ + + + Only use Dev. Containers if you're familiar with your IDEs, it's highly + dependent on clicking around the IDE. + + Do not set up a new environment, it'll be included in the environment. + + Ensure that you have installed pre-requisites for respective IDEs. + VSCode + IntelliJ + + Start by cloning our repository. + + git clone https://github.com/FR-DC/FRDC-ML.git + + + Follow steps for respective IDEs to set up the Dev. Container. + Activate the virtual environment. The venv is located in /opt/venv + + We use Google Cloud to store our datasets. To set up Google Cloud, @@ -86,6 +106,49 @@ + + This is only necessary if any task requires Label Studio annotations + + We use Label Studio to annotate our datasets. + We won't go through how to install Label Studio, for contributors, it + should be up on localhost:8080. + + + Then, retrieve your own API key from Label Studio. + Go to your account page + and copy the API key.
+ Set your API key as an environment variable. + + + In Windows, go to "Edit environment variables for + your account" and add this as a new environment variable with name + LABEL_STUDIO_API_KEY. + + + Export it as an environment variable. + export LABEL_STUDIO_API_KEY=... + + + +
+ + + + + We use W&B to track our experiments. To set up W&B, + + install the W&B CLI + + + + Then, + + authenticate your account + . + wandb login + + + This is optional but recommended. Pre-commit hooks are a way to ensure that your code is formatted correctly. @@ -98,30 +161,45 @@ - + Run the tests to make sure everything is working pytest - - In case of errors: - - - If you get this error, it means that you haven't authenticated your - Google Cloud account. - See Setting Up Google Cloud - - - If you get this error, it means that you haven't installed the - dependencies. - See Installing the Dev. Environment - - - +## Troubleshooting + +### ModuleNotFoundError + +It's likely that your `src` and `tests` directories are not in `PYTHONPATH`. +To fix this, run the following command: + +```shell +export PYTHONPATH=$PYTHONPATH:./src:./tests +``` + +Or, set it in your IDE, for example, IntelliJ allows setting directories as +**Source Roots**. + +### google.auth.exceptions.DefaultCredentialsError + +It's likely that you haven't authenticated your Google Cloud account. +See [Setting Up Google Cloud](#gcloud) + +### Couldn't connect to Label Studio + +Label Studio must be running locally, exposed on `localhost:8080`. Furthermore, +you need to specify the `LABEL_STUDIO_API_KEY` environment variable. See +[Setting Up Label Studio](#ls) + +### Cannot login to W&B + +You need to authenticate your W&B account. See [Setting Up Weight and Biases](#wandb) +If you're facing difficulties, set the `WANDB_MODE` environment variable to `offline` +to disable W&B. ## Our Repository Structure @@ -132,7 +210,6 @@ help you understand where to put your code. graph LR FRDC -- " Core Dependencies " --> src/frdc/ FRDC -- " Resources " --> rsc/ - FRDC -- " Pipeline " --> pipeline/ FRDC -- " Tests " --> tests/ FRDC -- " Repo Dependencies " --> pyproject.toml,poetry.lock src/frdc/ -- " Dataset Loaders " --> ./load/ @@ -140,7 +217,6 @@ graph LR src/frdc/ -- " Train Deps " --> ./train/ src/frdc/ -- " Model Architectures " --> ./models/ rsc/ -- " Datasets ... " --> ./dataset_name/ - pipeline/ -- " Model Training Pipeline " --> ./model_tests/ ``` src/frdc/ @@ -149,11 +225,8 @@ src/frdc/ rsc/ : Resources. These are usually cached datasets -pipeline/ -: Pipeline code. These are the full ML tests of our pipeline. - tests/ -: PyTest tests. These are unit tests & integration tests. +: PyTest tests. These are unit, integration, and model tests. ### Unit, Integration, and Pipeline Tests @@ -161,7 +234,7 @@ We have 3 types of tests: - Unit Tests are usually small, single function tests. - Integration Tests are larger tests that tests a mock pipeline. -- Pipeline Tests are the true production pipeline tests that will generate a +- Model Tests are the true production pipeline tests that will generate a model. ### Where Should I contribute? @@ -176,9 +249,9 @@ at the src/frdc/ directory. By adding a new component, you'll need to add a new test. Take a look at the tests/ directory. - + If you're a ML Researcher, you'll probably be changing the pipeline. Take a -look at the pipeline/ directory. +look at the tests/model_tests/ directory. If you're adding a new dependency, use poetry add PACKAGE and diff --git a/Writerside/writerside.cfg b/Writerside/writerside.cfg index 39e81f21..9e1b0444 100644 --- a/Writerside/writerside.cfg +++ b/Writerside/writerside.cfg @@ -4,5 +4,5 @@ - + \ No newline at end of file diff --git a/docs/HelpTOC.json b/docs/HelpTOC.json index 54d3f877..107c112d 100644 --- a/docs/HelpTOC.json +++ b/docs/HelpTOC.json @@ -1 +1 @@ -{"entities":{"pages":{"Overview":{"id":"Overview","title":"Overview","url":"overview.html","level":0,"tabIndex":0},"Getting-Started":{"id":"Getting-Started","title":"Getting Started","url":"getting-started.html","level":0,"tabIndex":1},"ae6f1f90_3454":{"id":"ae6f1f90_3454","title":"Tutorials","level":0,"pages":["Retrieve-our-Datasets"],"tabIndex":2},"Retrieve-our-Datasets":{"id":"Retrieve-our-Datasets","title":"Retrieve our Datasets","url":"retrieve-our-datasets.html","level":1,"parentId":"ae6f1f90_3454","tabIndex":0},"mix-match":{"id":"mix-match","title":"MixMatch","url":"mix-match.html","level":0,"pages":["mix-match-module","custom-k-aug-dataloaders"],"tabIndex":3},"mix-match-module":{"id":"mix-match-module","title":"MixMatch Module","url":"mix-match-module.html","level":1,"parentId":"mix-match","tabIndex":0},"custom-k-aug-dataloaders":{"id":"custom-k-aug-dataloaders","title":"Custom K-Aug Dataloaders","url":"custom-k-aug-dataloaders.html","level":1,"parentId":"mix-match","tabIndex":1},"ae6f1f90_3459":{"id":"ae6f1f90_3459","title":"Model Tests","level":0,"pages":["Model-Test-Chestnut-May-Dec"],"tabIndex":4},"Model-Test-Chestnut-May-Dec":{"id":"Model-Test-Chestnut-May-Dec","title":"Model Test Chestnut May-Dec","url":"model-test-chestnut-may-dec.html","level":1,"parentId":"ae6f1f90_3459","tabIndex":0},"ae6f1f90_3461":{"id":"ae6f1f90_3461","title":"API","level":0,"pages":["load.dataset","load.gcs","preprocessing.scale","preprocessing.extract_segments","preprocessing.morphology","preprocessing.glcm_padded","train.frdc_lightning"],"tabIndex":5},"load.dataset":{"id":"load.dataset","title":"load.dataset","url":"load-dataset.html","level":1,"parentId":"ae6f1f90_3461","tabIndex":0},"load.gcs":{"id":"load.gcs","title":"load.gcs","url":"load-gcs.html","level":1,"parentId":"ae6f1f90_3461","tabIndex":1},"preprocessing.scale":{"id":"preprocessing.scale","title":"preprocessing.scale","url":"preprocessing-scale.html","level":1,"parentId":"ae6f1f90_3461","tabIndex":2},"preprocessing.extract_segments":{"id":"preprocessing.extract_segments","title":"preprocessing.extract_segments","url":"preprocessing-extract-segments.html","level":1,"parentId":"ae6f1f90_3461","tabIndex":3},"preprocessing.morphology":{"id":"preprocessing.morphology","title":"preprocessing.morphology","url":"preprocessing-morphology.html","level":1,"parentId":"ae6f1f90_3461","tabIndex":4},"preprocessing.glcm_padded":{"id":"preprocessing.glcm_padded","title":"preprocessing.glcm_padded","url":"preprocessing-glcm-padded.html","level":1,"parentId":"ae6f1f90_3461","tabIndex":5},"train.frdc_lightning":{"id":"train.frdc_lightning","title":"train.frdc_datamodule \u0026 frdc_module","url":"train-frdc-lightning.html","level":1,"parentId":"ae6f1f90_3461","tabIndex":6}}},"topLevelIds":["Overview","Getting-Started","ae6f1f90_3454","mix-match","ae6f1f90_3459","ae6f1f90_3461"]} \ No newline at end of file +{"entities":{"pages":{"Overview":{"id":"Overview","title":"Overview","url":"overview.html","level":0,"tabIndex":0},"Getting-Started":{"id":"Getting-Started","title":"Getting Started","url":"getting-started.html","level":0,"tabIndex":1},"e8e19623_38829":{"id":"e8e19623_38829","title":"Tutorials","level":0,"pages":["Retrieve-our-Datasets"],"tabIndex":2},"Retrieve-our-Datasets":{"id":"Retrieve-our-Datasets","title":"Retrieve our Datasets","url":"retrieve-our-datasets.html","level":1,"parentId":"e8e19623_38829","tabIndex":0},"mix-match":{"id":"mix-match","title":"MixMatch","url":"mix-match.html","level":0,"pages":["mix-match-module","custom-k-aug-dataloaders"],"tabIndex":3},"mix-match-module":{"id":"mix-match-module","title":"MixMatch Module","url":"mix-match-module.html","level":1,"parentId":"mix-match","tabIndex":0},"custom-k-aug-dataloaders":{"id":"custom-k-aug-dataloaders","title":"Custom K-Aug Dataloaders","url":"custom-k-aug-dataloaders.html","level":1,"parentId":"mix-match","tabIndex":1},"e8e19623_38834":{"id":"e8e19623_38834","title":"Model Tests","level":0,"pages":["Model-Test-Chestnut-May-Dec"],"tabIndex":4},"Model-Test-Chestnut-May-Dec":{"id":"Model-Test-Chestnut-May-Dec","title":"Model Test Chestnut May-Dec","url":"model-test-chestnut-may-dec.html","level":1,"parentId":"e8e19623_38834","tabIndex":0},"e8e19623_38836":{"id":"e8e19623_38836","title":"API","level":0,"pages":["load.dataset","load.gcs","preprocessing.scale","preprocessing.extract_segments","preprocessing.morphology","preprocessing.glcm_padded","train.frdc_lightning"],"tabIndex":5},"load.dataset":{"id":"load.dataset","title":"load.dataset","url":"load-dataset.html","level":1,"parentId":"e8e19623_38836","tabIndex":0},"load.gcs":{"id":"load.gcs","title":"load.gcs","url":"load-gcs.html","level":1,"parentId":"e8e19623_38836","tabIndex":1},"preprocessing.scale":{"id":"preprocessing.scale","title":"preprocessing.scale","url":"preprocessing-scale.html","level":1,"parentId":"e8e19623_38836","tabIndex":2},"preprocessing.extract_segments":{"id":"preprocessing.extract_segments","title":"preprocessing.extract_segments","url":"preprocessing-extract-segments.html","level":1,"parentId":"e8e19623_38836","tabIndex":3},"preprocessing.morphology":{"id":"preprocessing.morphology","title":"preprocessing.morphology","url":"preprocessing-morphology.html","level":1,"parentId":"e8e19623_38836","tabIndex":4},"preprocessing.glcm_padded":{"id":"preprocessing.glcm_padded","title":"preprocessing.glcm_padded","url":"preprocessing-glcm-padded.html","level":1,"parentId":"e8e19623_38836","tabIndex":5},"train.frdc_lightning":{"id":"train.frdc_lightning","title":"train.frdc_datamodule \u0026 frdc_module","url":"train-frdc-lightning.html","level":1,"parentId":"e8e19623_38836","tabIndex":6}}},"topLevelIds":["Overview","Getting-Started","e8e19623_38829","mix-match","e8e19623_38834","e8e19623_38836"]} \ No newline at end of file diff --git a/docs/custom-k-aug-dataloaders.html b/docs/custom-k-aug-dataloaders.html index f3bc78b7..487648dc 100644 --- a/docs/custom-k-aug-dataloaders.html +++ b/docs/custom-k-aug-dataloaders.html @@ -1,4 +1,4 @@ - Custom K-Aug Dataloaders | Documentation

Documentation 0.0.7 Help

Custom K-Aug Dataloaders

In MixMatch, implementing the data loading methods is quite unconventional.

  1. We need to load multiple augmented versions of the same image into the same batch.

  2. The labelled set is usually too small, causing a premature end to the epoch as it runs out of samples to draw from faster than the unlabelled set.

This can be rather tricky to implement in PyTorch. This tutorial will illustrate how we did it.

Loading Multiple Augmented Versions of the Same Image

See: frdc/load/dataset.py FRDCDataset.__getitem__

In MixMatch, a single train batch must consist of:

  1. A batch of labeled images

  2. K batches of unlabeled images

Aug
Aug
Aug
Aug
Get Batch
Aug Labelled Batch
Unlabelled Batch
Aug Unl. Batch 1
Aug Unl. Batch i
Aug Unl. Batch K

Keep in mind that the unlabelled batch, is a single batch of images, not separate draws of batches. It is then "duplicated" K times, and each copy is augmented differently.

Solution 1: Custom Dataset

To solve this, we need to understand the role of both a Dataset and a DataLoader.

  • A Dataset represents a collection of data, responsible for loading and returning something.

  • A DataLoader draws samples from a Dataset and returns batched samples.

The key here is that a Dataset is not limited to returning 1 sample at a time, we can make it return the K augmented versions of the same image.

Aug
Aug
Aug
Sample
Aug Sample 1
Aug Sample i
Aug Sample K

In code, this is done by subclassing the Dataset class and overriding the __getitem__ method.

+ Custom K-Aug Dataloaders | Documentation

Documentation 0.0.7 Help

Custom K-Aug Dataloaders

In MixMatch, implementing the data loading methods is quite unconventional.

  1. We need to load multiple augmented versions of the same image into the same batch.

  2. The labelled set is usually too small, causing a premature end to the epoch as it runs out of samples to draw from faster than the unlabelled set.

This can be rather tricky to implement in PyTorch. This tutorial will illustrate how we did it.

Loading Multiple Augmented Versions of the Same Image

See: frdc/load/dataset.py FRDCDataset.__getitem__

In MixMatch, a single train batch must consist of:

  1. A batch of labeled images

  2. K batches of unlabeled images

Aug
Aug
Aug
Aug
Get Batch
Aug Labelled Batch
Unlabelled Batch
Aug Unl. Batch 1
Aug Unl. Batch i
Aug Unl. Batch K

Keep in mind that the unlabelled batch, is a single batch of images, not separate draws of batches. It is then "duplicated" K times, and each copy is augmented differently.

Solution 1: Custom Dataset

To solve this, we need to understand the role of both a Dataset and a DataLoader.

  • A Dataset represents a collection of data, responsible for loading and returning something.

  • A DataLoader draws samples from a Dataset and returns batched samples.

The key here is that a Dataset is not limited to returning 1 sample at a time, we can make it return the K augmented versions of the same image.

Aug
Aug
Aug
Sample
Aug Sample 1
Aug Sample i
Aug Sample K

In code, this is done by subclassing the Dataset class and overriding the __getitem__ method.

def duplicate(x): return x, deepcopy(x), deepcopy(x) @@ -10,7 +10,7 @@ def __getitem__(self, index): x, y = self.dataset[index] return self.aug(x), y -

In the above example, we have a Dataset that returns 3 duplicate versions of the same image. By leveraging this technique, we can create a Dataset that returns K augmented versions of the same image as a tuple

Premature End of Epoch due to Small Labelled Set

See: frdc/train/frdc_datamodule.py

In MixMatch, the definition of an "epoch" is a bit different. Instead of implying that we have seen all the data once, it implies that we've drawn N batches. The N is referred to as the number of iterations per epoch.

Take for example, a labelled set of numbers [1, 2, 3] and an unlabelled set [4, 5, 6, 7, 8, 9, 10]. With batch size of 2, we'll run out of labelled samples after 2 iterations, but we'll still have 3 more iterations for the unlabelled set.

  • Draw 1: [1, 2], [4, 5]

  • Draw 2: [3], [6, 7].

  • Epoch ends.

Solution 2: Random Sampling

To fix this, instead of sequentially sampling the labelled set (and the unlabelled set), we can sample them randomly. This way, we can ensure that it never runs out.

  • Draw 1: [1, 3], [7, 5]

  • Draw 2: [2, 1], [4, 9]

  • Draw 3: [3, 2], [8, 6]

  • ... and so on.

Luckily, PyTorch's DataLoader supports random sampling. We just need to use RandomSampler instead of SequentialSampler (which is the default).

+

In the above example, we have a Dataset that returns 3 duplicate versions of the same image. By leveraging this technique, we can create a Dataset that returns K augmented versions of the same image as a tuple

Premature End of Epoch due to Small Labelled Set

See: frdc/train/frdc_datamodule.py

In MixMatch, the definition of an "epoch" is a bit different. Instead of implying that we have seen all the data once, it implies that we've drawn N batches. The N is referred to as the number of iterations per epoch.

Take for example, a labelled set of numbers [1, 2, 3] and an unlabelled set [4, 5, 6, 7, 8, 9, 10]. With batch size of 2, we'll run out of labelled samples after 2 iterations, but we'll still have 3 more iterations for the unlabelled set.

  • Draw 1: [1, 2], [4, 5]

  • Draw 2: [3], [6, 7].

  • Epoch ends.

Solution 2: Random Sampling

To fix this, instead of sequentially sampling the labelled set (and the unlabelled set), we can sample them randomly. This way, we can ensure that it never runs out.

  • Draw 1: [1, 3], [7, 5]

  • Draw 2: [2, 1], [4, 9]

  • Draw 3: [3, 2], [8, 6]

  • ... and so on.

Luckily, PyTorch's DataLoader supports random sampling. We just need to use RandomSampler instead of SequentialSampler (which is the default).

from torch.utils.data import DataLoader, RandomSampler dl = DataLoader( @@ -21,4 +21,4 @@ replacement=False, ) ) -

This will ensure that the "epoch" ends when we've drawn train_iters batches

Last modified: 20 December 2023
\ No newline at end of file +

This will ensure that the "epoch" ends when we've drawn train_iters batches

Last modified: 26 December 2023
\ No newline at end of file diff --git a/docs/getting-started.html b/docs/getting-started.html index 172d307d..7cd27c91 100644 --- a/docs/getting-started.html +++ b/docs/getting-started.html @@ -1,20 +1,24 @@ - Getting Started | Documentation

Documentation 0.0.7 Help

Getting Started

Installing the Dev. Environment

  1. Ensure that you have the right version of Python. The required Python version can be seen in pyproject.toml

    + Getting Started | Documentation

    Documentation 0.0.7 Help

    Getting Started

    Installing the Dev. Environment

    1. Ensure that you have the right version of Python. The required Python version can be seen in pyproject.toml

      [tool.poetry.dependencies] python = "..." -
    2. Start by cloning our repository.

      - git clone https://github.com/Forest-Recovery-Digital-Companion/FRDC-ML.git -
    3. Then, create a Python Virtual Env pyvenv

      python -m venv venv/
      python3 -m venv venv/
    4. Install Poetry Then check if it's installed with

      poetry --version
    5. Activate the virtual environment

      +
    6. Start by cloning our repository.

      + git clone https://github.com/FR-DC/FRDC-ML.git +
    7. Then, create a Python Virtual Env pyvenv

      python -m venv venv/
      python3 -m venv venv/
    8. Install Poetry Then check if it's installed with

      poetry --version
    9. Activate the virtual environment

      cd venv/Scripts activate cd ../.. -
      +
      source venv/bin/activate -
    10. Install the dependencies. You should be in the same directory as pyproject.toml

      +
  2. Install the dependencies. You should be in the same directory as pyproject.toml

    poetry install --with dev -
  3. Install Pre-Commit Hooks

    +
  4. Install Pre-Commit Hooks

    pre-commit install -

Setting Up Google Cloud

  1. We use Google Cloud to store our datasets. To set up Google Cloud, install the Google Cloud CLI

  2. Then, authenticate your account.

    gcloud auth login
  3. Finally, set up Application Default Credentials (ADC).

    gcloud auth application-default login
  4. To make sure everything is working, run the tests.

Pre-commit Hooks

  • +

Use a Dev. Container

  1. Ensure that you have installed pre-requisites for respective IDEs. VSCode IntelliJ

  2. Start by cloning our repository.

    + git clone https://github.com/FR-DC/FRDC-ML.git +
  3. Follow steps for respective IDEs to set up the Dev. Container.

  4. Activate the virtual environment. The venv is located in /opt/venv

Setting Up Google Cloud

  1. We use Google Cloud to store our datasets. To set up Google Cloud, install the Google Cloud CLI

  2. Then, authenticate your account.

    gcloud auth login
  3. Finally, set up Application Default Credentials (ADC).

    gcloud auth application-default login
  4. To make sure everything is working, run the tests.

Setting Up Label Studio

  1. We use Label Studio to annotate our datasets. We won't go through how to install Label Studio, for contributors, it should be up on localhost:8080.

  2. Then, retrieve your own API key from Label Studio. Go to your account page and copy the API key.


  3. Set your API key as an environment variable.

    In Windows, go to "Edit environment variables for your account" and add this as a new environment variable with name LABEL_STUDIO_API_KEY.

    Export it as an environment variable.

    export LABEL_STUDIO_API_KEY=...

Setting Up Weight and Biases

  1. We use W&B to track our experiments. To set up W&B, install the W&B CLI

  2. Then, authenticate your account.

    wandb login

Pre-commit Hooks

  • pre-commit install -

Running the Tests

  1. Run the tests to make sure everything is working

    +

Running the Tests

  • Run the tests to make sure everything is working

    pytest -
  • In case of errors:

    google.auth.exceptions.DefaultCredentialsError

    If you get this error, it means that you haven't authenticated your Google Cloud account. See Setting Up Google Cloud

    ModuleNotFoundError

    If you get this error, it means that you haven't installed the dependencies. See Installing the Dev. Environment

Our Repository Structure

Before starting development, take a look at our repository structure. This will help you understand where to put your code.

Core Dependencies
Resources
Pipeline
Tests
Repo Dependencies
Dataset Loaders
Preprocessing Fn.
Train Deps
Model Architectures
Datasets ...
Model Training Pipeline
FRDC
src/frdc/
rsc/
pipeline/
tests/
pyproject.toml,poetry.lock
./load/
./preprocess/
./train/
./models/
./dataset_name/
./model_tests/
src/frdc/

Source Code for our package. These are the unit components of our pipeline.

rsc/

Resources. These are usually cached datasets

pipeline/

Pipeline code. These are the full ML tests of our pipeline.

tests/

PyTest tests. These are unit tests & integration tests.

Unit, Integration, and Pipeline Tests

We have 3 types of tests:

  • Unit Tests are usually small, single function tests.

  • Integration Tests are larger tests that tests a mock pipeline.

  • Pipeline Tests are the true production pipeline tests that will generate a model.

Where Should I contribute?

Changing a small component

If you're changing a small component, such as a argument for preprocessing, a new model architecture, or a new configuration for a dataset, take a look at the src/frdc/ directory.

Adding a test

By adding a new component, you'll need to add a new test. Take a look at the tests/ directory.

Changing the pipeline

If you're a ML Researcher, you'll probably be changing the pipeline. Take a look at the pipeline/ directory.

Adding a dependency

If you're adding a new dependency, use poetry add PACKAGE and commit the changes to pyproject.toml and poetry.lock.

Last modified: 20 December 2023
\ No newline at end of file +

Troubleshooting

ModuleNotFoundError

It's likely that your src and tests directories are not in PYTHONPATH. To fix this, run the following command:

+export PYTHONPATH=$PYTHONPATH:./src:./tests +

Or, set it in your IDE, for example, IntelliJ allows setting directories as Source Roots.

google.auth.exceptions.DefaultCredentialsError

It's likely that you haven't authenticated your Google Cloud account. See Setting Up Google Cloud

Couldn't connect to Label Studio

Label Studio must be running locally, exposed on localhost:8080. Furthermore, you need to specify the LABEL_STUDIO_API_KEY environment variable. See Setting Up Label Studio

Cannot login to W&B

You need to authenticate your W&B account. See Setting Up Weight and Biases If you're facing difficulties, set the WANDB_MODE environment variable to offline to disable W&B.

Our Repository Structure

Before starting development, take a look at our repository structure. This will help you understand where to put your code.

Core Dependencies
Resources
Tests
Repo Dependencies
Dataset Loaders
Preprocessing Fn.
Train Deps
Model Architectures
Datasets ...
FRDC
src/frdc/
rsc/
tests/
pyproject.toml,poetry.lock
./load/
./preprocess/
./train/
./models/
./dataset_name/
src/frdc/

Source Code for our package. These are the unit components of our pipeline.

rsc/

Resources. These are usually cached datasets

tests/

PyTest tests. These are unit, integration, and model tests.

Unit, Integration, and Pipeline Tests

We have 3 types of tests:

  • Unit Tests are usually small, single function tests.

  • Integration Tests are larger tests that tests a mock pipeline.

  • Model Tests are the true production pipeline tests that will generate a model.

Where Should I contribute?

Changing a small component

If you're changing a small component, such as a argument for preprocessing, a new model architecture, or a new configuration for a dataset, take a look at the src/frdc/ directory.

Adding a test

By adding a new component, you'll need to add a new test. Take a look at the tests/ directory.

Changing the model pipeline

If you're a ML Researcher, you'll probably be changing the pipeline. Take a look at the tests/model_tests/ directory.

Adding a dependency

If you're adding a new dependency, use poetry add PACKAGE and commit the changes to pyproject.toml and poetry.lock.

Last modified: 26 December 2023
\ No newline at end of file diff --git a/docs/icon-192.png b/docs/icon-192.png new file mode 100644 index 0000000000000000000000000000000000000000..5953601c396250504ba6b31c031ea906e92b6cd9 GIT binary patch literal 337 zcmeAS@N?(olHy`uVBq!ia0vp^2SAvE2}s`E_d9@rflSGwb;dxs*#b$G<8}erCCjeWuCzLfaEd zD*IkLs+}#4;Wx^h_m~)^%s}g@2u|;lzdBFn<%jhd{?kf+gl|#zw)&%eYqF~BKhd`* zC-MF7`j+C^Uhi7HY02UJi)gTe~DWM4f DYn5)J literal 0 HcmV?d00001 diff --git a/docs/icon-512.png b/docs/icon-512.png new file mode 100644 index 0000000000000000000000000000000000000000..9840e7b0cd4973a67d66ea20a62c77380047aed1 GIT binary patch literal 1103 zcmeAS@N?(olHy`uVBq!ia0y~yU;;9k7&t&wwUqN(1_l-}PZ!6KinzB|482$b1XvEf z|I5F&QoY|*z-`s1vsK?dmwsVZ>UO9HIT3v b7H)suWWQGE&SGE*!NB0@>gTe~DWM4f4BN6W literal 0 HcmV?d00001 diff --git a/docs/load-dataset.html b/docs/load-dataset.html index 09a97fb1..dc575cd2 100644 --- a/docs/load-dataset.html +++ b/docs/load-dataset.html @@ -1,14 +1,14 @@ - load.dataset | Documentation

Documentation 0.0.7 Help

load.dataset

Usage

Firstly, to load a dataset instance, you need to initiliaze a FRDCDataset object, providing the site, date, and version.

For example, to load our Chestnut Nature Park dataset.

+ load.dataset | Documentation

Documentation 0.0.7 Help

load.dataset

Usage

Firstly, to load a dataset instance, you need to initiliaze a FRDCDataset object, providing the site, date, and version.

For example, to load our Chestnut Nature Park dataset.

from frdc.load import FRDCDataset ds = FRDCDataset(site='chestnut_nature_park', date='20201218', version=None) -

Then, we can use the ds object to load objects of the dataset:

+

Then, we can use the ds object to load objects of the dataset:

ar, order = ds.get_ar_bands() d = ds.get_ar_bands_as_dict() bounds, labels = ds.get_bounds_and_labels() -
  • ar is a stacked NDArray of the hyperspectral bands of shape (H x W x C)

  • order is a list of strings, containing the names of the bands, ordered according to the channels of ar

  • d is a dictionary of the hyperspectral bands of shape (H x W), keyed by the band names

  • bounds is a list of bounding boxes, in the format of Rect, a namedtuple of x0, y0, x1, y1

  • labels is a list of strings, containing the labels of the bounding boxes, ordered according to bounds

Filters

You can also selectively get the channels for both get_ar_bands() and get_ar_bands_as_dict() by providing a list of strings to the bands argument.

For example, to get the Wideband RGB bands, you can do:

+
  • ar is a stacked NDArray of the hyperspectral bands of shape (H x W x C)

  • order is a list of strings, containing the names of the bands, ordered according to the channels of ar

  • d is a dictionary of the hyperspectral bands of shape (H x W), keyed by the band names

  • bounds is a list of bounding boxes, in the format of Rect, a namedtuple of x0, y0, x1, y1

  • labels is a list of strings, containing the labels of the bounding boxes, ordered according to bounds

Filters

You can also selectively get the channels for both get_ar_bands() and get_ar_bands_as_dict() by providing a list of strings to the bands argument.

For example, to get the Wideband RGB bands, you can do:

ar, order = ds.get_ar_bands(bands=['WR', 'WG', 'WB']) d = ds.get_ar_bands_as_dict(bands=['WR', 'WG', 'WB']) -

This will also alter the channel order to the order of the bands provided.

See load.gcs for configuration options.

Last modified: 20 December 2023
\ No newline at end of file +

This will also alter the channel order to the order of the bands provided.

See load.gcs for configuration options.

Last modified: 26 December 2023
\ No newline at end of file diff --git a/docs/load-gcs.html b/docs/load-gcs.html index e135d6e1..f02e3959 100644 --- a/docs/load-gcs.html +++ b/docs/load-gcs.html @@ -1,11 +1,11 @@ - load.gcs | Documentation

Documentation 0.0.7 Help

load.gcs

Usage

These are defined in the top-level load.gcs module.

list_gcs_datasets

Lists all datasets in the bucket as a DataFrame. This works by checking which folders have a specific file, which we call the anchor.

download

Downloads a file from Google Cloud Storage and returns the local file path.

open_file

Downloads and opens a file from Google Cloud Storage. Returns a file handle.

open_image

Downloads and returns the PIL image from Google Cloud Storage.

Pathing

The path to specify is relative to the bucket, which is frdc-ds by default.

For example this filesystem on GCS:

+ load.gcs | Documentation

Documentation 0.0.7 Help

load.gcs

Usage

These are defined in the top-level load.gcs module.

list_gcs_datasets

Lists all datasets in the bucket as a DataFrame. This works by checking which folders have a specific file, which we call the anchor.

download

Downloads a file from Google Cloud Storage and returns the local file path.

open_file

Downloads and opens a file from Google Cloud Storage. Returns a file handle.

open_image

Downloads and returns the PIL image from Google Cloud Storage.

Pathing

The path to specify is relative to the bucket, which is frdc-ds by default.

For example this filesystem on GCS:

# On Google Cloud Storage frdc-ds ├── chestnut_nature_park │ └── 20201218 │ └── 90deg │ └── bounds.json -

To download bounds.json, use download(r"chestnut_nature_park/20201218/90deg/bounds.json"). By default, all files will be downloaded to PROJ_DIR/rsc/....

+

To download bounds.json, use download(r"chestnut_nature_park/20201218/90deg/bounds.json"). By default, all files will be downloaded to PROJ_DIR/rsc/....

# On local filesystem PROJ_DIR ├── rsc @@ -13,4 +13,4 @@ │ └── 20201218 │ └── 90deg │ └── bounds.json -

Configuration

If you need granular control over

  • where the files are downloaded

  • the credentials used

  • the project used

  • the bucket used

Then edit conf.py.

GCS_CREDENTIALS

Google Cloud credentials.


A google.oauth2.service_account.Credentials object. See the object documentation for more information.

LOCAL_DATASET_ROOT_DIR

Local directory to download files to.


Path to a directory, or a Path object.

GCS_PROJECT_ID

Google Cloud project ID.


GCS_BUCKET_NAME

Google Cloud Storage bucket name.


Last modified: 20 December 2023
\ No newline at end of file +

Configuration

If you need granular control over

  • where the files are downloaded

  • the credentials used

  • the project used

  • the bucket used

Then edit conf.py.

GCS_CREDENTIALS

Google Cloud credentials.


A google.oauth2.service_account.Credentials object. See the object documentation for more information.

LOCAL_DATASET_ROOT_DIR

Local directory to download files to.


Path to a directory, or a Path object.

GCS_PROJECT_ID

Google Cloud project ID.


GCS_BUCKET_NAME

Google Cloud Storage bucket name.


Last modified: 26 December 2023
\ No newline at end of file diff --git a/docs/mix-match-module.html b/docs/mix-match-module.html index f622c5c5..e8d5df2e 100644 --- a/docs/mix-match-module.html +++ b/docs/mix-match-module.html @@ -1,4 +1,4 @@ - MixMatch Module | Documentation

Documentation 0.0.7 Help

MixMatch Module

See frdc/train/mixmatch_module.py.

Quick Recap

We will go over the essential parts of the code here. Before that, we revise some of the concepts that are used in the code.

Abstract Methods

In Python, we can define abstract methods using the abc module. Just like other OOP languages, abstract methods are methods that must be implemented by the child class.

For example:

+ MixMatch Module | Documentation

Documentation 0.0.7 Help

MixMatch Module

See frdc/train/mixmatch_module.py.

Quick Recap

We will go over the essential parts of the code here. Before that, we revise some of the concepts that are used in the code.

Abstract Methods

In Python, we can define abstract methods using the abc module. Just like other OOP languages, abstract methods are methods that must be implemented by the child class.

For example:

from abc import ABC, abstractmethod @@ -11,7 +11,7 @@ class MyChildClass(MyAbstractClass): def my_abstract_method(self): print("Hello World!") -

nn.Module & LightningModule

If you're unfamiliar with PyTorch, you should read the nn.Module Documentation.

nn.Module is the base class for all neural network modules in PyTorch. While LightningModule is a PyTorch Lightning class that extends nn.Module, providing it with additional functionality that reduces boilerplate code.

By implementing it as a LightningModule, we also enter the PyTorch Lightning ecosystem, which provides us with a lot of useful features such as logging, early stopping, and more.

What do we implement in a Module?

One key component that nn.Module requires, is the model. So for example:

+

nn.Module & LightningModule

If you're unfamiliar with PyTorch, you should read the nn.Module Documentation.

nn.Module is the base class for all neural network modules in PyTorch. While LightningModule is a PyTorch Lightning class that extends nn.Module, providing it with additional functionality that reduces boilerplate code.

By implementing it as a LightningModule, we also enter the PyTorch Lightning ecosystem, which provides us with a lot of useful features such as logging, early stopping, and more.

What do we implement in a Module?

One key component that nn.Module requires, is the model. So for example:

class MyModule(nn.Module): def __init__(self): super().__init__() @@ -23,7 +23,7 @@ def forward(self, x): return self.model(x) -

PyTorch Lightning builds on top of it, requiring training_step and validation_step. Each "step" is a batch of data, and the model is trained on it. So for example:

+

PyTorch Lightning builds on top of it, requiring training_step and validation_step. Each "step" is a batch of data, and the model is trained on it. So for example:

class MyModule(LightningModule): def __init__(self): ... @@ -40,7 +40,7 @@ y_hat = self(x) loss = F.cross_entropy(y_hat, y) return loss -

Usually, the training and validation steps are the same, but in some cases, such as MixMatch, they are different. In MixMatch, we not only use a different loss function for train, we also handle a batch differently. The PyTorch Lightning framework allows us to separate the two, and implement them separately.

Model Embedded Preprocessing on_before_batch_transfer

In PyTorch Lightning, we can also inject a step before the batch is passed to the model. This is done by overriding the on_before_batch_transfer method.

Batch
on_before_batch_transfer
training_step
validation_step

This allows us to do preprocessing on the batch, such as scaling the data, encoding the labels, and more.

Custom EMA Update on_after_backward

We also leverage another hook, called on_after_backward. This hook is called after the backward pass, and allows us to do custom operations. In our case, we use it to update the EMA model.

Batch
training_step
on_after_backward
update_ema

MixMatch

We recommend having tests/model_tests/chestnut_dec_may/train.py open while reading this section. It implements a real-world example of MixMatch.

As a summary:

  1. We learned what is an abstract method, and how to implement it

  2. We implement the model in LightningModule much like we would in nn.Module

  3. We implement on_before_batch_transfer to preprocess the batch

  4. Finally, we implement on_after_backward to update the EMA model

With the above in mind, let's look at the MixMatch implementation.

forward (abstract)

Forward pass of the model

ema_model (abstract)

The model that is used for EMA. We expect this property to be implemented by the child class.

update_ema (abstract)

The method to update the EMA model. We expect this method to be implemented by the child class.

loss_unl_scaler (static)

Takes in the current progress of the training, 0.0 to 1.0, where 0.0 is the start of the training, and 1.0 is the end. Then, returns the multiplier for the unlabeled loss.

loss_lbl (static)

Implements the loss for labeled data. Takes in the predicted labels and the ground truth labels, and returns the loss. This is cross entropy for MixMatch.

loss_unl (static)

Implements the loss for unlabeled data. Takes in the predicted labels and the ground truth labels, and returns the loss. This is MSE for MixMatch.

mixup

Takes in the data and the labels, the beta distribution parameter, and returns the mixed data and labels.

sharpen

Takes in the labels and temperature, and returns the sharpened labels.

guess_labels

Takes in the unlabeled data, and returns the guessed labels.

progress

The current progress of the training, 0.0 to 1.0, where 0.0 is the start of the training, and 1.0 is the end.

training_step

The training step runs through 1 batch of data, and returns the loss. Note that this is significantly different from validation step, as we handle the K-Augmented data differently.

test / validation_step

The test / validation step runs through 1 batch of data, and returns the loss.

predict_step

The predict step runs through 1 batch of data, and returns the actual decoded labels.

on_after_backward

The on_after_backward hook is called after the backward pass, and allows us to do custom operations. In our case, we use it to update the EMA model.

on_before_batch_transfer

The on_before_batch_transfer hook is called before the batch is transferred to the GPU. In our case, we use it to preprocess the batch.

A diagram of how these components interact with each other is shown below:

Batch
on_before_batch_transfer
training_step
guess_labels
sharpen
mix_up
loss_unl
loss_unl_scaler
loss
loss_lbl
backward
on_after_backward
update_ema
validation_step
loss

Finally, we show an example of how to use the MixMatch module:

+

Usually, the training and validation steps are the same, but in some cases, such as MixMatch, they are different. In MixMatch, we not only use a different loss function for train, we also handle a batch differently. The PyTorch Lightning framework allows us to separate the two, and implement them separately.

Model Embedded Preprocessing on_before_batch_transfer

In PyTorch Lightning, we can also inject a step before the batch is passed to the model. This is done by overriding the on_before_batch_transfer method.

Batch
on_before_batch_transfer
training_step
validation_step

This allows us to do preprocessing on the batch, such as scaling the data, encoding the labels, and more.

Custom EMA Update on_after_backward

We also leverage another hook, called on_after_backward. This hook is called after the backward pass, and allows us to do custom operations. In our case, we use it to update the EMA model.

Batch
training_step
on_after_backward
update_ema

MixMatch

We recommend having tests/model_tests/chestnut_dec_may/train.py open while reading this section. It implements a real-world example of MixMatch.

As a summary:

  1. We learned what is an abstract method, and how to implement it

  2. We implement the model in LightningModule much like we would in nn.Module

  3. We implement on_before_batch_transfer to preprocess the batch

  4. Finally, we implement on_after_backward to update the EMA model

With the above in mind, let's look at the MixMatch implementation.

forward (abstract)

Forward pass of the model

ema_model (abstract)

The model that is used for EMA. We expect this property to be implemented by the child class.

update_ema (abstract)

The method to update the EMA model. We expect this method to be implemented by the child class.

loss_unl_scaler (static)

Takes in the current progress of the training, 0.0 to 1.0, where 0.0 is the start of the training, and 1.0 is the end. Then, returns the multiplier for the unlabeled loss.

loss_lbl (static)

Implements the loss for labeled data. Takes in the predicted labels and the ground truth labels, and returns the loss. This is cross entropy for MixMatch.

loss_unl (static)

Implements the loss for unlabeled data. Takes in the predicted labels and the ground truth labels, and returns the loss. This is MSE for MixMatch.

mixup

Takes in the data and the labels, the beta distribution parameter, and returns the mixed data and labels.

sharpen

Takes in the labels and temperature, and returns the sharpened labels.

guess_labels

Takes in the unlabeled data, and returns the guessed labels.

progress

The current progress of the training, 0.0 to 1.0, where 0.0 is the start of the training, and 1.0 is the end.

training_step

The training step runs through 1 batch of data, and returns the loss. Note that this is significantly different from validation step, as we handle the K-Augmented data differently.

test / validation_step

The test / validation step runs through 1 batch of data, and returns the loss.

predict_step

The predict step runs through 1 batch of data, and returns the actual decoded labels.

on_after_backward

The on_after_backward hook is called after the backward pass, and allows us to do custom operations. In our case, we use it to update the EMA model.

on_before_batch_transfer

The on_before_batch_transfer hook is called before the batch is transferred to the GPU. In our case, we use it to preprocess the batch.

A diagram of how these components interact with each other is shown below:

Batch
on_before_batch_transfer
training_step
guess_labels
sharpen
mix_up
loss_unl
loss_unl_scaler
loss
loss_lbl
backward
on_after_backward
update_ema
validation_step
loss

Finally, we show an example of how to use the MixMatch module:

from sklearn.preprocessing import StandardScaler, OrdinalEncoder from frdc.train.mixmatch_module import MixMatchModule @@ -60,7 +60,7 @@ sharpen_temp=0.5, mix_beta_alpha=0.75, ) -

In particular, we need to supply some transformations for the preprocessing step. In this case, we use StandardScaler to scale the data, and OrdinalEncoder to encode the labels.

  1. It's best if standardization is done only on the training data, and not the validation data to better fit real-world scenarios.

  2. We use OrdinalEncoder as it handles unseen labels. So if a class doesn't show up in the training data, it will be encoded as np.nan, and will not participate in the loss calculation.

Design Choices

Static Method Overriding

We implement many functions as static, as we believe that a functional style reduces dependencies, thus making the code easier to test and debug.

Furthermore, it allows the subclasses to easily override the functions, to customize the behavior of the MixMatch module.

For example, the loss_unl_scaler function is static, thus, we can implement our own scaling function, and pass it to the MixMatch module.

+

In particular, we need to supply some transformations for the preprocessing step. In this case, we use StandardScaler to scale the data, and OrdinalEncoder to encode the labels.

  1. It's best if standardization is done only on the training data, and not the validation data to better fit real-world scenarios.

  2. We use OrdinalEncoder as it handles unseen labels. So if a class doesn't show up in the training data, it will be encoded as np.nan, and will not participate in the loss calculation.

Design Choices

Static Method Overriding

We implement many functions as static, as we believe that a functional style reduces dependencies, thus making the code easier to test and debug.

Furthermore, it allows the subclasses to easily override the functions, to customize the behavior of the MixMatch module.

For example, the loss_unl_scaler function is static, thus, we can implement our own scaling function, and pass it to the MixMatch module.

def my_loss_unl_scaler(progress: float) -> float: return progress ** 2 @@ -68,4 +68,4 @@ @staticmethod def loss_unl_scaler(progress: float) -> float: return my_loss_unl_scaler(progress) -

If we had used a method instead, we would have to consider instance state, which would make it harder to override.

Why not use Dataclasses?

One of the biggest caveats of nn.Module is that it requires super().__init__() to be called before anything is assigned. While dataclass can leverage __post_init__ to do the same, we felt that this was too much of a hassle to save a few keystrokes. Thus, we opted to use __init__ instead, while more verbose, it is more explicit.

Why use PyTorch Lightning?

While we did hit some road blocks implementing SSL, due to its complex and unconventional nature, we felt that the benefits of using PyTorch Lightning outweighed the cons.

on_before_batch_transfer and on_after_backward are unconventional hooks, and we had to do some digging to find them. It can be argued that by just writing explicit code, we can avoid the need for these hooks, but the PyTorch ecosystem fixes many other issues, so we closed an eye on this.

References

Last modified: 20 December 2023
\ No newline at end of file +

If we had used a method instead, we would have to consider instance state, which would make it harder to override.

Why not use Dataclasses?

One of the biggest caveats of nn.Module is that it requires super().__init__() to be called before anything is assigned. While dataclass can leverage __post_init__ to do the same, we felt that this was too much of a hassle to save a few keystrokes. Thus, we opted to use __init__ instead, while more verbose, it is more explicit.

Why use PyTorch Lightning?

While we did hit some road blocks implementing SSL, due to its complex and unconventional nature, we felt that the benefits of using PyTorch Lightning outweighed the cons.

on_before_batch_transfer and on_after_backward are unconventional hooks, and we had to do some digging to find them. It can be argued that by just writing explicit code, we can avoid the need for these hooks, but the PyTorch ecosystem fixes many other issues, so we closed an eye on this.

References

Last modified: 26 December 2023
\ No newline at end of file diff --git a/docs/mix-match.html b/docs/mix-match.html index dfe195be..a1dd4670 100644 --- a/docs/mix-match.html +++ b/docs/mix-match.html @@ -1 +1 @@ - MixMatch | Documentation

Documentation 0.0.7 Help

MixMatch

In FRDC-ML, we leverage semi-supervised learning to improve the model's performance through better augmentation consistency and using even unlabelled data.

The algorithm we use is MixMatch. A state-of-the-art semi-supervised learning algorithm. It is based on the idea of consistency regularization, which encourages models to predict the same class even after augmentations that occur naturally in the real world.

Our implementation of MixMatch is a refactored version of YU1ut/MixMatch-pytorch We've refactored the code to follow more modern PyTorch practices, allowing us to utilize it with modern PyTorch frameworks such as PyTorch Lightning.

We won't go through the details of MixMatch here, see Our Documentation in our MixMatch-PyTorch-CIFAR10 repository for more details.

Implementation Details

  1. How we implemented the MixMatch logic MixMatchModule

  2. How we implemented the unique MixMatch data loading logic Custom MixMatch Data Loading

References

Last modified: 20 December 2023
\ No newline at end of file + MixMatch | Documentation

Documentation 0.0.7 Help

MixMatch

In FRDC-ML, we leverage semi-supervised learning to improve the model's performance through better augmentation consistency and using even unlabelled data.

The algorithm we use is MixMatch. A state-of-the-art semi-supervised learning algorithm. It is based on the idea of consistency regularization, which encourages models to predict the same class even after augmentations that occur naturally in the real world.

Our implementation of MixMatch is a refactored version of YU1ut/MixMatch-pytorch We've refactored the code to follow more modern PyTorch practices, allowing us to utilize it with modern PyTorch frameworks such as PyTorch Lightning.

We won't go through the details of MixMatch here, see Our Documentation in our MixMatch-PyTorch-CIFAR10 repository for more details.

Implementation Details

  1. How we implemented the MixMatch logic MixMatchModule

  2. How we implemented the unique MixMatch data loading logic Custom MixMatch Data Loading

References

Last modified: 26 December 2023
\ No newline at end of file diff --git a/docs/model-test-chestnut-may-dec.html b/docs/model-test-chestnut-may-dec.html index fc6ce98e..57959a5f 100644 --- a/docs/model-test-chestnut-may-dec.html +++ b/docs/model-test-chestnut-may-dec.html @@ -1 +1 @@ - Model Test Chestnut May-Dec | Documentation

Documentation 0.0.7 Help

Model Test Chestnut May-Dec

This test is used to evaluate the model performance on the Chestnut Nature Park May & December dataset.

See this script in model_tests/chestnut_dec_may/train.py.

Motivation

The usage of this model will be to classify trees in unseen datasets under different conditions. In this test, we'll evaluate it under a different season.

A caveat is that it'll be evaluated on the same set of trees, so it's not a representative of a field-test. However, given difficulties of yielding datasets, this still gives us a good preliminary idea of how the model will perform in different conditions.

Methodology

We train on the December dataset, and test on the May dataset.

Labelled Train
Unlabelled Train
Test
DecDataset
Model
MayDataset

Despite not having any true unlabelled data, we use MixMatch by treating the labelled data of the December dataset as unlabelled data.

Model

The current Model used is a simple InceptionV3 Transfer Learning model, with the last layer replaced with a fully connected layer(s).

SSL Loss
Input
InceptionV3 Frozen
FC Layer(s)
Softmax
Output

Preprocessing

For Training:

Segment
RandomCrop 299
Horizontal Flip 50%
Vertical Flip 50%
Normalize By Training Mean & Std

For Validation:

Segment
CenterCrop 299
Normalize By Training Mean & Std

For Evaluation:

Segment
CenterCrop 299
Normalize By Training Mean & Std
As Is
Horizontal Flip
Vertical Flip
Horizontal & Vertical Flip

For evaluation, we evaluate that the model should be invariant to horizontal and vertical flips, as well as the original image.

Hyperparameters

The following hyperparameters are used:

  • Optimizer: Adam

  • Learning Rate: 1e-3

  • Batch Size: 32

  • Epochs: 10

  • Train Iterations: 25~100

  • Validation Iterations: 10~25

  • Early Stopping: 4

Results

We evaluate around 40% accuracy on the test set, compared to 100% for the training set. This indicates that the model has saturated and is not able to learn anymore from the training set. There's no indication of overfitting as the validation loss just plateaus.

W&B Dashboard

Caveats

  • The test set is very small, so the results are not very representative.

  • The test set is the same set of trees, so it's not a true test of the model performance in different conditions.

  • There are many classes with 1 sample, so the model may not be able to learn the features of these classes well.

Last modified: 20 December 2023
\ No newline at end of file + Model Test Chestnut May-Dec | Documentation

Documentation 0.0.7 Help

Model Test Chestnut May-Dec

This test is used to evaluate the model performance on the Chestnut Nature Park May & December dataset.

See this script in model_tests/chestnut_dec_may/train.py.

Motivation

The usage of this model will be to classify trees in unseen datasets under different conditions. In this test, we'll evaluate it under a different season.

A caveat is that it'll be evaluated on the same set of trees, so it's not a representative of a field-test. However, given difficulties of yielding datasets, this still gives us a good preliminary idea of how the model will perform in different conditions.

Methodology

We train on the December dataset, and test on the May dataset.

Labelled Train
Unlabelled Train
Test
DecDataset
Model
MayDataset

Despite not having any true unlabelled data, we use MixMatch by treating the labelled data of the December dataset as unlabelled data.

Model

The current Model used is a simple InceptionV3 Transfer Learning model, with the last layer replaced with a fully connected layer(s).

SSL Loss
Input
InceptionV3 Frozen
FC Layer(s)
Softmax
Output

Preprocessing

For Training:

Segment
RandomCrop 299
Horizontal Flip 50%
Vertical Flip 50%
Normalize By Training Mean & Std

For Validation:

Segment
CenterCrop 299
Normalize By Training Mean & Std

For Evaluation:

Segment
CenterCrop 299
Normalize By Training Mean & Std
As Is
Horizontal Flip
Vertical Flip
Horizontal & Vertical Flip

For evaluation, we evaluate that the model should be invariant to horizontal and vertical flips, as well as the original image.

Hyperparameters

The following hyperparameters are used:

  • Optimizer: Adam

  • Learning Rate: 1e-3

  • Batch Size: 32

  • Epochs: 10

  • Train Iterations: 25~100

  • Validation Iterations: 10~25

  • Early Stopping: 4

Results

We evaluate around 40% accuracy on the test set, compared to 100% for the training set. This indicates that the model has saturated and is not able to learn anymore from the training set. There's no indication of overfitting as the validation loss just plateaus.

W&B Dashboard

Caveats

  • The test set is very small, so the results are not very representative.

  • The test set is the same set of trees, so it's not a true test of the model performance in different conditions.

  • There are many classes with 1 sample, so the model may not be able to learn the features of these classes well.

Last modified: 26 December 2023
\ No newline at end of file diff --git a/docs/overview.html b/docs/overview.html index 56310435..11b6ffd2 100644 --- a/docs/overview.html +++ b/docs/overview.html @@ -1 +1 @@ - Overview | Documentation

Documentation 0.0.7 Help

Overview

Forest Recovery Digital Companion (FRDC) is a ML-assisted companion for ecologists to automatically classify surveyed trees via an Unmanned Aerial Vehicle (UAV).

This package, FRDC-ML is the Machine Learning backbone of this project, a centralized repository of tools and model architectures to be used in the FRDC pipeline.

Get started here

Other Projects

FRDC-UI

The User Interface Repository for FRDC, a WebApp GUI for ecologists to adjust annotations.

Last modified: 20 December 2023
\ No newline at end of file + Overview | Documentation

Documentation 0.0.7 Help

Overview

Forest Recovery Digital Companion (FRDC) is a ML-assisted companion for ecologists to automatically classify surveyed trees via an Unmanned Aerial Vehicle (UAV).

This package, FRDC-ML is the Machine Learning backbone of this project, a centralized repository of tools and model architectures to be used in the FRDC pipeline.

Get started here

Other Projects

FRDC-UI

The User Interface Repository for FRDC, a WebApp GUI for ecologists to adjust annotations.

Last modified: 26 December 2023
\ No newline at end of file diff --git a/docs/preprocessing-extract-segments.html b/docs/preprocessing-extract-segments.html index 6801c5e6..a2c6eff0 100644 --- a/docs/preprocessing-extract-segments.html +++ b/docs/preprocessing-extract-segments.html @@ -1,4 +1,4 @@ - preprocessing.extract_segments | Documentation

Documentation 0.0.7 Help

preprocessing.extract_segments

Functions

extract_segments_from_labels

Extracts segments from a label classification.

extract_segments_from_bounds

Extracts segments from Rect bounds.

remove_small_segments_from_labels

Removes small segments from a label classification.

Extract with Boundaries

A boundary is a Rect object that represents the minimum bounding box of a segment, with x0, y0, x1, y1 coordinates.

It simply slices the original image to the bounding box. The origin is the top left corner of the image.

+ preprocessing.extract_segments | Documentation

Documentation 0.0.7 Help

preprocessing.extract_segments

Functions

extract_segments_from_labels

Extracts segments from a label classification.

extract_segments_from_bounds

Extracts segments from Rect bounds.

remove_small_segments_from_labels

Removes small segments from a label classification.

Extract with Boundaries

A boundary is a Rect object that represents the minimum bounding box of a segment, with x0, y0, x1, y1 coordinates.

It simply slices the original image to the bounding box. The origin is the top left corner of the image.

+-----------------+ +-----------+ | Original | | Segmented | | Image | | Image | @@ -9,7 +9,7 @@ +-----+-----+-----+ 1, 2, 0, 2 +-----+-----+ | 7 | 8 | 9 | x0 y0 x1 y1 | 8 | 9 | +-----+-----+-----+ +-----+-----+ -
+
+-----------------+ +-----------------+ | Original | | Segmented | | Image | | Image | @@ -20,7 +20,7 @@ +-----+-----+-----+ 1, 2, 0, 2 +-----+-----+-----+ | 7 | 8 | 9 | x0 y0 x1 y1 | 0 | 8 | 9 | +-----+-----+-----+ +-----+-----+-----+ -

Extract with Labels

A label classification is a np.ndarray where each pixel is mapped to a segment. The segments are mapped to a unique integer. In our project, the 0th label is the background.

For example, a label classification of 3 segments will look like this:

+

Extract with Labels

A label classification is a np.ndarray where each pixel is mapped to a segment. The segments are mapped to a unique integer. In our project, the 0th label is the background.

For example, a label classification of 3 segments will look like this:

+-----------------+ +-----------------+ | Label | | Original | | Classification | | Image | @@ -31,7 +31,7 @@ +-----+-----+-----+ +-----+-----+-----+ | 1 | 1 | 0 | | 7 | 8 | 9 | +-----+-----+-----+ +-----+-----+-----+ -

The extraction will take the minimum bounding box of each segment and return a list of segments.

For example, the label 1 and 2 extracted images will be

+

The extraction will take the minimum bounding box of each segment and return a list of segments.

For example, the label 1 and 2 extracted images will be

+-----------+ +-----------+ | Extracted | | Extracted | | Segment 1 | | Segment 2 | @@ -42,7 +42,7 @@ +-----+-----+ +-----+-----+ | 7 | 8 | +-----+-----+ -
+
+-----------------+ +-----------------+ | Extracted | | Extracted | | Segment 1 | | Segment 2 | @@ -53,7 +53,7 @@ +-----+-----+-----+ +-----+-----+-----+ | 7 | 8 | 0 | | 0 | 0 | 0 | +-----+-----+-----+ +-----+-----+-----+ -
  • If cropped is False, the segments are padded with 0s to the original image size. While this can ensure shape consistency, it can consume more memory for large images.

  • If cropped is True, the segments are cropped to the minimum bounding box. This can save memory, but the shape of the segments will be inconsistent.

Usage

Extract from Bounds and Labels

Extract segments from bounds and labels.

+
  • If cropped is False, the segments are padded with 0s to the original image size. While this can ensure shape consistency, it can consume more memory for large images.

  • If cropped is True, the segments are cropped to the minimum bounding box. This can save memory, but the shape of the segments will be inconsistent.

Usage

Extract from Bounds and Labels

Extract segments from bounds and labels.

import numpy as np from frdc.load import FRDCDataset from frdc.preprocess.extract_segments import extract_segments_from_bounds @@ -65,7 +65,7 @@ bounds, labels = ds.get_bounds_and_labels() segments: list[np.ndarray] = extract_segments_from_bounds(ar, bounds) -

Extract from Auto-Segmentation

Extract segments from a label classification.

+

Extract from Auto-Segmentation

Extract segments from a label classification.

from skimage.morphology import remove_small_objects, remove_small_holes import numpy as np @@ -91,4 +91,4 @@ min_height=10, min_width=10) segments: list[np.ndarray] = extract_segments_from_labels(ar, ar_labels) -

API

extract_segments_from_labels(ar, ar_labels, cropped)

Extracts segments from a label classification.


ar_labels is a label classification as a np.ndarray

extract_segments_from_bounds(ar, bounds, cropped)

Extracts segments from Rect bounds.


bounds is a list of Rect bounds.

remove_small_segments_from_labels(ar_labels, min_height, min_width)

Removes small segments from a label classification.


Last modified: 20 December 2023
\ No newline at end of file +

API

extract_segments_from_labels(ar, ar_labels, cropped)

Extracts segments from a label classification.


ar_labels is a label classification as a np.ndarray

extract_segments_from_bounds(ar, bounds, cropped)

Extracts segments from Rect bounds.


bounds is a list of Rect bounds.

remove_small_segments_from_labels(ar_labels, min_height, min_width)

Removes small segments from a label classification.


Last modified: 26 December 2023
\ No newline at end of file diff --git a/docs/preprocessing-glcm-padded.html b/docs/preprocessing-glcm-padded.html index ff0639fd..ed865bac 100644 --- a/docs/preprocessing-glcm-padded.html +++ b/docs/preprocessing-glcm-padded.html @@ -1,4 +1,4 @@ - preprocessing.glcm_padded | Documentation

Documentation 0.0.7 Help

preprocessing.glcm_padded

Functions

glcm_padded

Computes the GLCM of the NDArray bands with padding.

glcm_padded_cached

Computes the GLCM of the NDArray bands with padding, and caches it.

append_glcm_padded_cached

Computes the GLCM of the NDArray bands with padding, and caches it and also appends it onto the original array.

Usage

We show a few examples of how to use the GLCM functions.

+ preprocessing.glcm_padded | Documentation

Documentation 0.0.7 Help

preprocessing.glcm_padded

Functions

glcm_padded

Computes the GLCM of the NDArray bands with padding.

glcm_padded_cached

Computes the GLCM of the NDArray bands with padding, and caches it.

append_glcm_padded_cached

Computes the GLCM of the NDArray bands with padding, and caches it and also appends it onto the original array.

Usage

We show a few examples of how to use the GLCM functions.

import numpy as np from glcm_cupy import Features @@ -23,4 +23,4 @@ ar_glcm_cached_appended = append_glcm_padded_cached(ar, bin_from=1, bin_to=4, radius=3) -
  • ar_glcm is the GLCM of the original array, with the last dimension being the GLCM features. The number of features is determined by the features parameter, which defaults to all features.

  • ar_glcm_2_features selects only 2 features, with the last dimension being the 2 GLCM features specified.

  • ar_glcm_cached caches the GLCM so that if you call it again, it will return the cached version. It stores its data at the project root dir, under .cache/.

  • ar_glcm_cached_appended is a wrapper around ar_glcm_cached, it appends the GLCM features onto the original array. It's equivalent to calling ar_glcm_cached and then np.concatenate on the final axes.

Caching

GLCM is an expensive operation, thus we recommend to cache it if the input parameters will be the same. This is especially useful if you're experimenting with the same dataset with constant parameters.

API

glcm_padded(ar, bin_from, bin_to, radius, step_size, features)

Computes the GLCM of the NDArray bands with padding.


  • ar is the input array

  • bin_from is the upper bound of the input

  • bin_to is the upper bound of the GLCM input, i.e. the resolution that GLCM operates on

  • radius is the radius of the GLCM

  • step_size is the step size of the GLCM

  • features is the list of GLCM features to compute

The return shape is

See glcm_cupy for the GLCM Features.

glcm_padded_cached(ar, bin_from, bin_to, radius, step_size, features)

Computes the GLCM of the NDArray bands with padding, and caches it.


See glcm_padded for the parameters and output shape

append_glcm_padded_cached(ar, bin_from, bin_to, radius, step_size, features)

Computes the GLCM of the NDArray bands with padding, and caches it and also appends it onto the original array.


See glcm_padded for the parameters


The return shape is:

The function automatically flattens the last 2 dimensions of the GLCM features, and appends it onto the original array.

Last modified: 20 December 2023
\ No newline at end of file +
  • ar_glcm is the GLCM of the original array, with the last dimension being the GLCM features. The number of features is determined by the features parameter, which defaults to all features.

  • ar_glcm_2_features selects only 2 features, with the last dimension being the 2 GLCM features specified.

  • ar_glcm_cached caches the GLCM so that if you call it again, it will return the cached version. It stores its data at the project root dir, under .cache/.

  • ar_glcm_cached_appended is a wrapper around ar_glcm_cached, it appends the GLCM features onto the original array. It's equivalent to calling ar_glcm_cached and then np.concatenate on the final axes.

Caching

GLCM is an expensive operation, thus we recommend to cache it if the input parameters will be the same. This is especially useful if you're experimenting with the same dataset with constant parameters.

API

glcm_padded(ar, bin_from, bin_to, radius, step_size, features)

Computes the GLCM of the NDArray bands with padding.


  • ar is the input array

  • bin_from is the upper bound of the input

  • bin_to is the upper bound of the GLCM input, i.e. the resolution that GLCM operates on

  • radius is the radius of the GLCM

  • step_size is the step size of the GLCM

  • features is the list of GLCM features to compute

The return shape is

See glcm_cupy for the GLCM Features.

glcm_padded_cached(ar, bin_from, bin_to, radius, step_size, features)

Computes the GLCM of the NDArray bands with padding, and caches it.


See glcm_padded for the parameters and output shape

append_glcm_padded_cached(ar, bin_from, bin_to, radius, step_size, features)

Computes the GLCM of the NDArray bands with padding, and caches it and also appends it onto the original array.


See glcm_padded for the parameters


The return shape is:

The function automatically flattens the last 2 dimensions of the GLCM features, and appends it onto the original array.

Last modified: 26 December 2023
\ No newline at end of file diff --git a/docs/preprocessing-morphology.html b/docs/preprocessing-morphology.html index edc343b4..6697539f 100644 --- a/docs/preprocessing-morphology.html +++ b/docs/preprocessing-morphology.html @@ -1,4 +1,4 @@ - preprocessing.morphology | Documentation

Documentation 0.0.7 Help

preprocessing.morphology

Functions

threshold_binary_mask

Thresholds a selected NDArray bands to yield a binary mask.

binary_watershed

Performs watershed on a binary mask to yield a mapped label classification

Usage

Perform auto-segmentation on a dataset to yield a label classification.

+ preprocessing.morphology | Documentation

Documentation 0.0.7 Help

preprocessing.morphology

Functions

threshold_binary_mask

Thresholds a selected NDArray bands to yield a binary mask.

binary_watershed

Performs watershed on a binary mask to yield a mapped label classification

Usage

Perform auto-segmentation on a dataset to yield a label classification.

from frdc.load import FRDCDataset from frdc.preprocess.morphology import ( threshold_binary_mask, binary_watershed @@ -10,6 +10,6 @@ ar, order = ds.get_ar_bands() mask = threshold_binary_mask(ar, order.index('NIR'), 90 / 256) ar_label = binary_watershed(mask) -

API

threshold_binary_mask(ar, band_idx, threshold_value)

Thresholds a selected NDArray bands to yield a binary mask as np.ndarray


This is equivalent to

+

API

threshold_binary_mask(ar, band_idx, threshold_value)

Thresholds a selected NDArray bands to yield a binary mask as np.ndarray


This is equivalent to

ar[..., band_idx] > threshold_value -
binary_watershed(ar_mask, peaks_footprint, watershed_compactness)

Performs watershed on a binary mask to yield a mapped label classification as a np.ndarray


  • peaks_footprint is the footprint of skimage.feature.peak_local_max

  • watershed_compactness is the compactness of skimage.morphology.watershed

Last modified: 20 December 2023
\ No newline at end of file +
binary_watershed(ar_mask, peaks_footprint, watershed_compactness)

Performs watershed on a binary mask to yield a mapped label classification as a np.ndarray


  • peaks_footprint is the footprint of skimage.feature.peak_local_max

  • watershed_compactness is the compactness of skimage.morphology.watershed

Last modified: 26 December 2023
\ No newline at end of file diff --git a/docs/preprocessing-scale.html b/docs/preprocessing-scale.html index c8213f8e..b846445e 100644 --- a/docs/preprocessing-scale.html +++ b/docs/preprocessing-scale.html @@ -1,4 +1,4 @@ - preprocessing.scale | Documentation

Documentation 0.0.7 Help

preprocessing.scale

Functions

scale_0_1_per_band

Scales the NDArray bands to [0, 1] per band.

scale_normal_per_band

Scales the NDArray bands to zero mean unit variance per band.

scale_static_per_band

Scales the NDArray bands by a predefined configuration. Take a look at frdc.conf.BAND_MAX_CONFIG for an example.

Usage

+ preprocessing.scale | Documentation

Documentation 0.0.7 Help

preprocessing.scale

Functions

scale_0_1_per_band

Scales the NDArray bands to [0, 1] per band.

scale_normal_per_band

Scales the NDArray bands to zero mean unit variance per band.

scale_static_per_band

Scales the NDArray bands by a predefined configuration. Take a look at frdc.conf.BAND_MAX_CONFIG for an example.

Usage

from frdc.load import FRDCDataset from frdc.preprocess.scale import ( scale_0_1_per_band, scale_normal_per_band, scale_static_per_band @@ -12,4 +12,4 @@ ar_01 = scale_0_1_per_band(ar) ar_norm = scale_normal_per_band(ar) ar_static = scale_static_per_band(ar, order, BAND_MAX_CONFIG) -
Last modified: 20 December 2023
\ No newline at end of file +
Last modified: 26 December 2023
\ No newline at end of file diff --git a/docs/retrieve-our-datasets.html b/docs/retrieve-our-datasets.html index 644e51f5..bbeecb47 100644 --- a/docs/retrieve-our-datasets.html +++ b/docs/retrieve-our-datasets.html @@ -1,10 +1,10 @@ - Retrieve our Datasets | Documentation

Documentation 0.0.7 Help

Retrieve our Datasets

In this tutorial, we'll learn how to :

  • Retrieve FRDC's Hyperspectral Image Data as np.ndarray

  • Retrieve FRDC's Ground Truth bounds and labels

  • Slice/segment the image data by the bounds

Prerequisites

  • New here? Get Started.

  • Setup the Google Cloud Authorization to download the data.

Retrieve the Data

To retrieve the data, use FRDCDataset

Here, we'll download and load our

  • ar: Hyperspectral Image Data

  • order: The order of the bands

  • bounds: The bounds of the trees (segments)

  • labels: The labels of the trees (segments)

+ Retrieve our Datasets | Documentation

Documentation 0.0.7 Help

Retrieve our Datasets

In this tutorial, we'll learn how to :

  • Retrieve FRDC's Hyperspectral Image Data as np.ndarray

  • Retrieve FRDC's Ground Truth bounds and labels

  • Slice/segment the image data by the bounds

Prerequisites

  • New here? Get Started.

  • Setup the Google Cloud Authorization to download the data.

Retrieve the Data

To retrieve the data, use FRDCDataset

Here, we'll download and load our

  • ar: Hyperspectral Image Data

  • order: The order of the bands

  • bounds: The bounds of the trees (segments)

  • labels: The labels of the trees (segments)

from frdc.load.dataset import FRDCDataset ds = FRDCDataset(site="chestnut_nature_park", date="20201218", version=None) ar, order = ds.get_ar_bands() bounds, labels = ds.get_bounds_and_labels() -

What Datasets are there?

+

What Datasets are there?

from frdc.load.gcs import list_gcs_datasets print(list_gcs_datasets()) # 0 DEBUG/0 @@ -12,7 +12,7 @@ # 2 casuarina/20220418/93deg # 3 chestnut_nature_park/20201218 # ... -
  • The first part of the path is the site, and the second part is the date.

  • The version is the rest of the path, if there isn't any, use None.

  • site="ds"

  • date="date"

  • version="ver"

  • site="ds"

  • date="date"

  • version="ver/01/data"

  • site="ds"

  • date="date"

  • version=None

Segment the Data

To segment the data, use Extract Segments.

Here, we'll segment the data by the bounds.

+
  • The first part of the path is the site, and the second part is the date.

  • The version is the rest of the path, if there isn't any, use None.

  • site="ds"

  • date="date"

  • version="ver"

  • site="ds"

  • date="date"

  • version="ver/01/data"

  • site="ds"

  • date="date"

  • version=None

Segment the Data

To segment the data, use Extract Segments.

Here, we'll segment the data by the bounds.

from frdc.load.dataset import FRDCDataset from frdc.preprocess.extract_segments import extract_segments_from_bounds @@ -20,7 +20,7 @@ ar, order = ds.get_ar_bands() bounds, labels = ds.get_bounds_and_labels() segments = extract_segments_from_bounds(ar, bounds) -

segments is a list of np.ndarray of shape H, W, C, representing a tree. The order of segments is the same as labels, so you can use labels to identify the tree.

Plot the Data (Optional)

We can then use these data to plot out the first tree segment.

+

segments is a list of np.ndarray of shape H, W, C, representing a tree. The order of segments is the same as labels, so you can use labels to identify the tree.

Plot the Data (Optional)

We can then use these data to plot out the first tree segment.

import matplotlib.pyplot as plt from frdc.load.dataset import FRDCDataset @@ -38,4 +38,4 @@ plt.imshow(segment_0_rgb_scaled) plt.title(f"Tree {labels[0]}") plt.show() -

See also: preprocessing.scale.scale_0_1_per_band

MatPlotLib cannot show the data correctly as-is, so we need to

  • Convert the data from BGR to RGB

  • Scale the data to 0-1 per band

Last modified: 20 December 2023
\ No newline at end of file +

See also: preprocessing.scale.scale_0_1_per_band

MatPlotLib cannot show the data correctly as-is, so we need to

  • Convert the data from BGR to RGB

  • Scale the data to 0-1 per band

Last modified: 26 December 2023
\ No newline at end of file diff --git a/docs/site.webmanifest b/docs/site.webmanifest new file mode 100644 index 00000000..fe6a9303 --- /dev/null +++ b/docs/site.webmanifest @@ -0,0 +1,11 @@ +{ + "name": "JetBrains", + "short_name": "JetBrains", + "icons": [ + { "src": "icon-192.png", "type": "image/png", "sizes": "192x192" }, + { "src": "icon-512.png", "type": "image/png", "sizes": "512x512" } + ], + "theme_color": "#000000", + "background_color": "#000000", + "display": "standalone" +} \ No newline at end of file diff --git a/docs/train-frdc-lightning.html b/docs/train-frdc-lightning.html index ebcd6054..1886de79 100644 --- a/docs/train-frdc-lightning.html +++ b/docs/train-frdc-lightning.html @@ -1,4 +1,4 @@ - train.frdc_datamodule & frdc_module | Documentation

Documentation 0.0.7 Help

train.frdc_datamodule & frdc_module

These are FRDC specific LightningDataModule and LightningModule, a core component in the PyTorch Lightning ecosystem to provide a simple interface to train and evaluate models.

Classes

FRDCDataModule

The FRDC PyTorch Lightning DataModule.

FRDCModule

The FRDC PyTorch Lightning Module.

Usage

API

FRDCDataModule(segments, labels, preprocess, augmentation, train_val_test_split, batch_size)

Initializes the FRDC PyTorch Lightning DataModule.


  • segments, labels are retrieved from

  • preprocess is a function that takes in a segment and returns a preprocessed segment. In particular, it should accept a list of NumPy NDArrays and return a single stacked PyToch Tensor.

  • augmentation is a function that takes in a segment and returns an augmented segment. In particular, it takes in a PyTorch Tensor and returns another.

  • train_val_test_split is a function that takes a TensorDataset and returns a list of 3 TensorDatasets, for train, val and test respectively.

  • batch_size is the batch size.

FRDCModule(model_cls, model_kwargs, optim_cls, optim_kwargs)

Initializes the FRDC PyTorch Lightning Module.


  • model_cls is the Class of the model.

  • model_kwargs is the kwargs to pass to the model.

  • optim_cls is the Class of the optimizer.

  • optim_kwargs is the kwargs to pass to the optimizer.

Internally, the module will initialize the model and optimizer as follows:

+ train.frdc_datamodule & frdc_module | Documentation

Documentation 0.0.7 Help

train.frdc_datamodule & frdc_module

These are FRDC specific LightningDataModule and LightningModule, a core component in the PyTorch Lightning ecosystem to provide a simple interface to train and evaluate models.

Classes

FRDCDataModule

The FRDC PyTorch Lightning DataModule.

FRDCModule

The FRDC PyTorch Lightning Module.

Usage

API

FRDCDataModule(segments, labels, preprocess, augmentation, train_val_test_split, batch_size)

Initializes the FRDC PyTorch Lightning DataModule.


  • segments, labels are retrieved from

  • preprocess is a function that takes in a segment and returns a preprocessed segment. In particular, it should accept a list of NumPy NDArrays and return a single stacked PyToch Tensor.

  • augmentation is a function that takes in a segment and returns an augmented segment. In particular, it takes in a PyTorch Tensor and returns another.

  • train_val_test_split is a function that takes a TensorDataset and returns a list of 3 TensorDatasets, for train, val and test respectively.

  • batch_size is the batch size.

FRDCModule(model_cls, model_kwargs, optim_cls, optim_kwargs)

Initializes the FRDC PyTorch Lightning Module.


  • model_cls is the Class of the model.

  • model_kwargs is the kwargs to pass to the model.

  • optim_cls is the Class of the optimizer.

  • optim_kwargs is the kwargs to pass to the optimizer.

Internally, the module will initialize the model and optimizer as follows:

model = model_cls(**model_kwargs) optim = optim_cls(model.parameters(), **optim_kwargs) -
Last modified: 20 December 2023
\ No newline at end of file +
Last modified: 26 December 2023
\ No newline at end of file From 3ad231b037edec6effe1208ede4bc258e6a26dbd Mon Sep 17 00:00:00 2001 From: Evening Date: Tue, 26 Dec 2023 15:47:27 +0800 Subject: [PATCH 10/52] Update README.md --- README.md | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 9ddd6c92..d497d13a 100644 --- a/README.md +++ b/README.md @@ -54,14 +54,6 @@ To illustrate this, take a look at how `tests/model_tests/chestnut_dec_may/train.py` is written. It pulls in relevant modules from each stage and constructs a pipeline. - -> Initially, we evaluated a few ML E2E solutions, despite them offering great -> functionality, their flexibility was -> limited. From a dev perspective, **Active Learning** was a gray area, and we -> foresee heavy shoehorning. -> Ultimately, we decided that the risk was too great, thus we resort to -> creating our own solution. - ## Contributing ### Pre-commit Hooks @@ -80,3 +72,5 @@ If you're using `pip` instead of `poetry`, run the following commands: pip install pre-commit pre-commit install ``` + +Alternatively, you can use Black configured with your own IDE. From 3eb0b40a792286b235486c6baeee3ac0f14e82f9 Mon Sep 17 00:00:00 2001 From: Evening Date: Tue, 26 Dec 2023 15:53:17 +0800 Subject: [PATCH 11/52] Update devcontainer.json --- .devcontainer/devcontainer.json | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index ca281b81..5ca24e93 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -1,8 +1,7 @@ { "name": "frdc", "build": { - "context": "../", - "dockerfile": "Dockerfile", + "dockerfile": "../Dockerfile", }, "containerEnv": { "LABEL_STUDIO_HOST": "host.docker.internal", From d021af7cbb825c88fcb8885bd118f6f51cfca35c Mon Sep 17 00:00:00 2001 From: Evening Date: Tue, 26 Dec 2023 16:29:08 +0800 Subject: [PATCH 12/52] Attempt to fix codespace problem --- Dockerfile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index 9951dc95..6b535c26 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,8 +3,8 @@ WORKDIR /devcontainer COPY ./pyproject.toml /devcontainer/pyproject.toml -RUN apt-get update -RUN apt-get install git -y +RUN apt update && apt upgrade +RUN apt install git -y RUN pip3 install --upgrade pip && \ pip3 install poetry && \ @@ -16,5 +16,5 @@ RUN conda init bash \ && poetry config virtualenvs.create false \ && poetry install --with dev --no-interaction --no-ansi -RUN apt-get install curl -y && curl -sSL https://sdk.cloud.google.com | bash +RUN apt install curl -y && curl -sSL https://sdk.cloud.google.com | bash ENV PATH $PATH:/root/google-cloud-sdk/bin From 2636cf1454a0d2a34b770829fa55f558d242e4ef Mon Sep 17 00:00:00 2001 From: Evening Date: Tue, 26 Dec 2023 16:38:02 +0800 Subject: [PATCH 13/52] Update Dockerfile --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 6b535c26..526daecb 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,7 +3,7 @@ WORKDIR /devcontainer COPY ./pyproject.toml /devcontainer/pyproject.toml -RUN apt update && apt upgrade +RUN apt update -y && apt upgrade -y RUN apt install git -y RUN pip3 install --upgrade pip && \ From bac614adb1e19d71d512e4d52824038146755b7f Mon Sep 17 00:00:00 2001 From: Eve-ning Date: Wed, 27 Dec 2023 16:24:11 +0800 Subject: [PATCH 14/52] Force Dockerfile to LF --- .gitattributes | 0 Dockerfile | 0 2 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 .gitattributes mode change 100644 => 100755 Dockerfile diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 00000000..e69de29b diff --git a/Dockerfile b/Dockerfile old mode 100644 new mode 100755 From 399fc54ea766b9c1dcb9699cac337a5908ce895e Mon Sep 17 00:00:00 2001 From: Evening Date: Wed, 27 Dec 2023 16:24:51 +0800 Subject: [PATCH 15/52] Force Dockerfile to LF --- .gitattributes | 1 + Dockerfile | 0 2 files changed, 1 insertion(+) mode change 100755 => 100644 Dockerfile diff --git a/.gitattributes b/.gitattributes index e69de29b..d28cb2fc 100644 --- a/.gitattributes +++ b/.gitattributes @@ -0,0 +1 @@ +Dockerfile text=auto eol=lf \ No newline at end of file diff --git a/Dockerfile b/Dockerfile old mode 100755 new mode 100644 From e2cab32a0aef5f1889c0e7f63a8b782dd223a49f Mon Sep 17 00:00:00 2001 From: Evening Date: Wed, 27 Dec 2023 17:22:58 +0800 Subject: [PATCH 16/52] Move Dev Container setup to other page --- Writerside/d.tree | 4 +- .../topics/Get-Started-with-Dev-Containers.md | 49 +++++++++++++++++++ Writerside/topics/Getting-Started.md | 22 +-------- docs/HelpTOC.json | 2 +- docs/config.json | 2 +- docs/current.help.version | 2 +- docs/custom-k-aug-dataloaders.html | 6 +-- docs/getting-started.html | 22 ++++----- docs/load-dataset.html | 8 +-- docs/load-gcs.html | 6 +-- docs/mix-match-module.html | 12 ++--- docs/mix-match.html | 2 +- docs/model-test-chestnut-may-dec.html | 2 +- docs/overview.html | 2 +- docs/preprocessing-extract-segments.html | 16 +++--- docs/preprocessing-glcm-padded.html | 4 +- docs/preprocessing-morphology.html | 6 +-- docs/preprocessing-scale.html | 4 +- docs/retrieve-our-datasets.html | 10 ++-- docs/train-frdc-lightning.html | 4 +- 20 files changed, 108 insertions(+), 77 deletions(-) create mode 100644 Writerside/topics/Get-Started-with-Dev-Containers.md diff --git a/Writerside/d.tree b/Writerside/d.tree index 27fe88b4..32778cf1 100644 --- a/Writerside/d.tree +++ b/Writerside/d.tree @@ -8,7 +8,9 @@ start-page="Overview.md"> - + + + diff --git a/Writerside/topics/Get-Started-with-Dev-Containers.md b/Writerside/topics/Get-Started-with-Dev-Containers.md new file mode 100644 index 00000000..750bead5 --- /dev/null +++ b/Writerside/topics/Get-Started-with-Dev-Containers.md @@ -0,0 +1,49 @@ +# Get Started with Dev Containers + +Dev. Containers are a great way to get started with a project. They define all +necessary dependencies and environments, so you can just start coding within +the container. + +In this article, we'll only go over **additional steps** to set up with our +project. For more information on how to use Dev Containers, please refer to +the official documentation for each IDE. Once you've set up the Dev Container, +come back here to finish the setup: + +- [VSCode](https://code.visualstudio.com/docs/remote/containers). +- [IntelliJ](https://www.jetbrains.com/help/idea/connect-to-devcontainer.html) + +> If you see the error `Error response from daemon: ... the `.git` at the end of the repo URL. +{style='warning'} + +## Python Environment + +> Do not create a new environment +{style='warning'} + +The dev environment is already created and is managed by Anaconda +`/opt/conda/bin/conda`. +To activate the environment, run the following command: + +```bash +conda activate base +``` + +> Refer to your respective IDE's documentation on how to activate the +> environment. + +## Mark as Sources Root (Add to PYTHONPATH) + +For `import` statements to work, you need to mark the `src` folder as the +sources root. Optionally, also mark the `tests` folder as the tests root. + +> Refer to your respective IDE's documentation on how to mark folders as +> sources root. (Also known as adding to the `PYTHONPATH`) + +## Additional Setup + +Refer to the [Getting Started](Getting-Started.md) guide for additional setup +steps such as: +- Google Cloud Application Default Credentials +- Weight & Bias API Key +- Label Studio API Key diff --git a/Writerside/topics/Getting-Started.md b/Writerside/topics/Getting-Started.md index 10fe4398..c62ee26f 100644 --- a/Writerside/topics/Getting-Started.md +++ b/Writerside/topics/Getting-Started.md @@ -1,5 +1,7 @@ # Getting Started +> Want to use a Dev Container? See [Get Started with Dev Containers](Get-Started-with-Dev-Containers.md) + Ensure that you have the right version of Python. The required Python version can be seen in pyproject.toml @@ -60,25 +62,6 @@ - - - Only use Dev. Containers if you're familiar with your IDEs, it's highly - dependent on clicking around the IDE. - - Do not set up a new environment, it'll be included in the environment. - - Ensure that you have installed pre-requisites for respective IDEs. - VSCode - IntelliJ - - Start by cloning our repository. - - git clone https://github.com/FR-DC/FRDC-ML.git - - - Follow steps for respective IDEs to set up the Dev. Container. - Activate the virtual environment. The venv is located in /opt/venv - @@ -132,7 +115,6 @@ - We use W&B to track our experiments. To set up W&B, diff --git a/docs/HelpTOC.json b/docs/HelpTOC.json index 107c112d..e076f9bd 100644 --- a/docs/HelpTOC.json +++ b/docs/HelpTOC.json @@ -1 +1 @@ -{"entities":{"pages":{"Overview":{"id":"Overview","title":"Overview","url":"overview.html","level":0,"tabIndex":0},"Getting-Started":{"id":"Getting-Started","title":"Getting Started","url":"getting-started.html","level":0,"tabIndex":1},"e8e19623_38829":{"id":"e8e19623_38829","title":"Tutorials","level":0,"pages":["Retrieve-our-Datasets"],"tabIndex":2},"Retrieve-our-Datasets":{"id":"Retrieve-our-Datasets","title":"Retrieve our Datasets","url":"retrieve-our-datasets.html","level":1,"parentId":"e8e19623_38829","tabIndex":0},"mix-match":{"id":"mix-match","title":"MixMatch","url":"mix-match.html","level":0,"pages":["mix-match-module","custom-k-aug-dataloaders"],"tabIndex":3},"mix-match-module":{"id":"mix-match-module","title":"MixMatch Module","url":"mix-match-module.html","level":1,"parentId":"mix-match","tabIndex":0},"custom-k-aug-dataloaders":{"id":"custom-k-aug-dataloaders","title":"Custom K-Aug Dataloaders","url":"custom-k-aug-dataloaders.html","level":1,"parentId":"mix-match","tabIndex":1},"e8e19623_38834":{"id":"e8e19623_38834","title":"Model Tests","level":0,"pages":["Model-Test-Chestnut-May-Dec"],"tabIndex":4},"Model-Test-Chestnut-May-Dec":{"id":"Model-Test-Chestnut-May-Dec","title":"Model Test Chestnut May-Dec","url":"model-test-chestnut-may-dec.html","level":1,"parentId":"e8e19623_38834","tabIndex":0},"e8e19623_38836":{"id":"e8e19623_38836","title":"API","level":0,"pages":["load.dataset","load.gcs","preprocessing.scale","preprocessing.extract_segments","preprocessing.morphology","preprocessing.glcm_padded","train.frdc_lightning"],"tabIndex":5},"load.dataset":{"id":"load.dataset","title":"load.dataset","url":"load-dataset.html","level":1,"parentId":"e8e19623_38836","tabIndex":0},"load.gcs":{"id":"load.gcs","title":"load.gcs","url":"load-gcs.html","level":1,"parentId":"e8e19623_38836","tabIndex":1},"preprocessing.scale":{"id":"preprocessing.scale","title":"preprocessing.scale","url":"preprocessing-scale.html","level":1,"parentId":"e8e19623_38836","tabIndex":2},"preprocessing.extract_segments":{"id":"preprocessing.extract_segments","title":"preprocessing.extract_segments","url":"preprocessing-extract-segments.html","level":1,"parentId":"e8e19623_38836","tabIndex":3},"preprocessing.morphology":{"id":"preprocessing.morphology","title":"preprocessing.morphology","url":"preprocessing-morphology.html","level":1,"parentId":"e8e19623_38836","tabIndex":4},"preprocessing.glcm_padded":{"id":"preprocessing.glcm_padded","title":"preprocessing.glcm_padded","url":"preprocessing-glcm-padded.html","level":1,"parentId":"e8e19623_38836","tabIndex":5},"train.frdc_lightning":{"id":"train.frdc_lightning","title":"train.frdc_datamodule \u0026 frdc_module","url":"train-frdc-lightning.html","level":1,"parentId":"e8e19623_38836","tabIndex":6}}},"topLevelIds":["Overview","Getting-Started","e8e19623_38829","mix-match","e8e19623_38834","e8e19623_38836"]} \ No newline at end of file +{"entities":{"pages":{"Overview":{"id":"Overview","title":"Overview","url":"overview.html","level":0,"tabIndex":0},"Getting-Started":{"id":"Getting-Started","title":"Getting Started","url":"getting-started.html","level":0,"tabIndex":1},"e8e19623_38931":{"id":"e8e19623_38931","title":"Tutorials","level":0,"pages":["Retrieve-our-Datasets"],"tabIndex":2},"Retrieve-our-Datasets":{"id":"Retrieve-our-Datasets","title":"Retrieve our Datasets","url":"retrieve-our-datasets.html","level":1,"parentId":"e8e19623_38931","tabIndex":0},"mix-match":{"id":"mix-match","title":"MixMatch","url":"mix-match.html","level":0,"pages":["mix-match-module","custom-k-aug-dataloaders"],"tabIndex":3},"mix-match-module":{"id":"mix-match-module","title":"MixMatch Module","url":"mix-match-module.html","level":1,"parentId":"mix-match","tabIndex":0},"custom-k-aug-dataloaders":{"id":"custom-k-aug-dataloaders","title":"Custom K-Aug Dataloaders","url":"custom-k-aug-dataloaders.html","level":1,"parentId":"mix-match","tabIndex":1},"e8e19623_38936":{"id":"e8e19623_38936","title":"Model Tests","level":0,"pages":["Model-Test-Chestnut-May-Dec"],"tabIndex":4},"Model-Test-Chestnut-May-Dec":{"id":"Model-Test-Chestnut-May-Dec","title":"Model Test Chestnut May-Dec","url":"model-test-chestnut-may-dec.html","level":1,"parentId":"e8e19623_38936","tabIndex":0},"e8e19623_38938":{"id":"e8e19623_38938","title":"API","level":0,"pages":["load.dataset","load.gcs","preprocessing.scale","preprocessing.extract_segments","preprocessing.morphology","preprocessing.glcm_padded","train.frdc_lightning"],"tabIndex":5},"load.dataset":{"id":"load.dataset","title":"load.dataset","url":"load-dataset.html","level":1,"parentId":"e8e19623_38938","tabIndex":0},"load.gcs":{"id":"load.gcs","title":"load.gcs","url":"load-gcs.html","level":1,"parentId":"e8e19623_38938","tabIndex":1},"preprocessing.scale":{"id":"preprocessing.scale","title":"preprocessing.scale","url":"preprocessing-scale.html","level":1,"parentId":"e8e19623_38938","tabIndex":2},"preprocessing.extract_segments":{"id":"preprocessing.extract_segments","title":"preprocessing.extract_segments","url":"preprocessing-extract-segments.html","level":1,"parentId":"e8e19623_38938","tabIndex":3},"preprocessing.morphology":{"id":"preprocessing.morphology","title":"preprocessing.morphology","url":"preprocessing-morphology.html","level":1,"parentId":"e8e19623_38938","tabIndex":4},"preprocessing.glcm_padded":{"id":"preprocessing.glcm_padded","title":"preprocessing.glcm_padded","url":"preprocessing-glcm-padded.html","level":1,"parentId":"e8e19623_38938","tabIndex":5},"train.frdc_lightning":{"id":"train.frdc_lightning","title":"train.frdc_datamodule \u0026 frdc_module","url":"train-frdc-lightning.html","level":1,"parentId":"e8e19623_38938","tabIndex":6}}},"topLevelIds":["Overview","Getting-Started","e8e19623_38931","mix-match","e8e19623_38936","e8e19623_38938"]} \ No newline at end of file diff --git a/docs/config.json b/docs/config.json index a83934cd..1f05ea18 100644 --- a/docs/config.json +++ b/docs/config.json @@ -1 +1 @@ -{"productVersion":"0.0.7","productId":"d","stage":"release","downloadTitle":"Get Documentation","keymaps":{},"searchMaxHits":75,"productName":"Documentation"} \ No newline at end of file +{"productVersion":"0.0.8","productId":"d","stage":"release","downloadTitle":"Get Documentation","keymaps":{},"searchMaxHits":75,"productName":"Documentation"} \ No newline at end of file diff --git a/docs/current.help.version b/docs/current.help.version index 5c4511c3..7d6b3eb3 100644 --- a/docs/current.help.version +++ b/docs/current.help.version @@ -1 +1 @@ -0.0.7 \ No newline at end of file +0.0.8 \ No newline at end of file diff --git a/docs/custom-k-aug-dataloaders.html b/docs/custom-k-aug-dataloaders.html index 487648dc..b51fd57a 100644 --- a/docs/custom-k-aug-dataloaders.html +++ b/docs/custom-k-aug-dataloaders.html @@ -1,4 +1,4 @@ - Custom K-Aug Dataloaders | Documentation

Documentation 0.0.7 Help

Custom K-Aug Dataloaders

In MixMatch, implementing the data loading methods is quite unconventional.

  1. We need to load multiple augmented versions of the same image into the same batch.

  2. The labelled set is usually too small, causing a premature end to the epoch as it runs out of samples to draw from faster than the unlabelled set.

This can be rather tricky to implement in PyTorch. This tutorial will illustrate how we did it.

Loading Multiple Augmented Versions of the Same Image

See: frdc/load/dataset.py FRDCDataset.__getitem__

In MixMatch, a single train batch must consist of:

  1. A batch of labeled images

  2. K batches of unlabeled images

Aug
Aug
Aug
Aug
Get Batch
Aug Labelled Batch
Unlabelled Batch
Aug Unl. Batch 1
Aug Unl. Batch i
Aug Unl. Batch K

Keep in mind that the unlabelled batch, is a single batch of images, not separate draws of batches. It is then "duplicated" K times, and each copy is augmented differently.

Solution 1: Custom Dataset

To solve this, we need to understand the role of both a Dataset and a DataLoader.

  • A Dataset represents a collection of data, responsible for loading and returning something.

  • A DataLoader draws samples from a Dataset and returns batched samples.

The key here is that a Dataset is not limited to returning 1 sample at a time, we can make it return the K augmented versions of the same image.

Aug
Aug
Aug
Sample
Aug Sample 1
Aug Sample i
Aug Sample K

In code, this is done by subclassing the Dataset class and overriding the __getitem__ method.

+ Custom K-Aug Dataloaders | Documentation

Documentation 0.0.8 Help

Custom K-Aug Dataloaders

In MixMatch, implementing the data loading methods is quite unconventional.

  1. We need to load multiple augmented versions of the same image into the same batch.

  2. The labelled set is usually too small, causing a premature end to the epoch as it runs out of samples to draw from faster than the unlabelled set.

This can be rather tricky to implement in PyTorch. This tutorial will illustrate how we did it.

Loading Multiple Augmented Versions of the Same Image

See: frdc/load/dataset.py FRDCDataset.__getitem__

In MixMatch, a single train batch must consist of:

  1. A batch of labeled images

  2. K batches of unlabeled images

Aug
Aug
Aug
Aug
Get Batch
Aug Labelled Batch
Unlabelled Batch
Aug Unl. Batch 1
Aug Unl. Batch i
Aug Unl. Batch K

Keep in mind that the unlabelled batch, is a single batch of images, not separate draws of batches. It is then "duplicated" K times, and each copy is augmented differently.

Solution 1: Custom Dataset

To solve this, we need to understand the role of both a Dataset and a DataLoader.

  • A Dataset represents a collection of data, responsible for loading and returning something.

  • A DataLoader draws samples from a Dataset and returns batched samples.

The key here is that a Dataset is not limited to returning 1 sample at a time, we can make it return the K augmented versions of the same image.

Aug
Aug
Aug
Sample
Aug Sample 1
Aug Sample i
Aug Sample K

In code, this is done by subclassing the Dataset class and overriding the __getitem__ method.

def duplicate(x): return x, deepcopy(x), deepcopy(x) @@ -10,7 +10,7 @@ def __getitem__(self, index): x, y = self.dataset[index] return self.aug(x), y -

In the above example, we have a Dataset that returns 3 duplicate versions of the same image. By leveraging this technique, we can create a Dataset that returns K augmented versions of the same image as a tuple

Premature End of Epoch due to Small Labelled Set

See: frdc/train/frdc_datamodule.py

In MixMatch, the definition of an "epoch" is a bit different. Instead of implying that we have seen all the data once, it implies that we've drawn N batches. The N is referred to as the number of iterations per epoch.

Take for example, a labelled set of numbers [1, 2, 3] and an unlabelled set [4, 5, 6, 7, 8, 9, 10]. With batch size of 2, we'll run out of labelled samples after 2 iterations, but we'll still have 3 more iterations for the unlabelled set.

  • Draw 1: [1, 2], [4, 5]

  • Draw 2: [3], [6, 7].

  • Epoch ends.

Solution 2: Random Sampling

To fix this, instead of sequentially sampling the labelled set (and the unlabelled set), we can sample them randomly. This way, we can ensure that it never runs out.

  • Draw 1: [1, 3], [7, 5]

  • Draw 2: [2, 1], [4, 9]

  • Draw 3: [3, 2], [8, 6]

  • ... and so on.

Luckily, PyTorch's DataLoader supports random sampling. We just need to use RandomSampler instead of SequentialSampler (which is the default).

+

In the above example, we have a Dataset that returns 3 duplicate versions of the same image. By leveraging this technique, we can create a Dataset that returns K augmented versions of the same image as a tuple

Premature End of Epoch due to Small Labelled Set

See: frdc/train/frdc_datamodule.py

In MixMatch, the definition of an "epoch" is a bit different. Instead of implying that we have seen all the data once, it implies that we've drawn N batches. The N is referred to as the number of iterations per epoch.

Take for example, a labelled set of numbers [1, 2, 3] and an unlabelled set [4, 5, 6, 7, 8, 9, 10]. With batch size of 2, we'll run out of labelled samples after 2 iterations, but we'll still have 3 more iterations for the unlabelled set.

  • Draw 1: [1, 2], [4, 5]

  • Draw 2: [3], [6, 7].

  • Epoch ends.

Solution 2: Random Sampling

To fix this, instead of sequentially sampling the labelled set (and the unlabelled set), we can sample them randomly. This way, we can ensure that it never runs out.

  • Draw 1: [1, 3], [7, 5]

  • Draw 2: [2, 1], [4, 9]

  • Draw 3: [3, 2], [8, 6]

  • ... and so on.

Luckily, PyTorch's DataLoader supports random sampling. We just need to use RandomSampler instead of SequentialSampler (which is the default).

from torch.utils.data import DataLoader, RandomSampler dl = DataLoader( @@ -21,4 +21,4 @@ replacement=False, ) ) -

This will ensure that the "epoch" ends when we've drawn train_iters batches

Last modified: 26 December 2023
\ No newline at end of file +

This will ensure that the "epoch" ends when we've drawn train_iters batches

Last modified: 27 December 2023
\ No newline at end of file diff --git a/docs/getting-started.html b/docs/getting-started.html index 7cd27c91..ac181d93 100644 --- a/docs/getting-started.html +++ b/docs/getting-started.html @@ -1,24 +1,22 @@ - Getting Started | Documentation

Documentation 0.0.7 Help

Getting Started

Installing the Dev. Environment

  1. Ensure that you have the right version of Python. The required Python version can be seen in pyproject.toml

    + Getting Started | Documentation

    Documentation 0.0.8 Help

    Getting Started

    Installing the Dev. Environment

    1. Ensure that you have the right version of Python. The required Python version can be seen in pyproject.toml

      [tool.poetry.dependencies] python = "..." -
    2. Start by cloning our repository.

      +
    3. Start by cloning our repository.

      git clone https://github.com/FR-DC/FRDC-ML.git -
    4. Then, create a Python Virtual Env pyvenv

      python -m venv venv/
      python3 -m venv venv/
    5. Install Poetry Then check if it's installed with

      poetry --version
    6. Activate the virtual environment

      +
    7. Then, create a Python Virtual Env pyvenv

      python -m venv venv/
      python3 -m venv venv/
    8. Install Poetry Then check if it's installed with

      poetry --version
    9. Activate the virtual environment

      cd venv/Scripts activate cd ../.. -
      +
      source venv/bin/activate -
    10. Install the dependencies. You should be in the same directory as pyproject.toml

      +
  2. Install the dependencies. You should be in the same directory as pyproject.toml

    poetry install --with dev -
  3. Install Pre-Commit Hooks

    +
  4. Install Pre-Commit Hooks

    pre-commit install -

Use a Dev. Container

  1. Ensure that you have installed pre-requisites for respective IDEs. VSCode IntelliJ

  2. Start by cloning our repository.

    - git clone https://github.com/FR-DC/FRDC-ML.git -
  3. Follow steps for respective IDEs to set up the Dev. Container.

  4. Activate the virtual environment. The venv is located in /opt/venv

Setting Up Google Cloud

  1. We use Google Cloud to store our datasets. To set up Google Cloud, install the Google Cloud CLI

  2. Then, authenticate your account.

    gcloud auth login
  3. Finally, set up Application Default Credentials (ADC).

    gcloud auth application-default login
  4. To make sure everything is working, run the tests.

Setting Up Label Studio

  1. We use Label Studio to annotate our datasets. We won't go through how to install Label Studio, for contributors, it should be up on localhost:8080.

  2. Then, retrieve your own API key from Label Studio. Go to your account page and copy the API key.


  3. Set your API key as an environment variable.

    In Windows, go to "Edit environment variables for your account" and add this as a new environment variable with name LABEL_STUDIO_API_KEY.

    Export it as an environment variable.

    export LABEL_STUDIO_API_KEY=...

Setting Up Weight and Biases

  1. We use W&B to track our experiments. To set up W&B, install the W&B CLI

  2. Then, authenticate your account.

    wandb login

Pre-commit Hooks

  • +

Setting Up Google Cloud

  1. We use Google Cloud to store our datasets. To set up Google Cloud, install the Google Cloud CLI

  2. Then, authenticate your account.

    gcloud auth login
  3. Finally, set up Application Default Credentials (ADC).

    gcloud auth application-default login
  4. To make sure everything is working, run the tests.

Setting Up Label Studio

  1. We use Label Studio to annotate our datasets. We won't go through how to install Label Studio, for contributors, it should be up on localhost:8080.

  2. Then, retrieve your own API key from Label Studio. Go to your account page and copy the API key.


  3. Set your API key as an environment variable.

    In Windows, go to "Edit environment variables for your account" and add this as a new environment variable with name LABEL_STUDIO_API_KEY.

    Export it as an environment variable.

    export LABEL_STUDIO_API_KEY=...

Setting Up Weight and Biases

  1. We use W&B to track our experiments. To set up W&B, install the W&B CLI

  2. Then, authenticate your account.

    wandb login

Pre-commit Hooks

  • pre-commit install -

Running the Tests

  • Run the tests to make sure everything is working

    +

Running the Tests

  • Run the tests to make sure everything is working

    pytest -

Troubleshooting

ModuleNotFoundError

It's likely that your src and tests directories are not in PYTHONPATH. To fix this, run the following command:

+

Troubleshooting

ModuleNotFoundError

It's likely that your src and tests directories are not in PYTHONPATH. To fix this, run the following command:

export PYTHONPATH=$PYTHONPATH:./src:./tests -

Or, set it in your IDE, for example, IntelliJ allows setting directories as Source Roots.

google.auth.exceptions.DefaultCredentialsError

It's likely that you haven't authenticated your Google Cloud account. See Setting Up Google Cloud

Couldn't connect to Label Studio

Label Studio must be running locally, exposed on localhost:8080. Furthermore, you need to specify the LABEL_STUDIO_API_KEY environment variable. See Setting Up Label Studio

Cannot login to W&B

You need to authenticate your W&B account. See Setting Up Weight and Biases If you're facing difficulties, set the WANDB_MODE environment variable to offline to disable W&B.

Our Repository Structure

Before starting development, take a look at our repository structure. This will help you understand where to put your code.

Core Dependencies
Resources
Tests
Repo Dependencies
Dataset Loaders
Preprocessing Fn.
Train Deps
Model Architectures
Datasets ...
FRDC
src/frdc/
rsc/
tests/
pyproject.toml,poetry.lock
./load/
./preprocess/
./train/
./models/
./dataset_name/
src/frdc/

Source Code for our package. These are the unit components of our pipeline.

rsc/

Resources. These are usually cached datasets

tests/

PyTest tests. These are unit, integration, and model tests.

Unit, Integration, and Pipeline Tests

We have 3 types of tests:

  • Unit Tests are usually small, single function tests.

  • Integration Tests are larger tests that tests a mock pipeline.

  • Model Tests are the true production pipeline tests that will generate a model.

Where Should I contribute?

Changing a small component

If you're changing a small component, such as a argument for preprocessing, a new model architecture, or a new configuration for a dataset, take a look at the src/frdc/ directory.

Adding a test

By adding a new component, you'll need to add a new test. Take a look at the tests/ directory.

Changing the model pipeline

If you're a ML Researcher, you'll probably be changing the pipeline. Take a look at the tests/model_tests/ directory.

Adding a dependency

If you're adding a new dependency, use poetry add PACKAGE and commit the changes to pyproject.toml and poetry.lock.

Last modified: 26 December 2023
\ No newline at end of file +

Or, set it in your IDE, for example, IntelliJ allows setting directories as Source Roots.

google.auth.exceptions.DefaultCredentialsError

It's likely that you haven't authenticated your Google Cloud account. See Setting Up Google Cloud

Couldn't connect to Label Studio

Label Studio must be running locally, exposed on localhost:8080. Furthermore, you need to specify the LABEL_STUDIO_API_KEY environment variable. See Setting Up Label Studio

Cannot login to W&B

You need to authenticate your W&B account. See Setting Up Weight and Biases If you're facing difficulties, set the WANDB_MODE environment variable to offline to disable W&B.

Our Repository Structure

Before starting development, take a look at our repository structure. This will help you understand where to put your code.

Core Dependencies
Resources
Tests
Repo Dependencies
Dataset Loaders
Preprocessing Fn.
Train Deps
Model Architectures
Datasets ...
FRDC
src/frdc/
rsc/
tests/
pyproject.toml,poetry.lock
./load/
./preprocess/
./train/
./models/
./dataset_name/
src/frdc/

Source Code for our package. These are the unit components of our pipeline.

rsc/

Resources. These are usually cached datasets

tests/

PyTest tests. These are unit, integration, and model tests.

Unit, Integration, and Pipeline Tests

We have 3 types of tests:

  • Unit Tests are usually small, single function tests.

  • Integration Tests are larger tests that tests a mock pipeline.

  • Model Tests are the true production pipeline tests that will generate a model.

Where Should I contribute?

Changing a small component

If you're changing a small component, such as a argument for preprocessing, a new model architecture, or a new configuration for a dataset, take a look at the src/frdc/ directory.

Adding a test

By adding a new component, you'll need to add a new test. Take a look at the tests/ directory.

Changing the model pipeline

If you're a ML Researcher, you'll probably be changing the pipeline. Take a look at the tests/model_tests/ directory.

Adding a dependency

If you're adding a new dependency, use poetry add PACKAGE and commit the changes to pyproject.toml and poetry.lock.

Last modified: 27 December 2023
\ No newline at end of file diff --git a/docs/load-dataset.html b/docs/load-dataset.html index dc575cd2..4f936c9f 100644 --- a/docs/load-dataset.html +++ b/docs/load-dataset.html @@ -1,14 +1,14 @@ - load.dataset | Documentation

Documentation 0.0.7 Help

load.dataset

Usage

Firstly, to load a dataset instance, you need to initiliaze a FRDCDataset object, providing the site, date, and version.

For example, to load our Chestnut Nature Park dataset.

+ load.dataset | Documentation

Documentation 0.0.8 Help

load.dataset

Usage

Firstly, to load a dataset instance, you need to initiliaze a FRDCDataset object, providing the site, date, and version.

For example, to load our Chestnut Nature Park dataset.

from frdc.load import FRDCDataset ds = FRDCDataset(site='chestnut_nature_park', date='20201218', version=None) -

Then, we can use the ds object to load objects of the dataset:

+

Then, we can use the ds object to load objects of the dataset:

ar, order = ds.get_ar_bands() d = ds.get_ar_bands_as_dict() bounds, labels = ds.get_bounds_and_labels() -
  • ar is a stacked NDArray of the hyperspectral bands of shape (H x W x C)

  • order is a list of strings, containing the names of the bands, ordered according to the channels of ar

  • d is a dictionary of the hyperspectral bands of shape (H x W), keyed by the band names

  • bounds is a list of bounding boxes, in the format of Rect, a namedtuple of x0, y0, x1, y1

  • labels is a list of strings, containing the labels of the bounding boxes, ordered according to bounds

Filters

You can also selectively get the channels for both get_ar_bands() and get_ar_bands_as_dict() by providing a list of strings to the bands argument.

For example, to get the Wideband RGB bands, you can do:

+
  • ar is a stacked NDArray of the hyperspectral bands of shape (H x W x C)

  • order is a list of strings, containing the names of the bands, ordered according to the channels of ar

  • d is a dictionary of the hyperspectral bands of shape (H x W), keyed by the band names

  • bounds is a list of bounding boxes, in the format of Rect, a namedtuple of x0, y0, x1, y1

  • labels is a list of strings, containing the labels of the bounding boxes, ordered according to bounds

Filters

You can also selectively get the channels for both get_ar_bands() and get_ar_bands_as_dict() by providing a list of strings to the bands argument.

For example, to get the Wideband RGB bands, you can do:

ar, order = ds.get_ar_bands(bands=['WR', 'WG', 'WB']) d = ds.get_ar_bands_as_dict(bands=['WR', 'WG', 'WB']) -

This will also alter the channel order to the order of the bands provided.

See load.gcs for configuration options.

Last modified: 26 December 2023
\ No newline at end of file +

This will also alter the channel order to the order of the bands provided.

See load.gcs for configuration options.

Last modified: 27 December 2023
\ No newline at end of file diff --git a/docs/load-gcs.html b/docs/load-gcs.html index f02e3959..b94802eb 100644 --- a/docs/load-gcs.html +++ b/docs/load-gcs.html @@ -1,11 +1,11 @@ - load.gcs | Documentation

Documentation 0.0.7 Help

load.gcs

Usage

These are defined in the top-level load.gcs module.

list_gcs_datasets

Lists all datasets in the bucket as a DataFrame. This works by checking which folders have a specific file, which we call the anchor.

download

Downloads a file from Google Cloud Storage and returns the local file path.

open_file

Downloads and opens a file from Google Cloud Storage. Returns a file handle.

open_image

Downloads and returns the PIL image from Google Cloud Storage.

Pathing

The path to specify is relative to the bucket, which is frdc-ds by default.

For example this filesystem on GCS:

+ load.gcs | Documentation

Documentation 0.0.8 Help

load.gcs

Usage

These are defined in the top-level load.gcs module.

list_gcs_datasets

Lists all datasets in the bucket as a DataFrame. This works by checking which folders have a specific file, which we call the anchor.

download

Downloads a file from Google Cloud Storage and returns the local file path.

open_file

Downloads and opens a file from Google Cloud Storage. Returns a file handle.

open_image

Downloads and returns the PIL image from Google Cloud Storage.

Pathing

The path to specify is relative to the bucket, which is frdc-ds by default.

For example this filesystem on GCS:

# On Google Cloud Storage frdc-ds ├── chestnut_nature_park │ └── 20201218 │ └── 90deg │ └── bounds.json -

To download bounds.json, use download(r"chestnut_nature_park/20201218/90deg/bounds.json"). By default, all files will be downloaded to PROJ_DIR/rsc/....

+

To download bounds.json, use download(r"chestnut_nature_park/20201218/90deg/bounds.json"). By default, all files will be downloaded to PROJ_DIR/rsc/....

# On local filesystem PROJ_DIR ├── rsc @@ -13,4 +13,4 @@ │ └── 20201218 │ └── 90deg │ └── bounds.json -

Configuration

If you need granular control over

  • where the files are downloaded

  • the credentials used

  • the project used

  • the bucket used

Then edit conf.py.

GCS_CREDENTIALS

Google Cloud credentials.


A google.oauth2.service_account.Credentials object. See the object documentation for more information.

LOCAL_DATASET_ROOT_DIR

Local directory to download files to.


Path to a directory, or a Path object.

GCS_PROJECT_ID

Google Cloud project ID.


GCS_BUCKET_NAME

Google Cloud Storage bucket name.


Last modified: 26 December 2023
\ No newline at end of file +

Configuration

If you need granular control over

  • where the files are downloaded

  • the credentials used

  • the project used

  • the bucket used

Then edit conf.py.

GCS_CREDENTIALS

Google Cloud credentials.


A google.oauth2.service_account.Credentials object. See the object documentation for more information.

LOCAL_DATASET_ROOT_DIR

Local directory to download files to.


Path to a directory, or a Path object.

GCS_PROJECT_ID

Google Cloud project ID.


GCS_BUCKET_NAME

Google Cloud Storage bucket name.


Last modified: 27 December 2023
\ No newline at end of file diff --git a/docs/mix-match-module.html b/docs/mix-match-module.html index e8d5df2e..ceef074e 100644 --- a/docs/mix-match-module.html +++ b/docs/mix-match-module.html @@ -1,4 +1,4 @@ - MixMatch Module | Documentation

Documentation 0.0.7 Help

MixMatch Module

See frdc/train/mixmatch_module.py.

Quick Recap

We will go over the essential parts of the code here. Before that, we revise some of the concepts that are used in the code.

Abstract Methods

In Python, we can define abstract methods using the abc module. Just like other OOP languages, abstract methods are methods that must be implemented by the child class.

For example:

+ MixMatch Module | Documentation

Documentation 0.0.8 Help

MixMatch Module

See frdc/train/mixmatch_module.py.

Quick Recap

We will go over the essential parts of the code here. Before that, we revise some of the concepts that are used in the code.

Abstract Methods

In Python, we can define abstract methods using the abc module. Just like other OOP languages, abstract methods are methods that must be implemented by the child class.

For example:

from abc import ABC, abstractmethod @@ -11,7 +11,7 @@ class MyChildClass(MyAbstractClass): def my_abstract_method(self): print("Hello World!") -

nn.Module & LightningModule

If you're unfamiliar with PyTorch, you should read the nn.Module Documentation.

nn.Module is the base class for all neural network modules in PyTorch. While LightningModule is a PyTorch Lightning class that extends nn.Module, providing it with additional functionality that reduces boilerplate code.

By implementing it as a LightningModule, we also enter the PyTorch Lightning ecosystem, which provides us with a lot of useful features such as logging, early stopping, and more.

What do we implement in a Module?

One key component that nn.Module requires, is the model. So for example:

+

nn.Module & LightningModule

If you're unfamiliar with PyTorch, you should read the nn.Module Documentation.

nn.Module is the base class for all neural network modules in PyTorch. While LightningModule is a PyTorch Lightning class that extends nn.Module, providing it with additional functionality that reduces boilerplate code.

By implementing it as a LightningModule, we also enter the PyTorch Lightning ecosystem, which provides us with a lot of useful features such as logging, early stopping, and more.

What do we implement in a Module?

One key component that nn.Module requires, is the model. So for example:

class MyModule(nn.Module): def __init__(self): super().__init__() @@ -23,7 +23,7 @@ def forward(self, x): return self.model(x) -

PyTorch Lightning builds on top of it, requiring training_step and validation_step. Each "step" is a batch of data, and the model is trained on it. So for example:

+

PyTorch Lightning builds on top of it, requiring training_step and validation_step. Each "step" is a batch of data, and the model is trained on it. So for example:

class MyModule(LightningModule): def __init__(self): ... @@ -40,7 +40,7 @@ y_hat = self(x) loss = F.cross_entropy(y_hat, y) return loss -

Usually, the training and validation steps are the same, but in some cases, such as MixMatch, they are different. In MixMatch, we not only use a different loss function for train, we also handle a batch differently. The PyTorch Lightning framework allows us to separate the two, and implement them separately.

Model Embedded Preprocessing on_before_batch_transfer

In PyTorch Lightning, we can also inject a step before the batch is passed to the model. This is done by overriding the on_before_batch_transfer method.

Batch
on_before_batch_transfer
training_step
validation_step

This allows us to do preprocessing on the batch, such as scaling the data, encoding the labels, and more.

Custom EMA Update on_after_backward

We also leverage another hook, called on_after_backward. This hook is called after the backward pass, and allows us to do custom operations. In our case, we use it to update the EMA model.

Batch
training_step
on_after_backward
update_ema

MixMatch

We recommend having tests/model_tests/chestnut_dec_may/train.py open while reading this section. It implements a real-world example of MixMatch.

As a summary:

  1. We learned what is an abstract method, and how to implement it

  2. We implement the model in LightningModule much like we would in nn.Module

  3. We implement on_before_batch_transfer to preprocess the batch

  4. Finally, we implement on_after_backward to update the EMA model

With the above in mind, let's look at the MixMatch implementation.

forward (abstract)

Forward pass of the model

ema_model (abstract)

The model that is used for EMA. We expect this property to be implemented by the child class.

update_ema (abstract)

The method to update the EMA model. We expect this method to be implemented by the child class.

loss_unl_scaler (static)

Takes in the current progress of the training, 0.0 to 1.0, where 0.0 is the start of the training, and 1.0 is the end. Then, returns the multiplier for the unlabeled loss.

loss_lbl (static)

Implements the loss for labeled data. Takes in the predicted labels and the ground truth labels, and returns the loss. This is cross entropy for MixMatch.

loss_unl (static)

Implements the loss for unlabeled data. Takes in the predicted labels and the ground truth labels, and returns the loss. This is MSE for MixMatch.

mixup

Takes in the data and the labels, the beta distribution parameter, and returns the mixed data and labels.

sharpen

Takes in the labels and temperature, and returns the sharpened labels.

guess_labels

Takes in the unlabeled data, and returns the guessed labels.

progress

The current progress of the training, 0.0 to 1.0, where 0.0 is the start of the training, and 1.0 is the end.

training_step

The training step runs through 1 batch of data, and returns the loss. Note that this is significantly different from validation step, as we handle the K-Augmented data differently.

test / validation_step

The test / validation step runs through 1 batch of data, and returns the loss.

predict_step

The predict step runs through 1 batch of data, and returns the actual decoded labels.

on_after_backward

The on_after_backward hook is called after the backward pass, and allows us to do custom operations. In our case, we use it to update the EMA model.

on_before_batch_transfer

The on_before_batch_transfer hook is called before the batch is transferred to the GPU. In our case, we use it to preprocess the batch.

A diagram of how these components interact with each other is shown below:

Batch
on_before_batch_transfer
training_step
guess_labels
sharpen
mix_up
loss_unl
loss_unl_scaler
loss
loss_lbl
backward
on_after_backward
update_ema
validation_step
loss

Finally, we show an example of how to use the MixMatch module:

+

Usually, the training and validation steps are the same, but in some cases, such as MixMatch, they are different. In MixMatch, we not only use a different loss function for train, we also handle a batch differently. The PyTorch Lightning framework allows us to separate the two, and implement them separately.

Model Embedded Preprocessing on_before_batch_transfer

In PyTorch Lightning, we can also inject a step before the batch is passed to the model. This is done by overriding the on_before_batch_transfer method.

Batch
on_before_batch_transfer
training_step
validation_step

This allows us to do preprocessing on the batch, such as scaling the data, encoding the labels, and more.

Custom EMA Update on_after_backward

We also leverage another hook, called on_after_backward. This hook is called after the backward pass, and allows us to do custom operations. In our case, we use it to update the EMA model.

Batch
training_step
on_after_backward
update_ema

MixMatch

We recommend having tests/model_tests/chestnut_dec_may/train.py open while reading this section. It implements a real-world example of MixMatch.

As a summary:

  1. We learned what is an abstract method, and how to implement it

  2. We implement the model in LightningModule much like we would in nn.Module

  3. We implement on_before_batch_transfer to preprocess the batch

  4. Finally, we implement on_after_backward to update the EMA model

With the above in mind, let's look at the MixMatch implementation.

forward (abstract)

Forward pass of the model

ema_model (abstract)

The model that is used for EMA. We expect this property to be implemented by the child class.

update_ema (abstract)

The method to update the EMA model. We expect this method to be implemented by the child class.

loss_unl_scaler (static)

Takes in the current progress of the training, 0.0 to 1.0, where 0.0 is the start of the training, and 1.0 is the end. Then, returns the multiplier for the unlabeled loss.

loss_lbl (static)

Implements the loss for labeled data. Takes in the predicted labels and the ground truth labels, and returns the loss. This is cross entropy for MixMatch.

loss_unl (static)

Implements the loss for unlabeled data. Takes in the predicted labels and the ground truth labels, and returns the loss. This is MSE for MixMatch.

mixup

Takes in the data and the labels, the beta distribution parameter, and returns the mixed data and labels.

sharpen

Takes in the labels and temperature, and returns the sharpened labels.

guess_labels

Takes in the unlabeled data, and returns the guessed labels.

progress

The current progress of the training, 0.0 to 1.0, where 0.0 is the start of the training, and 1.0 is the end.

training_step

The training step runs through 1 batch of data, and returns the loss. Note that this is significantly different from validation step, as we handle the K-Augmented data differently.

test / validation_step

The test / validation step runs through 1 batch of data, and returns the loss.

predict_step

The predict step runs through 1 batch of data, and returns the actual decoded labels.

on_after_backward

The on_after_backward hook is called after the backward pass, and allows us to do custom operations. In our case, we use it to update the EMA model.

on_before_batch_transfer

The on_before_batch_transfer hook is called before the batch is transferred to the GPU. In our case, we use it to preprocess the batch.

A diagram of how these components interact with each other is shown below:

Batch
on_before_batch_transfer
training_step
guess_labels
sharpen
mix_up
loss_unl
loss_unl_scaler
loss
loss_lbl
backward
on_after_backward
update_ema
validation_step
loss

Finally, we show an example of how to use the MixMatch module:

from sklearn.preprocessing import StandardScaler, OrdinalEncoder from frdc.train.mixmatch_module import MixMatchModule @@ -60,7 +60,7 @@ sharpen_temp=0.5, mix_beta_alpha=0.75, ) -

In particular, we need to supply some transformations for the preprocessing step. In this case, we use StandardScaler to scale the data, and OrdinalEncoder to encode the labels.

  1. It's best if standardization is done only on the training data, and not the validation data to better fit real-world scenarios.

  2. We use OrdinalEncoder as it handles unseen labels. So if a class doesn't show up in the training data, it will be encoded as np.nan, and will not participate in the loss calculation.

Design Choices

Static Method Overriding

We implement many functions as static, as we believe that a functional style reduces dependencies, thus making the code easier to test and debug.

Furthermore, it allows the subclasses to easily override the functions, to customize the behavior of the MixMatch module.

For example, the loss_unl_scaler function is static, thus, we can implement our own scaling function, and pass it to the MixMatch module.

+

In particular, we need to supply some transformations for the preprocessing step. In this case, we use StandardScaler to scale the data, and OrdinalEncoder to encode the labels.

  1. It's best if standardization is done only on the training data, and not the validation data to better fit real-world scenarios.

  2. We use OrdinalEncoder as it handles unseen labels. So if a class doesn't show up in the training data, it will be encoded as np.nan, and will not participate in the loss calculation.

Design Choices

Static Method Overriding

We implement many functions as static, as we believe that a functional style reduces dependencies, thus making the code easier to test and debug.

Furthermore, it allows the subclasses to easily override the functions, to customize the behavior of the MixMatch module.

For example, the loss_unl_scaler function is static, thus, we can implement our own scaling function, and pass it to the MixMatch module.

def my_loss_unl_scaler(progress: float) -> float: return progress ** 2 @@ -68,4 +68,4 @@ @staticmethod def loss_unl_scaler(progress: float) -> float: return my_loss_unl_scaler(progress) -

If we had used a method instead, we would have to consider instance state, which would make it harder to override.

Why not use Dataclasses?

One of the biggest caveats of nn.Module is that it requires super().__init__() to be called before anything is assigned. While dataclass can leverage __post_init__ to do the same, we felt that this was too much of a hassle to save a few keystrokes. Thus, we opted to use __init__ instead, while more verbose, it is more explicit.

Why use PyTorch Lightning?

While we did hit some road blocks implementing SSL, due to its complex and unconventional nature, we felt that the benefits of using PyTorch Lightning outweighed the cons.

on_before_batch_transfer and on_after_backward are unconventional hooks, and we had to do some digging to find them. It can be argued that by just writing explicit code, we can avoid the need for these hooks, but the PyTorch ecosystem fixes many other issues, so we closed an eye on this.

References

Last modified: 26 December 2023
\ No newline at end of file +

If we had used a method instead, we would have to consider instance state, which would make it harder to override.

Why not use Dataclasses?

One of the biggest caveats of nn.Module is that it requires super().__init__() to be called before anything is assigned. While dataclass can leverage __post_init__ to do the same, we felt that this was too much of a hassle to save a few keystrokes. Thus, we opted to use __init__ instead, while more verbose, it is more explicit.

Why use PyTorch Lightning?

While we did hit some road blocks implementing SSL, due to its complex and unconventional nature, we felt that the benefits of using PyTorch Lightning outweighed the cons.

on_before_batch_transfer and on_after_backward are unconventional hooks, and we had to do some digging to find them. It can be argued that by just writing explicit code, we can avoid the need for these hooks, but the PyTorch ecosystem fixes many other issues, so we closed an eye on this.

References

Last modified: 27 December 2023
\ No newline at end of file diff --git a/docs/mix-match.html b/docs/mix-match.html index a1dd4670..514c3298 100644 --- a/docs/mix-match.html +++ b/docs/mix-match.html @@ -1 +1 @@ - MixMatch | Documentation

Documentation 0.0.7 Help

MixMatch

In FRDC-ML, we leverage semi-supervised learning to improve the model's performance through better augmentation consistency and using even unlabelled data.

The algorithm we use is MixMatch. A state-of-the-art semi-supervised learning algorithm. It is based on the idea of consistency regularization, which encourages models to predict the same class even after augmentations that occur naturally in the real world.

Our implementation of MixMatch is a refactored version of YU1ut/MixMatch-pytorch We've refactored the code to follow more modern PyTorch practices, allowing us to utilize it with modern PyTorch frameworks such as PyTorch Lightning.

We won't go through the details of MixMatch here, see Our Documentation in our MixMatch-PyTorch-CIFAR10 repository for more details.

Implementation Details

  1. How we implemented the MixMatch logic MixMatchModule

  2. How we implemented the unique MixMatch data loading logic Custom MixMatch Data Loading

References

Last modified: 26 December 2023
\ No newline at end of file + MixMatch | Documentation

Documentation 0.0.8 Help

MixMatch

In FRDC-ML, we leverage semi-supervised learning to improve the model's performance through better augmentation consistency and using even unlabelled data.

The algorithm we use is MixMatch. A state-of-the-art semi-supervised learning algorithm. It is based on the idea of consistency regularization, which encourages models to predict the same class even after augmentations that occur naturally in the real world.

Our implementation of MixMatch is a refactored version of YU1ut/MixMatch-pytorch We've refactored the code to follow more modern PyTorch practices, allowing us to utilize it with modern PyTorch frameworks such as PyTorch Lightning.

We won't go through the details of MixMatch here, see Our Documentation in our MixMatch-PyTorch-CIFAR10 repository for more details.

Implementation Details

  1. How we implemented the MixMatch logic MixMatchModule

  2. How we implemented the unique MixMatch data loading logic Custom MixMatch Data Loading

References

Last modified: 27 December 2023
\ No newline at end of file diff --git a/docs/model-test-chestnut-may-dec.html b/docs/model-test-chestnut-may-dec.html index 57959a5f..1de3317d 100644 --- a/docs/model-test-chestnut-may-dec.html +++ b/docs/model-test-chestnut-may-dec.html @@ -1 +1 @@ - Model Test Chestnut May-Dec | Documentation

Documentation 0.0.7 Help

Model Test Chestnut May-Dec

This test is used to evaluate the model performance on the Chestnut Nature Park May & December dataset.

See this script in model_tests/chestnut_dec_may/train.py.

Motivation

The usage of this model will be to classify trees in unseen datasets under different conditions. In this test, we'll evaluate it under a different season.

A caveat is that it'll be evaluated on the same set of trees, so it's not a representative of a field-test. However, given difficulties of yielding datasets, this still gives us a good preliminary idea of how the model will perform in different conditions.

Methodology

We train on the December dataset, and test on the May dataset.

Labelled Train
Unlabelled Train
Test
DecDataset
Model
MayDataset

Despite not having any true unlabelled data, we use MixMatch by treating the labelled data of the December dataset as unlabelled data.

Model

The current Model used is a simple InceptionV3 Transfer Learning model, with the last layer replaced with a fully connected layer(s).

SSL Loss
Input
InceptionV3 Frozen
FC Layer(s)
Softmax
Output

Preprocessing

For Training:

Segment
RandomCrop 299
Horizontal Flip 50%
Vertical Flip 50%
Normalize By Training Mean & Std

For Validation:

Segment
CenterCrop 299
Normalize By Training Mean & Std

For Evaluation:

Segment
CenterCrop 299
Normalize By Training Mean & Std
As Is
Horizontal Flip
Vertical Flip
Horizontal & Vertical Flip

For evaluation, we evaluate that the model should be invariant to horizontal and vertical flips, as well as the original image.

Hyperparameters

The following hyperparameters are used:

  • Optimizer: Adam

  • Learning Rate: 1e-3

  • Batch Size: 32

  • Epochs: 10

  • Train Iterations: 25~100

  • Validation Iterations: 10~25

  • Early Stopping: 4

Results

We evaluate around 40% accuracy on the test set, compared to 100% for the training set. This indicates that the model has saturated and is not able to learn anymore from the training set. There's no indication of overfitting as the validation loss just plateaus.

W&B Dashboard

Caveats

  • The test set is very small, so the results are not very representative.

  • The test set is the same set of trees, so it's not a true test of the model performance in different conditions.

  • There are many classes with 1 sample, so the model may not be able to learn the features of these classes well.

Last modified: 26 December 2023
\ No newline at end of file + Model Test Chestnut May-Dec | Documentation

Documentation 0.0.8 Help

Model Test Chestnut May-Dec

This test is used to evaluate the model performance on the Chestnut Nature Park May & December dataset.

See this script in model_tests/chestnut_dec_may/train.py.

Motivation

The usage of this model will be to classify trees in unseen datasets under different conditions. In this test, we'll evaluate it under a different season.

A caveat is that it'll be evaluated on the same set of trees, so it's not a representative of a field-test. However, given difficulties of yielding datasets, this still gives us a good preliminary idea of how the model will perform in different conditions.

Methodology

We train on the December dataset, and test on the May dataset.

Labelled Train
Unlabelled Train
Test
DecDataset
Model
MayDataset

Despite not having any true unlabelled data, we use MixMatch by treating the labelled data of the December dataset as unlabelled data.

Model

The current Model used is a simple InceptionV3 Transfer Learning model, with the last layer replaced with a fully connected layer(s).

SSL Loss
Input
InceptionV3 Frozen
FC Layer(s)
Softmax
Output

Preprocessing

For Training:

Segment
RandomCrop 299
Horizontal Flip 50%
Vertical Flip 50%
Normalize By Training Mean & Std

For Validation:

Segment
CenterCrop 299
Normalize By Training Mean & Std

For Evaluation:

Segment
CenterCrop 299
Normalize By Training Mean & Std
As Is
Horizontal Flip
Vertical Flip
Horizontal & Vertical Flip

For evaluation, we evaluate that the model should be invariant to horizontal and vertical flips, as well as the original image.

Hyperparameters

The following hyperparameters are used:

  • Optimizer: Adam

  • Learning Rate: 1e-3

  • Batch Size: 32

  • Epochs: 10

  • Train Iterations: 25~100

  • Validation Iterations: 10~25

  • Early Stopping: 4

Results

We evaluate around 40% accuracy on the test set, compared to 100% for the training set. This indicates that the model has saturated and is not able to learn anymore from the training set. There's no indication of overfitting as the validation loss just plateaus.

W&B Dashboard

Caveats

  • The test set is very small, so the results are not very representative.

  • The test set is the same set of trees, so it's not a true test of the model performance in different conditions.

  • There are many classes with 1 sample, so the model may not be able to learn the features of these classes well.

Last modified: 27 December 2023
\ No newline at end of file diff --git a/docs/overview.html b/docs/overview.html index 11b6ffd2..0a5fbe11 100644 --- a/docs/overview.html +++ b/docs/overview.html @@ -1 +1 @@ - Overview | Documentation

Documentation 0.0.7 Help

Overview

Forest Recovery Digital Companion (FRDC) is a ML-assisted companion for ecologists to automatically classify surveyed trees via an Unmanned Aerial Vehicle (UAV).

This package, FRDC-ML is the Machine Learning backbone of this project, a centralized repository of tools and model architectures to be used in the FRDC pipeline.

Get started here

Other Projects

FRDC-UI

The User Interface Repository for FRDC, a WebApp GUI for ecologists to adjust annotations.

Last modified: 26 December 2023
\ No newline at end of file + Overview | Documentation

Documentation 0.0.8 Help

Overview

Forest Recovery Digital Companion (FRDC) is a ML-assisted companion for ecologists to automatically classify surveyed trees via an Unmanned Aerial Vehicle (UAV).

This package, FRDC-ML is the Machine Learning backbone of this project, a centralized repository of tools and model architectures to be used in the FRDC pipeline.

Get started here

Other Projects

FRDC-UI

The User Interface Repository for FRDC, a WebApp GUI for ecologists to adjust annotations.

Last modified: 27 December 2023
\ No newline at end of file diff --git a/docs/preprocessing-extract-segments.html b/docs/preprocessing-extract-segments.html index a2c6eff0..ee047f28 100644 --- a/docs/preprocessing-extract-segments.html +++ b/docs/preprocessing-extract-segments.html @@ -1,4 +1,4 @@ - preprocessing.extract_segments | Documentation

Documentation 0.0.7 Help

preprocessing.extract_segments

Functions

extract_segments_from_labels

Extracts segments from a label classification.

extract_segments_from_bounds

Extracts segments from Rect bounds.

remove_small_segments_from_labels

Removes small segments from a label classification.

Extract with Boundaries

A boundary is a Rect object that represents the minimum bounding box of a segment, with x0, y0, x1, y1 coordinates.

It simply slices the original image to the bounding box. The origin is the top left corner of the image.

+ preprocessing.extract_segments | Documentation

Documentation 0.0.8 Help

preprocessing.extract_segments

Functions

extract_segments_from_labels

Extracts segments from a label classification.

extract_segments_from_bounds

Extracts segments from Rect bounds.

remove_small_segments_from_labels

Removes small segments from a label classification.

Extract with Boundaries

A boundary is a Rect object that represents the minimum bounding box of a segment, with x0, y0, x1, y1 coordinates.

It simply slices the original image to the bounding box. The origin is the top left corner of the image.

+-----------------+ +-----------+ | Original | | Segmented | | Image | | Image | @@ -9,7 +9,7 @@ +-----+-----+-----+ 1, 2, 0, 2 +-----+-----+ | 7 | 8 | 9 | x0 y0 x1 y1 | 8 | 9 | +-----+-----+-----+ +-----+-----+ -
+
+-----------------+ +-----------------+ | Original | | Segmented | | Image | | Image | @@ -20,7 +20,7 @@ +-----+-----+-----+ 1, 2, 0, 2 +-----+-----+-----+ | 7 | 8 | 9 | x0 y0 x1 y1 | 0 | 8 | 9 | +-----+-----+-----+ +-----+-----+-----+ -

Extract with Labels

A label classification is a np.ndarray where each pixel is mapped to a segment. The segments are mapped to a unique integer. In our project, the 0th label is the background.

For example, a label classification of 3 segments will look like this:

+

Extract with Labels

A label classification is a np.ndarray where each pixel is mapped to a segment. The segments are mapped to a unique integer. In our project, the 0th label is the background.

For example, a label classification of 3 segments will look like this:

+-----------------+ +-----------------+ | Label | | Original | | Classification | | Image | @@ -31,7 +31,7 @@ +-----+-----+-----+ +-----+-----+-----+ | 1 | 1 | 0 | | 7 | 8 | 9 | +-----+-----+-----+ +-----+-----+-----+ -

The extraction will take the minimum bounding box of each segment and return a list of segments.

For example, the label 1 and 2 extracted images will be

+

The extraction will take the minimum bounding box of each segment and return a list of segments.

For example, the label 1 and 2 extracted images will be

+-----------+ +-----------+ | Extracted | | Extracted | | Segment 1 | | Segment 2 | @@ -42,7 +42,7 @@ +-----+-----+ +-----+-----+ | 7 | 8 | +-----+-----+ -
+
+-----------------+ +-----------------+ | Extracted | | Extracted | | Segment 1 | | Segment 2 | @@ -53,7 +53,7 @@ +-----+-----+-----+ +-----+-----+-----+ | 7 | 8 | 0 | | 0 | 0 | 0 | +-----+-----+-----+ +-----+-----+-----+ -
  • If cropped is False, the segments are padded with 0s to the original image size. While this can ensure shape consistency, it can consume more memory for large images.

  • If cropped is True, the segments are cropped to the minimum bounding box. This can save memory, but the shape of the segments will be inconsistent.

Usage

Extract from Bounds and Labels

Extract segments from bounds and labels.

+
  • If cropped is False, the segments are padded with 0s to the original image size. While this can ensure shape consistency, it can consume more memory for large images.

  • If cropped is True, the segments are cropped to the minimum bounding box. This can save memory, but the shape of the segments will be inconsistent.

Usage

Extract from Bounds and Labels

Extract segments from bounds and labels.

import numpy as np from frdc.load import FRDCDataset from frdc.preprocess.extract_segments import extract_segments_from_bounds @@ -65,7 +65,7 @@ bounds, labels = ds.get_bounds_and_labels() segments: list[np.ndarray] = extract_segments_from_bounds(ar, bounds) -

Extract from Auto-Segmentation

Extract segments from a label classification.

+

Extract from Auto-Segmentation

Extract segments from a label classification.

from skimage.morphology import remove_small_objects, remove_small_holes import numpy as np @@ -91,4 +91,4 @@ min_height=10, min_width=10) segments: list[np.ndarray] = extract_segments_from_labels(ar, ar_labels) -

API

extract_segments_from_labels(ar, ar_labels, cropped)

Extracts segments from a label classification.


ar_labels is a label classification as a np.ndarray

extract_segments_from_bounds(ar, bounds, cropped)

Extracts segments from Rect bounds.


bounds is a list of Rect bounds.

remove_small_segments_from_labels(ar_labels, min_height, min_width)

Removes small segments from a label classification.


Last modified: 26 December 2023
\ No newline at end of file +

API

extract_segments_from_labels(ar, ar_labels, cropped)

Extracts segments from a label classification.


ar_labels is a label classification as a np.ndarray

extract_segments_from_bounds(ar, bounds, cropped)

Extracts segments from Rect bounds.


bounds is a list of Rect bounds.

remove_small_segments_from_labels(ar_labels, min_height, min_width)

Removes small segments from a label classification.


Last modified: 27 December 2023
\ No newline at end of file diff --git a/docs/preprocessing-glcm-padded.html b/docs/preprocessing-glcm-padded.html index ed865bac..4fe5fd50 100644 --- a/docs/preprocessing-glcm-padded.html +++ b/docs/preprocessing-glcm-padded.html @@ -1,4 +1,4 @@ - preprocessing.glcm_padded | Documentation

Documentation 0.0.7 Help

preprocessing.glcm_padded

Functions

glcm_padded

Computes the GLCM of the NDArray bands with padding.

glcm_padded_cached

Computes the GLCM of the NDArray bands with padding, and caches it.

append_glcm_padded_cached

Computes the GLCM of the NDArray bands with padding, and caches it and also appends it onto the original array.

Usage

We show a few examples of how to use the GLCM functions.

+ preprocessing.glcm_padded | Documentation

Documentation 0.0.8 Help

preprocessing.glcm_padded

Functions

glcm_padded

Computes the GLCM of the NDArray bands with padding.

glcm_padded_cached

Computes the GLCM of the NDArray bands with padding, and caches it.

append_glcm_padded_cached

Computes the GLCM of the NDArray bands with padding, and caches it and also appends it onto the original array.

Usage

We show a few examples of how to use the GLCM functions.

import numpy as np from glcm_cupy import Features @@ -23,4 +23,4 @@ ar_glcm_cached_appended = append_glcm_padded_cached(ar, bin_from=1, bin_to=4, radius=3) -
  • ar_glcm is the GLCM of the original array, with the last dimension being the GLCM features. The number of features is determined by the features parameter, which defaults to all features.

  • ar_glcm_2_features selects only 2 features, with the last dimension being the 2 GLCM features specified.

  • ar_glcm_cached caches the GLCM so that if you call it again, it will return the cached version. It stores its data at the project root dir, under .cache/.

  • ar_glcm_cached_appended is a wrapper around ar_glcm_cached, it appends the GLCM features onto the original array. It's equivalent to calling ar_glcm_cached and then np.concatenate on the final axes.

Caching

GLCM is an expensive operation, thus we recommend to cache it if the input parameters will be the same. This is especially useful if you're experimenting with the same dataset with constant parameters.

API

glcm_padded(ar, bin_from, bin_to, radius, step_size, features)

Computes the GLCM of the NDArray bands with padding.


  • ar is the input array

  • bin_from is the upper bound of the input

  • bin_to is the upper bound of the GLCM input, i.e. the resolution that GLCM operates on

  • radius is the radius of the GLCM

  • step_size is the step size of the GLCM

  • features is the list of GLCM features to compute

The return shape is

See glcm_cupy for the GLCM Features.

glcm_padded_cached(ar, bin_from, bin_to, radius, step_size, features)

Computes the GLCM of the NDArray bands with padding, and caches it.


See glcm_padded for the parameters and output shape

append_glcm_padded_cached(ar, bin_from, bin_to, radius, step_size, features)

Computes the GLCM of the NDArray bands with padding, and caches it and also appends it onto the original array.


See glcm_padded for the parameters


The return shape is:

The function automatically flattens the last 2 dimensions of the GLCM features, and appends it onto the original array.

Last modified: 26 December 2023
\ No newline at end of file +
  • ar_glcm is the GLCM of the original array, with the last dimension being the GLCM features. The number of features is determined by the features parameter, which defaults to all features.

  • ar_glcm_2_features selects only 2 features, with the last dimension being the 2 GLCM features specified.

  • ar_glcm_cached caches the GLCM so that if you call it again, it will return the cached version. It stores its data at the project root dir, under .cache/.

  • ar_glcm_cached_appended is a wrapper around ar_glcm_cached, it appends the GLCM features onto the original array. It's equivalent to calling ar_glcm_cached and then np.concatenate on the final axes.

Caching

GLCM is an expensive operation, thus we recommend to cache it if the input parameters will be the same. This is especially useful if you're experimenting with the same dataset with constant parameters.

API

glcm_padded(ar, bin_from, bin_to, radius, step_size, features)

Computes the GLCM of the NDArray bands with padding.


  • ar is the input array

  • bin_from is the upper bound of the input

  • bin_to is the upper bound of the GLCM input, i.e. the resolution that GLCM operates on

  • radius is the radius of the GLCM

  • step_size is the step size of the GLCM

  • features is the list of GLCM features to compute

The return shape is

See glcm_cupy for the GLCM Features.

glcm_padded_cached(ar, bin_from, bin_to, radius, step_size, features)

Computes the GLCM of the NDArray bands with padding, and caches it.


See glcm_padded for the parameters and output shape

append_glcm_padded_cached(ar, bin_from, bin_to, radius, step_size, features)

Computes the GLCM of the NDArray bands with padding, and caches it and also appends it onto the original array.


See glcm_padded for the parameters


The return shape is:

The function automatically flattens the last 2 dimensions of the GLCM features, and appends it onto the original array.

Last modified: 27 December 2023
\ No newline at end of file diff --git a/docs/preprocessing-morphology.html b/docs/preprocessing-morphology.html index 6697539f..fbe3df70 100644 --- a/docs/preprocessing-morphology.html +++ b/docs/preprocessing-morphology.html @@ -1,4 +1,4 @@ - preprocessing.morphology | Documentation

Documentation 0.0.7 Help

preprocessing.morphology

Functions

threshold_binary_mask

Thresholds a selected NDArray bands to yield a binary mask.

binary_watershed

Performs watershed on a binary mask to yield a mapped label classification

Usage

Perform auto-segmentation on a dataset to yield a label classification.

+ preprocessing.morphology | Documentation

Documentation 0.0.8 Help

preprocessing.morphology

Functions

threshold_binary_mask

Thresholds a selected NDArray bands to yield a binary mask.

binary_watershed

Performs watershed on a binary mask to yield a mapped label classification

Usage

Perform auto-segmentation on a dataset to yield a label classification.

from frdc.load import FRDCDataset from frdc.preprocess.morphology import ( threshold_binary_mask, binary_watershed @@ -10,6 +10,6 @@ ar, order = ds.get_ar_bands() mask = threshold_binary_mask(ar, order.index('NIR'), 90 / 256) ar_label = binary_watershed(mask) -

API

threshold_binary_mask(ar, band_idx, threshold_value)

Thresholds a selected NDArray bands to yield a binary mask as np.ndarray


This is equivalent to

+

API

threshold_binary_mask(ar, band_idx, threshold_value)

Thresholds a selected NDArray bands to yield a binary mask as np.ndarray


This is equivalent to

ar[..., band_idx] > threshold_value -
binary_watershed(ar_mask, peaks_footprint, watershed_compactness)

Performs watershed on a binary mask to yield a mapped label classification as a np.ndarray


  • peaks_footprint is the footprint of skimage.feature.peak_local_max

  • watershed_compactness is the compactness of skimage.morphology.watershed

Last modified: 26 December 2023
\ No newline at end of file +
binary_watershed(ar_mask, peaks_footprint, watershed_compactness)

Performs watershed on a binary mask to yield a mapped label classification as a np.ndarray


  • peaks_footprint is the footprint of skimage.feature.peak_local_max

  • watershed_compactness is the compactness of skimage.morphology.watershed

Last modified: 27 December 2023
\ No newline at end of file diff --git a/docs/preprocessing-scale.html b/docs/preprocessing-scale.html index b846445e..ebb1b8dc 100644 --- a/docs/preprocessing-scale.html +++ b/docs/preprocessing-scale.html @@ -1,4 +1,4 @@ - preprocessing.scale | Documentation

Documentation 0.0.7 Help

preprocessing.scale

Functions

scale_0_1_per_band

Scales the NDArray bands to [0, 1] per band.

scale_normal_per_band

Scales the NDArray bands to zero mean unit variance per band.

scale_static_per_band

Scales the NDArray bands by a predefined configuration. Take a look at frdc.conf.BAND_MAX_CONFIG for an example.

Usage

+ preprocessing.scale | Documentation

Documentation 0.0.8 Help

preprocessing.scale

Functions

scale_0_1_per_band

Scales the NDArray bands to [0, 1] per band.

scale_normal_per_band

Scales the NDArray bands to zero mean unit variance per band.

scale_static_per_band

Scales the NDArray bands by a predefined configuration. Take a look at frdc.conf.BAND_MAX_CONFIG for an example.

Usage

from frdc.load import FRDCDataset from frdc.preprocess.scale import ( scale_0_1_per_band, scale_normal_per_band, scale_static_per_band @@ -12,4 +12,4 @@ ar_01 = scale_0_1_per_band(ar) ar_norm = scale_normal_per_band(ar) ar_static = scale_static_per_band(ar, order, BAND_MAX_CONFIG) -
Last modified: 26 December 2023
\ No newline at end of file +
Last modified: 27 December 2023
\ No newline at end of file diff --git a/docs/retrieve-our-datasets.html b/docs/retrieve-our-datasets.html index bbeecb47..641bbf98 100644 --- a/docs/retrieve-our-datasets.html +++ b/docs/retrieve-our-datasets.html @@ -1,10 +1,10 @@ - Retrieve our Datasets | Documentation

Documentation 0.0.7 Help

Retrieve our Datasets

In this tutorial, we'll learn how to :

  • Retrieve FRDC's Hyperspectral Image Data as np.ndarray

  • Retrieve FRDC's Ground Truth bounds and labels

  • Slice/segment the image data by the bounds

Prerequisites

  • New here? Get Started.

  • Setup the Google Cloud Authorization to download the data.

Retrieve the Data

To retrieve the data, use FRDCDataset

Here, we'll download and load our

  • ar: Hyperspectral Image Data

  • order: The order of the bands

  • bounds: The bounds of the trees (segments)

  • labels: The labels of the trees (segments)

+ Retrieve our Datasets | Documentation

Documentation 0.0.8 Help

Retrieve our Datasets

In this tutorial, we'll learn how to :

  • Retrieve FRDC's Hyperspectral Image Data as np.ndarray

  • Retrieve FRDC's Ground Truth bounds and labels

  • Slice/segment the image data by the bounds

Prerequisites

  • New here? Get Started.

  • Setup the Google Cloud Authorization to download the data.

Retrieve the Data

To retrieve the data, use FRDCDataset

Here, we'll download and load our

  • ar: Hyperspectral Image Data

  • order: The order of the bands

  • bounds: The bounds of the trees (segments)

  • labels: The labels of the trees (segments)

from frdc.load.dataset import FRDCDataset ds = FRDCDataset(site="chestnut_nature_park", date="20201218", version=None) ar, order = ds.get_ar_bands() bounds, labels = ds.get_bounds_and_labels() -

What Datasets are there?

+

What Datasets are there?

from frdc.load.gcs import list_gcs_datasets print(list_gcs_datasets()) # 0 DEBUG/0 @@ -12,7 +12,7 @@ # 2 casuarina/20220418/93deg # 3 chestnut_nature_park/20201218 # ... -
  • The first part of the path is the site, and the second part is the date.

  • The version is the rest of the path, if there isn't any, use None.

  • site="ds"

  • date="date"

  • version="ver"

  • site="ds"

  • date="date"

  • version="ver/01/data"

  • site="ds"

  • date="date"

  • version=None

Segment the Data

To segment the data, use Extract Segments.

Here, we'll segment the data by the bounds.

+
  • The first part of the path is the site, and the second part is the date.

  • The version is the rest of the path, if there isn't any, use None.

  • site="ds"

  • date="date"

  • version="ver"

  • site="ds"

  • date="date"

  • version="ver/01/data"

  • site="ds"

  • date="date"

  • version=None

Segment the Data

To segment the data, use Extract Segments.

Here, we'll segment the data by the bounds.

from frdc.load.dataset import FRDCDataset from frdc.preprocess.extract_segments import extract_segments_from_bounds @@ -20,7 +20,7 @@ ar, order = ds.get_ar_bands() bounds, labels = ds.get_bounds_and_labels() segments = extract_segments_from_bounds(ar, bounds) -

segments is a list of np.ndarray of shape H, W, C, representing a tree. The order of segments is the same as labels, so you can use labels to identify the tree.

Plot the Data (Optional)

We can then use these data to plot out the first tree segment.

+

segments is a list of np.ndarray of shape H, W, C, representing a tree. The order of segments is the same as labels, so you can use labels to identify the tree.

Plot the Data (Optional)

We can then use these data to plot out the first tree segment.

import matplotlib.pyplot as plt from frdc.load.dataset import FRDCDataset @@ -38,4 +38,4 @@ plt.imshow(segment_0_rgb_scaled) plt.title(f"Tree {labels[0]}") plt.show() -

See also: preprocessing.scale.scale_0_1_per_band

MatPlotLib cannot show the data correctly as-is, so we need to

  • Convert the data from BGR to RGB

  • Scale the data to 0-1 per band

Last modified: 26 December 2023
\ No newline at end of file +

See also: preprocessing.scale.scale_0_1_per_band

MatPlotLib cannot show the data correctly as-is, so we need to

  • Convert the data from BGR to RGB

  • Scale the data to 0-1 per band

Last modified: 27 December 2023
\ No newline at end of file diff --git a/docs/train-frdc-lightning.html b/docs/train-frdc-lightning.html index 1886de79..52cde410 100644 --- a/docs/train-frdc-lightning.html +++ b/docs/train-frdc-lightning.html @@ -1,4 +1,4 @@ - train.frdc_datamodule & frdc_module | Documentation

Documentation 0.0.7 Help

train.frdc_datamodule & frdc_module

These are FRDC specific LightningDataModule and LightningModule, a core component in the PyTorch Lightning ecosystem to provide a simple interface to train and evaluate models.

Classes

FRDCDataModule

The FRDC PyTorch Lightning DataModule.

FRDCModule

The FRDC PyTorch Lightning Module.

Usage

API

FRDCDataModule(segments, labels, preprocess, augmentation, train_val_test_split, batch_size)

Initializes the FRDC PyTorch Lightning DataModule.


  • segments, labels are retrieved from

  • preprocess is a function that takes in a segment and returns a preprocessed segment. In particular, it should accept a list of NumPy NDArrays and return a single stacked PyToch Tensor.

  • augmentation is a function that takes in a segment and returns an augmented segment. In particular, it takes in a PyTorch Tensor and returns another.

  • train_val_test_split is a function that takes a TensorDataset and returns a list of 3 TensorDatasets, for train, val and test respectively.

  • batch_size is the batch size.

FRDCModule(model_cls, model_kwargs, optim_cls, optim_kwargs)

Initializes the FRDC PyTorch Lightning Module.


  • model_cls is the Class of the model.

  • model_kwargs is the kwargs to pass to the model.

  • optim_cls is the Class of the optimizer.

  • optim_kwargs is the kwargs to pass to the optimizer.

Internally, the module will initialize the model and optimizer as follows:

+ train.frdc_datamodule & frdc_module | Documentation

Documentation 0.0.8 Help

train.frdc_datamodule & frdc_module

These are FRDC specific LightningDataModule and LightningModule, a core component in the PyTorch Lightning ecosystem to provide a simple interface to train and evaluate models.

Classes

FRDCDataModule

The FRDC PyTorch Lightning DataModule.

FRDCModule

The FRDC PyTorch Lightning Module.

Usage

API

FRDCDataModule(segments, labels, preprocess, augmentation, train_val_test_split, batch_size)

Initializes the FRDC PyTorch Lightning DataModule.


  • segments, labels are retrieved from

  • preprocess is a function that takes in a segment and returns a preprocessed segment. In particular, it should accept a list of NumPy NDArrays and return a single stacked PyToch Tensor.

  • augmentation is a function that takes in a segment and returns an augmented segment. In particular, it takes in a PyTorch Tensor and returns another.

  • train_val_test_split is a function that takes a TensorDataset and returns a list of 3 TensorDatasets, for train, val and test respectively.

  • batch_size is the batch size.

FRDCModule(model_cls, model_kwargs, optim_cls, optim_kwargs)

Initializes the FRDC PyTorch Lightning Module.


  • model_cls is the Class of the model.

  • model_kwargs is the kwargs to pass to the model.

  • optim_cls is the Class of the optimizer.

  • optim_kwargs is the kwargs to pass to the optimizer.

Internally, the module will initialize the model and optimizer as follows:

model = model_cls(**model_kwargs) optim = optim_cls(model.parameters(), **optim_kwargs) -
Last modified: 26 December 2023
\ No newline at end of file +
Last modified: 27 December 2023
\ No newline at end of file From 64eb8087f11a4418e3be05da702cde308be7cb5a Mon Sep 17 00:00:00 2001 From: Evening Date: Wed, 27 Dec 2023 17:28:36 +0800 Subject: [PATCH 17/52] Add missing page --- docs/HelpTOC.json | 2 +- docs/Map.jhm | 2 +- docs/custom-k-aug-dataloaders.html | 6 +++--- docs/get-started-with-dev-containers.html | 3 +++ docs/getting-started.html | 20 ++++++++++---------- docs/load-dataset.html | 8 ++++---- docs/load-gcs.html | 6 +++--- docs/mix-match-module.html | 12 ++++++------ docs/mix-match.html | 2 +- docs/model-test-chestnut-may-dec.html | 2 +- docs/overview.html | 2 +- docs/preprocessing-extract-segments.html | 16 ++++++++-------- docs/preprocessing-glcm-padded.html | 4 ++-- docs/preprocessing-morphology.html | 6 +++--- docs/preprocessing-scale.html | 4 ++-- docs/retrieve-our-datasets.html | 10 +++++----- docs/train-frdc-lightning.html | 4 ++-- 17 files changed, 56 insertions(+), 53 deletions(-) create mode 100644 docs/get-started-with-dev-containers.html diff --git a/docs/HelpTOC.json b/docs/HelpTOC.json index e076f9bd..d0eb4e12 100644 --- a/docs/HelpTOC.json +++ b/docs/HelpTOC.json @@ -1 +1 @@ -{"entities":{"pages":{"Overview":{"id":"Overview","title":"Overview","url":"overview.html","level":0,"tabIndex":0},"Getting-Started":{"id":"Getting-Started","title":"Getting Started","url":"getting-started.html","level":0,"tabIndex":1},"e8e19623_38931":{"id":"e8e19623_38931","title":"Tutorials","level":0,"pages":["Retrieve-our-Datasets"],"tabIndex":2},"Retrieve-our-Datasets":{"id":"Retrieve-our-Datasets","title":"Retrieve our Datasets","url":"retrieve-our-datasets.html","level":1,"parentId":"e8e19623_38931","tabIndex":0},"mix-match":{"id":"mix-match","title":"MixMatch","url":"mix-match.html","level":0,"pages":["mix-match-module","custom-k-aug-dataloaders"],"tabIndex":3},"mix-match-module":{"id":"mix-match-module","title":"MixMatch Module","url":"mix-match-module.html","level":1,"parentId":"mix-match","tabIndex":0},"custom-k-aug-dataloaders":{"id":"custom-k-aug-dataloaders","title":"Custom K-Aug Dataloaders","url":"custom-k-aug-dataloaders.html","level":1,"parentId":"mix-match","tabIndex":1},"e8e19623_38936":{"id":"e8e19623_38936","title":"Model Tests","level":0,"pages":["Model-Test-Chestnut-May-Dec"],"tabIndex":4},"Model-Test-Chestnut-May-Dec":{"id":"Model-Test-Chestnut-May-Dec","title":"Model Test Chestnut May-Dec","url":"model-test-chestnut-may-dec.html","level":1,"parentId":"e8e19623_38936","tabIndex":0},"e8e19623_38938":{"id":"e8e19623_38938","title":"API","level":0,"pages":["load.dataset","load.gcs","preprocessing.scale","preprocessing.extract_segments","preprocessing.morphology","preprocessing.glcm_padded","train.frdc_lightning"],"tabIndex":5},"load.dataset":{"id":"load.dataset","title":"load.dataset","url":"load-dataset.html","level":1,"parentId":"e8e19623_38938","tabIndex":0},"load.gcs":{"id":"load.gcs","title":"load.gcs","url":"load-gcs.html","level":1,"parentId":"e8e19623_38938","tabIndex":1},"preprocessing.scale":{"id":"preprocessing.scale","title":"preprocessing.scale","url":"preprocessing-scale.html","level":1,"parentId":"e8e19623_38938","tabIndex":2},"preprocessing.extract_segments":{"id":"preprocessing.extract_segments","title":"preprocessing.extract_segments","url":"preprocessing-extract-segments.html","level":1,"parentId":"e8e19623_38938","tabIndex":3},"preprocessing.morphology":{"id":"preprocessing.morphology","title":"preprocessing.morphology","url":"preprocessing-morphology.html","level":1,"parentId":"e8e19623_38938","tabIndex":4},"preprocessing.glcm_padded":{"id":"preprocessing.glcm_padded","title":"preprocessing.glcm_padded","url":"preprocessing-glcm-padded.html","level":1,"parentId":"e8e19623_38938","tabIndex":5},"train.frdc_lightning":{"id":"train.frdc_lightning","title":"train.frdc_datamodule \u0026 frdc_module","url":"train-frdc-lightning.html","level":1,"parentId":"e8e19623_38938","tabIndex":6}}},"topLevelIds":["Overview","Getting-Started","e8e19623_38931","mix-match","e8e19623_38936","e8e19623_38938"]} \ No newline at end of file +{"entities":{"pages":{"Overview":{"id":"Overview","title":"Overview","url":"overview.html","level":0,"tabIndex":0},"Getting-Started":{"id":"Getting-Started","title":"Getting Started","url":"getting-started.html","level":0,"pages":["Get-Started-with-Dev-Containers"],"tabIndex":1},"Get-Started-with-Dev-Containers":{"id":"Get-Started-with-Dev-Containers","title":"Get Started with Dev Containers","url":"get-started-with-dev-containers.html","level":1,"parentId":"Getting-Started","tabIndex":0},"e8e19623_66291":{"id":"e8e19623_66291","title":"Tutorials","level":0,"pages":["Retrieve-our-Datasets"],"tabIndex":2},"Retrieve-our-Datasets":{"id":"Retrieve-our-Datasets","title":"Retrieve our Datasets","url":"retrieve-our-datasets.html","level":1,"parentId":"e8e19623_66291","tabIndex":0},"mix-match":{"id":"mix-match","title":"MixMatch","url":"mix-match.html","level":0,"pages":["mix-match-module","custom-k-aug-dataloaders"],"tabIndex":3},"mix-match-module":{"id":"mix-match-module","title":"MixMatch Module","url":"mix-match-module.html","level":1,"parentId":"mix-match","tabIndex":0},"custom-k-aug-dataloaders":{"id":"custom-k-aug-dataloaders","title":"Custom K-Aug Dataloaders","url":"custom-k-aug-dataloaders.html","level":1,"parentId":"mix-match","tabIndex":1},"e8e19623_66296":{"id":"e8e19623_66296","title":"Model Tests","level":0,"pages":["Model-Test-Chestnut-May-Dec"],"tabIndex":4},"Model-Test-Chestnut-May-Dec":{"id":"Model-Test-Chestnut-May-Dec","title":"Model Test Chestnut May-Dec","url":"model-test-chestnut-may-dec.html","level":1,"parentId":"e8e19623_66296","tabIndex":0},"e8e19623_66298":{"id":"e8e19623_66298","title":"API","level":0,"pages":["load.dataset","load.gcs","preprocessing.scale","preprocessing.extract_segments","preprocessing.morphology","preprocessing.glcm_padded","train.frdc_lightning"],"tabIndex":5},"load.dataset":{"id":"load.dataset","title":"load.dataset","url":"load-dataset.html","level":1,"parentId":"e8e19623_66298","tabIndex":0},"load.gcs":{"id":"load.gcs","title":"load.gcs","url":"load-gcs.html","level":1,"parentId":"e8e19623_66298","tabIndex":1},"preprocessing.scale":{"id":"preprocessing.scale","title":"preprocessing.scale","url":"preprocessing-scale.html","level":1,"parentId":"e8e19623_66298","tabIndex":2},"preprocessing.extract_segments":{"id":"preprocessing.extract_segments","title":"preprocessing.extract_segments","url":"preprocessing-extract-segments.html","level":1,"parentId":"e8e19623_66298","tabIndex":3},"preprocessing.morphology":{"id":"preprocessing.morphology","title":"preprocessing.morphology","url":"preprocessing-morphology.html","level":1,"parentId":"e8e19623_66298","tabIndex":4},"preprocessing.glcm_padded":{"id":"preprocessing.glcm_padded","title":"preprocessing.glcm_padded","url":"preprocessing-glcm-padded.html","level":1,"parentId":"e8e19623_66298","tabIndex":5},"train.frdc_lightning":{"id":"train.frdc_lightning","title":"train.frdc_datamodule \u0026 frdc_module","url":"train-frdc-lightning.html","level":1,"parentId":"e8e19623_66298","tabIndex":6}}},"topLevelIds":["Overview","Getting-Started","e8e19623_66291","mix-match","e8e19623_66296","e8e19623_66298"]} \ No newline at end of file diff --git a/docs/Map.jhm b/docs/Map.jhm index 2442fa52..9449faa3 100644 --- a/docs/Map.jhm +++ b/docs/Map.jhm @@ -1 +1 @@ - \ No newline at end of file + \ No newline at end of file diff --git a/docs/custom-k-aug-dataloaders.html b/docs/custom-k-aug-dataloaders.html index b51fd57a..4863aad8 100644 --- a/docs/custom-k-aug-dataloaders.html +++ b/docs/custom-k-aug-dataloaders.html @@ -1,4 +1,4 @@ - Custom K-Aug Dataloaders | Documentation

Documentation 0.0.8 Help

Custom K-Aug Dataloaders

In MixMatch, implementing the data loading methods is quite unconventional.

  1. We need to load multiple augmented versions of the same image into the same batch.

  2. The labelled set is usually too small, causing a premature end to the epoch as it runs out of samples to draw from faster than the unlabelled set.

This can be rather tricky to implement in PyTorch. This tutorial will illustrate how we did it.

Loading Multiple Augmented Versions of the Same Image

See: frdc/load/dataset.py FRDCDataset.__getitem__

In MixMatch, a single train batch must consist of:

  1. A batch of labeled images

  2. K batches of unlabeled images

Aug
Aug
Aug
Aug
Get Batch
Aug Labelled Batch
Unlabelled Batch
Aug Unl. Batch 1
Aug Unl. Batch i
Aug Unl. Batch K

Keep in mind that the unlabelled batch, is a single batch of images, not separate draws of batches. It is then "duplicated" K times, and each copy is augmented differently.

Solution 1: Custom Dataset

To solve this, we need to understand the role of both a Dataset and a DataLoader.

  • A Dataset represents a collection of data, responsible for loading and returning something.

  • A DataLoader draws samples from a Dataset and returns batched samples.

The key here is that a Dataset is not limited to returning 1 sample at a time, we can make it return the K augmented versions of the same image.

Aug
Aug
Aug
Sample
Aug Sample 1
Aug Sample i
Aug Sample K

In code, this is done by subclassing the Dataset class and overriding the __getitem__ method.

+ Custom K-Aug Dataloaders | Documentation

Documentation 0.0.8 Help

Custom K-Aug Dataloaders

In MixMatch, implementing the data loading methods is quite unconventional.

  1. We need to load multiple augmented versions of the same image into the same batch.

  2. The labelled set is usually too small, causing a premature end to the epoch as it runs out of samples to draw from faster than the unlabelled set.

This can be rather tricky to implement in PyTorch. This tutorial will illustrate how we did it.

Loading Multiple Augmented Versions of the Same Image

See: frdc/load/dataset.py FRDCDataset.__getitem__

In MixMatch, a single train batch must consist of:

  1. A batch of labeled images

  2. K batches of unlabeled images

Aug
Aug
Aug
Aug
Get Batch
Aug Labelled Batch
Unlabelled Batch
Aug Unl. Batch 1
Aug Unl. Batch i
Aug Unl. Batch K

Keep in mind that the unlabelled batch, is a single batch of images, not separate draws of batches. It is then "duplicated" K times, and each copy is augmented differently.

Solution 1: Custom Dataset

To solve this, we need to understand the role of both a Dataset and a DataLoader.

  • A Dataset represents a collection of data, responsible for loading and returning something.

  • A DataLoader draws samples from a Dataset and returns batched samples.

The key here is that a Dataset is not limited to returning 1 sample at a time, we can make it return the K augmented versions of the same image.

Aug
Aug
Aug
Sample
Aug Sample 1
Aug Sample i
Aug Sample K

In code, this is done by subclassing the Dataset class and overriding the __getitem__ method.

def duplicate(x): return x, deepcopy(x), deepcopy(x) @@ -10,7 +10,7 @@ def __getitem__(self, index): x, y = self.dataset[index] return self.aug(x), y -

In the above example, we have a Dataset that returns 3 duplicate versions of the same image. By leveraging this technique, we can create a Dataset that returns K augmented versions of the same image as a tuple

Premature End of Epoch due to Small Labelled Set

See: frdc/train/frdc_datamodule.py

In MixMatch, the definition of an "epoch" is a bit different. Instead of implying that we have seen all the data once, it implies that we've drawn N batches. The N is referred to as the number of iterations per epoch.

Take for example, a labelled set of numbers [1, 2, 3] and an unlabelled set [4, 5, 6, 7, 8, 9, 10]. With batch size of 2, we'll run out of labelled samples after 2 iterations, but we'll still have 3 more iterations for the unlabelled set.

  • Draw 1: [1, 2], [4, 5]

  • Draw 2: [3], [6, 7].

  • Epoch ends.

Solution 2: Random Sampling

To fix this, instead of sequentially sampling the labelled set (and the unlabelled set), we can sample them randomly. This way, we can ensure that it never runs out.

  • Draw 1: [1, 3], [7, 5]

  • Draw 2: [2, 1], [4, 9]

  • Draw 3: [3, 2], [8, 6]

  • ... and so on.

Luckily, PyTorch's DataLoader supports random sampling. We just need to use RandomSampler instead of SequentialSampler (which is the default).

+

In the above example, we have a Dataset that returns 3 duplicate versions of the same image. By leveraging this technique, we can create a Dataset that returns K augmented versions of the same image as a tuple

Premature End of Epoch due to Small Labelled Set

See: frdc/train/frdc_datamodule.py

In MixMatch, the definition of an "epoch" is a bit different. Instead of implying that we have seen all the data once, it implies that we've drawn N batches. The N is referred to as the number of iterations per epoch.

Take for example, a labelled set of numbers [1, 2, 3] and an unlabelled set [4, 5, 6, 7, 8, 9, 10]. With batch size of 2, we'll run out of labelled samples after 2 iterations, but we'll still have 3 more iterations for the unlabelled set.

  • Draw 1: [1, 2], [4, 5]

  • Draw 2: [3], [6, 7].

  • Epoch ends.

Solution 2: Random Sampling

To fix this, instead of sequentially sampling the labelled set (and the unlabelled set), we can sample them randomly. This way, we can ensure that it never runs out.

  • Draw 1: [1, 3], [7, 5]

  • Draw 2: [2, 1], [4, 9]

  • Draw 3: [3, 2], [8, 6]

  • ... and so on.

Luckily, PyTorch's DataLoader supports random sampling. We just need to use RandomSampler instead of SequentialSampler (which is the default).

from torch.utils.data import DataLoader, RandomSampler dl = DataLoader( @@ -21,4 +21,4 @@ replacement=False, ) ) -

This will ensure that the "epoch" ends when we've drawn train_iters batches

Last modified: 27 December 2023
\ No newline at end of file +

This will ensure that the "epoch" ends when we've drawn train_iters batches

Last modified: 27 December 2023
\ No newline at end of file diff --git a/docs/get-started-with-dev-containers.html b/docs/get-started-with-dev-containers.html new file mode 100644 index 00000000..5d793d5a --- /dev/null +++ b/docs/get-started-with-dev-containers.html @@ -0,0 +1,3 @@ + Get Started with Dev Containers | Documentation

Documentation 0.0.8 Help

Get Started with Dev Containers

Dev. Containers are a great way to get started with a project. They define all necessary dependencies and environments, so you can just start coding within the container.

In this article, we'll only go over additional steps to set up with our project. For more information on how to use Dev Containers, please refer to the official documentation for each IDE. Once you've set up the Dev Container, come back here to finish the setup:

Python Environment

The dev environment is already created and is managed by Anaconda /opt/conda/bin/conda. To activate the environment, run the following command:

+conda activate base +

Mark as Sources Root (Add to PYTHONPATH)

For import statements to work, you need to mark the src folder as the sources root. Optionally, also mark the tests folder as the tests root.

Additional Setup

Refer to the Getting Started guide for additional setup steps such as:

  • Google Cloud Application Default Credentials

  • Weight & Bias API Key

  • Label Studio API Key

Last modified: 27 December 2023
\ No newline at end of file diff --git a/docs/getting-started.html b/docs/getting-started.html index ac181d93..8f064904 100644 --- a/docs/getting-started.html +++ b/docs/getting-started.html @@ -1,22 +1,22 @@ - Getting Started | Documentation

Documentation 0.0.8 Help

Getting Started

Installing the Dev. Environment

  1. Ensure that you have the right version of Python. The required Python version can be seen in pyproject.toml

    + Getting Started | Documentation

    Documentation 0.0.8 Help

    Getting Started

    Installing the Dev. Environment

    1. Ensure that you have the right version of Python. The required Python version can be seen in pyproject.toml

      [tool.poetry.dependencies] python = "..." -
    2. Start by cloning our repository.

      +
    3. Start by cloning our repository.

      git clone https://github.com/FR-DC/FRDC-ML.git -
    4. Then, create a Python Virtual Env pyvenv

      python -m venv venv/
      python3 -m venv venv/
    5. Install Poetry Then check if it's installed with

      poetry --version
    6. Activate the virtual environment

      +
    7. Then, create a Python Virtual Env pyvenv

      python -m venv venv/
      python3 -m venv venv/
    8. Install Poetry Then check if it's installed with

      poetry --version
    9. Activate the virtual environment

      cd venv/Scripts activate cd ../.. -
      +
      source venv/bin/activate -
    10. Install the dependencies. You should be in the same directory as pyproject.toml

      +
  2. Install the dependencies. You should be in the same directory as pyproject.toml

    poetry install --with dev -
  3. Install Pre-Commit Hooks

    +
  4. Install Pre-Commit Hooks

    pre-commit install -

Setting Up Google Cloud

  1. We use Google Cloud to store our datasets. To set up Google Cloud, install the Google Cloud CLI

  2. Then, authenticate your account.

    gcloud auth login
  3. Finally, set up Application Default Credentials (ADC).

    gcloud auth application-default login
  4. To make sure everything is working, run the tests.

Setting Up Label Studio

  1. We use Label Studio to annotate our datasets. We won't go through how to install Label Studio, for contributors, it should be up on localhost:8080.

  2. Then, retrieve your own API key from Label Studio. Go to your account page and copy the API key.


  3. Set your API key as an environment variable.

    In Windows, go to "Edit environment variables for your account" and add this as a new environment variable with name LABEL_STUDIO_API_KEY.

    Export it as an environment variable.

    export LABEL_STUDIO_API_KEY=...

Setting Up Weight and Biases

  1. We use W&B to track our experiments. To set up W&B, install the W&B CLI

  2. Then, authenticate your account.

    wandb login

Pre-commit Hooks

  • +

Setting Up Google Cloud

  1. We use Google Cloud to store our datasets. To set up Google Cloud, install the Google Cloud CLI

  2. Then, authenticate your account.

    gcloud auth login
  3. Finally, set up Application Default Credentials (ADC).

    gcloud auth application-default login
  4. To make sure everything is working, run the tests.

Setting Up Label Studio

  1. We use Label Studio to annotate our datasets. We won't go through how to install Label Studio, for contributors, it should be up on localhost:8080.

  2. Then, retrieve your own API key from Label Studio. Go to your account page and copy the API key.


  3. Set your API key as an environment variable.

    In Windows, go to "Edit environment variables for your account" and add this as a new environment variable with name LABEL_STUDIO_API_KEY.

    Export it as an environment variable.

    export LABEL_STUDIO_API_KEY=...

Setting Up Weight and Biases

  1. We use W&B to track our experiments. To set up W&B, install the W&B CLI

  2. Then, authenticate your account.

    wandb login

Pre-commit Hooks

  • pre-commit install -

Running the Tests

  • Run the tests to make sure everything is working

    +

Running the Tests

  • Run the tests to make sure everything is working

    pytest -

Troubleshooting

ModuleNotFoundError

It's likely that your src and tests directories are not in PYTHONPATH. To fix this, run the following command:

+

Troubleshooting

ModuleNotFoundError

It's likely that your src and tests directories are not in PYTHONPATH. To fix this, run the following command:

export PYTHONPATH=$PYTHONPATH:./src:./tests -

Or, set it in your IDE, for example, IntelliJ allows setting directories as Source Roots.

google.auth.exceptions.DefaultCredentialsError

It's likely that you haven't authenticated your Google Cloud account. See Setting Up Google Cloud

Couldn't connect to Label Studio

Label Studio must be running locally, exposed on localhost:8080. Furthermore, you need to specify the LABEL_STUDIO_API_KEY environment variable. See Setting Up Label Studio

Cannot login to W&B

You need to authenticate your W&B account. See Setting Up Weight and Biases If you're facing difficulties, set the WANDB_MODE environment variable to offline to disable W&B.

Our Repository Structure

Before starting development, take a look at our repository structure. This will help you understand where to put your code.

Core Dependencies
Resources
Tests
Repo Dependencies
Dataset Loaders
Preprocessing Fn.
Train Deps
Model Architectures
Datasets ...
FRDC
src/frdc/
rsc/
tests/
pyproject.toml,poetry.lock
./load/
./preprocess/
./train/
./models/
./dataset_name/
src/frdc/

Source Code for our package. These are the unit components of our pipeline.

rsc/

Resources. These are usually cached datasets

tests/

PyTest tests. These are unit, integration, and model tests.

Unit, Integration, and Pipeline Tests

We have 3 types of tests:

  • Unit Tests are usually small, single function tests.

  • Integration Tests are larger tests that tests a mock pipeline.

  • Model Tests are the true production pipeline tests that will generate a model.

Where Should I contribute?

Changing a small component

If you're changing a small component, such as a argument for preprocessing, a new model architecture, or a new configuration for a dataset, take a look at the src/frdc/ directory.

Adding a test

By adding a new component, you'll need to add a new test. Take a look at the tests/ directory.

Changing the model pipeline

If you're a ML Researcher, you'll probably be changing the pipeline. Take a look at the tests/model_tests/ directory.

Adding a dependency

If you're adding a new dependency, use poetry add PACKAGE and commit the changes to pyproject.toml and poetry.lock.

Last modified: 27 December 2023
\ No newline at end of file +

Or, set it in your IDE, for example, IntelliJ allows setting directories as Source Roots.

google.auth.exceptions.DefaultCredentialsError

It's likely that you haven't authenticated your Google Cloud account. See Setting Up Google Cloud

Couldn't connect to Label Studio

Label Studio must be running locally, exposed on localhost:8080. Furthermore, you need to specify the LABEL_STUDIO_API_KEY environment variable. See Setting Up Label Studio

Cannot login to W&B

You need to authenticate your W&B account. See Setting Up Weight and Biases If you're facing difficulties, set the WANDB_MODE environment variable to offline to disable W&B.

Our Repository Structure

Before starting development, take a look at our repository structure. This will help you understand where to put your code.

Core Dependencies
Resources
Tests
Repo Dependencies
Dataset Loaders
Preprocessing Fn.
Train Deps
Model Architectures
Datasets ...
FRDC
src/frdc/
rsc/
tests/
pyproject.toml,poetry.lock
./load/
./preprocess/
./train/
./models/
./dataset_name/
src/frdc/

Source Code for our package. These are the unit components of our pipeline.

rsc/

Resources. These are usually cached datasets

tests/

PyTest tests. These are unit, integration, and model tests.

Unit, Integration, and Pipeline Tests

We have 3 types of tests:

  • Unit Tests are usually small, single function tests.

  • Integration Tests are larger tests that tests a mock pipeline.

  • Model Tests are the true production pipeline tests that will generate a model.

Where Should I contribute?

Changing a small component

If you're changing a small component, such as a argument for preprocessing, a new model architecture, or a new configuration for a dataset, take a look at the src/frdc/ directory.

Adding a test

By adding a new component, you'll need to add a new test. Take a look at the tests/ directory.

Changing the model pipeline

If you're a ML Researcher, you'll probably be changing the pipeline. Take a look at the tests/model_tests/ directory.

Adding a dependency

If you're adding a new dependency, use poetry add PACKAGE and commit the changes to pyproject.toml and poetry.lock.

Last modified: 27 December 2023
\ No newline at end of file diff --git a/docs/load-dataset.html b/docs/load-dataset.html index 4f936c9f..bd5caa57 100644 --- a/docs/load-dataset.html +++ b/docs/load-dataset.html @@ -1,14 +1,14 @@ - load.dataset | Documentation

Documentation 0.0.8 Help

load.dataset

Usage

Firstly, to load a dataset instance, you need to initiliaze a FRDCDataset object, providing the site, date, and version.

For example, to load our Chestnut Nature Park dataset.

+ load.dataset | Documentation

Documentation 0.0.8 Help

load.dataset

Usage

Firstly, to load a dataset instance, you need to initiliaze a FRDCDataset object, providing the site, date, and version.

For example, to load our Chestnut Nature Park dataset.

from frdc.load import FRDCDataset ds = FRDCDataset(site='chestnut_nature_park', date='20201218', version=None) -

Then, we can use the ds object to load objects of the dataset:

+

Then, we can use the ds object to load objects of the dataset:

ar, order = ds.get_ar_bands() d = ds.get_ar_bands_as_dict() bounds, labels = ds.get_bounds_and_labels() -
  • ar is a stacked NDArray of the hyperspectral bands of shape (H x W x C)

  • order is a list of strings, containing the names of the bands, ordered according to the channels of ar

  • d is a dictionary of the hyperspectral bands of shape (H x W), keyed by the band names

  • bounds is a list of bounding boxes, in the format of Rect, a namedtuple of x0, y0, x1, y1

  • labels is a list of strings, containing the labels of the bounding boxes, ordered according to bounds

Filters

You can also selectively get the channels for both get_ar_bands() and get_ar_bands_as_dict() by providing a list of strings to the bands argument.

For example, to get the Wideband RGB bands, you can do:

+
  • ar is a stacked NDArray of the hyperspectral bands of shape (H x W x C)

  • order is a list of strings, containing the names of the bands, ordered according to the channels of ar

  • d is a dictionary of the hyperspectral bands of shape (H x W), keyed by the band names

  • bounds is a list of bounding boxes, in the format of Rect, a namedtuple of x0, y0, x1, y1

  • labels is a list of strings, containing the labels of the bounding boxes, ordered according to bounds

Filters

You can also selectively get the channels for both get_ar_bands() and get_ar_bands_as_dict() by providing a list of strings to the bands argument.

For example, to get the Wideband RGB bands, you can do:

ar, order = ds.get_ar_bands(bands=['WR', 'WG', 'WB']) d = ds.get_ar_bands_as_dict(bands=['WR', 'WG', 'WB']) -

This will also alter the channel order to the order of the bands provided.

See load.gcs for configuration options.

Last modified: 27 December 2023
\ No newline at end of file +

This will also alter the channel order to the order of the bands provided.

See load.gcs for configuration options.

Last modified: 27 December 2023
\ No newline at end of file diff --git a/docs/load-gcs.html b/docs/load-gcs.html index b94802eb..c6faadf2 100644 --- a/docs/load-gcs.html +++ b/docs/load-gcs.html @@ -1,11 +1,11 @@ - load.gcs | Documentation

Documentation 0.0.8 Help

load.gcs

Usage

These are defined in the top-level load.gcs module.

list_gcs_datasets

Lists all datasets in the bucket as a DataFrame. This works by checking which folders have a specific file, which we call the anchor.

download

Downloads a file from Google Cloud Storage and returns the local file path.

open_file

Downloads and opens a file from Google Cloud Storage. Returns a file handle.

open_image

Downloads and returns the PIL image from Google Cloud Storage.

Pathing

The path to specify is relative to the bucket, which is frdc-ds by default.

For example this filesystem on GCS:

+ load.gcs | Documentation

Documentation 0.0.8 Help

load.gcs

Usage

These are defined in the top-level load.gcs module.

list_gcs_datasets

Lists all datasets in the bucket as a DataFrame. This works by checking which folders have a specific file, which we call the anchor.

download

Downloads a file from Google Cloud Storage and returns the local file path.

open_file

Downloads and opens a file from Google Cloud Storage. Returns a file handle.

open_image

Downloads and returns the PIL image from Google Cloud Storage.

Pathing

The path to specify is relative to the bucket, which is frdc-ds by default.

For example this filesystem on GCS:

# On Google Cloud Storage frdc-ds ├── chestnut_nature_park │ └── 20201218 │ └── 90deg │ └── bounds.json -

To download bounds.json, use download(r"chestnut_nature_park/20201218/90deg/bounds.json"). By default, all files will be downloaded to PROJ_DIR/rsc/....

+

To download bounds.json, use download(r"chestnut_nature_park/20201218/90deg/bounds.json"). By default, all files will be downloaded to PROJ_DIR/rsc/....

# On local filesystem PROJ_DIR ├── rsc @@ -13,4 +13,4 @@ │ └── 20201218 │ └── 90deg │ └── bounds.json -

Configuration

If you need granular control over

  • where the files are downloaded

  • the credentials used

  • the project used

  • the bucket used

Then edit conf.py.

GCS_CREDENTIALS

Google Cloud credentials.


A google.oauth2.service_account.Credentials object. See the object documentation for more information.

LOCAL_DATASET_ROOT_DIR

Local directory to download files to.


Path to a directory, or a Path object.

GCS_PROJECT_ID

Google Cloud project ID.


GCS_BUCKET_NAME

Google Cloud Storage bucket name.


Last modified: 27 December 2023
\ No newline at end of file +

Configuration

If you need granular control over

  • where the files are downloaded

  • the credentials used

  • the project used

  • the bucket used

Then edit conf.py.

GCS_CREDENTIALS

Google Cloud credentials.


A google.oauth2.service_account.Credentials object. See the object documentation for more information.

LOCAL_DATASET_ROOT_DIR

Local directory to download files to.


Path to a directory, or a Path object.

GCS_PROJECT_ID

Google Cloud project ID.


GCS_BUCKET_NAME

Google Cloud Storage bucket name.


Last modified: 27 December 2023
\ No newline at end of file diff --git a/docs/mix-match-module.html b/docs/mix-match-module.html index ceef074e..e2512d44 100644 --- a/docs/mix-match-module.html +++ b/docs/mix-match-module.html @@ -1,4 +1,4 @@ - MixMatch Module | Documentation

Documentation 0.0.8 Help

MixMatch Module

See frdc/train/mixmatch_module.py.

Quick Recap

We will go over the essential parts of the code here. Before that, we revise some of the concepts that are used in the code.

Abstract Methods

In Python, we can define abstract methods using the abc module. Just like other OOP languages, abstract methods are methods that must be implemented by the child class.

For example:

+ MixMatch Module | Documentation

Documentation 0.0.8 Help

MixMatch Module

See frdc/train/mixmatch_module.py.

Quick Recap

We will go over the essential parts of the code here. Before that, we revise some of the concepts that are used in the code.

Abstract Methods

In Python, we can define abstract methods using the abc module. Just like other OOP languages, abstract methods are methods that must be implemented by the child class.

For example:

from abc import ABC, abstractmethod @@ -11,7 +11,7 @@ class MyChildClass(MyAbstractClass): def my_abstract_method(self): print("Hello World!") -

nn.Module & LightningModule

If you're unfamiliar with PyTorch, you should read the nn.Module Documentation.

nn.Module is the base class for all neural network modules in PyTorch. While LightningModule is a PyTorch Lightning class that extends nn.Module, providing it with additional functionality that reduces boilerplate code.

By implementing it as a LightningModule, we also enter the PyTorch Lightning ecosystem, which provides us with a lot of useful features such as logging, early stopping, and more.

What do we implement in a Module?

One key component that nn.Module requires, is the model. So for example:

+

nn.Module & LightningModule

If you're unfamiliar with PyTorch, you should read the nn.Module Documentation.

nn.Module is the base class for all neural network modules in PyTorch. While LightningModule is a PyTorch Lightning class that extends nn.Module, providing it with additional functionality that reduces boilerplate code.

By implementing it as a LightningModule, we also enter the PyTorch Lightning ecosystem, which provides us with a lot of useful features such as logging, early stopping, and more.

What do we implement in a Module?

One key component that nn.Module requires, is the model. So for example:

class MyModule(nn.Module): def __init__(self): super().__init__() @@ -23,7 +23,7 @@ def forward(self, x): return self.model(x) -

PyTorch Lightning builds on top of it, requiring training_step and validation_step. Each "step" is a batch of data, and the model is trained on it. So for example:

+

PyTorch Lightning builds on top of it, requiring training_step and validation_step. Each "step" is a batch of data, and the model is trained on it. So for example:

class MyModule(LightningModule): def __init__(self): ... @@ -40,7 +40,7 @@ y_hat = self(x) loss = F.cross_entropy(y_hat, y) return loss -

Usually, the training and validation steps are the same, but in some cases, such as MixMatch, they are different. In MixMatch, we not only use a different loss function for train, we also handle a batch differently. The PyTorch Lightning framework allows us to separate the two, and implement them separately.

Model Embedded Preprocessing on_before_batch_transfer

In PyTorch Lightning, we can also inject a step before the batch is passed to the model. This is done by overriding the on_before_batch_transfer method.

Batch
on_before_batch_transfer
training_step
validation_step

This allows us to do preprocessing on the batch, such as scaling the data, encoding the labels, and more.

Custom EMA Update on_after_backward

We also leverage another hook, called on_after_backward. This hook is called after the backward pass, and allows us to do custom operations. In our case, we use it to update the EMA model.

Batch
training_step
on_after_backward
update_ema

MixMatch

We recommend having tests/model_tests/chestnut_dec_may/train.py open while reading this section. It implements a real-world example of MixMatch.

As a summary:

  1. We learned what is an abstract method, and how to implement it

  2. We implement the model in LightningModule much like we would in nn.Module

  3. We implement on_before_batch_transfer to preprocess the batch

  4. Finally, we implement on_after_backward to update the EMA model

With the above in mind, let's look at the MixMatch implementation.

forward (abstract)

Forward pass of the model

ema_model (abstract)

The model that is used for EMA. We expect this property to be implemented by the child class.

update_ema (abstract)

The method to update the EMA model. We expect this method to be implemented by the child class.

loss_unl_scaler (static)

Takes in the current progress of the training, 0.0 to 1.0, where 0.0 is the start of the training, and 1.0 is the end. Then, returns the multiplier for the unlabeled loss.

loss_lbl (static)

Implements the loss for labeled data. Takes in the predicted labels and the ground truth labels, and returns the loss. This is cross entropy for MixMatch.

loss_unl (static)

Implements the loss for unlabeled data. Takes in the predicted labels and the ground truth labels, and returns the loss. This is MSE for MixMatch.

mixup

Takes in the data and the labels, the beta distribution parameter, and returns the mixed data and labels.

sharpen

Takes in the labels and temperature, and returns the sharpened labels.

guess_labels

Takes in the unlabeled data, and returns the guessed labels.

progress

The current progress of the training, 0.0 to 1.0, where 0.0 is the start of the training, and 1.0 is the end.

training_step

The training step runs through 1 batch of data, and returns the loss. Note that this is significantly different from validation step, as we handle the K-Augmented data differently.

test / validation_step

The test / validation step runs through 1 batch of data, and returns the loss.

predict_step

The predict step runs through 1 batch of data, and returns the actual decoded labels.

on_after_backward

The on_after_backward hook is called after the backward pass, and allows us to do custom operations. In our case, we use it to update the EMA model.

on_before_batch_transfer

The on_before_batch_transfer hook is called before the batch is transferred to the GPU. In our case, we use it to preprocess the batch.

A diagram of how these components interact with each other is shown below:

Batch
on_before_batch_transfer
training_step
guess_labels
sharpen
mix_up
loss_unl
loss_unl_scaler
loss
loss_lbl
backward
on_after_backward
update_ema
validation_step
loss

Finally, we show an example of how to use the MixMatch module:

+

Usually, the training and validation steps are the same, but in some cases, such as MixMatch, they are different. In MixMatch, we not only use a different loss function for train, we also handle a batch differently. The PyTorch Lightning framework allows us to separate the two, and implement them separately.

Model Embedded Preprocessing on_before_batch_transfer

In PyTorch Lightning, we can also inject a step before the batch is passed to the model. This is done by overriding the on_before_batch_transfer method.

Batch
on_before_batch_transfer
training_step
validation_step

This allows us to do preprocessing on the batch, such as scaling the data, encoding the labels, and more.

Custom EMA Update on_after_backward

We also leverage another hook, called on_after_backward. This hook is called after the backward pass, and allows us to do custom operations. In our case, we use it to update the EMA model.

Batch
training_step
on_after_backward
update_ema

MixMatch

We recommend having tests/model_tests/chestnut_dec_may/train.py open while reading this section. It implements a real-world example of MixMatch.

As a summary:

  1. We learned what is an abstract method, and how to implement it

  2. We implement the model in LightningModule much like we would in nn.Module

  3. We implement on_before_batch_transfer to preprocess the batch

  4. Finally, we implement on_after_backward to update the EMA model

With the above in mind, let's look at the MixMatch implementation.

forward (abstract)

Forward pass of the model

ema_model (abstract)

The model that is used for EMA. We expect this property to be implemented by the child class.

update_ema (abstract)

The method to update the EMA model. We expect this method to be implemented by the child class.

loss_unl_scaler (static)

Takes in the current progress of the training, 0.0 to 1.0, where 0.0 is the start of the training, and 1.0 is the end. Then, returns the multiplier for the unlabeled loss.

loss_lbl (static)

Implements the loss for labeled data. Takes in the predicted labels and the ground truth labels, and returns the loss. This is cross entropy for MixMatch.

loss_unl (static)

Implements the loss for unlabeled data. Takes in the predicted labels and the ground truth labels, and returns the loss. This is MSE for MixMatch.

mixup

Takes in the data and the labels, the beta distribution parameter, and returns the mixed data and labels.

sharpen

Takes in the labels and temperature, and returns the sharpened labels.

guess_labels

Takes in the unlabeled data, and returns the guessed labels.

progress

The current progress of the training, 0.0 to 1.0, where 0.0 is the start of the training, and 1.0 is the end.

training_step

The training step runs through 1 batch of data, and returns the loss. Note that this is significantly different from validation step, as we handle the K-Augmented data differently.

test / validation_step

The test / validation step runs through 1 batch of data, and returns the loss.

predict_step

The predict step runs through 1 batch of data, and returns the actual decoded labels.

on_after_backward

The on_after_backward hook is called after the backward pass, and allows us to do custom operations. In our case, we use it to update the EMA model.

on_before_batch_transfer

The on_before_batch_transfer hook is called before the batch is transferred to the GPU. In our case, we use it to preprocess the batch.

A diagram of how these components interact with each other is shown below:

Batch
on_before_batch_transfer
training_step
guess_labels
sharpen
mix_up
loss_unl
loss_unl_scaler
loss
loss_lbl
backward
on_after_backward
update_ema
validation_step
loss

Finally, we show an example of how to use the MixMatch module:

from sklearn.preprocessing import StandardScaler, OrdinalEncoder from frdc.train.mixmatch_module import MixMatchModule @@ -60,7 +60,7 @@ sharpen_temp=0.5, mix_beta_alpha=0.75, ) -

In particular, we need to supply some transformations for the preprocessing step. In this case, we use StandardScaler to scale the data, and OrdinalEncoder to encode the labels.

  1. It's best if standardization is done only on the training data, and not the validation data to better fit real-world scenarios.

  2. We use OrdinalEncoder as it handles unseen labels. So if a class doesn't show up in the training data, it will be encoded as np.nan, and will not participate in the loss calculation.

Design Choices

Static Method Overriding

We implement many functions as static, as we believe that a functional style reduces dependencies, thus making the code easier to test and debug.

Furthermore, it allows the subclasses to easily override the functions, to customize the behavior of the MixMatch module.

For example, the loss_unl_scaler function is static, thus, we can implement our own scaling function, and pass it to the MixMatch module.

+

In particular, we need to supply some transformations for the preprocessing step. In this case, we use StandardScaler to scale the data, and OrdinalEncoder to encode the labels.

  1. It's best if standardization is done only on the training data, and not the validation data to better fit real-world scenarios.

  2. We use OrdinalEncoder as it handles unseen labels. So if a class doesn't show up in the training data, it will be encoded as np.nan, and will not participate in the loss calculation.

Design Choices

Static Method Overriding

We implement many functions as static, as we believe that a functional style reduces dependencies, thus making the code easier to test and debug.

Furthermore, it allows the subclasses to easily override the functions, to customize the behavior of the MixMatch module.

For example, the loss_unl_scaler function is static, thus, we can implement our own scaling function, and pass it to the MixMatch module.

def my_loss_unl_scaler(progress: float) -> float: return progress ** 2 @@ -68,4 +68,4 @@ @staticmethod def loss_unl_scaler(progress: float) -> float: return my_loss_unl_scaler(progress) -

If we had used a method instead, we would have to consider instance state, which would make it harder to override.

Why not use Dataclasses?

One of the biggest caveats of nn.Module is that it requires super().__init__() to be called before anything is assigned. While dataclass can leverage __post_init__ to do the same, we felt that this was too much of a hassle to save a few keystrokes. Thus, we opted to use __init__ instead, while more verbose, it is more explicit.

Why use PyTorch Lightning?

While we did hit some road blocks implementing SSL, due to its complex and unconventional nature, we felt that the benefits of using PyTorch Lightning outweighed the cons.

on_before_batch_transfer and on_after_backward are unconventional hooks, and we had to do some digging to find them. It can be argued that by just writing explicit code, we can avoid the need for these hooks, but the PyTorch ecosystem fixes many other issues, so we closed an eye on this.

References

Last modified: 27 December 2023
\ No newline at end of file +

If we had used a method instead, we would have to consider instance state, which would make it harder to override.

Why not use Dataclasses?

One of the biggest caveats of nn.Module is that it requires super().__init__() to be called before anything is assigned. While dataclass can leverage __post_init__ to do the same, we felt that this was too much of a hassle to save a few keystrokes. Thus, we opted to use __init__ instead, while more verbose, it is more explicit.

Why use PyTorch Lightning?

While we did hit some road blocks implementing SSL, due to its complex and unconventional nature, we felt that the benefits of using PyTorch Lightning outweighed the cons.

on_before_batch_transfer and on_after_backward are unconventional hooks, and we had to do some digging to find them. It can be argued that by just writing explicit code, we can avoid the need for these hooks, but the PyTorch ecosystem fixes many other issues, so we closed an eye on this.

References

Last modified: 27 December 2023
\ No newline at end of file diff --git a/docs/mix-match.html b/docs/mix-match.html index 514c3298..5d0f4795 100644 --- a/docs/mix-match.html +++ b/docs/mix-match.html @@ -1 +1 @@ - MixMatch | Documentation

Documentation 0.0.8 Help

MixMatch

In FRDC-ML, we leverage semi-supervised learning to improve the model's performance through better augmentation consistency and using even unlabelled data.

The algorithm we use is MixMatch. A state-of-the-art semi-supervised learning algorithm. It is based on the idea of consistency regularization, which encourages models to predict the same class even after augmentations that occur naturally in the real world.

Our implementation of MixMatch is a refactored version of YU1ut/MixMatch-pytorch We've refactored the code to follow more modern PyTorch practices, allowing us to utilize it with modern PyTorch frameworks such as PyTorch Lightning.

We won't go through the details of MixMatch here, see Our Documentation in our MixMatch-PyTorch-CIFAR10 repository for more details.

Implementation Details

  1. How we implemented the MixMatch logic MixMatchModule

  2. How we implemented the unique MixMatch data loading logic Custom MixMatch Data Loading

References

Last modified: 27 December 2023
\ No newline at end of file + MixMatch | Documentation

Documentation 0.0.8 Help

MixMatch

In FRDC-ML, we leverage semi-supervised learning to improve the model's performance through better augmentation consistency and using even unlabelled data.

The algorithm we use is MixMatch. A state-of-the-art semi-supervised learning algorithm. It is based on the idea of consistency regularization, which encourages models to predict the same class even after augmentations that occur naturally in the real world.

Our implementation of MixMatch is a refactored version of YU1ut/MixMatch-pytorch We've refactored the code to follow more modern PyTorch practices, allowing us to utilize it with modern PyTorch frameworks such as PyTorch Lightning.

We won't go through the details of MixMatch here, see Our Documentation in our MixMatch-PyTorch-CIFAR10 repository for more details.

Implementation Details

  1. How we implemented the MixMatch logic MixMatchModule

  2. How we implemented the unique MixMatch data loading logic Custom MixMatch Data Loading

References

Last modified: 27 December 2023
\ No newline at end of file diff --git a/docs/model-test-chestnut-may-dec.html b/docs/model-test-chestnut-may-dec.html index 1de3317d..91b6538a 100644 --- a/docs/model-test-chestnut-may-dec.html +++ b/docs/model-test-chestnut-may-dec.html @@ -1 +1 @@ - Model Test Chestnut May-Dec | Documentation

Documentation 0.0.8 Help

Model Test Chestnut May-Dec

This test is used to evaluate the model performance on the Chestnut Nature Park May & December dataset.

See this script in model_tests/chestnut_dec_may/train.py.

Motivation

The usage of this model will be to classify trees in unseen datasets under different conditions. In this test, we'll evaluate it under a different season.

A caveat is that it'll be evaluated on the same set of trees, so it's not a representative of a field-test. However, given difficulties of yielding datasets, this still gives us a good preliminary idea of how the model will perform in different conditions.

Methodology

We train on the December dataset, and test on the May dataset.

Labelled Train
Unlabelled Train
Test
DecDataset
Model
MayDataset

Despite not having any true unlabelled data, we use MixMatch by treating the labelled data of the December dataset as unlabelled data.

Model

The current Model used is a simple InceptionV3 Transfer Learning model, with the last layer replaced with a fully connected layer(s).

SSL Loss
Input
InceptionV3 Frozen
FC Layer(s)
Softmax
Output

Preprocessing

For Training:

Segment
RandomCrop 299
Horizontal Flip 50%
Vertical Flip 50%
Normalize By Training Mean & Std

For Validation:

Segment
CenterCrop 299
Normalize By Training Mean & Std

For Evaluation:

Segment
CenterCrop 299
Normalize By Training Mean & Std
As Is
Horizontal Flip
Vertical Flip
Horizontal & Vertical Flip

For evaluation, we evaluate that the model should be invariant to horizontal and vertical flips, as well as the original image.

Hyperparameters

The following hyperparameters are used:

  • Optimizer: Adam

  • Learning Rate: 1e-3

  • Batch Size: 32

  • Epochs: 10

  • Train Iterations: 25~100

  • Validation Iterations: 10~25

  • Early Stopping: 4

Results

We evaluate around 40% accuracy on the test set, compared to 100% for the training set. This indicates that the model has saturated and is not able to learn anymore from the training set. There's no indication of overfitting as the validation loss just plateaus.

W&B Dashboard

Caveats

  • The test set is very small, so the results are not very representative.

  • The test set is the same set of trees, so it's not a true test of the model performance in different conditions.

  • There are many classes with 1 sample, so the model may not be able to learn the features of these classes well.

Last modified: 27 December 2023
\ No newline at end of file + Model Test Chestnut May-Dec | Documentation

Documentation 0.0.8 Help

Model Test Chestnut May-Dec

This test is used to evaluate the model performance on the Chestnut Nature Park May & December dataset.

See this script in model_tests/chestnut_dec_may/train.py.

Motivation

The usage of this model will be to classify trees in unseen datasets under different conditions. In this test, we'll evaluate it under a different season.

A caveat is that it'll be evaluated on the same set of trees, so it's not a representative of a field-test. However, given difficulties of yielding datasets, this still gives us a good preliminary idea of how the model will perform in different conditions.

Methodology

We train on the December dataset, and test on the May dataset.

Labelled Train
Unlabelled Train
Test
DecDataset
Model
MayDataset

Despite not having any true unlabelled data, we use MixMatch by treating the labelled data of the December dataset as unlabelled data.

Model

The current Model used is a simple InceptionV3 Transfer Learning model, with the last layer replaced with a fully connected layer(s).

SSL Loss
Input
InceptionV3 Frozen
FC Layer(s)
Softmax
Output

Preprocessing

For Training:

Segment
RandomCrop 299
Horizontal Flip 50%
Vertical Flip 50%
Normalize By Training Mean & Std

For Validation:

Segment
CenterCrop 299
Normalize By Training Mean & Std

For Evaluation:

Segment
CenterCrop 299
Normalize By Training Mean & Std
As Is
Horizontal Flip
Vertical Flip
Horizontal & Vertical Flip

For evaluation, we evaluate that the model should be invariant to horizontal and vertical flips, as well as the original image.

Hyperparameters

The following hyperparameters are used:

  • Optimizer: Adam

  • Learning Rate: 1e-3

  • Batch Size: 32

  • Epochs: 10

  • Train Iterations: 25~100

  • Validation Iterations: 10~25

  • Early Stopping: 4

Results

We evaluate around 40% accuracy on the test set, compared to 100% for the training set. This indicates that the model has saturated and is not able to learn anymore from the training set. There's no indication of overfitting as the validation loss just plateaus.

W&B Dashboard

Caveats

  • The test set is very small, so the results are not very representative.

  • The test set is the same set of trees, so it's not a true test of the model performance in different conditions.

  • There are many classes with 1 sample, so the model may not be able to learn the features of these classes well.

Last modified: 27 December 2023
\ No newline at end of file diff --git a/docs/overview.html b/docs/overview.html index 0a5fbe11..627976e2 100644 --- a/docs/overview.html +++ b/docs/overview.html @@ -1 +1 @@ - Overview | Documentation

Documentation 0.0.8 Help

Overview

Forest Recovery Digital Companion (FRDC) is a ML-assisted companion for ecologists to automatically classify surveyed trees via an Unmanned Aerial Vehicle (UAV).

This package, FRDC-ML is the Machine Learning backbone of this project, a centralized repository of tools and model architectures to be used in the FRDC pipeline.

Get started here

Other Projects

FRDC-UI

The User Interface Repository for FRDC, a WebApp GUI for ecologists to adjust annotations.

Last modified: 27 December 2023
\ No newline at end of file + Overview | Documentation

Documentation 0.0.8 Help

Overview

Forest Recovery Digital Companion (FRDC) is a ML-assisted companion for ecologists to automatically classify surveyed trees via an Unmanned Aerial Vehicle (UAV).

This package, FRDC-ML is the Machine Learning backbone of this project, a centralized repository of tools and model architectures to be used in the FRDC pipeline.

Get started here

Other Projects

FRDC-UI

The User Interface Repository for FRDC, a WebApp GUI for ecologists to adjust annotations.

Last modified: 27 December 2023
\ No newline at end of file diff --git a/docs/preprocessing-extract-segments.html b/docs/preprocessing-extract-segments.html index ee047f28..03e0ae62 100644 --- a/docs/preprocessing-extract-segments.html +++ b/docs/preprocessing-extract-segments.html @@ -1,4 +1,4 @@ - preprocessing.extract_segments | Documentation

Documentation 0.0.8 Help

preprocessing.extract_segments

Functions

extract_segments_from_labels

Extracts segments from a label classification.

extract_segments_from_bounds

Extracts segments from Rect bounds.

remove_small_segments_from_labels

Removes small segments from a label classification.

Extract with Boundaries

A boundary is a Rect object that represents the minimum bounding box of a segment, with x0, y0, x1, y1 coordinates.

It simply slices the original image to the bounding box. The origin is the top left corner of the image.

+ preprocessing.extract_segments | Documentation

Documentation 0.0.8 Help

preprocessing.extract_segments

Functions

extract_segments_from_labels

Extracts segments from a label classification.

extract_segments_from_bounds

Extracts segments from Rect bounds.

remove_small_segments_from_labels

Removes small segments from a label classification.

Extract with Boundaries

A boundary is a Rect object that represents the minimum bounding box of a segment, with x0, y0, x1, y1 coordinates.

It simply slices the original image to the bounding box. The origin is the top left corner of the image.

+-----------------+ +-----------+ | Original | | Segmented | | Image | | Image | @@ -9,7 +9,7 @@ +-----+-----+-----+ 1, 2, 0, 2 +-----+-----+ | 7 | 8 | 9 | x0 y0 x1 y1 | 8 | 9 | +-----+-----+-----+ +-----+-----+ -
+
+-----------------+ +-----------------+ | Original | | Segmented | | Image | | Image | @@ -20,7 +20,7 @@ +-----+-----+-----+ 1, 2, 0, 2 +-----+-----+-----+ | 7 | 8 | 9 | x0 y0 x1 y1 | 0 | 8 | 9 | +-----+-----+-----+ +-----+-----+-----+ -

Extract with Labels

A label classification is a np.ndarray where each pixel is mapped to a segment. The segments are mapped to a unique integer. In our project, the 0th label is the background.

For example, a label classification of 3 segments will look like this:

+

Extract with Labels

A label classification is a np.ndarray where each pixel is mapped to a segment. The segments are mapped to a unique integer. In our project, the 0th label is the background.

For example, a label classification of 3 segments will look like this:

+-----------------+ +-----------------+ | Label | | Original | | Classification | | Image | @@ -31,7 +31,7 @@ +-----+-----+-----+ +-----+-----+-----+ | 1 | 1 | 0 | | 7 | 8 | 9 | +-----+-----+-----+ +-----+-----+-----+ -

The extraction will take the minimum bounding box of each segment and return a list of segments.

For example, the label 1 and 2 extracted images will be

+

The extraction will take the minimum bounding box of each segment and return a list of segments.

For example, the label 1 and 2 extracted images will be

+-----------+ +-----------+ | Extracted | | Extracted | | Segment 1 | | Segment 2 | @@ -42,7 +42,7 @@ +-----+-----+ +-----+-----+ | 7 | 8 | +-----+-----+ -
+
+-----------------+ +-----------------+ | Extracted | | Extracted | | Segment 1 | | Segment 2 | @@ -53,7 +53,7 @@ +-----+-----+-----+ +-----+-----+-----+ | 7 | 8 | 0 | | 0 | 0 | 0 | +-----+-----+-----+ +-----+-----+-----+ -
  • If cropped is False, the segments are padded with 0s to the original image size. While this can ensure shape consistency, it can consume more memory for large images.

  • If cropped is True, the segments are cropped to the minimum bounding box. This can save memory, but the shape of the segments will be inconsistent.

Usage

Extract from Bounds and Labels

Extract segments from bounds and labels.

+
  • If cropped is False, the segments are padded with 0s to the original image size. While this can ensure shape consistency, it can consume more memory for large images.

  • If cropped is True, the segments are cropped to the minimum bounding box. This can save memory, but the shape of the segments will be inconsistent.

Usage

Extract from Bounds and Labels

Extract segments from bounds and labels.

import numpy as np from frdc.load import FRDCDataset from frdc.preprocess.extract_segments import extract_segments_from_bounds @@ -65,7 +65,7 @@ bounds, labels = ds.get_bounds_and_labels() segments: list[np.ndarray] = extract_segments_from_bounds(ar, bounds) -

Extract from Auto-Segmentation

Extract segments from a label classification.

+

Extract from Auto-Segmentation

Extract segments from a label classification.

from skimage.morphology import remove_small_objects, remove_small_holes import numpy as np @@ -91,4 +91,4 @@ min_height=10, min_width=10) segments: list[np.ndarray] = extract_segments_from_labels(ar, ar_labels) -

API

extract_segments_from_labels(ar, ar_labels, cropped)

Extracts segments from a label classification.


ar_labels is a label classification as a np.ndarray

extract_segments_from_bounds(ar, bounds, cropped)

Extracts segments from Rect bounds.


bounds is a list of Rect bounds.

remove_small_segments_from_labels(ar_labels, min_height, min_width)

Removes small segments from a label classification.


Last modified: 27 December 2023
\ No newline at end of file +

API

extract_segments_from_labels(ar, ar_labels, cropped)

Extracts segments from a label classification.


ar_labels is a label classification as a np.ndarray

extract_segments_from_bounds(ar, bounds, cropped)

Extracts segments from Rect bounds.


bounds is a list of Rect bounds.

remove_small_segments_from_labels(ar_labels, min_height, min_width)

Removes small segments from a label classification.


Last modified: 27 December 2023
\ No newline at end of file diff --git a/docs/preprocessing-glcm-padded.html b/docs/preprocessing-glcm-padded.html index 4fe5fd50..0f99613f 100644 --- a/docs/preprocessing-glcm-padded.html +++ b/docs/preprocessing-glcm-padded.html @@ -1,4 +1,4 @@ - preprocessing.glcm_padded | Documentation

Documentation 0.0.8 Help

preprocessing.glcm_padded

Functions

glcm_padded

Computes the GLCM of the NDArray bands with padding.

glcm_padded_cached

Computes the GLCM of the NDArray bands with padding, and caches it.

append_glcm_padded_cached

Computes the GLCM of the NDArray bands with padding, and caches it and also appends it onto the original array.

Usage

We show a few examples of how to use the GLCM functions.

+ preprocessing.glcm_padded | Documentation

Documentation 0.0.8 Help

preprocessing.glcm_padded

Functions

glcm_padded

Computes the GLCM of the NDArray bands with padding.

glcm_padded_cached

Computes the GLCM of the NDArray bands with padding, and caches it.

append_glcm_padded_cached

Computes the GLCM of the NDArray bands with padding, and caches it and also appends it onto the original array.

Usage

We show a few examples of how to use the GLCM functions.

import numpy as np from glcm_cupy import Features @@ -23,4 +23,4 @@ ar_glcm_cached_appended = append_glcm_padded_cached(ar, bin_from=1, bin_to=4, radius=3) -
  • ar_glcm is the GLCM of the original array, with the last dimension being the GLCM features. The number of features is determined by the features parameter, which defaults to all features.

  • ar_glcm_2_features selects only 2 features, with the last dimension being the 2 GLCM features specified.

  • ar_glcm_cached caches the GLCM so that if you call it again, it will return the cached version. It stores its data at the project root dir, under .cache/.

  • ar_glcm_cached_appended is a wrapper around ar_glcm_cached, it appends the GLCM features onto the original array. It's equivalent to calling ar_glcm_cached and then np.concatenate on the final axes.

Caching

GLCM is an expensive operation, thus we recommend to cache it if the input parameters will be the same. This is especially useful if you're experimenting with the same dataset with constant parameters.

API

glcm_padded(ar, bin_from, bin_to, radius, step_size, features)

Computes the GLCM of the NDArray bands with padding.


  • ar is the input array

  • bin_from is the upper bound of the input

  • bin_to is the upper bound of the GLCM input, i.e. the resolution that GLCM operates on

  • radius is the radius of the GLCM

  • step_size is the step size of the GLCM

  • features is the list of GLCM features to compute

The return shape is

See glcm_cupy for the GLCM Features.

glcm_padded_cached(ar, bin_from, bin_to, radius, step_size, features)

Computes the GLCM of the NDArray bands with padding, and caches it.


See glcm_padded for the parameters and output shape

append_glcm_padded_cached(ar, bin_from, bin_to, radius, step_size, features)

Computes the GLCM of the NDArray bands with padding, and caches it and also appends it onto the original array.


See glcm_padded for the parameters


The return shape is:

The function automatically flattens the last 2 dimensions of the GLCM features, and appends it onto the original array.

Last modified: 27 December 2023
\ No newline at end of file +
  • ar_glcm is the GLCM of the original array, with the last dimension being the GLCM features. The number of features is determined by the features parameter, which defaults to all features.

  • ar_glcm_2_features selects only 2 features, with the last dimension being the 2 GLCM features specified.

  • ar_glcm_cached caches the GLCM so that if you call it again, it will return the cached version. It stores its data at the project root dir, under .cache/.

  • ar_glcm_cached_appended is a wrapper around ar_glcm_cached, it appends the GLCM features onto the original array. It's equivalent to calling ar_glcm_cached and then np.concatenate on the final axes.

Caching

GLCM is an expensive operation, thus we recommend to cache it if the input parameters will be the same. This is especially useful if you're experimenting with the same dataset with constant parameters.

API

glcm_padded(ar, bin_from, bin_to, radius, step_size, features)

Computes the GLCM of the NDArray bands with padding.


  • ar is the input array

  • bin_from is the upper bound of the input

  • bin_to is the upper bound of the GLCM input, i.e. the resolution that GLCM operates on

  • radius is the radius of the GLCM

  • step_size is the step size of the GLCM

  • features is the list of GLCM features to compute

The return shape is

See glcm_cupy for the GLCM Features.

glcm_padded_cached(ar, bin_from, bin_to, radius, step_size, features)

Computes the GLCM of the NDArray bands with padding, and caches it.


See glcm_padded for the parameters and output shape

append_glcm_padded_cached(ar, bin_from, bin_to, radius, step_size, features)

Computes the GLCM of the NDArray bands with padding, and caches it and also appends it onto the original array.


See glcm_padded for the parameters


The return shape is:

The function automatically flattens the last 2 dimensions of the GLCM features, and appends it onto the original array.

Last modified: 27 December 2023
\ No newline at end of file diff --git a/docs/preprocessing-morphology.html b/docs/preprocessing-morphology.html index fbe3df70..7817824f 100644 --- a/docs/preprocessing-morphology.html +++ b/docs/preprocessing-morphology.html @@ -1,4 +1,4 @@ - preprocessing.morphology | Documentation

Documentation 0.0.8 Help

preprocessing.morphology

Functions

threshold_binary_mask

Thresholds a selected NDArray bands to yield a binary mask.

binary_watershed

Performs watershed on a binary mask to yield a mapped label classification

Usage

Perform auto-segmentation on a dataset to yield a label classification.

+ preprocessing.morphology | Documentation

Documentation 0.0.8 Help

preprocessing.morphology

Functions

threshold_binary_mask

Thresholds a selected NDArray bands to yield a binary mask.

binary_watershed

Performs watershed on a binary mask to yield a mapped label classification

Usage

Perform auto-segmentation on a dataset to yield a label classification.

from frdc.load import FRDCDataset from frdc.preprocess.morphology import ( threshold_binary_mask, binary_watershed @@ -10,6 +10,6 @@ ar, order = ds.get_ar_bands() mask = threshold_binary_mask(ar, order.index('NIR'), 90 / 256) ar_label = binary_watershed(mask) -

API

threshold_binary_mask(ar, band_idx, threshold_value)

Thresholds a selected NDArray bands to yield a binary mask as np.ndarray


This is equivalent to

+

API

threshold_binary_mask(ar, band_idx, threshold_value)

Thresholds a selected NDArray bands to yield a binary mask as np.ndarray


This is equivalent to

ar[..., band_idx] > threshold_value -
binary_watershed(ar_mask, peaks_footprint, watershed_compactness)

Performs watershed on a binary mask to yield a mapped label classification as a np.ndarray


  • peaks_footprint is the footprint of skimage.feature.peak_local_max

  • watershed_compactness is the compactness of skimage.morphology.watershed

Last modified: 27 December 2023
\ No newline at end of file +
binary_watershed(ar_mask, peaks_footprint, watershed_compactness)

Performs watershed on a binary mask to yield a mapped label classification as a np.ndarray


  • peaks_footprint is the footprint of skimage.feature.peak_local_max

  • watershed_compactness is the compactness of skimage.morphology.watershed

Last modified: 27 December 2023
\ No newline at end of file diff --git a/docs/preprocessing-scale.html b/docs/preprocessing-scale.html index ebb1b8dc..cdf27d34 100644 --- a/docs/preprocessing-scale.html +++ b/docs/preprocessing-scale.html @@ -1,4 +1,4 @@ - preprocessing.scale | Documentation

Documentation 0.0.8 Help

preprocessing.scale

Functions

scale_0_1_per_band

Scales the NDArray bands to [0, 1] per band.

scale_normal_per_band

Scales the NDArray bands to zero mean unit variance per band.

scale_static_per_band

Scales the NDArray bands by a predefined configuration. Take a look at frdc.conf.BAND_MAX_CONFIG for an example.

Usage

+ preprocessing.scale | Documentation

Documentation 0.0.8 Help

preprocessing.scale

Functions

scale_0_1_per_band

Scales the NDArray bands to [0, 1] per band.

scale_normal_per_band

Scales the NDArray bands to zero mean unit variance per band.

scale_static_per_band

Scales the NDArray bands by a predefined configuration. Take a look at frdc.conf.BAND_MAX_CONFIG for an example.

Usage

from frdc.load import FRDCDataset from frdc.preprocess.scale import ( scale_0_1_per_band, scale_normal_per_band, scale_static_per_band @@ -12,4 +12,4 @@ ar_01 = scale_0_1_per_band(ar) ar_norm = scale_normal_per_band(ar) ar_static = scale_static_per_band(ar, order, BAND_MAX_CONFIG) -
Last modified: 27 December 2023
\ No newline at end of file +
Last modified: 27 December 2023
\ No newline at end of file diff --git a/docs/retrieve-our-datasets.html b/docs/retrieve-our-datasets.html index 641bbf98..34b21eb2 100644 --- a/docs/retrieve-our-datasets.html +++ b/docs/retrieve-our-datasets.html @@ -1,10 +1,10 @@ - Retrieve our Datasets | Documentation

Documentation 0.0.8 Help

Retrieve our Datasets

In this tutorial, we'll learn how to :

  • Retrieve FRDC's Hyperspectral Image Data as np.ndarray

  • Retrieve FRDC's Ground Truth bounds and labels

  • Slice/segment the image data by the bounds

Prerequisites

  • New here? Get Started.

  • Setup the Google Cloud Authorization to download the data.

Retrieve the Data

To retrieve the data, use FRDCDataset

Here, we'll download and load our

  • ar: Hyperspectral Image Data

  • order: The order of the bands

  • bounds: The bounds of the trees (segments)

  • labels: The labels of the trees (segments)

+ Retrieve our Datasets | Documentation

Documentation 0.0.8 Help

Retrieve our Datasets

In this tutorial, we'll learn how to :

  • Retrieve FRDC's Hyperspectral Image Data as np.ndarray

  • Retrieve FRDC's Ground Truth bounds and labels

  • Slice/segment the image data by the bounds

Prerequisites

  • New here? Get Started.

  • Setup the Google Cloud Authorization to download the data.

Retrieve the Data

To retrieve the data, use FRDCDataset

Here, we'll download and load our

  • ar: Hyperspectral Image Data

  • order: The order of the bands

  • bounds: The bounds of the trees (segments)

  • labels: The labels of the trees (segments)

from frdc.load.dataset import FRDCDataset ds = FRDCDataset(site="chestnut_nature_park", date="20201218", version=None) ar, order = ds.get_ar_bands() bounds, labels = ds.get_bounds_and_labels() -

What Datasets are there?

+

What Datasets are there?

from frdc.load.gcs import list_gcs_datasets print(list_gcs_datasets()) # 0 DEBUG/0 @@ -12,7 +12,7 @@ # 2 casuarina/20220418/93deg # 3 chestnut_nature_park/20201218 # ... -
  • The first part of the path is the site, and the second part is the date.

  • The version is the rest of the path, if there isn't any, use None.

  • site="ds"

  • date="date"

  • version="ver"

  • site="ds"

  • date="date"

  • version="ver/01/data"

  • site="ds"

  • date="date"

  • version=None

Segment the Data

To segment the data, use Extract Segments.

Here, we'll segment the data by the bounds.

+
  • The first part of the path is the site, and the second part is the date.

  • The version is the rest of the path, if there isn't any, use None.

  • site="ds"

  • date="date"

  • version="ver"

  • site="ds"

  • date="date"

  • version="ver/01/data"

  • site="ds"

  • date="date"

  • version=None

Segment the Data

To segment the data, use Extract Segments.

Here, we'll segment the data by the bounds.

from frdc.load.dataset import FRDCDataset from frdc.preprocess.extract_segments import extract_segments_from_bounds @@ -20,7 +20,7 @@ ar, order = ds.get_ar_bands() bounds, labels = ds.get_bounds_and_labels() segments = extract_segments_from_bounds(ar, bounds) -

segments is a list of np.ndarray of shape H, W, C, representing a tree. The order of segments is the same as labels, so you can use labels to identify the tree.

Plot the Data (Optional)

We can then use these data to plot out the first tree segment.

+

segments is a list of np.ndarray of shape H, W, C, representing a tree. The order of segments is the same as labels, so you can use labels to identify the tree.

Plot the Data (Optional)

We can then use these data to plot out the first tree segment.

import matplotlib.pyplot as plt from frdc.load.dataset import FRDCDataset @@ -38,4 +38,4 @@ plt.imshow(segment_0_rgb_scaled) plt.title(f"Tree {labels[0]}") plt.show() -

See also: preprocessing.scale.scale_0_1_per_band

MatPlotLib cannot show the data correctly as-is, so we need to

  • Convert the data from BGR to RGB

  • Scale the data to 0-1 per band

Last modified: 27 December 2023
\ No newline at end of file +

See also: preprocessing.scale.scale_0_1_per_band

MatPlotLib cannot show the data correctly as-is, so we need to

  • Convert the data from BGR to RGB

  • Scale the data to 0-1 per band

Last modified: 27 December 2023
\ No newline at end of file diff --git a/docs/train-frdc-lightning.html b/docs/train-frdc-lightning.html index 52cde410..19ba79c2 100644 --- a/docs/train-frdc-lightning.html +++ b/docs/train-frdc-lightning.html @@ -1,4 +1,4 @@ - train.frdc_datamodule & frdc_module | Documentation

Documentation 0.0.8 Help

train.frdc_datamodule & frdc_module

These are FRDC specific LightningDataModule and LightningModule, a core component in the PyTorch Lightning ecosystem to provide a simple interface to train and evaluate models.

Classes

FRDCDataModule

The FRDC PyTorch Lightning DataModule.

FRDCModule

The FRDC PyTorch Lightning Module.

Usage

API

FRDCDataModule(segments, labels, preprocess, augmentation, train_val_test_split, batch_size)

Initializes the FRDC PyTorch Lightning DataModule.


  • segments, labels are retrieved from

  • preprocess is a function that takes in a segment and returns a preprocessed segment. In particular, it should accept a list of NumPy NDArrays and return a single stacked PyToch Tensor.

  • augmentation is a function that takes in a segment and returns an augmented segment. In particular, it takes in a PyTorch Tensor and returns another.

  • train_val_test_split is a function that takes a TensorDataset and returns a list of 3 TensorDatasets, for train, val and test respectively.

  • batch_size is the batch size.

FRDCModule(model_cls, model_kwargs, optim_cls, optim_kwargs)

Initializes the FRDC PyTorch Lightning Module.


  • model_cls is the Class of the model.

  • model_kwargs is the kwargs to pass to the model.

  • optim_cls is the Class of the optimizer.

  • optim_kwargs is the kwargs to pass to the optimizer.

Internally, the module will initialize the model and optimizer as follows:

+ train.frdc_datamodule & frdc_module | Documentation

Documentation 0.0.8 Help

train.frdc_datamodule & frdc_module

These are FRDC specific LightningDataModule and LightningModule, a core component in the PyTorch Lightning ecosystem to provide a simple interface to train and evaluate models.

Classes

FRDCDataModule

The FRDC PyTorch Lightning DataModule.

FRDCModule

The FRDC PyTorch Lightning Module.

Usage

API

FRDCDataModule(segments, labels, preprocess, augmentation, train_val_test_split, batch_size)

Initializes the FRDC PyTorch Lightning DataModule.


  • segments, labels are retrieved from

  • preprocess is a function that takes in a segment and returns a preprocessed segment. In particular, it should accept a list of NumPy NDArrays and return a single stacked PyToch Tensor.

  • augmentation is a function that takes in a segment and returns an augmented segment. In particular, it takes in a PyTorch Tensor and returns another.

  • train_val_test_split is a function that takes a TensorDataset and returns a list of 3 TensorDatasets, for train, val and test respectively.

  • batch_size is the batch size.

FRDCModule(model_cls, model_kwargs, optim_cls, optim_kwargs)

Initializes the FRDC PyTorch Lightning Module.


  • model_cls is the Class of the model.

  • model_kwargs is the kwargs to pass to the model.

  • optim_cls is the Class of the optimizer.

  • optim_kwargs is the kwargs to pass to the optimizer.

Internally, the module will initialize the model and optimizer as follows:

model = model_cls(**model_kwargs) optim = optim_cls(model.parameters(), **optim_kwargs) -
Last modified: 27 December 2023
\ No newline at end of file +
Last modified: 27 December 2023
\ No newline at end of file From ddc7e1c8b38fea5850a07ea6aaef0a6571e83b6e Mon Sep 17 00:00:00 2001 From: Evening Date: Thu, 28 Dec 2023 14:48:01 +0800 Subject: [PATCH 18/52] Implement Preset Class This class will help reduce errors when getting datasets IDE autocomplete prompts --- src/frdc/load/dataset.py | 97 +++++++++++++++++++++++++++------------- 1 file changed, 67 insertions(+), 30 deletions(-) diff --git a/src/frdc/load/dataset.py b/src/frdc/load/dataset.py index b3654cca..34099fc2 100644 --- a/src/frdc/load/dataset.py +++ b/src/frdc/load/dataset.py @@ -4,7 +4,7 @@ from collections import OrderedDict from dataclasses import dataclass from pathlib import Path -from typing import Iterable, Callable, Any +from typing import Iterable, Callable, Any, Protocol import numpy as np import pandas as pd @@ -40,6 +40,10 @@ def __init__( ): """Initializes the FRDC Dataset. + Notes: + We recommend to check FRDCDatasetPreset if you want to use a + pre-defined dataset. + Args: site: The site of the dataset, e.g. "chestnut_nature_park". date: The date of the dataset, e.g. "20201218". @@ -83,35 +87,6 @@ def __getitem__(self, idx): else self.targets[idx], ) - @staticmethod - def _load_debug_dataset(resize: int = 299) -> FRDCDataset: - """Loads a debug dataset from Google Cloud Storage. - - Returns: - A dictionary of the dataset, with keys as the filenames and values - as the images. - """ - from torchvision.transforms.v2 import ( - Compose, - ToImage, - ToDtype, - Resize, - ) - - return FRDCDataset( - site="DEBUG", - date="0", - version=None, - transform=Compose( - [ - ToImage(), - ToDtype(torch.float32), - Resize((resize, resize)), - ] - ), - target_transform=None, - ) - @property def dataset_dir(self): return Path( @@ -247,6 +222,68 @@ def _load_image(path: Path | str) -> np.ndarray: return np.expand_dims(ar, axis=-1) if ar.ndim == 2 else ar +class FRDCDatasetPartial(Protocol): + """This class is used to provide type hints for FRDCDatasetPreset.""" + + def __call__( + self, + transform: Callable[[list[np.ndarray]], Any] = None, + target_transform: Callable[[list[str]], list[str]] = None, + use_legacy_bounds: bool = False, + ): + ... + + +# This curries the FRDCDataset class, so that we can shorthand the preset +# definitions. +def dataset(site: str, date: str, version: str | None) -> FRDCDatasetPartial: + def inner( + transform: Callable[[list[np.ndarray]], Any] = None, + target_transform: Callable[[list[str]], list[str]] = None, + use_legacy_bounds: bool = False, + ): + return FRDCDataset( + site, date, version, transform, target_transform, use_legacy_bounds + ) + + return inner + + +from torchvision.transforms.v2 import ( + Compose, + ToImage, + ToDtype, + Resize, +) + + +@dataclass +class FRDCDatasetPreset: + chestnut_20201218 = dataset("chestnut_nature_park", "20201218", None) + chestnut_20210510_43m = dataset( + "chestnut_nature_park", "20210510", "90deg43m85pct255deg" + ) + chestnut_20210510_60m = dataset( + "chestnut_nature_park", "20210510", "90deg60m84.5pct255deg" + ) + casuarina_20220418_183deg = dataset( + "casuarina_nature_park", "20220418", "183deg" + ) + casuarina_20220418_93deg = dataset( + "casuarina_nature_park", "20220418", "93deg" + ) + DEBUG = lambda resize=299: dataset(site="DEBUG", date="0", version=None)( + transform=Compose( + [ + ToImage(), + ToDtype(torch.float32), + Resize((resize, resize)), + ] + ), + target_transform=None, + ) + + # TODO: Kind of hacky, the unlabelled dataset should somehow come from the # labelled dataset by filtering out the unknown labels. But we'll # figure out this later when we do get unlabelled data. From 76b4dff776745a84badcb3fd41ebfb33ca472fc3 Mon Sep 17 00:00:00 2001 From: Evening Date: Thu, 28 Dec 2023 14:48:19 +0800 Subject: [PATCH 19/52] Update debug dataset loading --- tests/conftest.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/conftest.py b/tests/conftest.py index 1e9d84bd..d420f691 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2,11 +2,12 @@ import pytest from frdc.load import FRDCDataset +from frdc.load.dataset import FRDCDatasetPreset @pytest.fixture(scope="session") def ds() -> FRDCDataset: - return FRDCDataset._load_debug_dataset() + return FRDCDatasetPreset.DEBUG() @pytest.fixture(scope="session") From 2fbd4d410b50a99ccf707d6be6dbc59dbf16bca8 Mon Sep 17 00:00:00 2001 From: Evening Date: Thu, 28 Dec 2023 14:48:44 +0800 Subject: [PATCH 20/52] Update preset loading for chestnut training --- tests/model_tests/chestnut_dec_may/train.py | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/tests/model_tests/chestnut_dec_may/train.py b/tests/model_tests/chestnut_dec_may/train.py index f31ee825..64dc7c8a 100644 --- a/tests/model_tests/chestnut_dec_may/train.py +++ b/tests/model_tests/chestnut_dec_may/train.py @@ -22,7 +22,7 @@ from sklearn.preprocessing import StandardScaler, OrdinalEncoder from frdc.load import FRDCDataset -from frdc.load.dataset import FRDCUnlabelledDataset +from frdc.load.dataset import FRDCUnlabelledDataset, FRDCDatasetPreset from frdc.models.inceptionv3 import InceptionV3MixMatchModule from frdc.train.frdc_datamodule import FRDCDataModule from model_tests.utils import ( @@ -44,11 +44,8 @@ def main( run = wandb.init() logger = WandbLogger(name="chestnut_dec_may", project="frdc") # Prepare the dataset - train_lab_ds = FRDCDataset( - "chestnut_nature_park", - "20201218", - None, - transform=train_preprocess, + train_lab_ds = FRDCDatasetPreset.chestnut_20201218( + transform=train_preprocess ) # TODO: This is a hacky impl of the unlabelled dataset, see the docstring @@ -60,13 +57,7 @@ def main( transform=train_unl_preprocess(2), ) - # Subset(train_ds, np.argwhere(train_ds.targets == 0).reshape(-1)) - val_ds = FRDCDataset( - "chestnut_nature_park", - "20210510", - "90deg43m85pct255deg", - transform=preprocess, - ) + val_ds = FRDCDatasetPreset.chestnut_20210510_43m(transform=preprocess) oe = OrdinalEncoder( handle_unknown="use_encoded_value", From b5a465a976e0415a7803c14561719689e9c35b4d Mon Sep 17 00:00:00 2001 From: Evening Date: Thu, 28 Dec 2023 14:51:44 +0800 Subject: [PATCH 21/52] Move import to top --- src/frdc/load/dataset.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/src/frdc/load/dataset.py b/src/frdc/load/dataset.py index 34099fc2..54b9c918 100644 --- a/src/frdc/load/dataset.py +++ b/src/frdc/load/dataset.py @@ -11,6 +11,12 @@ import torch from PIL import Image from torch.utils.data import Dataset, ConcatDataset +from torchvision.transforms.v2 import ( + Compose, + ToImage, + ToDtype, + Resize, +) from frdc.conf import ( BAND_CONFIG, @@ -249,14 +255,6 @@ def inner( return inner -from torchvision.transforms.v2 import ( - Compose, - ToImage, - ToDtype, - Resize, -) - - @dataclass class FRDCDatasetPreset: chestnut_20201218 = dataset("chestnut_nature_park", "20201218", None) From 334daa7c50fb0ae410f1780f9ef0884369f94eb7 Mon Sep 17 00:00:00 2001 From: Evening Date: Thu, 28 Dec 2023 15:32:58 +0800 Subject: [PATCH 22/52] Implement interface to use add op to concat --- src/frdc/load/dataset.py | 33 ++++++++++++---------- tests/unit_tests/load/test_frdc_dataset.py | 12 ++++++++ 2 files changed, 30 insertions(+), 15 deletions(-) diff --git a/src/frdc/load/dataset.py b/src/frdc/load/dataset.py index 54b9c918..578f913c 100644 --- a/src/frdc/load/dataset.py +++ b/src/frdc/load/dataset.py @@ -33,6 +33,20 @@ logger = logging.getLogger(__name__) +# This is not yet used much as we don't have sufficient training data. +class FRDCConcatDataset(ConcatDataset): + def __init__(self, datasets: list[FRDCDataset]): + super().__init__(datasets) + self.datasets: list[FRDCDataset] = datasets + + @property + def targets(self): + return [t for ds in self.datasets for t in ds.targets] + + def __add__(self, other: FRDCDataset) -> FRDCConcatDataset: + return FRDCConcatDataset([*self.datasets, other]) + + @dataclass class FRDCDataset(Dataset): def __init__( @@ -63,6 +77,7 @@ def __init__( self.version = version self.ar, self.order = self.get_ar_bands() + self.targets = None if use_legacy_bounds or (LABEL_STUDIO_CLIENT is None): logger.warning( @@ -227,6 +242,9 @@ def _load_image(path: Path | str) -> np.ndarray: ar = np.asarray(im) return np.expand_dims(ar, axis=-1) if ar.ndim == 2 else ar + def __add__(self, other) -> FRDCConcatDataset: + return FRDCConcatDataset([self, other]) + class FRDCDatasetPartial(Protocol): """This class is used to provide type hints for FRDCDatasetPreset.""" @@ -296,18 +314,3 @@ def __getitem__(self, item): if self.transform else self.ar_segments[item] ) - - -# This is not yet used much as we don't have sufficient training data. -class FRDCConcatDataset(ConcatDataset): - def __init__(self, datasets: list[FRDCDataset]): - super().__init__(datasets) - self.datasets = datasets - - def __getitem__(self, idx): - x, y = super().__getitem__(idx) - return x, y - - @property - def targets(self): - return [t for ds in self.datasets for t in ds.targets] diff --git a/tests/unit_tests/load/test_frdc_dataset.py b/tests/unit_tests/load/test_frdc_dataset.py index c0e2c838..0a75425c 100644 --- a/tests/unit_tests/load/test_frdc_dataset.py +++ b/tests/unit_tests/load/test_frdc_dataset.py @@ -1,4 +1,5 @@ from frdc.conf import BAND_CONFIG +from frdc.load.dataset import FRDCConcatDataset from frdc.utils import Rect @@ -23,3 +24,14 @@ def test_get_bounds(ds): bounds, labels = ds.get_bounds_and_labels() assert all([isinstance(b, Rect) for b in bounds]) assert len(bounds) == len(labels) + + +def test_ds_add_ds_creates_concat_ds(ds): + assert isinstance(ds + ds, FRDCConcatDataset) + assert len(ds + ds) == len(ds) * 2 + + +def test_concat_ds_add_ds_creates_concat_ds(ds): + cds = ds + ds + assert isinstance(cds + ds, FRDCConcatDataset) + assert len(cds + ds) == len(ds) * 3 From ef57cf4b35d118253e92efb363697cdaebb3b7d4 Mon Sep 17 00:00:00 2001 From: Evening Date: Thu, 28 Dec 2023 15:34:03 +0800 Subject: [PATCH 23/52] Remove unused import --- tests/model_tests/chestnut_dec_may/train.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/model_tests/chestnut_dec_may/train.py b/tests/model_tests/chestnut_dec_may/train.py index 64dc7c8a..35999543 100644 --- a/tests/model_tests/chestnut_dec_may/train.py +++ b/tests/model_tests/chestnut_dec_may/train.py @@ -21,7 +21,6 @@ from lightning.pytorch.loggers import WandbLogger from sklearn.preprocessing import StandardScaler, OrdinalEncoder -from frdc.load import FRDCDataset from frdc.load.dataset import FRDCUnlabelledDataset, FRDCDatasetPreset from frdc.models.inceptionv3 import InceptionV3MixMatchModule from frdc.train.frdc_datamodule import FRDCDataModule From f04b9a3882c579186163855a8ff2be70322bfd71 Mon Sep 17 00:00:00 2001 From: Evening Date: Thu, 28 Dec 2023 15:50:11 +0800 Subject: [PATCH 24/52] Move warning to func def --- src/frdc/load/dataset.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/frdc/load/dataset.py b/src/frdc/load/dataset.py index 578f913c..06fa4f97 100644 --- a/src/frdc/load/dataset.py +++ b/src/frdc/load/dataset.py @@ -80,11 +80,6 @@ def __init__( self.targets = None if use_legacy_bounds or (LABEL_STUDIO_CLIENT is None): - logger.warning( - "Using legacy bounds.csv file for dataset." - "This is pending to be deprecated in favour of pulling " - "annotations from Label Studio." - ) bounds, self.targets = self.get_bounds_and_labels() self.ar_segments = extract_segments_from_bounds(self.ar, bounds) else: @@ -211,6 +206,11 @@ def get_bounds_and_labels( A tuple of (bounds, labels), where bounds is a list of (x0, y0, x1, y1) and labels is a list of labels. """ + logger.warning( + "Using legacy bounds.csv file for dataset." + "This is pending to be deprecated in favour of pulling " + "annotations from Label Studio." + ) fp = download(fp=self.dataset_dir / file_name) df = pd.read_csv(fp) return ( From 419fd9ef54552dcec457251ac060f5605a3914e4 Mon Sep 17 00:00:00 2001 From: Evening Date: Thu, 28 Dec 2023 17:23:52 +0800 Subject: [PATCH 25/52] Improve syntax of creating unlabelled datasets --- src/frdc/load/dataset.py | 199 ++++++++++++++++++++++++++++++++------- 1 file changed, 165 insertions(+), 34 deletions(-) diff --git a/src/frdc/load/dataset.py b/src/frdc/load/dataset.py index 06fa4f97..258aac9c 100644 --- a/src/frdc/load/dataset.py +++ b/src/frdc/load/dataset.py @@ -4,7 +4,7 @@ from collections import OrderedDict from dataclasses import dataclass from pathlib import Path -from typing import Iterable, Callable, Any, Protocol +from typing import Iterable, Callable, Any import numpy as np import pandas as pd @@ -33,8 +33,32 @@ logger = logging.getLogger(__name__) -# This is not yet used much as we don't have sufficient training data. class FRDCConcatDataset(ConcatDataset): + """ConcatDataset for FRDCDataset. + + Notes: + This handles concatenating the targets when you add two datasets + together, furthermore, implements the addition operator to + simplify the syntax. + + Examples: + If you have two datasets, ds1 and ds2, you can concatenate them:: + + ds = ds1 + ds2 + + `ds` will be a FRDCConcatDataset, which is a subclass of ConcatDataset. + + You can further add to a concatenated dataset:: + + ds = ds1 + ds2 + ds = ds + ds3 + + Finallu, all concatenated datasets have the `targets` property, which + is a list of all the targets in the datasets:: + + (ds1 + ds2).targets == ds1.targets + ds2.targets + """ + def __init__(self, datasets: list[FRDCDataset]): super().__init__(datasets) self.datasets: list[FRDCDataset] = datasets @@ -64,6 +88,13 @@ def __init__( We recommend to check FRDCDatasetPreset if you want to use a pre-defined dataset. + You can concatenate datasets using the addition operator, e.g.:: + + ds = FRDCDataset(...) + FRDCDataset(...) + + This will return a FRDCConcatDataset, see FRDCConcatDataset for + more information. + Args: site: The site of the dataset, e.g. "chestnut_nature_park". date: The date of the dataset, e.g. "20201218". @@ -71,6 +102,9 @@ def __init__( transform: The transform to apply to each segment. target_transform: The transform to apply to each label. use_legacy_bounds: Whether to use the legacy bounds.csv file. + This will automatically be set to True if LABEL_STUDIO_CLIENT + is None, which happens when Label Studio cannot be connected + to. """ self.site = site self.date = date @@ -105,6 +139,7 @@ def __getitem__(self, idx): @property def dataset_dir(self): + """Returns the path format of the dataset.""" return Path( f"{self.site}/{self.date}/" f"{self.version + '/' if self.version else ''}" @@ -219,6 +254,7 @@ def get_bounds_and_labels( ) def get_polybounds_and_labels(self): + """Gets the bounds and labels from Label Studio.""" return get_task( Path(f"{self.dataset_dir}/result.jpg") ).get_bounds_and_labels() @@ -246,8 +282,32 @@ def __add__(self, other) -> FRDCConcatDataset: return FRDCConcatDataset([self, other]) -class FRDCDatasetPartial(Protocol): - """This class is used to provide type hints for FRDCDatasetPreset.""" +# This curries the FRDCDataset class, so that we can shorthand the preset +# definitions. +@dataclass +class FRDCDatasetPartial: + """Partial class for FRDCDataset. + + Notes: + This is used internally by FRDCDatasetPreset to define the presets + in a more concise manner:: + + # Instead of + lambda *args, **kwargs: + FRDCDataset("chestnut_nature_park", "20201218", None, + *args, **kwargs) + + # Using partial, we can do this instead + FRDCDatasetPartial("chestnut_nature_park", "20201218", None)( + *args, **kwargs + ) + + See FRDCDatasetPreset for usage. + """ + + site: str + date: str + version: str | None def __call__( self, @@ -255,40 +315,127 @@ def __call__( target_transform: Callable[[list[str]], list[str]] = None, use_legacy_bounds: bool = False, ): - ... - + """Alias for labelled().""" + return self.labelled( + transform, + target_transform, + use_legacy_bounds, + ) -# This curries the FRDCDataset class, so that we can shorthand the preset -# definitions. -def dataset(site: str, date: str, version: str | None) -> FRDCDatasetPartial: - def inner( + def labelled( + self, transform: Callable[[list[np.ndarray]], Any] = None, target_transform: Callable[[list[str]], list[str]] = None, use_legacy_bounds: bool = False, ): + """Returns the Labelled Dataset.""" return FRDCDataset( - site, date, version, transform, target_transform, use_legacy_bounds + self.site, + self.date, + self.version, + transform, + target_transform, + use_legacy_bounds, + ) + + def unlabelled( + self, + transform: Callable[[list[np.ndarray]], Any] = None, + target_transform: Callable[[list[str]], list[str]] = None, + use_legacy_bounds: bool = False, + ): + """Returns the Unlabelled Dataset. + + Notes: + This simply masks away the labels during __getitem__. + The same behaviour can be achieved by setting __class__ to + FRDCUnlabelledDataset, but this is a more convenient way to do so. + """ + return FRDCUnlabelledDataset( + self.site, + self.date, + self.version, + transform, + target_transform, + use_legacy_bounds, ) - return inner + +class FRDCUnlabelledDataset(FRDCDataset): + """An implementation of FRDCDataset that masks away the labels. + + Notes: + If you already have a FRDCDataset, you can simply set __class__ to + FRDCUnlabelledDataset to achieve the same behaviour:: + + ds.__class__ = FRDCUnlabelledDataset + + This will replace the __getitem__ method with the one below. + + However, it's also perfectly fine to initialize this directly:: + + ds_unl = FRDCUnlabelledDataset(...) + """ + + def __getitem__(self, item): + return ( + self.transform(self.ar_segments[item]) + if self.transform + else self.ar_segments[item] + ) @dataclass class FRDCDatasetPreset: - chestnut_20201218 = dataset("chestnut_nature_park", "20201218", None) - chestnut_20210510_43m = dataset( + """Presets for the FRDCDataset. + + Examples: + Each variable is a preset for the FRDCDataset. + + You can use it like this:: + + FRDCDatasetPreset.chestnut_20201218() + + Which returns a FRDCDataset. + + Furthermore, if you're interested in the unlabelled dataset, you can + use:: + + FRDCDatasetPreset.chestnut_20201218.unlabelled() + + Which returns a FRDCUnlabelledDataset. + + If you'd like to keep the syntax consistent for labelled and unlabelled + datasets, you can use:: + + FRDCDatasetPreset.chestnut_20201218.labelled() + FRDCDatasetPreset.chestnut_20201218.unlabelled() + + The `labelled` method is simply an alias for the `__call__` method. + + The DEBUG dataset is a special dataset that is used for debugging, + which pulls from GCS a small cropped image and dummy label + bounds. + + """ + + chestnut_20201218 = FRDCDatasetPartial( + "chestnut_nature_park", "20201218", None + ) + chestnut_20210510_43m = FRDCDatasetPartial( "chestnut_nature_park", "20210510", "90deg43m85pct255deg" ) - chestnut_20210510_60m = dataset( + chestnut_20210510_60m = FRDCDatasetPartial( "chestnut_nature_park", "20210510", "90deg60m84.5pct255deg" ) - casuarina_20220418_183deg = dataset( + casuarina_20220418_183deg = FRDCDatasetPartial( "casuarina_nature_park", "20220418", "183deg" ) - casuarina_20220418_93deg = dataset( + casuarina_20220418_93deg = FRDCDatasetPartial( "casuarina_nature_park", "20220418", "93deg" ) - DEBUG = lambda resize=299: dataset(site="DEBUG", date="0", version=None)( + DEBUG = lambda resize=299: FRDCDatasetPartial( + site="DEBUG", date="0", version=None + )( transform=Compose( [ ToImage(), @@ -298,19 +445,3 @@ class FRDCDatasetPreset: ), target_transform=None, ) - - -# TODO: Kind of hacky, the unlabelled dataset should somehow come from the -# labelled dataset by filtering out the unknown labels. But we'll -# figure out this later when we do get unlabelled data. -# I'm thinking some API that's like -# FRDCDataset.filter_labels(...) -> FRDCSubset, FRDCSubset -# It could be more intuitive if it returns FRDCDataset, so we don't have -# to implement another class. -class FRDCUnlabelledDataset(FRDCDataset): - def __getitem__(self, item): - return ( - self.transform(self.ar_segments[item]) - if self.transform - else self.ar_segments[item] - ) From 7d3183e86aeb790fb00c6617d7a01bfb2304b033 Mon Sep 17 00:00:00 2001 From: Evening Date: Thu, 28 Dec 2023 17:24:49 +0800 Subject: [PATCH 26/52] Implement auto casting of labelled to unlabelled This provides a failsafe interface if somehow someone forgot to use the unlabelled set, which is totally fine --- src/frdc/train/frdc_datamodule.py | 45 +++++++++++++++++++------------ 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/src/frdc/train/frdc_datamodule.py b/src/frdc/train/frdc_datamodule.py index 6138c7e5..97daaa6b 100644 --- a/src/frdc/train/frdc_datamodule.py +++ b/src/frdc/train/frdc_datamodule.py @@ -1,11 +1,14 @@ from __future__ import annotations from dataclasses import dataclass +from types import MethodType +from typing import Any from lightning import LightningDataModule from torch.utils.data import DataLoader, RandomSampler from frdc.load import FRDCDataset +from frdc.load.dataset import FRDCUnlabelledDataset @dataclass @@ -14,34 +17,39 @@ class FRDCDataModule(LightningDataModule): Notes: This is a special datamodule for semi-supervised learning, which - requires two dataloaders for the labelled and unlabelled datasets. - It can also be used for supervised learning, by passing in None for - the unlabelled dataset. + can accept an optional dataloaders for an unlabelled dataset. + + Without an unsupervised dataset it can be used for supervised learning, + by passing in None for the unlabelled dataset. If you're using our MixMatch Module, using None for the unlabelled dataset will skip the MixMatch. However, note that this is not equivalent to passing the Labelled set as unlabelled as well. - For example: - >>> FRDCSSLDataModule( - ... train_lab_ds=train_lab_ds, - ... train_unl_ds=train_lab_ds, - ... ... - ... ) + For example:: + + FRDCDataModule( + train_lab_ds=train_lab_ds, + train_unl_ds=train_lab_ds, + ... + ) + + Does not have the same performance as:: - Does not have the same performance as: - >>> FRDCSSLDataModule( - ... train_lab_ds=train_lab_ds, - ... train_unl_ds=None, - ... ... - ... ) + FRDCSSLDataModule( + train_lab_ds=train_lab_ds, + train_unl_ds=None, + ... + ) As partially, some samples in MixMatch uses the unlabelled loss. Args: train_lab_ds: The labelled training dataset. train_unl_ds: The unlabelled training dataset. Can be None, which will - default to a DataModule suitable for supervised learning. + default to a DataModule suitable for supervised learning. If + train_unl_ds is a FRDCDataset, it will be converted to a + FRDCUnlabelledDataset, which simply masks away the labels. val_ds: The validation dataset. batch_size: The batch size to use for the dataloaders. train_iters: The number of iterations to run for the labelled training @@ -52,7 +60,7 @@ class FRDCDataModule(LightningDataModule): train_lab_ds: FRDCDataset val_ds: FRDCDataset - train_unl_ds: FRDCDataset | None = None + train_unl_ds: FRDCDataset | FRDCUnlabelledDataset | None = None batch_size: int = 4 train_iters: int = 100 val_iters: int = 100 @@ -60,6 +68,9 @@ class FRDCDataModule(LightningDataModule): def __post_init__(self): super().__init__() + if isinstance(self.train_unl_ds, FRDCDataset): + self.train_unl_ds.__class__ = FRDCUnlabelledDataset + def train_dataloader(self): num_samples = self.batch_size * self.train_iters lab_dl = DataLoader( From a34d3677da69fe81d44c30d6ce3bb71a45096b3e Mon Sep 17 00:00:00 2001 From: Evening Date: Thu, 28 Dec 2023 17:25:29 +0800 Subject: [PATCH 27/52] Refactor unlabelled to use the preset --- tests/model_tests/chestnut_dec_may/train.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/tests/model_tests/chestnut_dec_may/train.py b/tests/model_tests/chestnut_dec_may/train.py index 35999543..53099ece 100644 --- a/tests/model_tests/chestnut_dec_may/train.py +++ b/tests/model_tests/chestnut_dec_may/train.py @@ -47,12 +47,7 @@ def main( transform=train_preprocess ) - # TODO: This is a hacky impl of the unlabelled dataset, see the docstring - # for future work. - train_unl_ds = FRDCUnlabelledDataset( - "chestnut_nature_park", - "20201218", - None, + train_unl_ds = FRDCDatasetPreset.chestnut_20201218.unlabelled( transform=train_unl_preprocess(2), ) From 58c977dc86ca02ea48fdc8075c9cf16ec4e86144 Mon Sep 17 00:00:00 2001 From: Evening Date: Thu, 28 Dec 2023 17:25:45 +0800 Subject: [PATCH 28/52] Refactor the preprocessing step --- src/frdc/train/mixmatch_module.py | 87 ++++++++++++++----------------- 1 file changed, 38 insertions(+), 49 deletions(-) diff --git a/src/frdc/train/mixmatch_module.py b/src/frdc/train/mixmatch_module.py index 194928ad..784380b6 100644 --- a/src/frdc/train/mixmatch_module.py +++ b/src/frdc/train/mixmatch_module.py @@ -241,72 +241,61 @@ def on_before_batch_transfer(self, batch: Any, dataloader_idx: int) -> Any: We leverage this to do some preprocessing on the data. Namely, we use the StandardScaler and OrdinalEncoder to transform the data. - """ - - # TODO: ngl, this is pretty chunky. - # It works, but it's not very pretty. - if self.training: - (x_lab, y), x_unl = batch - xs = [x_lab, *x_unl] - - b, c, h, w = x_lab.shape - - # Move Channel to the last dimension then transform - xs_ss: list[np.ndarray] = [ - self.x_scaler.transform(x.permute(0, 2, 3, 1).reshape(-1, c)) - for x in xs - ] - - # Move Channel back to the second dimension - xs_: list[torch.Tensor] = [ - torch.from_numpy(x_ss.reshape(b, h, w, c)) - .permute(0, 3, 1, 2) - .float() - for x_ss in xs_ss - ] - - y: tuple[str] - y_: torch.Tensor = torch.from_numpy( - self.y_encoder.transform(np.array(y).reshape(-1, 1)).squeeze() - ) - - # Ordinal Encoders can return a np.nan if the value is not in the - # categories. We will remove that from the batch. - x_ = xs_[0][~torch.isnan(y_)] - y_ = y_[~torch.isnan(y_)] - return (x_, y_.long()), xs_[1:] - - else: - x, y = batch - - x: torch.Tensor - b, c, h, w = x.shape + Notes: + PyTorch Lightning may complain about this being on the Module + instead of the DataModule. However, this is intentional as we + want to export the model alongside the transformations. + """ + def x_trans_fn(x): # Standard Scaler only accepts (n_samples, n_features), # so we need to do some fancy reshaping. # Note that moving dimensions then reshaping is different from just # reshaping! + # Move Channel to the last dimension then transform - x_ss: np.ndarray = self.x_scaler.transform( + # B x C x H x W -> B x H x W x C + b, c, h, w = x.shape + x_ss = self.x_scaler.transform( x.permute(0, 2, 3, 1).reshape(-1, c) ) # Move Channel back to the second dimension - x_: torch.Tensor = ( + # B x H x W x C -> B x C x H x W + return ( torch.from_numpy(x_ss.reshape(b, h, w, c)) .permute(0, 3, 1, 2) .float() ) - y: tuple[str] - y_: torch.Tensor = torch.from_numpy( + def y_trans_fn(y): + return torch.from_numpy( self.y_encoder.transform(np.array(y).reshape(-1, 1)).squeeze() ) - # Ordinal Encoders can return a np.nan if the value is not in the - # categories. We will remove that from the batch. - x_ = x_[~torch.isnan(y_)] - y_ = y_[~torch.isnan(y_)] + # We need to handle the train and val dataloaders differently. + # For training, the unlabelled data is returned while for validation, + # the unlabelled data is just omitted. + if self.training: + (x_lab, y), x_unl = batch + else: + x_lab, y = batch + x_unl = [] + + x_lab_trans = x_trans_fn(x_lab) + y_trans = y_trans_fn(y) + x_unl_trans = [x_trans_fn(x) for x in x_unl] - return x_, y_.long() + # Remove nan values from the batch + # Ordinal Encoders can return a np.nan if the value is not in the + # categories. We will remove that from the batch. + nan = ~torch.isnan(y_trans) + x_lab_trans = x_lab_trans[nan] + x_unl_trans = [x[nan] for x in x_unl_trans] + y_trans = y_trans[nan] + + if self.training: + return (x_lab_trans, y_trans.long()), x_unl_trans + else: + return x_lab_trans, y_trans.long() From 2928147e91b694ad6fc60a9c26497daa09ebd589 Mon Sep 17 00:00:00 2001 From: Evening Date: Thu, 28 Dec 2023 18:02:37 +0800 Subject: [PATCH 29/52] Move common scripts to utils --- src/frdc/utils/training.py | 67 ++++++++++++++++++++++++++++++++++++++ tests/model_tests/utils.py | 50 ++-------------------------- 2 files changed, 69 insertions(+), 48 deletions(-) create mode 100644 src/frdc/utils/training.py diff --git a/src/frdc/utils/training.py b/src/frdc/utils/training.py new file mode 100644 index 00000000..87593f89 --- /dev/null +++ b/src/frdc/utils/training.py @@ -0,0 +1,67 @@ +from __future__ import annotations +from pathlib import Path + +import lightning as pl +import numpy as np +from matplotlib import pyplot as plt +from seaborn import heatmap +from sklearn.metrics import confusion_matrix +from torch.utils.data import DataLoader + +from frdc.load import FRDCDataset + + +def get_latest_ckpt_path(search_dir: Path, extention: str = "ckpt"): + # This fetches all possible checkpoints and gets the latest one + return sorted( + search_dir.glob(f"**/*.{extention}"), + key=lambda x: x.stat().st_mtime_ns, + )[-1] + + +def plot_confusion_matrix( + y_trues, y_preds, labels +) -> tuple[plt.Figure, plt.Axes]: + # Plot the confusion matrix + cm = confusion_matrix(y_trues, y_preds) + + fig, ax = plt.subplots(figsize=(10, 10)) + + heatmap( + cm, + annot=True, + xticklabels=labels, + yticklabels=labels, + cbar=False, + ax=ax, + ) + + fig.tight_layout(pad=3) + ax.set_xlabel("Predicted Label") + ax.set_ylabel("True Label") + + return fig, ax + + +def predict( + ds: FRDCDataset, + model_cls: type[pl.LightningModule], + ckpt_pth: Path | str | None = None, +) -> tuple[np.ndarray, np.ndarray]: + m = model_cls.load_from_checkpoint(ckpt_pth) + # Make predictions + trainer = pl.Trainer(logger=False) + pred = trainer.predict(m, dataloaders=DataLoader(ds, batch_size=32)) + + y_preds = [] + y_trues = [] + for y_true, y_pred in pred: + y_preds.append(y_pred) + y_trues.append(y_true) + y_trues = np.concatenate(y_trues) + y_preds = np.concatenate(y_preds) + return y_trues, y_preds + + +def accuracy(y_trues, y_preds) -> float: + return (y_trues == y_preds).mean() diff --git a/tests/model_tests/utils.py b/tests/model_tests/utils.py index 593bae11..f578a87e 100644 --- a/tests/model_tests/utils.py +++ b/tests/model_tests/utils.py @@ -1,12 +1,8 @@ +from __future__ import annotations + from pathlib import Path -import lightning as pl -import numpy as np import torch -from matplotlib import pyplot as plt -from seaborn import heatmap -from sklearn.metrics import confusion_matrix -from torch.utils.data import DataLoader from torchvision.transforms import RandomVerticalFlip from torchvision.transforms.v2 import ( Compose, @@ -49,48 +45,6 @@ def __getitem__(self, idx): return RandomHorizontalFlip(p=1)(RandomVerticalFlip(p=1)(x)), y -def evaluate( - ds: FRDCDataset, ckpt_pth: Path | str | None = None -) -> tuple[plt.Figure, float]: - if ckpt_pth is None: - # This fetches all possible checkpoints and gets the latest one - ckpt_pth = sorted( - THIS_DIR.glob("**/*.ckpt"), key=lambda x: x.stat().st_mtime_ns - )[-1] - - m = InceptionV3MixMatchModule.load_from_checkpoint(ckpt_pth) - # Make predictions - trainer = pl.Trainer(logger=False) - pred = trainer.predict(m, dataloaders=DataLoader(ds, batch_size=32)) - - y_trues = [] - y_preds = [] - for y_true, y_pred in pred: - y_trues.append(y_true) - y_preds.append(y_pred) - y_trues = np.concatenate(y_trues) - y_preds = np.concatenate(y_preds) - acc = (y_trues == y_preds).mean() - - # Plot the confusion matrix - cm = confusion_matrix(y_trues, y_preds) - - plt.figure(figsize=(10, 10)) - - heatmap( - cm, - annot=True, - xticklabels=m.y_encoder.categories_[0], - yticklabels=m.y_encoder.categories_[0], - cbar=False, - ) - plt.title(f"Accuracy: {acc:.2%}") - plt.tight_layout(pad=3) - plt.xlabel("Predicted Label") - plt.ylabel("True Label") - return plt.gcf(), acc - - def preprocess(x): return Compose( [ From 60b835d335b37256d4a82d3a71aeae19112b9614 Mon Sep 17 00:00:00 2001 From: Evening Date: Thu, 28 Dec 2023 18:02:49 +0800 Subject: [PATCH 30/52] Migrate references for train --- tests/model_tests/chestnut_dec_may/train.py | 27 ++++++++++++--------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/tests/model_tests/chestnut_dec_may/train.py b/tests/model_tests/chestnut_dec_may/train.py index 53099ece..97123f02 100644 --- a/tests/model_tests/chestnut_dec_may/train.py +++ b/tests/model_tests/chestnut_dec_may/train.py @@ -5,8 +5,11 @@ """ # Uncomment this to run the W&B monitoring locally -# import os -# os.environ["WANDB_MODE"] = "offline" +import os + +from frdc.utils.training import predict, plot_confusion_matrix + +os.environ["WANDB_MODE"] = "offline" from pathlib import Path @@ -21,14 +24,13 @@ from lightning.pytorch.loggers import WandbLogger from sklearn.preprocessing import StandardScaler, OrdinalEncoder -from frdc.load.dataset import FRDCUnlabelledDataset, FRDCDatasetPreset +from frdc.load.dataset import FRDCDatasetPreset as ds from frdc.models.inceptionv3 import InceptionV3MixMatchModule from frdc.train.frdc_datamodule import FRDCDataModule from model_tests.utils import ( train_preprocess, train_unl_preprocess, preprocess, - evaluate, FRDCDatasetFlipped, ) @@ -43,15 +45,13 @@ def main( run = wandb.init() logger = WandbLogger(name="chestnut_dec_may", project="frdc") # Prepare the dataset - train_lab_ds = FRDCDatasetPreset.chestnut_20201218( - transform=train_preprocess - ) + train_lab_ds = ds.chestnut_20201218(transform=train_preprocess) - train_unl_ds = FRDCDatasetPreset.chestnut_20201218.unlabelled( - transform=train_unl_preprocess(2), + train_unl_ds = ds.chestnut_20201218.unlabelled( + transform=train_unl_preprocess(2) ) - val_ds = FRDCDatasetPreset.chestnut_20210510_43m(transform=preprocess) + val_ds = ds.chestnut_20210510_43m(transform=preprocess) oe = OrdinalEncoder( handle_unknown="use_encoded_value", @@ -106,15 +106,20 @@ def main( f"- Results: [WandB Report]({run.get_url()})" ) - fig, acc = evaluate( + y_true, y_pred = predict( ds=FRDCDatasetFlipped( "chestnut_nature_park", "20210510", "90deg43m85pct255deg", transform=preprocess, ), + model_cls=InceptionV3MixMatchModule, ckpt_pth=Path(ckpt.best_model_path), ) + fig, ax = plot_confusion_matrix(y_true, y_pred, oe.categories_[0]) + acc = np.sum(y_true == y_pred) / len(y_true) + ax.set_title(f"Accuracy: {acc:.2%}") + wandb.log({"confusion_matrix": wandb.Image(fig)}) wandb.log({"eval_accuracy": acc}) From a78446a4b849170a3f5d3268d411114c62083352 Mon Sep 17 00:00:00 2001 From: Evening Date: Thu, 28 Dec 2023 18:47:06 +0800 Subject: [PATCH 31/52] Fix error in documentation signature --- src/frdc/train/frdc_datamodule.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/frdc/train/frdc_datamodule.py b/src/frdc/train/frdc_datamodule.py index 97daaa6b..9f35ce03 100644 --- a/src/frdc/train/frdc_datamodule.py +++ b/src/frdc/train/frdc_datamodule.py @@ -36,7 +36,7 @@ class FRDCDataModule(LightningDataModule): Does not have the same performance as:: - FRDCSSLDataModule( + FRDCDataModule( train_lab_ds=train_lab_ds, train_unl_ds=None, ... From a0583d6f7fc8f7f907fc67afee3476860e44b7a1 Mon Sep 17 00:00:00 2001 From: Evening Date: Thu, 28 Dec 2023 18:47:24 +0800 Subject: [PATCH 32/52] Make wandb online by default --- tests/model_tests/chestnut_dec_may/train.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/tests/model_tests/chestnut_dec_may/train.py b/tests/model_tests/chestnut_dec_may/train.py index 97123f02..df2d6814 100644 --- a/tests/model_tests/chestnut_dec_may/train.py +++ b/tests/model_tests/chestnut_dec_may/train.py @@ -5,11 +5,9 @@ """ # Uncomment this to run the W&B monitoring locally -import os - -from frdc.utils.training import predict, plot_confusion_matrix - -os.environ["WANDB_MODE"] = "offline" +# import os +# from frdc.utils.training import predict, plot_confusion_matrix +# os.environ["WANDB_MODE"] = "offline" from pathlib import Path From a64e59a4aaf151143464a672fbbcb37a4623b700 Mon Sep 17 00:00:00 2001 From: Evening Date: Fri, 29 Dec 2023 10:43:07 +0800 Subject: [PATCH 33/52] Migrate Preset classes to preset.py Also forces imports to use load.dataset for FRDCDataset to avoid confusion and circular imports --- Writerside/topics/load.dataset.md | 2 +- .../topics/preprocessing.extract_segments.md | 4 +- Writerside/topics/preprocessing.morphology.md | 2 +- Writerside/topics/preprocessing.scale.md | 2 +- src/frdc/load/__init__.py | 3 - src/frdc/load/dataset.py | 148 ---------------- src/frdc/load/label_studio.py | 10 -- src/frdc/load/preset.py | 159 ++++++++++++++++++ src/frdc/train/frdc_datamodule.py | 5 +- src/frdc/utils/training.py | 2 +- tests/conftest.py | 4 +- tests/model_tests/chestnut_dec_may/train.py | 2 +- tests/model_tests/utils.py | 2 +- 13 files changed, 170 insertions(+), 175 deletions(-) create mode 100644 src/frdc/load/preset.py diff --git a/Writerside/topics/load.dataset.md b/Writerside/topics/load.dataset.md index e5a57699..0c5dbf24 100644 --- a/Writerside/topics/load.dataset.md +++ b/Writerside/topics/load.dataset.md @@ -17,7 +17,7 @@ version. For example, to load our Chestnut Nature Park dataset. ```python -from frdc.load import FRDCDataset +from frdc.load.dataset import FRDCDataset ds = FRDCDataset(site='chestnut_nature_park', date='20201218', diff --git a/Writerside/topics/preprocessing.extract_segments.md b/Writerside/topics/preprocessing.extract_segments.md index ed257e8d..6f422569 100644 --- a/Writerside/topics/preprocessing.extract_segments.md +++ b/Writerside/topics/preprocessing.extract_segments.md @@ -135,7 +135,7 @@ Extract segments from bounds and labels. ```python import numpy as np -from frdc.load import FRDCDataset +from frdc.load.dataset import FRDCDataset from frdc.preprocess.extract_segments import extract_segments_from_bounds ds = FRDCDataset(site='chestnut_nature_park', @@ -155,7 +155,7 @@ Extract segments from a label classification. from skimage.morphology import remove_small_objects, remove_small_holes import numpy as np -from frdc.load import FRDCDataset +from frdc.load.dataset import FRDCDataset from frdc.preprocess.morphology import ( threshold_binary_mask, binary_watershed ) diff --git a/Writerside/topics/preprocessing.morphology.md b/Writerside/topics/preprocessing.morphology.md index 50b5d7b2..062eb464 100644 --- a/Writerside/topics/preprocessing.morphology.md +++ b/Writerside/topics/preprocessing.morphology.md @@ -29,7 +29,7 @@ classification Perform auto-segmentation on a dataset to yield a label classification. ```python -from frdc.load import FRDCDataset +from frdc.load.dataset import FRDCDataset from frdc.preprocess.morphology import ( threshold_binary_mask, binary_watershed ) diff --git a/Writerside/topics/preprocessing.scale.md b/Writerside/topics/preprocessing.scale.md index 2ce224be..513e9d5e 100644 --- a/Writerside/topics/preprocessing.scale.md +++ b/Writerside/topics/preprocessing.scale.md @@ -35,7 +35,7 @@ Take a look at frdc.conf.BAND_MAX_CONFIG for an example. ## Usage ```python -from frdc.load import FRDCDataset +from frdc.load.dataset import FRDCDataset from frdc.preprocess.scale import ( scale_0_1_per_band, scale_normal_per_band, scale_static_per_band ) diff --git a/src/frdc/load/__init__.py b/src/frdc/load/__init__.py index 06860a70..e69de29b 100644 --- a/src/frdc/load/__init__.py +++ b/src/frdc/load/__init__.py @@ -1,3 +0,0 @@ -from .dataset import FRDCDataset - -__all__ = ["FRDCDataset"] diff --git a/src/frdc/load/dataset.py b/src/frdc/load/dataset.py index 258aac9c..466aad93 100644 --- a/src/frdc/load/dataset.py +++ b/src/frdc/load/dataset.py @@ -8,15 +8,8 @@ import numpy as np import pandas as pd -import torch from PIL import Image from torch.utils.data import Dataset, ConcatDataset -from torchvision.transforms.v2 import ( - Compose, - ToImage, - ToDtype, - Resize, -) from frdc.conf import ( BAND_CONFIG, @@ -282,85 +275,6 @@ def __add__(self, other) -> FRDCConcatDataset: return FRDCConcatDataset([self, other]) -# This curries the FRDCDataset class, so that we can shorthand the preset -# definitions. -@dataclass -class FRDCDatasetPartial: - """Partial class for FRDCDataset. - - Notes: - This is used internally by FRDCDatasetPreset to define the presets - in a more concise manner:: - - # Instead of - lambda *args, **kwargs: - FRDCDataset("chestnut_nature_park", "20201218", None, - *args, **kwargs) - - # Using partial, we can do this instead - FRDCDatasetPartial("chestnut_nature_park", "20201218", None)( - *args, **kwargs - ) - - See FRDCDatasetPreset for usage. - """ - - site: str - date: str - version: str | None - - def __call__( - self, - transform: Callable[[list[np.ndarray]], Any] = None, - target_transform: Callable[[list[str]], list[str]] = None, - use_legacy_bounds: bool = False, - ): - """Alias for labelled().""" - return self.labelled( - transform, - target_transform, - use_legacy_bounds, - ) - - def labelled( - self, - transform: Callable[[list[np.ndarray]], Any] = None, - target_transform: Callable[[list[str]], list[str]] = None, - use_legacy_bounds: bool = False, - ): - """Returns the Labelled Dataset.""" - return FRDCDataset( - self.site, - self.date, - self.version, - transform, - target_transform, - use_legacy_bounds, - ) - - def unlabelled( - self, - transform: Callable[[list[np.ndarray]], Any] = None, - target_transform: Callable[[list[str]], list[str]] = None, - use_legacy_bounds: bool = False, - ): - """Returns the Unlabelled Dataset. - - Notes: - This simply masks away the labels during __getitem__. - The same behaviour can be achieved by setting __class__ to - FRDCUnlabelledDataset, but this is a more convenient way to do so. - """ - return FRDCUnlabelledDataset( - self.site, - self.date, - self.version, - transform, - target_transform, - use_legacy_bounds, - ) - - class FRDCUnlabelledDataset(FRDCDataset): """An implementation of FRDCDataset that masks away the labels. @@ -383,65 +297,3 @@ def __getitem__(self, item): if self.transform else self.ar_segments[item] ) - - -@dataclass -class FRDCDatasetPreset: - """Presets for the FRDCDataset. - - Examples: - Each variable is a preset for the FRDCDataset. - - You can use it like this:: - - FRDCDatasetPreset.chestnut_20201218() - - Which returns a FRDCDataset. - - Furthermore, if you're interested in the unlabelled dataset, you can - use:: - - FRDCDatasetPreset.chestnut_20201218.unlabelled() - - Which returns a FRDCUnlabelledDataset. - - If you'd like to keep the syntax consistent for labelled and unlabelled - datasets, you can use:: - - FRDCDatasetPreset.chestnut_20201218.labelled() - FRDCDatasetPreset.chestnut_20201218.unlabelled() - - The `labelled` method is simply an alias for the `__call__` method. - - The DEBUG dataset is a special dataset that is used for debugging, - which pulls from GCS a small cropped image and dummy label + bounds. - - """ - - chestnut_20201218 = FRDCDatasetPartial( - "chestnut_nature_park", "20201218", None - ) - chestnut_20210510_43m = FRDCDatasetPartial( - "chestnut_nature_park", "20210510", "90deg43m85pct255deg" - ) - chestnut_20210510_60m = FRDCDatasetPartial( - "chestnut_nature_park", "20210510", "90deg60m84.5pct255deg" - ) - casuarina_20220418_183deg = FRDCDatasetPartial( - "casuarina_nature_park", "20220418", "183deg" - ) - casuarina_20220418_93deg = FRDCDatasetPartial( - "casuarina_nature_park", "20220418", "93deg" - ) - DEBUG = lambda resize=299: FRDCDatasetPartial( - site="DEBUG", date="0", version=None - )( - transform=Compose( - [ - ToImage(), - ToDtype(torch.float32), - Resize((resize, resize)), - ] - ), - target_transform=None, - ) diff --git a/src/frdc/load/label_studio.py b/src/frdc/load/label_studio.py index b8287ff3..6383cfe4 100644 --- a/src/frdc/load/label_studio.py +++ b/src/frdc/load/label_studio.py @@ -8,16 +8,6 @@ from frdc.conf import LABEL_STUDIO_CLIENT -# try: -# client.check_connection() -# except ConnectionError: -# raise ConnectionError( -# f"Could not connect to Label Studio at {LABEL_STUDIO_URL}. " -# "This uses Label Studio's check_connection() method," -# "which performs retries. " -# "Use utils.is_label_studio_up() as a faster alternative to check if " -# "Label Studio is up." -# ) logger = logging.getLogger(__name__) diff --git a/src/frdc/load/preset.py b/src/frdc/load/preset.py new file mode 100644 index 00000000..4f892349 --- /dev/null +++ b/src/frdc/load/preset.py @@ -0,0 +1,159 @@ +from __future__ import annotations + +import logging +from dataclasses import dataclass +from typing import Callable, Any + +import numpy as np +import torch +from torchvision.transforms.v2 import ( + Compose, + ToImage, + ToDtype, + Resize, +) + +from frdc.load.dataset import FRDCDataset, FRDCUnlabelledDataset + +logger = logging.getLogger(__name__) + + +# This curries the FRDCDataset class, so that we can shorthand the preset +# definitions. +@dataclass +class FRDCDatasetPartial: + """Partial class for FRDCDataset. + + Notes: + This is used internally by FRDCDatasetPreset to define the presets + in a more concise manner:: + + # Instead of + lambda *args, **kwargs: + FRDCDataset("chestnut_nature_park", "20201218", None, + *args, **kwargs) + + # Using partial, we can do this instead + FRDCDatasetPartial("chestnut_nature_park", "20201218", None)( + *args, **kwargs + ) + + See FRDCDatasetPreset for usage. + """ + + site: str + date: str + version: str | None + + def __call__( + self, + transform: Callable[[list[np.ndarray]], Any] = None, + target_transform: Callable[[list[str]], list[str]] = None, + use_legacy_bounds: bool = False, + ): + """Alias for labelled().""" + return self.labelled( + transform, + target_transform, + use_legacy_bounds, + ) + + def labelled( + self, + transform: Callable[[list[np.ndarray]], Any] = None, + target_transform: Callable[[list[str]], list[str]] = None, + use_legacy_bounds: bool = False, + ): + """Returns the Labelled Dataset.""" + return FRDCDataset( + self.site, + self.date, + self.version, + transform, + target_transform, + use_legacy_bounds, + ) + + def unlabelled( + self, + transform: Callable[[list[np.ndarray]], Any] = None, + target_transform: Callable[[list[str]], list[str]] = None, + use_legacy_bounds: bool = False, + ): + """Returns the Unlabelled Dataset. + + Notes: + This simply masks away the labels during __getitem__. + The same behaviour can be achieved by setting __class__ to + FRDCUnlabelledDataset, but this is a more convenient way to do so. + """ + return FRDCUnlabelledDataset( + self.site, + self.date, + self.version, + transform, + target_transform, + use_legacy_bounds, + ) + + +@dataclass +class FRDCDatasetPreset: + """Presets for the FRDCDataset. + + Examples: + Each variable is a preset for the FRDCDataset. + + You can use it like this:: + + FRDCDatasetPreset.chestnut_20201218() + + Which returns a FRDCDataset. + + Furthermore, if you're interested in the unlabelled dataset, you can + use:: + + FRDCDatasetPreset.chestnut_20201218.unlabelled() + + Which returns a FRDCUnlabelledDataset. + + If you'd like to keep the syntax consistent for labelled and unlabelled + datasets, you can use:: + + FRDCDatasetPreset.chestnut_20201218.labelled() + FRDCDatasetPreset.chestnut_20201218.unlabelled() + + The `labelled` method is simply an alias for the `__call__` method. + + The DEBUG dataset is a special dataset that is used for debugging, + which pulls from GCS a small cropped image and dummy label + bounds. + + """ + + chestnut_20201218 = FRDCDatasetPartial( + "chestnut_nature_park", "20201218", None + ) + chestnut_20210510_43m = FRDCDatasetPartial( + "chestnut_nature_park", "20210510", "90deg43m85pct255deg" + ) + chestnut_20210510_60m = FRDCDatasetPartial( + "chestnut_nature_park", "20210510", "90deg60m84.5pct255deg" + ) + casuarina_20220418_183deg = FRDCDatasetPartial( + "casuarina_nature_park", "20220418", "183deg" + ) + casuarina_20220418_93deg = FRDCDatasetPartial( + "casuarina_nature_park", "20220418", "93deg" + ) + DEBUG = lambda resize=299: FRDCDatasetPartial( + site="DEBUG", date="0", version=None + )( + transform=Compose( + [ + ToImage(), + ToDtype(torch.float32), + Resize((resize, resize)), + ] + ), + target_transform=None, + ) diff --git a/src/frdc/train/frdc_datamodule.py b/src/frdc/train/frdc_datamodule.py index 9f35ce03..cabcb604 100644 --- a/src/frdc/train/frdc_datamodule.py +++ b/src/frdc/train/frdc_datamodule.py @@ -1,14 +1,11 @@ from __future__ import annotations from dataclasses import dataclass -from types import MethodType -from typing import Any from lightning import LightningDataModule from torch.utils.data import DataLoader, RandomSampler -from frdc.load import FRDCDataset -from frdc.load.dataset import FRDCUnlabelledDataset +from frdc.load.dataset import FRDCDataset, FRDCUnlabelledDataset @dataclass diff --git a/src/frdc/utils/training.py b/src/frdc/utils/training.py index 87593f89..d8130b3a 100644 --- a/src/frdc/utils/training.py +++ b/src/frdc/utils/training.py @@ -8,7 +8,7 @@ from sklearn.metrics import confusion_matrix from torch.utils.data import DataLoader -from frdc.load import FRDCDataset +from frdc.load.dataset import FRDCDataset def get_latest_ckpt_path(search_dir: Path, extention: str = "ckpt"): diff --git a/tests/conftest.py b/tests/conftest.py index d420f691..b7bf6357 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,8 +1,8 @@ import numpy as np import pytest -from frdc.load import FRDCDataset -from frdc.load.dataset import FRDCDatasetPreset +from frdc.load.dataset import FRDCDataset +from frdc.load.preset import FRDCDatasetPreset @pytest.fixture(scope="session") diff --git a/tests/model_tests/chestnut_dec_may/train.py b/tests/model_tests/chestnut_dec_may/train.py index df2d6814..70f3dada 100644 --- a/tests/model_tests/chestnut_dec_may/train.py +++ b/tests/model_tests/chestnut_dec_may/train.py @@ -22,7 +22,7 @@ from lightning.pytorch.loggers import WandbLogger from sklearn.preprocessing import StandardScaler, OrdinalEncoder -from frdc.load.dataset import FRDCDatasetPreset as ds +from frdc.load.preset import FRDCDatasetPreset as ds from frdc.models.inceptionv3 import InceptionV3MixMatchModule from frdc.train.frdc_datamodule import FRDCDataModule from model_tests.utils import ( diff --git a/tests/model_tests/utils.py b/tests/model_tests/utils.py index f578a87e..bc820f53 100644 --- a/tests/model_tests/utils.py +++ b/tests/model_tests/utils.py @@ -14,7 +14,7 @@ ) from torchvision.transforms.v2 import RandomHorizontalFlip -from frdc.load import FRDCDataset +from frdc.load.dataset import FRDCDataset from frdc.models.inceptionv3 import InceptionV3MixMatchModule THIS_DIR = Path(__file__).parent From 314774c4b2c809292aaa323c1246ae674f93eb1b Mon Sep 17 00:00:00 2001 From: Evening Date: Fri, 29 Dec 2023 11:50:37 +0800 Subject: [PATCH 34/52] Update docs to prefer preset --- Writerside/topics/Retrieve-our-Datasets.md | 15 ++++++++------- Writerside/topics/load.dataset.md | 6 ++---- .../topics/preprocessing.extract_segments.md | 12 ++++-------- Writerside/topics/preprocessing.morphology.md | 6 ++---- Writerside/topics/preprocessing.scale.md | 6 ++---- 5 files changed, 18 insertions(+), 27 deletions(-) diff --git a/Writerside/topics/Retrieve-our-Datasets.md b/Writerside/topics/Retrieve-our-Datasets.md index 46141afe..9c671cbd 100644 --- a/Writerside/topics/Retrieve-our-Datasets.md +++ b/Writerside/topics/Retrieve-our-Datasets.md @@ -25,16 +25,17 @@ Here, we'll download and load our - `labels`: The labels of the trees (segments) ```python -from frdc.load.dataset import FRDCDataset +from frdc.load.preset import FRDCDatasetPreset -ds = FRDCDataset(site="chestnut_nature_park", date="20201218", version=None) +ds = FRDCDatasetPreset.chestnut_20201218() ar, order = ds.get_ar_bands() bounds, labels = ds.get_bounds_and_labels() ``` ### What Datasets are there? {collapsible="true"} -> To know what datasets are available, you can run +> We recommend to use FRDCDatasetPreset. However, if you want +> to know what other datasets are available, you can run > [load.gcs](load.gcs.md)'s `list_gcs_datasets()` > method @@ -86,10 +87,10 @@ To segment the data, use [Extract Segments](preprocessing.extract_segments.md). Here, we'll segment the data by the bounds. ```python -from frdc.load.dataset import FRDCDataset +from frdc.load.preset import FRDCDatasetPreset from frdc.preprocess.extract_segments import extract_segments_from_bounds -ds = FRDCDataset(site="chestnut_nature_park", date="20201218", version=None) +ds = FRDCDatasetPreset.chestnut_20201218() ar, order = ds.get_ar_bands() bounds, labels = ds.get_bounds_and_labels() segments = extract_segments_from_bounds(ar, bounds) @@ -109,11 +110,11 @@ We can then use these data to plot out the first tree segment. ```python import matplotlib.pyplot as plt -from frdc.load.dataset import FRDCDataset +from frdc.load.preset import FRDCDatasetPreset from frdc.preprocess.extract_segments import extract_segments_from_bounds from frdc.preprocess.scale import scale_0_1_per_band -ds = FRDCDataset(site="chestnut_nature_park", date="20201218", version=None) +ds = FRDCDatasetPreset.chestnut_20201218() ar, order = ds.get_ar_bands() bounds, labels = ds.get_bounds_and_labels() segments = extract_segments_from_bounds(ar, bounds) diff --git a/Writerside/topics/load.dataset.md b/Writerside/topics/load.dataset.md index 0c5dbf24..7cbf6cbc 100644 --- a/Writerside/topics/load.dataset.md +++ b/Writerside/topics/load.dataset.md @@ -17,11 +17,9 @@ version. For example, to load our Chestnut Nature Park dataset. ```python -from frdc.load.dataset import FRDCDataset +from frdc.load.preset import FRDCDatasetPreset -ds = FRDCDataset(site='chestnut_nature_park', - date='20201218', - version=None) +ds = FRDCDatasetPreset.chestnut_20201218() ``` Then, we can use the `ds` object to load objects of the dataset: diff --git a/Writerside/topics/preprocessing.extract_segments.md b/Writerside/topics/preprocessing.extract_segments.md index 6f422569..a83b3060 100644 --- a/Writerside/topics/preprocessing.extract_segments.md +++ b/Writerside/topics/preprocessing.extract_segments.md @@ -135,12 +135,10 @@ Extract segments from bounds and labels. ```python import numpy as np -from frdc.load.dataset import FRDCDataset +from frdc.load.preset import FRDCDatasetPreset from frdc.preprocess.extract_segments import extract_segments_from_bounds -ds = FRDCDataset(site='chestnut_nature_park', - date='20201218', - version=None, ) +ds = FRDCDatasetPreset.chestnut_20201218() ar, order = ds.get_ar_bands() bounds, labels = ds.get_bounds_and_labels() @@ -155,7 +153,7 @@ Extract segments from a label classification. from skimage.morphology import remove_small_objects, remove_small_holes import numpy as np -from frdc.load.dataset import FRDCDataset +from frdc.load.preset import FRDCDatasetPreset from frdc.preprocess.morphology import ( threshold_binary_mask, binary_watershed ) @@ -164,9 +162,7 @@ from frdc.preprocess.extract_segments import ( extract_segments_from_labels, remove_small_segments_from_labels ) -ds = FRDCDataset(site='chestnut_nature_park', - date='20201218', - version=None, ) +ds = FRDCDatasetPreset.chestnut_20201218() ar, order = ds.get_ar_bands() ar = scale_0_1_per_band(ar) ar_mask = threshold_binary_mask(ar, -1, 90 / 256) diff --git a/Writerside/topics/preprocessing.morphology.md b/Writerside/topics/preprocessing.morphology.md index 062eb464..95289404 100644 --- a/Writerside/topics/preprocessing.morphology.md +++ b/Writerside/topics/preprocessing.morphology.md @@ -29,14 +29,12 @@ classification Perform auto-segmentation on a dataset to yield a label classification. ```python -from frdc.load.dataset import FRDCDataset +from frdc.load.preset import FRDCDatasetPreset from frdc.preprocess.morphology import ( threshold_binary_mask, binary_watershed ) -ds = FRDCDataset(site='chestnut_nature_park', - date='20201218', - version=None, ) +ds = FRDCDatasetPreset.chestnut_20201218() ar, order = ds.get_ar_bands() mask = threshold_binary_mask(ar, order.index('NIR'), 90 / 256) ar_label = binary_watershed(mask) diff --git a/Writerside/topics/preprocessing.scale.md b/Writerside/topics/preprocessing.scale.md index 513e9d5e..0b0e5946 100644 --- a/Writerside/topics/preprocessing.scale.md +++ b/Writerside/topics/preprocessing.scale.md @@ -35,15 +35,13 @@ Take a look at frdc.conf.BAND_MAX_CONFIG for an example. ## Usage ```python -from frdc.load.dataset import FRDCDataset +from frdc.load.preset import FRDCDatasetPreset from frdc.preprocess.scale import ( scale_0_1_per_band, scale_normal_per_band, scale_static_per_band ) from frdc.conf import BAND_MAX_CONFIG -ds = FRDCDataset(site='chestnut_nature_park', - date='20201218', - version=None, ) +ds = FRDCDatasetPreset.chestnut_20201218() ar, order = ds.get_ar_bands() ar_01 = scale_0_1_per_band(ar) ar_norm = scale_normal_per_band(ar) From 1673227701272a2e210578927a8202dc2e352352 Mon Sep 17 00:00:00 2001 From: Evening Date: Fri, 29 Dec 2023 11:53:47 +0800 Subject: [PATCH 35/52] update html docs --- docs/HelpTOC.json | 2 +- docs/custom-k-aug-dataloaders.html | 6 ++--- docs/get-started-with-dev-containers.html | 4 ++-- docs/getting-started.html | 20 ++++++++-------- docs/icon-192.png | Bin 337 -> 0 bytes docs/icon-512.png | Bin 1103 -> 0 bytes docs/load-dataset.html | 14 +++++------ docs/load-gcs.html | 6 ++--- docs/mix-match-module.html | 12 +++++----- docs/mix-match.html | 2 +- docs/model-test-chestnut-may-dec.html | 2 +- docs/overview.html | 2 +- docs/preprocessing-extract-segments.html | 28 ++++++++++------------ docs/preprocessing-glcm-padded.html | 4 ++-- docs/preprocessing-morphology.html | 12 ++++------ docs/preprocessing-scale.html | 10 ++++---- docs/retrieve-our-datasets.html | 22 ++++++++--------- docs/site.webmanifest | 11 --------- docs/train-frdc-lightning.html | 4 ++-- 19 files changed, 70 insertions(+), 91 deletions(-) delete mode 100644 docs/icon-192.png delete mode 100644 docs/icon-512.png delete mode 100644 docs/site.webmanifest diff --git a/docs/HelpTOC.json b/docs/HelpTOC.json index d0eb4e12..5467dd78 100644 --- a/docs/HelpTOC.json +++ b/docs/HelpTOC.json @@ -1 +1 @@ -{"entities":{"pages":{"Overview":{"id":"Overview","title":"Overview","url":"overview.html","level":0,"tabIndex":0},"Getting-Started":{"id":"Getting-Started","title":"Getting Started","url":"getting-started.html","level":0,"pages":["Get-Started-with-Dev-Containers"],"tabIndex":1},"Get-Started-with-Dev-Containers":{"id":"Get-Started-with-Dev-Containers","title":"Get Started with Dev Containers","url":"get-started-with-dev-containers.html","level":1,"parentId":"Getting-Started","tabIndex":0},"e8e19623_66291":{"id":"e8e19623_66291","title":"Tutorials","level":0,"pages":["Retrieve-our-Datasets"],"tabIndex":2},"Retrieve-our-Datasets":{"id":"Retrieve-our-Datasets","title":"Retrieve our Datasets","url":"retrieve-our-datasets.html","level":1,"parentId":"e8e19623_66291","tabIndex":0},"mix-match":{"id":"mix-match","title":"MixMatch","url":"mix-match.html","level":0,"pages":["mix-match-module","custom-k-aug-dataloaders"],"tabIndex":3},"mix-match-module":{"id":"mix-match-module","title":"MixMatch Module","url":"mix-match-module.html","level":1,"parentId":"mix-match","tabIndex":0},"custom-k-aug-dataloaders":{"id":"custom-k-aug-dataloaders","title":"Custom K-Aug Dataloaders","url":"custom-k-aug-dataloaders.html","level":1,"parentId":"mix-match","tabIndex":1},"e8e19623_66296":{"id":"e8e19623_66296","title":"Model Tests","level":0,"pages":["Model-Test-Chestnut-May-Dec"],"tabIndex":4},"Model-Test-Chestnut-May-Dec":{"id":"Model-Test-Chestnut-May-Dec","title":"Model Test Chestnut May-Dec","url":"model-test-chestnut-may-dec.html","level":1,"parentId":"e8e19623_66296","tabIndex":0},"e8e19623_66298":{"id":"e8e19623_66298","title":"API","level":0,"pages":["load.dataset","load.gcs","preprocessing.scale","preprocessing.extract_segments","preprocessing.morphology","preprocessing.glcm_padded","train.frdc_lightning"],"tabIndex":5},"load.dataset":{"id":"load.dataset","title":"load.dataset","url":"load-dataset.html","level":1,"parentId":"e8e19623_66298","tabIndex":0},"load.gcs":{"id":"load.gcs","title":"load.gcs","url":"load-gcs.html","level":1,"parentId":"e8e19623_66298","tabIndex":1},"preprocessing.scale":{"id":"preprocessing.scale","title":"preprocessing.scale","url":"preprocessing-scale.html","level":1,"parentId":"e8e19623_66298","tabIndex":2},"preprocessing.extract_segments":{"id":"preprocessing.extract_segments","title":"preprocessing.extract_segments","url":"preprocessing-extract-segments.html","level":1,"parentId":"e8e19623_66298","tabIndex":3},"preprocessing.morphology":{"id":"preprocessing.morphology","title":"preprocessing.morphology","url":"preprocessing-morphology.html","level":1,"parentId":"e8e19623_66298","tabIndex":4},"preprocessing.glcm_padded":{"id":"preprocessing.glcm_padded","title":"preprocessing.glcm_padded","url":"preprocessing-glcm-padded.html","level":1,"parentId":"e8e19623_66298","tabIndex":5},"train.frdc_lightning":{"id":"train.frdc_lightning","title":"train.frdc_datamodule \u0026 frdc_module","url":"train-frdc-lightning.html","level":1,"parentId":"e8e19623_66298","tabIndex":6}}},"topLevelIds":["Overview","Getting-Started","e8e19623_66291","mix-match","e8e19623_66296","e8e19623_66298"]} \ No newline at end of file +{"entities":{"pages":{"Overview":{"id":"Overview","title":"Overview","url":"overview.html","level":0,"tabIndex":0},"Getting-Started":{"id":"Getting-Started","title":"Getting Started","url":"getting-started.html","level":0,"pages":["Get-Started-with-Dev-Containers"],"tabIndex":1},"Get-Started-with-Dev-Containers":{"id":"Get-Started-with-Dev-Containers","title":"Get Started with Dev Containers","url":"get-started-with-dev-containers.html","level":1,"parentId":"Getting-Started","tabIndex":0},"f6c570e4_4234":{"id":"f6c570e4_4234","title":"Tutorials","level":0,"pages":["Retrieve-our-Datasets"],"tabIndex":2},"Retrieve-our-Datasets":{"id":"Retrieve-our-Datasets","title":"Retrieve our Datasets","url":"retrieve-our-datasets.html","level":1,"parentId":"f6c570e4_4234","tabIndex":0},"mix-match":{"id":"mix-match","title":"MixMatch","url":"mix-match.html","level":0,"pages":["mix-match-module","custom-k-aug-dataloaders"],"tabIndex":3},"mix-match-module":{"id":"mix-match-module","title":"MixMatch Module","url":"mix-match-module.html","level":1,"parentId":"mix-match","tabIndex":0},"custom-k-aug-dataloaders":{"id":"custom-k-aug-dataloaders","title":"Custom K-Aug Dataloaders","url":"custom-k-aug-dataloaders.html","level":1,"parentId":"mix-match","tabIndex":1},"f6c570e4_4239":{"id":"f6c570e4_4239","title":"Model Tests","level":0,"pages":["Model-Test-Chestnut-May-Dec"],"tabIndex":4},"Model-Test-Chestnut-May-Dec":{"id":"Model-Test-Chestnut-May-Dec","title":"Model Test Chestnut May-Dec","url":"model-test-chestnut-may-dec.html","level":1,"parentId":"f6c570e4_4239","tabIndex":0},"f6c570e4_4241":{"id":"f6c570e4_4241","title":"API","level":0,"pages":["load.dataset","load.gcs","preprocessing.scale","preprocessing.extract_segments","preprocessing.morphology","preprocessing.glcm_padded","train.frdc_lightning"],"tabIndex":5},"load.dataset":{"id":"load.dataset","title":"load.dataset","url":"load-dataset.html","level":1,"parentId":"f6c570e4_4241","tabIndex":0},"load.gcs":{"id":"load.gcs","title":"load.gcs","url":"load-gcs.html","level":1,"parentId":"f6c570e4_4241","tabIndex":1},"preprocessing.scale":{"id":"preprocessing.scale","title":"preprocessing.scale","url":"preprocessing-scale.html","level":1,"parentId":"f6c570e4_4241","tabIndex":2},"preprocessing.extract_segments":{"id":"preprocessing.extract_segments","title":"preprocessing.extract_segments","url":"preprocessing-extract-segments.html","level":1,"parentId":"f6c570e4_4241","tabIndex":3},"preprocessing.morphology":{"id":"preprocessing.morphology","title":"preprocessing.morphology","url":"preprocessing-morphology.html","level":1,"parentId":"f6c570e4_4241","tabIndex":4},"preprocessing.glcm_padded":{"id":"preprocessing.glcm_padded","title":"preprocessing.glcm_padded","url":"preprocessing-glcm-padded.html","level":1,"parentId":"f6c570e4_4241","tabIndex":5},"train.frdc_lightning":{"id":"train.frdc_lightning","title":"train.frdc_datamodule \u0026 frdc_module","url":"train-frdc-lightning.html","level":1,"parentId":"f6c570e4_4241","tabIndex":6}}},"topLevelIds":["Overview","Getting-Started","f6c570e4_4234","mix-match","f6c570e4_4239","f6c570e4_4241"]} \ No newline at end of file diff --git a/docs/custom-k-aug-dataloaders.html b/docs/custom-k-aug-dataloaders.html index 4863aad8..473d540d 100644 --- a/docs/custom-k-aug-dataloaders.html +++ b/docs/custom-k-aug-dataloaders.html @@ -1,4 +1,4 @@ - Custom K-Aug Dataloaders | Documentation

Documentation 0.0.8 Help

Custom K-Aug Dataloaders

In MixMatch, implementing the data loading methods is quite unconventional.

  1. We need to load multiple augmented versions of the same image into the same batch.

  2. The labelled set is usually too small, causing a premature end to the epoch as it runs out of samples to draw from faster than the unlabelled set.

This can be rather tricky to implement in PyTorch. This tutorial will illustrate how we did it.

Loading Multiple Augmented Versions of the Same Image

See: frdc/load/dataset.py FRDCDataset.__getitem__

In MixMatch, a single train batch must consist of:

  1. A batch of labeled images

  2. K batches of unlabeled images

Aug
Aug
Aug
Aug
Get Batch
Aug Labelled Batch
Unlabelled Batch
Aug Unl. Batch 1
Aug Unl. Batch i
Aug Unl. Batch K

Keep in mind that the unlabelled batch, is a single batch of images, not separate draws of batches. It is then "duplicated" K times, and each copy is augmented differently.

Solution 1: Custom Dataset

To solve this, we need to understand the role of both a Dataset and a DataLoader.

  • A Dataset represents a collection of data, responsible for loading and returning something.

  • A DataLoader draws samples from a Dataset and returns batched samples.

The key here is that a Dataset is not limited to returning 1 sample at a time, we can make it return the K augmented versions of the same image.

Aug
Aug
Aug
Sample
Aug Sample 1
Aug Sample i
Aug Sample K

In code, this is done by subclassing the Dataset class and overriding the __getitem__ method.

+ Custom K-Aug Dataloaders | Documentation

Documentation 0.0.8 Help

Custom K-Aug Dataloaders

In MixMatch, implementing the data loading methods is quite unconventional.

  1. We need to load multiple augmented versions of the same image into the same batch.

  2. The labelled set is usually too small, causing a premature end to the epoch as it runs out of samples to draw from faster than the unlabelled set.

This can be rather tricky to implement in PyTorch. This tutorial will illustrate how we did it.

Loading Multiple Augmented Versions of the Same Image

See: frdc/load/dataset.py FRDCDataset.__getitem__

In MixMatch, a single train batch must consist of:

  1. A batch of labeled images

  2. K batches of unlabeled images

Aug
Aug
Aug
Aug
Get Batch
Aug Labelled Batch
Unlabelled Batch
Aug Unl. Batch 1
Aug Unl. Batch i
Aug Unl. Batch K

Keep in mind that the unlabelled batch, is a single batch of images, not separate draws of batches. It is then "duplicated" K times, and each copy is augmented differently.

Solution 1: Custom Dataset

To solve this, we need to understand the role of both a Dataset and a DataLoader.

  • A Dataset represents a collection of data, responsible for loading and returning something.

  • A DataLoader draws samples from a Dataset and returns batched samples.

The key here is that a Dataset is not limited to returning 1 sample at a time, we can make it return the K augmented versions of the same image.

Aug
Aug
Aug
Sample
Aug Sample 1
Aug Sample i
Aug Sample K

In code, this is done by subclassing the Dataset class and overriding the __getitem__ method.

def duplicate(x): return x, deepcopy(x), deepcopy(x) @@ -10,7 +10,7 @@ def __getitem__(self, index): x, y = self.dataset[index] return self.aug(x), y -

In the above example, we have a Dataset that returns 3 duplicate versions of the same image. By leveraging this technique, we can create a Dataset that returns K augmented versions of the same image as a tuple

Premature End of Epoch due to Small Labelled Set

See: frdc/train/frdc_datamodule.py

In MixMatch, the definition of an "epoch" is a bit different. Instead of implying that we have seen all the data once, it implies that we've drawn N batches. The N is referred to as the number of iterations per epoch.

Take for example, a labelled set of numbers [1, 2, 3] and an unlabelled set [4, 5, 6, 7, 8, 9, 10]. With batch size of 2, we'll run out of labelled samples after 2 iterations, but we'll still have 3 more iterations for the unlabelled set.

  • Draw 1: [1, 2], [4, 5]

  • Draw 2: [3], [6, 7].

  • Epoch ends.

Solution 2: Random Sampling

To fix this, instead of sequentially sampling the labelled set (and the unlabelled set), we can sample them randomly. This way, we can ensure that it never runs out.

  • Draw 1: [1, 3], [7, 5]

  • Draw 2: [2, 1], [4, 9]

  • Draw 3: [3, 2], [8, 6]

  • ... and so on.

Luckily, PyTorch's DataLoader supports random sampling. We just need to use RandomSampler instead of SequentialSampler (which is the default).

+

In the above example, we have a Dataset that returns 3 duplicate versions of the same image. By leveraging this technique, we can create a Dataset that returns K augmented versions of the same image as a tuple

Premature End of Epoch due to Small Labelled Set

See: frdc/train/frdc_datamodule.py

In MixMatch, the definition of an "epoch" is a bit different. Instead of implying that we have seen all the data once, it implies that we've drawn N batches. The N is referred to as the number of iterations per epoch.

Take for example, a labelled set of numbers [1, 2, 3] and an unlabelled set [4, 5, 6, 7, 8, 9, 10]. With batch size of 2, we'll run out of labelled samples after 2 iterations, but we'll still have 3 more iterations for the unlabelled set.

  • Draw 1: [1, 2], [4, 5]

  • Draw 2: [3], [6, 7].

  • Epoch ends.

Solution 2: Random Sampling

To fix this, instead of sequentially sampling the labelled set (and the unlabelled set), we can sample them randomly. This way, we can ensure that it never runs out.

  • Draw 1: [1, 3], [7, 5]

  • Draw 2: [2, 1], [4, 9]

  • Draw 3: [3, 2], [8, 6]

  • ... and so on.

Luckily, PyTorch's DataLoader supports random sampling. We just need to use RandomSampler instead of SequentialSampler (which is the default).

from torch.utils.data import DataLoader, RandomSampler dl = DataLoader( @@ -21,4 +21,4 @@ replacement=False, ) ) -

This will ensure that the "epoch" ends when we've drawn train_iters batches

Last modified: 27 December 2023
\ No newline at end of file +

This will ensure that the "epoch" ends when we've drawn train_iters batches

Last modified: 29 December 2023
\ No newline at end of file diff --git a/docs/get-started-with-dev-containers.html b/docs/get-started-with-dev-containers.html index 5d793d5a..be7dc455 100644 --- a/docs/get-started-with-dev-containers.html +++ b/docs/get-started-with-dev-containers.html @@ -1,3 +1,3 @@ - Get Started with Dev Containers | Documentation

Documentation 0.0.8 Help

Get Started with Dev Containers

Dev. Containers are a great way to get started with a project. They define all necessary dependencies and environments, so you can just start coding within the container.

In this article, we'll only go over additional steps to set up with our project. For more information on how to use Dev Containers, please refer to the official documentation for each IDE. Once you've set up the Dev Container, come back here to finish the setup:

Python Environment

The dev environment is already created and is managed by Anaconda /opt/conda/bin/conda. To activate the environment, run the following command:

+ Get Started with Dev Containers | Documentation

Documentation 0.0.8 Help

Get Started with Dev Containers

Dev. Containers are a great way to get started with a project. They define all necessary dependencies and environments, so you can just start coding within the container.

In this article, we'll only go over additional steps to set up with our project. For more information on how to use Dev Containers, please refer to the official documentation for each IDE. Once you've set up the Dev Container, come back here to finish the setup:

Python Environment

The dev environment is already created and is managed by Anaconda /opt/conda/bin/conda. To activate the environment, run the following command:

conda activate base -

Mark as Sources Root (Add to PYTHONPATH)

For import statements to work, you need to mark the src folder as the sources root. Optionally, also mark the tests folder as the tests root.

Additional Setup

Refer to the Getting Started guide for additional setup steps such as:

  • Google Cloud Application Default Credentials

  • Weight & Bias API Key

  • Label Studio API Key

Last modified: 27 December 2023
\ No newline at end of file +

Mark as Sources Root (Add to PYTHONPATH)

For import statements to work, you need to mark the src folder as the sources root. Optionally, also mark the tests folder as the tests root.

Additional Setup

Refer to the Getting Started guide for additional setup steps such as:

  • Google Cloud Application Default Credentials

  • Weight & Bias API Key

  • Label Studio API Key

Last modified: 29 December 2023
\ No newline at end of file diff --git a/docs/getting-started.html b/docs/getting-started.html index 8f064904..8bc4700d 100644 --- a/docs/getting-started.html +++ b/docs/getting-started.html @@ -1,22 +1,22 @@ - Getting Started | Documentation

Documentation 0.0.8 Help

Getting Started

Installing the Dev. Environment

  1. Ensure that you have the right version of Python. The required Python version can be seen in pyproject.toml

    + Getting Started | Documentation

    Documentation 0.0.8 Help

    Getting Started

    Installing the Dev. Environment

    1. Ensure that you have the right version of Python. The required Python version can be seen in pyproject.toml

      [tool.poetry.dependencies] python = "..." -
    2. Start by cloning our repository.

      +
    3. Start by cloning our repository.

      git clone https://github.com/FR-DC/FRDC-ML.git -
    4. Then, create a Python Virtual Env pyvenv

      python -m venv venv/
      python3 -m venv venv/
    5. Install Poetry Then check if it's installed with

      poetry --version
    6. Activate the virtual environment

      +
    7. Then, create a Python Virtual Env pyvenv

      python -m venv venv/
      python3 -m venv venv/
    8. Install Poetry Then check if it's installed with

      poetry --version
    9. Activate the virtual environment

      cd venv/Scripts activate cd ../.. -
      +
      source venv/bin/activate -
    10. Install the dependencies. You should be in the same directory as pyproject.toml

      +
  2. Install the dependencies. You should be in the same directory as pyproject.toml

    poetry install --with dev -
  3. Install Pre-Commit Hooks

    +
  4. Install Pre-Commit Hooks

    pre-commit install -

Setting Up Google Cloud

  1. We use Google Cloud to store our datasets. To set up Google Cloud, install the Google Cloud CLI

  2. Then, authenticate your account.

    gcloud auth login
  3. Finally, set up Application Default Credentials (ADC).

    gcloud auth application-default login
  4. To make sure everything is working, run the tests.

Setting Up Label Studio

  1. We use Label Studio to annotate our datasets. We won't go through how to install Label Studio, for contributors, it should be up on localhost:8080.

  2. Then, retrieve your own API key from Label Studio. Go to your account page and copy the API key.


  3. Set your API key as an environment variable.

    In Windows, go to "Edit environment variables for your account" and add this as a new environment variable with name LABEL_STUDIO_API_KEY.

    Export it as an environment variable.

    export LABEL_STUDIO_API_KEY=...

Setting Up Weight and Biases

  1. We use W&B to track our experiments. To set up W&B, install the W&B CLI

  2. Then, authenticate your account.

    wandb login

Pre-commit Hooks

  • +

Setting Up Google Cloud

  1. We use Google Cloud to store our datasets. To set up Google Cloud, install the Google Cloud CLI

  2. Then, authenticate your account.

    gcloud auth login
  3. Finally, set up Application Default Credentials (ADC).

    gcloud auth application-default login
  4. To make sure everything is working, run the tests.

Setting Up Label Studio

  1. We use Label Studio to annotate our datasets. We won't go through how to install Label Studio, for contributors, it should be up on localhost:8080.

  2. Then, retrieve your own API key from Label Studio. Go to your account page and copy the API key.


  3. Set your API key as an environment variable.

    In Windows, go to "Edit environment variables for your account" and add this as a new environment variable with name LABEL_STUDIO_API_KEY.

    Export it as an environment variable.

    export LABEL_STUDIO_API_KEY=...

Setting Up Weight and Biases

  1. We use W&B to track our experiments. To set up W&B, install the W&B CLI

  2. Then, authenticate your account.

    wandb login

Pre-commit Hooks

  • pre-commit install -

Running the Tests

  • Run the tests to make sure everything is working

    +

Running the Tests

  • Run the tests to make sure everything is working

    pytest -

Troubleshooting

ModuleNotFoundError

It's likely that your src and tests directories are not in PYTHONPATH. To fix this, run the following command:

+

Troubleshooting

ModuleNotFoundError

It's likely that your src and tests directories are not in PYTHONPATH. To fix this, run the following command:

export PYTHONPATH=$PYTHONPATH:./src:./tests -

Or, set it in your IDE, for example, IntelliJ allows setting directories as Source Roots.

google.auth.exceptions.DefaultCredentialsError

It's likely that you haven't authenticated your Google Cloud account. See Setting Up Google Cloud

Couldn't connect to Label Studio

Label Studio must be running locally, exposed on localhost:8080. Furthermore, you need to specify the LABEL_STUDIO_API_KEY environment variable. See Setting Up Label Studio

Cannot login to W&B

You need to authenticate your W&B account. See Setting Up Weight and Biases If you're facing difficulties, set the WANDB_MODE environment variable to offline to disable W&B.

Our Repository Structure

Before starting development, take a look at our repository structure. This will help you understand where to put your code.

Core Dependencies
Resources
Tests
Repo Dependencies
Dataset Loaders
Preprocessing Fn.
Train Deps
Model Architectures
Datasets ...
FRDC
src/frdc/
rsc/
tests/
pyproject.toml,poetry.lock
./load/
./preprocess/
./train/
./models/
./dataset_name/
src/frdc/

Source Code for our package. These are the unit components of our pipeline.

rsc/

Resources. These are usually cached datasets

tests/

PyTest tests. These are unit, integration, and model tests.

Unit, Integration, and Pipeline Tests

We have 3 types of tests:

  • Unit Tests are usually small, single function tests.

  • Integration Tests are larger tests that tests a mock pipeline.

  • Model Tests are the true production pipeline tests that will generate a model.

Where Should I contribute?

Changing a small component

If you're changing a small component, such as a argument for preprocessing, a new model architecture, or a new configuration for a dataset, take a look at the src/frdc/ directory.

Adding a test

By adding a new component, you'll need to add a new test. Take a look at the tests/ directory.

Changing the model pipeline

If you're a ML Researcher, you'll probably be changing the pipeline. Take a look at the tests/model_tests/ directory.

Adding a dependency

If you're adding a new dependency, use poetry add PACKAGE and commit the changes to pyproject.toml and poetry.lock.

Last modified: 27 December 2023
\ No newline at end of file +

Or, set it in your IDE, for example, IntelliJ allows setting directories as Source Roots.

google.auth.exceptions.DefaultCredentialsError

It's likely that you haven't authenticated your Google Cloud account. See Setting Up Google Cloud

Couldn't connect to Label Studio

Label Studio must be running locally, exposed on localhost:8080. Furthermore, you need to specify the LABEL_STUDIO_API_KEY environment variable. See Setting Up Label Studio

Cannot login to W&B

You need to authenticate your W&B account. See Setting Up Weight and Biases If you're facing difficulties, set the WANDB_MODE environment variable to offline to disable W&B.

Our Repository Structure

Before starting development, take a look at our repository structure. This will help you understand where to put your code.

Core Dependencies
Resources
Tests
Repo Dependencies
Dataset Loaders
Preprocessing Fn.
Train Deps
Model Architectures
Datasets ...
FRDC
src/frdc/
rsc/
tests/
pyproject.toml,poetry.lock
./load/
./preprocess/
./train/
./models/
./dataset_name/
src/frdc/

Source Code for our package. These are the unit components of our pipeline.

rsc/

Resources. These are usually cached datasets

tests/

PyTest tests. These are unit, integration, and model tests.

Unit, Integration, and Pipeline Tests

We have 3 types of tests:

  • Unit Tests are usually small, single function tests.

  • Integration Tests are larger tests that tests a mock pipeline.

  • Model Tests are the true production pipeline tests that will generate a model.

Where Should I contribute?

Changing a small component

If you're changing a small component, such as a argument for preprocessing, a new model architecture, or a new configuration for a dataset, take a look at the src/frdc/ directory.

Adding a test

By adding a new component, you'll need to add a new test. Take a look at the tests/ directory.

Changing the model pipeline

If you're a ML Researcher, you'll probably be changing the pipeline. Take a look at the tests/model_tests/ directory.

Adding a dependency

If you're adding a new dependency, use poetry add PACKAGE and commit the changes to pyproject.toml and poetry.lock.

Last modified: 29 December 2023
\ No newline at end of file diff --git a/docs/icon-192.png b/docs/icon-192.png deleted file mode 100644 index 5953601c396250504ba6b31c031ea906e92b6cd9..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 337 zcmeAS@N?(olHy`uVBq!ia0vp^2SAvE2}s`E_d9@rflSGwb;dxs*#b$G<8}erCCjeWuCzLfaEd zD*IkLs+}#4;Wx^h_m~)^%s}g@2u|;lzdBFn<%jhd{?kf+gl|#zw)&%eYqF~BKhd`* zC-MF7`j+C^Uhi7HY02UJi)gTe~DWM4f DYn5)J diff --git a/docs/icon-512.png b/docs/icon-512.png deleted file mode 100644 index 9840e7b0cd4973a67d66ea20a62c77380047aed1..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1103 zcmeAS@N?(olHy`uVBq!ia0y~yU;;9k7&t&wwUqN(1_l-}PZ!6KinzB|482$b1XvEf z|I5F&QoY|*z-`s1vsK?dmwsVZ>UO9HIT3v b7H)suWWQGE&SGE*!NB0@>gTe~DWM4f4BN6W diff --git a/docs/load-dataset.html b/docs/load-dataset.html index bd5caa57..8ae04891 100644 --- a/docs/load-dataset.html +++ b/docs/load-dataset.html @@ -1,14 +1,12 @@ - load.dataset | Documentation

Documentation 0.0.8 Help

load.dataset

Usage

Firstly, to load a dataset instance, you need to initiliaze a FRDCDataset object, providing the site, date, and version.

For example, to load our Chestnut Nature Park dataset.

-from frdc.load import FRDCDataset + load.dataset | Documentation

Documentation 0.0.8 Help

load.dataset

Usage

Firstly, to load a dataset instance, you need to initiliaze a FRDCDataset object, providing the site, date, and version.

For example, to load our Chestnut Nature Park dataset.

+from frdc.load.preset import FRDCDatasetPreset -ds = FRDCDataset(site='chestnut_nature_park', - date='20201218', - version=None) -

Then, we can use the ds object to load objects of the dataset:

+ds = FRDCDatasetPreset.chestnut_20201218() +

Then, we can use the ds object to load objects of the dataset:

ar, order = ds.get_ar_bands() d = ds.get_ar_bands_as_dict() bounds, labels = ds.get_bounds_and_labels() -
  • ar is a stacked NDArray of the hyperspectral bands of shape (H x W x C)

  • order is a list of strings, containing the names of the bands, ordered according to the channels of ar

  • d is a dictionary of the hyperspectral bands of shape (H x W), keyed by the band names

  • bounds is a list of bounding boxes, in the format of Rect, a namedtuple of x0, y0, x1, y1

  • labels is a list of strings, containing the labels of the bounding boxes, ordered according to bounds

Filters

You can also selectively get the channels for both get_ar_bands() and get_ar_bands_as_dict() by providing a list of strings to the bands argument.

For example, to get the Wideband RGB bands, you can do:

+
  • ar is a stacked NDArray of the hyperspectral bands of shape (H x W x C)

  • order is a list of strings, containing the names of the bands, ordered according to the channels of ar

  • d is a dictionary of the hyperspectral bands of shape (H x W), keyed by the band names

  • bounds is a list of bounding boxes, in the format of Rect, a namedtuple of x0, y0, x1, y1

  • labels is a list of strings, containing the labels of the bounding boxes, ordered according to bounds

Filters

You can also selectively get the channels for both get_ar_bands() and get_ar_bands_as_dict() by providing a list of strings to the bands argument.

For example, to get the Wideband RGB bands, you can do:

ar, order = ds.get_ar_bands(bands=['WR', 'WG', 'WB']) d = ds.get_ar_bands_as_dict(bands=['WR', 'WG', 'WB']) -

This will also alter the channel order to the order of the bands provided.

See load.gcs for configuration options.

Last modified: 27 December 2023
\ No newline at end of file +

This will also alter the channel order to the order of the bands provided.

See load.gcs for configuration options.

Last modified: 29 December 2023
\ No newline at end of file diff --git a/docs/load-gcs.html b/docs/load-gcs.html index c6faadf2..ff221005 100644 --- a/docs/load-gcs.html +++ b/docs/load-gcs.html @@ -1,11 +1,11 @@ - load.gcs | Documentation

Documentation 0.0.8 Help

load.gcs

Usage

These are defined in the top-level load.gcs module.

list_gcs_datasets

Lists all datasets in the bucket as a DataFrame. This works by checking which folders have a specific file, which we call the anchor.

download

Downloads a file from Google Cloud Storage and returns the local file path.

open_file

Downloads and opens a file from Google Cloud Storage. Returns a file handle.

open_image

Downloads and returns the PIL image from Google Cloud Storage.

Pathing

The path to specify is relative to the bucket, which is frdc-ds by default.

For example this filesystem on GCS:

+ load.gcs | Documentation

Documentation 0.0.8 Help

load.gcs

Usage

These are defined in the top-level load.gcs module.

list_gcs_datasets

Lists all datasets in the bucket as a DataFrame. This works by checking which folders have a specific file, which we call the anchor.

download

Downloads a file from Google Cloud Storage and returns the local file path.

open_file

Downloads and opens a file from Google Cloud Storage. Returns a file handle.

open_image

Downloads and returns the PIL image from Google Cloud Storage.

Pathing

The path to specify is relative to the bucket, which is frdc-ds by default.

For example this filesystem on GCS:

# On Google Cloud Storage frdc-ds ├── chestnut_nature_park │ └── 20201218 │ └── 90deg │ └── bounds.json -

To download bounds.json, use download(r"chestnut_nature_park/20201218/90deg/bounds.json"). By default, all files will be downloaded to PROJ_DIR/rsc/....

+

To download bounds.json, use download(r"chestnut_nature_park/20201218/90deg/bounds.json"). By default, all files will be downloaded to PROJ_DIR/rsc/....

# On local filesystem PROJ_DIR ├── rsc @@ -13,4 +13,4 @@ │ └── 20201218 │ └── 90deg │ └── bounds.json -

Configuration

If you need granular control over

  • where the files are downloaded

  • the credentials used

  • the project used

  • the bucket used

Then edit conf.py.

GCS_CREDENTIALS

Google Cloud credentials.


A google.oauth2.service_account.Credentials object. See the object documentation for more information.

LOCAL_DATASET_ROOT_DIR

Local directory to download files to.


Path to a directory, or a Path object.

GCS_PROJECT_ID

Google Cloud project ID.


GCS_BUCKET_NAME

Google Cloud Storage bucket name.


Last modified: 27 December 2023
\ No newline at end of file +

Configuration

If you need granular control over

  • where the files are downloaded

  • the credentials used

  • the project used

  • the bucket used

Then edit conf.py.

GCS_CREDENTIALS

Google Cloud credentials.


A google.oauth2.service_account.Credentials object. See the object documentation for more information.

LOCAL_DATASET_ROOT_DIR

Local directory to download files to.


Path to a directory, or a Path object.

GCS_PROJECT_ID

Google Cloud project ID.


GCS_BUCKET_NAME

Google Cloud Storage bucket name.


Last modified: 29 December 2023
\ No newline at end of file diff --git a/docs/mix-match-module.html b/docs/mix-match-module.html index e2512d44..9ef02365 100644 --- a/docs/mix-match-module.html +++ b/docs/mix-match-module.html @@ -1,4 +1,4 @@ - MixMatch Module | Documentation

Documentation 0.0.8 Help

MixMatch Module

See frdc/train/mixmatch_module.py.

Quick Recap

We will go over the essential parts of the code here. Before that, we revise some of the concepts that are used in the code.

Abstract Methods

In Python, we can define abstract methods using the abc module. Just like other OOP languages, abstract methods are methods that must be implemented by the child class.

For example:

+ MixMatch Module | Documentation

Documentation 0.0.8 Help

MixMatch Module

See frdc/train/mixmatch_module.py.

Quick Recap

We will go over the essential parts of the code here. Before that, we revise some of the concepts that are used in the code.

Abstract Methods

In Python, we can define abstract methods using the abc module. Just like other OOP languages, abstract methods are methods that must be implemented by the child class.

For example:

from abc import ABC, abstractmethod @@ -11,7 +11,7 @@ class MyChildClass(MyAbstractClass): def my_abstract_method(self): print("Hello World!") -

nn.Module & LightningModule

If you're unfamiliar with PyTorch, you should read the nn.Module Documentation.

nn.Module is the base class for all neural network modules in PyTorch. While LightningModule is a PyTorch Lightning class that extends nn.Module, providing it with additional functionality that reduces boilerplate code.

By implementing it as a LightningModule, we also enter the PyTorch Lightning ecosystem, which provides us with a lot of useful features such as logging, early stopping, and more.

What do we implement in a Module?

One key component that nn.Module requires, is the model. So for example:

+

nn.Module & LightningModule

If you're unfamiliar with PyTorch, you should read the nn.Module Documentation.

nn.Module is the base class for all neural network modules in PyTorch. While LightningModule is a PyTorch Lightning class that extends nn.Module, providing it with additional functionality that reduces boilerplate code.

By implementing it as a LightningModule, we also enter the PyTorch Lightning ecosystem, which provides us with a lot of useful features such as logging, early stopping, and more.

What do we implement in a Module?

One key component that nn.Module requires, is the model. So for example:

class MyModule(nn.Module): def __init__(self): super().__init__() @@ -23,7 +23,7 @@ def forward(self, x): return self.model(x) -

PyTorch Lightning builds on top of it, requiring training_step and validation_step. Each "step" is a batch of data, and the model is trained on it. So for example:

+

PyTorch Lightning builds on top of it, requiring training_step and validation_step. Each "step" is a batch of data, and the model is trained on it. So for example:

class MyModule(LightningModule): def __init__(self): ... @@ -40,7 +40,7 @@ y_hat = self(x) loss = F.cross_entropy(y_hat, y) return loss -

Usually, the training and validation steps are the same, but in some cases, such as MixMatch, they are different. In MixMatch, we not only use a different loss function for train, we also handle a batch differently. The PyTorch Lightning framework allows us to separate the two, and implement them separately.

Model Embedded Preprocessing on_before_batch_transfer

In PyTorch Lightning, we can also inject a step before the batch is passed to the model. This is done by overriding the on_before_batch_transfer method.

Batch
on_before_batch_transfer
training_step
validation_step

This allows us to do preprocessing on the batch, such as scaling the data, encoding the labels, and more.

Custom EMA Update on_after_backward

We also leverage another hook, called on_after_backward. This hook is called after the backward pass, and allows us to do custom operations. In our case, we use it to update the EMA model.

Batch
training_step
on_after_backward
update_ema

MixMatch

We recommend having tests/model_tests/chestnut_dec_may/train.py open while reading this section. It implements a real-world example of MixMatch.

As a summary:

  1. We learned what is an abstract method, and how to implement it

  2. We implement the model in LightningModule much like we would in nn.Module

  3. We implement on_before_batch_transfer to preprocess the batch

  4. Finally, we implement on_after_backward to update the EMA model

With the above in mind, let's look at the MixMatch implementation.

forward (abstract)

Forward pass of the model

ema_model (abstract)

The model that is used for EMA. We expect this property to be implemented by the child class.

update_ema (abstract)

The method to update the EMA model. We expect this method to be implemented by the child class.

loss_unl_scaler (static)

Takes in the current progress of the training, 0.0 to 1.0, where 0.0 is the start of the training, and 1.0 is the end. Then, returns the multiplier for the unlabeled loss.

loss_lbl (static)

Implements the loss for labeled data. Takes in the predicted labels and the ground truth labels, and returns the loss. This is cross entropy for MixMatch.

loss_unl (static)

Implements the loss for unlabeled data. Takes in the predicted labels and the ground truth labels, and returns the loss. This is MSE for MixMatch.

mixup

Takes in the data and the labels, the beta distribution parameter, and returns the mixed data and labels.

sharpen

Takes in the labels and temperature, and returns the sharpened labels.

guess_labels

Takes in the unlabeled data, and returns the guessed labels.

progress

The current progress of the training, 0.0 to 1.0, where 0.0 is the start of the training, and 1.0 is the end.

training_step

The training step runs through 1 batch of data, and returns the loss. Note that this is significantly different from validation step, as we handle the K-Augmented data differently.

test / validation_step

The test / validation step runs through 1 batch of data, and returns the loss.

predict_step

The predict step runs through 1 batch of data, and returns the actual decoded labels.

on_after_backward

The on_after_backward hook is called after the backward pass, and allows us to do custom operations. In our case, we use it to update the EMA model.

on_before_batch_transfer

The on_before_batch_transfer hook is called before the batch is transferred to the GPU. In our case, we use it to preprocess the batch.

A diagram of how these components interact with each other is shown below:

Batch
on_before_batch_transfer
training_step
guess_labels
sharpen
mix_up
loss_unl
loss_unl_scaler
loss
loss_lbl
backward
on_after_backward
update_ema
validation_step
loss

Finally, we show an example of how to use the MixMatch module:

+

Usually, the training and validation steps are the same, but in some cases, such as MixMatch, they are different. In MixMatch, we not only use a different loss function for train, we also handle a batch differently. The PyTorch Lightning framework allows us to separate the two, and implement them separately.

Model Embedded Preprocessing on_before_batch_transfer

In PyTorch Lightning, we can also inject a step before the batch is passed to the model. This is done by overriding the on_before_batch_transfer method.

Batch
on_before_batch_transfer
training_step
validation_step

This allows us to do preprocessing on the batch, such as scaling the data, encoding the labels, and more.

Custom EMA Update on_after_backward

We also leverage another hook, called on_after_backward. This hook is called after the backward pass, and allows us to do custom operations. In our case, we use it to update the EMA model.

Batch
training_step
on_after_backward
update_ema

MixMatch

We recommend having tests/model_tests/chestnut_dec_may/train.py open while reading this section. It implements a real-world example of MixMatch.

As a summary:

  1. We learned what is an abstract method, and how to implement it

  2. We implement the model in LightningModule much like we would in nn.Module

  3. We implement on_before_batch_transfer to preprocess the batch

  4. Finally, we implement on_after_backward to update the EMA model

With the above in mind, let's look at the MixMatch implementation.

forward (abstract)

Forward pass of the model

ema_model (abstract)

The model that is used for EMA. We expect this property to be implemented by the child class.

update_ema (abstract)

The method to update the EMA model. We expect this method to be implemented by the child class.

loss_unl_scaler (static)

Takes in the current progress of the training, 0.0 to 1.0, where 0.0 is the start of the training, and 1.0 is the end. Then, returns the multiplier for the unlabeled loss.

loss_lbl (static)

Implements the loss for labeled data. Takes in the predicted labels and the ground truth labels, and returns the loss. This is cross entropy for MixMatch.

loss_unl (static)

Implements the loss for unlabeled data. Takes in the predicted labels and the ground truth labels, and returns the loss. This is MSE for MixMatch.

mixup

Takes in the data and the labels, the beta distribution parameter, and returns the mixed data and labels.

sharpen

Takes in the labels and temperature, and returns the sharpened labels.

guess_labels

Takes in the unlabeled data, and returns the guessed labels.

progress

The current progress of the training, 0.0 to 1.0, where 0.0 is the start of the training, and 1.0 is the end.

training_step

The training step runs through 1 batch of data, and returns the loss. Note that this is significantly different from validation step, as we handle the K-Augmented data differently.

test / validation_step

The test / validation step runs through 1 batch of data, and returns the loss.

predict_step

The predict step runs through 1 batch of data, and returns the actual decoded labels.

on_after_backward

The on_after_backward hook is called after the backward pass, and allows us to do custom operations. In our case, we use it to update the EMA model.

on_before_batch_transfer

The on_before_batch_transfer hook is called before the batch is transferred to the GPU. In our case, we use it to preprocess the batch.

A diagram of how these components interact with each other is shown below:

Batch
on_before_batch_transfer
training_step
guess_labels
sharpen
mix_up
loss_unl
loss_unl_scaler
loss
loss_lbl
backward
on_after_backward
update_ema
validation_step
loss

Finally, we show an example of how to use the MixMatch module:

from sklearn.preprocessing import StandardScaler, OrdinalEncoder from frdc.train.mixmatch_module import MixMatchModule @@ -60,7 +60,7 @@ sharpen_temp=0.5, mix_beta_alpha=0.75, ) -

In particular, we need to supply some transformations for the preprocessing step. In this case, we use StandardScaler to scale the data, and OrdinalEncoder to encode the labels.

  1. It's best if standardization is done only on the training data, and not the validation data to better fit real-world scenarios.

  2. We use OrdinalEncoder as it handles unseen labels. So if a class doesn't show up in the training data, it will be encoded as np.nan, and will not participate in the loss calculation.

Design Choices

Static Method Overriding

We implement many functions as static, as we believe that a functional style reduces dependencies, thus making the code easier to test and debug.

Furthermore, it allows the subclasses to easily override the functions, to customize the behavior of the MixMatch module.

For example, the loss_unl_scaler function is static, thus, we can implement our own scaling function, and pass it to the MixMatch module.

+

In particular, we need to supply some transformations for the preprocessing step. In this case, we use StandardScaler to scale the data, and OrdinalEncoder to encode the labels.

  1. It's best if standardization is done only on the training data, and not the validation data to better fit real-world scenarios.

  2. We use OrdinalEncoder as it handles unseen labels. So if a class doesn't show up in the training data, it will be encoded as np.nan, and will not participate in the loss calculation.

Design Choices

Static Method Overriding

We implement many functions as static, as we believe that a functional style reduces dependencies, thus making the code easier to test and debug.

Furthermore, it allows the subclasses to easily override the functions, to customize the behavior of the MixMatch module.

For example, the loss_unl_scaler function is static, thus, we can implement our own scaling function, and pass it to the MixMatch module.

def my_loss_unl_scaler(progress: float) -> float: return progress ** 2 @@ -68,4 +68,4 @@ @staticmethod def loss_unl_scaler(progress: float) -> float: return my_loss_unl_scaler(progress) -

If we had used a method instead, we would have to consider instance state, which would make it harder to override.

Why not use Dataclasses?

One of the biggest caveats of nn.Module is that it requires super().__init__() to be called before anything is assigned. While dataclass can leverage __post_init__ to do the same, we felt that this was too much of a hassle to save a few keystrokes. Thus, we opted to use __init__ instead, while more verbose, it is more explicit.

Why use PyTorch Lightning?

While we did hit some road blocks implementing SSL, due to its complex and unconventional nature, we felt that the benefits of using PyTorch Lightning outweighed the cons.

on_before_batch_transfer and on_after_backward are unconventional hooks, and we had to do some digging to find them. It can be argued that by just writing explicit code, we can avoid the need for these hooks, but the PyTorch ecosystem fixes many other issues, so we closed an eye on this.

References

Last modified: 27 December 2023
\ No newline at end of file +

If we had used a method instead, we would have to consider instance state, which would make it harder to override.

Why not use Dataclasses?

One of the biggest caveats of nn.Module is that it requires super().__init__() to be called before anything is assigned. While dataclass can leverage __post_init__ to do the same, we felt that this was too much of a hassle to save a few keystrokes. Thus, we opted to use __init__ instead, while more verbose, it is more explicit.

Why use PyTorch Lightning?

While we did hit some road blocks implementing SSL, due to its complex and unconventional nature, we felt that the benefits of using PyTorch Lightning outweighed the cons.

on_before_batch_transfer and on_after_backward are unconventional hooks, and we had to do some digging to find them. It can be argued that by just writing explicit code, we can avoid the need for these hooks, but the PyTorch ecosystem fixes many other issues, so we closed an eye on this.

References

Last modified: 29 December 2023
\ No newline at end of file diff --git a/docs/mix-match.html b/docs/mix-match.html index 5d0f4795..80686148 100644 --- a/docs/mix-match.html +++ b/docs/mix-match.html @@ -1 +1 @@ - MixMatch | Documentation

Documentation 0.0.8 Help

MixMatch

In FRDC-ML, we leverage semi-supervised learning to improve the model's performance through better augmentation consistency and using even unlabelled data.

The algorithm we use is MixMatch. A state-of-the-art semi-supervised learning algorithm. It is based on the idea of consistency regularization, which encourages models to predict the same class even after augmentations that occur naturally in the real world.

Our implementation of MixMatch is a refactored version of YU1ut/MixMatch-pytorch We've refactored the code to follow more modern PyTorch practices, allowing us to utilize it with modern PyTorch frameworks such as PyTorch Lightning.

We won't go through the details of MixMatch here, see Our Documentation in our MixMatch-PyTorch-CIFAR10 repository for more details.

Implementation Details

  1. How we implemented the MixMatch logic MixMatchModule

  2. How we implemented the unique MixMatch data loading logic Custom MixMatch Data Loading

References

Last modified: 27 December 2023
\ No newline at end of file + MixMatch | Documentation

Documentation 0.0.8 Help

MixMatch

In FRDC-ML, we leverage semi-supervised learning to improve the model's performance through better augmentation consistency and using even unlabelled data.

The algorithm we use is MixMatch. A state-of-the-art semi-supervised learning algorithm. It is based on the idea of consistency regularization, which encourages models to predict the same class even after augmentations that occur naturally in the real world.

Our implementation of MixMatch is a refactored version of YU1ut/MixMatch-pytorch We've refactored the code to follow more modern PyTorch practices, allowing us to utilize it with modern PyTorch frameworks such as PyTorch Lightning.

We won't go through the details of MixMatch here, see Our Documentation in our MixMatch-PyTorch-CIFAR10 repository for more details.

Implementation Details

  1. How we implemented the MixMatch logic MixMatchModule

  2. How we implemented the unique MixMatch data loading logic Custom MixMatch Data Loading

References

Last modified: 29 December 2023
\ No newline at end of file diff --git a/docs/model-test-chestnut-may-dec.html b/docs/model-test-chestnut-may-dec.html index 91b6538a..245427b2 100644 --- a/docs/model-test-chestnut-may-dec.html +++ b/docs/model-test-chestnut-may-dec.html @@ -1 +1 @@ - Model Test Chestnut May-Dec | Documentation

Documentation 0.0.8 Help

Model Test Chestnut May-Dec

This test is used to evaluate the model performance on the Chestnut Nature Park May & December dataset.

See this script in model_tests/chestnut_dec_may/train.py.

Motivation

The usage of this model will be to classify trees in unseen datasets under different conditions. In this test, we'll evaluate it under a different season.

A caveat is that it'll be evaluated on the same set of trees, so it's not a representative of a field-test. However, given difficulties of yielding datasets, this still gives us a good preliminary idea of how the model will perform in different conditions.

Methodology

We train on the December dataset, and test on the May dataset.

Labelled Train
Unlabelled Train
Test
DecDataset
Model
MayDataset

Despite not having any true unlabelled data, we use MixMatch by treating the labelled data of the December dataset as unlabelled data.

Model

The current Model used is a simple InceptionV3 Transfer Learning model, with the last layer replaced with a fully connected layer(s).

SSL Loss
Input
InceptionV3 Frozen
FC Layer(s)
Softmax
Output

Preprocessing

For Training:

Segment
RandomCrop 299
Horizontal Flip 50%
Vertical Flip 50%
Normalize By Training Mean & Std

For Validation:

Segment
CenterCrop 299
Normalize By Training Mean & Std

For Evaluation:

Segment
CenterCrop 299
Normalize By Training Mean & Std
As Is
Horizontal Flip
Vertical Flip
Horizontal & Vertical Flip

For evaluation, we evaluate that the model should be invariant to horizontal and vertical flips, as well as the original image.

Hyperparameters

The following hyperparameters are used:

  • Optimizer: Adam

  • Learning Rate: 1e-3

  • Batch Size: 32

  • Epochs: 10

  • Train Iterations: 25~100

  • Validation Iterations: 10~25

  • Early Stopping: 4

Results

We evaluate around 40% accuracy on the test set, compared to 100% for the training set. This indicates that the model has saturated and is not able to learn anymore from the training set. There's no indication of overfitting as the validation loss just plateaus.

W&B Dashboard

Caveats

  • The test set is very small, so the results are not very representative.

  • The test set is the same set of trees, so it's not a true test of the model performance in different conditions.

  • There are many classes with 1 sample, so the model may not be able to learn the features of these classes well.

Last modified: 27 December 2023
\ No newline at end of file + Model Test Chestnut May-Dec | Documentation

Documentation 0.0.8 Help

Model Test Chestnut May-Dec

This test is used to evaluate the model performance on the Chestnut Nature Park May & December dataset.

See this script in model_tests/chestnut_dec_may/train.py.

Motivation

The usage of this model will be to classify trees in unseen datasets under different conditions. In this test, we'll evaluate it under a different season.

A caveat is that it'll be evaluated on the same set of trees, so it's not a representative of a field-test. However, given difficulties of yielding datasets, this still gives us a good preliminary idea of how the model will perform in different conditions.

Methodology

We train on the December dataset, and test on the May dataset.

Labelled Train
Unlabelled Train
Test
DecDataset
Model
MayDataset

Despite not having any true unlabelled data, we use MixMatch by treating the labelled data of the December dataset as unlabelled data.

Model

The current Model used is a simple InceptionV3 Transfer Learning model, with the last layer replaced with a fully connected layer(s).

SSL Loss
Input
InceptionV3 Frozen
FC Layer(s)
Softmax
Output

Preprocessing

For Training:

Segment
RandomCrop 299
Horizontal Flip 50%
Vertical Flip 50%
Normalize By Training Mean & Std

For Validation:

Segment
CenterCrop 299
Normalize By Training Mean & Std

For Evaluation:

Segment
CenterCrop 299
Normalize By Training Mean & Std
As Is
Horizontal Flip
Vertical Flip
Horizontal & Vertical Flip

For evaluation, we evaluate that the model should be invariant to horizontal and vertical flips, as well as the original image.

Hyperparameters

The following hyperparameters are used:

  • Optimizer: Adam

  • Learning Rate: 1e-3

  • Batch Size: 32

  • Epochs: 10

  • Train Iterations: 25~100

  • Validation Iterations: 10~25

  • Early Stopping: 4

Results

We evaluate around 40% accuracy on the test set, compared to 100% for the training set. This indicates that the model has saturated and is not able to learn anymore from the training set. There's no indication of overfitting as the validation loss just plateaus.

W&B Dashboard

Caveats

  • The test set is very small, so the results are not very representative.

  • The test set is the same set of trees, so it's not a true test of the model performance in different conditions.

  • There are many classes with 1 sample, so the model may not be able to learn the features of these classes well.

Last modified: 29 December 2023
\ No newline at end of file diff --git a/docs/overview.html b/docs/overview.html index 627976e2..896ab236 100644 --- a/docs/overview.html +++ b/docs/overview.html @@ -1 +1 @@ - Overview | Documentation

Documentation 0.0.8 Help

Overview

Forest Recovery Digital Companion (FRDC) is a ML-assisted companion for ecologists to automatically classify surveyed trees via an Unmanned Aerial Vehicle (UAV).

This package, FRDC-ML is the Machine Learning backbone of this project, a centralized repository of tools and model architectures to be used in the FRDC pipeline.

Get started here

Other Projects

FRDC-UI

The User Interface Repository for FRDC, a WebApp GUI for ecologists to adjust annotations.

Last modified: 27 December 2023
\ No newline at end of file + Overview | Documentation

Documentation 0.0.8 Help

Overview

Forest Recovery Digital Companion (FRDC) is a ML-assisted companion for ecologists to automatically classify surveyed trees via an Unmanned Aerial Vehicle (UAV).

This package, FRDC-ML is the Machine Learning backbone of this project, a centralized repository of tools and model architectures to be used in the FRDC pipeline.

Get started here

Other Projects

FRDC-UI

The User Interface Repository for FRDC, a WebApp GUI for ecologists to adjust annotations.

Last modified: 29 December 2023
\ No newline at end of file diff --git a/docs/preprocessing-extract-segments.html b/docs/preprocessing-extract-segments.html index 03e0ae62..9a248812 100644 --- a/docs/preprocessing-extract-segments.html +++ b/docs/preprocessing-extract-segments.html @@ -1,4 +1,4 @@ - preprocessing.extract_segments | Documentation

Documentation 0.0.8 Help

preprocessing.extract_segments

Functions

extract_segments_from_labels

Extracts segments from a label classification.

extract_segments_from_bounds

Extracts segments from Rect bounds.

remove_small_segments_from_labels

Removes small segments from a label classification.

Extract with Boundaries

A boundary is a Rect object that represents the minimum bounding box of a segment, with x0, y0, x1, y1 coordinates.

It simply slices the original image to the bounding box. The origin is the top left corner of the image.

+ preprocessing.extract_segments | Documentation

Documentation 0.0.8 Help

preprocessing.extract_segments

Functions

extract_segments_from_labels

Extracts segments from a label classification.

extract_segments_from_bounds

Extracts segments from Rect bounds.

remove_small_segments_from_labels

Removes small segments from a label classification.

Extract with Boundaries

A boundary is a Rect object that represents the minimum bounding box of a segment, with x0, y0, x1, y1 coordinates.

It simply slices the original image to the bounding box. The origin is the top left corner of the image.

+-----------------+ +-----------+ | Original | | Segmented | | Image | | Image | @@ -9,7 +9,7 @@ +-----+-----+-----+ 1, 2, 0, 2 +-----+-----+ | 7 | 8 | 9 | x0 y0 x1 y1 | 8 | 9 | +-----+-----+-----+ +-----+-----+ -
+
+-----------------+ +-----------------+ | Original | | Segmented | | Image | | Image | @@ -20,7 +20,7 @@ +-----+-----+-----+ 1, 2, 0, 2 +-----+-----+-----+ | 7 | 8 | 9 | x0 y0 x1 y1 | 0 | 8 | 9 | +-----+-----+-----+ +-----+-----+-----+ -

Extract with Labels

A label classification is a np.ndarray where each pixel is mapped to a segment. The segments are mapped to a unique integer. In our project, the 0th label is the background.

For example, a label classification of 3 segments will look like this:

+

Extract with Labels

A label classification is a np.ndarray where each pixel is mapped to a segment. The segments are mapped to a unique integer. In our project, the 0th label is the background.

For example, a label classification of 3 segments will look like this:

+-----------------+ +-----------------+ | Label | | Original | | Classification | | Image | @@ -31,7 +31,7 @@ +-----+-----+-----+ +-----+-----+-----+ | 1 | 1 | 0 | | 7 | 8 | 9 | +-----+-----+-----+ +-----+-----+-----+ -

The extraction will take the minimum bounding box of each segment and return a list of segments.

For example, the label 1 and 2 extracted images will be

+

The extraction will take the minimum bounding box of each segment and return a list of segments.

For example, the label 1 and 2 extracted images will be

+-----------+ +-----------+ | Extracted | | Extracted | | Segment 1 | | Segment 2 | @@ -42,7 +42,7 @@ +-----+-----+ +-----+-----+ | 7 | 8 | +-----+-----+ -
+
+-----------------+ +-----------------+ | Extracted | | Extracted | | Segment 1 | | Segment 2 | @@ -53,23 +53,21 @@ +-----+-----+-----+ +-----+-----+-----+ | 7 | 8 | 0 | | 0 | 0 | 0 | +-----+-----+-----+ +-----+-----+-----+ -
  • If cropped is False, the segments are padded with 0s to the original image size. While this can ensure shape consistency, it can consume more memory for large images.

  • If cropped is True, the segments are cropped to the minimum bounding box. This can save memory, but the shape of the segments will be inconsistent.

Usage

Extract from Bounds and Labels

Extract segments from bounds and labels.

+
  • If cropped is False, the segments are padded with 0s to the original image size. While this can ensure shape consistency, it can consume more memory for large images.

  • If cropped is True, the segments are cropped to the minimum bounding box. This can save memory, but the shape of the segments will be inconsistent.

Usage

Extract from Bounds and Labels

Extract segments from bounds and labels.

import numpy as np -from frdc.load import FRDCDataset +from frdc.load.preset import FRDCDatasetPreset from frdc.preprocess.extract_segments import extract_segments_from_bounds -ds = FRDCDataset(site='chestnut_nature_park', - date='20201218', - version=None, ) +ds = FRDCDatasetPreset.chestnut_20201218() ar, order = ds.get_ar_bands() bounds, labels = ds.get_bounds_and_labels() segments: list[np.ndarray] = extract_segments_from_bounds(ar, bounds) -

Extract from Auto-Segmentation

Extract segments from a label classification.

+

Extract from Auto-Segmentation

Extract segments from a label classification.

from skimage.morphology import remove_small_objects, remove_small_holes import numpy as np -from frdc.load import FRDCDataset +from frdc.load.preset import FRDCDatasetPreset from frdc.preprocess.morphology import ( threshold_binary_mask, binary_watershed ) @@ -78,9 +76,7 @@ extract_segments_from_labels, remove_small_segments_from_labels ) -ds = FRDCDataset(site='chestnut_nature_park', - date='20201218', - version=None, ) +ds = FRDCDatasetPreset.chestnut_20201218() ar, order = ds.get_ar_bands() ar = scale_0_1_per_band(ar) ar_mask = threshold_binary_mask(ar, -1, 90 / 256) @@ -91,4 +87,4 @@ min_height=10, min_width=10) segments: list[np.ndarray] = extract_segments_from_labels(ar, ar_labels) -

API

extract_segments_from_labels(ar, ar_labels, cropped)

Extracts segments from a label classification.


ar_labels is a label classification as a np.ndarray

extract_segments_from_bounds(ar, bounds, cropped)

Extracts segments from Rect bounds.


bounds is a list of Rect bounds.

remove_small_segments_from_labels(ar_labels, min_height, min_width)

Removes small segments from a label classification.


Last modified: 27 December 2023
\ No newline at end of file +

API

extract_segments_from_labels(ar, ar_labels, cropped)

Extracts segments from a label classification.


ar_labels is a label classification as a np.ndarray

extract_segments_from_bounds(ar, bounds, cropped)

Extracts segments from Rect bounds.


bounds is a list of Rect bounds.

remove_small_segments_from_labels(ar_labels, min_height, min_width)

Removes small segments from a label classification.


Last modified: 29 December 2023
\ No newline at end of file diff --git a/docs/preprocessing-glcm-padded.html b/docs/preprocessing-glcm-padded.html index 0f99613f..f9e3da4c 100644 --- a/docs/preprocessing-glcm-padded.html +++ b/docs/preprocessing-glcm-padded.html @@ -1,4 +1,4 @@ - preprocessing.glcm_padded | Documentation

Documentation 0.0.8 Help

preprocessing.glcm_padded

Functions

glcm_padded

Computes the GLCM of the NDArray bands with padding.

glcm_padded_cached

Computes the GLCM of the NDArray bands with padding, and caches it.

append_glcm_padded_cached

Computes the GLCM of the NDArray bands with padding, and caches it and also appends it onto the original array.

Usage

We show a few examples of how to use the GLCM functions.

+ preprocessing.glcm_padded | Documentation

Documentation 0.0.8 Help

preprocessing.glcm_padded

Functions

glcm_padded

Computes the GLCM of the NDArray bands with padding.

glcm_padded_cached

Computes the GLCM of the NDArray bands with padding, and caches it.

append_glcm_padded_cached

Computes the GLCM of the NDArray bands with padding, and caches it and also appends it onto the original array.

Usage

We show a few examples of how to use the GLCM functions.

import numpy as np from glcm_cupy import Features @@ -23,4 +23,4 @@ ar_glcm_cached_appended = append_glcm_padded_cached(ar, bin_from=1, bin_to=4, radius=3) -
  • ar_glcm is the GLCM of the original array, with the last dimension being the GLCM features. The number of features is determined by the features parameter, which defaults to all features.

  • ar_glcm_2_features selects only 2 features, with the last dimension being the 2 GLCM features specified.

  • ar_glcm_cached caches the GLCM so that if you call it again, it will return the cached version. It stores its data at the project root dir, under .cache/.

  • ar_glcm_cached_appended is a wrapper around ar_glcm_cached, it appends the GLCM features onto the original array. It's equivalent to calling ar_glcm_cached and then np.concatenate on the final axes.

Caching

GLCM is an expensive operation, thus we recommend to cache it if the input parameters will be the same. This is especially useful if you're experimenting with the same dataset with constant parameters.

API

glcm_padded(ar, bin_from, bin_to, radius, step_size, features)

Computes the GLCM of the NDArray bands with padding.


  • ar is the input array

  • bin_from is the upper bound of the input

  • bin_to is the upper bound of the GLCM input, i.e. the resolution that GLCM operates on

  • radius is the radius of the GLCM

  • step_size is the step size of the GLCM

  • features is the list of GLCM features to compute

The return shape is

See glcm_cupy for the GLCM Features.

glcm_padded_cached(ar, bin_from, bin_to, radius, step_size, features)

Computes the GLCM of the NDArray bands with padding, and caches it.


See glcm_padded for the parameters and output shape

append_glcm_padded_cached(ar, bin_from, bin_to, radius, step_size, features)

Computes the GLCM of the NDArray bands with padding, and caches it and also appends it onto the original array.


See glcm_padded for the parameters


The return shape is:

The function automatically flattens the last 2 dimensions of the GLCM features, and appends it onto the original array.

Last modified: 27 December 2023
\ No newline at end of file +
  • ar_glcm is the GLCM of the original array, with the last dimension being the GLCM features. The number of features is determined by the features parameter, which defaults to all features.

  • ar_glcm_2_features selects only 2 features, with the last dimension being the 2 GLCM features specified.

  • ar_glcm_cached caches the GLCM so that if you call it again, it will return the cached version. It stores its data at the project root dir, under .cache/.

  • ar_glcm_cached_appended is a wrapper around ar_glcm_cached, it appends the GLCM features onto the original array. It's equivalent to calling ar_glcm_cached and then np.concatenate on the final axes.

Caching

GLCM is an expensive operation, thus we recommend to cache it if the input parameters will be the same. This is especially useful if you're experimenting with the same dataset with constant parameters.

API

glcm_padded(ar, bin_from, bin_to, radius, step_size, features)

Computes the GLCM of the NDArray bands with padding.


  • ar is the input array

  • bin_from is the upper bound of the input

  • bin_to is the upper bound of the GLCM input, i.e. the resolution that GLCM operates on

  • radius is the radius of the GLCM

  • step_size is the step size of the GLCM

  • features is the list of GLCM features to compute

The return shape is

See glcm_cupy for the GLCM Features.

glcm_padded_cached(ar, bin_from, bin_to, radius, step_size, features)

Computes the GLCM of the NDArray bands with padding, and caches it.


See glcm_padded for the parameters and output shape

append_glcm_padded_cached(ar, bin_from, bin_to, radius, step_size, features)

Computes the GLCM of the NDArray bands with padding, and caches it and also appends it onto the original array.


See glcm_padded for the parameters


The return shape is:

The function automatically flattens the last 2 dimensions of the GLCM features, and appends it onto the original array.

Last modified: 29 December 2023
\ No newline at end of file diff --git a/docs/preprocessing-morphology.html b/docs/preprocessing-morphology.html index 7817824f..0c21fd83 100644 --- a/docs/preprocessing-morphology.html +++ b/docs/preprocessing-morphology.html @@ -1,15 +1,13 @@ - preprocessing.morphology | Documentation

Documentation 0.0.8 Help

preprocessing.morphology

Functions

threshold_binary_mask

Thresholds a selected NDArray bands to yield a binary mask.

binary_watershed

Performs watershed on a binary mask to yield a mapped label classification

Usage

Perform auto-segmentation on a dataset to yield a label classification.

-from frdc.load import FRDCDataset + preprocessing.morphology | Documentation

Documentation 0.0.8 Help

preprocessing.morphology

Functions

threshold_binary_mask

Thresholds a selected NDArray bands to yield a binary mask.

binary_watershed

Performs watershed on a binary mask to yield a mapped label classification

Usage

Perform auto-segmentation on a dataset to yield a label classification.

+from frdc.load.preset import FRDCDatasetPreset from frdc.preprocess.morphology import ( threshold_binary_mask, binary_watershed ) -ds = FRDCDataset(site='chestnut_nature_park', - date='20201218', - version=None, ) +ds = FRDCDatasetPreset.chestnut_20201218() ar, order = ds.get_ar_bands() mask = threshold_binary_mask(ar, order.index('NIR'), 90 / 256) ar_label = binary_watershed(mask) -

API

threshold_binary_mask(ar, band_idx, threshold_value)

Thresholds a selected NDArray bands to yield a binary mask as np.ndarray


This is equivalent to

+

API

threshold_binary_mask(ar, band_idx, threshold_value)

Thresholds a selected NDArray bands to yield a binary mask as np.ndarray


This is equivalent to

ar[..., band_idx] > threshold_value -
binary_watershed(ar_mask, peaks_footprint, watershed_compactness)

Performs watershed on a binary mask to yield a mapped label classification as a np.ndarray


  • peaks_footprint is the footprint of skimage.feature.peak_local_max

  • watershed_compactness is the compactness of skimage.morphology.watershed

Last modified: 27 December 2023
\ No newline at end of file +
binary_watershed(ar_mask, peaks_footprint, watershed_compactness)

Performs watershed on a binary mask to yield a mapped label classification as a np.ndarray


  • peaks_footprint is the footprint of skimage.feature.peak_local_max

  • watershed_compactness is the compactness of skimage.morphology.watershed

Last modified: 29 December 2023
\ No newline at end of file diff --git a/docs/preprocessing-scale.html b/docs/preprocessing-scale.html index cdf27d34..2dd27886 100644 --- a/docs/preprocessing-scale.html +++ b/docs/preprocessing-scale.html @@ -1,15 +1,13 @@ - preprocessing.scale | Documentation

Documentation 0.0.8 Help

preprocessing.scale

Functions

scale_0_1_per_band

Scales the NDArray bands to [0, 1] per band.

scale_normal_per_band

Scales the NDArray bands to zero mean unit variance per band.

scale_static_per_band

Scales the NDArray bands by a predefined configuration. Take a look at frdc.conf.BAND_MAX_CONFIG for an example.

Usage

-from frdc.load import FRDCDataset + preprocessing.scale | Documentation

Documentation 0.0.8 Help

preprocessing.scale

Functions

scale_0_1_per_band

Scales the NDArray bands to [0, 1] per band.

scale_normal_per_band

Scales the NDArray bands to zero mean unit variance per band.

scale_static_per_band

Scales the NDArray bands by a predefined configuration. Take a look at frdc.conf.BAND_MAX_CONFIG for an example.

Usage

+from frdc.load.preset import FRDCDatasetPreset from frdc.preprocess.scale import ( scale_0_1_per_band, scale_normal_per_band, scale_static_per_band ) from frdc.conf import BAND_MAX_CONFIG -ds = FRDCDataset(site='chestnut_nature_park', - date='20201218', - version=None, ) +ds = FRDCDatasetPreset.chestnut_20201218() ar, order = ds.get_ar_bands() ar_01 = scale_0_1_per_band(ar) ar_norm = scale_normal_per_band(ar) ar_static = scale_static_per_band(ar, order, BAND_MAX_CONFIG) -
Last modified: 27 December 2023
\ No newline at end of file +
Last modified: 29 December 2023
\ No newline at end of file diff --git a/docs/retrieve-our-datasets.html b/docs/retrieve-our-datasets.html index 34b21eb2..0f0ef2d9 100644 --- a/docs/retrieve-our-datasets.html +++ b/docs/retrieve-our-datasets.html @@ -1,10 +1,10 @@ - Retrieve our Datasets | Documentation

Documentation 0.0.8 Help

Retrieve our Datasets

In this tutorial, we'll learn how to :

  • Retrieve FRDC's Hyperspectral Image Data as np.ndarray

  • Retrieve FRDC's Ground Truth bounds and labels

  • Slice/segment the image data by the bounds

Prerequisites

  • New here? Get Started.

  • Setup the Google Cloud Authorization to download the data.

Retrieve the Data

To retrieve the data, use FRDCDataset

Here, we'll download and load our

  • ar: Hyperspectral Image Data

  • order: The order of the bands

  • bounds: The bounds of the trees (segments)

  • labels: The labels of the trees (segments)

-from frdc.load.dataset import FRDCDataset + Retrieve our Datasets | Documentation

Documentation 0.0.8 Help

Retrieve our Datasets

In this tutorial, we'll learn how to :

  • Retrieve FRDC's Hyperspectral Image Data as np.ndarray

  • Retrieve FRDC's Ground Truth bounds and labels

  • Slice/segment the image data by the bounds

Prerequisites

  • New here? Get Started.

  • Setup the Google Cloud Authorization to download the data.

Retrieve the Data

To retrieve the data, use FRDCDataset

Here, we'll download and load our

  • ar: Hyperspectral Image Data

  • order: The order of the bands

  • bounds: The bounds of the trees (segments)

  • labels: The labels of the trees (segments)

+from frdc.load.preset import FRDCDatasetPreset -ds = FRDCDataset(site="chestnut_nature_park", date="20201218", version=None) +ds = FRDCDatasetPreset.chestnut_20201218() ar, order = ds.get_ar_bands() bounds, labels = ds.get_bounds_and_labels() -

What Datasets are there?

+

What Datasets are there?

from frdc.load.gcs import list_gcs_datasets print(list_gcs_datasets()) # 0 DEBUG/0 @@ -12,22 +12,22 @@ # 2 casuarina/20220418/93deg # 3 chestnut_nature_park/20201218 # ... -
  • The first part of the path is the site, and the second part is the date.

  • The version is the rest of the path, if there isn't any, use None.

  • site="ds"

  • date="date"

  • version="ver"

  • site="ds"

  • date="date"

  • version="ver/01/data"

  • site="ds"

  • date="date"

  • version=None

Segment the Data

To segment the data, use Extract Segments.

Here, we'll segment the data by the bounds.

-from frdc.load.dataset import FRDCDataset +
  • The first part of the path is the site, and the second part is the date.

  • The version is the rest of the path, if there isn't any, use None.

  • site="ds"

  • date="date"

  • version="ver"

  • site="ds"

  • date="date"

  • version="ver/01/data"

  • site="ds"

  • date="date"

  • version=None

Segment the Data

To segment the data, use Extract Segments.

Here, we'll segment the data by the bounds.

+from frdc.load.preset import FRDCDatasetPreset from frdc.preprocess.extract_segments import extract_segments_from_bounds -ds = FRDCDataset(site="chestnut_nature_park", date="20201218", version=None) +ds = FRDCDatasetPreset.chestnut_20201218() ar, order = ds.get_ar_bands() bounds, labels = ds.get_bounds_and_labels() segments = extract_segments_from_bounds(ar, bounds) -

segments is a list of np.ndarray of shape H, W, C, representing a tree. The order of segments is the same as labels, so you can use labels to identify the tree.

Plot the Data (Optional)

We can then use these data to plot out the first tree segment.

+

segments is a list of np.ndarray of shape H, W, C, representing a tree. The order of segments is the same as labels, so you can use labels to identify the tree.

Plot the Data (Optional)

We can then use these data to plot out the first tree segment.

import matplotlib.pyplot as plt -from frdc.load.dataset import FRDCDataset +from frdc.load.preset import FRDCDatasetPreset from frdc.preprocess.extract_segments import extract_segments_from_bounds from frdc.preprocess.scale import scale_0_1_per_band -ds = FRDCDataset(site="chestnut_nature_park", date="20201218", version=None) +ds = FRDCDatasetPreset.chestnut_20201218() ar, order = ds.get_ar_bands() bounds, labels = ds.get_bounds_and_labels() segments = extract_segments_from_bounds(ar, bounds) @@ -38,4 +38,4 @@ plt.imshow(segment_0_rgb_scaled) plt.title(f"Tree {labels[0]}") plt.show() -

See also: preprocessing.scale.scale_0_1_per_band

MatPlotLib cannot show the data correctly as-is, so we need to

  • Convert the data from BGR to RGB

  • Scale the data to 0-1 per band

Last modified: 27 December 2023
\ No newline at end of file +

See also: preprocessing.scale.scale_0_1_per_band

MatPlotLib cannot show the data correctly as-is, so we need to

  • Convert the data from BGR to RGB

  • Scale the data to 0-1 per band

Last modified: 29 December 2023
\ No newline at end of file diff --git a/docs/site.webmanifest b/docs/site.webmanifest deleted file mode 100644 index fe6a9303..00000000 --- a/docs/site.webmanifest +++ /dev/null @@ -1,11 +0,0 @@ -{ - "name": "JetBrains", - "short_name": "JetBrains", - "icons": [ - { "src": "icon-192.png", "type": "image/png", "sizes": "192x192" }, - { "src": "icon-512.png", "type": "image/png", "sizes": "512x512" } - ], - "theme_color": "#000000", - "background_color": "#000000", - "display": "standalone" -} \ No newline at end of file diff --git a/docs/train-frdc-lightning.html b/docs/train-frdc-lightning.html index 19ba79c2..2bdfd346 100644 --- a/docs/train-frdc-lightning.html +++ b/docs/train-frdc-lightning.html @@ -1,4 +1,4 @@ - train.frdc_datamodule & frdc_module | Documentation

Documentation 0.0.8 Help

train.frdc_datamodule & frdc_module

These are FRDC specific LightningDataModule and LightningModule, a core component in the PyTorch Lightning ecosystem to provide a simple interface to train and evaluate models.

Classes

FRDCDataModule

The FRDC PyTorch Lightning DataModule.

FRDCModule

The FRDC PyTorch Lightning Module.

Usage

API

FRDCDataModule(segments, labels, preprocess, augmentation, train_val_test_split, batch_size)

Initializes the FRDC PyTorch Lightning DataModule.


  • segments, labels are retrieved from

  • preprocess is a function that takes in a segment and returns a preprocessed segment. In particular, it should accept a list of NumPy NDArrays and return a single stacked PyToch Tensor.

  • augmentation is a function that takes in a segment and returns an augmented segment. In particular, it takes in a PyTorch Tensor and returns another.

  • train_val_test_split is a function that takes a TensorDataset and returns a list of 3 TensorDatasets, for train, val and test respectively.

  • batch_size is the batch size.

FRDCModule(model_cls, model_kwargs, optim_cls, optim_kwargs)

Initializes the FRDC PyTorch Lightning Module.


  • model_cls is the Class of the model.

  • model_kwargs is the kwargs to pass to the model.

  • optim_cls is the Class of the optimizer.

  • optim_kwargs is the kwargs to pass to the optimizer.

Internally, the module will initialize the model and optimizer as follows:

+ train.frdc_datamodule & frdc_module | Documentation

Documentation 0.0.8 Help

train.frdc_datamodule & frdc_module

These are FRDC specific LightningDataModule and LightningModule, a core component in the PyTorch Lightning ecosystem to provide a simple interface to train and evaluate models.

Classes

FRDCDataModule

The FRDC PyTorch Lightning DataModule.

FRDCModule

The FRDC PyTorch Lightning Module.

Usage

API

FRDCDataModule(segments, labels, preprocess, augmentation, train_val_test_split, batch_size)

Initializes the FRDC PyTorch Lightning DataModule.


  • segments, labels are retrieved from

  • preprocess is a function that takes in a segment and returns a preprocessed segment. In particular, it should accept a list of NumPy NDArrays and return a single stacked PyToch Tensor.

  • augmentation is a function that takes in a segment and returns an augmented segment. In particular, it takes in a PyTorch Tensor and returns another.

  • train_val_test_split is a function that takes a TensorDataset and returns a list of 3 TensorDatasets, for train, val and test respectively.

  • batch_size is the batch size.

FRDCModule(model_cls, model_kwargs, optim_cls, optim_kwargs)

Initializes the FRDC PyTorch Lightning Module.


  • model_cls is the Class of the model.

  • model_kwargs is the kwargs to pass to the model.

  • optim_cls is the Class of the optimizer.

  • optim_kwargs is the kwargs to pass to the optimizer.

Internally, the module will initialize the model and optimizer as follows:

model = model_cls(**model_kwargs) optim = optim_cls(model.parameters(), **optim_kwargs) -
Last modified: 27 December 2023
\ No newline at end of file +
Last modified: 29 December 2023
\ No newline at end of file From 3289d49491bb24497112abaddcb3f75b832688a3 Mon Sep 17 00:00:00 2001 From: Evening Date: Fri, 29 Dec 2023 16:03:03 +0800 Subject: [PATCH 36/52] Implement Stratified Sampling --- src/frdc/train/stratified_sampling.py | 60 +++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 src/frdc/train/stratified_sampling.py diff --git a/src/frdc/train/stratified_sampling.py b/src/frdc/train/stratified_sampling.py new file mode 100644 index 00000000..4d1e96cd --- /dev/null +++ b/src/frdc/train/stratified_sampling.py @@ -0,0 +1,60 @@ +from __future__ import annotations + +from typing import Iterator + +import torch +from torch.utils.data import Sampler + + +class RandomStratifiedSampler(Sampler[int]): + def __init__( + self, + targets: torch.Tensor, + num_samples: int | None = None, + ) -> None: + """Stratified sampling from a dataset, such that each class is + sampled with equal probability. + + Examples: + Use this with DataLoader to sample from a dataset in a stratified + fashion. For example:: + + ds = TensorDataset(...) + dl = DataLoader( + ds, + batch_size=..., + sampler=RandomStratifiedSampler(), + ) + + This will use the targets' frequency as the inverse probability + for sampling. For example, if the targets are [0, 0, 1, 2], + then the probability of sampling the + + Args: + targets: The targets to stratify by. Must be integers. + num_samples: The number of samples to draw. If None, the + number of samples is equal to the length of the dataset. + """ + super().__init__() + + # Given targets [0, 0, 1] + # bincount = [2, 1] + # 1 / bincount = [0.5, 1] + # 1 / bincount / len(bincount) = [0.25, 0.5] + # The indexing then just projects it to the original targets. + self.target_probs: torch.Tensor = ( + 1 / (bincount := torch.bincount(targets)) / len(bincount) + )[targets] + + self.num_samples = num_samples if num_samples else len(targets) + + def __len__(self) -> int: + return self.num_samples + + def __iter__(self) -> Iterator[int]: + """This should be a generator that yields indices from the dataset.""" + yield from torch.multinomial( + self.target_probs, + num_samples=self.num_samples, + replacement=True, + ) From fdfa17a99b8d384c32f0d0793532640fa3783af2 Mon Sep 17 00:00:00 2001 From: Evening Date: Fri, 29 Dec 2023 16:03:09 +0800 Subject: [PATCH 37/52] Add test for Stratified Sampling --- tests/unit_tests/train/__init__.py | 0 .../train/test_stratified_sampling.py | 47 +++++++++++++++++++ 2 files changed, 47 insertions(+) create mode 100644 tests/unit_tests/train/__init__.py create mode 100644 tests/unit_tests/train/test_stratified_sampling.py diff --git a/tests/unit_tests/train/__init__.py b/tests/unit_tests/train/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit_tests/train/test_stratified_sampling.py b/tests/unit_tests/train/test_stratified_sampling.py new file mode 100644 index 00000000..78eecb7b --- /dev/null +++ b/tests/unit_tests/train/test_stratified_sampling.py @@ -0,0 +1,47 @@ +from __future__ import annotations + +import torch +from torch.utils.data import DataLoader, TensorDataset + +from frdc.train.stratified_sampling import RandomStratifiedSampler + + +def test_stratifed_sampling_has_correct_probs(): + sampler = RandomStratifiedSampler(torch.tensor([0, 0, 1])) + + assert torch.all(sampler.target_probs == torch.tensor([0.25, 0.25, 0.5])) + + +def test_stratified_sampling_fairly_samples(): + """This test checks that the stratified sampler works with a dataloader.""" + + # This is a simple example of a dataset with 2 classes. + # The first 2 samples are class 0, the third is class 1. + x = torch.tensor([0, 1, 2]) + y = torch.tensor([0, 0, 1]) + + # To check that it's truly stratified, we'll sample 1000 times + # then assert that both classes are sampled roughly equally. + + # In this case, the first 2 x should be sampled roughly 250 times, + # and the third x should be sampled roughly 500 times. + + num_samples = 1000 + batch_size = 10 + dl = DataLoader( + TensorDataset(x), + batch_size=batch_size, + sampler=RandomStratifiedSampler(y, num_samples=num_samples), + ) + + # Note that when we sample from a TensorDataset, we get a tuple of tensors. + # So we need to unpack the tuple. + x_samples = torch.cat([x for (x,) in dl]) + + assert len(x_samples) == num_samples + assert torch.allclose( + torch.bincount(x_samples), + torch.tensor([250, 250, 500]), + # atol is the absolute tolerance, so the result can differ by 50 + atol=50, + ) From 349e7cd3051924899351a03edc7ec544e3f9aa90 Mon Sep 17 00:00:00 2001 From: Evening Date: Tue, 2 Jan 2024 11:36:30 +0800 Subject: [PATCH 38/52] Implement Stratified Sampling on DM --- src/frdc/train/frdc_datamodule.py | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/src/frdc/train/frdc_datamodule.py b/src/frdc/train/frdc_datamodule.py index cabcb604..5e4e6dbd 100644 --- a/src/frdc/train/frdc_datamodule.py +++ b/src/frdc/train/frdc_datamodule.py @@ -1,11 +1,13 @@ from __future__ import annotations from dataclasses import dataclass +from typing import Literal from lightning import LightningDataModule -from torch.utils.data import DataLoader, RandomSampler +from torch.utils.data import DataLoader, RandomSampler, Sampler from frdc.load.dataset import FRDCDataset, FRDCUnlabelledDataset +from frdc.train.stratified_sampling import RandomStratifiedSampler @dataclass @@ -61,6 +63,7 @@ class FRDCDataModule(LightningDataModule): batch_size: int = 4 train_iters: int = 100 val_iters: int = 100 + sampling_strategy: Literal["stratified", "random"] = "stratified" def __post_init__(self): super().__init__() @@ -70,24 +73,29 @@ def __post_init__(self): def train_dataloader(self): num_samples = self.batch_size * self.train_iters + if self.sampling_strategy == "stratified": + sampler = lambda ds: RandomStratifiedSampler( + ds.targets, num_samples=num_samples, replacement=True + ) + elif self.sampling_strategy == "random": + sampler = lambda ds: RandomSampler( + ds, num_samples=num_samples, replacement=True + ) + else: + raise ValueError( + f"Invalid sampling strategy: {self.sampling_strategy}" + ) + lab_dl = DataLoader( self.train_lab_ds, batch_size=self.batch_size, - sampler=RandomSampler( - self.train_lab_ds, - num_samples=num_samples, - replacement=False, - ), + sampler=sampler(self.train_lab_ds), ) unl_dl = ( DataLoader( self.train_unl_ds, batch_size=self.batch_size, - sampler=RandomSampler( - self.train_unl_ds, - num_samples=self.batch_size * self.train_iters, - replacement=False, - ), + sampler=sampler(self.train_unl_ds), ) if self.train_unl_ds is not None # This is a hacky way to create an empty dataloader. @@ -99,7 +107,6 @@ def train_dataloader(self): sampler=RandomSampler( empty, num_samples=num_samples, - replacement=False, ), ) ) From dc05b35ab13f9bde505efc1fd450dd2b9aa421f2 Mon Sep 17 00:00:00 2001 From: Evening Date: Tue, 2 Jan 2024 11:42:16 +0800 Subject: [PATCH 39/52] Allow Stratified Sampling for arbitrary seq types --- src/frdc/train/stratified_sampling.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/frdc/train/stratified_sampling.py b/src/frdc/train/stratified_sampling.py index 4d1e96cd..dd17762c 100644 --- a/src/frdc/train/stratified_sampling.py +++ b/src/frdc/train/stratified_sampling.py @@ -1,16 +1,19 @@ from __future__ import annotations -from typing import Iterator +from typing import Iterator, Any, Sequence +import pandas as pd import torch +from sklearn.preprocessing import LabelEncoder from torch.utils.data import Sampler class RandomStratifiedSampler(Sampler[int]): def __init__( self, - targets: torch.Tensor, + targets: Sequence[Any], num_samples: int | None = None, + replacement: bool = True, ) -> None: """Stratified sampling from a dataset, such that each class is sampled with equal probability. @@ -42,11 +45,13 @@ def __init__( # 1 / bincount = [0.5, 1] # 1 / bincount / len(bincount) = [0.25, 0.5] # The indexing then just projects it to the original targets. + targets_lab = torch.tensor(LabelEncoder().fit_transform(targets)) self.target_probs: torch.Tensor = ( - 1 / (bincount := torch.bincount(targets)) / len(bincount) - )[targets] + 1 / (bincount := torch.bincount(targets_lab)) / len(bincount) + )[targets_lab] self.num_samples = num_samples if num_samples else len(targets) + self.replacement = replacement def __len__(self) -> int: return self.num_samples @@ -56,5 +61,5 @@ def __iter__(self) -> Iterator[int]: yield from torch.multinomial( self.target_probs, num_samples=self.num_samples, - replacement=True, + replacement=self.replacement, ) From a8dcafcf9acbad694fb1b1b40395d6b5ac482f9b Mon Sep 17 00:00:00 2001 From: Evening Date: Tue, 2 Jan 2024 11:42:35 +0800 Subject: [PATCH 40/52] Fix missing imports for pred and plot --- tests/model_tests/chestnut_dec_may/train.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/model_tests/chestnut_dec_may/train.py b/tests/model_tests/chestnut_dec_may/train.py index 70f3dada..c37404e8 100644 --- a/tests/model_tests/chestnut_dec_may/train.py +++ b/tests/model_tests/chestnut_dec_may/train.py @@ -25,6 +25,7 @@ from frdc.load.preset import FRDCDatasetPreset as ds from frdc.models.inceptionv3 import InceptionV3MixMatchModule from frdc.train.frdc_datamodule import FRDCDataModule +from frdc.utils.training import predict, plot_confusion_matrix from model_tests.utils import ( train_preprocess, train_unl_preprocess, From e6f6a9c6ea984ebe1658d48380168c29903cb0bb Mon Sep 17 00:00:00 2001 From: Evening Date: Tue, 2 Jan 2024 11:42:53 +0800 Subject: [PATCH 41/52] Change test to use str list --- tests/unit_tests/train/test_stratified_sampling.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit_tests/train/test_stratified_sampling.py b/tests/unit_tests/train/test_stratified_sampling.py index 78eecb7b..e8019b64 100644 --- a/tests/unit_tests/train/test_stratified_sampling.py +++ b/tests/unit_tests/train/test_stratified_sampling.py @@ -7,7 +7,7 @@ def test_stratifed_sampling_has_correct_probs(): - sampler = RandomStratifiedSampler(torch.tensor([0, 0, 1])) + sampler = RandomStratifiedSampler(["A", "A", "B"]) assert torch.all(sampler.target_probs == torch.tensor([0.25, 0.25, 0.5])) @@ -18,7 +18,7 @@ def test_stratified_sampling_fairly_samples(): # This is a simple example of a dataset with 2 classes. # The first 2 samples are class 0, the third is class 1. x = torch.tensor([0, 1, 2]) - y = torch.tensor([0, 0, 1]) + y = ["A", "A", "B"] # To check that it's truly stratified, we'll sample 1000 times # then assert that both classes are sampled roughly equally. From 86d11df7a27d085496e6c4f7666db74c4f96da74 Mon Sep 17 00:00:00 2001 From: Evening Date: Tue, 2 Jan 2024 14:44:40 +0800 Subject: [PATCH 42/52] Implement W&B vis of label spread --- src/frdc/train/mixmatch_module.py | 68 +++++++++++++++++++++++++++++-- 1 file changed, 64 insertions(+), 4 deletions(-) diff --git a/src/frdc/train/mixmatch_module.py b/src/frdc/train/mixmatch_module.py index 784380b6..75e581e8 100644 --- a/src/frdc/train/mixmatch_module.py +++ b/src/frdc/train/mixmatch_module.py @@ -8,6 +8,7 @@ import torch.nn.functional as F import torch.nn.parallel import torch.nn.parallel +import wandb from lightning import LightningModule from sklearn.preprocessing import StandardScaler, OrdinalEncoder from torch.nn.functional import one_hot @@ -52,6 +53,7 @@ def __init__( self.sharpen_temp = sharpen_temp self.mix_beta_alpha = mix_beta_alpha self.save_hyperparameters() + self.lbl_logger = WandBLabelLogger() @property @abstractmethod @@ -150,10 +152,12 @@ def progress(self): ) / self.trainer.max_epochs def training_step(self, batch, batch_idx): - # Progress is a linear ramp from 0 to 1 over the course of training. (x_lbl, y_lbl), x_unls = batch + self.lbl_logger( + self.logger.experiment, "Input Y Label", y_lbl, flush_every=10 + ) - y_lbl = one_hot(y_lbl.long(), num_classes=self.n_classes) + y_lbl_ohe = one_hot(y_lbl.long(), num_classes=self.n_classes) # If x_unls is Truthy, then we are using MixMatch. # Otherwise, we are just using supervised learning. @@ -164,7 +168,7 @@ def training_step(self, batch, batch_idx): y_unl = self.sharpen(y_unl, self.sharpen_temp) x = torch.cat([x_lbl, *x_unls], dim=0) - y = torch.cat([y_lbl, *(y_unl,) * len(x_unls)], dim=0) + y = torch.cat([y_lbl_ohe, *(y_unl,) * len(x_unls)], dim=0) x_mix, y_mix = self.mix_up(x, y, self.mix_beta_alpha) # This had interleaving, but it was removed as it's not @@ -177,7 +181,19 @@ def training_step(self, batch, batch_idx): y_mix_unl = y_mix[batch_size:] loss_lbl = self.loss_lbl(y_mix_lbl_pred, y_mix_lbl) + self.lbl_logger( + self.logger.experiment, + "Labelled Y Pred", + torch.argmax(y_mix_lbl_pred, dim=1), + flush_every=10, + ) loss_unl = self.loss_unl(y_mix_unl_pred, y_mix_unl) + self.lbl_logger( + self.logger.experiment, + "Unlabelled Y Pred", + torch.argmax(y_mix_unl_pred, dim=1), + flush_every=10, + ) loss_unl_scale = self.loss_unl_scaler(progress=self.progress) loss = loss_lbl + loss_unl * loss_unl_scale @@ -188,7 +204,7 @@ def training_step(self, batch, batch_idx): else: # This route implies that we are just using supervised learning y_pred = self(x_lbl) - loss = self.loss_lbl(y_pred, y_lbl.float()) + loss = self.loss_lbl(y_pred, y_lbl_ohe.float()) self.log("train_loss", loss) return loss @@ -201,7 +217,16 @@ def on_after_backward(self) -> None: def validation_step(self, batch, batch_idx): x, y = batch + self.lbl_logger( + self.logger.experiment, "Val Input Y Label", y, flush_every=1 + ) y_pred = self.ema_model(x) + self.lbl_logger( + self.logger.experiment, + "Val Pred Y Label", + torch.argmax(y_pred, dim=1), + flush_every=1, + ) loss = F.cross_entropy(y_pred, y.long()) acc = accuracy( @@ -299,3 +324,38 @@ def y_trans_fn(y): return (x_lab_trans, y_trans.long()), x_unl_trans else: return x_lab_trans, y_trans.long() + + +class WandBLabelLogger(dict): + """Logger to log y labels to WandB""" + + def __call__( + self, + logger: wandb.sdk.wandb_run.Run, + key: str, + value: torch.Tensor, + flush_every: int = 10, + ): + """Log the labels to WandB + + Args: + logger: The W&B logger. Accessible through `self.logger.experiment` + key: The key to log the labels under. + value: The labels to log. + flush_every: How often to flush the labels to WandB. + + """ + if key not in self.keys(): + self[key] = [value] + else: + self[key].append(value) + + if len(self[key]) % flush_every == 0: + logger.log( + { + key: wandb.Histogram( + torch.flatten(value).detach().cpu().tolist() + ) + } + ) + self[key] = [] From a355c39843f0cb835c7a1d9034346f479b191488 Mon Sep 17 00:00:00 2001 From: Evening Date: Tue, 2 Jan 2024 14:45:01 +0800 Subject: [PATCH 43/52] Clean up train.py --- tests/model_tests/chestnut_dec_may/train.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/model_tests/chestnut_dec_may/train.py b/tests/model_tests/chestnut_dec_may/train.py index c37404e8..28e79a92 100644 --- a/tests/model_tests/chestnut_dec_may/train.py +++ b/tests/model_tests/chestnut_dec_may/train.py @@ -43,13 +43,12 @@ def main( ): run = wandb.init() logger = WandbLogger(name="chestnut_dec_may", project="frdc") + # Prepare the dataset train_lab_ds = ds.chestnut_20201218(transform=train_preprocess) - train_unl_ds = ds.chestnut_20201218.unlabelled( transform=train_unl_preprocess(2) ) - val_ds = ds.chestnut_20210510_43m(transform=preprocess) oe = OrdinalEncoder( @@ -65,12 +64,12 @@ def main( # Prepare the datamodule and trainer dm = FRDCDataModule( train_lab_ds=train_lab_ds, - # Pass in None to use the default supervised DM - train_unl_ds=train_unl_ds, + train_unl_ds=train_unl_ds, # None to use supervised DM val_ds=val_ds, batch_size=batch_size, train_iters=train_iters, val_iters=val_iters, + sampling_strategy="stratified", ) trainer = pl.Trainer( @@ -90,6 +89,7 @@ def main( ], logger=logger, ) + m = InceptionV3MixMatchModule( n_classes=n_classes, lr=lr, From dff83781b1461a91c524b5df5ef2a086720b3f8e Mon Sep 17 00:00:00 2001 From: Evening Date: Tue, 2 Jan 2024 14:45:10 +0800 Subject: [PATCH 44/52] Make W&B Watch model --- tests/model_tests/chestnut_dec_may/train.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/model_tests/chestnut_dec_may/train.py b/tests/model_tests/chestnut_dec_may/train.py index 28e79a92..863a9476 100644 --- a/tests/model_tests/chestnut_dec_may/train.py +++ b/tests/model_tests/chestnut_dec_may/train.py @@ -96,6 +96,7 @@ def main( x_scaler=ss, y_encoder=oe, ) + logger.watch(m) trainer.fit(m, datamodule=dm) From 53d38e5d8b9205443d2d1d4de10b1fc7cb502707 Mon Sep 17 00:00:00 2001 From: Evening Date: Mon, 8 Jan 2024 12:59:33 +0800 Subject: [PATCH 45/52] Add warning on Label Studio connection issue --- src/frdc/conf.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/frdc/conf.py b/src/frdc/conf.py index e2f5958a..68016c60 100644 --- a/src/frdc/conf.py +++ b/src/frdc/conf.py @@ -77,3 +77,15 @@ f"LABEL_STUDIO_CLIENT will be None." ) LABEL_STUDIO_CLIENT = None + +try: + logger.info("Attempting to Get Label Studio Project...") + LABEL_STUDIO_CLIENT.get_project(1) +except requests.exceptions.HTTPError: + logger.warning( + f"Could not get main annotation project. " + f"Pulling annotations may not work. " + f"It's possible that your API Key is incorrect, " + f"or somehow your .netrc is preventing you from " + f"accessing the project. " + ) From 74b15c0435b04d2213a94011d32a8ee654ef8934 Mon Sep 17 00:00:00 2001 From: Evening Date: Mon, 8 Jan 2024 13:07:59 +0800 Subject: [PATCH 46/52] Add dumping script --- src/label-studio/dump.sh | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 src/label-studio/dump.sh diff --git a/src/label-studio/dump.sh b/src/label-studio/dump.sh new file mode 100644 index 00000000..d651f42e --- /dev/null +++ b/src/label-studio/dump.sh @@ -0,0 +1,16 @@ +echo "Creating backups directory..." +docker exec label-studio-db-1 sh -c "if [ ! -d \"/var/lib/postgresql/backups/\" ]; then mkdir -p \"/var/lib/postgresql/backups/\"; fi" + +echo "Checking if label-studio-db-1 is running..." +docker exec label-studio-db-1 sh -c "pg_isready -U postgres" + +if [ $? -ne 0 ]; then + echo "label-studio-db-1 is not running. Exiting..." + exit 1 +fi + +echo "Dumping database... to /var/lib/postgresql/backups/" +docker exec label-studio-db-1 sh -c "pg_dump -Fc -U postgres -d postgres -f \"/var/lib/postgresql/backups/$(date +'%d-%m-%Y_%HH%MM%SS').backup\"" + +echo "Dumping database in SQL format... to /var/lib/postgresql/backups/" +docker exec label-studio-db-1 sh -c "pg_dump -U postgres -d postgres -f \"/var/lib/postgresql/backups/$(date +'%d-%m-%Y_%HH%MM%SS').sql\"" From 13a611811d0c90198209ee8c47ac04aeba114a84 Mon Sep 17 00:00:00 2001 From: Evening Date: Mon, 8 Jan 2024 13:08:03 +0800 Subject: [PATCH 47/52] Update .gitignore --- src/.gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/src/.gitignore b/src/.gitignore index 57b22671..d3514034 100644 --- a/src/.gitignore +++ b/src/.gitignore @@ -1,2 +1,3 @@ label-studio/ !label-studio/docker-compose.yml +!label-studio/dump.sh From aeca5565993297e3697a81fca4d496e27b01bd9b Mon Sep 17 00:00:00 2001 From: Evening Date: Mon, 8 Jan 2024 13:08:12 +0800 Subject: [PATCH 48/52] Mount backups to host --- src/label-studio/docker-compose.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/src/label-studio/docker-compose.yml b/src/label-studio/docker-compose.yml index 4a6d909f..9340fb8b 100644 --- a/src/label-studio/docker-compose.yml +++ b/src/label-studio/docker-compose.yml @@ -67,6 +67,7 @@ services: - POSTGRES_HOST_AUTH_METHOD=trust volumes: - ${POSTGRES_DATA_DIR:-./postgres-data}:/var/lib/postgresql/data + - ${POSTGRES_DATA_DIR:-./postgres-backups}:/var/lib/postgresql/backups - ./deploy/pgsql/certs:/var/lib/postgresql/certs:ro networks: - label-studio From fb938909a504c8ec769bcd6b4aaf77eeb421fb3d Mon Sep 17 00:00:00 2001 From: Evening Date: Mon, 8 Jan 2024 13:13:24 +0800 Subject: [PATCH 49/52] Fix issue with Label Studio being None exception --- src/frdc/conf.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/src/frdc/conf.py b/src/frdc/conf.py index 68016c60..bbbbdf68 100644 --- a/src/frdc/conf.py +++ b/src/frdc/conf.py @@ -71,6 +71,17 @@ api_key=LABEL_STUDIO_API_KEY, ) logger.info("Connected to Label Studio.") + try: + logger.info("Attempting to Get Label Studio Project...") + LABEL_STUDIO_CLIENT.get_project(1) + except requests.exceptions.HTTPError: + logger.warning( + f"Could not get main annotation project. " + f"Pulling annotations may not work. " + f"It's possible that your API Key is incorrect, " + f"or somehow your .netrc is preventing you from " + f"accessing the project. " + ) except requests.exceptions.ConnectionError: logger.warning( f"Could not connect to Label Studio at {LABEL_STUDIO_URL}. " @@ -78,14 +89,4 @@ ) LABEL_STUDIO_CLIENT = None -try: - logger.info("Attempting to Get Label Studio Project...") - LABEL_STUDIO_CLIENT.get_project(1) -except requests.exceptions.HTTPError: - logger.warning( - f"Could not get main annotation project. " - f"Pulling annotations may not work. " - f"It's possible that your API Key is incorrect, " - f"or somehow your .netrc is preventing you from " - f"accessing the project. " - ) + From d96030ddacc5c224eb1bfa283bd0202069da6fdb Mon Sep 17 00:00:00 2001 From: Evening Date: Mon, 8 Jan 2024 13:50:17 +0800 Subject: [PATCH 50/52] Minor Black formatting --- src/frdc/conf.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/frdc/conf.py b/src/frdc/conf.py index bbbbdf68..0d32eb45 100644 --- a/src/frdc/conf.py +++ b/src/frdc/conf.py @@ -88,5 +88,3 @@ f"LABEL_STUDIO_CLIENT will be None." ) LABEL_STUDIO_CLIENT = None - - From 3794501c943c9e3ee007cac941bb7afb4d2ea176 Mon Sep 17 00:00:00 2001 From: Evening Date: Mon, 8 Jan 2024 13:50:47 +0800 Subject: [PATCH 51/52] Fix issue with WandB hist logger too many bins --- src/frdc/train/mixmatch_module.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/src/frdc/train/mixmatch_module.py b/src/frdc/train/mixmatch_module.py index 75e581e8..9e3af191 100644 --- a/src/frdc/train/mixmatch_module.py +++ b/src/frdc/train/mixmatch_module.py @@ -154,7 +154,11 @@ def progress(self): def training_step(self, batch, batch_idx): (x_lbl, y_lbl), x_unls = batch self.lbl_logger( - self.logger.experiment, "Input Y Label", y_lbl, flush_every=10 + self.logger.experiment, + "Input Y Label", + y_lbl, + flush_every=10, + num_bins=self.n_classes, ) y_lbl_ohe = one_hot(y_lbl.long(), num_classes=self.n_classes) @@ -186,6 +190,7 @@ def training_step(self, batch, batch_idx): "Labelled Y Pred", torch.argmax(y_mix_lbl_pred, dim=1), flush_every=10, + num_bins=self.n_classes, ) loss_unl = self.loss_unl(y_mix_unl_pred, y_mix_unl) self.lbl_logger( @@ -193,6 +198,7 @@ def training_step(self, batch, batch_idx): "Unlabelled Y Pred", torch.argmax(y_mix_unl_pred, dim=1), flush_every=10, + num_bins=self.n_classes, ) loss_unl_scale = self.loss_unl_scaler(progress=self.progress) @@ -218,7 +224,11 @@ def on_after_backward(self) -> None: def validation_step(self, batch, batch_idx): x, y = batch self.lbl_logger( - self.logger.experiment, "Val Input Y Label", y, flush_every=1 + self.logger.experiment, + "Val Input Y Label", + y, + flush_every=1, + num_bins=self.n_classes, ) y_pred = self.ema_model(x) self.lbl_logger( @@ -226,6 +236,7 @@ def validation_step(self, batch, batch_idx): "Val Pred Y Label", torch.argmax(y_pred, dim=1), flush_every=1, + num_bins=self.n_classes, ) loss = F.cross_entropy(y_pred, y.long()) @@ -334,6 +345,7 @@ def __call__( logger: wandb.sdk.wandb_run.Run, key: str, value: torch.Tensor, + num_bins: int, flush_every: int = 10, ): """Log the labels to WandB @@ -354,7 +366,8 @@ def __call__( logger.log( { key: wandb.Histogram( - torch.flatten(value).detach().cpu().tolist() + torch.flatten(value).detach().cpu().tolist(), + num_bins=num_bins, ) } ) From c8b050a2ebe10fe88d17e39b068f66f1e30480f9 Mon Sep 17 00:00:00 2001 From: Evening Date: Mon, 8 Jan 2024 13:51:06 +0800 Subject: [PATCH 52/52] Fix issue with redundant initializing wandb --- tests/model_tests/chestnut_dec_may/train.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/model_tests/chestnut_dec_may/train.py b/tests/model_tests/chestnut_dec_may/train.py index 863a9476..8d4aad1c 100644 --- a/tests/model_tests/chestnut_dec_may/train.py +++ b/tests/model_tests/chestnut_dec_may/train.py @@ -3,6 +3,7 @@ This test is done by training a model on the 20201218 dataset, then testing on the 20210510 dataset. """ +import os # Uncomment this to run the W&B monitoring locally # import os @@ -41,9 +42,6 @@ def main( val_iters=15, lr=1e-3, ): - run = wandb.init() - logger = WandbLogger(name="chestnut_dec_may", project="frdc") - # Prepare the dataset train_lab_ds = ds.chestnut_20201218(transform=train_preprocess) train_unl_ds = ds.chestnut_20201218.unlabelled( @@ -87,7 +85,9 @@ def main( monitor="val_loss", mode="min", save_top_k=1 ), ], - logger=logger, + logger=( + logger := WandbLogger(name="chestnut_dec_may", project="frdc") + ), ) m = InceptionV3MixMatchModule( @@ -103,7 +103,7 @@ def main( with open(Path(__file__).parent / "report.md", "w") as f: f.write( f"# Chestnut Nature Park (Dec 2020 vs May 2021)\n" - f"- Results: [WandB Report]({run.get_url()})" + f"- Results: [WandB Report]({wandb.run.get_url()})" ) y_true, y_pred = predict( @@ -133,8 +133,8 @@ def main( VAL_ITERS = 15 LR = 1e-3 - assert wandb.run is None - wandb.setup(wandb.Settings(program=__name__, program_relpath=__name__)) + wandb.login(key=os.environ["WANDB_API_KEY"]) + main( batch_size=BATCH_SIZE, epochs=EPOCHS,