From c73e67b5edb8034bd07947bc0a886fed00d09a69 Mon Sep 17 00:00:00 2001 From: Rupesh Shrestha Date: Tue, 18 Feb 2025 09:31:50 -0500 Subject: [PATCH] updates to toc --- README.md | 3 +- _quarto.yml | 2 +- .../avng_invasive_esaworkshop_noS3.ipynb | 1341 ----------------- 3 files changed, 3 insertions(+), 1343 deletions(-) delete mode 100644 tutorials/avirisng/avng_invasive_esaworkshop_noS3.ipynb diff --git a/README.md b/README.md index a423d6d..2875abb 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,5 @@ -# BIOSCAPE DEMO / BIOSPACE 25 +# BIOSCAPE Workshop at BIOSPACE25 Harnessing analysis tools for biodiversity applications using field, airborne, and orbital remote sensing data from NASA's BioSCAPE campaign +https://nasa-openscapes.github.io/2025-biospace/ \ No newline at end of file diff --git a/_quarto.yml b/_quarto.yml index 49bbcf3..f618c26 100644 --- a/_quarto.yml +++ b/_quarto.yml @@ -29,7 +29,7 @@ website: - tutorials/prerequisites.md - section: "Mapping invasive species using AVIRIS-NG" contents: - - href: tutorials/avirisng/avng_invasive_esaworkshop_noS3.ipynb + - href: tutorials/avirisng/avng_invasive_esaworkshop.ipynb text: "Machine learning with AVIRIS-NG" - section: "LVIS/GEDI Data for Ecosystem Structure" contents: diff --git a/tutorials/avirisng/avng_invasive_esaworkshop_noS3.ipynb b/tutorials/avirisng/avng_invasive_esaworkshop_noS3.ipynb deleted file mode 100644 index afa7312..0000000 --- a/tutorials/avirisng/avng_invasive_esaworkshop_noS3.ipynb +++ /dev/null @@ -1,1341 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "3e87abc9-7ff7-42e5-914d-48c2a73c0839", - "metadata": {}, - "source": [ - "# BIOSPACE25 Workshop: \n", - "\n", - "## Harnessing analysis tools for biodiversity applications using field, airborne, and orbital remote sensing data from NASA's BioSCAPE campaign\n", - "\n", - "Michele Thornton, Rupesh Shrestha, Erin Hestir, Adam Wilson, Jasper Slingsby, Anabelle Cardoso\n", - "\n", - "**Date:** February 12, 2025, Frascati (Rome), Italy\n", - "\n", - "![BIOSPACE25](images/BioSpace25_clip_50.jpg)\n" - ] - }, - { - "cell_type": "markdown", - "id": "2a39fb4b-1206-40fe-8cb1-28869f68db47", - "metadata": {}, - "source": [ - "# Tutorial: Mapping invasive species using supervised machine learning and AVIRIS-NG \n", - "\n", - "## Overview \n", - "\n", - "In this notebook, we will use existing data of verified land cover and alien species locations to extract spectra from AVIRIS NG surface reflectance data.\n", - "\n", - "## Learning Objectives\n", - "1. Understand how to inspect and prepare data for machine learning models\n", - "2. Train and interpret a machine learning model\n", - "3. Apply a trained model to AVIRIS imagery to create alien species maps" - ] - }, - { - "cell_type": "markdown", - "id": "e66219ca-026f-4dc5-a4ac-3abf1f59d50b", - "metadata": {}, - "source": [ - "### Load Python Modules" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dce9f802-0a0b-4589-9a21-e562ae338e07", - "metadata": {}, - "outputs": [], - "source": [ - "#!pip install --user xvec\n", - "#!pip install --user shap\n", - "#!pip install --user xgboost" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "67e71344-6b12-40c6-8d34-af2245dcd2dd", - "metadata": {}, - "outputs": [], - "source": [ - "from os import path\n", - "import geopandas as gpd\n", - "import s3fs\n", - "import pandas as pd\n", - "import xarray as xr\n", - "from shapely.geometry import box, mapping\n", - "import rioxarray as riox\n", - "import numpy as np\n", - "import netCDF4 as nc\n", - "import hvplot.xarray\n", - "import holoviews as hv\n", - "import xvec\n", - "import matplotlib.pyplot as plt\n", - "from dask.diagnostics import ProgressBar\n", - "import warnings\n", - "#our functions\n", - "from utils import get_first_xr\n", - "\n", - "warnings.filterwarnings('ignore')\n", - "hvplot.extension('bokeh')" - ] - }, - { - "cell_type": "markdown", - "id": "061d90fc-5cdf-4ee6-912f-09167312586f", - "metadata": {}, - "source": [ - "### Explore Sample Land Type Plot-Level Data\n", - "A small dataset over the Cape Town Peninsula of South Africa of manually collected invasive plant and land cover label\n", - "- `ct_invasive.gpkg`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4dd7ef19-7cd7-4e07-b67c-7d77ece5b102", - "metadata": {}, - "outputs": [], - "source": [ - "# let's create a DataFrame and assign labels to each class\n", - "\n", - "label_df = pd.DataFrame({'LandType': ['Bare ground/Rock','Mature Fynbos', \n", - " 'Recently Burnt Fynbos', 'Wetland', \n", - " 'Forest', 'Pine', 'Eucalyptus' , 'Wattle', 'Water'],\n", - " 'class': ['0','1','2','3','4','5','6','7','8']})\n", - "\n", - "label_df" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6332d4b6-6332-405a-85c7-a1e5618927d4", - "metadata": {}, - "outputs": [], - "source": [ - "# open the dataset and project to the South African UTM projection also used by the AVIRIS-NG airborne data \n", - "class_data = gpd.read_file('data/ct_invasive.gpkg')\n", - "# class_data.crs\n", - "class_data_utm = (class_data\n", - " .to_crs(\"EPSG:32734\")\n", - " .merge(label_df, on='class', how='left')\n", - " )\n", - "class_data_utm" - ] - }, - { - "cell_type": "markdown", - "id": "f96d4b5a-316f-4092-936c-7cb027975f05", - "metadata": {}, - "source": [ - "### Summarize and Visualize the Land Types" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "65e0f1b4-335c-4666-9e57-460a8387b2c3", - "metadata": {}, - "outputs": [], - "source": [ - "class_data_utm.groupby(['LandType']).size()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "443dca14-cc04-4644-b4a0-053527af8a4d", - "metadata": {}, - "outputs": [], - "source": [ - "class_data_utm.groupby(['group']).size()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ebddeb77-d922-4224-ad6a-7cdd0914dc54", - "metadata": {}, - "outputs": [], - "source": [ - "# Let's visualize the plot data in an interactive map, with color by class, using a Google satellite basemap\n", - "map = class_data_utm[['LandType', 'geometry']].explore('LandType', tiles='https://mt1.google.com/vt/lyrs=s&x={x}&y={y}&z={z}', attr='Google')\n", - "map" - ] - }, - { - "cell_type": "markdown", - "id": "7f4655f3-aa26-4a0f-80a5-3d79e4739702", - "metadata": {}, - "source": [ - "### AVIRIS-NG Data over Cape Town Peninsula\n", - "\n", - "There is a coverage file that has the bounding box of each AVIRIS-NG flight scene made available by the BioSCape Science Team. \n", - "- ANG_Coverage.geojson\n", - "\n", - "Flight lines are provided as smaller sections within each flight line. We'll refer to these smaller sections as scences. The data for each scene within a flight line is seamless to the adjacent scenes. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "83f66965-db11-4f18-af16-5330e132c31c", - "metadata": {}, - "outputs": [], - "source": [ - "# read and plot the AVNG coverage file\n", - "AVNG_Coverage = gpd.read_file('data/ANGv2_Coverage.geojson', driver='GeoJSON')\n", - "AVNG_Coverage.keys()" - ] - }, - { - "cell_type": "markdown", - "id": "aff6f1eb-9ced-4d5f-ac59-5484ce72b89c", - "metadata": {}, - "source": [ - "- note that the 'RFL s3' key was pre-populated in the geojson file!!\n", - "- we'll see this s3 file list in an upcoming list" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "721faecc-dec9-46d0-adf4-03c376020d6a", - "metadata": {}, - "outputs": [], - "source": [ - "AVNG_Coverage.crs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c17dc0d7-9dd6-418b-9265-02efe38ee57b", - "metadata": {}, - "outputs": [], - "source": [ - "AVNG_Coverage" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1857fe8e-fd3e-46a1-acd5-cf6b440fa361", - "metadata": {}, - "outputs": [], - "source": [ - "# Let's visualize the plot data in an interactive map, with color by class, using a Google satellite basemap\n", - "map = AVNG_Coverage[['fid', 'geometry']].explore(tiles='https://mt1.google.com/vt/lyrs=s&x={x}&y={y}&z={z}', attr='Google')\n", - "#map = AVNG_Coverage[['fid', 'geometry']].explore('fid')\n", - "map" - ] - }, - { - "cell_type": "markdown", - "id": "97dd8516-3859-4084-b34c-b28bdd7132fd", - "metadata": {}, - "source": [ - "- AVIRIS-NG Principle Investigator Researchers are finalizing formats and standards of AVIRIS-NG airborne radiance and reflectance files. When finalized, the data will be published to into NASA Earthdata. \n", - "\n", - "- For now, JPL provides preliminatry AVIRIS-NG data [**here**](https://popo.jpl.nasa.gov/pub/bioscape_netCDF/). Once finalized, AVIRIS-NG data from the BioSCape Campaign will be available from NASA Earthdata Cloud Storage." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f73b5727-e7cf-4d91-b319-24c24b72c530", - "metadata": {}, - "outputs": [], - "source": [ - "# Workshop participants will download this file from JPL\n", - "# If you need to download this file, uncomment the wget line and run this code block.\n", - "# !wget https://popo.jpl.nasa.gov/pub/bioscape_netCDF/rfl/ang20231109t133124_005_L2A_OE_0b4f48b4_RFL_ORT.nc -P /home/jovyan/2025-biospace/tutorials/avirisng/data/ang" - ] - }, - { - "cell_type": "markdown", - "id": "0e455fb5-8d98-408a-a08e-874839843757", - "metadata": {}, - "source": [ - "### Select the AVIRIS-NG Flight Line data to selected parameters and create lists to use later\n", - "For our analysis demonstration in this Notebook, we'll narrow the flight lines to the area of the Cape Penisula and for flights that took place on 2023-11-09.\n", - "- the Python **`GeoDataFrame.to_crs`** method Transform geometries to a new coordinate reference system." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a1ea7a44-1335-4e2d-9ffc-1eafcfeba0bb", - "metadata": {}, - "outputs": [], - "source": [ - "# temporal filter: filter dates to between midnight on 2023-11-09 and 23:59:59 on 2023-11-09\n", - "AVNG_CP = AVNG_Coverage[(AVNG_Coverage['end_time'] >= '2023-11-09 00:00:00') & (AVNG_Coverage['end_time'] <= '2023-11-09 23:59:59')]\n", - "AVNG_CP = AVNG_CP.to_crs(\"EPSG:32734\")\n", - "\n", - "#keep only AVNG_CP that intersects with class_data\n", - "AVNG_CP = AVNG_CP[AVNG_CP.intersects(class_data_utm.unary_union)]\n", - "#AVNG_CP\n", - "\n", - "files_s3 = AVNG_CP['RFL s3'].tolist()\n", - "files_AVNG_geo = AVNG_CP['geometry'].tolist()\n", - "files_AVNG_geo\n", - "\n", - "#Visualize the selected flight lines\n", - "#m = AVNG_CP[['fid','geometry']].explore('fid')\n", - "m = AVNG_CP[['fid', 'geometry']].explore('fid', tiles='https://mt1.google.com/vt/lyrs=s&x={x}&y={y}&z={z}', attr='Google')\n", - "#explore('LandType', tiles='https://mt1.google.com/vt/lyrs=s&x={x}&y={y}&z={z}', attr='Google')\n", - "m\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a0884783-b56c-4cbd-af1e-910f9e61f621", - "metadata": {}, - "outputs": [], - "source": [ - "AVNG_CP.to_file('AVNG_CP.geojson', driver='GeoJSON')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "433d917f-df68-47f7-b0b1-de20be9dec9c", - "metadata": {}, - "outputs": [], - "source": [ - "AVNG_CP.crs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f6cd5b47-3ce0-44b1-8087-1962eea2fdcb", - "metadata": {}, - "outputs": [], - "source": [ - "print(AVNG_CP['fid'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9ae176e1-b9d6-4dd1-b26a-7e4f21552738", - "metadata": {}, - "outputs": [], - "source": [ - "files_s3[26]" - ] - }, - { - "cell_type": "markdown", - "id": "ccbd8ad3-0924-45fa-9cf1-6d31f82a0eb4", - "metadata": { - "scrolled": true - }, - "source": [ - "#### The AVIRIS-NG files are also in S3 buckets in a BioSCape Science Managed Cloud Environment (SMCE). \n", - "- SMCE's support NASA Funded researchers by providing a secure hub to store and analyze data. These SMCE's are in AWS US-West. Workshop instructors are able to access these files. \n" - ] - }, - { - "cell_type": "markdown", - "id": "1abffda6-49a3-42cb-aacb-db81dca8b634", - "metadata": {}, - "source": [ - "#### S3 access is commented out for workshop participants" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0431d079-e0c9-4cab-b5d4-3c93fef145cf", - "metadata": {}, - "outputs": [], - "source": [ - "# Using BioSCape AWS Credentials to acces BioSCape SMCE\n", - "# import s3fs\n", - "# secret_key=\n", - "# access_key=\n", - "# token =\n", - "# fs = s3fs.S3FileSystem(anon=False, \n", - "# secret=secret_key,\n", - "# key=access_key,\n", - "# token=token)\n" - ] - }, - { - "cell_type": "markdown", - "id": "b2907fea-6581-4cff-8e0a-a73dcd01452c", - "metadata": {}, - "source": [ - "### Explore the BioSCape S3 Data Holdings\n", - "- **S3** = Amazon Simple Storage Service (S3) is a cloud storage service that allows users to store and retrieve data\n", - "- **S3 Bucket** = Buckets are the basic containers that hold data. Buckets can be likened to file folders and object storage\n", - "- **S3Fs** is a `Pythonic` open source tool that mounts S3 object storage locally. S3Fs provides a filesystem-like interface for accessing objects on S3.\n", - ">import s3fs\n", - ">\n", - ">fs = s3fs.S3FileSystem(anon=False)\n", - "\n", - "- The top-level class **`S3FileSystem`** holds connection information and allows typical file-system style operations like `ls`, `cp`, `mv`\n", - " - `ls` is a UNIX command to list computer files and directories" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3e2d210f-4a68-4d2c-bb1e-05cbd5b06ff5", - "metadata": {}, - "outputs": [], - "source": [ - "#fs.ls('bioscape-data/')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "14bea73f-6662-4ee8-8128-59bcc5fdb8aa", - "metadata": {}, - "outputs": [], - "source": [ - "#fs.ls('bioscape-data/AVNG_V2/')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "edd319d0-9366-455b-a8fe-b9072850fb8b", - "metadata": {}, - "outputs": [], - "source": [ - "#fs.ls('bioscape-data/AVNG_V2/ang20231109t133124/ang20231109t133124_005')" - ] - }, - { - "cell_type": "markdown", - "id": "f5bbdfa6-71d4-4eec-9769-0db095832947", - "metadata": {}, - "source": [ - "#### Single AVIRIS-NG flight scene Reflectance file **`ang20231109t133124_005_L2A_OE_0b4f48b4_RFL_ORT.nc`**" - ] - }, - { - "cell_type": "markdown", - "id": "607cc880-a891-4f53-8d47-7885497be10f", - "metadata": {}, - "source": [ - "### Open a single AVIRIS-NG Reflectance file to inspect the data" - ] - }, - { - "cell_type": "markdown", - "id": "5fa0e145-6eeb-4277-bbb6-0c4a813e722c", - "metadata": {}, - "source": [ - "- **`S3Fs`** can be used to mount S3 object storage locally\n", - "- **`xarray`** is an open source project and Python package that introduces labels in the form of dimensions, coordinates, and attributes on top of raw NumPy-like arrays" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ca1e1459-59f4-4c4e-ad9b-1a9c3a65a331", - "metadata": {}, - "outputs": [], - "source": [ - "## Sample code to open a file from an S3 bucket using S3Fs\n", - "\n", - "#rfl_netcdf = xr.open_datatree(fs.open(files_s3[26], 'rb'),\n", - "# engine='h5netcdf', chunks={})\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bfe9a4c8-e9cc-4b43-b7aa-39a7612dece0", - "metadata": {}, - "outputs": [], - "source": [ - "# For this workshop, we're using a local AVIRIS-NG scence \n", - "rfl_netcdf_2i2c = 'data/ang/ang20231109t134249_006_L2A_OE_0b4f48b4_RFL_ORT.nc'\n", - "rfl_netcdf_2i2c" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "13f66965-d621-4e25-b84f-0cc97ca3bf55", - "metadata": {}, - "outputs": [], - "source": [ - "#rfl_netcdf = xr.open_datatree(fs.open(files_s3[26], 'rb'),\n", - "# engine='h5netcdf', chunks={})\n", - "\n", - "rfl_netcdf = xr.open_datatree(rfl_netcdf_2i2c, engine='h5netcdf', chunks={})\n", - "rfl_netcdf = rfl_netcdf.reflectance.to_dataset()\n", - "rfl_netcdf = rfl_netcdf.reflectance.where(rfl_netcdf.reflectance>0)\n", - "rfl_netcdf" - ] - }, - { - "cell_type": "markdown", - "id": "068c45cf-27e2-440e-a9d2-e59281bcfb57", - "metadata": {}, - "source": [ - "### Plot a true color image" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dda7578f-913b-4aef-9350-49a1f2f98cff", - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "h = rfl_netcdf.sel(wavelength=[660, 570, 480], method=\"nearest\").hvplot.rgb('easting', 'northing',\n", - " rasterize=True, data_aspect=1,\n", - " bands='wavelength', frame_width=400)\n", - "h" - ] - }, - { - "cell_type": "markdown", - "id": "87a64646-2def-49ba-b833-cfd32e605911", - "metadata": { - "scrolled": true - }, - "source": [ - "### Plot just a red reflectance" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7b06cab1-2ca6-4a25-85b3-a88c24592fb5", - "metadata": {}, - "outputs": [], - "source": [ - "h = rfl_netcdf.sel({'wavelength': 660},method='nearest').hvplot('easting', 'northing',\n", - " rasterize=True, data_aspect=1,\n", - " cmap='magma',frame_width=400,clim=(0,0.3))\n", - "h" - ] - }, - { - "cell_type": "markdown", - "id": "e73ef8ca-1f8b-4e85-913c-219adf1ebf7d", - "metadata": {}, - "source": [ - "### Extract Spectra for each Land Plot" - ] - }, - { - "cell_type": "markdown", - "id": "775b10c6-0ffa-42d7-a009-d9f0ba237c54", - "metadata": {}, - "source": [ - "#### Now that we are familiar with the data, we want to get the AVIRIS spectra at each label location. Below is a function that does this and returns the result as a xarray" - ] - }, - { - "cell_type": "markdown", - "id": "78d70841-d832-4f47-befb-d66704fba623", - "metadata": {}, - "source": [ - "Recall some files we created earlier:\n", - "- `files_s3` = list; S3 netCDF files directories from the Cape Penisula subset area\n", - "- `files_AVNG_geo` = list; coordinates of bounding boxes of the flight line scenes from the Cape Penisula area \n", - "- `class_data_utm` = gpd; Cape Penisula Land Types with UTM geography" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5dfa83e3-1a79-4854-9ba6-97e88a0d5fef", - "metadata": {}, - "outputs": [], - "source": [ - "#the function takes a filepath to a file on s3, and the point locations for extraction\n", - "#this function requires hitting files on the BioSCape SMCE\n", - "\n", - "# def extract_points(s3uri, geof, points):\n", - "# ds = xr.open_datatree(fs.open(s3uri, 'rb'), decode_coords='all',\n", - "# engine='h5netcdf', chunks='auto')\n", - " \n", - "# # Clip the raw data to the bounding box \n", - "# points = points.clip(geof)\n", - "# print(f'got {points.shape[0]} point from {s3uri}')\n", - "# points = points.to_crs(ds.transverse_mercator.crs_wkt)\n", - " \n", - " \n", - "# # Extract points\n", - "# #extracted = ds.to_dataset().xvec.extract_points(points['geometry'], x_coords=\"easting\", y_coords=\"northing\",index=True)\n", - "# extracted = ds.reflectance.to_dataset().xvec.extract_points(points['geometry'], \n", - "# x_coords=\"easting\", \n", - "# y_coords=\"northing\",\n", - "# index=True)\n", - "# return extracted" - ] - }, - { - "cell_type": "markdown", - "id": "89cfc6e0-261a-45a3-8aea-09136d9aa4de", - "metadata": {}, - "source": [ - "When we call the function, we'll iterate through the list of files (files_s3). Each file will overlap with several land class points." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "36321f50-3c41-4e9b-b8fb-c50abc8738c2", - "metadata": {}, - "outputs": [], - "source": [ - "# ds_all = [extract_points(file, geo, class_data_utm) for file, geo in zip(files_s3, files_AVNG_geo)]\n", - "# ds_all = xr.concat(ds_all, dim='file')\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c89d9db1-29c4-4d57-86c9-aa93ab207bb4", - "metadata": {}, - "outputs": [], - "source": [ - "#ds_all" - ] - }, - { - "cell_type": "markdown", - "id": "57e8bb0f-84b0-4e59-8cfe-96adceab6879", - "metadata": { - "scrolled": true - }, - "source": [ - "Because some points are covered by multiple AVIRIS scenes, some points have multiple spectra for each location, and thus we have an extra dim in this. We will simply extract the first valid reflectance measurement for each geometry. We have a custom function to do this `get_first_xr()`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2eaa4021-7198-4a04-9f12-b3b41918dbb6", - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "# ds = get_first_xr(ds_all)\n", - "# ds" - ] - }, - { - "cell_type": "markdown", - "id": "895b4f9b-842a-43aa-9226-cb20c1106c17", - "metadata": {}, - "source": [ - "This data set just has the spectra. We need to merge with point data to add labels" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cea14253-bf16-42ba-a2eb-712634cac76d", - "metadata": {}, - "outputs": [], - "source": [ - "# class_xr =class_data_utm[['class','group']].to_xarray()\n", - "# ds = ds.merge(class_xr.astype(int),join='left')\n", - "# ds" - ] - }, - { - "cell_type": "markdown", - "id": "19eff926-f01d-4856-91ea-02b81572948b", - "metadata": {}, - "source": [ - "We have defined all the operations we want, but becasue of xarrays lazy compution, the calculations have not yet been done. We will now force it to perform this calculations. We want to keep the result in chunks, so we use .persist() and not .compute(). This should take approx 2 - 3 mins" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6b1f1e08-dbed-4329-9777-7957ceef389f", - "metadata": {}, - "outputs": [], - "source": [ - "## DUE TO RUN TIME LENGTH, WE WILL NOT RUN THIS IN THE WORKSHOP - HAVE SAVED THIS OUTPUT FOR NEXT STEP\n", - "# with ProgressBar():\n", - "# dsp = ds.persist()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "318d800d-ad44-4840-a218-f055d1dfb389", - "metadata": {}, - "outputs": [], - "source": [ - "dsp = xr.open_dataset('dsp.nc')\n", - "dsp" - ] - }, - { - "cell_type": "markdown", - "id": "a7620adb-7dd3-44c7-99e6-03216d0c5c96", - "metadata": {}, - "source": [ - "### Inspect AVIRIS spectra" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3ca9c4ea-12ac-4718-867d-9b9953c2ade1", - "metadata": {}, - "outputs": [], - "source": [ - "# recall the class types\n", - "label_df" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cc0332b5-3eb5-4cc0-8860-c764f03f33b9", - "metadata": {}, - "outputs": [], - "source": [ - "dsp_plot = dsp.where(dsp['class']==5, drop=True)\n", - "h = dsp_plot['reflectance'].hvplot.line(x='wavelength',by='index',\n", - " color='green', alpha=0.5,legend=False)\n", - "h" - ] - }, - { - "cell_type": "markdown", - "id": "32fd3b5e-e390-4eb5-9b21-ede5fdaddced", - "metadata": {}, - "source": [ - "> At this point in a real machine learning workflow, you should closely inspect the spectra you have for each class. Do they make sense? Are there some spectra that look weird? You should re-evaluate your data to make sure that the assigned labels are true. This is a very important step" - ] - }, - { - "cell_type": "markdown", - "id": "2781095e-98e8-4a72-90b6-c2f0a0bd664c", - "metadata": {}, - "source": [ - "#### Prep data for ML model\n", - "\n", - "As you will know, not all of the wavelengths in the data are of equal quality, some will be degraded by atmospheric water absorption features or other factors. We should remove the bands from the analysis that we are not confident of. Probably the best way to do this is to use the uncertainties provided along with the reflectance files. We will simply use some prior knowledge to screen out the worst bands." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "28ef2ccf-025a-43fc-8148-b224c13d42b1", - "metadata": {}, - "outputs": [], - "source": [ - "wavelengths_to_drop = dsp.wavelength.where(\n", - " (dsp.wavelength < 450) |\n", - " (dsp.wavelength >= 1340) & (dsp.wavelength <= 1480) |\n", - " (dsp.wavelength >= 1800) & (dsp.wavelength <= 1980) |\n", - " (dsp.wavelength > 2400), drop=True\n", - ")\n", - "\n", - "# Use drop_sel() to remove those specific wavelength ranges\n", - "dsp = dsp.drop_sel(wavelength=wavelengths_to_drop)\n", - "\n", - "mask = (dsp['reflectance'] > -1).all(dim='wavelength') # Create a mask where all values along 'z' are non-negative\n", - "dsp = dsp.sel(index=mask)\n", - "dsp" - ] - }, - { - "cell_type": "markdown", - "id": "b85b58ab-66b1-4559-b3fd-0da97abbf06b", - "metadata": {}, - "source": [ - "Next we will normalize the data, there are a number of difference normalizations to try. In a ML workflow you should try a few and see which work best. We will only use a Brightness Normalization. In essence, we scale the reflectance of each wavelength by the total brightness of the spectra. This retains info on important shape features and relative reflectance, and removes info on absolute reflectance." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "86886370-d346-4547-8db4-4a7b27e479aa", - "metadata": {}, - "outputs": [], - "source": [ - "# Calculate the L2 norm along the 'wavelength' dimension\n", - "l2_norm = np.sqrt((dsp['reflectance'] ** 2).sum(dim='wavelength'))\n", - "\n", - "# Normalize the reflectance by dividing by the L2 norm\n", - "dsp['reflectance'] = dsp['reflectance'] / l2_norm" - ] - }, - { - "cell_type": "markdown", - "id": "e53e157b-1e16-4a64-bb68-037a450c5585", - "metadata": {}, - "source": [ - "Plot the new, clean spectra" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fdea565b-f455-4aa3-8f76-ca6ae28d00fd", - "metadata": {}, - "outputs": [], - "source": [ - "dsp_norm_plot = dsp.where(dsp['class']==5, drop=True)\n", - "h = dsp_norm_plot['reflectance'].hvplot.line(x='wavelength',by='index',\n", - " color='green',ylim=(-0.01,0.2),alpha=0.5,legend=False)\n", - "h" - ] - }, - { - "cell_type": "markdown", - "id": "f38ac6b4-711a-4322-9209-e1d82345f232", - "metadata": {}, - "source": [ - "### Train and evaluate the ML model" - ] - }, - { - "cell_type": "markdown", - "id": "8628a508-5f72-429e-9fb1-09a441a2ec20", - "metadata": {}, - "source": [ - "We will be using a model called `xgboost`. There are many, many different kinds of ML models. `xgboost` is a class of models called gradient boosted trees, related to random forests. When used for classification, random forests work by creating multiple decision trees, each trained on a random subset of the data and features, and then averaging their predictions to improve accuracy and reduce overfitting. Gradient boosted trees differ in that they build trees sequentially, with each new tree focusing on correcting the errors of the previous ones. This sequential approach allows `xgboost` to create highly accurate models by iteratively refining predictions and addressing the weaknesses of earlier trees." - ] - }, - { - "cell_type": "markdown", - "id": "9bca1ea9-9cb3-427b-abaa-7ecbf237c5af", - "metadata": {}, - "source": [ - "Import the Machine Learning libraries we will use." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "73e9f2ba-072d-4f51-8314-8c002dd60803", - "metadata": {}, - "outputs": [], - "source": [ - "import xgboost as xgb\n", - "from sklearn.model_selection import GridSearchCV\n", - "from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, ConfusionMatrixDisplay" - ] - }, - { - "cell_type": "markdown", - "id": "013634d9-b759-450d-a7b8-80b41b171bb1", - "metadata": {}, - "source": [ - "Our dataset has a label indicating which set (training or test), our data belong to. We wil use this to split it" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4c2d3cff-1402-4415-bb9e-23e634ffdc54", - "metadata": {}, - "outputs": [], - "source": [ - "# recall groups\n", - "class_data_utm.groupby(['group']).size()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "812a85a0-2788-4b70-be09-b4ab315358be", - "metadata": {}, - "outputs": [], - "source": [ - "class_data_utm.crs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f9d2c637-fd44-47ad-8c3d-c29510b3c4d3", - "metadata": {}, - "outputs": [], - "source": [ - "dtrain = dsp.where(dsp['group']==1,drop=True)\n", - "dtest = dsp.where(dsp['group']==2,drop=True)\n", - "\n", - "#create separte datasets for labels and features\n", - "y_train = dtrain['class'].values.astype(int)\n", - "y_test = dtest['class'].values.astype(int)\n", - "X_train = dtrain['reflectance'].values\n", - "X_test = dtest['reflectance'].values" - ] - }, - { - "cell_type": "markdown", - "id": "36f993d6-bed9-4f71-83b5-1a50c9034fb6", - "metadata": {}, - "source": [ - "#### Train ML model\n", - "The steps we will go through to train the model are:\n", - "\n", - "First, we define the hyperparameter grid. Initially, we set up a comprehensive grid (param_grid) with multiple values for several hyperparameters of the XGBoost model. \n", - "\n", - "Next, we create an XGBoost classifier object using the XGBClassifier class from the XGBoost library.\n", - "\n", - "We then set up the GridSearchCV object using our defined XGBoost model and the hyperparameter grid. GridSearchCV allows us to perform an exhaustive search over the specified hyperparameter values to find the optimal combination that results in the best model performance. We choose a 5-fold cross-validation strategy (cv=5), meaning we split our training data into five subsets to validate the model's performance across different data splits. We use accuracy as our scoring metric to evaluate the models.\n", - "\n", - "After setting up the grid search, we fit the GridSearchCV object to our training data (X_train and y_train). This process involves training multiple models with different hyperparameter combinations and evaluating their performance using cross-validation. Our goal is to identify the set of hyperparameters that yields the highest accuracy.\n", - "\n", - "Once the grid search completes, we print out the best set of hyperparameters and the corresponding best score. The grid_search.best_params_ attribute provides the combination of hyperparameters that achieved the highest cross-validation accuracy, while the grid_search.best_score_ attribute shows the corresponding accuracy score. Finally, we extract the best model (best_model) from the grid search results. This model is trained with the optimal hyperparameters and is ready for making predictions or further analysis in our classification task.\n", - "\n", - "This will take approx __30 seconds__" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d617f51d-b299-4c9b-810a-962cd8fa303a", - "metadata": {}, - "outputs": [], - "source": [ - "# Define the hyperparameter grid\n", - "param_grid = {\n", - " 'max_depth': [5],\n", - " 'learning_rate': [0.1],\n", - " 'subsample': [0.75],\n", - " 'n_estimators' : [50,100]\n", - "}\n", - "\n", - "# Create the XGBoost model object\n", - "xgb_model = xgb.XGBClassifier(tree_method='hist')\n", - "\n", - "# Create the GridSearchCV object\n", - "grid_search = GridSearchCV(xgb_model, param_grid, cv=5, scoring='accuracy')\n", - "\n", - "# Fit the GridSearchCV object to the training data\n", - "grid_search.fit(X_train, y_train)\n", - "\n", - "# Print the best set of hyperparameters and the corresponding score\n", - "print(\"Best set of hyperparameters: \", grid_search.best_params_)\n", - "print(\"Best score: \", grid_search.best_score_)\n", - "best_model = grid_search.best_estimator_" - ] - }, - { - "cell_type": "markdown", - "id": "8562cfe4-e227-4f64-bb01-4b19f4667e35", - "metadata": {}, - "source": [ - "### Evaluate model performance\n", - "\n", - "We will use our best model to predict the classes of the test data Then, we calculate the F1 score using f1_score, which balances precision and recall, and print it to evaluate overall performance.\n", - "\n", - "Next, we assess how well the model performs for predicting Pine trees by calculating its precision and recall. Precision measures the accuracy of the positive predictions. It answers the question, \"Of all the instances we labeled as Pines, how many were actually Pines?\". Recall measures the model's ability to identify all actual positive instances. It answers the question, \"Of all the actual Pines, how many did we correctly identify?\". You may also be familiar with the terms Users' and Producers' Accuracy. Precision = User' Accuracy, and Recall = Producers' Accuracy.\n", - "\n", - "Finally, we create and display a confusion matrix to visualize the model's prediction accuracy across all classes" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d97ca76e-2c9e-4d4c-a451-c3df092adc7c", - "metadata": {}, - "outputs": [], - "source": [ - "y_pred = best_model.predict(X_test)\n", - "\n", - "# Step 2: Calculate acc and F1 score for the entire dataset\n", - "acc = accuracy_score(y_test, y_pred)\n", - "print(f\"Accuracy: {acc}\")\n", - "\n", - "f1 = f1_score(y_test, y_pred, average='weighted') # 'weighted' accounts for class imbalance\n", - "print(f\"F1 Score (weighted): {f1}\")\n", - "\n", - "# Step 3: Calculate precision and recall for class 5 (Pine)\n", - "precision_class_5 = precision_score(y_test, y_pred, labels=[5], average='macro', zero_division=0)\n", - "recall_class_5 = recall_score(y_test, y_pred, labels=[5], average='macro', zero_division=0)\n", - "\n", - "print(f\"Precision for Class 5: {precision_class_5}\")\n", - "print(f\"Recall for Class 5: {recall_class_5}\")\n", - "\n", - "# Step 4: Plot the confusion matrix\n", - "conf_matrix = confusion_matrix(y_test, y_pred)\n", - "\n", - "ConfusionMatrixDisplay(confusion_matrix=conf_matrix).plot()\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "id": "b6899f56-6edf-4c54-9c98-6dfa353642f5", - "metadata": {}, - "source": [ - "### Skipping Some steps in Glenn's BioSCape Workshop Tutorial\n", - "`8.2.1.8. Interpret and understand ML model`\n", - "\n", - "https://ornldaac.github.io/bioscape_workshop_sa/tutorials/Machine_Learning/Invasive_AVIRIS.html#interpret-and-understand-ml-model" - ] - }, - { - "cell_type": "markdown", - "id": "86524208-33fd-4935-b623-275427972b27", - "metadata": {}, - "source": [ - "### Predict over an example AVIRIS scene\n", - "\n", - "We now have a trained model and are ready to deploy it to generate predictions across an entire AVIRIS scene and map the distribution of invasive plants. This involves handling a large volume of data, so we need to write the code to do this intelligently. We will accomplish this by applying the `.predict()` method of our trained model in parallel across the chunks of the AVIRIS xarray. The model will receive one chunk at a time so that the data is not too large, but it will be able to perform this operation in parallel across multiple chunks, and therefore will not take too long." - ] - }, - { - "cell_type": "markdown", - "id": "75ed9114-93ed-410b-9c41-84ab106f0efe", - "metadata": {}, - "source": [ - "This model was only trained on data covering natural vegetaton in the Cape Peninsula, It is important that we only predict in the areas that match our training data. We will therefore filter to scenes that cover the Cape Peninsula and mask out non-protected areas" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "974fce3f-dee4-45c9-874c-ec7b150cf902", - "metadata": {}, - "outputs": [], - "source": [ - "#south africa protected areas\n", - "SAPAD = (gpd.read_file('data/SAPAD_2024.gpkg')\n", - " .query(\"SITE_TYPE!='Marine Protected Area'\")\n", - " )\n", - "#SAPAD.plot()\n", - "#SAPAD.to_crs(\"EPSG:32734\")\n", - "SAPAD.crs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0d3f20e7-0c23-4a40-b2af-8725dd4d721d", - "metadata": {}, - "outputs": [], - "source": [ - "# Get the bounding box of the training data\n", - "bbox = class_data_utm.total_bounds # (minx, miny, maxx, maxy)\n", - "#bbox\n", - "gdf_bbox = gpd.GeoDataFrame({'geometry': [box(*bbox)]}, crs=class_data_utm.crs) # Specify the CRS\n", - "gdf_bbox['geometry'] = gdf_bbox.buffer(500)\n", - "gdf_bbox.crs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fdc16d41-14e2-4f35-a9cb-f6551af4b4f4", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b00ce1b9-b2c5-47a1-b3d2-9786b5f98e30", - "metadata": {}, - "outputs": [], - "source": [ - "#south africa protected areas\n", - "SAPAD = (gpd.read_file('data/SAPAD_2024.gpkg')\n", - " .query(\"SITE_TYPE!='Marine Protected Area'\")\n", - " )\n", - "SAPAD = SAPAD.to_crs(\"EPSG:32734\")\n", - "\n", - "# Get the bounding box of the training data\n", - "bbox = class_data_utm.total_bounds # (minx, miny, maxx, maxy)\n", - "gdf_bbox = gpd.GeoDataFrame({'geometry': [box(*bbox)]}, crs=class_data_utm.crs) # Specify the CRS\n", - "gdf_bbox['geometry'] = gdf_bbox.buffer(500)\n", - "\n", - "# protected areas that intersect with the training data\n", - "SAPAD_CT = SAPAD.overlay(gdf_bbox,how='intersection')\n", - "\n", - "#keep only AVIRIS scenes that intersects with CT protected areas\n", - "AVNG_sapad = AVNG_CP[AVNG_CP.intersects(SAPAD_CT.unary_union)]\n", - "\n", - "#a list of files to predict\n", - "files_sapad = AVNG_sapad['RFL s3'].tolist()\n", - "\n", - "#how many files?\n", - "len(files_sapad)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a4c134e3-f12f-4a17-aae7-898737fdfa41", - "metadata": {}, - "outputs": [], - "source": [ - "m = AVNG_sapad[['fid','geometry']].explore('fid')\n", - "m" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "17490875-d8c9-4256-ab5f-576e944097c2", - "metadata": {}, - "outputs": [], - "source": [ - "SAPAD.keys()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7ee0c900-56c1-49a1-be6c-0edddae355b4", - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "#map = AVNG_Coverage[['fid', 'geometry']].explore('fid')\n", - "map = SAPAD[['SITE_TYPE', 'geometry']].explore('SITE_TYPE')\n", - "map" - ] - }, - { - "cell_type": "markdown", - "id": "88ed40f0-2b5e-4970-8493-082f38a2de93", - "metadata": {}, - "source": [ - "Here is the function that we will actually apply to each chunk. Simple really. The hard work is getting the data into and out of this functiON" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0158fee1-4b09-4999-acfd-f5c4dd8ed19d", - "metadata": {}, - "outputs": [], - "source": [ - "def predict_on_chunk(chunk, model):\n", - " probabilities = model.predict_proba(chunk)\n", - " return probabilities" - ] - }, - { - "cell_type": "markdown", - "id": "7c2d8e1b-c32d-4911-9a7c-21f6cfa449e2", - "metadata": {}, - "source": [ - "Now we define the funciton that takes as input the path to the AVIRIS file and pass the data to the predict function. THhs is composed of 4 parts:\n", - "\n", - "Part 1: Opens the AVIRIS data file using xarray and sets a condition to identify valid data points where reflectance values are greater than zero.\n", - "\n", - "Part 2: Applies all the transformations that need to be done before the data goes to the model. It the spatial dimensions (x and y) into a single dimension, filters wavelengths, and normalizes the reflectance data.\n", - "\n", - "Part 3: Applies the machine learning model to the normalized data in parallel, predicting class probabilities for each data point using xarray's apply_ufunc method. Most of the function invloves defining what to do with the dimensions of the old dataset and the new output\n", - "\n", - "Part 4: Unstacks the data to restore its original dimensions, sets spatial dimensions and coordinate reference system (CRS), clips the data, and transposes the data to match expected formats before returning the results." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7ed0c90c-c056-4637-9945-a6a4d21af109", - "metadata": {}, - "outputs": [], - "source": [ - "def predict_xr(file,geometries):\n", - "\n", - " #part 1 - opening file\n", - " #open the file\n", - " print(f'file: {file}')\n", - " ds = xr.open_datatree(rfl_netcdf_2i2c, engine='h5netcdf', decode_coords=\"all\",\n", - " chunks='auto')\n", - "\n", - " #get the geometries of the protected areas for masking\n", - " ds_crs = ds.transverse_mercator.crs_wkt\n", - " geometries = geometries.to_crs(ds_crs).geometry.apply(mapping)\n", - "\n", - " #condition to use for masking no data later\n", - " condition = (ds['reflectance'] > -1).any(dim='wavelength')\n", - "\n", - " #stack the data into a single dimension. This will be important for applying the model later\n", - " ds = ds.reflectance.to_dataset().stack(sample=('easting','northing'))\n", - " \n", - " #part 2 - pre-processing\n", - " #remove bad wavelenghts\n", - " wavelengths_to_drop = ds.wavelength.where(\n", - " (ds.wavelength < 450) |\n", - " (ds.wavelength >= 1340) & (ds.wavelength <= 1480) |\n", - " (ds.wavelength >= 1800) & (ds.wavelength <= 1980) |\n", - " (ds.wavelength > 2400), drop=True\n", - " )\n", - " # Use drop_sel() to remove those specific wavelength ranges\n", - " ds = ds.drop_sel(wavelength=wavelengths_to_drop)\n", - " \n", - " #normalise the data\n", - " l2_norm = np.sqrt((ds['reflectance'] ** 2).sum(dim='wavelength'))\n", - " ds['reflectance'] = ds['reflectance'] / l2_norm\n", - "\n", - " \n", - " #part 3 - apply the model over chunks\n", - " result = xr.apply_ufunc(\n", - " predict_on_chunk,\n", - " ds['reflectance'].chunk(dict(wavelength=-1)),\n", - " input_core_dims=[['wavelength']],#input dim with features\n", - " output_core_dims=[['class']], # name for the new output dim\n", - " exclude_dims=set(('wavelength',)), #dims to drop in result\n", - " output_sizes={'class': 9}, #length of the new dimension\n", - " output_dtypes=[np.float32],\n", - " dask=\"parallelized\",\n", - " kwargs={'model': best_model}\n", - " )\n", - "\n", - " #part 4 - post-processing\n", - " result = result.where((result >= 0) & (result <= 1), np.nan) #valid values\n", - " result = result.unstack('sample') #remove the stack\n", - " result = result.rio.set_spatial_dims(x_dim='easting',y_dim='northing') #set the spatial dims\n", - " result = result.rio.write_crs(ds_crs) #set the CRS\n", - " result = result.rio.clip(geometries) #clip to the protected areas and no data\n", - " result = result.transpose('class', 'northing', 'easting') #transpose the data rio expects it this way\n", - " return result " - ] - }, - { - "cell_type": "markdown", - "id": "6ed10603-660c-49cd-a40c-1702201631b8", - "metadata": {}, - "source": [ - "Let's test that it works on a single file before we run it through 100s of GB of data." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "96e10e7e-2514-4a62-b967-0a83d1c5d6db", - "metadata": {}, - "outputs": [], - "source": [ - "#files_sapad[25]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "aeb17f14-5cf0-4684-a293-02db53dd3b0b", - "metadata": {}, - "outputs": [], - "source": [ - "test = predict_xr(rfl_netcdf_2i2c,SAPAD_CT)\n", - "test" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "faec501e-747c-41f9-a301-eebdf58319d8", - "metadata": {}, - "outputs": [], - "source": [ - "label_df" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "48c27393-076f-4b8a-b0bb-3ac628a8e8dc", - "metadata": {}, - "outputs": [], - "source": [ - "test = test.rio.reproject(\"EPSG:4326\",nodata=np.nan)\n", - "h = test.isel({'class':5}).hvplot(tiles=hv.element.tiles.EsriImagery(), \n", - " project=True,rasterize=True,clim=(0,1),\n", - " cmap='magma',frame_width=400,data_aspect=1,alpha=0.5)\n", - "h" - ] - }, - { - "cell_type": "markdown", - "id": "65cbe64a-fc6b-427b-bb72-cf13ab4259b7", - "metadata": {}, - "source": [ - "ML models typically provide a single prediction of the most likely outcomes. You can also get probability-like scores (values from 0 to 1) from these models, but they are not true probabilities. If the model gives you a score of 0.6, that means it is more likely than a prediction of 0.5, and less likely than 0.7. However, it does not mean that in a large sample your prediction would be right 60 times out of 100. To get calibrated probabilities from our models, we have to apply additional steps. We can also get a set of predictions from models rather than a single prediction, which reflects the model's true uncertainty using a technique called conformal predictions. Read more about conformal prediction for geospatial machine learning in this amazing paper:\n", - "\n", - "[Singh, G., Moncrieff, G., Venter, Z., Cawse-Nicholson, K., Slingsby, J., & Robinson, T. B. (2024). Uncertainty quantification for probabilistic machine learning in earth observation using conformal prediction. Scientific Reports, 14(1), 16166.](https://www.nature.com/articles/s41598-024-65954-w)" - ] - }, - { - "cell_type": "markdown", - "id": "06e7ad3a-5549-45f6-b2a7-d301c2993366", - "metadata": {}, - "source": [ - "### Final steps of the full ML classification are time intensive and are not described in this workshop. \n", - "\n", - "Steps in Glenn Moncrieff's BioSCape Workshop Tutorial\n", - "\n", - "`8.2.1.10. Merge and mosaic results`\n", - "- https://ornldaac.github.io/bioscape_workshop_sa/tutorials/Machine_Learning/Invasive_AVIRIS.html#merge-and-mosaic-results" - ] - }, - { - "cell_type": "markdown", - "id": "a205f922-6cbc-4da0-91fc-75723694b5b3", - "metadata": {}, - "source": [ - "### CREDITS: \n", - "\n", - "Find all of the October 2025 BioSCape Data Workshop Materials/Notebooks\n", - "\n", - "- https://ornldaac.github.io/bioscape_workshop_sa/intro.html\n", - "\n", - "This Notebook is an adaption of **Glenn Moncrieff**'s BioSCape Data Workshop Notebook: [**Mapping invasive species using supervised machine learning and AVIRIS-NG**](https://ornldaac.github.io/bioscape_workshop_sa/tutorials/Machine_Learning/Invasive_AVIRIS.html)\n", - "- This Notebook accesses and uses an updated version of AVIRIS-NG data with improved corrections and that are in netCDF file formats\n", - "\n", - "Glenn's lesson borrowed from:\n", - "\n", - "- [``Land cover mapping example on Microsoft Planetary Computer``](https://planetarycomputer.microsoft.com/docs/tutorials/landcover)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "013472a8-9f07-449a-966c-280d989ae060", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.11" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -}