Skip to content

Commit

Permalink
Merge pull request #4 from Forest-Recovery-Digital-Companion/FRML-21
Browse files Browse the repository at this point in the history
Update Integration Tests to better suit Researcher Use Cases
  • Loading branch information
Eve-ning authored Sep 20, 2023
2 parents ad54787 + 9e32530 commit f017e94
Show file tree
Hide file tree
Showing 18 changed files with 471 additions and 333 deletions.
29 changes: 2 additions & 27 deletions rsc/README.md
Original file line number Diff line number Diff line change
@@ -1,36 +1,11 @@
# FRDC Resources

We have 2 type of resources:

1) Raw: Uncompressed, raw data from our UAV drones
2) Debug: Compressed version of the raw data used for
1) Unit Testing
2) Integration Testing
3) Experimentation Debugging

The differences are:
1) Debug is stored lossy, in `.jpeg` format. While Raw is in lossless `.tiff`
2) Debug is committed to Git, while Raw only on demand.
3) Debug is used in our tests, while Raw only if absolutely necessary. This is to reduce I/O costs.

## Structure
The file structure should follow this

Each folder with data should have the following files:
- `result.tif`
- `result_Blue.tif`
- `result_Green.tif`
- `result_NIR.tif`
- `result_Red.tif`
- `result_RedEdge.tif`
- `bounds.csv` _(Optional: Will eventually be deprecated)_
```
surveyed-site/
survey-date/
result.tif
result_Blue.tif
result_Green.tif
result_NIR.tif
result_Red.tif
result_RedEdge.tif
bounds.csv
```
You shouldn't need to touch this, this will be used to cache large raw `tif` files.
4 changes: 2 additions & 2 deletions src/frdc/load/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from .frdc_dataset import FRDCDataset
from .dataset import FRDCDownloader, FRDCDataset

__all__ = ['FRDCDataset']
__all__ = ['FRDCDownloader', 'FRDCDataset']
162 changes: 162 additions & 0 deletions src/frdc/load/dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
from __future__ import annotations

import base64
import hashlib
import logging
from dataclasses import dataclass, field
from pathlib import Path
from typing import Iterable

import numpy as np
import pandas as pd
from PIL import Image
from google.cloud import storage
from google.oauth2.service_account import Credentials

from frdc.conf import LOCAL_DATASET_ROOT_DIR, SECRETS_DIR, GCS_PROJECT_ID, GCS_BUCKET_NAME, Band


@dataclass
class FRDCDownloader:
credentials: Credentials = None
local_dataset_root_dir: Path = LOCAL_DATASET_ROOT_DIR
project_id: str = GCS_PROJECT_ID
bucket_name: str = GCS_BUCKET_NAME
bucket: storage.Bucket = field(init=False)

def __post_init__(self):
# We pull the credentials here instead of the constructor for try-except block to catch the FileNotFoundError
if self.credentials is None:
try:
self.credentials = Credentials.from_service_account_file(next(SECRETS_DIR.glob("*.json")).as_posix())
except StopIteration:
raise FileNotFoundError(f"No credentials found in {SECRETS_DIR.as_posix()}")

client = storage.Client(project=self.project_id, credentials=self.credentials)
self.bucket = client.bucket(self.bucket_name)

def list_gcs_datasets(self, anchor=Band.FILE_NAMES[0]) -> pd.DataFrame:
""" Lists all datasets from Google Cloud Storage.
Args:
anchor: The anchor file to find the dataset.
This is used to find the dataset. For example, if we want to find the dataset for
"chestnut_nature_park/20201218/183deg/result_Red.tif", then we can use "result_Red.tif" as the
anchor file.
Returns:
A DataFrame of all blobs in the bucket, with columns site, date, version.
"""

# The anchor file to find the dataset
# E.g. "result_Red.tif"
df = (
# The list of all blobs in the bucket that contains the anchor file
# E.g. "chestnut_nature_park/20201218/183deg/result_Red.tif"
pd.Series([blob.name for blob in self.bucket.list_blobs(match_glob=f"**/{anchor}")])
# Remove the anchor file name
# E.g. "chestnut_nature_park/20201218/183deg"
.str.replace(f"/{anchor}", "")
.rename("dataset_dir")
.drop_duplicates()
)

return df

def download_file(self, *, path: Path | str, local_exists_ok: bool = True) -> Path:
""" Downloads a file from Google Cloud Storage. If the file already exists locally, and the hashes match, it
will not download the file.
Args:
path: Path to the file in GCS.
local_exists_ok: If True, will not raise an error if the file already exists locally and the hashes match.
Examples:
If our file in GCS is in gs://frdc-scan/casuarina/20220418/183deg/result_Blue.tif
then we can download it with:
>>> download_file(path=Path("casuarina/20220418/183deg/result_Blue.tif"))
Raises:
FileNotFoundError: If the file does not exist in GCS.
FileExistsError: If the file already exists locally and the hashes match.
Returns:
The local path to the downloaded file.
"""
local_path = self.local_dataset_root_dir / path
gcs_path = path.as_posix() if isinstance(path, Path) else path
gcs_blob = self.bucket.blob(gcs_path)

# If not exists in GCS, raise error
if not gcs_blob.exists():
raise FileNotFoundError(f"{gcs_path} does not exist in GCS.")

# If locally exists & hashes match, return False
if local_path.exists():
gcs_blob.reload() # Necessary to get the md5_hash
gcs_hash = base64.b64decode(gcs_blob.md5_hash).hex()
local_hash = hashlib.md5(open(local_path, 'rb').read()).hexdigest()
logging.debug(f"Local hash: {local_hash}, GCS hash: {gcs_hash}")
if gcs_hash == local_hash:
if local_exists_ok:
# If local_exists_ok, then don't raise
return local_path
else:
raise FileExistsError(f"{local_path} already exists and hashes match.")

# Else, download
logging.info(f"Downloading {gcs_blob.name} to {local_path}...")
local_path.parent.mkdir(parents=True, exist_ok=True)
gcs_blob.download_to_filename(local_path.as_posix())
return local_path


@dataclass
class FRDCDataset:
site: str
date: str
version: str | None
dl: FRDCDownloader = field(default_factory=FRDCDownloader)

@staticmethod
def _load_debug_dataset() -> FRDCDataset:
""" Loads a debug dataset from Google Cloud Storage.
Returns:
A dictionary of the dataset, with keys as the filenames and values as the images.
"""
return FRDCDataset(site='DEBUG', date='0', version=None)

@property
def dataset_dir(self):
return Path(f"{self.site}/{self.date}/{self.version + '/' if self.version else ''}")

def get_ar_bands(self, band_names=Band.FILE_NAMES) -> np.ndarray:
bands_dict = {}
for band_name in band_names:
fp = self.dl.download_file(path=self.dataset_dir / band_name)
ar_im = self._load_image(fp)
bands_dict[band_name] = ar_im

# Sort the bands by the order in Band.FILE_NAMES
return np.stack([bands_dict[band_name] for band_name in Band.FILE_NAMES], axis=-1)

def get_bounds_and_labels(self, file_name='bounds.csv') -> tuple[Iterable[Iterable[int]], Iterable[str]]:
fp = self.dl.download_file(path=self.dataset_dir / file_name)
df = pd.read_csv(fp)
return [(i.x0, i.y0, i.x1, i.y1) for i in df.itertuples()], df['name'].tolist()

@staticmethod
def _load_image(path: Path | str) -> np.ndarray:
""" Loads an Image from a path.
Args:
path: Path to image. pathlib.Path is preferred, but str is also accepted.
Returns:
Image as numpy array.
"""

im = Image.open(Path(path).as_posix())
return np.array(im)
186 changes: 0 additions & 186 deletions src/frdc/load/frdc_dataset.py

This file was deleted.

Loading

0 comments on commit f017e94

Please sign in to comment.