Skip to content

Commit

Permalink
Merge branch 'f/dresden_capsule_integration' into 'main'
Browse files Browse the repository at this point in the history
Dresden Capsule Dataset Integration

See merge request es/ai/hannah/hannah!371
  • Loading branch information
cgerum committed Feb 22, 2024
2 parents 0cee578 + 10e05b0 commit ebebd62
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 14 deletions.
2 changes: 1 addition & 1 deletion experiments/dresden_capsule/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ dataset:
data_folder: ${oc.env:HANNAH_DATA_FOLDER,${hydra:runtime.cwd}/../../datasets/}

module:
batch_size: 32
batch_size: 128

trainer:
max_epochs: 15
Expand Down
14 changes: 13 additions & 1 deletion hannah/conf/dataset/dresden_capsule.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,5 +25,17 @@ dataset: dresden_capsule
sampler: random
weighted_loss: false

task: section
task: sections # Check splits folder for other task options.
split: split_0

downsampling:
enabled: true
ratio:
binary: 1 # ratio normal : anomaly
# Proportions of each class that should be used.
sections: [1, 1, 0.05, 0.01, 0.005]
technical_multilabel_bubbles_dirt: [1, 1]
technical_multiclass_view: [1, 1, 1]
anomalies_fraction: 0.3 # only relevant for binary task

seed: 1234
63 changes: 51 additions & 12 deletions hannah/datasets/vision/dresden_capsule.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,29 +25,65 @@

import logging
import pathlib
import shutil

import numpy as np
import pandas as pd
import torchvision
from albumentations.pytorch import ToTensorV2
import albumentations as A
import tqdm

from sklearn.utils import resample
from .base import ImageDatasetBase

logger = logging.getLogger(__name__)


def prepare_data(study_folder: pathlib.Path, data: pd.DataFrame):
label_names = list(data.columns)[:-1]

label_names = list(data.columns)[:-1]
files = [study_folder / image for image in data["path"].to_list()]
labels = np.argmax(data.iloc[:, :-1].values, axis=1)

if len(label_names) > 1: # True for section and technical tasks
labels = np.argmax(data.iloc[:, :-1].values, axis=1)

else: # Assuming binary task (some anomaly and normal)
label_names.insert(0, 'normal')
labels = np.max(data.iloc[:, :-1].values, axis=1)

labels = [label_names[x] for x in labels]

return files, labels, label_names

def downsampling(X: list, y: list, labels: list, config: dict):

task = config.task
seed = config.seed

if task == 'sections' or task == 'technical_multiclass_view' or task == 'technical_multilabel_bubbles_dirt':
idx_resampled = np.empty(0, dtype=int)
for i in range(len(labels)):
y = np.array(y)
idx_y = np.where(y == labels[i])[0] # get only one class
n_samples = int(config.downsampling.ratio[task][i]*len(idx_y))
idx_resampled_temp = resample(idx_y, n_samples=n_samples, random_state=seed) # downsample class to n samples
idx_resampled = np.concatenate([idx_resampled, idx_resampled_temp])

else: # Assuming binary task
ratio = config.downsampling.ratio.binary
y = np.array(y)
normal_idx = np.where(y == labels[0])[0]
anomaly_idx = np.where(y == labels[1])[0]
n_samples = int(len(anomaly_idx)*config.downsampling.anomalies_fraction)
idx_resampled_anomaly = resample(anomaly_idx, n_samples=n_samples, random_state=seed)
idx_resampled_normal = resample(normal_idx, n_samples=int(n_samples*ratio), random_state=seed)
idx_resampled = np.concatenate([idx_resampled_anomaly, idx_resampled_normal])

ordered_idx_resampled = np.sort(idx_resampled)
y = y[ordered_idx_resampled]
X = np.array(X)[ordered_idx_resampled]
assert len(X) == len(y)

return X, y


class DresdenCapsuleDataset(ImageDatasetBase):
@classmethod
Expand All @@ -56,9 +92,9 @@ def prepare(cls, config):

@classmethod
def splits(cls, config):
data_folder = pathlib.Path(config["data_folder"]) / "dresden_capsule"
data_folder = pathlib.Path(config["data_folder"]) / "dresden-capsule"
study_folder = data_folder / "images"
split_folder = data_folder / "splits" / config.task
split_folder = data_folder / "splits_tuebingen" / config.task

test_data = pd.read_csv(split_folder / "test.csv")
val_data = pd.read_csv(split_folder / config.split / "val.csv")
Expand All @@ -68,13 +104,16 @@ def splits(cls, config):
X_val, y_val, labels = prepare_data(study_folder, val_data)
X_test, y_test, labels = prepare_data(study_folder, test_data)

# Resampling
if config.downsampling.enabled:
X_train, y_train = downsampling(X_train, y_train, labels, config)
X_val, y_val = downsampling(X_val, y_val, labels, config)

transform = A.Compose([A.augmentations.geometric.resize.Resize(config.sensor.resolution[0], config.sensor.resolution[1]), ToTensorV2()])
test_transform = A.Compose([A.augmentations.geometric.resize.Resize(config.sensor.resolution[0], config.sensor.resolution[1]), ToTensorV2()])
train_set = cls(X_train, y_train, labels, transform=transform)
val_set = cls(X_val, y_val, labels)
test_set = cls(X_test, y_test, labels)

# RANDOM, RANDOM_PER_STUDY Splits
# preprocessing,
val_set = cls(X_val, y_val, labels, transform=test_transform)
test_set = cls(X_test, y_test, labels, transform=test_transform)

return (
train_set,
Expand Down

0 comments on commit ebebd62

Please sign in to comment.