Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Work in progress: Multiclass possible now #62

Merged
merged 25 commits into from
May 28, 2024
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
cb69197
Work in progress: Multiclass possible now
May 23, 2024
99353f2
Merge branch 'main' of https://github.com/mlederbauer/NMRcraft into f…
May 23, 2024
d86a2b8
fix: standard scaling
May 23, 2024
3a17d15
feat: simplified data loader
May 23, 2024
0eb6bad
chore: refactor name to dataloader
May 23, 2024
c486683
feat: add more columns to results df
May 24, 2024
ae94e2f
move default args of dataloader to classifier
May 24, 2024
b6508db
test multiple targets
May 24, 2024
7246f14
feat: removed data folder from gitignore
May 25, 2024
1c83a3b
fix: change absolute path of data file
May 26, 2024
4aec0dd
feat: barebone baslines script
May 26, 2024
3bdb20a
fix: testing for now lol
May 26, 2024
4af718e
feat: functional multiclass models
kbiniek May 26, 2024
c5ee67a
resolve conflicts
kbiniek May 26, 2024
49c820f
Merge pull request #68 from mlederbauer/chore/minimal-dataloader
kbiniek May 26, 2024
b32dcb5
feat: working baseline
May 26, 2024
168e266
Merge pull request #69 from mlederbauer/feat/baselines
kbiniek May 26, 2024
0be3994
feat: new evaluation
May 26, 2024
ebe51ff
fix: targets instead of y_labels
May 26, 2024
1d0a7a4
feat: add default parameters to DataLoader
kbiniek May 26, 2024
01de0b0
Merge pull request #70 from mlederbauer/feat/unify-evaluation
kbiniek May 26, 2024
8598603
feat: fix confusion matrix plot and bootstrap
kbiniek May 26, 2024
9fddfe5
feat: add multioutput models
kbiniek May 27, 2024
2b77d23
Feat: Added statistics for the bootstrapped Metrics
May 27, 2024
8ac7432
Chore/47 plotting functions (#71)
mlederbauer May 27, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
mlruns/
scratch/
dataset/
data/
plots/
data/

docs/source

Expand Down
76 changes: 76 additions & 0 deletions nmrcraft/data/data_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
"""Load and preprocess data."""

import os

import pandas as pd
from datasets import load_dataset


class DatasetLoadError(FileNotFoundError):
"""Exeption raised when the Dataloader could not find data/dataset.csv,
even after trying to generate it from huggingface"""

def __init__(self, t):
super().__init__(f"Could not load raw Dataset '{t}'")


class InvalidTargetError(ValueError):
"""Exception raised when the specified model name is not found."""

def __init__(self, t):
super().__init__(f"Invalid target '{t}'")


def filename_to_ligands(dataset: pd.DataFrame):
"""
Extract ligands from the filename and add as columns to the dataset.
Assumes that filenames are structured in a specific way that can be parsed into ligands.
"""
filename_parts = dataset["file_name"].str.split("_", expand=True)
dataset["metal"] = filename_parts.get(0)
dataset["geometry"] = filename_parts.get(1)
dataset["E_ligand"] = filename_parts.get(2)
dataset["X1_ligand"] = filename_parts.get(3)
dataset["X2_ligand"] = filename_parts.get(4)
dataset["X3_ligand"] = filename_parts.get(5)
dataset["X4_ligand"] = filename_parts.get(6)
dataset["L_ligand"] = filename_parts.get(7).fillna(
"none"
) # Fill missing L_ligand with 'none'
return dataset


def load_dummy_dataset_locally(datset_path: str = "tests/data.csv"):
dataset = pd.read_csv(datset_path)
return dataset


def load_dataset_from_hf(
dataset_name: str = "NMRcraft/nmrcraft", data_files: str = "all_no_nan.csv"
):
"""Load the dataset.

This function loads the dataset using the specified dataset name and data files.
It assumes that you have logged into the Hugging Face CLI prior to calling this function.

Args:
dataset_name (str, optional): The name of the dataset. Defaults to "NMRcraft/nmrcraft".
data_files (str, optional): The name of the data file. Defaults to 'all_no_nan.csv'.

Returns:
pandas.DataFrame: The loaded dataset as a pandas DataFrame.
"""
# Create data dir if needed
if not os.path.isdir("dataset"):
os.mkdir("dataset")
# Check if hf dataset is already downloaded, else download it and then load it
if not os.path.isfile("dataset/dataset.csv"):
dataset = load_dataset(dataset_name, data_files=data_files)[
"train"
].to_pandas()
dataset.to_csv("dataset/dataset.csv")
if os.path.isfile("dataset/dataset.csv"):
dataset = pd.read_csv("dataset/dataset.csv")
elif not os.path.isfile("dataset/dataset.csv"):
raise DatasetLoadError(FileNotFoundError)
return dataset
219 changes: 219 additions & 0 deletions nmrcraft/data/dataloader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,219 @@
"""Load and preprocess data."""

from typing import List, Tuple

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import (
LabelEncoder,
StandardScaler,
)

from nmrcraft.data.data_utils import (
filename_to_ligands,
load_dataset_from_hf,
load_dummy_dataset_locally,
)
from nmrcraft.utils.set_seed import set_seed

set_seed()

TARGET_TYPES = [
"metal",
"X1_ligand",
"X2_ligand",
"X3_ligand",
"X4_ligand",
"L_ligand",
"E_ligand",
]


class DataLoader:
def __init__(
self,
target_columns: str,
dataset_size: float,
include_structural_features: bool = False,
complex_geometry: str = "oct",
test_size: float = 0.2,
random_state: int = 42,
testing: bool = False,
feature_columns=None,
):
if feature_columns is None:
feature_columns = [
"M_sigma11_ppm",
"M_sigma22_ppm",
"M_sigma33_ppm",
"E_sigma11_ppm",
"E_sigma22_ppm",
"E_sigma33_ppm",
]
self.feature_columns = feature_columns
self.test_size = test_size
self.random_state = random_state
self.dataset_size = dataset_size
self.target_columns = target_columns
self.complex_geometry = complex_geometry
self.include_structural_features = include_structural_features

if not testing:
self.dataset = load_dataset_from_hf()
elif testing:
self.dataset = load_dummy_dataset_locally()

def load_data(self) -> pd.DataFrame:
"""
Loads the dataset, preprocesses it, and returns the preprocessed data.

Returns:
Preprocessed data (pandas.DataFrame): The preprocessed dataset.
"""
self.dataset = filename_to_ligands(self.dataset)
self.dataset = self.dataset.sample(frac=self.dataset_size)
self.choose_geometry()
return self.split_and_preprocess()

def choose_geometry(self) -> None:
"""
Filters the dataset based on the complex geometry.

This method filters the dataset based on the complex geometry specified by the `complex_geometry` attribute.
It checks if the specified geometry is valid and then updates the dataset accordingly. If the geometry is not
valid, a `ValueError` is raised.

Raises:
ValueError: If the specified geometry is not valid.

"""
valid_geometries = {"oct", "spy", "tbp"}
if self.complex_geometry in valid_geometries:
self.dataset = self.dataset[
self.dataset["geometry"] == self.complex_geometry
]
# else:
# raise ValueError("Invalid geometry'.") FIXME

def encode_categorical_features(self) -> np.ndarray:
"""
Encodes the categorical features in the dataset using LabelEncoder.

Returns:
np.ndarray: The encoded features in numpy array format.
"""
# Select and extract the structural features from the dataset
structural_features = (
self.dataset[
[col for col in TARGET_TYPES if col not in self.target_columns]
]
.to_numpy()
.T
) # Transpose immediately after conversion to numpy

# Encode features using LabelEncoder and store encoders for potential inverse transform
encoded_features = []
self.encoders = [] # To store encoders for each feature
for features in structural_features:
encoder = LabelEncoder()
encoder.fit(features)
encoded_features.append(encoder.transform(features))
self.encoders.append(encoder)

# Convert the list of encoded features back to the original data structure
return np.array(
encoded_features
).T # Transpose back to original orientation

def encode_targets(self) -> Tuple[np.ndarray, dict]:
"""
Encodes the target variables in the dataset using LabelEncoder.

Returns:
Tuple[np.ndarray, dict]: The encoded targets and a dictionary mapping target names to labels.
"""
# Initialize lists to store encoded targets and corresponding encoders
encoded_targets = []
self.target_encoders = []
y_labels_dict = {}

# Encode each target column using LabelEncoder
for target_name in self.target_columns:
target = self.dataset[target_name].to_numpy()
encoder = LabelEncoder()
encoder.fit(target)
encoded_targets.append(encoder.transform(target))
self.target_encoders.append(encoder)
y_labels_dict[
target_name
] = (
encoder.classes_.tolist()
) # Dictionary of labels for each target

y_encoded = np.array(
encoded_targets
).T # Transpose to match original data structure
return y_encoded, y_labels_dict

def split_and_preprocess(
self,
) -> Tuple[
np.ndarray, np.ndarray, np.ndarray, np.ndarray, List[List[str]]
]:
"""
Split the dataset into training and testing sets, preprocess the data, and return the preprocessed data.

Returns:
Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, List[List[str]]]: A tuple containing the preprocessed training and testing data, encoded target variables, and readable labels.
"""
# Extract and encode categorical features
X_NMR = self.dataset[self.feature_columns].to_numpy()
X_Structural = self.encode_categorical_features()

# Encode target variables and store readable labels
(
y_encoded,
y_labels,
) = self.encode_targets()

# Split data into training and testing sets
(
X_train_NMR,
X_test_NMR,
X_train_Structural,
X_test_Structural,
y_train,
y_test,
) = train_test_split(
X_NMR,
X_Structural,
y_encoded,
test_size=self.test_size,
random_state=self.random_state,
)

# Scale numerical features (the NMR tensor)
scaler = StandardScaler()
X_train_NMR_scaled = scaler.fit_transform(X_train_NMR)
X_test_NMR_scaled = scaler.transform(X_test_NMR)

# Combine features if structural features are included
if self.include_structural_features:
X_train = np.concatenate(
[X_train_NMR_scaled, X_train_Structural], axis=1
)
X_test = np.concatenate(
[X_test_NMR_scaled, X_test_Structural], axis=1
)
else:
X_train = X_train_NMR_scaled
X_test = X_test_NMR_scaled

return (
X_train,
X_test,
np.squeeze(y_train),
np.squeeze(y_test),
y_labels,
)
Loading
Loading