Skip to content

Commit

Permalink
Refactoring
Browse files Browse the repository at this point in the history
  • Loading branch information
TiagoW committed May 21, 2024
1 parent 1f1582d commit 7208e67
Showing 1 changed file with 28 additions and 33 deletions.
61 changes: 28 additions & 33 deletions nmrcraft/data/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,15 @@ def load_dataset_from_hf(
return dataset


def transpose(array: any):
"""rotate/transpose array to the right"""
ar = array[:] # make copy just to be sure
ar = [ # rotate the array to the right
list(x) if i == 0 else x for i, x in enumerate(map(list, zip(*ar)))
]
return ar


def get_target_columns(target_columns: str):
"""
Function takes target columns in underline format f.e 'metal_X1_X4_X2_L' and
Expand Down Expand Up @@ -251,13 +260,13 @@ def choose_geometry(self):
self.dataset["geometry"] == "tbp"
] # only load trigonal bipyramidal complexes

def preprocess_features(self, X):
def scale(self, X):
"""
Apply standard normalization to the feature set.
"""
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
return X_scaled, scaler
return X_scaled

def get_target_columns_separated(self):
"""Returns the column indicies of the target array nicely sorted.
Expand Down Expand Up @@ -293,12 +302,7 @@ def categorical_target_decoder(self, y):
ys = y[:] # copy y so it's not modified
target_encoders = self.target_label_encoders
ys_decoded = []
ys = [ # rotate the y array to each column be a target
list(x) if i == 0 else x
for i, x in enumerate(
map(list, zip(*ys))
) # y[:] should leave original y in peace?
]
ys = transpose(ys)

# Decode columnwise
for i, target_column in enumerate(ys):
Expand Down Expand Up @@ -373,18 +377,16 @@ def categorical_endocode_X(self):
X_Structural_Features_Columns
].to_numpy()

# Rotate the array so instead of a row corresponding to a complex a row is a target
X_Structural_Features = [
list(x) if i == 0 else x
for i, x in enumerate(map(list, zip(*X_Structural_Features)))
]
# Transpose the array
X_Structural_Features = transpose(X_Structural_Features)

# Target-wise encoding with Label encoder and save encoders for later decoding
xs = []
for i in range(len(X_Structural_Features)):
tmp_encoder = LabelEncoder()
tmp_encoder.fit(X_Structural_Features[i])
xs.append(tmp_encoder.transform(X_Structural_Features[i]))
X_Structural_Features = list(zip(*xs))
X_Structural_Features = list(zip(*xs)) # Kind of backtransposing

return X_NMR, X_Structural_Features

Expand All @@ -393,10 +395,7 @@ def categorical_endocode_y(self):
y_labels_rotated = self.dataset[self.target_columns].to_numpy()

# rotate the list of list (array-like)
y_labels = [
list(x) if i == 0 else x
for i, x in enumerate(map(list, zip(*y_labels_rotated)))
]
y_labels = transpose(y_labels_rotated)

# Do targetwise encoding using the label encoder and save the label encoders for later decoding
ys = []
Expand Down Expand Up @@ -424,9 +423,10 @@ def split_and_preprocess_categorical(self):
# Encode y in a categorical fashion with the label encoder columnwise
y, readable_labels = self.categorical_endocode_y()

# Train Test splitting
(
X_NMR_train,
X_NMR_test,
X_train_NMR,
X_test_NMR,
X_train_structural,
X_test_structural,
y_train,
Expand All @@ -440,10 +440,9 @@ def split_and_preprocess_categorical(self):
)

# Normalize features with no leakage from test set
X_train_NMR_scaled, scaler = self.preprocess_features(X_NMR_train)
X_test_NMR_scaled = scaler.transform(
X_NMR_test
) # Apply the same transformation to test set
X_train_NMR_scaled = self.scale(X_train_NMR)
X_test_NMR_scaled = self.scale(X_test_NMR)

X_train_scaled = np.concatenate(
[X_train_NMR_scaled, X_train_structural], axis=1
)
Expand All @@ -467,10 +466,7 @@ def split_and_preprocess_one_hot(self):

# Get the Targets, rotate, apply binarization, funze into a single array
y_labels_rotated = self.dataset[self.target_columns].to_numpy()
y_labels = [
list(x) if i == 0 else x
for i, x in enumerate(map(list, zip(*y_labels_rotated)))
]
y_labels = transpose(y_labels_rotated)
self.target_unique_labels = target_unique_labels
ys = []
readable_labels = []
Expand Down Expand Up @@ -516,11 +512,10 @@ def split_and_preprocess_one_hot(self):
)

# Normalize features with no leakage from test set
X_train_NMR_scaled, scaler = self.preprocess_features(X_train_NMR)
X_test_NMR_scaled = scaler.transform(
X_test_NMR
) # Apply the same transformation to test set
# Combine scaled NMR features with structural features
X_train_NMR_scaled = self.scale(X_train_NMR)
X_test_NMR_scaled = self.scale(X_test_NMR)

# Combine scaled NMR features with structural features to get final X
X_train_scaled = np.concatenate(
[X_train_NMR_scaled, X_train_structural], axis=1
)
Expand Down

0 comments on commit 7208e67

Please sign in to comment.