From f0a34a52eec2d7d242999d948ee48bdce8197652 Mon Sep 17 00:00:00 2001 From: Clecio De Bom <29743719+cdebom@users.noreply.github.com> Date: Thu, 30 Jul 2020 12:59:20 -0300 Subject: [PATCH 01/27] Add files via upload --- .../classifiers/LSTM_bidirectional.py | 145 ++++++++++++++++++ 1 file changed, 145 insertions(+) create mode 100644 tomo_challenge/classifiers/LSTM_bidirectional.py diff --git a/tomo_challenge/classifiers/LSTM_bidirectional.py b/tomo_challenge/classifiers/LSTM_bidirectional.py new file mode 100644 index 00000000..1de9c735 --- /dev/null +++ b/tomo_challenge/classifiers/LSTM_bidirectional.py @@ -0,0 +1,145 @@ +""" +Bidirectional LSTM Classifier + +This is an example tomographic bin generator using a convolutional bidirectional LSTM neural network. + +This solution was developed by the Brazilian Center for Physics Research AI 4 Astrophysics team. + +Authors: Clecio R. Bom, Bernardo M. Fraga, Gabriel Teixeira and Elizabeth Gonzalez. + +In our preliminary tests we found a SNR 3X2 of ~1930 for n=10 bins. + +This is an example tomographic bin generator using a convolutional bidirectional LSTM neural network. + + +Every classifier module needs to: + - have construction of the type + __init__ (self, bands, options) (see examples below) + - implement two functions: + train (self, training_data,training_z) + apply (self, data). + - define valid_options class varible. + +See Classifier Documentation below. +""" + +from .base import Tomographer +import numpy as np +import tensorflow as tf +from tensorflow import keras +from sklearn.preprocessing import MinMaxScaler + +class LSTM(Tomographer): + """ Bidirectional LSTM deep classifier """ + + # valid parameter -- see below + valid_options = ['bins'] + # this settings means arrays will be sent to train and apply instead + # of dictionaries + wants_arrays = True + + def __init__ (self, bands, options): + """Constructor + + Parameters: + ----------- + bands: str + string containg valid bands, like 'riz' or 'griz' + options: dict + options come through here. Valid keys are listed as valid_options + class variable. + + Note: + ----- + Valid options are: + 'bins' - number of tomographic bins + + """ + self.bands = bands + self.opt = options + + def train (self, training_data, training_z): + """Trains the classifier + + Parameters: + ----------- + training_data: numpy array, size Ngalaxes x Nbands + training data, each row is a galaxy, each column is a band as per + band defined above + training_z: numpy array, size Ngalaxies + true redshift for the training sample + + """ + + n_bin = self.opt['bins'] + print("Finding bins for training data") + # Now put the training data into redshift bins. + # Use zero so that the one object with minimum + # z in the whole survey will be in the lowest bin + training_bin = np.zeros(training_z.size) + + # Find the edges that split the redshifts into n_z bins of + # equal number counts in each + p = np.linspace(0, 100, n_bin + 1) + z_edges = np.percentile(training_z, p) + + # Now find all the objects in each of these bins + for i in range(n_bin): + z_low = z_edges[i] + z_high = z_edges[i + 1] + training_bin[(training_z > z_low) & (training_z < z_high)] = i + + + lstm_out = np.int(486*2) + inp = keras.layers.Input(shape=inp_shape) + + x = keras.layers.Conv1D(64, 3, padding='same')(inp) + x = keras.layers.Activation('tanh')(x) + x = keras.layers.MaxPooling1D(2, padding='same')(x) + + x = keras.layers.Conv1D(128, 3, padding='same')(x) + x = keras.layers.Activation('tanh')(x) + x = keras.layers.MaxPooling1D(2, padding='same')(x) + + x = keras.layers.Bidirectional(keras.layers.LSTM(lstm_out, return_sequences=False), merge_mode='concat')(x) + + x = keras.layers.Dense(lstm_out // 2, activation='tanh')(x) + x = keras.layers.Dense(lstm_out // 2, activation='tanh')(x) + + x = keras.layers.Dense(n_bins, activation='softmax')(x) + + model = keras.models.Model(inp, x) + + model.compile(loss='sparse_categorical_crossentropy', optimizer=keras.optimizers.Adam(0.001), metrics=['accuracy']) + + # Can be replaced with any classifier + scaler = MinMaxScaler() + x_train = scaler.fit_transform(training_data) + print("Fitting classifier") + # Lots of data, so this will take some time + model.fit(x_train, training_bin, epochs=15, verbose=0) + + self.classifier = model + self.z_edges = z_edges + self.scaler = scaler + + + def apply (self, data): + """Applies training to the data. + + Parameters: + ----------- + Data: numpy array, size Ngalaxes x Nbands + testing data, each row is a galaxy, each column is a band as per + band defined above + + Returns: + tomographic_selections: numpy array, int, size Ngalaxies + tomographic selection for galaxies return as bin number for + each galaxy. + """ + x_test = self.scaler.transform(data) + preds = self.classifier.predict(x_test) + tomo_bin = np.argmax(preds, axis=1) + return tomo_bin + From 49e0388df417e32e9538512722df96545761fd7c Mon Sep 17 00:00:00 2001 From: Clecio De Bom <29743719+cdebom@users.noreply.github.com> Date: Thu, 30 Jul 2020 14:28:26 -0300 Subject: [PATCH 02/27] Add files via upload --- .../classifiers/LSTM_bidirectional.py | 21 +-- tomo_challenge/classifiers/conv_ak.py | 146 ++++++++++++++++++ 2 files changed, 157 insertions(+), 10 deletions(-) create mode 100644 tomo_challenge/classifiers/conv_ak.py diff --git a/tomo_challenge/classifiers/LSTM_bidirectional.py b/tomo_challenge/classifiers/LSTM_bidirectional.py index 1de9c735..01ca1f2e 100644 --- a/tomo_challenge/classifiers/LSTM_bidirectional.py +++ b/tomo_challenge/classifiers/LSTM_bidirectional.py @@ -6,11 +6,9 @@ This solution was developed by the Brazilian Center for Physics Research AI 4 Astrophysics team. Authors: Clecio R. Bom, Bernardo M. Fraga, Gabriel Teixeira and Elizabeth Gonzalez. +contact: debom |at |cbpf| dot| br -In our preliminary tests we found a SNR 3X2 of ~1930 for n=10 bins. - -This is an example tomographic bin generator using a convolutional bidirectional LSTM neural network. - +In our preliminary tests we found a SNR 3X2 of ~1930 for n=10 bins and ~2240 for 14 bins. Every classifier module needs to: - have construction of the type @@ -19,8 +17,6 @@ train (self, training_data,training_z) apply (self, data). - define valid_options class varible. - -See Classifier Documentation below. """ from .base import Tomographer @@ -88,10 +84,12 @@ def train (self, training_data, training_z): z_low = z_edges[i] z_high = z_edges[i + 1] training_bin[(training_z > z_low) & (training_z < z_high)] = i - + #cut = np.random.uniform(0, 1, training_z.size) < 0.05 + #training_data = training_data[cut] + #training_bin = training_bin[cut] lstm_out = np.int(486*2) - inp = keras.layers.Input(shape=inp_shape) + inp = keras.layers.Input(shape=(training_data.shape[1], 1)) x = keras.layers.Conv1D(64, 3, padding='same')(inp) x = keras.layers.Activation('tanh')(x) @@ -106,7 +104,7 @@ def train (self, training_data, training_z): x = keras.layers.Dense(lstm_out // 2, activation='tanh')(x) x = keras.layers.Dense(lstm_out // 2, activation='tanh')(x) - x = keras.layers.Dense(n_bins, activation='softmax')(x) + x = keras.layers.Dense(n_bin, activation='softmax')(x) model = keras.models.Model(inp, x) @@ -115,9 +113,11 @@ def train (self, training_data, training_z): # Can be replaced with any classifier scaler = MinMaxScaler() x_train = scaler.fit_transform(training_data) + x_train = np.expand_dims(x_train, axis=-1) + y_train = np.expand_dims(training_bin, axis=-1) print("Fitting classifier") # Lots of data, so this will take some time - model.fit(x_train, training_bin, epochs=15, verbose=0) + model.fit(x_train, y_train, epochs=15, verbose=0) self.classifier = model self.z_edges = z_edges @@ -139,6 +139,7 @@ def apply (self, data): each galaxy. """ x_test = self.scaler.transform(data) + x_test = np.expand_dims(x_test, axis=-1) preds = self.classifier.predict(x_test) tomo_bin = np.argmax(preds, axis=1) return tomo_bin diff --git a/tomo_challenge/classifiers/conv_ak.py b/tomo_challenge/classifiers/conv_ak.py new file mode 100644 index 00000000..687242c9 --- /dev/null +++ b/tomo_challenge/classifiers/conv_ak.py @@ -0,0 +1,146 @@ +""" +Conv1D Classifier + +This is an example tomographic bin generator using a convolutional 1d neural network. +The initial model was optimized using AutoKeras + +This solution was developed by the Brazilian Center for Physics Research AI 4 Astrophysics team. + +Authors: Clecio R. Bom, Bernardo M. Fraga, Gabriel Teixeira and Elizabeth Gonzalez. +contact: debom |at |cbpf| dot| br + +In our preliminary tests we found a SNR 3X2 of ~1910 for n=10 bins and ~2200 for 14 bins. + +This is an example tomographic bin generator using a convolutional bidirectional LSTM neural network. + + +Every classifier module needs to: + - have construction of the type + __init__ (self, bands, options) (see examples below) + - implement two functions: + train (self, training_data,training_z) + apply (self, data). + - define valid_options class varible. +""" +from .base import Tomographer +import numpy as np +import tensorflow as tf +from tensorflow import keras +from sklearn.preprocessing import MinMaxScaler + +class CNN(Tomographer): + """ Convolutional deep classifier """ + + # valid parameter -- see below + valid_options = ['bins'] + # this settings means arrays will be sent to train and apply instead + # of dictionaries + wants_arrays = True + + def __init__ (self, bands, options): + """Constructor + + Parameters: + ----------- + bands: str + string containg valid bands, like 'riz' or 'griz' + options: dict + options come through here. Valid keys are listed as valid_options + class variable. + + Note: + ----- + Valid options are: + 'bins' - number of tomographic bins + + """ + self.bands = bands + self.opt = options + + def train (self, training_data, training_z): + """Trains the classifier + + Parameters: + ----------- + training_data: numpy array, size Ngalaxes x Nbands + training data, each row is a galaxy, each column is a band as per + band defined above + training_z: numpy array, size Ngalaxies + true redshift for the training sample + + """ + + n_bin = self.opt['bins'] + print("Finding bins for training data") + # Now put the training data into redshift bins. + # Use zero so that the one object with minimum + # z in the whole survey will be in the lowest bin + training_bin = np.zeros(training_z.size) + + # Find the edges that split the redshifts into n_z bins of + # equal number counts in each + p = np.linspace(0, 100, n_bin + 1) + z_edges = np.percentile(training_z, p) + + # Now find all the objects in each of these bins + for i in range(n_bin): + z_low = z_edges[i] + z_high = z_edges[i + 1] + training_bin[(training_z > z_low) & (training_z < z_high)] = i + #cut = np.random.uniform(0, 1, training_z.size) < 0.05 + #training_bin = training_bin[cut] + #training_data = training_data[cut] + + inp = keras.layers.Input(shape=(training_data.shape[1], 1)) + + x = keras.layers.Conv1D(256, 7, padding='same', activation='relu')(inp) + x = keras.layers.Conv1D(512, 7, padding='same', activation='relu')(x) + x = keras.layers.MaxPooling1D(6, padding='same')(x) + x = keras.layers.Dropout(0.25)(x) + + x = keras.layers.Conv1D(128, 7, padding='same', activation='relu')(x) + x = keras.layers.Conv1D(256, 7, padding='same', activation='relu')(x) + x = keras.layers.MaxPooling1D(6, padding='same')(x) + x = keras.layers.Dropout(0.25)(x) + x = keras.layers.GlobalAveragePooling1D()(x) + + x = keras.layers.Dense(n_bin, activation='softmax')(x) + + model = keras.models.Model(inp, x) + + model.compile(loss='sparse_categorical_crossentropy', optimizer=keras.optimizers.Adam(0.001), metrics=['accuracy']) + + # Can be replaced with any classifier + scaler = MinMaxScaler() + x_train = scaler.fit_transform(training_data) + print("Fitting classifier") + # Lots of data, so this will take some time + x_train = np.expand_dims(x_train, axis=-1) + y_train = np.expand_dims(training_bin, axis=-1) + model.fit(x_train, y_train, epochs=15, verbose=0) + + self.classifier = model + self.z_edges = z_edges + self.scaler = scaler + + + def apply (self, data): + """Applies training to the data. + + Parameters: + ----------- + Data: numpy array, size Ngalaxes x Nbands + testing data, each row is a galaxy, each column is a band as per + band defined above + + Returns: + tomographic_selections: numpy array, int, size Ngalaxies + tomographic selection for galaxies return as bin number for + each galaxy. + """ + x_test = self.scaler.transform(data) + x_test = np.expand_dims(x_test, axis=-1) + preds = self.classifier.predict(x_test) + tomo_bin = np.argmax(preds, axis=1) + return tomo_bin + From c9b467cdb2ad802867de2f642dd7b035cae2b0d4 Mon Sep 17 00:00:00 2001 From: Clecio De Bom <29743719+cdebom@users.noreply.github.com> Date: Mon, 24 Aug 2020 10:16:09 -0300 Subject: [PATCH 03/27] Add a simple LightGBM classifier. We found, in our tests, SNR_3X2 ~1926 with 10 bins. --- tomo_challenge/classifiers/LGBM_classifier.py | 115 ++++++++++++++++++ 1 file changed, 115 insertions(+) create mode 100644 tomo_challenge/classifiers/LGBM_classifier.py diff --git a/tomo_challenge/classifiers/LGBM_classifier.py b/tomo_challenge/classifiers/LGBM_classifier.py new file mode 100644 index 00000000..3f3639a7 --- /dev/null +++ b/tomo_challenge/classifiers/LGBM_classifier.py @@ -0,0 +1,115 @@ +""" +LGBM Classifier +This is an example tomographic bin generator using a Light Gradient Boosted Machine as Classifier. +This solution was developed by the Brazilian Center for Physics Research AI 4 Astrophysics team. +Authors: Clecio R. Bom, Gabriel Teixeira, Bernardo M. Fraga, Eduardo Cypriano. +contact: debom |at |cbpf| dot| br +In our preliminary tests we found a SNR 3X2 of ~1930 for n=10 bins. +Every classifier module needs to: + - have construction of the type + __init__ (self, bands, options) (see examples below) + - implement two functions: + train (self, training_data,training_z) + apply (self, data). + - define valid_options class varible. +""" + +from .base import Tomographer +import numpy as np +from lightgbm import LGBMClassifier + + +class LGBM(Tomographer): + """ Light Gradient Boosted Machine Classifier """ + + # valid parameter -- see below + valid_options = ['bins'] + # this settings means arrays will be sent to train and apply instead + # of dictionaries + wants_arrays = True + + def __init__ (self, bands, options): + """Constructor + + Parameters: + ----------- + bands: str + string containg valid bands, like 'riz' or 'griz' + options: dict + options come through here. Valid keys are listed as valid_options + class variable. + Note: + ----- + Valid options are: + 'bins' - number of tomographic bins + """ + self.bands = bands + self.opt = options + + def train (self, training_data, training_z): + """Trains the classifier + + Parameters: + ----------- + training_data: numpy array, size Ngalaxes x Nbands + training data, each row is a galaxy, each column is a band as per + band defined above + training_z: numpy array, size Ngalaxies + true redshift for the training sample + """ + + n_bin = self.opt['bins'] + print("Finding bins for training data") + # Now put the training data into redshift bins. + # Use zero so that the one object with minimum + # z in the whole survey will be in the lowest bin + training_bin = np.zeros(training_z.size) + + # Find the edges that split the redshifts into n_z bins of + # equal number counts in each + p = np.linspace(0, 100, n_bin + 1) + z_edges = np.percentile(training_z, p) + + # Now find all the objects in each of these bins + for i in range(n_bin): + z_low = z_edges[i] + z_high = z_edges[i + 1] + training_bin[(training_z > z_low) & (training_z < z_high)] = i + + # For speed, it's possible cut down a percentage of original size + #cut = np.random.uniform(0, 1, training_z.size) < 0.05 + #training_data = training_data[cut] + #training_bin = training_bin[cut] + + + model = LGBMClassifier + # Can be replaced with any classifier + + + print("Fitting classifier") + # Lots of data, so this will take some time + model.fit(training_data, training_bin) + + self.classifier = model + self.z_edges = z_edges + self.scaler = scaler + + + def apply (self, data): + """Applies training to the data. + + Parameters: + ----------- + Data: numpy array, size Ngalaxes x Nbands + testing data, each row is a galaxy, each column is a band as per + band defined above + Returns: + tomographic_selections: numpy array, int, size Ngalaxies + tomographic selection for galaxies return as bin number for + each galaxy. + """ + + + preds = self.classifier.predict(data) + tomo_bin = np.argmax(preds, axis=1) + return tomo_bin From d42091f7eb1924abcfda5ee585f939e3ecf3dafe Mon Sep 17 00:00:00 2001 From: Clecio De Bom <29743719+cdebom@users.noreply.github.com> Date: Mon, 24 Aug 2020 10:17:54 -0300 Subject: [PATCH 04/27] I corrected the file's description --- tomo_challenge/classifiers/LGBM_classifier.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tomo_challenge/classifiers/LGBM_classifier.py b/tomo_challenge/classifiers/LGBM_classifier.py index 3f3639a7..7633be7b 100644 --- a/tomo_challenge/classifiers/LGBM_classifier.py +++ b/tomo_challenge/classifiers/LGBM_classifier.py @@ -4,7 +4,7 @@ This solution was developed by the Brazilian Center for Physics Research AI 4 Astrophysics team. Authors: Clecio R. Bom, Gabriel Teixeira, Bernardo M. Fraga, Eduardo Cypriano. contact: debom |at |cbpf| dot| br -In our preliminary tests we found a SNR 3X2 of ~1930 for n=10 bins. +In our preliminary tests we found a SNR 3X2 of ~1926 for n=10 bins. Every classifier module needs to: - have construction of the type __init__ (self, bands, options) (see examples below) From dd086e2de97166a8455e1df3cb127833729d4911 Mon Sep 17 00:00:00 2001 From: Clecio De Bom <29743719+cdebom@users.noreply.github.com> Date: Mon, 31 Aug 2020 10:32:47 -0300 Subject: [PATCH 05/27] Added new models, added loader to others Add TCN, AutoML LSTM classifiers, added a custom data loader in AutoML conv1d and Bidirectional LSTM. --- .../classifiers/LSTM_bidirectional.py | 49 ++++- tomo_challenge/classifiers/TCN.py | 177 +++++++++++++++++ tomo_challenge/classifiers/autokeras_LSTM.py | 185 ++++++++++++++++++ tomo_challenge/classifiers/conv_ak.py | 53 ++++- tomo_challenge/classifiers/requirements.txt | 40 ++++ 5 files changed, 495 insertions(+), 9 deletions(-) create mode 100644 tomo_challenge/classifiers/TCN.py create mode 100644 tomo_challenge/classifiers/autokeras_LSTM.py create mode 100644 tomo_challenge/classifiers/requirements.txt diff --git a/tomo_challenge/classifiers/LSTM_bidirectional.py b/tomo_challenge/classifiers/LSTM_bidirectional.py index 01ca1f2e..6fca9cab 100644 --- a/tomo_challenge/classifiers/LSTM_bidirectional.py +++ b/tomo_challenge/classifiers/LSTM_bidirectional.py @@ -2,13 +2,15 @@ Bidirectional LSTM Classifier This is an example tomographic bin generator using a convolutional bidirectional LSTM neural network. - +We also added a custom data loader we tested. This solution was developed by the Brazilian Center for Physics Research AI 4 Astrophysics team. Authors: Clecio R. Bom, Bernardo M. Fraga, Gabriel Teixeira and Elizabeth Gonzalez. + contact: debom |at |cbpf| dot| br -In our preliminary tests we found a SNR 3X2 of ~1930 for n=10 bins and ~2240 for 14 bins. + + Every classifier module needs to: - have construction of the type @@ -17,6 +19,8 @@ train (self, training_data,training_z) apply (self, data). - define valid_options class varible. + +See Classifier Documentation below. """ from .base import Tomographer @@ -24,6 +28,8 @@ import tensorflow as tf from tensorflow import keras from sklearn.preprocessing import MinMaxScaler +import h5py + class LSTM(Tomographer): """ Bidirectional LSTM deep classifier """ @@ -54,6 +60,45 @@ class variable. self.bands = bands self.opt = options + def load_data(fname, take_colors=True, cutoff=0.0): + n_bins = self.opt['bins'] + data = h5py.File(fname, 'r') + r_mag = data['r_mag'] + g_mag = data['g_mag'] + i_mag = data['i_mag'] + z_mag = data['z_mag'] + redshift = data['redshift_true'] + all_mags = np.vstack([g_mag, r_mag, i_mag, z_mag]) + all_mags = all_mags.T + mask = (all_mags != np.inf).all(axis=1) + all_mags = all_mags[mask,:] + redshift = redshift[mask] + gr_color = all_mags[:,0] - all_mags[:,1] + ri_color = all_mags[:,1] - all_mags[:,2] + iz_color = all_mags[:,2] - all_mags[:,3] + all_colors = np.vstack([gr_color, ri_color, iz_color]) + all_colors = all_colors.T + p = np.linspace(0, 100, n_bins+1) + z_edges = np.percentile(redshift, p) + train_bin = np.zeros(all_mags.shape[0]) + for i in range(n_bins): + z_low = z_edges[i] + z_high = z_edges[i+1] + train_bin[(redshift > z_low) & (redshift <= z_high)] = i + if cutoff != 0.0: + cut = np.random.uniform(0, 1, all_mags.shape[0]) < cutoff + train_bin = train_bin[cut].reshape(-1,1) + all_mags = all_mags[cut] + all_colors = all_colors[cut] + redshift = redshift[cut] + else: + train_bin = train_bin.reshape(-1,1) + if take_colors: + return np.hstack([all_mags, all_colors]), redshift, train_bin.astype(int), z_edges + else: + return mags, redshift, train_bin.astype(int), z_edges + + def train (self, training_data, training_z): """Trains the classifier diff --git a/tomo_challenge/classifiers/TCN.py b/tomo_challenge/classifiers/TCN.py new file mode 100644 index 00000000..6bce4ac4 --- /dev/null +++ b/tomo_challenge/classifiers/TCN.py @@ -0,0 +1,177 @@ +""" +TCN classifier + +This is an example tomographic bin generator using a temporal convolutional neural network (TCN). +We also added a custom data loader we tested. +This solution was developed by the Brazilian Center for Physics Research AI 4 Astrophysics team. + +Authors: Clecio R. Bom, Bernardo M. Fraga, Gabriel Teixeira, Eduardo Cypriano and Elizabeth Gonzalez. +contact: debom |at |cbpf| dot| br + + +Every classifier module needs to: + - have construction of the type + __init__ (self, bands, options) (see examples below) + - implement two functions: + train (self, training_data,training_z) + apply (self, data). + - define valid_options class varible. + +See Classifier Documentation below. +""" + +from .base import Tomographer +import numpy as np +import tensorflow as tf +from tensorflow import keras +from sklearn.preprocessing import MinMaxScaler +from tcn import TCN +import h5py + + +class TCN(Tomographer): + """ TCN deep classifier """ + + # valid parameter -- see below + valid_options = ['bins'] + # this settings means arrays will be sent to train and apply instead + # of dictionaries + wants_arrays = True + + def __init__ (self, bands, options): + """Constructor + + Parameters: + ----------- + bands: str + string containg valid bands, like 'riz' or 'griz' + options: dict + options come through here. Valid keys are listed as valid_options + class variable. + + Note: + ----- + Valid options are: + 'bins' - number of tomographic bins + + """ + self.bands = bands + self.opt = options + + def load_data(fname, take_colors=True, cutoff=0.0): + n_bins = self.opt['bins'] + data = h5py.File(fname, 'r') + r_mag = data['r_mag'] + g_mag = data['g_mag'] + i_mag = data['i_mag'] + z_mag = data['z_mag'] + redshift = data['redshift_true'] + all_mags = np.vstack([g_mag, r_mag, i_mag, z_mag]) + all_mags = all_mags.T + mask = (all_mags != np.inf).all(axis=1) + all_mags = all_mags[mask,:] + redshift = redshift[mask] + gr_color = all_mags[:,0] - all_mags[:,1] + ri_color = all_mags[:,1] - all_mags[:,2] + iz_color = all_mags[:,2] - all_mags[:,3] + all_colors = np.vstack([gr_color, ri_color, iz_color]) + all_colors = all_colors.T + p = np.linspace(0, 100, n_bins+1) + z_edges = np.percentile(redshift, p) + train_bin = np.zeros(all_mags.shape[0]) + for i in range(n_bins): + z_low = z_edges[i] + z_high = z_edges[i+1] + train_bin[(redshift > z_low) & (redshift <= z_high)] = i + if cutoff != 0.0: + cut = np.random.uniform(0, 1, all_mags.shape[0]) < cutoff + train_bin = train_bin[cut].reshape(-1,1) + all_mags = all_mags[cut] + all_colors = all_colors[cut] + redshift = redshift[cut] + else: + train_bin = train_bin.reshape(-1,1) + if take_colors: + return np.hstack([all_mags, all_colors]), redshift, train_bin.astype(int), z_edges + else: + return mags, redshift, train_bin.astype(int), z_edges + + + def train (self, training_data, training_z): + """Trains the classifier + + Parameters: + ----------- + training_data: numpy array, size Ngalaxes x Nbands + training data, each row is a galaxy, each column is a band as per + band defined above + training_z: numpy array, size Ngalaxies + true redshift for the training sample + + """ + + n_bin = self.opt['bins'] + print("Finding bins for training data") + # Now put the training data into redshift bins. + # Use zero so that the one object with minimum + # z in the whole survey will be in the lowest bin + training_bin = np.zeros(training_z.size) + + # Find the edges that split the redshifts into n_z bins of + # equal number counts in each + p = np.linspace(0, 100, n_bin + 1) + z_edges = np.percentile(training_z, p) + + # Now find all the objects in each of these bins + for i in range(n_bin): + z_low = z_edges[i] + z_high = z_edges[i + 1] + training_bin[(training_z > z_low) & (training_z < z_high)] = i + #cut = np.random.uniform(0, 1, training_z.size) < 0.05 + #training_data = training_data[cut] + #training_bin = training_bin[cut] + + + inp = keras.layers.Input(shape=(training_data.shape[1], 1)) + x = TCN(nb_filters=[128, 128], kernel_size=2, dilations=[1, 2], nb_stacks=2, activation='relu', return_sequences=False, use_batch_norm=True, use_skip_connections=True)(inp) + + x = keras.layers.Dense(n_bin, activation='softmax')(x) + + model = keras.models.Model(inp, x) + + model.compile(loss='sparse_categorical_crossentropy', optimizer=keras.optimizers.Adam(0.001), metrics=['accuracy']) + + # Can be replaced with any classifier + scaler = MinMaxScaler() + x_train = scaler.fit_transform(training_data) + x_train = np.expand_dims(x_train, axis=-1) + y_train = np.expand_dims(training_bin, axis=-1) + print("Fitting classifier") + # Lots of data, so this will take some time + model.fit(x_train, y_train, epochs=20, verbose=0) + + self.classifier = model + self.z_edges = z_edges + self.scaler = scaler + + + def apply (self, data): + """Applies training to the data. + + Parameters: + ----------- + Data: numpy array, size Ngalaxes x Nbands + testing data, each row is a galaxy, each column is a band as per + band defined above + + Returns: + tomographic_selections: numpy array, int, size Ngalaxies + tomographic selection for galaxies return as bin number for + each galaxy. + """ + x_test = self.scaler.transform(data) + x_test = np.expand_dims(x_test, axis=-1) + preds = self.classifier.predict(x_test) + tomo_bin = np.argmax(preds, axis=1) + return tomo_bin + diff --git a/tomo_challenge/classifiers/autokeras_LSTM.py b/tomo_challenge/classifiers/autokeras_LSTM.py new file mode 100644 index 00000000..5e3d7981 --- /dev/null +++ b/tomo_challenge/classifiers/autokeras_LSTM.py @@ -0,0 +1,185 @@ +""" +Bidirectional AutoML LSTM Classifier + +This is an example tomographic bin generator using an Autokeras optimized convolutional LSTM neural network. +We also added a custom data loader we tested. +This solution was developed by the Brazilian Center for Physics Research AI 4 Astrophysics team. + +Authors: Clecio R. Bom, Bernardo M. Fraga, Gabriel Teixeira, Eduardo Cypriano and Elizabeth Gonzalez. +contact: debom |at |cbpf| dot| br + + +Every classifier module needs to: + - have construction of the type + __init__ (self, bands, options) (see examples below) + - implement two functions: + train (self, training_data,training_z) + apply (self, data). + - define valid_options class varible. + +See Classifier Documentation below. +""" + +from .base import Tomographer +import numpy as np +import tensorflow as tf +from tensorflow import keras +from sklearn.preprocessing import MinMaxScaler +import h5py + +class Autokeras_LSTM(Tomographer): + """ LSTM optimised by AutoKeras deep classifier """ + + # valid parameter -- see below + valid_options = ['bins'] + # this settings means arrays will be sent to train and apply instead + # of dictionaries + wants_arrays = True + + def __init__ (self, bands, options): + """Constructor + + Parameters: + ----------- + bands: str + string containg valid bands, like 'riz' or 'griz' + options: dict + options come through here. Valid keys are listed as valid_options + class variable. + + Note: + ----- + Valid options are: + 'bins' - number of tomographic bins + + """ + self.bands = bands + self.opt = options + + def load_data(fname, take_colors=True, cutoff=0.0): + n_bins = self.opt['bins'] + data = h5py.File(fname, 'r') + r_mag = data['r_mag'] + g_mag = data['g_mag'] + i_mag = data['i_mag'] + z_mag = data['z_mag'] + redshift = data['redshift_true'] + all_mags = np.vstack([g_mag, r_mag, i_mag, z_mag]) + all_mags = all_mags.T + mask = (all_mags != np.inf).all(axis=1) + all_mags = all_mags[mask,:] + redshift = redshift[mask] + gr_color = all_mags[:,0] - all_mags[:,1] + ri_color = all_mags[:,1] - all_mags[:,2] + iz_color = all_mags[:,2] - all_mags[:,3] + all_colors = np.vstack([gr_color, ri_color, iz_color]) + all_colors = all_colors.T + p = np.linspace(0, 100, n_bins+1) + z_edges = np.percentile(redshift, p) + train_bin = np.zeros(all_mags.shape[0]) + for i in range(n_bins): + z_low = z_edges[i] + z_high = z_edges[i+1] + train_bin[(redshift > z_low) & (redshift <= z_high)] = i + if cutoff != 0.0: + cut = np.random.uniform(0, 1, all_mags.shape[0]) < cutoff + train_bin = train_bin[cut].reshape(-1,1) + all_mags = all_mags[cut] + all_colors = all_colors[cut] + redshift = redshift[cut] + else: + train_bin = train_bin.reshape(-1,1) + if take_colors: + return np.hstack([all_mags, all_colors]), redshift, train_bin.astype(int), z_edges + else: + return mags, redshift, train_bin.astype(int), z_edges + + + def train (self, training_data, training_z): + """Trains the classifier + + Parameters: + ----------- + training_data: numpy array, size Ngalaxes x Nbands + training data, each row is a galaxy, each column is a band as per + band defined above + training_z: numpy array, size Ngalaxies + true redshift for the training sample + + """ + + n_bin = self.opt['bins'] + print("Finding bins for training data") + # Now put the training data into redshift bins. + # Use zero so that the one object with minimum + # z in the whole survey will be in the lowest bin + training_bin = np.zeros(training_z.size) + + # Find the edges that split the redshifts into n_z bins of + # equal number counts in each + p = np.linspace(0, 100, n_bin + 1) + z_edges = np.percentile(training_z, p) + + # Now find all the objects in each of these bins + for i in range(n_bin): + z_low = z_edges[i] + z_high = z_edges[i + 1] + training_bin[(training_z > z_low) & (training_z < z_high)] = i + #cut = np.random.uniform(0, 1, training_z.size) < 0.05 + #training_data = training_data[cut] + #training_bin = training_bin[cut] + + + inp = keras.layers.Input(shape=(training_data.shape[1], 1)) + x = keras.layers.Conv1D(32, 5, padding='same', activation='relu')(inp) + x = keras.layers.Conv1D(512, 5, padding='same', activation='relu')(x) + x = keras.layers.Conv1D(128, 5, padding='same', activation='relu')(x) + + x = keras.layers.Bidirectional(keras.layers.LSTM(128, return_sequences=True), merge_mode='concat')(x) + x = keras.layers.Bidirectional(keras.layers.LSTM(128, return_sequences=False), merge_mode='concat')(x) + + x = keras.layers.Dense(512)(x) + x = keras.layers.BatchNormalization()(x) + x = keras.layers.Activation('relu')(x) + x = keras.layers.Dropout(0.25)(x) + x = keras.layers.Dropout(0.5)(x) + x = keras.layers.Dense(n_bin, activation='softmax')(x) + + model = keras.models.Model(inp, x) + + model.compile(loss='sparse_categorical_crossentropy', optimizer=keras.optimizers.Adam(0.001), metrics=['accuracy']) + + # Can be replaced with any classifier + scaler = MinMaxScaler() + x_train = scaler.fit_transform(training_data) + x_train = np.expand_dims(x_train, axis=-1) + y_train = np.expand_dims(training_bin, axis=-1) + print("Fitting classifier") + # Lots of data, so this will take some time + model.fit(x_train, y_train, epochs=30, verbose=0) + + self.classifier = model + self.z_edges = z_edges + self.scaler = scaler + + + def apply (self, data): + """Applies training to the data. + + Parameters: + ----------- + Data: numpy array, size Ngalaxes x Nbands + testing data, each row is a galaxy, each column is a band as per + band defined above + + Returns: + tomographic_selections: numpy array, int, size Ngalaxies + tomographic selection for galaxies return as bin number for + each galaxy. + """ + x_test = self.scaler.transform(data) + x_test = np.expand_dims(x_test, axis=-1) + preds = self.classifier.predict(x_test) + tomo_bin = np.argmax(preds, axis=1) + return tomo_bin + diff --git a/tomo_challenge/classifiers/conv_ak.py b/tomo_challenge/classifiers/conv_ak.py index 687242c9..6f88faf4 100644 --- a/tomo_challenge/classifiers/conv_ak.py +++ b/tomo_challenge/classifiers/conv_ak.py @@ -2,18 +2,14 @@ Conv1D Classifier This is an example tomographic bin generator using a convolutional 1d neural network. -The initial model was optimized using AutoKeras - +The initial model was optimized using AutoKeras. +We also added a custom data loader we tested. This solution was developed by the Brazilian Center for Physics Research AI 4 Astrophysics team. + Authors: Clecio R. Bom, Bernardo M. Fraga, Gabriel Teixeira and Elizabeth Gonzalez. contact: debom |at |cbpf| dot| br -In our preliminary tests we found a SNR 3X2 of ~1910 for n=10 bins and ~2200 for 14 bins. - -This is an example tomographic bin generator using a convolutional bidirectional LSTM neural network. - - Every classifier module needs to: - have construction of the type __init__ (self, bands, options) (see examples below) @@ -21,12 +17,16 @@ train (self, training_data,training_z) apply (self, data). - define valid_options class varible. + +See Classifier Documentation below. """ + from .base import Tomographer import numpy as np import tensorflow as tf from tensorflow import keras from sklearn.preprocessing import MinMaxScaler +import h5py class CNN(Tomographer): """ Convolutional deep classifier """ @@ -57,6 +57,45 @@ class variable. self.bands = bands self.opt = options + def load_data(fname, take_colors=True, cutoff=0.0): + n_bins = self.opt['bins'] + data = h5py.File(fname, 'r') + r_mag = data['r_mag'] + g_mag = data['g_mag'] + i_mag = data['i_mag'] + z_mag = data['z_mag'] + redshift = data['redshift_true'] + all_mags = np.vstack([g_mag, r_mag, i_mag, z_mag]) + all_mags = all_mags.T + mask = (all_mags != np.inf).all(axis=1) + all_mags = all_mags[mask,:] + redshift = redshift[mask] + gr_color = all_mags[:,0] - all_mags[:,1] + ri_color = all_mags[:,1] - all_mags[:,2] + iz_color = all_mags[:,2] - all_mags[:,3] + all_colors = np.vstack([gr_color, ri_color, iz_color]) + all_colors = all_colors.T + p = np.linspace(0, 100, n_bins+1) + z_edges = np.percentile(redshift, p) + train_bin = np.zeros(all_mags.shape[0]) + for i in range(n_bins): + z_low = z_edges[i] + z_high = z_edges[i+1] + train_bin[(redshift > z_low) & (redshift <= z_high)] = i + if cutoff != 0.0: + cut = np.random.uniform(0, 1, all_mags.shape[0]) < cutoff + train_bin = train_bin[cut].reshape(-1,1) + all_mags = all_mags[cut] + all_colors = all_colors[cut] + redshift = redshift[cut] + else: + train_bin = train_bin.reshape(-1,1) + if take_colors: + return np.hstack([all_mags, all_colors]), redshift, train_bin.astype(int), z_edges + else: + return mags, redshift, train_bin.astype(int), z_edges + + def train (self, training_data, training_z): """Trains the classifier diff --git a/tomo_challenge/classifiers/requirements.txt b/tomo_challenge/classifiers/requirements.txt new file mode 100644 index 00000000..7baec943 --- /dev/null +++ b/tomo_challenge/classifiers/requirements.txt @@ -0,0 +1,40 @@ +astropy==4.0.1.post1 +camb==1.1.3 +git+https://github.com/cmbant/camb +click==7.1.2 +cosmosis-standalone==0.2.2 +DynamicTable==0.0.4 +firecrown @ git+https://github.com/LSSTDESC/firecrown/@fd4bb423aac8221ed0826a5921bc5590770f4bcb +gast==0.2.2 +gmpy2==2.1.0b1 +graphviz==0.14 +h5py==2.10.0 +importlib-metadata==1.7.0 +jax==0.1.75 +jax-cosmo==0.1rc6 +jaxlib @ https://storage.googleapis.com/jax-releases/cuda101/jaxlib-0.1.52-cp38-none-manylinux2010_x86_64.whl +joblib==0.15.1 +keract==4.2.2 +keras-tcn==3.1.1 +lightgbm==2.0.12 +matplotlib==3.1.3 +mock==4.0.2 +mpi4py==3.0.3 +numpy==1.18.1 +pandas==0.25.3 +pillow==7.1.2 +progressbar==2.5 +pyccl==2.1.0 +pydot==1.4.1 +PyYAML==5.3.1 +sacc==0.4.2 +schwimmbad==0.3.1 +scikit-learn==0.22.2.post1 +scipy==1.4.1 +sip==4.19.13 +statsmodels==0.11.1 +tensorboard==2.2.2 +tensorflow==2.2.0 +tensorflow-addons==0.10.0 +tensorflow-estimator==2.2.0 +tensorflow-probability==0.10.0 From 88de451928025619d61b2caf4b0b172856c30b8d Mon Sep 17 00:00:00 2001 From: Clecio De Bom <29743719+cdebom@users.noreply.github.com> Date: Mon, 31 Aug 2020 10:53:44 -0300 Subject: [PATCH 06/27] Added Deep Ensemble Model Added Deep Ensemble model, including resnet, autolsm, fcn. --- .../classifiers/DEEP_ENSEMBLE_main.py | 200 ++++++++++++++++++ tomo_challenge/classifiers/de_autolstm.py | 84 ++++++++ tomo_challenge/classifiers/de_fcn.py | 81 +++++++ tomo_challenge/classifiers/de_nne.py | 78 +++++++ tomo_challenge/classifiers/de_resnet.py | 142 +++++++++++++ 5 files changed, 585 insertions(+) create mode 100644 tomo_challenge/classifiers/DEEP_ENSEMBLE_main.py create mode 100644 tomo_challenge/classifiers/de_autolstm.py create mode 100644 tomo_challenge/classifiers/de_fcn.py create mode 100644 tomo_challenge/classifiers/de_nne.py create mode 100644 tomo_challenge/classifiers/de_resnet.py diff --git a/tomo_challenge/classifiers/DEEP_ENSEMBLE_main.py b/tomo_challenge/classifiers/DEEP_ENSEMBLE_main.py new file mode 100644 index 00000000..e1f1c93b --- /dev/null +++ b/tomo_challenge/classifiers/DEEP_ENSEMBLE_main.py @@ -0,0 +1,200 @@ +""" +Deep Ensemble Classifier +This is an example tomographic bin generator using a Deep Ensemble Classifier. +This solution was developed by the Brazilian Center for Physics Research AI 4 Astrophysics team. +Authors: Clecio R. Bom, Gabriel Teixeira, Bernardo M. Fraga, Eduardo Cypriano and Elizabeth Gonzalez. +contact: debom |at |cbpf| dot| br +In our preliminary tests we found a SNR 3X2 of ~1930 for n=10 bins. +Every classifier module needs to: + - have construction of the type + __init__ (self, bands, options) (see examples below) + - implement two functions: + train (self, training_data,training_z) + apply (self, data). + - define valid_options class varible. +""" + +from tomo_challenge.utils.utils import transform_labels +from tomo_challenge.utils.utils import create_directory +from sklearn.preprocessing import MinMaxScaler +import sklearn +import os +from .base import Tomographer +import numpy as np +import sys + + + +class ENSEMBLE(Tomographer): + """ ENSEMBLE Classifier """ + + # valid parameter -- see below + valid_options = ['bins'] + # this settings means arrays will be sent to train and apply instead + # of dictionaries + wants_arrays = True + + def __init__ (self, bands, options): + """Constructor + + Parameters: + ----------- + bands: str + string containg valid bands, like 'riz' or 'griz' + options: dict + options come through here. Valid keys are listed as valid_options + class variable. + dir_out: str + directory for the outputs of training + Note: + ----- + Valid options are: + 'bins' - number of tomographic bins + """ + + self.bands = bands + self.opt = options + self.CLASSIFIERS = ['fcn', 'autolstm', 'resnet'] + + def propare_data(self, training_data, traning_bin): + + x_train = training_data + y_train = traning_bin + + nb_classes = len(np.unique(y_train)) + print(f'n_bins = {nb_classes}') + + + # make the min to zero of labels + y_train = transform_labels(y_train) + + # transform the labels from integers to one hot vectors + enc = sklearn.preprocessing.OneHotEncoder() + enc.fit(y_train.reshape(-1, 1)) + y_train = enc.transform(y_train.reshape(-1, 1)).toarray() + + scaler = MinMaxScaler() + x_train = scaler.fit_transform(x_train) + if len(x_train.shape) == 2: # if univariate + # add a dimension to make it multivariate with one dimension and normalizing + x_train = x_train.reshape((x_train.shape[0], x_train.shape[1], 1)) + + return x_train, y_train, nb_classes, scaler + + def fit_classifier(self, classifier_name, x_train, y_train, nb_classes, output_directory, load_weights=False): + + input_shape = x_train.shape[1:] + + classifier = self.create_classifier(classifier_name, input_shape, nb_classes, output_directory, + load_weights=load_weights) + + classifier.fit(x_train, y_train) + + def create_classifier(self, classifier_name, input_shape, nb_classes, output_directory, verbose=True, + build=True, load_weights=False): + if classifier_name == 'fcn': + from tomo_challenge.classifiers import de_fcn as fcn + return fcn.Classifier_FCN(output_directory, input_shape, nb_classes, verbose, build=build) + + if classifier_name == 'autolstm': + from tomo_challenge.classifiers import de_autolstm as autolstm + return autolstm.Classifier_LSTM(output_directory, input_shape, nb_classes, verbose, build=build) + + if classifier_name == 'resnet': + from tomo_challenge.classifiers import de_resnet as resnet + return resnet.Classifier_RESNET(output_directory, input_shape, nb_classes, verbose, + build=build, load_weights=load_weights) + + def train (self, training_data, training_z): + """Trains the classifier + + Parameters: + ----------- + training_data: numpy array, size Ngalaxes x Nbands + training data, each row is a galaxy, each column is a band as per + band defined above + training_z: numpy array, size Ngalaxies + true redshift for the training sample + """ + + #Create a directory for the results and the models + path_parent = os.getcwd() + dir_out = path_parent+'/ENSEMBLE' + if not os.path.exists(dir_out): + create_directory(dir_out) + self.dir_out = dir_out + + + + n_bin = self.opt['bins'] + print("Finding bins for training data") + # Now put the training data into redshift bins. + # Use zero so that the one object with minimum + # z in the whole survey will be in the lowest bin + training_bin = np.zeros(training_z.size) + + # Find the edges that split the redshifts into n_z bins of + # equal number counts in each + p = np.linspace(0, 100, n_bin + 1) + z_edges = np.percentile(training_z, p) + + # Now find all the objects in each of these bins + for i in range(n_bin): + z_low = z_edges[i] + z_high = z_edges[i + 1] + training_bin[(training_z > z_low) & (training_z < z_high)] = i + + # For speed, it's possible cut down a percentage of original size + #cut = np.random.uniform(0, 1, training_z.size) < 0.05 + #training_data = training_data[cut] + #training_bin = training_bin[cut] + + x_train, y_train, nb_classes, scaler = self.propare_data(training_data=training_data, traning_bin=training_bin) + print("Fitting classifier") + for classifier_name in self.CLASSIFIERS: + print('classifier_name', classifier_name) + + output_directory = self.dir_out +'/' + classifier_name + '/' + create_directory(output_directory) + + print("Training") + + self.fit_classifier(classifier_name=classifier_name, x_train=x_train, y_train=y_train, + nb_classes=nb_classes, output_directory=output_directory) + + create_directory(output_directory + '/DONE') + print('\t\t\t\tDONE') + # Can be replaced with any classifier + + + + self.z_edges = z_edges + self.scaler = scaler + + + def apply (self, data): + """Applies training to the data. + + Parameters: + ----------- + Data: numpy array, size Ngalaxes x Nbands + testing data, each row is a galaxy, each column is a band as per + band defined above + Returns: + tomographic_selections: numpy array, int, size Ngalaxies + tomographic selection for galaxies return as bin number for + each galaxy. + """ + data = self.scaler.transform(data) + if len(data.shape) == 2: # if univariate + # add a dimension to make it multivariate with one dimension and normalizing + data = data.reshape((data.shape[0], data.shape[1], 1)) + + from tomo_challenge.classifiers import de_nne as nne + result_dir = self.dir_out+'/ensemble_result/' + create_directory(self.dir_out+'/ensemble_result/') + ensemble = nne.Classifier_NNE(result_dir) + + preds = ensemble.predict(data) + tomo_bin = np.argmax(preds, axis=1) + return tomo_bin diff --git a/tomo_challenge/classifiers/de_autolstm.py b/tomo_challenge/classifiers/de_autolstm.py new file mode 100644 index 00000000..8cd83fc3 --- /dev/null +++ b/tomo_challenge/classifiers/de_autolstm.py @@ -0,0 +1,84 @@ +# AutoLSTM model for Deep Ensemble +import tensorflow.keras as keras +import numpy as np +import time + +from tomo_challenge.utils.utils import save_logs +from tomo_challenge.utils.utils import calculate_metrics +from tomo_challenge.utils.utils import get_available_gpus + +class Classifier_LSTM: + + def __init__(self, output_directory, input_shape, nb_classes, verbose=True,build=True): + self.output_directory = output_directory + if build == True: + self.model = self.build_model(input_shape, nb_classes) + if(verbose==True): + self.model.summary() + self.verbose = verbose + if 'autolstm' in self.output_directory: + self.model.save_weights(self.output_directory+'model_init.hdf5') + return + + def build_model(self, input_shape, nb_classes): + input_layer = keras.layers.Input(input_shape) + + x = keras.layers.Conv1D(32, 5, padding='same', activation='relu')(input_layer) + x = keras.layers.Conv1D(512, 5, padding='same', activation='relu')(x) + x = keras.layers.Conv1D(128, 5, padding='same', activation='relu')(x) + + x = keras.layers.Bidirectional(keras.layers.LSTM(128, return_sequences=True), merge_mode='concat')(x) + x = keras.layers.Bidirectional(keras.layers.LSTM(128, return_sequences=False), merge_mode='concat')(x) + + x = keras.layers.Dense(512)(x) + x = keras.layers.BatchNormalization()(x) + x = keras.layers.Activation('relu')(x) + x = keras.layers.Dropout(0.25)(x) + x = keras.layers.Dropout(0.5)(x) + + output_layer = keras.layers.Dense(nb_classes, activation='softmax')(x) + + model = keras.models.Model(inputs=input_layer, outputs=output_layer) + + model.compile(loss='categorical_crossentropy', optimizer = keras.optimizers.Adam(0.0005), + metrics=['accuracy']) + + reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor='loss', factor=0.5, patience=50, + min_lr=0.0001) + + file_path = self.output_directory+'best_model.hdf5' + + model_checkpoint = keras.callbacks.ModelCheckpoint(filepath=file_path, monitor='loss', + save_best_only=True) + + self.callbacks = [reduce_lr,model_checkpoint] + + return model + + def fit(self, x_train, y_train): + if len(get_available_gpus())==0: + print('error') + exit() + # x_val and y_val are only used to monitor the test loss and NOT for training + batch_size = 512 + nb_epochs = 20 + + mini_batch_size = int(min(x_train.shape[0]/10, batch_size)) + + start_time = time.time() + self.model.fit(x_train, y_train, batch_size=mini_batch_size, epochs=nb_epochs, + verbose=1, callbacks=self.callbacks) + + duration = time.time() - start_time + print(f'Duration - AutoLSTM - Training = {duration:.2f}s') + self.model.save(self.output_directory+'last_model.hdf5') + + keras.backend.clear_session() + + def predict(self, x_test): + model_path = self.output_directory.replace('ensemble_result', 'autolstm') + model_path = model_path + 'best_model.hdf5' + model = keras.models.load_model(model_path) + y_pred = model.predict(x_test) + + return y_pred diff --git a/tomo_challenge/classifiers/de_fcn.py b/tomo_challenge/classifiers/de_fcn.py new file mode 100644 index 00000000..6d1a66ba --- /dev/null +++ b/tomo_challenge/classifiers/de_fcn.py @@ -0,0 +1,81 @@ +# FCN model for Deep Ensemble +import keras +import numpy as np +import time +import sys +from tomo_challenge.utils.utils import get_available_gpus + +class Classifier_FCN: + + def __init__(self, output_directory, input_shape, nb_classes, verbose=True,build=True): + self.output_directory = output_directory + if build == True: + self.model = self.build_model(input_shape, nb_classes) + if(verbose==True): + self.model.summary() + self.verbose = verbose + self.model.save_weights(self.output_directory+'model_init.hdf5') + return + + def build_model(self, input_shape, nb_classes): + input_layer = keras.layers.Input(input_shape) + + conv1 = keras.layers.Conv1D(filters=128, kernel_size=8, padding='same')(input_layer) + conv1 = keras.layers.normalization.BatchNormalization()(conv1) + conv1 = keras.layers.Activation(activation='relu')(conv1) + + conv2 = keras.layers.Conv1D(filters=256, kernel_size=5, padding='same')(conv1) + conv2 = keras.layers.normalization.BatchNormalization()(conv2) + conv2 = keras.layers.Activation('relu')(conv2) + + conv3 = keras.layers.Conv1D(128, kernel_size=3,padding='same')(conv2) + conv3 = keras.layers.normalization.BatchNormalization()(conv3) + conv3 = keras.layers.Activation('relu')(conv3) + + gap_layer = keras.layers.pooling.GlobalAveragePooling1D()(conv3) + + output_layer = keras.layers.Dense(nb_classes, activation='softmax')(gap_layer) + + model = keras.models.Model(inputs=input_layer, outputs=output_layer) + + model.compile(loss='categorical_crossentropy', optimizer = keras.optimizers.Adam(), + metrics=['accuracy']) + + reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor='loss', factor=0.5, patience=50, + min_lr=0.0001) + + file_path = self.output_directory+'best_model.hdf5' + + model_checkpoint = keras.callbacks.ModelCheckpoint(filepath=file_path, monitor='loss', + save_best_only=True) + + self.callbacks = [reduce_lr,model_checkpoint] + + return model + + def fit(self, x_train, y_train): + if len(get_available_gpus())==0: + print('error') + exit() + # x_val and y_val are only used to monitor the test loss and NOT for training + batch_size = 512 + nb_epochs = 20 + + mini_batch_size = int(min(x_train.shape[0]/10, batch_size)) + + start_time = time.time() + self.model.fit(x_train, y_train, batch_size=mini_batch_size, epochs=nb_epochs, + verbose=self.verbose, callbacks=self.callbacks) + + self.model.save(self.output_directory+'last_model.hdf5') + duration = time.time() - start_time + print(f'Duration - FCN - Training = {duration:.2f}s') + + keras.backend.clear_session() + + def predict(self, x_test): + model_path = self.output_directory.replace('ensemble_result', 'fcn') + model_path = model_path + 'best_model.hdf5' + model = keras.models.load_model(model_path) + y_pred = model.predict(x_test) + return y_pred diff --git a/tomo_challenge/classifiers/de_nne.py b/tomo_challenge/classifiers/de_nne.py new file mode 100644 index 00000000..b3bd6348 --- /dev/null +++ b/tomo_challenge/classifiers/de_nne.py @@ -0,0 +1,78 @@ +# NNE model for Deep Ensemble +import tensorflow.keras as keras +import numpy as np + +from tomo_challenge.utils.utils import create_directory +from tomo_challenge.utils.utils import check_if_file_exits +import gc +import time +import sys + + +class Classifier_NNE: + def create_classifier(self, model_name, input_shape, nb_classes, output_directory, verbose=False, + build=True, load_weights=False): + if model_name == 'fcn': + from tomo_challenge.classifiers import fcn + return fcn.Classifier_FCN(output_directory, input_shape, nb_classes, verbose, build=build) + if model_name == 'autolstm': + from tomo_challenge.classifiers import autolstm + return autolstm.Classifier_LSTM(output_directory, input_shape, nb_classes, verbose, build=build) + if model_name == 'resnet': + from tomo_challenge.classifiers import resnet + return resnet.Classifier_RESNET(output_directory, input_shape, nb_classes, verbose, + build=build, load_weights=load_weights) + + + def __init__(self, output_directory, verbose=False): + self.classifiers = ['fcn', 'autolstm', 'resnet'] + + self.output_directory = output_directory + create_directory(self.output_directory) + self.verbose = verbose + self.models_dir = output_directory.replace('ensemble_results','classifier') + self.iterations = 1 + + def predict(self, x_test): + # no training since models are pre-trained + start_time = time.time() + + l = 0 + # loop through all classifiers + for model_name in self.classifiers: + + # loop through different initialization of classifiers + + curr_dir = self.models_dir.replace('classifier',model_name) + + model = self.create_classifier(model_name, None, None, + curr_dir, build=False) + print(model_name) + + predictions_file_name = curr_dir+'y_pred.npy' + + print(f"geting predict for {model_name}") + curr_y_pred = model.predict(x_test=x_test) + keras.backend.clear_session() + + np.save(predictions_file_name,curr_y_pred) + + if l == 0: + y_pred = np.zeros(shape=curr_y_pred.shape) + + y_pred = y_pred+curr_y_pred + l+=1 + + + # average predictions + y_pred = y_pred / l + + # save predictiosn + np.save(self.output_directory+'y_pred.npy', y_pred) + + duration = time.time() - start_time + + # the creation of this directory means + create_directory(self.output_directory + '/DONE') + + return y_pred diff --git a/tomo_challenge/classifiers/de_resnet.py b/tomo_challenge/classifiers/de_resnet.py new file mode 100644 index 00000000..b26e19fe --- /dev/null +++ b/tomo_challenge/classifiers/de_resnet.py @@ -0,0 +1,142 @@ +# ResNet model +import keras +import numpy as np +import time + +from tomo_challenge.utils.utils import save_logs +from tomo_challenge.utils.utils import calculate_metrics +from tomo_challenge.utils.utils import get_available_gpus + +class Classifier_RESNET: + + def __init__(self, output_directory, input_shape, nb_classes, verbose=True,build=True,load_weights=False): + self.output_directory = output_directory + if build==True: + self.model = self.build_model(input_shape, nb_classes) + if(verbose==True): + self.model.summary() + self.verbose = verbose + if load_weights == True: + self.model.load_weights(self.output_directory + .replace('resnet_augment','resnet') + .replace('TSC_itr_augment_x_10','TSC_itr_10') + +'/model_init.hdf5') + else: + self.model.save_weights(self.output_directory+'model_init.hdf5') + return + + def build_model(self, input_shape, nb_classes): + n_feature_maps = 64 + + input_layer = keras.layers.Input(input_shape) + + # BLOCK 1 + + conv_x = keras.layers.Conv1D(filters=n_feature_maps, kernel_size=8, padding='same')(input_layer) + conv_x = keras.layers.normalization.BatchNormalization()(conv_x) + conv_x = keras.layers.Activation('relu')(conv_x) + + conv_y = keras.layers.Conv1D(filters=n_feature_maps, kernel_size=5, padding='same')(conv_x) + conv_y = keras.layers.normalization.BatchNormalization()(conv_y) + conv_y = keras.layers.Activation('relu')(conv_y) + + conv_z = keras.layers.Conv1D(filters=n_feature_maps, kernel_size=3, padding='same')(conv_y) + conv_z = keras.layers.normalization.BatchNormalization()(conv_z) + + # expand channels for the sum + shortcut_y = keras.layers.Conv1D(filters=n_feature_maps, kernel_size=1, padding='same')(input_layer) + shortcut_y = keras.layers.normalization.BatchNormalization()(shortcut_y) + + output_block_1 = keras.layers.add([shortcut_y, conv_z]) + output_block_1 = keras.layers.Activation('relu')(output_block_1) + + # BLOCK 2 + + conv_x = keras.layers.Conv1D(filters=n_feature_maps*2, kernel_size=8, padding='same')(output_block_1) + conv_x = keras.layers.normalization.BatchNormalization()(conv_x) + conv_x = keras.layers.Activation('relu')(conv_x) + + conv_y = keras.layers.Conv1D(filters=n_feature_maps*2, kernel_size=5, padding='same')(conv_x) + conv_y = keras.layers.normalization.BatchNormalization()(conv_y) + conv_y = keras.layers.Activation('relu')(conv_y) + + conv_z = keras.layers.Conv1D(filters=n_feature_maps*2, kernel_size=3, padding='same')(conv_y) + conv_z = keras.layers.normalization.BatchNormalization()(conv_z) + + # expand channels for the sum + shortcut_y = keras.layers.Conv1D(filters=n_feature_maps*2, kernel_size=1, padding='same')(output_block_1) + shortcut_y = keras.layers.normalization.BatchNormalization()(shortcut_y) + + output_block_2 = keras.layers.add([shortcut_y, conv_z]) + output_block_2 = keras.layers.Activation('relu')(output_block_2) + + # BLOCK 3 + + conv_x = keras.layers.Conv1D(filters=n_feature_maps*2, kernel_size=8, padding='same')(output_block_2) + conv_x = keras.layers.normalization.BatchNormalization()(conv_x) + conv_x = keras.layers.Activation('relu')(conv_x) + + conv_y = keras.layers.Conv1D(filters=n_feature_maps*2, kernel_size=5, padding='same')(conv_x) + conv_y = keras.layers.normalization.BatchNormalization()(conv_y) + conv_y = keras.layers.Activation('relu')(conv_y) + + conv_z = keras.layers.Conv1D(filters=n_feature_maps*2, kernel_size=3, padding='same')(conv_y) + conv_z = keras.layers.normalization.BatchNormalization()(conv_z) + + # no need to expand channels because they are equal + shortcut_y = keras.layers.normalization.BatchNormalization()(output_block_2) + + output_block_3 = keras.layers.add([shortcut_y, conv_z]) + output_block_3 = keras.layers.Activation('relu')(output_block_3) + + # FINAL + + gap_layer = keras.layers.GlobalAveragePooling1D()(output_block_3) + + output_layer = keras.layers.Dense(nb_classes, activation='softmax')(gap_layer) + + model = keras.models.Model(inputs=input_layer, outputs=output_layer) + + model.compile(loss='categorical_crossentropy', optimizer=keras.optimizers.Adam(), + metrics=['accuracy']) + + reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor='loss', factor=0.5, patience=50, min_lr=0.0001) + + file_path = self.output_directory+'best_model.hdf5' + + model_checkpoint = keras.callbacks.ModelCheckpoint(filepath=file_path, monitor='loss', + save_best_only=True) + + self.callbacks = [reduce_lr,model_checkpoint] + + return model + + def fit(self, x_train, y_train): + if len(get_available_gpus())==0: + print('error') + exit() + # x_val and y_val are only used to monitor the test loss and NOT for training + batch_size = 512 + nb_epochs = 20 + + mini_batch_size = int(min(x_train.shape[0]/10, batch_size)) + + start_time = time.time() + + self.model.fit(x_train, y_train, batch_size=mini_batch_size, epochs=nb_epochs, + verbose=self.verbose, callbacks=self.callbacks) + + duration = time.time() - start_time + print(f'Duration - ResNet - Training = {duration:.2f}s') + self.model.save(self.output_directory + 'last_model.hdf5') + + + keras.backend.clear_session() + + + def predict(self, x_test): + model_path = self.output_directory.replace('ensemble_result', 'resnet') + model_path = model_path + 'best_model.hdf5' + model = keras.models.load_model(model_path) + y_pred = model.predict(x_test) + return y_pred \ No newline at end of file From ce001b2de7da3c24209f14c6cd4a52a6011362c5 Mon Sep 17 00:00:00 2001 From: Clecio De Bom <29743719+cdebom@users.noreply.github.com> Date: Mon, 31 Aug 2020 10:58:15 -0300 Subject: [PATCH 07/27] Added Keras contrib --- tomo_challenge/classifiers/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/tomo_challenge/classifiers/requirements.txt b/tomo_challenge/classifiers/requirements.txt index 7baec943..70f7f99d 100644 --- a/tomo_challenge/classifiers/requirements.txt +++ b/tomo_challenge/classifiers/requirements.txt @@ -38,3 +38,4 @@ tensorflow==2.2.0 tensorflow-addons==0.10.0 tensorflow-estimator==2.2.0 tensorflow-probability==0.10.0 +keras_contrib # installed using pip3 install git+https://www.github.com/keras-team/keras-contrib.git From 7e6a7c48e8eb085059ddc6d443d8c2c19c91908f Mon Sep 17 00:00:00 2001 From: Clecio De Bom <29743719+cdebom@users.noreply.github.com> Date: Mon, 31 Aug 2020 14:33:45 -0300 Subject: [PATCH 08/27] small correction --- tomo_challenge/classifiers/de_nne.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tomo_challenge/classifiers/de_nne.py b/tomo_challenge/classifiers/de_nne.py index b3bd6348..d1734f4b 100644 --- a/tomo_challenge/classifiers/de_nne.py +++ b/tomo_challenge/classifiers/de_nne.py @@ -13,13 +13,13 @@ class Classifier_NNE: def create_classifier(self, model_name, input_shape, nb_classes, output_directory, verbose=False, build=True, load_weights=False): if model_name == 'fcn': - from tomo_challenge.classifiers import fcn + from tomo_challenge.classifiers import de_fcn as fcn return fcn.Classifier_FCN(output_directory, input_shape, nb_classes, verbose, build=build) if model_name == 'autolstm': - from tomo_challenge.classifiers import autolstm + from tomo_challenge.classifiers import de_autolstm as autolstm return autolstm.Classifier_LSTM(output_directory, input_shape, nb_classes, verbose, build=build) if model_name == 'resnet': - from tomo_challenge.classifiers import resnet + from tomo_challenge.classifiers import de_resnet as resnet return resnet.Classifier_RESNET(output_directory, input_shape, nb_classes, verbose, build=build, load_weights=load_weights) From 16733c94ed54df4ad7a523dcba0d740591f08053 Mon Sep 17 00:00:00 2001 From: Clecio De Bom <29743719+cdebom@users.noreply.github.com> Date: Mon, 31 Aug 2020 16:32:24 -0300 Subject: [PATCH 09/27] Jax/Flax based LSTM Here we optimize for SNR score. --- tomo_challenge/classifiers/lstm_flax.py | 189 ++++++++++++++++++++++++ 1 file changed, 189 insertions(+) create mode 100644 tomo_challenge/classifiers/lstm_flax.py diff --git a/tomo_challenge/classifiers/lstm_flax.py b/tomo_challenge/classifiers/lstm_flax.py new file mode 100644 index 00000000..80245a9c --- /dev/null +++ b/tomo_challenge/classifiers/lstm_flax.py @@ -0,0 +1,189 @@ +""" +Flax LSTM + +This is an example tomographic bin generator using a convolutional LSTM neural network using JAX/FLAX +We also added a custom data loader we tested. +This solution was developed by the Brazilian Center for Physics Research AI 4 Astrophysics team. +Authors: Clecio R. Bom, Bernardo M. Fraga, Gabriel Teixeira, Eduardo Cypriano and Elizabeth Gonzalez. +contact: debom |at |cbpf| dot| br + + +Every classifier module needs to: + - have construction of the type + __init__ (self, bands, options) (see examples below) + - implement two functions: + train (self, training_data,training_z) + apply (self, data). + - define valid_options class varible. + +See Classifier Documentation below. +""" + +from .base import Tomographer +import numpy as np +import tensorflow as tf +from tensorflow import keras +from sklearn.preprocessing import MinMaxScaler +import h5py +from flax import nn, optim, jax_utils +import jax_random as rand +import jax_numpy as jnp +import jax +from .tomo_challenge import jax_metrics as j_metrics + +def get_classifier(n_bin, n_features): + class BinningNN(nn.Module): + def apply(self, x): + batch_size = x.shape[0] + net = nn.Conv(x, features=64, kernel_size=(2,), padding='SAME', name='fc1') + net = nn.tanh(net) + net = nn.max_pool(net, window_shape=(2,), strides=(2,), padding='SAME') + net = nn.Conv(net, 128, kernel_size=(2,), padding='SAME', name='fc2') + net = nn.tanh(net) + net = nn.max_pool(net, (2,), strides=(2,), padding='SAME') + carry = nn.LSTMCell.initialize_carry(rand.PRNGKey(0), (batch_size,), 972) + _, outputs = jax_utils.scan_in_dim(nn.LSTMCell.partial(), carry, net, axis=1) + net_dense = outputs.reshape((outputs.shape[0], -1)) + net_dense = nn.Dense(net_dense, 486, name='fc3') + net_dense = nn.tanh(net_dense) + net_dense = nn.Dense(net_dense, 486) + net_dense = nn.tanh(net_dense) + return nn.softmax(nn.Dense(net_dense, n_bin)) + _, initial_params = BinningNN.init_by_shape( rand.PRNGKey(0), [((1, n_features,1), jnp.float32)]) + return nn.Model(BinningNN, initial_params) + +class Flax_LSTM(Tomographer): + """ LSTM using flax, optimising for SNR """ + + # valid parameter -- see below + valid_options = ['bins', 'n_feats'] + # this settings means arrays will be sent to train and apply instead + # of dictionaries + wants_arrays = True + + def __init__ (self, bands, options): + """Constructor + + Parameters: + ----------- + bands: str + string containg valid bands, like 'riz' or 'griz' + options: dict + options come through here. Valid keys are listed as valid_options + class variable. + + Note: + ----- + Valid options are: + 'bins' - number of tomographic bins + 'n_feats' - number of features + + """ + self.bands = bands + self.opt = options + self.n_bins = options['bins'] + self.n_features = options['n_feats'] + self.model = get_classifier(self.n_features, self.n_bins) + self.scaler = MinMaxScaler() + + def load_data(fname, take_colors=True, cutoff=0.0): + + data = h5py.File(fname, 'r') + r_mag = data['r_mag'] + g_mag = data['g_mag'] + i_mag = data['i_mag'] + z_mag = data['z_mag'] + redshift = data['redshift_true'] + all_mags = np.vstack([g_mag, r_mag, i_mag, z_mag]) + all_mags = all_mags.T + mask = (all_mags != np.inf).all(axis=1) + all_mags = all_mags[mask,:] + redshift = redshift[mask] + gr_color = all_mags[:,0] - all_mags[:,1] + ri_color = all_mags[:,1] - all_mags[:,2] + iz_color = all_mags[:,2] - all_mags[:,3] + all_colors = np.vstack([gr_color, ri_color, iz_color]) + all_colors = all_colors.T + p = np.linspace(0, 100, self.n_bins+1) + z_edges = np.percentile(redshift, p) + train_bin = np.zeros(all_mags.shape[0]) + for i in range(self.n_bins): + z_low = z_edges[i] + z_high = z_edges[i+1] + train_bin[(redshift > z_low) & (redshift <= z_high)] = i + if cutoff != 0.0: + cut = np.random.uniform(0, 1, all_mags.shape[0]) < cutoff + train_bin = train_bin[cut].reshape(-1,1) + all_mags = all_mags[cut] + all_colors = all_colors[cut] + redshift = redshift[cut] + else: + train_bin = train_bin.reshape(-1,1) + if take_colors: + return np.hstack([all_mags, all_colors]), redshift, train_bin.astype(int), z_edges + else: + return mags, redshift, train_bin.astype(int), z_edges + + + + def train(training_data, training_z, batch_size=512, epochs=20): + + x_train = self.scaler.fit_transform(training_data) + x_train = np.expand_dims(x_train, axis=1) + lr = 0.001 + optimizer = optim.Adam(learning_rate=lr).create(self.model) + + @jax.jit + def train_step(optimizer, x, y): + # This is the loss function + def loss_fn(model): + # Apply classifier to features + w = model(x) + return 1000./ j_metrics.compute_snr_score(w, y) + # Compute gradients + loss, g = jax.value_and_grad(loss_fn)(optimizer.target) + # Perform gradient descent + optimizer = optimizer.apply_gradient(g) + return optimizer, loss + + def get_batches(): + train_dataset = tf.data.Dataset.from_tensor_slices((x_train, training_z)) + train_dataset = train_dataset.shuffle(buffer_size=2048).batch(batch_size) + return train_dataset + + losses = [] + for e in range(epochs): + for i, (x_train, labels) in enumerate(get_batches().as_numpy_iterator()): + optimizer, loss = train_step(optimizer, x_train, labels) + losses.append(loss) + + print('Epoch {}\nLoss = {}'.format(e, loss)) + + + self.model = optimizer.target + + + def apply (self, data): + """Applies training to the data. + + Parameters: + ----------- + Data: numpy array, size Ngalaxes x Nbands + testing data, each row is a galaxy, each column is a band as per + band defined above + + Returns: + tomographic_selections: numpy array, int, size Ngalaxies + tomographic selection for galaxies return as bin number for + each galaxy. + """ + x_test = self.scaler.transform(data) + x_test = np.expand_dims(x_test, axis=-1) + + bs = 512 + s = len(x_test) + weights = np.concatenate([self.model(x_test[bs*i:min((bs*(i+1)), s)]) for i in range(s//bs + 1)]) + + tomo_bin = weights.argmax(axis=1) + return tomo_bin + From 2b803181677a5ae65dd994ead19882fab808d115 Mon Sep 17 00:00:00 2001 From: Clecio De Bom <29743719+cdebom@users.noreply.github.com> Date: Fri, 4 Sep 2020 21:01:44 -0300 Subject: [PATCH 10/27] Jax/Flax Deep Ensemble --- .../classifiers/Deep_Ensemble_jax.py | 201 ++++++++++++++++++ tomo_challenge/classifiers/autolstm_jax.py | 87 ++++++++ tomo_challenge/classifiers/nne_jax.py | 77 +++++++ tomo_challenge/classifiers/resnet_jax.py | 118 ++++++++++ 4 files changed, 483 insertions(+) create mode 100644 tomo_challenge/classifiers/Deep_Ensemble_jax.py create mode 100644 tomo_challenge/classifiers/autolstm_jax.py create mode 100644 tomo_challenge/classifiers/nne_jax.py create mode 100644 tomo_challenge/classifiers/resnet_jax.py diff --git a/tomo_challenge/classifiers/Deep_Ensemble_jax.py b/tomo_challenge/classifiers/Deep_Ensemble_jax.py new file mode 100644 index 00000000..527cd30d --- /dev/null +++ b/tomo_challenge/classifiers/Deep_Ensemble_jax.py @@ -0,0 +1,201 @@ +""" +Deep Ensemble Classifier +This is an example tomographic bin generator using a Deep Ensemble Classifier with JAX/FLAX capable of optimizing to multiple losses including SNR/FOM +This solution was developed by the Brazilian Center for Physics Research AI 4 Astrophysics team. +Authors: Clecio R. Bom, Bernardo M. Fraga, Gabriel Teixeira, Eduardo Cypriano and Elizabeth Gonzalez. +contact: debom |at |cbpf| dot| br +Every classifier module needs to: + - have construction of the type + __init__ (self, bands, options) (see examples below) + - implement two functions: + train (self, training_data,training_z) + apply (self, data). + - define valid_options class varible. +""" + +from tomo_challenge.utils.utils import transform_labels +from tomo_challenge.utils.utils import create_directory +from sklearn.preprocessing import MinMaxScaler +import sklearn +import os +from .base import Tomographer +import numpy as np +from flax import nn, optim, serialization, jax_utils +import sys + + + +class ENSEMBLE(Tomographer): + """ ENSEMBLE Classifier """ + + # valid parameter -- see below + valid_options = ['bins'] + # this settings means arrays will be sent to train and apply instead + # of dictionaries + wants_arrays = True + + def __init__ (self, bands, options): + """Constructor + + Parameters: + ----------- + bands: str + string containg valid bands, like 'riz' or 'griz' + options: dict + options come through here. Valid keys are listed as valid_options + class variable. + dir_out: str + directory for the outputs of training + Note: + ----- + Valid options are: + 'bins' - number of tomographic bins + """ + + self.bands = bands + self.opt = options + self.CLASSIFIERS = ['resnet_jax', 'autolstm_jax'] + + def propare_data(self, training_data, traning_bin): + + x_train = training_data + y_train = traning_bin + + nb_classes = len(np.unique(y_train)) + print(f'n_bins = {nb_classes}') + + + # make the min to zero of labels + y_train = transform_labels(y_train) + + # transform the labels from integers to one hot vectors + enc = sklearn.preprocessing.OneHotEncoder() + enc.fit(y_train.reshape(-1, 1)) + y_train = enc.transform(y_train.reshape(-1, 1)).toarray() + + scaler = MinMaxScaler() + x_train = scaler.fit_transform(x_train) + if len(x_train.shape) == 2: # if univariate + # add a dimension to make it multivariate with one dimension and normalizing + x_train = x_train.reshape((x_train.shape[0], x_train.shape[1], 1)) + + return x_train, y_train, nb_classes, scaler + + def fit_classifier(self, classifier_name, x_train, z_train, nb_classes, output_directory, load_weights=False): + + input_shape = x_train.shape[1:] + + if classifier_name == 'resnet_jax': + from tomo_challenge.classifiers import resnet_jax + # resnet = resnet_jax.Resnet(nb_classes) + resnet = resnet_jax.Resnet(n_bins=nb_classes) + _, model = resnet_jax.train(resnet,x_train.shape[1], nb_classes, x_train, z_train) + self.model_res = model + else: + from tomo_challenge.classifiers import autolstm_jax + _, model = autolstm_jax.train(x_train.shape[1], nb_classes, x_train, z_train) + self.model_auto = model + + def create_classifier(self, classifier_name, input_shape, nb_classes, output_directory, verbose=True, + build=True, load_weights=False): + pass + # if classifier_name == 'resnet_jax': + # from tomo_challenge.classifiers import resnet_jax + # return 0#resnet_jax.Resnet(nb_classes) + # if classifier_name == 'autolstm_jax': + # from tomo_challenge.classifiers import autolstm_jax + # return autolstm_jax.AutoLSTM(nb_classes) + + def train (self, training_data, training_z): + """Trains the classifier + + Parameters: + ----------- + training_data: numpy array, size Ngalaxes x Nbands + training data, each row is a galaxy, each column is a band as per + band defined above + training_z: numpy array, size Ngalaxies + true redshift for the training sample + """ + + #Create a directory for the results and the models + path_parent = os.getcwd() + dir_out = path_parent+'/ENSEMBLE' + if not os.path.exists(dir_out): + create_directory(dir_out) + self.dir_out = dir_out + + + + n_bin = self.opt['bins'] + print("Finding bins for training data") + # Now put the training data into redshift bins. + # Use zero so that the one object with minimum + # z in the whole survey will be in the lowest bin + training_bin = np.zeros(training_z.size) + + # Find the edges that split the redshifts into n_z bins of + # equal number counts in each + p = np.linspace(0, 100, n_bin + 1) + z_edges = np.percentile(training_z, p) + + # Now find all the objects in each of these bins + for i in range(n_bin): + z_low = z_edges[i] + z_high = z_edges[i + 1] + training_bin[(training_z > z_low) & (training_z < z_high)] = i + + # For speed, it's possible cut down a percentage of original size + #cut = np.random.uniform(0, 1, training_z.size) < 0.05 + #training_data = training_data[cut] + #training_bin = training_bin[cut] + + x_train, y_train, nb_classes, scaler = self.propare_data(training_data=training_data, traning_bin=training_bin) + print("Fitting classifier") + for classifier_name in self.CLASSIFIERS: + print('classifier_name', classifier_name) + + output_directory = self.dir_out +'/' + classifier_name + '/' + create_directory(output_directory) + + print("Training") + + self.fit_classifier(classifier_name=classifier_name, x_train=x_train, z_train=training_z, + nb_classes=nb_classes, output_directory=output_directory) + + create_directory(output_directory + '/DONE') + print('\t\t\t\tDONE') + # Can be replaced with any classifier + + + + self.z_edges = z_edges + self.scaler = scaler + + + def apply (self, data): + """Applies training to the data. + + Parameters: + ----------- + Data: numpy array, size Ngalaxes x Nbands + testing data, each row is a galaxy, each column is a band as per + band defined above + Returns: + tomographic_selections: numpy array, int, size Ngalaxies + tomographic selection for galaxies return as bin number for + each galaxy. + """ + data = self.scaler.transform(data) + if len(data.shape) == 2: # if univariate + # add a dimension to make it multivariate with one dimension and normalizing + data = data.reshape((data.shape[0], data.shape[1], 1)) + + from tomo_challenge.classifiers import nne + result_dir = self.dir_out+'/ensemble_result/' + create_directory(self.dir_out+'/ensemble_result/') + ensemble = nne.Classifier_NNE(result_dir) + + preds = ensemble.predict(data, self.model_res, self.model_auto) + tomo_bin = np.argmax(preds, axis=1) + return tomo_bin diff --git a/tomo_challenge/classifiers/autolstm_jax.py b/tomo_challenge/classifiers/autolstm_jax.py new file mode 100644 index 00000000..4a1ae5cd --- /dev/null +++ b/tomo_challenge/classifiers/autolstm_jax.py @@ -0,0 +1,87 @@ +import tomo_challenge.jax_metrics as j_metrics +from flax import nn, optim, serialization, jax_utils +import jax.random as rand +import jax.numpy as jnp +import jax + +class AutoLSTM(nn.Module): + def __init__(self, n_bins): + self.bins = n_bins + def apply(self, x): + batch_size = x.shape[0] + net = nn.Conv(x, features=32, kernel_size=(5,), padding='SAME') + net = nn.leaky_relu(net) + net = nn.Conv(net, 512, kernel_size=(5,), padding='SAME') + net = nn.leaky_relu(net) + net = nn.Conv(net, 128, kernel_size=(5,), padding='SAME') + net = nn.leaky_relu(net) + + carry = nn.LSTMCell.initialize_carry(rand.PRNGKey(0), (batch_size,), 128) + new_carry, output_1 = jax_utils.scan_in_dim(nn.LSTMCell.partial(), carry, net, axis=1) + _, output_2 = jax_utils.scan_in_dim(nn.LSTMCell.partial(), new_carry, output_1, axis=1) + + net_dense = output_2.reshape((output_2.shape[0], -1)) + net_dense = nn.Dense(net_dense, 512) + net_dense = nn.BatchNorm(net_dense) + net_dense = nn.leaky_relu(net_dense) + + net_dense = nn.dropout(net_dense, 0.25) + + net_dense = nn.dropout(net_dense, 0.5) + return nn.softmax(nn.Dense(gap, self.bins)) + + + +def get_classifier(n_features, n_bins): + with nn.stochastic(rand.PRNGKey(0)): + _, initial_params = AutoLSTM(n_bins).init_by_shape( rand.PRNGKey(1), [((1, n_features,1), jnp.float32)]) + return nn.Model(AutoLSTM(n_bins), initial_params) + +def get_batches(features, redshift, batch_size, train=True): + if train: + dataset = tf.data.Dataset.from_tensor_slices((features, redshift)) + else: + dataset = tf.data.Dataset.from_tensor_slices(features) + dataset = dataset.shuffle(buffer_size=2048).batch(batch_size) + return dataset + +def train(n_features, n_bins, training_data, training_z, batch_size=20, epochs=20, lr=0.001): + model = get_classifier(n_features, n_bins) + optimizer = optim.Adam(learning_rate=lr).create(model) + + @jax.jit + def train_step(optimizer, x, y): + # This is the loss function + def loss_fn(model): + # Apply classifier to features + w = model(x) + # returns - score, because we want to maximize score + return 1000./ j_metrics.compute_snr_score(w, y) + # Compute gradients + loss, g = jax.value_and_grad(loss_fn)(optimizer.target) + # Perform gradient descent + optimizer = optimizer.apply_gradient(g) + return optimizer, loss + + + losses = [] + with nn.stochastic(rand.PRNGKey(0)): + for e in range(epochs): + print('starting epoch {}'.format(e)) + batches = get_batches(training_data, training_z, batch_size) + for i, (x_train, labels) in enumerate(batches.as_numpy_iterator()): + optimizer, loss = train_step(optimizer, x_train, labels) + losses.append(loss) + print('Epoch {}\nLoss = {}'.format(e, loss)) + model = optimizer.target + return losses, model + +def predict(model, data, batch_size): + batches = get_batches(data, 0., train=False) + preds = [] + with nn.stochastic(rand.PRNGKey(0)): + for i, x_test in enumerate(batches.as_numpy_iterator()): + p = model(x_test) + preds.append(p) + result = np.concatenate([p for p in preds], axis=0) + return result diff --git a/tomo_challenge/classifiers/nne_jax.py b/tomo_challenge/classifiers/nne_jax.py new file mode 100644 index 00000000..4440caba --- /dev/null +++ b/tomo_challenge/classifiers/nne_jax.py @@ -0,0 +1,77 @@ +import tensorflow.keras as keras +import numpy as np + +from tomo_challenge.utils.utils import create_directory +from tomo_challenge.utils.utils import check_if_file_exits +import gc +import time +import sys + + +class Classifier_NNE: + def create_classifier(self, model_name, input_shape, nb_classes, output_directory, verbose=False, + build=True, load_weights=False): + if model_name == 'fcn': + from tomo_challenge.classifiers import fcn + return fcn.Classifier_FCN(output_directory, input_shape, nb_classes, verbose, build=build) + if model_name == 'autolstm': + from tomo_challenge.classifiers import autolstm + return autolstm.Classifier_LSTM(output_directory, input_shape, nb_classes, verbose, build=build) + if model_name == 'resnet': + from tomo_challenge.classifiers import resnet + return resnet.Classifier_RESNET(output_directory, input_shape, nb_classes, verbose, + build=build, load_weights=load_weights) + + + def __init__(self, output_directory, verbose=False): + self.classifiers = ['fcn', 'autolstm', 'resnet'] + + self.output_directory = output_directory + create_directory(self.output_directory) + self.verbose = verbose + self.models_dir = output_directory.replace('ensemble_results','classifier') + self.iterations = 1 + + def predict(self, x_test): + # no training since models are pre-trained + start_time = time.time() + + l = 0 + # loop through all classifiers + for model_name in self.classifiers: + + # loop through different initialization of classifiers + + curr_dir = self.models_dir.replace('classifier',model_name) + + model = self.create_classifier(model_name, None, None, + curr_dir, build=False) + print(model_name) + + predictions_file_name = curr_dir+'y_pred.npy' + + print(f"geting predict for {model_name}") + curr_y_pred = model.predict(x_test=x_test) + keras.backend.clear_session() + + np.save(predictions_file_name,curr_y_pred) + + if l == 0: + y_pred = np.zeros(shape=curr_y_pred.shape) + + y_pred = y_pred+curr_y_pred + l+=1 + + + # average predictions + y_pred = y_pred / l + + # save predictiosn + np.save(self.output_directory+'y_pred.npy', y_pred) + + duration = time.time() - start_time + + # the creation of this directory means + create_directory(self.output_directory + '/DONE') + + return y_pred diff --git a/tomo_challenge/classifiers/resnet_jax.py b/tomo_challenge/classifiers/resnet_jax.py new file mode 100644 index 00000000..5631cb09 --- /dev/null +++ b/tomo_challenge/classifiers/resnet_jax.py @@ -0,0 +1,118 @@ +import tomo_challenge.jax_metrics as j_metrics +from flax import nn, optim, serialization, jax_utils +import jax.random as rand +import jax.numpy as jnp +import jax + +class Resnet(nn.Module): + def __init__(self, n_bins): + self.bins = n_bins + def apply(self, x): + #1st block + conv_x = nn.Conv(x, features=64, kernel_size=(8,), padding='SAME') + conv_x = nn.BatchNorm(conv_x) + conv_x = nn.leaky_relu(conv_x) + + conv_y = nn.Conv(conv_x, 64, kernel_size=(5,), padding='SAME') + conv_y = nn.BatchNorm(conv_y) + conv_y = nn.leaky_relu(conv_y) + + conv_z = nn.Conv(conv_y, 64, kernel_size=(3,), padding='SAME') + conv_z = nn.BatchNorm(conv_z) + + short_y = nn.Conv(x, 64, kernel_size=(1,), padding='SAME') + short_y = nn.BatchNorm(short_y) + + output_1 = nn.leaky_relu(short_y + conv_z) + + #2nd block + conv_x = nn.Conv(output_1, features=64*2, kernel_size=(8,), padding='SAME') + conv_x = nn.BatchNorm(conv_x) + conv_x = nn.leaky_relu(conv_x) + + conv_y = nn.Conv(conv_x, 64*2, kernel_size=(5,), padding='SAME') + conv_y = nn.BatchNorm(conv_y) + conv_y = nn.leaky_relu(conv_y) + + conv_z = nn.Conv(conv_y, 64*2, kernel_size=(3,), padding='SAME') + conv_z = nn.BatchNorm(conv_z) + + short_y = nn.Conv(output_1, 64*2, kernel_size=(1,), padding='SAME') + short_y = nn.BatchNorm(short_y) + + output_2 = nn.leaky_relu(short_y + conv_z) + + #3rd block + conv_x = nn.Conv(output_2, features=64*2, kernel_size=(8,), padding='SAME') + conv_x = nn.BatchNorm(conv_x) + conv_x = nn.leaky_relu(conv_x) + + conv_y = nn.Conv(conv_x, 64*2, kernel_size=(5,), padding='SAME') + conv_y = nn.BatchNorm(conv_y) + conv_y = nn.leaky_relu(conv_y) + + conv_z = nn.Conv(conv_y, 64*2, kernel_size=(3,), padding='SAME') + conv_z = nn.BatchNorm(conv_z) + + short_y = nn.BatchNorm(output_2) + + output_3 = nn.leaky_relu(short_y + conv_z) + + gap = jnp.mean(output_3, axis=1) + return nn.softmax(nn.Dense(gap, self.bins)) + + + + + +def get_batches(features, redshift, batch_size, train=True): + if train: + dataset = tf.data.Dataset.from_tensor_slices((features, redshift)) + else: + dataset = tf.data.Dataset.from_tensor_slices(features) + dataset = dataset.shuffle(buffer_size=2048).batch(batch_size) + return dataset + +def train(resnet,n_features, n_bins, training_data, training_z, batch_size=20, epochs=20, lr=0.001): + def get_classifier(n_features, n_bins): + _, initial_params = resnet.init_by_shape(rand.PRNGKey(0), [((1, n_features, 1), jnp.float32)]) + return nn.Model(resnet, initial_params) + + model = get_classifier(n_features, n_bins) + optimizer = optim.Adam(learning_rate=lr).create(model) + + @jax.jit + def train_step(optimizer, x, y): + # This is the loss function + def loss_fn(model): + # Apply classifier to features + w = model(x) + # returns - score, because we want to maximize score + return 1000./ j_metrics.compute_snr_score(w, y) + # Compute gradients + loss, g = jax.value_and_grad(loss_fn)(optimizer.target) + # Perform gradient descent + optimizer = optimizer.apply_gradient(g) + return optimizer, loss + + + losses = [] + + for e in range(epochs): + print('starting epoch {}'.format(e)) + batches = get_batches(training_data, training_z, batch_size) + for i, (x_train, labels) in enumerate(batches.as_numpy_iterator()): + optimizer, loss = train_step(optimizer, x_train, labels) + losses.append(loss) + print('Epoch {}\nLoss = {}'.format(e, loss)) + model = optimizer.target + return losses, model + +def predict(model, data, batch_size): + batches = get_batches(data, 0., train=False) + preds = [] + for i, x_test in enumerate(batches.as_numpy_iterator()): + p = model(x_test) + preds.append(p) + result = np.concatenate([p for p in preds], axis=0) + return result From 4f2d24917018dccb43f90d71875548cb5d6e67d2 Mon Sep 17 00:00:00 2001 From: Clecio De Bom <29743719+cdebom@users.noreply.github.com> Date: Mon, 14 Sep 2020 22:03:42 -0300 Subject: [PATCH 11/27] Adding example of some classifiers --- example/example_autolstm.yaml | 24 ++++++++++++++++++++++++ example/example_cnn.yaml | 24 ++++++++++++++++++++++++ example/example_lstm.yaml | 24 ++++++++++++++++++++++++ example/example_lstm_flax.yaml | 24 ++++++++++++++++++++++++ example/example_tcn.yaml | 24 ++++++++++++++++++++++++ 5 files changed, 120 insertions(+) create mode 100644 example/example_autolstm.yaml create mode 100644 example/example_cnn.yaml create mode 100644 example/example_lstm.yaml create mode 100644 example/example_lstm_flax.yaml create mode 100644 example/example_tcn.yaml diff --git a/example/example_autolstm.yaml b/example/example_autolstm.yaml new file mode 100644 index 00000000..0d8b2e8d --- /dev/null +++ b/example/example_autolstm.yaml @@ -0,0 +1,24 @@ +metrics: [SNR_3x2, FOM_3x2] +bands: griz +training_file: data/training.hdf5 +validation_file: data/validation.hdf5 +output_file: example/autolstm_10bins.txt +metrics_impl: jax-cosmo +loader: custom + +run: + # This is a class name which will be looked up + #RandomForest: + Autokeras_LSTM: + run_3: + # This setting is sent to the classifier + bins: 10 + # These special settings decide whether the + # color and error colums are passed to the classifier + # as well as the magnitudes + colors: True + errors: False + + # IBandOnly: + # run_3: + # bins: 3 diff --git a/example/example_cnn.yaml b/example/example_cnn.yaml new file mode 100644 index 00000000..38d2bce6 --- /dev/null +++ b/example/example_cnn.yaml @@ -0,0 +1,24 @@ +metrics: [SNR_3x2, FOM_3x2] +bands: griz +training_file: data/training.hdf5 +validation_file: data/validation.hdf5 +output_file: example/cnn_10bins.txt +metrics_impl: jax-cosmo +loader: custom + +run: + # This is a class name which will be looked up + #RandomForest: + CNN: + run_3: + # This setting is sent to the classifier + bins: 10 + # These special settings decide whether the + # color and error colums are passed to the classifier + # as well as the magnitudes + colors: True + errors: False + + # IBandOnly: + # run_3: + # bins: 3 diff --git a/example/example_lstm.yaml b/example/example_lstm.yaml new file mode 100644 index 00000000..18477deb --- /dev/null +++ b/example/example_lstm.yaml @@ -0,0 +1,24 @@ +metrics: [SNR_3x2, FOM_3x2] +bands: griz +training_file: data/training.hdf5 +validation_file: data/validation.hdf5 +output_file: example/bidirectional_lstm_10bins.txt +metrics_impl: jax-cosmo +loader: custom + +run: + # This is a class name which will be looked up + #RandomForest: + LSTM: + run_3: + # This setting is sent to the classifier + bins: 10 + # These special settings decide whether the + # color and error colums are passed to the classifier + # as well as the magnitudes + colors: True + errors: False + + # IBandOnly: + # run_3: + # bins: 3 diff --git a/example/example_lstm_flax.yaml b/example/example_lstm_flax.yaml new file mode 100644 index 00000000..6a62768c --- /dev/null +++ b/example/example_lstm_flax.yaml @@ -0,0 +1,24 @@ +metrics: [SNR_3x2, FOM_3x2] +bands: griz +training_file: data/training.hdf5 +validation_file: data/validation.hdf5 +output_file: example/flax_lstm_10bins.txt +metrics_impl: jax-cosmo +loader: custom + +run: + # This is a class name which will be looked up + #RandomForest: + Flax_LSTM: + run_3: + # This setting is sent to the classifier + bins: 10 + # These special settings decide whether the + # color and error colums are passed to the classifier + # as well as the magnitudes + colors: True + errors: False + + # IBandOnly: + # run_3: + # bins: 3 diff --git a/example/example_tcn.yaml b/example/example_tcn.yaml new file mode 100644 index 00000000..44a5c9ee --- /dev/null +++ b/example/example_tcn.yaml @@ -0,0 +1,24 @@ +metrics: [SNR_3x2, FOM_3x2] +bands: griz +training_file: data/training.hdf5 +validation_file: data/validation.hdf5 +output_file: example/tcn_10bins.txt +metrics_impl: jax-cosmo +loader: custom + +run: + # This is a class name which will be looked up + #RandomForest: + TCN: + run_3: + # This setting is sent to the classifier + bins: 10 + # These special settings decide whether the + # color and error colums are passed to the classifier + # as well as the magnitudes + colors: True + errors: False + + # IBandOnly: + # run_3: + # bins: 3 From 1d45a33b3a9929b8b0841a2b27aa8e8d57e858b3 Mon Sep 17 00:00:00 2001 From: Clecio De Bom <29743719+cdebom@users.noreply.github.com> Date: Mon, 14 Sep 2020 22:10:48 -0300 Subject: [PATCH 12/27] Fixes in the loader --- tomo_challenge/classifiers/LSTM_bidirectional.py | 10 +--------- tomo_challenge/classifiers/TCN.py | 7 +------ tomo_challenge/classifiers/autokeras_LSTM.py | 7 +------ tomo_challenge/classifiers/conv_ak.py | 7 +------ 4 files changed, 4 insertions(+), 27 deletions(-) diff --git a/tomo_challenge/classifiers/LSTM_bidirectional.py b/tomo_challenge/classifiers/LSTM_bidirectional.py index 6fca9cab..aa857066 100644 --- a/tomo_challenge/classifiers/LSTM_bidirectional.py +++ b/tomo_challenge/classifiers/LSTM_bidirectional.py @@ -1,17 +1,10 @@ """ Bidirectional LSTM Classifier - This is an example tomographic bin generator using a convolutional bidirectional LSTM neural network. We also added a custom data loader we tested. This solution was developed by the Brazilian Center for Physics Research AI 4 Astrophysics team. - Authors: Clecio R. Bom, Bernardo M. Fraga, Gabriel Teixeira and Elizabeth Gonzalez. - contact: debom |at |cbpf| dot| br - - - - Every classifier module needs to: - have construction of the type __init__ (self, bands, options) (see examples below) @@ -19,7 +12,6 @@ train (self, training_data,training_z) apply (self, data). - define valid_options class varible. - See Classifier Documentation below. """ @@ -38,7 +30,7 @@ class LSTM(Tomographer): valid_options = ['bins'] # this settings means arrays will be sent to train and apply instead # of dictionaries - wants_arrays = True + wants_arrays = False def __init__ (self, bands, options): """Constructor diff --git a/tomo_challenge/classifiers/TCN.py b/tomo_challenge/classifiers/TCN.py index 6bce4ac4..e2db3b7a 100644 --- a/tomo_challenge/classifiers/TCN.py +++ b/tomo_challenge/classifiers/TCN.py @@ -1,14 +1,10 @@ """ TCN classifier - This is an example tomographic bin generator using a temporal convolutional neural network (TCN). We also added a custom data loader we tested. This solution was developed by the Brazilian Center for Physics Research AI 4 Astrophysics team. - Authors: Clecio R. Bom, Bernardo M. Fraga, Gabriel Teixeira, Eduardo Cypriano and Elizabeth Gonzalez. contact: debom |at |cbpf| dot| br - - Every classifier module needs to: - have construction of the type __init__ (self, bands, options) (see examples below) @@ -16,7 +12,6 @@ train (self, training_data,training_z) apply (self, data). - define valid_options class varible. - See Classifier Documentation below. """ @@ -36,7 +31,7 @@ class TCN(Tomographer): valid_options = ['bins'] # this settings means arrays will be sent to train and apply instead # of dictionaries - wants_arrays = True + wants_arrays = False def __init__ (self, bands, options): """Constructor diff --git a/tomo_challenge/classifiers/autokeras_LSTM.py b/tomo_challenge/classifiers/autokeras_LSTM.py index 5e3d7981..a146bdd3 100644 --- a/tomo_challenge/classifiers/autokeras_LSTM.py +++ b/tomo_challenge/classifiers/autokeras_LSTM.py @@ -1,14 +1,10 @@ """ Bidirectional AutoML LSTM Classifier - This is an example tomographic bin generator using an Autokeras optimized convolutional LSTM neural network. We also added a custom data loader we tested. This solution was developed by the Brazilian Center for Physics Research AI 4 Astrophysics team. - Authors: Clecio R. Bom, Bernardo M. Fraga, Gabriel Teixeira, Eduardo Cypriano and Elizabeth Gonzalez. contact: debom |at |cbpf| dot| br - - Every classifier module needs to: - have construction of the type __init__ (self, bands, options) (see examples below) @@ -16,7 +12,6 @@ train (self, training_data,training_z) apply (self, data). - define valid_options class varible. - See Classifier Documentation below. """ @@ -34,7 +29,7 @@ class Autokeras_LSTM(Tomographer): valid_options = ['bins'] # this settings means arrays will be sent to train and apply instead # of dictionaries - wants_arrays = True + wants_arrays = False def __init__ (self, bands, options): """Constructor diff --git a/tomo_challenge/classifiers/conv_ak.py b/tomo_challenge/classifiers/conv_ak.py index 6f88faf4..a50280ec 100644 --- a/tomo_challenge/classifiers/conv_ak.py +++ b/tomo_challenge/classifiers/conv_ak.py @@ -1,15 +1,11 @@ """ Conv1D Classifier - This is an example tomographic bin generator using a convolutional 1d neural network. The initial model was optimized using AutoKeras. We also added a custom data loader we tested. This solution was developed by the Brazilian Center for Physics Research AI 4 Astrophysics team. - - Authors: Clecio R. Bom, Bernardo M. Fraga, Gabriel Teixeira and Elizabeth Gonzalez. contact: debom |at |cbpf| dot| br - Every classifier module needs to: - have construction of the type __init__ (self, bands, options) (see examples below) @@ -17,7 +13,6 @@ train (self, training_data,training_z) apply (self, data). - define valid_options class varible. - See Classifier Documentation below. """ @@ -35,7 +30,7 @@ class CNN(Tomographer): valid_options = ['bins'] # this settings means arrays will be sent to train and apply instead # of dictionaries - wants_arrays = True + wants_arrays = False def __init__ (self, bands, options): """Constructor From ae7817ca36381dea1821c21786124725a7bef0b5 Mon Sep 17 00:00:00 2001 From: Clecio De Bom <29743719+cdebom@users.noreply.github.com> Date: Mon, 14 Sep 2020 22:17:31 -0300 Subject: [PATCH 13/27] Add files via upload --- tomo_challenge/classifiers/lstm_flax.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/tomo_challenge/classifiers/lstm_flax.py b/tomo_challenge/classifiers/lstm_flax.py index 80245a9c..669e6142 100644 --- a/tomo_challenge/classifiers/lstm_flax.py +++ b/tomo_challenge/classifiers/lstm_flax.py @@ -1,13 +1,10 @@ """ Flax LSTM - This is an example tomographic bin generator using a convolutional LSTM neural network using JAX/FLAX We also added a custom data loader we tested. This solution was developed by the Brazilian Center for Physics Research AI 4 Astrophysics team. Authors: Clecio R. Bom, Bernardo M. Fraga, Gabriel Teixeira, Eduardo Cypriano and Elizabeth Gonzalez. contact: debom |at |cbpf| dot| br - - Every classifier module needs to: - have construction of the type __init__ (self, bands, options) (see examples below) @@ -15,7 +12,6 @@ train (self, training_data,training_z) apply (self, data). - define valid_options class varible. - See Classifier Documentation below. """ @@ -59,7 +55,7 @@ class Flax_LSTM(Tomographer): valid_options = ['bins', 'n_feats'] # this settings means arrays will be sent to train and apply instead # of dictionaries - wants_arrays = True + wants_arrays = False def __init__ (self, bands, options): """Constructor From a4b95123db8f4250ee01238a76ca8c6cfad25b9d Mon Sep 17 00:00:00 2001 From: Clecio De Bom <29743719+cdebom@users.noreply.github.com> Date: Mon, 14 Sep 2020 22:41:49 -0300 Subject: [PATCH 14/27] Add custom data loader. --- tomo_challenge/custom_loader.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 tomo_challenge/custom_loader.py diff --git a/tomo_challenge/custom_loader.py b/tomo_challenge/custom_loader.py new file mode 100644 index 00000000..6cf29e17 --- /dev/null +++ b/tomo_challenge/custom_loader.py @@ -0,0 +1,29 @@ +import numpy as np +import h5py + +def custom_loader(fname): + data = h5py.File(fname, 'r') + r_mag = data['r_mag'] + g_mag = data['g_mag'] + i_mag = data['i_mag'] + z_mag = data['z_mag'] + redshift = data['redshift_true'] + all_mags = np.vstack([g_mag, r_mag, i_mag, z_mag]) + all_mags = all_mags.T + mask = (all_mags != np.inf).all(axis=1) + all_mags = all_mags[mask,:] + redshift = redshift[mask] + gr_color = all_mags[:,0] - all_mags[:,1] + ri_color = all_mags[:,1] - all_mags[:,2] + iz_color = all_mags[:,2] - all_mags[:,3] + all_colors = np.vstack([gr_color, ri_color, iz_color]) + all_colors = all_colors.T + return np.hstack([all_mags, all_colors]), redshift + +def custom_redshift_loader(fname, bands=None, errors=None, colors=None): + _, redshift = custom_loader(fname) + return redshift + +def custom_data_loader(fname): + features, _ = custom_loader(fname) + return features From 7f73431c93749fb03b26fd15d80fa08e7ea9befa Mon Sep 17 00:00:00 2001 From: Clecio De Bom <29743719+cdebom@users.noreply.github.com> Date: Mon, 14 Sep 2020 22:42:49 -0300 Subject: [PATCH 15/27] Add a custom data loader option. --- bin/challenge.py | 39 +++++++++++++++++++++++++++++---------- 1 file changed, 29 insertions(+), 10 deletions(-) diff --git a/bin/challenge.py b/bin/challenge.py index 7cf11812..66aefcf3 100755 --- a/bin/challenge.py +++ b/bin/challenge.py @@ -37,36 +37,53 @@ def main(config_yaml): bands = config['bands'] - training_data = tc.load_data( +######## New loader #################### + if config['loader'] == 'custom': + data_loader = tc.custom_data_loader + z_loader = tc.custom_redshift_loader + else: + data_loader = tc.load_data + z_loader = tc.load_redshift +######################################## + + training_data = data_loader( config['training_file'], bands, errors=anyone_wants_errors, colors=anyone_wants_colors ) - validation_data = tc.load_data( + validation_data = data_loader( config['validation_file'], bands, errors=anyone_wants_errors, colors=anyone_wants_colors ) - training_z = tc.load_redshift(config['training_file']) - validation_z = tc.load_redshift(config['validation_file']) + training_z = z_loader(config['training_file']) + validation_z = z_loader(config['validation_file']) + + if (('metrics_impl' not in config) or + (config['metrics_impl'] == 'firecrown')): + metrics_fn = tc.compute_scores + elif config['metrics_impl'] == 'jax-cosmo': + metrics_fn = tc.jc_compute_scores + else: + raise ValueError('Unknown metrics_impl value') with open(config['output_file'],'w') as output_file: for classifier_name, runs in config['run'].items(): for run, settings in runs.items(): scores = run_one(classifier_name, bands, settings, training_data, training_z, validation_data, validation_z, - config['metrics']) + config['metrics'], metrics_fn) output_file.write (f"{classifier_name} {run} {settings} {scores} \n") def run_one(classifier_name, bands, settings, train_data, train_z, valid_data, - valid_z, metrics): + valid_z, metrics, metrics_fn): classifier = tc.Tomographer._find_subclass(classifier_name) if classifier.wants_arrays: @@ -74,14 +91,16 @@ def run_one(classifier_name, bands, settings, train_data, train_z, valid_data, colors = settings.get('colors') train_data = tc.dict_to_array(train_data, bands, errors=errors, colors=colors) valid_data = tc.dict_to_array(valid_data, bands, errors=errors, colors=colors) - + #cut = int(0.5 * valid_data.size) + #valid_data = valid_data[:cut] + mask = (valid_data < 30).all(axis=1) + valid_z = valid_z[mask] print ("Executing: ", classifier_name, bands, settings) ## first check if options are valid - print (settings, classifier.valid_options) for key in settings.keys(): if key not in classifier.valid_options and key not in ['errors', 'colors']: - raise ValueError(f"Key {key} is not recognized by classifier {classifier_name}") + raise ValueError(f"Key {key} is not recognized by classifier {name}") print ("Initializing classifier...") C=classifier(bands, settings) @@ -93,7 +112,7 @@ def run_one(classifier_name, bands, settings, train_data, train_z, valid_data, results = C.apply(valid_data) print ("Getting metric...") - scores = tc.compute_scores(results, valid_z, metrics=metrics) + scores = metrics_fn(results, valid_z, metrics=metrics) return scores From 153648851fadb86f57579433e5c54ae2f0dcc7fe Mon Sep 17 00:00:00 2001 From: Clecio De Bom <29743719+cdebom@users.noreply.github.com> Date: Mon, 14 Sep 2020 23:16:49 -0300 Subject: [PATCH 16/27] Revert "Add a custom data loader option." This reverts commit 7f73431c93749fb03b26fd15d80fa08e7ea9befa. --- bin/challenge.py | 39 ++++++++++----------------------------- 1 file changed, 10 insertions(+), 29 deletions(-) diff --git a/bin/challenge.py b/bin/challenge.py index 66aefcf3..7cf11812 100755 --- a/bin/challenge.py +++ b/bin/challenge.py @@ -37,53 +37,36 @@ def main(config_yaml): bands = config['bands'] -######## New loader #################### - if config['loader'] == 'custom': - data_loader = tc.custom_data_loader - z_loader = tc.custom_redshift_loader - else: - data_loader = tc.load_data - z_loader = tc.load_redshift -######################################## - - training_data = data_loader( + training_data = tc.load_data( config['training_file'], bands, errors=anyone_wants_errors, colors=anyone_wants_colors ) - validation_data = data_loader( + validation_data = tc.load_data( config['validation_file'], bands, errors=anyone_wants_errors, colors=anyone_wants_colors ) - training_z = z_loader(config['training_file']) - validation_z = z_loader(config['validation_file']) - - if (('metrics_impl' not in config) or - (config['metrics_impl'] == 'firecrown')): - metrics_fn = tc.compute_scores - elif config['metrics_impl'] == 'jax-cosmo': - metrics_fn = tc.jc_compute_scores - else: - raise ValueError('Unknown metrics_impl value') + training_z = tc.load_redshift(config['training_file']) + validation_z = tc.load_redshift(config['validation_file']) with open(config['output_file'],'w') as output_file: for classifier_name, runs in config['run'].items(): for run, settings in runs.items(): scores = run_one(classifier_name, bands, settings, training_data, training_z, validation_data, validation_z, - config['metrics'], metrics_fn) + config['metrics']) output_file.write (f"{classifier_name} {run} {settings} {scores} \n") def run_one(classifier_name, bands, settings, train_data, train_z, valid_data, - valid_z, metrics, metrics_fn): + valid_z, metrics): classifier = tc.Tomographer._find_subclass(classifier_name) if classifier.wants_arrays: @@ -91,16 +74,14 @@ def run_one(classifier_name, bands, settings, train_data, train_z, valid_data, colors = settings.get('colors') train_data = tc.dict_to_array(train_data, bands, errors=errors, colors=colors) valid_data = tc.dict_to_array(valid_data, bands, errors=errors, colors=colors) - #cut = int(0.5 * valid_data.size) - #valid_data = valid_data[:cut] - mask = (valid_data < 30).all(axis=1) - valid_z = valid_z[mask] + print ("Executing: ", classifier_name, bands, settings) ## first check if options are valid + print (settings, classifier.valid_options) for key in settings.keys(): if key not in classifier.valid_options and key not in ['errors', 'colors']: - raise ValueError(f"Key {key} is not recognized by classifier {name}") + raise ValueError(f"Key {key} is not recognized by classifier {classifier_name}") print ("Initializing classifier...") C=classifier(bands, settings) @@ -112,7 +93,7 @@ def run_one(classifier_name, bands, settings, train_data, train_z, valid_data, results = C.apply(valid_data) print ("Getting metric...") - scores = metrics_fn(results, valid_z, metrics=metrics) + scores = tc.compute_scores(results, valid_z, metrics=metrics) return scores From 3aa3a473e45c041013ebad4cbc4144bcb35742e6 Mon Sep 17 00:00:00 2001 From: Clecio De Bom <29743719+cdebom@users.noreply.github.com> Date: Tue, 15 Sep 2020 01:35:39 -0300 Subject: [PATCH 17/27] version of challenge.py to include the custom loader We included a version of challenge.py file that is under bin folder to include the option to run our custom loader. --- tomo_challenge/challenge_custom_loader.py | 123 ++++++++++++++++++++++ 1 file changed, 123 insertions(+) create mode 100644 tomo_challenge/challenge_custom_loader.py diff --git a/tomo_challenge/challenge_custom_loader.py b/tomo_challenge/challenge_custom_loader.py new file mode 100644 index 00000000..66aefcf3 --- /dev/null +++ b/tomo_challenge/challenge_custom_loader.py @@ -0,0 +1,123 @@ +#!/usr/bin/env python +import sys +import os +import click +import yaml +import jinja2 + +## get root dir, one dir above this script +root_dir=os.path.join(os.path.split(sys.argv[0])[0],"..") +sys.path.append(root_dir) +import tomo_challenge as tc + +@click.command() +@click.argument('config_yaml', type=str) +def main(config_yaml): + with open(config_yaml, 'r') as fp: + config_str = jinja2.Template(fp.read()).render() + config = yaml.load(config_str, Loader=yaml.Loader) + + # Get the classes associated with each + for name in config['run']: + try: + config['cls'] = tc.Tomographer._find_subclass(name) + except KeyError: + raise ValueError(f"Tomographer {name} is not defined") + + # Decide if anyone needs the colors calculating and/or errors loading + anyone_wants_colors = False + anyone_wants_errors = False + for run in config['run'].values(): + for version in run.values(): + if version.get('errors'): + anyone_wants_errors = True + if version.get('colors'): + anyone_wants_colors = True + + + bands = config['bands'] + +######## New loader #################### + if config['loader'] == 'custom': + data_loader = tc.custom_data_loader + z_loader = tc.custom_redshift_loader + else: + data_loader = tc.load_data + z_loader = tc.load_redshift +######################################## + + training_data = data_loader( + config['training_file'], + bands, + errors=anyone_wants_errors, + colors=anyone_wants_colors + ) + + validation_data = data_loader( + config['validation_file'], + bands, + errors=anyone_wants_errors, + colors=anyone_wants_colors + ) + + training_z = z_loader(config['training_file']) + validation_z = z_loader(config['validation_file']) + + if (('metrics_impl' not in config) or + (config['metrics_impl'] == 'firecrown')): + metrics_fn = tc.compute_scores + elif config['metrics_impl'] == 'jax-cosmo': + metrics_fn = tc.jc_compute_scores + else: + raise ValueError('Unknown metrics_impl value') + + with open(config['output_file'],'w') as output_file: + for classifier_name, runs in config['run'].items(): + for run, settings in runs.items(): + scores = run_one(classifier_name, bands, settings, + training_data, training_z, validation_data, validation_z, + config['metrics'], metrics_fn) + + output_file.write (f"{classifier_name} {run} {settings} {scores} \n") + + + +def run_one(classifier_name, bands, settings, train_data, train_z, valid_data, + valid_z, metrics, metrics_fn): + classifier = tc.Tomographer._find_subclass(classifier_name) + + if classifier.wants_arrays: + errors = settings.get('errors') + colors = settings.get('colors') + train_data = tc.dict_to_array(train_data, bands, errors=errors, colors=colors) + valid_data = tc.dict_to_array(valid_data, bands, errors=errors, colors=colors) + #cut = int(0.5 * valid_data.size) + #valid_data = valid_data[:cut] + mask = (valid_data < 30).all(axis=1) + valid_z = valid_z[mask] + print ("Executing: ", classifier_name, bands, settings) + + ## first check if options are valid + for key in settings.keys(): + if key not in classifier.valid_options and key not in ['errors', 'colors']: + raise ValueError(f"Key {key} is not recognized by classifier {name}") + + print ("Initializing classifier...") + C=classifier(bands, settings) + + print ("Training...") + C.train(train_data,train_z) + + print ("Applying...") + results = C.apply(valid_data) + + print ("Getting metric...") + scores = metrics_fn(results, valid_z, metrics=metrics) + + return scores + + +if __name__=="__main__": + main() + + From 22f342e9a57100c2d8a025b410ff069b54c75faa Mon Sep 17 00:00:00 2001 From: Clecio De Bom <29743719+cdebom@users.noreply.github.com> Date: Tue, 15 Sep 2020 02:20:12 -0300 Subject: [PATCH 18/27] Small Fix in JAX Deep Ensemble --- .../classifiers/Deep_Ensemble_jax.py | 134 +++------- tomo_challenge/classifiers/autolstm_jax.py | 166 +++++++++--- tomo_challenge/classifiers/nne_jax.py | 55 ++-- tomo_challenge/classifiers/resnet_jax.py | 247 ++++++++++++------ 4 files changed, 368 insertions(+), 234 deletions(-) diff --git a/tomo_challenge/classifiers/Deep_Ensemble_jax.py b/tomo_challenge/classifiers/Deep_Ensemble_jax.py index 527cd30d..2624e0b6 100644 --- a/tomo_challenge/classifiers/Deep_Ensemble_jax.py +++ b/tomo_challenge/classifiers/Deep_Ensemble_jax.py @@ -13,15 +13,13 @@ - define valid_options class varible. """ -from tomo_challenge.utils.utils import transform_labels -from tomo_challenge.utils.utils import create_directory -from sklearn.preprocessing import MinMaxScaler -import sklearn + +#from tomo_challenge.utils.utils import create_directory import os from .base import Tomographer import numpy as np -from flax import nn, optim, serialization, jax_utils -import sys +import subprocess + @@ -32,7 +30,7 @@ class ENSEMBLE(Tomographer): valid_options = ['bins'] # this settings means arrays will be sent to train and apply instead # of dictionaries - wants_arrays = True + wants_arrays = False def __init__ (self, bands, options): """Constructor @@ -54,59 +52,9 @@ class variable. self.bands = bands self.opt = options - self.CLASSIFIERS = ['resnet_jax', 'autolstm_jax'] - - def propare_data(self, training_data, traning_bin): - - x_train = training_data - y_train = traning_bin - - nb_classes = len(np.unique(y_train)) - print(f'n_bins = {nb_classes}') - - - # make the min to zero of labels - y_train = transform_labels(y_train) - - # transform the labels from integers to one hot vectors - enc = sklearn.preprocessing.OneHotEncoder() - enc.fit(y_train.reshape(-1, 1)) - y_train = enc.transform(y_train.reshape(-1, 1)).toarray() - - scaler = MinMaxScaler() - x_train = scaler.fit_transform(x_train) - if len(x_train.shape) == 2: # if univariate - # add a dimension to make it multivariate with one dimension and normalizing - x_train = x_train.reshape((x_train.shape[0], x_train.shape[1], 1)) - - return x_train, y_train, nb_classes, scaler - - def fit_classifier(self, classifier_name, x_train, z_train, nb_classes, output_directory, load_weights=False): - - input_shape = x_train.shape[1:] + self.CLASSIFIERS = ['resnet', 'autolstm'] - if classifier_name == 'resnet_jax': - from tomo_challenge.classifiers import resnet_jax - # resnet = resnet_jax.Resnet(nb_classes) - resnet = resnet_jax.Resnet(n_bins=nb_classes) - _, model = resnet_jax.train(resnet,x_train.shape[1], nb_classes, x_train, z_train) - self.model_res = model - else: - from tomo_challenge.classifiers import autolstm_jax - _, model = autolstm_jax.train(x_train.shape[1], nb_classes, x_train, z_train) - self.model_auto = model - - def create_classifier(self, classifier_name, input_shape, nb_classes, output_directory, verbose=True, - build=True, load_weights=False): - pass - # if classifier_name == 'resnet_jax': - # from tomo_challenge.classifiers import resnet_jax - # return 0#resnet_jax.Resnet(nb_classes) - # if classifier_name == 'autolstm_jax': - # from tomo_challenge.classifiers import autolstm_jax - # return autolstm_jax.AutoLSTM(nb_classes) - - def train (self, training_data, training_z): + def train (self, training_data, validation_data): """Trains the classifier Parameters: @@ -125,32 +73,13 @@ def train (self, training_data, training_z): create_directory(dir_out) self.dir_out = dir_out - - + data_path = path_parent + '/data/' + training_path = data_path + 'training.hdf5' + validation_path = data_path + 'validation.hdf5' + classifier_path = os.path.dirname(os.path.abspath(__file__)) + #print('path = ', classifier_path) n_bin = self.opt['bins'] - print("Finding bins for training data") - # Now put the training data into redshift bins. - # Use zero so that the one object with minimum - # z in the whole survey will be in the lowest bin - training_bin = np.zeros(training_z.size) - - # Find the edges that split the redshifts into n_z bins of - # equal number counts in each - p = np.linspace(0, 100, n_bin + 1) - z_edges = np.percentile(training_z, p) - - # Now find all the objects in each of these bins - for i in range(n_bin): - z_low = z_edges[i] - z_high = z_edges[i + 1] - training_bin[(training_z > z_low) & (training_z < z_high)] = i - - # For speed, it's possible cut down a percentage of original size - #cut = np.random.uniform(0, 1, training_z.size) < 0.05 - #training_data = training_data[cut] - #training_bin = training_bin[cut] - - x_train, y_train, nb_classes, scaler = self.propare_data(training_data=training_data, traning_bin=training_bin) + print("Fitting classifier") for classifier_name in self.CLASSIFIERS: print('classifier_name', classifier_name) @@ -159,21 +88,20 @@ def train (self, training_data, training_z): create_directory(output_directory) print("Training") - - self.fit_classifier(classifier_name=classifier_name, x_train=x_train, z_train=training_z, - nb_classes=nb_classes, output_directory=output_directory) + print(os.getcwd()) + self.p = subprocess.Popen(['python3','-u', str(classifier_path) + '/' + classifier_name + '.py', + training_path, + validation_path, + output_directory, + str(n_bin)]) + subprocess.Popen.wait(self.p) create_directory(output_directory + '/DONE') print('\t\t\t\tDONE') # Can be replaced with any classifier - - self.z_edges = z_edges - self.scaler = scaler - - - def apply (self, data): + def apply (self,data): """Applies training to the data. Parameters: @@ -186,16 +114,24 @@ def apply (self, data): tomographic selection for galaxies return as bin number for each galaxy. """ - data = self.scaler.transform(data) - if len(data.shape) == 2: # if univariate - # add a dimension to make it multivariate with one dimension and normalizing - data = data.reshape((data.shape[0], data.shape[1], 1)) - from tomo_challenge.classifiers import nne + from tomo_challenge.classifiers import nne_jax as nne result_dir = self.dir_out+'/ensemble_result/' create_directory(self.dir_out+'/ensemble_result/') ensemble = nne.Classifier_NNE(result_dir) - preds = ensemble.predict(data, self.model_res, self.model_auto) + preds = ensemble.predict() tomo_bin = np.argmax(preds, axis=1) return tomo_bin + + +def create_directory(directory_path): + if os.path.exists(directory_path): + return None + else: + try: + os.makedirs(directory_path) + except: + + return None + return directory_path diff --git a/tomo_challenge/classifiers/autolstm_jax.py b/tomo_challenge/classifiers/autolstm_jax.py index 4a1ae5cd..e4e416bb 100644 --- a/tomo_challenge/classifiers/autolstm_jax.py +++ b/tomo_challenge/classifiers/autolstm_jax.py @@ -1,51 +1,62 @@ -import tomo_challenge.jax_metrics as j_metrics +import jax_metrics as j_metrics from flax import nn, optim, serialization, jax_utils import jax.random as rand import jax.numpy as jnp import jax +import os +#os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID' +#os.environ['CUDA_VISIBLE_DEVICES'] = '2' +from sklearn.preprocessing import MinMaxScaler +import numpy as np +import sys +import h5py +import tensorflow as tf -class AutoLSTM(nn.Module): - def __init__(self, n_bins): - self.bins = n_bins - def apply(self, x): - batch_size = x.shape[0] - net = nn.Conv(x, features=32, kernel_size=(5,), padding='SAME') - net = nn.leaky_relu(net) - net = nn.Conv(net, 512, kernel_size=(5,), padding='SAME') - net = nn.leaky_relu(net) - net = nn.Conv(net, 128, kernel_size=(5,), padding='SAME') - net = nn.leaky_relu(net) - - carry = nn.LSTMCell.initialize_carry(rand.PRNGKey(0), (batch_size,), 128) - new_carry, output_1 = jax_utils.scan_in_dim(nn.LSTMCell.partial(), carry, net, axis=1) - _, output_2 = jax_utils.scan_in_dim(nn.LSTMCell.partial(), new_carry, output_1, axis=1) - - net_dense = output_2.reshape((output_2.shape[0], -1)) - net_dense = nn.Dense(net_dense, 512) - net_dense = nn.BatchNorm(net_dense) - net_dense = nn.leaky_relu(net_dense) - - net_dense = nn.dropout(net_dense, 0.25) - - net_dense = nn.dropout(net_dense, 0.5) - return nn.softmax(nn.Dense(gap, self.bins)) +def get_classifier(n_features, n_bins): + class AutoLSTM(nn.Module): + def apply(self, x): + batch_size = x.shape[0] + net = nn.Conv(x, features=32, kernel_size=(5,), padding='SAME') + net = nn.leaky_relu(net) + net = nn.Conv(net, 512, kernel_size=(5,), padding='SAME') + net = nn.leaky_relu(net) + net = nn.Conv(net, 128, kernel_size=(5,), padding='SAME') + net = nn.leaky_relu(net) + carry = nn.LSTMCell.initialize_carry(rand.PRNGKey(0), (batch_size,), 128) + new_carry, output_1 = jax_utils.scan_in_dim(nn.LSTMCell.partial(), carry, net, axis=1) + _, output_2 = jax_utils.scan_in_dim(nn.LSTMCell.partial(), new_carry, output_1, axis=1) + net_dense = output_2.reshape((output_2.shape[0], -1)) + net_dense = nn.Dense(net_dense, 512) + net_dense = nn.BatchNorm(net_dense) + net_dense = nn.leaky_relu(net_dense) + + net_dense = nn.dropout(net_dense, 0.25) + + net_dense = nn.dropout(net_dense, 0.5) + return nn.softmax(nn.Dense(net_dense, n_bins)) -def get_classifier(n_features, n_bins): with nn.stochastic(rand.PRNGKey(0)): - _, initial_params = AutoLSTM(n_bins).init_by_shape( rand.PRNGKey(1), [((1, n_features,1), jnp.float32)]) - return nn.Model(AutoLSTM(n_bins), initial_params) + _, initial_params = AutoLSTM.init_by_shape(rand.PRNGKey(0), [((1, n_features, 1), jnp.float32)]) + return nn.Model(AutoLSTM, initial_params) + + +#def get_classifier(n_features, n_bins): +# _, initial_params = Resnet(n_bins).init_by_shape( rand.PRNGKey(0), [((1, n_features,1), jnp.float32)]) +# return nn.Model(Resnet(n_bins), initial_params) def get_batches(features, redshift, batch_size, train=True): if train: dataset = tf.data.Dataset.from_tensor_slices((features, redshift)) + dataset = dataset.shuffle(buffer_size=2048).batch(batch_size) else: - dataset = tf.data.Dataset.from_tensor_slices(features) - dataset = dataset.shuffle(buffer_size=2048).batch(batch_size) + dataset = tf.data.Dataset.from_tensor_slices(features).batch(batch_size) return dataset -def train(n_features, n_bins, training_data, training_z, batch_size=20, epochs=20, lr=0.001): + + +def train(n_features, n_bins, training_data, training_z, batch_size=512, epochs=20, lr=0.001): model = get_classifier(n_features, n_bins) optimizer = optim.Adam(learning_rate=lr).create(model) @@ -65,6 +76,7 @@ def loss_fn(model): losses = [] + with nn.stochastic(rand.PRNGKey(0)): for e in range(epochs): print('starting epoch {}'.format(e)) @@ -74,14 +86,98 @@ def loss_fn(model): losses.append(loss) print('Epoch {}\nLoss = {}'.format(e, loss)) model = optimizer.target + return losses, model -def predict(model, data, batch_size): - batches = get_batches(data, 0., train=False) +def predict(model, data, batch_size=512): + + batches = get_batches(data, 0., batch_size, train=False) preds = [] + n_batches = data.shape[0] // batch_size with nn.stochastic(rand.PRNGKey(0)): for i, x_test in enumerate(batches.as_numpy_iterator()): + print('batch {} of {}'.format(i, n_batches)) p = model(x_test) preds.append(p) - result = np.concatenate([p for p in preds], axis=0) + result = np.concatenate([p for p in preds], axis=0) return result + +def load_data(n_bins, fname, no_inf=True, take_colors=True, cutoff=0.0): + + #print(fname) + #sys.exit() + data = h5py.File(fname, 'r') + + r_mag = data['r_mag'] + g_mag = data['g_mag'] + i_mag = data['i_mag'] + z_mag = data['z_mag'] + redshift = data['redshift_true'] + all_mags = np.vstack([g_mag, r_mag, i_mag, z_mag]) + all_mags = all_mags.T + if no_inf: + mask = (all_mags != np.inf).all(axis=1) + all_mags = all_mags[mask,:] + redshift = redshift[mask] + else: + bad = ~np.isfinite(all_mags) + all_mags[bad] = 30 + gr_color = all_mags[:,0] - all_mags[:,1] + ri_color = all_mags[:,1] - all_mags[:,2] + iz_color = all_mags[:,2] - all_mags[:,3] + all_colors = np.vstack([gr_color, ri_color, iz_color]) + all_colors = all_colors.T + p = np.linspace(0, 100, n_bins+1) + z_edges = np.percentile(redshift, p) + train_bin = np.zeros(all_mags.shape[0]) + for i in range(n_bins): + z_low = z_edges[i] + z_high = z_edges[i+1] + train_bin[(redshift > z_low) & (redshift <= z_high)] = i + if cutoff != 0.0: + cut = np.random.uniform(0, 1, all_mags.shape[0]) < cutoff + train_bin = train_bin[cut].reshape(-1,1) + all_mags = all_mags[cut] + all_colors = all_colors[cut] + redshift = redshift[cut] + else: + train_bin = train_bin.reshape(-1,1) + if take_colors: + return np.hstack([all_mags, all_colors]), redshift, train_bin.astype(int), z_edges + else: + return mags, redshift, train_bin.astype(int), z_edges + +def prepare_data(data, scaler=None): + if scaler == None: + scaler = MinMaxScaler() + result = scaler.fit_transform(data) + result = np.expand_dims(result, axis=-1) + return result, scaler + else: + result = scaler.transform(data) + result = np.expand_dims(result, axis=-1) + return result + + +if __name__ == '__main__': + n_bins = int(sys.argv[4]) + n_features = 7 + print('preparing data') + feats_train, z_train, bins_train, edges_train = load_data(n_bins, sys.argv[1], no_inf=False) + + x_train, scaler = prepare_data(feats_train) + + print('training') + losses, trained_model = train(n_features, n_bins, x_train, z_train, epochs=20) + print('training finished') + + val_feats, z_val, bins_val, edges_val = load_data(n_bins, sys.argv[2], no_inf=False) + x_val = prepare_data(val_feats, scaler=scaler) + print(x_val.shape) + print('predicting') + preds = predict(trained_model, x_val, 512) + + #classes = preds.argmax(axis=1) + print('saving') + output_dir = sys.argv[3] + np.save(output_dir+'y_pred.npy', preds) diff --git a/tomo_challenge/classifiers/nne_jax.py b/tomo_challenge/classifiers/nne_jax.py index 4440caba..aae54cde 100644 --- a/tomo_challenge/classifiers/nne_jax.py +++ b/tomo_challenge/classifiers/nne_jax.py @@ -1,8 +1,9 @@ -import tensorflow.keras as keras +#import tensorflow.keras as keras import numpy as np -from tomo_challenge.utils.utils import create_directory -from tomo_challenge.utils.utils import check_if_file_exits +#from tomo_challenge.utils.utils import create_directory +import os +#from tomo_challenge.utils.utils import check_if_file_exits import gc import time import sys @@ -11,28 +12,25 @@ class Classifier_NNE: def create_classifier(self, model_name, input_shape, nb_classes, output_directory, verbose=False, build=True, load_weights=False): - if model_name == 'fcn': - from tomo_challenge.classifiers import fcn - return fcn.Classifier_FCN(output_directory, input_shape, nb_classes, verbose, build=build) if model_name == 'autolstm': - from tomo_challenge.classifiers import autolstm + from tomo_challenge.classifiers import autolstm_jax as autolstm return autolstm.Classifier_LSTM(output_directory, input_shape, nb_classes, verbose, build=build) if model_name == 'resnet': - from tomo_challenge.classifiers import resnet + from tomo_challenge.classifiers import resnet_jax as resnet return resnet.Classifier_RESNET(output_directory, input_shape, nb_classes, verbose, build=build, load_weights=load_weights) def __init__(self, output_directory, verbose=False): - self.classifiers = ['fcn', 'autolstm', 'resnet'] + self.classifiers = ['autolstm', 'resnet'] self.output_directory = output_directory create_directory(self.output_directory) self.verbose = verbose - self.models_dir = output_directory.replace('ensemble_results','classifier') + self.models_dir = output_directory.replace('ensemble_result','classifier') self.iterations = 1 - def predict(self, x_test): + def predict(self): # no training since models are pre-trained start_time = time.time() @@ -43,22 +41,25 @@ def predict(self, x_test): # loop through different initialization of classifiers curr_dir = self.models_dir.replace('classifier',model_name) - - model = self.create_classifier(model_name, None, None, - curr_dir, build=False) - print(model_name) + print('curr_dir:' ,curr_dir) + #model = self.create_classifier(model_name, None, None, + # curr_dir, build=False) + #print(model_name) predictions_file_name = curr_dir+'y_pred.npy' print(f"geting predict for {model_name}") - curr_y_pred = model.predict(x_test=x_test) - keras.backend.clear_session() - - np.save(predictions_file_name,curr_y_pred) + curr_y_pred = np.load(predictions_file_name) if l == 0: y_pred = np.zeros(shape=curr_y_pred.shape) - + else: + max_val = min(y_pred.shape[0], curr_y_pred.shape[0]) + print(max_val) + y_pred = y_pred[:max_val] + print(y_pred.shape) + curr_y_pred = curr_y_pred[:max_val] + print(curr_y_pred.shape) y_pred = y_pred+curr_y_pred l+=1 @@ -75,3 +76,17 @@ def predict(self, x_test): create_directory(self.output_directory + '/DONE') return y_pred + +def create_directory(directory_path): + if os.path.exists(directory_path): + return None + else: + try: + os.makedirs(directory_path) + except: + + return None + return directory_path + +def check_if_file_exits(file_name): + return os.path.exists(file_name) diff --git a/tomo_challenge/classifiers/resnet_jax.py b/tomo_challenge/classifiers/resnet_jax.py index 5631cb09..353178d8 100644 --- a/tomo_challenge/classifiers/resnet_jax.py +++ b/tomo_challenge/classifiers/resnet_jax.py @@ -1,84 +1,90 @@ -import tomo_challenge.jax_metrics as j_metrics +import jax_metrics as j_metrics from flax import nn, optim, serialization, jax_utils import jax.random as rand import jax.numpy as jnp import jax - -class Resnet(nn.Module): - def __init__(self, n_bins): - self.bins = n_bins - def apply(self, x): +import os +os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID' +os.environ['CUDA_VISIBLE_DEVICES'] = '2' +from sklearn.preprocessing import MinMaxScaler +import numpy as np +import sys +import h5py +import tensorflow as tf + +def get_classifier(n_features, n_bins): + class Resnet(nn.Module): + def apply(self, x): #1st block - conv_x = nn.Conv(x, features=64, kernel_size=(8,), padding='SAME') - conv_x = nn.BatchNorm(conv_x) - conv_x = nn.leaky_relu(conv_x) - - conv_y = nn.Conv(conv_x, 64, kernel_size=(5,), padding='SAME') - conv_y = nn.BatchNorm(conv_y) - conv_y = nn.leaky_relu(conv_y) - - conv_z = nn.Conv(conv_y, 64, kernel_size=(3,), padding='SAME') - conv_z = nn.BatchNorm(conv_z) - - short_y = nn.Conv(x, 64, kernel_size=(1,), padding='SAME') - short_y = nn.BatchNorm(short_y) - - output_1 = nn.leaky_relu(short_y + conv_z) - - #2nd block - conv_x = nn.Conv(output_1, features=64*2, kernel_size=(8,), padding='SAME') - conv_x = nn.BatchNorm(conv_x) - conv_x = nn.leaky_relu(conv_x) - - conv_y = nn.Conv(conv_x, 64*2, kernel_size=(5,), padding='SAME') - conv_y = nn.BatchNorm(conv_y) - conv_y = nn.leaky_relu(conv_y) - - conv_z = nn.Conv(conv_y, 64*2, kernel_size=(3,), padding='SAME') - conv_z = nn.BatchNorm(conv_z) - - short_y = nn.Conv(output_1, 64*2, kernel_size=(1,), padding='SAME') - short_y = nn.BatchNorm(short_y) - - output_2 = nn.leaky_relu(short_y + conv_z) - - #3rd block - conv_x = nn.Conv(output_2, features=64*2, kernel_size=(8,), padding='SAME') - conv_x = nn.BatchNorm(conv_x) - conv_x = nn.leaky_relu(conv_x) - - conv_y = nn.Conv(conv_x, 64*2, kernel_size=(5,), padding='SAME') - conv_y = nn.BatchNorm(conv_y) - conv_y = nn.leaky_relu(conv_y) - - conv_z = nn.Conv(conv_y, 64*2, kernel_size=(3,), padding='SAME') - conv_z = nn.BatchNorm(conv_z) - - short_y = nn.BatchNorm(output_2) - - output_3 = nn.leaky_relu(short_y + conv_z) - - gap = jnp.mean(output_3, axis=1) - return nn.softmax(nn.Dense(gap, self.bins)) - - - - + conv_x = nn.Conv(x, features=64, kernel_size=(8,), padding='SAME') + conv_x = nn.BatchNorm(conv_x) + conv_x = nn.leaky_relu(conv_x) + + conv_y = nn.Conv(conv_x, 64, kernel_size=(5,), padding='SAME') + conv_y = nn.BatchNorm(conv_y) + conv_y = nn.leaky_relu(conv_y) + + conv_z = nn.Conv(conv_y, 64, kernel_size=(3,), padding='SAME') + conv_z = nn.BatchNorm(conv_z) + + short_y = nn.Conv(x, 64, kernel_size=(1,), padding='SAME') + short_y = nn.BatchNorm(short_y) + + output_1 = nn.leaky_relu(short_y + conv_z) + + #2nd block + conv_x = nn.Conv(output_1, features=64*2, kernel_size=(8,), padding='SAME') + conv_x = nn.BatchNorm(conv_x) + conv_x = nn.leaky_relu(conv_x) + + conv_y = nn.Conv(conv_x, 64*2, kernel_size=(5,), padding='SAME') + conv_y = nn.BatchNorm(conv_y) + conv_y = nn.leaky_relu(conv_y) + + conv_z = nn.Conv(conv_y, 64*2, kernel_size=(3,), padding='SAME') + conv_z = nn.BatchNorm(conv_z) + + short_y = nn.Conv(output_1, 64*2, kernel_size=(1,), padding='SAME') + short_y = nn.BatchNorm(short_y) + + output_2 = nn.leaky_relu(short_y + conv_z) + + #3rd block + conv_x = nn.Conv(output_2, features=64*2, kernel_size=(8,), padding='SAME') + conv_x = nn.BatchNorm(conv_x) + conv_x = nn.leaky_relu(conv_x) + + conv_y = nn.Conv(conv_x, 64*2, kernel_size=(5,), padding='SAME') + conv_y = nn.BatchNorm(conv_y) + conv_y = nn.leaky_relu(conv_y) + + conv_z = nn.Conv(conv_y, 64*2, kernel_size=(3,), padding='SAME') + conv_z = nn.BatchNorm(conv_z) + + short_y = nn.BatchNorm(output_2) + + output_3 = nn.leaky_relu(short_y + conv_z) + + gap = jnp.mean(output_3, axis=1) + return nn.softmax(nn.Dense(gap, n_bins)) + _, initial_params = Resnet.init_by_shape(rand.PRNGKey(0), [((1, n_features, 1), jnp.float32)]) + return nn.Model(Resnet, initial_params) + + +#def get_classifier(n_features, n_bins): +# _, initial_params = Resnet(n_bins).init_by_shape( rand.PRNGKey(0), [((1, n_features,1), jnp.float32)]) +# return nn.Model(Resnet(n_bins), initial_params) def get_batches(features, redshift, batch_size, train=True): if train: dataset = tf.data.Dataset.from_tensor_slices((features, redshift)) + dataset = dataset.shuffle(buffer_size=2048).batch(batch_size) else: - dataset = tf.data.Dataset.from_tensor_slices(features) - dataset = dataset.shuffle(buffer_size=2048).batch(batch_size) + dataset = tf.data.Dataset.from_tensor_slices(features).batch(batch_size) return dataset -def train(resnet,n_features, n_bins, training_data, training_z, batch_size=20, epochs=20, lr=0.001): - def get_classifier(n_features, n_bins): - _, initial_params = resnet.init_by_shape(rand.PRNGKey(0), [((1, n_features, 1), jnp.float32)]) - return nn.Model(resnet, initial_params) - - model = get_classifier(n_features, n_bins) +def train(n_bins, training_data, training_z, batch_size=512, epochs=40, lr=0.001): + model = get_classifier(training_data.shape[1], n_bins) optimizer = optim.Adam(learning_rate=lr).create(model) @jax.jit @@ -94,25 +100,106 @@ def loss_fn(model): # Perform gradient descent optimizer = optimizer.apply_gradient(g) return optimizer, loss - + + #def get_batches(): + # train_dataset = tf.data.Dataset.from_tensor_slices((training_data, training_z)) + # train_dataset = train_dataset.shuffle(buffer_size=2048).batch(batch_size) + # return train_dataset losses = [] - - for e in range(epochs): - print('starting epoch {}'.format(e)) - batches = get_batches(training_data, training_z, batch_size) - for i, (x_train, labels) in enumerate(batches.as_numpy_iterator()): - optimizer, loss = train_step(optimizer, x_train, labels) - losses.append(loss) - print('Epoch {}\nLoss = {}'.format(e, loss)) - model = optimizer.target + + with nn.stochastic(rand.PRNGKey(0)): + for e in range(epochs): + print('starting epoch {}'.format(e)) + batches = get_batches(training_data, training_z, batch_size) + for i, (x_train, labels) in enumerate(batches.as_numpy_iterator()): + optimizer, loss = train_step(optimizer, x_train, labels) + losses.append(loss) + print('Epoch {}\nLoss = {}'.format(e, loss)) + model = optimizer.target return losses, model def predict(model, data, batch_size): - batches = get_batches(data, 0., train=False) + batches = get_batches(data, 0., batch_size, train=False) preds = [] for i, x_test in enumerate(batches.as_numpy_iterator()): + #print(x_test.shape) p = model(x_test) preds.append(p) result = np.concatenate([p for p in preds], axis=0) return result + +def load_data(n_bins, fname, no_inf=True, take_colors=True, cutoff=0.0): + + data = h5py.File(fname, 'r') + r_mag = data['r_mag'] + g_mag = data['g_mag'] + i_mag = data['i_mag'] + z_mag = data['z_mag'] + redshift = data['redshift_true'] + all_mags = np.vstack([g_mag, r_mag, i_mag, z_mag]) + all_mags = all_mags.T + if no_inf: + mask = (all_mags != np.inf).all(axis=1) + all_mags = all_mags[mask,:] + redshift = redshift[mask] + else: + bad = ~np.isfinite(all_mags) + all_mags[bad] = 30.0 + gr_color = all_mags[:,0] - all_mags[:,1] + ri_color = all_mags[:,1] - all_mags[:,2] + iz_color = all_mags[:,2] - all_mags[:,3] + all_colors = np.vstack([gr_color, ri_color, iz_color]) + all_colors = all_colors.T + p = np.linspace(0, 100, n_bins+1) + z_edges = np.percentile(redshift, p) + train_bin = np.zeros(all_mags.shape[0]) + for i in range(n_bins): + z_low = z_edges[i] + z_high = z_edges[i+1] + train_bin[(redshift > z_low) & (redshift <= z_high)] = i + if cutoff != 0.0: + cut = np.random.uniform(0, 1, all_mags.shape[0]) < cutoff + train_bin = train_bin[cut].reshape(-1,1) + all_mags = all_mags[cut] + all_colors = all_colors[cut] + redshift = redshift[cut] + else: + train_bin = train_bin.reshape(-1,1) + if take_colors: + return np.hstack([all_mags, all_colors]), redshift, train_bin.astype(int), z_edges + else: + return mags, redshift, train_bin.astype(int), z_edges + +def prepare_data(data, scaler=None): + if scaler is None: + scaler = MinMaxScaler() + scaled = scaler.fit_transform(data) + return np.expand_dims(scaled, axis=-1), scaler + else: + scaled = scaler.transform(data) + return np.expand_dims(scaled, axis=-1) + + +if __name__ == '__main__': + n_bins = int(sys.argv[4]) + print('preparing data') + feats_train, z_train, bins_train, edges_train = load_data(n_bins, sys.argv[1], no_inf=False) + + x_train, scaler = prepare_data(feats_train) + + #print(x_train.shape) + #model = get_classifier(x_train.shape[1], n_bins) + print('training') + losses, trained_model = train(n_bins, x_train, z_train, epochs=20) + print('training finished') + val_feats, z_val, val_bins, edges_val = load_data(n_bins, sys.argv[2], no_inf=False) + x_val = prepare_data(val_feats, scaler=scaler) + #print(x_val.shape) + print('predicting') + preds = predict(trained_model, x_val, 512) + + #classes = preds.argmax(axis=1) + print('saving') + output_dir = sys.argv[3] + np.save(output_dir+'y_pred.npy', preds) From e177e8fa474d74504306dac404850946643af66d Mon Sep 17 00:00:00 2001 From: Clecio De Bom <29743719+cdebom@users.noreply.github.com> Date: Tue, 15 Sep 2020 02:25:17 -0300 Subject: [PATCH 19/27] import fix in TF resnet --- tomo_challenge/classifiers/de_resnet.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/tomo_challenge/classifiers/de_resnet.py b/tomo_challenge/classifiers/de_resnet.py index b26e19fe..0b34bb43 100644 --- a/tomo_challenge/classifiers/de_resnet.py +++ b/tomo_challenge/classifiers/de_resnet.py @@ -3,9 +3,9 @@ import numpy as np import time -from tomo_challenge.utils.utils import save_logs -from tomo_challenge.utils.utils import calculate_metrics -from tomo_challenge.utils.utils import get_available_gpus +#from tomo_challenge.utils.utils import save_logs +#from tomo_challenge.utils.utils import calculate_metrics +#from tomo_challenge.utils.utils import get_available_gpus class Classifier_RESNET: @@ -139,4 +139,8 @@ def predict(self, x_test): model_path = model_path + 'best_model.hdf5' model = keras.models.load_model(model_path) y_pred = model.predict(x_test) - return y_pred \ No newline at end of file + +def get_available_gpus(): + local_device_protos = device_lib.list_local_devices() + return [x.name for x in local_device_protos if x.device_type == 'GPU'] + return y_pred From 39b407b93db657c0681a3e00b1c6a0a8d2f32763 Mon Sep 17 00:00:00 2001 From: Clecio De Bom <29743719+cdebom@users.noreply.github.com> Date: Tue, 15 Sep 2020 02:26:33 -0300 Subject: [PATCH 20/27] Update de_resnet.py --- tomo_challenge/classifiers/de_resnet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tomo_challenge/classifiers/de_resnet.py b/tomo_challenge/classifiers/de_resnet.py index 0b34bb43..672a1dcb 100644 --- a/tomo_challenge/classifiers/de_resnet.py +++ b/tomo_challenge/classifiers/de_resnet.py @@ -139,8 +139,8 @@ def predict(self, x_test): model_path = model_path + 'best_model.hdf5' model = keras.models.load_model(model_path) y_pred = model.predict(x_test) + return y_pred def get_available_gpus(): local_device_protos = device_lib.list_local_devices() return [x.name for x in local_device_protos if x.device_type == 'GPU'] - return y_pred From 0cc731a6a071cde2a3f8565be2af94470c34d525 Mon Sep 17 00:00:00 2001 From: Clecio De Bom <29743719+cdebom@users.noreply.github.com> Date: Tue, 15 Sep 2020 02:27:40 -0300 Subject: [PATCH 21/27] Update de_resnet.py --- tomo_challenge/classifiers/de_resnet.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tomo_challenge/classifiers/de_resnet.py b/tomo_challenge/classifiers/de_resnet.py index 672a1dcb..9ded1db4 100644 --- a/tomo_challenge/classifiers/de_resnet.py +++ b/tomo_challenge/classifiers/de_resnet.py @@ -142,5 +142,5 @@ def predict(self, x_test): return y_pred def get_available_gpus(): - local_device_protos = device_lib.list_local_devices() - return [x.name for x in local_device_protos if x.device_type == 'GPU'] + local_device_protos = device_lib.list_local_devices() + return [x.name for x in local_device_protos if x.device_type == 'GPU'] From c74df532fedfcdfc6fd4ee3aa034ff2f735b18c0 Mon Sep 17 00:00:00 2001 From: Clecio De Bom <29743719+cdebom@users.noreply.github.com> Date: Tue, 15 Sep 2020 02:29:46 -0300 Subject: [PATCH 22/27] import fix in Ensemble LSTM --- tomo_challenge/classifiers/de_autolstm.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tomo_challenge/classifiers/de_autolstm.py b/tomo_challenge/classifiers/de_autolstm.py index 8cd83fc3..ab7b8365 100644 --- a/tomo_challenge/classifiers/de_autolstm.py +++ b/tomo_challenge/classifiers/de_autolstm.py @@ -3,9 +3,9 @@ import numpy as np import time -from tomo_challenge.utils.utils import save_logs -from tomo_challenge.utils.utils import calculate_metrics -from tomo_challenge.utils.utils import get_available_gpus +#from tomo_challenge.utils.utils import save_logs +#from tomo_challenge.utils.utils import calculate_metrics +#from tomo_challenge.utils.utils import get_available_gpus class Classifier_LSTM: @@ -82,3 +82,6 @@ def predict(self, x_test): y_pred = model.predict(x_test) return y_pred +def get_available_gpus(): + local_device_protos = device_lib.list_local_devices() + return [x.name for x in local_device_protos if x.device_type == 'GPU'] From 18566b97584e573117b4c3d403e68582b8a7783b Mon Sep 17 00:00:00 2001 From: Clecio De Bom <29743719+cdebom@users.noreply.github.com> Date: Tue, 15 Sep 2020 02:30:39 -0300 Subject: [PATCH 23/27] Update de_autolstm.py --- tomo_challenge/classifiers/de_autolstm.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tomo_challenge/classifiers/de_autolstm.py b/tomo_challenge/classifiers/de_autolstm.py index ab7b8365..c277dd21 100644 --- a/tomo_challenge/classifiers/de_autolstm.py +++ b/tomo_challenge/classifiers/de_autolstm.py @@ -6,6 +6,7 @@ #from tomo_challenge.utils.utils import save_logs #from tomo_challenge.utils.utils import calculate_metrics #from tomo_challenge.utils.utils import get_available_gpus +from tensorflow.python.client import device_lib class Classifier_LSTM: From 12ca8c7697521953115b54da4e6a5fcc391f83f3 Mon Sep 17 00:00:00 2001 From: Clecio De Bom <29743719+cdebom@users.noreply.github.com> Date: Tue, 15 Sep 2020 02:31:54 -0300 Subject: [PATCH 24/27] Update de_resnet.py --- tomo_challenge/classifiers/de_resnet.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tomo_challenge/classifiers/de_resnet.py b/tomo_challenge/classifiers/de_resnet.py index 9ded1db4..6201eff1 100644 --- a/tomo_challenge/classifiers/de_resnet.py +++ b/tomo_challenge/classifiers/de_resnet.py @@ -3,6 +3,7 @@ import numpy as np import time +from tensorflow.python.client import device_lib #from tomo_challenge.utils.utils import save_logs #from tomo_challenge.utils.utils import calculate_metrics #from tomo_challenge.utils.utils import get_available_gpus From 596eff78fda42432d7942321c054e565ff823add Mon Sep 17 00:00:00 2001 From: Clecio De Bom <29743719+cdebom@users.noreply.github.com> Date: Tue, 15 Sep 2020 02:35:19 -0300 Subject: [PATCH 25/27] Update de_fcn.py --- tomo_challenge/classifiers/de_fcn.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tomo_challenge/classifiers/de_fcn.py b/tomo_challenge/classifiers/de_fcn.py index 6d1a66ba..6ec12558 100644 --- a/tomo_challenge/classifiers/de_fcn.py +++ b/tomo_challenge/classifiers/de_fcn.py @@ -3,7 +3,8 @@ import numpy as np import time import sys -from tomo_challenge.utils.utils import get_available_gpus +#from tomo_challenge.utils.utils import get_available_gpus +from tensorflow.python.client import device_lib class Classifier_FCN: @@ -79,3 +80,6 @@ def predict(self, x_test): model = keras.models.load_model(model_path) y_pred = model.predict(x_test) return y_pred +def get_available_gpus(): + local_device_protos = device_lib.list_local_devices() + return [x.name for x in local_device_protos if x.device_type == 'GPU'] From 39dad6562fb1a240764a733ef2fbf7ba688c64bc Mon Sep 17 00:00:00 2001 From: Clecio De Bom <29743719+cdebom@users.noreply.github.com> Date: Tue, 15 Sep 2020 02:42:19 -0300 Subject: [PATCH 26/27] import fix --- tomo_challenge/classifiers/de_nne.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/tomo_challenge/classifiers/de_nne.py b/tomo_challenge/classifiers/de_nne.py index d1734f4b..acabce7b 100644 --- a/tomo_challenge/classifiers/de_nne.py +++ b/tomo_challenge/classifiers/de_nne.py @@ -2,8 +2,9 @@ import tensorflow.keras as keras import numpy as np -from tomo_challenge.utils.utils import create_directory -from tomo_challenge.utils.utils import check_if_file_exits +#from tomo_challenge.utils.utils import create_directory +#from tomo_challenge.utils.utils import check_if_file_exits +import os import gc import time import sys @@ -76,3 +77,16 @@ def predict(self, x_test): create_directory(self.output_directory + '/DONE') return y_pred + +def check_if_file_exits(file_name): + return os.path.exists(file_name) + +def create_directory(directory_path): + if os.path.exists(directory_path): + return None + else: + try: + os.makedirs(directory_path) + except: + return None + return directory_path From c9a5cd1c4dbf844bffcb7fad7107820fb7c043ac Mon Sep 17 00:00:00 2001 From: Clecio De Bom <29743719+cdebom@users.noreply.github.com> Date: Tue, 15 Sep 2020 02:48:09 -0300 Subject: [PATCH 27/27] import fix --- .../classifiers/DEEP_ENSEMBLE_main.py | 49 ++++++++++++++++++- 1 file changed, 47 insertions(+), 2 deletions(-) diff --git a/tomo_challenge/classifiers/DEEP_ENSEMBLE_main.py b/tomo_challenge/classifiers/DEEP_ENSEMBLE_main.py index e1f1c93b..57c5e0a7 100644 --- a/tomo_challenge/classifiers/DEEP_ENSEMBLE_main.py +++ b/tomo_challenge/classifiers/DEEP_ENSEMBLE_main.py @@ -14,8 +14,8 @@ - define valid_options class varible. """ -from tomo_challenge.utils.utils import transform_labels -from tomo_challenge.utils.utils import create_directory +#from tomo_challenge.utils.utils import transform_labels +#from tomo_challenge.utils.utils import create_directory from sklearn.preprocessing import MinMaxScaler import sklearn import os @@ -198,3 +198,48 @@ def apply (self, data): preds = ensemble.predict(data) tomo_bin = np.argmax(preds, axis=1) return tomo_bin +def create_directory(directory_path): + if os.path.exists(directory_path): + return None + else: + try: + os.makedirs(directory_path) + except: + return None + return directory_path +def transform_labels(y_train,y_test=np.array([]),y_val=None): + """ + Transform label to min equal zero and continuous + For example if we have [1,3,4] ---> [0,1,2] + """ + if not y_val is None : + # index for when resplitting the concatenation + idx_y_val = len(y_train) + idx_y_test = idx_y_val + len(y_val) + # init the encoder + encoder = LabelEncoder() + # concat train and test to fit + y_train_val_test = np.concatenate((y_train,y_val,y_test),axis =0) + # fit the encoder + encoder.fit(y_train_val_test) + # transform to min zero and continuous labels + new_y_train_val_test = encoder.transform(y_train_val_test) + # resplit the train and test + new_y_train = new_y_train_val_test[0:idx_y_val] + new_y_val = new_y_train_val_test[idx_y_val:idx_y_test] + new_y_test = new_y_train_val_test[idx_y_test:] + return new_y_train, new_y_val,new_y_test + else: + # no validation split + # init the encoder + encoder = LabelEncoder() + # concat train and test to fit + y_train_test = np.concatenate((y_train,y_test),axis =0) + # fit the encoder + encoder.fit(y_train_test) + # transform to min zero and continuous labels + new_y_train_test = encoder.transform(y_train_test) + # resplit the train and test + new_y_train = new_y_train_test[0:len(y_train)] + new_y_test = new_y_train_test[len(y_train):] + return new_y_train