diff --git a/example/example_autolstm.yaml b/example/example_autolstm.yaml new file mode 100644 index 00000000..0d8b2e8d --- /dev/null +++ b/example/example_autolstm.yaml @@ -0,0 +1,24 @@ +metrics: [SNR_3x2, FOM_3x2] +bands: griz +training_file: data/training.hdf5 +validation_file: data/validation.hdf5 +output_file: example/autolstm_10bins.txt +metrics_impl: jax-cosmo +loader: custom + +run: + # This is a class name which will be looked up + #RandomForest: + Autokeras_LSTM: + run_3: + # This setting is sent to the classifier + bins: 10 + # These special settings decide whether the + # color and error colums are passed to the classifier + # as well as the magnitudes + colors: True + errors: False + + # IBandOnly: + # run_3: + # bins: 3 diff --git a/example/example_cnn.yaml b/example/example_cnn.yaml new file mode 100644 index 00000000..38d2bce6 --- /dev/null +++ b/example/example_cnn.yaml @@ -0,0 +1,24 @@ +metrics: [SNR_3x2, FOM_3x2] +bands: griz +training_file: data/training.hdf5 +validation_file: data/validation.hdf5 +output_file: example/cnn_10bins.txt +metrics_impl: jax-cosmo +loader: custom + +run: + # This is a class name which will be looked up + #RandomForest: + CNN: + run_3: + # This setting is sent to the classifier + bins: 10 + # These special settings decide whether the + # color and error colums are passed to the classifier + # as well as the magnitudes + colors: True + errors: False + + # IBandOnly: + # run_3: + # bins: 3 diff --git a/example/example_lstm.yaml b/example/example_lstm.yaml new file mode 100644 index 00000000..18477deb --- /dev/null +++ b/example/example_lstm.yaml @@ -0,0 +1,24 @@ +metrics: [SNR_3x2, FOM_3x2] +bands: griz +training_file: data/training.hdf5 +validation_file: data/validation.hdf5 +output_file: example/bidirectional_lstm_10bins.txt +metrics_impl: jax-cosmo +loader: custom + +run: + # This is a class name which will be looked up + #RandomForest: + LSTM: + run_3: + # This setting is sent to the classifier + bins: 10 + # These special settings decide whether the + # color and error colums are passed to the classifier + # as well as the magnitudes + colors: True + errors: False + + # IBandOnly: + # run_3: + # bins: 3 diff --git a/example/example_lstm_flax.yaml b/example/example_lstm_flax.yaml new file mode 100644 index 00000000..6a62768c --- /dev/null +++ b/example/example_lstm_flax.yaml @@ -0,0 +1,24 @@ +metrics: [SNR_3x2, FOM_3x2] +bands: griz +training_file: data/training.hdf5 +validation_file: data/validation.hdf5 +output_file: example/flax_lstm_10bins.txt +metrics_impl: jax-cosmo +loader: custom + +run: + # This is a class name which will be looked up + #RandomForest: + Flax_LSTM: + run_3: + # This setting is sent to the classifier + bins: 10 + # These special settings decide whether the + # color and error colums are passed to the classifier + # as well as the magnitudes + colors: True + errors: False + + # IBandOnly: + # run_3: + # bins: 3 diff --git a/example/example_tcn.yaml b/example/example_tcn.yaml new file mode 100644 index 00000000..44a5c9ee --- /dev/null +++ b/example/example_tcn.yaml @@ -0,0 +1,24 @@ +metrics: [SNR_3x2, FOM_3x2] +bands: griz +training_file: data/training.hdf5 +validation_file: data/validation.hdf5 +output_file: example/tcn_10bins.txt +metrics_impl: jax-cosmo +loader: custom + +run: + # This is a class name which will be looked up + #RandomForest: + TCN: + run_3: + # This setting is sent to the classifier + bins: 10 + # These special settings decide whether the + # color and error colums are passed to the classifier + # as well as the magnitudes + colors: True + errors: False + + # IBandOnly: + # run_3: + # bins: 3 diff --git a/tomo_challenge/challenge_custom_loader.py b/tomo_challenge/challenge_custom_loader.py new file mode 100644 index 00000000..66aefcf3 --- /dev/null +++ b/tomo_challenge/challenge_custom_loader.py @@ -0,0 +1,123 @@ +#!/usr/bin/env python +import sys +import os +import click +import yaml +import jinja2 + +## get root dir, one dir above this script +root_dir=os.path.join(os.path.split(sys.argv[0])[0],"..") +sys.path.append(root_dir) +import tomo_challenge as tc + +@click.command() +@click.argument('config_yaml', type=str) +def main(config_yaml): + with open(config_yaml, 'r') as fp: + config_str = jinja2.Template(fp.read()).render() + config = yaml.load(config_str, Loader=yaml.Loader) + + # Get the classes associated with each + for name in config['run']: + try: + config['cls'] = tc.Tomographer._find_subclass(name) + except KeyError: + raise ValueError(f"Tomographer {name} is not defined") + + # Decide if anyone needs the colors calculating and/or errors loading + anyone_wants_colors = False + anyone_wants_errors = False + for run in config['run'].values(): + for version in run.values(): + if version.get('errors'): + anyone_wants_errors = True + if version.get('colors'): + anyone_wants_colors = True + + + bands = config['bands'] + +######## New loader #################### + if config['loader'] == 'custom': + data_loader = tc.custom_data_loader + z_loader = tc.custom_redshift_loader + else: + data_loader = tc.load_data + z_loader = tc.load_redshift +######################################## + + training_data = data_loader( + config['training_file'], + bands, + errors=anyone_wants_errors, + colors=anyone_wants_colors + ) + + validation_data = data_loader( + config['validation_file'], + bands, + errors=anyone_wants_errors, + colors=anyone_wants_colors + ) + + training_z = z_loader(config['training_file']) + validation_z = z_loader(config['validation_file']) + + if (('metrics_impl' not in config) or + (config['metrics_impl'] == 'firecrown')): + metrics_fn = tc.compute_scores + elif config['metrics_impl'] == 'jax-cosmo': + metrics_fn = tc.jc_compute_scores + else: + raise ValueError('Unknown metrics_impl value') + + with open(config['output_file'],'w') as output_file: + for classifier_name, runs in config['run'].items(): + for run, settings in runs.items(): + scores = run_one(classifier_name, bands, settings, + training_data, training_z, validation_data, validation_z, + config['metrics'], metrics_fn) + + output_file.write (f"{classifier_name} {run} {settings} {scores} \n") + + + +def run_one(classifier_name, bands, settings, train_data, train_z, valid_data, + valid_z, metrics, metrics_fn): + classifier = tc.Tomographer._find_subclass(classifier_name) + + if classifier.wants_arrays: + errors = settings.get('errors') + colors = settings.get('colors') + train_data = tc.dict_to_array(train_data, bands, errors=errors, colors=colors) + valid_data = tc.dict_to_array(valid_data, bands, errors=errors, colors=colors) + #cut = int(0.5 * valid_data.size) + #valid_data = valid_data[:cut] + mask = (valid_data < 30).all(axis=1) + valid_z = valid_z[mask] + print ("Executing: ", classifier_name, bands, settings) + + ## first check if options are valid + for key in settings.keys(): + if key not in classifier.valid_options and key not in ['errors', 'colors']: + raise ValueError(f"Key {key} is not recognized by classifier {name}") + + print ("Initializing classifier...") + C=classifier(bands, settings) + + print ("Training...") + C.train(train_data,train_z) + + print ("Applying...") + results = C.apply(valid_data) + + print ("Getting metric...") + scores = metrics_fn(results, valid_z, metrics=metrics) + + return scores + + +if __name__=="__main__": + main() + + diff --git a/tomo_challenge/classifiers/DEEP_ENSEMBLE_main.py b/tomo_challenge/classifiers/DEEP_ENSEMBLE_main.py new file mode 100644 index 00000000..57c5e0a7 --- /dev/null +++ b/tomo_challenge/classifiers/DEEP_ENSEMBLE_main.py @@ -0,0 +1,245 @@ +""" +Deep Ensemble Classifier +This is an example tomographic bin generator using a Deep Ensemble Classifier. +This solution was developed by the Brazilian Center for Physics Research AI 4 Astrophysics team. +Authors: Clecio R. Bom, Gabriel Teixeira, Bernardo M. Fraga, Eduardo Cypriano and Elizabeth Gonzalez. +contact: debom |at |cbpf| dot| br +In our preliminary tests we found a SNR 3X2 of ~1930 for n=10 bins. +Every classifier module needs to: + - have construction of the type + __init__ (self, bands, options) (see examples below) + - implement two functions: + train (self, training_data,training_z) + apply (self, data). + - define valid_options class varible. +""" + +#from tomo_challenge.utils.utils import transform_labels +#from tomo_challenge.utils.utils import create_directory +from sklearn.preprocessing import MinMaxScaler +import sklearn +import os +from .base import Tomographer +import numpy as np +import sys + + + +class ENSEMBLE(Tomographer): + """ ENSEMBLE Classifier """ + + # valid parameter -- see below + valid_options = ['bins'] + # this settings means arrays will be sent to train and apply instead + # of dictionaries + wants_arrays = True + + def __init__ (self, bands, options): + """Constructor + + Parameters: + ----------- + bands: str + string containg valid bands, like 'riz' or 'griz' + options: dict + options come through here. Valid keys are listed as valid_options + class variable. + dir_out: str + directory for the outputs of training + Note: + ----- + Valid options are: + 'bins' - number of tomographic bins + """ + + self.bands = bands + self.opt = options + self.CLASSIFIERS = ['fcn', 'autolstm', 'resnet'] + + def propare_data(self, training_data, traning_bin): + + x_train = training_data + y_train = traning_bin + + nb_classes = len(np.unique(y_train)) + print(f'n_bins = {nb_classes}') + + + # make the min to zero of labels + y_train = transform_labels(y_train) + + # transform the labels from integers to one hot vectors + enc = sklearn.preprocessing.OneHotEncoder() + enc.fit(y_train.reshape(-1, 1)) + y_train = enc.transform(y_train.reshape(-1, 1)).toarray() + + scaler = MinMaxScaler() + x_train = scaler.fit_transform(x_train) + if len(x_train.shape) == 2: # if univariate + # add a dimension to make it multivariate with one dimension and normalizing + x_train = x_train.reshape((x_train.shape[0], x_train.shape[1], 1)) + + return x_train, y_train, nb_classes, scaler + + def fit_classifier(self, classifier_name, x_train, y_train, nb_classes, output_directory, load_weights=False): + + input_shape = x_train.shape[1:] + + classifier = self.create_classifier(classifier_name, input_shape, nb_classes, output_directory, + load_weights=load_weights) + + classifier.fit(x_train, y_train) + + def create_classifier(self, classifier_name, input_shape, nb_classes, output_directory, verbose=True, + build=True, load_weights=False): + if classifier_name == 'fcn': + from tomo_challenge.classifiers import de_fcn as fcn + return fcn.Classifier_FCN(output_directory, input_shape, nb_classes, verbose, build=build) + + if classifier_name == 'autolstm': + from tomo_challenge.classifiers import de_autolstm as autolstm + return autolstm.Classifier_LSTM(output_directory, input_shape, nb_classes, verbose, build=build) + + if classifier_name == 'resnet': + from tomo_challenge.classifiers import de_resnet as resnet + return resnet.Classifier_RESNET(output_directory, input_shape, nb_classes, verbose, + build=build, load_weights=load_weights) + + def train (self, training_data, training_z): + """Trains the classifier + + Parameters: + ----------- + training_data: numpy array, size Ngalaxes x Nbands + training data, each row is a galaxy, each column is a band as per + band defined above + training_z: numpy array, size Ngalaxies + true redshift for the training sample + """ + + #Create a directory for the results and the models + path_parent = os.getcwd() + dir_out = path_parent+'/ENSEMBLE' + if not os.path.exists(dir_out): + create_directory(dir_out) + self.dir_out = dir_out + + + + n_bin = self.opt['bins'] + print("Finding bins for training data") + # Now put the training data into redshift bins. + # Use zero so that the one object with minimum + # z in the whole survey will be in the lowest bin + training_bin = np.zeros(training_z.size) + + # Find the edges that split the redshifts into n_z bins of + # equal number counts in each + p = np.linspace(0, 100, n_bin + 1) + z_edges = np.percentile(training_z, p) + + # Now find all the objects in each of these bins + for i in range(n_bin): + z_low = z_edges[i] + z_high = z_edges[i + 1] + training_bin[(training_z > z_low) & (training_z < z_high)] = i + + # For speed, it's possible cut down a percentage of original size + #cut = np.random.uniform(0, 1, training_z.size) < 0.05 + #training_data = training_data[cut] + #training_bin = training_bin[cut] + + x_train, y_train, nb_classes, scaler = self.propare_data(training_data=training_data, traning_bin=training_bin) + print("Fitting classifier") + for classifier_name in self.CLASSIFIERS: + print('classifier_name', classifier_name) + + output_directory = self.dir_out +'/' + classifier_name + '/' + create_directory(output_directory) + + print("Training") + + self.fit_classifier(classifier_name=classifier_name, x_train=x_train, y_train=y_train, + nb_classes=nb_classes, output_directory=output_directory) + + create_directory(output_directory + '/DONE') + print('\t\t\t\tDONE') + # Can be replaced with any classifier + + + + self.z_edges = z_edges + self.scaler = scaler + + + def apply (self, data): + """Applies training to the data. + + Parameters: + ----------- + Data: numpy array, size Ngalaxes x Nbands + testing data, each row is a galaxy, each column is a band as per + band defined above + Returns: + tomographic_selections: numpy array, int, size Ngalaxies + tomographic selection for galaxies return as bin number for + each galaxy. + """ + data = self.scaler.transform(data) + if len(data.shape) == 2: # if univariate + # add a dimension to make it multivariate with one dimension and normalizing + data = data.reshape((data.shape[0], data.shape[1], 1)) + + from tomo_challenge.classifiers import de_nne as nne + result_dir = self.dir_out+'/ensemble_result/' + create_directory(self.dir_out+'/ensemble_result/') + ensemble = nne.Classifier_NNE(result_dir) + + preds = ensemble.predict(data) + tomo_bin = np.argmax(preds, axis=1) + return tomo_bin +def create_directory(directory_path): + if os.path.exists(directory_path): + return None + else: + try: + os.makedirs(directory_path) + except: + return None + return directory_path +def transform_labels(y_train,y_test=np.array([]),y_val=None): + """ + Transform label to min equal zero and continuous + For example if we have [1,3,4] ---> [0,1,2] + """ + if not y_val is None : + # index for when resplitting the concatenation + idx_y_val = len(y_train) + idx_y_test = idx_y_val + len(y_val) + # init the encoder + encoder = LabelEncoder() + # concat train and test to fit + y_train_val_test = np.concatenate((y_train,y_val,y_test),axis =0) + # fit the encoder + encoder.fit(y_train_val_test) + # transform to min zero and continuous labels + new_y_train_val_test = encoder.transform(y_train_val_test) + # resplit the train and test + new_y_train = new_y_train_val_test[0:idx_y_val] + new_y_val = new_y_train_val_test[idx_y_val:idx_y_test] + new_y_test = new_y_train_val_test[idx_y_test:] + return new_y_train, new_y_val,new_y_test + else: + # no validation split + # init the encoder + encoder = LabelEncoder() + # concat train and test to fit + y_train_test = np.concatenate((y_train,y_test),axis =0) + # fit the encoder + encoder.fit(y_train_test) + # transform to min zero and continuous labels + new_y_train_test = encoder.transform(y_train_test) + # resplit the train and test + new_y_train = new_y_train_test[0:len(y_train)] + new_y_test = new_y_train_test[len(y_train):] + return new_y_train diff --git a/tomo_challenge/classifiers/Deep_Ensemble_jax.py b/tomo_challenge/classifiers/Deep_Ensemble_jax.py new file mode 100644 index 00000000..2624e0b6 --- /dev/null +++ b/tomo_challenge/classifiers/Deep_Ensemble_jax.py @@ -0,0 +1,137 @@ +""" +Deep Ensemble Classifier +This is an example tomographic bin generator using a Deep Ensemble Classifier with JAX/FLAX capable of optimizing to multiple losses including SNR/FOM +This solution was developed by the Brazilian Center for Physics Research AI 4 Astrophysics team. +Authors: Clecio R. Bom, Bernardo M. Fraga, Gabriel Teixeira, Eduardo Cypriano and Elizabeth Gonzalez. +contact: debom |at |cbpf| dot| br +Every classifier module needs to: + - have construction of the type + __init__ (self, bands, options) (see examples below) + - implement two functions: + train (self, training_data,training_z) + apply (self, data). + - define valid_options class varible. +""" + + +#from tomo_challenge.utils.utils import create_directory +import os +from .base import Tomographer +import numpy as np +import subprocess + + + + +class ENSEMBLE(Tomographer): + """ ENSEMBLE Classifier """ + + # valid parameter -- see below + valid_options = ['bins'] + # this settings means arrays will be sent to train and apply instead + # of dictionaries + wants_arrays = False + + def __init__ (self, bands, options): + """Constructor + + Parameters: + ----------- + bands: str + string containg valid bands, like 'riz' or 'griz' + options: dict + options come through here. Valid keys are listed as valid_options + class variable. + dir_out: str + directory for the outputs of training + Note: + ----- + Valid options are: + 'bins' - number of tomographic bins + """ + + self.bands = bands + self.opt = options + self.CLASSIFIERS = ['resnet', 'autolstm'] + + def train (self, training_data, validation_data): + """Trains the classifier + + Parameters: + ----------- + training_data: numpy array, size Ngalaxes x Nbands + training data, each row is a galaxy, each column is a band as per + band defined above + training_z: numpy array, size Ngalaxies + true redshift for the training sample + """ + + #Create a directory for the results and the models + path_parent = os.getcwd() + dir_out = path_parent+'/ENSEMBLE' + if not os.path.exists(dir_out): + create_directory(dir_out) + self.dir_out = dir_out + + data_path = path_parent + '/data/' + training_path = data_path + 'training.hdf5' + validation_path = data_path + 'validation.hdf5' + classifier_path = os.path.dirname(os.path.abspath(__file__)) + #print('path = ', classifier_path) + n_bin = self.opt['bins'] + + print("Fitting classifier") + for classifier_name in self.CLASSIFIERS: + print('classifier_name', classifier_name) + + output_directory = self.dir_out +'/' + classifier_name + '/' + create_directory(output_directory) + + print("Training") + print(os.getcwd()) + self.p = subprocess.Popen(['python3','-u', str(classifier_path) + '/' + classifier_name + '.py', + training_path, + validation_path, + output_directory, + str(n_bin)]) + subprocess.Popen.wait(self.p) + + create_directory(output_directory + '/DONE') + print('\t\t\t\tDONE') + # Can be replaced with any classifier + + + def apply (self,data): + """Applies training to the data. + + Parameters: + ----------- + Data: numpy array, size Ngalaxes x Nbands + testing data, each row is a galaxy, each column is a band as per + band defined above + Returns: + tomographic_selections: numpy array, int, size Ngalaxies + tomographic selection for galaxies return as bin number for + each galaxy. + """ + + from tomo_challenge.classifiers import nne_jax as nne + result_dir = self.dir_out+'/ensemble_result/' + create_directory(self.dir_out+'/ensemble_result/') + ensemble = nne.Classifier_NNE(result_dir) + + preds = ensemble.predict() + tomo_bin = np.argmax(preds, axis=1) + return tomo_bin + + +def create_directory(directory_path): + if os.path.exists(directory_path): + return None + else: + try: + os.makedirs(directory_path) + except: + + return None + return directory_path diff --git a/tomo_challenge/classifiers/LGBM_classifier.py b/tomo_challenge/classifiers/LGBM_classifier.py new file mode 100644 index 00000000..7633be7b --- /dev/null +++ b/tomo_challenge/classifiers/LGBM_classifier.py @@ -0,0 +1,115 @@ +""" +LGBM Classifier +This is an example tomographic bin generator using a Light Gradient Boosted Machine as Classifier. +This solution was developed by the Brazilian Center for Physics Research AI 4 Astrophysics team. +Authors: Clecio R. Bom, Gabriel Teixeira, Bernardo M. Fraga, Eduardo Cypriano. +contact: debom |at |cbpf| dot| br +In our preliminary tests we found a SNR 3X2 of ~1926 for n=10 bins. +Every classifier module needs to: + - have construction of the type + __init__ (self, bands, options) (see examples below) + - implement two functions: + train (self, training_data,training_z) + apply (self, data). + - define valid_options class varible. +""" + +from .base import Tomographer +import numpy as np +from lightgbm import LGBMClassifier + + +class LGBM(Tomographer): + """ Light Gradient Boosted Machine Classifier """ + + # valid parameter -- see below + valid_options = ['bins'] + # this settings means arrays will be sent to train and apply instead + # of dictionaries + wants_arrays = True + + def __init__ (self, bands, options): + """Constructor + + Parameters: + ----------- + bands: str + string containg valid bands, like 'riz' or 'griz' + options: dict + options come through here. Valid keys are listed as valid_options + class variable. + Note: + ----- + Valid options are: + 'bins' - number of tomographic bins + """ + self.bands = bands + self.opt = options + + def train (self, training_data, training_z): + """Trains the classifier + + Parameters: + ----------- + training_data: numpy array, size Ngalaxes x Nbands + training data, each row is a galaxy, each column is a band as per + band defined above + training_z: numpy array, size Ngalaxies + true redshift for the training sample + """ + + n_bin = self.opt['bins'] + print("Finding bins for training data") + # Now put the training data into redshift bins. + # Use zero so that the one object with minimum + # z in the whole survey will be in the lowest bin + training_bin = np.zeros(training_z.size) + + # Find the edges that split the redshifts into n_z bins of + # equal number counts in each + p = np.linspace(0, 100, n_bin + 1) + z_edges = np.percentile(training_z, p) + + # Now find all the objects in each of these bins + for i in range(n_bin): + z_low = z_edges[i] + z_high = z_edges[i + 1] + training_bin[(training_z > z_low) & (training_z < z_high)] = i + + # For speed, it's possible cut down a percentage of original size + #cut = np.random.uniform(0, 1, training_z.size) < 0.05 + #training_data = training_data[cut] + #training_bin = training_bin[cut] + + + model = LGBMClassifier + # Can be replaced with any classifier + + + print("Fitting classifier") + # Lots of data, so this will take some time + model.fit(training_data, training_bin) + + self.classifier = model + self.z_edges = z_edges + self.scaler = scaler + + + def apply (self, data): + """Applies training to the data. + + Parameters: + ----------- + Data: numpy array, size Ngalaxes x Nbands + testing data, each row is a galaxy, each column is a band as per + band defined above + Returns: + tomographic_selections: numpy array, int, size Ngalaxies + tomographic selection for galaxies return as bin number for + each galaxy. + """ + + + preds = self.classifier.predict(data) + tomo_bin = np.argmax(preds, axis=1) + return tomo_bin diff --git a/tomo_challenge/classifiers/LSTM_bidirectional.py b/tomo_challenge/classifiers/LSTM_bidirectional.py new file mode 100644 index 00000000..aa857066 --- /dev/null +++ b/tomo_challenge/classifiers/LSTM_bidirectional.py @@ -0,0 +1,183 @@ +""" +Bidirectional LSTM Classifier +This is an example tomographic bin generator using a convolutional bidirectional LSTM neural network. +We also added a custom data loader we tested. +This solution was developed by the Brazilian Center for Physics Research AI 4 Astrophysics team. +Authors: Clecio R. Bom, Bernardo M. Fraga, Gabriel Teixeira and Elizabeth Gonzalez. +contact: debom |at |cbpf| dot| br +Every classifier module needs to: + - have construction of the type + __init__ (self, bands, options) (see examples below) + - implement two functions: + train (self, training_data,training_z) + apply (self, data). + - define valid_options class varible. +See Classifier Documentation below. +""" + +from .base import Tomographer +import numpy as np +import tensorflow as tf +from tensorflow import keras +from sklearn.preprocessing import MinMaxScaler +import h5py + + +class LSTM(Tomographer): + """ Bidirectional LSTM deep classifier """ + + # valid parameter -- see below + valid_options = ['bins'] + # this settings means arrays will be sent to train and apply instead + # of dictionaries + wants_arrays = False + + def __init__ (self, bands, options): + """Constructor + + Parameters: + ----------- + bands: str + string containg valid bands, like 'riz' or 'griz' + options: dict + options come through here. Valid keys are listed as valid_options + class variable. + + Note: + ----- + Valid options are: + 'bins' - number of tomographic bins + + """ + self.bands = bands + self.opt = options + + def load_data(fname, take_colors=True, cutoff=0.0): + n_bins = self.opt['bins'] + data = h5py.File(fname, 'r') + r_mag = data['r_mag'] + g_mag = data['g_mag'] + i_mag = data['i_mag'] + z_mag = data['z_mag'] + redshift = data['redshift_true'] + all_mags = np.vstack([g_mag, r_mag, i_mag, z_mag]) + all_mags = all_mags.T + mask = (all_mags != np.inf).all(axis=1) + all_mags = all_mags[mask,:] + redshift = redshift[mask] + gr_color = all_mags[:,0] - all_mags[:,1] + ri_color = all_mags[:,1] - all_mags[:,2] + iz_color = all_mags[:,2] - all_mags[:,3] + all_colors = np.vstack([gr_color, ri_color, iz_color]) + all_colors = all_colors.T + p = np.linspace(0, 100, n_bins+1) + z_edges = np.percentile(redshift, p) + train_bin = np.zeros(all_mags.shape[0]) + for i in range(n_bins): + z_low = z_edges[i] + z_high = z_edges[i+1] + train_bin[(redshift > z_low) & (redshift <= z_high)] = i + if cutoff != 0.0: + cut = np.random.uniform(0, 1, all_mags.shape[0]) < cutoff + train_bin = train_bin[cut].reshape(-1,1) + all_mags = all_mags[cut] + all_colors = all_colors[cut] + redshift = redshift[cut] + else: + train_bin = train_bin.reshape(-1,1) + if take_colors: + return np.hstack([all_mags, all_colors]), redshift, train_bin.astype(int), z_edges + else: + return mags, redshift, train_bin.astype(int), z_edges + + + def train (self, training_data, training_z): + """Trains the classifier + + Parameters: + ----------- + training_data: numpy array, size Ngalaxes x Nbands + training data, each row is a galaxy, each column is a band as per + band defined above + training_z: numpy array, size Ngalaxies + true redshift for the training sample + + """ + + n_bin = self.opt['bins'] + print("Finding bins for training data") + # Now put the training data into redshift bins. + # Use zero so that the one object with minimum + # z in the whole survey will be in the lowest bin + training_bin = np.zeros(training_z.size) + + # Find the edges that split the redshifts into n_z bins of + # equal number counts in each + p = np.linspace(0, 100, n_bin + 1) + z_edges = np.percentile(training_z, p) + + # Now find all the objects in each of these bins + for i in range(n_bin): + z_low = z_edges[i] + z_high = z_edges[i + 1] + training_bin[(training_z > z_low) & (training_z < z_high)] = i + #cut = np.random.uniform(0, 1, training_z.size) < 0.05 + #training_data = training_data[cut] + #training_bin = training_bin[cut] + + lstm_out = np.int(486*2) + inp = keras.layers.Input(shape=(training_data.shape[1], 1)) + + x = keras.layers.Conv1D(64, 3, padding='same')(inp) + x = keras.layers.Activation('tanh')(x) + x = keras.layers.MaxPooling1D(2, padding='same')(x) + + x = keras.layers.Conv1D(128, 3, padding='same')(x) + x = keras.layers.Activation('tanh')(x) + x = keras.layers.MaxPooling1D(2, padding='same')(x) + + x = keras.layers.Bidirectional(keras.layers.LSTM(lstm_out, return_sequences=False), merge_mode='concat')(x) + + x = keras.layers.Dense(lstm_out // 2, activation='tanh')(x) + x = keras.layers.Dense(lstm_out // 2, activation='tanh')(x) + + x = keras.layers.Dense(n_bin, activation='softmax')(x) + + model = keras.models.Model(inp, x) + + model.compile(loss='sparse_categorical_crossentropy', optimizer=keras.optimizers.Adam(0.001), metrics=['accuracy']) + + # Can be replaced with any classifier + scaler = MinMaxScaler() + x_train = scaler.fit_transform(training_data) + x_train = np.expand_dims(x_train, axis=-1) + y_train = np.expand_dims(training_bin, axis=-1) + print("Fitting classifier") + # Lots of data, so this will take some time + model.fit(x_train, y_train, epochs=15, verbose=0) + + self.classifier = model + self.z_edges = z_edges + self.scaler = scaler + + + def apply (self, data): + """Applies training to the data. + + Parameters: + ----------- + Data: numpy array, size Ngalaxes x Nbands + testing data, each row is a galaxy, each column is a band as per + band defined above + + Returns: + tomographic_selections: numpy array, int, size Ngalaxies + tomographic selection for galaxies return as bin number for + each galaxy. + """ + x_test = self.scaler.transform(data) + x_test = np.expand_dims(x_test, axis=-1) + preds = self.classifier.predict(x_test) + tomo_bin = np.argmax(preds, axis=1) + return tomo_bin + diff --git a/tomo_challenge/classifiers/TCN.py b/tomo_challenge/classifiers/TCN.py new file mode 100644 index 00000000..e2db3b7a --- /dev/null +++ b/tomo_challenge/classifiers/TCN.py @@ -0,0 +1,172 @@ +""" +TCN classifier +This is an example tomographic bin generator using a temporal convolutional neural network (TCN). +We also added a custom data loader we tested. +This solution was developed by the Brazilian Center for Physics Research AI 4 Astrophysics team. +Authors: Clecio R. Bom, Bernardo M. Fraga, Gabriel Teixeira, Eduardo Cypriano and Elizabeth Gonzalez. +contact: debom |at |cbpf| dot| br +Every classifier module needs to: + - have construction of the type + __init__ (self, bands, options) (see examples below) + - implement two functions: + train (self, training_data,training_z) + apply (self, data). + - define valid_options class varible. +See Classifier Documentation below. +""" + +from .base import Tomographer +import numpy as np +import tensorflow as tf +from tensorflow import keras +from sklearn.preprocessing import MinMaxScaler +from tcn import TCN +import h5py + + +class TCN(Tomographer): + """ TCN deep classifier """ + + # valid parameter -- see below + valid_options = ['bins'] + # this settings means arrays will be sent to train and apply instead + # of dictionaries + wants_arrays = False + + def __init__ (self, bands, options): + """Constructor + + Parameters: + ----------- + bands: str + string containg valid bands, like 'riz' or 'griz' + options: dict + options come through here. Valid keys are listed as valid_options + class variable. + + Note: + ----- + Valid options are: + 'bins' - number of tomographic bins + + """ + self.bands = bands + self.opt = options + + def load_data(fname, take_colors=True, cutoff=0.0): + n_bins = self.opt['bins'] + data = h5py.File(fname, 'r') + r_mag = data['r_mag'] + g_mag = data['g_mag'] + i_mag = data['i_mag'] + z_mag = data['z_mag'] + redshift = data['redshift_true'] + all_mags = np.vstack([g_mag, r_mag, i_mag, z_mag]) + all_mags = all_mags.T + mask = (all_mags != np.inf).all(axis=1) + all_mags = all_mags[mask,:] + redshift = redshift[mask] + gr_color = all_mags[:,0] - all_mags[:,1] + ri_color = all_mags[:,1] - all_mags[:,2] + iz_color = all_mags[:,2] - all_mags[:,3] + all_colors = np.vstack([gr_color, ri_color, iz_color]) + all_colors = all_colors.T + p = np.linspace(0, 100, n_bins+1) + z_edges = np.percentile(redshift, p) + train_bin = np.zeros(all_mags.shape[0]) + for i in range(n_bins): + z_low = z_edges[i] + z_high = z_edges[i+1] + train_bin[(redshift > z_low) & (redshift <= z_high)] = i + if cutoff != 0.0: + cut = np.random.uniform(0, 1, all_mags.shape[0]) < cutoff + train_bin = train_bin[cut].reshape(-1,1) + all_mags = all_mags[cut] + all_colors = all_colors[cut] + redshift = redshift[cut] + else: + train_bin = train_bin.reshape(-1,1) + if take_colors: + return np.hstack([all_mags, all_colors]), redshift, train_bin.astype(int), z_edges + else: + return mags, redshift, train_bin.astype(int), z_edges + + + def train (self, training_data, training_z): + """Trains the classifier + + Parameters: + ----------- + training_data: numpy array, size Ngalaxes x Nbands + training data, each row is a galaxy, each column is a band as per + band defined above + training_z: numpy array, size Ngalaxies + true redshift for the training sample + + """ + + n_bin = self.opt['bins'] + print("Finding bins for training data") + # Now put the training data into redshift bins. + # Use zero so that the one object with minimum + # z in the whole survey will be in the lowest bin + training_bin = np.zeros(training_z.size) + + # Find the edges that split the redshifts into n_z bins of + # equal number counts in each + p = np.linspace(0, 100, n_bin + 1) + z_edges = np.percentile(training_z, p) + + # Now find all the objects in each of these bins + for i in range(n_bin): + z_low = z_edges[i] + z_high = z_edges[i + 1] + training_bin[(training_z > z_low) & (training_z < z_high)] = i + #cut = np.random.uniform(0, 1, training_z.size) < 0.05 + #training_data = training_data[cut] + #training_bin = training_bin[cut] + + + inp = keras.layers.Input(shape=(training_data.shape[1], 1)) + x = TCN(nb_filters=[128, 128], kernel_size=2, dilations=[1, 2], nb_stacks=2, activation='relu', return_sequences=False, use_batch_norm=True, use_skip_connections=True)(inp) + + x = keras.layers.Dense(n_bin, activation='softmax')(x) + + model = keras.models.Model(inp, x) + + model.compile(loss='sparse_categorical_crossentropy', optimizer=keras.optimizers.Adam(0.001), metrics=['accuracy']) + + # Can be replaced with any classifier + scaler = MinMaxScaler() + x_train = scaler.fit_transform(training_data) + x_train = np.expand_dims(x_train, axis=-1) + y_train = np.expand_dims(training_bin, axis=-1) + print("Fitting classifier") + # Lots of data, so this will take some time + model.fit(x_train, y_train, epochs=20, verbose=0) + + self.classifier = model + self.z_edges = z_edges + self.scaler = scaler + + + def apply (self, data): + """Applies training to the data. + + Parameters: + ----------- + Data: numpy array, size Ngalaxes x Nbands + testing data, each row is a galaxy, each column is a band as per + band defined above + + Returns: + tomographic_selections: numpy array, int, size Ngalaxies + tomographic selection for galaxies return as bin number for + each galaxy. + """ + x_test = self.scaler.transform(data) + x_test = np.expand_dims(x_test, axis=-1) + preds = self.classifier.predict(x_test) + tomo_bin = np.argmax(preds, axis=1) + return tomo_bin + diff --git a/tomo_challenge/classifiers/autokeras_LSTM.py b/tomo_challenge/classifiers/autokeras_LSTM.py new file mode 100644 index 00000000..a146bdd3 --- /dev/null +++ b/tomo_challenge/classifiers/autokeras_LSTM.py @@ -0,0 +1,180 @@ +""" +Bidirectional AutoML LSTM Classifier +This is an example tomographic bin generator using an Autokeras optimized convolutional LSTM neural network. +We also added a custom data loader we tested. +This solution was developed by the Brazilian Center for Physics Research AI 4 Astrophysics team. +Authors: Clecio R. Bom, Bernardo M. Fraga, Gabriel Teixeira, Eduardo Cypriano and Elizabeth Gonzalez. +contact: debom |at |cbpf| dot| br +Every classifier module needs to: + - have construction of the type + __init__ (self, bands, options) (see examples below) + - implement two functions: + train (self, training_data,training_z) + apply (self, data). + - define valid_options class varible. +See Classifier Documentation below. +""" + +from .base import Tomographer +import numpy as np +import tensorflow as tf +from tensorflow import keras +from sklearn.preprocessing import MinMaxScaler +import h5py + +class Autokeras_LSTM(Tomographer): + """ LSTM optimised by AutoKeras deep classifier """ + + # valid parameter -- see below + valid_options = ['bins'] + # this settings means arrays will be sent to train and apply instead + # of dictionaries + wants_arrays = False + + def __init__ (self, bands, options): + """Constructor + + Parameters: + ----------- + bands: str + string containg valid bands, like 'riz' or 'griz' + options: dict + options come through here. Valid keys are listed as valid_options + class variable. + + Note: + ----- + Valid options are: + 'bins' - number of tomographic bins + + """ + self.bands = bands + self.opt = options + + def load_data(fname, take_colors=True, cutoff=0.0): + n_bins = self.opt['bins'] + data = h5py.File(fname, 'r') + r_mag = data['r_mag'] + g_mag = data['g_mag'] + i_mag = data['i_mag'] + z_mag = data['z_mag'] + redshift = data['redshift_true'] + all_mags = np.vstack([g_mag, r_mag, i_mag, z_mag]) + all_mags = all_mags.T + mask = (all_mags != np.inf).all(axis=1) + all_mags = all_mags[mask,:] + redshift = redshift[mask] + gr_color = all_mags[:,0] - all_mags[:,1] + ri_color = all_mags[:,1] - all_mags[:,2] + iz_color = all_mags[:,2] - all_mags[:,3] + all_colors = np.vstack([gr_color, ri_color, iz_color]) + all_colors = all_colors.T + p = np.linspace(0, 100, n_bins+1) + z_edges = np.percentile(redshift, p) + train_bin = np.zeros(all_mags.shape[0]) + for i in range(n_bins): + z_low = z_edges[i] + z_high = z_edges[i+1] + train_bin[(redshift > z_low) & (redshift <= z_high)] = i + if cutoff != 0.0: + cut = np.random.uniform(0, 1, all_mags.shape[0]) < cutoff + train_bin = train_bin[cut].reshape(-1,1) + all_mags = all_mags[cut] + all_colors = all_colors[cut] + redshift = redshift[cut] + else: + train_bin = train_bin.reshape(-1,1) + if take_colors: + return np.hstack([all_mags, all_colors]), redshift, train_bin.astype(int), z_edges + else: + return mags, redshift, train_bin.astype(int), z_edges + + + def train (self, training_data, training_z): + """Trains the classifier + + Parameters: + ----------- + training_data: numpy array, size Ngalaxes x Nbands + training data, each row is a galaxy, each column is a band as per + band defined above + training_z: numpy array, size Ngalaxies + true redshift for the training sample + + """ + + n_bin = self.opt['bins'] + print("Finding bins for training data") + # Now put the training data into redshift bins. + # Use zero so that the one object with minimum + # z in the whole survey will be in the lowest bin + training_bin = np.zeros(training_z.size) + + # Find the edges that split the redshifts into n_z bins of + # equal number counts in each + p = np.linspace(0, 100, n_bin + 1) + z_edges = np.percentile(training_z, p) + + # Now find all the objects in each of these bins + for i in range(n_bin): + z_low = z_edges[i] + z_high = z_edges[i + 1] + training_bin[(training_z > z_low) & (training_z < z_high)] = i + #cut = np.random.uniform(0, 1, training_z.size) < 0.05 + #training_data = training_data[cut] + #training_bin = training_bin[cut] + + + inp = keras.layers.Input(shape=(training_data.shape[1], 1)) + x = keras.layers.Conv1D(32, 5, padding='same', activation='relu')(inp) + x = keras.layers.Conv1D(512, 5, padding='same', activation='relu')(x) + x = keras.layers.Conv1D(128, 5, padding='same', activation='relu')(x) + + x = keras.layers.Bidirectional(keras.layers.LSTM(128, return_sequences=True), merge_mode='concat')(x) + x = keras.layers.Bidirectional(keras.layers.LSTM(128, return_sequences=False), merge_mode='concat')(x) + + x = keras.layers.Dense(512)(x) + x = keras.layers.BatchNormalization()(x) + x = keras.layers.Activation('relu')(x) + x = keras.layers.Dropout(0.25)(x) + x = keras.layers.Dropout(0.5)(x) + x = keras.layers.Dense(n_bin, activation='softmax')(x) + + model = keras.models.Model(inp, x) + + model.compile(loss='sparse_categorical_crossentropy', optimizer=keras.optimizers.Adam(0.001), metrics=['accuracy']) + + # Can be replaced with any classifier + scaler = MinMaxScaler() + x_train = scaler.fit_transform(training_data) + x_train = np.expand_dims(x_train, axis=-1) + y_train = np.expand_dims(training_bin, axis=-1) + print("Fitting classifier") + # Lots of data, so this will take some time + model.fit(x_train, y_train, epochs=30, verbose=0) + + self.classifier = model + self.z_edges = z_edges + self.scaler = scaler + + + def apply (self, data): + """Applies training to the data. + + Parameters: + ----------- + Data: numpy array, size Ngalaxes x Nbands + testing data, each row is a galaxy, each column is a band as per + band defined above + + Returns: + tomographic_selections: numpy array, int, size Ngalaxies + tomographic selection for galaxies return as bin number for + each galaxy. + """ + x_test = self.scaler.transform(data) + x_test = np.expand_dims(x_test, axis=-1) + preds = self.classifier.predict(x_test) + tomo_bin = np.argmax(preds, axis=1) + return tomo_bin + diff --git a/tomo_challenge/classifiers/autolstm_jax.py b/tomo_challenge/classifiers/autolstm_jax.py new file mode 100644 index 00000000..e4e416bb --- /dev/null +++ b/tomo_challenge/classifiers/autolstm_jax.py @@ -0,0 +1,183 @@ +import jax_metrics as j_metrics +from flax import nn, optim, serialization, jax_utils +import jax.random as rand +import jax.numpy as jnp +import jax +import os +#os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID' +#os.environ['CUDA_VISIBLE_DEVICES'] = '2' +from sklearn.preprocessing import MinMaxScaler +import numpy as np +import sys +import h5py +import tensorflow as tf + +def get_classifier(n_features, n_bins): + class AutoLSTM(nn.Module): + def apply(self, x): + batch_size = x.shape[0] + net = nn.Conv(x, features=32, kernel_size=(5,), padding='SAME') + net = nn.leaky_relu(net) + net = nn.Conv(net, 512, kernel_size=(5,), padding='SAME') + net = nn.leaky_relu(net) + net = nn.Conv(net, 128, kernel_size=(5,), padding='SAME') + net = nn.leaky_relu(net) + + carry = nn.LSTMCell.initialize_carry(rand.PRNGKey(0), (batch_size,), 128) + new_carry, output_1 = jax_utils.scan_in_dim(nn.LSTMCell.partial(), carry, net, axis=1) + _, output_2 = jax_utils.scan_in_dim(nn.LSTMCell.partial(), new_carry, output_1, axis=1) + + net_dense = output_2.reshape((output_2.shape[0], -1)) + net_dense = nn.Dense(net_dense, 512) + net_dense = nn.BatchNorm(net_dense) + net_dense = nn.leaky_relu(net_dense) + + net_dense = nn.dropout(net_dense, 0.25) + + net_dense = nn.dropout(net_dense, 0.5) + return nn.softmax(nn.Dense(net_dense, n_bins)) + + with nn.stochastic(rand.PRNGKey(0)): + _, initial_params = AutoLSTM.init_by_shape(rand.PRNGKey(0), [((1, n_features, 1), jnp.float32)]) + return nn.Model(AutoLSTM, initial_params) + + +#def get_classifier(n_features, n_bins): +# _, initial_params = Resnet(n_bins).init_by_shape( rand.PRNGKey(0), [((1, n_features,1), jnp.float32)]) +# return nn.Model(Resnet(n_bins), initial_params) + +def get_batches(features, redshift, batch_size, train=True): + if train: + dataset = tf.data.Dataset.from_tensor_slices((features, redshift)) + dataset = dataset.shuffle(buffer_size=2048).batch(batch_size) + else: + dataset = tf.data.Dataset.from_tensor_slices(features).batch(batch_size) + return dataset + + + +def train(n_features, n_bins, training_data, training_z, batch_size=512, epochs=20, lr=0.001): + model = get_classifier(n_features, n_bins) + optimizer = optim.Adam(learning_rate=lr).create(model) + + @jax.jit + def train_step(optimizer, x, y): + # This is the loss function + def loss_fn(model): + # Apply classifier to features + w = model(x) + # returns - score, because we want to maximize score + return 1000./ j_metrics.compute_snr_score(w, y) + # Compute gradients + loss, g = jax.value_and_grad(loss_fn)(optimizer.target) + # Perform gradient descent + optimizer = optimizer.apply_gradient(g) + return optimizer, loss + + + losses = [] + + with nn.stochastic(rand.PRNGKey(0)): + for e in range(epochs): + print('starting epoch {}'.format(e)) + batches = get_batches(training_data, training_z, batch_size) + for i, (x_train, labels) in enumerate(batches.as_numpy_iterator()): + optimizer, loss = train_step(optimizer, x_train, labels) + losses.append(loss) + print('Epoch {}\nLoss = {}'.format(e, loss)) + model = optimizer.target + + return losses, model + +def predict(model, data, batch_size=512): + + batches = get_batches(data, 0., batch_size, train=False) + preds = [] + n_batches = data.shape[0] // batch_size + with nn.stochastic(rand.PRNGKey(0)): + for i, x_test in enumerate(batches.as_numpy_iterator()): + print('batch {} of {}'.format(i, n_batches)) + p = model(x_test) + preds.append(p) + result = np.concatenate([p for p in preds], axis=0) + return result + +def load_data(n_bins, fname, no_inf=True, take_colors=True, cutoff=0.0): + + #print(fname) + #sys.exit() + data = h5py.File(fname, 'r') + + r_mag = data['r_mag'] + g_mag = data['g_mag'] + i_mag = data['i_mag'] + z_mag = data['z_mag'] + redshift = data['redshift_true'] + all_mags = np.vstack([g_mag, r_mag, i_mag, z_mag]) + all_mags = all_mags.T + if no_inf: + mask = (all_mags != np.inf).all(axis=1) + all_mags = all_mags[mask,:] + redshift = redshift[mask] + else: + bad = ~np.isfinite(all_mags) + all_mags[bad] = 30 + gr_color = all_mags[:,0] - all_mags[:,1] + ri_color = all_mags[:,1] - all_mags[:,2] + iz_color = all_mags[:,2] - all_mags[:,3] + all_colors = np.vstack([gr_color, ri_color, iz_color]) + all_colors = all_colors.T + p = np.linspace(0, 100, n_bins+1) + z_edges = np.percentile(redshift, p) + train_bin = np.zeros(all_mags.shape[0]) + for i in range(n_bins): + z_low = z_edges[i] + z_high = z_edges[i+1] + train_bin[(redshift > z_low) & (redshift <= z_high)] = i + if cutoff != 0.0: + cut = np.random.uniform(0, 1, all_mags.shape[0]) < cutoff + train_bin = train_bin[cut].reshape(-1,1) + all_mags = all_mags[cut] + all_colors = all_colors[cut] + redshift = redshift[cut] + else: + train_bin = train_bin.reshape(-1,1) + if take_colors: + return np.hstack([all_mags, all_colors]), redshift, train_bin.astype(int), z_edges + else: + return mags, redshift, train_bin.astype(int), z_edges + +def prepare_data(data, scaler=None): + if scaler == None: + scaler = MinMaxScaler() + result = scaler.fit_transform(data) + result = np.expand_dims(result, axis=-1) + return result, scaler + else: + result = scaler.transform(data) + result = np.expand_dims(result, axis=-1) + return result + + +if __name__ == '__main__': + n_bins = int(sys.argv[4]) + n_features = 7 + print('preparing data') + feats_train, z_train, bins_train, edges_train = load_data(n_bins, sys.argv[1], no_inf=False) + + x_train, scaler = prepare_data(feats_train) + + print('training') + losses, trained_model = train(n_features, n_bins, x_train, z_train, epochs=20) + print('training finished') + + val_feats, z_val, bins_val, edges_val = load_data(n_bins, sys.argv[2], no_inf=False) + x_val = prepare_data(val_feats, scaler=scaler) + print(x_val.shape) + print('predicting') + preds = predict(trained_model, x_val, 512) + + #classes = preds.argmax(axis=1) + print('saving') + output_dir = sys.argv[3] + np.save(output_dir+'y_pred.npy', preds) diff --git a/tomo_challenge/classifiers/conv_ak.py b/tomo_challenge/classifiers/conv_ak.py new file mode 100644 index 00000000..a50280ec --- /dev/null +++ b/tomo_challenge/classifiers/conv_ak.py @@ -0,0 +1,180 @@ +""" +Conv1D Classifier +This is an example tomographic bin generator using a convolutional 1d neural network. +The initial model was optimized using AutoKeras. +We also added a custom data loader we tested. +This solution was developed by the Brazilian Center for Physics Research AI 4 Astrophysics team. +Authors: Clecio R. Bom, Bernardo M. Fraga, Gabriel Teixeira and Elizabeth Gonzalez. +contact: debom |at |cbpf| dot| br +Every classifier module needs to: + - have construction of the type + __init__ (self, bands, options) (see examples below) + - implement two functions: + train (self, training_data,training_z) + apply (self, data). + - define valid_options class varible. +See Classifier Documentation below. +""" + +from .base import Tomographer +import numpy as np +import tensorflow as tf +from tensorflow import keras +from sklearn.preprocessing import MinMaxScaler +import h5py + +class CNN(Tomographer): + """ Convolutional deep classifier """ + + # valid parameter -- see below + valid_options = ['bins'] + # this settings means arrays will be sent to train and apply instead + # of dictionaries + wants_arrays = False + + def __init__ (self, bands, options): + """Constructor + + Parameters: + ----------- + bands: str + string containg valid bands, like 'riz' or 'griz' + options: dict + options come through here. Valid keys are listed as valid_options + class variable. + + Note: + ----- + Valid options are: + 'bins' - number of tomographic bins + + """ + self.bands = bands + self.opt = options + + def load_data(fname, take_colors=True, cutoff=0.0): + n_bins = self.opt['bins'] + data = h5py.File(fname, 'r') + r_mag = data['r_mag'] + g_mag = data['g_mag'] + i_mag = data['i_mag'] + z_mag = data['z_mag'] + redshift = data['redshift_true'] + all_mags = np.vstack([g_mag, r_mag, i_mag, z_mag]) + all_mags = all_mags.T + mask = (all_mags != np.inf).all(axis=1) + all_mags = all_mags[mask,:] + redshift = redshift[mask] + gr_color = all_mags[:,0] - all_mags[:,1] + ri_color = all_mags[:,1] - all_mags[:,2] + iz_color = all_mags[:,2] - all_mags[:,3] + all_colors = np.vstack([gr_color, ri_color, iz_color]) + all_colors = all_colors.T + p = np.linspace(0, 100, n_bins+1) + z_edges = np.percentile(redshift, p) + train_bin = np.zeros(all_mags.shape[0]) + for i in range(n_bins): + z_low = z_edges[i] + z_high = z_edges[i+1] + train_bin[(redshift > z_low) & (redshift <= z_high)] = i + if cutoff != 0.0: + cut = np.random.uniform(0, 1, all_mags.shape[0]) < cutoff + train_bin = train_bin[cut].reshape(-1,1) + all_mags = all_mags[cut] + all_colors = all_colors[cut] + redshift = redshift[cut] + else: + train_bin = train_bin.reshape(-1,1) + if take_colors: + return np.hstack([all_mags, all_colors]), redshift, train_bin.astype(int), z_edges + else: + return mags, redshift, train_bin.astype(int), z_edges + + + def train (self, training_data, training_z): + """Trains the classifier + + Parameters: + ----------- + training_data: numpy array, size Ngalaxes x Nbands + training data, each row is a galaxy, each column is a band as per + band defined above + training_z: numpy array, size Ngalaxies + true redshift for the training sample + + """ + + n_bin = self.opt['bins'] + print("Finding bins for training data") + # Now put the training data into redshift bins. + # Use zero so that the one object with minimum + # z in the whole survey will be in the lowest bin + training_bin = np.zeros(training_z.size) + + # Find the edges that split the redshifts into n_z bins of + # equal number counts in each + p = np.linspace(0, 100, n_bin + 1) + z_edges = np.percentile(training_z, p) + + # Now find all the objects in each of these bins + for i in range(n_bin): + z_low = z_edges[i] + z_high = z_edges[i + 1] + training_bin[(training_z > z_low) & (training_z < z_high)] = i + #cut = np.random.uniform(0, 1, training_z.size) < 0.05 + #training_bin = training_bin[cut] + #training_data = training_data[cut] + + inp = keras.layers.Input(shape=(training_data.shape[1], 1)) + + x = keras.layers.Conv1D(256, 7, padding='same', activation='relu')(inp) + x = keras.layers.Conv1D(512, 7, padding='same', activation='relu')(x) + x = keras.layers.MaxPooling1D(6, padding='same')(x) + x = keras.layers.Dropout(0.25)(x) + + x = keras.layers.Conv1D(128, 7, padding='same', activation='relu')(x) + x = keras.layers.Conv1D(256, 7, padding='same', activation='relu')(x) + x = keras.layers.MaxPooling1D(6, padding='same')(x) + x = keras.layers.Dropout(0.25)(x) + x = keras.layers.GlobalAveragePooling1D()(x) + + x = keras.layers.Dense(n_bin, activation='softmax')(x) + + model = keras.models.Model(inp, x) + + model.compile(loss='sparse_categorical_crossentropy', optimizer=keras.optimizers.Adam(0.001), metrics=['accuracy']) + + # Can be replaced with any classifier + scaler = MinMaxScaler() + x_train = scaler.fit_transform(training_data) + print("Fitting classifier") + # Lots of data, so this will take some time + x_train = np.expand_dims(x_train, axis=-1) + y_train = np.expand_dims(training_bin, axis=-1) + model.fit(x_train, y_train, epochs=15, verbose=0) + + self.classifier = model + self.z_edges = z_edges + self.scaler = scaler + + + def apply (self, data): + """Applies training to the data. + + Parameters: + ----------- + Data: numpy array, size Ngalaxes x Nbands + testing data, each row is a galaxy, each column is a band as per + band defined above + + Returns: + tomographic_selections: numpy array, int, size Ngalaxies + tomographic selection for galaxies return as bin number for + each galaxy. + """ + x_test = self.scaler.transform(data) + x_test = np.expand_dims(x_test, axis=-1) + preds = self.classifier.predict(x_test) + tomo_bin = np.argmax(preds, axis=1) + return tomo_bin + diff --git a/tomo_challenge/classifiers/de_autolstm.py b/tomo_challenge/classifiers/de_autolstm.py new file mode 100644 index 00000000..c277dd21 --- /dev/null +++ b/tomo_challenge/classifiers/de_autolstm.py @@ -0,0 +1,88 @@ +# AutoLSTM model for Deep Ensemble +import tensorflow.keras as keras +import numpy as np +import time + +#from tomo_challenge.utils.utils import save_logs +#from tomo_challenge.utils.utils import calculate_metrics +#from tomo_challenge.utils.utils import get_available_gpus +from tensorflow.python.client import device_lib + +class Classifier_LSTM: + + def __init__(self, output_directory, input_shape, nb_classes, verbose=True,build=True): + self.output_directory = output_directory + if build == True: + self.model = self.build_model(input_shape, nb_classes) + if(verbose==True): + self.model.summary() + self.verbose = verbose + if 'autolstm' in self.output_directory: + self.model.save_weights(self.output_directory+'model_init.hdf5') + return + + def build_model(self, input_shape, nb_classes): + input_layer = keras.layers.Input(input_shape) + + x = keras.layers.Conv1D(32, 5, padding='same', activation='relu')(input_layer) + x = keras.layers.Conv1D(512, 5, padding='same', activation='relu')(x) + x = keras.layers.Conv1D(128, 5, padding='same', activation='relu')(x) + + x = keras.layers.Bidirectional(keras.layers.LSTM(128, return_sequences=True), merge_mode='concat')(x) + x = keras.layers.Bidirectional(keras.layers.LSTM(128, return_sequences=False), merge_mode='concat')(x) + + x = keras.layers.Dense(512)(x) + x = keras.layers.BatchNormalization()(x) + x = keras.layers.Activation('relu')(x) + x = keras.layers.Dropout(0.25)(x) + x = keras.layers.Dropout(0.5)(x) + + output_layer = keras.layers.Dense(nb_classes, activation='softmax')(x) + + model = keras.models.Model(inputs=input_layer, outputs=output_layer) + + model.compile(loss='categorical_crossentropy', optimizer = keras.optimizers.Adam(0.0005), + metrics=['accuracy']) + + reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor='loss', factor=0.5, patience=50, + min_lr=0.0001) + + file_path = self.output_directory+'best_model.hdf5' + + model_checkpoint = keras.callbacks.ModelCheckpoint(filepath=file_path, monitor='loss', + save_best_only=True) + + self.callbacks = [reduce_lr,model_checkpoint] + + return model + + def fit(self, x_train, y_train): + if len(get_available_gpus())==0: + print('error') + exit() + # x_val and y_val are only used to monitor the test loss and NOT for training + batch_size = 512 + nb_epochs = 20 + + mini_batch_size = int(min(x_train.shape[0]/10, batch_size)) + + start_time = time.time() + self.model.fit(x_train, y_train, batch_size=mini_batch_size, epochs=nb_epochs, + verbose=1, callbacks=self.callbacks) + + duration = time.time() - start_time + print(f'Duration - AutoLSTM - Training = {duration:.2f}s') + self.model.save(self.output_directory+'last_model.hdf5') + + keras.backend.clear_session() + + def predict(self, x_test): + model_path = self.output_directory.replace('ensemble_result', 'autolstm') + model_path = model_path + 'best_model.hdf5' + model = keras.models.load_model(model_path) + y_pred = model.predict(x_test) + + return y_pred +def get_available_gpus(): + local_device_protos = device_lib.list_local_devices() + return [x.name for x in local_device_protos if x.device_type == 'GPU'] diff --git a/tomo_challenge/classifiers/de_fcn.py b/tomo_challenge/classifiers/de_fcn.py new file mode 100644 index 00000000..6ec12558 --- /dev/null +++ b/tomo_challenge/classifiers/de_fcn.py @@ -0,0 +1,85 @@ +# FCN model for Deep Ensemble +import keras +import numpy as np +import time +import sys +#from tomo_challenge.utils.utils import get_available_gpus +from tensorflow.python.client import device_lib + +class Classifier_FCN: + + def __init__(self, output_directory, input_shape, nb_classes, verbose=True,build=True): + self.output_directory = output_directory + if build == True: + self.model = self.build_model(input_shape, nb_classes) + if(verbose==True): + self.model.summary() + self.verbose = verbose + self.model.save_weights(self.output_directory+'model_init.hdf5') + return + + def build_model(self, input_shape, nb_classes): + input_layer = keras.layers.Input(input_shape) + + conv1 = keras.layers.Conv1D(filters=128, kernel_size=8, padding='same')(input_layer) + conv1 = keras.layers.normalization.BatchNormalization()(conv1) + conv1 = keras.layers.Activation(activation='relu')(conv1) + + conv2 = keras.layers.Conv1D(filters=256, kernel_size=5, padding='same')(conv1) + conv2 = keras.layers.normalization.BatchNormalization()(conv2) + conv2 = keras.layers.Activation('relu')(conv2) + + conv3 = keras.layers.Conv1D(128, kernel_size=3,padding='same')(conv2) + conv3 = keras.layers.normalization.BatchNormalization()(conv3) + conv3 = keras.layers.Activation('relu')(conv3) + + gap_layer = keras.layers.pooling.GlobalAveragePooling1D()(conv3) + + output_layer = keras.layers.Dense(nb_classes, activation='softmax')(gap_layer) + + model = keras.models.Model(inputs=input_layer, outputs=output_layer) + + model.compile(loss='categorical_crossentropy', optimizer = keras.optimizers.Adam(), + metrics=['accuracy']) + + reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor='loss', factor=0.5, patience=50, + min_lr=0.0001) + + file_path = self.output_directory+'best_model.hdf5' + + model_checkpoint = keras.callbacks.ModelCheckpoint(filepath=file_path, monitor='loss', + save_best_only=True) + + self.callbacks = [reduce_lr,model_checkpoint] + + return model + + def fit(self, x_train, y_train): + if len(get_available_gpus())==0: + print('error') + exit() + # x_val and y_val are only used to monitor the test loss and NOT for training + batch_size = 512 + nb_epochs = 20 + + mini_batch_size = int(min(x_train.shape[0]/10, batch_size)) + + start_time = time.time() + self.model.fit(x_train, y_train, batch_size=mini_batch_size, epochs=nb_epochs, + verbose=self.verbose, callbacks=self.callbacks) + + self.model.save(self.output_directory+'last_model.hdf5') + duration = time.time() - start_time + print(f'Duration - FCN - Training = {duration:.2f}s') + + keras.backend.clear_session() + + def predict(self, x_test): + model_path = self.output_directory.replace('ensemble_result', 'fcn') + model_path = model_path + 'best_model.hdf5' + model = keras.models.load_model(model_path) + y_pred = model.predict(x_test) + return y_pred +def get_available_gpus(): + local_device_protos = device_lib.list_local_devices() + return [x.name for x in local_device_protos if x.device_type == 'GPU'] diff --git a/tomo_challenge/classifiers/de_nne.py b/tomo_challenge/classifiers/de_nne.py new file mode 100644 index 00000000..acabce7b --- /dev/null +++ b/tomo_challenge/classifiers/de_nne.py @@ -0,0 +1,92 @@ +# NNE model for Deep Ensemble +import tensorflow.keras as keras +import numpy as np + +#from tomo_challenge.utils.utils import create_directory +#from tomo_challenge.utils.utils import check_if_file_exits +import os +import gc +import time +import sys + + +class Classifier_NNE: + def create_classifier(self, model_name, input_shape, nb_classes, output_directory, verbose=False, + build=True, load_weights=False): + if model_name == 'fcn': + from tomo_challenge.classifiers import de_fcn as fcn + return fcn.Classifier_FCN(output_directory, input_shape, nb_classes, verbose, build=build) + if model_name == 'autolstm': + from tomo_challenge.classifiers import de_autolstm as autolstm + return autolstm.Classifier_LSTM(output_directory, input_shape, nb_classes, verbose, build=build) + if model_name == 'resnet': + from tomo_challenge.classifiers import de_resnet as resnet + return resnet.Classifier_RESNET(output_directory, input_shape, nb_classes, verbose, + build=build, load_weights=load_weights) + + + def __init__(self, output_directory, verbose=False): + self.classifiers = ['fcn', 'autolstm', 'resnet'] + + self.output_directory = output_directory + create_directory(self.output_directory) + self.verbose = verbose + self.models_dir = output_directory.replace('ensemble_results','classifier') + self.iterations = 1 + + def predict(self, x_test): + # no training since models are pre-trained + start_time = time.time() + + l = 0 + # loop through all classifiers + for model_name in self.classifiers: + + # loop through different initialization of classifiers + + curr_dir = self.models_dir.replace('classifier',model_name) + + model = self.create_classifier(model_name, None, None, + curr_dir, build=False) + print(model_name) + + predictions_file_name = curr_dir+'y_pred.npy' + + print(f"geting predict for {model_name}") + curr_y_pred = model.predict(x_test=x_test) + keras.backend.clear_session() + + np.save(predictions_file_name,curr_y_pred) + + if l == 0: + y_pred = np.zeros(shape=curr_y_pred.shape) + + y_pred = y_pred+curr_y_pred + l+=1 + + + # average predictions + y_pred = y_pred / l + + # save predictiosn + np.save(self.output_directory+'y_pred.npy', y_pred) + + duration = time.time() - start_time + + # the creation of this directory means + create_directory(self.output_directory + '/DONE') + + return y_pred + +def check_if_file_exits(file_name): + return os.path.exists(file_name) + +def create_directory(directory_path): + if os.path.exists(directory_path): + return None + else: + try: + os.makedirs(directory_path) + except: + return None + return directory_path diff --git a/tomo_challenge/classifiers/de_resnet.py b/tomo_challenge/classifiers/de_resnet.py new file mode 100644 index 00000000..6201eff1 --- /dev/null +++ b/tomo_challenge/classifiers/de_resnet.py @@ -0,0 +1,147 @@ +# ResNet model +import keras +import numpy as np +import time + +from tensorflow.python.client import device_lib +#from tomo_challenge.utils.utils import save_logs +#from tomo_challenge.utils.utils import calculate_metrics +#from tomo_challenge.utils.utils import get_available_gpus + +class Classifier_RESNET: + + def __init__(self, output_directory, input_shape, nb_classes, verbose=True,build=True,load_weights=False): + self.output_directory = output_directory + if build==True: + self.model = self.build_model(input_shape, nb_classes) + if(verbose==True): + self.model.summary() + self.verbose = verbose + if load_weights == True: + self.model.load_weights(self.output_directory + .replace('resnet_augment','resnet') + .replace('TSC_itr_augment_x_10','TSC_itr_10') + +'/model_init.hdf5') + else: + self.model.save_weights(self.output_directory+'model_init.hdf5') + return + + def build_model(self, input_shape, nb_classes): + n_feature_maps = 64 + + input_layer = keras.layers.Input(input_shape) + + # BLOCK 1 + + conv_x = keras.layers.Conv1D(filters=n_feature_maps, kernel_size=8, padding='same')(input_layer) + conv_x = keras.layers.normalization.BatchNormalization()(conv_x) + conv_x = keras.layers.Activation('relu')(conv_x) + + conv_y = keras.layers.Conv1D(filters=n_feature_maps, kernel_size=5, padding='same')(conv_x) + conv_y = keras.layers.normalization.BatchNormalization()(conv_y) + conv_y = keras.layers.Activation('relu')(conv_y) + + conv_z = keras.layers.Conv1D(filters=n_feature_maps, kernel_size=3, padding='same')(conv_y) + conv_z = keras.layers.normalization.BatchNormalization()(conv_z) + + # expand channels for the sum + shortcut_y = keras.layers.Conv1D(filters=n_feature_maps, kernel_size=1, padding='same')(input_layer) + shortcut_y = keras.layers.normalization.BatchNormalization()(shortcut_y) + + output_block_1 = keras.layers.add([shortcut_y, conv_z]) + output_block_1 = keras.layers.Activation('relu')(output_block_1) + + # BLOCK 2 + + conv_x = keras.layers.Conv1D(filters=n_feature_maps*2, kernel_size=8, padding='same')(output_block_1) + conv_x = keras.layers.normalization.BatchNormalization()(conv_x) + conv_x = keras.layers.Activation('relu')(conv_x) + + conv_y = keras.layers.Conv1D(filters=n_feature_maps*2, kernel_size=5, padding='same')(conv_x) + conv_y = keras.layers.normalization.BatchNormalization()(conv_y) + conv_y = keras.layers.Activation('relu')(conv_y) + + conv_z = keras.layers.Conv1D(filters=n_feature_maps*2, kernel_size=3, padding='same')(conv_y) + conv_z = keras.layers.normalization.BatchNormalization()(conv_z) + + # expand channels for the sum + shortcut_y = keras.layers.Conv1D(filters=n_feature_maps*2, kernel_size=1, padding='same')(output_block_1) + shortcut_y = keras.layers.normalization.BatchNormalization()(shortcut_y) + + output_block_2 = keras.layers.add([shortcut_y, conv_z]) + output_block_2 = keras.layers.Activation('relu')(output_block_2) + + # BLOCK 3 + + conv_x = keras.layers.Conv1D(filters=n_feature_maps*2, kernel_size=8, padding='same')(output_block_2) + conv_x = keras.layers.normalization.BatchNormalization()(conv_x) + conv_x = keras.layers.Activation('relu')(conv_x) + + conv_y = keras.layers.Conv1D(filters=n_feature_maps*2, kernel_size=5, padding='same')(conv_x) + conv_y = keras.layers.normalization.BatchNormalization()(conv_y) + conv_y = keras.layers.Activation('relu')(conv_y) + + conv_z = keras.layers.Conv1D(filters=n_feature_maps*2, kernel_size=3, padding='same')(conv_y) + conv_z = keras.layers.normalization.BatchNormalization()(conv_z) + + # no need to expand channels because they are equal + shortcut_y = keras.layers.normalization.BatchNormalization()(output_block_2) + + output_block_3 = keras.layers.add([shortcut_y, conv_z]) + output_block_3 = keras.layers.Activation('relu')(output_block_3) + + # FINAL + + gap_layer = keras.layers.GlobalAveragePooling1D()(output_block_3) + + output_layer = keras.layers.Dense(nb_classes, activation='softmax')(gap_layer) + + model = keras.models.Model(inputs=input_layer, outputs=output_layer) + + model.compile(loss='categorical_crossentropy', optimizer=keras.optimizers.Adam(), + metrics=['accuracy']) + + reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor='loss', factor=0.5, patience=50, min_lr=0.0001) + + file_path = self.output_directory+'best_model.hdf5' + + model_checkpoint = keras.callbacks.ModelCheckpoint(filepath=file_path, monitor='loss', + save_best_only=True) + + self.callbacks = [reduce_lr,model_checkpoint] + + return model + + def fit(self, x_train, y_train): + if len(get_available_gpus())==0: + print('error') + exit() + # x_val and y_val are only used to monitor the test loss and NOT for training + batch_size = 512 + nb_epochs = 20 + + mini_batch_size = int(min(x_train.shape[0]/10, batch_size)) + + start_time = time.time() + + self.model.fit(x_train, y_train, batch_size=mini_batch_size, epochs=nb_epochs, + verbose=self.verbose, callbacks=self.callbacks) + + duration = time.time() - start_time + print(f'Duration - ResNet - Training = {duration:.2f}s') + self.model.save(self.output_directory + 'last_model.hdf5') + + + keras.backend.clear_session() + + + def predict(self, x_test): + model_path = self.output_directory.replace('ensemble_result', 'resnet') + model_path = model_path + 'best_model.hdf5' + model = keras.models.load_model(model_path) + y_pred = model.predict(x_test) + return y_pred + +def get_available_gpus(): + local_device_protos = device_lib.list_local_devices() + return [x.name for x in local_device_protos if x.device_type == 'GPU'] diff --git a/tomo_challenge/classifiers/lstm_flax.py b/tomo_challenge/classifiers/lstm_flax.py new file mode 100644 index 00000000..669e6142 --- /dev/null +++ b/tomo_challenge/classifiers/lstm_flax.py @@ -0,0 +1,185 @@ +""" +Flax LSTM +This is an example tomographic bin generator using a convolutional LSTM neural network using JAX/FLAX +We also added a custom data loader we tested. +This solution was developed by the Brazilian Center for Physics Research AI 4 Astrophysics team. +Authors: Clecio R. Bom, Bernardo M. Fraga, Gabriel Teixeira, Eduardo Cypriano and Elizabeth Gonzalez. +contact: debom |at |cbpf| dot| br +Every classifier module needs to: + - have construction of the type + __init__ (self, bands, options) (see examples below) + - implement two functions: + train (self, training_data,training_z) + apply (self, data). + - define valid_options class varible. +See Classifier Documentation below. +""" + +from .base import Tomographer +import numpy as np +import tensorflow as tf +from tensorflow import keras +from sklearn.preprocessing import MinMaxScaler +import h5py +from flax import nn, optim, jax_utils +import jax_random as rand +import jax_numpy as jnp +import jax +from .tomo_challenge import jax_metrics as j_metrics + +def get_classifier(n_bin, n_features): + class BinningNN(nn.Module): + def apply(self, x): + batch_size = x.shape[0] + net = nn.Conv(x, features=64, kernel_size=(2,), padding='SAME', name='fc1') + net = nn.tanh(net) + net = nn.max_pool(net, window_shape=(2,), strides=(2,), padding='SAME') + net = nn.Conv(net, 128, kernel_size=(2,), padding='SAME', name='fc2') + net = nn.tanh(net) + net = nn.max_pool(net, (2,), strides=(2,), padding='SAME') + carry = nn.LSTMCell.initialize_carry(rand.PRNGKey(0), (batch_size,), 972) + _, outputs = jax_utils.scan_in_dim(nn.LSTMCell.partial(), carry, net, axis=1) + net_dense = outputs.reshape((outputs.shape[0], -1)) + net_dense = nn.Dense(net_dense, 486, name='fc3') + net_dense = nn.tanh(net_dense) + net_dense = nn.Dense(net_dense, 486) + net_dense = nn.tanh(net_dense) + return nn.softmax(nn.Dense(net_dense, n_bin)) + _, initial_params = BinningNN.init_by_shape( rand.PRNGKey(0), [((1, n_features,1), jnp.float32)]) + return nn.Model(BinningNN, initial_params) + +class Flax_LSTM(Tomographer): + """ LSTM using flax, optimising for SNR """ + + # valid parameter -- see below + valid_options = ['bins', 'n_feats'] + # this settings means arrays will be sent to train and apply instead + # of dictionaries + wants_arrays = False + + def __init__ (self, bands, options): + """Constructor + + Parameters: + ----------- + bands: str + string containg valid bands, like 'riz' or 'griz' + options: dict + options come through here. Valid keys are listed as valid_options + class variable. + + Note: + ----- + Valid options are: + 'bins' - number of tomographic bins + 'n_feats' - number of features + + """ + self.bands = bands + self.opt = options + self.n_bins = options['bins'] + self.n_features = options['n_feats'] + self.model = get_classifier(self.n_features, self.n_bins) + self.scaler = MinMaxScaler() + + def load_data(fname, take_colors=True, cutoff=0.0): + + data = h5py.File(fname, 'r') + r_mag = data['r_mag'] + g_mag = data['g_mag'] + i_mag = data['i_mag'] + z_mag = data['z_mag'] + redshift = data['redshift_true'] + all_mags = np.vstack([g_mag, r_mag, i_mag, z_mag]) + all_mags = all_mags.T + mask = (all_mags != np.inf).all(axis=1) + all_mags = all_mags[mask,:] + redshift = redshift[mask] + gr_color = all_mags[:,0] - all_mags[:,1] + ri_color = all_mags[:,1] - all_mags[:,2] + iz_color = all_mags[:,2] - all_mags[:,3] + all_colors = np.vstack([gr_color, ri_color, iz_color]) + all_colors = all_colors.T + p = np.linspace(0, 100, self.n_bins+1) + z_edges = np.percentile(redshift, p) + train_bin = np.zeros(all_mags.shape[0]) + for i in range(self.n_bins): + z_low = z_edges[i] + z_high = z_edges[i+1] + train_bin[(redshift > z_low) & (redshift <= z_high)] = i + if cutoff != 0.0: + cut = np.random.uniform(0, 1, all_mags.shape[0]) < cutoff + train_bin = train_bin[cut].reshape(-1,1) + all_mags = all_mags[cut] + all_colors = all_colors[cut] + redshift = redshift[cut] + else: + train_bin = train_bin.reshape(-1,1) + if take_colors: + return np.hstack([all_mags, all_colors]), redshift, train_bin.astype(int), z_edges + else: + return mags, redshift, train_bin.astype(int), z_edges + + + + def train(training_data, training_z, batch_size=512, epochs=20): + + x_train = self.scaler.fit_transform(training_data) + x_train = np.expand_dims(x_train, axis=1) + lr = 0.001 + optimizer = optim.Adam(learning_rate=lr).create(self.model) + + @jax.jit + def train_step(optimizer, x, y): + # This is the loss function + def loss_fn(model): + # Apply classifier to features + w = model(x) + return 1000./ j_metrics.compute_snr_score(w, y) + # Compute gradients + loss, g = jax.value_and_grad(loss_fn)(optimizer.target) + # Perform gradient descent + optimizer = optimizer.apply_gradient(g) + return optimizer, loss + + def get_batches(): + train_dataset = tf.data.Dataset.from_tensor_slices((x_train, training_z)) + train_dataset = train_dataset.shuffle(buffer_size=2048).batch(batch_size) + return train_dataset + + losses = [] + for e in range(epochs): + for i, (x_train, labels) in enumerate(get_batches().as_numpy_iterator()): + optimizer, loss = train_step(optimizer, x_train, labels) + losses.append(loss) + + print('Epoch {}\nLoss = {}'.format(e, loss)) + + + self.model = optimizer.target + + + def apply (self, data): + """Applies training to the data. + + Parameters: + ----------- + Data: numpy array, size Ngalaxes x Nbands + testing data, each row is a galaxy, each column is a band as per + band defined above + + Returns: + tomographic_selections: numpy array, int, size Ngalaxies + tomographic selection for galaxies return as bin number for + each galaxy. + """ + x_test = self.scaler.transform(data) + x_test = np.expand_dims(x_test, axis=-1) + + bs = 512 + s = len(x_test) + weights = np.concatenate([self.model(x_test[bs*i:min((bs*(i+1)), s)]) for i in range(s//bs + 1)]) + + tomo_bin = weights.argmax(axis=1) + return tomo_bin + diff --git a/tomo_challenge/classifiers/nne_jax.py b/tomo_challenge/classifiers/nne_jax.py new file mode 100644 index 00000000..aae54cde --- /dev/null +++ b/tomo_challenge/classifiers/nne_jax.py @@ -0,0 +1,92 @@ +#import tensorflow.keras as keras +import numpy as np + +#from tomo_challenge.utils.utils import create_directory +import os +#from tomo_challenge.utils.utils import check_if_file_exits +import gc +import time +import sys + + +class Classifier_NNE: + def create_classifier(self, model_name, input_shape, nb_classes, output_directory, verbose=False, + build=True, load_weights=False): + if model_name == 'autolstm': + from tomo_challenge.classifiers import autolstm_jax as autolstm + return autolstm.Classifier_LSTM(output_directory, input_shape, nb_classes, verbose, build=build) + if model_name == 'resnet': + from tomo_challenge.classifiers import resnet_jax as resnet + return resnet.Classifier_RESNET(output_directory, input_shape, nb_classes, verbose, + build=build, load_weights=load_weights) + + + def __init__(self, output_directory, verbose=False): + self.classifiers = ['autolstm', 'resnet'] + + self.output_directory = output_directory + create_directory(self.output_directory) + self.verbose = verbose + self.models_dir = output_directory.replace('ensemble_result','classifier') + self.iterations = 1 + + def predict(self): + # no training since models are pre-trained + start_time = time.time() + + l = 0 + # loop through all classifiers + for model_name in self.classifiers: + + # loop through different initialization of classifiers + + curr_dir = self.models_dir.replace('classifier',model_name) + print('curr_dir:' ,curr_dir) + #model = self.create_classifier(model_name, None, None, + # curr_dir, build=False) + #print(model_name) + + predictions_file_name = curr_dir+'y_pred.npy' + + print(f"geting predict for {model_name}") + curr_y_pred = np.load(predictions_file_name) + + if l == 0: + y_pred = np.zeros(shape=curr_y_pred.shape) + else: + max_val = min(y_pred.shape[0], curr_y_pred.shape[0]) + print(max_val) + y_pred = y_pred[:max_val] + print(y_pred.shape) + curr_y_pred = curr_y_pred[:max_val] + print(curr_y_pred.shape) + y_pred = y_pred+curr_y_pred + l+=1 + + + # average predictions + y_pred = y_pred / l + + # save predictiosn + np.save(self.output_directory+'y_pred.npy', y_pred) + + duration = time.time() - start_time + + # the creation of this directory means + create_directory(self.output_directory + '/DONE') + + return y_pred + +def create_directory(directory_path): + if os.path.exists(directory_path): + return None + else: + try: + os.makedirs(directory_path) + except: + + return None + return directory_path + +def check_if_file_exits(file_name): + return os.path.exists(file_name) diff --git a/tomo_challenge/classifiers/requirements.txt b/tomo_challenge/classifiers/requirements.txt new file mode 100644 index 00000000..70f7f99d --- /dev/null +++ b/tomo_challenge/classifiers/requirements.txt @@ -0,0 +1,41 @@ +astropy==4.0.1.post1 +camb==1.1.3 +git+https://github.com/cmbant/camb +click==7.1.2 +cosmosis-standalone==0.2.2 +DynamicTable==0.0.4 +firecrown @ git+https://github.com/LSSTDESC/firecrown/@fd4bb423aac8221ed0826a5921bc5590770f4bcb +gast==0.2.2 +gmpy2==2.1.0b1 +graphviz==0.14 +h5py==2.10.0 +importlib-metadata==1.7.0 +jax==0.1.75 +jax-cosmo==0.1rc6 +jaxlib @ https://storage.googleapis.com/jax-releases/cuda101/jaxlib-0.1.52-cp38-none-manylinux2010_x86_64.whl +joblib==0.15.1 +keract==4.2.2 +keras-tcn==3.1.1 +lightgbm==2.0.12 +matplotlib==3.1.3 +mock==4.0.2 +mpi4py==3.0.3 +numpy==1.18.1 +pandas==0.25.3 +pillow==7.1.2 +progressbar==2.5 +pyccl==2.1.0 +pydot==1.4.1 +PyYAML==5.3.1 +sacc==0.4.2 +schwimmbad==0.3.1 +scikit-learn==0.22.2.post1 +scipy==1.4.1 +sip==4.19.13 +statsmodels==0.11.1 +tensorboard==2.2.2 +tensorflow==2.2.0 +tensorflow-addons==0.10.0 +tensorflow-estimator==2.2.0 +tensorflow-probability==0.10.0 +keras_contrib # installed using pip3 install git+https://www.github.com/keras-team/keras-contrib.git diff --git a/tomo_challenge/classifiers/resnet_jax.py b/tomo_challenge/classifiers/resnet_jax.py new file mode 100644 index 00000000..353178d8 --- /dev/null +++ b/tomo_challenge/classifiers/resnet_jax.py @@ -0,0 +1,205 @@ +import jax_metrics as j_metrics +from flax import nn, optim, serialization, jax_utils +import jax.random as rand +import jax.numpy as jnp +import jax +import os +os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID' +os.environ['CUDA_VISIBLE_DEVICES'] = '2' +from sklearn.preprocessing import MinMaxScaler +import numpy as np +import sys +import h5py +import tensorflow as tf + +def get_classifier(n_features, n_bins): + class Resnet(nn.Module): + def apply(self, x): + #1st block + conv_x = nn.Conv(x, features=64, kernel_size=(8,), padding='SAME') + conv_x = nn.BatchNorm(conv_x) + conv_x = nn.leaky_relu(conv_x) + + conv_y = nn.Conv(conv_x, 64, kernel_size=(5,), padding='SAME') + conv_y = nn.BatchNorm(conv_y) + conv_y = nn.leaky_relu(conv_y) + + conv_z = nn.Conv(conv_y, 64, kernel_size=(3,), padding='SAME') + conv_z = nn.BatchNorm(conv_z) + + short_y = nn.Conv(x, 64, kernel_size=(1,), padding='SAME') + short_y = nn.BatchNorm(short_y) + + output_1 = nn.leaky_relu(short_y + conv_z) + + #2nd block + conv_x = nn.Conv(output_1, features=64*2, kernel_size=(8,), padding='SAME') + conv_x = nn.BatchNorm(conv_x) + conv_x = nn.leaky_relu(conv_x) + + conv_y = nn.Conv(conv_x, 64*2, kernel_size=(5,), padding='SAME') + conv_y = nn.BatchNorm(conv_y) + conv_y = nn.leaky_relu(conv_y) + + conv_z = nn.Conv(conv_y, 64*2, kernel_size=(3,), padding='SAME') + conv_z = nn.BatchNorm(conv_z) + + short_y = nn.Conv(output_1, 64*2, kernel_size=(1,), padding='SAME') + short_y = nn.BatchNorm(short_y) + + output_2 = nn.leaky_relu(short_y + conv_z) + + #3rd block + conv_x = nn.Conv(output_2, features=64*2, kernel_size=(8,), padding='SAME') + conv_x = nn.BatchNorm(conv_x) + conv_x = nn.leaky_relu(conv_x) + + conv_y = nn.Conv(conv_x, 64*2, kernel_size=(5,), padding='SAME') + conv_y = nn.BatchNorm(conv_y) + conv_y = nn.leaky_relu(conv_y) + + conv_z = nn.Conv(conv_y, 64*2, kernel_size=(3,), padding='SAME') + conv_z = nn.BatchNorm(conv_z) + + short_y = nn.BatchNorm(output_2) + + output_3 = nn.leaky_relu(short_y + conv_z) + + gap = jnp.mean(output_3, axis=1) + return nn.softmax(nn.Dense(gap, n_bins)) + _, initial_params = Resnet.init_by_shape(rand.PRNGKey(0), [((1, n_features, 1), jnp.float32)]) + return nn.Model(Resnet, initial_params) + + +#def get_classifier(n_features, n_bins): +# _, initial_params = Resnet(n_bins).init_by_shape( rand.PRNGKey(0), [((1, n_features,1), jnp.float32)]) +# return nn.Model(Resnet(n_bins), initial_params) + +def get_batches(features, redshift, batch_size, train=True): + if train: + dataset = tf.data.Dataset.from_tensor_slices((features, redshift)) + dataset = dataset.shuffle(buffer_size=2048).batch(batch_size) + else: + dataset = tf.data.Dataset.from_tensor_slices(features).batch(batch_size) + return dataset + +def train(n_bins, training_data, training_z, batch_size=512, epochs=40, lr=0.001): + model = get_classifier(training_data.shape[1], n_bins) + optimizer = optim.Adam(learning_rate=lr).create(model) + + @jax.jit + def train_step(optimizer, x, y): + # This is the loss function + def loss_fn(model): + # Apply classifier to features + w = model(x) + # returns - score, because we want to maximize score + return 1000./ j_metrics.compute_snr_score(w, y) + # Compute gradients + loss, g = jax.value_and_grad(loss_fn)(optimizer.target) + # Perform gradient descent + optimizer = optimizer.apply_gradient(g) + return optimizer, loss + + #def get_batches(): + # train_dataset = tf.data.Dataset.from_tensor_slices((training_data, training_z)) + # train_dataset = train_dataset.shuffle(buffer_size=2048).batch(batch_size) + # return train_dataset + + losses = [] + + with nn.stochastic(rand.PRNGKey(0)): + for e in range(epochs): + print('starting epoch {}'.format(e)) + batches = get_batches(training_data, training_z, batch_size) + for i, (x_train, labels) in enumerate(batches.as_numpy_iterator()): + optimizer, loss = train_step(optimizer, x_train, labels) + losses.append(loss) + print('Epoch {}\nLoss = {}'.format(e, loss)) + model = optimizer.target + return losses, model + +def predict(model, data, batch_size): + batches = get_batches(data, 0., batch_size, train=False) + preds = [] + for i, x_test in enumerate(batches.as_numpy_iterator()): + #print(x_test.shape) + p = model(x_test) + preds.append(p) + result = np.concatenate([p for p in preds], axis=0) + return result + +def load_data(n_bins, fname, no_inf=True, take_colors=True, cutoff=0.0): + + data = h5py.File(fname, 'r') + r_mag = data['r_mag'] + g_mag = data['g_mag'] + i_mag = data['i_mag'] + z_mag = data['z_mag'] + redshift = data['redshift_true'] + all_mags = np.vstack([g_mag, r_mag, i_mag, z_mag]) + all_mags = all_mags.T + if no_inf: + mask = (all_mags != np.inf).all(axis=1) + all_mags = all_mags[mask,:] + redshift = redshift[mask] + else: + bad = ~np.isfinite(all_mags) + all_mags[bad] = 30.0 + gr_color = all_mags[:,0] - all_mags[:,1] + ri_color = all_mags[:,1] - all_mags[:,2] + iz_color = all_mags[:,2] - all_mags[:,3] + all_colors = np.vstack([gr_color, ri_color, iz_color]) + all_colors = all_colors.T + p = np.linspace(0, 100, n_bins+1) + z_edges = np.percentile(redshift, p) + train_bin = np.zeros(all_mags.shape[0]) + for i in range(n_bins): + z_low = z_edges[i] + z_high = z_edges[i+1] + train_bin[(redshift > z_low) & (redshift <= z_high)] = i + if cutoff != 0.0: + cut = np.random.uniform(0, 1, all_mags.shape[0]) < cutoff + train_bin = train_bin[cut].reshape(-1,1) + all_mags = all_mags[cut] + all_colors = all_colors[cut] + redshift = redshift[cut] + else: + train_bin = train_bin.reshape(-1,1) + if take_colors: + return np.hstack([all_mags, all_colors]), redshift, train_bin.astype(int), z_edges + else: + return mags, redshift, train_bin.astype(int), z_edges + +def prepare_data(data, scaler=None): + if scaler is None: + scaler = MinMaxScaler() + scaled = scaler.fit_transform(data) + return np.expand_dims(scaled, axis=-1), scaler + else: + scaled = scaler.transform(data) + return np.expand_dims(scaled, axis=-1) + + +if __name__ == '__main__': + n_bins = int(sys.argv[4]) + print('preparing data') + feats_train, z_train, bins_train, edges_train = load_data(n_bins, sys.argv[1], no_inf=False) + + x_train, scaler = prepare_data(feats_train) + + #print(x_train.shape) + #model = get_classifier(x_train.shape[1], n_bins) + print('training') + losses, trained_model = train(n_bins, x_train, z_train, epochs=20) + print('training finished') + val_feats, z_val, val_bins, edges_val = load_data(n_bins, sys.argv[2], no_inf=False) + x_val = prepare_data(val_feats, scaler=scaler) + #print(x_val.shape) + print('predicting') + preds = predict(trained_model, x_val, 512) + + #classes = preds.argmax(axis=1) + print('saving') + output_dir = sys.argv[3] + np.save(output_dir+'y_pred.npy', preds) diff --git a/tomo_challenge/custom_loader.py b/tomo_challenge/custom_loader.py new file mode 100644 index 00000000..6cf29e17 --- /dev/null +++ b/tomo_challenge/custom_loader.py @@ -0,0 +1,29 @@ +import numpy as np +import h5py + +def custom_loader(fname): + data = h5py.File(fname, 'r') + r_mag = data['r_mag'] + g_mag = data['g_mag'] + i_mag = data['i_mag'] + z_mag = data['z_mag'] + redshift = data['redshift_true'] + all_mags = np.vstack([g_mag, r_mag, i_mag, z_mag]) + all_mags = all_mags.T + mask = (all_mags != np.inf).all(axis=1) + all_mags = all_mags[mask,:] + redshift = redshift[mask] + gr_color = all_mags[:,0] - all_mags[:,1] + ri_color = all_mags[:,1] - all_mags[:,2] + iz_color = all_mags[:,2] - all_mags[:,3] + all_colors = np.vstack([gr_color, ri_color, iz_color]) + all_colors = all_colors.T + return np.hstack([all_mags, all_colors]), redshift + +def custom_redshift_loader(fname, bands=None, errors=None, colors=None): + _, redshift = custom_loader(fname) + return redshift + +def custom_data_loader(fname): + features, _ = custom_loader(fname) + return features