Skip to content

Commit

Permalink
Merge pull request #125 from EpistasisLab/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
jay-m-dev authored Apr 19, 2024
2 parents 3c771d4 + fe42853 commit 18652bf
Show file tree
Hide file tree
Showing 13 changed files with 301 additions and 175 deletions.
255 changes: 146 additions & 109 deletions Tutorial/8_Genetic_Algorithm_Overview.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def calculate_version():
'update_checker>=0.16',
'tqdm>=4.36.1',
'stopit>=1.1.1',
'pandas>=1.5.3,<2.0.0',
'pandas>=2.2.0',
'joblib>=1.1.1',
'xgboost>=1.7.0',
'matplotlib>=3.6.2',
Expand Down
2 changes: 1 addition & 1 deletion tpot2/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.1.5-alpha'
__version__ = '0.1.6-alpha'
30 changes: 22 additions & 8 deletions tpot2/builtin_modules/column_one_hot_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,13 @@



def auto_select_categorical_features(X):
def auto_select_categorical_features(X, min_unique=10,):

if not isinstance(X, pd.DataFrame):
return []

feature_mask = []
for column in X.columns:
feature_mask.append(not is_numeric_dtype(X[column]))
if isinstance(X, pd.DataFrame):
return [col for col in X.columns if len(X[col].unique()) < min_unique]
else:
return [i for i in range(X.shape[1]) if len(np.unique(X[:, i])) < min_unique]

return feature_mask


def _X_selected(X, selected):
Expand All @@ -41,6 +38,21 @@ class ColumnOneHotEncoder(BaseEstimator, TransformerMixin):


def __init__(self, columns='auto', drop=None, handle_unknown='error', sparse_output=False, min_frequency=None,max_categories=None):
'''
Parameters
----------
columns : str, list, default='auto'
- 'auto' : Automatically select categorical features based on columns with less than 10 unique values
- 'categorical' : Automatically select categorical features
- 'numeric' : Automatically select numeric features
- 'all' : Select all features
- list : A list of columns to select
drop, handle_unknown, sparse_output, min_frequency, max_categories : see sklearn.preprocessing.OneHotEncoder
'''

self.columns = columns
self.drop = drop
Expand Down Expand Up @@ -73,6 +85,8 @@ def fit(self, X, y=None):
self.columns_ = list(X.select_dtypes(exclude='number').columns)
elif self.columns == "numeric":
self.columns_ = [col for col in X.columns if is_numeric_dtype(X[col])]
elif self.columns == "auto":
self.columns_ = auto_select_categorical_features(X)
elif self.columns == "all":
if isinstance(X, pd.DataFrame):
self.columns_ = X.columns
Expand Down
4 changes: 2 additions & 2 deletions tpot2/config/transformers.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from functools import partial
import numpy as np

from tpot2.builtin_modules import ZeroCount, OneHotEncoder
from tpot2.builtin_modules import ZeroCount, OneHotEncoder, ColumnOneHotEncoder
from sklearn.preprocessing import Binarizer
from sklearn.decomposition import FastICA
from sklearn.cluster import FeatureAgglomeration
Expand Down Expand Up @@ -99,5 +99,5 @@ def make_transformer_config_dictionary(random_state=None, n_features=10):
RobustScaler: {},
StandardScaler: {},
ZeroCount: params_tpot_builtins_ZeroCount,
OneHotEncoder: params_tpot_builtins_OneHotEncoder,
ColumnOneHotEncoder: params_tpot_builtins_OneHotEncoder,
}
21 changes: 11 additions & 10 deletions tpot2/evolvers/base_evolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -483,9 +483,10 @@ def optimize(self, generations=None):
except KeyboardInterrupt:
if self.verbose >= 3:
print("KeyboardInterrupt")

self.population.remove_invalid_from_population(column_names=self.objective_names, invalid_value="INVALID")
self.population.remove_invalid_from_population(column_names=self.objective_names, invalid_value="TIMEOUT")
self.population.remove_invalid_from_population(column_names="Eval Error", invalid_value="INVALID")
self.population.remove_invalid_from_population(column_names="Eval Error", invalid_value="TIMEOUT")



Expand Down Expand Up @@ -623,17 +624,17 @@ def evaluate_population_full(self, budget=None):
parallel_timeout = 10

#scores = tpot2.utils.eval_utils.parallel_eval_objective_list(individuals_to_evaluate, self.objective_functions, self.n_jobs, verbose=self.verbose, timeout=self.max_eval_time_seconds, budget=budget, n_expected_columns=len(self.objective_names), client=self._client, parallel_timeout=parallel_timeout, **self.objective_kwargs)
scores, start_times, end_times = tpot2.utils.eval_utils.parallel_eval_objective_list2(individuals_to_evaluate, self.objective_functions, verbose=self.verbose, max_eval_time_seconds=self.max_eval_time_seconds, budget=budget, n_expected_columns=len(self.objective_names), client=self._client, **self.objective_kwargs)

scores, start_times, end_times, eval_errors = tpot2.utils.eval_utils.parallel_eval_objective_list2(individuals_to_evaluate, self.objective_functions, verbose=self.verbose, max_eval_time_seconds=self.max_eval_time_seconds, budget=budget, n_expected_columns=len(self.objective_names), client=self._client, **self.objective_kwargs)

self.population.update_column(individuals_to_evaluate, column_names=self.objective_names, data=scores)
if budget is not None:
self.population.update_column(individuals_to_evaluate, column_names="Budget", data=budget)

self.population.update_column(individuals_to_evaluate, column_names="Submitted Timestamp", data=start_times)
self.population.update_column(individuals_to_evaluate, column_names="Completed Timestamp", data=end_times)
self.population.remove_invalid_from_population(column_names=self.objective_names)
self.population.remove_invalid_from_population(column_names=self.objective_names, invalid_value="TIMEOUT")
self.population.update_column(individuals_to_evaluate, column_names="Eval Error", data=eval_errors)
self.population.remove_invalid_from_population(column_names="Eval Error")
self.population.remove_invalid_from_population(column_names="Eval Error", invalid_value="TIMEOUT")

def get_unevaluated_individuals(self, column_names, budget=None, individual_list=None):
if individual_list is not None:
Expand Down Expand Up @@ -695,7 +696,7 @@ def evaluate_population_selection_early_stop(self,survival_counts, thresholds=No
if parallel_timeout < 0:
parallel_timeout = 10

scores, start_times, end_times = tpot2.utils.eval_utils.parallel_eval_objective_list2(individual_list=unevaluated_individuals_this_step,
scores, start_times, end_times, eval_errors = tpot2.utils.eval_utils.parallel_eval_objective_list2(individual_list=unevaluated_individuals_this_step,
objective_list=self.objective_functions,
verbose=self.verbose,
max_eval_time_seconds=self.max_eval_time_seconds,
Expand All @@ -706,14 +707,14 @@ def evaluate_population_selection_early_stop(self,survival_counts, thresholds=No
client=self._client,
**self.objective_kwargs,
)

self.population.update_column(unevaluated_individuals_this_step, column_names=this_step_names, data=scores)
self.population.update_column(unevaluated_individuals_this_step, column_names="Submitted Timestamp", data=start_times)
self.population.update_column(unevaluated_individuals_this_step, column_names="Completed Timestamp", data=end_times)
self.population.update_column(unevaluated_individuals_this_step, column_names="Eval Error", data=eval_errors)


self.population.remove_invalid_from_population(column_names=this_step_names)
self.population.remove_invalid_from_population(column_names=this_step_names, invalid_value="TIMEOUT")
self.population.remove_invalid_from_population(column_names="Eval Error")
self.population.remove_invalid_from_population(column_names="Eval Error", invalid_value="TIMEOUT")

#remove invalids:
invalids = []
Expand Down
107 changes: 77 additions & 30 deletions tpot2/evolvers/steady_state_evolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,15 @@
import dask
import warnings


def ind_mutate(ind, rng_):
rng = np.random.default_rng(rng_)
return ind.mutate(rng_=rng)

def ind_crossover(ind1, ind2, rng_):
rng = np.random.default_rng(rng_)
return ind1.crossover(ind2, rng_=rng)

class SteadyStateEvolver():
def __init__( self,
individual_generator ,
Expand Down Expand Up @@ -241,6 +250,8 @@ def optimize(self):

done = False
start_time = time.time()

enough_parents_evaluated=False
while not done:

###############################
Expand All @@ -257,20 +268,31 @@ def optimize(self):

#Loop through all futures, collect completed and timeout futures.
for completed_future in list(submitted_futures.keys()):

eval_error = None
#get scores and update
if completed_future.done(): #if future is done
#If the future is done but threw and error, record the error
if completed_future.exception() or completed_future.status == "error": #if the future is done and threw an error
print("Exception in future")
print(completed_future.exception())
scores = ["INVALID" for _ in range(len(self.objective_names))]
scores = [np.nan for _ in range(len(self.objective_names))]
eval_error = "INVALID"
elif completed_future.cancelled(): #if the future is done and was cancelled
print("Cancelled future (likely memory related)")
scores = ["INVALID" for _ in range(len(self.objective_names))]
scores = [np.nan for _ in range(len(self.objective_names))]
eval_error = "INVALID"
else: #if the future is done and did not throw an error, get the scores
try:
scores = completed_future.result()

#check if scores contain "INVALID" or "TIMEOUT"
if "INVALID" in scores:
eval_error = "INVALID"
scores = [np.nan]
elif "TIMEOUT" in scores:
eval_error = "TIMEOUT"
scores = [np.nan]

except Exception as e:
print("Exception in future, but not caught by dask")
print(e)
Expand All @@ -279,7 +301,8 @@ def optimize(self):
print("status", completed_future.status)
print("done", completed_future.done())
print("cancelld ", completed_future.cancelled())
scores = ["INVALID" for _ in range(len(self.objective_names))]
scores = [np.nan for _ in range(len(self.objective_names))]
eval_error = "INVALID"
else: #if future is not done

#check if the future has been running for too long, cancel the future
Expand All @@ -289,7 +312,8 @@ def optimize(self):
if self.verbose >= 4:
print(f'WARNING AN INDIVIDUAL TIMED OUT (Fallback): \n {submitted_futures[completed_future]} \n')

scores = ["TIMEOUT" for _ in range(len(self.objective_names))]
scores = [np.nan for _ in range(len(self.objective_names))]
eval_error = "TIMEOUT"
else:
continue #otherwise, continue to next future

Expand All @@ -304,6 +328,7 @@ def optimize(self):
scores = [scores[0] for _ in range(len(self.objective_names))]
self.population.update_column(this_individual, column_names=self.objective_names, data=scores)
self.population.update_column(this_individual, column_names="Completed Timestamp", data=time.time())
self.population.update_column(this_individual, column_names="Eval Error", data=eval_error)
if budget is not None:
self.population.update_column(this_individual, column_names="Budget", data=this_budget)

Expand All @@ -314,9 +339,8 @@ def optimize(self):

#now we have a list of completed futures


self.population.remove_invalid_from_population(column_names=self.objective_names, invalid_value="INVALID")
self.population.remove_invalid_from_population(column_names=self.objective_names, invalid_value="TIMEOUT")
self.population.remove_invalid_from_population(column_names="Eval Error", invalid_value="INVALID")
self.population.remove_invalid_from_population(column_names="Eval Error", invalid_value="TIMEOUT")


###############################
Expand Down Expand Up @@ -429,33 +453,56 @@ def optimize(self):
###############################
n_individuals_to_submit = self.max_queue_size - len(submitted_futures)
if n_individuals_to_submit > 0:
parents_df = self.population.get_column(self.population.population, column_names=self.objective_names+ ["Individual"], to_numpy=False)
parents_df = parents_df[~parents_df[self.objective_names].isin(["TIMEOUT","INVALID"]).any(axis=1)]
parents_df = parents_df[~parents_df[self.objective_names].isna().any(axis=1)]

cur_evaluated_population = parents_df["Individual"].to_numpy()
if len(cur_evaluated_population) > 0:
scores = parents_df[self.objective_names].to_numpy()
weighted_scores = scores * self.objective_function_weights
#number of crossover pairs and mutation only parent to generate

if len(parents_df) < 2:
var_ops = ["mutate" for _ in range(n_individuals_to_submit)]
else:
var_ops = [self.rng.choice(["crossover","mutate_then_crossover","crossover_then_mutate",'mutate'],p=[self.crossover_probability,self.mutate_then_crossover_probability, self.crossover_then_mutate_probability,self.mutate_probability]) for _ in range(n_individuals_to_submit)]

parents = []
for op in var_ops:
#count non-nan values in the objective columns
if not enough_parents_evaluated:
parents_df = self.population.get_column(self.population.population, column_names=self.objective_names, to_numpy=False)
scores = parents_df[self.objective_names[0]].to_numpy()
#count non-nan values in the objective columns
n_evaluated = np.count_nonzero(~np.isnan(scores))
if n_evaluated >0 :
enough_parents_evaluated=True

# parents_df = self.population.get_column(self.population.population, column_names=self.objective_names+ ["Individual"], to_numpy=False)
# parents_df = parents_df[~parents_df[self.objective_names].isin(["TIMEOUT","INVALID"]).any(axis=1)]
# parents_df = parents_df[~parents_df[self.objective_names].isna().any(axis=1)]

# cur_evaluated_population = parents_df["Individual"].to_numpy()
# if len(cur_evaluated_population) > 0:
# scores = parents_df[self.objective_names].to_numpy()
# weighted_scores = scores * self.objective_function_weights
# #number of crossover pairs and mutation only parent to generate

# if len(parents_df) < 2:
# var_ops = ["mutate" for _ in range(n_individuals_to_submit)]
# else:
# var_ops = [self.rng.choice(["crossover","mutate_then_crossover","crossover_then_mutate",'mutate'],p=[self.crossover_probability,self.mutate_then_crossover_probability, self.crossover_then_mutate_probability,self.mutate_probability]) for _ in range(n_individuals_to_submit)]

# parents = []
# for op in var_ops:
# if op == "mutate":
# parents.extend(np.array(cur_evaluated_population)[self.parent_selector(weighted_scores, k=1, n_parents=1, rng_=self.rng)])
# else:
# parents.extend(np.array(cur_evaluated_population)[self.parent_selector(weighted_scores, k=1, n_parents=2, rng_=self.rng)])

# #_offspring = self.population.create_offspring2(parents, var_ops, rng_=self.rng, add_to_population=True)
# offspring = self.population.create_offspring2(parents, var_ops, [ind_mutate], None, [ind_crossover], None, add_to_population=True, keep_repeats=False, mutate_until_unique=True, rng_=self.rng)

if enough_parents_evaluated:

parents = self.population.parent_select(selector=self.parent_selector, weights=self.objective_function_weights, columns_names=self.objective_names, k=n_individuals_to_submit, n_parents=2, rng_=self.rng)
p = np.array([self.crossover_probability, self.mutate_then_crossover_probability, self.crossover_then_mutate_probability, self.mutate_probability])
p = p / p.sum()
var_op_list = self.rng.choice(["crossover", "mutate_then_crossover", "crossover_then_mutate", "mutate"], size=n_individuals_to_submit, p=p)

for i, op in enumerate(var_op_list):
if op == "mutate":
parents.extend(np.array(cur_evaluated_population)[self.parent_selector(weighted_scores, k=1, n_parents=1, rng_=self.rng)])
else:
parents.extend(np.array(cur_evaluated_population)[self.parent_selector(weighted_scores, k=1, n_parents=2, rng_=self.rng)])
parents[i] = parents[i][0] #mutations take a single individual

_offspring = self.population.create_offspring(parents, var_ops, rng_=self.rng, n_jobs=1, add_to_population=True)
offspring = self.population.create_offspring2(parents, var_op_list, [ind_mutate], None, [ind_crossover], None, add_to_population=True, keep_repeats=False, mutate_until_unique=True, rng_=self.rng)

# If we don't have enough evaluated individuals to use as parents for variation, we create new individuals randomly
# This can happen if the individuals in the initial population are invalid
if len(cur_evaluated_population) == 0 and len(submitted_futures) < self.max_queue_size:
elif len(submitted_futures) < self.max_queue_size:

initial_population = self.population.evaluated_individuals.iloc[:self.initial_population_size*3]
invalid_initial_population = initial_population[initial_population[self.objective_names].isin(["TIMEOUT","INVALID"]).any(axis=1)]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1137,7 +1137,10 @@ def _cached_transform(cache_nunber=0):
pass

def __str__(self):
return self.export_pipeline().__str__()
try:
return f"<GraphIdnividual {0}".format(self.export_pipeline().__str__())
except:
return "<Invalid GraphIdnividual>"

def unique_id(self) -> GraphKey:
if self.key is None:
Expand Down
Loading

0 comments on commit 18652bf

Please sign in to comment.