From 60382e61cc57e92369addb19d1a72cffe0a4ae15 Mon Sep 17 00:00:00 2001 From: Adam Valenta Date: Wed, 21 Jun 2023 15:34:54 +0200 Subject: [PATCH 01/34] GH-6723 - First version of Adaboost with hardcoded weaklearner to DRF - works similarly to scikit on prostate, airlines, and higgs --- .../src/main/java/hex/adaboost/AdaBoost.java | 156 ++++++++++++++ .../main/java/hex/adaboost/AdaBoostModel.java | 130 ++++++++++++ .../src/main/java/hex/tree/drf/DRFModel.java | 8 + .../test/java/hex/adaboost/AdaBoostTest.java | 200 ++++++++++++++++++ .../src/test/java/hex/tree/drf/DRFTest.java | 35 +++ h2o-core/src/main/java/hex/Model.java | 1 + .../main/java/hex/ModelMetricsBinomial.java | 1 - 7 files changed, 530 insertions(+), 1 deletion(-) create mode 100644 h2o-algos/src/main/java/hex/adaboost/AdaBoost.java create mode 100644 h2o-algos/src/main/java/hex/adaboost/AdaBoostModel.java create mode 100644 h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java diff --git a/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java b/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java new file mode 100644 index 000000000000..40f4f2835b9a --- /dev/null +++ b/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java @@ -0,0 +1,156 @@ +package hex.adaboost; + +import hex.ModelBuilder; +import hex.ModelCategory; +import hex.genmodel.algos.tree.SharedTreeSubgraph; +import hex.tree.drf.DRF; +import hex.tree.drf.DRFModel; +import org.apache.log4j.Logger; +import water.*; +import water.exceptions.H2OModelBuilderIllegalArgumentException; +import water.fvec.Chunk; +import water.fvec.Frame; +import water.fvec.Vec; + +/** + * TODO valenad1 + * + * @author Adam Valenta + */ +public class AdaBoost extends ModelBuilder { + private static final Logger LOG = Logger.getLogger(AdaBoost.class); + + private AdaBoostModel _model; + + // Called from an http request + public AdaBoost(AdaBoostModel.AdaBoostParameters parms) { + super(parms); + init(false); + } + + private class AdaBoostDriver extends Driver { + + @Override + public void computeImpl() { + _model = null; + try { + init(false); + if (error_count() > 0) { + throw H2OModelBuilderIllegalArgumentException.makeFromBuilder(AdaBoost.this); + } + _model = new AdaBoostModel(dest(), _parms, + new AdaBoostModel.AdaBoostOutput(AdaBoost.this)); + _model.delete_and_lock(_job); + buildAdaboost(); + LOG.info(_model.toString()); + } finally { + if (_model != null) + _model.unlock(_job); + } + } + + private void buildAdaboost() { + _model._output.alphas = new double[(int)_parms._n_estimators]; + _model._output.models = new Key[(int)_parms._n_estimators]; + System.out.println(train().toTwoDimTable(0,10,false)); + train().add("weights", Vec.makeCon(1.0, train().numRows())); + train()._key = Key.make(); + DKV.put(train()); + Scope.track(train()); + for (int n = 0; n < _parms._n_estimators; n++) { + DRFModel.DRFParameters parms = new DRFModel.DRFParameters(); + parms._train = train()._key; + parms._response_column = _parms._response_column; + parms._mtries = 1; + parms._min_rows = 1; + parms._weights_column = "weights"; + parms._seed = _parms._seed + n; + parms._ntrees = 1; + parms._sample_rate = 1; + parms._max_depth = 1; + DRF job = new DRF(parms); + DRFModel drf = job.trainModel().get(); + DKV.put(drf); + Scope.untrack(drf._key); + _model._output.models[n] = drf._key; + Frame score = drf.score(train()); + Scope.track(score); + + CountWe countWe = new CountWe().doAll(train().vec("weights"), train().vec(_parms._response_column), score.vec("predict")); + double e_m = countWe.We / countWe.W; + double alpha_m = _parms._learning_rate * Math.log((1 - e_m) / e_m); + _model._output.alphas[n] = alpha_m; + + UpdateW updateW = new UpdateW(alpha_m); + updateW.doAll(train().vec("weights"), train().vec(_parms._response_column), score.vec("predict")); + } + } + } + + private class CountWe extends MRTask { + double W = 0; + double We = 0; + + @Override + public void map(Chunk weights, Chunk response, Chunk predict) { + for (int row = 0; row < weights._len; row++) { + double weight = weights.atd(row); + W += weight; + if (response.at8(row) != predict.at8(row)) { + We += weight; + } + } + } + + @Override + public void reduce(CountWe mrt) { + W += mrt.W; + We += mrt.We; + } + } + + private class UpdateW extends MRTask { + double exp_am; + double exp_am_inverse; + + public UpdateW(double alpha_m) { + exp_am = Math.exp(alpha_m); + exp_am_inverse = Math.exp(-alpha_m); + } + + @Override + public void map(Chunk weights, Chunk response, Chunk predict) { + for (int row = 0; row < weights._len; row++) { + double weight = weights.atd(row); + if (response.at8(row) != predict.at8(row)) { + weights.set(row, weight*exp_am); + } else { + weights.set(row, weight*exp_am_inverse); + } + } + } + } + + @Override + protected Driver trainModelImpl() { + return new AdaBoostDriver(); + } + + @Override + public BuilderVisibility builderVisibility() { + return BuilderVisibility.Experimental; + } + + @Override + public ModelCategory[] can_build() { + return new ModelCategory[]{ + ModelCategory.Binomial, + }; + } + + @Override + public boolean isSupervised() { + return true; + } + +} diff --git a/h2o-algos/src/main/java/hex/adaboost/AdaBoostModel.java b/h2o-algos/src/main/java/hex/adaboost/AdaBoostModel.java new file mode 100644 index 000000000000..92baeb3bb9d5 --- /dev/null +++ b/h2o-algos/src/main/java/hex/adaboost/AdaBoostModel.java @@ -0,0 +1,130 @@ +package hex.adaboost; + +import hex.*; +import hex.tree.drf.DRFModel; +import hex.tree.isoforextended.isolationtree.CompressedIsolationTree; +import org.apache.log4j.Logger; +import water.*; +import water.fvec.Frame; + +public class AdaBoostModel extends Model { + private static final Logger LOG = Logger.getLogger(AdaBoostModel.class); + + public AdaBoostModel(Key selfKey, AdaBoostParameters parms, + AdaBoostOutput output) { + super(selfKey, parms, output); + } + + @Override + public ModelMetrics.MetricBuilder makeMetricBuilder(String[] domain) { + if (_output.getModelCategory() == ModelCategory.Binomial) { + return new ModelMetricsBinomial.MetricBuilderBinomial(domain); + } + throw H2O.unimpl("AdaBoost currently support only binary classification"); + } + + @Override + protected String[] makeScoringNames(){ + return new String[]{"predict", "p0", "p1"}; + } + + @Override + protected double[] score0(double[] data, double[] preds) { + double alphas0 = 0; + double alphas1 = 0; + for (int i = 0; i < _output.alphas.length; i++) { + DRFModel drfModel = DKV.getGet(_output.models[i]); + if (drfModel.score(data) == 0) { + alphas0 += _output.alphas[i]; + } else { + alphas1 += _output.alphas[i]; + } + } + preds[1] = alphas0 > alphas1 ? 1 : 0; + preds[2] = alphas0 < alphas1 ? 1 : 0; + return preds; + } + + public static class AdaBoostOutput extends Model.Output { + public double[] alphas; + public Key[] models; + + public AdaBoostOutput(AdaBoost adaBoostModel) { + super(adaBoostModel); + } + + @Override + public boolean isAdaboost() { + return true; + } + } + + @Override + protected Futures remove_impl(Futures fs, boolean cascade) { + for (Key iTreeKey : _output.models) { + Keyed.remove(iTreeKey, fs, true); + } + return super.remove_impl(fs, cascade); + } + + @Override + protected AutoBuffer writeAll_impl(AutoBuffer ab) { + for (Key iTreeKey : _output.models) { + ab.putKey(iTreeKey); + } + return super.writeAll_impl(ab); + } + + @Override + protected Keyed readAll_impl(AutoBuffer ab, Futures fs) { + for (Key iTreeKey : _output.models) { + ab.getKey(iTreeKey, fs); + } + return super.readAll_impl(ab,fs); + } + + public static class AdaBoostParameters extends Model.Parameters { + + /** + * TODO valenad1 + */ + public long _n_estimators; + + /** + * TODO valenad1 + */ + public String _weak_learner; + + /** + * TODO valenad1 + */ + public double _learning_rate; + + @Override + public String algoName() { + return "AdaBoost"; + } + + @Override + public String fullName() { + return "AdaBoost"; + } + + @Override + public String javaName() { + return AdaBoostModel.class.getName(); + } + + @Override + public long progressUnits() { + return _n_estimators; + } + + public AdaBoostParameters() { + super(); + _n_estimators = 50; + _weak_learner = "DRF"; + _learning_rate = 0.5; + } + } +} diff --git a/h2o-algos/src/main/java/hex/tree/drf/DRFModel.java b/h2o-algos/src/main/java/hex/tree/drf/DRFModel.java index 578e8fb2d045..73b63dd37516 100644 --- a/h2o-algos/src/main/java/hex/tree/drf/DRFModel.java +++ b/h2o-algos/src/main/java/hex/tree/drf/DRFModel.java @@ -7,6 +7,7 @@ import water.Key; import water.fvec.Frame; import water.fvec.NewChunk; +import water.util.ArrayUtils; import water.util.MathUtils; public class DRFModel extends SharedTreeModelWithContributions { @@ -100,6 +101,13 @@ protected ScoreContributionsTask getScoreContributionsSoringTask(SharedTreeModel return preds; } + @Override + public double score(double[] data) { + double[] pred = score0(data, new double[_output.nclasses() + 1], 0, _output._ntrees); + score0PostProcessSupervised(pred, data); + return pred[0]; + } + @Override protected SharedTreePojoWriter makeTreePojoWriter() { CompressedForest compressedForest = new CompressedForest(_output._treeKeys, _output._domains); diff --git a/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java b/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java new file mode 100644 index 000000000000..a3850fb5e4e8 --- /dev/null +++ b/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java @@ -0,0 +1,200 @@ +package hex.adaboost; + +import org.apache.commons.io.FileUtils; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.junit.contrib.java.lang.system.EnvironmentVariables; +import org.junit.runner.RunWith; +import water.Scope; +import water.TestUtil; +import water.fvec.Frame; +import water.runner.CloudSize; +import water.runner.H2ORunner; + +import java.io.File; +import java.io.IOException; + +import static org.junit.Assert.assertNotNull; + +@CloudSize(1) +@RunWith(H2ORunner.class) +public class AdaBoostTest extends TestUtil { + + public boolean print = false; + + @Rule + public EnvironmentVariables environmentVariables = new EnvironmentVariables(); + + @Before + public void beforeClass() { + final File h2oHomeDir = new File(System.getProperty("user.dir")).getParentFile(); + environmentVariables.set("H2O_FILES_SEARCH_PATH", h2oHomeDir.getAbsolutePath()); + } + + @Test + public void testBasicTrain() { + try { + Scope.enter(); + Frame train = Scope.track(parseTestFile("smalldata/prostate/prostate.csv")); + String response = "CAPSULE"; + train.toCategoricalCol(response); + AdaBoostModel.AdaBoostParameters p = new AdaBoostModel.AdaBoostParameters(); + p._train = train._key; + p._seed = 0xDECAF; + p._n_estimators = 50; + p._response_column = response; + + AdaBoost adaBoost = new AdaBoost(p); + AdaBoostModel adaBoostModel = adaBoost.trainModel().get(); + Scope.track_generic(adaBoostModel); + assertNotNull(adaBoostModel); + } finally { + Scope.exit(); + } + } + + @Test + public void testBasicTrainLarge() { + try { + Scope.enter(); + Frame train = Scope.track(parseTestFile("bigdata/laptop/creditcardfraud/creditcardfraud.csv")); + String response = "Class"; + train.toCategoricalCol(response); + AdaBoostModel.AdaBoostParameters p = new AdaBoostModel.AdaBoostParameters(); + p._train = train._key; + p._seed = 0xDECAF; + p._n_estimators = 50; + p._response_column = response; + + AdaBoost adaBoost = new AdaBoost(p); + AdaBoostModel adaBoostModel = adaBoost.trainModel().get(); + Scope.track_generic(adaBoostModel); + assertNotNull(adaBoostModel); + } finally { + Scope.exit(); + } + } + + @Test + public void testBasicTrainAndScore() { + try { + Scope.enter(); + Frame train = Scope.track(parseTestFile("smalldata/prostate/prostate.csv")); + Frame test = Scope.track(parseTestFile("smalldata/prostate/prostate.csv")); + String response = "CAPSULE"; + train.toCategoricalCol(response); + test.toCategoricalCol(response); + AdaBoostModel.AdaBoostParameters p = new AdaBoostModel.AdaBoostParameters(); + p._train = train._key; + p._seed = 0xDECAF; + p._n_estimators = 50; + p._response_column = response; + + AdaBoost adaBoost = new AdaBoost(p); + AdaBoostModel adaBoostModel = adaBoost.trainModel().get(); + Scope.track_generic(adaBoostModel); + assertNotNull(adaBoostModel); + + Frame score = adaBoostModel.score(test); + Scope.track(score); + toCSV(score, "../prostatescore.csv"); + } finally { + Scope.exit(); + } + } + + @Test + public void testBasicTrainAndScoreLarge() { + try { + Scope.enter(); + Frame train = Scope.track(parseTestFile("bigdata/laptop/creditcardfraud/creditcardfraud.csv")); + Frame test = Scope.track(parseTestFile("bigdata/laptop/creditcardfraud/creditcardfraud.csv")); + String response = "Class"; + train.toCategoricalCol(response); + AdaBoostModel.AdaBoostParameters p = new AdaBoostModel.AdaBoostParameters(); + p._train = train._key; + p._seed = 0xDECAF; + p._n_estimators = 50; + p._response_column = response; + + AdaBoost adaBoost = new AdaBoost(p); + AdaBoostModel adaBoostModel = adaBoost.trainModel().get(); + Scope.track_generic(adaBoostModel); + assertNotNull(adaBoostModel); + + Frame score = adaBoostModel.score(test); + Scope.track(score); + toCSV(score, "../creditcardfraudscore.csv"); + } finally { + Scope.exit(); + } + } + + @Test + public void testBasicTrainAirlines() { + try { + Scope.enter(); + Frame train = Scope.track(parseTestFile("smalldata/testng/airlines_train_preprocessed.csv")); + Frame test = Scope.track(parseTestFile("smalldata/testng/airlines_test_preprocessed.csv")); + String response = "IsDepDelayed"; + train.toCategoricalCol(response); + test.toCategoricalCol(response); + AdaBoostModel.AdaBoostParameters p = new AdaBoostModel.AdaBoostParameters(); + p._train = train._key; + p._seed = 0xDECAF; + p._n_estimators = 50; + p._response_column = response; + + AdaBoost adaBoost = new AdaBoost(p); + AdaBoostModel adaBoostModel = adaBoost.trainModel().get(); + Scope.track_generic(adaBoostModel); + assertNotNull(adaBoostModel); + + Frame score = adaBoostModel.score(test); + Scope.track(score); + toCSV(score, "../airlinesscore.csv"); + } finally { + Scope.exit(); + } + } + + @Test + public void testBasicTrainHiggs() { + try { + Scope.enter(); + Frame train = Scope.track(parseTestFile("smalldata/higgs/higgs_train_5k.csv")); + Frame test = Scope.track(parseTestFile("smalldata/higgs/higgs_test_5k.csv")); + String response = "response"; + train.toCategoricalCol(response); + test.toCategoricalCol(response); + AdaBoostModel.AdaBoostParameters p = new AdaBoostModel.AdaBoostParameters(); + p._train = train._key; + p._seed = 0xDECAF; + p._n_estimators = 50; + p._response_column = response; + + AdaBoost adaBoost = new AdaBoost(p); + AdaBoostModel adaBoostModel = adaBoost.trainModel().get(); + Scope.track_generic(adaBoostModel); + assertNotNull(adaBoostModel); + + Frame score = adaBoostModel.score(test); + Scope.track(score); + toCSV(score, "../higgsscore.csv"); + } finally { + Scope.exit(); + } + } + + private void toCSV(Frame frame, String filename) { + if (print) { + File targetFile = new File(filename); + try { + FileUtils.copyInputStreamToFile(frame.toCSV(new Frame.CSVStreamParams()), targetFile); + } catch (IOException e) { + e.printStackTrace(); + } + } + } +} diff --git a/h2o-algos/src/test/java/hex/tree/drf/DRFTest.java b/h2o-algos/src/test/java/hex/tree/drf/DRFTest.java index 73723daddec9..21d8b1482022 100755 --- a/h2o-algos/src/test/java/hex/tree/drf/DRFTest.java +++ b/h2o-algos/src/test/java/hex/tree/drf/DRFTest.java @@ -2290,4 +2290,39 @@ public void reproducePUBDEV8298() throws Exception { Scope.exit(); } } + + @Test + public void testDRFMinimalize() { + Frame tfr = null, vfr = null; + DRFModel drf = null; + + Scope.enter(); + try { + tfr = parseTestFile("smalldata/junit/weights_all_ones.csv"); + DKV.put(tfr); + DRFModel.DRFParameters parms = new DRFModel.DRFParameters(); + parms._train = tfr._key; + parms._response_column = "response"; + parms._weights_column = "weight"; + parms._seed = 234; + parms._min_rows = 1; + parms._max_depth = 2; + parms._ntrees = 3; + + // Build a first model; all remaining models should be equal + drf = new DRF(parms).trainModel().get(); + + // OOB + ModelMetricsBinomial mm = (ModelMetricsBinomial)drf._output._training_metrics; + assertEquals(_AUC, mm.auc_obj()._auc, 1e-8); + assertEquals(_MSE, mm.mse(), 1e-8); + assertEquals(_LogLoss, mm.logloss(), 1e-6); + + } finally { + if (tfr != null) tfr.remove(); + if (vfr != null) vfr.remove(); + if (drf != null) drf.delete(); + Scope.exit(); + } + } } diff --git a/h2o-core/src/main/java/hex/Model.java b/h2o-core/src/main/java/hex/Model.java index 46c5d4f33a11..0828f2ea2418 100755 --- a/h2o-core/src/main/java/hex/Model.java +++ b/h2o-core/src/main/java/hex/Model.java @@ -1127,6 +1127,7 @@ public String[] features() { public boolean hasFold () { return _hasFold;} public boolean hasTreatment() { return _hasTreatment;} public boolean hasResponse() { return isSupervised(); } + public boolean isAdaboost() {return false;}; public String responseName() { return isSupervised()?_names[responseIdx()]:null;} public String weightsName () { return _hasWeights ?_names[weightsIdx()]:null;} public String offsetName () { return _hasOffset ?_names[offsetIdx()]:null;} diff --git a/h2o-core/src/main/java/hex/ModelMetricsBinomial.java b/h2o-core/src/main/java/hex/ModelMetricsBinomial.java index 725c866055d6..04a40a3f5ace 100755 --- a/h2o-core/src/main/java/hex/ModelMetricsBinomial.java +++ b/h2o-core/src/main/java/hex/ModelMetricsBinomial.java @@ -163,7 +163,6 @@ public static class MetricBuilderBinomial> ex protected double _logloss; protected AUC2.AUCBuilder _auc; - public MetricBuilderBinomial() {} public MetricBuilderBinomial( String[] domain ) { super(2,domain); _auc = new AUC2.AUCBuilder(AUC2.NBINS); } public double auc() {return new AUC2(_auc)._auc;} From 11a0e2627f67ae38e6c606a7342d47e6c0fa666a Mon Sep 17 00:00:00 2001 From: Adam Valenta Date: Wed, 26 Jul 2023 18:11:37 +0200 Subject: [PATCH 02/34] WIP add glm learner --- .../src/main/java/hex/adaboost/AdaBoost.java | 77 ++++++++++++----- .../main/java/hex/adaboost/AdaBoostModel.java | 8 +- h2o-algos/src/main/java/hex/glm/GLMModel.java | 6 ++ .../test/java/hex/adaboost/AdaBoostTest.java | 82 ++++++++++++++++++- h2o-core/src/main/java/hex/Model.java | 2 +- 5 files changed, 150 insertions(+), 25 deletions(-) diff --git a/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java b/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java index 40f4f2835b9a..7de28af469d9 100644 --- a/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java +++ b/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java @@ -1,12 +1,17 @@ package hex.adaboost; +import hex.Model; import hex.ModelBuilder; import hex.ModelCategory; -import hex.genmodel.algos.tree.SharedTreeSubgraph; +import hex.glm.GLM; +import hex.glm.GLMModel; import hex.tree.drf.DRF; import hex.tree.drf.DRFModel; import org.apache.log4j.Logger; -import water.*; +import water.DKV; +import water.Key; +import water.MRTask; +import water.Scope; import water.exceptions.H2OModelBuilderIllegalArgumentException; import water.fvec.Chunk; import water.fvec.Frame; @@ -28,6 +33,16 @@ public AdaBoost(AdaBoostModel.AdaBoostParameters parms) { init(false); } + @Override + public void init(boolean expensive) { + super.init(expensive); + if (expensive) { + if (_parms._weak_learner == AdaBoostModel.Algorithm.AUTO) { + _parms._weak_learner = AdaBoostModel.Algorithm.DRF; + } + } + } + private class AdaBoostDriver extends Driver { @Override @@ -52,28 +67,18 @@ public void computeImpl() { private void buildAdaboost() { _model._output.alphas = new double[(int)_parms._n_estimators]; _model._output.models = new Key[(int)_parms._n_estimators]; - System.out.println(train().toTwoDimTable(0,10,false)); train().add("weights", Vec.makeCon(1.0, train().numRows())); train()._key = Key.make(); DKV.put(train()); Scope.track(train()); for (int n = 0; n < _parms._n_estimators; n++) { - DRFModel.DRFParameters parms = new DRFModel.DRFParameters(); - parms._train = train()._key; - parms._response_column = _parms._response_column; - parms._mtries = 1; - parms._min_rows = 1; - parms._weights_column = "weights"; - parms._seed = _parms._seed + n; - parms._ntrees = 1; - parms._sample_rate = 1; - parms._max_depth = 1; - DRF job = new DRF(parms); - DRFModel drf = job.trainModel().get(); - DKV.put(drf); - Scope.untrack(drf._key); - _model._output.models[n] = drf._key; - Frame score = drf.score(train()); + ModelBuilder job = chooseWeakLearner(); + job._parms._seed += n; + Model model = (Model) job.trainModel().get(); + DKV.put(model); + Scope.untrack(model._key); + _model._output.models[n] = model._key; + Frame score = model.score(train()); Scope.track(score); CountWe countWe = new CountWe().doAll(train().vec("weights"), train().vec(_parms._response_column), score.vec("predict")); @@ -129,7 +134,7 @@ public void map(Chunk weights, Chunk response, Chunk predict) { } } } - } + } @Override protected Driver trainModelImpl() { @@ -152,5 +157,37 @@ public ModelCategory[] can_build() { public boolean isSupervised() { return true; } + + private ModelBuilder chooseWeakLearner() { + switch (_parms._weak_learner) { + case GLM: + return getGLMWeakLearner(); + default: + case DRF: + return getDRFWeakLearner(); + + } + } + + private DRF getDRFWeakLearner() { + DRFModel.DRFParameters parms = new DRFModel.DRFParameters(); + parms._train = train()._key; + parms._response_column = _parms._response_column; + parms._mtries = 1; + parms._min_rows = 1; + parms._weights_column = "weights"; + parms._ntrees = 1; + parms._sample_rate = 1; + parms._max_depth = 1; + parms._seed = _parms._seed; + return new DRF(parms); + } + + private GLM getGLMWeakLearner() { + GLMModel.GLMParameters parms = new GLMModel.GLMParameters(); + parms._train = train()._key; + parms._response_column = _parms._response_column; + return new GLM(parms); + } } diff --git a/h2o-algos/src/main/java/hex/adaboost/AdaBoostModel.java b/h2o-algos/src/main/java/hex/adaboost/AdaBoostModel.java index 92baeb3bb9d5..47d3ad785590 100644 --- a/h2o-algos/src/main/java/hex/adaboost/AdaBoostModel.java +++ b/h2o-algos/src/main/java/hex/adaboost/AdaBoostModel.java @@ -10,6 +10,8 @@ public class AdaBoostModel extends Model { private static final Logger LOG = Logger.getLogger(AdaBoostModel.class); + public enum Algorithm {DRF, GLM, AUTO} + public AdaBoostModel(Key selfKey, AdaBoostParameters parms, AdaBoostOutput output) { super(selfKey, parms, output); @@ -33,7 +35,7 @@ protected double[] score0(double[] data, double[] preds) { double alphas0 = 0; double alphas1 = 0; for (int i = 0; i < _output.alphas.length; i++) { - DRFModel drfModel = DKV.getGet(_output.models[i]); + Model drfModel = DKV.getGet(_output.models[i]); if (drfModel.score(data) == 0) { alphas0 += _output.alphas[i]; } else { @@ -93,7 +95,7 @@ public static class AdaBoostParameters extends Model.Parameters { /** * TODO valenad1 */ - public String _weak_learner; + public Algorithm _weak_learner; /** * TODO valenad1 @@ -123,7 +125,7 @@ public long progressUnits() { public AdaBoostParameters() { super(); _n_estimators = 50; - _weak_learner = "DRF"; + _weak_learner = Algorithm.AUTO; _learning_rate = 0.5; } } diff --git a/h2o-algos/src/main/java/hex/glm/GLMModel.java b/h2o-algos/src/main/java/hex/glm/GLMModel.java index 3a221a7c7669..bfb94d3966c3 100755 --- a/h2o-algos/src/main/java/hex/glm/GLMModel.java +++ b/h2o-algos/src/main/java/hex/glm/GLMModel.java @@ -2087,6 +2087,12 @@ public TwoDimTable generateSummaryHGLM(Key train, int iter){ } @Override protected boolean needsPostProcess() { return false; /* pred[0] is already set by score0 */ } + @Override + public double score(double[] data) { + double[] pred = score0(data, new double[_output.nclasses() + 1], 0); + return pred[0]; + } + @Override protected void toJavaPredictBody(SBPrintStream body, CodeGeneratorPipeline classCtx, CodeGeneratorPipeline fileCtx, diff --git a/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java b/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java index a3850fb5e4e8..200b055940d8 100644 --- a/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java +++ b/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java @@ -1,5 +1,7 @@ package hex.adaboost; +import hex.glm.GLM; +import hex.glm.GLMModel; import org.apache.commons.io.FileUtils; import org.junit.Before; import org.junit.Rule; @@ -54,6 +56,53 @@ public void testBasicTrain() { } } + @Test + public void testBasicTrainGLM() { + try { + Scope.enter(); + Frame train = Scope.track(parseTestFile("smalldata/prostate/prostate.csv")); + String response = "CAPSULE"; + train.toCategoricalCol(response); + AdaBoostModel.AdaBoostParameters p = new AdaBoostModel.AdaBoostParameters(); + p._train = train._key; + p._seed = 0xDECAF; + p._n_estimators = 50; + p._weak_learner = AdaBoostModel.Algorithm.GLM; + p._response_column = response; + + AdaBoost adaBoost = new AdaBoost(p); + AdaBoostModel adaBoostModel = adaBoost.trainModel().get(); + Scope.track_generic(adaBoostModel); + assertNotNull(adaBoostModel); + } finally { + Scope.exit(); + } + } + + @Test + public void testBasicTrainGLMWeakLerner() { + try { + Scope.enter(); + Frame train = Scope.track(parseTestFile("smalldata/prostate/prostate.csv")); + String response = "CAPSULE"; + train.toCategoricalCol(response); + GLMModel.GLMParameters p = new GLMModel.GLMParameters(); + p._train = train._key; + p._seed = 0xDECAF; + p._response_column = response; + + GLM adaBoost = new GLM(p); + GLMModel adaBoostModel = adaBoost.trainModel().get(); + Scope.track_generic(adaBoostModel); + assertNotNull(adaBoostModel); + Frame score = adaBoostModel.score(train); + Scope.track(score); + System.out.println("score.toTwoDimTable(0,10,false) = " + score.toTwoDimTable(0, 10, false)); + } finally { + Scope.exit(); + } + } + @Test public void testBasicTrainLarge() { try { @@ -95,10 +144,41 @@ public void testBasicTrainAndScore() { AdaBoostModel adaBoostModel = adaBoost.trainModel().get(); Scope.track_generic(adaBoostModel); assertNotNull(adaBoostModel); - + Frame score = adaBoostModel.score(test); Scope.track(score); toCSV(score, "../prostatescore.csv"); + + Frame scoreOriginal = Scope.track(parseTestFile("../prostatescore_original.csv")); + assertFrameEquals(scoreOriginal, score, 0); + } finally { + Scope.exit(); + } + } + + @Test + public void testBasicTrainAndScoreGLM() { + try { + Scope.enter(); + Frame train = Scope.track(parseTestFile("smalldata/prostate/prostate.csv")); + Frame test = Scope.track(parseTestFile("smalldata/prostate/prostate.csv")); + String response = "CAPSULE"; + train.toCategoricalCol(response); + AdaBoostModel.AdaBoostParameters p = new AdaBoostModel.AdaBoostParameters(); + p._train = train._key; + p._seed = 0xDECAF; + p._n_estimators = 2; + p._weak_learner = AdaBoostModel.Algorithm.GLM; + p._response_column = response; + + AdaBoost adaBoost = new AdaBoost(p); + AdaBoostModel adaBoostModel = adaBoost.trainModel().get(); + Scope.track_generic(adaBoostModel); + assertNotNull(adaBoostModel); + + Frame score = adaBoostModel.score(test); + Scope.track(score); + toCSV(score, "../prostatescoreglm.csv"); } finally { Scope.exit(); } diff --git a/h2o-core/src/main/java/hex/Model.java b/h2o-core/src/main/java/hex/Model.java index 0828f2ea2418..81aaa9d23dd1 100755 --- a/h2o-core/src/main/java/hex/Model.java +++ b/h2o-core/src/main/java/hex/Model.java @@ -2373,7 +2373,7 @@ protected double[] score0(double data[/*ncols*/], double preds[/*nclasses+1*/], // Version where the user has just ponied-up an array of data to be scored. // Data must be in proper order. Handy for JUnit tests. public double score(double[] data){ - double[] pred = score0(data, new double[_output.nclasses()]); + double[] pred = score0(data, new double[_output.nclasses() + 1]); return _output.nclasses() == 1 ? pred[0] /* regression */ : ArrayUtils.maxIndex(pred) /*classification?*/; } From 2cc97856df0ca85420375e73a4d65525c5a67680 Mon Sep 17 00:00:00 2001 From: Adam Valenta Date: Fri, 28 Jul 2023 17:44:56 +0200 Subject: [PATCH 03/34] Add probabilities by Obtaining Calibrated Probabilities from Boosting paper --- .../main/java/hex/adaboost/AdaBoostModel.java | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/h2o-algos/src/main/java/hex/adaboost/AdaBoostModel.java b/h2o-algos/src/main/java/hex/adaboost/AdaBoostModel.java index 47d3ad785590..5461a4190603 100644 --- a/h2o-algos/src/main/java/hex/adaboost/AdaBoostModel.java +++ b/h2o-algos/src/main/java/hex/adaboost/AdaBoostModel.java @@ -1,11 +1,12 @@ package hex.adaboost; -import hex.*; +import hex.Model; +import hex.ModelCategory; +import hex.ModelMetrics; +import hex.ModelMetricsBinomial; import hex.tree.drf.DRFModel; -import hex.tree.isoforextended.isolationtree.CompressedIsolationTree; import org.apache.log4j.Logger; import water.*; -import water.fvec.Frame; public class AdaBoostModel extends Model { private static final Logger LOG = Logger.getLogger(AdaBoostModel.class); @@ -34,19 +35,25 @@ protected String[] makeScoringNames(){ protected double[] score0(double[] data, double[] preds) { double alphas0 = 0; double alphas1 = 0; + double linearCombination = 0; for (int i = 0; i < _output.alphas.length; i++) { Model drfModel = DKV.getGet(_output.models[i]); if (drfModel.score(data) == 0) { + linearCombination += _output.alphas[i]*-1; alphas0 += _output.alphas[i]; } else { + linearCombination += _output.alphas[i]*1; alphas1 += _output.alphas[i]; } } - preds[1] = alphas0 > alphas1 ? 1 : 0; - preds[2] = alphas0 < alphas1 ? 1 : 0; + preds[0] = alphas0 > alphas1 ? 0 : 1; + preds[2] = 1/(1 + Math.exp(-2*linearCombination)); + preds[1] = 1 - preds[2]; return preds; } + @Override protected boolean needsPostProcess() { return false; /* pred[0] is already set by score0 */ } + public static class AdaBoostOutput extends Model.Output { public double[] alphas; public Key[] models; From 87c13d99719ed7377d843dc594dd7384757d95f2 Mon Sep 17 00:00:00 2001 From: Adam Valenta Date: Fri, 18 Aug 2023 19:12:01 +0200 Subject: [PATCH 04/34] try to fix a test --- .../test/java/hex/adaboost/AdaBoostTest.java | 59 +++++++++---------- 1 file changed, 29 insertions(+), 30 deletions(-) diff --git a/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java b/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java index 200b055940d8..e5f2a7aa8817 100644 --- a/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java +++ b/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java @@ -148,41 +148,40 @@ public void testBasicTrainAndScore() { Frame score = adaBoostModel.score(test); Scope.track(score); toCSV(score, "../prostatescore.csv"); - - Frame scoreOriginal = Scope.track(parseTestFile("../prostatescore_original.csv")); - assertFrameEquals(scoreOriginal, score, 0); +// Frame scoreOriginal = Scope.track(parseTestFile("../prostatescore_original.csv")); +// assertFrameEquals(scoreOriginal, score, 0); } finally { Scope.exit(); } } - @Test - public void testBasicTrainAndScoreGLM() { - try { - Scope.enter(); - Frame train = Scope.track(parseTestFile("smalldata/prostate/prostate.csv")); - Frame test = Scope.track(parseTestFile("smalldata/prostate/prostate.csv")); - String response = "CAPSULE"; - train.toCategoricalCol(response); - AdaBoostModel.AdaBoostParameters p = new AdaBoostModel.AdaBoostParameters(); - p._train = train._key; - p._seed = 0xDECAF; - p._n_estimators = 2; - p._weak_learner = AdaBoostModel.Algorithm.GLM; - p._response_column = response; - - AdaBoost adaBoost = new AdaBoost(p); - AdaBoostModel adaBoostModel = adaBoost.trainModel().get(); - Scope.track_generic(adaBoostModel); - assertNotNull(adaBoostModel); - - Frame score = adaBoostModel.score(test); - Scope.track(score); - toCSV(score, "../prostatescoreglm.csv"); - } finally { - Scope.exit(); - } - } +// @Test +// public void testBasicTrainAndScoreGLM() { +// try { +// Scope.enter(); +// Frame train = Scope.track(parseTestFile("smalldata/prostate/prostate.csv")); +// Frame test = Scope.track(parseTestFile("smalldata/prostate/prostate.csv")); +// String response = "CAPSULE"; +// train.toCategoricalCol(response); +// AdaBoostModel.AdaBoostParameters p = new AdaBoostModel.AdaBoostParameters(); +// p._train = train._key; +// p._seed = 0xDECAF; +// p._n_estimators = 2; +// p._weak_learner = AdaBoostModel.Algorithm.GLM; +// p._response_column = response; +// +// AdaBoost adaBoost = new AdaBoost(p); +// AdaBoostModel adaBoostModel = adaBoost.trainModel().get(); +// Scope.track_generic(adaBoostModel); +// assertNotNull(adaBoostModel); +// +// Frame score = adaBoostModel.score(test); +// Scope.track(score); +// toCSV(score, "../prostatescoreglm.csv"); +// } finally { +// Scope.exit(); +// } +// } @Test public void testBasicTrainAndScoreLarge() { From 6fa3b5d116b8302f21c5fe2c03c682830ac0b517 Mon Sep 17 00:00:00 2001 From: Adam Valenta Date: Fri, 25 Aug 2023 19:25:17 +0200 Subject: [PATCH 05/34] try to fix test --- h2o-algos/src/main/java/hex/adaboost/AdaBoost.java | 13 ++++++++----- .../src/test/java/hex/adaboost/AdaBoostTest.java | 12 ++++++++++-- h2o-core/src/main/java/hex/ModelBuilder.java | 2 +- 3 files changed, 19 insertions(+), 8 deletions(-) diff --git a/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java b/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java index 7de28af469d9..936386b92c3b 100644 --- a/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java +++ b/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java @@ -67,10 +67,13 @@ public void computeImpl() { private void buildAdaboost() { _model._output.alphas = new double[(int)_parms._n_estimators]; _model._output.models = new Key[(int)_parms._n_estimators]; - train().add("weights", Vec.makeCon(1.0, train().numRows())); - train()._key = Key.make(); - DKV.put(train()); - Scope.track(train()); +// Vec weights = train().anyVec().makeCons(1,1,null,null)[0]; +// train().add("weights", weights); +// DKV.put(weights); +// train()._key = Key.make(); +// DKV.put(train()); +// Scope.track(weights); + System.out.println("train().toTwoDimTable() = " + train().toTwoDimTable()); for (int n = 0; n < _parms._n_estimators; n++) { ModelBuilder job = chooseWeakLearner(); job._parms._seed += n; @@ -171,7 +174,7 @@ private ModelBuilder chooseWeakLearner() { private DRF getDRFWeakLearner() { DRFModel.DRFParameters parms = new DRFModel.DRFParameters(); - parms._train = train()._key; + parms._train = _parms._train; parms._response_column = _parms._response_column; parms._mtries = 1; parms._min_rows = 1; diff --git a/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java b/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java index e5f2a7aa8817..23355c444b4b 100644 --- a/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java +++ b/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java @@ -8,6 +8,7 @@ import org.junit.Test; import org.junit.contrib.java.lang.system.EnvironmentVariables; import org.junit.runner.RunWith; +import water.DKV; import water.Scope; import water.TestUtil; import water.fvec.Frame; @@ -130,14 +131,19 @@ public void testBasicTrainAndScore() { try { Scope.enter(); Frame train = Scope.track(parseTestFile("smalldata/prostate/prostate.csv")); + train.add("weights", train.anyVec().makeCons(1,1,null,null)[0]); + DKV.put(train); Frame test = Scope.track(parseTestFile("smalldata/prostate/prostate.csv")); + test.add("weights", test.anyVec().makeCons(1,1,null,null)[0]); + DKV.put(test); String response = "CAPSULE"; train.toCategoricalCol(response); test.toCategoricalCol(response); AdaBoostModel.AdaBoostParameters p = new AdaBoostModel.AdaBoostParameters(); p._train = train._key; p._seed = 0xDECAF; - p._n_estimators = 50; + p._n_estimators = 1; + p._weights_column = "weights"; p._response_column = response; AdaBoost adaBoost = new AdaBoost(p); @@ -145,7 +151,9 @@ public void testBasicTrainAndScore() { Scope.track_generic(adaBoostModel); assertNotNull(adaBoostModel); - Frame score = adaBoostModel.score(test); + System.out.println("train.toTwoDimTable() = " + train.toTwoDimTable()); + + Frame score = adaBoostModel.score(train); Scope.track(score); toCSV(score, "../prostatescore.csv"); // Frame scoreOriginal = Scope.track(parseTestFile("../prostatescore_original.csv")); diff --git a/h2o-core/src/main/java/hex/ModelBuilder.java b/h2o-core/src/main/java/hex/ModelBuilder.java index e647a1a6c8f7..969dc7a16d1f 100644 --- a/h2o-core/src/main/java/hex/ModelBuilder.java +++ b/h2o-core/src/main/java/hex/ModelBuilder.java @@ -1403,7 +1403,7 @@ public void init(boolean expensive) { initWorkspace(expensive); assert _parms != null; // Parms must already be set in - if( _parms._train == null ) { + if( _parms._train == null && _train == null) { if (expensive) error("_train", "Missing training frame"); return; From 89f2d44dc4584f6a8c45861890420e7860217c77 Mon Sep 17 00:00:00 2001 From: Adam Valenta Date: Mon, 28 Aug 2023 16:37:10 +0200 Subject: [PATCH 06/34] Try to fix the tests --- .../src/main/java/hex/adaboost/AdaBoost.java | 2 +- .../test/java/hex/adaboost/AdaBoostTest.java | 24 +++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java b/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java index 936386b92c3b..2d945ebe2de8 100644 --- a/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java +++ b/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java @@ -188,7 +188,7 @@ private DRF getDRFWeakLearner() { private GLM getGLMWeakLearner() { GLMModel.GLMParameters parms = new GLMModel.GLMParameters(); - parms._train = train()._key; + parms._train = _parms._train; parms._response_column = _parms._response_column; return new GLM(parms); } diff --git a/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java b/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java index 23355c444b4b..ece13dafc456 100644 --- a/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java +++ b/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java @@ -40,12 +40,15 @@ public void testBasicTrain() { try { Scope.enter(); Frame train = Scope.track(parseTestFile("smalldata/prostate/prostate.csv")); + train.add("weights", train.anyVec().makeCons(1,1,null,null)[0]); + DKV.put(train); String response = "CAPSULE"; train.toCategoricalCol(response); AdaBoostModel.AdaBoostParameters p = new AdaBoostModel.AdaBoostParameters(); p._train = train._key; p._seed = 0xDECAF; p._n_estimators = 50; + p._weights_column = "weights"; p._response_column = response; AdaBoost adaBoost = new AdaBoost(p); @@ -62,6 +65,8 @@ public void testBasicTrainGLM() { try { Scope.enter(); Frame train = Scope.track(parseTestFile("smalldata/prostate/prostate.csv")); + train.add("weights", train.anyVec().makeCons(1,1,null,null)[0]); + DKV.put(train); String response = "CAPSULE"; train.toCategoricalCol(response); AdaBoostModel.AdaBoostParameters p = new AdaBoostModel.AdaBoostParameters(); @@ -69,6 +74,7 @@ public void testBasicTrainGLM() { p._seed = 0xDECAF; p._n_estimators = 50; p._weak_learner = AdaBoostModel.Algorithm.GLM; + p._weights_column = "weights"; p._response_column = response; AdaBoost adaBoost = new AdaBoost(p); @@ -109,12 +115,15 @@ public void testBasicTrainLarge() { try { Scope.enter(); Frame train = Scope.track(parseTestFile("bigdata/laptop/creditcardfraud/creditcardfraud.csv")); + train.add("weights", train.anyVec().makeCons(1,1,null,null)[0]); + DKV.put(train); String response = "Class"; train.toCategoricalCol(response); AdaBoostModel.AdaBoostParameters p = new AdaBoostModel.AdaBoostParameters(); p._train = train._key; p._seed = 0xDECAF; p._n_estimators = 50; + p._weights_column = "weights"; p._response_column = response; AdaBoost adaBoost = new AdaBoost(p); @@ -196,13 +205,18 @@ public void testBasicTrainAndScoreLarge() { try { Scope.enter(); Frame train = Scope.track(parseTestFile("bigdata/laptop/creditcardfraud/creditcardfraud.csv")); + train.add("weights", train.anyVec().makeCons(1,1,null,null)[0]); + DKV.put(train); Frame test = Scope.track(parseTestFile("bigdata/laptop/creditcardfraud/creditcardfraud.csv")); + test.add("weights", test.anyVec().makeCons(1,1,null,null)[0]); + DKV.put(test); String response = "Class"; train.toCategoricalCol(response); AdaBoostModel.AdaBoostParameters p = new AdaBoostModel.AdaBoostParameters(); p._train = train._key; p._seed = 0xDECAF; p._n_estimators = 50; + p._weights_column = "weights"; p._response_column = response; AdaBoost adaBoost = new AdaBoost(p); @@ -223,7 +237,11 @@ public void testBasicTrainAirlines() { try { Scope.enter(); Frame train = Scope.track(parseTestFile("smalldata/testng/airlines_train_preprocessed.csv")); + train.add("weights", train.anyVec().makeCons(1,1,null,null)[0]); + DKV.put(train); Frame test = Scope.track(parseTestFile("smalldata/testng/airlines_test_preprocessed.csv")); + test.add("weights", test.anyVec().makeCons(1,1,null,null)[0]); + DKV.put(test); String response = "IsDepDelayed"; train.toCategoricalCol(response); test.toCategoricalCol(response); @@ -231,6 +249,7 @@ public void testBasicTrainAirlines() { p._train = train._key; p._seed = 0xDECAF; p._n_estimators = 50; + p._weights_column = "weights"; p._response_column = response; AdaBoost adaBoost = new AdaBoost(p); @@ -251,7 +270,11 @@ public void testBasicTrainHiggs() { try { Scope.enter(); Frame train = Scope.track(parseTestFile("smalldata/higgs/higgs_train_5k.csv")); + train.add("weights", train.anyVec().makeCons(1,1,null,null)[0]); + DKV.put(train); Frame test = Scope.track(parseTestFile("smalldata/higgs/higgs_test_5k.csv")); + test.add("weights", train.anyVec().makeCons(1,1,null,null)[0]); + DKV.put(test); String response = "response"; train.toCategoricalCol(response); test.toCategoricalCol(response); @@ -259,6 +282,7 @@ public void testBasicTrainHiggs() { p._train = train._key; p._seed = 0xDECAF; p._n_estimators = 50; + p._weights_column = "weights"; p._response_column = response; AdaBoost adaBoost = new AdaBoost(p); From d23e46c8929747fd340255e8c04e353727fc7cb9 Mon Sep 17 00:00:00 2001 From: Adam Valenta Date: Tue, 29 Aug 2023 12:22:16 +0200 Subject: [PATCH 07/34] try to fix the tests - this works - pass weights externally each time --- .../test/java/hex/adaboost/AdaBoostTest.java | 49 ++++++++----------- 1 file changed, 20 insertions(+), 29 deletions(-) diff --git a/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java b/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java index ece13dafc456..cc1cadcb958f 100644 --- a/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java +++ b/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java @@ -39,9 +39,10 @@ public void beforeClass() { public void testBasicTrain() { try { Scope.enter(); - Frame train = Scope.track(parseTestFile("smalldata/prostate/prostate.csv")); + Frame train = parseTestFile("smalldata/prostate/prostate.csv"); train.add("weights", train.anyVec().makeCons(1,1,null,null)[0]); DKV.put(train); + Scope.track(train); String response = "CAPSULE"; train.toCategoricalCol(response); AdaBoostModel.AdaBoostParameters p = new AdaBoostModel.AdaBoostParameters(); @@ -64,9 +65,10 @@ public void testBasicTrain() { public void testBasicTrainGLM() { try { Scope.enter(); - Frame train = Scope.track(parseTestFile("smalldata/prostate/prostate.csv")); + Frame train = parseTestFile("smalldata/prostate/prostate.csv"); train.add("weights", train.anyVec().makeCons(1,1,null,null)[0]); DKV.put(train); + Scope.track(train); String response = "CAPSULE"; train.toCategoricalCol(response); AdaBoostModel.AdaBoostParameters p = new AdaBoostModel.AdaBoostParameters(); @@ -104,7 +106,6 @@ public void testBasicTrainGLMWeakLerner() { assertNotNull(adaBoostModel); Frame score = adaBoostModel.score(train); Scope.track(score); - System.out.println("score.toTwoDimTable(0,10,false) = " + score.toTwoDimTable(0, 10, false)); } finally { Scope.exit(); } @@ -114,9 +115,10 @@ public void testBasicTrainGLMWeakLerner() { public void testBasicTrainLarge() { try { Scope.enter(); - Frame train = Scope.track(parseTestFile("bigdata/laptop/creditcardfraud/creditcardfraud.csv")); + Frame train = parseTestFile("bigdata/laptop/creditcardfraud/creditcardfraud.csv"); train.add("weights", train.anyVec().makeCons(1,1,null,null)[0]); DKV.put(train); + Scope.track(train); String response = "Class"; train.toCategoricalCol(response); AdaBoostModel.AdaBoostParameters p = new AdaBoostModel.AdaBoostParameters(); @@ -139,15 +141,12 @@ public void testBasicTrainLarge() { public void testBasicTrainAndScore() { try { Scope.enter(); - Frame train = Scope.track(parseTestFile("smalldata/prostate/prostate.csv")); + Frame train = parseTestFile("smalldata/prostate/prostate.csv"); train.add("weights", train.anyVec().makeCons(1,1,null,null)[0]); DKV.put(train); - Frame test = Scope.track(parseTestFile("smalldata/prostate/prostate.csv")); - test.add("weights", test.anyVec().makeCons(1,1,null,null)[0]); - DKV.put(test); + Scope.track(train); String response = "CAPSULE"; train.toCategoricalCol(response); - test.toCategoricalCol(response); AdaBoostModel.AdaBoostParameters p = new AdaBoostModel.AdaBoostParameters(); p._train = train._key; p._seed = 0xDECAF; @@ -204,12 +203,10 @@ public void testBasicTrainAndScore() { public void testBasicTrainAndScoreLarge() { try { Scope.enter(); - Frame train = Scope.track(parseTestFile("bigdata/laptop/creditcardfraud/creditcardfraud.csv")); + Frame train = parseTestFile("bigdata/laptop/creditcardfraud/creditcardfraud.csv"); train.add("weights", train.anyVec().makeCons(1,1,null,null)[0]); - DKV.put(train); - Frame test = Scope.track(parseTestFile("bigdata/laptop/creditcardfraud/creditcardfraud.csv")); - test.add("weights", test.anyVec().makeCons(1,1,null,null)[0]); - DKV.put(test); + DKV.put(train); + Scope.track(train); String response = "Class"; train.toCategoricalCol(response); AdaBoostModel.AdaBoostParameters p = new AdaBoostModel.AdaBoostParameters(); @@ -224,7 +221,7 @@ public void testBasicTrainAndScoreLarge() { Scope.track_generic(adaBoostModel); assertNotNull(adaBoostModel); - Frame score = adaBoostModel.score(test); + Frame score = adaBoostModel.score(train); Scope.track(score); toCSV(score, "../creditcardfraudscore.csv"); } finally { @@ -236,15 +233,12 @@ public void testBasicTrainAndScoreLarge() { public void testBasicTrainAirlines() { try { Scope.enter(); - Frame train = Scope.track(parseTestFile("smalldata/testng/airlines_train_preprocessed.csv")); + Frame train = parseTestFile("smalldata/testng/airlines_train_preprocessed.csv"); train.add("weights", train.anyVec().makeCons(1,1,null,null)[0]); - DKV.put(train); - Frame test = Scope.track(parseTestFile("smalldata/testng/airlines_test_preprocessed.csv")); - test.add("weights", test.anyVec().makeCons(1,1,null,null)[0]); - DKV.put(test); + DKV.put(train); + Scope.track(train); String response = "IsDepDelayed"; train.toCategoricalCol(response); - test.toCategoricalCol(response); AdaBoostModel.AdaBoostParameters p = new AdaBoostModel.AdaBoostParameters(); p._train = train._key; p._seed = 0xDECAF; @@ -257,7 +251,7 @@ public void testBasicTrainAirlines() { Scope.track_generic(adaBoostModel); assertNotNull(adaBoostModel); - Frame score = adaBoostModel.score(test); + Frame score = adaBoostModel.score(train); Scope.track(score); toCSV(score, "../airlinesscore.csv"); } finally { @@ -269,15 +263,12 @@ public void testBasicTrainAirlines() { public void testBasicTrainHiggs() { try { Scope.enter(); - Frame train = Scope.track(parseTestFile("smalldata/higgs/higgs_train_5k.csv")); + Frame train = parseTestFile("smalldata/higgs/higgs_train_5k.csv"); train.add("weights", train.anyVec().makeCons(1,1,null,null)[0]); - DKV.put(train); - Frame test = Scope.track(parseTestFile("smalldata/higgs/higgs_test_5k.csv")); - test.add("weights", train.anyVec().makeCons(1,1,null,null)[0]); - DKV.put(test); + DKV.put(train); + Scope.track(train); String response = "response"; train.toCategoricalCol(response); - test.toCategoricalCol(response); AdaBoostModel.AdaBoostParameters p = new AdaBoostModel.AdaBoostParameters(); p._train = train._key; p._seed = 0xDECAF; @@ -290,7 +281,7 @@ public void testBasicTrainHiggs() { Scope.track_generic(adaBoostModel); assertNotNull(adaBoostModel); - Frame score = adaBoostModel.score(test); + Frame score = adaBoostModel.score(train); Scope.track(score); toCSV(score, "../higgsscore.csv"); } finally { From 4c768fbed39d4ebfbdc8cde2e6f5f3504cc40873 Mon Sep 17 00:00:00 2001 From: Adam Valenta Date: Tue, 29 Aug 2023 17:33:15 +0200 Subject: [PATCH 08/34] Add weights inside of the algorithm - this is working --- .../src/main/java/hex/adaboost/AdaBoost.java | 43 +++++++++---------- .../test/java/hex/adaboost/AdaBoostTest.java | 25 +---------- 2 files changed, 23 insertions(+), 45 deletions(-) diff --git a/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java b/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java index 2d945ebe2de8..a69a0965a6f4 100644 --- a/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java +++ b/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java @@ -2,16 +2,14 @@ import hex.Model; import hex.ModelBuilder; +import hex.ModelBuilderHelper; import hex.ModelCategory; import hex.glm.GLM; import hex.glm.GLMModel; import hex.tree.drf.DRF; import hex.tree.drf.DRFModel; import org.apache.log4j.Logger; -import water.DKV; -import water.Key; -import water.MRTask; -import water.Scope; +import water.*; import water.exceptions.H2OModelBuilderIllegalArgumentException; import water.fvec.Chunk; import water.fvec.Frame; @@ -49,7 +47,7 @@ private class AdaBoostDriver extends Driver { public void computeImpl() { _model = null; try { - init(false); + init(true); if (error_count() > 0) { throw H2OModelBuilderIllegalArgumentException.makeFromBuilder(AdaBoost.this); } @@ -67,31 +65,32 @@ public void computeImpl() { private void buildAdaboost() { _model._output.alphas = new double[(int)_parms._n_estimators]; _model._output.models = new Key[(int)_parms._n_estimators]; -// Vec weights = train().anyVec().makeCons(1,1,null,null)[0]; -// train().add("weights", weights); -// DKV.put(weights); -// train()._key = Key.make(); -// DKV.put(train()); -// Scope.track(weights); - System.out.println("train().toTwoDimTable() = " + train().toTwoDimTable()); + Frame _trainWithWeights = new Frame(train()); + Vec weights = _trainWithWeights.anyVec().makeCons(1,1,null,null)[0]; + _trainWithWeights.add("weights", weights); + DKV.put(_trainWithWeights); + Scope.track(weights); for (int n = 0; n < _parms._n_estimators; n++) { - ModelBuilder job = chooseWeakLearner(); + ModelBuilder job = chooseWeakLearner(_trainWithWeights); job._parms._seed += n; Model model = (Model) job.trainModel().get(); DKV.put(model); Scope.untrack(model._key); _model._output.models[n] = model._key; - Frame score = model.score(train()); + Frame score = model.score(_trainWithWeights); Scope.track(score); - CountWe countWe = new CountWe().doAll(train().vec("weights"), train().vec(_parms._response_column), score.vec("predict")); + CountWe countWe = new CountWe().doAll(_trainWithWeights.vec("weights"), _trainWithWeights.vec(_parms._response_column), score.vec("predict")); double e_m = countWe.We / countWe.W; double alpha_m = _parms._learning_rate * Math.log((1 - e_m) / e_m); _model._output.alphas[n] = alpha_m; UpdateW updateW = new UpdateW(alpha_m); - updateW.doAll(train().vec("weights"), train().vec(_parms._response_column), score.vec("predict")); + updateW.doAll(_trainWithWeights.vec("weights"), _trainWithWeights.vec(_parms._response_column), score.vec("predict")); + _job.update(1); + _model.update(_job); } + DKV.remove(_trainWithWeights._key); } } @@ -161,20 +160,20 @@ public boolean isSupervised() { return true; } - private ModelBuilder chooseWeakLearner() { + private ModelBuilder chooseWeakLearner(Frame frame) { switch (_parms._weak_learner) { case GLM: - return getGLMWeakLearner(); + return getGLMWeakLearner(frame); default: case DRF: - return getDRFWeakLearner(); + return getDRFWeakLearner(frame); } } - private DRF getDRFWeakLearner() { + private DRF getDRFWeakLearner(Frame frame) { DRFModel.DRFParameters parms = new DRFModel.DRFParameters(); - parms._train = _parms._train; + parms._train = frame._key; parms._response_column = _parms._response_column; parms._mtries = 1; parms._min_rows = 1; @@ -186,7 +185,7 @@ private DRF getDRFWeakLearner() { return new DRF(parms); } - private GLM getGLMWeakLearner() { + private GLM getGLMWeakLearner(Frame frame) { GLMModel.GLMParameters parms = new GLMModel.GLMParameters(); parms._train = _parms._train; parms._response_column = _parms._response_column; diff --git a/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java b/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java index cc1cadcb958f..7e8c993990ee 100644 --- a/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java +++ b/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java @@ -40,16 +40,13 @@ public void testBasicTrain() { try { Scope.enter(); Frame train = parseTestFile("smalldata/prostate/prostate.csv"); - train.add("weights", train.anyVec().makeCons(1,1,null,null)[0]); - DKV.put(train); - Scope.track(train); String response = "CAPSULE"; train.toCategoricalCol(response); + Scope.track(train); AdaBoostModel.AdaBoostParameters p = new AdaBoostModel.AdaBoostParameters(); p._train = train._key; p._seed = 0xDECAF; p._n_estimators = 50; - p._weights_column = "weights"; p._response_column = response; AdaBoost adaBoost = new AdaBoost(p); @@ -66,8 +63,6 @@ public void testBasicTrainGLM() { try { Scope.enter(); Frame train = parseTestFile("smalldata/prostate/prostate.csv"); - train.add("weights", train.anyVec().makeCons(1,1,null,null)[0]); - DKV.put(train); Scope.track(train); String response = "CAPSULE"; train.toCategoricalCol(response); @@ -76,7 +71,6 @@ public void testBasicTrainGLM() { p._seed = 0xDECAF; p._n_estimators = 50; p._weak_learner = AdaBoostModel.Algorithm.GLM; - p._weights_column = "weights"; p._response_column = response; AdaBoost adaBoost = new AdaBoost(p); @@ -116,8 +110,6 @@ public void testBasicTrainLarge() { try { Scope.enter(); Frame train = parseTestFile("bigdata/laptop/creditcardfraud/creditcardfraud.csv"); - train.add("weights", train.anyVec().makeCons(1,1,null,null)[0]); - DKV.put(train); Scope.track(train); String response = "Class"; train.toCategoricalCol(response); @@ -125,7 +117,6 @@ public void testBasicTrainLarge() { p._train = train._key; p._seed = 0xDECAF; p._n_estimators = 50; - p._weights_column = "weights"; p._response_column = response; AdaBoost adaBoost = new AdaBoost(p); @@ -142,16 +133,13 @@ public void testBasicTrainAndScore() { try { Scope.enter(); Frame train = parseTestFile("smalldata/prostate/prostate.csv"); - train.add("weights", train.anyVec().makeCons(1,1,null,null)[0]); - DKV.put(train); Scope.track(train); String response = "CAPSULE"; train.toCategoricalCol(response); AdaBoostModel.AdaBoostParameters p = new AdaBoostModel.AdaBoostParameters(); p._train = train._key; p._seed = 0xDECAF; - p._n_estimators = 1; - p._weights_column = "weights"; + p._n_estimators = 50; p._response_column = response; AdaBoost adaBoost = new AdaBoost(p); @@ -204,8 +192,6 @@ public void testBasicTrainAndScoreLarge() { try { Scope.enter(); Frame train = parseTestFile("bigdata/laptop/creditcardfraud/creditcardfraud.csv"); - train.add("weights", train.anyVec().makeCons(1,1,null,null)[0]); - DKV.put(train); Scope.track(train); String response = "Class"; train.toCategoricalCol(response); @@ -213,7 +199,6 @@ public void testBasicTrainAndScoreLarge() { p._train = train._key; p._seed = 0xDECAF; p._n_estimators = 50; - p._weights_column = "weights"; p._response_column = response; AdaBoost adaBoost = new AdaBoost(p); @@ -234,8 +219,6 @@ public void testBasicTrainAirlines() { try { Scope.enter(); Frame train = parseTestFile("smalldata/testng/airlines_train_preprocessed.csv"); - train.add("weights", train.anyVec().makeCons(1,1,null,null)[0]); - DKV.put(train); Scope.track(train); String response = "IsDepDelayed"; train.toCategoricalCol(response); @@ -243,7 +226,6 @@ public void testBasicTrainAirlines() { p._train = train._key; p._seed = 0xDECAF; p._n_estimators = 50; - p._weights_column = "weights"; p._response_column = response; AdaBoost adaBoost = new AdaBoost(p); @@ -264,8 +246,6 @@ public void testBasicTrainHiggs() { try { Scope.enter(); Frame train = parseTestFile("smalldata/higgs/higgs_train_5k.csv"); - train.add("weights", train.anyVec().makeCons(1,1,null,null)[0]); - DKV.put(train); Scope.track(train); String response = "response"; train.toCategoricalCol(response); @@ -273,7 +253,6 @@ public void testBasicTrainHiggs() { p._train = train._key; p._seed = 0xDECAF; p._n_estimators = 50; - p._weights_column = "weights"; p._response_column = response; AdaBoost adaBoost = new AdaBoost(p); From 40e628515ab0edb24499bc020fa6cffdc5d744d0 Mon Sep 17 00:00:00 2001 From: Adam Valenta Date: Wed, 30 Aug 2023 17:37:55 +0200 Subject: [PATCH 09/34] fix for GLM --- .../src/main/java/hex/adaboost/AdaBoost.java | 50 ++----------------- .../src/main/java/hex/adaboost/CountWe.java | 26 ++++++++++ 2 files changed, 29 insertions(+), 47 deletions(-) create mode 100644 h2o-algos/src/main/java/hex/adaboost/CountWe.java diff --git a/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java b/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java index a69a0965a6f4..6cd7f2b8a758 100644 --- a/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java +++ b/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java @@ -2,7 +2,6 @@ import hex.Model; import hex.ModelBuilder; -import hex.ModelBuilderHelper; import hex.ModelCategory; import hex.glm.GLM; import hex.glm.GLMModel; @@ -11,7 +10,6 @@ import org.apache.log4j.Logger; import water.*; import water.exceptions.H2OModelBuilderIllegalArgumentException; -import water.fvec.Chunk; import water.fvec.Frame; import water.fvec.Vec; @@ -65,11 +63,13 @@ public void computeImpl() { private void buildAdaboost() { _model._output.alphas = new double[(int)_parms._n_estimators]; _model._output.models = new Key[(int)_parms._n_estimators]; + Frame _trainWithWeights = new Frame(train()); Vec weights = _trainWithWeights.anyVec().makeCons(1,1,null,null)[0]; _trainWithWeights.add("weights", weights); DKV.put(_trainWithWeights); Scope.track(weights); + for (int n = 0; n < _parms._n_estimators; n++) { ModelBuilder job = chooseWeakLearner(_trainWithWeights); job._parms._seed += n; @@ -93,50 +93,6 @@ private void buildAdaboost() { DKV.remove(_trainWithWeights._key); } } - - private class CountWe extends MRTask { - double W = 0; - double We = 0; - - @Override - public void map(Chunk weights, Chunk response, Chunk predict) { - for (int row = 0; row < weights._len; row++) { - double weight = weights.atd(row); - W += weight; - if (response.at8(row) != predict.at8(row)) { - We += weight; - } - } - } - - @Override - public void reduce(CountWe mrt) { - W += mrt.W; - We += mrt.We; - } - } - - private class UpdateW extends MRTask { - double exp_am; - double exp_am_inverse; - - public UpdateW(double alpha_m) { - exp_am = Math.exp(alpha_m); - exp_am_inverse = Math.exp(-alpha_m); - } - - @Override - public void map(Chunk weights, Chunk response, Chunk predict) { - for (int row = 0; row < weights._len; row++) { - double weight = weights.atd(row); - if (response.at8(row) != predict.at8(row)) { - weights.set(row, weight*exp_am); - } else { - weights.set(row, weight*exp_am_inverse); - } - } - } - } @Override protected Driver trainModelImpl() { @@ -187,7 +143,7 @@ private DRF getDRFWeakLearner(Frame frame) { private GLM getGLMWeakLearner(Frame frame) { GLMModel.GLMParameters parms = new GLMModel.GLMParameters(); - parms._train = _parms._train; + parms._train = frame._key; parms._response_column = _parms._response_column; return new GLM(parms); } diff --git a/h2o-algos/src/main/java/hex/adaboost/CountWe.java b/h2o-algos/src/main/java/hex/adaboost/CountWe.java new file mode 100644 index 000000000000..9222a65de58f --- /dev/null +++ b/h2o-algos/src/main/java/hex/adaboost/CountWe.java @@ -0,0 +1,26 @@ +package hex.adaboost; + +import water.MRTask; +import water.fvec.Chunk; + +class CountWe extends MRTask { + double W = 0; + double We = 0; + + @Override + public void map(Chunk weights, Chunk response, Chunk predict) { + for (int row = 0; row < weights._len; row++) { + double weight = weights.atd(row); + W += weight; + if (response.at8(row) != predict.at8(row)) { + We += weight; + } + } + } + + @Override + public void reduce(CountWe mrt) { + W += mrt.W; + We += mrt.We; + } +} From 0fd0a43eacb02d2133477049ac819bfb611db656 Mon Sep 17 00:00:00 2001 From: Adam Valenta Date: Wed, 30 Aug 2023 18:11:42 +0200 Subject: [PATCH 10/34] GH-6723 - add unit test to inner tasks --- .../src/main/java/hex/adaboost/AdaBoost.java | 6 +-- .../{CountWe.java => CountWeTask.java} | 7 ++- .../java/hex/adaboost/UpdateWeightsTask.java | 29 ++++++++++ .../test/java/hex/adaboost/AdaBoostTest.java | 54 ++++++++++++++++++- 4 files changed, 90 insertions(+), 6 deletions(-) rename h2o-algos/src/main/java/hex/adaboost/{CountWe.java => CountWeTask.java} (73%) create mode 100644 h2o-algos/src/main/java/hex/adaboost/UpdateWeightsTask.java diff --git a/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java b/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java index 6cd7f2b8a758..34cfe0dbf43b 100644 --- a/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java +++ b/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java @@ -80,13 +80,13 @@ private void buildAdaboost() { Frame score = model.score(_trainWithWeights); Scope.track(score); - CountWe countWe = new CountWe().doAll(_trainWithWeights.vec("weights"), _trainWithWeights.vec(_parms._response_column), score.vec("predict")); + CountWeTask countWe = new CountWeTask().doAll(_trainWithWeights.vec("weights"), _trainWithWeights.vec(_parms._response_column), score.vec("predict")); double e_m = countWe.We / countWe.W; double alpha_m = _parms._learning_rate * Math.log((1 - e_m) / e_m); _model._output.alphas[n] = alpha_m; - UpdateW updateW = new UpdateW(alpha_m); - updateW.doAll(_trainWithWeights.vec("weights"), _trainWithWeights.vec(_parms._response_column), score.vec("predict")); + UpdateWeightsTask updateWeightsTask = new UpdateWeightsTask(alpha_m); + updateWeightsTask.doAll(_trainWithWeights.vec("weights"), _trainWithWeights.vec(_parms._response_column), score.vec("predict")); _job.update(1); _model.update(_job); } diff --git a/h2o-algos/src/main/java/hex/adaboost/CountWe.java b/h2o-algos/src/main/java/hex/adaboost/CountWeTask.java similarity index 73% rename from h2o-algos/src/main/java/hex/adaboost/CountWe.java rename to h2o-algos/src/main/java/hex/adaboost/CountWeTask.java index 9222a65de58f..0a763ca64f2d 100644 --- a/h2o-algos/src/main/java/hex/adaboost/CountWe.java +++ b/h2o-algos/src/main/java/hex/adaboost/CountWeTask.java @@ -3,7 +3,10 @@ import water.MRTask; import water.fvec.Chunk; -class CountWe extends MRTask { +/** + * Count sum of all weights and sum of bad predicted weights for AdaBoost purpose + */ +class CountWeTask extends MRTask { double W = 0; double We = 0; @@ -19,7 +22,7 @@ public void map(Chunk weights, Chunk response, Chunk predict) { } @Override - public void reduce(CountWe mrt) { + public void reduce(CountWeTask mrt) { W += mrt.W; We += mrt.We; } diff --git a/h2o-algos/src/main/java/hex/adaboost/UpdateWeightsTask.java b/h2o-algos/src/main/java/hex/adaboost/UpdateWeightsTask.java new file mode 100644 index 000000000000..bc09ec1a996a --- /dev/null +++ b/h2o-algos/src/main/java/hex/adaboost/UpdateWeightsTask.java @@ -0,0 +1,29 @@ +package hex.adaboost; + +import water.MRTask; +import water.fvec.Chunk; + +/** + * Update weights according to AdaBoost algorithm + */ +class UpdateWeightsTask extends MRTask { + double exp_am; + double exp_am_inverse; + + public UpdateWeightsTask(double alpha_m) { + exp_am = Math.exp(alpha_m); + exp_am_inverse = Math.exp(-alpha_m); + } + + @Override + public void map(Chunk weights, Chunk response, Chunk predict) { + for (int row = 0; row < weights._len; row++) { + double weight = weights.atd(row); + if (response.at8(row) != predict.at8(row)) { + weights.set(row, weight * exp_am); + } else { + weights.set(row, weight * exp_am_inverse); + } + } + } +} diff --git a/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java b/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java index 7e8c993990ee..9d99a5ddf684 100644 --- a/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java +++ b/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java @@ -8,16 +8,20 @@ import org.junit.Test; import org.junit.contrib.java.lang.system.EnvironmentVariables; import org.junit.runner.RunWith; -import water.DKV; import water.Scope; import water.TestUtil; import water.fvec.Frame; +import water.fvec.TestFrameBuilder; +import water.fvec.Vec; import water.runner.CloudSize; import water.runner.H2ORunner; +import water.util.FrameUtils; +import water.util.VecUtils; import java.io.File; import java.io.IOException; +import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; @CloudSize(1) @@ -278,4 +282,52 @@ private void toCSV(Frame frame, String filename) { } } } + + @Test + public void testCountWe() { + Scope.enter(); + try { + Frame train = new TestFrameBuilder() + .withVecTypes(Vec.T_NUM, Vec.T_CAT, Vec.T_CAT) + .withDataForCol(0, ard(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0)) + .withDataForCol(1, ar("0", "0", "0", "0", "0", "1", "1", "1", "1", "1")) + .withDataForCol(2, ar("1", "1", "1", "1", "1", "0", "0", "0", "0", "0")) + .build(); + train = ensureDistributed(train); + Scope.track(train); + + CountWeTask countWeTask = new CountWeTask().doAll(train); + assertEquals("Sum of weights is not correct",10, countWeTask.W, 0); + assertEquals("Sum of error weights is not correct",10, countWeTask.We, 0); + } finally { + Scope.exit(); + } + } + + @Test + public void testUpdateWeights() { + Scope.enter(); + try { + Frame train = new TestFrameBuilder() + .withVecTypes(Vec.T_NUM, Vec.T_CAT, Vec.T_CAT) + .withDataForCol(0, ard(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0)) + .withDataForCol(1, ar("1", "0", "0", "0", "0", "1", "1", "1", "1", "1")) + .withDataForCol(2, ar("1", "1", "1", "1", "1", "0", "0", "0", "0", "0")) + .build(); + train = ensureDistributed(train); + Scope.track(train); + + double alpha = 2; + UpdateWeightsTask updateWeightsTask = new UpdateWeightsTask(alpha); + updateWeightsTask.doAll(train); + + Vec weightsExpected = Vec.makeCon(Math.exp(alpha),train.numRows()); + weightsExpected.set(0, Math.exp(-alpha)); + System.out.println("weights = "); + System.out.println(new Frame(train.vec(0)).toTwoDimTable(0, (int) train.numRows(), false)); + assertVecEquals("Weights are not correctly updated", weightsExpected, train.vec(0),0); + } finally { + Scope.exit(); + } + } } From 109b348b98ec226f3a8b167ff983570b1c2cf9c5 Mon Sep 17 00:00:00 2001 From: Adam Valenta Date: Thu, 31 Aug 2023 19:01:45 +0200 Subject: [PATCH 11/34] use test files again in the large tests --- .../src/test/java/hex/adaboost/AdaBoostTest.java | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java b/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java index 9d99a5ddf684..5bd41433c224 100644 --- a/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java +++ b/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java @@ -28,7 +28,7 @@ @RunWith(H2ORunner.class) public class AdaBoostTest extends TestUtil { - public boolean print = false; + public boolean print = true; @Rule public EnvironmentVariables environmentVariables = new EnvironmentVariables(); @@ -224,8 +224,11 @@ public void testBasicTrainAirlines() { Scope.enter(); Frame train = parseTestFile("smalldata/testng/airlines_train_preprocessed.csv"); Scope.track(train); + Frame test = parseTestFile("smalldata/testng/airlines_test_preprocessed.csv"); + Scope.track(test); String response = "IsDepDelayed"; train.toCategoricalCol(response); + test.toCategoricalCol(response); AdaBoostModel.AdaBoostParameters p = new AdaBoostModel.AdaBoostParameters(); p._train = train._key; p._seed = 0xDECAF; @@ -237,7 +240,7 @@ public void testBasicTrainAirlines() { Scope.track_generic(adaBoostModel); assertNotNull(adaBoostModel); - Frame score = adaBoostModel.score(train); + Frame score = adaBoostModel.score(test); Scope.track(score); toCSV(score, "../airlinesscore.csv"); } finally { @@ -251,8 +254,11 @@ public void testBasicTrainHiggs() { Scope.enter(); Frame train = parseTestFile("smalldata/higgs/higgs_train_5k.csv"); Scope.track(train); + Frame test = parseTestFile("smalldata/higgs/higgs_test_5k.csv"); + Scope.track(test); String response = "response"; train.toCategoricalCol(response); + test.toCategoricalCol(response); AdaBoostModel.AdaBoostParameters p = new AdaBoostModel.AdaBoostParameters(); p._train = train._key; p._seed = 0xDECAF; @@ -264,7 +270,7 @@ public void testBasicTrainHiggs() { Scope.track_generic(adaBoostModel); assertNotNull(adaBoostModel); - Frame score = adaBoostModel.score(train); + Frame score = adaBoostModel.score(test); Scope.track(score); toCSV(score, "../higgsscore.csv"); } finally { From f994b1e269d10cd32b50e86560e34d11afea99b8 Mon Sep 17 00:00:00 2001 From: Adam Valenta Date: Tue, 5 Sep 2023 15:46:16 +0200 Subject: [PATCH 12/34] Improve basic training test to look into structure of weak learners --- .../test/java/hex/adaboost/AdaBoostTest.java | 25 ++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java b/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java index 5bd41433c224..b84bc13a2c4c 100644 --- a/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java +++ b/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java @@ -1,13 +1,16 @@ package hex.adaboost; +import hex.genmodel.algos.tree.SharedTreeSubgraph; import hex.glm.GLM; import hex.glm.GLMModel; +import hex.tree.drf.DRFModel; import org.apache.commons.io.FileUtils; import org.junit.Before; import org.junit.Rule; import org.junit.Test; import org.junit.contrib.java.lang.system.EnvironmentVariables; import org.junit.runner.RunWith; +import water.DKV; import water.Scope; import water.TestUtil; import water.fvec.Frame; @@ -21,8 +24,7 @@ import java.io.File; import java.io.IOException; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.*; @CloudSize(1) @RunWith(H2ORunner.class) @@ -57,6 +59,23 @@ public void testBasicTrain() { AdaBoostModel adaBoostModel = adaBoost.trainModel().get(); Scope.track_generic(adaBoostModel); assertNotNull(adaBoostModel); + + for (int i = 0; i < adaBoostModel._output.models.length; i++) { + System.out.println("Tree = " + i); + DRFModel drfModel = DKV.getGet(adaBoostModel._output.models[i]); + SharedTreeSubgraph tree = drfModel.getSharedTreeSubgraph(0,0); + if (tree.rootNode.getColName() == null) { + // FIXME - why are some of the trees empty? Are all of the columns bad for split? + System.out.println(" Empty tree"); + continue; + } + System.out.println(" Root = " + tree.rootNode.getColName() + " " + tree.rootNode.getSplitValue()); + System.out.println(" Left = " + tree.rootNode.getLeftChild().isLeaf() + " " + tree.rootNode.getLeftChild().getPredValue()); + System.out.println(" Right = " + tree.rootNode.getRightChild().isLeaf() + " " + tree.rootNode.getRightChild().getPredValue()); + assertNotNull(tree.rootNode.getColName()); + assertTrue(tree.rootNode.getLeftChild().isLeaf()); + assertTrue(tree.rootNode.getRightChild().isLeaf()); + } } finally { Scope.exit(); } @@ -157,7 +176,7 @@ public void testBasicTrainAndScore() { Scope.track(score); toCSV(score, "../prostatescore.csv"); // Frame scoreOriginal = Scope.track(parseTestFile("../prostatescore_original.csv")); -// assertFrameEquals(scoreOriginal, score, 0); +// assertFrameEquals(new Frame(scoreOriginal.vec(0)), new Frame(score.vec(0)), 0); } finally { Scope.exit(); } From 3c07b2558cebbdd592c6c21868438450d7e82bbd Mon Sep 17 00:00:00 2001 From: Adam Valenta Date: Tue, 5 Sep 2023 16:34:55 +0200 Subject: [PATCH 13/34] clenup - remove isAdaBoost --- h2o-algos/src/main/java/hex/adaboost/AdaBoostModel.java | 5 ----- h2o-core/src/main/java/hex/Model.java | 1 - 2 files changed, 6 deletions(-) diff --git a/h2o-algos/src/main/java/hex/adaboost/AdaBoostModel.java b/h2o-algos/src/main/java/hex/adaboost/AdaBoostModel.java index 5461a4190603..c586d06d6a85 100644 --- a/h2o-algos/src/main/java/hex/adaboost/AdaBoostModel.java +++ b/h2o-algos/src/main/java/hex/adaboost/AdaBoostModel.java @@ -61,11 +61,6 @@ public static class AdaBoostOutput extends Model.Output { public AdaBoostOutput(AdaBoost adaBoostModel) { super(adaBoostModel); } - - @Override - public boolean isAdaboost() { - return true; - } } @Override diff --git a/h2o-core/src/main/java/hex/Model.java b/h2o-core/src/main/java/hex/Model.java index 81aaa9d23dd1..6a83f98935df 100755 --- a/h2o-core/src/main/java/hex/Model.java +++ b/h2o-core/src/main/java/hex/Model.java @@ -1127,7 +1127,6 @@ public String[] features() { public boolean hasFold () { return _hasFold;} public boolean hasTreatment() { return _hasTreatment;} public boolean hasResponse() { return isSupervised(); } - public boolean isAdaboost() {return false;}; public String responseName() { return isSupervised()?_names[responseIdx()]:null;} public String weightsName () { return _hasWeights ?_names[weightsIdx()]:null;} public String offsetName () { return _hasOffset ?_names[offsetIdx()]:null;} From 98191ba294b4bcae3b0c82659ee47cd894ba3433 Mon Sep 17 00:00:00 2001 From: Adam Valenta Date: Tue, 5 Sep 2023 16:41:04 +0200 Subject: [PATCH 14/34] add simple model summary --- .../src/main/java/hex/adaboost/AdaBoost.java | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java b/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java index 34cfe0dbf43b..37c37c7689c2 100644 --- a/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java +++ b/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java @@ -12,6 +12,10 @@ import water.exceptions.H2OModelBuilderIllegalArgumentException; import water.fvec.Frame; import water.fvec.Vec; +import water.util.TwoDimTable; + +import java.util.ArrayList; +import java.util.List; /** * TODO valenad1 @@ -91,6 +95,7 @@ private void buildAdaboost() { _model.update(_job); } DKV.remove(_trainWithWeights._key); + _model._output._model_summary = createModelSummaryTable(); } } @@ -148,4 +153,31 @@ private GLM getGLMWeakLearner(Frame frame) { return new GLM(parms); } + public TwoDimTable createModelSummaryTable() { + List colHeaders = new ArrayList<>(); + List colTypes = new ArrayList<>(); + List colFormat = new ArrayList<>(); + + colHeaders.add("Number of weak learners"); colTypes.add("int"); colFormat.add("%d"); + colHeaders.add("Learning rate"); colTypes.add("int"); colFormat.add("%d"); + colHeaders.add("Weak learner"); colTypes.add("int"); colFormat.add("%d"); + colHeaders.add("Seed"); colTypes.add("long"); colFormat.add("%d"); + + final int rows = 1; + TwoDimTable table = new TwoDimTable( + "Model Summary", null, + new String[rows], + colHeaders.toArray(new String[0]), + colTypes.toArray(new String[0]), + colFormat.toArray(new String[0]), + ""); + int row = 0; + int col = 0; + table.set(row, col++, _parms._n_estimators); + table.set(row, col++, _parms._learning_rate); + table.set(row, col++, _parms._weak_learner.toString()); + table.set(row, col, _parms._seed); + return table; + } + } From 589033db6a3d406ccb5b97065a8f9a1987c3f87d Mon Sep 17 00:00:00 2001 From: Adam Valenta Date: Wed, 6 Sep 2023 18:21:57 +0200 Subject: [PATCH 15/34] fix java api --- .../src/main/java/hex/adaboost/AdaBoost.java | 15 +++++++++++++++ .../src/main/java/hex/adaboost/AdaBoostModel.java | 2 +- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java b/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java index 37c37c7689c2..e61b98400faf 100644 --- a/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java +++ b/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java @@ -7,6 +7,7 @@ import hex.glm.GLMModel; import hex.tree.drf.DRF; import hex.tree.drf.DRFModel; +import hex.tree.dt.DTModel; import org.apache.log4j.Logger; import water.*; import water.exceptions.H2OModelBuilderIllegalArgumentException; @@ -33,6 +34,20 @@ public AdaBoost(AdaBoostModel.AdaBoostParameters parms) { init(false); } + public AdaBoost(boolean startup_once) { + super(new AdaBoostModel.AdaBoostParameters(), startup_once); + } + + @Override + public boolean havePojo() { + return false; + } + + @Override + public boolean haveMojo() { + return false; + } + @Override public void init(boolean expensive) { super.init(expensive); diff --git a/h2o-algos/src/main/java/hex/adaboost/AdaBoostModel.java b/h2o-algos/src/main/java/hex/adaboost/AdaBoostModel.java index c586d06d6a85..f56dd2927fbc 100644 --- a/h2o-algos/src/main/java/hex/adaboost/AdaBoostModel.java +++ b/h2o-algos/src/main/java/hex/adaboost/AdaBoostModel.java @@ -92,7 +92,7 @@ public static class AdaBoostParameters extends Model.Parameters { /** * TODO valenad1 */ - public long _n_estimators; + public int _n_estimators; /** * TODO valenad1 From b4135358d14c535587647a6d404b2dfc09465d13 Mon Sep 17 00:00:00 2001 From: Adam Valenta Date: Thu, 14 Sep 2023 18:58:16 +0200 Subject: [PATCH 16/34] Implement possibility to have a custom weights column and ensure that the created weights collumn will always be the one we use --- .../src/main/java/hex/adaboost/AdaBoost.java | 35 +++-- .../test/java/hex/adaboost/AdaBoostTest.java | 123 +++++++++++++++++- 2 files changed, 145 insertions(+), 13 deletions(-) diff --git a/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java b/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java index e61b98400faf..b1a41c6d843b 100644 --- a/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java +++ b/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java @@ -7,7 +7,6 @@ import hex.glm.GLMModel; import hex.tree.drf.DRF; import hex.tree.drf.DRFModel; -import hex.tree.dt.DTModel; import org.apache.log4j.Logger; import water.*; import water.exceptions.H2OModelBuilderIllegalArgumentException; @@ -27,6 +26,7 @@ public class AdaBoost extends ModelBuilder Date: Thu, 14 Sep 2023 19:06:43 +0200 Subject: [PATCH 17/34] Add categorical test --- .../test/java/hex/adaboost/AdaBoostTest.java | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java b/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java index 79b419683dda..2d63e580f713 100644 --- a/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java +++ b/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java @@ -1,5 +1,6 @@ package hex.adaboost; +import hex.Model; import hex.genmodel.algos.tree.SharedTreeSubgraph; import hex.glm.GLM; import hex.glm.GLMModel; @@ -182,6 +183,39 @@ public void testBasicTrainAndScore() { } } + @Test + public void testBasicTrainAndScoreCategorical() { + try { + Scope.enter(); + Frame train = parseTestFile("smalldata/prostate/prostate.csv"); + Scope.track(train); + String response = "CAPSULE"; + train.toCategoricalCol(response); + train.toCategoricalCol("RACE"); + train.toCategoricalCol("DPROS"); + train.toCategoricalCol("DCAPS"); + train.toCategoricalCol("GLEASON"); + AdaBoostModel.AdaBoostParameters p = new AdaBoostModel.AdaBoostParameters(); + p._train = train._key; + p._seed = 0xDECAF; + p._n_estimators = 50; + p._response_column = response; + p._categorical_encoding = Model.Parameters.CategoricalEncodingScheme.OneHotExplicit; + + AdaBoost adaBoost = new AdaBoost(p); + AdaBoostModel adaBoostModel = adaBoost.trainModel().get(); + Scope.track_generic(adaBoostModel); + assertNotNull(adaBoostModel); + + System.out.println("train.toTwoDimTable() = " + train.toTwoDimTable()); + + Frame score = adaBoostModel.score(train); + Scope.track(score); + } finally { + Scope.exit(); + } + } + // @Test // public void testBasicTrainAndScoreGLM() { // try { From 7a65eb7b8edd4a73f6b692f53aa9b9b5f631ec54 Mon Sep 17 00:00:00 2001 From: Adam Valenta Date: Fri, 15 Sep 2023 17:19:08 +0200 Subject: [PATCH 18/34] Cleanup that didn't change tests results --- .../src/test/java/hex/tree/drf/DRFTest.java | 35 ------------------- h2o-core/src/main/java/hex/Model.java | 2 +- h2o-core/src/main/java/hex/ModelBuilder.java | 2 +- 3 files changed, 2 insertions(+), 37 deletions(-) diff --git a/h2o-algos/src/test/java/hex/tree/drf/DRFTest.java b/h2o-algos/src/test/java/hex/tree/drf/DRFTest.java index 21d8b1482022..73723daddec9 100755 --- a/h2o-algos/src/test/java/hex/tree/drf/DRFTest.java +++ b/h2o-algos/src/test/java/hex/tree/drf/DRFTest.java @@ -2290,39 +2290,4 @@ public void reproducePUBDEV8298() throws Exception { Scope.exit(); } } - - @Test - public void testDRFMinimalize() { - Frame tfr = null, vfr = null; - DRFModel drf = null; - - Scope.enter(); - try { - tfr = parseTestFile("smalldata/junit/weights_all_ones.csv"); - DKV.put(tfr); - DRFModel.DRFParameters parms = new DRFModel.DRFParameters(); - parms._train = tfr._key; - parms._response_column = "response"; - parms._weights_column = "weight"; - parms._seed = 234; - parms._min_rows = 1; - parms._max_depth = 2; - parms._ntrees = 3; - - // Build a first model; all remaining models should be equal - drf = new DRF(parms).trainModel().get(); - - // OOB - ModelMetricsBinomial mm = (ModelMetricsBinomial)drf._output._training_metrics; - assertEquals(_AUC, mm.auc_obj()._auc, 1e-8); - assertEquals(_MSE, mm.mse(), 1e-8); - assertEquals(_LogLoss, mm.logloss(), 1e-6); - - } finally { - if (tfr != null) tfr.remove(); - if (vfr != null) vfr.remove(); - if (drf != null) drf.delete(); - Scope.exit(); - } - } } diff --git a/h2o-core/src/main/java/hex/Model.java b/h2o-core/src/main/java/hex/Model.java index 6a83f98935df..46c5d4f33a11 100755 --- a/h2o-core/src/main/java/hex/Model.java +++ b/h2o-core/src/main/java/hex/Model.java @@ -2372,7 +2372,7 @@ protected double[] score0(double data[/*ncols*/], double preds[/*nclasses+1*/], // Version where the user has just ponied-up an array of data to be scored. // Data must be in proper order. Handy for JUnit tests. public double score(double[] data){ - double[] pred = score0(data, new double[_output.nclasses() + 1]); + double[] pred = score0(data, new double[_output.nclasses()]); return _output.nclasses() == 1 ? pred[0] /* regression */ : ArrayUtils.maxIndex(pred) /*classification?*/; } diff --git a/h2o-core/src/main/java/hex/ModelBuilder.java b/h2o-core/src/main/java/hex/ModelBuilder.java index 969dc7a16d1f..e647a1a6c8f7 100644 --- a/h2o-core/src/main/java/hex/ModelBuilder.java +++ b/h2o-core/src/main/java/hex/ModelBuilder.java @@ -1403,7 +1403,7 @@ public void init(boolean expensive) { initWorkspace(expensive); assert _parms != null; // Parms must already be set in - if( _parms._train == null && _train == null) { + if( _parms._train == null ) { if (expensive) error("_train", "Missing training frame"); return; From e0d4845946119b17eb4cbc3915ce0ee428591028 Mon Sep 17 00:00:00 2001 From: Adam Valenta Date: Fri, 15 Sep 2023 17:25:16 +0200 Subject: [PATCH 19/34] Remove toCSV since there is API already and commented code --- .../test/java/hex/adaboost/AdaBoostTest.java | 49 +------------------ 1 file changed, 1 insertion(+), 48 deletions(-) diff --git a/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java b/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java index 2d63e580f713..1bb1cace6f8d 100644 --- a/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java +++ b/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java @@ -30,8 +30,6 @@ @CloudSize(1) @RunWith(H2ORunner.class) public class AdaBoostTest extends TestUtil { - - public boolean print = false; @Rule public EnvironmentVariables environmentVariables = new EnvironmentVariables(); @@ -175,9 +173,6 @@ public void testBasicTrainAndScore() { Frame score = adaBoostModel.score(train); Scope.track(score); - toCSV(score, "../prostatescore.csv"); -// Frame scoreOriginal = Scope.track(parseTestFile("../prostatescore_original.csv")); -// assertFrameEquals(new Frame(scoreOriginal.vec(0)), new Frame(score.vec(0)), 0); } finally { Scope.exit(); } @@ -216,34 +211,6 @@ public void testBasicTrainAndScoreCategorical() { } } -// @Test -// public void testBasicTrainAndScoreGLM() { -// try { -// Scope.enter(); -// Frame train = Scope.track(parseTestFile("smalldata/prostate/prostate.csv")); -// Frame test = Scope.track(parseTestFile("smalldata/prostate/prostate.csv")); -// String response = "CAPSULE"; -// train.toCategoricalCol(response); -// AdaBoostModel.AdaBoostParameters p = new AdaBoostModel.AdaBoostParameters(); -// p._train = train._key; -// p._seed = 0xDECAF; -// p._n_estimators = 2; -// p._weak_learner = AdaBoostModel.Algorithm.GLM; -// p._response_column = response; -// -// AdaBoost adaBoost = new AdaBoost(p); -// AdaBoostModel adaBoostModel = adaBoost.trainModel().get(); -// Scope.track_generic(adaBoostModel); -// assertNotNull(adaBoostModel); -// -// Frame score = adaBoostModel.score(test); -// Scope.track(score); -// toCSV(score, "../prostatescoreglm.csv"); -// } finally { -// Scope.exit(); -// } -// } - @Test public void testBasicTrainAndScoreLarge() { try { @@ -265,7 +232,6 @@ public void testBasicTrainAndScoreLarge() { Frame score = adaBoostModel.score(train); Scope.track(score); - toCSV(score, "../creditcardfraudscore.csv"); } finally { Scope.exit(); } @@ -295,7 +261,6 @@ public void testBasicTrainAirlines() { Frame score = adaBoostModel.score(test); Scope.track(score); - toCSV(score, "../airlinesscore.csv"); } finally { Scope.exit(); } @@ -325,23 +290,11 @@ public void testBasicTrainHiggs() { Frame score = adaBoostModel.score(test); Scope.track(score); - toCSV(score, "../higgsscore.csv"); } finally { Scope.exit(); } } - - private void toCSV(Frame frame, String filename) { - if (print) { - File targetFile = new File(filename); - try { - FileUtils.copyInputStreamToFile(frame.toCSV(new Frame.CSVStreamParams()), targetFile); - } catch (IOException e) { - e.printStackTrace(); - } - } - } - + @Test public void testCountWe() { Scope.enter(); From f84d9cb7a6f73065371ae66d0a3767cd5dc8ce0d Mon Sep 17 00:00:00 2001 From: Adam Valenta Date: Fri, 15 Sep 2023 18:28:24 +0200 Subject: [PATCH 20/34] Refactor learning rate to learn rate --- h2o-algos/src/main/java/hex/adaboost/AdaBoost.java | 6 +++--- h2o-algos/src/main/java/hex/adaboost/AdaBoostModel.java | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java b/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java index b1a41c6d843b..123fec0e0648 100644 --- a/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java +++ b/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java @@ -112,7 +112,7 @@ private void buildAdaboost() { CountWeTask countWe = new CountWeTask().doAll(_trainWithWeights.vec(_weightsName), _trainWithWeights.vec(_parms._response_column), score.vec("predict")); double e_m = countWe.We / countWe.W; - double alpha_m = _parms._learning_rate * Math.log((1 - e_m) / e_m); + double alpha_m = _parms._learn_rate * Math.log((1 - e_m) / e_m); _model._output.alphas[n] = alpha_m; UpdateWeightsTask updateWeightsTask = new UpdateWeightsTask(alpha_m); @@ -187,7 +187,7 @@ public TwoDimTable createModelSummaryTable() { List colFormat = new ArrayList<>(); colHeaders.add("Number of weak learners"); colTypes.add("int"); colFormat.add("%d"); - colHeaders.add("Learning rate"); colTypes.add("int"); colFormat.add("%d"); + colHeaders.add("Learn rate"); colTypes.add("int"); colFormat.add("%d"); colHeaders.add("Weak learner"); colTypes.add("int"); colFormat.add("%d"); colHeaders.add("Seed"); colTypes.add("long"); colFormat.add("%d"); @@ -202,7 +202,7 @@ public TwoDimTable createModelSummaryTable() { int row = 0; int col = 0; table.set(row, col++, _parms._n_estimators); - table.set(row, col++, _parms._learning_rate); + table.set(row, col++, _parms._learn_rate); table.set(row, col++, _parms._weak_learner.toString()); table.set(row, col, _parms._seed); return table; diff --git a/h2o-algos/src/main/java/hex/adaboost/AdaBoostModel.java b/h2o-algos/src/main/java/hex/adaboost/AdaBoostModel.java index f56dd2927fbc..dee7954b98c4 100644 --- a/h2o-algos/src/main/java/hex/adaboost/AdaBoostModel.java +++ b/h2o-algos/src/main/java/hex/adaboost/AdaBoostModel.java @@ -102,7 +102,7 @@ public static class AdaBoostParameters extends Model.Parameters { /** * TODO valenad1 */ - public double _learning_rate; + public double _learn_rate; @Override public String algoName() { @@ -128,7 +128,7 @@ public AdaBoostParameters() { super(); _n_estimators = 50; _weak_learner = Algorithm.AUTO; - _learning_rate = 0.5; + _learn_rate = 0.5; } } } From e3a03099e8ce69f883c52f40c3afa2444af668e8 Mon Sep 17 00:00:00 2001 From: Adam Valenta Date: Fri, 15 Sep 2023 18:33:35 +0200 Subject: [PATCH 21/34] Add documentation and validation to parameters --- .../src/main/java/hex/adaboost/AdaBoost.java | 21 ++++++++++++------- .../main/java/hex/adaboost/AdaBoostModel.java | 6 +++--- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java b/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java index 123fec0e0648..1219b4ff2dc6 100644 --- a/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java +++ b/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java @@ -24,6 +24,7 @@ */ public class AdaBoost extends ModelBuilder { private static final Logger LOG = Logger.getLogger(AdaBoost.class); + private static final int MAX_ESTIMATORS = 100_000; private AdaBoostModel _model; private String _weightsName = "weights"; @@ -51,14 +52,18 @@ public boolean haveMojo() { @Override public void init(boolean expensive) { super.init(expensive); - if (expensive) { - if (_parms._weak_learner == AdaBoostModel.Algorithm.AUTO) { - _parms._weak_learner = AdaBoostModel.Algorithm.DRF; - } - if (_parms._weights_column != null) { - // _parms._weights_column cannot be used all time since it breaks scoring - _weightsName = _parms._weights_column; - } + if(_parms._n_estimators < 1 || _parms._n_estimators > MAX_ESTIMATORS) + error("n_estimators", "Parameter n_estimators must be in interval [1, " + + MAX_ESTIMATORS + "] but it is " + _parms._n_estimators); + if (_parms._weak_learner == AdaBoostModel.Algorithm.AUTO) { + _parms._weak_learner = AdaBoostModel.Algorithm.DRF; + } + if (_parms._weights_column != null) { + // _parms._weights_column cannot be used all time since it breaks scoring + _weightsName = _parms._weights_column; + } + if( !(0. < _parms._learn_rate && _parms._learn_rate <= 1.0) ) { + error("learn_rate", "learn_rate must be between 0 and 1"); } } diff --git a/h2o-algos/src/main/java/hex/adaboost/AdaBoostModel.java b/h2o-algos/src/main/java/hex/adaboost/AdaBoostModel.java index dee7954b98c4..e22eb1d2e9a4 100644 --- a/h2o-algos/src/main/java/hex/adaboost/AdaBoostModel.java +++ b/h2o-algos/src/main/java/hex/adaboost/AdaBoostModel.java @@ -90,17 +90,17 @@ protected Keyed readAll_impl(AutoBuffer ab, Futures fs) { public static class AdaBoostParameters extends Model.Parameters { /** - * TODO valenad1 + * Number of weak learners to train. Defaults to 50. */ public int _n_estimators; /** - * TODO valenad1 + * Choose a weak learner type. Defaults to DRF. */ public Algorithm _weak_learner; /** - * TODO valenad1 + * Specify how quickly the training converge. Number in (0,1]. Defaults to 0.5. */ public double _learn_rate; From 5345b9b887e58c3502706684ebbeec9c1e33ee71 Mon Sep 17 00:00:00 2001 From: Adam Valenta Date: Fri, 15 Sep 2023 18:45:01 +0200 Subject: [PATCH 22/34] Add documentation to AdaBoost class --- h2o-algos/src/main/java/hex/adaboost/AdaBoost.java | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java b/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java index 1219b4ff2dc6..897aabe0cfa7 100644 --- a/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java +++ b/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java @@ -18,7 +18,11 @@ import java.util.List; /** - * TODO valenad1 + * Implementation of AdaBoost algorithm based on + * + * Raul Rojas, "Adaboost and the Super Bowl of Classifiers A Tutorial Introduction to Adaptive Boosting" + * Alexandru Niculescu-Mizil and Richard A. Caruana, "Obtaining Calibrated Probabilities from Boosting" + * Y. Freund, R. Schapire, “A Decision-Theoretic Generalization of on-Line Learning and an Application to Boosting”, 1995. * * @author Adam Valenta */ From 1840cab302dbb2e2ff929e98059377adf373e93c Mon Sep 17 00:00:00 2001 From: Adam Valenta Date: Fri, 15 Sep 2023 19:07:36 +0200 Subject: [PATCH 23/34] add log --- h2o-algos/src/main/java/hex/adaboost/AdaBoost.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java b/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java index 897aabe0cfa7..7a5c08f29c5e 100644 --- a/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java +++ b/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java @@ -12,6 +12,7 @@ import water.exceptions.H2OModelBuilderIllegalArgumentException; import water.fvec.Frame; import water.fvec.Vec; +import water.util.Timer; import water.util.TwoDimTable; import java.util.ArrayList; @@ -110,6 +111,7 @@ private void buildAdaboost() { } for (int n = 0; n < _parms._n_estimators; n++) { + Timer timer = new Timer(); ModelBuilder job = chooseWeakLearner(_trainWithWeights); job._parms._seed += n; Model model = (Model) job.trainModel().get(); @@ -128,6 +130,8 @@ private void buildAdaboost() { updateWeightsTask.doAll(_trainWithWeights.vec(_weightsName), _trainWithWeights.vec(_parms._response_column), score.vec("predict")); _job.update(1); _model.update(_job); + LOG.info((n + 1) + ". estimator was built in " + timer.toString()); + LOG.info("*********************************************************************"); } if (_trainWithWeights != _parms.train()) { DKV.remove(_trainWithWeights._key); From 0485715b8b9542f379ee623de61a7da0711047c5 Mon Sep 17 00:00:00 2001 From: Adam Valenta Date: Mon, 18 Sep 2023 16:33:43 +0200 Subject: [PATCH 24/34] Refactor AdaBoost - simple refactor --- h2o-algos/src/main/java/hex/adaboost/AdaBoost.java | 2 +- h2o-algos/src/main/java/hex/adaboost/AdaBoostModel.java | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java b/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java index 7a5c08f29c5e..eb01a9d5ea14 100644 --- a/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java +++ b/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java @@ -177,9 +177,9 @@ private DRF getDRFWeakLearner(Frame frame) { DRFModel.DRFParameters parms = new DRFModel.DRFParameters(); parms._train = frame._key; parms._response_column = _parms._response_column; + parms._weights_column = _weightsName; parms._mtries = 1; parms._min_rows = 1; - parms._weights_column = _weightsName; parms._ntrees = 1; parms._sample_rate = 1; parms._max_depth = 1; diff --git a/h2o-algos/src/main/java/hex/adaboost/AdaBoostModel.java b/h2o-algos/src/main/java/hex/adaboost/AdaBoostModel.java index e22eb1d2e9a4..fb22947c7f16 100644 --- a/h2o-algos/src/main/java/hex/adaboost/AdaBoostModel.java +++ b/h2o-algos/src/main/java/hex/adaboost/AdaBoostModel.java @@ -37,8 +37,8 @@ protected double[] score0(double[] data, double[] preds) { double alphas1 = 0; double linearCombination = 0; for (int i = 0; i < _output.alphas.length; i++) { - Model drfModel = DKV.getGet(_output.models[i]); - if (drfModel.score(data) == 0) { + Model model = DKV.getGet(_output.models[i]); + if (model.score(data) == 0) { linearCombination += _output.alphas[i]*-1; alphas0 += _output.alphas[i]; } else { From 4810d4fd3238d2cf8ffe0f2434e0c0b4253d73ea Mon Sep 17 00:00:00 2001 From: Adam Valenta Date: Mon, 18 Sep 2023 17:19:46 +0200 Subject: [PATCH 25/34] Fix GLM as a weak learner --- .../src/main/java/hex/adaboost/AdaBoost.java | 2 ++ .../test/java/hex/adaboost/AdaBoostTest.java | 27 +++++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java b/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java index eb01a9d5ea14..0d467989da11 100644 --- a/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java +++ b/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java @@ -191,6 +191,8 @@ private GLM getGLMWeakLearner(Frame frame) { GLMModel.GLMParameters parms = new GLMModel.GLMParameters(); parms._train = frame._key; parms._response_column = _parms._response_column; + parms._weights_column = _weightsName; + parms._seed = _parms._seed; return new GLM(parms); } diff --git a/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java b/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java index 1bb1cace6f8d..37b1644d5d6f 100644 --- a/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java +++ b/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java @@ -461,4 +461,31 @@ public void testBasicTrainAndScoreWithDuplicatedWeightsColumn() { Scope.exit(); } } + + @Test + public void testBasicTrainAndScoreGLM() { + try { + Scope.enter(); + Frame train = Scope.track(parseTestFile("smalldata/prostate/prostate.csv")); + Frame test = Scope.track(parseTestFile("smalldata/prostate/prostate.csv")); + String response = "CAPSULE"; + train.toCategoricalCol(response); + AdaBoostModel.AdaBoostParameters p = new AdaBoostModel.AdaBoostParameters(); + p._train = train._key; + p._seed = 0xDECAF; + p._n_estimators = 50; + p._weak_learner = AdaBoostModel.Algorithm.GLM; + p._response_column = response; + + AdaBoost adaBoost = new AdaBoost(p); + AdaBoostModel adaBoostModel = adaBoost.trainModel().get(); + Scope.track_generic(adaBoostModel); + assertNotNull(adaBoostModel); + + Frame score = adaBoostModel.score(test); + Scope.track(score); + } finally { + Scope.exit(); + } + } } From 40bca78e273ab3268e7695e54795b1a53558efcf Mon Sep 17 00:00:00 2001 From: Adam Valenta Date: Mon, 18 Sep 2023 17:21:43 +0200 Subject: [PATCH 26/34] Add GBM as a weak learner --- .../src/main/java/hex/adaboost/AdaBoost.java | 17 ++++++++++++ .../main/java/hex/adaboost/AdaBoostModel.java | 2 +- .../src/main/java/hex/tree/gbm/GBMModel.java | 7 +++++ .../test/java/hex/adaboost/AdaBoostTest.java | 27 +++++++++++++++++++ 4 files changed, 52 insertions(+), 1 deletion(-) diff --git a/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java b/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java index 0d467989da11..1e461fea578f 100644 --- a/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java +++ b/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java @@ -7,6 +7,8 @@ import hex.glm.GLMModel; import hex.tree.drf.DRF; import hex.tree.drf.DRFModel; +import hex.tree.gbm.GBM; +import hex.tree.gbm.GBMModel; import org.apache.log4j.Logger; import water.*; import water.exceptions.H2OModelBuilderIllegalArgumentException; @@ -166,6 +168,8 @@ private ModelBuilder chooseWeakLearner(Frame frame) { switch (_parms._weak_learner) { case GLM: return getGLMWeakLearner(frame); + case GBM: + return getGBMWeakLearner(frame); default: case DRF: return getDRFWeakLearner(frame); @@ -196,6 +200,19 @@ private GLM getGLMWeakLearner(Frame frame) { return new GLM(parms); } + private GBM getGBMWeakLearner(Frame frame) { + GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); + parms._train = frame._key; + parms._response_column = _parms._response_column; + parms._weights_column = _weightsName; + parms._min_rows = 1; + parms._ntrees = 1; + parms._sample_rate = 1; + parms._max_depth = 1; + parms._seed = _parms._seed; + return new GBM(parms); + } + public TwoDimTable createModelSummaryTable() { List colHeaders = new ArrayList<>(); List colTypes = new ArrayList<>(); diff --git a/h2o-algos/src/main/java/hex/adaboost/AdaBoostModel.java b/h2o-algos/src/main/java/hex/adaboost/AdaBoostModel.java index fb22947c7f16..e769779d376c 100644 --- a/h2o-algos/src/main/java/hex/adaboost/AdaBoostModel.java +++ b/h2o-algos/src/main/java/hex/adaboost/AdaBoostModel.java @@ -11,7 +11,7 @@ public class AdaBoostModel extends Model { private static final Logger LOG = Logger.getLogger(AdaBoostModel.class); - public enum Algorithm {DRF, GLM, AUTO} + public enum Algorithm {DRF, GLM, GBM, AUTO} public AdaBoostModel(Key selfKey, AdaBoostParameters parms, AdaBoostOutput output) { diff --git a/h2o-algos/src/main/java/hex/tree/gbm/GBMModel.java b/h2o-algos/src/main/java/hex/tree/gbm/GBMModel.java index 4598e66c664e..8eee5526ac59 100755 --- a/h2o-algos/src/main/java/hex/tree/gbm/GBMModel.java +++ b/h2o-algos/src/main/java/hex/tree/gbm/GBMModel.java @@ -378,4 +378,11 @@ public void map(Chunk[] chk, NewChunk[] nchk) { }.withPostMapAction(JobUpdatePostMap.forJob(j)).doAll(types, vs).outputFrame(destination_key, names, domains); } + @Override + public double score(double[] data) { + double[] pred = score0(data, new double[_output.nclasses() + 1], 0, _output._ntrees); + score0PostProcessSupervised(pred, data); + return pred[0]; + } + } diff --git a/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java b/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java index 37b1644d5d6f..65de33ee8830 100644 --- a/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java +++ b/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java @@ -488,4 +488,31 @@ public void testBasicTrainAndScoreGLM() { Scope.exit(); } } + + @Test + public void testBasicTrainAndScoreGBM() { + try { + Scope.enter(); + Frame train = Scope.track(parseTestFile("smalldata/prostate/prostate.csv")); + Frame test = Scope.track(parseTestFile("smalldata/prostate/prostate.csv")); + String response = "CAPSULE"; + train.toCategoricalCol(response); + AdaBoostModel.AdaBoostParameters p = new AdaBoostModel.AdaBoostParameters(); + p._train = train._key; + p._seed = 0xDECAF; + p._n_estimators = 50; + p._weak_learner = AdaBoostModel.Algorithm.GBM; + p._response_column = response; + + AdaBoost adaBoost = new AdaBoost(p); + AdaBoostModel adaBoostModel = adaBoost.trainModel().get(); + Scope.track_generic(adaBoostModel); + assertNotNull(adaBoostModel); + + Frame score = adaBoostModel.score(test); + Scope.track(score); + } finally { + Scope.exit(); + } + } } From 32d96b20c8d1eb576d70b21f45be158c0cfb11e1 Mon Sep 17 00:00:00 2001 From: Adam Valenta Date: Tue, 19 Sep 2023 17:50:04 +0200 Subject: [PATCH 27/34] test cleanup --- .../test/java/hex/adaboost/AdaBoostTest.java | 37 +++---------------- 1 file changed, 5 insertions(+), 32 deletions(-) diff --git a/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java b/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java index 65de33ee8830..9258b84caf8d 100644 --- a/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java +++ b/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java @@ -2,10 +2,9 @@ import hex.Model; import hex.genmodel.algos.tree.SharedTreeSubgraph; -import hex.glm.GLM; -import hex.glm.GLMModel; import hex.tree.drf.DRFModel; -import org.apache.commons.io.FileUtils; +import hex.tree.gbm.GBM; +import hex.tree.gbm.GBMModel; import org.junit.Before; import org.junit.Rule; import org.junit.Test; @@ -22,7 +21,6 @@ import water.util.FrameUtils; import java.io.File; -import java.io.IOException; import java.util.Arrays; import static org.junit.Assert.*; @@ -102,30 +100,7 @@ public void testBasicTrainGLM() { } finally { Scope.exit(); } - } - - @Test - public void testBasicTrainGLMWeakLerner() { - try { - Scope.enter(); - Frame train = Scope.track(parseTestFile("smalldata/prostate/prostate.csv")); - String response = "CAPSULE"; - train.toCategoricalCol(response); - GLMModel.GLMParameters p = new GLMModel.GLMParameters(); - p._train = train._key; - p._seed = 0xDECAF; - p._response_column = response; - - GLM adaBoost = new GLM(p); - GLMModel adaBoostModel = adaBoost.trainModel().get(); - Scope.track_generic(adaBoostModel); - assertNotNull(adaBoostModel); - Frame score = adaBoostModel.score(train); - Scope.track(score); - } finally { - Scope.exit(); - } - } + } @Test public void testBasicTrainLarge() { @@ -467,7 +442,6 @@ public void testBasicTrainAndScoreGLM() { try { Scope.enter(); Frame train = Scope.track(parseTestFile("smalldata/prostate/prostate.csv")); - Frame test = Scope.track(parseTestFile("smalldata/prostate/prostate.csv")); String response = "CAPSULE"; train.toCategoricalCol(response); AdaBoostModel.AdaBoostParameters p = new AdaBoostModel.AdaBoostParameters(); @@ -482,7 +456,7 @@ public void testBasicTrainAndScoreGLM() { Scope.track_generic(adaBoostModel); assertNotNull(adaBoostModel); - Frame score = adaBoostModel.score(test); + Frame score = adaBoostModel.score(train); Scope.track(score); } finally { Scope.exit(); @@ -494,7 +468,6 @@ public void testBasicTrainAndScoreGBM() { try { Scope.enter(); Frame train = Scope.track(parseTestFile("smalldata/prostate/prostate.csv")); - Frame test = Scope.track(parseTestFile("smalldata/prostate/prostate.csv")); String response = "CAPSULE"; train.toCategoricalCol(response); AdaBoostModel.AdaBoostParameters p = new AdaBoostModel.AdaBoostParameters(); @@ -509,7 +482,7 @@ public void testBasicTrainAndScoreGBM() { Scope.track_generic(adaBoostModel); assertNotNull(adaBoostModel); - Frame score = adaBoostModel.score(test); + Frame score = adaBoostModel.score(train); Scope.track(score); } finally { Scope.exit(); From 8574bd2e80bcbdb6e30159ceb32d7253b9588455 Mon Sep 17 00:00:00 2001 From: Adam Valenta Date: Tue, 19 Sep 2023 18:30:56 +0200 Subject: [PATCH 28/34] Refactor n_estimators to nlearners --- .../src/main/java/hex/adaboost/AdaBoost.java | 14 +++++----- .../main/java/hex/adaboost/AdaBoostModel.java | 6 ++-- .../test/java/hex/adaboost/AdaBoostTest.java | 28 +++++++++---------- 3 files changed, 23 insertions(+), 25 deletions(-) diff --git a/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java b/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java index 1e461fea578f..89a7e04a938f 100644 --- a/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java +++ b/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java @@ -31,7 +31,7 @@ */ public class AdaBoost extends ModelBuilder { private static final Logger LOG = Logger.getLogger(AdaBoost.class); - private static final int MAX_ESTIMATORS = 100_000; + private static final int MAX_LEARNERS = 100_000; private AdaBoostModel _model; private String _weightsName = "weights"; @@ -59,9 +59,9 @@ public boolean haveMojo() { @Override public void init(boolean expensive) { super.init(expensive); - if(_parms._n_estimators < 1 || _parms._n_estimators > MAX_ESTIMATORS) + if(_parms._nlearners < 1 || _parms._nlearners > MAX_LEARNERS) error("n_estimators", "Parameter n_estimators must be in interval [1, " - + MAX_ESTIMATORS + "] but it is " + _parms._n_estimators); + + MAX_LEARNERS + "] but it is " + _parms._nlearners); if (_parms._weak_learner == AdaBoostModel.Algorithm.AUTO) { _parms._weak_learner = AdaBoostModel.Algorithm.DRF; } @@ -96,8 +96,8 @@ public void computeImpl() { } private void buildAdaboost() { - _model._output.alphas = new double[(int)_parms._n_estimators]; - _model._output.models = new Key[(int)_parms._n_estimators]; + _model._output.alphas = new double[(int)_parms._nlearners]; + _model._output.models = new Key[(int)_parms._nlearners]; Frame _trainWithWeights; if (_parms._weights_column == null) { @@ -112,7 +112,7 @@ private void buildAdaboost() { _trainWithWeights = _parms.train(); } - for (int n = 0; n < _parms._n_estimators; n++) { + for (int n = 0; n < _parms._nlearners; n++) { Timer timer = new Timer(); ModelBuilder job = chooseWeakLearner(_trainWithWeights); job._parms._seed += n; @@ -233,7 +233,7 @@ public TwoDimTable createModelSummaryTable() { ""); int row = 0; int col = 0; - table.set(row, col++, _parms._n_estimators); + table.set(row, col++, _parms._nlearners); table.set(row, col++, _parms._learn_rate); table.set(row, col++, _parms._weak_learner.toString()); table.set(row, col, _parms._seed); diff --git a/h2o-algos/src/main/java/hex/adaboost/AdaBoostModel.java b/h2o-algos/src/main/java/hex/adaboost/AdaBoostModel.java index e769779d376c..e9789a53088b 100644 --- a/h2o-algos/src/main/java/hex/adaboost/AdaBoostModel.java +++ b/h2o-algos/src/main/java/hex/adaboost/AdaBoostModel.java @@ -92,7 +92,7 @@ public static class AdaBoostParameters extends Model.Parameters { /** * Number of weak learners to train. Defaults to 50. */ - public int _n_estimators; + public int _nlearners; /** * Choose a weak learner type. Defaults to DRF. @@ -121,12 +121,12 @@ public String javaName() { @Override public long progressUnits() { - return _n_estimators; + return _nlearners; } public AdaBoostParameters() { super(); - _n_estimators = 50; + _nlearners = 50; _weak_learner = Algorithm.AUTO; _learn_rate = 0.5; } diff --git a/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java b/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java index 9258b84caf8d..839858ea95d5 100644 --- a/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java +++ b/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java @@ -3,8 +3,6 @@ import hex.Model; import hex.genmodel.algos.tree.SharedTreeSubgraph; import hex.tree.drf.DRFModel; -import hex.tree.gbm.GBM; -import hex.tree.gbm.GBMModel; import org.junit.Before; import org.junit.Rule; import org.junit.Test; @@ -49,7 +47,7 @@ public void testBasicTrain() { AdaBoostModel.AdaBoostParameters p = new AdaBoostModel.AdaBoostParameters(); p._train = train._key; p._seed = 0xDECAF; - p._n_estimators = 50; + p._nlearners = 50; p._response_column = response; AdaBoost adaBoost = new AdaBoost(p); @@ -89,7 +87,7 @@ public void testBasicTrainGLM() { AdaBoostModel.AdaBoostParameters p = new AdaBoostModel.AdaBoostParameters(); p._train = train._key; p._seed = 0xDECAF; - p._n_estimators = 50; + p._nlearners = 50; p._weak_learner = AdaBoostModel.Algorithm.GLM; p._response_column = response; @@ -113,7 +111,7 @@ public void testBasicTrainLarge() { AdaBoostModel.AdaBoostParameters p = new AdaBoostModel.AdaBoostParameters(); p._train = train._key; p._seed = 0xDECAF; - p._n_estimators = 50; + p._nlearners = 50; p._response_column = response; AdaBoost adaBoost = new AdaBoost(p); @@ -136,7 +134,7 @@ public void testBasicTrainAndScore() { AdaBoostModel.AdaBoostParameters p = new AdaBoostModel.AdaBoostParameters(); p._train = train._key; p._seed = 0xDECAF; - p._n_estimators = 50; + p._nlearners = 50; p._response_column = response; AdaBoost adaBoost = new AdaBoost(p); @@ -168,7 +166,7 @@ public void testBasicTrainAndScoreCategorical() { AdaBoostModel.AdaBoostParameters p = new AdaBoostModel.AdaBoostParameters(); p._train = train._key; p._seed = 0xDECAF; - p._n_estimators = 50; + p._nlearners = 50; p._response_column = response; p._categorical_encoding = Model.Parameters.CategoricalEncodingScheme.OneHotExplicit; @@ -197,7 +195,7 @@ public void testBasicTrainAndScoreLarge() { AdaBoostModel.AdaBoostParameters p = new AdaBoostModel.AdaBoostParameters(); p._train = train._key; p._seed = 0xDECAF; - p._n_estimators = 50; + p._nlearners = 50; p._response_column = response; AdaBoost adaBoost = new AdaBoost(p); @@ -226,7 +224,7 @@ public void testBasicTrainAirlines() { AdaBoostModel.AdaBoostParameters p = new AdaBoostModel.AdaBoostParameters(); p._train = train._key; p._seed = 0xDECAF; - p._n_estimators = 50; + p._nlearners = 50; p._response_column = response; AdaBoost adaBoost = new AdaBoost(p); @@ -255,7 +253,7 @@ public void testBasicTrainHiggs() { AdaBoostModel.AdaBoostParameters p = new AdaBoostModel.AdaBoostParameters(); p._train = train._key; p._seed = 0xDECAF; - p._n_estimators = 50; + p._nlearners = 50; p._response_column = response; AdaBoost adaBoost = new AdaBoost(p); @@ -330,7 +328,7 @@ public void testBasicTrainAndScoreWithExternalWeightsColumn() { AdaBoostModel.AdaBoostParameters p = new AdaBoostModel.AdaBoostParameters(); p._train = train._key; p._seed = 0xDECAF; - p._n_estimators = 10; + p._nlearners = 10; p._response_column = response; AdaBoost adaBoostReference = new AdaBoost(p); @@ -374,7 +372,7 @@ public void testBasicTrainAndScoreWithCustomWeightsColumn() { AdaBoostModel.AdaBoostParameters p = new AdaBoostModel.AdaBoostParameters(); p._train = train._key; p._seed = 0xDECAF; - p._n_estimators = 10; + p._nlearners = 10; p._response_column = response; AdaBoost adaBoostReference = new AdaBoost(p); @@ -419,7 +417,7 @@ public void testBasicTrainAndScoreWithDuplicatedWeightsColumn() { AdaBoostModel.AdaBoostParameters p = new AdaBoostModel.AdaBoostParameters(); p._train = train._key; p._seed = 0xDECAF; - p._n_estimators = 10; + p._nlearners = 10; p._response_column = response; p._ignore_const_cols = false; @@ -447,7 +445,7 @@ public void testBasicTrainAndScoreGLM() { AdaBoostModel.AdaBoostParameters p = new AdaBoostModel.AdaBoostParameters(); p._train = train._key; p._seed = 0xDECAF; - p._n_estimators = 50; + p._nlearners = 50; p._weak_learner = AdaBoostModel.Algorithm.GLM; p._response_column = response; @@ -473,7 +471,7 @@ public void testBasicTrainAndScoreGBM() { AdaBoostModel.AdaBoostParameters p = new AdaBoostModel.AdaBoostParameters(); p._train = train._key; p._seed = 0xDECAF; - p._n_estimators = 50; + p._nlearners = 50; p._weak_learner = AdaBoostModel.Algorithm.GBM; p._response_column = response; From bf7a4355c45ec7af398677d353d62aeb76d783f5 Mon Sep 17 00:00:00 2001 From: Adam Valenta Date: Fri, 22 Sep 2023 18:36:05 +0200 Subject: [PATCH 29/34] fixup! Implement possibility to have a custom weights column and ensure that the created weights collumn will always be the one we use --- h2o-algos/src/main/java/hex/adaboost/AdaBoost.java | 1 - 1 file changed, 1 deletion(-) diff --git a/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java b/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java index 89a7e04a938f..3edb0f68631a 100644 --- a/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java +++ b/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java @@ -107,7 +107,6 @@ private void buildAdaboost() { _trainWithWeights.add(_weightsName, weights); DKV.put(_trainWithWeights); Scope.track(weights); - _weightsName = _trainWithWeights.lastVecName(); } else { _trainWithWeights = _parms.train(); } From 445dc395070f1d1619365ba32e8bd6fde10a4ec2 Mon Sep 17 00:00:00 2001 From: Adam Valenta Date: Fri, 22 Sep 2023 18:38:05 +0200 Subject: [PATCH 30/34] Fix for different model as a weak learner - use upperclass instead of DRF --- h2o-algos/src/main/java/hex/adaboost/AdaBoostModel.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/h2o-algos/src/main/java/hex/adaboost/AdaBoostModel.java b/h2o-algos/src/main/java/hex/adaboost/AdaBoostModel.java index e9789a53088b..b8a549d7dda4 100644 --- a/h2o-algos/src/main/java/hex/adaboost/AdaBoostModel.java +++ b/h2o-algos/src/main/java/hex/adaboost/AdaBoostModel.java @@ -56,7 +56,7 @@ protected double[] score0(double[] data, double[] preds) { public static class AdaBoostOutput extends Model.Output { public double[] alphas; - public Key[] models; + public Key[] models; public AdaBoostOutput(AdaBoost adaBoostModel) { super(adaBoostModel); @@ -65,7 +65,7 @@ public AdaBoostOutput(AdaBoost adaBoostModel) { @Override protected Futures remove_impl(Futures fs, boolean cascade) { - for (Key iTreeKey : _output.models) { + for (Key iTreeKey : _output.models) { Keyed.remove(iTreeKey, fs, true); } return super.remove_impl(fs, cascade); @@ -73,7 +73,7 @@ protected Futures remove_impl(Futures fs, boolean cascade) { @Override protected AutoBuffer writeAll_impl(AutoBuffer ab) { - for (Key iTreeKey : _output.models) { + for (Key iTreeKey : _output.models) { ab.putKey(iTreeKey); } return super.writeAll_impl(ab); @@ -81,7 +81,7 @@ protected AutoBuffer writeAll_impl(AutoBuffer ab) { @Override protected Keyed readAll_impl(AutoBuffer ab, Futures fs) { - for (Key iTreeKey : _output.models) { + for (Key iTreeKey : _output.models) { ab.getKey(iTreeKey, fs); } return super.readAll_impl(ab,fs); From 4ec922962e5057c7e325e78c98ece2892073ae2b Mon Sep 17 00:00:00 2001 From: Adam Valenta Date: Fri, 22 Sep 2023 18:40:35 +0200 Subject: [PATCH 31/34] Ensure that adaboost create exactly nlearners models --- h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java b/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java index 839858ea95d5..a5b2aef87979 100644 --- a/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java +++ b/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java @@ -42,18 +42,20 @@ public void testBasicTrain() { Scope.enter(); Frame train = parseTestFile("smalldata/prostate/prostate.csv"); String response = "CAPSULE"; + int nlearners = 50; train.toCategoricalCol(response); Scope.track(train); AdaBoostModel.AdaBoostParameters p = new AdaBoostModel.AdaBoostParameters(); p._train = train._key; p._seed = 0xDECAF; - p._nlearners = 50; + p._nlearners = nlearners; p._response_column = response; AdaBoost adaBoost = new AdaBoost(p); AdaBoostModel adaBoostModel = adaBoost.trainModel().get(); Scope.track_generic(adaBoostModel); assertNotNull(adaBoostModel); + assertEquals("Model should contain all the weak learners", nlearners, adaBoostModel._output.models.length); for (int i = 0; i < adaBoostModel._output.models.length; i++) { System.out.println("Tree = " + i); From e52c2873a1b692db7c1773b89cca08c4c0861e52 Mon Sep 17 00:00:00 2001 From: Adam Valenta Date: Fri, 22 Sep 2023 18:54:28 +0200 Subject: [PATCH 32/34] Refactoring according to suggestions --- .../src/main/java/hex/adaboost/AdaBoost.java | 16 ++++++++-------- .../main/java/hex/adaboost/AdaBoostModel.java | 2 +- .../java/hex/adaboost/UpdateWeightsTask.java | 14 +++++++------- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java b/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java index 3edb0f68631a..b606027c6bfa 100644 --- a/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java +++ b/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java @@ -119,16 +119,16 @@ private void buildAdaboost() { DKV.put(model); Scope.untrack(model._key); _model._output.models[n] = model._key; - Frame score = model.score(_trainWithWeights); - Scope.track(score); + Frame predictions = model.score(_trainWithWeights); + Scope.track(predictions); - CountWeTask countWe = new CountWeTask().doAll(_trainWithWeights.vec(_weightsName), _trainWithWeights.vec(_parms._response_column), score.vec("predict")); - double e_m = countWe.We / countWe.W; - double alpha_m = _parms._learn_rate * Math.log((1 - e_m) / e_m); - _model._output.alphas[n] = alpha_m; + CountWeTask countWe = new CountWeTask().doAll(_trainWithWeights.vec(_weightsName), _trainWithWeights.vec(_parms._response_column), predictions.vec("predict")); + double eM = countWe.We / countWe.W; + double alphaM = _parms._learn_rate * Math.log((1 - eM) / eM); + _model._output.alphas[n] = alphaM; - UpdateWeightsTask updateWeightsTask = new UpdateWeightsTask(alpha_m); - updateWeightsTask.doAll(_trainWithWeights.vec(_weightsName), _trainWithWeights.vec(_parms._response_column), score.vec("predict")); + UpdateWeightsTask updateWeightsTask = new UpdateWeightsTask(alphaM); + updateWeightsTask.doAll(_trainWithWeights.vec(_weightsName), _trainWithWeights.vec(_parms._response_column), predictions.vec("predict")); _job.update(1); _model.update(_job); LOG.info((n + 1) + ". estimator was built in " + timer.toString()); diff --git a/h2o-algos/src/main/java/hex/adaboost/AdaBoostModel.java b/h2o-algos/src/main/java/hex/adaboost/AdaBoostModel.java index b8a549d7dda4..6c92294eef2a 100644 --- a/h2o-algos/src/main/java/hex/adaboost/AdaBoostModel.java +++ b/h2o-algos/src/main/java/hex/adaboost/AdaBoostModel.java @@ -42,7 +42,7 @@ protected double[] score0(double[] data, double[] preds) { linearCombination += _output.alphas[i]*-1; alphas0 += _output.alphas[i]; } else { - linearCombination += _output.alphas[i]*1; + linearCombination += _output.alphas[i]; alphas1 += _output.alphas[i]; } } diff --git a/h2o-algos/src/main/java/hex/adaboost/UpdateWeightsTask.java b/h2o-algos/src/main/java/hex/adaboost/UpdateWeightsTask.java index bc09ec1a996a..e3d4f0d08cee 100644 --- a/h2o-algos/src/main/java/hex/adaboost/UpdateWeightsTask.java +++ b/h2o-algos/src/main/java/hex/adaboost/UpdateWeightsTask.java @@ -7,12 +7,12 @@ * Update weights according to AdaBoost algorithm */ class UpdateWeightsTask extends MRTask { - double exp_am; - double exp_am_inverse; + double expAm; + double expAmInverse; - public UpdateWeightsTask(double alpha_m) { - exp_am = Math.exp(alpha_m); - exp_am_inverse = Math.exp(-alpha_m); + public UpdateWeightsTask(double alphaM) { + expAm = Math.exp(alphaM); + expAmInverse = Math.exp(-alphaM); } @Override @@ -20,9 +20,9 @@ public void map(Chunk weights, Chunk response, Chunk predict) { for (int row = 0; row < weights._len; row++) { double weight = weights.atd(row); if (response.at8(row) != predict.at8(row)) { - weights.set(row, weight * exp_am); + weights.set(row, weight * expAm); } else { - weights.set(row, weight * exp_am_inverse); + weights.set(row, weight * expAmInverse); } } } From aeaa11bed8f98f9c7412858ca088b9743d3debea Mon Sep 17 00:00:00 2001 From: Adam Valenta Date: Fri, 22 Sep 2023 19:13:49 +0200 Subject: [PATCH 33/34] fixup! Fix for different model as a weak learner - use upperclass instead of DRF --- h2o-algos/src/main/java/hex/adaboost/AdaBoostModel.java | 1 - 1 file changed, 1 deletion(-) diff --git a/h2o-algos/src/main/java/hex/adaboost/AdaBoostModel.java b/h2o-algos/src/main/java/hex/adaboost/AdaBoostModel.java index 6c92294eef2a..24d848b27b16 100644 --- a/h2o-algos/src/main/java/hex/adaboost/AdaBoostModel.java +++ b/h2o-algos/src/main/java/hex/adaboost/AdaBoostModel.java @@ -4,7 +4,6 @@ import hex.ModelCategory; import hex.ModelMetrics; import hex.ModelMetricsBinomial; -import hex.tree.drf.DRFModel; import org.apache.log4j.Logger; import water.*; From 5b2b780fac5e125e3c5d79cc587bc1120e613b1b Mon Sep 17 00:00:00 2001 From: Adam Valenta Date: Tue, 26 Sep 2023 17:02:35 +0200 Subject: [PATCH 34/34] GH-6723 AdaBoost API (#15732) * GH-6723 - implement AdaBoost API for python and R * Expose weights column * Refactor learning_rate to learn_rate * Add GBM to the weak_learners * Refactor n_estimators to nlearners --- .../src/main/java/hex/api/RegisterAlgos.java | 3 +- .../java/hex/schemas/AdaBoostModelV3.java | 30 +++ .../src/main/java/hex/schemas/AdaBoostV3.java | 41 ++++ .../META-INF/services/water.api.Schema | 4 + h2o-bindings/bin/custom/R/gen_adaboost.py | 41 ++++ .../bin/custom/python/gen_adaboost.py | 8 + h2o-bindings/bin/gen_R.py | 2 + h2o-bindings/bin/gen_python.py | 1 + h2o-py/docs/modeling.rst | 6 + h2o-py/h2o/estimators/__init__.py | 18 +- h2o-py/h2o/estimators/adaboost.py | 220 ++++++++++++++++++ .../adaboost/pyunit_adaboost_saveload.py | 38 +++ .../adaboost/pyunit_adaboost_smoke.py | 31 +++ .../pyunit_sklearn_generic_all_estimators.py | 1 + ...yunit_sklearn_regression_all_estimators.py | 3 +- .../testdir_multi_jvm/test_rest_api.py | 2 +- h2o-r/H2O_Load.R | 3 +- h2o-r/h2o-package/R/adaboost.R | 170 ++++++++++++++ h2o-r/h2o-package/pkgdown/_pkgdown.yml | 1 + h2o-r/scripts/h2o-r-test-setup.R | 2 +- .../adaboost/runit_adaboost_smoke.R | 20 ++ 21 files changed, 632 insertions(+), 13 deletions(-) create mode 100644 h2o-algos/src/main/java/hex/schemas/AdaBoostModelV3.java create mode 100644 h2o-algos/src/main/java/hex/schemas/AdaBoostV3.java create mode 100644 h2o-bindings/bin/custom/R/gen_adaboost.py create mode 100644 h2o-bindings/bin/custom/python/gen_adaboost.py create mode 100644 h2o-py/h2o/estimators/adaboost.py create mode 100644 h2o-py/tests/testdir_algos/adaboost/pyunit_adaboost_saveload.py create mode 100644 h2o-py/tests/testdir_algos/adaboost/pyunit_adaboost_smoke.py create mode 100644 h2o-r/h2o-package/R/adaboost.R create mode 100644 h2o-r/tests/testdir_algos/adaboost/runit_adaboost_smoke.R diff --git a/h2o-algos/src/main/java/hex/api/RegisterAlgos.java b/h2o-algos/src/main/java/hex/api/RegisterAlgos.java index be7e255f35d3..b31bbcc39834 100644 --- a/h2o-algos/src/main/java/hex/api/RegisterAlgos.java +++ b/h2o-algos/src/main/java/hex/api/RegisterAlgos.java @@ -38,7 +38,8 @@ public void registerEndPoints(RestApiContext context) { new hex.tree.uplift.UpliftDRF (true), new hex.modelselection.ModelSelection (true), new hex.isotonic .IsotonicRegression(true), - new hex.tree.dt .DT (true) + new hex.tree.dt .DT (true), + new hex.adaboost. AdaBoost (true) }; // "Word2Vec", "Example", "Grep" diff --git a/h2o-algos/src/main/java/hex/schemas/AdaBoostModelV3.java b/h2o-algos/src/main/java/hex/schemas/AdaBoostModelV3.java new file mode 100644 index 000000000000..9229af5dc9ab --- /dev/null +++ b/h2o-algos/src/main/java/hex/schemas/AdaBoostModelV3.java @@ -0,0 +1,30 @@ +package hex.schemas; + +import hex.adaboost.AdaBoostModel; +import water.api.schemas3.ModelOutputSchemaV3; +import water.api.schemas3.ModelSchemaV3; + +public class AdaBoostModelV3 extends ModelSchemaV3 { + + public static final class AdaBoostModelOutputV3 extends ModelOutputSchemaV3 { + // nothing + } + + public AdaBoostV3.AdaBoostParametersV3 createParametersSchema() { return new AdaBoostV3.AdaBoostParametersV3(); } + public AdaBoostModelOutputV3 createOutputSchema() { return new AdaBoostModelOutputV3(); } + + //========================== + // Custom adapters go here + + // Version&Schema-specific filling into the impl + @Override public AdaBoostModel createImpl() { + AdaBoostV3.AdaBoostParametersV3 p = this.parameters; + AdaBoostModel.AdaBoostParameters parms = p.createImpl(); + return new AdaBoostModel( model_id.key(), parms, new AdaBoostModel.AdaBoostOutput(null) ); + } +} diff --git a/h2o-algos/src/main/java/hex/schemas/AdaBoostV3.java b/h2o-algos/src/main/java/hex/schemas/AdaBoostV3.java new file mode 100644 index 000000000000..1a1edb52189c --- /dev/null +++ b/h2o-algos/src/main/java/hex/schemas/AdaBoostV3.java @@ -0,0 +1,41 @@ +package hex.schemas; + +import hex.adaboost.AdaBoost; +import hex.adaboost.AdaBoostModel; +import water.api.API; +import water.api.schemas3.ModelParametersSchemaV3; + +public class AdaBoostV3 extends ModelBuilderSchema< + AdaBoost, + AdaBoostV3, + AdaBoostV3.AdaBoostParametersV3> { + + public static final class AdaBoostParametersV3 extends ModelParametersSchemaV3 { + static public String[] fields = new String[]{ + "model_id", + "training_frame", + "ignored_columns", + "ignore_const_cols", + "categorical_encoding", + "weights_column", + + // AdaBoost specific + "nlearners", + "weak_learner", + "learn_rate", + "seed", + }; + + @API(help = "Number of AdaBoost weak learners.", gridable = true) + public int nlearners; + + @API(help = "Choose a weak learner type. Defaults to AUTO, which means DRF.", gridable = true, values = {"AUTO", "DRF", "GLM", "GBM"}) + public AdaBoostModel.Algorithm weak_learner; + + @API(help="Learning rate (from 0.0 to 1.0)", gridable = true) + public double learn_rate; + + @API(help = "Seed for pseudo random number generator (if applicable)", gridable = true) + public long seed; + } +} diff --git a/h2o-algos/src/main/resources/META-INF/services/water.api.Schema b/h2o-algos/src/main/resources/META-INF/services/water.api.Schema index b23d07eec489..18c3ea3dc972 100644 --- a/h2o-algos/src/main/resources/META-INF/services/water.api.Schema +++ b/h2o-algos/src/main/resources/META-INF/services/water.api.Schema @@ -114,3 +114,7 @@ hex.schemas.UpliftDRFModelV3 hex.schemas.UpliftDRFModelV3$UpliftDRFModelOutputV3 hex.schemas.UpliftDRFV3 hex.schemas.UpliftDRFV3$UpliftDRFParametersV3 +hex.schemas.AdaBoostModelV3 +hex.schemas.AdaBoostModelV3$AdaBoostModelOutputV3 +hex.schemas.AdaBoostV3 +hex.schemas.AdaBoostV3$AdaBoostParametersV3 diff --git a/h2o-bindings/bin/custom/R/gen_adaboost.py b/h2o-bindings/bin/custom/R/gen_adaboost.py new file mode 100644 index 000000000000..7dbf86f2a3fa --- /dev/null +++ b/h2o-bindings/bin/custom/R/gen_adaboost.py @@ -0,0 +1,41 @@ +extensions = dict( + skip_default_set_params_for=['training_frame', 'ignored_columns', 'response_column', + 'max_confusion_matrix_size', 'distribution', 'offset_column'], + set_required_params=""" +parms$training_frame <- training_frame +args <- .verify_dataxy(training_frame, x, y) +parms$ignored_columns <- args$x_ignore +parms$response_column <- args$y +""", +) + + +doc = dict( + preamble=""" +Build an AdaBoost model + +Builds an AdaBoost model on an H2OFrame. +""", + returns=""" +Creates a \linkS4class{H2OModel} object of the right type. +""", + seealso=""" +\code{\link{predict.H2OModel}} for prediction +""", + examples=""" +library(h2o) +h2o.init() + +# Import the airlines dataset +f <- "https://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv" +data <- h2o.importFile(f) + +# Set predictors and response; set response as a factor +data["CAPSULE"] <- as.factor(data["CAPSULE"]) +predictors <- c("AGE","RACE","DPROS","DCAPS","PSA","VOL","GLEASON") +response <- "CAPSULE" + +# Train the AdaBoost model +h2o_adaboost <- h2o.adaBoost(x = predictors, y = response, training_frame = data, seed = 1234) +""" +) diff --git a/h2o-bindings/bin/custom/python/gen_adaboost.py b/h2o-bindings/bin/custom/python/gen_adaboost.py new file mode 100644 index 000000000000..646c8e820871 --- /dev/null +++ b/h2o-bindings/bin/custom/python/gen_adaboost.py @@ -0,0 +1,8 @@ +options = dict( +) + +doc = dict( + __class__=""" +Builds an AdaBoost model +""" +) diff --git a/h2o-bindings/bin/gen_R.py b/h2o-bindings/bin/gen_R.py index 14a895e584b1..b99875a2a1f4 100644 --- a/h2o-bindings/bin/gen_R.py +++ b/h2o-bindings/bin/gen_R.py @@ -272,6 +272,7 @@ def algo_to_modelname(algo): if algo == "gam": return "Generalized Additive Model" if algo == "modelselection": return "Model Selection" if algo == "infogram": return "Infogram" + if algo == "adaboost": return "AdaBoost Model" return algo @@ -347,6 +348,7 @@ def main(): if name == "stackedensemble": module = "stackedEnsemble" if name == "pca": module = "prcomp" if name == "modelselection": module = "modelSelection" + if name == "adaboost": module = "adaBoost" bi.vprint("Generating model: " + name) bi.write_to_file("%s.R" % file_name, gen_module(mb, name, module)) diff --git a/h2o-bindings/bin/gen_python.py b/h2o-bindings/bin/gen_python.py index 98fd3db325b8..6fa044f77d0e 100755 --- a/h2o-bindings/bin/gen_python.py +++ b/h2o-bindings/bin/gen_python.py @@ -351,6 +351,7 @@ def algo_to_classname(algo): if algo == "rulefit": return "H2ORuleFitEstimator" if algo == "modelselection": return "H2OModelSelectionEstimator" if algo == "isotonicregression": return "H2OIsotonicRegressionEstimator" + if algo == "adaboost": return "H2OAdaBoostEstimator" return "H2O" + algo.capitalize() + "Estimator" diff --git a/h2o-py/docs/modeling.rst b/h2o-py/docs/modeling.rst index 89b4f9cfa408..9ceecb83b361 100644 --- a/h2o-py/docs/modeling.rst +++ b/h2o-py/docs/modeling.rst @@ -8,6 +8,12 @@ Modeling In H2O Supervised ++++++++++ +:mod:`H2OAdaBoostEstimator` +--------------------------- +.. autoclass:: h2o.estimators.adaboost.H2OAdaBoostEstimator + :show-inheritance: + :members: + :mod:`H2OANOVAGLMEstimator` --------------------------- .. autoclass:: h2o.estimators.anovaglm.H2OANOVAGLMEstimator diff --git a/h2o-py/h2o/estimators/__init__.py b/h2o-py/h2o/estimators/__init__.py index d261ff829f13..766e1678b950 100644 --- a/h2o-py/h2o/estimators/__init__.py +++ b/h2o-py/h2o/estimators/__init__.py @@ -7,6 +7,7 @@ import inspect import sys +from .adaboost import H2OAdaBoostEstimator from .aggregator import H2OAggregatorEstimator from .anovaglm import H2OANOVAGLMEstimator from .coxph import H2OCoxProportionalHazardsEstimator @@ -60,12 +61,13 @@ def create_estimator(algo, **params): __all__ = ( "create_estimator", - "H2OAggregatorEstimator", "H2OANOVAGLMEstimator", "H2OCoxProportionalHazardsEstimator", "H2ODecisionTreeEstimator", - "H2OAutoEncoderEstimator", "H2ODeepLearningEstimator", "H2OEstimator", "H2OExtendedIsolationForestEstimator", - "H2OGeneralizedAdditiveEstimator", "H2OGradientBoostingEstimator", "H2OGenericEstimator", - "H2OGeneralizedLinearEstimator", "H2OGeneralizedLowRankEstimator", "H2OInfogram", "H2OIsolationForestEstimator", - "H2OIsotonicRegressionEstimator", "H2OKMeansEstimator", "H2OModelSelectionEstimator", "H2ONaiveBayesEstimator", - "H2OPrincipalComponentAnalysisEstimator", "H2OSupportVectorMachineEstimator", "H2ORandomForestEstimator", - "H2ORuleFitEstimator", "H2OStackedEnsembleEstimator", "H2OSingularValueDecompositionEstimator", - "H2OTargetEncoderEstimator", "H2OUpliftRandomForestEstimator", "H2OWord2vecEstimator", "H2OXGBoostEstimator" + "H2OAdaBoostEstimator", "H2OAggregatorEstimator", "H2OANOVAGLMEstimator", "H2OCoxProportionalHazardsEstimator", + "H2ODecisionTreeEstimator", "H2OAutoEncoderEstimator", "H2ODeepLearningEstimator", "H2OEstimator", + "H2OExtendedIsolationForestEstimator", "H2OGeneralizedAdditiveEstimator", "H2OGradientBoostingEstimator", + "H2OGenericEstimator", "H2OGeneralizedLinearEstimator", "H2OGeneralizedLowRankEstimator", "H2OInfogram", + "H2OIsolationForestEstimator", "H2OIsotonicRegressionEstimator", "H2OKMeansEstimator", "H2OModelSelectionEstimator", + "H2ONaiveBayesEstimator", "H2OPrincipalComponentAnalysisEstimator", "H2OSupportVectorMachineEstimator", + "H2ORandomForestEstimator", "H2ORuleFitEstimator", "H2OStackedEnsembleEstimator", + "H2OSingularValueDecompositionEstimator", "H2OTargetEncoderEstimator", "H2OUpliftRandomForestEstimator", + "H2OWord2vecEstimator", "H2OXGBoostEstimator" ) diff --git a/h2o-py/h2o/estimators/adaboost.py b/h2o-py/h2o/estimators/adaboost.py new file mode 100644 index 000000000000..09495202531c --- /dev/null +++ b/h2o-py/h2o/estimators/adaboost.py @@ -0,0 +1,220 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- +# +# This file is auto-generated by h2o-3/h2o-bindings/bin/gen_python.py +# Copyright 2016 H2O.ai; Apache License Version 2.0 (see LICENSE for details) +# + +from h2o.estimators.estimator_base import H2OEstimator +from h2o.exceptions import H2OValueError +from h2o.frame import H2OFrame +from h2o.utils.typechecks import assert_is_type, Enum, numeric + + +class H2OAdaBoostEstimator(H2OEstimator): + """ + AdaBoost + + Builds an AdaBoost model + """ + + algo = "adaboost" + supervised_learning = True + + def __init__(self, + model_id=None, # type: Optional[Union[None, str, H2OEstimator]] + training_frame=None, # type: Optional[Union[None, str, H2OFrame]] + ignored_columns=None, # type: Optional[List[str]] + ignore_const_cols=True, # type: bool + categorical_encoding="auto", # type: Literal["auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder", "sort_by_response", "enum_limited"] + weights_column=None, # type: Optional[str] + nlearners=50, # type: int + weak_learner="auto", # type: Literal["auto", "drf", "glm", "gbm"] + learn_rate=0.5, # type: float + seed=-1, # type: int + ): + """ + :param model_id: Destination id for this model; auto-generated if not specified. + Defaults to ``None``. + :type model_id: Union[None, str, H2OEstimator], optional + :param training_frame: Id of the training data frame. + Defaults to ``None``. + :type training_frame: Union[None, str, H2OFrame], optional + :param ignored_columns: Names of columns to ignore for training. + Defaults to ``None``. + :type ignored_columns: List[str], optional + :param ignore_const_cols: Ignore constant columns. + Defaults to ``True``. + :type ignore_const_cols: bool + :param categorical_encoding: Encoding scheme for categorical features + Defaults to ``"auto"``. + :type categorical_encoding: Literal["auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder", + "sort_by_response", "enum_limited"] + :param weights_column: Column with observation weights. Giving some observation a weight of zero is equivalent + to excluding it from the dataset; giving an observation a relative weight of 2 is equivalent to repeating + that row twice. Negative weights are not allowed. Note: Weights are per-row observation weights and do + not increase the size of the data frame. This is typically the number of times a row is repeated, but + non-integer values are supported as well. During training, rows with higher weights matter more, due to + the larger loss function pre-factor. If you set weight = 0 for a row, the returned prediction frame at + that row is zero and this is incorrect. To get an accurate prediction, remove all rows with weight == 0. + Defaults to ``None``. + :type weights_column: str, optional + :param nlearners: Number of AdaBoost weak learners. + Defaults to ``50``. + :type nlearners: int + :param weak_learner: Choose a weak learner type. Defaults to AUTO, which means DRF. + Defaults to ``"auto"``. + :type weak_learner: Literal["auto", "drf", "glm", "gbm"] + :param learn_rate: Learning rate (from 0.0 to 1.0) + Defaults to ``0.5``. + :type learn_rate: float + :param seed: Seed for pseudo random number generator (if applicable) + Defaults to ``-1``. + :type seed: int + """ + super(H2OAdaBoostEstimator, self).__init__() + self._parms = {} + self._id = self._parms['model_id'] = model_id + self.training_frame = training_frame + self.ignored_columns = ignored_columns + self.ignore_const_cols = ignore_const_cols + self.categorical_encoding = categorical_encoding + self.weights_column = weights_column + self.nlearners = nlearners + self.weak_learner = weak_learner + self.learn_rate = learn_rate + self.seed = seed + + @property + def training_frame(self): + """ + Id of the training data frame. + + Type: ``Union[None, str, H2OFrame]``. + """ + return self._parms.get("training_frame") + + @training_frame.setter + def training_frame(self, training_frame): + self._parms["training_frame"] = H2OFrame._validate(training_frame, 'training_frame') + + @property + def ignored_columns(self): + """ + Names of columns to ignore for training. + + Type: ``List[str]``. + """ + return self._parms.get("ignored_columns") + + @ignored_columns.setter + def ignored_columns(self, ignored_columns): + assert_is_type(ignored_columns, None, [str]) + self._parms["ignored_columns"] = ignored_columns + + @property + def ignore_const_cols(self): + """ + Ignore constant columns. + + Type: ``bool``, defaults to ``True``. + """ + return self._parms.get("ignore_const_cols") + + @ignore_const_cols.setter + def ignore_const_cols(self, ignore_const_cols): + assert_is_type(ignore_const_cols, None, bool) + self._parms["ignore_const_cols"] = ignore_const_cols + + @property + def categorical_encoding(self): + """ + Encoding scheme for categorical features + + Type: ``Literal["auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder", + "sort_by_response", "enum_limited"]``, defaults to ``"auto"``. + """ + return self._parms.get("categorical_encoding") + + @categorical_encoding.setter + def categorical_encoding(self, categorical_encoding): + assert_is_type(categorical_encoding, None, Enum("auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder", "sort_by_response", "enum_limited")) + self._parms["categorical_encoding"] = categorical_encoding + + @property + def weights_column(self): + """ + Column with observation weights. Giving some observation a weight of zero is equivalent to excluding it from the + dataset; giving an observation a relative weight of 2 is equivalent to repeating that row twice. Negative + weights are not allowed. Note: Weights are per-row observation weights and do not increase the size of the data + frame. This is typically the number of times a row is repeated, but non-integer values are supported as well. + During training, rows with higher weights matter more, due to the larger loss function pre-factor. If you set + weight = 0 for a row, the returned prediction frame at that row is zero and this is incorrect. To get an + accurate prediction, remove all rows with weight == 0. + + Type: ``str``. + """ + return self._parms.get("weights_column") + + @weights_column.setter + def weights_column(self, weights_column): + assert_is_type(weights_column, None, str) + self._parms["weights_column"] = weights_column + + @property + def nlearners(self): + """ + Number of AdaBoost weak learners. + + Type: ``int``, defaults to ``50``. + """ + return self._parms.get("nlearners") + + @nlearners.setter + def nlearners(self, nlearners): + assert_is_type(nlearners, None, int) + self._parms["nlearners"] = nlearners + + @property + def weak_learner(self): + """ + Choose a weak learner type. Defaults to AUTO, which means DRF. + + Type: ``Literal["auto", "drf", "glm", "gbm"]``, defaults to ``"auto"``. + """ + return self._parms.get("weak_learner") + + @weak_learner.setter + def weak_learner(self, weak_learner): + assert_is_type(weak_learner, None, Enum("auto", "drf", "glm", "gbm")) + self._parms["weak_learner"] = weak_learner + + @property + def learn_rate(self): + """ + Learning rate (from 0.0 to 1.0) + + Type: ``float``, defaults to ``0.5``. + """ + return self._parms.get("learn_rate") + + @learn_rate.setter + def learn_rate(self, learn_rate): + assert_is_type(learn_rate, None, numeric) + self._parms["learn_rate"] = learn_rate + + @property + def seed(self): + """ + Seed for pseudo random number generator (if applicable) + + Type: ``int``, defaults to ``-1``. + """ + return self._parms.get("seed") + + @seed.setter + def seed(self, seed): + assert_is_type(seed, None, int) + self._parms["seed"] = seed + + diff --git a/h2o-py/tests/testdir_algos/adaboost/pyunit_adaboost_saveload.py b/h2o-py/tests/testdir_algos/adaboost/pyunit_adaboost_saveload.py new file mode 100644 index 000000000000..9de757289e05 --- /dev/null +++ b/h2o-py/tests/testdir_algos/adaboost/pyunit_adaboost_saveload.py @@ -0,0 +1,38 @@ +import sys, os +sys.path.insert(1, os.path.join("..","..","..")) +import h2o +from tests import pyunit_utils +from h2o.estimators import H2OAdaBoostEstimator + + +def adaBoost_save_and_load(): + print("AdaBoost Save Load Test") + + train = h2o.import_file(pyunit_utils.locate("smalldata/prostate/prostate.csv")) + train["CAPSULE"] = train["CAPSULE"].asfactor() + + adaboost_model = H2OAdaBoostEstimator(nlearners=7, seed=12) + adaboost_model.train(training_frame=train, y="CAPSULE") + predict = adaboost_model.predict(train) + + path = pyunit_utils.locate("results") + + assert os.path.isdir(path), "Expected save directory {0} to exist, but it does not.".format(path) + model_path = h2o.save_model(adaboost_model, path=path, force=True) + + assert os.path.isfile(model_path), "Expected load file {0} to exist, but it does not.".format(model_path) + reloaded = h2o.load_model(model_path) + predict_reloaded = reloaded.predict(train) + + assert isinstance(reloaded, + H2OAdaBoostEstimator), \ + "Expected and H2OAdaBoostEstimator, but got {0}"\ + .format(reloaded) + + assert pyunit_utils.compare_frames_local(predict, predict_reloaded, returnResult=True) + + +if __name__ == "__main__": + pyunit_utils.standalone_test(adaBoost_save_and_load) +else: + adaBoost_save_and_load() diff --git a/h2o-py/tests/testdir_algos/adaboost/pyunit_adaboost_smoke.py b/h2o-py/tests/testdir_algos/adaboost/pyunit_adaboost_smoke.py new file mode 100644 index 000000000000..5e52a11f9859 --- /dev/null +++ b/h2o-py/tests/testdir_algos/adaboost/pyunit_adaboost_smoke.py @@ -0,0 +1,31 @@ +import sys, os +sys.path.insert(1, os.path.join("..","..","..")) +import h2o +from tests import pyunit_utils +from h2o.estimators import H2OAdaBoostEstimator + + +def adaboost(): + print("AdaBoost Smoke Test") + + train = h2o.import_file(pyunit_utils.locate("smalldata/prostate/prostate.csv")) + train["CAPSULE"] = train["CAPSULE"].asfactor() + + adaboost_model = H2OAdaBoostEstimator(nlearners=55, seed=0xBEEF, weak_learner="GLM", learn_rate=0.6) + adaboost_model.train(training_frame=train, y="CAPSULE") + predict = adaboost_model.predict(train) + + print("") + print(adaboost_model) + print("") + print(predict) + + assert 55 == adaboost_model._model_json["output"]["model_summary"]["number_of_weak_learners"][0], "Python API is not working!" + assert "GLM" == adaboost_model._model_json["output"]["model_summary"]["weak_learner"][0], "Python API is not working!" + assert 0.6 == adaboost_model._model_json["output"]["model_summary"]["learn_rate"][0], "Python API is not working!" + + +if __name__ == "__main__": + pyunit_utils.standalone_test(adaboost) +else: + adaboost() diff --git a/h2o-py/tests/testdir_sklearn/pyunit_sklearn_generic_all_estimators.py b/h2o-py/tests/testdir_sklearn/pyunit_sklearn_generic_all_estimators.py index c584ea25d6d1..c38651b8b69c 100644 --- a/h2o-py/tests/testdir_sklearn/pyunit_sklearn_generic_all_estimators.py +++ b/h2o-py/tests/testdir_sklearn/pyunit_sklearn_generic_all_estimators.py @@ -199,6 +199,7 @@ def make_tests(classifier): 'H2OWord2vecEstimator', # needs a separate test (requires pre_trained model as parameter) 'H2OUpliftRandomForestEstimator', # generic part is not implemented yet 'H2ODecisionTreeEstimator', # generic part is not implemented yet + 'H2OAdaBoostEstimator', # generic part is not implemented yet or test needs to be adjusted just for classification ] estimators = [cls for name, cls in inspect.getmembers(h2o.sklearn, inspect.isclass) if name.endswith('Estimator') and name not in ['H2OAutoMLEstimator'] + failing] diff --git a/h2o-py/tests/testdir_sklearn/pyunit_sklearn_regression_all_estimators.py b/h2o-py/tests/testdir_sklearn/pyunit_sklearn_regression_all_estimators.py index 0debe9691020..839733595178 100644 --- a/h2o-py/tests/testdir_sklearn/pyunit_sklearn_regression_all_estimators.py +++ b/h2o-py/tests/testdir_sklearn/pyunit_sklearn_regression_all_estimators.py @@ -138,7 +138,8 @@ def make_tests(classifier): 'H2OCoxProportionalHazardsRegressor', # doesn't support regression? 'H2OStackedEnsembleRegressor', # needs a separate test (requires models as parameters), 'H2OUpliftRandomForestRegressor', # does not support regression yet - 'H2ODecisionTreeRegressor' # does not support regression yet + 'H2ODecisionTreeRegressor', # does not support regression yet + 'H2OAdaBoostRegressor' # does not support regression yet ] regressors = [cls for name, cls in inspect.getmembers(h2o.sklearn, inspect.isclass) if name.endswith('Regressor') and name not in ['H2OAutoMLRegressor']+failing] diff --git a/h2o-py/tests_rest_smoke/testdir_multi_jvm/test_rest_api.py b/h2o-py/tests_rest_smoke/testdir_multi_jvm/test_rest_api.py index 1fec73fc1a1e..13c93693ae8e 100644 --- a/h2o-py/tests_rest_smoke/testdir_multi_jvm/test_rest_api.py +++ b/h2o-py/tests_rest_smoke/testdir_multi_jvm/test_rest_api.py @@ -23,7 +23,7 @@ algos = ['coxph', 'kmeans', 'deeplearning', 'drf', 'glm', 'gbm', 'pca', 'naivebayes', 'glrm', 'svd', 'isotonicregression', 'psvm', 'aggregator', 'word2vec', 'stackedensemble', 'xgboost', 'isolationforest', 'gam', 'generic', 'targetencoder', 'rulefit', 'extendedisolationforest', 'anovaglm', 'modelselection', - 'upliftdrf', 'infogram', 'dt'] + 'upliftdrf', 'infogram', 'dt', 'adaboost'] algo_additional_default_params = { 'grep' : { 'regex' : '.*' }, 'kmeans' : { 'k' : 2 }, diff --git a/h2o-r/H2O_Load.R b/h2o-r/H2O_Load.R index 96058f24465e..74af068e4558 100755 --- a/h2o-r/H2O_Load.R +++ b/h2o-r/H2O_Load.R @@ -17,7 +17,8 @@ function() { "edicts.R", "coxph.R", "coxphutils.R", "glm.R", "gam.R", "glrm.R", "pca.R", "kmeans.R", "gbm.R", "deeplearning.R", "naivebayes.R", "randomforest.R", "svd.R", "locate.R", "predict.R", "rulefit.R", "isolationforest.R", "psvm.R", "tf-idf.R", "permutation_varimp.R", "extendedisolationforest.R", - "anovaglm.R", "modelselection.R", "upliftrandomforest.R", "infogram.R", "admissibleml.R", "decisiontree.R") + "anovaglm.R", "modelselection.R", "upliftrandomforest.R", "infogram.R", "admissibleml.R", "decisiontree.R", + "adaBoost.R") require(jsonlite); require(RCurl) invisible(lapply(to_src,function(x){source(paste(FULL.PATH, x, sep = ""))})) } diff --git a/h2o-r/h2o-package/R/adaboost.R b/h2o-r/h2o-package/R/adaboost.R new file mode 100644 index 000000000000..526467977831 --- /dev/null +++ b/h2o-r/h2o-package/R/adaboost.R @@ -0,0 +1,170 @@ +# This file is auto-generated by h2o-3/h2o-bindings/bin/gen_R.py +# Copyright 2016 H2O.ai; Apache License Version 2.0 (see LICENSE for details) +#' +# -------------------------- AdaBoost Model -------------------------- # +#' +#' Build an AdaBoost model +#' +#' Builds an AdaBoost model on an H2OFrame. +#' +#' @param x (Optional) A vector containing the names or indices of the predictor variables to use in building the model. +#' If x is missing, then all columns except y are used. +#' @param y The name or column index of the response variable in the data. +#' The response must be either a numeric or a categorical/factor variable. +#' If the response is numeric, then a regression model will be trained, otherwise it will train a classification model. +#' @param training_frame Id of the training data frame. +#' @param model_id Destination id for this model; auto-generated if not specified. +#' @param ignore_const_cols \code{Logical}. Ignore constant columns. Defaults to TRUE. +#' @param categorical_encoding Encoding scheme for categorical features Must be one of: "AUTO", "Enum", "OneHotInternal", "OneHotExplicit", +#' "Binary", "Eigen", "LabelEncoder", "SortByResponse", "EnumLimited". Defaults to AUTO. +#' @param weights_column Column with observation weights. Giving some observation a weight of zero is equivalent to excluding it from +#' the dataset; giving an observation a relative weight of 2 is equivalent to repeating that row twice. Negative +#' weights are not allowed. Note: Weights are per-row observation weights and do not increase the size of the +#' data frame. This is typically the number of times a row is repeated, but non-integer values are supported as +#' well. During training, rows with higher weights matter more, due to the larger loss function pre-factor. If +#' you set weight = 0 for a row, the returned prediction frame at that row is zero and this is incorrect. To get +#' an accurate prediction, remove all rows with weight == 0. +#' @param nlearners Number of AdaBoost weak learners. Defaults to 50. +#' @param weak_learner Choose a weak learner type. Defaults to AUTO, which means DRF. Must be one of: "AUTO", "DRF", "GLM", "GBM". +#' Defaults to AUTO. +#' @param learn_rate Learning rate (from 0.0 to 1.0) Defaults to 0.5. +#' @param seed Seed for random numbers (affects certain parts of the algo that are stochastic and those might or might not be enabled by default). +#' Defaults to -1 (time-based random number). +#' @return Creates a \linkS4class{H2OModel} object of the right type. +#' @seealso \code{\link{predict.H2OModel}} for prediction +#' @examples +#' \dontrun{ +#' library(h2o) +#' h2o.init() +#' +#' # Import the airlines dataset +#' f <- "https://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv" +#' data <- h2o.importFile(f) +#' +#' # Set predictors and response; set response as a factor +#' data["CAPSULE"] <- as.factor(data["CAPSULE"]) +#' predictors <- c("AGE","RACE","DPROS","DCAPS","PSA","VOL","GLEASON") +#' response <- "CAPSULE" +#' +#' # Train the AdaBoost model +#' h2o_adaboost <- h2o.adaBoost(x = predictors, y = response, training_frame = data, seed = 1234) +#' } +#' @export +h2o.adaBoost <- function(x, + y, + training_frame, + model_id = NULL, + ignore_const_cols = TRUE, + categorical_encoding = c("AUTO", "Enum", "OneHotInternal", "OneHotExplicit", "Binary", "Eigen", "LabelEncoder", "SortByResponse", "EnumLimited"), + weights_column = NULL, + nlearners = 50, + weak_learner = c("AUTO", "DRF", "GLM", "GBM"), + learn_rate = 0.5, + seed = -1) +{ + # Validate required training_frame first and other frame args: should be a valid key or an H2OFrame object + training_frame <- .validate.H2OFrame(training_frame, required=TRUE) + + # Validate other required args + # If x is missing, then assume user wants to use all columns as features. + if (missing(x)) { + if (is.numeric(y)) { + x <- setdiff(col(training_frame), y) + } else { + x <- setdiff(colnames(training_frame), y) + } + } + + # Build parameter list to send to model builder + parms <- list() + parms$training_frame <- training_frame + args <- .verify_dataxy(training_frame, x, y) + parms$ignored_columns <- args$x_ignore + parms$response_column <- args$y + + if (!missing(model_id)) + parms$model_id <- model_id + if (!missing(ignore_const_cols)) + parms$ignore_const_cols <- ignore_const_cols + if (!missing(categorical_encoding)) + parms$categorical_encoding <- categorical_encoding + if (!missing(weights_column)) + parms$weights_column <- weights_column + if (!missing(nlearners)) + parms$nlearners <- nlearners + if (!missing(weak_learner)) + parms$weak_learner <- weak_learner + if (!missing(learn_rate)) + parms$learn_rate <- learn_rate + if (!missing(seed)) + parms$seed <- seed + + # Error check and build model + model <- .h2o.modelJob('adaboost', parms, h2oRestApiVersion=3, verbose=FALSE) + return(model) +} +.h2o.train_segments_adaboost <- function(x, + y, + training_frame, + ignore_const_cols = TRUE, + categorical_encoding = c("AUTO", "Enum", "OneHotInternal", "OneHotExplicit", "Binary", "Eigen", "LabelEncoder", "SortByResponse", "EnumLimited"), + weights_column = NULL, + nlearners = 50, + weak_learner = c("AUTO", "DRF", "GLM", "GBM"), + learn_rate = 0.5, + seed = -1, + segment_columns = NULL, + segment_models_id = NULL, + parallelism = 1) +{ + # formally define variables that were excluded from function parameters + model_id <- NULL + verbose <- NULL + destination_key <- NULL + # Validate required training_frame first and other frame args: should be a valid key or an H2OFrame object + training_frame <- .validate.H2OFrame(training_frame, required=TRUE) + + # Validate other required args + # If x is missing, then assume user wants to use all columns as features. + if (missing(x)) { + if (is.numeric(y)) { + x <- setdiff(col(training_frame), y) + } else { + x <- setdiff(colnames(training_frame), y) + } + } + + # Build parameter list to send to model builder + parms <- list() + parms$training_frame <- training_frame + args <- .verify_dataxy(training_frame, x, y) + parms$ignored_columns <- args$x_ignore + parms$response_column <- args$y + + if (!missing(ignore_const_cols)) + parms$ignore_const_cols <- ignore_const_cols + if (!missing(categorical_encoding)) + parms$categorical_encoding <- categorical_encoding + if (!missing(weights_column)) + parms$weights_column <- weights_column + if (!missing(nlearners)) + parms$nlearners <- nlearners + if (!missing(weak_learner)) + parms$weak_learner <- weak_learner + if (!missing(learn_rate)) + parms$learn_rate <- learn_rate + if (!missing(seed)) + parms$seed <- seed + + # Build segment-models specific parameters + segment_parms <- list() + if (!missing(segment_columns)) + segment_parms$segment_columns <- segment_columns + if (!missing(segment_models_id)) + segment_parms$segment_models_id <- segment_models_id + segment_parms$parallelism <- parallelism + + # Error check and build segment models + segment_models <- .h2o.segmentModelsJob('adaboost', segment_parms, parms, h2oRestApiVersion=3) + return(segment_models) +} diff --git a/h2o-r/h2o-package/pkgdown/_pkgdown.yml b/h2o-r/h2o-package/pkgdown/_pkgdown.yml index 6e170838b4db..149fd0526bc7 100644 --- a/h2o-r/h2o-package/pkgdown/_pkgdown.yml +++ b/h2o-r/h2o-package/pkgdown/_pkgdown.yml @@ -38,6 +38,7 @@ reference: - h2o - h2o.abs - h2o.acos + - h2o.adaBoost - h2o.aggregated_frame - h2o.aggregator - h2o.aic diff --git a/h2o-r/scripts/h2o-r-test-setup.R b/h2o-r/scripts/h2o-r-test-setup.R index 9dd64ce0e762..61e5d8e919a4 100755 --- a/h2o-r/scripts/h2o-r-test-setup.R +++ b/h2o-r/scripts/h2o-r-test-setup.R @@ -187,7 +187,7 @@ function() { "coxph.R", "coxphutils.R", "gbm.R", "glm.R", "gam.R", "anovaglm.R", "glrm.R", "kmeans.R", "deeplearning.R", "randomforest.R", "generic.R", "naivebayes.R", "pca.R", "svd.R", "locate.R", "grid.R", "word2vec.R", "w2vutils.R", "stackedensemble.R", "rulefit.R", "modelselection.R", "predict.R", "xgboost.R", "isolationforest.R", "psvm.R", "segment.R", "tf-idf.R", "explain.R", "permutation_varimp.R", "extendedisolationforest.R", - "upliftrandomforest.R", "infogram.R", "isotonicregression.R", "admissibleml.R", "decisiontree.R") + "upliftrandomforest.R", "infogram.R", "isotonicregression.R", "admissibleml.R", "decisiontree.R", "adaboost.R") src_path <- paste(h2oRDir,"h2o-package","R",sep=.Platform$file.sep) invisible(lapply(to_src,function(x){source(paste(src_path, x, sep = .Platform$file.sep))})) diff --git a/h2o-r/tests/testdir_algos/adaboost/runit_adaboost_smoke.R b/h2o-r/tests/testdir_algos/adaboost/runit_adaboost_smoke.R new file mode 100644 index 000000000000..ca59093b6fbb --- /dev/null +++ b/h2o-r/tests/testdir_algos/adaboost/runit_adaboost_smoke.R @@ -0,0 +1,20 @@ +setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f"))) +source("../../../scripts/h2o-r-test-setup.R") + + + +test.adaBoost.smoke <- function() { + f <- "https://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv" + data <- h2o.importFile(f) + + # Set predictors and response; set response as a factor + data["CAPSULE"] <- as.factor(data["CAPSULE"]) + predictors <- c("AGE","RACE","DPROS","DCAPS","PSA","VOL","GLEASON") + response <- "CAPSULE" + + # Train the AdaBoost model + h2o_adaboost <- h2o.adaBoost(x = predictors, y = response, training_frame = data, seed = 1234) + expect_equal(is.null(h2o_adaboost), FALSE) +} + +doTest("adaBoost: Smoke Test", test.adaBoost.smoke)