diff --git a/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java b/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java new file mode 100644 index 000000000000..b606027c6bfa --- /dev/null +++ b/h2o-algos/src/main/java/hex/adaboost/AdaBoost.java @@ -0,0 +1,242 @@ +package hex.adaboost; + +import hex.Model; +import hex.ModelBuilder; +import hex.ModelCategory; +import hex.glm.GLM; +import hex.glm.GLMModel; +import hex.tree.drf.DRF; +import hex.tree.drf.DRFModel; +import hex.tree.gbm.GBM; +import hex.tree.gbm.GBMModel; +import org.apache.log4j.Logger; +import water.*; +import water.exceptions.H2OModelBuilderIllegalArgumentException; +import water.fvec.Frame; +import water.fvec.Vec; +import water.util.Timer; +import water.util.TwoDimTable; + +import java.util.ArrayList; +import java.util.List; + +/** + * Implementation of AdaBoost algorithm based on + * + * Raul Rojas, "Adaboost and the Super Bowl of Classifiers A Tutorial Introduction to Adaptive Boosting" + * Alexandru Niculescu-Mizil and Richard A. Caruana, "Obtaining Calibrated Probabilities from Boosting" + * Y. Freund, R. Schapire, “A Decision-Theoretic Generalization of on-Line Learning and an Application to Boosting”, 1995. + * + * @author Adam Valenta + */ +public class AdaBoost extends ModelBuilder { + private static final Logger LOG = Logger.getLogger(AdaBoost.class); + private static final int MAX_LEARNERS = 100_000; + + private AdaBoostModel _model; + private String _weightsName = "weights"; + + // Called from an http request + public AdaBoost(AdaBoostModel.AdaBoostParameters parms) { + super(parms); + init(false); + } + + public AdaBoost(boolean startup_once) { + super(new AdaBoostModel.AdaBoostParameters(), startup_once); + } + + @Override + public boolean havePojo() { + return false; + } + + @Override + public boolean haveMojo() { + return false; + } + + @Override + public void init(boolean expensive) { + super.init(expensive); + if(_parms._nlearners < 1 || _parms._nlearners > MAX_LEARNERS) + error("n_estimators", "Parameter n_estimators must be in interval [1, " + + MAX_LEARNERS + "] but it is " + _parms._nlearners); + if (_parms._weak_learner == AdaBoostModel.Algorithm.AUTO) { + _parms._weak_learner = AdaBoostModel.Algorithm.DRF; + } + if (_parms._weights_column != null) { + // _parms._weights_column cannot be used all time since it breaks scoring + _weightsName = _parms._weights_column; + } + if( !(0. < _parms._learn_rate && _parms._learn_rate <= 1.0) ) { + error("learn_rate", "learn_rate must be between 0 and 1"); + } + } + + private class AdaBoostDriver extends Driver { + + @Override + public void computeImpl() { + _model = null; + try { + init(true); + if (error_count() > 0) { + throw H2OModelBuilderIllegalArgumentException.makeFromBuilder(AdaBoost.this); + } + _model = new AdaBoostModel(dest(), _parms, + new AdaBoostModel.AdaBoostOutput(AdaBoost.this)); + _model.delete_and_lock(_job); + buildAdaboost(); + LOG.info(_model.toString()); + } finally { + if (_model != null) + _model.unlock(_job); + } + } + + private void buildAdaboost() { + _model._output.alphas = new double[(int)_parms._nlearners]; + _model._output.models = new Key[(int)_parms._nlearners]; + + Frame _trainWithWeights; + if (_parms._weights_column == null) { + _trainWithWeights = new Frame(train()); + Vec weights = _trainWithWeights.anyVec().makeCons(1,1,null,null)[0]; + _weightsName = _trainWithWeights.uniquify(_weightsName); // be sure that we are not accidentally using some column in the train + _trainWithWeights.add(_weightsName, weights); + DKV.put(_trainWithWeights); + Scope.track(weights); + } else { + _trainWithWeights = _parms.train(); + } + + for (int n = 0; n < _parms._nlearners; n++) { + Timer timer = new Timer(); + ModelBuilder job = chooseWeakLearner(_trainWithWeights); + job._parms._seed += n; + Model model = (Model) job.trainModel().get(); + DKV.put(model); + Scope.untrack(model._key); + _model._output.models[n] = model._key; + Frame predictions = model.score(_trainWithWeights); + Scope.track(predictions); + + CountWeTask countWe = new CountWeTask().doAll(_trainWithWeights.vec(_weightsName), _trainWithWeights.vec(_parms._response_column), predictions.vec("predict")); + double eM = countWe.We / countWe.W; + double alphaM = _parms._learn_rate * Math.log((1 - eM) / eM); + _model._output.alphas[n] = alphaM; + + UpdateWeightsTask updateWeightsTask = new UpdateWeightsTask(alphaM); + updateWeightsTask.doAll(_trainWithWeights.vec(_weightsName), _trainWithWeights.vec(_parms._response_column), predictions.vec("predict")); + _job.update(1); + _model.update(_job); + LOG.info((n + 1) + ". estimator was built in " + timer.toString()); + LOG.info("*********************************************************************"); + } + if (_trainWithWeights != _parms.train()) { + DKV.remove(_trainWithWeights._key); + } + _model._output._model_summary = createModelSummaryTable(); + } + } + + @Override + protected Driver trainModelImpl() { + return new AdaBoostDriver(); + } + + @Override + public BuilderVisibility builderVisibility() { + return BuilderVisibility.Experimental; + } + + @Override + public ModelCategory[] can_build() { + return new ModelCategory[]{ + ModelCategory.Binomial, + }; + } + + @Override + public boolean isSupervised() { + return true; + } + + private ModelBuilder chooseWeakLearner(Frame frame) { + switch (_parms._weak_learner) { + case GLM: + return getGLMWeakLearner(frame); + case GBM: + return getGBMWeakLearner(frame); + default: + case DRF: + return getDRFWeakLearner(frame); + + } + } + + private DRF getDRFWeakLearner(Frame frame) { + DRFModel.DRFParameters parms = new DRFModel.DRFParameters(); + parms._train = frame._key; + parms._response_column = _parms._response_column; + parms._weights_column = _weightsName; + parms._mtries = 1; + parms._min_rows = 1; + parms._ntrees = 1; + parms._sample_rate = 1; + parms._max_depth = 1; + parms._seed = _parms._seed; + return new DRF(parms); + } + + private GLM getGLMWeakLearner(Frame frame) { + GLMModel.GLMParameters parms = new GLMModel.GLMParameters(); + parms._train = frame._key; + parms._response_column = _parms._response_column; + parms._weights_column = _weightsName; + parms._seed = _parms._seed; + return new GLM(parms); + } + + private GBM getGBMWeakLearner(Frame frame) { + GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); + parms._train = frame._key; + parms._response_column = _parms._response_column; + parms._weights_column = _weightsName; + parms._min_rows = 1; + parms._ntrees = 1; + parms._sample_rate = 1; + parms._max_depth = 1; + parms._seed = _parms._seed; + return new GBM(parms); + } + + public TwoDimTable createModelSummaryTable() { + List colHeaders = new ArrayList<>(); + List colTypes = new ArrayList<>(); + List colFormat = new ArrayList<>(); + + colHeaders.add("Number of weak learners"); colTypes.add("int"); colFormat.add("%d"); + colHeaders.add("Learn rate"); colTypes.add("int"); colFormat.add("%d"); + colHeaders.add("Weak learner"); colTypes.add("int"); colFormat.add("%d"); + colHeaders.add("Seed"); colTypes.add("long"); colFormat.add("%d"); + + final int rows = 1; + TwoDimTable table = new TwoDimTable( + "Model Summary", null, + new String[rows], + colHeaders.toArray(new String[0]), + colTypes.toArray(new String[0]), + colFormat.toArray(new String[0]), + ""); + int row = 0; + int col = 0; + table.set(row, col++, _parms._nlearners); + table.set(row, col++, _parms._learn_rate); + table.set(row, col++, _parms._weak_learner.toString()); + table.set(row, col, _parms._seed); + return table; + } + +} diff --git a/h2o-algos/src/main/java/hex/adaboost/AdaBoostModel.java b/h2o-algos/src/main/java/hex/adaboost/AdaBoostModel.java new file mode 100644 index 000000000000..24d848b27b16 --- /dev/null +++ b/h2o-algos/src/main/java/hex/adaboost/AdaBoostModel.java @@ -0,0 +1,133 @@ +package hex.adaboost; + +import hex.Model; +import hex.ModelCategory; +import hex.ModelMetrics; +import hex.ModelMetricsBinomial; +import org.apache.log4j.Logger; +import water.*; + +public class AdaBoostModel extends Model { + private static final Logger LOG = Logger.getLogger(AdaBoostModel.class); + + public enum Algorithm {DRF, GLM, GBM, AUTO} + + public AdaBoostModel(Key selfKey, AdaBoostParameters parms, + AdaBoostOutput output) { + super(selfKey, parms, output); + } + + @Override + public ModelMetrics.MetricBuilder makeMetricBuilder(String[] domain) { + if (_output.getModelCategory() == ModelCategory.Binomial) { + return new ModelMetricsBinomial.MetricBuilderBinomial(domain); + } + throw H2O.unimpl("AdaBoost currently support only binary classification"); + } + + @Override + protected String[] makeScoringNames(){ + return new String[]{"predict", "p0", "p1"}; + } + + @Override + protected double[] score0(double[] data, double[] preds) { + double alphas0 = 0; + double alphas1 = 0; + double linearCombination = 0; + for (int i = 0; i < _output.alphas.length; i++) { + Model model = DKV.getGet(_output.models[i]); + if (model.score(data) == 0) { + linearCombination += _output.alphas[i]*-1; + alphas0 += _output.alphas[i]; + } else { + linearCombination += _output.alphas[i]; + alphas1 += _output.alphas[i]; + } + } + preds[0] = alphas0 > alphas1 ? 0 : 1; + preds[2] = 1/(1 + Math.exp(-2*linearCombination)); + preds[1] = 1 - preds[2]; + return preds; + } + + @Override protected boolean needsPostProcess() { return false; /* pred[0] is already set by score0 */ } + + public static class AdaBoostOutput extends Model.Output { + public double[] alphas; + public Key[] models; + + public AdaBoostOutput(AdaBoost adaBoostModel) { + super(adaBoostModel); + } + } + + @Override + protected Futures remove_impl(Futures fs, boolean cascade) { + for (Key iTreeKey : _output.models) { + Keyed.remove(iTreeKey, fs, true); + } + return super.remove_impl(fs, cascade); + } + + @Override + protected AutoBuffer writeAll_impl(AutoBuffer ab) { + for (Key iTreeKey : _output.models) { + ab.putKey(iTreeKey); + } + return super.writeAll_impl(ab); + } + + @Override + protected Keyed readAll_impl(AutoBuffer ab, Futures fs) { + for (Key iTreeKey : _output.models) { + ab.getKey(iTreeKey, fs); + } + return super.readAll_impl(ab,fs); + } + + public static class AdaBoostParameters extends Model.Parameters { + + /** + * Number of weak learners to train. Defaults to 50. + */ + public int _nlearners; + + /** + * Choose a weak learner type. Defaults to DRF. + */ + public Algorithm _weak_learner; + + /** + * Specify how quickly the training converge. Number in (0,1]. Defaults to 0.5. + */ + public double _learn_rate; + + @Override + public String algoName() { + return "AdaBoost"; + } + + @Override + public String fullName() { + return "AdaBoost"; + } + + @Override + public String javaName() { + return AdaBoostModel.class.getName(); + } + + @Override + public long progressUnits() { + return _nlearners; + } + + public AdaBoostParameters() { + super(); + _nlearners = 50; + _weak_learner = Algorithm.AUTO; + _learn_rate = 0.5; + } + } +} diff --git a/h2o-algos/src/main/java/hex/adaboost/CountWeTask.java b/h2o-algos/src/main/java/hex/adaboost/CountWeTask.java new file mode 100644 index 000000000000..0a763ca64f2d --- /dev/null +++ b/h2o-algos/src/main/java/hex/adaboost/CountWeTask.java @@ -0,0 +1,29 @@ +package hex.adaboost; + +import water.MRTask; +import water.fvec.Chunk; + +/** + * Count sum of all weights and sum of bad predicted weights for AdaBoost purpose + */ +class CountWeTask extends MRTask { + double W = 0; + double We = 0; + + @Override + public void map(Chunk weights, Chunk response, Chunk predict) { + for (int row = 0; row < weights._len; row++) { + double weight = weights.atd(row); + W += weight; + if (response.at8(row) != predict.at8(row)) { + We += weight; + } + } + } + + @Override + public void reduce(CountWeTask mrt) { + W += mrt.W; + We += mrt.We; + } +} diff --git a/h2o-algos/src/main/java/hex/adaboost/UpdateWeightsTask.java b/h2o-algos/src/main/java/hex/adaboost/UpdateWeightsTask.java new file mode 100644 index 000000000000..e3d4f0d08cee --- /dev/null +++ b/h2o-algos/src/main/java/hex/adaboost/UpdateWeightsTask.java @@ -0,0 +1,29 @@ +package hex.adaboost; + +import water.MRTask; +import water.fvec.Chunk; + +/** + * Update weights according to AdaBoost algorithm + */ +class UpdateWeightsTask extends MRTask { + double expAm; + double expAmInverse; + + public UpdateWeightsTask(double alphaM) { + expAm = Math.exp(alphaM); + expAmInverse = Math.exp(-alphaM); + } + + @Override + public void map(Chunk weights, Chunk response, Chunk predict) { + for (int row = 0; row < weights._len; row++) { + double weight = weights.atd(row); + if (response.at8(row) != predict.at8(row)) { + weights.set(row, weight * expAm); + } else { + weights.set(row, weight * expAmInverse); + } + } + } +} diff --git a/h2o-algos/src/main/java/hex/api/RegisterAlgos.java b/h2o-algos/src/main/java/hex/api/RegisterAlgos.java index be7e255f35d3..b31bbcc39834 100644 --- a/h2o-algos/src/main/java/hex/api/RegisterAlgos.java +++ b/h2o-algos/src/main/java/hex/api/RegisterAlgos.java @@ -38,7 +38,8 @@ public void registerEndPoints(RestApiContext context) { new hex.tree.uplift.UpliftDRF (true), new hex.modelselection.ModelSelection (true), new hex.isotonic .IsotonicRegression(true), - new hex.tree.dt .DT (true) + new hex.tree.dt .DT (true), + new hex.adaboost. AdaBoost (true) }; // "Word2Vec", "Example", "Grep" diff --git a/h2o-algos/src/main/java/hex/glm/GLMModel.java b/h2o-algos/src/main/java/hex/glm/GLMModel.java index 3a221a7c7669..bfb94d3966c3 100755 --- a/h2o-algos/src/main/java/hex/glm/GLMModel.java +++ b/h2o-algos/src/main/java/hex/glm/GLMModel.java @@ -2087,6 +2087,12 @@ public TwoDimTable generateSummaryHGLM(Key train, int iter){ } @Override protected boolean needsPostProcess() { return false; /* pred[0] is already set by score0 */ } + @Override + public double score(double[] data) { + double[] pred = score0(data, new double[_output.nclasses() + 1], 0); + return pred[0]; + } + @Override protected void toJavaPredictBody(SBPrintStream body, CodeGeneratorPipeline classCtx, CodeGeneratorPipeline fileCtx, diff --git a/h2o-algos/src/main/java/hex/schemas/AdaBoostModelV3.java b/h2o-algos/src/main/java/hex/schemas/AdaBoostModelV3.java new file mode 100644 index 000000000000..9229af5dc9ab --- /dev/null +++ b/h2o-algos/src/main/java/hex/schemas/AdaBoostModelV3.java @@ -0,0 +1,30 @@ +package hex.schemas; + +import hex.adaboost.AdaBoostModel; +import water.api.schemas3.ModelOutputSchemaV3; +import water.api.schemas3.ModelSchemaV3; + +public class AdaBoostModelV3 extends ModelSchemaV3 { + + public static final class AdaBoostModelOutputV3 extends ModelOutputSchemaV3 { + // nothing + } + + public AdaBoostV3.AdaBoostParametersV3 createParametersSchema() { return new AdaBoostV3.AdaBoostParametersV3(); } + public AdaBoostModelOutputV3 createOutputSchema() { return new AdaBoostModelOutputV3(); } + + //========================== + // Custom adapters go here + + // Version&Schema-specific filling into the impl + @Override public AdaBoostModel createImpl() { + AdaBoostV3.AdaBoostParametersV3 p = this.parameters; + AdaBoostModel.AdaBoostParameters parms = p.createImpl(); + return new AdaBoostModel( model_id.key(), parms, new AdaBoostModel.AdaBoostOutput(null) ); + } +} diff --git a/h2o-algos/src/main/java/hex/schemas/AdaBoostV3.java b/h2o-algos/src/main/java/hex/schemas/AdaBoostV3.java new file mode 100644 index 000000000000..1a1edb52189c --- /dev/null +++ b/h2o-algos/src/main/java/hex/schemas/AdaBoostV3.java @@ -0,0 +1,41 @@ +package hex.schemas; + +import hex.adaboost.AdaBoost; +import hex.adaboost.AdaBoostModel; +import water.api.API; +import water.api.schemas3.ModelParametersSchemaV3; + +public class AdaBoostV3 extends ModelBuilderSchema< + AdaBoost, + AdaBoostV3, + AdaBoostV3.AdaBoostParametersV3> { + + public static final class AdaBoostParametersV3 extends ModelParametersSchemaV3 { + static public String[] fields = new String[]{ + "model_id", + "training_frame", + "ignored_columns", + "ignore_const_cols", + "categorical_encoding", + "weights_column", + + // AdaBoost specific + "nlearners", + "weak_learner", + "learn_rate", + "seed", + }; + + @API(help = "Number of AdaBoost weak learners.", gridable = true) + public int nlearners; + + @API(help = "Choose a weak learner type. Defaults to AUTO, which means DRF.", gridable = true, values = {"AUTO", "DRF", "GLM", "GBM"}) + public AdaBoostModel.Algorithm weak_learner; + + @API(help="Learning rate (from 0.0 to 1.0)", gridable = true) + public double learn_rate; + + @API(help = "Seed for pseudo random number generator (if applicable)", gridable = true) + public long seed; + } +} diff --git a/h2o-algos/src/main/java/hex/tree/drf/DRFModel.java b/h2o-algos/src/main/java/hex/tree/drf/DRFModel.java index 578e8fb2d045..73b63dd37516 100644 --- a/h2o-algos/src/main/java/hex/tree/drf/DRFModel.java +++ b/h2o-algos/src/main/java/hex/tree/drf/DRFModel.java @@ -7,6 +7,7 @@ import water.Key; import water.fvec.Frame; import water.fvec.NewChunk; +import water.util.ArrayUtils; import water.util.MathUtils; public class DRFModel extends SharedTreeModelWithContributions { @@ -100,6 +101,13 @@ protected ScoreContributionsTask getScoreContributionsSoringTask(SharedTreeModel return preds; } + @Override + public double score(double[] data) { + double[] pred = score0(data, new double[_output.nclasses() + 1], 0, _output._ntrees); + score0PostProcessSupervised(pred, data); + return pred[0]; + } + @Override protected SharedTreePojoWriter makeTreePojoWriter() { CompressedForest compressedForest = new CompressedForest(_output._treeKeys, _output._domains); diff --git a/h2o-algos/src/main/java/hex/tree/gbm/GBMModel.java b/h2o-algos/src/main/java/hex/tree/gbm/GBMModel.java index 4598e66c664e..8eee5526ac59 100755 --- a/h2o-algos/src/main/java/hex/tree/gbm/GBMModel.java +++ b/h2o-algos/src/main/java/hex/tree/gbm/GBMModel.java @@ -378,4 +378,11 @@ public void map(Chunk[] chk, NewChunk[] nchk) { }.withPostMapAction(JobUpdatePostMap.forJob(j)).doAll(types, vs).outputFrame(destination_key, names, domains); } + @Override + public double score(double[] data) { + double[] pred = score0(data, new double[_output.nclasses() + 1], 0, _output._ntrees); + score0PostProcessSupervised(pred, data); + return pred[0]; + } + } diff --git a/h2o-algos/src/main/resources/META-INF/services/water.api.Schema b/h2o-algos/src/main/resources/META-INF/services/water.api.Schema index b23d07eec489..18c3ea3dc972 100644 --- a/h2o-algos/src/main/resources/META-INF/services/water.api.Schema +++ b/h2o-algos/src/main/resources/META-INF/services/water.api.Schema @@ -114,3 +114,7 @@ hex.schemas.UpliftDRFModelV3 hex.schemas.UpliftDRFModelV3$UpliftDRFModelOutputV3 hex.schemas.UpliftDRFV3 hex.schemas.UpliftDRFV3$UpliftDRFParametersV3 +hex.schemas.AdaBoostModelV3 +hex.schemas.AdaBoostModelV3$AdaBoostModelOutputV3 +hex.schemas.AdaBoostV3 +hex.schemas.AdaBoostV3$AdaBoostParametersV3 diff --git a/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java b/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java new file mode 100644 index 000000000000..a5b2aef87979 --- /dev/null +++ b/h2o-algos/src/test/java/hex/adaboost/AdaBoostTest.java @@ -0,0 +1,491 @@ +package hex.adaboost; + +import hex.Model; +import hex.genmodel.algos.tree.SharedTreeSubgraph; +import hex.tree.drf.DRFModel; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.junit.contrib.java.lang.system.EnvironmentVariables; +import org.junit.runner.RunWith; +import water.DKV; +import water.Scope; +import water.TestUtil; +import water.fvec.Frame; +import water.fvec.TestFrameBuilder; +import water.fvec.Vec; +import water.runner.CloudSize; +import water.runner.H2ORunner; +import water.util.FrameUtils; + +import java.io.File; +import java.util.Arrays; + +import static org.junit.Assert.*; + +@CloudSize(1) +@RunWith(H2ORunner.class) +public class AdaBoostTest extends TestUtil { + + @Rule + public EnvironmentVariables environmentVariables = new EnvironmentVariables(); + + @Before + public void beforeClass() { + final File h2oHomeDir = new File(System.getProperty("user.dir")).getParentFile(); + environmentVariables.set("H2O_FILES_SEARCH_PATH", h2oHomeDir.getAbsolutePath()); + } + + @Test + public void testBasicTrain() { + try { + Scope.enter(); + Frame train = parseTestFile("smalldata/prostate/prostate.csv"); + String response = "CAPSULE"; + int nlearners = 50; + train.toCategoricalCol(response); + Scope.track(train); + AdaBoostModel.AdaBoostParameters p = new AdaBoostModel.AdaBoostParameters(); + p._train = train._key; + p._seed = 0xDECAF; + p._nlearners = nlearners; + p._response_column = response; + + AdaBoost adaBoost = new AdaBoost(p); + AdaBoostModel adaBoostModel = adaBoost.trainModel().get(); + Scope.track_generic(adaBoostModel); + assertNotNull(adaBoostModel); + assertEquals("Model should contain all the weak learners", nlearners, adaBoostModel._output.models.length); + + for (int i = 0; i < adaBoostModel._output.models.length; i++) { + System.out.println("Tree = " + i); + DRFModel drfModel = DKV.getGet(adaBoostModel._output.models[i]); + SharedTreeSubgraph tree = drfModel.getSharedTreeSubgraph(0,0); + if (tree.rootNode.getColName() == null) { + // FIXME - why are some of the trees empty? Are all of the columns bad for split? + System.out.println(" Empty tree"); + continue; + } + System.out.println(" Root = " + tree.rootNode.getColName() + " " + tree.rootNode.getSplitValue()); + System.out.println(" Left = " + tree.rootNode.getLeftChild().isLeaf() + " " + tree.rootNode.getLeftChild().getPredValue()); + System.out.println(" Right = " + tree.rootNode.getRightChild().isLeaf() + " " + tree.rootNode.getRightChild().getPredValue()); + assertNotNull(tree.rootNode.getColName()); + assertTrue(tree.rootNode.getLeftChild().isLeaf()); + assertTrue(tree.rootNode.getRightChild().isLeaf()); + } + } finally { + Scope.exit(); + } + } + + @Test + public void testBasicTrainGLM() { + try { + Scope.enter(); + Frame train = parseTestFile("smalldata/prostate/prostate.csv"); + Scope.track(train); + String response = "CAPSULE"; + train.toCategoricalCol(response); + AdaBoostModel.AdaBoostParameters p = new AdaBoostModel.AdaBoostParameters(); + p._train = train._key; + p._seed = 0xDECAF; + p._nlearners = 50; + p._weak_learner = AdaBoostModel.Algorithm.GLM; + p._response_column = response; + + AdaBoost adaBoost = new AdaBoost(p); + AdaBoostModel adaBoostModel = adaBoost.trainModel().get(); + Scope.track_generic(adaBoostModel); + assertNotNull(adaBoostModel); + } finally { + Scope.exit(); + } + } + + @Test + public void testBasicTrainLarge() { + try { + Scope.enter(); + Frame train = parseTestFile("bigdata/laptop/creditcardfraud/creditcardfraud.csv"); + Scope.track(train); + String response = "Class"; + train.toCategoricalCol(response); + AdaBoostModel.AdaBoostParameters p = new AdaBoostModel.AdaBoostParameters(); + p._train = train._key; + p._seed = 0xDECAF; + p._nlearners = 50; + p._response_column = response; + + AdaBoost adaBoost = new AdaBoost(p); + AdaBoostModel adaBoostModel = adaBoost.trainModel().get(); + Scope.track_generic(adaBoostModel); + assertNotNull(adaBoostModel); + } finally { + Scope.exit(); + } + } + + @Test + public void testBasicTrainAndScore() { + try { + Scope.enter(); + Frame train = parseTestFile("smalldata/prostate/prostate.csv"); + Scope.track(train); + String response = "CAPSULE"; + train.toCategoricalCol(response); + AdaBoostModel.AdaBoostParameters p = new AdaBoostModel.AdaBoostParameters(); + p._train = train._key; + p._seed = 0xDECAF; + p._nlearners = 50; + p._response_column = response; + + AdaBoost adaBoost = new AdaBoost(p); + AdaBoostModel adaBoostModel = adaBoost.trainModel().get(); + Scope.track_generic(adaBoostModel); + assertNotNull(adaBoostModel); + + System.out.println("train.toTwoDimTable() = " + train.toTwoDimTable()); + + Frame score = adaBoostModel.score(train); + Scope.track(score); + } finally { + Scope.exit(); + } + } + + @Test + public void testBasicTrainAndScoreCategorical() { + try { + Scope.enter(); + Frame train = parseTestFile("smalldata/prostate/prostate.csv"); + Scope.track(train); + String response = "CAPSULE"; + train.toCategoricalCol(response); + train.toCategoricalCol("RACE"); + train.toCategoricalCol("DPROS"); + train.toCategoricalCol("DCAPS"); + train.toCategoricalCol("GLEASON"); + AdaBoostModel.AdaBoostParameters p = new AdaBoostModel.AdaBoostParameters(); + p._train = train._key; + p._seed = 0xDECAF; + p._nlearners = 50; + p._response_column = response; + p._categorical_encoding = Model.Parameters.CategoricalEncodingScheme.OneHotExplicit; + + AdaBoost adaBoost = new AdaBoost(p); + AdaBoostModel adaBoostModel = adaBoost.trainModel().get(); + Scope.track_generic(adaBoostModel); + assertNotNull(adaBoostModel); + + System.out.println("train.toTwoDimTable() = " + train.toTwoDimTable()); + + Frame score = adaBoostModel.score(train); + Scope.track(score); + } finally { + Scope.exit(); + } + } + + @Test + public void testBasicTrainAndScoreLarge() { + try { + Scope.enter(); + Frame train = parseTestFile("bigdata/laptop/creditcardfraud/creditcardfraud.csv"); + Scope.track(train); + String response = "Class"; + train.toCategoricalCol(response); + AdaBoostModel.AdaBoostParameters p = new AdaBoostModel.AdaBoostParameters(); + p._train = train._key; + p._seed = 0xDECAF; + p._nlearners = 50; + p._response_column = response; + + AdaBoost adaBoost = new AdaBoost(p); + AdaBoostModel adaBoostModel = adaBoost.trainModel().get(); + Scope.track_generic(adaBoostModel); + assertNotNull(adaBoostModel); + + Frame score = adaBoostModel.score(train); + Scope.track(score); + } finally { + Scope.exit(); + } + } + + @Test + public void testBasicTrainAirlines() { + try { + Scope.enter(); + Frame train = parseTestFile("smalldata/testng/airlines_train_preprocessed.csv"); + Scope.track(train); + Frame test = parseTestFile("smalldata/testng/airlines_test_preprocessed.csv"); + Scope.track(test); + String response = "IsDepDelayed"; + train.toCategoricalCol(response); + test.toCategoricalCol(response); + AdaBoostModel.AdaBoostParameters p = new AdaBoostModel.AdaBoostParameters(); + p._train = train._key; + p._seed = 0xDECAF; + p._nlearners = 50; + p._response_column = response; + + AdaBoost adaBoost = new AdaBoost(p); + AdaBoostModel adaBoostModel = adaBoost.trainModel().get(); + Scope.track_generic(adaBoostModel); + assertNotNull(adaBoostModel); + + Frame score = adaBoostModel.score(test); + Scope.track(score); + } finally { + Scope.exit(); + } + } + + @Test + public void testBasicTrainHiggs() { + try { + Scope.enter(); + Frame train = parseTestFile("smalldata/higgs/higgs_train_5k.csv"); + Scope.track(train); + Frame test = parseTestFile("smalldata/higgs/higgs_test_5k.csv"); + Scope.track(test); + String response = "response"; + train.toCategoricalCol(response); + test.toCategoricalCol(response); + AdaBoostModel.AdaBoostParameters p = new AdaBoostModel.AdaBoostParameters(); + p._train = train._key; + p._seed = 0xDECAF; + p._nlearners = 50; + p._response_column = response; + + AdaBoost adaBoost = new AdaBoost(p); + AdaBoostModel adaBoostModel = adaBoost.trainModel().get(); + Scope.track_generic(adaBoostModel); + assertNotNull(adaBoostModel); + + Frame score = adaBoostModel.score(test); + Scope.track(score); + } finally { + Scope.exit(); + } + } + + @Test + public void testCountWe() { + Scope.enter(); + try { + Frame train = new TestFrameBuilder() + .withVecTypes(Vec.T_NUM, Vec.T_CAT, Vec.T_CAT) + .withDataForCol(0, ard(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0)) + .withDataForCol(1, ar("0", "0", "0", "0", "0", "1", "1", "1", "1", "1")) + .withDataForCol(2, ar("1", "1", "1", "1", "1", "0", "0", "0", "0", "0")) + .build(); + train = ensureDistributed(train); + Scope.track(train); + + CountWeTask countWeTask = new CountWeTask().doAll(train); + assertEquals("Sum of weights is not correct",10, countWeTask.W, 0); + assertEquals("Sum of error weights is not correct",10, countWeTask.We, 0); + } finally { + Scope.exit(); + } + } + + @Test + public void testUpdateWeights() { + Scope.enter(); + try { + Frame train = new TestFrameBuilder() + .withVecTypes(Vec.T_NUM, Vec.T_CAT, Vec.T_CAT) + .withDataForCol(0, ard(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0)) + .withDataForCol(1, ar("1", "0", "0", "0", "0", "1", "1", "1", "1", "1")) + .withDataForCol(2, ar("1", "1", "1", "1", "1", "0", "0", "0", "0", "0")) + .build(); + train = ensureDistributed(train); + Scope.track(train); + + double alpha = 2; + UpdateWeightsTask updateWeightsTask = new UpdateWeightsTask(alpha); + updateWeightsTask.doAll(train); + + Vec weightsExpected = Vec.makeCon(Math.exp(alpha),train.numRows()); + weightsExpected.set(0, Math.exp(-alpha)); + System.out.println("weights = "); + System.out.println(new Frame(train.vec(0)).toTwoDimTable(0, (int) train.numRows(), false)); + assertVecEquals("Weights are not correctly updated", weightsExpected, train.vec(0),0); + } finally { + Scope.exit(); + } + } + + @Test + public void testBasicTrainAndScoreWithExternalWeightsColumn() { + try { + // Train reference model + Scope.enter(); + Frame train = parseTestFile("smalldata/prostate/prostate.csv"); + String response = "CAPSULE"; + train.toCategoricalCol(response); + Scope.track(train); + AdaBoostModel.AdaBoostParameters p = new AdaBoostModel.AdaBoostParameters(); + p._train = train._key; + p._seed = 0xDECAF; + p._nlearners = 10; + p._response_column = response; + + AdaBoost adaBoostReference = new AdaBoost(p); + AdaBoostModel adaBoostReferenceModel = adaBoostReference.trainModel().get(); + Scope.track_generic(adaBoostReferenceModel); + assertNotNull(adaBoostReferenceModel); + + // Add weights column to frame and train different model + Vec weights = train.anyVec().makeCons(1,1,null,null)[0]; + train.add("weights", weights); + DKV.put(train); + Scope.track(train); + p._weights_column = "weights"; + + AdaBoost adaBoostWithExternalWeights = new AdaBoost(p); + AdaBoostModel adaBoostModelWithExternalWeights = adaBoostWithExternalWeights.trainModel().get(); + Scope.track_generic(adaBoostModelWithExternalWeights); + assertNotNull(adaBoostModelWithExternalWeights); + + // Check that output is identical + Frame scoreReference = adaBoostReferenceModel.score(train); + Scope.track(scoreReference); + Frame scoreWithExternalWeights = adaBoostModelWithExternalWeights.score(train); + Scope.track(scoreWithExternalWeights); + assertFrameEquals(scoreReference, scoreWithExternalWeights, 0); // output should be identical + assertFalse("Weights column should be change in the training", weights.isConst()); + } finally { + Scope.exit(); + } + } + + @Test + public void testBasicTrainAndScoreWithCustomWeightsColumn() { + try { + // Train reference model + Scope.enter(); + Frame train = parseTestFile("smalldata/prostate/prostate.csv"); + String response = "CAPSULE"; + train.toCategoricalCol(response); + Scope.track(train); + AdaBoostModel.AdaBoostParameters p = new AdaBoostModel.AdaBoostParameters(); + p._train = train._key; + p._seed = 0xDECAF; + p._nlearners = 10; + p._response_column = response; + + AdaBoost adaBoostReference = new AdaBoost(p); + AdaBoostModel adaBoostReferenceModel = adaBoostReference.trainModel().get(); + Scope.track_generic(adaBoostReferenceModel); + assertNotNull(adaBoostReferenceModel); + + // Set custom weights column + p._weights_column = "RACE"; + double maxReference = train.vec("RACE").max(); // for future assert + AdaBoost adaBoostWithExternalWeights = new AdaBoost(p); + AdaBoostModel adaBoostModelWithExternalWeights = adaBoostWithExternalWeights.trainModel().get(); + Scope.track_generic(adaBoostModelWithExternalWeights); + assertNotNull(adaBoostModelWithExternalWeights); + + // Check that output is identical + Frame scoreReference = adaBoostReferenceModel.score(train); + Scope.track(scoreReference); + Frame scoreWithExternalWeights = adaBoostModelWithExternalWeights.score(train); + Scope.track(scoreWithExternalWeights); + // output should be different since the weights are not initialize on purpose + assertFalse(Arrays.equals(FrameUtils.asDoubles(scoreReference.vec("predict")), FrameUtils.asDoubles(scoreWithExternalWeights.vec("predict")))); + assertNotEquals("RACE column should be changed in the training", maxReference, train.vec("RACE").max()); + } finally { + Scope.exit(); + } + } + + @Test + public void testBasicTrainAndScoreWithDuplicatedWeightsColumn() { + try { + Scope.enter(); + Frame train = parseTestFile("smalldata/prostate/prostate.csv"); + // Add weights column to frame + Vec weights = train.anyVec().makeCons(1,1,null,null)[0]; + train.add("weights", weights); + DKV.put(train); + String response = "CAPSULE"; + train.toCategoricalCol(response); + Scope.track(train); + + AdaBoostModel.AdaBoostParameters p = new AdaBoostModel.AdaBoostParameters(); + p._train = train._key; + p._seed = 0xDECAF; + p._nlearners = 10; + p._response_column = response; + p._ignore_const_cols = false; + + AdaBoost adaBoost = new AdaBoost(p); + AdaBoostModel adaBoostModel = adaBoost.trainModel().get(); + Scope.track_generic(adaBoostModel); + assertNotNull(adaBoostModel); + + // Check that output is identical + Frame score = adaBoostModel.score(train); + Scope.track(score); + assertTrue("Weights column should not be changed in the training", weights.isConst()); + } finally { + Scope.exit(); + } + } + + @Test + public void testBasicTrainAndScoreGLM() { + try { + Scope.enter(); + Frame train = Scope.track(parseTestFile("smalldata/prostate/prostate.csv")); + String response = "CAPSULE"; + train.toCategoricalCol(response); + AdaBoostModel.AdaBoostParameters p = new AdaBoostModel.AdaBoostParameters(); + p._train = train._key; + p._seed = 0xDECAF; + p._nlearners = 50; + p._weak_learner = AdaBoostModel.Algorithm.GLM; + p._response_column = response; + + AdaBoost adaBoost = new AdaBoost(p); + AdaBoostModel adaBoostModel = adaBoost.trainModel().get(); + Scope.track_generic(adaBoostModel); + assertNotNull(adaBoostModel); + + Frame score = adaBoostModel.score(train); + Scope.track(score); + } finally { + Scope.exit(); + } + } + + @Test + public void testBasicTrainAndScoreGBM() { + try { + Scope.enter(); + Frame train = Scope.track(parseTestFile("smalldata/prostate/prostate.csv")); + String response = "CAPSULE"; + train.toCategoricalCol(response); + AdaBoostModel.AdaBoostParameters p = new AdaBoostModel.AdaBoostParameters(); + p._train = train._key; + p._seed = 0xDECAF; + p._nlearners = 50; + p._weak_learner = AdaBoostModel.Algorithm.GBM; + p._response_column = response; + + AdaBoost adaBoost = new AdaBoost(p); + AdaBoostModel adaBoostModel = adaBoost.trainModel().get(); + Scope.track_generic(adaBoostModel); + assertNotNull(adaBoostModel); + + Frame score = adaBoostModel.score(train); + Scope.track(score); + } finally { + Scope.exit(); + } + } +} diff --git a/h2o-bindings/bin/custom/R/gen_adaboost.py b/h2o-bindings/bin/custom/R/gen_adaboost.py new file mode 100644 index 000000000000..7dbf86f2a3fa --- /dev/null +++ b/h2o-bindings/bin/custom/R/gen_adaboost.py @@ -0,0 +1,41 @@ +extensions = dict( + skip_default_set_params_for=['training_frame', 'ignored_columns', 'response_column', + 'max_confusion_matrix_size', 'distribution', 'offset_column'], + set_required_params=""" +parms$training_frame <- training_frame +args <- .verify_dataxy(training_frame, x, y) +parms$ignored_columns <- args$x_ignore +parms$response_column <- args$y +""", +) + + +doc = dict( + preamble=""" +Build an AdaBoost model + +Builds an AdaBoost model on an H2OFrame. +""", + returns=""" +Creates a \linkS4class{H2OModel} object of the right type. +""", + seealso=""" +\code{\link{predict.H2OModel}} for prediction +""", + examples=""" +library(h2o) +h2o.init() + +# Import the airlines dataset +f <- "https://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv" +data <- h2o.importFile(f) + +# Set predictors and response; set response as a factor +data["CAPSULE"] <- as.factor(data["CAPSULE"]) +predictors <- c("AGE","RACE","DPROS","DCAPS","PSA","VOL","GLEASON") +response <- "CAPSULE" + +# Train the AdaBoost model +h2o_adaboost <- h2o.adaBoost(x = predictors, y = response, training_frame = data, seed = 1234) +""" +) diff --git a/h2o-bindings/bin/custom/python/gen_adaboost.py b/h2o-bindings/bin/custom/python/gen_adaboost.py new file mode 100644 index 000000000000..646c8e820871 --- /dev/null +++ b/h2o-bindings/bin/custom/python/gen_adaboost.py @@ -0,0 +1,8 @@ +options = dict( +) + +doc = dict( + __class__=""" +Builds an AdaBoost model +""" +) diff --git a/h2o-bindings/bin/gen_R.py b/h2o-bindings/bin/gen_R.py index 14a895e584b1..b99875a2a1f4 100644 --- a/h2o-bindings/bin/gen_R.py +++ b/h2o-bindings/bin/gen_R.py @@ -272,6 +272,7 @@ def algo_to_modelname(algo): if algo == "gam": return "Generalized Additive Model" if algo == "modelselection": return "Model Selection" if algo == "infogram": return "Infogram" + if algo == "adaboost": return "AdaBoost Model" return algo @@ -347,6 +348,7 @@ def main(): if name == "stackedensemble": module = "stackedEnsemble" if name == "pca": module = "prcomp" if name == "modelselection": module = "modelSelection" + if name == "adaboost": module = "adaBoost" bi.vprint("Generating model: " + name) bi.write_to_file("%s.R" % file_name, gen_module(mb, name, module)) diff --git a/h2o-bindings/bin/gen_python.py b/h2o-bindings/bin/gen_python.py index 98fd3db325b8..6fa044f77d0e 100755 --- a/h2o-bindings/bin/gen_python.py +++ b/h2o-bindings/bin/gen_python.py @@ -351,6 +351,7 @@ def algo_to_classname(algo): if algo == "rulefit": return "H2ORuleFitEstimator" if algo == "modelselection": return "H2OModelSelectionEstimator" if algo == "isotonicregression": return "H2OIsotonicRegressionEstimator" + if algo == "adaboost": return "H2OAdaBoostEstimator" return "H2O" + algo.capitalize() + "Estimator" diff --git a/h2o-core/src/main/java/hex/ModelMetricsBinomial.java b/h2o-core/src/main/java/hex/ModelMetricsBinomial.java index 725c866055d6..04a40a3f5ace 100755 --- a/h2o-core/src/main/java/hex/ModelMetricsBinomial.java +++ b/h2o-core/src/main/java/hex/ModelMetricsBinomial.java @@ -163,7 +163,6 @@ public static class MetricBuilderBinomial> ex protected double _logloss; protected AUC2.AUCBuilder _auc; - public MetricBuilderBinomial() {} public MetricBuilderBinomial( String[] domain ) { super(2,domain); _auc = new AUC2.AUCBuilder(AUC2.NBINS); } public double auc() {return new AUC2(_auc)._auc;} diff --git a/h2o-py/docs/modeling.rst b/h2o-py/docs/modeling.rst index 89b4f9cfa408..9ceecb83b361 100644 --- a/h2o-py/docs/modeling.rst +++ b/h2o-py/docs/modeling.rst @@ -8,6 +8,12 @@ Modeling In H2O Supervised ++++++++++ +:mod:`H2OAdaBoostEstimator` +--------------------------- +.. autoclass:: h2o.estimators.adaboost.H2OAdaBoostEstimator + :show-inheritance: + :members: + :mod:`H2OANOVAGLMEstimator` --------------------------- .. autoclass:: h2o.estimators.anovaglm.H2OANOVAGLMEstimator diff --git a/h2o-py/h2o/estimators/__init__.py b/h2o-py/h2o/estimators/__init__.py index d261ff829f13..766e1678b950 100644 --- a/h2o-py/h2o/estimators/__init__.py +++ b/h2o-py/h2o/estimators/__init__.py @@ -7,6 +7,7 @@ import inspect import sys +from .adaboost import H2OAdaBoostEstimator from .aggregator import H2OAggregatorEstimator from .anovaglm import H2OANOVAGLMEstimator from .coxph import H2OCoxProportionalHazardsEstimator @@ -60,12 +61,13 @@ def create_estimator(algo, **params): __all__ = ( "create_estimator", - "H2OAggregatorEstimator", "H2OANOVAGLMEstimator", "H2OCoxProportionalHazardsEstimator", "H2ODecisionTreeEstimator", - "H2OAutoEncoderEstimator", "H2ODeepLearningEstimator", "H2OEstimator", "H2OExtendedIsolationForestEstimator", - "H2OGeneralizedAdditiveEstimator", "H2OGradientBoostingEstimator", "H2OGenericEstimator", - "H2OGeneralizedLinearEstimator", "H2OGeneralizedLowRankEstimator", "H2OInfogram", "H2OIsolationForestEstimator", - "H2OIsotonicRegressionEstimator", "H2OKMeansEstimator", "H2OModelSelectionEstimator", "H2ONaiveBayesEstimator", - "H2OPrincipalComponentAnalysisEstimator", "H2OSupportVectorMachineEstimator", "H2ORandomForestEstimator", - "H2ORuleFitEstimator", "H2OStackedEnsembleEstimator", "H2OSingularValueDecompositionEstimator", - "H2OTargetEncoderEstimator", "H2OUpliftRandomForestEstimator", "H2OWord2vecEstimator", "H2OXGBoostEstimator" + "H2OAdaBoostEstimator", "H2OAggregatorEstimator", "H2OANOVAGLMEstimator", "H2OCoxProportionalHazardsEstimator", + "H2ODecisionTreeEstimator", "H2OAutoEncoderEstimator", "H2ODeepLearningEstimator", "H2OEstimator", + "H2OExtendedIsolationForestEstimator", "H2OGeneralizedAdditiveEstimator", "H2OGradientBoostingEstimator", + "H2OGenericEstimator", "H2OGeneralizedLinearEstimator", "H2OGeneralizedLowRankEstimator", "H2OInfogram", + "H2OIsolationForestEstimator", "H2OIsotonicRegressionEstimator", "H2OKMeansEstimator", "H2OModelSelectionEstimator", + "H2ONaiveBayesEstimator", "H2OPrincipalComponentAnalysisEstimator", "H2OSupportVectorMachineEstimator", + "H2ORandomForestEstimator", "H2ORuleFitEstimator", "H2OStackedEnsembleEstimator", + "H2OSingularValueDecompositionEstimator", "H2OTargetEncoderEstimator", "H2OUpliftRandomForestEstimator", + "H2OWord2vecEstimator", "H2OXGBoostEstimator" ) diff --git a/h2o-py/h2o/estimators/adaboost.py b/h2o-py/h2o/estimators/adaboost.py new file mode 100644 index 000000000000..09495202531c --- /dev/null +++ b/h2o-py/h2o/estimators/adaboost.py @@ -0,0 +1,220 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- +# +# This file is auto-generated by h2o-3/h2o-bindings/bin/gen_python.py +# Copyright 2016 H2O.ai; Apache License Version 2.0 (see LICENSE for details) +# + +from h2o.estimators.estimator_base import H2OEstimator +from h2o.exceptions import H2OValueError +from h2o.frame import H2OFrame +from h2o.utils.typechecks import assert_is_type, Enum, numeric + + +class H2OAdaBoostEstimator(H2OEstimator): + """ + AdaBoost + + Builds an AdaBoost model + """ + + algo = "adaboost" + supervised_learning = True + + def __init__(self, + model_id=None, # type: Optional[Union[None, str, H2OEstimator]] + training_frame=None, # type: Optional[Union[None, str, H2OFrame]] + ignored_columns=None, # type: Optional[List[str]] + ignore_const_cols=True, # type: bool + categorical_encoding="auto", # type: Literal["auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder", "sort_by_response", "enum_limited"] + weights_column=None, # type: Optional[str] + nlearners=50, # type: int + weak_learner="auto", # type: Literal["auto", "drf", "glm", "gbm"] + learn_rate=0.5, # type: float + seed=-1, # type: int + ): + """ + :param model_id: Destination id for this model; auto-generated if not specified. + Defaults to ``None``. + :type model_id: Union[None, str, H2OEstimator], optional + :param training_frame: Id of the training data frame. + Defaults to ``None``. + :type training_frame: Union[None, str, H2OFrame], optional + :param ignored_columns: Names of columns to ignore for training. + Defaults to ``None``. + :type ignored_columns: List[str], optional + :param ignore_const_cols: Ignore constant columns. + Defaults to ``True``. + :type ignore_const_cols: bool + :param categorical_encoding: Encoding scheme for categorical features + Defaults to ``"auto"``. + :type categorical_encoding: Literal["auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder", + "sort_by_response", "enum_limited"] + :param weights_column: Column with observation weights. Giving some observation a weight of zero is equivalent + to excluding it from the dataset; giving an observation a relative weight of 2 is equivalent to repeating + that row twice. Negative weights are not allowed. Note: Weights are per-row observation weights and do + not increase the size of the data frame. This is typically the number of times a row is repeated, but + non-integer values are supported as well. During training, rows with higher weights matter more, due to + the larger loss function pre-factor. If you set weight = 0 for a row, the returned prediction frame at + that row is zero and this is incorrect. To get an accurate prediction, remove all rows with weight == 0. + Defaults to ``None``. + :type weights_column: str, optional + :param nlearners: Number of AdaBoost weak learners. + Defaults to ``50``. + :type nlearners: int + :param weak_learner: Choose a weak learner type. Defaults to AUTO, which means DRF. + Defaults to ``"auto"``. + :type weak_learner: Literal["auto", "drf", "glm", "gbm"] + :param learn_rate: Learning rate (from 0.0 to 1.0) + Defaults to ``0.5``. + :type learn_rate: float + :param seed: Seed for pseudo random number generator (if applicable) + Defaults to ``-1``. + :type seed: int + """ + super(H2OAdaBoostEstimator, self).__init__() + self._parms = {} + self._id = self._parms['model_id'] = model_id + self.training_frame = training_frame + self.ignored_columns = ignored_columns + self.ignore_const_cols = ignore_const_cols + self.categorical_encoding = categorical_encoding + self.weights_column = weights_column + self.nlearners = nlearners + self.weak_learner = weak_learner + self.learn_rate = learn_rate + self.seed = seed + + @property + def training_frame(self): + """ + Id of the training data frame. + + Type: ``Union[None, str, H2OFrame]``. + """ + return self._parms.get("training_frame") + + @training_frame.setter + def training_frame(self, training_frame): + self._parms["training_frame"] = H2OFrame._validate(training_frame, 'training_frame') + + @property + def ignored_columns(self): + """ + Names of columns to ignore for training. + + Type: ``List[str]``. + """ + return self._parms.get("ignored_columns") + + @ignored_columns.setter + def ignored_columns(self, ignored_columns): + assert_is_type(ignored_columns, None, [str]) + self._parms["ignored_columns"] = ignored_columns + + @property + def ignore_const_cols(self): + """ + Ignore constant columns. + + Type: ``bool``, defaults to ``True``. + """ + return self._parms.get("ignore_const_cols") + + @ignore_const_cols.setter + def ignore_const_cols(self, ignore_const_cols): + assert_is_type(ignore_const_cols, None, bool) + self._parms["ignore_const_cols"] = ignore_const_cols + + @property + def categorical_encoding(self): + """ + Encoding scheme for categorical features + + Type: ``Literal["auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder", + "sort_by_response", "enum_limited"]``, defaults to ``"auto"``. + """ + return self._parms.get("categorical_encoding") + + @categorical_encoding.setter + def categorical_encoding(self, categorical_encoding): + assert_is_type(categorical_encoding, None, Enum("auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder", "sort_by_response", "enum_limited")) + self._parms["categorical_encoding"] = categorical_encoding + + @property + def weights_column(self): + """ + Column with observation weights. Giving some observation a weight of zero is equivalent to excluding it from the + dataset; giving an observation a relative weight of 2 is equivalent to repeating that row twice. Negative + weights are not allowed. Note: Weights are per-row observation weights and do not increase the size of the data + frame. This is typically the number of times a row is repeated, but non-integer values are supported as well. + During training, rows with higher weights matter more, due to the larger loss function pre-factor. If you set + weight = 0 for a row, the returned prediction frame at that row is zero and this is incorrect. To get an + accurate prediction, remove all rows with weight == 0. + + Type: ``str``. + """ + return self._parms.get("weights_column") + + @weights_column.setter + def weights_column(self, weights_column): + assert_is_type(weights_column, None, str) + self._parms["weights_column"] = weights_column + + @property + def nlearners(self): + """ + Number of AdaBoost weak learners. + + Type: ``int``, defaults to ``50``. + """ + return self._parms.get("nlearners") + + @nlearners.setter + def nlearners(self, nlearners): + assert_is_type(nlearners, None, int) + self._parms["nlearners"] = nlearners + + @property + def weak_learner(self): + """ + Choose a weak learner type. Defaults to AUTO, which means DRF. + + Type: ``Literal["auto", "drf", "glm", "gbm"]``, defaults to ``"auto"``. + """ + return self._parms.get("weak_learner") + + @weak_learner.setter + def weak_learner(self, weak_learner): + assert_is_type(weak_learner, None, Enum("auto", "drf", "glm", "gbm")) + self._parms["weak_learner"] = weak_learner + + @property + def learn_rate(self): + """ + Learning rate (from 0.0 to 1.0) + + Type: ``float``, defaults to ``0.5``. + """ + return self._parms.get("learn_rate") + + @learn_rate.setter + def learn_rate(self, learn_rate): + assert_is_type(learn_rate, None, numeric) + self._parms["learn_rate"] = learn_rate + + @property + def seed(self): + """ + Seed for pseudo random number generator (if applicable) + + Type: ``int``, defaults to ``-1``. + """ + return self._parms.get("seed") + + @seed.setter + def seed(self, seed): + assert_is_type(seed, None, int) + self._parms["seed"] = seed + + diff --git a/h2o-py/tests/testdir_algos/adaboost/pyunit_adaboost_saveload.py b/h2o-py/tests/testdir_algos/adaboost/pyunit_adaboost_saveload.py new file mode 100644 index 000000000000..9de757289e05 --- /dev/null +++ b/h2o-py/tests/testdir_algos/adaboost/pyunit_adaboost_saveload.py @@ -0,0 +1,38 @@ +import sys, os +sys.path.insert(1, os.path.join("..","..","..")) +import h2o +from tests import pyunit_utils +from h2o.estimators import H2OAdaBoostEstimator + + +def adaBoost_save_and_load(): + print("AdaBoost Save Load Test") + + train = h2o.import_file(pyunit_utils.locate("smalldata/prostate/prostate.csv")) + train["CAPSULE"] = train["CAPSULE"].asfactor() + + adaboost_model = H2OAdaBoostEstimator(nlearners=7, seed=12) + adaboost_model.train(training_frame=train, y="CAPSULE") + predict = adaboost_model.predict(train) + + path = pyunit_utils.locate("results") + + assert os.path.isdir(path), "Expected save directory {0} to exist, but it does not.".format(path) + model_path = h2o.save_model(adaboost_model, path=path, force=True) + + assert os.path.isfile(model_path), "Expected load file {0} to exist, but it does not.".format(model_path) + reloaded = h2o.load_model(model_path) + predict_reloaded = reloaded.predict(train) + + assert isinstance(reloaded, + H2OAdaBoostEstimator), \ + "Expected and H2OAdaBoostEstimator, but got {0}"\ + .format(reloaded) + + assert pyunit_utils.compare_frames_local(predict, predict_reloaded, returnResult=True) + + +if __name__ == "__main__": + pyunit_utils.standalone_test(adaBoost_save_and_load) +else: + adaBoost_save_and_load() diff --git a/h2o-py/tests/testdir_algos/adaboost/pyunit_adaboost_smoke.py b/h2o-py/tests/testdir_algos/adaboost/pyunit_adaboost_smoke.py new file mode 100644 index 000000000000..5e52a11f9859 --- /dev/null +++ b/h2o-py/tests/testdir_algos/adaboost/pyunit_adaboost_smoke.py @@ -0,0 +1,31 @@ +import sys, os +sys.path.insert(1, os.path.join("..","..","..")) +import h2o +from tests import pyunit_utils +from h2o.estimators import H2OAdaBoostEstimator + + +def adaboost(): + print("AdaBoost Smoke Test") + + train = h2o.import_file(pyunit_utils.locate("smalldata/prostate/prostate.csv")) + train["CAPSULE"] = train["CAPSULE"].asfactor() + + adaboost_model = H2OAdaBoostEstimator(nlearners=55, seed=0xBEEF, weak_learner="GLM", learn_rate=0.6) + adaboost_model.train(training_frame=train, y="CAPSULE") + predict = adaboost_model.predict(train) + + print("") + print(adaboost_model) + print("") + print(predict) + + assert 55 == adaboost_model._model_json["output"]["model_summary"]["number_of_weak_learners"][0], "Python API is not working!" + assert "GLM" == adaboost_model._model_json["output"]["model_summary"]["weak_learner"][0], "Python API is not working!" + assert 0.6 == adaboost_model._model_json["output"]["model_summary"]["learn_rate"][0], "Python API is not working!" + + +if __name__ == "__main__": + pyunit_utils.standalone_test(adaboost) +else: + adaboost() diff --git a/h2o-py/tests/testdir_sklearn/pyunit_sklearn_generic_all_estimators.py b/h2o-py/tests/testdir_sklearn/pyunit_sklearn_generic_all_estimators.py index c584ea25d6d1..c38651b8b69c 100644 --- a/h2o-py/tests/testdir_sklearn/pyunit_sklearn_generic_all_estimators.py +++ b/h2o-py/tests/testdir_sklearn/pyunit_sklearn_generic_all_estimators.py @@ -199,6 +199,7 @@ def make_tests(classifier): 'H2OWord2vecEstimator', # needs a separate test (requires pre_trained model as parameter) 'H2OUpliftRandomForestEstimator', # generic part is not implemented yet 'H2ODecisionTreeEstimator', # generic part is not implemented yet + 'H2OAdaBoostEstimator', # generic part is not implemented yet or test needs to be adjusted just for classification ] estimators = [cls for name, cls in inspect.getmembers(h2o.sklearn, inspect.isclass) if name.endswith('Estimator') and name not in ['H2OAutoMLEstimator'] + failing] diff --git a/h2o-py/tests/testdir_sklearn/pyunit_sklearn_regression_all_estimators.py b/h2o-py/tests/testdir_sklearn/pyunit_sklearn_regression_all_estimators.py index 0debe9691020..839733595178 100644 --- a/h2o-py/tests/testdir_sklearn/pyunit_sklearn_regression_all_estimators.py +++ b/h2o-py/tests/testdir_sklearn/pyunit_sklearn_regression_all_estimators.py @@ -138,7 +138,8 @@ def make_tests(classifier): 'H2OCoxProportionalHazardsRegressor', # doesn't support regression? 'H2OStackedEnsembleRegressor', # needs a separate test (requires models as parameters), 'H2OUpliftRandomForestRegressor', # does not support regression yet - 'H2ODecisionTreeRegressor' # does not support regression yet + 'H2ODecisionTreeRegressor', # does not support regression yet + 'H2OAdaBoostRegressor' # does not support regression yet ] regressors = [cls for name, cls in inspect.getmembers(h2o.sklearn, inspect.isclass) if name.endswith('Regressor') and name not in ['H2OAutoMLRegressor']+failing] diff --git a/h2o-py/tests_rest_smoke/testdir_multi_jvm/test_rest_api.py b/h2o-py/tests_rest_smoke/testdir_multi_jvm/test_rest_api.py index 1fec73fc1a1e..13c93693ae8e 100644 --- a/h2o-py/tests_rest_smoke/testdir_multi_jvm/test_rest_api.py +++ b/h2o-py/tests_rest_smoke/testdir_multi_jvm/test_rest_api.py @@ -23,7 +23,7 @@ algos = ['coxph', 'kmeans', 'deeplearning', 'drf', 'glm', 'gbm', 'pca', 'naivebayes', 'glrm', 'svd', 'isotonicregression', 'psvm', 'aggregator', 'word2vec', 'stackedensemble', 'xgboost', 'isolationforest', 'gam', 'generic', 'targetencoder', 'rulefit', 'extendedisolationforest', 'anovaglm', 'modelselection', - 'upliftdrf', 'infogram', 'dt'] + 'upliftdrf', 'infogram', 'dt', 'adaboost'] algo_additional_default_params = { 'grep' : { 'regex' : '.*' }, 'kmeans' : { 'k' : 2 }, diff --git a/h2o-r/H2O_Load.R b/h2o-r/H2O_Load.R index 96058f24465e..74af068e4558 100755 --- a/h2o-r/H2O_Load.R +++ b/h2o-r/H2O_Load.R @@ -17,7 +17,8 @@ function() { "edicts.R", "coxph.R", "coxphutils.R", "glm.R", "gam.R", "glrm.R", "pca.R", "kmeans.R", "gbm.R", "deeplearning.R", "naivebayes.R", "randomforest.R", "svd.R", "locate.R", "predict.R", "rulefit.R", "isolationforest.R", "psvm.R", "tf-idf.R", "permutation_varimp.R", "extendedisolationforest.R", - "anovaglm.R", "modelselection.R", "upliftrandomforest.R", "infogram.R", "admissibleml.R", "decisiontree.R") + "anovaglm.R", "modelselection.R", "upliftrandomforest.R", "infogram.R", "admissibleml.R", "decisiontree.R", + "adaBoost.R") require(jsonlite); require(RCurl) invisible(lapply(to_src,function(x){source(paste(FULL.PATH, x, sep = ""))})) } diff --git a/h2o-r/h2o-package/R/adaboost.R b/h2o-r/h2o-package/R/adaboost.R new file mode 100644 index 000000000000..526467977831 --- /dev/null +++ b/h2o-r/h2o-package/R/adaboost.R @@ -0,0 +1,170 @@ +# This file is auto-generated by h2o-3/h2o-bindings/bin/gen_R.py +# Copyright 2016 H2O.ai; Apache License Version 2.0 (see LICENSE for details) +#' +# -------------------------- AdaBoost Model -------------------------- # +#' +#' Build an AdaBoost model +#' +#' Builds an AdaBoost model on an H2OFrame. +#' +#' @param x (Optional) A vector containing the names or indices of the predictor variables to use in building the model. +#' If x is missing, then all columns except y are used. +#' @param y The name or column index of the response variable in the data. +#' The response must be either a numeric or a categorical/factor variable. +#' If the response is numeric, then a regression model will be trained, otherwise it will train a classification model. +#' @param training_frame Id of the training data frame. +#' @param model_id Destination id for this model; auto-generated if not specified. +#' @param ignore_const_cols \code{Logical}. Ignore constant columns. Defaults to TRUE. +#' @param categorical_encoding Encoding scheme for categorical features Must be one of: "AUTO", "Enum", "OneHotInternal", "OneHotExplicit", +#' "Binary", "Eigen", "LabelEncoder", "SortByResponse", "EnumLimited". Defaults to AUTO. +#' @param weights_column Column with observation weights. Giving some observation a weight of zero is equivalent to excluding it from +#' the dataset; giving an observation a relative weight of 2 is equivalent to repeating that row twice. Negative +#' weights are not allowed. Note: Weights are per-row observation weights and do not increase the size of the +#' data frame. This is typically the number of times a row is repeated, but non-integer values are supported as +#' well. During training, rows with higher weights matter more, due to the larger loss function pre-factor. If +#' you set weight = 0 for a row, the returned prediction frame at that row is zero and this is incorrect. To get +#' an accurate prediction, remove all rows with weight == 0. +#' @param nlearners Number of AdaBoost weak learners. Defaults to 50. +#' @param weak_learner Choose a weak learner type. Defaults to AUTO, which means DRF. Must be one of: "AUTO", "DRF", "GLM", "GBM". +#' Defaults to AUTO. +#' @param learn_rate Learning rate (from 0.0 to 1.0) Defaults to 0.5. +#' @param seed Seed for random numbers (affects certain parts of the algo that are stochastic and those might or might not be enabled by default). +#' Defaults to -1 (time-based random number). +#' @return Creates a \linkS4class{H2OModel} object of the right type. +#' @seealso \code{\link{predict.H2OModel}} for prediction +#' @examples +#' \dontrun{ +#' library(h2o) +#' h2o.init() +#' +#' # Import the airlines dataset +#' f <- "https://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv" +#' data <- h2o.importFile(f) +#' +#' # Set predictors and response; set response as a factor +#' data["CAPSULE"] <- as.factor(data["CAPSULE"]) +#' predictors <- c("AGE","RACE","DPROS","DCAPS","PSA","VOL","GLEASON") +#' response <- "CAPSULE" +#' +#' # Train the AdaBoost model +#' h2o_adaboost <- h2o.adaBoost(x = predictors, y = response, training_frame = data, seed = 1234) +#' } +#' @export +h2o.adaBoost <- function(x, + y, + training_frame, + model_id = NULL, + ignore_const_cols = TRUE, + categorical_encoding = c("AUTO", "Enum", "OneHotInternal", "OneHotExplicit", "Binary", "Eigen", "LabelEncoder", "SortByResponse", "EnumLimited"), + weights_column = NULL, + nlearners = 50, + weak_learner = c("AUTO", "DRF", "GLM", "GBM"), + learn_rate = 0.5, + seed = -1) +{ + # Validate required training_frame first and other frame args: should be a valid key or an H2OFrame object + training_frame <- .validate.H2OFrame(training_frame, required=TRUE) + + # Validate other required args + # If x is missing, then assume user wants to use all columns as features. + if (missing(x)) { + if (is.numeric(y)) { + x <- setdiff(col(training_frame), y) + } else { + x <- setdiff(colnames(training_frame), y) + } + } + + # Build parameter list to send to model builder + parms <- list() + parms$training_frame <- training_frame + args <- .verify_dataxy(training_frame, x, y) + parms$ignored_columns <- args$x_ignore + parms$response_column <- args$y + + if (!missing(model_id)) + parms$model_id <- model_id + if (!missing(ignore_const_cols)) + parms$ignore_const_cols <- ignore_const_cols + if (!missing(categorical_encoding)) + parms$categorical_encoding <- categorical_encoding + if (!missing(weights_column)) + parms$weights_column <- weights_column + if (!missing(nlearners)) + parms$nlearners <- nlearners + if (!missing(weak_learner)) + parms$weak_learner <- weak_learner + if (!missing(learn_rate)) + parms$learn_rate <- learn_rate + if (!missing(seed)) + parms$seed <- seed + + # Error check and build model + model <- .h2o.modelJob('adaboost', parms, h2oRestApiVersion=3, verbose=FALSE) + return(model) +} +.h2o.train_segments_adaboost <- function(x, + y, + training_frame, + ignore_const_cols = TRUE, + categorical_encoding = c("AUTO", "Enum", "OneHotInternal", "OneHotExplicit", "Binary", "Eigen", "LabelEncoder", "SortByResponse", "EnumLimited"), + weights_column = NULL, + nlearners = 50, + weak_learner = c("AUTO", "DRF", "GLM", "GBM"), + learn_rate = 0.5, + seed = -1, + segment_columns = NULL, + segment_models_id = NULL, + parallelism = 1) +{ + # formally define variables that were excluded from function parameters + model_id <- NULL + verbose <- NULL + destination_key <- NULL + # Validate required training_frame first and other frame args: should be a valid key or an H2OFrame object + training_frame <- .validate.H2OFrame(training_frame, required=TRUE) + + # Validate other required args + # If x is missing, then assume user wants to use all columns as features. + if (missing(x)) { + if (is.numeric(y)) { + x <- setdiff(col(training_frame), y) + } else { + x <- setdiff(colnames(training_frame), y) + } + } + + # Build parameter list to send to model builder + parms <- list() + parms$training_frame <- training_frame + args <- .verify_dataxy(training_frame, x, y) + parms$ignored_columns <- args$x_ignore + parms$response_column <- args$y + + if (!missing(ignore_const_cols)) + parms$ignore_const_cols <- ignore_const_cols + if (!missing(categorical_encoding)) + parms$categorical_encoding <- categorical_encoding + if (!missing(weights_column)) + parms$weights_column <- weights_column + if (!missing(nlearners)) + parms$nlearners <- nlearners + if (!missing(weak_learner)) + parms$weak_learner <- weak_learner + if (!missing(learn_rate)) + parms$learn_rate <- learn_rate + if (!missing(seed)) + parms$seed <- seed + + # Build segment-models specific parameters + segment_parms <- list() + if (!missing(segment_columns)) + segment_parms$segment_columns <- segment_columns + if (!missing(segment_models_id)) + segment_parms$segment_models_id <- segment_models_id + segment_parms$parallelism <- parallelism + + # Error check and build segment models + segment_models <- .h2o.segmentModelsJob('adaboost', segment_parms, parms, h2oRestApiVersion=3) + return(segment_models) +} diff --git a/h2o-r/h2o-package/pkgdown/_pkgdown.yml b/h2o-r/h2o-package/pkgdown/_pkgdown.yml index 6e170838b4db..149fd0526bc7 100644 --- a/h2o-r/h2o-package/pkgdown/_pkgdown.yml +++ b/h2o-r/h2o-package/pkgdown/_pkgdown.yml @@ -38,6 +38,7 @@ reference: - h2o - h2o.abs - h2o.acos + - h2o.adaBoost - h2o.aggregated_frame - h2o.aggregator - h2o.aic diff --git a/h2o-r/scripts/h2o-r-test-setup.R b/h2o-r/scripts/h2o-r-test-setup.R index 9dd64ce0e762..61e5d8e919a4 100755 --- a/h2o-r/scripts/h2o-r-test-setup.R +++ b/h2o-r/scripts/h2o-r-test-setup.R @@ -187,7 +187,7 @@ function() { "coxph.R", "coxphutils.R", "gbm.R", "glm.R", "gam.R", "anovaglm.R", "glrm.R", "kmeans.R", "deeplearning.R", "randomforest.R", "generic.R", "naivebayes.R", "pca.R", "svd.R", "locate.R", "grid.R", "word2vec.R", "w2vutils.R", "stackedensemble.R", "rulefit.R", "modelselection.R", "predict.R", "xgboost.R", "isolationforest.R", "psvm.R", "segment.R", "tf-idf.R", "explain.R", "permutation_varimp.R", "extendedisolationforest.R", - "upliftrandomforest.R", "infogram.R", "isotonicregression.R", "admissibleml.R", "decisiontree.R") + "upliftrandomforest.R", "infogram.R", "isotonicregression.R", "admissibleml.R", "decisiontree.R", "adaboost.R") src_path <- paste(h2oRDir,"h2o-package","R",sep=.Platform$file.sep) invisible(lapply(to_src,function(x){source(paste(src_path, x, sep = .Platform$file.sep))})) diff --git a/h2o-r/tests/testdir_algos/adaboost/runit_adaboost_smoke.R b/h2o-r/tests/testdir_algos/adaboost/runit_adaboost_smoke.R new file mode 100644 index 000000000000..ca59093b6fbb --- /dev/null +++ b/h2o-r/tests/testdir_algos/adaboost/runit_adaboost_smoke.R @@ -0,0 +1,20 @@ +setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f"))) +source("../../../scripts/h2o-r-test-setup.R") + + + +test.adaBoost.smoke <- function() { + f <- "https://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv" + data <- h2o.importFile(f) + + # Set predictors and response; set response as a factor + data["CAPSULE"] <- as.factor(data["CAPSULE"]) + predictors <- c("AGE","RACE","DPROS","DCAPS","PSA","VOL","GLEASON") + response <- "CAPSULE" + + # Train the AdaBoost model + h2o_adaboost <- h2o.adaBoost(x = predictors, y = response, training_frame = data, seed = 1234) + expect_equal(is.null(h2o_adaboost), FALSE) +} + +doTest("adaBoost: Smoke Test", test.adaBoost.smoke)