diff --git a/h2o-algos/src/main/java/hex/schemas/UpliftDRFModelV3.java b/h2o-algos/src/main/java/hex/schemas/UpliftDRFModelV3.java index 565214aafedc..f334b0870384 100644 --- a/h2o-algos/src/main/java/hex/schemas/UpliftDRFModelV3.java +++ b/h2o-algos/src/main/java/hex/schemas/UpliftDRFModelV3.java @@ -1,6 +1,7 @@ package hex.schemas; import hex.tree.uplift.UpliftDRFModel; +import water.api.API; public class UpliftDRFModelV3 extends SharedTreeModelV3 { - public static final class UpliftDRFModelOutputV3 extends SharedTreeModelV3.SharedTreeModelOutputV3 {} + public static final class UpliftDRFModelOutputV3 extends SharedTreeModelV3.SharedTreeModelOutputV3 { + @API(help="Default thresholds to calculate AUUC metric. If validation is enabled, thresholds from validation metrics is saved here. Otherwise thresholds are from training metrics.") + public double[] default_auuc_thresholds; + + @Override public UpliftDRFModelV3.UpliftDRFModelOutputV3 fillFromImpl(UpliftDRFModel.UpliftDRFOutput impl) { + UpliftDRFModelV3.UpliftDRFModelOutputV3 uov3 = super.fillFromImpl(impl); + uov3.default_auuc_thresholds = impl._defaultAuucThresholds; + return uov3; + } + } + public UpliftDRFV3.UpliftDRFParametersV3 createParametersSchema() { return new UpliftDRFV3.UpliftDRFParametersV3(); } public UpliftDRFModelOutputV3 createOutputSchema() { return new UpliftDRFModelOutputV3(); } diff --git a/h2o-algos/src/main/java/hex/tree/Score.java b/h2o-algos/src/main/java/hex/tree/Score.java index 5efd075e120b..f894515c4ada 100755 --- a/h2o-algos/src/main/java/hex/tree/Score.java +++ b/h2o-algos/src/main/java/hex/tree/Score.java @@ -4,6 +4,7 @@ import hex.genmodel.GenModel; import hex.genmodel.utils.DistributionFamily; import hex.tree.gbm.GBMModel; +import hex.tree.uplift.UpliftDRFModel; import org.apache.log4j.Logger; import water.Iced; import water.Key; diff --git a/h2o-algos/src/main/java/hex/tree/SharedTree.java b/h2o-algos/src/main/java/hex/tree/SharedTree.java index 4cf64ca9ad64..65ace57f5a90 100755 --- a/h2o-algos/src/main/java/hex/tree/SharedTree.java +++ b/h2o-algos/src/main/java/hex/tree/SharedTree.java @@ -838,7 +838,6 @@ protected final boolean doScoringAndSaveModel(boolean finalScoring, boolean oob, out._training_metrics = mm; if (oob) out._training_metrics._description = "Metrics reported on Out-Of-Bag training samples"; out._scored_train[out._ntrees].fillFrom(mm); - // Score again on validation data if( _parms._valid != null) { Frame v = new Frame(valid()); diff --git a/h2o-algos/src/main/java/hex/tree/SharedTreeModel.java b/h2o-algos/src/main/java/hex/tree/SharedTreeModel.java index 176da3ddf4dd..fd63d8f74bfb 100755 --- a/h2o-algos/src/main/java/hex/tree/SharedTreeModel.java +++ b/h2o-algos/src/main/java/hex/tree/SharedTreeModel.java @@ -5,7 +5,7 @@ import hex.genmodel.algos.tree.SharedTreeMojoModel; import hex.genmodel.algos.tree.SharedTreeNode; import hex.genmodel.algos.tree.SharedTreeSubgraph; -import hex.glm.GLMModel; +import hex.tree.uplift.UpliftDRFModel; import hex.util.LinearAlgebraUtils; import org.apache.log4j.Logger; import water.*; @@ -166,6 +166,7 @@ public boolean forceStrictlyReproducibleHistograms() { case Binomial: return new ModelMetricsBinomial.MetricBuilderBinomial(domain); case Multinomial: return new ModelMetricsMultinomial.MetricBuilderMultinomial(_output.nclasses(),domain, _parms._auc_type); case Regression: return new ModelMetricsRegression.MetricBuilderRegression(); + case BinomialUplift: return new ModelMetricsBinomialUplift.MetricBuilderBinomialUplift(domain, ((UpliftDRFModel.UpliftDRFOutput)_output)._defaultAuucThresholds); default: throw H2O.unimpl(); } } diff --git a/h2o-algos/src/main/java/hex/tree/uplift/UpliftDRF.java b/h2o-algos/src/main/java/hex/tree/uplift/UpliftDRF.java index eef583f66bfa..b5be40651918 100644 --- a/h2o-algos/src/main/java/hex/tree/uplift/UpliftDRF.java +++ b/h2o-algos/src/main/java/hex/tree/uplift/UpliftDRF.java @@ -1,6 +1,8 @@ package hex.tree.uplift; import hex.*; +import hex.genmodel.MojoModel; +import hex.genmodel.algos.upliftdrf.UpliftDrfMojoModel; import hex.genmodel.utils.DistributionFamily; import hex.tree.*; import org.apache.log4j.Logger; @@ -50,12 +52,12 @@ public UpliftDRF(boolean startup_once) { @Override public boolean haveMojo() { - return false; + return true; } @Override public boolean havePojo() { - return false; + return true; } @Override @@ -472,6 +474,22 @@ static TwoDimTable createUpliftScoringHistoryTable(Model.Output _output, return table; } + @Override + public PojoWriter makePojoWriter(Model genericModel, MojoModel mojoModel) { + UpliftDrfMojoModel upliftDrfMojoModel = (UpliftDrfMojoModel) mojoModel; + CompressedTree[][] trees = MojoUtils.extractCompressedTrees(upliftDrfMojoModel); + return new UpliftDrfPojoWriter(genericModel, upliftDrfMojoModel.getCategoricalEncoding(), false, trees, upliftDrfMojoModel._balanceClasses); + } + + @Override + protected void addCustomInfo(UpliftDRFModel.UpliftDRFOutput out) { + if(out._validation_metrics != null){ + out._defaultAuucThresholds = ((ModelMetricsBinomialUplift)out._validation_metrics)._auuc._ths; + } else { + out._defaultAuucThresholds = ((ModelMetricsBinomialUplift)out._training_metrics)._auuc._ths; + } + } + @Override protected UpliftScoreExtension makeScoreExtension() { return new UpliftScoreExtension(); diff --git a/h2o-algos/src/main/java/hex/tree/uplift/UpliftDRFModel.java b/h2o-algos/src/main/java/hex/tree/uplift/UpliftDRFModel.java index 931ef874e6ad..dbe8277d576d 100644 --- a/h2o-algos/src/main/java/hex/tree/uplift/UpliftDRFModel.java +++ b/h2o-algos/src/main/java/hex/tree/uplift/UpliftDRFModel.java @@ -1,10 +1,11 @@ package hex.tree.uplift; import hex.*; +import hex.tree.CompressedForest; import hex.tree.SharedTreeModel; import hex.tree.SharedTreeModelWithContributions; +import hex.tree.SharedTreePojoWriter; import hex.util.EffectiveParametersUtils; -import water.H2O; import water.Key; public class UpliftDRFModel extends SharedTreeModel { @@ -13,7 +14,6 @@ public static class UpliftDRFParameters extends SharedTreeModel.SharedTreeParame public String algoName() { return "UpliftDRF"; } public String fullName() { return "Uplift Distributed Random Forest"; } public String javaName() { return UpliftDRFModel.class.getName(); } - public boolean _binomial_double_trees = false; public enum UpliftMetricType { AUTO, KL, ChiSquared, Euclidean } @@ -36,6 +36,9 @@ public long progressUnits() { } public static class UpliftDRFOutput extends SharedTreeModelWithContributions.SharedTreeOutput { + + public double[] _defaultAuucThresholds; // thresholds for AUUC to calculate metrics + public UpliftDRFOutput( UpliftDRF b) { super(b); } @Override @@ -45,7 +48,11 @@ public ModelCategory getModelCategory() { @Override public boolean isBinomialClassifier() { - return false; + return true; + } + + public void setDefaultAuucThresholds(double[] defaultAuucThresholds) { + this._defaultAuucThresholds = defaultAuucThresholds; } } @@ -77,10 +84,18 @@ public void initActualParamValues() { } @Override public ModelMetrics.MetricBuilder makeMetricBuilder(String[] domain) { - if (_output.getModelCategory() == ModelCategory.BinomialUplift) { - return new ModelMetricsBinomialUplift.MetricBuilderBinomialUplift(domain); - } - throw H2O.unimpl(); + return new ModelMetricsBinomialUplift.MetricBuilderBinomialUplift(domain, _output._defaultAuucThresholds); } + @Override + public UpliftDrfMojoWriter getMojo() { + return new UpliftDrfMojoWriter(this); + } + + @Override + protected SharedTreePojoWriter makeTreePojoWriter() { + CompressedForest compressedForest = new CompressedForest(_output._treeKeys, _output._domains); + CompressedForest.LocalCompressedForest localCompressedForest = compressedForest.fetch(); + return new UpliftDrfPojoWriter(this, localCompressedForest._trees); + } } diff --git a/h2o-algos/src/main/java/hex/tree/uplift/UpliftDrfMojoWriter.java b/h2o-algos/src/main/java/hex/tree/uplift/UpliftDrfMojoWriter.java new file mode 100644 index 000000000000..1d8cad5855ba --- /dev/null +++ b/h2o-algos/src/main/java/hex/tree/uplift/UpliftDrfMojoWriter.java @@ -0,0 +1,23 @@ +package hex.tree.uplift; + +import hex.tree.SharedTreeMojoWriter; + +import java.io.IOException; + +public class UpliftDrfMojoWriter extends SharedTreeMojoWriter { + + @SuppressWarnings("unused") // Called through reflection in ModelBuildersHandler + public UpliftDrfMojoWriter() {} + + public UpliftDrfMojoWriter(UpliftDRFModel model) { super(model); } + + @Override public String mojoVersion() { + return "1.40"; + } + + @Override + protected void writeModelData() throws IOException { + super.writeModelData(); + writekv("default_auuc_thresholds", model._output._defaultAuucThresholds); + } +} diff --git a/h2o-algos/src/main/java/hex/tree/uplift/UpliftDrfPojoWriter.java b/h2o-algos/src/main/java/hex/tree/uplift/UpliftDrfPojoWriter.java new file mode 100644 index 000000000000..ebe1627c9b72 --- /dev/null +++ b/h2o-algos/src/main/java/hex/tree/uplift/UpliftDrfPojoWriter.java @@ -0,0 +1,28 @@ +package hex.tree.uplift; + +import hex.Model; +import hex.genmodel.CategoricalEncoding; +import hex.tree.CompressedTree; +import hex.tree.SharedTreePojoWriter; +import water.util.SBPrintStream; + +public class UpliftDrfPojoWriter extends SharedTreePojoWriter { + + UpliftDrfPojoWriter(UpliftDRFModel model, CompressedTree[][] trees) { + super(model._key, model._output, model.getGenModelEncoding(), model.binomialOpt(), + trees, model._output._treeStats); + } + + UpliftDrfPojoWriter(Model model, CategoricalEncoding encoding, + boolean binomialOpt, CompressedTree[][] trees, + boolean balanceClasses) { + super(model._key, model._output, encoding, binomialOpt, trees, null); + } + + @Override + protected void toJavaUnifyPreds(SBPrintStream body) { + body.ip("preds[1] /= " + _trees.length + ";").nl(); + body.ip("preds[2] /= " + _trees.length + ";").nl(); + body.ip("preds[0] = preds[1] - preds[2]"); + } +} diff --git a/h2o-algos/src/test/java/hex/tree/gbm/GBMTest.java b/h2o-algos/src/test/java/hex/tree/gbm/GBMTest.java index 515cfdb237b4..6b302ff98711 100755 --- a/h2o-algos/src/test/java/hex/tree/gbm/GBMTest.java +++ b/h2o-algos/src/test/java/hex/tree/gbm/GBMTest.java @@ -4458,6 +4458,53 @@ public void testResetThreshold() throws Exception { } } + @Test + public void testMojoMetrics() throws Exception { + GBMModel gbm = null; + try { + Scope.enter(); + Frame frame = new TestFrameBuilder() + .withName("data") + .withColNames("ColA", "ColB", "Response") + .withVecTypes(Vec.T_NUM, Vec.T_NUM, Vec.T_NUM) + .withDataForCol(0, ard(0, 1, 0, 1, 0, 1, 0)) + .withDataForCol(1, ard(Double.NaN, 1, 2, 3, 4, 5.6, 7)) + .withDataForCol(2, ard(1, 0, 1, 1, 1, 0, 1)) + .build(); + + frame = frame.toCategoricalCol(2); + + Frame frameVal = new TestFrameBuilder() + .withName("dataVal") + .withColNames("ColA", "ColB", "Response") + .withVecTypes(Vec.T_NUM, Vec.T_NUM, Vec.T_NUM) + .withDataForCol(0, ard(0, 1, 1, 1, 0, 0, 1)) + .withDataForCol(1, ard(Double.NaN, 1, 3, 2, 4, 8, 7)) + .withDataForCol(2, ard(1, 1, 1, 0, 0, 1, 1)) + .build(); + + frameVal = frameVal.toCategoricalCol(2); + + GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); + parms._train = frame._key; + parms._valid = frameVal._key; + parms._response_column = "Response"; + parms._ntrees = 1; + parms._min_rows = 0.1; + parms._distribution = bernoulli; + + gbm = new GBM(parms).trainModel().get(); + Scope.track_generic(gbm); + Frame train_score = gbm.score(frame); + Scope.track_generic(train_score); + + assertTrue(gbm.testJavaScoring(frame, train_score, 1e-15)); + + } finally { + Scope.exit(); + } + } + @Test public void testGBMFeatureInteractions() { Scope.enter(); diff --git a/h2o-algos/src/test/java/hex/tree/uplift/UpliftDRFTest.java b/h2o-algos/src/test/java/hex/tree/uplift/UpliftDRFTest.java index f5104a55cc2a..93b2c47ab6a3 100644 --- a/h2o-algos/src/test/java/hex/tree/uplift/UpliftDRFTest.java +++ b/h2o-algos/src/test/java/hex/tree/uplift/UpliftDRFTest.java @@ -1,10 +1,16 @@ package hex.tree.uplift; import hex.ScoreKeeper; +import hex.genmodel.MojoModel; +import hex.genmodel.easy.EasyPredictModelWrapper; +import hex.genmodel.easy.RowData; +import hex.genmodel.easy.prediction.UpliftBinomialModelPrediction; import hex.genmodel.utils.ArrayUtils; import hex.genmodel.utils.DistributionFamily; +import org.junit.Assume; import org.junit.Test; import org.junit.runner.RunWith; +import water.H2O; import water.Scope; import water.TestUtil; import water.exceptions.H2OModelBuilderIllegalArgumentException; @@ -14,7 +20,14 @@ import water.runner.CloudSize; import water.runner.H2ORunner; +import java.io.IOException; +import java.util.ArrayList; import java.util.Arrays; +import java.util.List; +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; import static org.junit.Assert.*; @@ -356,4 +369,85 @@ public void testPredictCorrectOutput() { Scope.exit(); } } + + @Test + public void testMojo() { + try { + Scope.enter(); + Frame train = new TestFrameBuilder() + .withColNames("C0", "C1", "treatment", "conversion") + .withVecTypes(Vec.T_NUM, Vec.T_NUM, Vec.T_CAT, Vec.T_CAT) + .withDataForCol(0, ard(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0)) + .withDataForCol(1, ard(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0)) + .withDataForCol(2, ar("T", "C", "T", "T", "T", "C", "C", "C", "C", "C")) + .withDataForCol(3, ar("Yes", "No", "Yes", "No", "Yes", "No", "Yes", "No", "Yes", "Yes")) + .build(); + train.toCategoricalCol("treatment"); + train.toCategoricalCol("conversion"); + UpliftDRFModel.UpliftDRFParameters p = new UpliftDRFModel.UpliftDRFParameters(); + p._train = train._key; + p._response_column = "conversion"; + p._treatment_column = "treatment"; + p._ntrees = 4; + + UpliftDRF udrf = new UpliftDRF(p); + UpliftDRFModel model = udrf.trainModel().get(); + Scope.track_generic(model); + Frame preds = model.score(train); + Scope.track_generic(preds); + + assertTrue(model.testJavaScoring(train, preds,1e-15)); + } finally { + Scope.exit(); + } + } + + @Test + public void testEasyPredictMojo() throws Exception { + try { + Scope.enter(); + Frame train = new TestFrameBuilder() + .withColNames("C0", "C1", "treatment", "conversion") + .withVecTypes(Vec.T_NUM, Vec.T_NUM, Vec.T_CAT, Vec.T_CAT) + .withDataForCol(0, ard(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0)) + .withDataForCol(1, ard(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0)) + .withDataForCol(2, ar("T", "C", "T", "T", "T", "C", "C", "C", "C", "C")) + .withDataForCol(3, ar("Yes", "No", "Yes", "No", "Yes", "No", "Yes", "No", "Yes", "Yes")) + .build(); + train.toCategoricalCol("treatment"); + train.toCategoricalCol("conversion"); + Scope.track_generic(train); + UpliftDRFModel.UpliftDRFParameters p = new UpliftDRFModel.UpliftDRFParameters(); + p._train = train._key; + p._response_column = "conversion"; + p._treatment_column = "treatment"; + p._ntrees = 4; + + UpliftDRF udrf = new UpliftDRF(p); + UpliftDRFModel model = udrf.trainModel().get(); + Scope.track_generic(model); + MojoModel mojo = model.toMojo(); + EasyPredictModelWrapper wrapper = new EasyPredictModelWrapper( + new EasyPredictModelWrapper.Config() + .setModel(mojo) + .setEnableContributions(false) + ); + Frame featureFr = train.subframe(mojo.features()); + Scope.track_generic(featureFr); + for (int i = 0; i < featureFr.numRows(); i++) { + RowData row = new RowData(); + for (String feat : featureFr.names()) { + if (!featureFr.vec(feat).isNA(i)) { + double value = featureFr.vec(feat).at(i); + row.put(feat, value); + } + } + UpliftBinomialModelPrediction pred = wrapper.predictUpliftBinomial(row); + assertEquals(pred.predictions.length,3); + assertEquals(pred.predictions[0], pred.predictions[1]-pred.predictions[2], 0); + } + } finally { + Scope.exit(); + } + } } diff --git a/h2o-core/src/main/java/hex/AUUC.java b/h2o-core/src/main/java/hex/AUUC.java index 442775e95a99..842ec0869ad9 100644 --- a/h2o-core/src/main/java/hex/AUUC.java +++ b/h2o-core/src/main/java/hex/AUUC.java @@ -59,72 +59,93 @@ private int getIndexByAUUCType(AUUCType type){ public double[] upliftByType(AUUCType type){ int idx = getIndexByAUUCType(type); - return _uplift[idx]; + return idx < 0 ? null : _uplift[idx]; } public double[] upliftNormalizedByType(AUUCType type){ int idx = getIndexByAUUCType(type); - return _upliftNormalized[idx]; + return idx < 0 ? null : _upliftNormalized[idx]; } public double[] upliftRandomByType(AUUCType type){ int idx = getIndexByAUUCType(type); - return _upliftRandom[idx]; + return idx < 0 ? null : _upliftRandom[idx]; } - + public AUUC(Vec probs, Vec y, Vec uplift, AUUCType auucType, int nbins) { - this(nbins, probs, y, uplift, auucType); - } - - public AUUC(int nBins, Vec probs, Vec y, Vec uplift, AUUCType auucType) { - this(new AUUCImpl(calculateQuantileThresholds(nBins, probs)).doAll(probs, y, uplift)._bldr, auucType); + this(new AUUCImpl(calculateQuantileThresholds(nbins, probs)).doAll(probs, y, uplift)._bldr, auucType); } public AUUC(AUUCBuilder bldr, AUUCType auucType) { this(bldr, true, auucType); } + - private AUUC(AUUCBuilder bldr, boolean trueProbabilities, AUUCType auucType) { + public AUUC(double[] customThresholds, Vec probs, Vec y, Vec uplift, AUUCType auucType) { + this(new AUUCImpl(customThresholds).doAll(probs, y, uplift)._bldr, auucType); + } + + public AUUC(AUUCBuilder bldr, boolean trueProbabilities, AUUCType auucType) { _auucType = auucType; _auucTypeIndx = getIndexByAUUCType(_auucType); _nBins = bldr._nBins; - assert _nBins >= 1 : "Must have >= 1 bins for AUUC calculation, but got " + _nBins; - assert trueProbabilities || bldr._thresholds[_nBins - 1] == 1 : "Bins need to contain pred = 1 when 0-1 probabilities are used"; - _n = bldr._n; - _ths = Arrays.copyOf(bldr._thresholds,_nBins); - _treatment = Arrays.copyOf(bldr._treatment,_nBins); - _control = Arrays.copyOf(bldr._control,_nBins); - _yTreatment = Arrays.copyOf(bldr._yTreatment,_nBins); - _yControl = Arrays.copyOf(bldr._yControl,_nBins); - _frequency = Arrays.copyOf(bldr._frequency, _nBins); - _frequencyCumsum = Arrays.copyOf(bldr._frequency, _nBins); - _uplift = new double[AUUCType.values().length][_nBins]; - _upliftRandom = new double[AUUCType.values().length][_nBins]; - _upliftNormalized = new double[AUUCType.values().length][_nBins]; - - // Rollup counts - long tmpt=0, tmpc=0, tmptp=0, tmpcp=0, tmpf=0; - for( int i=0; i<_nBins; i++ ) { - tmpt += _treatment[i]; _treatment[i] = tmpt; - tmpc += _control[i]; _control[i] = tmpc; - tmptp += _yTreatment[i]; _yTreatment[i] = tmptp; - tmpcp += _yControl[i]; _yControl[i] = tmpcp; - tmpf += _frequencyCumsum[i]; _frequencyCumsum[i] = tmpf; - } - - // these methods need to be call in this order - setUplift(); - setUpliftRandom(); - setUpliftNormalized(); - - if (trueProbabilities) { - _auucs = computeAuucs(); - _auucsRandom = computeAuucsRandom(); - _aecu = computeAecu(); - _auucsNormalized = computeAuucsNormalized(); - _maxIdx = _auucType.maxCriterionIdx(this); + //assert _nBins >= 1 : "Must have >= 1 bins for AUUC calculation, but got " + _nBins; + if (_nBins > 0) { + assert trueProbabilities || bldr._thresholds[_nBins - 1] == 1 : "Bins need to contain pred = 1 when 0-1 probabilities are used"; + _n = bldr._n; + _ths = Arrays.copyOf(bldr._thresholds, _nBins); + _treatment = Arrays.copyOf(bldr._treatment, _nBins); + _control = Arrays.copyOf(bldr._control, _nBins); + _yTreatment = Arrays.copyOf(bldr._yTreatment, _nBins); + _yControl = Arrays.copyOf(bldr._yControl, _nBins); + _frequency = Arrays.copyOf(bldr._frequency, _nBins); + _frequencyCumsum = Arrays.copyOf(bldr._frequency, _nBins); + _uplift = new double[AUUCType.values().length][_nBins]; + _upliftRandom = new double[AUUCType.values().length][_nBins]; + _upliftNormalized = new double[AUUCType.values().length][_nBins]; + + // Rollup counts + long tmpt = 0, tmpc = 0, tmptp = 0, tmpcp = 0, tmpf = 0; + for (int i = 0; i < _nBins; i++) { + tmpt += _treatment[i]; + _treatment[i] = tmpt; + tmpc += _control[i]; + _control[i] = tmpc; + tmptp += _yTreatment[i]; + _yTreatment[i] = tmptp; + tmpcp += _yControl[i]; + _yControl[i] = tmpcp; + tmpf += _frequencyCumsum[i]; + _frequencyCumsum[i] = tmpf; + } + + // these methods need to be call in this order + setUplift(); + setUpliftRandom(); + setUpliftNormalized(); + + if (trueProbabilities) { + _auucs = computeAuucs(); + _auucsRandom = computeAuucsRandom(); + _aecu = computeAecu(); + _auucsNormalized = computeAuucsNormalized(); + _maxIdx = _auucType.maxCriterionIdx(this); + } else { + _maxIdx = 0; + } } else { - _maxIdx = 0; + _maxIdx = -1; + _n = 0; + _ths = null; + _treatment = null; + _control = null; + _yTreatment = null; + _yControl = null; + _frequency = null; + _frequencyCumsum = null; + _uplift = null; + _upliftRandom = null; + _upliftNormalized = null; } } @@ -213,7 +234,9 @@ public static double[] calculateQuantileThresholds(int groups, Vec preds) { if (qm != null) qm.remove(); if (fr != null) DKV.remove(fr._key); } - if(Double.isNaN(quantiles[0])){ + if(quantiles == null){ + quantiles = new double[]{0}; + } else if(Double.isNaN(quantiles[0])){ quantiles[0] = 0; } return quantiles; @@ -227,17 +250,23 @@ private double[] computeAuucsRandom(){ return computeAuucs(_upliftRandom); } - private double[] computeAuucsNormalized() {return computeAuucs(_upliftNormalized);} + private double[] computeAuucsNormalized() { + return computeAuucs(_upliftNormalized); + } private double[] computeAuucs(double[][] uplift){ AUUCType[] auucTypes = AUUCType.VALUES; double[] auucs = new double[auucTypes.length]; for(int i = 0; i < auucTypes.length; i++ ) { - double area = 0; - for(int j = 0; j < _nBins; j++) { - area += uplift[i][j] * frequency(j); + if(_n == 0){ + auucs[i] = Double.NaN; + } else { + double area = 0; + for (int j = 0; j < _nBins; j++) { + area += uplift[i][j] * frequency(j); + } + auucs[i] = area / (_n + 1); } - auucs[i] = area/(_n+1); } return auucs; } @@ -270,29 +299,31 @@ public double auucNormalizedByType(AUUCType type){ return auucNormalized(idx); } - public double auuc(int idx){ return _auucs[idx]; } + public double auuc (int idx){ + return _n == 0 || idx < 0 ? Double.NaN : _auucs[idx]; + } public double auuc(){ return auuc(_auucTypeIndx); } public double auucRandom(int idx){ - return _auucsRandom[idx]; + return _n == 0 || idx < 0 ? Double.NaN : _auucsRandom[idx]; } public double auucRandom(){ return auucRandom(_auucTypeIndx); } - public double aecu(int idx) { return _aecu[idx];} + public double aecu(int idx) { return _n == 0 || idx < 0 ? Double.NaN : _aecu[idx];} public double qini(){ return aecuByType(AUUCType.qini);} - public double auucNormalized(int idx){ return _auucsNormalized[idx]; } + public double auucNormalized(int idx){ return _n == 0 || idx < 0 ? Double.NaN : _auucsNormalized[idx]; } public double auucNormalized(){ return auucNormalized(_auucTypeIndx); } - private static class AUUCImpl extends MRTask { + public static class AUUCImpl extends MRTask { final double[] _thresholds; AUUCBuilder _bldr; - AUUCImpl(double[] thresholds) { + public AUUCImpl(double[] thresholds) { _thresholds = thresholds; } diff --git a/h2o-core/src/main/java/hex/Model.java b/h2o-core/src/main/java/hex/Model.java index 0afdbbb8387e..46c5d4f33a11 100755 --- a/h2o-core/src/main/java/hex/Model.java +++ b/h2o-core/src/main/java/hex/Model.java @@ -2221,8 +2221,8 @@ protected void setupLocal() { if (isCancelled() || _j != null && _j.stop_requested()) return; Chunk weightsChunk = _hasWeights && _computeMetrics ? chks[_output.weightsIdx()] : null; Chunk offsetChunk = _output.hasOffset() ? chks[_output.offsetIdx()] : null; + Chunk treatmentChunk = _output.hasTreatment() ? chks[_output.treatmentIdx()] : null; Chunk responseChunk = null; - Chunk treatmentChunk = null; float [] actual = null; _mb = Model.this.makeMetricBuilder(_domain); if (_computeMetrics) { diff --git a/h2o-core/src/main/java/hex/ModelMetricsBinomialUplift.java b/h2o-core/src/main/java/hex/ModelMetricsBinomialUplift.java index 5031e53bff0a..a58ca933aa4f 100644 --- a/h2o-core/src/main/java/hex/ModelMetricsBinomialUplift.java +++ b/h2o-core/src/main/java/hex/ModelMetricsBinomialUplift.java @@ -4,7 +4,7 @@ import water.Scope; import water.exceptions.H2OIllegalArgumentException; import water.fvec.*; -import water.util.ArrayUtils; +import water.util.Log; import java.util.Arrays; @@ -70,35 +70,26 @@ public String toString() { protected StringBuilder appendToStringMetrics(StringBuilder sb) { return sb; } - - /** - * Build a Binomial ModelMetrics object from target-class probabilities, from actual labels, and a given domain for both labels (and domain[1] is the target class) - * @param targetClassProbs A Vec containing target class probabilities - * @param actualLabels A Vec containing the actual labels (can be for fewer labels than what's in domain, since the predictions can be for a small subset of the data) - * @return ModelMetrics object - */ - static public ModelMetricsBinomialUplift make(Vec targetClassProbs, Vec actualLabels, Vec treatment, AUUC.AUUCType auucType, int nbins) { - return make(targetClassProbs, actualLabels, treatment, actualLabels.domain(), auucType, nbins); - } /** - * Build a Binomial ModelMetrics object from target-class probabilities, from actual labels, and a given domain for both labels (and domain[1] is the target class) - * @param targetClassProbs A Vec containing target class probabilities + * Build a Binomial ModelMetrics object from predicted probabilities, from actual labels, and a given domain for both labels (and domain[1] is the target class) + * @param predictedProbs A Vec containing predicted probabilities * @param actualLabels A Vec containing the actual labels (can be for fewer labels than what's in domain, since the predictions can be for a small subset of the data) * @param treatment A Vec containing the treatment values * @param domain The two class labels (domain[0] is the non-target class, domain[1] is the target class, for which probabilities are given) * @param auucType Type of default AUUC - * @param auucNbins Number of bins to calculate AUUC (-1 means default value 1000, the number has to be higher than zero) + * @param auucNbins Number of bins to calculate AUUC (-1 means default value 1000, the number has to be higher than zero) + * @param customAuucThresholds custom threshold to calculate AUUC, if is not specified, the thresholds will be calculated from prediction vector * @return ModelMetrics object */ - static public ModelMetricsBinomialUplift make(Vec targetClassProbs, Vec actualLabels, Vec treatment, String[] domain, AUUC.AUUCType auucType, int auucNbins) { + static public ModelMetricsBinomialUplift make(Vec predictedProbs, Vec actualLabels, Vec treatment, String[] domain, AUUC.AUUCType auucType, int auucNbins) { Scope.enter(); try { Vec labels = actualLabels.toCategoricalVec(); if (domain == null) domain = labels.domain(); - if (labels == null || targetClassProbs == null || treatment == null) - throw new IllegalArgumentException("Missing actualLabels or predictedProbs or treatment values for uplift binomial metrics!"); - if (!targetClassProbs.isNumeric()) + if (labels == null || predictedProbs == null || treatment == null) + throw new IllegalArgumentException("Missing actualLabels or predicted probabilities or treatment values for uplift binomial metrics!"); + if (!predictedProbs.isNumeric()) throw new IllegalArgumentException("Predicted probabilities must be numeric per-class probabilities for uplift binomial metrics."); if (domain.length != 2) throw new IllegalArgumentException("Domain must have 2 class labels, but is " + Arrays.toString(domain) + " for uplift binomial metrics."); @@ -112,14 +103,13 @@ static public ModelMetricsBinomialUplift make(Vec targetClassProbs, Vec actualLa throw new IllegalArgumentException("The number of bins to calculate AUUC need to be -1 (default value) or higher than zero, but less than data size."); if(auucNbins == -1) auucNbins = AUUC.NBINS > dataSize ? (int) dataSize : AUUC.NBINS; - Frame fr = new Frame(targetClassProbs); + Frame fr = new Frame(predictedProbs); fr.add("labels", labels); fr.add("treatment", treatment); - MetricBuilderBinomialUplift mb = new UpliftBinomialMetrics(labels.domain(), AUUC.calculateQuantileThresholds(auucNbins, targetClassProbs)).doAll(fr)._mb; + MetricBuilderBinomialUplift mb; + mb = new UpliftBinomialMetrics(labels.domain(), AUUC.calculateQuantileThresholds(auucNbins, predictedProbs)).doAll(fr)._mb; labels.remove(); - Frame preds = new Frame(targetClassProbs); - ModelMetricsBinomialUplift mm = (ModelMetricsBinomialUplift) mb.makeModelMetrics(null, fr, preds, - fr.vec("labels"), fr.vec("treatment"), auucType, auucNbins); // use the Vecs from the frame (to make sure the ESPC is identical) + ModelMetricsBinomialUplift mm = (ModelMetricsBinomialUplift) mb.makeModelMetrics(null, fr, auucType); mm._description = "Computed on user-given predictions and labels."; return mm; } finally { @@ -127,7 +117,7 @@ static public ModelMetricsBinomialUplift make(Vec targetClassProbs, Vec actualLa } } - // helper to build a ModelMetricsBinomial for a N-class problem from a Frame that contains N per-class probability columns, and the actual label as the (N+1)-th column + // helper to build a ModelMetricsBinomialUplift from a Frame that contains prediction probability column and the actual label private static class UpliftBinomialMetrics extends MRTask { String[] domain; double[] thresholds; @@ -168,18 +158,14 @@ public MetricBuilderBinomialUplift( String[] domain, double[] thresholds) { _auuc = new AUUC.AUUCBuilder(thresholds); } } - - public MetricBuilderBinomialUplift( String[] domain) { - super(2,domain); - } - + @Override public double[] perRow(double[] ds, float[] yact, Model m) { return perRow(ds, yact,1, 0, m); } @Override public double[] perRow(double[] ds, float[] yact, double weight, double offset, Model m) { - assert _auuc == null || yact.length == 2 : "Treatment must be included in `yact` when calculating AUUC"; + assert yact.length == 2 : "Treatment must be included in `yact` when calculating AUUC"; if(Float .isNaN(yact[0])) return ds; // No errors if actual is missing if(weight == 0 || Double.isNaN(weight)) return ds; int y = (int)yact[0]; @@ -233,7 +219,7 @@ public double[] perRow(double[] ds, float[] yact, double weight, double offset, treatment = frameWithExtraColumns.vec(m._parms._treatment_column); } } - int auucNbins = m==null || m._parms._auuc_nbins == -1? + int auucNbins = m==null || m._parms._auuc_nbins == -1? AUUC.NBINS : m._parms._auuc_nbins; return makeModelMetrics(m, f, preds, resp, treatment, auucType, auucNbins); } @@ -243,17 +229,17 @@ private ModelMetrics makeModelMetrics(final Model m, final Frame f, final Frame AUUC auuc = null; if (preds != null) { if (resp != null) { - if (_auuc == null) { auuc = new AUUC(preds.vec(0), resp, treatment, auucType, nbins); - } else { - auuc = new AUUC(_auuc, auucType); - } } } return makeModelMetrics(m, f, auuc); } - private ModelMetrics makeModelMetrics(Model m, Frame f, AUUC auuc) { + private ModelMetrics makeModelMetrics(final Model m, final Frame f, AUUC.AUUCType auucType) { + return makeModelMetrics(m, f, new AUUC(_auuc, auucType)); + } + + public ModelMetrics makeModelMetrics(Model m, Frame f, AUUC auuc) { double sigma = Double.NaN; double ate = Double.NaN; double atc = Double.NaN; diff --git a/h2o-core/src/main/java/water/api/ModelMetricsHandler.java b/h2o-core/src/main/java/water/api/ModelMetricsHandler.java index 0fb8a0213020..4ce6f63d90f7 100644 --- a/h2o-core/src/main/java/water/api/ModelMetricsHandler.java +++ b/h2o-core/src/main/java/water/api/ModelMetricsHandler.java @@ -173,6 +173,7 @@ public static final class ModelMetricsListSchemaV3 extends RequestSchemaV3major.minor * format, where minor is a 2-digit number. For example "1.00", @@ -242,7 +246,7 @@ private void readAll(final boolean readModelMetadata) throws IOException { String[] columns = (String[]) _lkv.get("[columns]"); String[][] domains = parseModelDomains(columns.length); boolean isSupervised = readkv("supervised"); - _model = makeModel(columns, domains, isSupervised ? columns[columns.length - 1] : null); + _model = makeModel(columns, domains, isSupervised ? columns[columns.length - 1] : null, (String) readkv("treatment_column")); _model._uuid = readkv("uuid"); _model._algoName = readkv("algo"); _model._h2oVersion = readkv("h2o_version", "unknown"); diff --git a/h2o-genmodel/src/main/java/hex/genmodel/MojoModel.java b/h2o-genmodel/src/main/java/hex/genmodel/MojoModel.java index fb94d4cdb809..e58769912bb0 100644 --- a/h2o-genmodel/src/main/java/hex/genmodel/MojoModel.java +++ b/h2o-genmodel/src/main/java/hex/genmodel/MojoModel.java @@ -83,4 +83,8 @@ public static MojoModel load(MojoReaderBackend mojoReader) throws IOException { protected MojoModel(String[] columns, String[][] domains, String responseColumn) { super(columns, domains, responseColumn); } + + protected MojoModel(String[] columns, String[][] domains, String responseColumn, String treatmentColumn) { + super(columns, domains, responseColumn, treatmentColumn); + } } diff --git a/h2o-genmodel/src/main/java/hex/genmodel/algos/tree/SharedTreeMojoModel.java b/h2o-genmodel/src/main/java/hex/genmodel/algos/tree/SharedTreeMojoModel.java index 53e940ae91cb..e904d3275184 100644 --- a/h2o-genmodel/src/main/java/hex/genmodel/algos/tree/SharedTreeMojoModel.java +++ b/h2o-genmodel/src/main/java/hex/genmodel/algos/tree/SharedTreeMojoModel.java @@ -832,6 +832,10 @@ protected SharedTreeMojoModel(String[] columns, String[][] domains, String respo super(columns, domains, responseColumn); } + protected SharedTreeMojoModel(String[] columns, String[][] domains, String responseColumn, String treatmentColumn) { + super(columns, domains, responseColumn, treatmentColumn); + } + /** * Score all trees and fill in the `preds` array. */ diff --git a/h2o-genmodel/src/main/java/hex/genmodel/algos/upliftdrf/UpliftDrfMojoModel.java b/h2o-genmodel/src/main/java/hex/genmodel/algos/upliftdrf/UpliftDrfMojoModel.java new file mode 100644 index 000000000000..3ba78cb48859 --- /dev/null +++ b/h2o-genmodel/src/main/java/hex/genmodel/algos/upliftdrf/UpliftDrfMojoModel.java @@ -0,0 +1,47 @@ +package hex.genmodel.algos.upliftdrf; + +import hex.ModelCategory; +import hex.genmodel.algos.tree.SharedTreeMojoModel; + +public class UpliftDrfMojoModel extends SharedTreeMojoModel { + + protected double[] _thresholds; + + public UpliftDrfMojoModel(String[] columns, String[][] domains, String responseColumn, String treatmentColumn){ + super(columns, domains, responseColumn, treatmentColumn); + } + + @Override + public double[] unifyPreds(double[] row, double offset, double[] preds) { + assert _nclasses == 2; + preds[1] /= _ntree_groups; + preds[2] /= _ntree_groups; + preds[0] = preds[1] - preds[2]; + return preds; + } + + @Override + public double[] score0(double[] row, double[] preds) { + super.scoreAllTrees(row, preds); + return unifyPreds(row, 0, preds); + } + + @Override + public double getInitF() { + return 0; + } + + public double[] getThresholds() { + return _thresholds; + } + + @Override + public int getPredsSize() { + return 3; + } + + @Override + public int getPredsSize(ModelCategory mc) { + return getPredsSize(); + } +} diff --git a/h2o-genmodel/src/main/java/hex/genmodel/algos/upliftdrf/UpliftDrfMojoReader.java b/h2o-genmodel/src/main/java/hex/genmodel/algos/upliftdrf/UpliftDrfMojoReader.java new file mode 100644 index 000000000000..a8b3a877b540 --- /dev/null +++ b/h2o-genmodel/src/main/java/hex/genmodel/algos/upliftdrf/UpliftDrfMojoReader.java @@ -0,0 +1,35 @@ +package hex.genmodel.algos.upliftdrf; +import hex.genmodel.algos.tree.SharedTreeMojoReader; + +import java.io.IOException; + +/** + */ +public class UpliftDrfMojoReader extends SharedTreeMojoReader { + + @Override + public String getModelName() { + return "Distributed Uplift Random Forest"; + } + + @Override + protected void readModelData() throws IOException { + super.readModelData(); + _model._treatmentColumn = readkv("treatment_column"); + _model._thresholds = readkv("thresholds"); + } + + @Override + protected UpliftDrfMojoModel makeModel(String[] columns, String[][] domains, String responseColumn) { + return null; + } + + @Override + protected UpliftDrfMojoModel makeModel(String[] columns, String[][] domains, String responseColumn, String treatmentColumn) { + return new UpliftDrfMojoModel(columns, domains, responseColumn, treatmentColumn); + } + + @Override public String mojoVersion() { + return "1.40"; + } +} diff --git a/h2o-genmodel/src/main/java/hex/genmodel/easy/EasyPredictModelWrapper.java b/h2o-genmodel/src/main/java/hex/genmodel/easy/EasyPredictModelWrapper.java index 9fb766b30db3..844a592b31b4 100644 --- a/h2o-genmodel/src/main/java/hex/genmodel/easy/EasyPredictModelWrapper.java +++ b/h2o-genmodel/src/main/java/hex/genmodel/easy/EasyPredictModelWrapper.java @@ -385,6 +385,8 @@ public AbstractPrediction predict(RowData data, ModelCategory mc) throws Predict return predictKLime(data); case CoxPH: return predictCoxPH(data); + case BinomialUplift: + return predictUpliftBinomial(data); case Unknown: throw new PredictException("Unknown model category"); default: @@ -679,6 +681,19 @@ public BinomialModelPrediction predictBinomial(RowData data, double offset) thro return p; } + /** + * Make a prediction on a new data point using Uplift Binomial model. + * @param data A new data point. Unknown or missing column name is treated as a NaN or ignored. Column names are case sensitive. + * @return The prediction. + * @throws PredictException + */ + public UpliftBinomialModelPrediction predictUpliftBinomial(RowData data) throws PredictException { + double[] preds = preamble(ModelCategory.BinomialUplift, data, 0); + UpliftBinomialModelPrediction p = new UpliftBinomialModelPrediction(); + p.predictions = preds; + return p; + } + /** * @deprecated Use {@link #predictTargetEncoding(RowData)} instead. */ diff --git a/h2o-genmodel/src/main/java/hex/genmodel/easy/prediction/UpliftBinomialModelPrediction.java b/h2o-genmodel/src/main/java/hex/genmodel/easy/prediction/UpliftBinomialModelPrediction.java new file mode 100644 index 000000000000..7968a5f2eb98 --- /dev/null +++ b/h2o-genmodel/src/main/java/hex/genmodel/easy/prediction/UpliftBinomialModelPrediction.java @@ -0,0 +1,5 @@ +package hex.genmodel.easy.prediction; + +public class UpliftBinomialModelPrediction extends AbstractPrediction { + public double[] predictions; +} diff --git a/h2o-py/h2o/h2o.py b/h2o-py/h2o/h2o.py index 1f7d6adabf0a..7af8d9dcb4bd 100644 --- a/h2o-py/h2o/h2o.py +++ b/h2o-py/h2o/h2o.py @@ -2048,6 +2048,7 @@ def make_metrics(predicted, actual, domain=None, distribution=None, weights=None params = {"domain": domain, "distribution": distribution} if weights is not None: params["weights_frame"] = weights.frame_id + params["auc_type"] = auc_type if treatment is not None: assert treatment.ncol == 1, "`treatment` frame should have exactly 1 column" params["treatment_frame"] = treatment.frame_id @@ -2056,7 +2057,6 @@ def make_metrics(predicted, actual, domain=None, distribution=None, weights=None params["auuc_type"] = auuc_type assert auuc_nbins == -1 or auuc_nbins > 0, "auuc_nbis should be -1 or higner than 0." params["auuc_nbins"] = auuc_nbins - params["auc_type"] = auc_type res = api("POST /3/ModelMetrics/predictions_frame/%s/actuals_frame/%s" % (predicted.frame_id, actual.frame_id), data=params) return res["model_metrics"] diff --git a/h2o-py/h2o/model/model_base.py b/h2o-py/h2o/model/model_base.py index fd7a1570e379..d1af1ae2cdfe 100644 --- a/h2o-py/h2o/model/model_base.py +++ b/h2o-py/h2o/model/model_base.py @@ -461,16 +461,15 @@ def training_model_metrics(self): return self._model_json["output"]["training_metrics"]._metric_json def model_performance(self, test_data=None, train=False, valid=False, xval=False, auc_type=None, - auuc_type=None, auuc_nbins=-1): + auuc_type=None): """ Generate model metrics for this model on ``test_data``. :param H2OFrame test_data: Data set for which model metrics shall be computed against. All three of train, valid and xval arguments are ignored if ``test_data`` is not ``None``. - :param bool train: Report the training metrics for the model. - :param bool valid: Report the validation metrics for the model. - :param bool xval: Report the cross-validation metrics for the model. If train and valid are ``True``, then it - defaults to True. + :param bool train: Report the training metrics for the model. Defaults false. + :param bool valid: Report the validation metrics for the model. Defaults false. + :param bool xval: Report the cross-validation metrics for the model. Defaults false. :param String auc_type: Change default AUC type for multinomial classification AUC/AUCPR calculation when ``test_data`` is not ``None``. One of: - ``"auto"`` @@ -482,15 +481,14 @@ def model_performance(self, test_data=None, train=False, valid=False, xval=False If type is ``"auto"`` or ``"none"``, AUC and AUCPR are not calculated. :param String auuc_type: Change default AUUC type for uplift binomial classification AUUC calculation - when ``test_data`` is not None. One of: - - - ``"AUTO"`` (default) + when ``test_data`` is not None. One of: + - ``"AUTO"`` - ``"qini"`` - ``"lift"`` - ``"gain"`` + - None (default) If type is ``"auto"`` ("qini"), AUUC is calculated. - :param int auuc_nbins: Number of bins for calculation AUUC. Defaults to ``-1``, which means 1000. :returns: An instance of :class:`~h2o.model.metrics_base.MetricsBase` or one of its subclass. """ @@ -521,8 +519,9 @@ def model_performance(self, test_data=None, train=False, valid=False, xval=False if (self._model_json["treatment_column_name"] is not None) and not(self._model_json["treatment_column_name"] in test_data.names): print("WARNING: Model metrics cannot be calculated and metric_json is empty due to the absence of the treatment column in your dataset.") return + res = h2o.api("POST /3/ModelMetrics/models/%s/frames/%s" % (self.model_id, test_data.frame_id), - data={"auuc_type": auuc_type, "auuc_nbins": auuc_nbins}) + data={"auuc_type": auuc_type}) else: res = h2o.api("POST /3/ModelMetrics/models/%s/frames/%s" % (self.model_id, test_data.frame_id)) # FIXME need to do the client-side filtering... (https://github.com/h2oai/h2o-3/issues/13862) diff --git a/h2o-py/h2o/model/models/uplift.py b/h2o-py/h2o/model/models/uplift.py index 621464cb663f..5b0f8b7891fa 100644 --- a/h2o-py/h2o/model/models/uplift.py +++ b/h2o-py/h2o/model/models/uplift.py @@ -368,6 +368,9 @@ def qini(self, train=False, valid=False): >>> uplift_model.qini(train=True) """ return self._delegate_to_metrics(method='qini', train=train, valid=valid) + + def default_auuc_thresholds(self): + return self._model_json['output']['default_auuc_thresholds'] def ate(self, train=False, valid=False): """ diff --git a/h2o-py/tests/testdir_misc/pyunit_make_metrics.py b/h2o-py/tests/testdir_misc/pyunit_make_metrics.py index 0a92cb4f0e8c..c839d4c11baf 100644 --- a/h2o-py/tests/testdir_misc/pyunit_make_metrics.py +++ b/h2o-py/tests/testdir_misc/pyunit_make_metrics.py @@ -189,6 +189,7 @@ def pyunit_make_metrics(weights_col=None): def pyunit_make_metrics_uplift(): + print("======= UPLIFT BINOMIAL ========") treatment_column = "treatment" response_column = "outcome" feature_cols = ["feature_"+str(x) for x in range(1,13)] @@ -197,41 +198,55 @@ def pyunit_make_metrics_uplift(): train[treatment_column] = train[treatment_column].asfactor() train[response_column] = train[response_column].asfactor() - test = h2o.import_file(pyunit_utils.locate("smalldata/uplift/upliftml_test.csv")) - test[treatment_column] = test[treatment_column].asfactor() - test[response_column] = test[response_column].asfactor() + valid = h2o.import_file(pyunit_utils.locate("smalldata/uplift/upliftml_test.csv")) + valid[treatment_column] = valid[treatment_column].asfactor() + valid[response_column] = valid[response_column].asfactor() nbins = 20 model = H2OUpliftRandomForestEstimator( treatment_column=treatment_column, seed=42, auuc_nbins=nbins, - score_each_iteration=True + score_each_iteration=True, + ntrees=3 ) - model.train(y=response_column, x=feature_cols, training_frame=train, validation_frame=test) + model.train(y=response_column, x=feature_cols, training_frame=train, validation_frame=valid) # test on validation data, train metrics are affected by sample rate m0 = model.model_performance(valid=True) - predicted = h2o.assign(model.predict(test)[0], "pred") - actual = test[response_column] - treatment = test[treatment_column] - m1 = model.model_performance(test_data=test, auuc_type="AUTO", auuc_nbins=nbins) + predicted = h2o.assign(model.predict(valid)[0], "pred") + actual = valid[response_column] + treatment = valid[treatment_column] + m1 = model.model_performance(test_data=valid, auuc_type="AUTO") m2 = h2o.make_metrics(predicted, actual, treatment=treatment, auuc_type="AUTO", auuc_nbins=nbins) - - err = 1e-5 - assert abs(m0.auuc() - m1.auuc()) < err - assert abs(m1.auuc() - m2.auuc()) < err + new_nbins = nbins - 10 + m3 = h2o.make_metrics(predicted, actual, treatment=treatment, auuc_type="AUTO", auuc_nbins=new_nbins) + + print("Model AUUC: {}".format(model.auuc())) + print("thresholds: {}".format(model.default_auuc_thresholds())) + print("Model performance AUUC: {}".format(m0.auuc())) + print("thresholds: {}".format(m0.thresholds())) + print("Model performance AUUC recalculate with data: {}".format(m1.auuc())) + print("thresholds: {}".format(m1.thresholds())) + print("Make AUUC: {}".format(m2.auuc())) + print("thresholds: {}".format(m2.thresholds())) + print("Make AUUC with new number of bins: {}".format(m3.auuc())) + print("thresholds: {}".format(m3.thresholds())) + + tol = 1e-5 + + # default model auuc is calculated from train data, default thresholds are from validation data + assert abs(model.auuc() - m0.auuc()) > tol + # model performance uses default thresholds, so AUUCs are same + assert abs(m0.auuc() - m1.auuc()) < tol + # make method calculates new thresholds but from the same data with same nbins so AUUCs are same + assert abs(m1.auuc() - m2.auuc()) < tol + # make method with the new auuc_nbins parameter calculates the new thresholds + assert abs(m2.auuc() - m3.auuc()) > tol + + print("===========================") - assert abs(m0.ate() - m1.ate()) < err - assert abs(m1.ate() - m2.ate()) < err - - assert abs(m0.att() - m1.att()) < err - assert abs(m1.att() - m2.att()) < err - - assert abs(m0.atc() - m1.atc()) < err - assert abs(m1.atc() - m2.atc()) < err - def suite_model_metrics(): diff --git a/h2o-r/h2o-package/R/models.R b/h2o-r/h2o-package/R/models.R index a8ebac661cd1..a19dd34fbf32 100755 --- a/h2o-r/h2o-package/R/models.R +++ b/h2o-r/h2o-package/R/models.R @@ -1021,6 +1021,7 @@ h2o.feature_frequencies <- feature_frequencies.H2OModel #' @param data (DEPRECATED) An H2OFrame. This argument is now called `newdata`. #' @param auc_type For multinomila model only. Set default multinomial AUC type. Must be one of: "AUTO", "NONE", "MACRO_OVR", "WEIGHTED_OVR", "MACRO_OVO", #' "WEIGHTED_OVO". Default is "NONE" +#' @param auuc_type For binomial model only. Set default AUUC type. Must be one of: "AUTO", "GINI", "GAIN", "LIFT". Default is NULL. #' @return Returns an object of the \linkS4class{H2OModelMetrics} subclass. #' @examples #' \dontrun{ @@ -1039,7 +1040,7 @@ h2o.feature_frequencies <- feature_frequencies.H2OModel #' h2o.performance(model = prostate_gbm_balanced, train = TRUE) #' } #' @export -h2o.performance <- function(model, newdata=NULL, train=FALSE, valid=FALSE, xval=FALSE, data=NULL, auc_type="NONE") { +h2o.performance <- function(model, newdata=NULL, train=FALSE, valid=FALSE, xval=FALSE, data=NULL, auc_type="NONE", auuc_type=NULL) { # data is now deprecated and the new arg name is newdata if (!is.null(data)) { @@ -1056,11 +1057,15 @@ h2o.performance <- function(model, newdata=NULL, train=FALSE, valid=FALSE, xval= if(!is.logical(xval) || length(xval) != 1L || is.na(xval)) stop("`xval` must be TRUE or FALSE") if(sum(valid, xval, train) > 1) stop("only one of `train`, `valid`, and `xval` can be TRUE") if(!(auc_type %in% c("AUTO", "NONE", "MACRO_OVR", "WEIGHTED_OVR", "MACRO_OVO", "WEIGHTED_OVO"))) stop("`auc_type` must be \"AUTO\", \"NONE\", \"MACRO_OVR\", \"WEIGHTED_OVR\", \"MACRO_OVO\", or \"WEIGHTED_OVO\".") + if(!is.null(auuc_type) && !(auuc_type %in% c("AUTO", "GINI", "LIFT", "GAIN"))) stop("`auuc_type` must be \"AUTO\", \"GINI\", \"LIFT\" or \"GAIN\"." ) missingNewdata <- missing(newdata) || is.null(newdata) if( missingNewdata && auc_type != "NONE") { print("WARNING: The `auc_type` parameter is set but it is not used because the `newdata` parameter is NULL.") } + if( missingNewdata && !is.null(auuc_type)) { + print("WARNING: The `auuc_type` parameter is set but it is not used because the `newdata` parameter is NULL.") + } if( !missingNewdata ) { if (!is.null(model@parameters$y) && !(model@parameters$y %in% names(newdata))) { print("WARNING: Model metrics cannot be calculated and metric_json is empty due to the absence of the response column in your dataset.") @@ -1075,6 +1080,11 @@ h2o.performance <- function(model, newdata=NULL, train=FALSE, valid=FALSE, xval= } else if(!is.null(model@parameters$auc_type) && model@parameters$auc_type != "NONE"){ parms[["auc_type"]] <- model@parameters$auc_type } + if(!is.null(auuc_type)){ + parms[["auuc_type"]] <- auuc_type + } else if(!is.null(model@parameters$auuc_type) && !is.null(model@parameters$auuc_type)){ + parms[["auuc_type"]] <- model@parameters$auuc_type + } res <- .h2o.__remoteSend(method = "POST", .h2o.__MODEL_METRICS(model@model_id, newdata.id), .params = parms) #### diff --git a/h2o-r/tests/testdir_misc/runit_make_metrics_uplift_binomial.R b/h2o-r/tests/testdir_misc/runit_make_metrics_uplift_binomial.R index c56634b99833..07846a9a24fd 100644 --- a/h2o-r/tests/testdir_misc/runit_make_metrics_uplift_binomial.R +++ b/h2o-r/tests/testdir_misc/runit_make_metrics_uplift_binomial.R @@ -5,77 +5,114 @@ test.make_metrics_uplift_binomial <- function() { response <- "outcome" treatment <- "treatment" train <- h2o.importFile(locate("smalldata/uplift/upliftml_train.csv")) + valid <- h2o.importFile(locate("smalldata/uplift/upliftml_test.csv")) train$treatment <- as.factor(train$treatment) train$outcome <- as.factor(train$outcome) + valid$treatment <- as.factor(valid$treatment) + valid$outcome <- as.factor(valid$outcome) predictors <- sprintf("feature_%s",seq(0:11)) + nbins <- 20 model <- h2o.upliftRandomForest(training_frame=train, + validation_frame=valid, x=predictors, y=response, - ntrees=5, - max_depth=5, treatment_column=treatment, - min_rows=10, - nbins=100, - seed=1234) + seed=42, + auuc_nbins=nbins, + score_each_iteration=TRUE, + ntrees=3) print(model) - pred <- h2o.assign(h2o.predict(model,train)[,1],"pred") - actual <- h2o.assign(train[,response],"act") - treat <- h2o.assign(train[,treatment],"treatment") - print(treat) + pred <- h2o.assign(h2o.predict(model,valid)[,1], "pred") + actual <- h2o.assign(valid[,response], "act") + treat <- h2o.assign(valid[,treatment], "treatment") - m0 <- h2o.make_metrics(pred, actual, treatment=treat) - print(m0) - m1 <- h2o.performance(model, train) + thresholds <- model@model$default_auuc_thresholds + + m0 <- h2o.performance(model, valid=TRUE) + thresholds0 <- m0@metrics$thresholds$thresholds + + m1 <- h2o.make_metrics(pred, actual, treatment=treat, auuc_nbins=nbins) + thresholds1 <- m1@metrics$thresholds$thresholds print(m1) + + m2 <- h2o.performance(model, valid) + thresholds2 <- m2@metrics$thresholds$thresholds + print(m2) + + tol <- 1e-10 + + # thresholds should be the same + expect_equal(thresholds, thresholds0, tolerance=tol) + expect_equal(thresholds0, thresholds1, tolerance=tol) + expect_equal(thresholds0, thresholds2, tolerance=tol) auuc0 <- h2o.auuc(m0) auuc1 <- h2o.auuc(m1) + auuc2 <- h2o.auuc(m2) + + expect_equal(auuc0, auuc1, tolerance=tol) + expect_equal(auuc0, auuc2, tolerance=tol) auuc_table0 <- h2o.auuc_table(m0) auuc_table1 <- h2o.auuc_table(m1) + auuc_table2 <- h2o.auuc_table(m2) expect_true(is.data.frame(auuc_table0)) expect_true(is.data.frame(auuc_table1)) + expect_true(is.data.frame(auuc_table2)) - expect_equal(auuc0, auuc1) - expect_equal(auuc_table0, auuc_table1) + expect_equal(auuc_table0, auuc_table1, tolerance=tol) + expect_equal(auuc_table0, auuc_table2, tolerance=tol) thr_table0 <- h2o.thresholds_and_metric_scores(m0) thr_table1 <- h2o.thresholds_and_metric_scores(m1) + thr_table2 <- h2o.thresholds_and_metric_scores(m2) + + expect_equal(thr_table0, thr_table1, tolerance=tol) + expect_equal(thr_table0, thr_table2, tolerance=tol) - expect_equal(thr_table0, thr_table1) - qini0 <- h2o.qini(m0) qini1 <- h2o.qini(m1) + qini2 <- h2o.qini(m2) - expect_equal(qini0, qini1) + expect_equal(qini0, qini1, tolerance=tol) + expect_equal(qini0, qini2, tolerance=tol) aecu_table0 <- h2o.aecu_table(m0) aecu_table1 <- h2o.aecu_table(m1) + aecu_table2 <- h2o.aecu_table(m2) expect_true(is.data.frame(aecu_table0)) expect_true(is.data.frame(aecu_table1)) - - expect_equal(aecu_table0, aecu_table1) + expect_true(is.data.frame(aecu_table2)) + + expect_equal(aecu_table0, aecu_table1, tolerance=tol) + expect_equal(aecu_table0, aecu_table2, tolerance=tol) ate0 <- h2o.ate(m0) ate1 <- h2o.ate(m1) + ate2 <- h2o.ate(m2) - expect_equal(ate0, ate1) + expect_equal(ate0, ate1, tolerance=tol) + expect_equal(ate0, ate2, tolerance=tol) att0 <- h2o.att(m0) att1 <- h2o.att(m1) + att2 <- h2o.att(m2) - expect_equal(att0, att1) + expect_equal(att0, att1, tolerance=tol) + expect_equal(att0, att2, tolerance=tol) atc0 <- h2o.atc(m0) atc1 <- h2o.atc(m1) + atc2 <- h2o.atc(m2) - expect_equal(atc0, atc1) + expect_equal(atc0, atc1, tolerance=tol) + expect_equal(atc0, atc2, tolerance=tol) } doSuite("Check making uplift binomial model metrics.", makeSuite(