Skip to content

Commit

Permalink
Merged version 2.2.2
Browse files Browse the repository at this point in the history
  • Loading branch information
vruusmann committed Apr 27, 2023
2 parents 3b32624 + 12ec394 commit 5d057b8
Show file tree
Hide file tree
Showing 34 changed files with 3,653 additions and 2,390 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/maven.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ name: maven

on:
push:
branches: [ '2.0.X', '2.1.X', '2.2.X', master ]
branches: [ '2.0.X', '2.1.X', '2.2.X', '2.3.X', master ]

jobs:
build:
Expand Down
7 changes: 4 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ Java library and command-line application for converting Apache Spark ML pipelin
* Trigonometric functions `sin`, `asin`, `sinh`, `cos`, `acos`, `cosh`, `tan`, `atan`, `tanh`.
* Aggregation functions `greatest` and `least`.
* RegExp functions `regexp_replace` and `rlike`.
* String functions `char_length`, `character_length`, `concat`, `lcase`, `length`, `lower`, `substring`, `trim`, `ucase` and `upper`.
* String functions `char_length`, `character_length`, `concat`, `lcase`, `length`, `lower`, `replace`, `substring`, `trim`, `ucase` and `upper`.
* Type cast functions `boolean`, `cast`, `double`, `int` and `string`.
* Value functions `in`, `isnan`, `isnull`, `isnotnull`, `negative` and `positive`.
* [`feature.StandardScalerModel`](https://spark.apache.org/docs/latest/api/java/org/apache/spark/ml/feature/StandardScalerModel.html) (the result of fitting a `feature.StandardScaler`)
Expand Down Expand Up @@ -129,7 +129,7 @@ Java library and command-line application for converting Apache Spark ML pipelin

# Prerequisites #

* Apache Spark 1.5.X, 1.6.X, 2.0.X, 2.1.X, 2.2.X, 2.3.X, 2.4.X, 3.0.X, 3.1.X, 3.2.X or 3.3.X.
* Apache Spark 3.0.X, 3.1.X, 3.2.X, 3.3.X or 3.4.X.

# Installation #

Expand All @@ -156,7 +156,8 @@ Active development branches:
| 3.0.X | [`2.0.X`](https://github.com/jpmml/jpmml-sparkml/tree/2.0.X) |
| 3.1.X | [`2.1.X`](https://github.com/jpmml/jpmml-sparkml/tree/2.1.X) |
| 3.2.X | [`2.2.X`](https://github.com/jpmml/jpmml-sparkml/tree/2.2.X) |
| 3.3.X | [`master`](https://github.com/jpmml/jpmml-sparkml/tree/master) |
| 3.3.X | [`2.3.X`](https://github.com/jpmml/jpmml-sparkml/tree/2.3.X) |
| 3.4.X | [`master`](https://github.com/jpmml/jpmml-sparkml/tree/master) |

Archived development branches:

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,16 @@
*/
package org.jpmml.sparkml.lightgbm.testing;

import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.function.Predicate;

import com.google.common.base.Equivalence;
import org.jpmml.converter.testing.OptionsUtil;
import org.jpmml.evaluator.ResultField;
import org.jpmml.evaluator.testing.PMMLEquivalence;
import org.jpmml.lightgbm.HasLightGBMOptions;
import org.jpmml.sparkml.testing.SparkMLEncoderBatch;
import org.jpmml.sparkml.testing.SparkMLEncoderBatchTest;
import org.junit.AfterClass;
Expand All @@ -39,7 +44,24 @@ public LightGBMTest(){
public SparkMLEncoderBatch createBatch(String algorithm, String dataset, Predicate<ResultField> columnFilter, Equivalence<Object> equivalence){
columnFilter = columnFilter.and(SparkMLEncoderBatchTest.excludePredictionFields());

return super.createBatch(algorithm, dataset, columnFilter, equivalence);
SparkMLEncoderBatch result = new SparkMLEncoderBatch(algorithm, dataset, columnFilter, equivalence){

@Override
public LightGBMTest getArchiveBatchTest(){
return LightGBMTest.this;
}

@Override
public List<Map<String, Object>> getOptionsMatrix(){
Map<String, Object> options = new LinkedHashMap<>();

options.put(HasLightGBMOptions.OPTION_COMPACT, new Boolean[]{false, true});

return OptionsUtil.generateOptionsMatrix(options);
}
};

return result;
}

@Test
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,12 @@
import java.util.function.Function;

import ml.dmlc.xgboost4j.scala.Booster;
import ml.dmlc.xgboost4j.scala.spark.params.GeneralParams;
import org.apache.spark.ml.Model;
import org.apache.spark.ml.param.shared.HasPredictionCol;
import org.dmg.pmml.DataType;
import org.dmg.pmml.Field;
import org.dmg.pmml.mining.MiningModel;
import org.jpmml.converter.BinaryFeature;
import org.jpmml.converter.ContinuousFeature;
import org.jpmml.converter.Feature;
import org.jpmml.converter.Schema;
Expand All @@ -43,7 +46,9 @@ private BoosterUtil(){
}

static
public <C extends ModelConverter<?> & HasXGBoostOptions> MiningModel encodeBooster(C converter, Booster booster, Schema schema){
public <M extends Model<M> & HasPredictionCol & GeneralParams, C extends ModelConverter<M> & HasSparkMLXGBoostOptions> MiningModel encodeBooster(C converter, Booster booster, Schema schema){
M model = converter.getModel();

byte[] bytes;

try {
Expand All @@ -60,30 +65,56 @@ public <C extends ModelConverter<?> & HasXGBoostOptions> MiningModel encodeBoost
throw new RuntimeException(ioe);
}

Function<Feature, Feature> function = new Function<Feature, Feature>(){
Boolean inputFloat = (Boolean)converter.getOption(HasSparkMLXGBoostOptions.OPTION_INPUT_FLOAT, null);
if((Boolean.TRUE).equals(inputFloat)){
Function<Feature, Feature> function = new Function<Feature, Feature>(){

@Override
public Feature apply(Feature feature){

@Override
public Feature apply(Feature feature){
if(feature instanceof ContinuousFeature){
ContinuousFeature continuousFeature = (ContinuousFeature)feature;

if(feature instanceof BinaryFeature){
BinaryFeature binaryFeature = (BinaryFeature)feature;
DataType dataType = continuousFeature.getDataType();
switch(dataType){
case INTEGER:
case FLOAT:
break;
case DOUBLE:
{
Field<?> field = continuousFeature.getField();

return binaryFeature;
} else
field.setDataType(DataType.FLOAT);

{
ContinuousFeature continuousFeature = feature.toContinuousFeature(DataType.FLOAT);
return new ContinuousFeature(continuousFeature.getEncoder(), field);
}
default:
break;
}
}

return continuousFeature;
return feature;
}
}
};
};

schema = schema.toTransformedSchema(function);
}

Float missing = model.getMissing();
if(missing.isNaN()){
missing = null;
}

Map<String, Object> options = new LinkedHashMap<>();
options.put(HasXGBoostOptions.OPTION_MISSING, converter.getOption(HasXGBoostOptions.OPTION_MISSING, missing));
options.put(HasXGBoostOptions.OPTION_COMPACT, converter.getOption(HasXGBoostOptions.OPTION_COMPACT, false));
options.put(HasXGBoostOptions.OPTION_NUMERIC, converter.getOption(HasXGBoostOptions.OPTION_NUMERIC, true));
options.put(HasXGBoostOptions.OPTION_PRUNE, converter.getOption(HasXGBoostOptions.OPTION_PRUNE, false));
options.put(HasXGBoostOptions.OPTION_NTREE_LIMIT, converter.getOption(HasXGBoostOptions.OPTION_NTREE_LIMIT, null));

Schema xgbSchema = schema.toTransformedSchema(function);
Boolean numeric = (Boolean)options.get(HasXGBoostOptions.OPTION_NUMERIC);

Schema xgbSchema = learner.toXGBoostSchema(numeric, schema);

return learner.encodeMiningModel(options, xgbSchema);
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
/*
* Copyright (c) 2023 Villu Ruusmann
*
* This file is part of JPMML-SparkML
*
* JPMML-SparkML is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* JPMML-SparkML is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with JPMML-SparkML. If not, see <http://www.gnu.org/licenses/>.
*/
package org.jpmml.sparkml.xgboost;

import org.jpmml.sparkml.HasSparkMLOptions;
import org.jpmml.xgboost.HasXGBoostOptions;

public interface HasSparkMLXGBoostOptions extends HasSparkMLOptions, HasXGBoostOptions {

String OPTION_INPUT_FLOAT = "input_float";
}
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,8 @@
import org.jpmml.converter.Schema;
import org.jpmml.converter.mining.MiningModelUtil;
import org.jpmml.sparkml.ProbabilisticClassificationModelConverter;
import org.jpmml.xgboost.HasXGBoostOptions;

public class XGBoostClassificationModelConverter extends ProbabilisticClassificationModelConverter<XGBoostClassificationModel> implements HasXGBoostOptions {
public class XGBoostClassificationModelConverter extends ProbabilisticClassificationModelConverter<XGBoostClassificationModel> implements HasSparkMLXGBoostOptions {

public XGBoostClassificationModelConverter(XGBoostClassificationModel model){
super(model);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,8 @@
import org.jpmml.sparkml.PredictionModelConverter;
import org.jpmml.sparkml.SparkMLEncoder;
import org.jpmml.sparkml.model.HasPredictionModelOptions;
import org.jpmml.xgboost.HasXGBoostOptions;

public class XGBoostRegressionModelConverter extends PredictionModelConverter<XGBoostRegressionModel> implements HasXGBoostOptions {
public class XGBoostRegressionModelConverter extends PredictionModelConverter<XGBoostRegressionModel> implements HasSparkMLXGBoostOptions {

public XGBoostRegressionModelConverter(XGBoostRegressionModel model){
super(model);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@
*/
package org.jpmml.sparkml.xgboost.testing;

import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.function.Predicate;

import com.google.common.base.Equivalence;
Expand All @@ -26,11 +29,14 @@
import org.dmg.pmml.Visitor;
import org.dmg.pmml.VisitorAction;
import org.jpmml.converter.testing.Fields;
import org.jpmml.converter.testing.OptionsUtil;
import org.jpmml.evaluator.ResultField;
import org.jpmml.evaluator.testing.FloatEquivalence;
import org.jpmml.model.visitors.AbstractVisitor;
import org.jpmml.sparkml.testing.SparkMLEncoderBatch;
import org.jpmml.sparkml.testing.SparkMLEncoderBatchTest;
import org.jpmml.sparkml.xgboost.HasSparkMLXGBoostOptions;
import org.jpmml.xgboost.HasXGBoostOptions;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Test;
Expand All @@ -52,6 +58,18 @@ public XGBoostTest getArchiveBatchTest(){
return XGBoostTest.this;
}

@Override
public List<Map<String, Object>> getOptionsMatrix(){
Map<String, Object> options = new LinkedHashMap<>();

options.put(HasSparkMLXGBoostOptions.OPTION_INPUT_FLOAT, new Boolean[]{false, true});

options.put(HasXGBoostOptions.OPTION_COMPACT, new Boolean[]{false, true});
options.put(HasXGBoostOptions.OPTION_PRUNE, false);

return OptionsUtil.generateOptionsMatrix(options);
}

@Override
public PMML getPMML() throws Exception {
PMML pmml = super.getPMML();
Expand Down Expand Up @@ -86,6 +104,11 @@ public void evaluateAuto() throws Exception {
evaluate("XGBoost", "Auto");
}

@Test
public void evaluateHousing() throws Exception {
evaluate("XGBoost", "Housing");
}

@Test
public void evaluateIris() throws Exception {
evaluate("XGBoost", "Iris", new FloatEquivalence(16));
Expand Down
5 changes: 3 additions & 2 deletions pmml-sparkml-xgboost/src/test/resources/XGBoostAuto.scala
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import java.io.File
import ml.dmlc.xgboost4j.scala.spark.{TrackerConf, XGBoostRegressor}
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.feature._
import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.types.{FloatType, StringType}
import org.jpmml.sparkml.{DatasetUtil, PipelineModelUtil}
import org.jpmml.sparkml.xgboost.SparseToDenseTransformer

Expand All @@ -22,7 +22,7 @@ val assembler = new VectorAssembler().setInputCols(ohe.getOutputCols ++ cont_col
val sparse2dense = new SparseToDenseTransformer().setInputCol(assembler.getOutputCol).setOutputCol("denseFeatureVec")

val trackerConf = TrackerConf(0, "scala")
val regressor = new XGBoostRegressor(Map("objective" -> "reg:squarederror", "num_round" -> 101, "num_workers" -> 1, "skip_clean_checkpoint" -> true, "tracker_conf" -> trackerConf)).setLabelCol("mpg").setFeaturesCol(sparse2dense.getOutputCol)
val regressor = new XGBoostRegressor(Map("objective" -> "reg:squarederror", "num_round" -> 101, "num_workers" -> 1, "tracker_conf" -> trackerConf)).setLabelCol("mpg").setFeaturesCol(sparse2dense.getOutputCol)

val pipeline = new Pipeline().setStages(Array(indexer, ohe, assembler, sparse2dense, regressor))
val pipelineModel = pipeline.fit(df)
Expand All @@ -31,5 +31,6 @@ PipelineModelUtil.storeZip(pipelineModel, new File("pipeline/XGBoostAuto.zip"))

var xgbDf = pipelineModel.transform(df)
xgbDf = xgbDf.selectExpr("prediction as mpg")
xgbDf = DatasetUtil.castColumn(xgbDf, "mpg", FloatType)

DatasetUtil.storeCsv(xgbDf, new File("csv/XGBoostAuto.csv"))
31 changes: 31 additions & 0 deletions pmml-sparkml-xgboost/src/test/resources/XGBoostHousing.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import java.io.File

import ml.dmlc.xgboost4j.scala.spark.{TrackerConf, XGBoostRegressor}
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.feature._
import org.apache.spark.sql.types.FloatType
import org.jpmml.sparkml.{DatasetUtil, PipelineModelUtil}

var df = DatasetUtil.loadCsv(spark, new File("csv/Housing.csv"))

DatasetUtil.storeSchema(df, new File("schema/Housing.json"))

val cat_cols = Array("CHAS", "RAD", "TAX")
val cont_cols = Array("CRIM", "ZN", "INDUS", "NOX", "RM", "AGE", "DIS", "PTRATIO", "B", "LSTAT")

val assembler = new VectorAssembler().setInputCols(cat_cols ++ cont_cols).setOutputCol("featureVector")
val indexer = new VectorIndexer().setInputCol(assembler.getOutputCol).setOutputCol("catFeatureVector")

val trackerConf = TrackerConf(0, "scala")
val regressor = new XGBoostRegressor(Map("objective" -> "reg:squarederror", "num_round" -> 101, "num_workers" -> 1, "tracker_conf" -> trackerConf)).setMissing(-1).setLabelCol("MEDV").setFeaturesCol(indexer.getOutputCol)

val pipeline = new Pipeline().setStages(Array(assembler, indexer, regressor))
val pipelineModel = pipeline.fit(df)

PipelineModelUtil.storeZip(pipelineModel, new File("pipeline/XGBoostHousing.zip"))

var xgbDf = pipelineModel.transform(df)
xgbDf = xgbDf.selectExpr("prediction as MEDV")
xgbDf = DatasetUtil.castColumn(xgbDf, "MEDV", FloatType)

DatasetUtil.storeCsv(xgbDf, new File("csv/XGBoostHousing.csv"))
7 changes: 6 additions & 1 deletion pmml-sparkml-xgboost/src/test/resources/XGBoostIris.scala
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,16 @@ import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.feature._
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.sql.functions.{lit, udf}
import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.types.{DataType, StringType, StructType}
import org.jpmml.sparkml.{DatasetUtil, PipelineModelUtil}

var df = DatasetUtil.loadCsv(spark, new File("csv/Iris.csv"))

val schema = df.schema
val floatSchema = DataType.fromJson(schema.json.replaceAll("double", "float"))

df = DatasetUtil.castColumns(df, floatSchema.asInstanceOf[StructType])

DatasetUtil.storeSchema(df, new File("schema/Iris.json"))

val labelIndexer = new StringIndexer().setInputCol("Species").setOutputCol("idx_Species")
Expand Down
Loading

0 comments on commit 5d057b8

Please sign in to comment.