Merged version 2.0.3

jpmml · Dec 27, 2023 · 39847ee · 39847ee
2 parents 8636f12 + acafc4c
commit 39847ee
Show file tree

Hide file tree

Showing 53 changed files with 30,343 additions and 2,580 deletions.
diff --git a/.github/workflows/maven.yml b/.github/workflows/maven.yml
@@ -22,4 +22,4 @@ jobs:
         path: ~/.m2
         key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }}
         restore-keys: ${{ runner.os }}-m2
-    - run: mvn -B package --file pom.xml
+    - run: mvn -Dxgboost4j-spark.version=2.0.1 -B package --file pom.xml
diff --git a/README.md b/README.md
@@ -109,6 +109,14 @@ Java library and command-line application for converting Apache Spark ML pipelin
     * [`tuning.TrainValidationSplitModel`](https://spark.apache.org/docs/latest/api/java/org/apache/spark/ml/tuning/TrainValidationSplitModel.html)
 </details>
 
+<details>
+  <summary>JPMML-SparkML</summary>
+
+  * Feature transformers:
+    * `org.jpmml.sparkml.feature.InvalidCategoryTransformer`
+    * `org.jpmml.sparkml.feature.SparseToDenseTransformer`
+</details>
+
 <details>
   <summary>LightGBM</summary>
 
@@ -120,8 +128,6 @@ Java library and command-line application for converting Apache Spark ML pipelin
 <details>
   <summary>XGBoost</summary>
 
-  * Feature transformers:
-    * `org.jpmml.sparkml.xgboost.SparseToDenseTransformer`
   * Prediction models:
     * [`ml.dmlc.xgboost4j.scala.spark.XGBoostClassificationModel`](https://xgboost.readthedocs.io/en/latest/jvm/scaladocs/xgboost4j-spark/ml/dmlc/xgboost4j/scala/spark/XGBoostClassificationModel.html)
     * [`ml.dmlc.xgboost4j.scala.spark.XGBoostRegressionModel`](https://xgboost.readthedocs.io/en/latest/jvm/scaladocs/xgboost4j-spark/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressionModel.html)
@@ -247,6 +253,7 @@ spark-submit --master local --class org.jpmml.sparkml.example.Main pmml-sparkml-
 
 # Documentation #
 
+* [Training PySpark LightGBM pipelines](https://openscoring.io/blog/2023/05/26/pyspark_lightgbm_pipeline/)
 * [Converting logistic regression models to PMML documents](https://openscoring.io/blog/2020/01/19/converting_logistic_regression_pmml/#apache-spark)
 * [Deploying Apache Spark ML pipeline models on Openscoring REST web service](https://openscoring.io/blog/2020/02/16/deploying_sparkml_pipeline_openscoring_rest/)
 * [Converting Apache Spark ML pipeline models to PMML documents](https://openscoring.io/blog/2018/07/09/converting_sparkml_pipeline_pmml/)

diff --git a/pmml-sparkml-example/pom.xml b/pmml-sparkml-example/pom.xml
@@ -67,7 +67,7 @@
 			<plugin>
 				<groupId>org.apache.maven.plugins</groupId>
 				<artifactId>maven-shade-plugin</artifactId>
-				<version>3.4.1</version>
+				<version>3.5.1</version>
 				<executions>
 					<execution>
 						<phase>package</phase>

diff --git a/pmml-sparkml-example/src/main/java/org/jpmml/sparkml/example/Main.java b/pmml-sparkml-example/src/main/java/org/jpmml/sparkml/example/Main.java
@@ -37,6 +37,7 @@
 import org.apache.spark.sql.types.StructType;
 import org.dmg.pmml.PMML;
 import org.jpmml.model.metro.MetroJAXBUtil;
+import org.jpmml.sparkml.ArchiveUtil;
 import org.jpmml.sparkml.PMMLBuilder;
 import org.jpmml.sparkml.PipelineModelUtil;
 import org.jpmml.sparkml.model.HasPredictionModelOptions;
@@ -187,7 +188,7 @@ private void run() throws Exception {
 			logger.info("Loading pipeline model..");
 
 			if(this.pipelineInput.isFile()){
-				this.pipelineInput = PipelineModelUtil.uncompress(this.pipelineInput);
+				this.pipelineInput = ArchiveUtil.uncompress(this.pipelineInput);
 			}
 
 			long begin = System.currentTimeMillis();

diff --git a/pmml-sparkml-lightgbm/src/test/java/org/jpmml/sparkml/lightgbm/testing/LightGBMTest.java b/pmml-sparkml-lightgbm/src/test/java/org/jpmml/sparkml/lightgbm/testing/LightGBMTest.java
@@ -21,14 +21,15 @@
 import java.util.function.Predicate;
 
 import com.google.common.base.Equivalence;
+import org.jpmml.converter.testing.Datasets;
 import org.jpmml.evaluator.ResultField;
 import org.jpmml.evaluator.testing.IntegrationTest;
 import org.jpmml.evaluator.testing.PMMLEquivalence;
 import org.jpmml.evaluator.testing.SimpleArchiveBatch;
 import org.jpmml.sparkml.testing.SparkMLEncoderBatchTest;
 import org.junit.Test;
 
-public class LightGBMTest extends IntegrationTest {
+public class LightGBMTest extends IntegrationTest implements Datasets {
 
 	public LightGBMTest(){
 		super(new PMMLEquivalence(1e-14, 1e-14));
@@ -43,16 +44,26 @@ public SimpleArchiveBatch createBatch(String algorithm, String dataset, Predicat
 
 	@Test
 	public void evaluateLightGBMAudit() throws Exception {
-		evaluate("LightGBM", "Audit");
+		evaluate("LightGBM", AUDIT);
+	}
+
+	@Test
+	public void evaluateLightGBMAuditNA() throws Exception {
+		evaluate("LightGBM", AUDIT_NA);
 	}
 
 	@Test
 	public void evaluateLightGBMAuto() throws Exception {
-		evaluate("LightGBM", "Auto");
+		evaluate("LightGBM", AUTO);
+	}
+
+	@Test
+	public void evaluateLightGBMAutoNA() throws Exception {
+		evaluate("LightGBM", AUTO_NA);
 	}
 
 	@Test
 	public void evaluateLightGBMIris() throws Exception {
-		evaluate("LightGBM", "Iris");
+		evaluate("LightGBM", IRIS);
 	}
 }
diff --git a/pmml-sparkml-lightgbm/src/test/resources/LightGBMAuditNA.scala b/pmml-sparkml-lightgbm/src/test/resources/LightGBMAuditNA.scala
@@ -0,0 +1,44 @@
+import java.io.File
+
+import com.microsoft.azure.synapse.ml.lightgbm.LightGBMClassifier
+import org.apache.spark.ml.Pipeline
+import org.apache.spark.ml.feature._
+import org.apache.spark.ml.linalg.Vector
+import org.apache.spark.sql.functions.{lit, udf}
+import org.apache.spark.sql.types.StringType
+import org.jpmml.sparkml.{DatasetUtil, PipelineModelUtil, PMMLBuilder}
+import org.jpmml.sparkml.feature.InvalidCategoryTransformer
+
+var df = DatasetUtil.loadCsv(spark, new File("csv/AuditNA.csv"))
+df = DatasetUtil.castColumn(df, "Adjusted", StringType)
+
+//DatasetUtil.storeSchema(df, new File("schema/AuditNA.json"))
+
+val cat_cols = Array("Education", "Employment", "Gender", "Marital", "Occupation")
+val cont_cols = Array("Age", "Hours", "Income")
+
+val labelIndexer = new StringIndexer().setInputCol("Adjusted").setOutputCol("idx_Adjusted")
+
+val indexer = new StringIndexer().setInputCols(cat_cols).setOutputCols(cat_cols.map(cat_col => "idx_" + cat_col)).setHandleInvalid("keep")
+val indexTransformer = new InvalidCategoryTransformer().setInputCols(indexer.getOutputCols).setOutputCols(cat_cols.map(cat_col => "idxTransformed_" + cat_col))
+
+val assembler = new VectorAssembler().setInputCols(indexTransformer.getOutputCols ++ cont_cols).setOutputCol("featureVector").setHandleInvalid("keep")
+
+val classifier = new LightGBMClassifier().setObjective("binary").setNumIterations(101).setLabelCol(labelIndexer.getOutputCol).setFeaturesCol(assembler.getOutputCol)
+
+val pipeline = new Pipeline().setStages(Array(labelIndexer, indexer, indexTransformer, assembler, classifier))
+val pipelineModel = pipeline.fit(df)
+
+//PipelineModelUtil.storeZip(pipelineModel, new File("pipeline/LightGBMAuditNA.zip"))
+
+new PMMLBuilder(df.schema, pipelineModel).buildFile(new File("pmml/LightGBMAuditNA.pmml"))
+
+val predLabel = udf{ (value: Float) => value.toInt.toString }
+val vectorToColumn = udf{ (vec: Vector, index: Int) => vec(index) }
+
+var lgbDf = pipelineModel.transform(df)
+lgbDf = lgbDf.selectExpr("prediction", "probability")
+lgbDf = lgbDf.withColumn("Adjusted", predLabel(lgbDf("prediction"))).drop("prediction")
+lgbDf = lgbDf.withColumn("probability(0)", vectorToColumn(lgbDf("probability"), lit(0))).withColumn("probability(1)", vectorToColumn(lgbDf("probability"), lit(1))).drop("probability").drop("probability")
+
+DatasetUtil.storeCsv(lgbDf, new File("csv/LightGBMAuditNA.csv"))
diff --git a/pmml-sparkml-lightgbm/src/test/resources/LightGBMAutoNA.scala b/pmml-sparkml-lightgbm/src/test/resources/LightGBMAutoNA.scala
@@ -0,0 +1,33 @@
+import java.io.File
+
+import com.microsoft.azure.synapse.ml.lightgbm.LightGBMRegressor
+import org.apache.spark.ml.Pipeline
+import org.apache.spark.ml.feature._
+import org.jpmml.sparkml.{DatasetUtil, PipelineModelUtil, PMMLBuilder}
+import org.jpmml.sparkml.feature.InvalidCategoryTransformer
+
+var df = DatasetUtil.loadCsv(spark, new File("csv/AutoNA.csv"))
+
+//DatasetUtil.storeSchema(df, new File("schema/AutoNA.json"))
+
+val cat_cols = Array("cylinders", "model_year", "origin")
+val cont_cols = Array("acceleration", "displacement", "horsepower", "weight")
+
+val indexer = new StringIndexer().setInputCols(cat_cols).setOutputCols(cat_cols.map(cat_col => "idx_" + cat_col)).setHandleInvalid("keep")
+val indexTransformer = new InvalidCategoryTransformer().setInputCols(indexer.getOutputCols).setOutputCols(cat_cols.map(cat_col => "idxTransformed_" + cat_col))
+
+val assembler = new VectorAssembler().setInputCols(indexTransformer.getOutputCols ++ cont_cols).setOutputCol("featureVector").setHandleInvalid("keep")
+
+val regressor = new LightGBMRegressor().setNumIterations(101).setLabelCol("mpg").setFeaturesCol(assembler.getOutputCol)
+
+val pipeline = new Pipeline().setStages(Array(indexer, indexTransformer, assembler, regressor))
+val pipelineModel = pipeline.fit(df)
+
+//PipelineModelUtil.storeZip(pipelineModel, new File("pipeline/LightGBMAutoNA.zip"))
+
+new PMMLBuilder(df.schema, pipelineModel).buildFile(new File("pmml/LightGBMAutoNA.pmml"))
+
+var lgbDf = pipelineModel.transform(df)
+lgbDf = lgbDf.selectExpr("prediction as mpg")
+
+DatasetUtil.storeCsv(lgbDf, new File("csv/LightGBMAutoNA.csv"))