Skip to content

Commit

Permalink
Merged version 2.0.3
Browse files Browse the repository at this point in the history
  • Loading branch information
vruusmann committed Dec 27, 2023
2 parents 8636f12 + acafc4c commit 39847ee
Show file tree
Hide file tree
Showing 53 changed files with 30,343 additions and 2,580 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/maven.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,4 @@ jobs:
path: ~/.m2
key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }}
restore-keys: ${{ runner.os }}-m2
- run: mvn -B package --file pom.xml
- run: mvn -Dxgboost4j-spark.version=2.0.1 -B package --file pom.xml
11 changes: 9 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,14 @@ Java library and command-line application for converting Apache Spark ML pipelin
* [`tuning.TrainValidationSplitModel`](https://spark.apache.org/docs/latest/api/java/org/apache/spark/ml/tuning/TrainValidationSplitModel.html)
</details>

<details>
<summary>JPMML-SparkML</summary>

* Feature transformers:
* `org.jpmml.sparkml.feature.InvalidCategoryTransformer`
* `org.jpmml.sparkml.feature.SparseToDenseTransformer`
</details>

<details>
<summary>LightGBM</summary>

Expand All @@ -120,8 +128,6 @@ Java library and command-line application for converting Apache Spark ML pipelin
<details>
<summary>XGBoost</summary>

* Feature transformers:
* `org.jpmml.sparkml.xgboost.SparseToDenseTransformer`
* Prediction models:
* [`ml.dmlc.xgboost4j.scala.spark.XGBoostClassificationModel`](https://xgboost.readthedocs.io/en/latest/jvm/scaladocs/xgboost4j-spark/ml/dmlc/xgboost4j/scala/spark/XGBoostClassificationModel.html)
* [`ml.dmlc.xgboost4j.scala.spark.XGBoostRegressionModel`](https://xgboost.readthedocs.io/en/latest/jvm/scaladocs/xgboost4j-spark/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressionModel.html)
Expand Down Expand Up @@ -247,6 +253,7 @@ spark-submit --master local --class org.jpmml.sparkml.example.Main pmml-sparkml-

# Documentation #

* [Training PySpark LightGBM pipelines](https://openscoring.io/blog/2023/05/26/pyspark_lightgbm_pipeline/)
* [Converting logistic regression models to PMML documents](https://openscoring.io/blog/2020/01/19/converting_logistic_regression_pmml/#apache-spark)
* [Deploying Apache Spark ML pipeline models on Openscoring REST web service](https://openscoring.io/blog/2020/02/16/deploying_sparkml_pipeline_openscoring_rest/)
* [Converting Apache Spark ML pipeline models to PMML documents](https://openscoring.io/blog/2018/07/09/converting_sparkml_pipeline_pmml/)
Expand Down
2 changes: 1 addition & 1 deletion pmml-sparkml-example/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>3.4.1</version>
<version>3.5.1</version>
<executions>
<execution>
<phase>package</phase>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
import org.apache.spark.sql.types.StructType;
import org.dmg.pmml.PMML;
import org.jpmml.model.metro.MetroJAXBUtil;
import org.jpmml.sparkml.ArchiveUtil;
import org.jpmml.sparkml.PMMLBuilder;
import org.jpmml.sparkml.PipelineModelUtil;
import org.jpmml.sparkml.model.HasPredictionModelOptions;
Expand Down Expand Up @@ -187,7 +188,7 @@ private void run() throws Exception {
logger.info("Loading pipeline model..");

if(this.pipelineInput.isFile()){
this.pipelineInput = PipelineModelUtil.uncompress(this.pipelineInput);
this.pipelineInput = ArchiveUtil.uncompress(this.pipelineInput);
}

long begin = System.currentTimeMillis();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,15 @@
import java.util.function.Predicate;

import com.google.common.base.Equivalence;
import org.jpmml.converter.testing.Datasets;
import org.jpmml.evaluator.ResultField;
import org.jpmml.evaluator.testing.IntegrationTest;
import org.jpmml.evaluator.testing.PMMLEquivalence;
import org.jpmml.evaluator.testing.SimpleArchiveBatch;
import org.jpmml.sparkml.testing.SparkMLEncoderBatchTest;
import org.junit.Test;

public class LightGBMTest extends IntegrationTest {
public class LightGBMTest extends IntegrationTest implements Datasets {

public LightGBMTest(){
super(new PMMLEquivalence(1e-14, 1e-14));
Expand All @@ -43,16 +44,26 @@ public SimpleArchiveBatch createBatch(String algorithm, String dataset, Predicat

@Test
public void evaluateLightGBMAudit() throws Exception {
evaluate("LightGBM", "Audit");
evaluate("LightGBM", AUDIT);
}

@Test
public void evaluateLightGBMAuditNA() throws Exception {
evaluate("LightGBM", AUDIT_NA);
}

@Test
public void evaluateLightGBMAuto() throws Exception {
evaluate("LightGBM", "Auto");
evaluate("LightGBM", AUTO);
}

@Test
public void evaluateLightGBMAutoNA() throws Exception {
evaluate("LightGBM", AUTO_NA);
}

@Test
public void evaluateLightGBMIris() throws Exception {
evaluate("LightGBM", "Iris");
evaluate("LightGBM", IRIS);
}
}
44 changes: 44 additions & 0 deletions pmml-sparkml-lightgbm/src/test/resources/LightGBMAuditNA.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import java.io.File

import com.microsoft.azure.synapse.ml.lightgbm.LightGBMClassifier
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.feature._
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.sql.functions.{lit, udf}
import org.apache.spark.sql.types.StringType
import org.jpmml.sparkml.{DatasetUtil, PipelineModelUtil, PMMLBuilder}
import org.jpmml.sparkml.feature.InvalidCategoryTransformer

var df = DatasetUtil.loadCsv(spark, new File("csv/AuditNA.csv"))
df = DatasetUtil.castColumn(df, "Adjusted", StringType)

//DatasetUtil.storeSchema(df, new File("schema/AuditNA.json"))

val cat_cols = Array("Education", "Employment", "Gender", "Marital", "Occupation")
val cont_cols = Array("Age", "Hours", "Income")

val labelIndexer = new StringIndexer().setInputCol("Adjusted").setOutputCol("idx_Adjusted")

val indexer = new StringIndexer().setInputCols(cat_cols).setOutputCols(cat_cols.map(cat_col => "idx_" + cat_col)).setHandleInvalid("keep")
val indexTransformer = new InvalidCategoryTransformer().setInputCols(indexer.getOutputCols).setOutputCols(cat_cols.map(cat_col => "idxTransformed_" + cat_col))

val assembler = new VectorAssembler().setInputCols(indexTransformer.getOutputCols ++ cont_cols).setOutputCol("featureVector").setHandleInvalid("keep")

val classifier = new LightGBMClassifier().setObjective("binary").setNumIterations(101).setLabelCol(labelIndexer.getOutputCol).setFeaturesCol(assembler.getOutputCol)

val pipeline = new Pipeline().setStages(Array(labelIndexer, indexer, indexTransformer, assembler, classifier))
val pipelineModel = pipeline.fit(df)

//PipelineModelUtil.storeZip(pipelineModel, new File("pipeline/LightGBMAuditNA.zip"))

new PMMLBuilder(df.schema, pipelineModel).buildFile(new File("pmml/LightGBMAuditNA.pmml"))

val predLabel = udf{ (value: Float) => value.toInt.toString }
val vectorToColumn = udf{ (vec: Vector, index: Int) => vec(index) }

var lgbDf = pipelineModel.transform(df)
lgbDf = lgbDf.selectExpr("prediction", "probability")
lgbDf = lgbDf.withColumn("Adjusted", predLabel(lgbDf("prediction"))).drop("prediction")
lgbDf = lgbDf.withColumn("probability(0)", vectorToColumn(lgbDf("probability"), lit(0))).withColumn("probability(1)", vectorToColumn(lgbDf("probability"), lit(1))).drop("probability").drop("probability")

DatasetUtil.storeCsv(lgbDf, new File("csv/LightGBMAuditNA.csv"))
33 changes: 33 additions & 0 deletions pmml-sparkml-lightgbm/src/test/resources/LightGBMAutoNA.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import java.io.File

import com.microsoft.azure.synapse.ml.lightgbm.LightGBMRegressor
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.feature._
import org.jpmml.sparkml.{DatasetUtil, PipelineModelUtil, PMMLBuilder}
import org.jpmml.sparkml.feature.InvalidCategoryTransformer

var df = DatasetUtil.loadCsv(spark, new File("csv/AutoNA.csv"))

//DatasetUtil.storeSchema(df, new File("schema/AutoNA.json"))

val cat_cols = Array("cylinders", "model_year", "origin")
val cont_cols = Array("acceleration", "displacement", "horsepower", "weight")

val indexer = new StringIndexer().setInputCols(cat_cols).setOutputCols(cat_cols.map(cat_col => "idx_" + cat_col)).setHandleInvalid("keep")
val indexTransformer = new InvalidCategoryTransformer().setInputCols(indexer.getOutputCols).setOutputCols(cat_cols.map(cat_col => "idxTransformed_" + cat_col))

val assembler = new VectorAssembler().setInputCols(indexTransformer.getOutputCols ++ cont_cols).setOutputCol("featureVector").setHandleInvalid("keep")

val regressor = new LightGBMRegressor().setNumIterations(101).setLabelCol("mpg").setFeaturesCol(assembler.getOutputCol)

val pipeline = new Pipeline().setStages(Array(indexer, indexTransformer, assembler, regressor))
val pipelineModel = pipeline.fit(df)

//PipelineModelUtil.storeZip(pipelineModel, new File("pipeline/LightGBMAutoNA.zip"))

new PMMLBuilder(df.schema, pipelineModel).buildFile(new File("pmml/LightGBMAutoNA.pmml"))

var lgbDf = pipelineModel.transform(df)
lgbDf = lgbDf.selectExpr("prediction as mpg")

DatasetUtil.storeCsv(lgbDf, new File("csv/LightGBMAutoNA.csv"))
Loading

0 comments on commit 39847ee

Please sign in to comment.