combust · ancasarb · Feb 24, 2020 · Feb 7, 2020 · Feb 8, 2020 · Feb 8, 2020
diff --git a/...-runtime/src/main/scala/ml/combust/mleap/xgboost/runtime/XGBoostClassificationModel.scala b/...-runtime/src/main/scala/ml/combust/mleap/xgboost/runtime/XGBoostClassificationModel.scala
@@ -42,7 +42,7 @@ case class XGBoostBinaryClassificationModel(override val booster: Booster,
 
   def predictRaw(data: DMatrix): Vector = {
     val m = booster.predict(data, outPutMargin = true, treeLimit = treeLimit).head(0)
-    Vectors.dense(1 - m, m)
+    Vectors.dense(- m, m)
   }
 
   override def rawToProbabilityInPlace(raw: Vector): Vector = {

diff --git a/.../src/main/scala/ml/combust/mleap/xgboost/runtime/bundle/ops/XGBoostClassificationOp.scala b/.../src/main/scala/ml/combust/mleap/xgboost/runtime/bundle/ops/XGBoostClassificationOp.scala
@@ -23,8 +23,10 @@ class XGBoostClassificationOp extends MleapOp[XGBoostClassification, XGBoostClas
                       (implicit context: BundleContext[MleapContext]): Model = {
       val out = Files.newOutputStream(context.file("xgboost.model"))
       obj.booster.saveModel(out)
-      model.withValue("num_features", Value.int(obj.numFeatures)).
-        withValue("num_classes", Value.int(obj.numClasses))
+      model
+        .withValue("num_features", Value.int(obj.numFeatures))
+        .withValue("num_classes", Value.int(obj.numClasses))
+        .withValue("tree_limit", Value.int(obj.treeLimit))
     }
 
     override def load(model: Model)

diff --git a/...time/src/main/scala/ml/combust/mleap/xgboost/runtime/bundle/ops/XGBoostRegressionOp.scala b/...time/src/main/scala/ml/combust/mleap/xgboost/runtime/bundle/ops/XGBoostRegressionOp.scala
@@ -25,7 +25,10 @@ class XGBoostRegressionOp extends MleapOp[XGBoostRegression, XGBoostRegressionMo
                       (implicit context: BundleContext[MleapContext]): Model = {
       val out = Files.newOutputStream(context.file("xgboost.model"))
       obj.booster.saveModel(out)
-      model.withValue("num_features", Value.int(obj.numFeatures))
+
+      model
+        .withValue("num_features", Value.int(obj.numFeatures))
+        .withValue("tree_limit", Value.int(obj.treeLimit))
     }
 
     override def load(model: Model)

diff --git a/mleap-xgboost-runtime/src/test/resources/datasources/agaricus.test b/mleap-xgboost-runtime/src/test/resources/datasources/agaricus.test
diff --git a/mleap-xgboost-runtime/src/test/resources/datasources/agaricus.train b/mleap-xgboost-runtime/src/test/resources/datasources/agaricus.train
diff --git a/...rc/test/scala/ml/combust/mleap/xgboost/runtime/XGBoostClassificationModelParitySpec.scala b/...rc/test/scala/ml/combust/mleap/xgboost/runtime/XGBoostClassificationModelParitySpec.scala
@@ -0,0 +1,103 @@
+package ml.combust.mleap.xgboost.runtime
+
+import ml.combust.mleap.core.types.{BasicType, NodeShape, ScalarType, StructField, TensorType}
+import ml.combust.mleap.runtime.frame.{DefaultLeapFrame, Transformer}
+import ml.combust.mleap.tensor.SparseTensor
+import ml.combust.mleap.xgboost.runtime.testing.{BoosterUtils, BundleSerializationUtils, CachedDatasetUtils, FloatingPointApproximations}
+import ml.dmlc.xgboost4j.scala.Booster
+import org.apache.spark.ml.linalg.Vectors
+import org.scalatest.FunSpec
+
+
+class XGBoostClassificationModelParitySpec extends FunSpec
+  with BoosterUtils
+  with CachedDatasetUtils
+  with BundleSerializationUtils
+  with FloatingPointApproximations {
+
+  def createBoosterClassifier(booster: Booster): Transformer ={
+
+    XGBoostClassification(
+      "xgboostSingleThread",
+      NodeShape.probabilisticClassifier(
+        rawPredictionCol = Some("raw_prediction"),
+        probabilityCol = Some("probability")),
+      XGBoostClassificationModel(XGBoostBinaryClassificationModel(booster, numFeatures, 0))
+    )
+  }
+
+  def equalityTestRowByRow(booster: Booster, mleapTransformer: Transformer) = {
+
+    import XgbConverters._
+
+    leapFrameLibSVMtest.dataset.foreach {
+      r=>
+        val mleapResult = mleapTransformer.transform(DefaultLeapFrame(mleapSchema.get, Seq(r))).get
+
+        val mleapPredictionColIndex = mleapResult.schema.indexOf("prediction").get
+        val mleapRawPredictionColIndex = mleapResult.schema.indexOf("raw_prediction").get
+        val mleapProbabilityColIndex = mleapResult.schema.indexOf("probability").get
+
+        val singleRowDMatrix = r(1).asInstanceOf[SparseTensor[Double]].asXGB
+
+        val boosterResult = booster.predict(singleRowDMatrix, false, 0).head(0)
+
+        val boosterProbability = Vectors.dense(1 - boosterResult, boosterResult).toDense
+        val boosterPrediction = Math.round(boosterResult)
+
+
+        assert (boosterPrediction == mleapResult.dataset.head.getDouble(mleapPredictionColIndex))
+
+        assert (
+          almostEqualSequences(
+            Seq(boosterProbability.values),
+            Seq(mleapResult.dataset.head.getTensor[Double](mleapProbabilityColIndex).toArray)
+          )
+        )
+
+        val boosterResultWithMargin = booster.predict(singleRowDMatrix, true, 0).head(0)
+        val boosterRawPrediction = Vectors.dense(- boosterResultWithMargin, boosterResultWithMargin).toDense
+
+        assert (almostEqualSequences(
+          Seq(boosterRawPrediction.values),
+          Seq(mleapResult.dataset.head.getTensor[Double](mleapRawPredictionColIndex).toArray)
+        ))
+    }
+  }
+
+  it("Results between the XGBoost4j booster and the MLeap Transformer are the same") {
+    val booster = trainBooster(xgboostParams, denseDataset)
+    val xgboostTransformer = createBoosterClassifier(trainBooster(xgboostParams, denseDataset))
+
+    val mleapBundle = serializeModelToMleapBundle(xgboostTransformer)
+    val deserializedTransformer: Transformer = loadMleapTransformerFromBundle(mleapBundle)
+
+    equalityTestRowByRow(booster, deserializedTransformer)
+  }
+
+  it("has the correct inputs and outputs with columns: prediction, probability and raw_prediction") {
+
+    val booster = trainBooster(xgboostParams, denseDataset)
+    val transformer = createBoosterClassifier(booster)
+
+    assert(transformer.schema.fields ==
+      Seq(StructField("features", TensorType(BasicType.Double, Seq(numFeatures))),
+        StructField("raw_prediction", TensorType(BasicType.Double, Seq(2))),
+        StructField("probability", TensorType(BasicType.Double, Seq(2))),
+        StructField("prediction", ScalarType.Double.nonNullable)))
+  }
+
+  it("Results are the same pre and post serialization") {
+    val booster = trainBooster(xgboostParams, denseDataset)
+    val xgboostTransformer = createBoosterClassifier(booster)
+
+    val preSerializationResult = xgboostTransformer.transform(leapFrameLibSVMtrain)
+
+    val mleapBundle = serializeModelToMleapBundle(xgboostTransformer)
+
+    val deserializedTransformer: Transformer = loadMleapTransformerFromBundle(mleapBundle)
+    val deserializedModelResult = deserializedTransformer.transform(leapFrameLibSVMtrain).get
+
+    assert(preSerializationResult.get.dataset == deserializedModelResult.dataset)
+  }
+}
diff --git a/...me/src/test/scala/ml/combust/mleap/xgboost/runtime/XGBoostRegressionModelParitySpec.scala b/...me/src/test/scala/ml/combust/mleap/xgboost/runtime/XGBoostRegressionModelParitySpec.scala
@@ -0,0 +1,76 @@
+package ml.combust.mleap.xgboost.runtime
+
+import ml.combust.mleap.core.types._
+import ml.combust.mleap.runtime.frame.{DefaultLeapFrame, Transformer}
+import ml.combust.mleap.tensor.SparseTensor
+import ml.combust.mleap.xgboost.runtime.testing.{BoosterUtils, BundleSerializationUtils, CachedDatasetUtils, FloatingPointApproximations}
+import ml.dmlc.xgboost4j.scala.Booster
+import org.scalatest.FunSpec
+
+
+class XGBoostRegressionModelParitySpec extends FunSpec
+  with BoosterUtils
+  with CachedDatasetUtils
+  with BundleSerializationUtils
+  with FloatingPointApproximations {
+
+  def createBoosterRegressor(booster: Booster): Transformer ={
+
+    XGBoostRegression(
+      "xgboostSingleThread",
+      NodeShape.regression(),
+      XGBoostRegressionModel(booster, numFeatures, 0)
+    )
+  }
+
+  def equalityTestRowByRow(booster: Booster, mleapTransformer: Transformer) = {
+
+    import XgbConverters._
+
+    leapFrameLibSVMtest.dataset.foreach {
+      r=>
+        val mleapResult = mleapTransformer.transform(DefaultLeapFrame(mleapSchema.get, Seq(r))).get
+        val mleapPredictionColIndex = mleapResult.schema.indexOf("prediction").get
+
+        val singleRowDMatrix = r(1).asInstanceOf[SparseTensor[Double]].asXGB
+        val boosterResult = booster.predict(singleRowDMatrix, false, 0).head(0)
+
+        assert (boosterResult == mleapResult.dataset.head.getDouble(mleapPredictionColIndex))
+
+    }
+  }
+
+  it("Results between the XGBoost4j booster and the MLeap Transformer are the same") {
+    val booster = trainBooster(xgboostParams, denseDataset)
+    val xgboostTransformer = createBoosterRegressor(trainBooster(xgboostParams, denseDataset))
+
+    val mleapBundle = serializeModelToMleapBundle(xgboostTransformer)
+    val deserializedTransformer: Transformer = loadMleapTransformerFromBundle(mleapBundle)
+
+    equalityTestRowByRow(booster, deserializedTransformer)
+  }
+
+  it("has the correct inputs and outputs with columns: prediction, probability and raw_prediction") {
+
+    val booster = trainBooster(xgboostParams, denseDataset)
+    val transformer = createBoosterRegressor(booster)
+
+    assert(transformer.schema.fields ==
+      Seq(StructField("features", TensorType(BasicType.Double, Seq(numFeatures))),
+        StructField("prediction", ScalarType.Double.nonNullable)))
+  }
+
+  it("Results are the same pre and post serialization") {
+    val booster = trainBooster(xgboostParams, denseDataset)
+    val xgboostTransformer = createBoosterRegressor(booster)
+
+    val preSerializationResult = xgboostTransformer.transform(leapFrameLibSVMtrain)
+
+    val mleapBundle = serializeModelToMleapBundle(xgboostTransformer)
+
+    val deserializedTransformer: Transformer = loadMleapTransformerFromBundle(mleapBundle)
+    val deserializedModelResult = deserializedTransformer.transform(leapFrameLibSVMtrain).get
+
+    assert(preSerializationResult.get.dataset == deserializedModelResult.dataset)
+  }
+}
diff --git a/...gboost-runtime/src/test/scala/ml/combust/mleap/xgboost/runtime/testing/BoosterUtils.scala b/...gboost-runtime/src/test/scala/ml/combust/mleap/xgboost/runtime/testing/BoosterUtils.scala
@@ -0,0 +1,18 @@
+package ml.combust.mleap.xgboost.runtime.testing
+
+import ml.dmlc.xgboost4j.scala.{Booster, DMatrix, XGBoost}
+
+trait BoosterUtils {
+
+  final val xgboostParams: Map[String, Any] = Map(
+    "eta" -> 0.3,
+    "max_depth" -> 2,
+    "objective" -> "binary:logistic",
+    "num_round" -> 15,
+    "num_classes" -> 2
+  )
+
+  def trainBooster(xgboostParams: Map[String, Any], dataset: DMatrix): Booster =
+    XGBoost.train(dataset, xgboostParams, xgboostParams("num_round").asInstanceOf[Int])
+
+}
diff --git a/...me/src/test/scala/ml/combust/mleap/xgboost/runtime/testing/BundleSerializationUtils.scala b/...me/src/test/scala/ml/combust/mleap/xgboost/runtime/testing/BundleSerializationUtils.scala
@@ -0,0 +1,35 @@
+package ml.combust.mleap.xgboost.runtime.testing
+
+import java.io.File
+
+import ml.combust.bundle.BundleFile
+import ml.combust.bundle.serializer.SerializationFormat
+import ml.combust.mleap.runtime.{MleapContext, frame}
+import ml.combust.mleap.runtime.frame.Transformer
+import resource.managed
+
+trait BundleSerializationUtils {
+
+  def serializeModelToMleapBundle(transformer: Transformer): File = {
+    import ml.combust.mleap.runtime.MleapSupport._
+
+    new File("/tmp/mleap/spark-parity").mkdirs()
+    val file = new File(s"/tmp/mleap/spark-parity/${this.getClass.getName}.zip")
+    file.delete()
+
+    for(bf <- managed(BundleFile(file))) {
+      transformer.writeBundle.format(SerializationFormat.Json).save(bf).get
+    }
+    file
+  }
+
+  def loadMleapTransformerFromBundle(bundleFile: File)
+                                    (implicit context: MleapContext): frame.Transformer = {
+
+    import ml.combust.mleap.runtime.MleapSupport._
+
+    (for(bf <- managed(BundleFile(bundleFile))) yield {
+      bf.loadMleapBundle().get.root
+    }).tried.get
+  }
+}
diff --git a/...-runtime/src/test/scala/ml/combust/mleap/xgboost/runtime/testing/CachedDatasetUtils.scala b/...-runtime/src/test/scala/ml/combust/mleap/xgboost/runtime/testing/CachedDatasetUtils.scala
@@ -0,0 +1,50 @@
+package ml.combust.mleap.xgboost.runtime.testing
+
+import ml.combust.mleap.core.types.{StructType, TensorType}
+import ml.combust.mleap.core.util.VectorConverters
+import ml.combust.mleap.runtime.frame.{ArrayRow, DefaultLeapFrame}
+import ml.dmlc.xgboost4j.scala.DMatrix
+import org.apache.spark.ml.linalg.SparseVector
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.mleap.TypeConverters
+
+trait CachedDatasetUtils {
+
+  private final val TrainDataFilePath = "datasources/agaricus.train"
+  private final val TestDataFilePath = "datasources/agaricus.test"
+  var mleapSchema: Option[StructType] = None
+
+  lazy val denseDataset: DMatrix =
+    new DMatrix(this.getClass.getClassLoader.getResource(TrainDataFilePath).getFile)
+
+  lazy val leapFrameLibSVMtrain: DefaultLeapFrame = leapFrameFromLibSVMFile(TrainDataFilePath)
+  lazy val leapFrameLibSVMtest: DefaultLeapFrame = leapFrameFromLibSVMFile(TestDataFilePath)
+  lazy val numFeatures: Int =
+    leapFrameLibSVMtrain.schema.getField("features").get.dataType.asInstanceOf[TensorType].dimensions.get.head
+
+  private def leapFrameFromLibSVMFile(filePath: String): DefaultLeapFrame = {
+
+    // Use Spark utils to load libsvm from disk
+    val spark = SparkSession.builder()
+      .master("local[2]")
+      .appName("XGBoostRuntimeClassificationModelParitySpec")
+      .getOrCreate()
+
+    // This is the dataset used by dmls-XGBoost https://github.com/dmlc/xgboost/blob/master/demo/data/agaricus.txt.train
+    val dataFrame = spark.read.format("libsvm")
+      .load(this.getClass.getClassLoader.getResource(filePath).getFile)
+
+    mleapSchema = Option(TypeConverters.sparkSchemaToMleapSchema(dataFrame))
+
+    val mleapMatrix: Array[ArrayRow] = dataFrame.collect().map {
+      r => ArrayRow(
+        Seq(
+          r.get(0),
+          VectorConverters.sparkVectorToMleapTensor(r.get(1).asInstanceOf[SparseVector])
+        ))
+    }
+
+    DefaultLeapFrame(mleapSchema.get, mleapMatrix)
+  }
+
+}
diff --git a/...src/test/scala/ml/combust/mleap/xgboost/runtime/testing/FloatingPointApproximations.scala b/...src/test/scala/ml/combust/mleap/xgboost/runtime/testing/FloatingPointApproximations.scala
@@ -0,0 +1,24 @@
+package ml.combust.mleap.xgboost.runtime.testing
+
+
+trait FloatingPointApproximations {
+
+  val DefaultMinimumPrecision = 1e-7
+
+  def almostEqual(x: Double, y: Double, precision: Double = DefaultMinimumPrecision): Boolean = {
+    if ((x - y).abs < precision) true else false
+  }
+
+  def almostEqualSequences(
+                            sequenceA: Seq[Array[Double]],
+                            SequenceB: Seq[Array[Double]],
+                            precision: Double             = DefaultMinimumPrecision
+                          ): Boolean = {
+    for ((arrayA, arrayB) <- sequenceA zip SequenceB) {
+      for ((numberA, numberB) <- arrayA zip arrayB) {
+        if (!almostEqual(numberA, numberB, precision)) return false
+      }
+    }
+    true
+  }
+}
diff --git a/project/Dependencies.scala b/project/Dependencies.scala
@@ -80,6 +80,7 @@ object Dependencies {
     val akkaStreamTestKit = "com.typesafe.akka" %% "akka-stream-testkit" % akkaVersion % "test"
     val junit = "junit" % "junit" % "4.12" % "test"
     val junitInterface = "com.novocode" % "junit-interface" % "0.10" % "test"
+    val spark = Compile.spark.map(_ % "test")
   }
 
   object Provided {
@@ -114,7 +115,7 @@ object Dependencies {
 
   val tensorflow = l ++= tensorflowDeps ++ Seq(Test.scalaTest)
 
-  val xgboostRuntime = l ++= Seq(xgboostDep) ++ Seq(Test.scalaTest)
+  val xgboostRuntime = l ++= Seq(xgboostDep) ++ Test.spark ++ Seq(Test.scalaTest)
 
   val xgboostSpark = l ++= Seq(xgboostSparkDep) ++ Provided.spark
 

diff --git a/project/MleapProject.scala b/project/MleapProject.scala
@@ -110,7 +110,9 @@ object MleapProject {
   lazy val xgboostRuntime = Project(
     id = "mleap-xgboost-runtime",
     base = file("mleap-xgboost-runtime"),
-    dependencies = Seq(runtime)
+    dependencies = Seq(
+      runtime,
+      sparkTestkit % "test")
   )
 
   lazy val xgboostSpark = Project(