Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Luca/fix raw predict issue 635 #637

Merged
merged 8 commits into from
Feb 24, 2020
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ case class XGBoostBinaryClassificationModel(override val booster: Booster,

def predictRaw(data: DMatrix): Vector = {
val m = booster.predict(data, outPutMargin = true, treeLimit = treeLimit).head(0)
Vectors.dense(1 - m, m)
Vectors.dense(- m, m)
}

override def rawToProbabilityInPlace(raw: Vector): Vector = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,10 @@ class XGBoostClassificationOp extends MleapOp[XGBoostClassification, XGBoostClas
(implicit context: BundleContext[MleapContext]): Model = {
val out = Files.newOutputStream(context.file("xgboost.model"))
obj.booster.saveModel(out)
model.withValue("num_features", Value.int(obj.numFeatures)).
withValue("num_classes", Value.int(obj.numClasses))
model
.withValue("num_features", Value.int(obj.numFeatures))
.withValue("num_classes", Value.int(obj.numClasses))
.withValue("tree_limit", Value.int(obj.treeLimit))
}

override def load(model: Model)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,10 @@ class XGBoostRegressionOp extends MleapOp[XGBoostRegression, XGBoostRegressionMo
(implicit context: BundleContext[MleapContext]): Model = {
val out = Files.newOutputStream(context.file("xgboost.model"))
obj.booster.saveModel(out)
model.withValue("num_features", Value.int(obj.numFeatures))

model
.withValue("num_features", Value.int(obj.numFeatures))
.withValue("tree_limit", Value.int(obj.treeLimit))
}

override def load(model: Model)
Expand Down
1,611 changes: 1,611 additions & 0 deletions mleap-xgboost-runtime/src/test/resources/datasources/agaricus.test

Large diffs are not rendered by default.

6,513 changes: 6,513 additions & 0 deletions mleap-xgboost-runtime/src/test/resources/datasources/agaricus.train

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
package ml.combust.mleap.xgboost.runtime

import ml.combust.mleap.core.types.{BasicType, NodeShape, ScalarType, StructField, TensorType}
import ml.combust.mleap.runtime.frame.{DefaultLeapFrame, Transformer}
import ml.combust.mleap.tensor.SparseTensor
import ml.combust.mleap.xgboost.runtime.testing.{BoosterUtils, BundleSerializationUtils, CachedDatasetUtils, FloatingPointApproximations}
import ml.dmlc.xgboost4j.scala.Booster
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec


class XGBoostClassificationModelParitySpec extends FunSpec
with BoosterUtils
with CachedDatasetUtils
with BundleSerializationUtils
with FloatingPointApproximations {

def createBoosterClassifier(booster: Booster): Transformer ={

XGBoostClassification(
"xgboostSingleThread",
NodeShape.probabilisticClassifier(
rawPredictionCol = Some("raw_prediction"),
probabilityCol = Some("probability")),
XGBoostClassificationModel(XGBoostBinaryClassificationModel(booster, numFeatures, 0))
)
}

def equalityTestRowByRow(booster: Booster, mleapTransformer: Transformer) = {

import XgbConverters._

leapFrameLibSVMtest.dataset.foreach {
r=>
val mleapResult = mleapTransformer.transform(DefaultLeapFrame(mleapSchema.get, Seq(r))).get

val mleapPredictionColIndex = mleapResult.schema.indexOf("prediction").get
val mleapRawPredictionColIndex = mleapResult.schema.indexOf("raw_prediction").get
val mleapProbabilityColIndex = mleapResult.schema.indexOf("probability").get

val singleRowDMatrix = r(1).asInstanceOf[SparseTensor[Double]].asXGB

val boosterResult = booster.predict(singleRowDMatrix, false, 0).head(0)

val boosterProbability = Vectors.dense(1 - boosterResult, boosterResult).toDense
val boosterPrediction = Math.round(boosterResult)


assert (boosterPrediction == mleapResult.dataset.head.getDouble(mleapPredictionColIndex))

assert (
almostEqualSequences(
Seq(boosterProbability.values),
Seq(mleapResult.dataset.head.getTensor[Double](mleapProbabilityColIndex).toArray)
)
)

val boosterResultWithMargin = booster.predict(singleRowDMatrix, true, 0).head(0)
val boosterRawPrediction = Vectors.dense(- boosterResultWithMargin, boosterResultWithMargin).toDense

assert (almostEqualSequences(
Seq(boosterRawPrediction.values),
Seq(mleapResult.dataset.head.getTensor[Double](mleapRawPredictionColIndex).toArray)
))
}
}

it("Results between the XGBoost4j booster and the MLeap Transformer are the same") {
val booster = trainBooster(xgboostParams, denseDataset)
val xgboostTransformer = createBoosterClassifier(trainBooster(xgboostParams, denseDataset))

val mleapBundle = serializeModelToMleapBundle(xgboostTransformer)
val deserializedTransformer: Transformer = loadMleapTransformerFromBundle(mleapBundle)

equalityTestRowByRow(booster, deserializedTransformer)
}

it("has the correct inputs and outputs with columns: prediction, probability and raw_prediction") {

val booster = trainBooster(xgboostParams, denseDataset)
val transformer = createBoosterClassifier(booster)

assert(transformer.schema.fields ==
Seq(StructField("features", TensorType(BasicType.Double, Seq(numFeatures))),
StructField("raw_prediction", TensorType(BasicType.Double, Seq(2))),
StructField("probability", TensorType(BasicType.Double, Seq(2))),
StructField("prediction", ScalarType.Double.nonNullable)))
}

it("Results are the same pre and post serialization") {
val booster = trainBooster(xgboostParams, denseDataset)
val xgboostTransformer = createBoosterClassifier(booster)

val preSerializationResult = xgboostTransformer.transform(leapFrameLibSVMtrain)

val mleapBundle = serializeModelToMleapBundle(xgboostTransformer)

val deserializedTransformer: Transformer = loadMleapTransformerFromBundle(mleapBundle)
val deserializedModelResult = deserializedTransformer.transform(leapFrameLibSVMtrain).get

assert(preSerializationResult.get.dataset == deserializedModelResult.dataset)
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
package ml.combust.mleap.xgboost.runtime

import ml.combust.mleap.core.types._
import ml.combust.mleap.runtime.frame.{DefaultLeapFrame, Transformer}
import ml.combust.mleap.tensor.SparseTensor
import ml.combust.mleap.xgboost.runtime.testing.{BoosterUtils, BundleSerializationUtils, CachedDatasetUtils, FloatingPointApproximations}
import ml.dmlc.xgboost4j.scala.Booster
import org.scalatest.FunSpec


class XGBoostRegressionModelParitySpec extends FunSpec
with BoosterUtils
with CachedDatasetUtils
with BundleSerializationUtils
with FloatingPointApproximations {

def createBoosterRegressor(booster: Booster): Transformer ={

XGBoostRegression(
"xgboostSingleThread",
NodeShape.regression(),
XGBoostRegressionModel(booster, numFeatures, 0)
)
}

def equalityTestRowByRow(booster: Booster, mleapTransformer: Transformer) = {

import XgbConverters._

leapFrameLibSVMtest.dataset.foreach {
r=>
val mleapResult = mleapTransformer.transform(DefaultLeapFrame(mleapSchema.get, Seq(r))).get
val mleapPredictionColIndex = mleapResult.schema.indexOf("prediction").get

val singleRowDMatrix = r(1).asInstanceOf[SparseTensor[Double]].asXGB
val boosterResult = booster.predict(singleRowDMatrix, false, 0).head(0)

assert (boosterResult == mleapResult.dataset.head.getDouble(mleapPredictionColIndex))

}
}

it("Results between the XGBoost4j booster and the MLeap Transformer are the same") {
val booster = trainBooster(xgboostParams, denseDataset)
val xgboostTransformer = createBoosterRegressor(trainBooster(xgboostParams, denseDataset))

val mleapBundle = serializeModelToMleapBundle(xgboostTransformer)
val deserializedTransformer: Transformer = loadMleapTransformerFromBundle(mleapBundle)

equalityTestRowByRow(booster, deserializedTransformer)
}

it("has the correct inputs and outputs with columns: prediction, probability and raw_prediction") {

val booster = trainBooster(xgboostParams, denseDataset)
val transformer = createBoosterRegressor(booster)

assert(transformer.schema.fields ==
Seq(StructField("features", TensorType(BasicType.Double, Seq(numFeatures))),
StructField("prediction", ScalarType.Double.nonNullable)))
}

it("Results are the same pre and post serialization") {
val booster = trainBooster(xgboostParams, denseDataset)
val xgboostTransformer = createBoosterRegressor(booster)

val preSerializationResult = xgboostTransformer.transform(leapFrameLibSVMtrain)

val mleapBundle = serializeModelToMleapBundle(xgboostTransformer)

val deserializedTransformer: Transformer = loadMleapTransformerFromBundle(mleapBundle)
val deserializedModelResult = deserializedTransformer.transform(leapFrameLibSVMtrain).get

assert(preSerializationResult.get.dataset == deserializedModelResult.dataset)
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
package ml.combust.mleap.xgboost.runtime.testing

import ml.dmlc.xgboost4j.scala.{Booster, DMatrix, XGBoost}

trait BoosterUtils {

final val xgboostParams: Map[String, Any] = Map(
"eta" -> 0.3,
"max_depth" -> 2,
"objective" -> "binary:logistic",
"num_round" -> 15,
"num_classes" -> 2
)

def trainBooster(xgboostParams: Map[String, Any], dataset: DMatrix): Booster =
XGBoost.train(dataset, xgboostParams, xgboostParams("num_round").asInstanceOf[Int])

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
package ml.combust.mleap.xgboost.runtime.testing

import java.io.File

import ml.combust.bundle.BundleFile
import ml.combust.bundle.serializer.SerializationFormat
import ml.combust.mleap.runtime.{MleapContext, frame}
import ml.combust.mleap.runtime.frame.Transformer
import resource.managed

trait BundleSerializationUtils {

def serializeModelToMleapBundle(transformer: Transformer): File = {
import ml.combust.mleap.runtime.MleapSupport._

new File("/tmp/mleap/spark-parity").mkdirs()
val file = new File(s"/tmp/mleap/spark-parity/${this.getClass.getName}.zip")
file.delete()

for(bf <- managed(BundleFile(file))) {
transformer.writeBundle.format(SerializationFormat.Json).save(bf).get
}
file
}

def loadMleapTransformerFromBundle(bundleFile: File)
(implicit context: MleapContext): frame.Transformer = {

import ml.combust.mleap.runtime.MleapSupport._

(for(bf <- managed(BundleFile(bundleFile))) yield {
bf.loadMleapBundle().get.root
}).tried.get
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
package ml.combust.mleap.xgboost.runtime.testing

import ml.combust.mleap.core.types.{StructType, TensorType}
import ml.combust.mleap.core.util.VectorConverters
import ml.combust.mleap.runtime.frame.{ArrayRow, DefaultLeapFrame}
import ml.dmlc.xgboost4j.scala.DMatrix
import org.apache.spark.ml.linalg.SparseVector
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.mleap.TypeConverters

trait CachedDatasetUtils {

private final val TrainDataFilePath = "datasources/agaricus.train"
private final val TestDataFilePath = "datasources/agaricus.test"
var mleapSchema: Option[StructType] = None

lazy val denseDataset: DMatrix =
new DMatrix(this.getClass.getClassLoader.getResource(TrainDataFilePath).getFile)

lazy val leapFrameLibSVMtrain: DefaultLeapFrame = leapFrameFromLibSVMFile(TrainDataFilePath)
lazy val leapFrameLibSVMtest: DefaultLeapFrame = leapFrameFromLibSVMFile(TestDataFilePath)
lazy val numFeatures: Int =
leapFrameLibSVMtrain.schema.getField("features").get.dataType.asInstanceOf[TensorType].dimensions.get.head

private def leapFrameFromLibSVMFile(filePath: String): DefaultLeapFrame = {

// Use Spark utils to load libsvm from disk
val spark = SparkSession.builder()
.master("local[2]")
.appName("XGBoostRuntimeClassificationModelParitySpec")
.getOrCreate()

// This is the dataset used by dmls-XGBoost https://github.com/dmlc/xgboost/blob/master/demo/data/agaricus.txt.train
val dataFrame = spark.read.format("libsvm")
.load(this.getClass.getClassLoader.getResource(filePath).getFile)

mleapSchema = Option(TypeConverters.sparkSchemaToMleapSchema(dataFrame))

val mleapMatrix: Array[ArrayRow] = dataFrame.collect().map {
r => ArrayRow(
Seq(
r.get(0),
VectorConverters.sparkVectorToMleapTensor(r.get(1).asInstanceOf[SparseVector])
))
}

DefaultLeapFrame(mleapSchema.get, mleapMatrix)
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
package ml.combust.mleap.xgboost.runtime.testing


trait FloatingPointApproximations {

val DefaultMinimumPrecision = 1e-7

def almostEqual(x: Double, y: Double, precision: Double = DefaultMinimumPrecision): Boolean = {
if ((x - y).abs < precision) true else false
}

def almostEqualSequences(
sequenceA: Seq[Array[Double]],
SequenceB: Seq[Array[Double]],
precision: Double = DefaultMinimumPrecision
): Boolean = {
for ((arrayA, arrayB) <- sequenceA zip SequenceB) {
for ((numberA, numberB) <- arrayA zip arrayB) {
if (!almostEqual(numberA, numberB, precision)) return false
}
}
true
}
}
3 changes: 2 additions & 1 deletion project/Dependencies.scala
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ object Dependencies {
val akkaStreamTestKit = "com.typesafe.akka" %% "akka-stream-testkit" % akkaVersion % "test"
val junit = "junit" % "junit" % "4.12" % "test"
val junitInterface = "com.novocode" % "junit-interface" % "0.10" % "test"
val spark = Compile.spark.map(_ % "test")
}

object Provided {
Expand Down Expand Up @@ -114,7 +115,7 @@ object Dependencies {

val tensorflow = l ++= tensorflowDeps ++ Seq(Test.scalaTest)

val xgboostRuntime = l ++= Seq(xgboostDep) ++ Seq(Test.scalaTest)
val xgboostRuntime = l ++= Seq(xgboostDep) ++ Test.spark ++ Seq(Test.scalaTest)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are you sure Provided.spark isn't enough? it seems like that for the other parity tests?

Copy link
Member Author

@lucagiovagnoli lucagiovagnoli Feb 12, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So I actually meant to get your input on these dependency changes.
Both Provided.spark or Test.spark achieve the same thing here because I am using spark things only in tests.

I think that putting those dependencies with a test scope makes the build faster, since the dependencies are only brought in when running tests, not when building. I read online:

  • test:

Dependencies with maven dependency scope test are not needed to build and run the project. They are needed to compile and run the unit tests.

  • provided:

Maven dependency scope provided is used during build and test the project.

so I assume a test scope is slightly lighter than provided? Again, the xgboost-runtime code has no dependencies on spark stuff, but its tests do.

Another option would be to avoid spark things in tests but I found spark has the best libsvm reader that lets me do
val dataFrame = spark.read.format("libsvm").load(this.getClass.getClassLoader.getResource(filePath).getFile)

and I can use the Dataframe -> to mleapFrame utils.

Let me know what you think!

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That makes sense, thank you!


val xgboostSpark = l ++= Seq(xgboostSparkDep) ++ Provided.spark

Expand Down
4 changes: 3 additions & 1 deletion project/MleapProject.scala
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,9 @@ object MleapProject {
lazy val xgboostRuntime = Project(
id = "mleap-xgboost-runtime",
base = file("mleap-xgboost-runtime"),
dependencies = Seq(runtime)
dependencies = Seq(
runtime,
sparkTestkit % "test")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

where do we use sparkTestkit?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If I don't bring that in, I cannot use:
import org.apache.spark.sql.mleap.TypeConverters

Do you think it is ok to use that in tests? (%"test" should not be included in the build)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that comes from mleap-spark-base, but spark-testkit should be ok too.

)

lazy val xgboostSpark = Project(
Expand Down