Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Luca/fix raw predict issue 635 #637

Merged
merged 8 commits into from
Feb 24, 2020
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ dist: trusty
services:
- docker

language: scala
scala:
- 2.11.8
jdk:
Expand All @@ -14,7 +15,6 @@ jobs:
include:
- stage: "Mleap tests"
name: "Scala Tests"
language: scala
script:
- travis/travis.sh

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ case class XGBoostBinaryClassificationModel(override val booster: Booster,

def predictRaw(data: DMatrix): Vector = {
val m = booster.predict(data, outPutMargin = true, treeLimit = treeLimit).head(0)
Vectors.dense(1 - m, m)
Vectors.dense(- m, m)
}

override def rawToProbabilityInPlace(raw: Vector): Vector = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,10 @@ class XGBoostClassificationOp extends MleapOp[XGBoostClassification, XGBoostClas
(implicit context: BundleContext[MleapContext]): Model = {
val out = Files.newOutputStream(context.file("xgboost.model"))
obj.booster.saveModel(out)
model.withValue("num_features", Value.int(obj.numFeatures)).
withValue("num_classes", Value.int(obj.numClasses))
model
.withValue("num_features", Value.int(obj.numFeatures))
.withValue("num_classes", Value.int(obj.numClasses))
.withValue("tree_limit", Value.int(obj.treeLimit))
}

override def load(model: Model)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,10 @@ class XGBoostRegressionOp extends MleapOp[XGBoostRegression, XGBoostRegressionMo
(implicit context: BundleContext[MleapContext]): Model = {
val out = Files.newOutputStream(context.file("xgboost.model"))
obj.booster.saveModel(out)
model.withValue("num_features", Value.int(obj.numFeatures))

model
.withValue("num_features", Value.int(obj.numFeatures))
.withValue("tree_limit", Value.int(obj.treeLimit))
}

override def load(model: Model)
Expand Down
1,611 changes: 1,611 additions & 0 deletions mleap-xgboost-runtime/src/test/resources/datasources/agaricus.test

Large diffs are not rendered by default.

6,513 changes: 6,513 additions & 0 deletions mleap-xgboost-runtime/src/test/resources/datasources/agaricus.train

Large diffs are not rendered by default.

150 changes: 150 additions & 0 deletions mleap-xgboost-runtime/src/test/resources/datasources/iris.scale.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
0 1:-0.555556 2:0.25 3:-0.864407 4:-0.916667
0 1:-0.666667 2:-0.166667 3:-0.864407 4:-0.916667
0 1:-0.777778 3:-0.898305 4:-0.916667
0 1:-0.833333 2:-0.0833334 3:-0.830508 4:-0.916667
0 1:-0.611111 2:0.333333 3:-0.864407 4:-0.916667
0 1:-0.388889 2:0.583333 3:-0.762712 4:-0.75
0 1:-0.833333 2:0.166667 3:-0.864407 4:-0.833333
0 1:-0.611111 2:0.166667 3:-0.830508 4:-0.916667
0 1:-0.944444 2:-0.25 3:-0.864407 4:-0.916667
0 1:-0.666667 2:-0.0833334 3:-0.830508 4:-1
0 1:-0.388889 2:0.416667 3:-0.830508 4:-0.916667
0 1:-0.722222 2:0.166667 3:-0.79661 4:-0.916667
0 1:-0.722222 2:-0.166667 3:-0.864407 4:-1
0 1:-1 2:-0.166667 3:-0.966102 4:-1
0 1:-0.166667 2:0.666667 3:-0.932203 4:-0.916667
0 1:-0.222222 2:1 3:-0.830508 4:-0.75
0 1:-0.388889 2:0.583333 3:-0.898305 4:-0.75
0 1:-0.555556 2:0.25 3:-0.864407 4:-0.833333
0 1:-0.222222 2:0.5 3:-0.762712 4:-0.833333
0 1:-0.555556 2:0.5 3:-0.830508 4:-0.833333
0 1:-0.388889 2:0.166667 3:-0.762712 4:-0.916667
0 1:-0.555556 2:0.416667 3:-0.830508 4:-0.75
0 1:-0.833333 2:0.333333 3:-1 4:-0.916667
0 1:-0.555556 2:0.0833333 3:-0.762712 4:-0.666667
0 1:-0.722222 2:0.166667 3:-0.694915 4:-0.916667
0 1:-0.611111 2:-0.166667 3:-0.79661 4:-0.916667
0 1:-0.611111 2:0.166667 3:-0.79661 4:-0.75
0 1:-0.5 2:0.25 3:-0.830508 4:-0.916667
0 1:-0.5 2:0.166667 3:-0.864407 4:-0.916667
0 1:-0.777778 3:-0.79661 4:-0.916667
0 1:-0.722222 2:-0.0833334 3:-0.79661 4:-0.916667
0 1:-0.388889 2:0.166667 3:-0.830508 4:-0.75
0 1:-0.5 2:0.75 3:-0.830508 4:-1
0 1:-0.333333 2:0.833333 3:-0.864407 4:-0.916667
0 1:-0.666667 2:-0.0833334 3:-0.830508 4:-1
0 1:-0.611111 3:-0.932203 4:-0.916667
0 1:-0.333333 2:0.25 3:-0.898305 4:-0.916667
0 1:-0.666667 2:-0.0833334 3:-0.830508 4:-1
0 1:-0.944444 2:-0.166667 3:-0.898305 4:-0.916667
0 1:-0.555556 2:0.166667 3:-0.830508 4:-0.916667
0 1:-0.611111 2:0.25 3:-0.898305 4:-0.833333
0 1:-0.888889 2:-0.75 3:-0.898305 4:-0.833333
0 1:-0.944444 3:-0.898305 4:-0.916667
0 1:-0.611111 2:0.25 3:-0.79661 4:-0.583333
0 1:-0.555556 2:0.5 3:-0.694915 4:-0.75
0 1:-0.722222 2:-0.166667 3:-0.864407 4:-0.833333
0 1:-0.555556 2:0.5 3:-0.79661 4:-0.916667
0 1:-0.833333 3:-0.864407 4:-0.916667
0 1:-0.444444 2:0.416667 3:-0.830508 4:-0.916667
0 1:-0.611111 2:0.0833333 3:-0.864407 4:-0.916667
1 1:0.5 3:0.254237 4:0.0833333
1 1:0.166667 3:0.186441 4:0.166667
1 1:0.444444 2:-0.0833334 3:0.322034 4:0.166667
1 1:-0.333333 2:-0.75 3:0.0169491 4:-4.03573e-08
1 1:0.222222 2:-0.333333 3:0.220339 4:0.166667
1 1:-0.222222 2:-0.333333 3:0.186441 4:-4.03573e-08
1 1:0.111111 2:0.0833333 3:0.254237 4:0.25
1 1:-0.666667 2:-0.666667 3:-0.220339 4:-0.25
1 1:0.277778 2:-0.25 3:0.220339 4:-4.03573e-08
1 1:-0.5 2:-0.416667 3:-0.0169491 4:0.0833333
1 1:-0.611111 2:-1 3:-0.152542 4:-0.25
1 1:-0.111111 2:-0.166667 3:0.0847457 4:0.166667
1 1:-0.0555556 2:-0.833333 3:0.0169491 4:-0.25
1 1:-1.32455e-07 2:-0.25 3:0.254237 4:0.0833333
1 1:-0.277778 2:-0.25 3:-0.118644 4:-4.03573e-08
1 1:0.333333 2:-0.0833334 3:0.152542 4:0.0833333
1 1:-0.277778 2:-0.166667 3:0.186441 4:0.166667
1 1:-0.166667 2:-0.416667 3:0.0508474 4:-0.25
1 1:0.0555554 2:-0.833333 3:0.186441 4:0.166667
1 1:-0.277778 2:-0.583333 3:-0.0169491 4:-0.166667
1 1:-0.111111 3:0.288136 4:0.416667
1 1:-1.32455e-07 2:-0.333333 3:0.0169491 4:-4.03573e-08
1 1:0.111111 2:-0.583333 3:0.322034 4:0.166667
1 1:-1.32455e-07 2:-0.333333 3:0.254237 4:-0.0833333
1 1:0.166667 2:-0.25 3:0.118644 4:-4.03573e-08
1 1:0.277778 2:-0.166667 3:0.152542 4:0.0833333
1 1:0.388889 2:-0.333333 3:0.288136 4:0.0833333
1 1:0.333333 2:-0.166667 3:0.355932 4:0.333333
1 1:-0.0555556 2:-0.25 3:0.186441 4:0.166667
1 1:-0.222222 2:-0.5 3:-0.152542 4:-0.25
1 1:-0.333333 2:-0.666667 3:-0.0508475 4:-0.166667
1 1:-0.333333 2:-0.666667 3:-0.0847458 4:-0.25
1 1:-0.166667 2:-0.416667 3:-0.0169491 4:-0.0833333
1 1:-0.0555556 2:-0.416667 3:0.38983 4:0.25
1 1:-0.388889 2:-0.166667 3:0.186441 4:0.166667
1 1:-0.0555556 2:0.166667 3:0.186441 4:0.25
1 1:0.333333 2:-0.0833334 3:0.254237 4:0.166667
1 1:0.111111 2:-0.75 3:0.152542 4:-4.03573e-08
1 1:-0.277778 2:-0.166667 3:0.0508474 4:-4.03573e-08
1 1:-0.333333 2:-0.583333 3:0.0169491 4:-4.03573e-08
1 1:-0.333333 2:-0.5 3:0.152542 4:-0.0833333
1 1:-1.32455e-07 2:-0.166667 3:0.220339 4:0.0833333
1 1:-0.166667 2:-0.5 3:0.0169491 4:-0.0833333
1 1:-0.611111 2:-0.75 3:-0.220339 4:-0.25
1 1:-0.277778 2:-0.416667 3:0.0847457 4:-4.03573e-08
1 1:-0.222222 2:-0.166667 3:0.0847457 4:-0.0833333
1 1:-0.222222 2:-0.25 3:0.0847457 4:-4.03573e-08
1 1:0.0555554 2:-0.25 3:0.118644 4:-4.03573e-08
1 1:-0.555556 2:-0.583333 3:-0.322034 4:-0.166667
1 1:-0.222222 2:-0.333333 3:0.0508474 4:-4.03573e-08
2 1:0.111111 2:0.0833333 3:0.694915 4:1
2 1:-0.166667 2:-0.416667 3:0.38983 4:0.5
2 1:0.555555 2:-0.166667 3:0.661017 4:0.666667
2 1:0.111111 2:-0.25 3:0.559322 4:0.416667
2 1:0.222222 2:-0.166667 3:0.627119 4:0.75
2 1:0.833333 2:-0.166667 3:0.898305 4:0.666667
2 1:-0.666667 2:-0.583333 3:0.186441 4:0.333333
2 1:0.666667 2:-0.25 3:0.79661 4:0.416667
2 1:0.333333 2:-0.583333 3:0.627119 4:0.416667
2 1:0.611111 2:0.333333 3:0.728813 4:1
2 1:0.222222 3:0.38983 4:0.583333
2 1:0.166667 2:-0.416667 3:0.457627 4:0.5
2 1:0.388889 2:-0.166667 3:0.525424 4:0.666667
2 1:-0.222222 2:-0.583333 3:0.355932 4:0.583333
2 1:-0.166667 2:-0.333333 3:0.38983 4:0.916667
2 1:0.166667 3:0.457627 4:0.833333
2 1:0.222222 2:-0.166667 3:0.525424 4:0.416667
2 1:0.888889 2:0.5 3:0.932203 4:0.75
2 1:0.888889 2:-0.5 3:1 4:0.833333
2 1:-0.0555556 2:-0.833333 3:0.355932 4:0.166667
2 1:0.444444 3:0.59322 4:0.833333
2 1:-0.277778 2:-0.333333 3:0.322034 4:0.583333
2 1:0.888889 2:-0.333333 3:0.932203 4:0.583333
2 1:0.111111 2:-0.416667 3:0.322034 4:0.416667
2 1:0.333333 2:0.0833333 3:0.59322 4:0.666667
2 1:0.611111 3:0.694915 4:0.416667
2 1:0.0555554 2:-0.333333 3:0.288136 4:0.416667
2 1:-1.32455e-07 2:-0.166667 3:0.322034 4:0.416667
2 1:0.166667 2:-0.333333 3:0.559322 4:0.666667
2 1:0.611111 2:-0.166667 3:0.627119 4:0.25
2 1:0.722222 2:-0.333333 3:0.728813 4:0.5
2 1:1 2:0.5 3:0.830508 4:0.583333
2 1:0.166667 2:-0.333333 3:0.559322 4:0.75
2 1:0.111111 2:-0.333333 3:0.38983 4:0.166667
2 1:-1.32455e-07 2:-0.5 3:0.559322 4:0.0833333
2 1:0.888889 2:-0.166667 3:0.728813 4:0.833333
2 1:0.111111 2:0.166667 3:0.559322 4:0.916667
2 1:0.166667 2:-0.0833334 3:0.525424 4:0.416667
2 1:-0.0555556 2:-0.166667 3:0.288136 4:0.416667
2 1:0.444444 2:-0.0833334 3:0.491525 4:0.666667
2 1:0.333333 2:-0.0833334 3:0.559322 4:0.916667
2 1:0.444444 2:-0.0833334 3:0.38983 4:0.833333
2 1:-0.166667 2:-0.416667 3:0.38983 4:0.5
2 1:0.388889 3:0.661017 4:0.833333
2 1:0.333333 2:0.0833333 3:0.59322 4:1
2 1:0.333333 2:-0.166667 3:0.423729 4:0.833333
2 1:0.111111 2:-0.583333 3:0.355932 4:0.5
2 1:0.222222 2:-0.166667 3:0.423729 4:0.583333
2 1:0.0555554 2:0.166667 3:0.491525 4:0.833333
2 1:-0.111111 2:-0.166667 3:0.38983 4:0.416667
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
package ml.combust.mleap.xgboost.runtime

import ml.combust.mleap.core.types.{BasicType, NodeShape, ScalarType, StructField, TensorType}
import ml.combust.mleap.runtime.frame.{DefaultLeapFrame, Transformer}
import ml.combust.mleap.tensor.SparseTensor
import ml.combust.mleap.xgboost.runtime.testing.{BoosterUtils, BundleSerializationUtils, CachedDatasetUtils, FloatingPointApproximations}
import ml.dmlc.xgboost4j.scala.Booster
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec
import XgbConverters._


class XGBoostClassificationModelParitySpec extends FunSpec
with BoosterUtils
with CachedDatasetUtils
with BundleSerializationUtils
with FloatingPointApproximations {

def createBoosterClassifier: Transformer = {

val booster: Booster = trainBooster(binomialDataset)

XGBoostClassification(
"xgboostSingleThread",
NodeShape.probabilisticClassifier(
rawPredictionCol = Some("raw_prediction"),
probabilityCol = Some("probability")),
XGBoostClassificationModel(
XGBoostBinaryClassificationModel(booster, numFeatures(leapFrameLibSVMtrain), 0))
)
}

def createMultinomialBoosterClassifier: Transformer ={

val booster: Booster = trainMultinomialBooster(multinomialDataset)

XGBoostClassification(
"xgboostSingleThread",
NodeShape.probabilisticClassifier(
rawPredictionCol = Some("raw_prediction"),
probabilityCol = Some("probability")),
XGBoostClassificationModel(
XGBoostMultinomialClassificationModel(
booster, xgboostMultinomialParams("num_class").asInstanceOf[Int], numFeatures(leapFrameIrisTrain), 0))
)
}

def equalityTestRowByRow(
booster: Booster,
mleapTransformer: Transformer,
leapFrameDataset: DefaultLeapFrame) = {

val featuresColumnIndex = leapFrameDataset.schema.indexOf("features").get

leapFrameDataset.dataset.foreach {
r=>
val mleapResult = mleapTransformer.transform(DefaultLeapFrame(leapFrameDataset.schema, Seq(r))).get

val mleapPredictionColIndex = mleapResult.schema.indexOf("prediction").get
val mleapRawPredictionColIndex = mleapResult.schema.indexOf("raw_prediction").get
val mleapProbabilityColIndex = mleapResult.schema.indexOf("probability").get

val singleRowDMatrix = r(featuresColumnIndex).asInstanceOf[SparseTensor[Double]].asXGB

val boosterResult = booster.predict(singleRowDMatrix, false, 0).head(0)

val boosterProbability = Vectors.dense(1 - boosterResult, boosterResult).toDense
val boosterPrediction = Math.round(boosterResult)


assert (boosterPrediction == mleapResult.dataset.head.getDouble(mleapPredictionColIndex))

assert (
almostEqualSequences(
Seq(boosterProbability.values),
Seq(mleapResult.dataset.head.getTensor[Double](mleapProbabilityColIndex).toArray)
)
)

val boosterResultWithMargin = booster.predict(singleRowDMatrix, true, 0).head(0)
val boosterRawPrediction = Vectors.dense(- boosterResultWithMargin, boosterResultWithMargin).toDense

assert (almostEqualSequences(
Seq(boosterRawPrediction.values),
Seq(mleapResult.dataset.head.getTensor[Double](mleapRawPredictionColIndex).toArray)
))
}
}

def equalityTestRowByRowMultinomial(
booster: Booster,
mleapTransformer: Transformer,
leapFrameDataset: DefaultLeapFrame) = {

val featuresColumnIndex = leapFrameDataset.schema.indexOf("features").get

leapFrameDataset.dataset.foreach {
r =>
val mleapResult = mleapTransformer.transform(DefaultLeapFrame(leapFrameDataset.schema, Seq(r))).get

val mleapPredictionColIndex = mleapResult.schema.indexOf("prediction").get
val mleapRawPredictionColIndex = mleapResult.schema.indexOf("raw_prediction").get
val mleapProbabilityColIndex = mleapResult.schema.indexOf("probability").get

val singleRowDMatrix = r(featuresColumnIndex).asInstanceOf[SparseTensor[Double]].asXGB

val boosterResult = booster.predict(singleRowDMatrix, false, 0).head

val boosterProbability = Vectors.dense(boosterResult.map(_.toDouble)).toDense
val boosterPrediction = boosterProbability.argmax

assert(boosterPrediction == mleapResult.dataset.head.getDouble(mleapPredictionColIndex))

assert(
almostEqualSequences(
Seq(boosterProbability.values),
Seq(mleapResult.dataset.head.getTensor[Double](mleapProbabilityColIndex).toArray)
)
)

val boosterResultWithMargin = booster.predict(singleRowDMatrix, true, 0).head
val boosterRawPrediction = Vectors.dense(boosterResultWithMargin.map(_.toDouble)).toDense

assert(almostEqualSequences(
Seq(boosterRawPrediction.values),
Seq(mleapResult.dataset.head.getTensor[Double](mleapRawPredictionColIndex).toArray)
))
}
}

it("Results between the XGBoost4j booster and the MLeap Transformer are the same") {
val booster = trainBooster(binomialDataset)
val xgboostTransformer = createBoosterClassifier

equalityTestRowByRow(booster, xgboostTransformer, leapFrameLibSVMtrain)
}

it("has the correct inputs and outputs with columns: prediction, probability and raw_prediction") {
val transformer = createBoosterClassifier
val numFeatures = transformer.asInstanceOf[XGBoostClassification].model.numFeatures

assert(transformer.schema.fields ==
Seq(StructField("features", TensorType(BasicType.Double, Seq(numFeatures))),
StructField("raw_prediction", TensorType(BasicType.Double, Seq(2))),
StructField("probability", TensorType(BasicType.Double, Seq(2))),
StructField("prediction", ScalarType.Double.nonNullable)))
}

it("Results are the same pre and post serialization") {
val xgboostTransformer = createBoosterClassifier

val mleapBundle = serializeModelToMleapBundle(xgboostTransformer)
val deserializedTransformer: Transformer = loadMleapTransformerFromBundle(mleapBundle)

val preSerializationResult = xgboostTransformer.transform(leapFrameLibSVMtrain).get
val deserializedModelResult = deserializedTransformer.transform(leapFrameLibSVMtrain).get

assert(preSerializationResult.dataset == deserializedModelResult.dataset)
}

it("Results between the XGBoost4j multinomial booster and the MLeap XGBoostMultinomialClassificationModel are the same") {
val multiBooster = trainMultinomialBooster(multinomialDataset)
val xgboostTransformer = createMultinomialBoosterClassifier

equalityTestRowByRowMultinomial(multiBooster, xgboostTransformer, leapFrameIrisTrain)
}

it("XGBoostMultinomialClassificationModel results are the same pre and post serialization") {
val xgboostTransformer = createMultinomialBoosterClassifier

val mleapBundle = serializeModelToMleapBundle(xgboostTransformer)
val deserializedTransformer: Transformer = loadMleapTransformerFromBundle(mleapBundle)

val preSerializationResult = xgboostTransformer.transform(leapFrameIrisTrain).get
val deserializedResult = deserializedTransformer.transform(leapFrameIrisTrain).get

assert(preSerializationResult.dataset == deserializedResult.dataset)
}

it("Results pre and post serialization are the same when using a dense dataset") {
val xgboostTransformer = createBoosterClassifier

val mleapBundle = serializeModelToMleapBundle(xgboostTransformer)
val deserializedTransformer: Transformer = loadMleapTransformerFromBundle(mleapBundle)

val denseLeapFrame = toDenseFeaturesLeapFrame(leapFrameLibSVMtrain)

val preSerializationResult = xgboostTransformer.transform(denseLeapFrame).get
val deserializedResult = deserializedTransformer.transform(denseLeapFrame).get

assert(preSerializationResult.dataset == deserializedResult.dataset)
}
}
Loading