Skip to content

Commit

Permalink
Adding tests with dense dataset for xgboost-runtime
Browse files Browse the repository at this point in the history
  • Loading branch information
lucagiovagnoli committed Feb 15, 2020
1 parent e7877bd commit ae22247
Show file tree
Hide file tree
Showing 3 changed files with 66 additions and 23 deletions.
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
package ml.combust.mleap.xgboost.runtime

import ml.combust.mleap.core.types.{BasicType, NodeShape, ScalarType, StructField, StructType, TensorType}
import ml.combust.mleap.core.types.{BasicType, NodeShape, ScalarType, StructField, TensorType}
import ml.combust.mleap.runtime.frame.{DefaultLeapFrame, Transformer}
import ml.combust.mleap.tensor.SparseTensor
import ml.combust.mleap.xgboost.runtime.testing.{BoosterUtils, BundleSerializationUtils, CachedDatasetUtils, FloatingPointApproximations}
Expand All @@ -17,7 +17,7 @@ class XGBoostClassificationModelParitySpec extends FunSpec

def createBoosterClassifier: Transformer = {

val booster: Booster = trainBooster(denseDataset)
val booster: Booster = trainBooster(binomialDataset)

XGBoostClassification(
"xgboostSingleThread",
Expand All @@ -31,7 +31,7 @@ class XGBoostClassificationModelParitySpec extends FunSpec

def createMultinomialBoosterClassifier: Transformer ={

val booster: Booster = trainMultinomialBooster(denseMultinomialDataset)
val booster: Booster = trainMultinomialBooster(multinomialDataset)

XGBoostClassification(
"xgboostSingleThread",
Expand Down Expand Up @@ -132,10 +132,10 @@ class XGBoostClassificationModelParitySpec extends FunSpec
}

it("Results between the XGBoost4j booster and the MLeap Transformer are the same") {
val booster = trainBooster(denseDataset)
val booster = trainBooster(binomialDataset)
val xgboostTransformer = createBoosterClassifier

equalityTestRowByRow(booster, xgboostTransformer, leapFrameLibSVMtest)
equalityTestRowByRow(booster, xgboostTransformer, leapFrameLibSVMtrain)
}

it("has the correct inputs and outputs with columns: prediction, probability and raw_prediction") {
Expand All @@ -155,14 +155,14 @@ class XGBoostClassificationModelParitySpec extends FunSpec
val mleapBundle = serializeModelToMleapBundle(xgboostTransformer)
val deserializedTransformer: Transformer = loadMleapTransformerFromBundle(mleapBundle)

val preSerializationResult = xgboostTransformer.transform(leapFrameLibSVMtest)
val deserializedModelResult = deserializedTransformer.transform(leapFrameLibSVMtest).get
val preSerializationResult = xgboostTransformer.transform(leapFrameLibSVMtrain).get
val deserializedModelResult = deserializedTransformer.transform(leapFrameLibSVMtrain).get

assert(preSerializationResult.get.dataset == deserializedModelResult.dataset)
assert(preSerializationResult.dataset == deserializedModelResult.dataset)
}

it("Results between the XGBoost4j multinomial booster and the MLeap XGBoostMultinomialClassificationModel are the same") {
val multiBooster = trainMultinomialBooster(denseMultinomialDataset)
val multiBooster = trainMultinomialBooster(multinomialDataset)
val xgboostTransformer = createMultinomialBoosterClassifier

equalityTestRowByRowMultinomial(multiBooster, xgboostTransformer, leapFrameIrisTrain)
Expand All @@ -174,9 +174,23 @@ class XGBoostClassificationModelParitySpec extends FunSpec
val mleapBundle = serializeModelToMleapBundle(xgboostTransformer)
val deserializedTransformer: Transformer = loadMleapTransformerFromBundle(mleapBundle)

val preSerializationResult = xgboostTransformer.transform(leapFrameIrisTrain)
val deserializedModelResult = deserializedTransformer.transform(leapFrameIrisTrain).get
val preSerializationResult = xgboostTransformer.transform(leapFrameIrisTrain).get
val deserializedResult = deserializedTransformer.transform(leapFrameIrisTrain).get

assert(preSerializationResult.get.dataset == deserializedModelResult.dataset)
assert(preSerializationResult.dataset == deserializedResult.dataset)
}

it("Results pre and post serialization are the same when using a dense dataset") {
val xgboostTransformer = createBoosterClassifier

val mleapBundle = serializeModelToMleapBundle(xgboostTransformer)
val deserializedTransformer: Transformer = loadMleapTransformerFromBundle(mleapBundle)

val denseLeapFrame = toDenseFeaturesLeapFrame(leapFrameLibSVMtrain)

val preSerializationResult = xgboostTransformer.transform(denseLeapFrame).get
val deserializedResult = deserializedTransformer.transform(denseLeapFrame).get

assert(preSerializationResult.dataset == deserializedResult.dataset)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ class XGBoostRegressionModelParitySpec extends FunSpec

def trainRegressor: Transformer ={

val booster: Booster = trainBooster(denseDataset)
val booster: Booster = trainBooster(binomialDataset)

XGBoostRegression(
"xgboostSingleThread",
Expand Down Expand Up @@ -45,13 +45,13 @@ class XGBoostRegressionModelParitySpec extends FunSpec
}

it("Results between the XGBoost4j booster and the MLeap Transformer are the same") {
val booster = trainBooster(denseDataset)
val booster = trainBooster(binomialDataset)
val xgboostTransformer = trainRegressor

val mleapBundle = serializeModelToMleapBundle(xgboostTransformer)
val deserializedTransformer: Transformer = loadMleapTransformerFromBundle(mleapBundle)

equalityTestRowByRow(booster, deserializedTransformer, leapFrameLibSVMtest)
equalityTestRowByRow(booster, deserializedTransformer, leapFrameLibSVMtrain)
}

it("has the correct inputs and outputs with columns: prediction, probability and raw_prediction") {
Expand All @@ -70,9 +70,23 @@ class XGBoostRegressionModelParitySpec extends FunSpec
val mleapBundle = serializeModelToMleapBundle(xgboostTransformer)
val deserializedTransformer: Transformer = loadMleapTransformerFromBundle(mleapBundle)

val preSerializationResult = xgboostTransformer.transform(leapFrameLibSVMtest)
val deserializedModelResult = deserializedTransformer.transform(leapFrameLibSVMtest).get
val preSerializationResult = xgboostTransformer.transform(leapFrameLibSVMtrain).get
val deserializedModelResult = deserializedTransformer.transform(leapFrameLibSVMtrain).get

assert(preSerializationResult.get.dataset == deserializedModelResult.dataset)
assert(preSerializationResult.dataset == deserializedModelResult.dataset)
}

it("Test results are the same when using a dense dataset") {
val xgboostTransformer = trainRegressor

val mleapBundle = serializeModelToMleapBundle(xgboostTransformer)
val deserializedTransformer: Transformer = loadMleapTransformerFromBundle(mleapBundle)

val denseLeapFrame = toDenseFeaturesLeapFrame(leapFrameLibSVMtrain)

val preSerializationResult = xgboostTransformer.transform(denseLeapFrame).get
val deserializedResult = deserializedTransformer.transform(denseLeapFrame).get

assert(preSerializationResult.dataset == deserializedResult.dataset)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ package ml.combust.mleap.xgboost.runtime.testing

import ml.combust.mleap.core.types.{StructType, TensorType}
import ml.combust.mleap.core.util.VectorConverters
import ml.combust.mleap.runtime.frame.{ArrayRow, DefaultLeapFrame}
import ml.combust.mleap.runtime.frame.{ArrayRow, DefaultLeapFrame, Row}
import ml.dmlc.xgboost4j.scala.DMatrix
import org.apache.spark.ml.linalg.SparseVector
import org.apache.spark.sql.SparkSession
Expand All @@ -12,17 +12,15 @@ trait CachedDatasetUtils {

private final val TrainDataFilePath = "datasources/agaricus.train"
private final val TrainDataMultinomialFilePath = "datasources/iris.scale.txt"
private final val TestDataFilePath = "datasources/agaricus.test"

val denseDataset: DMatrix =
val binomialDataset: DMatrix =
new DMatrix(this.getClass.getClassLoader.getResource(TrainDataFilePath).getFile)

val denseMultinomialDataset: DMatrix =
val multinomialDataset: DMatrix =
new DMatrix(this.getClass.getClassLoader.getResource(TrainDataMultinomialFilePath).getFile)

lazy val leapFrameLibSVMtrain: DefaultLeapFrame = leapFrameFromLibSVMFile(TrainDataFilePath)
lazy val leapFrameIrisTrain: DefaultLeapFrame = leapFrameFromLibSVMFile(TrainDataMultinomialFilePath)
lazy val leapFrameLibSVMtest: DefaultLeapFrame = leapFrameFromLibSVMFile(TestDataFilePath)

def numFeatures(dataset: DefaultLeapFrame): Int =
dataset.schema.getField("features").get.dataType.asInstanceOf[TensorType].dimensions.get.head
Expand Down Expand Up @@ -51,4 +49,21 @@ trait CachedDatasetUtils {

DefaultLeapFrame(mleapSchema.get, mleapMatrix)
}

def toDenseFeaturesLeapFrame(sparseLeapFrame: DefaultLeapFrame): DefaultLeapFrame = {
val featureColumnIndex = sparseLeapFrame.schema.indexOf("features").get
val labelColumnIndex = sparseLeapFrame.schema.indexOf("label").get

val denseDataset: Seq[Row] = sparseLeapFrame.dataset.map{
row => {
val array = new Array[Any](2)
array(labelColumnIndex) = row.getDouble(labelColumnIndex)
array(featureColumnIndex) = row.getTensor[Double](featureColumnIndex).toDense

ArrayRow(array)
}
}

DefaultLeapFrame(sparseLeapFrame.schema, denseDataset)
}
}

0 comments on commit ae22247

Please sign in to comment.