Skip to content

Commit

Permalink
avoid hardcoded default evaluators but instead set them as defaults; …
Browse files Browse the repository at this point in the history
…backward compatible by prepending these defaults if necessary
  • Loading branch information
nicodv committed Jul 23, 2020
1 parent e42214d commit 218850b
Show file tree
Hide file tree
Showing 7 changed files with 73 additions and 32 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -148,8 +148,8 @@ case object BinaryClassificationModelSelector extends ModelSelectorFactory {
* @param splitter instance that will balance and split the data
* @param numFolds number of folds for cross validation (>= 2)
* @param validationMetric metric name in evaluation: AuROC or AuPR
* @param trainTestEvaluators List of evaluators applied on training + holdout data for evaluation. Default is empty
* and default evaluator is added to this list (here Evaluators.BinaryClassification)
* @param trainTestEvaluators List of evaluators applied on training + holdout data for evaluation. Default is
* the standard OpBinaryClassificationEvaluator and OpBinScoreEvaluator.
* @param seed random seed
* @param stratify whether or not stratify cross validation. Caution : setting that param to true might
* impact the runtime
Expand All @@ -170,7 +170,8 @@ case object BinaryClassificationModelSelector extends ModelSelectorFactory {
numFolds: Int = ValidatorParamDefaults.NumFolds,
validationMetric: OpBinaryClassificationEvaluatorBase[_ <: EvaluationMetrics] =
Evaluators.BinaryClassification.auPR(),
trainTestEvaluators: Seq[OpBinaryClassificationEvaluatorBase[_ <: EvaluationMetrics]] = Seq.empty,
trainTestEvaluators: Seq[OpBinaryClassificationEvaluatorBase[_ <: EvaluationMetrics]] = Seq(
new OpBinaryClassificationEvaluator, new OpBinScoreEvaluator),
seed: Long = ValidatorParamDefaults.Seed,
stratify: Boolean = ValidatorParamDefaults.Stratify,
parallelism: Int = ValidatorParamDefaults.Parallelism,
Expand All @@ -182,9 +183,13 @@ case object BinaryClassificationModelSelector extends ModelSelectorFactory {
numFolds = numFolds, seed = seed, evaluator = validationMetric, stratify = stratify,
parallelism = parallelism, maxWait = maxWait
)
// For backwards compatibility, make sure evaluators always include the defaults
val allEvaluators = addDefaultEvaluators(trainTestEvaluators, Seq(
new OpBinaryClassificationEvaluator, new OpBinScoreEvaluator))

selector(cv,
splitter = splitter,
trainTestEvaluators = Seq(new OpBinaryClassificationEvaluator, new OpBinScoreEvaluator) ++ trainTestEvaluators,
trainTestEvaluators = allEvaluators,
modelTypesToUse = modelTypesToUse,
modelsAndParameters = modelsAndParameters,
modelDefaults = Defaults
Expand All @@ -197,8 +202,8 @@ case object BinaryClassificationModelSelector extends ModelSelectorFactory {
* @param splitter instance that will balance and split the data
* @param trainRatio ratio between training set and validation set (>= 0 && <= 1)
* @param validationMetric metric name in evaluation: AuROC or AuPR
* @param trainTestEvaluators List of evaluators applied on training + holdout data for evaluation. Default is empty
* and default evaluator is added to this list (here Evaluators.BinaryClassification)
* @param trainTestEvaluators List of evaluators applied on training + holdout data for evaluation. Default is
* the standard OpBinaryClassificationEvaluator and OpBinScoreEvaluator.
* @param seed random seed
* @param stratify whether or not stratify train validation split.
* Caution : setting that param to true might impact the runtime
Expand All @@ -219,7 +224,8 @@ case object BinaryClassificationModelSelector extends ModelSelectorFactory {
trainRatio: Double = ValidatorParamDefaults.TrainRatio,
validationMetric: OpBinaryClassificationEvaluatorBase[_ <: EvaluationMetrics] =
Evaluators.BinaryClassification.auPR(),
trainTestEvaluators: Seq[OpBinaryClassificationEvaluatorBase[_ <: EvaluationMetrics]] = Seq.empty,
trainTestEvaluators: Seq[OpBinaryClassificationEvaluatorBase[_ <: EvaluationMetrics]] = Seq(
new OpBinaryClassificationEvaluator, new OpBinScoreEvaluator),
seed: Long = ValidatorParamDefaults.Seed,
stratify: Boolean = ValidatorParamDefaults.Stratify,
parallelism: Int = ValidatorParamDefaults.Parallelism,
Expand All @@ -231,9 +237,13 @@ case object BinaryClassificationModelSelector extends ModelSelectorFactory {
trainRatio = trainRatio, seed = seed, validationMetric, stratify = stratify, parallelism = parallelism,
maxWait = maxWait
)
// For backwards compatibility, make sure evaluators always include the defaults
val allEvaluators = addDefaultEvaluators(trainTestEvaluators, Seq(
new OpBinaryClassificationEvaluator, new OpBinScoreEvaluator))

selector(ts,
splitter = splitter,
trainTestEvaluators = Seq(new OpBinaryClassificationEvaluator) ++ trainTestEvaluators,
trainTestEvaluators = allEvaluators,
modelTypesToUse = modelTypesToUse,
modelsAndParameters = modelsAndParameters,
modelDefaults = Defaults
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -121,8 +121,8 @@ case object MultiClassificationModelSelector extends ModelSelectorFactory {
* @param splitter instance that will split the data
* @param numFolds number of folds for cross validation (>= 2)
* @param validationMetric metric name in evaluation: Accuracy, Precision, Recall or F1
* @param trainTestEvaluators List of evaluators applied on training + holdout data for evaluation. Default is empty
* and default evaluator is added to this list (here OpMultiClassificationEvaluator)
* @param trainTestEvaluators List of evaluators applied on training + holdout data for evaluation. Default is
* the standard OpMultiClassificationEvaluator.
* @param seed random seed
* @param stratify whether or not stratify cross validation. Caution : setting that param to true might
* impact the runtime
Expand All @@ -143,7 +143,8 @@ case object MultiClassificationModelSelector extends ModelSelectorFactory {
numFolds: Int = ValidatorParamDefaults.NumFolds,
validationMetric: OpMultiClassificationEvaluatorBase[_ <: EvaluationMetrics] =
Evaluators.MultiClassification.error(),
trainTestEvaluators: Seq[OpMultiClassificationEvaluatorBase[_ <: EvaluationMetrics]] = Seq.empty,
trainTestEvaluators: Seq[OpMultiClassificationEvaluatorBase[_ <: EvaluationMetrics]] = Seq(
new OpMultiClassificationEvaluator),
seed: Long = ValidatorParamDefaults.Seed,
stratify: Boolean = ValidatorParamDefaults.Stratify,
parallelism: Int = ValidatorParamDefaults.Parallelism,
Expand All @@ -155,9 +156,12 @@ case object MultiClassificationModelSelector extends ModelSelectorFactory {
numFolds = numFolds, seed = seed, evaluator = validationMetric, stratify = stratify, parallelism = parallelism,
maxWait = maxWait
)
// For backwards compatibility, make sure evaluators always include the defaults
val allEvaluators = addDefaultEvaluators(trainTestEvaluators, Seq(new OpMultiClassificationEvaluator))

selector(cv,
splitter = splitter,
trainTestEvaluators = Seq(new OpMultiClassificationEvaluator) ++ trainTestEvaluators,
trainTestEvaluators = allEvaluators,
modelTypesToUse = modelTypesToUse,
modelsAndParameters = modelsAndParameters,
modelDefaults = Defaults
Expand All @@ -170,8 +174,8 @@ case object MultiClassificationModelSelector extends ModelSelectorFactory {
* @param splitter instance that will split the data
* @param trainRatio ratio between training set and validation set (>= 0 && <= 1)
* @param validationMetric metric name in evaluation: AuROC or AuPR
* @param trainTestEvaluators List of evaluators applied on training + holdout data for evaluation. Default is empty
* and default evaluator is added to this list (here OpMultiClassificationEvaluator)
* @param trainTestEvaluators List of evaluators applied on training + holdout data for evaluation. Default is
* the standard OpMultiClassificationEvaluator.
* @param seed random seed
* @param stratify whether or not stratify train validation split.
* Caution : setting that param to true might impact the runtime
Expand All @@ -192,7 +196,8 @@ case object MultiClassificationModelSelector extends ModelSelectorFactory {
trainRatio: Double = ValidatorParamDefaults.TrainRatio,
validationMetric: OpMultiClassificationEvaluatorBase[_ <: EvaluationMetrics] =
Evaluators.MultiClassification.error(),
trainTestEvaluators: Seq[OpMultiClassificationEvaluatorBase[_ <: EvaluationMetrics]] = Seq.empty,
trainTestEvaluators: Seq[OpMultiClassificationEvaluatorBase[_ <: EvaluationMetrics]] = Seq(
new OpMultiClassificationEvaluator),
seed: Long = ValidatorParamDefaults.Seed,
stratify: Boolean = ValidatorParamDefaults.Stratify,
parallelism: Int = ValidatorParamDefaults.Parallelism,
Expand All @@ -203,9 +208,12 @@ case object MultiClassificationModelSelector extends ModelSelectorFactory {
val ts = new OpTrainValidationSplit[ModelType, EstimatorType](
trainRatio = trainRatio, seed = seed, validationMetric, stratify = stratify, parallelism = parallelism
)
// For backwards compatibility, make sure evaluators always include the defaults
val allEvaluators = addDefaultEvaluators(trainTestEvaluators, Seq(new OpMultiClassificationEvaluator))

selector(ts,
splitter = splitter,
trainTestEvaluators = Seq(new OpMultiClassificationEvaluator) ++ trainTestEvaluators,
trainTestEvaluators = allEvaluators,
modelTypesToUse = modelTypesToUse,
modelsAndParameters = modelsAndParameters,
modelDefaults = Defaults
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -136,8 +136,8 @@ case object RegressionModelSelector extends ModelSelectorFactory {
* @param dataSplitter instance that will split the data into training set and test set
* @param numFolds number of folds for cross validation (>= 2)
* @param validationMetric metric name in evaluation: RMSE, R2 etc
* @param trainTestEvaluators List of evaluators applied on training + holdout data for evaluation. Default is empty
* and default evaluator is added to this list (here Evaluators.Regression)
* @param trainTestEvaluators List of evaluators applied on training + holdout data for evaluation. Default is
* the standard OpRegressionEvaluator.
* @param seed random seed
* @param parallelism level of parallelism used to schedule a number of models to be trained/evaluated
* so that the jobs can be run concurrently
Expand All @@ -155,7 +155,7 @@ case object RegressionModelSelector extends ModelSelectorFactory {
dataSplitter: Option[DataSplitter] = Option(DataSplitter()),
numFolds: Int = ValidatorParamDefaults.NumFolds,
validationMetric: OpRegressionEvaluatorBase[_ <: EvaluationMetrics] = Evaluators.Regression.rmse(),
trainTestEvaluators: Seq[OpRegressionEvaluatorBase[_ <: EvaluationMetrics]] = Seq.empty,
trainTestEvaluators: Seq[OpRegressionEvaluatorBase[_ <: EvaluationMetrics]] = Seq(new OpRegressionEvaluator),
seed: Long = ValidatorParamDefaults.Seed,
parallelism: Int = ValidatorParamDefaults.Parallelism,
modelTypesToUse: Seq[RegressionModelsToTry] = Defaults.modelTypesToUse,
Expand All @@ -165,9 +165,12 @@ case object RegressionModelSelector extends ModelSelectorFactory {
val cv = new OpCrossValidation[ModelType, EstimatorType](
numFolds = numFolds, seed = seed, evaluator = validationMetric, parallelism = parallelism, maxWait = maxWait
)
// For backwards compatibility, make sure evaluators always include the defaults
val allEvaluators = addDefaultEvaluators(trainTestEvaluators, Seq(new OpRegressionEvaluator))

selector(cv,
splitter = dataSplitter,
trainTestEvaluators = Seq(new OpRegressionEvaluator) ++ trainTestEvaluators,
trainTestEvaluators = allEvaluators,
modelTypesToUse = modelTypesToUse,
modelsAndParameters = modelsAndParameters,
modelDefaults = Defaults
Expand All @@ -181,8 +184,8 @@ case object RegressionModelSelector extends ModelSelectorFactory {
* @param dataSplitter instance that will split the data into training set and test set
* @param trainRatio ratio between training set and validation set (>= 0 && <= 1)
* @param validationMetric metric name in evaluation: RMSE, R2 etc
* @param trainTestEvaluators List of evaluators applied on training + holdout data for evaluation. Default is empty
* and default evaluator is added to this list (here Evaluators.Regression)
* @param trainTestEvaluators List of evaluators applied on training + holdout data for evaluation. Default is
* the standard OpRegressionEvaluator.
* @param seed random seed
* @param parallelism level of parallelism used to schedule a number of models to be trained/evaluated
* so that the jobs can be run concurrently
Expand All @@ -200,7 +203,7 @@ case object RegressionModelSelector extends ModelSelectorFactory {
dataSplitter: Option[DataSplitter] = Option(DataSplitter()),
trainRatio: Double = ValidatorParamDefaults.TrainRatio,
validationMetric: OpRegressionEvaluatorBase[_ <: EvaluationMetrics] = Evaluators.Regression.rmse(),
trainTestEvaluators: Seq[OpRegressionEvaluatorBase[_ <: EvaluationMetrics]] = Seq.empty,
trainTestEvaluators: Seq[OpRegressionEvaluatorBase[_ <: EvaluationMetrics]] = Seq(new OpRegressionEvaluator),
seed: Long = ValidatorParamDefaults.Seed,
parallelism: Int = ValidatorParamDefaults.Parallelism,
modelTypesToUse: Seq[RegressionModelsToTry] = Defaults.modelTypesToUse,
Expand All @@ -210,9 +213,12 @@ case object RegressionModelSelector extends ModelSelectorFactory {
val ts = new OpTrainValidationSplit[ModelType, EstimatorType](
trainRatio = trainRatio, seed = seed, validationMetric, parallelism = parallelism
)
// For backwards compatibility, make sure evaluators always include the defaults
val allEvaluators = addDefaultEvaluators(trainTestEvaluators, Seq(new OpRegressionEvaluator))

selector(ts,
splitter = dataSplitter,
trainTestEvaluators = Seq(new OpRegressionEvaluator) ++ trainTestEvaluators,
trainTestEvaluators = allEvaluators,
modelTypesToUse = modelTypesToUse,
modelsAndParameters = modelsAndParameters,
modelDefaults = Defaults
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,10 @@ package com.salesforce.op.stages.impl.selector
import com.salesforce.op.evaluators.{EvaluationMetrics, OpEvaluatorBase}
import com.salesforce.op.stages.impl.ModelsToTry
import com.salesforce.op.stages.impl.selector.ModelSelectorNames.{EstimatorType, ModelType}
import com.salesforce.op.stages.impl.tuning.{OpValidator, Splitter, ValidatorParamDefaults}
import com.salesforce.op.stages.impl.tuning.{OpValidator, Splitter}
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.tuning.ParamGridBuilder

import scala.concurrent.duration.Duration

/**
* Creates the model selector class
*/
Expand Down Expand Up @@ -105,4 +103,13 @@ trait ModelSelectorFactory {
)
}

protected def addDefaultEvaluators(
evaluators: Seq[OpEvaluatorBase[_ <: EvaluationMetrics]],
defaultEvaluators: Seq[OpEvaluatorBase[_ <: EvaluationMetrics]]
): Seq[OpEvaluatorBase[_ <: EvaluationMetrics]] = {
defaultEvaluators.filter {
e => !evaluators.exists(_.isInstanceOf[e.type])
} ++ evaluators
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -307,14 +307,14 @@ class BinaryClassificationModelSelectorTest extends FlatSpec with TestSparkConte
largerBetter = false,
evaluateFn = crossEntropyFun
)

val customEvaluators = Seq(crossEntropy)
val testEstimator =
BinaryClassificationModelSelector
.withCrossValidation(
Option(DataSplitter(reserveTestFraction = 0.2)),
numFolds = 4,
validationMetric = Evaluators.BinaryClassification.recall(),
trainTestEvaluators = Seq(crossEntropy),
trainTestEvaluators = customEvaluators,
seed = 10L,
modelsAndParameters = models
)
Expand All @@ -331,6 +331,9 @@ class BinaryClassificationModelSelectorTest extends FlatSpec with TestSparkConte
val trainMetaData = metaData.trainEvaluation
val holdOutMetaData = metaData.holdoutEvaluation.get

// check that the default evaluator(s) got added to the list of evaluators
testEstimator.evaluators.length should be > customEvaluators.size

testEstimator.evaluators.foreach {
case evaluator: OpBinaryClassificationEvaluator => {
BinaryClassEvalMetrics.values.foreach(metric =>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -306,14 +306,14 @@ class MultiClassificationModelSelectorTest extends FlatSpec with TestSparkContex
largerBetter = false,
evaluateFn = crossEntropyFun
)

val customEvaluators = Seq(crossEntropy)
val testEstimator =
MultiClassificationModelSelector
.withCrossValidation(
Option(DataCutter(42, reserveTestFraction = 0.2, maxLabelCategories = 1000000, minLabelFraction = 0.0)),
numFolds = 4,
validationMetric = Evaluators.MultiClassification.precision(),
trainTestEvaluators = Seq(crossEntropy),
trainTestEvaluators = customEvaluators,
seed = 10L,
modelsAndParameters = models
)
Expand All @@ -330,6 +330,9 @@ class MultiClassificationModelSelectorTest extends FlatSpec with TestSparkContex
val trainMetaData = metaData.trainEvaluation.toJson(false)
val holdOutMetaData = metaData.holdoutEvaluation.get.toJson(false)

// check that the default evaluator(s) got added to the list of evaluators
testEstimator.evaluators.length should be > customEvaluators.size

testEstimator.evaluators.foreach {
case _: OpMultiClassificationEvaluator => {
MultiClassEvalMetrics.values.foreach(metric =>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -398,10 +398,11 @@ class RegressionModelSelectorTest extends FlatSpec with TestSparkContext
median.head
}
)
val customEvaluators = Seq(medianAbsoluteError)
val testEstimator =
RegressionModelSelector
.withCrossValidation(numFolds = 4, validationMetric = medianAbsoluteError,
trainTestEvaluators = Seq(medianAbsoluteError), seed = 11L, modelsAndParameters = models)
trainTestEvaluators = customEvaluators, seed = 11L, modelsAndParameters = models)
.setInput(label, features)
val model = testEstimator.fit(data)

Expand All @@ -411,6 +412,9 @@ class RegressionModelSelectorTest extends FlatSpec with TestSparkContext
val trainMetaData = metaData.trainEvaluation
val holdOutMetaData = metaData.holdoutEvaluation.get

// check that the default evaluator(s) got added to the list of evaluators
testEstimator.evaluators.length should be > customEvaluators.size

testEstimator.evaluators.foreach {
case evaluator: OpRegressionEvaluator => {
RegressionEvalMetrics.values.foreach(metric =>
Expand Down

0 comments on commit 218850b

Please sign in to comment.