Skip to content

Commit

Permalink
[SPARK-15442][ML][PYSPARK] Add 'relativeError' param to PySpark Quant…
Browse files Browse the repository at this point in the history
…ileDiscretizer

This PR adds the `relativeError` param to PySpark's `QuantileDiscretizer` to match Scala.

Also cleaned up a duplication of `numBuckets` where the param is both a class and instance attribute (I removed the instance attr to match the style of params throughout `ml`).

Finally, cleaned up the docs for `QuantileDiscretizer` to reflect that it now uses `approxQuantile`.

## How was this patch tested?

A little doctest and built API docs locally to check HTML doc generation.

Author: Nick Pentreath <nickp@za.ibm.com>

Closes #13228 from MLnick/SPARK-15442-py-relerror-param.

(cherry picked from commit 6075f5b)
Signed-off-by: Nick Pentreath <nickp@za.ibm.com>
  • Loading branch information
Nick Pentreath committed May 24, 2016
1 parent 6adbc06 commit 6ee1583
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 20 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,13 @@ private[feature] trait QuantileDiscretizerBase extends Params
/**
* Relative error (see documentation for
* [[org.apache.spark.sql.DataFrameStatFunctions.approxQuantile approxQuantile]] for description)
* Must be a number in [0, 1].
* Must be in the range [0, 1].
* default: 0.001
* @group param
*/
val relativeError = new DoubleParam(this, "relativeError", "The relative target precision " +
"for approxQuantile",
ParamValidators.inRange(0.0, 1.0))
"for the approximate quantile algorithm used to generate buckets. " +
"Must be in the range [0, 1].", ParamValidators.inRange(0.0, 1.0))
setDefault(relativeError -> 0.001)

/** @group getParam */
Expand All @@ -66,8 +66,11 @@ private[feature] trait QuantileDiscretizerBase extends Params
/**
* :: Experimental ::
* `QuantileDiscretizer` takes a column with continuous features and outputs a column with binned
* categorical features. The bin ranges are chosen by taking a sample of the data and dividing it
* into roughly equal parts. The lower and upper bin bounds will be -Infinity and +Infinity,
* categorical features. The number of bins can be set using the `numBuckets` parameter.
* The bin ranges are chosen using an approximate algorithm (see the documentation for
* [[org.apache.spark.sql.DataFrameStatFunctions.approxQuantile approxQuantile]]
* for a detailed description). The precision of the approximation can be controlled with the
* `relativeError` parameter. The lower and upper bin bounds will be `-Infinity` and `+Infinity`,
* covering all real values.
*/
@Experimental
Expand Down
51 changes: 36 additions & 15 deletions python/pyspark/ml/feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -1177,16 +1177,20 @@ class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol, HasSeed, Jav
.. note:: Experimental
`QuantileDiscretizer` takes a column with continuous features and outputs a column with binned
categorical features. The bin ranges are chosen by taking a sample of the data and dividing it
into roughly equal parts. The lower and upper bin bounds will be -Infinity and +Infinity,
covering all real values. This attempts to find numBuckets partitions based on a sample of data,
but it may find fewer depending on the data sample values.
categorical features. The number of bins can be set using the :py:attr:`numBuckets` parameter.
The bin ranges are chosen using an approximate algorithm (see the documentation for
:py:meth:`~.DataFrameStatFunctions.approxQuantile` for a detailed description).
The precision of the approximation can be controlled with the
:py:attr:`relativeError` parameter.
The lower and upper bin bounds will be `-Infinity` and `+Infinity`, covering all real values.
>>> df = spark.createDataFrame([(0.1,), (0.4,), (1.2,), (1.5,)], ["values"])
>>> qds = QuantileDiscretizer(numBuckets=2,
... inputCol="values", outputCol="buckets", seed=123)
... inputCol="values", outputCol="buckets", seed=123, relativeError=0.01)
>>> qds.getSeed()
123
>>> qds.getRelativeError()
0.01
>>> bucketizer = qds.fit(df)
>>> splits = bucketizer.getSplits()
>>> splits[0]
Expand All @@ -1205,32 +1209,35 @@ class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol, HasSeed, Jav
.. versionadded:: 2.0.0
"""

# a placeholder to make it appear in the generated doc
numBuckets = Param(Params._dummy(), "numBuckets",
"Maximum number of buckets (quantiles, or " +
"categories) into which data points are grouped. Must be >= 2. Default 2.",
"categories) into which data points are grouped. Must be >= 2.",
typeConverter=TypeConverters.toInt)

relativeError = Param(Params._dummy(), "relativeError", "The relative target precision for " +
"the approximate quantile algorithm used to generate buckets. " +
"Must be in the range [0, 1].",
typeConverter=TypeConverters.toFloat)

@keyword_only
def __init__(self, numBuckets=2, inputCol=None, outputCol=None, seed=None):
def __init__(self, numBuckets=2, inputCol=None, outputCol=None, seed=None, relativeError=0.001):
"""
__init__(self, numBuckets=2, inputCol=None, outputCol=None, seed=None)
__init__(self, numBuckets=2, inputCol=None, outputCol=None, seed=None, relativeError=0.001)
"""
super(QuantileDiscretizer, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.QuantileDiscretizer",
self.uid)
self.numBuckets = Param(self, "numBuckets",
"Maximum number of buckets (quantiles, or " +
"categories) into which data points are grouped. Must be >= 2.")
self._setDefault(numBuckets=2)
self._setDefault(numBuckets=2, relativeError=0.001)
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)

@keyword_only
@since("2.0.0")
def setParams(self, numBuckets=2, inputCol=None, outputCol=None, seed=None):
def setParams(self, numBuckets=2, inputCol=None, outputCol=None, seed=None,
relativeError=0.001):
"""
setParams(self, numBuckets=2, inputCol=None, outputCol=None, seed=None)
setParams(self, numBuckets=2, inputCol=None, outputCol=None, seed=None, \
relativeError=0.001)
Set the params for the QuantileDiscretizer
"""
kwargs = self.setParams._input_kwargs
Expand All @@ -1250,6 +1257,20 @@ def getNumBuckets(self):
"""
return self.getOrDefault(self.numBuckets)

@since("2.0.0")
def setRelativeError(self, value):
"""
Sets the value of :py:attr:`relativeError`.
"""
return self._set(relativeError=value)

@since("2.0.0")
def getRelativeError(self):
"""
Gets the value of relativeError or its default value.
"""
return self.getOrDefault(self.relativeError)

def _create_model(self, java_model):
"""
Private method to convert the java_model to a Python model.
Expand Down

0 comments on commit 6ee1583

Please sign in to comment.