apache · rebo16v · Oct 4, 2024 · Oct 4, 2024 · Oct 8, 2024 · Oct 8, 2024
diff --git a/docs/ml-features.md b/docs/ml-features.md
@@ -855,6 +855,46 @@ for more details on the API.
 
 </div>
 
+## TargetEncoder
+
+Target Encoding maps a column of categorical indices into a numerical feature derived from the target. For string type input data, it is common to encode categorical features using [StringIndexer](ml-features.html#stringindexer) first.
+
+`TargetEncoder` can transform multiple columns, returning a target-encoded output column for each input column.
+
+`TargetEncoder` supports the `handleInvalid` parameter to choose how to handle invalid input during transforming data. Available options include 'keep' (any invalid inputs are assigned to an extra categorical index) and 'error' (throw an error).
+
+`TargetEncoder` supports the `targetType` parameter to choose the label type when fitting data, affecting how statistics are calculated. Available options include 'binary' (bin-counting) and 'continuous' (mean-encoding).
+
+`TargetEncoder` supports the `smoothing` parameter to tune how in-category stats and overall stats are weighted.
+
+**Examples**
+
+<div class="codetabs">
+
+<div data-lang="python" markdown="1">
+
+Refer to the [TargetEncoder Python docs](api/python/reference/api/pyspark.ml.feature.TargetEncoder.html) for more details on the API.
+
+{% include_example python/ml/target_encoder_example.py %}
+</div>
+
+<div data-lang="scala" markdown="1">
+
+Refer to the [TargetEncoder Scala docs](api/scala/org/apache/spark/ml/feature/TargetEncoder.html) for more details on the API.
+
+{% include_example scala/org/apache/spark/examples/ml/TargetEncoderExample.scala %}
+</div>
+
+<div data-lang="java" markdown="1">
+
+Refer to the [TargetEncoder Java docs](api/java/org/apache/spark/ml/feature/TargetEncoder.html)
+for more details on the API.
+
+{% include_example java/org/apache/spark/examples/ml/JavaTargetEncoderExample.java %}
+</div>
+
+</div>
+
 ## VectorIndexer
 
 `VectorIndexer` helps index categorical features in datasets of `Vector`s.

diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaTargetEncoderExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaTargetEncoderExample.java
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.sql.SparkSession;
+
+// $example on$
+import org.apache.spark.ml.feature.TargetEncoder;
+import org.apache.spark.ml.feature.TargetEncoderModel;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+
+import java.util.Arrays;
+import java.util.List;
+// $example off$
+
+public class JavaTargetEncoderExample {
+  public static void main(String[] args) {
+    SparkSession spark = SparkSession
+      .builder()
+      .appName("JavaTargetEncoderExample")
+      .getOrCreate();
+
+    // Note: categorical features are usually first encoded with StringIndexer
+    // $example on$
+    List<Row> data = Arrays.asList(
+      RowFactory.create(0.0, 1.0, 0, 10.0),
+      RowFactory.create(1.0, 0.0, 1, 20.0),
+      RowFactory.create(2.0, 1.0, 0, 30.0),
+      RowFactory.create(0.0, 2.0, 1, 40.0),
+      RowFactory.create(0.0, 1.0, 0, 50.0),
+      RowFactory.create(2.0, 0.0, 1, 60.0)
+    );
+
+    StructType schema = new StructType(new StructField[]{
+      new StructField("categoryIndex1", DataTypes.DoubleType, false, Metadata.empty()),
+      new StructField("categoryIndex2", DataTypes.DoubleType, false, Metadata.empty()),
+      new StructField("binaryLabel", DataTypes.DoubleType, false, Metadata.empty()),
+      new StructField("continuousLabel", DataTypes.DoubleType, false, Metadata.empty())
+    });
+
+    Dataset<Row> df = spark.createDataFrame(data, schema);
+
+    // binary target
+    TargetEncoder bin_encoder = new TargetEncoder()
+      .setInputCols(new String[] {"categoryIndex1", "categoryIndex2"})
+      .setOutputCols(new String[] {"categoryIndex1Target", "categoryIndex2Target"})
+      .setLabelCol("binaryLabel")
+      .setTargetType("binary");
+
+    TargetEncoderModel bin_model = bin_encoder.fit(df);
+    Dataset<Row> bin_encoded = bin_model.transform(df);
+    bin_encoded.show();
+
+    // continuous target
+    TargetEncoder cont_encoder = new TargetEncoder()
+            .setInputCols(new String[] {"categoryIndex1", "categoryIndex2"})
+            .setOutputCols(new String[] {"categoryIndex1Target", "categoryIndex2Target"})
+            .setLabelCol("continuousLabel")
+            .setTargetType("continuous");
+
+    TargetEncoderModel cont_model = cont_encoder.fit(df);
+    Dataset<Row> cont_encoded = cont_model.transform(df);
+    cont_encoded.show();
+    // $example off$
+
+    spark.stop();
+  }
+}
+
diff --git a/examples/src/main/python/ml/target_encoder_example.py b/examples/src/main/python/ml/target_encoder_example.py
@@ -0,0 +1,65 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# $example on$
+from pyspark.ml.feature import TargetEncoder
+
+# $example off$
+from pyspark.sql import SparkSession
+
+if __name__ == "__main__":
+    spark = SparkSession.builder.appName("TargetEncoderExample").getOrCreate()
+
+    # Note: categorical features are usually first encoded with StringIndexer
+    # $example on$
+    df = spark.createDataFrame(
+        [
+            (0.0, 1.0, 0, 10.0),
+            (1.0, 0.0, 1, 20.0),
+            (2.0, 1.0, 0, 30.0),
+            (0.0, 2.0, 1, 40.0),
+            (0.0, 1.0, 0, 50.0),
+            (2.0, 0.0, 1, 60.0),
+        ],
+        ["categoryIndex1", "categoryIndex2", "binaryLabel", "continuousLabel"],
+    )
+
+    # binary target
+    encoder = TargetEncoder(
+        inputCols=["categoryIndex1", "categoryIndex2"],
+        outputCols=["categoryIndex1Target", "categoryIndex2Target"],
+        labelCol="binaryLabel",
+        targetType="binary"
+    )
+    model = encoder.fit(df)
+    encoded = model.transform(df)
+    encoded.show()
+
+    # continuous target
+    encoder = TargetEncoder(
+        inputCols=["categoryIndex1", "categoryIndex2"],
+        outputCols=["categoryIndex1Target", "categoryIndex2Target"],
+        labelCol="continuousLabel",
+        targetType="continuous"
+    )
+
+    model = encoder.fit(df)
+    encoded = model.transform(df)
+    encoded.show()
+    # $example off$
+
+    spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/TargetEncoderExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/TargetEncoderExample.scala
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.TargetEncoder
+// $example off$
+import org.apache.spark.sql.SparkSession
+
+object TargetEncoderExample {
+  def main(args: Array[String]): Unit = {
+    val spark = SparkSession
+      .builder()
+      .appName("TargetEncoderExample")
+      .getOrCreate()
+
+    // Note: categorical features are usually first encoded with StringIndexer
+    // $example on$
+    val df = spark.createDataFrame(Seq(
+      (0.0, 1.0, 0, 10.0),
+      (1.0, 0.0, 1, 20.0),
+      (2.0, 1.0, 0, 30.0),
+      (0.0, 2.0, 1, 40.0),
+      (0.0, 1.0, 0, 50.0),
+      (2.0, 0.0, 1, 60.0)
+    )).toDF("categoryIndex1", "categoryIndex2",
+            "binaryLabel", "continuousLabel")
+
+    // binary target
+    val bin_encoder = new TargetEncoder()
+      .setInputCols(Array("categoryIndex1", "categoryIndex2"))
+      .setOutputCols(Array("categoryIndex1Target", "categoryIndex2Target"))
+      .setLabelCol("binaryLabel")
+      .setTargetType("binary");
+
+    val bin_model = bin_encoder.fit(df)
+    val bin_encoded = bin_model.transform(df)
+    bin_encoded.show()
+
+    // continuous target
+    val cont_encoder = new TargetEncoder()
+      .setInputCols(Array("categoryIndex1", "categoryIndex2"))
+      .setOutputCols(Array("categoryIndex1Target", "categoryIndex2Target"))
+      .setLabelCol("continuousLabel")
+      .setTargetType("continuous");
+
+    val cont_model = cont_encoder.fit(df)
+    val cont_encoded = cont_model.transform(df)
+    cont_encoded.show()
+    // $example off$
+
+    spark.stop()
+  }
+}
+// scalastyle:on println