From e037953d4c5879f927c527685c2d027fe2fc08c1 Mon Sep 17 00:00:00 2001
From: panbingkun <panbingkun@baidu.com>
Date: Wed, 11 Sep 2024 20:58:51 +0200
Subject: [PATCH] [SPARK-48549][SQL][PYTHON] Improve SQL function `sentences`

### What changes were proposed in this pull request?
The pr aims to  improve SQL function `sentences`, includes:
- update the description of the `sentences` expression to make it more realistic.
- add `def sentences(string: Column, language: Column): Column` to SQL functions
- `codegen` support for `sentences`

### Why are the changes needed?
Fix inconsistency in using the function `sentences` in the following scenarios
  <img width="1051" alt="image" src="https://github.com/apache/spark/assets/15246973/033c731d-5a2f-455f-8517-ed95bd6c1f6e">

- According to the definition of function `sentences`, we should only allow the following two kinds of parameter calls:
  A.sentences(str)
  B.sentences(str, language, country) - the parameters `language` and `country` either coexist or do not exist at the same time

  **In file `sql/core/src/main/scala/org/apache/spark/sql/functions.scala`, only the following two functions are defined**:
https://github.com/apache/spark/blob/f4434c36cc4f7b0147e0e8fe26ac0f177a5199cd/sql/core/src/main/scala/org/apache/spark/sql/functions.scala#L4273-L4282

- When we directly call the expression `sentences`, it actually supports the following:
A.`df.select(sentences($"str", $"language", $"country"))`;
B.`df.select(sentences($"str", $"language"))`;
C.`df.select(sentences($"str"))`;

## Let's align it

### Does this PR introduce _any_ user-facing change?
Yes, allow calling SQL function `sentences` as parameters (`str`, `language`).

### How was this patch tested?
- Add new UT & Update existed UT.
- Pass GA.
- Manually check
```scala
scala> val df =  Seq(("Hi there! The price was $1,234.56.... But, not now.", "en", "US")).toDF("str", "language", "country");
val df: org.apache.spark.sql.DataFrame = [str: string, language: string ... 1 more field]

scala> df.select(sentences($"str", $"language", $"country"));
val res0: org.apache.spark.sql.DataFrame = [sentences(str, language, country): array<array<string>>]

scala> df.select(sentences($"str", $"language"));
val res1: org.apache.spark.sql.DataFrame = [sentences(str, language, ): array<array<string>>]

scala> df.select(sentences($"str"));
val res2: org.apache.spark.sql.DataFrame = [sentences(str, , ): array<array<string>>]

scala> df.selectExpr("sentences(str, language, country)");
val res3: org.apache.spark.sql.DataFrame = [sentences(str, language, country): array<array<string>>]

scala> df.selectExpr("sentences(str, language)");
val res4: org.apache.spark.sql.DataFrame = [sentences(str, language, ): array<array<string>>]

scala> df.selectExpr("sentences(str)");
val res5: org.apache.spark.sql.DataFrame = [sentences(str, , ): array<array<string>>]
```

### Was this patch authored or co-authored using generative AI tooling?
No.

Closes #46880 from panbingkun/sentences_improve.

Authored-by: panbingkun <panbingkun@baidu.com>
Signed-off-by: Max Gekk <max.gekk@gmail.com>
---
 .../spark/sql/PlanGenerationTestSuite.scala   |   6 +-
 python/pyspark/sql/functions/builtin.py       |  22 ++++-
 .../org/apache/spark/sql/functions.scala      |   9 ++
 .../expressions/ExpressionImplUtils.java      |  61 +++++++++++--
 .../expressions/stringExpressions.scala       |  80 ++++++++----------
 .../expressions/StringExpressionsSuite.scala  |   2 +-
 .../function_sentences.explain                |   2 +-
 .../function_sentences_with_language.explain  |   2 +
 ...entences_with_language_and_country.explain |   2 +
 .../function_sentences_with_locale.explain    |   2 -
 .../function_sentences_with_language.json     |  29 +++++++
 ...function_sentences_with_language.proto.bin | Bin 0 -> 186 bytes
 ..._sentences_with_language_and_country.json} |   0
 ...ences_with_language_and_country.proto.bin} | Bin
 .../spark/sql/StringFunctionsSuite.scala      |  28 ++++++
 15 files changed, 187 insertions(+), 58 deletions(-)
 create mode 100644 sql/connect/common/src/test/resources/query-tests/explain-results/function_sentences_with_language.explain
 create mode 100644 sql/connect/common/src/test/resources/query-tests/explain-results/function_sentences_with_language_and_country.explain
 delete mode 100644 sql/connect/common/src/test/resources/query-tests/explain-results/function_sentences_with_locale.explain
 create mode 100644 sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_language.json
 create mode 100644 sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_language.proto.bin
 rename sql/connect/common/src/test/resources/query-tests/queries/{function_sentences_with_locale.json => function_sentences_with_language_and_country.json} (100%)
 rename sql/connect/common/src/test/resources/query-tests/queries/{function_sentences_with_locale.proto.bin => function_sentences_with_language_and_country.proto.bin} (100%)
diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala
index ee91f3aa6c00a..315f80e13eff7 100644
--- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala
+++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala
@@ -1809,7 +1809,11 @@ class PlanGenerationTestSuite
     fn.sentences(fn.col("g"))
   }
 
-  functionTest("sentences with locale") {
+  functionTest("sentences with language") {
+    fn.sentences(fn.col("g"), lit("en"))
+  }
+
+  functionTest("sentences with language and country") {
     fn.sentences(fn.col("g"), lit("en"), lit("US"))
   }
 
diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py
index b6499eb1546e7..781bf3d9f83a2 100644
--- a/python/pyspark/sql/functions/builtin.py
+++ b/python/pyspark/sql/functions/builtin.py
@@ -11241,13 +11241,27 @@ def sentences(
 ) -> Column:
     """
     Splits a string into arrays of sentences, where each sentence is an array of words.
-    The 'language' and 'country' arguments are optional, and if omitted, the default locale is used.
+    The `language` and `country` arguments are optional,
+    When they are omitted:
+    1.If they are both omitted, the `Locale.ROOT - locale(language='', country='')` is used.
+    The `Locale.ROOT` is regarded as the base locale of all locales, and is used as the
+    language/country neutral locale for the locale sensitive operations.
+    2.If the `country` is omitted, the `locale(language, country='')` is used.
+    When they are null:
+    1.If they are both `null`, the `Locale.US - locale(language='en', country='US')` is used.
+    2.If the `language` is null and the `country` is not null,
+    the `Locale.US - locale(language='en', country='US')` is used.
+    3.If the `language` is not null and the `country` is null, the `locale(language)` is used.
+    4.If neither is `null`, the `locale(language, country)` is used.
 
     .. versionadded:: 3.2.0
 
     .. versionchanged:: 3.4.0
         Supports Spark Connect.
 
+    .. versionchanged:: 4.0.0
+        Supports `sentences(string, language)`.
+
     Parameters
     ----------
     string : :class:`~pyspark.sql.Column` or str
@@ -11271,6 +11285,12 @@ def sentences(
     +-----------------------------------+
     |[[This, is, an, example, sentence]]|
     +-----------------------------------+
+    >>> df.select(sentences(df.string, lit("en"))).show(truncate=False)
+    +-----------------------------------+
+    |sentences(string, en, )            |
+    +-----------------------------------+
+    |[[This, is, an, example, sentence]]|
+    +-----------------------------------+
     >>> df = spark.createDataFrame([["Hello world. How are you?"]], ["s"])
     >>> df.select(sentences("s")).show(truncate=False)
     +---------------------------------+
diff --git a/sql/api/src/main/scala/org/apache/spark/sql/functions.scala b/sql/api/src/main/scala/org/apache/spark/sql/functions.scala
index 1ee86ae1a113d..86f8923f36b40 100644
--- a/sql/api/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/api/src/main/scala/org/apache/spark/sql/functions.scala
@@ -4349,6 +4349,15 @@ object functions {
   def sentences(string: Column, language: Column, country: Column): Column =
     Column.fn("sentences", string, language, country)
 
+  /**
+   * Splits a string into arrays of sentences, where each sentence is an array of words. The
+   * default `country`('') is used.
+   * @group string_funcs
+   * @since 4.0.0
+   */
+  def sentences(string: Column, language: Column): Column =
+    Column.fn("sentences", string, language)
+
   /**
    * Splits a string into arrays of sentences, where each sentence is an array of words. The
    * default locale is used.
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionImplUtils.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionImplUtils.java
index 07a9409bc57a2..18646f67975c0 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionImplUtils.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionImplUtils.java
@@ -17,20 +17,25 @@
 
 package org.apache.spark.sql.catalyst.expressions;
 
-import org.apache.spark.SparkBuildInfo;
-import org.apache.spark.sql.errors.QueryExecutionErrors;
-import org.apache.spark.unsafe.types.UTF8String;
-import org.apache.spark.util.VersionUtils;
-
-import javax.crypto.Cipher;
-import javax.crypto.spec.GCMParameterSpec;
-import javax.crypto.spec.IvParameterSpec;
-import javax.crypto.spec.SecretKeySpec;
 import java.nio.ByteBuffer;
 import java.security.GeneralSecurityException;
 import java.security.SecureRandom;
 import java.security.spec.AlgorithmParameterSpec;
+import java.text.BreakIterator;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Locale;
+import javax.crypto.Cipher;
+import javax.crypto.spec.GCMParameterSpec;
+import javax.crypto.spec.IvParameterSpec;
+import javax.crypto.spec.SecretKeySpec;
 
+import org.apache.spark.SparkBuildInfo;
+import org.apache.spark.sql.catalyst.util.ArrayData;
+import org.apache.spark.sql.catalyst.util.GenericArrayData;
+import org.apache.spark.sql.errors.QueryExecutionErrors;
+import org.apache.spark.unsafe.types.UTF8String;
+import org.apache.spark.util.VersionUtils;
 
 /**
  * A utility class for constructing expressions.
@@ -272,4 +277,42 @@ private static byte[] aesInternal(
       throw QueryExecutionErrors.aesCryptoError(e.getMessage());
     }
   }
+
+  public static ArrayData getSentences(
+      UTF8String str,
+      UTF8String language,
+      UTF8String country) {
+    if (str == null) return null;
+    Locale locale;
+    if (language != null && country != null) {
+      locale = new Locale(language.toString(), country.toString());
+    } else if (language != null) {
+      locale = new Locale(language.toString());
+    } else {
+      locale = Locale.US;
+    }
+    String sentences = str.toString();
+    BreakIterator sentenceInstance = BreakIterator.getSentenceInstance(locale);
+    sentenceInstance.setText(sentences);
+
+    int sentenceIndex = 0;
+    List<GenericArrayData> res = new ArrayList<>();
+    while (sentenceInstance.next() != BreakIterator.DONE) {
+      String sentence = sentences.substring(sentenceIndex, sentenceInstance.current());
+      sentenceIndex = sentenceInstance.current();
+      BreakIterator wordInstance = BreakIterator.getWordInstance(locale);
+      wordInstance.setText(sentence);
+      int wordIndex = 0;
+      List<UTF8String> words = new ArrayList<>();
+      while (wordInstance.next() != BreakIterator.DONE) {
+        String word = sentence.substring(wordIndex, wordInstance.current());
+        wordIndex = wordInstance.current();
+        if (Character.isLetterOrDigit(word.charAt(0))) {
+          words.add(UTF8String.fromString(word));
+        }
+      }
+      res.add(new GenericArrayData(words.toArray(new UTF8String[0])));
+    }
+    return new GenericArrayData(res.toArray(new GenericArrayData[0]));
+  }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
index f211da52e4570..e75df87994f0e 100755
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.expressions
 
 import java.nio.{ByteBuffer, CharBuffer}
 import java.nio.charset.CharacterCodingException
-import java.text.{BreakIterator, DecimalFormat, DecimalFormatSymbols}
+import java.text.{DecimalFormat, DecimalFormatSymbols}
 import java.util.{Base64 => JBase64, HashMap, Locale, Map => JMap}
 
 import scala.collection.mutable.ArrayBuffer
@@ -3327,14 +3327,37 @@ case class FormatNumber(x: Expression, d: Expression)
 
 /**
  * Splits a string into arrays of sentences, where each sentence is an array of words.
- * The 'lang' and 'country' arguments are optional, and if omitted, the default locale is used.
+ * The `lang` and `country` arguments are optional, their default values are all '',
+ *  - When they are omitted:
+ *    1. If they are both omitted, the `Locale.ROOT - locale(language='', country='')` is used.
+ *       The `Locale.ROOT` is regarded as the base locale of all locales, and is used as the
+ *       language/country neutral locale for the locale sensitive operations.
+ *    2. If the `country` is omitted, the `locale(language, country='')` is used.
+ *  - When they are null:
+ *    1. If they are both `null`, the `Locale.US - locale(language='en', country='US')` is used.
+ *    2. If the `language` is null and the `country` is not null,
+ *       the `Locale.US - locale(language='en', country='US')` is used.
+ *    3. If the `language` is not null and the `country` is null, the `locale(language)` is used.
+ *    4. If neither is `null`, the `locale(language, country)` is used.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(str[, lang, country]) - Splits `str` into an array of array of words.",
+  usage = "_FUNC_(str[, lang[, country]]) - Splits `str` into an array of array of words.",
+  arguments = """
+    Arguments:
+      * str - A STRING expression to be parsed.
+      * lang - An optional STRING expression with a language code from ISO 639 Alpha-2 (e.g. 'DE'),
+          Alpha-3, or a language subtag of up to 8 characters.
+      * country - An optional STRING expression with a country code from ISO 3166 alpha-2 country
+          code or a UN M.49 numeric-3 area code.
+  """,
   examples = """
     Examples:
       > SELECT _FUNC_('Hi there! Good morning.');
        [["Hi","there"],["Good","morning"]]
+      > SELECT _FUNC_('Hi there! Good morning.', 'en');
+       [["Hi","there"],["Good","morning"]]
+      > SELECT _FUNC_('Hi there! Good morning.', 'en', 'US');
+       [["Hi","there"],["Good","morning"]]
   """,
   since = "2.0.0",
   group = "string_funcs")
@@ -3342,7 +3365,9 @@ case class Sentences(
     str: Expression,
     language: Expression = Literal(""),
     country: Expression = Literal(""))
-  extends TernaryExpression with ImplicitCastInputTypes with CodegenFallback {
+  extends TernaryExpression
+  with ImplicitCastInputTypes
+  with RuntimeReplaceable {
 
   def this(str: Expression) = this(str, Literal(""), Literal(""))
   def this(str: Expression, language: Expression) = this(str, language, Literal(""))
@@ -3356,49 +3381,18 @@ case class Sentences(
   override def second: Expression = language
   override def third: Expression = country
 
-  override def eval(input: InternalRow): Any = {
-    val string = str.eval(input)
-    if (string == null) {
-      null
-    } else {
-      val languageStr = language.eval(input).asInstanceOf[UTF8String]
-      val countryStr = country.eval(input).asInstanceOf[UTF8String]
-      val locale = if (languageStr != null && countryStr != null) {
-        new Locale(languageStr.toString, countryStr.toString)
-      } else {
-        Locale.US
-      }
-      getSentences(string.asInstanceOf[UTF8String].toString, locale)
-    }
-  }
-
-  private def getSentences(sentences: String, locale: Locale) = {
-    val bi = BreakIterator.getSentenceInstance(locale)
-    bi.setText(sentences)
-    var idx = 0
-    val result = new ArrayBuffer[GenericArrayData]
-    while (bi.next != BreakIterator.DONE) {
-      val sentence = sentences.substring(idx, bi.current)
-      idx = bi.current
-
-      val wi = BreakIterator.getWordInstance(locale)
-      var widx = 0
-      wi.setText(sentence)
-      val words = new ArrayBuffer[UTF8String]
-      while (wi.next != BreakIterator.DONE) {
-        val word = sentence.substring(widx, wi.current)
-        widx = wi.current
-        if (Character.isLetterOrDigit(word.charAt(0))) words += UTF8String.fromString(word)
-      }
-      result += new GenericArrayData(words)
-    }
-    new GenericArrayData(result)
-  }
+  override def replacement: Expression =
+    StaticInvoke(
+      classOf[ExpressionImplUtils],
+      dataType,
+      "getSentences",
+      Seq(str, language, country),
+      inputTypes,
+      propagateNull = false)
 
   override protected def withNewChildrenInternal(
       newFirst: Expression, newSecond: Expression, newThird: Expression): Sentences =
     copy(str = newFirst, language = newSecond, country = newThird)
-
 }
 
 /**
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
index beefabd981089..29b878230472d 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
@@ -1987,7 +1987,7 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
 
     // Test escaping of arguments
     GenerateUnsafeProjection.generate(
-      Sentences(Literal("\"quote"), Literal("\"quote"), Literal("\"quote")) :: Nil)
+      Sentences(Literal("\"quote"), Literal("\"quote"), Literal("\"quote")).replacement :: Nil)
   }
 
   test("SPARK-33386: elt ArrayIndexOutOfBoundsException") {
diff --git a/sql/connect/common/src/test/resources/query-tests/explain-results/function_sentences.explain b/sql/connect/common/src/test/resources/query-tests/explain-results/function_sentences.explain
index 5c88a1f7b3abd..f4532e70675ae 100644
--- a/sql/connect/common/src/test/resources/query-tests/explain-results/function_sentences.explain
+++ b/sql/connect/common/src/test/resources/query-tests/explain-results/function_sentences.explain
@@ -1,2 +1,2 @@
-Project [sentences(g#0, , ) AS sentences(g, , )#0]
+Project [static_invoke(ExpressionImplUtils.getSentences(g#0, , )) AS sentences(g, , )#0]
 +- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0]
diff --git a/sql/connect/common/src/test/resources/query-tests/explain-results/function_sentences_with_language.explain b/sql/connect/common/src/test/resources/query-tests/explain-results/function_sentences_with_language.explain
new file mode 100644
index 0000000000000..37bcbf9a319b5
--- /dev/null
+++ b/sql/connect/common/src/test/resources/query-tests/explain-results/function_sentences_with_language.explain
@@ -0,0 +1,2 @@
+Project [static_invoke(ExpressionImplUtils.getSentences(g#0, en, )) AS sentences(g, en, )#0]
++- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0]
diff --git a/sql/connect/common/src/test/resources/query-tests/explain-results/function_sentences_with_language_and_country.explain b/sql/connect/common/src/test/resources/query-tests/explain-results/function_sentences_with_language_and_country.explain
new file mode 100644
index 0000000000000..8a8d54cfa0d10
--- /dev/null
+++ b/sql/connect/common/src/test/resources/query-tests/explain-results/function_sentences_with_language_and_country.explain
@@ -0,0 +1,2 @@
+Project [static_invoke(ExpressionImplUtils.getSentences(g#0, en, US)) AS sentences(g, en, US)#0]
++- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0]
diff --git a/sql/connect/common/src/test/resources/query-tests/explain-results/function_sentences_with_locale.explain b/sql/connect/common/src/test/resources/query-tests/explain-results/function_sentences_with_locale.explain
deleted file mode 100644
index 7819f9b542340..0000000000000
--- a/sql/connect/common/src/test/resources/query-tests/explain-results/function_sentences_with_locale.explain
+++ /dev/null
@@ -1,2 +0,0 @@
-Project [sentences(g#0, en, US) AS sentences(g, en, US)#0]
-+- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0]
diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_language.json b/sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_language.json
new file mode 100644
index 0000000000000..869e074ccd604
--- /dev/null
+++ b/sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_language.json
@@ -0,0 +1,29 @@
+{
+  "common": {
+    "planId": "1"
+  },
+  "project": {
+    "input": {
+      "common": {
+        "planId": "0"
+      },
+      "localRelation": {
+        "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e"
+      }
+    },
+    "expressions": [{
+      "unresolvedFunction": {
+        "functionName": "sentences",
+        "arguments": [{
+          "unresolvedAttribute": {
+            "unparsedIdentifier": "g"
+          }
+        }, {
+          "literal": {
+            "string": "en"
+          }
+        }]
+      }
+    }]
+  }
+}
\ No newline at end of file
diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_language.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_language.proto.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7514b380a1c82227de4fbdae32194d25a5b9deab
GIT binary patch
literal 186
zcmd;L5@3|t%*Zu~k&8)yA*!2EsDrV%q^LBx#3nPvDk(EPGp|G^(F#N+S*7HcCgr5+
zq*xJ9VW*R7l~`1iSZM>)XQz{9m77>#1Jsk5m##xdtDR0d$atVqJ1H3{DK5_9)Vz|^
XyyVnkAyy$~F2-~rHZGPdrqnzD=FU5M

literal 0
HcmV?d00001

diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_locale.json b/sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_language_and_country.json
similarity index 100%
rename from sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_locale.json
rename to sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_language_and_country.json
diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_locale.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_language_and_country.proto.bin
similarity index 100%
rename from sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_locale.proto.bin
rename to sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_language_and_country.proto.bin
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
index c98dddbfe8e9f..ec240d71b851f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
@@ -714,6 +714,34 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession {
       df.select(sentences($"str", $"language", $"country")),
       Row(Seq(Seq("Hi", "there"), Seq("The", "price", "was"), Seq("But", "not", "now"))))
 
+    checkAnswer(
+      df.selectExpr("sentences(str, language)"),
+      Row(Seq(Seq("Hi", "there"), Seq("The", "price", "was"), Seq("But", "not", "now"))))
+
+    checkAnswer(
+      df.select(sentences($"str", $"language")),
+      Row(Seq(Seq("Hi", "there"), Seq("The", "price", "was"), Seq("But", "not", "now"))))
+
+    checkAnswer(
+      df.selectExpr("sentences(str)"),
+      Row(Seq(Seq("Hi", "there"), Seq("The", "price", "was"), Seq("But", "not", "now"))))
+
+    checkAnswer(
+      df.select(sentences($"str")),
+      Row(Seq(Seq("Hi", "there"), Seq("The", "price", "was"), Seq("But", "not", "now"))))
+
+    checkAnswer(
+      df.selectExpr("sentences(str, null, null)"),
+      Row(Seq(Seq("Hi", "there"), Seq("The", "price", "was"), Seq("But", "not", "now"))))
+
+    checkAnswer(
+      df.selectExpr("sentences(str, '', null)"),
+      Row(Seq(Seq("Hi", "there"), Seq("The", "price", "was"), Seq("But", "not", "now"))))
+
+    checkAnswer(
+      df.selectExpr("sentences(str, null)"),
+      Row(Seq(Seq("Hi", "there"), Seq("The", "price", "was"), Seq("But", "not", "now"))))
+
     // Type coercion
     checkAnswer(
       df.selectExpr("sentences(null)", "sentences(10)", "sentences(3.14)"),