From e037953d4c5879f927c527685c2d027fe2fc08c1 Mon Sep 17 00:00:00 2001 From: panbingkun Date: Wed, 11 Sep 2024 20:58:51 +0200 Subject: [PATCH] [SPARK-48549][SQL][PYTHON] Improve SQL function `sentences` ### What changes were proposed in this pull request? The pr aims to improve SQL function `sentences`, includes: - update the description of the `sentences` expression to make it more realistic. - add `def sentences(string: Column, language: Column): Column` to SQL functions - `codegen` support for `sentences` ### Why are the changes needed? Fix inconsistency in using the function `sentences` in the following scenarios image - According to the definition of function `sentences`, we should only allow the following two kinds of parameter calls: A.sentences(str) B.sentences(str, language, country) - the parameters `language` and `country` either coexist or do not exist at the same time **In file `sql/core/src/main/scala/org/apache/spark/sql/functions.scala`, only the following two functions are defined**: https://github.com/apache/spark/blob/f4434c36cc4f7b0147e0e8fe26ac0f177a5199cd/sql/core/src/main/scala/org/apache/spark/sql/functions.scala#L4273-L4282 - When we directly call the expression `sentences`, it actually supports the following: A.`df.select(sentences($"str", $"language", $"country"))`; B.`df.select(sentences($"str", $"language"))`; C.`df.select(sentences($"str"))`; ## Let's align it ### Does this PR introduce _any_ user-facing change? Yes, allow calling SQL function `sentences` as parameters (`str`, `language`). ### How was this patch tested? - Add new UT & Update existed UT. - Pass GA. - Manually check ```scala scala> val df = Seq(("Hi there! The price was $1,234.56.... But, not now.", "en", "US")).toDF("str", "language", "country"); val df: org.apache.spark.sql.DataFrame = [str: string, language: string ... 1 more field] scala> df.select(sentences($"str", $"language", $"country")); val res0: org.apache.spark.sql.DataFrame = [sentences(str, language, country): array>] scala> df.select(sentences($"str", $"language")); val res1: org.apache.spark.sql.DataFrame = [sentences(str, language, ): array>] scala> df.select(sentences($"str")); val res2: org.apache.spark.sql.DataFrame = [sentences(str, , ): array>] scala> df.selectExpr("sentences(str, language, country)"); val res3: org.apache.spark.sql.DataFrame = [sentences(str, language, country): array>] scala> df.selectExpr("sentences(str, language)"); val res4: org.apache.spark.sql.DataFrame = [sentences(str, language, ): array>] scala> df.selectExpr("sentences(str)"); val res5: org.apache.spark.sql.DataFrame = [sentences(str, , ): array>] ``` ### Was this patch authored or co-authored using generative AI tooling? No. Closes #46880 from panbingkun/sentences_improve. Authored-by: panbingkun Signed-off-by: Max Gekk --- .../spark/sql/PlanGenerationTestSuite.scala | 6 +- python/pyspark/sql/functions/builtin.py | 22 ++++- .../org/apache/spark/sql/functions.scala | 9 ++ .../expressions/ExpressionImplUtils.java | 61 +++++++++++-- .../expressions/stringExpressions.scala | 80 ++++++++---------- .../expressions/StringExpressionsSuite.scala | 2 +- .../function_sentences.explain | 2 +- .../function_sentences_with_language.explain | 2 + ...entences_with_language_and_country.explain | 2 + .../function_sentences_with_locale.explain | 2 - .../function_sentences_with_language.json | 29 +++++++ ...function_sentences_with_language.proto.bin | Bin 0 -> 186 bytes ..._sentences_with_language_and_country.json} | 0 ...ences_with_language_and_country.proto.bin} | Bin .../spark/sql/StringFunctionsSuite.scala | 28 ++++++ 15 files changed, 187 insertions(+), 58 deletions(-) create mode 100644 sql/connect/common/src/test/resources/query-tests/explain-results/function_sentences_with_language.explain create mode 100644 sql/connect/common/src/test/resources/query-tests/explain-results/function_sentences_with_language_and_country.explain delete mode 100644 sql/connect/common/src/test/resources/query-tests/explain-results/function_sentences_with_locale.explain create mode 100644 sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_language.json create mode 100644 sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_language.proto.bin rename sql/connect/common/src/test/resources/query-tests/queries/{function_sentences_with_locale.json => function_sentences_with_language_and_country.json} (100%) rename sql/connect/common/src/test/resources/query-tests/queries/{function_sentences_with_locale.proto.bin => function_sentences_with_language_and_country.proto.bin} (100%) diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala index ee91f3aa6c00a..315f80e13eff7 100644 --- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala @@ -1809,7 +1809,11 @@ class PlanGenerationTestSuite fn.sentences(fn.col("g")) } - functionTest("sentences with locale") { + functionTest("sentences with language") { + fn.sentences(fn.col("g"), lit("en")) + } + + functionTest("sentences with language and country") { fn.sentences(fn.col("g"), lit("en"), lit("US")) } diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py index b6499eb1546e7..781bf3d9f83a2 100644 --- a/python/pyspark/sql/functions/builtin.py +++ b/python/pyspark/sql/functions/builtin.py @@ -11241,13 +11241,27 @@ def sentences( ) -> Column: """ Splits a string into arrays of sentences, where each sentence is an array of words. - The 'language' and 'country' arguments are optional, and if omitted, the default locale is used. + The `language` and `country` arguments are optional, + When they are omitted: + 1.If they are both omitted, the `Locale.ROOT - locale(language='', country='')` is used. + The `Locale.ROOT` is regarded as the base locale of all locales, and is used as the + language/country neutral locale for the locale sensitive operations. + 2.If the `country` is omitted, the `locale(language, country='')` is used. + When they are null: + 1.If they are both `null`, the `Locale.US - locale(language='en', country='US')` is used. + 2.If the `language` is null and the `country` is not null, + the `Locale.US - locale(language='en', country='US')` is used. + 3.If the `language` is not null and the `country` is null, the `locale(language)` is used. + 4.If neither is `null`, the `locale(language, country)` is used. .. versionadded:: 3.2.0 .. versionchanged:: 3.4.0 Supports Spark Connect. + .. versionchanged:: 4.0.0 + Supports `sentences(string, language)`. + Parameters ---------- string : :class:`~pyspark.sql.Column` or str @@ -11271,6 +11285,12 @@ def sentences( +-----------------------------------+ |[[This, is, an, example, sentence]]| +-----------------------------------+ + >>> df.select(sentences(df.string, lit("en"))).show(truncate=False) + +-----------------------------------+ + |sentences(string, en, ) | + +-----------------------------------+ + |[[This, is, an, example, sentence]]| + +-----------------------------------+ >>> df = spark.createDataFrame([["Hello world. How are you?"]], ["s"]) >>> df.select(sentences("s")).show(truncate=False) +---------------------------------+ diff --git a/sql/api/src/main/scala/org/apache/spark/sql/functions.scala b/sql/api/src/main/scala/org/apache/spark/sql/functions.scala index 1ee86ae1a113d..86f8923f36b40 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/functions.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/functions.scala @@ -4349,6 +4349,15 @@ object functions { def sentences(string: Column, language: Column, country: Column): Column = Column.fn("sentences", string, language, country) + /** + * Splits a string into arrays of sentences, where each sentence is an array of words. The + * default `country`('') is used. + * @group string_funcs + * @since 4.0.0 + */ + def sentences(string: Column, language: Column): Column = + Column.fn("sentences", string, language) + /** * Splits a string into arrays of sentences, where each sentence is an array of words. The * default locale is used. diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionImplUtils.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionImplUtils.java index 07a9409bc57a2..18646f67975c0 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionImplUtils.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionImplUtils.java @@ -17,20 +17,25 @@ package org.apache.spark.sql.catalyst.expressions; -import org.apache.spark.SparkBuildInfo; -import org.apache.spark.sql.errors.QueryExecutionErrors; -import org.apache.spark.unsafe.types.UTF8String; -import org.apache.spark.util.VersionUtils; - -import javax.crypto.Cipher; -import javax.crypto.spec.GCMParameterSpec; -import javax.crypto.spec.IvParameterSpec; -import javax.crypto.spec.SecretKeySpec; import java.nio.ByteBuffer; import java.security.GeneralSecurityException; import java.security.SecureRandom; import java.security.spec.AlgorithmParameterSpec; +import java.text.BreakIterator; +import java.util.ArrayList; +import java.util.List; +import java.util.Locale; +import javax.crypto.Cipher; +import javax.crypto.spec.GCMParameterSpec; +import javax.crypto.spec.IvParameterSpec; +import javax.crypto.spec.SecretKeySpec; +import org.apache.spark.SparkBuildInfo; +import org.apache.spark.sql.catalyst.util.ArrayData; +import org.apache.spark.sql.catalyst.util.GenericArrayData; +import org.apache.spark.sql.errors.QueryExecutionErrors; +import org.apache.spark.unsafe.types.UTF8String; +import org.apache.spark.util.VersionUtils; /** * A utility class for constructing expressions. @@ -272,4 +277,42 @@ private static byte[] aesInternal( throw QueryExecutionErrors.aesCryptoError(e.getMessage()); } } + + public static ArrayData getSentences( + UTF8String str, + UTF8String language, + UTF8String country) { + if (str == null) return null; + Locale locale; + if (language != null && country != null) { + locale = new Locale(language.toString(), country.toString()); + } else if (language != null) { + locale = new Locale(language.toString()); + } else { + locale = Locale.US; + } + String sentences = str.toString(); + BreakIterator sentenceInstance = BreakIterator.getSentenceInstance(locale); + sentenceInstance.setText(sentences); + + int sentenceIndex = 0; + List res = new ArrayList<>(); + while (sentenceInstance.next() != BreakIterator.DONE) { + String sentence = sentences.substring(sentenceIndex, sentenceInstance.current()); + sentenceIndex = sentenceInstance.current(); + BreakIterator wordInstance = BreakIterator.getWordInstance(locale); + wordInstance.setText(sentence); + int wordIndex = 0; + List words = new ArrayList<>(); + while (wordInstance.next() != BreakIterator.DONE) { + String word = sentence.substring(wordIndex, wordInstance.current()); + wordIndex = wordInstance.current(); + if (Character.isLetterOrDigit(word.charAt(0))) { + words.add(UTF8String.fromString(word)); + } + } + res.add(new GenericArrayData(words.toArray(new UTF8String[0]))); + } + return new GenericArrayData(res.toArray(new GenericArrayData[0])); + } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index f211da52e4570..e75df87994f0e 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.expressions import java.nio.{ByteBuffer, CharBuffer} import java.nio.charset.CharacterCodingException -import java.text.{BreakIterator, DecimalFormat, DecimalFormatSymbols} +import java.text.{DecimalFormat, DecimalFormatSymbols} import java.util.{Base64 => JBase64, HashMap, Locale, Map => JMap} import scala.collection.mutable.ArrayBuffer @@ -3327,14 +3327,37 @@ case class FormatNumber(x: Expression, d: Expression) /** * Splits a string into arrays of sentences, where each sentence is an array of words. - * The 'lang' and 'country' arguments are optional, and if omitted, the default locale is used. + * The `lang` and `country` arguments are optional, their default values are all '', + * - When they are omitted: + * 1. If they are both omitted, the `Locale.ROOT - locale(language='', country='')` is used. + * The `Locale.ROOT` is regarded as the base locale of all locales, and is used as the + * language/country neutral locale for the locale sensitive operations. + * 2. If the `country` is omitted, the `locale(language, country='')` is used. + * - When they are null: + * 1. If they are both `null`, the `Locale.US - locale(language='en', country='US')` is used. + * 2. If the `language` is null and the `country` is not null, + * the `Locale.US - locale(language='en', country='US')` is used. + * 3. If the `language` is not null and the `country` is null, the `locale(language)` is used. + * 4. If neither is `null`, the `locale(language, country)` is used. */ @ExpressionDescription( - usage = "_FUNC_(str[, lang, country]) - Splits `str` into an array of array of words.", + usage = "_FUNC_(str[, lang[, country]]) - Splits `str` into an array of array of words.", + arguments = """ + Arguments: + * str - A STRING expression to be parsed. + * lang - An optional STRING expression with a language code from ISO 639 Alpha-2 (e.g. 'DE'), + Alpha-3, or a language subtag of up to 8 characters. + * country - An optional STRING expression with a country code from ISO 3166 alpha-2 country + code or a UN M.49 numeric-3 area code. + """, examples = """ Examples: > SELECT _FUNC_('Hi there! Good morning.'); [["Hi","there"],["Good","morning"]] + > SELECT _FUNC_('Hi there! Good morning.', 'en'); + [["Hi","there"],["Good","morning"]] + > SELECT _FUNC_('Hi there! Good morning.', 'en', 'US'); + [["Hi","there"],["Good","morning"]] """, since = "2.0.0", group = "string_funcs") @@ -3342,7 +3365,9 @@ case class Sentences( str: Expression, language: Expression = Literal(""), country: Expression = Literal("")) - extends TernaryExpression with ImplicitCastInputTypes with CodegenFallback { + extends TernaryExpression + with ImplicitCastInputTypes + with RuntimeReplaceable { def this(str: Expression) = this(str, Literal(""), Literal("")) def this(str: Expression, language: Expression) = this(str, language, Literal("")) @@ -3356,49 +3381,18 @@ case class Sentences( override def second: Expression = language override def third: Expression = country - override def eval(input: InternalRow): Any = { - val string = str.eval(input) - if (string == null) { - null - } else { - val languageStr = language.eval(input).asInstanceOf[UTF8String] - val countryStr = country.eval(input).asInstanceOf[UTF8String] - val locale = if (languageStr != null && countryStr != null) { - new Locale(languageStr.toString, countryStr.toString) - } else { - Locale.US - } - getSentences(string.asInstanceOf[UTF8String].toString, locale) - } - } - - private def getSentences(sentences: String, locale: Locale) = { - val bi = BreakIterator.getSentenceInstance(locale) - bi.setText(sentences) - var idx = 0 - val result = new ArrayBuffer[GenericArrayData] - while (bi.next != BreakIterator.DONE) { - val sentence = sentences.substring(idx, bi.current) - idx = bi.current - - val wi = BreakIterator.getWordInstance(locale) - var widx = 0 - wi.setText(sentence) - val words = new ArrayBuffer[UTF8String] - while (wi.next != BreakIterator.DONE) { - val word = sentence.substring(widx, wi.current) - widx = wi.current - if (Character.isLetterOrDigit(word.charAt(0))) words += UTF8String.fromString(word) - } - result += new GenericArrayData(words) - } - new GenericArrayData(result) - } + override def replacement: Expression = + StaticInvoke( + classOf[ExpressionImplUtils], + dataType, + "getSentences", + Seq(str, language, country), + inputTypes, + propagateNull = false) override protected def withNewChildrenInternal( newFirst: Expression, newSecond: Expression, newThird: Expression): Sentences = copy(str = newFirst, language = newSecond, country = newThird) - } /** diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala index beefabd981089..29b878230472d 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala @@ -1987,7 +1987,7 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { // Test escaping of arguments GenerateUnsafeProjection.generate( - Sentences(Literal("\"quote"), Literal("\"quote"), Literal("\"quote")) :: Nil) + Sentences(Literal("\"quote"), Literal("\"quote"), Literal("\"quote")).replacement :: Nil) } test("SPARK-33386: elt ArrayIndexOutOfBoundsException") { diff --git a/sql/connect/common/src/test/resources/query-tests/explain-results/function_sentences.explain b/sql/connect/common/src/test/resources/query-tests/explain-results/function_sentences.explain index 5c88a1f7b3abd..f4532e70675ae 100644 --- a/sql/connect/common/src/test/resources/query-tests/explain-results/function_sentences.explain +++ b/sql/connect/common/src/test/resources/query-tests/explain-results/function_sentences.explain @@ -1,2 +1,2 @@ -Project [sentences(g#0, , ) AS sentences(g, , )#0] +Project [static_invoke(ExpressionImplUtils.getSentences(g#0, , )) AS sentences(g, , )#0] +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/sql/connect/common/src/test/resources/query-tests/explain-results/function_sentences_with_language.explain b/sql/connect/common/src/test/resources/query-tests/explain-results/function_sentences_with_language.explain new file mode 100644 index 0000000000000..37bcbf9a319b5 --- /dev/null +++ b/sql/connect/common/src/test/resources/query-tests/explain-results/function_sentences_with_language.explain @@ -0,0 +1,2 @@ +Project [static_invoke(ExpressionImplUtils.getSentences(g#0, en, )) AS sentences(g, en, )#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/sql/connect/common/src/test/resources/query-tests/explain-results/function_sentences_with_language_and_country.explain b/sql/connect/common/src/test/resources/query-tests/explain-results/function_sentences_with_language_and_country.explain new file mode 100644 index 0000000000000..8a8d54cfa0d10 --- /dev/null +++ b/sql/connect/common/src/test/resources/query-tests/explain-results/function_sentences_with_language_and_country.explain @@ -0,0 +1,2 @@ +Project [static_invoke(ExpressionImplUtils.getSentences(g#0, en, US)) AS sentences(g, en, US)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/sql/connect/common/src/test/resources/query-tests/explain-results/function_sentences_with_locale.explain b/sql/connect/common/src/test/resources/query-tests/explain-results/function_sentences_with_locale.explain deleted file mode 100644 index 7819f9b542340..0000000000000 --- a/sql/connect/common/src/test/resources/query-tests/explain-results/function_sentences_with_locale.explain +++ /dev/null @@ -1,2 +0,0 @@ -Project [sentences(g#0, en, US) AS sentences(g, en, US)#0] -+- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_language.json b/sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_language.json new file mode 100644 index 0000000000000..869e074ccd604 --- /dev/null +++ b/sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_language.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "sentences", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }, { + "literal": { + "string": "en" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_language.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_language.proto.bin new file mode 100644 index 0000000000000000000000000000000000000000..7514b380a1c82227de4fbdae32194d25a5b9deab GIT binary patch literal 186 zcmd;L5@3|t%*Zu~k&8)yA*!2EsDrV%q^LBx#3nPvDk(EPGp|G^(F#N+S*7HcCgr5+ zq*xJ9VW*R7l~`1iSZM>)XQz{9m77>#1Jsk5m##xdtDR0d$atVqJ1H3{DK5_9)Vz|^ XyyVnkAyy$~F2-~rHZGPdrqnzD=FU5M literal 0 HcmV?d00001 diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_locale.json b/sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_language_and_country.json similarity index 100% rename from sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_locale.json rename to sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_language_and_country.json diff --git a/sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_locale.proto.bin b/sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_language_and_country.proto.bin similarity index 100% rename from sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_locale.proto.bin rename to sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_language_and_country.proto.bin diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala index c98dddbfe8e9f..ec240d71b851f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala @@ -714,6 +714,34 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession { df.select(sentences($"str", $"language", $"country")), Row(Seq(Seq("Hi", "there"), Seq("The", "price", "was"), Seq("But", "not", "now")))) + checkAnswer( + df.selectExpr("sentences(str, language)"), + Row(Seq(Seq("Hi", "there"), Seq("The", "price", "was"), Seq("But", "not", "now")))) + + checkAnswer( + df.select(sentences($"str", $"language")), + Row(Seq(Seq("Hi", "there"), Seq("The", "price", "was"), Seq("But", "not", "now")))) + + checkAnswer( + df.selectExpr("sentences(str)"), + Row(Seq(Seq("Hi", "there"), Seq("The", "price", "was"), Seq("But", "not", "now")))) + + checkAnswer( + df.select(sentences($"str")), + Row(Seq(Seq("Hi", "there"), Seq("The", "price", "was"), Seq("But", "not", "now")))) + + checkAnswer( + df.selectExpr("sentences(str, null, null)"), + Row(Seq(Seq("Hi", "there"), Seq("The", "price", "was"), Seq("But", "not", "now")))) + + checkAnswer( + df.selectExpr("sentences(str, '', null)"), + Row(Seq(Seq("Hi", "there"), Seq("The", "price", "was"), Seq("But", "not", "now")))) + + checkAnswer( + df.selectExpr("sentences(str, null)"), + Row(Seq(Seq("Hi", "there"), Seq("The", "price", "was"), Seq("But", "not", "now")))) + // Type coercion checkAnswer( df.selectExpr("sentences(null)", "sentences(10)", "sentences(3.14)"),