apache · miland-db · Mar 21, 2024 · Mar 21, 2024 · Mar 21, 2024 · Mar 21, 2024
diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -549,6 +549,51 @@ public int findInSet(UTF8String match) {
     return 0;
   }
 
+  public int findInSet(UTF8String match, int collationId) {
+    if (CollationFactory.fetchCollation(collationId).supportsBinaryEquality) {
+      return this.findInSet(match);
+    }
+    if (collationId == CollationFactory.UTF8_BINARY_LCASE_COLLATION_ID) {
+      return this.toLowerCase().findInSet(match.toLowerCase());
+    }
+    return collationAwareFindInSet(match, collationId);
+  }
+
+  /*
+   * Works on Strings with collationId other than UTF8_BINARY_COLLATION_ID. Returns the index
+   * of the string `match` in this String. This string has to be a comma separated
+   * list. If `match` contains a comma 0 will be returned. If the `match` isn't part of this String,
+   * 0 will be returned, else the index of match (1-based index)
+   */
+  private int collationAwareFindInSet(UTF8String match, int collationId) {
+    if (match.contains(COMMA_UTF8)) {
+      return 0;
+    }
+
+    StringSearch stringSearch = CollationFactory.getStringSearch(this, match, collationId);
+
+    String setString = this.toString();
+    int wordStart = 0;
+    while ((wordStart = stringSearch.next()) != StringSearch.DONE) {
+      boolean isValidStart = wordStart == 0 || setString.charAt(wordStart - 1) == ',';
+      boolean isValidEnd = wordStart + stringSearch.getMatchLength() == setString.length()
+              || setString.charAt(wordStart + stringSearch.getMatchLength()) == ',';
+
+      if (isValidStart && isValidEnd) {
+        int pos = 0;
+        for (int i = 0; i < setString.length() && i < wordStart; i++) {
+          if (setString.charAt(i) == ',') {
+            pos++;
+          }
+        }
+
+        return pos + 1;
+      }
+    }
+
+    return 0;
+  }
+
   /**
    * Copy the bytes from the current UTF8String, and make a new UTF8String.
    * @param start the start position of the current UTF8String in bytes.
@@ -835,6 +880,27 @@ public int indexOf(UTF8String v, int start) {
     return -1;
   }
 
+  public int indexOf(UTF8String substring, int start, int collationId) {
+    if (CollationFactory.fetchCollation(collationId).supportsBinaryEquality) {
+      return this.indexOf(substring, start);
+    }
+    if (collationId == CollationFactory.UTF8_BINARY_LCASE_COLLATION_ID) {
+      return this.toLowerCase().indexOf(substring.toLowerCase(), start);
+    }
+    return collationAwareIndexOf(substring, start, collationId);
+  }
+
+  private int collationAwareIndexOf(UTF8String substring, int start, int collationId) {
+    if (substring.numBytes == 0) {
+      return 0;
+    }
+
+    StringSearch stringSearch = CollationFactory.getStringSearch(this, substring, collationId);
+    stringSearch.setIndex(start);
+
+    return stringSearch.next();
+  }
+
   /**
    * Find the `str` from left to right.
    */

diff --git a/...catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/...catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -994,15 +994,25 @@ case class StringTranslate(srcExpr: Expression, matchingExpr: Expression, replac
 case class FindInSet(left: Expression, right: Expression) extends BinaryExpression
     with ImplicitCastInputTypes with NullIntolerant {
 
-  override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType)
+  final lazy val collationId: Int = left.dataType.asInstanceOf[StringType].collationId
+
+  override def inputTypes: Seq[AbstractDataType] =
+    Seq(StringTypeAnyCollation, StringTypeAnyCollation)
 
-  override protected def nullSafeEval(word: Any, set: Any): Any =
-    set.asInstanceOf[UTF8String].findInSet(word.asInstanceOf[UTF8String])
+  override protected def nullSafeEval(word: Any, set: Any): Any = {
+    if (CollationFactory.fetchCollation(collationId).supportsBinaryEquality) {
+      set.asInstanceOf[UTF8String].findInSet(word.asInstanceOf[UTF8String])
+    } else {
+      set.asInstanceOf[UTF8String].findInSet(word.asInstanceOf[UTF8String], collationId)
+    }
+  }
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-    nullSafeCodeGen(ctx, ev, (word, set) =>
-      s"${ev.value} = $set.findInSet($word);"
-    )
+    if (CollationFactory.fetchCollation(collationId).supportsBinaryEquality) {
+      nullSafeCodeGen(ctx, ev, (word, set) => s"${ev.value} = $set.findInSet($word);")
+    } else {
+      nullSafeCodeGen(ctx, ev, (word, set) => s"${ev.value} = $set.findInSet($word, $collationId);")
+    }
   }
 
   override def dataType: DataType = IntegerType
@@ -1366,20 +1376,30 @@ case class StringTrimRight(srcStr: Expression, trimStr: Option[Expression] = Non
 case class StringInstr(str: Expression, substr: Expression)
   extends BinaryExpression with ImplicitCastInputTypes with NullIntolerant {
 
+  final lazy val collationId: Int = left.dataType.asInstanceOf[StringType].collationId
+
   override def left: Expression = str
   override def right: Expression = substr
   override def dataType: DataType = IntegerType
-  override def inputTypes: Seq[DataType] = Seq(StringType, StringType)
+  override def inputTypes: Seq[AbstractDataType] =
+    Seq(StringTypeAnyCollation, StringTypeAnyCollation)
 
   override def nullSafeEval(string: Any, sub: Any): Any = {
-    string.asInstanceOf[UTF8String].indexOf(sub.asInstanceOf[UTF8String], 0) + 1
+    if (CollationFactory.fetchCollation(collationId).supportsBinaryEquality) {
+      string.asInstanceOf[UTF8String].indexOf(sub.asInstanceOf[UTF8String], 0) + 1
+    } else {
+      string.asInstanceOf[UTF8String].indexOf(sub.asInstanceOf[UTF8String], 0, collationId) + 1
+    }
   }
 
   override def prettyName: String = "instr"
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-    defineCodeGen(ctx, ev, (l, r) =>
-      s"($l).indexOf($r, 0) + 1")
+    if (CollationFactory.fetchCollation(collationId).supportsBinaryEquality) {
+      defineCodeGen(ctx, ev, (l, r) => s"($l).indexOf($r, 0) + 1")
+    } else {
+      defineCodeGen(ctx, ev, (l, r) => s"($l).indexOf($r, 0, $collationId) + 1")
+    }
   }
 
   override protected def withNewChildrenInternal(

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala
@@ -21,7 +21,7 @@ import scala.collection.immutable.Seq
 
 import org.apache.spark.SparkConf
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.DataTypeMismatch
-import org.apache.spark.sql.catalyst.expressions.{Collation, ConcatWs, ExpressionEvalHelper, Literal, StringRepeat}
+import org.apache.spark.sql.catalyst.expressions.{Collation, ConcatWs, ExpressionEvalHelper, FindInSet, Literal, StringInstr, StringRepeat}
 import org.apache.spark.sql.catalyst.util.CollationFactory
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSparkSession
@@ -76,17 +76,128 @@ class CollationStringExpressionsSuite
     )
   }
 
+  test("INSTR check result on explicitly collated strings") {
+    def testInStr(str: String, substr: String, collationId: Integer, expected: Integer): Unit = {
+      val string = Literal.create(str, StringType(collationId))
+      val substring = Literal.create(substr, StringType(collationId))
+
+      checkEvaluation(StringInstr(string, substring), expected)
+    }
+
+    var collationId = CollationFactory.collationNameToId("UTF8_BINARY")
+    testInStr("aaads", "Aa", collationId, 0)
+    testInStr("aaaDs", "de", collationId, 0)
+    testInStr("aaads", "ds", collationId, 4)
+    testInStr("xxxx", "", collationId, 1)
+    testInStr("", "xxxx", collationId, 0)
+    // scalastyle:off
+    testInStr("test大千世界X大千世界", "大千", collationId, 5)
+    testInStr("test大千世界X大千世界", "界X", collationId, 8)
+    // scalastyle:on
+
+    collationId = CollationFactory.collationNameToId("UTF8_BINARY_LCASE")
+    testInStr("aaads", "Aa", collationId, 1)
+    testInStr("aaaDs", "de", collationId, 0)
+    testInStr("aaaDs", "ds", collationId, 4)
+    testInStr("xxxx", "", collationId, 1)
+    testInStr("", "xxxx", collationId, 0)
+    // scalastyle:off
+    testInStr("test大千世界X大千世界", "大千", collationId, 5)
+    testInStr("test大千世界X大千世界", "界x", collationId, 8)
+    // scalastyle:on
+
+    collationId = CollationFactory.collationNameToId("UNICODE")
+    testInStr("aaads", "Aa", collationId, 0)
+    testInStr("aaads", "aa", collationId, 1)
+    testInStr("aaads", "de", collationId, 0)
+    testInStr("xxxx", "", collationId, 1)
+    testInStr("", "xxxx", collationId, 0)
+    // scalastyle:off
+    testInStr("test大千世界X大千世界", "界x", collationId, 0)
+    testInStr("test大千世界X大千世界", "界X", collationId, 8)
+    // scalastyle:on
+
+    collationId = CollationFactory.collationNameToId("UNICODE_CI")
+    testInStr("aaads", "AD", collationId, 3)
+    testInStr("aaads", "dS", collationId, 4)
+    // scalastyle:off
+    testInStr("test大千世界X大千世界", "界x", collationId, 8)
+    // scalastyle:on
+  }
+
+  test("FIND_IN_SET check result on explicitly collated strings") {
+    def testFindInSet(word: String, set: String, collationId: Integer, expected: Integer): Unit = {
+      val w = Literal.create(word, StringType(collationId))
+      val s = Literal.create(set, StringType(collationId))
+
+      checkEvaluation(FindInSet(w, s), expected)
+    }
+
+    var collationId = CollationFactory.collationNameToId("UTF8_BINARY")
+    testFindInSet("AB", "abc,b,ab,c,def", collationId, 0)
+    testFindInSet("abc", "abc,b,ab,c,def", collationId, 1)
+    testFindInSet("def", "abc,b,ab,c,def", collationId, 5)
+    testFindInSet("d,ef", "abc,b,ab,c,def", collationId, 0)
+    testFindInSet("", "abc,b,ab,c,def", collationId, 0)
+
+    collationId = CollationFactory.collationNameToId("UTF8_BINARY_LCASE")
+    testFindInSet("a", "abc,b,ab,c,def", collationId, 0)
+    testFindInSet("c", "abc,b,ab,c,def", collationId, 4)
+    testFindInSet("AB", "abc,b,ab,c,def", collationId, 3)
+    testFindInSet("AbC", "abc,b,ab,c,def", collationId, 1)
+    testFindInSet("abcd", "abc,b,ab,c,def", collationId, 0)
+    testFindInSet("d,ef", "abc,b,ab,c,def", collationId, 0)
+    testFindInSet("XX", "xx", collationId, 1)
+    testFindInSet("", "abc,b,ab,c,def", collationId, 0)
+    // scalastyle:off
+    testFindInSet("界x", "test,大千,世,界X,大,千,世界", collationId, 4)
+    // scalastyle:on
+
+    collationId = CollationFactory.collationNameToId("UNICODE")
+    testFindInSet("a", "abc,b,ab,c,def", collationId, 0)
+    testFindInSet("ab", "abc,b,ab,c,def", collationId, 3)
+    testFindInSet("Ab", "abc,b,ab,c,def", collationId, 0)
+    testFindInSet("d,ef", "abc,b,ab,c,def", collationId, 0)
+    testFindInSet("xx", "xx", collationId, 1)
+    // scalastyle:off
+    testFindInSet("界x", "test,大千,世,界X,大,千,世界", collationId, 0)
+    testFindInSet("大", "test,大千,世,界X,大,千,世界", collationId, 5)
+    // scalastyle:on
+
+    collationId = CollationFactory.collationNameToId("UNICODE_CI")
+    testFindInSet("a", "abc,b,ab,c,def", collationId, 0)
+    testFindInSet("C", "abc,b,ab,c,def", collationId, 4)
+    testFindInSet("DeF", "abc,b,ab,c,dEf", collationId, 5)
+    testFindInSet("DEFG", "abc,b,ab,c,def", collationId, 0)
+    testFindInSet("XX", "xx", collationId, 1)
+    // scalastyle:off
+    testFindInSet("界x", "test,大千,世,界X,大,千,世界", collationId, 4)
+    testFindInSet("界x", "test,大千,界Xx,世,界X,大,千,世界", collationId, 5)
+    testFindInSet("大", "test,大千,世,界X,大,千,世界", collationId, 5)
+    // scalastyle:on
+  }
+
   test("REPEAT check output type on explicitly collated string") {
-    def testRepeat(expected: String, collationId: Int, input: String, n: Int): Unit = {
+    def testRepeat(input: String, n: Int, collationId: Int, expected: String): Unit = {
       val s = Literal.create(input, StringType(collationId))
 
       checkEvaluation(Collation(StringRepeat(s, Literal.create(n))).replacement, expected)
     }
 
-    testRepeat("UTF8_BINARY", 0, "abc", 2)
-    testRepeat("UTF8_BINARY_LCASE", 1, "abc", 2)
-    testRepeat("UNICODE", 2, "abc", 2)
-    testRepeat("UNICODE_CI", 3, "abc", 2)
+    // Not important for this test
+    val repeatNum = 2;
+
+    var collationId = CollationFactory.collationNameToId("UTF8_BINARY")
+    testRepeat("abc", repeatNum, collationId, "UTF8_BINARY")
+
+    collationId = CollationFactory.collationNameToId("UTF8_BINARY_LCASE")
+    testRepeat("abc", repeatNum, collationId, "UTF8_BINARY_LCASE")
+
+    collationId = CollationFactory.collationNameToId("UNICODE")
+    testRepeat("abc", repeatNum, collationId, "UNICODE")
+
+    collationId = CollationFactory.collationNameToId("UNICODE_CI")
+    testRepeat("abc", repeatNum, collationId, "UNICODE_CI")
   }
 
   // TODO: Add more tests for other string expressions