Skip to content

Commit

Permalink
address comments
Browse files Browse the repository at this point in the history
  • Loading branch information
zhengruifeng committed Jul 16, 2024
1 parent cacf9d6 commit 3aae4f0
Showing 1 changed file with 8 additions and 17 deletions.
25 changes: 8 additions & 17 deletions mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala
Original file line number Diff line number Diff line change
Expand Up @@ -143,25 +143,16 @@ class RegexTokenizer @Since("1.4.0") (@Since("1.4.0") override val uid: String)

override protected def createTransformFunc: String => Seq[String] = {
val re = $(pattern).r
val localToLowercase = $(toLowercase)
val localGaps = $(gaps)
val localMinTokenLength = $(minTokenLength)

($(toLowercase), $(gaps)) match {
case (true, true) =>
(originStr: String) =>
re.split(originStr.toLowerCase()).toImmutableArraySeq
.filter(_.length >= localMinTokenLength)

case (true, false) =>
(originStr: String) => re.findAllIn(originStr.toLowerCase()).toSeq
.filter(_.length >= localMinTokenLength)

case (false, true) =>
(originStr: String) => re.split(originStr).toImmutableArraySeq
.filter(_.length >= localMinTokenLength)

case (false, false) =>
(originStr: String) => re.findAllIn(originStr).toSeq
.filter(_.length >= localMinTokenLength)
(originStr: String) => {
// scalastyle:off caselocale
val str = if (localToLowercase) originStr.toLowerCase() else originStr
// scalastyle:on caselocale
val tokens = if (localGaps) re.split(str).toImmutableArraySeq else re.findAllIn(str).toSeq
tokens.filter(_.length >= localMinTokenLength)
}
}

Expand Down

0 comments on commit 3aae4f0

Please sign in to comment.