titlecase: all non-letters are considered word-separators

JuliaLang · Jan 8, 2018 · f94ab0a · f94ab0a
1 parent 7032e46
commit f94ab0a
Show file tree

Hide file tree

Showing 4 changed files with 34 additions and 8 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -365,9 +365,13 @@ This section lists changes that do not have deprecation warnings.
   * `findn(x::AbstractVector)` now return a 1-tuple with the vector of indices, to be
     consistent with higher order arrays ([#25365]).
 
-  * the default behavior of `titlecase` is changed such that characters not starting
-    a word are converted to lowercase; a new keyword argument `strict` is added which
-    allows to get the old behavior when it's `false`.
+  * the default behavior of `titlecase` is changed in two ways ([#23393]):
+    + characters not starting a word are converted to lowercase;
+      a new keyword argument `strict` is added which
+      allows to get the old behavior when it's `false`.
+    + any non-letter character is considered as a word separator;
+      to get the old behavior (only "space" characters are considered as
+      word separators), use the keyword `wordsep=isspace`.
 
 
 Library improvements

diff --git a/base/strings/unicode.jl b/base/strings/unicode.jl
@@ -384,6 +384,19 @@ function isupper(c::Char)
     cat == UTF8PROC_CATEGORY_LU || cat == UTF8PROC_CATEGORY_LT
 end
 
+"""
+    iscased(c::Char) -> Bool
+
+Tests whether a character is cased, i.e. is lower-, upper- or title-cased.
+"""
+function iscased(c::Char)
+    cat = category_code(c)
+    return cat == UTF8PROC_CATEGORY_LU ||
+           cat == UTF8PROC_CATEGORY_LT ||
+           cat == UTF8PROC_CATEGORY_LL
+end
+
+
 """
     isdigit(c::Char) -> Bool
 
@@ -649,11 +662,14 @@ julia> lowercase("STRINGS AND THINGS")
 lowercase(s::AbstractString) = map(lowercase, s)
 
 """
-    titlecase(s::AbstractString; strict::Bool=true) -> String
+    titlecase(s::AbstractString; [wordsep::Function], strict::Bool=true) -> String
 
 Capitalize the first character of each word in `s`;
 if `strict` is true, every other character is
 converted to lowercase, otherwise they are left unchanged.
+By default, all non-letters are considered as word separators;
+a predicate can be passed as the `wordsep` keyword to determine
+which characters should be considered as word separators.
 See also [`ucfirst`](@ref) to capitalize only the first
 character in `s`.
 
@@ -664,13 +680,16 @@ julia> titlecase("the JULIA programming language")
 
 julia> titlecase("ISS - international space station", strict=false)
 "ISS - International Space Station"
+
+julia> titlecase("a-a b-b", wordsep = c->c==' ')
+"A-a B-b"
 ```
 """
-function titlecase(s::AbstractString; strict::Bool=true)
+function titlecase(s::AbstractString; wordsep::Function = !iscased, strict::Bool=true)
     startword = true
     b = IOBuffer()
     for c in s
-        if isspace(c)
+        if wordsep(c)
             print(b, c)
             startword = true
         else

diff --git a/stdlib/Unicode/src/Unicode.jl b/stdlib/Unicode/src/Unicode.jl
@@ -7,7 +7,7 @@ module Unicode
 using Base.Unicode: normalize, graphemes, isassigned, textwidth, isvalid,
                     islower, isupper, isalpha, isdigit, isxdigit, isnumeric, isalnum,
                     iscntrl, ispunct, isspace, isprint, isgraph,
-                    lowercase, uppercase, titlecase, lcfirst, ucfirst
+                    lowercase, uppercase, titlecase, lcfirst, ucfirst, iscased
 
 export graphemes, textwidth, isvalid,
        islower, isupper, isalpha, isdigit, isxdigit, isnumeric, isalnum,

diff --git a/stdlib/Unicode/test/runtests.jl b/stdlib/Unicode/test/runtests.jl
@@ -2,7 +2,7 @@
 
 using Test
 using Unicode
-using Unicode: normalize, isassigned
+using Unicode: normalize, isassigned, iscased
 
 @testset "string normalization" begin
     # normalize (Unicode normalization etc.):
@@ -371,6 +371,9 @@ end
         @test titlecase("aBc ABC", strict=false) == "ABc ABC"
         @test titlecase("abcD   EFG\n\thij", strict=true)  == "Abcd   Efg\n\tHij"
         @test titlecase("abcD   EFG\n\thij", strict=false) == "AbcD   EFG\n\tHij"
+        @test titlecase("abc-def")                     == "Abc-Def"
+        @test titlecase("abc-def", wordsep = !iscased) == "Abc-Def"
+        @test titlecase("abc-def", wordsep = isspace)  == "Abc-def"
     end
 end