From 7032e461689ad577154928b7886417f9f2672def Mon Sep 17 00:00:00 2001 From: Rafael Fourquet Date: Tue, 22 Aug 2017 09:44:53 +0200 Subject: [PATCH 1/2] titlecase: chars not starting a word are converted to lowercase A keyword argument `strict` is added to `titlecase` to control whether to convert those chars to lowercase. The default value is `true`, which makes this change breaking. This is how some languages (e.g. Python) implement this function, and is compatible with http://www.unicode.org/L2/L1999/99190.htm. --- NEWS.md | 6 ++++++ base/strings/unicode.jl | 15 ++++++++++----- stdlib/Unicode/test/runtests.jl | 7 +++++-- 3 files changed, 21 insertions(+), 7 deletions(-) diff --git a/NEWS.md b/NEWS.md index b512e2c1b363b..e0a45e111e003 100644 --- a/NEWS.md +++ b/NEWS.md @@ -365,6 +365,11 @@ This section lists changes that do not have deprecation warnings. * `findn(x::AbstractVector)` now return a 1-tuple with the vector of indices, to be consistent with higher order arrays ([#25365]). + * the default behavior of `titlecase` is changed such that characters not starting + a word are converted to lowercase; a new keyword argument `strict` is added which + allows to get the old behavior when it's `false`. + + Library improvements -------------------- @@ -918,6 +923,7 @@ Deprecated or removed * `findin(a, b)` has been deprecated in favor of `find(occursin(b), a)` ([#24673]). + Command-line option changes --------------------------- diff --git a/base/strings/unicode.jl b/base/strings/unicode.jl index 96eaa0d65342d..bdc1b54a1d5f7 100644 --- a/base/strings/unicode.jl +++ b/base/strings/unicode.jl @@ -649,19 +649,24 @@ julia> lowercase("STRINGS AND THINGS") lowercase(s::AbstractString) = map(lowercase, s) """ - titlecase(s::AbstractString) -> String + titlecase(s::AbstractString; strict::Bool=true) -> String -Capitalize the first character of each word in `s`. +Capitalize the first character of each word in `s`; +if `strict` is true, every other character is +converted to lowercase, otherwise they are left unchanged. See also [`ucfirst`](@ref) to capitalize only the first character in `s`. # Examples ```jldoctest -julia> titlecase("the Julia programming language") +julia> titlecase("the JULIA programming language") "The Julia Programming Language" + +julia> titlecase("ISS - international space station", strict=false) +"ISS - International Space Station" ``` """ -function titlecase(s::AbstractString) +function titlecase(s::AbstractString; strict::Bool=true) startword = true b = IOBuffer() for c in s @@ -669,7 +674,7 @@ function titlecase(s::AbstractString) print(b, c) startword = true else - print(b, startword ? titlecase(c) : c) + print(b, startword ? titlecase(c) : strict ? lowercase(c) : c) startword = false end end diff --git a/stdlib/Unicode/test/runtests.jl b/stdlib/Unicode/test/runtests.jl index dacf266ccbaef..00f15d97925ba 100644 --- a/stdlib/Unicode/test/runtests.jl +++ b/stdlib/Unicode/test/runtests.jl @@ -366,8 +366,11 @@ end @testset "titlecase" begin @test titlecase('lj') == 'Lj' @test titlecase("ljubljana") == "Ljubljana" - @test titlecase("aBc ABC") == "ABc ABC" - @test titlecase("abcD EFG\n\thij") == "AbcD EFG\n\tHij" + @test titlecase("aBc ABC") == "Abc Abc" + @test titlecase("aBc ABC", strict=true) == "Abc Abc" + @test titlecase("aBc ABC", strict=false) == "ABc ABC" + @test titlecase("abcD EFG\n\thij", strict=true) == "Abcd Efg\n\tHij" + @test titlecase("abcD EFG\n\thij", strict=false) == "AbcD EFG\n\tHij" end end From f94ab0a73d62748cc99b9b23b78bb320fe10af99 Mon Sep 17 00:00:00 2001 From: Rafael Fourquet Date: Tue, 22 Aug 2017 18:55:55 +0200 Subject: [PATCH 2/2] titlecase: all non-letters are considered word-separators --- NEWS.md | 10 +++++++--- base/strings/unicode.jl | 25 ++++++++++++++++++++++--- stdlib/Unicode/src/Unicode.jl | 2 +- stdlib/Unicode/test/runtests.jl | 5 ++++- 4 files changed, 34 insertions(+), 8 deletions(-) diff --git a/NEWS.md b/NEWS.md index e0a45e111e003..dfe7f1e3d4e76 100644 --- a/NEWS.md +++ b/NEWS.md @@ -365,9 +365,13 @@ This section lists changes that do not have deprecation warnings. * `findn(x::AbstractVector)` now return a 1-tuple with the vector of indices, to be consistent with higher order arrays ([#25365]). - * the default behavior of `titlecase` is changed such that characters not starting - a word are converted to lowercase; a new keyword argument `strict` is added which - allows to get the old behavior when it's `false`. + * the default behavior of `titlecase` is changed in two ways ([#23393]): + + characters not starting a word are converted to lowercase; + a new keyword argument `strict` is added which + allows to get the old behavior when it's `false`. + + any non-letter character is considered as a word separator; + to get the old behavior (only "space" characters are considered as + word separators), use the keyword `wordsep=isspace`. Library improvements diff --git a/base/strings/unicode.jl b/base/strings/unicode.jl index bdc1b54a1d5f7..8447a125601b1 100644 --- a/base/strings/unicode.jl +++ b/base/strings/unicode.jl @@ -384,6 +384,19 @@ function isupper(c::Char) cat == UTF8PROC_CATEGORY_LU || cat == UTF8PROC_CATEGORY_LT end +""" + iscased(c::Char) -> Bool + +Tests whether a character is cased, i.e. is lower-, upper- or title-cased. +""" +function iscased(c::Char) + cat = category_code(c) + return cat == UTF8PROC_CATEGORY_LU || + cat == UTF8PROC_CATEGORY_LT || + cat == UTF8PROC_CATEGORY_LL +end + + """ isdigit(c::Char) -> Bool @@ -649,11 +662,14 @@ julia> lowercase("STRINGS AND THINGS") lowercase(s::AbstractString) = map(lowercase, s) """ - titlecase(s::AbstractString; strict::Bool=true) -> String + titlecase(s::AbstractString; [wordsep::Function], strict::Bool=true) -> String Capitalize the first character of each word in `s`; if `strict` is true, every other character is converted to lowercase, otherwise they are left unchanged. +By default, all non-letters are considered as word separators; +a predicate can be passed as the `wordsep` keyword to determine +which characters should be considered as word separators. See also [`ucfirst`](@ref) to capitalize only the first character in `s`. @@ -664,13 +680,16 @@ julia> titlecase("the JULIA programming language") julia> titlecase("ISS - international space station", strict=false) "ISS - International Space Station" + +julia> titlecase("a-a b-b", wordsep = c->c==' ') +"A-a B-b" ``` """ -function titlecase(s::AbstractString; strict::Bool=true) +function titlecase(s::AbstractString; wordsep::Function = !iscased, strict::Bool=true) startword = true b = IOBuffer() for c in s - if isspace(c) + if wordsep(c) print(b, c) startword = true else diff --git a/stdlib/Unicode/src/Unicode.jl b/stdlib/Unicode/src/Unicode.jl index 59077acb6a79c..e55a8f6cc39ef 100644 --- a/stdlib/Unicode/src/Unicode.jl +++ b/stdlib/Unicode/src/Unicode.jl @@ -7,7 +7,7 @@ module Unicode using Base.Unicode: normalize, graphemes, isassigned, textwidth, isvalid, islower, isupper, isalpha, isdigit, isxdigit, isnumeric, isalnum, iscntrl, ispunct, isspace, isprint, isgraph, - lowercase, uppercase, titlecase, lcfirst, ucfirst + lowercase, uppercase, titlecase, lcfirst, ucfirst, iscased export graphemes, textwidth, isvalid, islower, isupper, isalpha, isdigit, isxdigit, isnumeric, isalnum, diff --git a/stdlib/Unicode/test/runtests.jl b/stdlib/Unicode/test/runtests.jl index 00f15d97925ba..5a5b83eb12b87 100644 --- a/stdlib/Unicode/test/runtests.jl +++ b/stdlib/Unicode/test/runtests.jl @@ -2,7 +2,7 @@ using Test using Unicode -using Unicode: normalize, isassigned +using Unicode: normalize, isassigned, iscased @testset "string normalization" begin # normalize (Unicode normalization etc.): @@ -371,6 +371,9 @@ end @test titlecase("aBc ABC", strict=false) == "ABc ABC" @test titlecase("abcD EFG\n\thij", strict=true) == "Abcd Efg\n\tHij" @test titlecase("abcD EFG\n\thij", strict=false) == "AbcD EFG\n\tHij" + @test titlecase("abc-def") == "Abc-Def" + @test titlecase("abc-def", wordsep = !iscased) == "Abc-Def" + @test titlecase("abc-def", wordsep = isspace) == "Abc-def" end end