Merge pull request #23393 from JuliaLang/rf/titlecase

JeffBezanson · web-flow · commit 82453563fedb · 2018-01-08T15:50:37.000-05:00
titlecase: chars not starting a word can be converted to lowercase
diff --git a/NEWS.md b/NEWS.md
@@ -365,6 +365,15 @@ This section lists changes that do not have deprecation warnings.
   * `findn(x::AbstractVector)` now return a 1-tuple with the vector of indices, to be
     consistent with higher order arrays ([#25365]).
 
+  * the default behavior of `titlecase` is changed in two ways ([#23393]):
+    + characters not starting a word are converted to lowercase;
+      a new keyword argument `strict` is added which
+      allows to get the old behavior when it's `false`.
+    + any non-letter character is considered as a word separator;
+      to get the old behavior (only "space" characters are considered as
+      word separators), use the keyword `wordsep=isspace`.
+
+
 Library improvements
 --------------------
 
@@ -918,6 +927,7 @@ Deprecated or removed
 
   * `findin(a, b)` has been deprecated in favor of `find(occursin(b), a)` ([#24673]).
 
+
 Command-line option changes
 ---------------------------
 
diff --git a/base/strings/unicode.jl b/base/strings/unicode.jl
@@ -384,6 +384,19 @@ function isupper(c::Char)
     cat == UTF8PROC_CATEGORY_LU || cat == UTF8PROC_CATEGORY_LT
 end
 
+"""
+    iscased(c::Char) -> Bool
+
+Tests whether a character is cased, i.e. is lower-, upper- or title-cased.
+"""
+function iscased(c::Char)
+    cat = category_code(c)
+    return cat == UTF8PROC_CATEGORY_LU ||
+           cat == UTF8PROC_CATEGORY_LT ||
+           cat == UTF8PROC_CATEGORY_LL
+end
+
+
 """
     isdigit(c::Char) -> Bool
 
@@ -649,27 +662,38 @@ julia> lowercase("STRINGS AND THINGS")
 lowercase(s::AbstractString) = map(lowercase, s)
 
 """
-    titlecase(s::AbstractString) -> String
+    titlecase(s::AbstractString; [wordsep::Function], strict::Bool=true) -> String
 
-Capitalize the first character of each word in `s`.
+Capitalize the first character of each word in `s`;
+if `strict` is true, every other character is
+converted to lowercase, otherwise they are left unchanged.
+By default, all non-letters are considered as word separators;
+a predicate can be passed as the `wordsep` keyword to determine
+which characters should be considered as word separators.
 See also [`ucfirst`](@ref) to capitalize only the first
 character in `s`.
 
 # Examples
 ```jldoctest
-julia> titlecase("the Julia programming language")
+julia> titlecase("the JULIA programming language")
 "The Julia Programming Language"
+
+julia> titlecase("ISS - international space station", strict=false)
+"ISS - International Space Station"
+
+julia> titlecase("a-a b-b", wordsep = c->c==' ')
+"A-a B-b"
 ```
 """
-function titlecase(s::AbstractString)
+function titlecase(s::AbstractString; wordsep::Function = !iscased, strict::Bool=true)
     startword = true
     b = IOBuffer()
     for c in s
-        if isspace(c)
+        if wordsep(c)
             print(b, c)
             startword = true
         else
-            print(b, startword ? titlecase(c) : c)
+            print(b, startword ? titlecase(c) : strict ? lowercase(c) : c)
             startword = false
         end
     end
diff --git a/stdlib/Unicode/src/Unicode.jl b/stdlib/Unicode/src/Unicode.jl
@@ -7,7 +7,7 @@ module Unicode
 using Base.Unicode: normalize, graphemes, isassigned, textwidth, isvalid,
                     islower, isupper, isalpha, isdigit, isxdigit, isnumeric, isalnum,
                     iscntrl, ispunct, isspace, isprint, isgraph,
-                    lowercase, uppercase, titlecase, lcfirst, ucfirst
+                    lowercase, uppercase, titlecase, lcfirst, ucfirst, iscased
 
 export graphemes, textwidth, isvalid,
        islower, isupper, isalpha, isdigit, isxdigit, isnumeric, isalnum,
diff --git a/stdlib/Unicode/test/runtests.jl b/stdlib/Unicode/test/runtests.jl
@@ -2,7 +2,7 @@
 
 using Test
 using Unicode
-using Unicode: normalize, isassigned
+using Unicode: normalize, isassigned, iscased
 
 @testset "string normalization" begin
     # normalize (Unicode normalization etc.):
@@ -366,8 +366,14 @@ end
     @testset "titlecase" begin
         @test titlecase('ǉ') == 'ǈ'
         @test titlecase("ǉubljana") == "ǈubljana"
-        @test titlecase("aBc ABC") == "ABc ABC"
-        @test titlecase("abcD   EFG\n\thij") == "AbcD   EFG\n\tHij"
+        @test titlecase("aBc ABC")               == "Abc Abc"
+        @test titlecase("aBc ABC", strict=true)  == "Abc Abc"
+        @test titlecase("aBc ABC", strict=false) == "ABc ABC"
+        @test titlecase("abcD   EFG\n\thij", strict=true)  == "Abcd   Efg\n\tHij"
+        @test titlecase("abcD   EFG\n\thij", strict=false) == "AbcD   EFG\n\tHij"
+        @test titlecase("abc-def")                     == "Abc-Def"
+        @test titlecase("abc-def", wordsep = !iscased) == "Abc-Def"
+        @test titlecase("abc-def", wordsep = isspace)  == "Abc-def"
     end
 end