From 0a4c54c654d2eba91ca545b29d8c63ea98d6033f Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Tue, 24 Nov 2020 23:14:13 -0500 Subject: [PATCH 1/4] Unicode-compliant islower/uppercase --- base/strings/unicode.jl | 17 ++++++----------- stdlib/Unicode/test/runtests.jl | 7 +++++-- 2 files changed, 11 insertions(+), 13 deletions(-) diff --git a/base/strings/unicode.jl b/base/strings/unicode.jl index 60e6aa0e70a64..38ffacd8aa572 100644 --- a/base/strings/unicode.jl +++ b/base/strings/unicode.jl @@ -280,9 +280,8 @@ isassigned(c) = UTF8PROC_CATEGORY_CN < category_code(c) <= UTF8PROC_CATEGORY_CO """ islowercase(c::AbstractChar) -> Bool -Tests whether a character is a lowercase letter. -A character is classified as lowercase if it belongs to Unicode category Ll, -Letter: Lowercase. +Tests whether a character is a lowercase letter (according to the Unicode +standard's `Lowercase` derived property). See also: [`isuppercase`](@ref). @@ -298,16 +297,15 @@ julia> islowercase('❤') false ``` """ -islowercase(c::AbstractChar) = category_code(c) == UTF8PROC_CATEGORY_LL +islowercase(c::AbstractChar) = ismalformed(c) ? false : Bool(ccall(:utf8proc_islower, Cint, (UInt32,), UInt32(c))) # true for Unicode upper and mixed case """ isuppercase(c::AbstractChar) -> Bool -Tests whether a character is an uppercase letter. -A character is classified as uppercase if it belongs to Unicode category Lu, -Letter: Uppercase, or Lt, Letter: Titlecase. +Tests whether a character is an uppercase letter (according to the Unicode +standard's `Uppercase` derived property). See also: [`islowercase`](@ref). @@ -323,10 +321,7 @@ julia> isuppercase('❤') false ``` """ -function isuppercase(c::AbstractChar) - cat = category_code(c) - cat == UTF8PROC_CATEGORY_LU || cat == UTF8PROC_CATEGORY_LT -end +isuppercase(c::AbstractChar) = ismalformed(c) ? false : Bool(ccall(:utf8proc_isupper, Cint, (UInt32,), UInt32(c))) """ iscased(c::AbstractChar) -> Bool diff --git a/stdlib/Unicode/test/runtests.jl b/stdlib/Unicode/test/runtests.jl index e5d667a976079..9fe2540735ec4 100644 --- a/stdlib/Unicode/test/runtests.jl +++ b/stdlib/Unicode/test/runtests.jl @@ -92,7 +92,7 @@ end @testset "#5939 uft8proc character predicates" begin alower=['a', 'd', 'j', 'y', 'z'] - ulower=['α', 'β', 'γ', 'δ', 'ф', 'я'] + ulower=['α', 'β', 'γ', 'δ', 'ф', 'я', 'ª'] for c in vcat(alower,ulower) @test islowercase(c) == true @test isuppercase(c) == false @@ -101,7 +101,7 @@ end end aupper=['A', 'D', 'J', 'Y', 'Z'] - uupper= ['Δ', 'Γ', 'Π', 'Ψ', 'Dž', 'Ж', 'Д'] + uupper= ['Δ', 'Γ', 'Π', 'Ψ', 'Ж', 'Д', 'Ⓐ'] for c in vcat(aupper,uupper) @test islowercase(c) == false @@ -110,6 +110,9 @@ end @test isnumeric(c) == false end + @test !isuppercase('Dž') # titlecase is not uppercase + @test Base.Unicode.iscased('Dž') # but is "cased" + nocase=['א','ﺵ'] alphas=vcat(alower,ulower,aupper,uupper,nocase) From a9927872e7f54f80870bc28c16a0c34acee3029e Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Tue, 24 Nov 2020 23:33:24 -0500 Subject: [PATCH 2/4] don't test isletter for non-L* letters --- stdlib/Unicode/test/runtests.jl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/stdlib/Unicode/test/runtests.jl b/stdlib/Unicode/test/runtests.jl index 9fe2540735ec4..f5f322e2fbf93 100644 --- a/stdlib/Unicode/test/runtests.jl +++ b/stdlib/Unicode/test/runtests.jl @@ -92,8 +92,8 @@ end @testset "#5939 uft8proc character predicates" begin alower=['a', 'd', 'j', 'y', 'z'] - ulower=['α', 'β', 'γ', 'δ', 'ф', 'я', 'ª'] - for c in vcat(alower,ulower) + ulower=['α', 'β', 'γ', 'δ', 'ф', 'я'] + for c in vcat(alower,ulower,['ª']) @test islowercase(c) == true @test isuppercase(c) == false @test isdigit(c) == false @@ -101,9 +101,9 @@ end end aupper=['A', 'D', 'J', 'Y', 'Z'] - uupper= ['Δ', 'Γ', 'Π', 'Ψ', 'Ж', 'Д', 'Ⓐ'] + uupper= ['Δ', 'Γ', 'Π', 'Ψ', 'Ж', 'Д'] - for c in vcat(aupper,uupper) + for c in vcat(aupper,uupper,['Ⓐ']) @test islowercase(c) == false @test isuppercase(c) == true @test isdigit(c) == false From adc91259d3172f310712fdcb6361344499535084 Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Tue, 24 Nov 2020 23:38:49 -0500 Subject: [PATCH 3/4] include titlecase in alphas test --- stdlib/Unicode/test/runtests.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stdlib/Unicode/test/runtests.jl b/stdlib/Unicode/test/runtests.jl index f5f322e2fbf93..6888fa2d9ba40 100644 --- a/stdlib/Unicode/test/runtests.jl +++ b/stdlib/Unicode/test/runtests.jl @@ -114,7 +114,7 @@ end @test Base.Unicode.iscased('Dž') # but is "cased" nocase=['א','ﺵ'] - alphas=vcat(alower,ulower,aupper,uupper,nocase) + alphas=vcat(alower,ulower,aupper,uupper,nocase,['Dž']) for c in alphas @test isletter(c) == true From 0f2d140534c9c935a22f299f8ce2531c42a5f198 Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Wed, 16 Dec 2020 12:54:08 -0500 Subject: [PATCH 4/4] add news --- NEWS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/NEWS.md b/NEWS.md index 48aebc9344980..ea50e8c3c9b93 100644 --- a/NEWS.md +++ b/NEWS.md @@ -36,6 +36,7 @@ New library features Standard library changes ------------------------ +* `islowercase` and `isuppercase` are now compliant with the Unicode lower/uppercase categories ([#38574]). #### Package Manager