From e62ab285541d7094c374c49ce41cdf87f62a20b7 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Sun, 10 Dec 2017 21:31:48 +0100 Subject: [PATCH 1/3] Move Unicode-related functions to new Unicode stdlib package Keep them under Base.Unicode since they are needed inside Base, but stop exporting them to Base since they would conflict with the deprecations. Base.Unicode is the new name for Base.UTF8proc, but including a few more functions. --- CONTRIBUTING.md | 2 +- NEWS.md | 7 + base/arrayshow.jl | 2 +- base/char.jl | 6 +- base/deprecated.jl | 31 ++- base/dict.jl | 4 +- base/distributed/Distributed.jl | 1 + base/docs/utils.jl | 4 +- base/exports.jl | 22 -- base/interactiveutil.jl | 2 +- base/io.jl | 4 +- base/libuv.jl | 4 +- base/loading.jl | 4 +- base/markdown/Markdown.jl | 1 + base/mpfr.jl | 2 +- base/operators.jl | 2 + base/parse.jl | 18 +- base/pkg/entry.jl | 4 +- base/pkg/reqs.jl | 2 +- base/precompile.jl | 6 +- base/printf.jl | 1 + base/process.jl | 2 +- base/random/misc.jl | 2 +- base/regex.jl | 6 +- base/repl/LineEdit.jl | 2 + base/repl/REPL.jl | 2 +- base/repl/latex_symbols.jl | 2 +- base/shell.jl | 8 +- base/show.jl | 4 +- base/socket.jl | 2 +- base/stream.jl | 2 +- base/strings/basic.jl | 133 ------------- base/strings/io.jl | 21 +- base/strings/string.jl | 4 +- base/strings/strings.jl | 3 +- base/strings/substring.jl | 7 +- base/strings/utf8proc.jl | 188 +++++++++++++++++- base/strings/util.jl | 8 +- base/sysimg.jl | 1 + base/util.jl | 2 +- doc/make.jl | 7 +- doc/src/index.md | 1 + doc/src/manual/unicode-input.md | 7 +- doc/src/stdlib/.gitignore | 2 +- doc/src/stdlib/strings.md | 22 -- stdlib/Dates/src/parse.jl | 4 +- stdlib/Dates/src/periods.jl | 2 +- stdlib/Dates/src/query.jl | 4 +- stdlib/Dates/test/io.jl | 2 +- stdlib/SuiteSparse/src/cholmod.jl | 2 +- stdlib/Unicode/docs/src/index.md | 25 +++ stdlib/Unicode/src/Unicode.jl | 17 ++ .../Unicode/test/runtests.jl | 76 ++++++- test/arrayops.jl | 4 +- test/choosetests.jl | 2 +- test/compile.jl | 2 +- test/dict.jl | 4 +- test/distributed_exec.jl | 1 + test/docs.jl | 6 +- test/iobuffer.jl | 3 +- test/iostream.jl | 3 +- test/libgit2.jl | 1 + test/operators.jl | 6 +- test/perf/kernel/json.jl | 2 +- test/perf/perfgeneric.jl | 2 +- test/perf/shootout/k_nucleotide.jl | 2 +- test/perf/spell/perf.jl | 2 +- test/strings/basic.jl | 71 +------ test/strings/io.jl | 4 +- test/strings/util.jl | 2 +- 70 files changed, 451 insertions(+), 365 deletions(-) create mode 100644 stdlib/Unicode/docs/src/index.md create mode 100644 stdlib/Unicode/src/Unicode.jl rename test/unicode/utf8proc.jl => stdlib/Unicode/test/runtests.jl (80%) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index b5f41ab43798d..ba5c8f78a6359 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -174,7 +174,7 @@ The steps required to add a new docstring are listed below: Examples written within docstrings can be used as testcases known as "doctests" by annotating code blocks with `jldoctest`. ```jldoctest - julia> uppercase("Docstring test") + julia> Unicode.uppercase("Docstring test") "DOCSTRING TEST" ``` diff --git a/NEWS.md b/NEWS.md index 7d1277a02dc3a..4d90d1b4d78dc 100644 --- a/NEWS.md +++ b/NEWS.md @@ -738,6 +738,12 @@ Deprecated or removed * The `sum_kbn` and `cumsum_kbn` functions have been moved to the [KahanSummation](https://github.com/JuliaMath/KahanSummation.jl) package ([#24869]). + * Unicode-related string functions have been moved to the new `Unicode` standard + library module ([#25021]). This applies to `normalize_string`, `graphemes`, + `is_assigned_char`, `textwidth`, `isascii`, `islower`, `isupper`, `isalpha`, + `isdigit`, `isxdigit`, `isnumber`, `isalnum`, `iscntrl`, `ispunct`, `isspace`, + `isprint`, `isgraph`, `lowercase`, `uppercase`, `titlecase`, `lcfirst` and `ucfirst`. + Command-line option changes --------------------------- @@ -1711,3 +1717,4 @@ Command-line option changes [#24413]: https://github.com/JuliaLang/julia/issues/24413 [#24653]: https://github.com/JuliaLang/julia/issues/24653 [#24869]: https://github.com/JuliaLang/julia/issues/24869 +[#25021]: https://github.com/JuliaLang/julia/issues/25021 \ No newline at end of file diff --git a/base/arrayshow.jl b/base/arrayshow.jl index 9f16782f76e88..d1f9a16d06695 100644 --- a/base/arrayshow.jl +++ b/base/arrayshow.jl @@ -166,7 +166,7 @@ function print_matrix(io::IO, X::AbstractVecOrMat, screenwidth -= length(pre) + length(post) presp = repeat(" ", length(pre)) # indent each row to match pre string postsp = "" - @assert textwidth(hdots) == textwidth(ddots) + @assert Unicode.textwidth(hdots) == Unicode.textwidth(ddots) sepsize = length(sep) rowsA, colsA = indices(X,1), indices(X,2) m, n = length(rowsA), length(colsA) diff --git a/base/char.jl b/base/char.jl index ea7334eb0679e..9b99bb50b086f 100644 --- a/base/char.jl +++ b/base/char.jl @@ -64,7 +64,7 @@ function show(io::IO, c::Char) return end end - if isprint(c) + if Unicode.isprint(c) write(io, 0x27, c, 0x27) else u = UInt32(c) @@ -81,6 +81,6 @@ end function show(io::IO, ::MIME"text/plain", c::Char) show(io, c) u = UInt32(c) - print(io, ": ", isascii(c) ? "ASCII/" : "", "Unicode U+", hex(u, u > 0xffff ? 6 : 4)) - print(io, " (category ", UTF8proc.category_abbrev(c), ": ", UTF8proc.category_string(c), ")") + print(io, ": ", Unicode.isascii(c) ? "ASCII/" : "", "Unicode U+", hex(u, u > 0xffff ? 6 : 4)) + print(io, " (category ", Unicode.category_abbrev(c), ": ", Unicode.category_string(c), ")") end diff --git a/base/deprecated.jl b/base/deprecated.jl index 68960bb291104..99c371e1fb638 100644 --- a/base/deprecated.jl +++ b/base/deprecated.jl @@ -1072,13 +1072,6 @@ function Matrix() return Matrix(uninitialized, 0, 0) end -for name in ("alnum", "alpha", "cntrl", "digit", "number", "graph", - "lower", "print", "punct", "space", "upper", "xdigit") - f = Symbol("is",name) - @eval import .UTF8proc: $f - @eval @deprecate ($f)(s::AbstractString) all($f, s) -end - # TODO: remove warning for using `_` in parse_input_line in base/client.jl # Special functions have been moved to a package @@ -1512,7 +1505,7 @@ export hex2num @deprecate convert(::Type{Symbol}, s::AbstractString) Symbol(s) @deprecate convert(::Type{String}, s::Symbol) String(s) @deprecate convert(::Type{String}, v::Vector{UInt8}) String(v) -@deprecate convert(::Type{S}, g::UTF8proc.GraphemeIterator) where {S<:AbstractString} convert(S, g.s) +@deprecate convert(::Type{S}, g::Unicode.GraphemeIterator) where {S<:AbstractString} convert(S, g.s) # Issue #19923 @deprecate ror circshift @@ -2186,6 +2179,28 @@ end @deprecate_moved sum_kbn "KahanSummation" @deprecate_moved cumsum_kbn "KahanSummation" +# PR #25021 +@deprecate_moved normalize_string "Unicode" true true +@deprecate_moved graphemes "Unicode" true true +@deprecate_moved is_assigned_char "Unicode" true true +@deprecate_moved textwidth "Unicode" true true +@deprecate_moved islower "Unicode" true true +@deprecate_moved isupper "Unicode" true true +@deprecate_moved isalpha "Unicode" true true +@deprecate_moved isdigit "Unicode" true true +@deprecate_moved isnumber "Unicode" true true +@deprecate_moved isalnum "Unicode" true true +@deprecate_moved iscntrl "Unicode" true true +@deprecate_moved ispunct "Unicode" true true +@deprecate_moved isspace "Unicode" true true +@deprecate_moved isprint "Unicode" true true +@deprecate_moved isgraph "Unicode" true true +@deprecate_moved lowercase "Unicode" true true +@deprecate_moved uppercase "Unicode" true true +@deprecate_moved titlecase "Unicode" true true +@deprecate_moved lcfirst "Unicode" true true +@deprecate_moved ucfirst "Unicode" true true + # END 0.7 deprecations # BEGIN 1.0 deprecations diff --git a/base/dict.jl b/base/dict.jl index bd3ac41287e55..1834decfd1b6e 100644 --- a/base/dict.jl +++ b/base/dict.jl @@ -1,7 +1,7 @@ # This file is a part of Julia. License is MIT: https://julialang.org/license function _truncate_at_width_or_chars(str, width, chars="", truncmark="…") - truncwidth = textwidth(truncmark) + truncwidth = Unicode.textwidth(truncmark) (width <= 0 || width < truncwidth) && return "" wid = truncidx = lastidx = 0 @@ -9,7 +9,7 @@ function _truncate_at_width_or_chars(str, width, chars="", truncmark="…") while !done(str, idx) lastidx = idx c, idx = next(str, idx) - wid += textwidth(c) + wid += Unicode.textwidth(c) wid >= width - truncwidth && truncidx == 0 && (truncidx = lastidx) (wid >= width || c in chars) && break end diff --git a/base/distributed/Distributed.jl b/base/distributed/Distributed.jl index 0545c714fafc7..609f88acf9cad 100644 --- a/base/distributed/Distributed.jl +++ b/base/distributed/Distributed.jl @@ -15,6 +15,7 @@ using Base: Process, Semaphore, JLOptions, AnyDict, buffer_writes, wait_connecte binding_module, notify_error, atexit, julia_exename, julia_cmd, AsyncGenerator, display_error, acquire, release, invokelatest, warn_once, shell_escape_posixly, uv_error +using Base.Unicode: isascii, isdigit, isnumber # NOTE: clusterserialize.jl imports additional symbols from Base.Serializer for use diff --git a/base/docs/utils.jl b/base/docs/utils.jl index b4a1f4eb947ec..d4aadefd9430e 100644 --- a/base/docs/utils.jl +++ b/base/docs/utils.jl @@ -3,6 +3,7 @@ # Text / HTML objects import Base: print, show, ==, hash +using Base.Unicode export HTML, @html_str @@ -231,7 +232,8 @@ function matchinds(needle, haystack; acronym = false) for (i, char) in enumerate(haystack) isempty(chars) && break while chars[1] == ' ' shift!(chars) end # skip spaces - if lowercase(char) == lowercase(chars[1]) && (!acronym || !isalpha(lastc)) + if Unicode.lowercase(char) == Unicode.lowercase(chars[1]) && + (!acronym || !Unicode.isalpha(lastc)) push!(is, i) shift!(chars) end diff --git a/base/exports.jl b/base/exports.jl index ed3f5d2b34b49..7fe632fff0ada 100644 --- a/base/exports.jl +++ b/base/exports.jl @@ -725,32 +725,15 @@ export eachmatch, endswith, escape_string, - graphemes, hex, hex2bytes, hex2bytes!, ind2chr, info, - is_assigned_char, - isalnum, - isalpha, - isascii, - iscntrl, - isdigit, - isgraph, - islower, ismatch, - isnumber, - isprint, - ispunct, - isspace, - isupper, isvalid, - isxdigit, join, - lcfirst, logging, - lowercase, lpad, lstrip, match, @@ -758,7 +741,6 @@ export ncodeunits, ndigits, nextind, - normalize_string, oct, prevind, print, @@ -785,13 +767,9 @@ export string, strip, summary, - textwidth, thisind, - titlecase, transcode, - ucfirst, unescape_string, - uppercase, warn, # random numbers diff --git a/base/interactiveutil.jl b/base/interactiveutil.jl index fdad979f6b664..3616ff88c3284 100644 --- a/base/interactiveutil.jl +++ b/base/interactiveutil.jl @@ -51,7 +51,7 @@ function edit(path::AbstractString, line::Integer=0) cmd = line != 0 ? `$command $path -l $line` : `$command $path` elseif startswith(name, "subl") || startswith(name, "atom") cmd = line != 0 ? `$command $path:$line` : `$command $path` - elseif name == "code" || (Sys.iswindows() && uppercase(name) == "CODE.EXE") + elseif name == "code" || (Sys.iswindows() && Unicode.uppercase(name) == "CODE.EXE") cmd = line != 0 ? `$command -g $path:$line` : `$command -g $path` elseif startswith(name, "notepad++") cmd = line != 0 ? `$command $path -n$line` : `$command $path` diff --git a/base/io.jl b/base/io.jl index 0f2e7da1d578d..c37653d8fada3 100644 --- a/base/io.jl +++ b/base/io.jl @@ -845,6 +845,8 @@ characters from that character until the start of the next line are ignored. julia> buf = IOBuffer(" text") IOBuffer(data=UInt8[...], readable=true, writable=false, seekable=true, append=false, size=8, maxsize=Inf, ptr=1, mark=-1) +julia> using Unicode + julia> skipchars(buf, isspace) IOBuffer(data=UInt8[...], readable=true, writable=false, seekable=true, append=false, size=8, maxsize=Inf, ptr=5, mark=-1) @@ -889,7 +891,7 @@ julia> countlines(io, '.') ``` """ function countlines(io::IO, eol::Char='\n') - isascii(eol) || throw(ArgumentError("only ASCII line terminators are supported")) + Unicode.isascii(eol) || throw(ArgumentError("only ASCII line terminators are supported")) aeol = UInt8(eol) a = Vector{UInt8}(uninitialized, 8192) nl = 0 diff --git a/base/libuv.jl b/base/libuv.jl index 29c0a044cdf17..4119ce4068c68 100644 --- a/base/libuv.jl +++ b/base/libuv.jl @@ -20,10 +20,10 @@ function uv_sizeof_req(req) end for h in uv_handle_types -@eval const $(Symbol("_sizeof_",lowercase(string(h)))) = uv_sizeof_handle($h) +@eval const $(Symbol("_sizeof_",Unicode.lowercase(string(h)))) = uv_sizeof_handle($h) end for r in uv_req_types -@eval const $(Symbol("_sizeof_",lowercase(string(r)))) = uv_sizeof_req($r) +@eval const $(Symbol("_sizeof_",Unicode.lowercase(string(r)))) = uv_sizeof_req($r) end uv_handle_data(handle) = ccall(:jl_uv_handle_data,Ptr{Void},(Ptr{Void},),handle) diff --git a/base/loading.jl b/base/loading.jl index 8206da5df4bef..8e15004b2bd5c 100644 --- a/base/loading.jl +++ b/base/loading.jl @@ -68,8 +68,8 @@ elseif Sys.isapple() # If there is no match, it's possible that the file does exist but HFS+ # performed unicode normalization. See https://developer.apple.com/library/mac/qa/qa1235/_index.html. - isascii(path_basename) && return false - Vector{UInt8}(normalize_string(path_basename, :NFD)) == casepreserved_basename + Unicode.isascii(path_basename) && return false + Vector{UInt8}(Unicode.normalize_string(path_basename, :NFD)) == casepreserved_basename end else # Generic fallback that performs a slow directory listing. diff --git a/base/markdown/Markdown.jl b/base/markdown/Markdown.jl index 52df1347788f4..3794d77d13bd2 100644 --- a/base/markdown/Markdown.jl +++ b/base/markdown/Markdown.jl @@ -7,6 +7,7 @@ module Markdown import Base: show, == import Core: @doc_str +using Base.Unicode: lowercase, ucfirst, isspace include(joinpath("parse", "config.jl")) include(joinpath("parse", "util.jl")) diff --git a/base/mpfr.jl b/base/mpfr.jl index 8bff353fdca7f..61f21ab69ac06 100644 --- a/base/mpfr.jl +++ b/base/mpfr.jl @@ -125,7 +125,7 @@ convert(::Type{BigFloat}, x::Union{Float16,Float32}) = BigFloat(Float64(x)) convert(::Type{BigFloat}, x::Rational) = BigFloat(numerator(x)) / BigFloat(denominator(x)) function tryparse(::Type{BigFloat}, s::AbstractString, base::Int=0) - !isempty(s) && isspace(s[end]) && return tryparse(BigFloat, rstrip(s), base) + !isempty(s) && Base.Unicode.isspace(s[end]) && return tryparse(BigFloat, rstrip(s), base) z = BigFloat() err = ccall((:mpfr_set_str, :libmpfr), Int32, (Ref{BigFloat}, Cstring, Int32, Int32), z, s, base, ROUNDING_MODE[]) err == 0 ? Nullable(z) : Nullable{BigFloat}() diff --git a/base/operators.jl b/base/operators.jl index d40cf227410e1..887f04d19ca30 100644 --- a/base/operators.jl +++ b/base/operators.jl @@ -942,6 +942,8 @@ entered in the Julia REPL (and most editors, appropriately configured) by typing # Examples ```jldoctest +julia> using Unicode + julia> map(uppercase∘hex, 250:255) 6-element Array{String,1}: "FA" diff --git a/base/parse.jl b/base/parse.jl index 87447ba0a0a90..ddbf833cb162f 100644 --- a/base/parse.jl +++ b/base/parse.jl @@ -53,7 +53,7 @@ end function parseint_preamble(signed::Bool, base::Int, s::AbstractString, startpos::Int, endpos::Int) c, i, j = parseint_next(s, startpos, endpos) - while isspace(c) + while Unicode.isspace(c) c, i, j = parseint_next(s,i,endpos) end (j == 0) && (return 0, 0, 0) @@ -66,7 +66,7 @@ function parseint_preamble(signed::Bool, base::Int, s::AbstractString, startpos: end end - while isspace(c) + while Unicode.isspace(c) c, i, j = parseint_next(s,i,endpos) end (j == 0) && (return 0, 0, 0) @@ -125,10 +125,10 @@ function tryparse_internal(::Type{T}, s::AbstractString, startpos::Int, endpos:: return Nullable{T}(n) end c, i = next(s,i) - isspace(c) && break + Unicode.isspace(c) && break end (T <: Signed) && (n *= sgn) - while !isspace(c) + while !Unicode.isspace(c) d::T = '0' <= c <= '9' ? c-'0' : 'A' <= c <= 'Z' ? c-'A'+10 : 'a' <= c <= 'z' ? c-'a'+a : base @@ -149,7 +149,7 @@ function tryparse_internal(::Type{T}, s::AbstractString, startpos::Int, endpos:: end while i <= endpos c, i = next(s,i) - if !isspace(c) + if !Unicode.isspace(c) raise && throw(ArgumentError("extra characters after whitespace in $(repr(SubString(s,startpos,endpos)))")) return _n end @@ -168,10 +168,10 @@ function tryparse_internal(::Type{Bool}, sbuff::Union{String,SubString{String}}, orig_end = endpos # Ignore leading and trailing whitespace - while isspace(sbuff[startpos]) && startpos <= endpos + while Unicode.isspace(sbuff[startpos]) && startpos <= endpos startpos = nextind(sbuff, startpos) end - while isspace(sbuff[endpos]) && endpos >= startpos + while Unicode.isspace(sbuff[endpos]) && endpos >= startpos endpos = prevind(sbuff, endpos) end @@ -186,7 +186,7 @@ function tryparse_internal(::Type{Bool}, sbuff::Union{String,SubString{String}}, if raise substr = SubString(sbuff, orig_start, orig_end) # show input string in the error to avoid confusion - if all(isspace, substr) + if all(Unicode.isspace, substr) throw(ArgumentError("input string only contains whitespace")) else throw(ArgumentError("invalid Bool representation: $(repr(substr))")) @@ -243,7 +243,7 @@ tryparse_internal(::Type{Float16}, s::AbstractString, startpos::Int, endpos::Int function tryparse_internal(::Type{Complex{T}}, s::Union{String,SubString{String}}, i::Int, e::Int, raise::Bool) where {T<:Real} # skip initial whitespace - while i ≤ e && isspace(s[i]) + while i ≤ e && Unicode.isspace(s[i]) i = nextind(s, i) end if i > e diff --git a/base/pkg/entry.jl b/base/pkg/entry.jl index a0888bb82faa7..6f6f5482e2752 100644 --- a/base/pkg/entry.jl +++ b/base/pkg/entry.jl @@ -89,7 +89,7 @@ function available() for (pkg, vers) in all_avail any(x->Types.satisfies("julia", VERSION, x[2].requires), vers) && push!(avail, pkg) end - sort!(avail, by=lowercase) + sort!(avail, by=Base.Unicode.lowercase) end function available(pkg::AbstractString) @@ -572,7 +572,7 @@ end function warnbanner(msg...; label="[ WARNING ]", prefix="") cols = Base.displaysize(STDERR)[2] - str = rpad(lpad(label, div(cols+textwidth(label), 2), "="), cols, "=") + str = rpad(lpad(label, div(cols+Base.Unicode.textwidth(label), 2), "="), cols, "=") warn(prefix="", str) println(STDERR) warn(prefix=prefix, msg...) diff --git a/base/pkg/reqs.jl b/base/pkg/reqs.jl index e424bca9fcd30..b84ce5614993d 100644 --- a/base/pkg/reqs.jl +++ b/base/pkg/reqs.jl @@ -78,7 +78,7 @@ function write(io::IO, lines::Vector{Line}) end end function write(io::IO, reqs::Requires) - for pkg in sort!(collect(keys(reqs)), by=lowercase) + for pkg in sort!(collect(keys(reqs)), by=Unicode.lowercase) println(io, Requirement(pkg, reqs[pkg]).content) end end diff --git a/base/precompile.jl b/base/precompile.jl index f0be54c9270af..092c24045c0d7 100644 --- a/base/precompile.jl +++ b/base/precompile.jl @@ -68,7 +68,7 @@ precompile(Tuple{typeof(Base.lstrip), Base.SubString{String}, Array{Char, 1}}) precompile(Tuple{getfield(Base, Symbol("#kw##split")), Array{Any, 1}, typeof(Base.split), String, Char}) precompile(Tuple{getfield(Base, Symbol("#kw##split")), Array{Any, 1}, typeof(Base.split), Base.SubString{String}, Char}) precompile(Tuple{typeof(Base.map!), typeof(Base.strip), Array{Base.SubString{String}, 1}, Array{Base.SubString{String}, 1}}) -precompile(Tuple{typeof(Base.UTF8proc.isnumber), Base.SubString{String}}) +precompile(Tuple{typeof(Base.Unicode.isnumber), Base.SubString{String}}) precompile(Tuple{Type{Core.Inference.Generator{I, F} where F where I}, Type{Core.Inference.Const}, Tuple{Tuple{Base.DevNullStream, Base.DevNullStream, Base.DevNullStream}}}) precompile(Tuple{Type{Core.Inference.Generator{Tuple{Tuple{Base.DevNullStream, Base.DevNullStream, Base.DevNullStream}}, Type{Core.Inference.Const}}}, Type{Core.Inference.Const}, Tuple{Tuple{Base.DevNullStream, Base.DevNullStream, Base.DevNullStream}}}) precompile(Tuple{typeof(Core.Inference.convert), Type{Tuple{Tuple{Base.DevNullStream, Base.DevNullStream, Base.DevNullStream}}}, Tuple{Tuple{Base.DevNullStream, Base.DevNullStream, Base.DevNullStream}}}) @@ -805,7 +805,7 @@ precompile(Tuple{typeof(Base.iteratorsize), Type{Base.Generator{Array{Any, 1}, t precompile(Tuple{typeof(Base._collect), Array{Any, 1}, Base.Generator{Array{Any, 1}, typeof(Base.string)}, Base.EltypeUnknown, Base.HasShape}) precompile(Tuple{typeof(Base.similar), Array{Any, 1}, Type{String}, Tuple{Base.OneTo{Int64}}}) precompile(Tuple{typeof(Base.collect_to!), Array{String, 1}, Base.Generator{Array{Any, 1}, typeof(Base.string)}, Int64, Int64}) -precompile(Tuple{typeof(Base.UTF8proc.isalpha), Char}) +precompile(Tuple{typeof(Base.Unicode.isalpha), Char}) precompile(Tuple{getfield(Base.Docs, Symbol("#kw##matchinds")), Array{Any, 1}, typeof(Base.Docs.matchinds), String, String}) precompile(Tuple{typeof(Base.Docs.bestmatch), String, String}) precompile(Tuple{typeof(Base.length), Tuple{DataType, DataType}}) @@ -973,7 +973,7 @@ precompile(Tuple{typeof(Base.lstrip), String, Char}) precompile(Tuple{typeof(Base.Markdown.blockquote), Base.GenericIOBuffer{Array{UInt8, 1}}, Base.Markdown.MD}) precompile(Tuple{getfield(Base.Markdown, Symbol("#kw##parse")), Array{Any, 1}, typeof(Base.Markdown.parse), String}) precompile(Tuple{typeof(Base.Markdown.admonition), Base.GenericIOBuffer{Array{UInt8, 1}}, Base.Markdown.MD}) -precompile(Tuple{typeof(Base.UTF8proc.isupper), Char}) +precompile(Tuple{typeof(Base.Unicode.isupper), Char}) precompile(Tuple{getfield(Base.Markdown, Symbol("#kw##linecontains")), Array{Any, 1}, typeof(Base.Markdown.linecontains), Base.GenericIOBuffer{Array{UInt8, 1}}, String}) precompile(Tuple{typeof(Base.ucfirst), Base.SubString{String}}) precompile(Tuple{typeof(Base.Markdown.blocktex), Base.GenericIOBuffer{Array{UInt8, 1}}, Base.Markdown.MD}) diff --git a/base/printf.jl b/base/printf.jl index c6e121405f02f..21ecc0cfea3ed 100644 --- a/base/printf.jl +++ b/base/printf.jl @@ -2,6 +2,7 @@ module Printf using Base: Grisu, GMP +using Base.Unicode: lowercase, textwidth, isupper export @printf, @sprintf ### printf formatter generation ### diff --git a/base/process.jl b/base/process.jl index 0d56aaee5b2f1..a220da433462b 100644 --- a/base/process.jl +++ b/base/process.jl @@ -138,7 +138,7 @@ struct FileRedirect filename::AbstractString append::Bool function FileRedirect(filename, append) - if lowercase(filename) == (@static Sys.iswindows() ? "nul" : "/dev/null") + if Unicode.lowercase(filename) == (@static Sys.iswindows() ? "nul" : "/dev/null") warn_once("for portability use DevNull instead of a file redirect") end new(filename, append) diff --git a/base/random/misc.jl b/base/random/misc.jl index 0c0fb71da2b8e..a9718cd11d7f5 100644 --- a/base/random/misc.jl +++ b/base/random/misc.jl @@ -436,7 +436,7 @@ Base.convert(::Type{UInt128}, u::UUID) = u.value let groupings = [1:8; 10:13; 15:18; 20:23; 25:36] function Base.convert(::Type{UUID}, s::AbstractString) - s = lowercase(s) + s = Base.Unicode.lowercase(s) if !ismatch(r"^[0-9a-f]{8}(?:-[0-9a-f]{4}){3}-[0-9a-f]{12}$", s) throw(ArgumentError("Malformed UUID string")) diff --git a/base/regex.jl b/base/regex.jl index 344730007d7ec..6ad92f15cba20 100644 --- a/base/regex.jl +++ b/base/regex.jl @@ -338,11 +338,11 @@ function _replace(io, repl_s::SubstitutionString, str, r, re) if repl[next_i] == SUB_CHAR write(io, SUB_CHAR) i = nextind(repl, next_i) - elseif isnumber(repl[next_i]) + elseif Unicode.isnumber(repl[next_i]) group = parse(Int, repl[next_i]) i = nextind(repl, next_i) while i <= e - if isnumber(repl[i]) + if Unicode.isnumber(repl[i]) group = 10group + parse(Int, repl[i]) i = nextind(repl, i) else @@ -364,7 +364,7 @@ function _replace(io, repl_s::SubstitutionString, str, r, re) end # TODO: avoid this allocation groupname = SubString(repl, groupstart, prevind(repl, i)) - if all(isnumber,groupname) + if all(Unicode.isnumber,groupname) _write_capture(io, re, parse(Int, groupname)) else group = PCRE.substring_number_from_name(re.regex, groupname) diff --git a/base/repl/LineEdit.jl b/base/repl/LineEdit.jl index 5e09ce730b6c6..607740573b0e9 100644 --- a/base/repl/LineEdit.jl +++ b/base/repl/LineEdit.jl @@ -9,6 +9,8 @@ import ..Terminals: raw!, width, height, cmove, getX, import Base: ensureroom, peek, show, AnyDict, position +using Base.Unicode: lowercase, uppercase, ucfirst, textwidth, isspace + abstract type TextInterface end abstract type ModeState end diff --git a/base/repl/REPL.jl b/base/repl/REPL.jl index c291bd7de8ca4..f7585feaa2a05 100644 --- a/base/repl/REPL.jl +++ b/base/repl/REPL.jl @@ -1092,7 +1092,7 @@ function ends_with_semicolon(line::AbstractString) else # outside of a comment, encountering anything but whitespace # means the semi-colon was internal to the expression - isspace(c) || return false + Base.Unicode.isspace(c) || return false end end return true diff --git a/base/repl/latex_symbols.jl b/base/repl/latex_symbols.jl index e268b389a644b..e62f6819fca0e 100644 --- a/base/repl/latex_symbols.jl +++ b/base/repl/latex_symbols.jl @@ -56,7 +56,7 @@ open(fname) do f split(replace(L, r"[{}\"]+", "\t"), "\t")) c = Char(parse(Int, x[2], 16)) if (Base.is_id_char(c) || Base.isoperator(Symbol(c))) && - string(c) ∉ latex_strings && !isascii(c) + string(c) ∉ latex_strings && !Unicode.isascii(c) tabcomname = escape_string(x[3]) if startswith(tabcomname, "\\\\math") tabcomname = string("\\\\", tabcomname[7:end]) diff --git a/base/shell.jl b/base/shell.jl index 72ffd23d9a944..0ab8a09f80083 100644 --- a/base/shell.jl +++ b/base/shell.jl @@ -54,13 +54,13 @@ function shell_parse(str::AbstractString, interpolate::Bool=true; while !done(s,j) c, k = next(s,j) - if !in_single_quotes && !in_double_quotes && isspace(c) + if !in_single_quotes && !in_double_quotes && Unicode.isspace(c) update_arg(s[i:prevind(s, j)]) append_arg() j = k while !done(s,j) c, k = next(s,j) - if !isspace(c) + if !Unicode.isspace(c) i = j break end @@ -71,7 +71,7 @@ function shell_parse(str::AbstractString, interpolate::Bool=true; if done(s,k) error("\$ right before end of command") end - if isspace(s[k]) + if Unicode.isspace(s[k]) error("space not allowed right after \$") end stpos = j @@ -140,7 +140,7 @@ function print_shell_word(io::IO, word::AbstractString, special::AbstractString has_single = false has_special = false for c in word - if isspace(c) || c=='\\' || c=='\'' || c=='"' || c=='$' || c in special + if Unicode.isspace(c) || c=='\\' || c=='\'' || c=='"' || c=='$' || c in special has_special = true if c == '\'' has_single = true diff --git a/base/show.jl b/base/show.jl index 15ab70c465584..8c81eece49d07 100644 --- a/base/show.jl +++ b/base/show.jl @@ -711,7 +711,9 @@ function show_expr_type(io::IO, @nospecialize(ty), emph::Bool) end end -emphasize(io, str::AbstractString) = have_color ? print_with_color(Base.error_color(), io, str; bold = true) : print(io, uppercase(str)) +emphasize(io, str::AbstractString) = have_color ? + print_with_color(Base.error_color(), io, str; bold = true) : + print(io, Unicode.uppercase(str)) show_linenumber(io::IO, line) = print(io, "#= line ", line, " =#") show_linenumber(io::IO, line, file) = print(io, "#= ", file, ":", line, " =#") diff --git a/base/socket.jl b/base/socket.jl index 19aa7b0a2108c..a0298ec1f05dc 100644 --- a/base/socket.jl +++ b/base/socket.jl @@ -636,7 +636,7 @@ Gets all of the IP addresses of the `host`. Uses the operating system's underlying getaddrinfo implementation, which may do a DNS lookup. """ function getalladdrinfo(host::String) - isascii(host) || error("non-ASCII hostname: $host") + Unicode.isascii(host) || error("non-ASCII hostname: $host") req = Libc.malloc(_sizeof_uv_getaddrinfo) uv_req_set_data(req, C_NULL) # in case we get interrupted before arriving at the wait call status = ccall(:jl_getaddrinfo, Int32, (Ptr{Void}, Ptr{Void}, Cstring, Ptr{Void}, Ptr{Void}), diff --git a/base/stream.jl b/base/stream.jl index 4cf2d753f67ef..24831e3ce83da 100644 --- a/base/stream.jl +++ b/base/stream.jl @@ -1029,7 +1029,7 @@ for (x, writable, unix_fd, c_symbol) in ((:STDIN, false, 0, :jl_uv_stdin), (:STDOUT, true, 1, :jl_uv_stdout), (:STDERR, true, 2, :jl_uv_stderr)) - f = Symbol("redirect_",lowercase(string(x))) + f = Symbol("redirect_",Unicode.lowercase(string(x))) _f = Symbol("_",f) @eval begin function ($_f)(stream) diff --git a/base/strings/basic.jl b/base/strings/basic.jl index 734f1cc6f9041..0c3044573a215 100644 --- a/base/strings/basic.jl +++ b/base/strings/basic.jl @@ -469,143 +469,10 @@ next(e::EachStringIndex, state) = (state, nextind(e.s, state)) done(e::EachStringIndex, state) = done(e.s, state) eltype(::Type{EachStringIndex}) = Int -""" - isascii(c::Union{Char,AbstractString}) -> Bool - -Test whether a character belongs to the ASCII character set, or whether this is true for -all elements of a string. - -# Examples -```jldoctest -julia> isascii('a') -true - -julia> isascii('α') -false - -julia> isascii("abc") -true - -julia> isascii("αβγ") -false -``` -""" -isascii(c::Char) = c < Char(0x80) -isascii(s::AbstractString) = all(isascii, s) - ## string promotion rules ## promote_rule(::Type{<:AbstractString}, ::Type{<:AbstractString}) = String -""" - isxdigit(c::Char) -> Bool - -Test whether a character is a valid hexadecimal digit. Note that this does not -include `x` (as in the standard `0x` prefix). - -# Examples -```jldoctest -julia> isxdigit('a') -true - -julia> isxdigit('x') -false -``` -""" -isxdigit(c::Char) = '0'<=c<='9' || 'a'<=c<='f' || 'A'<=c<='F' - -## uppercase, lowercase, and titlecase transformations ## - -""" - uppercase(s::AbstractString) - -Return `s` with all characters converted to uppercase. - -# Examples -```jldoctest -julia> uppercase("Julia") -"JULIA" -``` -""" -uppercase(s::AbstractString) = map(uppercase, s) - -""" - lowercase(s::AbstractString) - -Return `s` with all characters converted to lowercase. - -# Examples -```jldoctest -julia> lowercase("STRINGS AND THINGS") -"strings and things" -``` -""" -lowercase(s::AbstractString) = map(lowercase, s) - -""" - titlecase(s::AbstractString) - -Capitalize the first character of each word in `s`. -See also [`ucfirst`](@ref) to capitalize only the first -character in `s`. - -# Examples -```jldoctest -julia> titlecase("the julia programming language") -"The Julia Programming Language" -``` -""" -function titlecase(s::AbstractString) - startword = true - b = IOBuffer() - for c in s - if isspace(c) - print(b, c) - startword = true - else - print(b, startword ? titlecase(c) : c) - startword = false - end - end - return String(take!(b)) -end - -""" - ucfirst(s::AbstractString) - -Return `string` with the first character converted to uppercase -(technically "title case" for Unicode). -See also [`titlecase`](@ref) to capitalize the first character of -every word in `s`. - -# Examples -```jldoctest -julia> ucfirst("python") -"Python" -``` -""" -function ucfirst(s::AbstractString) - isempty(s) && return s - c = s[1] - tc = titlecase(c) - return c==tc ? s : string(tc,s[nextind(s,1):end]) -end - -""" - lcfirst(s::AbstractString) - -Return `string` with the first character converted to lowercase. - -# Examples -```jldoctest -julia> lcfirst("Julia") -"julia" -``` -""" -function lcfirst(s::AbstractString) - isempty(s) || islower(s[1]) ? s : string(lowercase(s[1]),s[nextind(s,1):end]) -end - ## string map, filter, has ## function map(f, s::AbstractString) diff --git a/base/strings/io.jl b/base/strings/io.jl index a346c3d10f400..49d223111041b 100644 --- a/base/strings/io.jl +++ b/base/strings/io.jl @@ -2,7 +2,6 @@ ## core text I/O ## - """ print([io::IO], xs...) @@ -248,7 +247,7 @@ join(strings, delim, last) = sprint(join, strings, delim, last) ## string escaping & unescaping ## -need_full_hex(s::AbstractString, i::Int) = !done(s,i) && isxdigit(next(s,i)[1]) +need_full_hex(s::AbstractString, i::Int) = !done(s,i) && Unicode.isxdigit(next(s,i)[1]) escape_nul(s::AbstractString, i::Int) = !done(s,i) && '0' <= next(s,i)[1] <= '7' ? "\\x00" : "\\0" @@ -272,15 +271,15 @@ function escape_string(io, s::AbstractString, esc::AbstractString="") i = start(s) while !done(s,i) c, j = next(s,i) - c == '\0' ? print(io, escape_nul(s,j)) : - c == '\e' ? print(io, "\\e") : - c == '\\' ? print(io, "\\\\") : - c in esc ? print(io, '\\', c) : - '\a' <= c <= '\r' ? print(io, '\\', "abtnvfr"[Int(c)-6]) : - isprint(c) ? print(io, c) : - c <= '\x7f' ? print(io, "\\x", hex(c, 2)) : - c <= '\uffff' ? print(io, "\\u", hex(c, need_full_hex(s,j) ? 4 : 2)) : - print(io, "\\U", hex(c, need_full_hex(s,j) ? 8 : 4)) + c == '\0' ? print(io, escape_nul(s,j)) : + c == '\e' ? print(io, "\\e") : + c == '\\' ? print(io, "\\\\") : + c in esc ? print(io, '\\', c) : + '\a' <= c <= '\r' ? print(io, '\\', "abtnvfr"[Int(c)-6]) : + Unicode.isprint(c) ? print(io, c) : + c <= '\x7f' ? print(io, "\\x", hex(c, 2)) : + c <= '\uffff' ? print(io, "\\u", hex(c, need_full_hex(s,j) ? 4 : 2)) : + print(io, "\\U", hex(c, need_full_hex(s,j) ? 8 : 4)) i = j end end diff --git a/base/strings/string.jl b/base/strings/string.jl index e66f876a5f77d..67c238358486f 100644 --- a/base/strings/string.jl +++ b/base/strings/string.jl @@ -353,7 +353,7 @@ function search(a::Union{String,ByteArray}, b::Union{Int8,UInt8}, i::Integer = 1 end function search(a::ByteArray, b::Char, i::Integer = 1) - if isascii(b) + if Unicode.isascii(b) search(a,UInt8(b),i) else search(a,Vector{UInt8}(string(b)),i).start @@ -384,7 +384,7 @@ function rsearch(a::Union{String,ByteArray}, b::Union{Int8,UInt8}, i::Integer = end function rsearch(a::ByteArray, b::Char, i::Integer = length(a)) - if isascii(b) + if Unicode.isascii(b) rsearch(a,UInt8(b),i) else rsearch(a,Vector{UInt8}(string(b)),i).start diff --git a/base/strings/strings.jl b/base/strings/strings.jl index 1e442a09e406f..62cb030c9f84c 100644 --- a/base/strings/strings.jl +++ b/base/strings/strings.jl @@ -4,7 +4,6 @@ include("strings/errors.jl") include("strings/substring.jl") include("strings/basic.jl") include("strings/search.jl") +include("strings/utf8proc.jl") include("strings/util.jl") include("strings/io.jl") -include("strings/utf8proc.jl") -using .UTF8proc diff --git a/base/strings/substring.jl b/base/strings/substring.jl index d1bf33e4123fb..b5fabef1788dc 100644 --- a/base/strings/substring.jl +++ b/base/strings/substring.jl @@ -128,8 +128,9 @@ end Reverses a string. Technically, this function reverses the codepoints in a string and its main utility is for reversed-order string processing, especially for reversed regular-expression searches. See also [`reverseind`](@ref) to convert indices in `s` to -indices in `reverse(s)` and vice-versa, and [`graphemes`](@ref) to operate on user-visible -"characters" (graphemes) rather than codepoints. See also [`Iterators.reverse`](@ref) for +indices in `reverse(s)` and vice-versa, and [`Unicode.graphemes`](@ref Base.Unicode.graphemes) to +operate on user-visible "characters" (graphemes) rather than codepoints. +See also [`Iterators.reverse`](@ref) for reverse-order iteration without making a copy. Custom string types must implement the `reverse` function themselves and should typically return a string with the same type and encoding. If they return a string with a different encoding, they must also override @@ -143,6 +144,8 @@ julia> reverse("JuliaLang") julia> reverse("ax̂e") # combining characters can lead to surprising results "êxa" +julia> using Unicode + julia> join(reverse(collect(graphemes("ax̂e")))) # reverses graphemes "ex̂a" ``` diff --git a/base/strings/utf8proc.jl b/base/strings/utf8proc.jl index cf30ec5b3aa6f..df2b68eacd81e 100644 --- a/base/strings/utf8proc.jl +++ b/base/strings/utf8proc.jl @@ -1,16 +1,9 @@ # This file is a part of Julia. License is MIT: https://julialang.org/license # Various Unicode functionality from the utf8proc library -module UTF8proc +module Unicode -import Base: show, ==, hash, string, Symbol, isless, length, eltype, start, next, done, convert, isvalid, lowercase, uppercase, titlecase - -export isgraphemebreak, category_code, category_abbrev, category_string - -# also exported by Base: -export normalize_string, graphemes, is_assigned_char, textwidth, isvalid, - islower, isupper, isalpha, isdigit, isnumber, isalnum, - iscntrl, ispunct, isspace, isprint, isgraph +import Base: show, ==, hash, string, Symbol, isless, length, eltype, start, next, done, convert, isvalid # whether codepoints are valid Unicode scalar values, i.e. 0-0xd7ff, 0xe000-0x10ffff @@ -216,6 +209,8 @@ For example, NFKC corresponds to the options `compose=true, compat=true, stable= # Examples ```jldoctest +julia> using Unicode + julia> "μ" == normalize_string("µ", compat=true) #LHS: Unicode U+03bc, RHS: Unicode U+00b5 true @@ -246,6 +241,8 @@ Give the number of columns needed to print a character. # Examples ```jldoctest +julia> using Unicode + julia> textwidth('α') 1 @@ -262,6 +259,8 @@ Give the number of columns needed to print a string. # Examples ```jldoctest +julia> using Unicode + julia> textwidth("March") 5 ``` @@ -288,6 +287,8 @@ Returns `true` if the given char or integer is an assigned Unicode code point. # Examples ```jldoctest +julia> using Unicode + julia> is_assigned_char(101) true @@ -308,6 +309,8 @@ Letter: Lowercase. # Examples ```jldoctest +julia> using Unicode + julia> islower('α') true @@ -331,6 +334,8 @@ Letter: Uppercase, or Lt, Letter: Titlecase. # Examples ```jldoctest +julia> using Unicode + julia> isupper('γ') false @@ -353,6 +358,8 @@ Tests whether a character is a numeric digit (0-9). # Examples ```jldoctest +julia> using Unicode + julia> isdigit('❤') false @@ -374,6 +381,8 @@ category Letter, i.e. a character whose category code begins with 'L'. # Examples ```jldoctest +julia> using Unicode + julia> isalpha('❤') false @@ -395,6 +404,8 @@ i.e. a character whose category code begins with 'N'. # Examples ```jldoctest +julia> using Unicode + julia> isnumber('9') true @@ -416,6 +427,8 @@ category Letter or Number, i.e. a character whose category code begins with 'L' # Examples ```jldoctest +julia> using Unicode + julia> isalnum('❤') false @@ -442,6 +455,8 @@ Control characters are the non-printing characters of the Latin-1 subset of Unic # Examples ```jldoctest +julia> using Unicode + julia> iscntrl('\\x01') true @@ -459,6 +474,8 @@ character whose category code begins with 'P'. # Examples ```jldoctest +julia> using Unicode + julia> ispunct('α') false @@ -482,6 +499,8 @@ category Zs. # Examples ```jldoctest +julia> using Unicode + julia> isspace('\\n') true @@ -504,6 +523,8 @@ Tests whether a character is printable, including spaces, but not a control char # Examples ```jldoctest +julia> using Unicode + julia> isprint('\\x01') false @@ -524,6 +545,8 @@ classified with `isgraph(c)==true`. # Examples ```jldoctest +julia> using Unicode + julia> isgraph('\\x01') false @@ -533,6 +556,153 @@ true """ isgraph(c::Char) = (UTF8PROC_CATEGORY_LU <= category_code(c) <= UTF8PROC_CATEGORY_SO) +""" + isascii(c::Union{Char,AbstractString}) -> Bool + +Test whether a character belongs to the ASCII character set, or whether this is true for +all elements of a string. + +# Examples +```jldoctest +julia> using Unicode + +julia> isascii('a') +true + +julia> isascii('α') +false + +julia> isascii("abc") +true + +julia> isascii("αβγ") +false +``` +""" +isascii(c::Char) = c < Char(0x80) +isascii(s::AbstractString) = all(isascii, s) + +""" + isxdigit(c::Char) -> Bool + +Test whether a character is a valid hexadecimal digit. Note that this does not +include `x` (as in the standard `0x` prefix). + +# Examples +```jldoctest +julia> using Unicode + +julia> isxdigit('a') +true + +julia> isxdigit('x') +false +``` +""" +isxdigit(c::Char) = '0'<=c<='9' || 'a'<=c<='f' || 'A'<=c<='F' + +## uppercase, lowercase, and titlecase transformations ## + +""" + uppercase(s::AbstractString) + +Return `s` with all characters converted to uppercase. + +# Examples +```jldoctest +julia> using Unicode + +julia> uppercase("Julia") +"JULIA" +``` +""" +uppercase(s::AbstractString) = map(uppercase, s) + +""" + lowercase(s::AbstractString) + +Return `s` with all characters converted to lowercase. + +# Examples +```jldoctest +julia> using Unicode + +julia> lowercase("STRINGS AND THINGS") +"strings and things" +``` +""" +lowercase(s::AbstractString) = map(lowercase, s) + +""" + titlecase(s::AbstractString) + +Capitalize the first character of each word in `s`. +See also [`ucfirst`](@ref) to capitalize only the first +character in `s`. + +# Examples +```jldoctest +julia> using Unicode + +julia> titlecase("the julia programming language") +"The Julia Programming Language" +``` +""" +function titlecase(s::AbstractString) + startword = true + b = IOBuffer() + for c in s + if isspace(c) + print(b, c) + startword = true + else + print(b, startword ? titlecase(c) : c) + startword = false + end + end + return String(take!(b)) +end + +""" + ucfirst(s::AbstractString) + +Return `string` with the first character converted to uppercase +(technically "title case" for Unicode). +See also [`titlecase`](@ref) to capitalize the first character of +every word in `s`. + +# Examples +```jldoctest +julia> using Unicode + +julia> ucfirst("python") +"Python" +``` +""" +function ucfirst(s::AbstractString) + isempty(s) && return s + c = s[1] + tc = titlecase(c) + return c==tc ? s : string(tc,s[nextind(s,1):end]) +end + +""" + lcfirst(s::AbstractString) + +Return `string` with the first character converted to lowercase. + +# Examples +```jldoctest +julia> using Unicode + +julia> lcfirst("Julia") +"julia" +``` +""" +function lcfirst(s::AbstractString) + isempty(s) || islower(s[1]) ? s : string(lowercase(s[1]),s[nextind(s,1):end]) +end + ############################################################################ # iterators for grapheme segmentation diff --git a/base/strings/util.jl b/base/strings/util.jl index db230a16da0c6..1f6777e7c6c0f 100644 --- a/base/strings/util.jl +++ b/base/strings/util.jl @@ -220,9 +220,9 @@ strip(s::AbstractString, chars::Chars) = lstrip(rstrip(s, chars), chars) ## string padding functions ## function lpad(s::AbstractString, n::Integer, p::AbstractString=" ") - m = n - textwidth(s) + m = n - Unicode.textwidth(s) (m <= 0) && (return s) - l = textwidth(p) + l = Unicode.textwidth(p) if l==1 return string(p^m, s) end @@ -233,9 +233,9 @@ function lpad(s::AbstractString, n::Integer, p::AbstractString=" ") end function rpad(s::AbstractString, n::Integer, p::AbstractString=" ") - m = n - textwidth(s) + m = n - Unicode.textwidth(s) (m <= 0) && (return s) - l = textwidth(p) + l = Unicode.textwidth(p) if l==1 return string(s, p^m) end diff --git a/base/sysimg.jl b/base/sysimg.jl index 7204750b00d85..bb6a84002469c 100644 --- a/base/sysimg.jl +++ b/base/sysimg.jl @@ -485,6 +485,7 @@ Base.require(:Profile) Base.require(:SharedArrays) Base.require(:SuiteSparse) Base.require(:Test) +Base.require(:Unicode) @eval Base begin @deprecate_binding Test root_module(:Test) true ", run `using Test` instead" diff --git a/base/util.jl b/base/util.jl index 1b380318bcc99..35e89b9ae2645 100644 --- a/base/util.jl +++ b/base/util.jl @@ -591,7 +591,7 @@ function getpass(prompt::AbstractString) ccall(:_getch, UInt8, ()) # ignore function/arrow keys elseif c == UInt8('\b') && plen > 0 plen -= 1 # delete last character on backspace - elseif !iscntrl(Char(c)) && plen < 128 + elseif !Unicode.iscntrl(Char(c)) && plen < 128 p[plen += 1] = c end end diff --git a/doc/make.jl b/doc/make.jl index ed52e23f91538..ccdb03cd3fb4c 100644 --- a/doc/make.jl +++ b/doc/make.jl @@ -31,6 +31,7 @@ if Sys.iswindows() cp_q("../stdlib/CRC32c/docs/src/index.md", "src/stdlib/crc32c.md") cp_q("../stdlib/Dates/docs/src/index.md", "src/stdlib/dates.md") cp_q("../stdlib/IterativeEigenSolvers/docs/src/index.md", "src/stdlib/iterativeeigensolvers.md") + cp_q("../stdlib/Unicode/docs/src/index.md", "src/stdlib/unicode.md") else symlink_q("../../../stdlib/DelimitedFiles/docs/src/index.md", "src/stdlib/delimitedfiles.md") symlink_q("../../../stdlib/Test/docs/src/index.md", "src/stdlib/test.md") @@ -42,6 +43,7 @@ else symlink_q("../../../stdlib/CRC32c/docs/src/index.md", "src/stdlib/crc32c.md") symlink_q("../../../stdlib/Dates/docs/src/index.md", "src/stdlib/dates.md") symlink_q("../../../stdlib/IterativeEigenSolvers/docs/src/index.md", "src/stdlib/iterativeeigensolvers.md") + symlink_q("../../../stdlib/Unicode/docs/src/index.md", "src/stdlib/unicode.md") end const PAGES = [ @@ -118,6 +120,7 @@ const PAGES = [ "stdlib/filewatching.md", "stdlib/crc32c.md", "stdlib/iterativeeigensolvers.md", + "stdlib/unicode.md", ], "Developer Documentation" => [ "devdocs/reflection.md", @@ -153,12 +156,12 @@ const PAGES = [ ] using DelimitedFiles, Test, Mmap, SharedArrays, Profile, Base64, FileWatching, CRC32c, - Dates, IterativeEigenSolvers + Dates, IterativeEigenSolvers, Unicode makedocs( build = joinpath(pwd(), "_build/html/en"), modules = [Base, Core, BuildSysImg, DelimitedFiles, Test, Mmap, SharedArrays, Profile, - Base64, FileWatching, Dates, IterativeEigenSolvers], + Base64, FileWatching, Dates, IterativeEigenSolvers, Unicode], clean = false, doctest = "doctest" in ARGS, linkcheck = "linkcheck" in ARGS, diff --git a/doc/src/index.md b/doc/src/index.md index 8faee6bf9f939..1d919ab4d7230 100644 --- a/doc/src/index.md +++ b/doc/src/index.md @@ -73,6 +73,7 @@ * [Base64](@ref) * [File Events](@ref lib-filewatching) * [Iterative Eigensolvers](@ref lib-itereigen) + * [Unicode](@ref) ## Developer Documentation diff --git a/doc/src/manual/unicode-input.md b/doc/src/manual/unicode-input.md index bb0ab649e70b7..6632c169c784e 100644 --- a/doc/src/manual/unicode-input.md +++ b/doc/src/manual/unicode-input.md @@ -37,7 +37,8 @@ function unicode_data() for line in readlines(unidata) id, name, desc = split(line, ";")[[1, 2, 11]] codepoint = parse(UInt32, "0x$id") - names[codepoint] = titlecase(lowercase(name == "" ? desc : desc == "" ? name : "$name / $desc")) + names[codepoint] = Base.Unicode.titlecase(Base.Unicode.lowercase( + name == "" ? desc : desc == "" ? name : "$name / $desc")) end end return names @@ -47,7 +48,7 @@ end # for how unicode is displayed on the unicode.org website: # http://unicode.org/cldr/utility/character.jsp?a=0300 function fix_combining_chars(char) - cat = Base.UTF8proc.category_code(char) + cat = Base.Unicode.category_code(char) return cat == 6 || cat == 8 ? "$NBSP$char$NBSP" : "$char" end @@ -60,7 +61,7 @@ function table_entries(completions, unicode_dict) for (chars, inputs) in sort!(collect(completions), by = first) code_points, unicode_names, characters = String[], String[], String[] for char in chars - push!(code_points, "U+$(uppercase(hex(char, 5)))") + push!(code_points, "U+$(Base.Unicode.uppercase(hex(char, 5)))") push!(unicode_names, get(unicode_dict, UInt32(char), "(No Unicode name)")) push!(characters, isempty(characters) ? fix_combining_chars(char) : "$char") end diff --git a/doc/src/stdlib/.gitignore b/doc/src/stdlib/.gitignore index e04ac81b650d4..7096169fbd716 100644 --- a/doc/src/stdlib/.gitignore +++ b/doc/src/stdlib/.gitignore @@ -7,4 +7,4 @@ base64.md filewatching.md crc32c.md dates.md - +unicode.md diff --git a/doc/src/stdlib/strings.md b/doc/src/stdlib/strings.md index ec561b8a810a6..39f2f15a06b3b 100644 --- a/doc/src/stdlib/strings.md +++ b/doc/src/stdlib/strings.md @@ -19,12 +19,9 @@ Base.@r_str Base.@raw_str Base.Docs.@html_str Base.Docs.@text_str -Base.UTF8proc.normalize_string -Base.UTF8proc.graphemes Base.isvalid(::Any) Base.isvalid(::Any, ::Any) Base.isvalid(::AbstractString, ::Integer) -Base.UTF8proc.is_assigned_char Base.ismatch Base.match Base.eachmatch @@ -50,11 +47,6 @@ Base.startswith Base.endswith Base.first(::AbstractString, ::Integer) Base.last(::AbstractString, ::Integer) -Base.uppercase -Base.lowercase -Base.titlecase -Base.ucfirst -Base.lcfirst Base.join Base.chop Base.chomp @@ -64,20 +56,6 @@ Base.thisind Base.nextind Base.prevind Base.Random.randstring -Base.UTF8proc.textwidth -Base.UTF8proc.isalnum -Base.UTF8proc.isalpha -Base.isascii -Base.UTF8proc.iscntrl -Base.UTF8proc.isdigit -Base.UTF8proc.isgraph -Base.UTF8proc.islower -Base.UTF8proc.isnumber -Base.UTF8proc.isprint -Base.UTF8proc.ispunct -Base.UTF8proc.isspace -Base.UTF8proc.isupper -Base.isxdigit Core.Symbol Base.escape_string Base.unescape_string diff --git a/stdlib/Dates/src/parse.jl b/stdlib/Dates/src/parse.jl index eb9aa76bbc449..4f4cb0d7891ea 100644 --- a/stdlib/Dates/src/parse.jl +++ b/stdlib/Dates/src/parse.jl @@ -16,7 +16,7 @@ function character_codes(directives::SimpleVector) return letters end -genvar(t::DataType) = Symbol(lowercase(string(Base.datatype_name(t)))) +genvar(t::DataType) = Symbol(Base.Unicode.lowercase(string(Base.datatype_name(t)))) """ tryparsenext_core(str::AbstractString, pos::Int, len::Int, df::DateFormat, raise=false) @@ -184,7 +184,7 @@ end max_pos = maxchars <= 0 ? len : min(chr2ind(str, ind2chr(str,i) + maxchars - 1), len) @inbounds while i <= max_pos c, ii = next(str, i) - if isalpha(c) + if Base.Unicode.isalpha(c) word_end = i else break diff --git a/stdlib/Dates/src/periods.jl b/stdlib/Dates/src/periods.jl index 6f30613c31595..1a74892e23a6d 100644 --- a/stdlib/Dates/src/periods.jl +++ b/stdlib/Dates/src/periods.jl @@ -8,7 +8,7 @@ value(x::Period) = x.value # The following definitions are for Period-specific safety for period in (:Year, :Month, :Week, :Day, :Hour, :Minute, :Second, :Millisecond, :Microsecond, :Nanosecond) period_str = string(period) - accessor_str = lowercase(period_str) + accessor_str = Base.Unicode.lowercase(period_str) # Convenience method for show() @eval _units(x::$period) = " " * $accessor_str * (abs(value(x)) == 1 ? "" : "s") # periodisless diff --git a/stdlib/Dates/src/query.jl b/stdlib/Dates/src/query.jl index 88806673cc157..801431329524b 100644 --- a/stdlib/Dates/src/query.jl +++ b/stdlib/Dates/src/query.jl @@ -22,7 +22,7 @@ function locale_dict(names::Vector{<:AbstractString}) for i in 1:length(names) name = names[i] result[name] = i - result[lowercase(name)] = i + result[Base.Unicode.lowercase(name)] = i end return result end @@ -72,7 +72,7 @@ for (fn, field) in zip( # a case-sensitive lookup first value = get(locale.$field, word, 0) if value == 0 - value = get(locale.$field, lowercase(word), 0) + value = get(locale.$field, Base.Unicode.lowercase(word), 0) end value end diff --git a/stdlib/Dates/test/io.jl b/stdlib/Dates/test/io.jl index 63620a5c31505..d6e0a2ad774d9 100644 --- a/stdlib/Dates/test/io.jl +++ b/stdlib/Dates/test/io.jl @@ -297,7 +297,7 @@ end @test Dates.format(Dates.Date(2009, 12, 1), f) == "01Dec2009" f = "duy" globex = ["f", "g", "h", "j", "k", "m", "n", "q", "u", "v", "x", "z"] - locale = Dates.DateLocale(globex, map(uppercase, globex), globex[1:7], globex[1:7]) + locale = Dates.DateLocale(globex, map(Base.Unicode.uppercase, globex), globex[1:7], globex[1:7]) @test Dates.Date("1F4", f; locale=locale) + Dates.Year(2010) == Dates.Date(2014, 1, 1) @test Dates.format(Dates.Date(2014, 1, 1), f; locale=locale) == "1F4" diff --git a/stdlib/SuiteSparse/src/cholmod.jl b/stdlib/SuiteSparse/src/cholmod.jl index b0d535d6dd893..6b53109eb2841 100644 --- a/stdlib/SuiteSparse/src/cholmod.jl +++ b/stdlib/SuiteSparse/src/cholmod.jl @@ -614,7 +614,7 @@ end ### cholmod_check.h ### function print_sparse(A::Sparse{Tv}, name::String) where Tv<:VTypes - isascii(name) || error("non-ASCII name: $name") + Unicode.isascii(name) || error("non-ASCII name: $name") set_print_level(common_struct, 3) @isok ccall((@cholmod_name("print_sparse", SuiteSparse_long),:libcholmod), Cint, (Ptr{C_Sparse{Tv}}, Ptr{UInt8}, Ptr{UInt8}), diff --git a/stdlib/Unicode/docs/src/index.md b/stdlib/Unicode/docs/src/index.md new file mode 100644 index 0000000000000..86ab0d7383256 --- /dev/null +++ b/stdlib/Unicode/docs/src/index.md @@ -0,0 +1,25 @@ +# Unicode + +```@docs +Unicode.is_assigned_char +Unicode.normalize_string +Unicode.graphemes +Unicode.uppercase +Unicode.lowercase +Unicode.titlecase +Unicode.ucfirst +Unicode.lcfirst +Unicode.textwidth +Unicode.isalnum +Unicode.isalpha +Unicode.iscntrl +Unicode.isdigit +Unicode.isgraph +Unicode.islower +Unicode.isnumber +Unicode.isprint +Unicode.ispunct +Unicode.isspace +Unicode.isupper +Unicode.isxdigit +``` diff --git a/stdlib/Unicode/src/Unicode.jl b/stdlib/Unicode/src/Unicode.jl new file mode 100644 index 0000000000000..a320c565bcd91 --- /dev/null +++ b/stdlib/Unicode/src/Unicode.jl @@ -0,0 +1,17 @@ +# This file is a part of Julia. License is MIT: https://julialang.org/license + +__precompile__(true) + +module Unicode + +using Base.Unicode: normalize_string, graphemes, is_assigned_char, textwidth, isvalid, + islower, isupper, isalpha, isdigit, isxdigit, isnumber, isalnum, + iscntrl, ispunct, isspace, isprint, isgraph, + lowercase, uppercase, titlecase, lcfirst, ucfirst + +export normalize_string, graphemes, is_assigned_char, textwidth, isvalid, + islower, isupper, isalpha, isdigit, isxdigit, isnumber, isalnum, + iscntrl, ispunct, isspace, isprint, isgraph, + lowercase, uppercase, titlecase, lcfirst, ucfirst + +end diff --git a/test/unicode/utf8proc.jl b/stdlib/Unicode/test/runtests.jl similarity index 80% rename from test/unicode/utf8proc.jl rename to stdlib/Unicode/test/runtests.jl index ef05772064913..402807d075061 100644 --- a/test/unicode/utf8proc.jl +++ b/stdlib/Unicode/test/runtests.jl @@ -1,5 +1,8 @@ # This file is a part of Julia. License is MIT: https://julialang.org/license +using Test +using Unicode + @testset "string normalization" begin # normalize_string (Unicode normalization etc.): @test normalize_string("\u006e\u0303", :NFC) == "\u00f1" @@ -226,9 +229,9 @@ end @testset "utf8proc" begin # check utf8proc handling of CN category constants let c_ll = 'β', c_cn = '\u038B' - @test Base.UTF8proc.category_code(c_ll) == Base.UTF8proc.UTF8PROC_CATEGORY_LL + @test Base.Unicode.category_code(c_ll) == Base.Unicode.UTF8PROC_CATEGORY_LL # check codepoint with category code CN - @test Base.UTF8proc.category_code(c_cn) == Base.UTF8proc.UTF8PROC_CATEGORY_CN + @test Base.Unicode.category_code(c_cn) == Base.Unicode.UTF8PROC_CATEGORY_CN end end @@ -320,3 +323,72 @@ end @test eltype(g) == SubString{String} @test collect(g) == ["1","2","3","α","5"] end + +@testset "ucfirst/lcfirst" begin + @test ucfirst("Hola")=="Hola" + @test ucfirst("hola")=="Hola" + @test ucfirst("")=="" + @test ucfirst("*")=="*" + @test ucfirst("DŽxx") == ucfirst("džxx") == "Džxx" + + @test lcfirst("Hola")=="hola" + @test lcfirst("hola")=="hola" + @test lcfirst("")=="" + @test lcfirst("*")=="*" +end + +@testset "issue #11482" begin + @testset "uppercase/lowercase" begin + @test uppercase("aBc") == "ABC" + @test uppercase('A') == 'A' + @test uppercase('a') == 'A' + @test lowercase("AbC") == "abc" + @test lowercase('A') == 'a' + @test lowercase('a') == 'a' + @test uppercase('α') == '\u0391' + @test lowercase('Δ') == 'δ' + @test lowercase('\U118bf') == '\U118df' + @test uppercase('\U1044d') == '\U10425' + end + @testset "ucfirst/lcfirst" begin + @test ucfirst("Abc") == "Abc" + @test ucfirst("abc") == "Abc" + @test lcfirst("ABC") == "aBC" + @test lcfirst("aBC") == "aBC" + @test ucfirst(GenericString("")) == "" + @test lcfirst(GenericString("")) == "" + @test ucfirst(GenericString("a")) == "A" + @test lcfirst(GenericString("A")) == "a" + @test lcfirst(GenericString("a")) == "a" + @test ucfirst(GenericString("A")) == "A" + end + @testset "titlecase" begin + @test titlecase('lj') == 'Lj' + @test titlecase("ljubljana") == "Ljubljana" + @test titlecase("aBc ABC") == "ABc ABC" + @test titlecase("abcD EFG\n\thij") == "AbcD EFG\n\tHij" + end +end + +@testset "issue # 11464: uppercase/lowercase of GenericString becomes a String" begin + str = "abcdef\uff\uffff\u10ffffABCDEF" + @test typeof(uppercase("abcdef")) == String + @test typeof(uppercase(GenericString(str))) == String + @test typeof(lowercase("ABCDEF")) == String + @test typeof(lowercase(GenericString(str))) == String + + foomap(ch) = (ch > Char(65)) + foobar(ch) = Char(0xd800) + foobaz(ch) = reinterpret(Char, typemax(UInt32)) + @test_throws ArgumentError map(foomap, GenericString(str)) + @test map(foobar, GenericString(str)) == String(repeat(b"\ud800", outer=[17])) + @test map(foobaz, GenericString(str)) == String(repeat(b"\ufffd", outer=[17])) + + @test "a".*["b","c"] == ["ab","ac"] + @test ["b","c"].*"a" == ["ba","ca"] + @test ["a","b"].*["c" "d"] == ["ac" "ad"; "bc" "bd"] + + @test one(String) == "" + @test prod(["*" for i in 1:3]) == "***" + @test prod(["*" for i in 1:0]) == "" +end \ No newline at end of file diff --git a/test/arrayops.jl b/test/arrayops.jl index f3a4a0015d1f7..b8184addfbbf8 100644 --- a/test/arrayops.jl +++ b/test/arrayops.jl @@ -467,8 +467,8 @@ end @testset "find with general iterables" begin s = "julia" @test find(c -> c == 'l', s) == [3] - g = graphemes("日本語") - @test find(isascii, g) == Int[] + g = Base.Unicode.graphemes("日本語") + @test find(Base.Unicode.isascii, g) == Int[] @test find(!iszero, (i % 2 for i in 1:10)) == collect(1:2:9) end @testset "findn" begin diff --git a/test/choosetests.jl b/test/choosetests.jl index ae0ed781549eb..1594676cc9786 100644 --- a/test/choosetests.jl +++ b/test/choosetests.jl @@ -82,7 +82,7 @@ function choosetests(choices = []) end - unicodetests = ["unicode/UnicodeError", "unicode/utf8proc", "unicode/utf8"] + unicodetests = ["unicode/UnicodeError", "unicode/utf8"] if "unicode" in skip_tests filter!(x -> (x != "unicode" && !(x in unicodetests)), tests) elseif "unicode" in tests diff --git a/test/compile.jl b/test/compile.jl index 0f6c97801bf07..096433f424257 100644 --- a/test/compile.jl +++ b/test/compile.jl @@ -220,7 +220,7 @@ try Dict(s => Base.module_uuid(Base.root_module(s)) for s in [:Base64, :CRC32c, :Dates, :DelimitedFiles, :FileWatching, :IterativeEigenSolvers, :Mmap, :Profile, :SharedArrays, - :SuiteSparse, :Test])) + :SuiteSparse, :Test, :Unicode])) @test discard_module.(deps) == deps1 @test current_task()(0x01, 0x4000, 0x30031234) == 2 diff --git a/test/dict.jl b/test/dict.jl index 7212022e51ea4..35c1f463258fa 100644 --- a/test/dict.jl +++ b/test/dict.jl @@ -282,7 +282,7 @@ end Base.show(io, MIME("text/plain"), d) out = split(String(take!(s)),'\n') for line in out[2:end] - @test textwidth(line) <= cols + @test Base.Unicode.textwidth(line) <= cols end @test length(out) <= rows @@ -292,7 +292,7 @@ end Base.show(io, MIME("text/plain"), f(d)) out = split(String(take!(s)),'\n') for line in out[2:end] - @test textwidth(line) <= cols + @test Base.Unicode.textwidth(line) <= cols end @test length(out) <= rows end diff --git a/test/distributed_exec.jl b/test/distributed_exec.jl index 40fcf47cf142e..20ba2c64cb5a7 100644 --- a/test/distributed_exec.jl +++ b/test/distributed_exec.jl @@ -632,6 +632,7 @@ testmap_equivalence(x->x ? false : true, BitMatrix(uninitialized, 10,10)) testmap_equivalence(x->"foobar", BitMatrix(uninitialized, 10,10)) testmap_equivalence((x,y,z)->string(x,y,z), BitVector(uninitialized, 10), ones(10), "1234567890") +using Base.Unicode: uppercase @test asyncmap(uppercase, "Hello World!") == map(uppercase, "Hello World!") @test pmap(uppercase, "Hello World!") == map(uppercase, "Hello World!") diff --git a/test/docs.jl b/test/docs.jl index 308849a6a21e1..89a2d323a056a 100644 --- a/test/docs.jl +++ b/test/docs.jl @@ -952,7 +952,7 @@ for (line, expr) in Pair[ "\"...\"" => "...", "r\"...\"" => Expr(:macrocall, Symbol("@r_str"), LineNumberNode(1, :none), "...") ] - @test Docs.helpmode(line) == Expr(:macrocall, Expr(:., Expr(:., :Base, QuoteNode(:Docs)), QuoteNode(Symbol("@repl"))), LineNumberNode(117, doc_util_path), STDOUT, expr) + @test Docs.helpmode(line) == Expr(:macrocall, Expr(:., Expr(:., :Base, QuoteNode(:Docs)), QuoteNode(Symbol("@repl"))), LineNumberNode(118, doc_util_path), STDOUT, expr) buf = IOBuffer() @test eval(Base, Docs.helpmode(buf, line)) isa Union{Base.Markdown.MD,Void} end @@ -991,8 +991,8 @@ dynamic_test.x = "test 2" @test @doc(dynamic_test) == "test 2 Union{}" @test @doc(dynamic_test(::String)) == "test 2 Tuple{String}" -@test Docs._repl(:(dynamic_test(1.0))) == Expr(:escape, Expr(:macrocall, Symbol("@doc"), LineNumberNode(206, doc_util_path), :(dynamic_test(::typeof(1.0))))) -@test Docs._repl(:(dynamic_test(::String))) == Expr(:escape, Expr(:macrocall, Symbol("@doc"), LineNumberNode(206, doc_util_path), :(dynamic_test(::String)))) +@test Docs._repl(:(dynamic_test(1.0))) == Expr(:escape, Expr(:macrocall, Symbol("@doc"), LineNumberNode(207, doc_util_path), :(dynamic_test(::typeof(1.0))))) +@test Docs._repl(:(dynamic_test(::String))) == Expr(:escape, Expr(:macrocall, Symbol("@doc"), LineNumberNode(207, doc_util_path), :(dynamic_test(::String)))) # Equality testing diff --git a/test/iobuffer.jl b/test/iobuffer.jl index f11229c1f8beb..00de2463ad727 100644 --- a/test/iobuffer.jl +++ b/test/iobuffer.jl @@ -258,6 +258,7 @@ let io = IOBuffer() end # skipchars +using Base.Unicode: isspace let io = IOBuffer("") @test eof(skipchars(io, isspace)) @@ -278,7 +279,7 @@ let for char in ['@','߷','࿊','𐋺'] io = IOBuffer("alphabeticalstuff$char") - @test !eof(skipchars(io, isalpha)) + @test !eof(skipchars(io, Base.Unicode.isalpha)) @test read(io, Char) == char end end diff --git a/test/iostream.jl b/test/iostream.jl index 31026b753d173..67a0e7d90af0a 100644 --- a/test/iostream.jl +++ b/test/iostream.jl @@ -1,6 +1,7 @@ # This file is a part of Julia. License is MIT: https://julialang.org/license # Test skipchars for IOStreams +using Base.Unicode: isspace mktemp() do path, file function append_to_file(str) mark(file) @@ -34,7 +35,7 @@ mktemp() do path, file for (byte,char) in zip(1:4, ('@','߷','࿊','𐋺')) append_to_file("abcdef$char") @test Base.codelen(char) == byte - @test !eof(skipchars(file, isalpha)) + @test !eof(skipchars(file, Base.Unicode.isalpha)) @test read(file, Char) == char end end diff --git a/test/libgit2.jl b/test/libgit2.jl index 4f1bf064fcedf..ba8669eebe563 100644 --- a/test/libgit2.jl +++ b/test/libgit2.jl @@ -2,6 +2,7 @@ isdefined(Main, :TestHelpers) || @eval Main include(joinpath(@__DIR__, "TestHelpers.jl")) import Main.TestHelpers: challenge_prompt +using Base.Unicode: lowercase const LIBGIT2_MIN_VER = v"0.23.0" const LIBGIT2_HELPER_PATH = joinpath(@__DIR__, "libgit2-helpers.jl") diff --git a/test/operators.jl b/test/operators.jl index a349269d7e818..9a883c588d43e 100644 --- a/test/operators.jl +++ b/test/operators.jl @@ -106,12 +106,12 @@ Base.promote_rule(::Type{T19714}, ::Type{Int}) = T19714 # pr #17155 @testset "function composition" begin - @test (uppercase∘hex)(239487) == "3A77F" + @test (Base.Unicode.uppercase∘hex)(239487) == "3A77F" end @testset "function negation" begin str = randstring(20) - @test filter(!isupper, str) == replace(str, r"[A-Z]", "") - @test filter(!islower, str) == replace(str, r"[a-z]", "") + @test filter(!Base.Unicode.isupper, str) == replace(str, r"[A-Z]", "") + @test filter(!Base.Unicode.islower, str) == replace(str, r"[a-z]", "") end # issue #19891 diff --git a/test/perf/kernel/json.jl b/test/perf/kernel/json.jl index 8f992508614b3..44917d6b86d44 100644 --- a/test/perf/kernel/json.jl +++ b/test/perf/kernel/json.jl @@ -69,7 +69,7 @@ function parse_json(strng::AbstractString) end function skip_whitespace() - while pos <= len && isspace(strng[pos]) + while pos <= len && Base.Unicode.isspace(strng[pos]) pos = pos + 1 end end diff --git a/test/perf/perfgeneric.jl b/test/perf/perfgeneric.jl index 4e7dba0fa8051..4fef6051e42ae 100644 --- a/test/perf/perfgeneric.jl +++ b/test/perf/perfgeneric.jl @@ -3,6 +3,6 @@ #Generic benchmark driver for (testfunc, testname, longtestname, problem_sizes) in testdata for (n, t, size) in problem_sizes - @timeit testfunc(n, t) string(testname,"_",size) string(uppercase(size[1]),size[2:end]," ",longtestname," test") + @timeit testfunc(n, t) string(testname,"_",size) string(Base.Unicode.uppercase(size[1]),size[2:end]," ",longtestname," test") end end diff --git a/test/perf/shootout/k_nucleotide.jl b/test/perf/shootout/k_nucleotide.jl index d132efe210049..a070e79c97719 100644 --- a/test/perf/shootout/k_nucleotide.jl +++ b/test/perf/shootout/k_nucleotide.jl @@ -70,7 +70,7 @@ function k_nucleotide(infile="knucleotide-input.txt") i, j = 1, 1 while i <= length(data) if data[i] != '\n' - data[j] = uppercase(data[i]) + data[j] = Base.Unicode.uppercase(data[i]) j += 1 end i += 1 diff --git a/test/perf/spell/perf.jl b/test/perf/spell/perf.jl index 8fa1e8426f32b..687b9243ac99a 100644 --- a/test/perf/spell/perf.jl +++ b/test/perf/spell/perf.jl @@ -15,7 +15,7 @@ include("../perfutil.jl") -words(text) = eachmatch(r"[a-z]+", lowercase(text)) +words(text) = eachmatch(r"[a-z]+", Base.Unicode.lowercase(text)) function train(features) model = Dict{AbstractString, Int}() diff --git a/test/strings/basic.jl b/test/strings/basic.jl index 384da5d8a70f3..8e4073a138cac 100644 --- a/test/strings/basic.jl +++ b/test/strings/basic.jl @@ -186,17 +186,6 @@ end @test parse(Float32,"1\n") == 1.0 @test [parse(Float32,x) for x in split("0,1\n",",")][2] == 1.0 @test_throws ArgumentError parse(Float32,split("0,1 X\n",",")[2]) - - @test ucfirst("Hola")=="Hola" - @test ucfirst("hola")=="Hola" - @test ucfirst("")=="" - @test ucfirst("*")=="*" - @test ucfirst("DŽxx") == ucfirst("džxx") == "Džxx" - - @test lcfirst("Hola")=="hola" - @test lcfirst("hola")=="hola" - @test lcfirst("")=="" - @test lcfirst("*")=="*" end # test AbstractString functions at beginning of string.jl struct tstStringType <: AbstractString @@ -223,7 +212,7 @@ end @test done(eachindex("foobar"),7) @test eltype(Base.EachStringIndex) == Int - @test map(uppercase, "foó") == "FOÓ" + @test map(Base.Unicode.uppercase, "foó") == "FOÓ" @test chr2ind("fóobar",3) == 4 @test Symbol(gstr)==Symbol("12") @@ -289,7 +278,7 @@ end for T in [BigInt, Int8, UInt8, Int16, UInt16, Int32, UInt32, Int64, UInt64, Int128, UInt128, Float64, Float32] @test isnull(tryparse(T, "1\0")) end - let s = normalize_string("tést",:NFKC) + let s = Base.Unicode.normalize_string("tést",:NFKC) @test unsafe_string(Base.unsafe_convert(Cstring, Base.cconvert(Cstring, s))) == s @test unsafe_string(convert(Cstring, Symbol(s))) == s end @@ -419,62 +408,6 @@ end @test isvalid(String, UInt8[0xfe, 0x80, 0x80, 0x80, 0x80, 0x80]) == false end -@testset "issue #11482" begin - @testset "uppercase/lowercase" begin - @test uppercase("aBc") == "ABC" - @test uppercase('A') == 'A' - @test uppercase('a') == 'A' - @test lowercase("AbC") == "abc" - @test lowercase('A') == 'a' - @test lowercase('a') == 'a' - @test uppercase('α') == '\u0391' - @test lowercase('Δ') == 'δ' - @test lowercase('\U118bf') == '\U118df' - @test uppercase('\U1044d') == '\U10425' - end - @testset "ucfirst/lcfirst" begin - @test ucfirst("Abc") == "Abc" - @test ucfirst("abc") == "Abc" - @test lcfirst("ABC") == "aBC" - @test lcfirst("aBC") == "aBC" - @test ucfirst(GenericString("")) == "" - @test lcfirst(GenericString("")) == "" - @test ucfirst(GenericString("a")) == "A" - @test lcfirst(GenericString("A")) == "a" - @test lcfirst(GenericString("a")) == "a" - @test ucfirst(GenericString("A")) == "A" - end - @testset "titlecase" begin - @test titlecase('lj') == 'Lj' - @test titlecase("ljubljana") == "Ljubljana" - @test titlecase("aBc ABC") == "ABc ABC" - @test titlecase("abcD EFG\n\thij") == "AbcD EFG\n\tHij" - end -end - -@testset "issue # 11464: uppercase/lowercase of GenericString becomes a String" begin - str = "abcdef\uff\uffff\u10ffffABCDEF" - @test typeof(uppercase("abcdef")) == String - @test typeof(uppercase(GenericString(str))) == String - @test typeof(lowercase("ABCDEF")) == String - @test typeof(lowercase(GenericString(str))) == String - - foomap(ch) = (ch > Char(65)) - foobar(ch) = Char(0xd800) - foobaz(ch) = reinterpret(Char, typemax(UInt32)) - @test_throws ArgumentError map(foomap, GenericString(str)) - @test map(foobar, GenericString(str)) == String(repeat(b"\ud800", outer=[17])) - @test map(foobaz, GenericString(str)) == String(repeat(b"\ufffd", outer=[17])) - - @test "a".*["b","c"] == ["ab","ac"] - @test ["b","c"].*"a" == ["ba","ca"] - @test ["a","b"].*["c" "d"] == ["ac" "ad"; "bc" "bd"] - - @test one(String) == "" - @test prod(["*" for i in 1:3]) == "***" - @test prod(["*" for i in 1:0]) == "" -end - @testset "NULL pointers are handled consistently by String" begin @test_throws ArgumentError unsafe_string(Ptr{UInt8}(0)) @test_throws ArgumentError unsafe_string(Ptr{UInt8}(0), 10) diff --git a/test/strings/io.jl b/test/strings/io.jl index 7ee325c252c11..ad770804d8e21 100644 --- a/test/strings/io.jl +++ b/test/strings/io.jl @@ -66,14 +66,14 @@ cp, ch, st = cx[i,:] @test cp == convert(UInt32, ch) @test string(ch) == unescape_string(st) - if isascii(ch) || !isprint(ch) + if Base.Unicode.isascii(ch) || !Base.Unicode.isprint(ch) @test st == escape_string(string(ch)) end for j = 1:size(cx,1) local str = string(ch, cx[j,2]) @test str == unescape_string(escape_string(str)) end - @test repr(ch) == "'$(isprint(ch) ? ch : st)'" + @test repr(ch) == "'$(Base.Unicode.isprint(ch) ? ch : st)'" end for i = 0:0x7f, p = ["","\0","x","xxx","\x7f","\uFF","\uFFF", diff --git a/test/strings/util.jl b/test/strings/util.jl index 0315db7c9b62b..cf8865157245a 100644 --- a/test/strings/util.jl +++ b/test/strings/util.jl @@ -210,7 +210,7 @@ end @test replace("ḟøøƀäṙḟøø", r"(ḟø|ƀä)", "xx") == "xxøxxṙxxø" @test replace("ḟøøƀäṙḟøø", r"(ḟøø|ƀä)", "ƀäṙ") == "ƀäṙƀäṙṙƀäṙ" - @test replace("foo", "oo", uppercase) == "fOO" + @test replace("foo", "oo", Base.Unicode.uppercase) == "fOO" # Issue 13332 @test replace("abc", 'b', 2.1) == "a2.1c" From 635826b25fff4c8edd224e0ad9509133aa4b0fd0 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Tue, 12 Dec 2017 22:18:45 +0100 Subject: [PATCH 2/3] Rename base/strings/utf8proc.jl to base/strings/unicode.jl To match the new name of the module. --- base/strings/strings.jl | 2 +- base/strings/{utf8proc.jl => unicode.jl} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename base/strings/{utf8proc.jl => unicode.jl} (100%) diff --git a/base/strings/strings.jl b/base/strings/strings.jl index 62cb030c9f84c..961f05cdc675a 100644 --- a/base/strings/strings.jl +++ b/base/strings/strings.jl @@ -4,6 +4,6 @@ include("strings/errors.jl") include("strings/substring.jl") include("strings/basic.jl") include("strings/search.jl") -include("strings/utf8proc.jl") +include("strings/unicode.jl") include("strings/util.jl") include("strings/io.jl") diff --git a/base/strings/utf8proc.jl b/base/strings/unicode.jl similarity index 100% rename from base/strings/utf8proc.jl rename to base/strings/unicode.jl From 756936abe846c1e46a9751a238ba0dcac7f45160 Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Sat, 9 Dec 2017 21:54:56 +0100 Subject: [PATCH 3/3] Deprecate isnumber(), is_assigned_char() and normalize_string() isnumeric() is consistent with Python and Rust (but not Go), and less easy to confuse with isdigit(). Improve documentation to make confusion less easy. Also fix a few uses where isdigit() is more appropriate than isnumber(). --- NEWS.md | 4 ++ base/client.jl | 2 +- base/distributed/Distributed.jl | 2 +- base/loading.jl | 2 +- base/precompile.jl | 2 +- base/regex.jl | 6 +- base/strings/unicode.jl | 40 ++++++----- doc/src/manual/faq.md | 6 +- stdlib/Unicode/docs/src/index.md | 6 +- stdlib/Unicode/src/Unicode.jl | 17 +++-- stdlib/Unicode/test/runtests.jl | 119 ++++++++++++++++--------------- test/strings/basic.jl | 2 +- 12 files changed, 114 insertions(+), 94 deletions(-) diff --git a/NEWS.md b/NEWS.md index 4d90d1b4d78dc..672f6d4021048 100644 --- a/NEWS.md +++ b/NEWS.md @@ -744,6 +744,10 @@ Deprecated or removed `isdigit`, `isxdigit`, `isnumber`, `isalnum`, `iscntrl`, `ispunct`, `isspace`, `isprint`, `isgraph`, `lowercase`, `uppercase`, `titlecase`, `lcfirst` and `ucfirst`. + * `isnumber` has been deprecated in favor of `isnumeric`, `is_assigned_char` + in favor of `isassigned` and `normalize_string` in favor of `normalize`, all three + in the new `Unicode` standard library module ([#25021]). + Command-line option changes --------------------------- diff --git a/base/client.jl b/base/client.jl index df6f5df0c9d3d..88914e1d49f2f 100644 --- a/base/client.jl +++ b/base/client.jl @@ -361,7 +361,7 @@ function load_machine_file(path::AbstractString) s = split(line, '*'; keep = false) map!(strip, s, s) if length(s) > 1 - cnt = isnumber(s[1]) ? parse(Int,s[1]) : Symbol(s[1]) + cnt = all(isdigit, s[1]) ? parse(Int,s[1]) : Symbol(s[1]) push!(machines,(s[2], cnt)) else push!(machines,line) diff --git a/base/distributed/Distributed.jl b/base/distributed/Distributed.jl index 609f88acf9cad..a88d5ecac4ee0 100644 --- a/base/distributed/Distributed.jl +++ b/base/distributed/Distributed.jl @@ -15,7 +15,7 @@ using Base: Process, Semaphore, JLOptions, AnyDict, buffer_writes, wait_connecte binding_module, notify_error, atexit, julia_exename, julia_cmd, AsyncGenerator, display_error, acquire, release, invokelatest, warn_once, shell_escape_posixly, uv_error -using Base.Unicode: isascii, isdigit, isnumber +using Base.Unicode: isascii, isdigit, isnumeric # NOTE: clusterserialize.jl imports additional symbols from Base.Serializer for use diff --git a/base/loading.jl b/base/loading.jl index 8e15004b2bd5c..3576de4698c3f 100644 --- a/base/loading.jl +++ b/base/loading.jl @@ -69,7 +69,7 @@ elseif Sys.isapple() # If there is no match, it's possible that the file does exist but HFS+ # performed unicode normalization. See https://developer.apple.com/library/mac/qa/qa1235/_index.html. Unicode.isascii(path_basename) && return false - Vector{UInt8}(Unicode.normalize_string(path_basename, :NFD)) == casepreserved_basename + Vector{UInt8}(Unicode.normalize(path_basename, :NFD)) == casepreserved_basename end else # Generic fallback that performs a slow directory listing. diff --git a/base/precompile.jl b/base/precompile.jl index 092c24045c0d7..051f6ae91ed46 100644 --- a/base/precompile.jl +++ b/base/precompile.jl @@ -68,7 +68,7 @@ precompile(Tuple{typeof(Base.lstrip), Base.SubString{String}, Array{Char, 1}}) precompile(Tuple{getfield(Base, Symbol("#kw##split")), Array{Any, 1}, typeof(Base.split), String, Char}) precompile(Tuple{getfield(Base, Symbol("#kw##split")), Array{Any, 1}, typeof(Base.split), Base.SubString{String}, Char}) precompile(Tuple{typeof(Base.map!), typeof(Base.strip), Array{Base.SubString{String}, 1}, Array{Base.SubString{String}, 1}}) -precompile(Tuple{typeof(Base.Unicode.isnumber), Base.SubString{String}}) +precompile(Tuple{typeof(Base.Unicode.isnumeric), Base.SubString{String}}) precompile(Tuple{Type{Core.Inference.Generator{I, F} where F where I}, Type{Core.Inference.Const}, Tuple{Tuple{Base.DevNullStream, Base.DevNullStream, Base.DevNullStream}}}) precompile(Tuple{Type{Core.Inference.Generator{Tuple{Tuple{Base.DevNullStream, Base.DevNullStream, Base.DevNullStream}}, Type{Core.Inference.Const}}}, Type{Core.Inference.Const}, Tuple{Tuple{Base.DevNullStream, Base.DevNullStream, Base.DevNullStream}}}) precompile(Tuple{typeof(Core.Inference.convert), Type{Tuple{Tuple{Base.DevNullStream, Base.DevNullStream, Base.DevNullStream}}}, Tuple{Tuple{Base.DevNullStream, Base.DevNullStream, Base.DevNullStream}}}) diff --git a/base/regex.jl b/base/regex.jl index 6ad92f15cba20..555032ef30d23 100644 --- a/base/regex.jl +++ b/base/regex.jl @@ -338,11 +338,11 @@ function _replace(io, repl_s::SubstitutionString, str, r, re) if repl[next_i] == SUB_CHAR write(io, SUB_CHAR) i = nextind(repl, next_i) - elseif Unicode.isnumber(repl[next_i]) + elseif Unicode.isdigit(repl[next_i]) group = parse(Int, repl[next_i]) i = nextind(repl, next_i) while i <= e - if Unicode.isnumber(repl[i]) + if Unicode.isdigit(repl[i]) group = 10group + parse(Int, repl[i]) i = nextind(repl, i) else @@ -364,7 +364,7 @@ function _replace(io, repl_s::SubstitutionString, str, r, re) end # TODO: avoid this allocation groupname = SubString(repl, groupstart, prevind(repl, i)) - if all(Unicode.isnumber,groupname) + if all(Unicode.isdigit, groupname) _write_capture(io, re, parse(Int, groupname)) else group = PCRE.substring_number_from_name(re.regex, groupname) diff --git a/base/strings/unicode.jl b/base/strings/unicode.jl index df2b68eacd81e..67859b41b54c9 100644 --- a/base/strings/unicode.jl +++ b/base/strings/unicode.jl @@ -148,7 +148,7 @@ end utf8proc_map(s::AbstractString, flags::Integer) = utf8proc_map(String(s), flags) -function normalize_string(s::AbstractString; stable::Bool=false, compat::Bool=false, compose::Bool=true, decompose::Bool=false, stripignore::Bool=false, rejectna::Bool=false, newline2ls::Bool=false, newline2ps::Bool=false, newline2lf::Bool=false, stripcc::Bool=false, casefold::Bool=false, lump::Bool=false, stripmark::Bool=false) +function normalize(s::AbstractString; stable::Bool=false, compat::Bool=false, compose::Bool=true, decompose::Bool=false, stripignore::Bool=false, rejectna::Bool=false, newline2ls::Bool=false, newline2ps::Bool=false, newline2lf::Bool=false, stripcc::Bool=false, casefold::Bool=false, lump::Bool=false, stripmark::Bool=false) flags = 0 stable && (flags = flags | UTF8PROC_STABLE) compat && (flags = flags | UTF8PROC_COMPAT) @@ -173,7 +173,7 @@ function normalize_string(s::AbstractString; stable::Bool=false, compat::Bool=fa end """ - normalize_string(s::AbstractString, normalform::Symbol) + Unicode.normalize(s::AbstractString, normalform::Symbol) Normalize the string `s` according to one of the four "normal forms" of the Unicode standard: `normalform` can be `:NFC`, `:NFD`, `:NFKC`, or `:NFKD`. Normal forms C @@ -185,7 +185,7 @@ canonical choice (e.g. they expand ligatures into the individual characters), wi being more compact. Alternatively, finer control and additional transformations may be be obtained by calling -`normalize_string(s; keywords...)`, where any number of the following boolean keywords +`Unicode.normalize(s; keywords...)`, where any number of the following boolean keywords options (which all default to `false` except for `compose`) are specified: * `compose=false`: do not perform canonical composition @@ -211,17 +211,17 @@ For example, NFKC corresponds to the options `compose=true, compat=true, stable= ```jldoctest julia> using Unicode -julia> "μ" == normalize_string("µ", compat=true) #LHS: Unicode U+03bc, RHS: Unicode U+00b5 +julia> "μ" == normalize("µ", compat=true) #LHS: Unicode U+03bc, RHS: Unicode U+00b5 true -julia> normalize_string("JuLiA", casefold=true) +julia> normalize("JuLiA", casefold=true) "julia" -julia> normalize_string("JúLiA", stripmark=true) +julia> normalize("JúLiA", stripmark=true) "JuLiA" ``` """ -function normalize_string(s::AbstractString, nf::Symbol) +function normalize(s::AbstractString, nf::Symbol) utf8proc_map(s, nf == :NFC ? (UTF8PROC_STABLE | UTF8PROC_COMPOSE) : nf == :NFD ? (UTF8PROC_STABLE | UTF8PROC_DECOMPOSE) : nf == :NFKC ? (UTF8PROC_STABLE | UTF8PROC_COMPOSE @@ -281,7 +281,7 @@ category_abbrev(c) = unsafe_string(ccall(:utf8proc_category_string, Cstring, (UI category_string(c) = category_strings[category_code(c)+1] """ - is_assigned_char(c) -> Bool + Unicode.isassigned(c) -> Bool Returns `true` if the given char or integer is an assigned Unicode code point. @@ -289,14 +289,14 @@ Returns `true` if the given char or integer is an assigned Unicode code point. ```jldoctest julia> using Unicode -julia> is_assigned_char(101) +julia> isassigned(101) true -julia> is_assigned_char('\\x01') +julia> isassigned('\\x01') true ``` """ -is_assigned_char(c) = category_code(c) != UTF8PROC_CATEGORY_CN +isassigned(c) = category_code(c) != UTF8PROC_CATEGORY_CN ## libc character class predicates ## @@ -354,7 +354,7 @@ end """ isdigit(c::Char) -> Bool -Tests whether a character is a numeric digit (0-9). +Tests whether a character is a decimal digit (0-9). # Examples ```jldoctest @@ -396,27 +396,33 @@ false isalpha(c::Char) = (UTF8PROC_CATEGORY_LU <= category_code(c) <= UTF8PROC_CATEGORY_LO) """ - isnumber(c::Char) -> Bool + isnumeric(c::Char) -> Bool Tests whether a character is numeric. A character is classified as numeric if it belongs to the Unicode general category Number, i.e. a character whose category code begins with 'N'. +Note that this broad category includes characters such as ¾ and ௰. +Use [`isdigit`](@ref) to check whether a character a decimal digit between 0 and 9. + # Examples ```jldoctest julia> using Unicode -julia> isnumber('9') +julia> isnumeric('௰') +true + +julia> isnumeric('9') true -julia> isnumber('α') +julia> isnumeric('α') false -julia> isnumber('❤') +julia> isnumeric('❤') false ``` """ -isnumber(c::Char) = (UTF8PROC_CATEGORY_ND <= category_code(c) <= UTF8PROC_CATEGORY_NO) +isnumeric(c::Char) = (UTF8PROC_CATEGORY_ND <= category_code(c) <= UTF8PROC_CATEGORY_NO) """ isalnum(c::Char) -> Bool diff --git a/doc/src/manual/faq.md b/doc/src/manual/faq.md index ebcae376c8446..119777b343476 100644 --- a/doc/src/manual/faq.md +++ b/doc/src/manual/faq.md @@ -617,8 +617,8 @@ all/many future usages of the other functions in module Foo that depend on calli Unlike many languages (for example, C and Java), Julia does not have a "null" value. When a reference (variable, object field, or array element) is uninitialized, accessing it will immediately throw -an error. This situation can be detected using the [`isdefined`](@ref) or [`isassigned`](@ref) -functions. +an error. This situation can be detected using the [`isdefined`](@ref) or +[`isassigned`](@ref Base.isassigned) functions. Some functions are used only for their side effects, and do not need to return a value. In these cases, the convention is to return the value `nothing`, which is just a singleton object of type @@ -627,7 +627,7 @@ this convention, and that the REPL does not print anything for it. Some language would not otherwise have a value also yield `nothing`, for example `if false; end`. To represent missing data in the statistical sense (`NA` in R or `NULL` in SQL), use the -[`missing`](@ref) object. See the [`Missing Values|](@ref missing) section for more details. +[`missing`](@ref) object. See the [`Missing Values`](@ref missing) section for more details. The empty tuple (`()`) is another form of nothingness. But, it should not really be thought of as nothing but rather a tuple of zero values. diff --git a/stdlib/Unicode/docs/src/index.md b/stdlib/Unicode/docs/src/index.md index 86ab0d7383256..4f519aa0bc05e 100644 --- a/stdlib/Unicode/docs/src/index.md +++ b/stdlib/Unicode/docs/src/index.md @@ -1,8 +1,8 @@ # Unicode ```@docs -Unicode.is_assigned_char -Unicode.normalize_string +Unicode.isassigned +Unicode.normalize Unicode.graphemes Unicode.uppercase Unicode.lowercase @@ -16,7 +16,7 @@ Unicode.iscntrl Unicode.isdigit Unicode.isgraph Unicode.islower -Unicode.isnumber +Unicode.isnumeric Unicode.isprint Unicode.ispunct Unicode.isspace diff --git a/stdlib/Unicode/src/Unicode.jl b/stdlib/Unicode/src/Unicode.jl index a320c565bcd91..82e2801eb32b2 100644 --- a/stdlib/Unicode/src/Unicode.jl +++ b/stdlib/Unicode/src/Unicode.jl @@ -4,14 +4,23 @@ __precompile__(true) module Unicode -using Base.Unicode: normalize_string, graphemes, is_assigned_char, textwidth, isvalid, - islower, isupper, isalpha, isdigit, isxdigit, isnumber, isalnum, +using Base.Unicode: normalize, graphemes, isassigned, textwidth, isvalid, + islower, isupper, isalpha, isdigit, isxdigit, isnumeric, isalnum, iscntrl, ispunct, isspace, isprint, isgraph, lowercase, uppercase, titlecase, lcfirst, ucfirst -export normalize_string, graphemes, is_assigned_char, textwidth, isvalid, - islower, isupper, isalpha, isdigit, isxdigit, isnumber, isalnum, +export normalize, graphemes, isassigned, textwidth, isvalid, + islower, isupper, isalpha, isdigit, isxdigit, isnumeric, isalnum, iscntrl, ispunct, isspace, isprint, isgraph, lowercase, uppercase, titlecase, lcfirst, ucfirst +# BEGIN 0.7 deprecations + +@deprecate isnumber(c::Char) Unicode.isnumeric(c) +@deprecate is_assigned_char(c::Char) Unicode.isassigned(c) +@deprecate normalize_string(s::AbstractString, nf::Symbol; kwargs...) Unicode.normalize(s, nf; kwargs...) +@deprecate normalize_string(s::AbstractString; kwargs...) Unicode.normalize(s; kwargs...) + +# END 0.7 deprecations + end diff --git a/stdlib/Unicode/test/runtests.jl b/stdlib/Unicode/test/runtests.jl index 402807d075061..1a332e638a971 100644 --- a/stdlib/Unicode/test/runtests.jl +++ b/stdlib/Unicode/test/runtests.jl @@ -2,27 +2,28 @@ using Test using Unicode +using Unicode: normalize, isassigned @testset "string normalization" begin - # normalize_string (Unicode normalization etc.): - @test normalize_string("\u006e\u0303", :NFC) == "\u00f1" - @test "\u006e\u0303" == normalize_string("\u00f1", :NFD) - @test normalize_string("\ufb00", :NFC) != "ff" - @test normalize_string("\ufb00", :NFKC) == "ff" - @test normalize_string("\u006e\u0303\ufb00", :NFKC) == "\u00f1"*"ff" - @test normalize_string("\u00f1\ufb00", :NFKD) == "\u006e\u0303"*"ff" - @test normalize_string("\u006e\u0303", compose=true) == "\u00f1" - @test "\u006e\u0303" == normalize_string("\u00f1", decompose=true) - @test normalize_string("\u006e\u0303\u00b5",compat=true) == "\u00f1\u03bc" - @test normalize_string("Σσς",casefold=true) == "σσσ" - @test normalize_string("∕⁄", lump=true) == "//" - @test normalize_string("\ua\n\r\r\ua", newline2lf=true) == "\ua\ua\ua\ua" - @test normalize_string("\ua\n\r\r\ua", newline2ls=true) == "\u2028\u2028\u2028\u2028" - @test normalize_string("\ua\n\r\r\ua", newline2ps=true) == "\u2029\u2029\u2029\u2029" - @test normalize_string("\u00f1", stripmark=true) == "n" - @test isempty(normalize_string("\u00ad", stripignore=true)) - @test normalize_string("\t\r", stripcc=true) == " " - @test normalize_string("\t\r", stripcc=true, newline2ls=true) == " \u2028" + # normalize (Unicode normalization etc.): + @test normalize("\u006e\u0303", :NFC) == "\u00f1" + @test "\u006e\u0303" == normalize("\u00f1", :NFD) + @test normalize("\ufb00", :NFC) != "ff" + @test normalize("\ufb00", :NFKC) == "ff" + @test normalize("\u006e\u0303\ufb00", :NFKC) == "\u00f1"*"ff" + @test normalize("\u00f1\ufb00", :NFKD) == "\u006e\u0303"*"ff" + @test normalize("\u006e\u0303", compose=true) == "\u00f1" + @test "\u006e\u0303" == normalize("\u00f1", decompose=true) + @test normalize("\u006e\u0303\u00b5",compat=true) == "\u00f1\u03bc" + @test normalize("Σσς",casefold=true) == "σσσ" + @test normalize("∕⁄", lump=true) == "//" + @test normalize("\ua\n\r\r\ua", newline2lf=true) == "\ua\ua\ua\ua" + @test normalize("\ua\n\r\r\ua", newline2ls=true) == "\u2028\u2028\u2028\u2028" + @test normalize("\ua\n\r\r\ua", newline2ps=true) == "\u2029\u2029\u2029\u2029" + @test normalize("\u00f1", stripmark=true) == "n" + @test isempty(normalize("\u00ad", stripignore=true)) + @test normalize("\t\r", stripcc=true) == " " + @test normalize("\t\r", stripcc=true, newline2ls=true) == " \u2028" end @testset "unicode sa#15" begin @@ -30,7 +31,7 @@ end #http://www.unicode.org/reports/tr15/ @testset "canonical equivalence" begin - let ==(a::Array{Char},b::Array{Char}) = normalize_string(string(a...), :NFC)==normalize_string(string(b...), :NFC) + let ==(a::Array{Char},b::Array{Char}) = normalize(string(a...), :NFC)==normalize(string(b...), :NFC) ==(a,b) = Base.:(==)(a,b) @test ['C', '̧'] == ['Ç'] @test ['q', '̇', '̣'] == ['q', '̣', '̇'] @@ -40,7 +41,7 @@ end end @testset "compatibility equivalence" begin - let ==(a::Array{Char},b::Array{Char}) = normalize_string(string(a...), :NFKC)==normalize_string(string(b...), :NFKC) + let ==(a::Array{Char},b::Array{Char}) = normalize(string(a...), :NFKC)==normalize(string(b...), :NFKC) ==(a,b) = Base.:(==)(a,b) @test ['ℌ'] == ['ℍ'] == ['H'] @test ['ﻨ'] == ['ﻧ'] == ['ﻦ'] == ['ﻥ'] @@ -55,36 +56,36 @@ end end @testset "singletons" begin - @test normalize_string("\U212b", :NFD) == "A\U030a" - @test normalize_string("\U212b", :NFC) == "\U00c5" - @test normalize_string("\U2126", :NFC) == normalize_string("\U2126", :NFD) == "\U03a9" + @test normalize("\U212b", :NFD) == "A\U030a" + @test normalize("\U212b", :NFC) == "\U00c5" + @test normalize("\U2126", :NFC) == normalize("\U2126", :NFD) == "\U03a9" end @testset "canonical composites" begin - @test normalize_string("\U00c5", :NFC) == "\U00c5" - @test normalize_string("\U00c5", :NFD) == "A\U030a" - @test normalize_string("\U00f4", :NFC) == "\U00f4" - @test normalize_string("\U00f4", :NFD) == "o\U0302" + @test normalize("\U00c5", :NFC) == "\U00c5" + @test normalize("\U00c5", :NFD) == "A\U030a" + @test normalize("\U00f4", :NFC) == "\U00f4" + @test normalize("\U00f4", :NFD) == "o\U0302" end @testset "multiple combining marks" begin - @test normalize_string("\U1e69", :NFD) == "s\U0323\U0307" - @test normalize_string("\U1e69", :NFC) == "\U1e69" - @test normalize_string("\U1e0b\U0323", :NFD) == "d\U0323\U0307" - @test normalize_string("\U1e0b\U0323", :NFC) == "\U1e0d\U0307" - @test normalize_string("q\U0307\U0323", :NFC) == "q\U0323\U0307" - @test normalize_string("q\U0307\U0323", :NFD) == "q\U0323\U0307" + @test normalize("\U1e69", :NFD) == "s\U0323\U0307" + @test normalize("\U1e69", :NFC) == "\U1e69" + @test normalize("\U1e0b\U0323", :NFD) == "d\U0323\U0307" + @test normalize("\U1e0b\U0323", :NFC) == "\U1e0d\U0307" + @test normalize("q\U0307\U0323", :NFC) == "q\U0323\U0307" + @test normalize("q\U0307\U0323", :NFD) == "q\U0323\U0307" end @testset "compatibility composites" begin - @test normalize_string("\Ufb01", :NFD) == normalize_string("\Ufb01", :NFC) == "\Ufb01" - @test normalize_string("\Ufb01", :NFKD) == normalize_string("\Ufb01", :NFKC) == "fi" - @test normalize_string("2\U2075", :NFD) == normalize_string("2\U2075", :NFC) == "2\U2075" - @test normalize_string("2\U2075", :NFKD) == normalize_string("2\U2075", :NFKC) == "25" - @test normalize_string("\U1e9b\U0323", :NFD) == "\U017f\U0323\U0307" - @test normalize_string("\U1e9b\U0323", :NFC) == "\U1e9b\U0323" - @test normalize_string("\U1e9b\U0323", :NFKD) == "s\U0323\U0307" - @test normalize_string("\U1e9b\U0323", :NFKC) == "\U1e69" + @test normalize("\Ufb01", :NFD) == normalize("\Ufb01", :NFC) == "\Ufb01" + @test normalize("\Ufb01", :NFKD) == normalize("\Ufb01", :NFKC) == "fi" + @test normalize("2\U2075", :NFD) == normalize("2\U2075", :NFC) == "2\U2075" + @test normalize("2\U2075", :NFKD) == normalize("2\U2075", :NFKC) == "25" + @test normalize("\U1e9b\U0323", :NFD) == "\U017f\U0323\U0307" + @test normalize("\U1e9b\U0323", :NFC) == "\U1e9b\U0323" + @test normalize("\U1e9b\U0323", :NFKD) == "s\U0323\U0307" + @test normalize("\U1e9b\U0323", :NFKC) == "\U1e69" end end @@ -95,7 +96,7 @@ end @test islower(c) == true @test isupper(c) == false @test isdigit(c) == false - @test isnumber(c) == false + @test isnumeric(c) == false end aupper=['A', 'D', 'J', 'Y', 'Z'] @@ -105,7 +106,7 @@ end @test islower(c) == false @test isupper(c) == true @test isdigit(c) == false - @test isnumber(c) == false + @test isnumeric(c) == false end nocase=['א','ﺵ'] @@ -113,7 +114,7 @@ end for c in alphas @test isalpha(c) == true - @test isnumber(c) == false + @test isnumeric(c) == false end anumber=['0', '1', '5', '9'] @@ -121,11 +122,11 @@ end for c in anumber @test isdigit(c) == true - @test isnumber(c) == true + @test isnumeric(c) == true end for c in unumber @test isdigit(c) == false - @test isnumber(c) == true + @test isnumeric(c) == true end alnums=vcat(alphas,anumber,unumber) @@ -200,7 +201,7 @@ end @test !all(isgraph," \t \n \r ") @test !all(isprint," \t \n \r ") @test !all(isalpha," \t \n \r ") - @test !all(isnumber," \t \n \r ") + @test !all(isnumeric," \t \n \r ") @test !all(ispunct," \t \n \r ") @test !all(isspace,"ΣβΣβ") @@ -209,11 +210,11 @@ end @test all(isprint,"ΣβΣβ") @test !all(isupper,"ΣβΣβ") @test !all(islower,"ΣβΣβ") - @test !all(isnumber,"ΣβΣβ") + @test !all(isnumeric,"ΣβΣβ") @test !all(iscntrl,"ΣβΣβ") @test !all(ispunct,"ΣβΣβ") - @test all(isnumber,"23435") + @test all(isnumeric,"23435") @test all(isdigit,"23435") @test all(isalnum,"23435") @test !all(isalpha,"23435") @@ -249,8 +250,8 @@ end for T in (String,GenericString) for nf in (:NFC, :NFD) for (s, g) in grphtest - s_ = T(normalize_string(s, nf)) - g_ = map(s -> normalize_string(s, nf), g) + s_ = T(normalize(s, nf)) + g_ = map(s -> normalize(s, nf), g) # #9261 if length(s_) > 0 @test typeof(first(graphemes(s_))) == SubString{typeof(s_)} @@ -260,7 +261,7 @@ end @test grph == g_ @test length(graphemes(s_)) == length(grph) end - S = [T(normalize_string(s)) for (s,g) in grphtest] + S = [T(normalize(s)) for (s,g) in grphtest] G = map(graphemes, S) @test map(graphemes, sort!(S)) == sort!(G) end @@ -280,23 +281,23 @@ end @testset "#10958 handling of embedded NUL chars" begin @test length("\0w") == length("\0α") == 2 @test textwidth("\0w") == textwidth("\0α") == 1 - @test normalize_string("\0W", casefold=true) == "\0w" + @test normalize("\0W", casefold=true) == "\0w" end @testset "ut8proc_map with GenericString" begin - @test normalize_string(GenericString("\u006e\u0303"), :NFC) == "\u00f1" + @test normalize(GenericString("\u006e\u0303"), :NFC) == "\u00f1" end -@testset "normalize_string keywords" begin - @test_throws ArgumentError normalize_string("\u006e\u0303", compose=false, compat=true) - @test_throws ArgumentError normalize_string("\u006e\u0303", compose=false, stripmark=true) +@testset "normalize keywords" begin + @test_throws ArgumentError normalize("\u006e\u0303", compose=false, compat=true) + @test_throws ArgumentError normalize("\u006e\u0303", compose=false, stripmark=true) end @testset "fastplus" begin @test lowercase('A') == 'a' @test uppercase('a') == 'A' - @test is_assigned_char('A') + @test isassigned('A') end @testset "isspace" begin diff --git a/test/strings/basic.jl b/test/strings/basic.jl index 8e4073a138cac..1ad8ca9184e74 100644 --- a/test/strings/basic.jl +++ b/test/strings/basic.jl @@ -278,7 +278,7 @@ end for T in [BigInt, Int8, UInt8, Int16, UInt16, Int32, UInt32, Int64, UInt64, Int128, UInt128, Float64, Float32] @test isnull(tryparse(T, "1\0")) end - let s = Base.Unicode.normalize_string("tést",:NFKC) + let s = Base.Unicode.normalize("tést",:NFKC) @test unsafe_string(Base.unsafe_convert(Cstring, Base.cconvert(Cstring, s))) == s @test unsafe_string(convert(Cstring, Symbol(s))) == s end