Skip to content

Commit d62b2e7

Browse files
committed
Move docstrings to Unicode stdlib module
Functions need to be defined in Unicode module, as reexporting Base.Unicode functions makes them appear with the latter prefix in the manual.
1 parent a1bec2a commit d62b2e7

File tree

3 files changed

+92
-81
lines changed

3 files changed

+92
-81
lines changed

base/strings/substring.jl

+1-1
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ pointer(x::SubString{String}, i::Integer) = pointer(x.string) + x.offset + (i-1)
107107
Reverses a string. Technically, this function reverses the codepoints in a string and its
108108
main utility is for reversed-order string processing, especially for reversed
109109
regular-expression searches. See also [`reverseind`](@ref) to convert indices in `s` to
110-
indices in `reverse(s)` and vice-versa, and [`Unicode.graphemes`](@ref Base.Unicode.graphemes) to
110+
indices in `reverse(s)` and vice-versa, and `graphemes` from module `Unicode` to
111111
operate on user-visible "characters" (graphemes) rather than codepoints.
112112
See also [`Iterators.reverse`](@ref) for
113113
reverse-order iteration without making a copy. Custom string types must implement the

base/strings/unicode.jl

+3-78
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,7 @@ end
150150

151151
utf8proc_map(s::AbstractString, flags::Integer) = utf8proc_map(String(s), flags)
152152

153+
# Documented in Unicode module
153154
function normalize(
154155
s::AbstractString;
155156
stable::Bool=false,
@@ -190,55 +191,6 @@ function normalize(
190191
utf8proc_map(s, flags)
191192
end
192193

193-
"""
194-
Unicode.normalize(s::AbstractString, normalform::Symbol)
195-
196-
Normalize the string `s` according to one of the four "normal forms" of the Unicode
197-
standard: `normalform` can be `:NFC`, `:NFD`, `:NFKC`, or `:NFKD`. Normal forms C
198-
(canonical composition) and D (canonical decomposition) convert different visually identical
199-
representations of the same abstract string into a single canonical form, with form C being
200-
more compact. Normal forms KC and KD additionally canonicalize "compatibility equivalents":
201-
they convert characters that are abstractly similar but visually distinct into a single
202-
canonical choice (e.g. they expand ligatures into the individual characters), with form KC
203-
being more compact.
204-
205-
Alternatively, finer control and additional transformations may be be obtained by calling
206-
`Unicode.normalize(s; keywords...)`, where any number of the following boolean keywords
207-
options (which all default to `false` except for `compose`) are specified:
208-
209-
* `compose=false`: do not perform canonical composition
210-
* `decompose=true`: do canonical decomposition instead of canonical composition
211-
(`compose=true` is ignored if present)
212-
* `compat=true`: compatibility equivalents are canonicalized
213-
* `casefold=true`: perform Unicode case folding, e.g. for case-insensitive string comparison
214-
* `newline2lf=true`, `newline2ls=true`, or `newline2ps=true`: convert various newline
215-
sequences (LF, CRLF, CR, NEL) into a linefeed (LF), line-separation (LS), or
216-
paragraph-separation (PS) character, respectively
217-
* `stripmark=true`: strip diacritical marks (e.g. accents)
218-
* `stripignore=true`: strip Unicode's "default ignorable" characters (e.g. the soft hyphen
219-
or the left-to-right marker)
220-
* `stripcc=true`: strip control characters; horizontal tabs and form feeds are converted to
221-
spaces; newlines are also converted to spaces unless a newline-conversion flag was
222-
specified
223-
* `rejectna=true`: throw an error if unassigned code points are found
224-
* `stable=true`: enforce Unicode Versioning Stability
225-
226-
For example, NFKC corresponds to the options `compose=true, compat=true, stable=true`.
227-
228-
# Examples
229-
```jldoctest
230-
julia> using Unicode
231-
232-
julia> "μ" == normalize("µ", compat=true) #LHS: Unicode U+03bc, RHS: Unicode U+00b5
233-
true
234-
235-
julia> normalize("JuLiA", casefold=true)
236-
"julia"
237-
238-
julia> normalize("JúLiA", stripmark=true)
239-
"JuLiA"
240-
```
241-
"""
242194
function normalize(s::AbstractString, nf::Symbol)
243195
utf8proc_map(s, nf == :NFC ? (UTF8PROC_STABLE | UTF8PROC_COMPOSE) :
244196
nf == :NFD ? (UTF8PROC_STABLE | UTF8PROC_DECOMPOSE) :
@@ -311,22 +263,6 @@ end
311263

312264
category_string(c) = category_strings[category_code(c)+1]
313265

314-
"""
315-
Unicode.isassigned(c) -> Bool
316-
317-
Returns `true` if the given char or integer is an assigned Unicode code point.
318-
319-
# Examples
320-
```jldoctest
321-
julia> using Unicode
322-
323-
julia> Unicode.isassigned(101)
324-
true
325-
326-
julia> Unicode.isassigned('\\x01')
327-
true
328-
```
329-
"""
330266
isassigned(c) = UTF8PROC_CATEGORY_CN < category_code(c) <= UTF8PROC_CATEGORY_CO
331267

332268
## libc character class predicates ##
@@ -378,11 +314,7 @@ function isupper(c::Char)
378314
cat == UTF8PROC_CATEGORY_LU || cat == UTF8PROC_CATEGORY_LT
379315
end
380316

381-
"""
382-
iscased(c::Char) -> Bool
383-
384-
Tests whether a character is cased, i.e. is lower-, upper- or title-cased.
385-
"""
317+
# Documented in Unicode module
386318
function iscased(c::Char)
387319
cat = category_code(c)
388320
return cat == UTF8PROC_CATEGORY_LU ||
@@ -696,14 +628,7 @@ struct GraphemeIterator{S<:AbstractString}
696628
s::S # original string (for generation of SubStrings)
697629
end
698630

699-
"""
700-
graphemes(s::AbstractString) -> GraphemeIterator
701-
702-
Returns an iterator over substrings of `s` that correspond to the extended graphemes in the
703-
string, as defined by Unicode UAX #29. (Roughly, these are what users would perceive as
704-
single characters, even though they may contain more than one codepoint; for example a
705-
letter combined with an accent mark is a single grapheme.)
706-
"""
631+
# Documented in Unicode module
707632
graphemes(s::AbstractString) = GraphemeIterator{typeof(s)}(s)
708633

709634
eltype(::Type{GraphemeIterator{S}}) where {S} = SubString{S}

stdlib/Unicode/src/Unicode.jl

+88-2
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,96 @@ __precompile__(true)
44

55
module Unicode
66

7-
using Base.Unicode: normalize, graphemes, isassigned, iscased
8-
97
export graphemes
108

9+
"""
10+
Unicode.normalize(s::AbstractString, normalform::Symbol)
11+
12+
Normalize the string `s` according to one of the four "normal forms" of the Unicode
13+
standard: `normalform` can be `:NFC`, `:NFD`, `:NFKC`, or `:NFKD`. Normal forms C
14+
(canonical composition) and D (canonical decomposition) convert different visually identical
15+
representations of the same abstract string into a single canonical form, with form C being
16+
more compact. Normal forms KC and KD additionally canonicalize "compatibility equivalents":
17+
they convert characters that are abstractly similar but visually distinct into a single
18+
canonical choice (e.g. they expand ligatures into the individual characters), with form KC
19+
being more compact.
20+
21+
Alternatively, finer control and additional transformations may be be obtained by calling
22+
`Unicode.normalize(s; keywords...)`, where any number of the following boolean keywords
23+
options (which all default to `false` except for `compose`) are specified:
24+
25+
* `compose=false`: do not perform canonical composition
26+
* `decompose=true`: do canonical decomposition instead of canonical composition
27+
(`compose=true` is ignored if present)
28+
* `compat=true`: compatibility equivalents are canonicalized
29+
* `casefold=true`: perform Unicode case folding, e.g. for case-insensitive string comparison
30+
* `newline2lf=true`, `newline2ls=true`, or `newline2ps=true`: convert various newline
31+
sequences (LF, CRLF, CR, NEL) into a linefeed (LF), line-separation (LS), or
32+
paragraph-separation (PS) character, respectively
33+
* `stripmark=true`: strip diacritical marks (e.g. accents)
34+
* `stripignore=true`: strip Unicode's "default ignorable" characters (e.g. the soft hyphen
35+
or the left-to-right marker)
36+
* `stripcc=true`: strip control characters; horizontal tabs and form feeds are converted to
37+
spaces; newlines are also converted to spaces unless a newline-conversion flag was
38+
specified
39+
* `rejectna=true`: throw an error if unassigned code points are found
40+
* `stable=true`: enforce Unicode Versioning Stability
41+
42+
For example, NFKC corresponds to the options `compose=true, compat=true, stable=true`.
43+
44+
# Examples
45+
```jldoctest
46+
julia> using Unicode
47+
48+
julia> "μ" == normalize("µ", compat=true) #LHS: Unicode U+03bc, RHS: Unicode U+00b5
49+
true
50+
51+
julia> normalize("JuLiA", casefold=true)
52+
"julia"
53+
54+
julia> normalize("JúLiA", stripmark=true)
55+
"JuLiA"
56+
```
57+
"""
58+
function normalize end
59+
normalize(s::AbstractString, nf::Symbol) = Base.Unicode..normalize(s, nf)
60+
normalize(s::AbstractString; kwargs...) = Base.Unicode..normalize(s; kwargs...)
61+
62+
"""
63+
Unicode.isassigned(c) -> Bool
64+
65+
Returns `true` if the given char or integer is an assigned Unicode code point.
66+
67+
# Examples
68+
```jldoctest
69+
julia> using Unicode
70+
71+
julia> Unicode.isassigned(101)
72+
true
73+
74+
julia> Unicode.isassigned('\\x01')
75+
true
76+
```
77+
"""
78+
isassigned(c) = Base.Unicode.isassigned(c)
79+
80+
"""
81+
iscased(c::Char) -> Bool
82+
83+
Tests whether a character is cased, i.e. is lower-, upper- or title-cased.
84+
"""
85+
iscased(c::Char) = Base.Unicode.iscased(c)
86+
87+
"""
88+
graphemes(s::AbstractString) -> GraphemeIterator
89+
90+
Returns an iterator over substrings of `s` that correspond to the extended graphemes in the
91+
string, as defined by Unicode UAX #29. (Roughly, these are what users would perceive as
92+
single characters, even though they may contain more than one codepoint; for example a
93+
letter combined with an accent mark is a single grapheme.)
94+
"""
95+
graphemes(s::AbstractString) = Base.Unicode.GraphemeIterator{typeof(s)}(s)
96+
1197
# BEGIN 0.7 deprecations
1298

1399
@deprecate is_assigned_char(c::Char) Unicode.isassigned(c)

0 commit comments

Comments
 (0)