|
150 | 150 |
|
151 | 151 | utf8proc_map(s::AbstractString, flags::Integer) = utf8proc_map(String(s), flags)
|
152 | 152 |
|
| 153 | +# Documented in Unicode module |
153 | 154 | function normalize(
|
154 | 155 | s::AbstractString;
|
155 | 156 | stable::Bool=false,
|
@@ -190,55 +191,6 @@ function normalize(
|
190 | 191 | utf8proc_map(s, flags)
|
191 | 192 | end
|
192 | 193 |
|
193 |
| -""" |
194 |
| - Unicode.normalize(s::AbstractString, normalform::Symbol) |
195 |
| -
|
196 |
| -Normalize the string `s` according to one of the four "normal forms" of the Unicode |
197 |
| -standard: `normalform` can be `:NFC`, `:NFD`, `:NFKC`, or `:NFKD`. Normal forms C |
198 |
| -(canonical composition) and D (canonical decomposition) convert different visually identical |
199 |
| -representations of the same abstract string into a single canonical form, with form C being |
200 |
| -more compact. Normal forms KC and KD additionally canonicalize "compatibility equivalents": |
201 |
| -they convert characters that are abstractly similar but visually distinct into a single |
202 |
| -canonical choice (e.g. they expand ligatures into the individual characters), with form KC |
203 |
| -being more compact. |
204 |
| -
|
205 |
| -Alternatively, finer control and additional transformations may be be obtained by calling |
206 |
| -`Unicode.normalize(s; keywords...)`, where any number of the following boolean keywords |
207 |
| -options (which all default to `false` except for `compose`) are specified: |
208 |
| -
|
209 |
| -* `compose=false`: do not perform canonical composition |
210 |
| -* `decompose=true`: do canonical decomposition instead of canonical composition |
211 |
| - (`compose=true` is ignored if present) |
212 |
| -* `compat=true`: compatibility equivalents are canonicalized |
213 |
| -* `casefold=true`: perform Unicode case folding, e.g. for case-insensitive string comparison |
214 |
| -* `newline2lf=true`, `newline2ls=true`, or `newline2ps=true`: convert various newline |
215 |
| - sequences (LF, CRLF, CR, NEL) into a linefeed (LF), line-separation (LS), or |
216 |
| - paragraph-separation (PS) character, respectively |
217 |
| -* `stripmark=true`: strip diacritical marks (e.g. accents) |
218 |
| -* `stripignore=true`: strip Unicode's "default ignorable" characters (e.g. the soft hyphen |
219 |
| - or the left-to-right marker) |
220 |
| -* `stripcc=true`: strip control characters; horizontal tabs and form feeds are converted to |
221 |
| - spaces; newlines are also converted to spaces unless a newline-conversion flag was |
222 |
| - specified |
223 |
| -* `rejectna=true`: throw an error if unassigned code points are found |
224 |
| -* `stable=true`: enforce Unicode Versioning Stability |
225 |
| -
|
226 |
| -For example, NFKC corresponds to the options `compose=true, compat=true, stable=true`. |
227 |
| -
|
228 |
| -# Examples |
229 |
| -```jldoctest |
230 |
| -julia> using Unicode |
231 |
| -
|
232 |
| -julia> "μ" == normalize("µ", compat=true) #LHS: Unicode U+03bc, RHS: Unicode U+00b5 |
233 |
| -true |
234 |
| -
|
235 |
| -julia> normalize("JuLiA", casefold=true) |
236 |
| -"julia" |
237 |
| -
|
238 |
| -julia> normalize("JúLiA", stripmark=true) |
239 |
| -"JuLiA" |
240 |
| -``` |
241 |
| -""" |
242 | 194 | function normalize(s::AbstractString, nf::Symbol)
|
243 | 195 | utf8proc_map(s, nf == :NFC ? (UTF8PROC_STABLE | UTF8PROC_COMPOSE) :
|
244 | 196 | nf == :NFD ? (UTF8PROC_STABLE | UTF8PROC_DECOMPOSE) :
|
|
311 | 263 |
|
312 | 264 | category_string(c) = category_strings[category_code(c)+1]
|
313 | 265 |
|
314 |
| -""" |
315 |
| - Unicode.isassigned(c) -> Bool |
316 |
| -
|
317 |
| -Returns `true` if the given char or integer is an assigned Unicode code point. |
318 |
| -
|
319 |
| -# Examples |
320 |
| -```jldoctest |
321 |
| -julia> using Unicode |
322 |
| -
|
323 |
| -julia> Unicode.isassigned(101) |
324 |
| -true |
325 |
| -
|
326 |
| -julia> Unicode.isassigned('\\x01') |
327 |
| -true |
328 |
| -``` |
329 |
| -""" |
330 | 266 | isassigned(c) = UTF8PROC_CATEGORY_CN < category_code(c) <= UTF8PROC_CATEGORY_CO
|
331 | 267 |
|
332 | 268 | ## libc character class predicates ##
|
@@ -378,11 +314,7 @@ function isupper(c::Char)
|
378 | 314 | cat == UTF8PROC_CATEGORY_LU || cat == UTF8PROC_CATEGORY_LT
|
379 | 315 | end
|
380 | 316 |
|
381 |
| -""" |
382 |
| - iscased(c::Char) -> Bool |
383 |
| -
|
384 |
| -Tests whether a character is cased, i.e. is lower-, upper- or title-cased. |
385 |
| -""" |
| 317 | +# Documented in Unicode module |
386 | 318 | function iscased(c::Char)
|
387 | 319 | cat = category_code(c)
|
388 | 320 | return cat == UTF8PROC_CATEGORY_LU ||
|
@@ -696,14 +628,7 @@ struct GraphemeIterator{S<:AbstractString}
|
696 | 628 | s::S # original string (for generation of SubStrings)
|
697 | 629 | end
|
698 | 630 |
|
699 |
| -""" |
700 |
| - graphemes(s::AbstractString) -> GraphemeIterator |
701 |
| -
|
702 |
| -Returns an iterator over substrings of `s` that correspond to the extended graphemes in the |
703 |
| -string, as defined by Unicode UAX #29. (Roughly, these are what users would perceive as |
704 |
| -single characters, even though they may contain more than one codepoint; for example a |
705 |
| -letter combined with an accent mark is a single grapheme.) |
706 |
| -""" |
| 631 | +# Documented in Unicode module |
707 | 632 | graphemes(s::AbstractString) = GraphemeIterator{typeof(s)}(s)
|
708 | 633 |
|
709 | 634 | eltype(::Type{GraphemeIterator{S}}) where {S} = SubString{S}
|
|
0 commit comments