From 3ad378439260b64c7ff15ce2c44fa2dc052b9460 Mon Sep 17 00:00:00 2001 From: Stefan Karpinski Date: Wed, 25 May 2016 17:24:42 -0400 Subject: [PATCH 1/7] remove UTF-16 and UTF-32 string types and functions --- base/deprecated.jl | 10 -- base/docs/helpdb/Base.jl | 52 ------- base/exports.jl | 4 - base/replutil.jl | 12 +- base/serialize.jl | 5 +- base/strings/io.jl | 19 +++ base/strings/string.jl | 103 +------------- base/sysimg.jl | 1 - base/test.jl | 1 + base/unicode/checkstring.jl | 238 ------------------------------- base/unicode/types.jl | 34 ----- base/unicode/unicode.jl | 6 - base/unicode/utf16.jl | 275 ------------------------------------ base/unicode/utf32.jl | 195 ------------------------- test/serialize.jl | 4 +- test/strings/basic.jl | 68 +-------- test/strings/io.jl | 70 --------- test/strings/types.jl | 44 +++--- test/strings/util.jl | 2 +- test/unicode.jl | 4 - test/unicode/utf16.jl | 23 --- test/unicode/utf32.jl | 258 --------------------------------- test/unicode/utf8.jl | 6 +- test/unicode/utf8proc.jl | 2 +- 24 files changed, 69 insertions(+), 1367 deletions(-) delete mode 100644 base/unicode/checkstring.jl delete mode 100644 base/unicode/types.jl delete mode 100644 base/unicode/unicode.jl delete mode 100644 base/unicode/utf16.jl delete mode 100644 base/unicode/utf32.jl delete mode 100644 test/unicode/utf16.jl delete mode 100644 test/unicode/utf32.jl diff --git a/base/deprecated.jl b/base/deprecated.jl index b22c1a3cc6523..dc862dc46eca6 100644 --- a/base/deprecated.jl +++ b/base/deprecated.jl @@ -488,16 +488,6 @@ end end ) -if sizeof(Cwchar_t) == 2 - @deprecate_binding WString UTF16String - @deprecate_binding wstring utf16 - utf16(s::Cwstring) = utf16(convert(Ptr{Cwchar_t}, s)) -elseif sizeof(Cwchar_t) == 4 - @deprecate_binding WString UTF32String - @deprecate_binding wstring utf32 - utf32(s::Cwstring) = utf32(convert(Ptr{Cwchar_t}, s)) -end - @deprecate ==(x::Char, y::Integer) UInt32(x) == y @deprecate ==(x::Integer, y::Char) x == UInt32(y) @deprecate isless(x::Char, y::Integer) UInt32(x) < y diff --git a/base/docs/helpdb/Base.jl b/base/docs/helpdb/Base.jl index 70ffd3ca1c168..f8af1543b5d68 100644 --- a/base/docs/helpdb/Base.jl +++ b/base/docs/helpdb/Base.jl @@ -95,32 +95,6 @@ Get the step size of a [`Range`](:obj:`Range`) object. """ step -""" - utf32(s) - -Create a UTF-32 string from a byte array, array of `Char` or `UInt32`, or any other string -type. (Conversions of byte arrays check for a byte-order marker in the first four bytes, and -do not include it in the resulting string.) - -Note that the resulting `UTF32String` data is terminated by the NUL codepoint (32-bit zero), -which is not treated as a character in the string (so that it is mostly invisible in Julia); -this allows the string to be passed directly to external functions requiring NUL-terminated -data. This NUL is appended automatically by the `utf32(s)` conversion function. If you have -a `Char` or `UInt32` array `A` that is already NUL-terminated UTF-32 data, then you can -instead use `UTF32String(A)` to construct the string without making a copy of the data and -treating the NUL as a terminator rather than as part of the string. -""" -utf32(s) - -""" - utf32(::Union{Ptr{Char},Ptr{UInt32},Ptr{Int32}} [, length]) - -Create a string from the address of a NUL-terminated UTF-32 string. A copy is made; the -pointer can be safely freed. If `length` is specified, the string does not have to be -NUL-terminated. -""" -utf32(::Union{Ptr{Char},Ptr{UInt32},Ptr{Int32}}, length=?) - """ takebuf_array(b::IOBuffer) @@ -3620,32 +3594,6 @@ Compute ``\\sin(\\pi x) / (\\pi x)`` if ``x \\neq 0``, and ``1`` if ``x = 0``. """ sinc -""" - utf16(s) - -Create a UTF-16 string from a byte array, array of `UInt16`, or any other string type. (Data -must be valid UTF-16. Conversions of byte arrays check for a byte-order marker in the first -two bytes, and do not include it in the resulting string.) - -Note that the resulting `UTF16String` data is terminated by the NUL codepoint (16-bit zero), -which is not treated as a character in the string (so that it is mostly invisible in Julia); -this allows the string to be passed directly to external functions requiring NUL-terminated -data. This NUL is appended automatically by the `utf16(s)` conversion function. If you have -a `UInt16` array `A` that is already NUL-terminated valid UTF-16 data, then you can instead -use `UTF16String(A)` to construct the string without making a copy of the data and treating -the NUL as a terminator rather than as part of the string. -""" -utf16(s) - -""" - utf16(::Union{Ptr{UInt16},Ptr{Int16}} [, length]) - -Create a string from the address of a NUL-terminated UTF-16 string. A copy is made; the -pointer can be safely freed. If `length` is specified, the string does not have to be -NUL-terminated. -""" -utf16(::Union{Ptr{UInt16},Ptr{Int16}}, length=?) - """ median(v[, region]) diff --git a/base/exports.jl b/base/exports.jl index 000d2b4a96d2f..5af15b345eb31 100644 --- a/base/exports.jl +++ b/base/exports.jl @@ -119,8 +119,6 @@ export Tridiagonal, UnitRange, UpperTriangular, - UTF16String, - UTF32String, Val, VecOrMat, Vector, @@ -878,8 +876,6 @@ export ucfirst, unescape_string, uppercase, - utf16, - utf32, warn, # random numbers diff --git a/base/replutil.jl b/base/replutil.jl index 229aa877f657c..edbc2863e53a9 100644 --- a/base/replutil.jl +++ b/base/replutil.jl @@ -233,7 +233,6 @@ end showerror(io::IO, ::DivideError) = print(io, "DivideError: integer division error") showerror(io::IO, ::StackOverflowError) = print(io, "StackOverflowError:") showerror(io::IO, ::UndefRefError) = print(io, "UndefRefError: access to undefined reference") -showerror(io::IO, ex::UndefVarError) = print(io, "UndefVarError: $(ex.var) not defined") showerror(io::IO, ::EOFError) = print(io, "EOFError: read end of file") showerror(io::IO, ex::ErrorException) = print(io, ex.msg) showerror(io::IO, ex::KeyError) = print(io, "KeyError: key $(repr(ex.key)) not found") @@ -241,6 +240,17 @@ showerror(io::IO, ex::InterruptException) = print(io, "InterruptException:") showerror(io::IO, ex::ArgumentError) = print(io, "ArgumentError: $(ex.msg)") showerror(io::IO, ex::AssertionError) = print(io, "AssertionError: $(ex.msg)") +function showerror(io::IO, ex::UndefVarError) + if ex.var in [:UTF16String, :UTF32String, :WString, :utf16, :utf32, :wstring] + return showerror(io, ErrorException(""" + `$(ex.var)` has been moved to the package LegacyStrings.jl: + Run Pkg.add("LegacyStrings") to install LegacyStrings on Julia v0.5-; + Then do `using LegacyStrings` to get `$(ex.var)`. + """)) + end + print(io, "UndefVarError: $(ex.var) not defined") +end + function showerror(io::IO, ex::MethodError) # ex.args is a tuple type if it was thrown from `invoke` and is # a tuple of the arguments otherwise. diff --git a/base/serialize.jl b/base/serialize.jl index adb0048310034..debf09797391f 100644 --- a/base/serialize.jl +++ b/base/serialize.jl @@ -21,8 +21,7 @@ const TAGS = Any[ Symbol, Tuple, Expr, # dummy entries, intentionally shadowed by earlier ones LineNumberNode, Slot, LabelNode, GotoNode, QuoteNode, :reserved23 #=was TopNode=#, TypeVar, Core.Box, LambdaInfo, - Module, #=UndefRefTag=#Symbol, Task, String, - UTF16String, UTF32String, Float16, + Module, #=UndefRefTag=#Symbol, Task, String, Float16, SimpleVector, #=BackrefTag=#Symbol, Method, GlobalRef, (), Bool, Any, :Any, Bottom, :reserved21, :reserved22, Type, @@ -42,7 +41,7 @@ const TAGS = Any[ 28, 29, 30, 31, 32 ] -const ser_version = 3 # do not make changes without bumping the version #! +const ser_version = 4 # do not make changes without bumping the version #! const NTAGS = length(TAGS) diff --git a/base/strings/io.jl b/base/strings/io.jl index 199fe0eefc785..151909d6405e4 100644 --- a/base/strings/io.jl +++ b/base/strings/io.jl @@ -324,3 +324,22 @@ function unindent(str::AbstractString, indent::Int; tabwidth=8) end takebuf_string(buf) end + +function convert(::Type{String}, chars::AbstractVector{Char}) + sprint(length(chars), io->begin + state = start(chars) + while !done(chars, state) + c, state = next(chars, state) + if '\ud7ff' < c && c + 1024 < '\ue000' + d, state = next(chars, state) + if '\ud7ff' < d - 1024 && d < '\ue000' + c = Char(0x10000 + ((UInt32(c) & 0x03ff) << 10) | (UInt32(d) & 0x03ff)) + else + write(io, c) + c = d + end + end + write(io, c) + end + end) +end diff --git a/base/strings/string.jl b/base/strings/string.jl index 47eb243b21647..c9637b746ef91 100644 --- a/base/strings/string.jl +++ b/base/strings/string.jl @@ -33,6 +33,8 @@ const utf8_trailing = [ ## required core functionality ## +is_valid_continuation(c) = ((c & 0xc0) == 0x80) + function endof(s::String) d = s.data i = length(d) @@ -239,109 +241,10 @@ function reverse(s::String) String(buf) end -## outputting UTF-8 strings ## - write(io::IO, s::String) = write(io, s.data) pointer(x::String) = pointer(x.data) pointer(x::String, i::Integer) = pointer(x.data)+(i-1) -## transcoding to UTF-8 ## - convert(::Type{String}, s::String) = s - -function convert(::Type{String}, dat::Vector{UInt8}) - # handle zero length string quickly - isempty(dat) && return empty_utf8 - # get number of bytes to allocate - len, flags, num4byte, num3byte, num2byte = unsafe_checkstring(dat) - if (flags & (UTF_LONG | UTF_SURROGATE)) == 0 - len = sizeof(dat) - @inbounds return String(copy!(Vector{UInt8}(len), 1, dat, 1, len)) - end - # Copy, but eliminate over-long encodings and surrogate pairs - len += num2byte + num3byte*2 + num4byte*3 - buf = Vector{UInt8}(len) - out = 0 - pos = 0 - @inbounds while out < len - ch::UInt32 = dat[pos += 1] - # Handle ASCII characters - if ch <= 0x7f - buf[out += 1] = ch - # Handle overlong < 0x100 - elseif ch < 0xc2 - buf[out += 1] = ((ch & 3) << 6) | (dat[pos += 1] & 0x3f) - # Handle 0x100-0x7ff - elseif ch < 0xe0 - buf[out += 1] = ch - buf[out += 1] = dat[pos += 1] - elseif ch != 0xed - buf[out += 1] = ch - buf[out += 1] = dat[pos += 1] - buf[out += 1] = dat[pos += 1] - # Copy 4-byte encoded value - ch >= 0xf0 && (buf[out += 1] = dat[pos += 1]) - # Handle surrogate pairs - else - ch = dat[pos += 1] - if ch < 0xa0 # not surrogate pairs - buf[out += 1] = 0xed - buf[out += 1] = ch - buf[out += 1] = dat[pos += 1] - else - # Pick up surrogate pairs (CESU-8 format) - ch = ((((((ch & 0x3f) << 6) | (dat[pos + 1] & 0x3f)) << 10) - + (((dat[pos + 3] & 0x3f)%UInt32 << 6) | (dat[pos + 4] & 0x3f))) - - 0x01f0c00) - pos += 4 - output_utf8_4byte!(buf, out, ch) - out += 4 - end - end - end - String(buf) -end - -""" -Converts an already validated vector of `UInt16` or `UInt32` to a `String` - -Input Arguments: - -* `dat` Vector of code units (`UInt16` or `UInt32`), explicit `\0` is not converted -* `len` length of output in bytes - -Returns: - -* `String` -""" -function encode_to_utf8{T<:Union{UInt16, UInt32}}(::Type{T}, dat, len) - buf = Vector{UInt8}(len) - out = 0 - pos = 0 - @inbounds while out < len - ch::UInt32 = dat[pos += 1] - # Handle ASCII characters - if ch <= 0x7f - buf[out += 1] = ch - # Handle 0x80-0x7ff - elseif ch < 0x800 - buf[out += 1] = 0xc0 | (ch >>> 6) - buf[out += 1] = 0x80 | (ch & 0x3f) - # Handle 0x10000-0x10ffff (if input is UInt32) - elseif ch > 0xffff # this is only for T == UInt32, should not be generated for UInt16 - output_utf8_4byte!(buf, out, ch) - out += 4 - # Handle surrogate pairs - elseif is_surrogate_codeunit(ch) - output_utf8_4byte!(buf, out, get_supplementary(ch, dat[pos += 1])) - out += 4 - # Handle 0x800-0xd7ff, 0xe000-0xffff UCS-2 characters - else - buf[out += 1] = 0xe0 | ((ch >>> 12) & 0x3f) - buf[out += 1] = 0x80 | ((ch >>> 6) & 0x3f) - buf[out += 1] = 0x80 | (ch & 0x3f) - end - end - String(buf) -end +convert(::Type{String}, v::Vector{UInt8}) = String(v) diff --git a/base/sysimg.jl b/base/sysimg.jl index 022c4a9dac16c..85dfeaf5ef75e 100644 --- a/base/sysimg.jl +++ b/base/sysimg.jl @@ -145,7 +145,6 @@ include("iobuffer.jl") include("char.jl") include("intfuncs.jl") include("strings/strings.jl") -include("unicode/unicode.jl") include("parse.jl") include("shell.jl") include("regex.jl") diff --git a/base/test.jl b/base/test.jl index b0382f3284f5c..f9790636f6906 100644 --- a/base/test.jl +++ b/base/test.jl @@ -1009,5 +1009,6 @@ end Base.convert(::Type{GenericString}, s::AbstractString) = GenericString(s) Base.endof(s::GenericString) = endof(s.string) Base.next(s::GenericString, i::Int) = next(s.string, i) +Base.reverseind(s::GenericString, i::Integer) = reverseind(s.string, i) end # module diff --git a/base/unicode/checkstring.jl b/base/unicode/checkstring.jl deleted file mode 100644 index 72e9eb9d31062..0000000000000 --- a/base/unicode/checkstring.jl +++ /dev/null @@ -1,238 +0,0 @@ -# This file is a part of Julia. License is MIT: http://julialang.org/license - -## Functions to check validity of UTF-8, UTF-16, and UTF-32 encoded strings, -# and also to return information necessary to convert to other encodings - -## Return flags for check_string function - -const UTF_LONG = 1 ##< Long encodings are present -const UTF_LATIN1 = 2 ##< characters in range 0x80-0xFF present -const UTF_UNICODE2 = 4 ##< characters in range 0x100-0x7ff present -const UTF_UNICODE3 = 8 ##< characters in range 0x800-0xd7ff, 0xe000-0xffff -const UTF_UNICODE4 = 16 ##< non-BMP characters present -const UTF_SURROGATE = 32 ##< surrogate pairs present - -## Get a UTF-8 continuation byte, give error if invalid, return updated character value -@inline function get_continuation(ch::UInt32, byt::UInt8, pos) - if !is_valid_continuation(byt) - throw(UnicodeError(UTF_ERR_CONT, pos, byt)) - end - (ch << 6) | (byt & 0x3f) -end - -""" -Validates and calculates number of characters in a UTF-8,UTF-16 or UTF-32 encoded vector/string - -Warning: this function does not check the bounds of the start or end positions -Use `checkstring` to make sure the bounds are checked - -Input Arguments: - -* `dat` UTF-8 (`Vector{UInt8}`), UTF-16 (`Vector{UInt16}`) or UTF-32 (`Vector{UInt32}`, `AbstractString`) encoded string - -Optional Input Arguments: - -* `pos` start position (defaults to 1) -* `endpos` end position (defaults to `endof(dat)`) - -Keyword Arguments: - -* `accept_long_null` = `true` # Modified UTF-8 (`\\0` represented as `b\"\\xc0\\x80\"`) -* `accept_surrogates` = `true` # `CESU-8` -* `accept_long_char` = `false` # Accept arbitrary long encodings - -Returns: - -* (total characters, flags, 4-byte, 3-byte, 2-byte) - -Throws: - -* `UnicodeError` -""" -function unsafe_checkstring end - -function unsafe_checkstring(dat::AbstractVector{UInt8}, - pos = 1, - endpos = endof(dat) - ; - accept_long_null = true, - accept_surrogates = true, - accept_long_char = false) - local byt::UInt8, ch::UInt32, surr::UInt32 - flags::UInt = 0 - totalchar = num2byte = num3byte = num4byte = 0 - @inbounds while pos <= endpos - ch, pos = next(dat, pos) - totalchar += 1 - if ch > 0x7f - # Check UTF-8 encoding - if ch < 0xe0 - # 2-byte UTF-8 sequence (i.e. characters 0x80-0x7ff) - (pos > endpos) && throw(UnicodeError(UTF_ERR_SHORT, pos, ch)) - byt, pos = next(dat, pos) - ch = get_continuation(ch & 0x3f, byt, pos) - if ch > 0x7f - num2byte += 1 - flags |= (ch > 0xff) ? UTF_UNICODE2 : UTF_LATIN1 - elseif accept_long_char - flags |= UTF_LONG - elseif (ch == 0) && accept_long_null - flags |= UTF_LONG - else - throw(UnicodeError(UTF_ERR_LONG, pos, ch)) - end - elseif ch < 0xf0 - # 3-byte UTF-8 sequence (i.e. characters 0x800-0xffff) - (pos + 1 > endpos) && throw(UnicodeError(UTF_ERR_SHORT, pos, ch)) - byt, pos = next(dat, pos) - ch = get_continuation(ch & 0x0f, byt, pos) - byt, pos = next(dat, pos) - ch = get_continuation(ch, byt, pos) - # check for surrogate pairs, make sure correct - if is_surrogate_codeunit(ch) - !is_surrogate_lead(ch) && throw(UnicodeError(UTF_ERR_NOT_LEAD, pos-2, ch)) - # next character *must* be a trailing surrogate character - (pos + 2 > endpos) && throw(UnicodeError(UTF_ERR_MISSING_SURROGATE, pos-2, ch)) - byt, pos = next(dat, pos) - (byt != 0xed) && throw(UnicodeError(UTF_ERR_NOT_TRAIL, pos, byt)) - byt, pos = next(dat, pos) - surr = get_continuation(0x0000d, byt, pos) - byt, pos = next(dat, pos) - surr = get_continuation(surr, byt, pos) - !is_surrogate_trail(surr) && throw(UnicodeError(UTF_ERR_NOT_TRAIL, pos-2, surr)) - !accept_surrogates && throw(UnicodeError(UTF_ERR_SURROGATE, pos-2, surr)) - flags |= UTF_SURROGATE - num4byte += 1 - elseif ch > 0x07ff - num3byte += 1 - elseif accept_long_char - flags |= UTF_LONG - num2byte += 1 - else - throw(UnicodeError(UTF_ERR_LONG, pos-2, ch)) - end - elseif ch < 0xf5 - # 4-byte UTF-8 sequence (i.e. characters > 0xffff) - (pos + 2 > endpos) && throw(UnicodeError(UTF_ERR_SHORT, pos, ch)) - byt, pos = next(dat, pos) - ch = get_continuation(ch & 0x07, byt, pos) - byt, pos = next(dat, pos) - ch = get_continuation(ch, byt, pos) - byt, pos = next(dat, pos) - ch = get_continuation(ch, byt, pos) - if ch > 0x10ffff - throw(UnicodeError(UTF_ERR_INVALID, pos-3, ch)) - elseif ch > 0xffff - num4byte += 1 - elseif is_surrogate_codeunit(ch) - throw(UnicodeError(UTF_ERR_SURROGATE, pos-3, ch)) - elseif accept_long_char - # This is an overly long encoded character - flags |= UTF_LONG - if ch > 0x7ff - num3byte += 1 - elseif ch > 0x7f - num2byte += 1 - end - else - throw(UnicodeError(UTF_ERR_LONG, pos-2, ch)) - end - else - throw(UnicodeError(UTF_ERR_INVALID, pos, ch)) - end - end - end - num3byte != 0 && (flags |= UTF_UNICODE3) - num4byte != 0 && (flags |= UTF_UNICODE4) - return totalchar, flags, num4byte, num3byte, num2byte -end - -typealias AbstractString1632{Tel<:Union{UInt16,UInt32}} Union{AbstractVector{Tel}, AbstractString} - -function unsafe_checkstring( - dat::AbstractString1632, - pos = 1, - endpos = endof(dat) - ; - accept_long_null = true, - accept_surrogates = true, - accept_long_char = false) - local ch::UInt32 - flags::UInt = 0 - totalchar = num2byte = num3byte = num4byte = 0 - @inbounds while pos <= endpos - ch, pos = next(dat, pos) - totalchar += 1 - if ch > 0x7f - if ch < 0x100 - num2byte += 1 - flags |= UTF_LATIN1 - elseif ch < 0x800 - num2byte += 1 - flags |= UTF_UNICODE2 - elseif ch > 0x0ffff - (ch > 0x10ffff) && throw(UnicodeError(UTF_ERR_INVALID, pos, ch)) - num4byte += 1 - elseif !is_surrogate_codeunit(ch) - num3byte += 1 - elseif is_surrogate_lead(ch) - pos > endpos && throw(UnicodeError(UTF_ERR_MISSING_SURROGATE, pos, ch)) - # next character *must* be a trailing surrogate character - ch, pos = next(dat, pos) - !is_surrogate_trail(ch) && throw(UnicodeError(UTF_ERR_NOT_TRAIL, pos, ch)) - num4byte += 1 - if !(typeof(dat) <: AbstractVector{UInt16}) - !accept_surrogates && throw(UnicodeError(UTF_ERR_SURROGATE, pos, ch)) - flags |= UTF_SURROGATE - end - else - throw(UnicodeError(UTF_ERR_NOT_LEAD, pos, ch)) - end - end - end - num3byte != 0 && (flags |= UTF_UNICODE3) - num4byte != 0 && (flags |= UTF_UNICODE4) - return totalchar, flags, num4byte, num3byte, num2byte -end - -""" -Validates and calculates number of characters in a UTF-8,UTF-16 or UTF-32 encoded vector/string - -This function checks the bounds of the start and end positions -Use `unsafe_checkstring` to avoid that overhead if the bounds have already been checked - -Input Arguments: - -* `dat` UTF-8 (`Vector{UInt8}`), UTF-16 (`Vector{UInt16}`) or UTF-32 (`Vector{UInt32}`, `AbstractString`) encoded string - -Optional Input Arguments: - -* `startpos` start position (defaults to 1) -* `endpos` end position (defaults to `endof(dat)`) - -Keyword Arguments: - -* `accept_long_null` = `true` # Modified UTF-8 (`\\0` represented as `b\"\\xc0\\x80\"`) -* `accept_surrogates` = `true` # `CESU-8` -* `accept_long_char` = `false` # Accept arbitrary long encodings - -Returns: - -* (total characters, flags, 4-byte, 3-byte, 2-byte) - -Throws: - -* `UnicodeError` -""" -function checkstring end - -# No need to check bounds if using defaults -checkstring(dat; kwargs...) = unsafe_checkstring(dat, 1, endof(dat); kwargs...) - -# Make sure that beginning and end positions are bounds checked -function checkstring(dat, startpos, endpos = endof(dat); kwargs...) - checkbounds(dat,startpos) - checkbounds(dat,endpos) - endpos < startpos && throw(ArgumentError("End position ($endpos) is less than start position ($startpos)")) - unsafe_checkstring(dat, startpos, endpos; kwargs...) -end diff --git a/base/unicode/types.jl b/base/unicode/types.jl deleted file mode 100644 index 61f69d6a6638d..0000000000000 --- a/base/unicode/types.jl +++ /dev/null @@ -1,34 +0,0 @@ -# This file is a part of Julia. License is MIT: http://julialang.org/license - -##\brief Base UTF16String type, has 16-bit NULL termination word after data, native byte order -# -# \throws UnicodeError - -immutable UTF16String <: AbstractString - data::Vector{UInt16} # includes 16-bit NULL termination after string chars - function UTF16String(data::Vector{UInt16}) - if length(data) < 1 || data[end] != 0 - throw(UnicodeError(UTF_ERR_NULL_16_TERMINATE, 0, 0)) - end - new(data) - end -end - -##\brief Base UTF32String type, has 32-bit NULL termination word after data, native byte order -# -# \throws UnicodeError - -immutable UTF32String <: DirectIndexString - data::Vector{UInt32} # includes 32-bit NULL termination after string chars - - function UTF32String(data::Vector{UInt32}) - if length(data) < 1 || data[end] != 0 - throw(UnicodeError(UTF_ERR_NULL_32_TERMINATE, 0, 0)) - end - new(data) - end -end -UTF32String(data::Vector{Char}) = UTF32String(reinterpret(UInt32, data)) - -isvalid{T<:Union{String,UTF16String,UTF32String}}(str::T) = isvalid(T, str.data) -isvalid{T<:Union{String,UTF16String,UTF32String}}(::Type{T}, str::T) = isvalid(T, str.data) diff --git a/base/unicode/unicode.jl b/base/unicode/unicode.jl deleted file mode 100644 index a8bd91e1a5a81..0000000000000 --- a/base/unicode/unicode.jl +++ /dev/null @@ -1,6 +0,0 @@ -# This file is a part of Julia. License is MIT: http://julialang.org/license - -include("unicode/types.jl") -include("unicode/checkstring.jl") -include("unicode/utf16.jl") -include("unicode/utf32.jl") diff --git a/base/unicode/utf16.jl b/base/unicode/utf16.jl deleted file mode 100644 index 21cb4059056e5..0000000000000 --- a/base/unicode/utf16.jl +++ /dev/null @@ -1,275 +0,0 @@ -# This file is a part of Julia. License is MIT: http://julialang.org/license - -# Quickly copy and set trailing \0 -@inline function fast_utf_copy{S <: Union{UTF16String, UTF32String}, T <: Union{UInt16, UInt32}}( - ::Type{S}, ::Type{T}, len, dat, flag::Bool=false) - S(setindex!(copy!(Vector{T}(len+1), 1, dat, 1, flag ? len : len+1), 0, len+1)) -end - -# Get rest of character ch from 3-byte UTF-8 sequence in dat -@inline function get_utf8_3byte(dat, pos, ch) - @inbounds return ((ch & 0xf) << 12) | (UInt32(dat[pos-1] & 0x3f) << 6) | (dat[pos] & 0x3f) -end -# Get rest of character ch from 4-byte UTF-8 sequence in dat -@inline function get_utf8_4byte(dat, pos, ch) - @inbounds return (((ch & 0x7) << 18) - | (UInt32(dat[pos-2] & 0x3f) << 12) - | (UInt32(dat[pos-1] & 0x3f) << 6) - | (dat[pos] & 0x3f)) -end - -# Output a character as a 4-byte UTF-8 sequence -@inline function output_utf8_4byte!(buf, out, ch) - @inbounds begin - buf[out + 1] = 0xf0 | (ch >>> 18) - buf[out + 2] = 0x80 | ((ch >>> 12) & 0x3f) - buf[out + 3] = 0x80 | ((ch >>> 6) & 0x3f) - buf[out + 4] = 0x80 | (ch & 0x3f) - end -end - -const empty_utf16 = UTF16String(UInt16[0]) - -function length(s::UTF16String) - d = s.data - len = length(d) - 1 - len == 0 && return 0 - cnum = 0 - for i = 1:len - @inbounds cnum += !is_surrogate_trail(d[i]) - end - cnum -end - -function endof(s::UTF16String) - d = s.data - i = length(d) - 1 - i == 0 && return i - return is_surrogate_codeunit(d[i]) ? i-1 : i -end - -get_supplementary(lead::Unsigned, trail::Unsigned) = (UInt32(lead-0xd7f7)<<10 + trail) - -function next(s::UTF16String, i::Int) - ch = s.data[i] - !is_surrogate_codeunit(ch) && return (Char(ch), i+1) - # check length, account for terminating \0 - i >= (length(s.data)-1) && throw(UnicodeError(UTF_ERR_MISSING_SURROGATE, i, UInt32(ch))) - !is_surrogate_lead(ch) && throw(UnicodeError(UTF_ERR_NOT_LEAD, i, ch)) - ct = s.data[i+1] - !is_surrogate_trail(ct) && throw((UTF_ERR_NOT_TRAIL, i, ch)) - Char(get_supplementary(ch, ct)), i+2 -end - -function reverseind(s::UTF16String, i::Integer) - j = length(s.data) - i - return is_surrogate_trail(s.data[j]) ? j-1 : j -end - -lastidx(s::UTF16String) = length(s.data) - 1 # s.data includes NULL terminator - -function reverse(s::UTF16String) - d = s.data - out = similar(d) - out[end] = 0 # NULL termination - n = length(d) - @inbounds for i = 1:n-1 - ch = d[n-i] - if is_surrogate_lead(ch) - out[i],out[i-1] = out[i-1],ch - else - out[i] = ch - end - end - UTF16String(out) -end - -sizeof(s::UTF16String) = sizeof(s.data) - sizeof(UInt16) - -function isvalid(::Type{UTF16String}, data::AbstractArray{UInt16}) - i = 1 - n = length(data) # this may include NULL termination; that's okay - @inbounds while i < n # check for unpaired surrogates - if is_surrogate_lead(data[i]) && is_surrogate_trail(data[i+1]) - i += 2 - elseif is_surrogate_codeunit(data[i]) - return false - else - i += 1 - end - end - return i > n || !is_surrogate_codeunit(data[i]) -end - -function convert(::Type{UTF16String}, str::AbstractString) - len, flags, num4byte = unsafe_checkstring(str) - buf = Vector{UInt16}(len+num4byte+1) - out = 0 - @inbounds for ch in str - c = UInt32(ch) - if c < 0x10000 - buf[out += 1] = UInt16(c) - else - # output surrogate pair - buf[out += 1] = UInt16(0xd7c0 + (c >>> 10)) - buf[out += 1] = UInt16(0xdc00 + (c & 0x3ff)) - end - end - @inbounds buf[out + 1] = 0 # NULL termination - UTF16String(buf) -end - -function convert(::Type{UTF16String}, str::String) - dat = str.data - # handle zero length string quickly - sizeof(dat) == 0 && return empty_utf16 - # Check that is correct UTF-8 encoding and get number of words needed - len, flags, num4byte = unsafe_checkstring(dat) - len += num4byte - buf = Vector{UInt16}(len+1) - @inbounds buf[len+1] = 0 - # Optimize case where no characters > 0x7f - flags == 0 && @inbounds return UTF16String(copy!(buf, dat)) - out = 0 - pos = 0 - @inbounds while out < len - ch::UInt32 = dat[pos += 1] - # Handle ASCII characters - if ch <= 0x7f - buf[out += 1] = ch - # Handle range 0x80-0x7ff - elseif ch < 0xe0 - buf[out += 1] = ((ch & 0x1f) << 6) | (dat[pos += 1] & 0x3f) - # Handle range 0x800-0xffff - elseif ch < 0xf0 - pos += 2 - buf[out += 1] = get_utf8_3byte(dat, pos, ch) - # Handle range 0x10000-0x10ffff - else - pos += 3 - ch = get_utf8_4byte(dat, pos, ch) - # output surrogate pair - buf[out += 1] = UInt16(0xd7c0 + (ch >>> 10)) - buf[out += 1] = UInt16(0xdc00 + (ch & 0x3ff)) - end - end - UTF16String(buf) -end - -function convert(::Type{String}, str::UTF16String) - dat = str.data - len = sizeof(dat) >>> 1 - # handle zero length string quickly - len <= 1 && return empty_utf8 - # get number of bytes to allocate - len, flags, num4byte, num3byte, num2byte = unsafe_checkstring(dat, 1, len-1) - flags == 0 && @inbounds return String(copy!(Vector{UInt8}(len), 1, dat, 1, len)) - return encode_to_utf8(UInt16, dat, len + num2byte + num3byte*2 + num4byte*3) -end - -""" -Converts an already validated UTF-32 encoded vector of `UInt32` to a `UTF16String` - -Input Arguments: - -* `dat` `Vector{UInt32}` of UTF-32 encoded data -* `len` length of output in 16-bit words - -Returns: - -* `UTF16String` -""" -function encode_to_utf16(dat, len) - buf = Vector{UInt16}(len) - @inbounds buf[len] = 0 # NULL termination - out = 0 - pos = 0 - @inbounds while out < len - ch = UInt32(dat[pos += 1]) - if ch > 0xffff - # Output surrogate pair for 0x10000-0x10ffff - buf[out += 1] = 0xd7c0 + (ch >>> 10) - ch = 0xdc00 + (ch & 0x3ff) - end - buf[out += 1] = ch - end - UTF16String(buf) -end - -convert(::Type{Vector{UInt16}}, str::UTF16String) = str.data -convert(::Type{Array{UInt16}}, str::UTF16String) = str.data - -convert(::Type{UTF16String}, str::UTF16String) = str - -unsafe_convert{T<:Union{Int16,UInt16}}(::Type{Ptr{T}}, s::UTF16String) = - convert(Ptr{T}, pointer(s)) - -convert(T::Type{UTF16String}, data::AbstractArray{UInt16}) = - convert(T, reshape(data, length(data))) - -convert(T::Type{UTF16String}, data::AbstractArray{Int16}) = - convert(T, reinterpret(UInt16, data)) - -function convert(::Type{UTF16String}, dat::AbstractVector{UInt16}) - len, flags, num4byte = unsafe_checkstring(dat) - @inbounds return fast_utf_copy(UTF16String, UInt16, len+num4byte, dat, true) -end - -function convert(T::Type{UTF16String}, bytes::AbstractArray{UInt8}) - isempty(bytes) && return UTF16String(UInt16[0]) - isodd(length(bytes)) && throw(UnicodeError(UTF_ERR_ODD_BYTES_16, length(bytes), 0)) - data = reinterpret(UInt16, bytes) - # check for byte-order mark (BOM): - if data[1] == 0xfeff # native byte order - d = Array{UInt16}(length(data)) - copy!(d,1, data,2, length(data)-1) - elseif data[1] == 0xfffe # byte-swapped - d = Array{UInt16}(length(data)) - for i = 2:length(data) - d[i-1] = bswap(data[i]) - end - else - d = Array{UInt16}(length(data) + 1) - copy!(d,1, data,1, length(data)) # assume native byte order - end - d[end] = 0 # NULL terminate - !isvalid(UTF16String, d) && throw(UnicodeError(UTF_ERR_INVALID_16,0,0)) - UTF16String(d) -end - -utf16(x) = convert(UTF16String, x) -utf16(p::Ptr{UInt16}, len::Integer) = utf16(unsafe_wrap(Array, p, len)) -utf16(p::Ptr{Int16}, len::Integer) = utf16(convert(Ptr{UInt16}, p), len) -function utf16(p::Union{Ptr{UInt16}, Ptr{Int16}}) - len = 0 - while unsafe_load(p, len+1) != 0; len += 1; end - utf16(p, len) -end - -function map(fun, str::UTF16String) - buf = UInt16[] - sizehint!(buf, length(str.data)) - for ch in str - c2 = fun(ch) - if !isa(c2, Char) - throw(UnicodeError(UTF_ERR_MAP_CHAR, 0, 0)) - end - uc = UInt32(c2) - if uc < 0x10000 - if is_surrogate_codeunit(UInt16(uc)) - throw(UnicodeError(UTF_ERR_INVALID_CHAR, 0, uc)) - end - push!(buf, UInt16(uc)) - elseif uc <= 0x10ffff - push!(buf, UInt16(0xd7c0 + (uc >> 10))) - push!(buf, UInt16(0xdc00 + (uc & 0x3ff))) - else - throw(UnicodeError(UTF_ERR_INVALID_CHAR, 0, uc)) - end - end - push!(buf, 0) - UTF16String(buf) -end - -cconvert(::Type{Cwstring}, v::Vector{UInt16}) = transcode(Cwchar_t, v) -cconvert(::Type{Cwstring}, s::UTF16String) = transcode(Cwchar_t, s.data) diff --git a/base/unicode/utf32.jl b/base/unicode/utf32.jl deleted file mode 100644 index 9250e9310909e..0000000000000 --- a/base/unicode/utf32.jl +++ /dev/null @@ -1,195 +0,0 @@ -# This file is a part of Julia. License is MIT: http://julialang.org/license - -# UTF-32 basic functions -next(s::UTF32String, i::Int) = (Char(s.data[i]), i+1) -endof(s::UTF32String) = length(s.data) - 1 -length(s::UTF32String) = length(s.data) - 1 - -reverse(s::UTF32String) = UTF32String(reverse!(copy(s.data), 1, length(s))) - -sizeof(s::UTF32String) = sizeof(s.data) - sizeof(UInt32) - -const empty_utf32 = UTF32String(UInt32[0]) - -convert(::Type{UTF32String}, c::Char) = UTF32String(UInt32[c, 0]) -convert(::Type{UTF32String}, s::UTF32String) = s - -function convert(::Type{UTF32String}, str::AbstractString) - len, flags = unsafe_checkstring(str) - buf = Vector{UInt32}(len+1) - out = 0 - @inbounds for ch in str ; buf[out += 1] = ch ; end - @inbounds buf[out + 1] = 0 # NULL termination - UTF32String(buf) -end - -function convert(::Type{String}, str::UTF32String) - dat = str.data - len = sizeof(dat) >>> 2 - # handle zero length string quickly - len <= 1 && return empty_utf8 - # get number of bytes to allocate - len, flags, num4byte, num3byte, num2byte = unsafe_checkstring(dat, 1, len-1) - flags == 0 && @inbounds return String(copy!(Vector{UInt8}(len), 1, dat, 1, len)) - return encode_to_utf8(UInt32, dat, len + num2byte + num3byte*2 + num4byte*3) -end - -function convert(::Type{UTF32String}, str::String) - dat = str.data - # handle zero length string quickly - sizeof(dat) == 0 && return empty_utf32 - # Validate UTF-8 encoding, and get number of words to create - len, flags = unsafe_checkstring(dat) - # Optimize case where no characters > 0x7f - flags == 0 && @inbounds return fast_utf_copy(UTF32String, UInt32, len, dat, true) - # has multi-byte UTF-8 sequences - buf = Vector{UInt32}(len+1) - @inbounds buf[len+1] = 0 # NULL termination - local ch::UInt32, surr::UInt32 - out = 0 - pos = 0 - @inbounds while out < len - ch = dat[pos += 1] - # Handle ASCII characters - if ch <= 0x7f - buf[out += 1] = ch - # Handle range 0x80-0x7ff - elseif ch < 0xe0 - buf[out += 1] = ((ch & 0x1f) << 6) | (dat[pos += 1] & 0x3f) - # Handle range 0x800-0xffff - elseif ch < 0xf0 - pos += 2 - ch = get_utf8_3byte(dat, pos, ch) - # Handle surrogate pairs (should have been encoded in 4 bytes) - if is_surrogate_lead(ch) - # Build up 32-bit character from ch and trailing surrogate in next 3 bytes - pos += 3 - surr = ((UInt32(dat[pos-2] & 0xf) << 12) - | (UInt32(dat[pos-1] & 0x3f) << 6) - | (dat[pos] & 0x3f)) - ch = get_supplementary(ch, surr) - end - buf[out += 1] = ch - # Handle range 0x10000-0x10ffff - else - pos += 3 - buf[out += 1] = get_utf8_4byte(dat, pos, ch) - end - end - UTF32String(buf) -end - -function convert(::Type{UTF32String}, str::UTF16String) - dat = str.data - len = sizeof(dat) - # handle zero length string quickly (account for trailing \0) - len <= 2 && return empty_utf32 - # get number of words to create - len, flags, num4byte = unsafe_checkstring(dat, 1, len>>>1) - # No surrogate pairs, do optimized copy - (flags & UTF_UNICODE4) == 0 && @inbounds return UTF32String(copy!(Vector{Char}(len), dat)) - local ch::UInt32 - buf = Vector{UInt32}(len) - out = 0 - pos = 0 - @inbounds while out < len - ch = dat[pos += 1] - # check for surrogate pair - if is_surrogate_lead(ch) ; ch = get_supplementary(ch, dat[pos += 1]) ; end - buf[out += 1] = ch - end - UTF32String(buf) -end - -function convert(::Type{UTF16String}, str::UTF32String) - dat = str.data - len = sizeof(dat) - # handle zero length string quickly - len <= 4 && return empty_utf16 - # get number of words to allocate - len, flags, num4byte = unsafe_checkstring(dat, 1, len>>>2) - # optimized path, no surrogates - num4byte == 0 && @inbounds return UTF16String(copy!(Vector{UInt16}(len), dat)) - return encode_to_utf16(dat, len + num4byte) -end - -function convert(::Type{UTF32String}, dat::AbstractVector{UInt32}) - @inbounds return fast_utf_copy(UTF32String, UInt32, length(dat), dat, true) -end - -convert(::Type{UTF32String}, data::AbstractVector{Int32}) = - convert(UTF32String, reinterpret(UInt32, convert(Vector{T}, data))) - -convert(::Type{UTF32String}, data::AbstractVector{Char}) = - convert(UTF32String, map(UInt32, data)) - -convert{T<:AbstractString, S<:Union{UInt32,Char,Int32}}(::Type{T}, v::AbstractVector{S}) = - convert(T, utf32(v)) - -convert(::Type{Vector{UInt32}}, str::UTF32String) = str.data -convert(::Type{Array{UInt32}}, str::UTF32String) = str.data - -unsafe_convert{T<:Union{UInt32,Int32,Char}}(::Type{Ptr{T}}, s::UTF32String) = - convert(Ptr{T}, pointer(s)) - -function convert(T::Type{UTF32String}, bytes::AbstractArray{UInt8}) - isempty(bytes) && return empty_utf32 - length(bytes) & 3 != 0 && throw(UnicodeError(UTF_ERR_ODD_BYTES_32,0,0)) - data = reinterpret(UInt32, bytes) - # check for byte-order mark (BOM): - if data[1] == 0x0000feff # native byte order - d = Array{UInt32}(length(data)) - copy!(d,1, data, 2, length(data)-1) - elseif data[1] == 0xfffe0000 # byte-swapped - d = Array{UInt32}(length(data)) - for i = 2:length(data) - @inbounds d[i-1] = bswap(data[i]) - end - else - d = Array{UInt32}(length(data) + 1) - copy!(d, 1, data, 1, length(data)) # assume native byte order - end - d[end] = 0 # NULL terminate - UTF32String(d) -end - -cconvert(::Type{Cwstring}, v::Vector{UInt32}) = transcode(Cwchar_t, v) -cconvert(::Type{Cwstring}, s::UTF32String) = transcode(Cwchar_t, s.data) - -function isvalid(::Type{UTF32String}, str::Union{Vector{UInt32}, Vector{Char}}) - for c in str - @inbounds if !isvalid(Char, UInt32(c)) ; return false ; end - end - return true -end -isvalid(str::Vector{Char}) = isvalid(UTF32String, str) - -utf32(x) = convert(UTF32String, x) - -utf32(p::Ptr{UInt32}, len::Integer) = utf32(unsafe_wrap(Array, p, len)) -utf32(p::Union{Ptr{Char}, Ptr{Int32}}, len::Integer) = utf32(convert(Ptr{UInt32}, p), len) -function utf32(p::Union{Ptr{UInt32}, Ptr{Char}, Ptr{Int32}}) - len = 0 - while unsafe_load(p, len+1) != 0; len += 1; end - utf32(p, len) -end - -function map(f, s::UTF32String) - d = s.data - out = similar(d) - out[end] = 0 - - @inbounds for i = 1:(length(d)-1) - c2 = f(Char(d[i])) - if !isa(c2, Char) - throw(UnicodeError(UTF_ERR_MAP_CHAR, 0, 0)) - end - out[i] = (c2::Char) - end - UTF32String(out) -end - -pointer(x::Union{UTF16String,UTF32String}) = pointer(x.data) -pointer(x::Union{UTF16String,UTF32String}, i::Integer) = pointer(x)+(i-1)*sizeof(eltype(x.data)) -pointer{T<:Union{UTF16String,UTF32String}}(x::SubString{T}) = pointer(x.string.data) + x.offset*sizeof(eltype(x.string.data)) -pointer{T<:Union{UTF16String,UTF32String}}(x::SubString{T}, i::Integer) = pointer(x.string.data) + (x.offset + (i-1))*sizeof(eltype(x.string.data)) diff --git a/test/serialize.jl b/test/serialize.jl index c7c9e7e3d3c34..67a7249c2b91c 100644 --- a/test/serialize.jl +++ b/test/serialize.jl @@ -4,8 +4,8 @@ using Base.Test # Check that serializer hasn't gone out-of-frame @test Serializer.sertag(Symbol) == 2 -@test Serializer.sertag(()) == 46 -@test Serializer.sertag(false) == 122 +@test Serializer.sertag(()) == 44 +@test Serializer.sertag(false) == 120 function create_serialization_stream(f::Function) s = IOBuffer() diff --git a/test/strings/basic.jl b/test/strings/basic.jl index f8a63850a0253..5d62ddb81a842 100644 --- a/test/strings/basic.jl +++ b/test/strings/basic.jl @@ -246,28 +246,11 @@ let p = cstrdup("hello") Libc.free(p) end -# issue # 11389: Vector{UInt32} was copied with UTF32String, unlike Vector{Char} -a = UInt32[48,0] -b = UTF32String(a) -@test b == "0" -a[1] = 65 -@test b == "A" -c = Char['0','\0'] -d = UTF32String(c) -@test d == "0" -c[1] = 'A' -@test d == "A" - # iteration @test [c for c in "ḟøøƀäṙ"] == ['ḟ', 'ø', 'ø', 'ƀ', 'ä', 'ṙ'] @test [i for i in eachindex("ḟøøƀäṙ")] == [1, 4, 6, 8, 10, 12] @test [x for x in enumerate("ḟøøƀäṙ")] == [(1, 'ḟ'), (2, 'ø'), (3, 'ø'), (4, 'ƀ'), (5, 'ä'), (6, 'ṙ')] -# Issue #11140 -@test isvalid(utf32("a")) == true -@test isvalid(utf32("\x00")) == true -@test isvalid(UTF32String, UInt32[0xd800,0]) == false - # test all edge conditions for (val, pass) in ( (0, true), (0xd7ff, true), @@ -306,39 +289,9 @@ for (val, pass) in ( ) @test isvalid(String, val) == pass end -for (val, pass) in ( - (UInt16[0x0000], true), - (UInt16[0xd7ff,0], true), - (UInt16[0xd800,0], false), - (UInt16[0xdfff,0], false), - (UInt16[0xe000,0], true), - (UInt16[0xffff,0], true), - (UInt16[0xd800,0xdc00,0], true), - (UInt16[0xdbff,0xdfff,0], true), - (UInt16[0xd800,0x0100,0], false), - (UInt16[0xdc00,0x0100,0], false), - (UInt16[0xdc00,0xd800,0], false) - ) - @test isvalid(UTF16String, val) == pass -end -for (val, pass) in ( - (UInt32[0x0000], true), - (UInt32[0xd7ff,0], true), - (UInt32[0xd800,0], false), - (UInt32[0xdfff,0], false), - (UInt32[0xe000,0], true), - (UInt32[0xffff,0], true), - (UInt32[0x100000,0], true), - (UInt32[0x10ffff,0], true), - (UInt32[0x110000,0], false), - ) - @test isvalid(UTF32String, val) == pass -end # Issue #11203 @test isvalid(String, UInt8[]) == true -@test isvalid(UTF16String,UInt16[]) == true -@test isvalid(UTF32String,UInt32[]) == true # Check UTF-8 characters # Check ASCII range (true), @@ -411,21 +364,6 @@ end # 11482 -# isvalid -let s = "abcdef", u8 = "abcdef\uff", u16 = utf16(u8), u32 = utf32(u8), - bad32 = utf32(UInt32[65,0x110000]), badch = Char[0x110000][1] - - @test !isvalid(bad32) - @test !isvalid(badch) - @test isvalid(s) - @test isvalid(u8) - @test isvalid(u16) - @test isvalid(u32) - @test isvalid(String, u8) - @test isvalid(UTF16String, u16) - @test isvalid(UTF32String, u32) -end - # lower and upper @test uppercase("aBc") == "ABC" @test uppercase('A') == 'A' @@ -458,9 +396,9 @@ str = "abcdef\uff\uffff\u10ffffABCDEF" foomap(ch) = (ch > Char(65)) foobar(ch) = Char(0xd800) foobaz(ch) = reinterpret(Char, typemax(UInt32)) -@test_throws UnicodeError map(foomap, utf16(str)) -@test_throws UnicodeError map(foobar, utf16(str)) -@test_throws UnicodeError map(foobaz, utf16(str)) +@test_throws ArgumentError map(foomap, GenericString(str)) +@test map(foobar, GenericString(str)) == String(repeat(b"\ud800", outer=[17])) +@test map(foobaz, GenericString(str)) == String(repeat(b"\ufffd", outer=[17])) @test "a".*["b","c"] == ["ab","ac"] @test ["b","c"].*"a" == ["ba","ca"] diff --git a/test/strings/io.jl b/test/strings/io.jl index 41df929335f39..1ab6be0035c44 100644 --- a/test/strings/io.jl +++ b/test/strings/io.jl @@ -136,76 +136,6 @@ end @test "\x0f" == unescape_string("\\x0f") @test "\x0F" == unescape_string("\\x0F") -extrapath = is_windows() ? joinpath(JULIA_HOME,"..","Git","usr","bin")*";" : "" -withenv("PATH" => extrapath * ENV["PATH"]) do -if !success(`iconv --version`) - warn("iconv not found, skipping unicode tests!") - is_windows() && warn("Use WinRPM.install(\"win_iconv\") to run these tests") -else - # Create unicode test data directory - unicodedir = mktempdir() - - # Use perl to generate the primary data - primary_encoding = "UTF-32BE" - primary_path = replace(joinpath(unicodedir, primary_encoding*".unicode"),"\\","\\\\\\\\") - run(`perl -e " - $$fname = \"$primary_path\"; - open(UNICODEF, \">\", \"$$fname\") or die \"can\'t open $$fname: $$!\"; - binmode(UNICODEF); - print UNICODEF pack \"N*\", 0xfeff, 0..0xd7ff, 0xe000..0x10ffff; - close(UNICODEF);"` ) - - # Use iconv to generate the other data - for encoding in ["UTF-32LE", "UTF-16BE", "UTF-16LE", "UTF-8"] - output_path = joinpath(unicodedir, encoding*".unicode") - f = Base.Filesystem.open(output_path,Base.JL_O_WRONLY|Base.JL_O_CREAT,Base.S_IRUSR | Base.S_IWUSR | Base.S_IRGRP | Base.S_IROTH) - run(pipeline(`iconv -f $primary_encoding -t $encoding $primary_path`, f)) - Base.Filesystem.close(f) - end - - f=open(joinpath(unicodedir,"UTF-32LE.unicode")) - str1 = utf32(read(f, UInt32, 1112065)[2:end]) - close(f) - - f=open(joinpath(unicodedir,"UTF-8.unicode")) - str2 = String(read(f, UInt8, 4382595)[4:end]) - close(f) - @test str1 == str2 - - @test str1 == utf16(read(joinpath(unicodedir,"UTF-16LE.unicode"), - UInt16, 2160641)[2:end]) - - @test str1 == utf16(read(joinpath(unicodedir,"UTF-16LE.unicode"), - UInt8, 2160641*2)) - - @test str1 == utf16(read(joinpath(unicodedir,"UTF-16BE.unicode"), - UInt8, 2160641*2)) - - - @test str1 == utf32(read(joinpath(unicodedir,"UTF-32LE.unicode"), - UInt8, 1112065*4)) - - @test str1 == utf32(read(joinpath(unicodedir,"UTF-32BE.unicode"), - UInt8, 1112065*4)) - - - str1 = "∀ ε > 0, ∃ δ > 0: |x-y| < δ ⇒ |f(x)-f(y)| < ε" - str2 = UTF32String(UInt32[ - 8704, 32, 949, 32, 62, 32, 48, 44, 32, 8707, 32, - 948, 32, 62, 32, 48, 58, 32, 124, 120, 45, 121, 124, - 32, 60, 32, 948, 32, 8658, 32, 124, 102, 40, 120, - 41, 45, 102, 40, 121, 41, 124, 32, 60, 32, 949 - ,0]) - @test str1 == str2 - - # Cleanup unicode data - for encoding in ["UTF-32BE", "UTF-32LE", "UTF-16BE", "UTF-16LE", "UTF-8"] - rm(joinpath(unicodedir,encoding*".unicode")) - end - rm(unicodedir) -end -end - # Tests of join() @test join([]) == "" @test join(["a"],"?") == "a" diff --git a/test/strings/types.jl b/test/strings/types.jl index 48991654a7a81..0a5aba56d48ad 100644 --- a/test/strings/types.jl +++ b/test/strings/types.jl @@ -163,27 +163,29 @@ rs = RevString("foobar") @test rsplit(RevString("ailuj"),'l') == ["ju","ia"] @test parse(Float64,RevString("64")) === 46.0 -# reverseind -for T in (String, UTF16String, UTF32String) - for prefix in ("", "abcd", "\U0001d6a4\U0001d4c1", "\U0001d6a4\U0001d4c1c", " \U0001d6a4\U0001d4c1") - for suffix in ("", "abcde", "\U0001d4c1β\U0001d6a4", "\U0001d4c1β\U0001d6a4c", " \U0001d4c1β\U0001d6a4") - for c in ('X', 'δ', '\U0001d6a5') - s = convert(T, string(prefix, c, suffix)) - ri = search(reverse(s), c) - @test reverse(s) == RevString(s) - @test c == s[reverseind(s, ri)] == reverse(s)[ri] - s = RevString(s) - ri = search(reverse(s), c) - @test c == s[reverseind(s, ri)] == reverse(s)[ri] - s = convert(T, string(prefix, prefix, c, suffix, suffix)) - pre = convert(T, prefix) - sb = SubString(s, nextind(pre, endof(pre)), endof(convert(T, string(prefix, prefix, c, suffix)))) - ri = search(reverse(sb), c) - @test c == sb[reverseind(sb, ri)] == reverse(sb)[ri] - end - end - end -end +# # reverseind +# for T in (String, GenericString) +# for prefix in ("", "abcd", "\U0001d6a4\U0001d4c1", "\U0001d6a4\U0001d4c1c", " \U0001d6a4\U0001d4c1") +# for suffix in ("", "abcde", "\U0001d4c1β\U0001d6a4", "\U0001d4c1β\U0001d6a4c", " \U0001d4c1β\U0001d6a4") +# for c in ('X', 'δ', '\U0001d6a5') +# @show (T,prefix,suffix,c) +# s = convert(T, string(prefix, c, suffix)) +# r = convert(T, String(reverse(s))) +# ri = search(r, c) +# @test r == RevString(s) +# @test c == s[reverseind(s, ri)] == r[ri] +# s = RevString(s) +# ri = search(r, c) +# @test c == s[reverseind(s, ri)] == r[ri] +# s = convert(T, string(prefix, prefix, c, suffix, suffix)) +# pre = convert(T, prefix) +# sb = SubString(s, nextind(pre, endof(pre)), endof(convert(T, string(prefix, prefix, c, suffix)))) +# ri = search(reverse(sb), c) +# @test c == sb[reverseind(sb, ri)] == reverse(sb)[ri] +# end +# end +# end +# end ## Repeat strings ## diff --git a/test/strings/util.jl b/test/strings/util.jl index d52189554f320..1aeb5b86e143d 100644 --- a/test/strings/util.jl +++ b/test/strings/util.jl @@ -21,7 +21,7 @@ for s in ("", " ", " abc", "abc ", " abc "), f in (lstrip, rstrip, strip) fs = f(s) - for T = (String, UTF16String, UTF32String) + for T = (String, GenericString) t = convert(T,s) ft = f(t) @test s == t diff --git a/test/unicode.jl b/test/unicode.jl index 21f3dd7d48fb4..18666448ef169 100644 --- a/test/unicode.jl +++ b/test/unicode.jl @@ -1,9 +1,5 @@ # This file is a part of Julia. License is MIT: http://julialang.org/license include("unicode/UnicodeError.jl") -include("unicode/types.jl") -include("unicode/checkstring.jl") include("unicode/utf8.jl") -include("unicode/utf16.jl") -include("unicode/utf32.jl") include("unicode/utf8proc.jl") diff --git a/test/unicode/utf16.jl b/test/unicode/utf16.jl deleted file mode 100644 index 1c8e31cdece98..0000000000000 --- a/test/unicode/utf16.jl +++ /dev/null @@ -1,23 +0,0 @@ -# This file is a part of Julia. License is MIT: http://julialang.org/license - -# UTF16 -u8 = "\U10ffff\U1d565\U1d7f6\U00066\U2008a" -u16 = utf16(u8) -@test sizeof(u16) == 18 -@test length(u16.data) == 10 && u16.data[end] == 0 -@test length(u16) == 5 -@test String(u16) == u8 -@test collect(u8) == collect(u16) -@test u8 == utf16(u16.data[1:end-1]) == utf16(copy!(Array(UInt8, 18), 1, reinterpret(UInt8, u16.data), 1, 18)) -@test u8 == utf16(pointer(u16)) == utf16(convert(Ptr{Int16}, pointer(u16))) -@test_throws UnicodeError utf16(utf32(Char(0x120000))) -@test_throws UnicodeError utf16(UInt8[1,2,3]) - -# Add tests for full coverage -@test convert(UTF16String, "test") == "test" -@test convert(UTF16String, u16) == u16 -@test convert(UTF16String, UInt16[[0x65, 0x66] [0x67, 0x68]]) == "efgh" -@test convert(UTF16String, Int16[[0x65, 0x66] [0x67, 0x68]]) == "efgh" -@test map(lowercase, utf16("TEST\U1f596")) == "test\U1f596" -@test typeof(Base.unsafe_convert(Ptr{UInt16}, utf16("test"))) == Ptr{UInt16} - diff --git a/test/unicode/utf32.jl b/test/unicode/utf32.jl deleted file mode 100644 index f4455271be054..0000000000000 --- a/test/unicode/utf32.jl +++ /dev/null @@ -1,258 +0,0 @@ -# This file is a part of Julia. License is MIT: http://julialang.org/license - -# UTF32 -u8 = "\U10ffff\U1d565\U1d7f6\U00066\U2008a" -u32 = utf32(u8) -@test sizeof(u32) == 20 -@test length(u32.data) == 6 && u32.data[end] == 0 -@test length(u32) == 5 -@test String(u32) == u8 -@test collect(u8) == collect(u32) -@test u8 == utf32(u32.data[1:end-1]) == utf32(copy!(Array{UInt8}(20), 1, reinterpret(UInt8, u32.data), 1, 20)) -@test u8 == utf32(pointer(u32)) == utf32(convert(Ptr{Int32}, pointer(u32))) -@test_throws UnicodeError utf32(UInt8[1,2,3]) - -# issue #11551 (#11004,#10959) -function tstcvt(strUTF8::String, strUTF16::UTF16String, strUTF32::UTF32String) - @test utf16(strUTF8) == strUTF16 - @test utf32(strUTF8) == strUTF32 - @test String(strUTF16) == strUTF8 - @test utf32(strUTF16) == strUTF32 - @test String(strUTF32) == strUTF8 - @test utf16(strUTF32) == strUTF16 -end - -# Create some ASCII, UTF8, UTF16, and UTF32 strings - -strAscii = "abcdefgh" -strA_UTF8 = ("abcdefgh\uff")[1:8] -strL_UTF8 = "abcdef\uff\uff" -str2_UTF8 = "abcd\uff\uff\u7ff\u7ff" -str3_UTF8 = "abcd\uff\uff\u7fff\u7fff" -str4_UTF8 = "abcd\uff\u7ff\u7fff\U7ffff" -strS_UTF8 = String(b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\xed\xa0\x80\xed\xb0\x80") -strC_UTF8 = String(b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\U10000") -strz_UTF8 = String(b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\0") -strZ = b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\xc0\x80" - -strA_UTF16 = utf16(strA_UTF8) -strL_UTF16 = utf16(strL_UTF8) -str2_UTF16 = utf16(str2_UTF8) -str3_UTF16 = utf16(str3_UTF8) -str4_UTF16 = utf16(str4_UTF8) -strS_UTF16 = utf16(strS_UTF8) - -strA_UTF32 = utf32(strA_UTF8) -strL_UTF32 = utf32(strL_UTF8) -str2_UTF32 = utf32(str2_UTF8) -str3_UTF32 = utf32(str3_UTF8) -str4_UTF32 = utf32(str4_UTF8) -strS_UTF32 = utf32(strS_UTF8) - -@test String(strAscii) == strAscii -@test utf16(strAscii) == strAscii -@test utf32(strAscii) == strAscii - -tstcvt(strA_UTF8,strA_UTF16,strA_UTF32) -tstcvt(strL_UTF8,strL_UTF16,strL_UTF32) -tstcvt(str2_UTF8,str2_UTF16,str2_UTF32) -tstcvt(str3_UTF8,str3_UTF16,str3_UTF32) -tstcvt(str4_UTF8,str4_UTF16,str4_UTF32) - -# Test converting surrogate pairs -@test utf16(strS_UTF8) == strC_UTF8 -@test utf32(strS_UTF8) == strC_UTF8 -@test String(strS_UTF16) == strC_UTF8 -@test utf32(strS_UTF16) == strC_UTF8 -@test String(strS_UTF32) == strC_UTF8 -@test utf16(strS_UTF32) == strC_UTF8 - -# Test converting overlong \0 -@test convert(String, strZ) == strz_UTF8 -@test utf16(String(strZ)) == strz_UTF8 -@test utf32(String(strZ)) == strz_UTF8 - -# Test invalid sequences - -strval(::Type{String}, dat) = dat -strval(::Union{Type{UTF16String},Type{UTF32String}}, dat) = String(dat) - -byt = 0x0 -for T in (String, UTF16String, UTF32String) - try - # Continuation byte not after lead - for byt in 0x80:0xbf - @test_throws UnicodeError convert(T, strval(T, UInt8[byt])) - end - - # Test lead bytes - for byt in 0xc0:0xff - # Single lead byte at end of string - @test_throws UnicodeError convert(T, strval(T, UInt8[byt])) - # Lead followed by non-continuation character < 0x80 - @test_throws UnicodeError convert(T, strval(T, UInt8[byt,0])) - # Lead followed by non-continuation character > 0xbf - @test_throws UnicodeError convert(T, strval(T, UInt8[byt,0xc0])) - end - - # Test overlong 2-byte - for byt in 0x81:0xbf - @test_throws UnicodeError convert(T, strval(T, UInt8[0xc0,byt])) - end - for byt in 0x80:0xbf - @test_throws UnicodeError convert(T, strval(T, UInt8[0xc1,byt])) - end - - # Test overlong 3-byte - for byt in 0x80:0x9f - @test_throws UnicodeError convert(T, strval(T, UInt8[0xe0,byt,0x80])) - end - - # Test overlong 4-byte - for byt in 0x80:0x8f - @test_throws UnicodeError convert(T, strval(T, UInt8[0xef,byt,0x80,0x80])) - end - - # Test 4-byte > 0x10ffff - for byt in 0x90:0xbf - @test_throws UnicodeError convert(T, strval(T, UInt8[0xf4,byt,0x80,0x80])) - end - for byt in 0xf5:0xf7 - @test_throws UnicodeError convert(T, strval(T, UInt8[byt,0x80,0x80,0x80])) - end - - # Test 5-byte - for byt in 0xf8:0xfb - @test_throws UnicodeError convert(T, strval(T, UInt8[byt,0x80,0x80,0x80,0x80])) - end - - # Test 6-byte - for byt in 0xfc:0xfd - @test_throws UnicodeError convert(T, strval(T, UInt8[byt,0x80,0x80,0x80,0x80,0x80])) - end - - # Test 7-byte - @test_throws UnicodeError convert(T, strval(T, UInt8[0xfe,0x80,0x80,0x80,0x80,0x80,0x80])) - - # Three and above byte sequences - for byt in 0xe0:0xef - # Lead followed by only 1 continuation byte - @test_throws UnicodeError convert(T, strval(T, UInt8[byt,0x80])) - # Lead ended by non-continuation character < 0x80 - @test_throws UnicodeError convert(T, strval(T, UInt8[byt,0x80,0])) - # Lead ended by non-continuation character > 0xbf - @test_throws UnicodeError convert(T, strval(T, UInt8[byt,0x80,0xc0])) - end - - # 3-byte encoded surrogate character(s) - # Single surrogate - @test_throws UnicodeError convert(T, strval(T, UInt8[0xed,0xa0,0x80])) - # Not followed by surrogate - @test_throws UnicodeError convert(T, strval(T, UInt8[0xed,0xa0,0x80,0xed,0x80,0x80])) - # Trailing surrogate first - @test_throws UnicodeError convert(T, strval(T, UInt8[0xed,0xb0,0x80,0xed,0xb0,0x80])) - # Followed by lead surrogate - @test_throws UnicodeError convert(T, strval(T, UInt8[0xed,0xa0,0x80,0xed,0xa0,0x80])) - - # Four byte sequences - for byt in 0xf0:0xf4 - # Lead followed by only 2 continuation bytes - @test_throws UnicodeError convert(T, strval(T, UInt8[byt,0x80,0x80])) - # Lead followed by non-continuation character < 0x80 - @test_throws UnicodeError convert(T, strval(T, UInt8[byt,0x80,0x80,0])) - # Lead followed by non-continuation character > 0xbf - @test_throws UnicodeError convert(T, strval(T, UInt8[byt,0x80,0x80,0xc0])) - end - catch exp - println("Error checking $T: $byt") - throw(exp) - end -end - -# 12268 -for (fun, S, T) in ((utf16, UInt16, UTF16String), (utf32, UInt32, UTF32String)) - # AbstractString - str = "abcd\0\uff\u7ff\u7fff\U7ffff" - tst = SubString(convert(T,str),4) - cmp = Char['d','\0','\uff','\u7ff','\u7fff','\U7ffff'] - cmp32 = UInt32['d','\0','\uff','\u7ff','\u7fff','\U7ffff','\0'] - cmp16 = UInt16[0x0064,0x0000,0x00ff,0x07ff,0x7fff,0xd9bf,0xdfff,0x0000] - x = fun(tst) - cmpx = (S == UInt16 ? cmp16 : cmp32) - @test typeof(tst) == SubString{T} - @test convert(T, tst) == str[4:end] - @test convert(Vector{Char}, x) == cmp - # Vector{T} / Array{T} - @test convert(Vector{S}, x) == cmpx - @test convert(Array{S}, x) == cmpx - # Embedded nul checking - @test Base.containsnul(x) - @test Base.containsnul(tst) - # map - @test_throws UnicodeError map(islower, x) - @test_throws ArgumentError map(islower, tst) - # SubArray conversion - subarr = view(cmp, 1:6) - @test convert(T, subarr) == str[4:end] -end - -# Char to UTF32String -@test utf32('\U7ffff') == utf32("\U7ffff") -@test convert(UTF32String, '\U7ffff') == utf32("\U7ffff") - -@test isvalid(UTF32String, Char['d','\uff','\u7ff','\u7fff','\U7ffff']) -@test reverse(utf32("abcd \uff\u7ff\u7fff\U7ffff")) == utf32("\U7ffff\u7fff\u7ff\uff dcba") - -# Test pointer() functions -let str = ascii("this ") - u8 = String(str) - u16 = utf16(str) - u32 = utf32(str) - pa = pointer(str) - p8 = pointer(u8) - p16 = pointer(u16) - p32 = pointer(u32) - @test typeof(pa) == Ptr{UInt8} - @test unsafe_load(pa,1) == 0x74 - @test typeof(p8) == Ptr{UInt8} - @test unsafe_load(p8,1) == 0x74 - @test typeof(p16) == Ptr{UInt16} - @test unsafe_load(p16,1) == 0x74 - @test typeof(p32) == Ptr{UInt32} - @test unsafe_load(p32,1) == 0x74 - pa = pointer(str, 2) - p8 = pointer(u8, 2) - p16 = pointer(u16, 2) - p32 = pointer(u32, 2) - @test typeof(pa) == Ptr{UInt8} - @test unsafe_load(pa,1) == 0x68 - @test typeof(p8) == Ptr{UInt8} - @test unsafe_load(p8,1) == 0x68 - @test typeof(p16) == Ptr{UInt16} - @test unsafe_load(p16,1) == 0x68 - @test typeof(p32) == Ptr{UInt32} - @test unsafe_load(p32,1) == 0x68 - s8 = SubString{String}(u8, 3, 5) - s16 = SubString{UTF16String}(u16, 3, 5) - s32 = SubString{UTF32String}(u32, 3, 5) - p8 = pointer(s8) - p16 = pointer(s16) - p32 = pointer(s32) - @test typeof(p8) == Ptr{UInt8} - @test unsafe_load(p8,1) == 0x69 - @test typeof(p16) == Ptr{UInt16} - @test unsafe_load(p16,1) == 0x69 - @test typeof(p32) == Ptr{UInt32} - @test unsafe_load(p32,1) == 0x69 - p8 = pointer(s8, 2) - p16 = pointer(s16, 2) - p32 = pointer(s32, 2) - @test typeof(p8) == Ptr{UInt8} - @test unsafe_load(p8,1) == 0x73 - @test typeof(p16) == Ptr{UInt16} - @test unsafe_load(p16,1) == 0x73 - @test typeof(p32) == Ptr{UInt32} - @test unsafe_load(p32,1) == 0x73 -end - -@test isvalid(Char['f','o','o','b','a','r']) diff --git a/test/unicode/utf8.jl b/test/unicode/utf8.jl index 8827642ba9c3a..0a4665146b697 100644 --- a/test/unicode/utf8.jl +++ b/test/unicode/utf8.jl @@ -3,9 +3,9 @@ ## Test for CESU-8 sequences let ch = 0x10000 - for hichar = 0xd800:0xdbff - for lochar = 0xdc00:0xdfff - @test convert(String, String(Char[hichar, lochar]).data) == string(Char(ch)) + for hi = 0xd800:0xdbff + for lo = 0xdc00:0xdfff + @test convert(String, String(Char[hi, lo]).data) == string(Char(ch)) ch += 1 end end diff --git a/test/unicode/utf8proc.jl b/test/unicode/utf8proc.jl index 0f1e3b0727e26..3078f05fce080 100644 --- a/test/unicode/utf8proc.jl +++ b/test/unicode/utf8proc.jl @@ -234,7 +234,7 @@ let grphtest = (("b\u0300lahβlahb\u0302láh", ["b\u0300","l","a","h", "\U1d4c1\u0300"]), ("x",["x"]), ("abc",["a","b","c"])) - for T in (String,utf16,utf32) + for T in (String,GenericString) for nf in (:NFC, :NFD) for (s, g) in grphtest s_ = T(normalize_string(s, nf)) From 33be76907eeba40a51c2e80fb074057aadf26ece Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Sat, 2 Jul 2016 23:12:39 +0200 Subject: [PATCH 2/7] Add reverseind() for AbstractString, re-enable tests with GenericString reverse() for GenericString/AbstractString returns a RevString, whose indexing behavior is very different from a reverse()'d String which is returned for String. Thus, calling reverseind() on the underlying String object is not correct for GenericString. Add a generic but O(n) method for AbstractString and use it for GenericString. --- base/strings/string.jl | 4 +--- base/strings/types.jl | 7 +++---- base/test.jl | 1 - test/strings/types.jl | 47 +++++++++++++++++++++--------------------- 4 files changed, 28 insertions(+), 31 deletions(-) diff --git a/base/strings/string.jl b/base/strings/string.jl index c9637b746ef91..6395c03eac5c7 100644 --- a/base/strings/string.jl +++ b/base/strings/string.jl @@ -104,7 +104,7 @@ function first_utf8_byte(ch::Char) end function reverseind(s::String, i::Integer) - j = lastidx(s) + 1 - i + j = length(s.data) + 1 - i d = s.data while is_valid_continuation(d[j]) j -= 1 @@ -116,8 +116,6 @@ end sizeof(s::String) = sizeof(s.data) -lastidx(s::String) = length(s.data) - isvalid(s::String, i::Integer) = (1 <= i <= endof(s.data)) && !is_valid_continuation(s.data[i]) diff --git a/base/strings/types.jl b/base/strings/types.jl index fe3470bac5721..b98ea229f57ab 100644 --- a/base/strings/types.jl +++ b/base/strings/types.jl @@ -118,12 +118,11 @@ reverse(s::RevString) = s.string ## reverse an index i so that reverse(s)[i] == s[reverseind(s,i)] +reverseind(s::AbstractString, i) = chr2ind(s, length(s) + 1 - ind2chr(reverse(s), i)) reverseind(s::Union{DirectIndexString,SubString{DirectIndexString}}, i::Integer) = length(s) + 1 - i reverseind(s::RevString, i::Integer) = endof(s) - i + 1 -lastidx(s::AbstractString) = nextind(s, endof(s)) - 1 -lastidx(s::DirectIndexString) = length(s) -reverseind(s::SubString, i::Integer) = - reverseind(s.string, lastidx(s.string)-s.offset-s.endof+i) - s.offset +reverseind(s::SubString{String}, i::Integer) = + reverseind(s.string, nextind(s.string, endof(s.string))-s.offset-s.endof+i-1) - s.offset ## efficient representation of repeated strings ## diff --git a/base/test.jl b/base/test.jl index f9790636f6906..b0382f3284f5c 100644 --- a/base/test.jl +++ b/base/test.jl @@ -1009,6 +1009,5 @@ end Base.convert(::Type{GenericString}, s::AbstractString) = GenericString(s) Base.endof(s::GenericString) = endof(s.string) Base.next(s::GenericString, i::Int) = next(s.string, i) -Base.reverseind(s::GenericString, i::Integer) = reverseind(s.string, i) end # module diff --git a/test/strings/types.jl b/test/strings/types.jl index 0a5aba56d48ad..10e8ca406682b 100644 --- a/test/strings/types.jl +++ b/test/strings/types.jl @@ -163,29 +163,30 @@ rs = RevString("foobar") @test rsplit(RevString("ailuj"),'l') == ["ju","ia"] @test parse(Float64,RevString("64")) === 46.0 -# # reverseind -# for T in (String, GenericString) -# for prefix in ("", "abcd", "\U0001d6a4\U0001d4c1", "\U0001d6a4\U0001d4c1c", " \U0001d6a4\U0001d4c1") -# for suffix in ("", "abcde", "\U0001d4c1β\U0001d6a4", "\U0001d4c1β\U0001d6a4c", " \U0001d4c1β\U0001d6a4") -# for c in ('X', 'δ', '\U0001d6a5') -# @show (T,prefix,suffix,c) -# s = convert(T, string(prefix, c, suffix)) -# r = convert(T, String(reverse(s))) -# ri = search(r, c) -# @test r == RevString(s) -# @test c == s[reverseind(s, ri)] == r[ri] -# s = RevString(s) -# ri = search(r, c) -# @test c == s[reverseind(s, ri)] == r[ri] -# s = convert(T, string(prefix, prefix, c, suffix, suffix)) -# pre = convert(T, prefix) -# sb = SubString(s, nextind(pre, endof(pre)), endof(convert(T, string(prefix, prefix, c, suffix)))) -# ri = search(reverse(sb), c) -# @test c == sb[reverseind(sb, ri)] == reverse(sb)[ri] -# end -# end -# end -# end +# reverseind +for T in (String, GenericString) + for prefix in ("", "abcd", "\U0001d6a4\U0001d4c1", "\U0001d6a4\U0001d4c1c", " \U0001d6a4\U0001d4c1") + for suffix in ("", "abcde", "\U0001d4c1β\U0001d6a4", "\U0001d4c1β\U0001d6a4c", " \U0001d4c1β\U0001d6a4") + for c in ('X', 'δ', '\U0001d6a5') + s = convert(T, string(prefix, c, suffix)) + r = reverse(s) + ri = search(r, c) + @test r == RevString(s) + @test c == s[reverseind(s, ri)] == r[ri] + s = RevString(s) + r = reverse(s) + ri = search(r, c) + @test c == s[reverseind(s, ri)] == r[ri] + s = convert(T, string(prefix, prefix, c, suffix, suffix)) + pre = convert(T, prefix) + sb = SubString(s, nextind(pre, endof(pre)), endof(convert(T, string(prefix, prefix, c, suffix)))) + r = reverse(sb) + ri = search(r, c) + @test c == sb[reverseind(sb, ri)] == r[ri] + end + end + end +end ## Repeat strings ## From 593c5de621cf39b930073415dd6c0d66d233baf5 Mon Sep 17 00:00:00 2001 From: Tony Kelman Date: Tue, 5 Jul 2016 20:24:24 -0700 Subject: [PATCH 3/7] Delete the now-unused UTF_ERR constants --- base/strings/errors.jl | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/base/strings/errors.jl b/base/strings/errors.jl index b1b694df7fdf2..c0ca38ff28db2 100644 --- a/base/strings/errors.jl +++ b/base/strings/errors.jl @@ -3,23 +3,7 @@ ## Error messages for Unicode / UTF support const UTF_ERR_SHORT = "invalid UTF-8 sequence starting at index <<1>> (0x<<2>> missing one or more continuation bytes)" -const UTF_ERR_CONT = "invalid UTF-8 sequence starting at index <<1>> (0x<<2>> is not a continuation byte)" -const UTF_ERR_LONG = "invalid UTF-8 sequence, overlong encoding starting at index <<1>> (0x<<2>>)" -const UTF_ERR_NOT_LEAD = "not a leading Unicode surrogate code unit at index <<1>> (0x<<2>>)" -const UTF_ERR_NOT_TRAIL = "not a trailing Unicode surrogate code unit at index <<1>> (0x<<2>>)" -const UTF_ERR_NOT_SURROGATE = "not a valid Unicode surrogate code unit at index <<1>> (0x<<2>>)" -const UTF_ERR_MISSING_SURROGATE = "missing trailing Unicode surrogate code unit after index <<1>> (0x<<2>>)" -const UTF_ERR_INVALID = "invalid Unicode character starting at index <<1>> (0x<<2>> > 0x10ffff)" -const UTF_ERR_SURROGATE = "surrogate encoding not allowed in UTF-8 or UTF-32, at index <<1>> (0x<<2>>)" -const UTF_ERR_NULL_16_TERMINATE = "UTF16String data must be NULL-terminated" -const UTF_ERR_NULL_32_TERMINATE = "UTF32String data must be NULL-terminated" -const UTF_ERR_ODD_BYTES_16 = "UTF16String can't have odd number of bytes <<1>>" -const UTF_ERR_ODD_BYTES_32 = "UTF32String must have multiple of 4 bytes <<1>>" -const UTF_ERR_INVALID_CHAR = "invalid Unicode character (0x<<2>> > 0x10ffff)" -const UTF_ERR_INVALID_8 = "invalid UTF-8 data" -const UTF_ERR_INVALID_16 = "invalid UTF-16 data" const UTF_ERR_INVALID_INDEX = "invalid character index" -const UTF_ERR_MAP_CHAR = "map(f,s::AbstractString) requires f to return Char; try map(f,collect(s)) or a comprehension instead" type UnicodeError <: Exception errmsg::AbstractString ##< A UTF_ERR_ message From 1c2e5b555ea5c94d7d883a09d00e485fa8fbd4e9 Mon Sep 17 00:00:00 2001 From: Tony Kelman Date: Tue, 5 Jul 2016 20:31:31 -0700 Subject: [PATCH 4/7] Doc update for utf16 and utf32 removal --- base/docs/helpdb/Base.jl | 12 +++++------- doc/manual/strings.rst | 9 +++------ doc/stdlib/strings.rst | 33 ++------------------------------- 3 files changed, 10 insertions(+), 44 deletions(-) diff --git a/base/docs/helpdb/Base.jl b/base/docs/helpdb/Base.jl index f8af1543b5d68..a45a26f0c5225 100644 --- a/base/docs/helpdb/Base.jl +++ b/base/docs/helpdb/Base.jl @@ -8856,19 +8856,17 @@ vecnorm """ isvalid(value) -> Bool -Returns `true` if the given value is valid for its type, which currently can be one of -`Char`, `String`, `UTF16String`, or `UTF32String`. +Returns `true` if the given value is valid for its type, which currently can be either +`Char` or `String`. """ isvalid(value) """ isvalid(T, value) -> Bool -Returns `true` if the given value is valid for that type. Types currently can be `Char`, -`String`, `UTF16String`, or `UTF32String` Values for `Char` can be of -type `Char` or `UInt32` Values for `String` can be of that type, or -`Vector{UInt8}` Values for `UTF16String` can be `UTF16String` or `Vector{UInt16}` Values for -`UTF32String` can be `UTF32String`, `Vector{Char}` or `Vector{UInt32}` +Returns `true` if the given value is valid for that type. Types currently can +be either `Char` or `String`. Values for `Char` can be of type `Char` or `UInt32`. +Values for `String` can be of that type, or `Vector{UInt8}`. """ isvalid(T,value) diff --git a/doc/manual/strings.rst b/doc/manual/strings.rst index 36180463ba46a..45e556f45f99a 100644 --- a/doc/manual/strings.rst +++ b/doc/manual/strings.rst @@ -349,12 +349,9 @@ exception handling required: y -UTF-8 is not the only encoding that Julia supports, and adding support -for new encodings is quite easy. In particular, Julia also provides -:obj:`UTF16String` and :obj:`UTF32String` types, constructed by -:func:`utf16` and :func:`utf32` respectively, for UTF-16 and -UTF-32 encodings. Additional discussion of other encodings and how to -implement support for them is beyond the scope of this document for +Julia uses UTF-8 encoding by default, and support for new encodings can +be added by packages. Additional discussion of other encodings and how +to implement support for them is beyond the scope of this document for the time being. For further discussion of UTF-8 encoding issues, see the section below on `byte array literals <#Byte+Array+Literals>`_, which goes into some greater detail. diff --git a/doc/stdlib/strings.rst b/doc/stdlib/strings.rst index 0817c94f0fa56..40ec3e2d69347 100644 --- a/doc/stdlib/strings.rst +++ b/doc/stdlib/strings.rst @@ -143,13 +143,13 @@ .. Docstring generated from Julia source - Returns ``true`` if the given value is valid for its type, which currently can be one of ``Char``\ , ``String``\ , ``UTF16String``\ , or ``UTF32String``\ . + Returns ``true`` if the given value is valid for its type, which currently can be either ``Char`` or ``String``\ . .. function:: isvalid(T, value) -> Bool .. Docstring generated from Julia source - Returns ``true`` if the given value is valid for that type. Types currently can be ``Char``\ , ``String``\ , ``UTF16String``\ , or ``UTF32String`` Values for ``Char`` can be of type ``Char`` or ``UInt32`` Values for ``String`` can be of that type, or ``Vector{UInt8}`` Values for ``UTF16String`` can be ``UTF16String`` or ``Vector{UInt16}`` Values for ``UTF32String`` can be ``UTF32String``\ , ``Vector{Char}`` or ``Vector{UInt32}`` + Returns ``true`` if the given value is valid for that type. Types currently can be either ``Char`` or ``String``\ . Values for ``Char`` can be of type ``Char`` or ``UInt32``\ . Values for ``String`` can be of that type, or ``Vector{UInt8}``\ . .. function:: isvalid(str, i) @@ -472,32 +472,3 @@ .. Docstring generated from Julia source General unescaping of traditional C and Unicode escape sequences. Reverse of :func:`escape_string`\ . See also :func:`unescape_string`\ . - -.. function:: utf16(s) - - .. Docstring generated from Julia source - - Create a UTF-16 string from a byte array, array of ``UInt16``\ , or any other string type. (Data must be valid UTF-16. Conversions of byte arrays check for a byte-order marker in the first two bytes, and do not include it in the resulting string.) - - Note that the resulting ``UTF16String`` data is terminated by the NUL codepoint (16-bit zero), which is not treated as a character in the string (so that it is mostly invisible in Julia); this allows the string to be passed directly to external functions requiring NUL-terminated data. This NUL is appended automatically by the ``utf16(s)`` conversion function. If you have a ``UInt16`` array ``A`` that is already NUL-terminated valid UTF-16 data, then you can instead use ``UTF16String(A)`` to construct the string without making a copy of the data and treating the NUL as a terminator rather than as part of the string. - -.. function:: utf16(::Union{Ptr{UInt16},Ptr{Int16}} [, length]) - - .. Docstring generated from Julia source - - Create a string from the address of a NUL-terminated UTF-16 string. A copy is made; the pointer can be safely freed. If ``length`` is specified, the string does not have to be NUL-terminated. - -.. function:: utf32(s) - - .. Docstring generated from Julia source - - Create a UTF-32 string from a byte array, array of ``Char`` or ``UInt32``\ , or any other string type. (Conversions of byte arrays check for a byte-order marker in the first four bytes, and do not include it in the resulting string.) - - Note that the resulting ``UTF32String`` data is terminated by the NUL codepoint (32-bit zero), which is not treated as a character in the string (so that it is mostly invisible in Julia); this allows the string to be passed directly to external functions requiring NUL-terminated data. This NUL is appended automatically by the ``utf32(s)`` conversion function. If you have a ``Char`` or ``UInt32`` array ``A`` that is already NUL-terminated UTF-32 data, then you can instead use ``UTF32String(A)`` to construct the string without making a copy of the data and treating the NUL as a terminator rather than as part of the string. - -.. function:: utf32(::Union{Ptr{Char},Ptr{UInt32},Ptr{Int32}} [, length]) - - .. Docstring generated from Julia source - - Create a string from the address of a NUL-terminated UTF-32 string. A copy is made; the pointer can be safely freed. If ``length`` is specified, the string does not have to be NUL-terminated. - From d7b8361754863e5a7c6bb907b0e14eeb24465b19 Mon Sep 17 00:00:00 2001 From: Tony Kelman Date: Tue, 5 Jul 2016 20:21:50 -0700 Subject: [PATCH 5/7] Delete test/unicode/types.jl and test/unicode/checkstring.jl since their code in base has been removed --- test/unicode/checkstring.jl | 175 ------------------------------------ test/unicode/types.jl | 11 --- 2 files changed, 186 deletions(-) delete mode 100644 test/unicode/checkstring.jl delete mode 100644 test/unicode/types.jl diff --git a/test/unicode/checkstring.jl b/test/unicode/checkstring.jl deleted file mode 100644 index 8fd9c81554b7a..0000000000000 --- a/test/unicode/checkstring.jl +++ /dev/null @@ -1,175 +0,0 @@ -# This file is a part of Julia. License is MIT: http://julialang.org/license - -# 11575 -# Test invalid sequences - -byt = 0x0 # Needs to be defined outside the try block! -try - # Continuation byte not after lead - for byt in 0x80:0xbf - @test_throws UnicodeError Base.checkstring(UInt8[byt]) - end - - # Test lead bytes - for byt in 0xc0:0xff - # Single lead byte at end of string - @test_throws UnicodeError Base.checkstring(UInt8[byt]) - # Lead followed by non-continuation character < 0x80 - @test_throws UnicodeError Base.checkstring(UInt8[byt,0]) - # Lead followed by non-continuation character > 0xbf - @test_throws UnicodeError Base.checkstring(UInt8[byt,0xc0]) - end - - # Test overlong 2-byte - for byt in 0x81:0xbf - @test_throws UnicodeError Base.checkstring(UInt8[0xc0,byt]) - end - for byt in 0x80:0xbf - @test_throws UnicodeError Base.checkstring(UInt8[0xc1,byt]) - end - - # Test overlong 3-byte - for byt in 0x80:0x9f - @test_throws UnicodeError Base.checkstring(UInt8[0xe0,byt,0x80]) - end - - # Test overlong 4-byte - for byt in 0x80:0x8f - @test_throws UnicodeError Base.checkstring(UInt8[0xef,byt,0x80,0x80]) - end - - # Test 4-byte > 0x10ffff - for byt in 0x90:0xbf - @test_throws UnicodeError Base.checkstring(UInt8[0xf4,byt,0x80,0x80]) - end - for byt in 0xf5:0xf7 - @test_throws UnicodeError Base.checkstring(UInt8[byt,0x80,0x80,0x80]) - end - - # Test 5-byte - for byt in 0xf8:0xfb - @test_throws UnicodeError Base.checkstring(UInt8[byt,0x80,0x80,0x80,0x80]) - end - - # Test 6-byte - for byt in 0xfc:0xfd - @test_throws UnicodeError Base.checkstring(UInt8[byt,0x80,0x80,0x80,0x80,0x80]) - end - - # Test 7-byte - @test_throws UnicodeError Base.checkstring(UInt8[0xfe,0x80,0x80,0x80,0x80,0x80,0x80]) - - # Three and above byte sequences - for byt in 0xe0:0xef - # Lead followed by only 1 continuation byte - @test_throws UnicodeError Base.checkstring(UInt8[byt,0x80]) - # Lead ended by non-continuation character < 0x80 - @test_throws UnicodeError Base.checkstring(UInt8[byt,0x80,0]) - # Lead ended by non-continuation character > 0xbf - @test_throws UnicodeError Base.checkstring(UInt8[byt,0x80,0xc0]) - end - - # 3-byte encoded surrogate character(s) - # Single surrogate - @test_throws UnicodeError Base.checkstring(UInt8[0xed,0xa0,0x80]) - # Not followed by surrogate - @test_throws UnicodeError Base.checkstring(UInt8[0xed,0xa0,0x80,0xed,0x80,0x80]) - # Trailing surrogate first - @test_throws UnicodeError Base.checkstring(UInt8[0xed,0xb0,0x80,0xed,0xb0,0x80]) - # Followed by lead surrogate - @test_throws UnicodeError Base.checkstring(UInt8[0xed,0xa0,0x80,0xed,0xa0,0x80]) - - # Four byte sequences - for byt in 0xf0:0xf4 - # Lead followed by only 2 continuation bytes - @test_throws UnicodeError Base.checkstring(UInt8[byt,0x80,0x80]) - # Lead followed by non-continuation character < 0x80 - @test_throws UnicodeError Base.checkstring(UInt8[byt,0x80,0x80,0]) - # Lead followed by non-continuation character > 0xbf - @test_throws UnicodeError Base.checkstring(UInt8[byt,0x80,0x80,0xc0]) - end - - # Long encoding of 0x01 - @test_throws UnicodeError convert(String, b"\xf0\x80\x80\x80") - # Test ends of long encoded surrogates - @test_throws UnicodeError convert(String, b"\xf0\x8d\xa0\x80") - @test_throws UnicodeError convert(String, b"\xf0\x8d\xbf\xbf") - # Long encodings - @test_throws UnicodeError Base.checkstring(b"\xf0\x80\x80\x80") - @test Base.checkstring(b"\xc0\x81"; accept_long_char=true) == (1,0x1,0,0,0) - @test Base.checkstring(b"\xf0\x80\x80\x80"; accept_long_char=true) == (1,0x1,0,0,0) -catch exp; - println("Error testing checkstring: $byt, $exp") - throw(exp) -end - -# Surrogates -@test_throws UnicodeError Base.checkstring(UInt16[0xd800]) -@test_throws UnicodeError Base.checkstring(UInt16[0xdc00]) -@test_throws UnicodeError Base.checkstring(UInt16[0xdc00,0xd800]) - -# Surrogates in UTF-32 -@test_throws UnicodeError Base.checkstring(UInt32[0xd800]) -@test_throws UnicodeError Base.checkstring(UInt32[0xdc00]) -@test_throws UnicodeError Base.checkstring(UInt32[0xdc00,0xd800]) - -# Characters > 0x10ffff -@test_throws UnicodeError Base.checkstring(UInt32[0x110000]) - -# Test starting and different position -@test Base.checkstring(UInt32[0x110000, 0x1f596], 2) == (1,0x10,1,0,0) - -# Test valid sequences -for (seq, res) in ( - (UInt8[0x0], (1,0,0,0,0)), # Nul byte, beginning of ASCII range - (UInt8[0x7f], (1,0,0,0,0)), # End of ASCII range - (UInt8[0xc0,0x80], (1,1,0,0,0)), # Long encoded Nul byte (Modified UTF-8, Java) - (UInt8[0xc2,0x80], (1,2,0,0,1)), # \u80, beginning of Latin1 range - (UInt8[0xc3,0xbf], (1,2,0,0,1)), # \uff, end of Latin1 range - (UInt8[0xc4,0x80], (1,4,0,0,1)), # \u100, beginning of non-Latin1 2-byte range - (UInt8[0xdf,0xbf], (1,4,0,0,1)), # \u7ff, end of non-Latin1 2-byte range - (UInt8[0xe0,0xa0,0x80], (1,8,0,1,0)), # \u800, beginning of 3-byte range - (UInt8[0xed,0x9f,0xbf], (1,8,0,1,0)), # \ud7ff, end of first part of 3-byte range - (UInt8[0xee,0x80,0x80], (1,8,0,1,0)), # \ue000, beginning of second part of 3-byte range - (UInt8[0xef,0xbf,0xbf], (1,8,0,1,0)), # \uffff, end of 3-byte range - (UInt8[0xf0,0x90,0x80,0x80],(1,16,1,0,0)), # \U10000, beginning of 4-byte range - (UInt8[0xf4,0x8f,0xbf,0xbf],(1,16,1,0,0)), # \U10ffff, end of 4-byte range - (UInt8[0xed,0xa0,0x80,0xed,0xb0,0x80], (1,0x30,1,0,0)), # Overlong \U10000, (CESU-8) - (UInt8[0xed,0xaf,0xbf,0xed,0xbf,0xbf], (1,0x30,1,0,0)), # Overlong \U10ffff, (CESU-8) - (UInt16[0x0000], (1,0,0,0,0)), # Nul byte, beginning of ASCII range - (UInt16[0x007f], (1,0,0,0,0)), # End of ASCII range - (UInt16[0x0080], (1,2,0,0,1)), # Beginning of Latin1 range - (UInt16[0x00ff], (1,2,0,0,1)), # End of Latin1 range - (UInt16[0x0100], (1,4,0,0,1)), # Beginning of non-Latin1 2-byte range - (UInt16[0x07ff], (1,4,0,0,1)), # End of non-Latin1 2-byte range - (UInt16[0x0800], (1,8,0,1,0)), # Beginning of 3-byte range - (UInt16[0xd7ff], (1,8,0,1,0)), # End of first part of 3-byte range - (UInt16[0xe000], (1,8,0,1,0)), # Beginning of second part of 3-byte range - (UInt16[0xffff], (1,8,0,1,0)), # End of 3-byte range - (UInt16[0xd800,0xdc00], (1,16,1,0,0)), # \U10000, beginning of 4-byte range - (UInt16[0xdbff,0xdfff], (1,16,1,0,0)), # \U10ffff, end of 4-byte range - (UInt32[0x0000], (1,0,0,0,0)), # Nul byte, beginning of ASCII range - (UInt32[0x007f], (1,0,0,0,0)), # End of ASCII range - (UInt32[0x0080], (1,2,0,0,1)), # Beginning of Latin1 range - (UInt32[0x00ff], (1,2,0,0,1)), # End of Latin1 range - (UInt32[0x0100], (1,4,0,0,1)), # Beginning of non-Latin1 2-byte range - (UInt32[0x07ff], (1,4,0,0,1)), # End of non-Latin1 2-byte range - (UInt32[0x0800], (1,8,0,1,0)), # Beginning of 3-byte range - (UInt32[0xd7ff], (1,8,0,1,0)), # End of first part of 3-byte range - (UInt32[0xe000], (1,8,0,1,0)), # Beginning of second part of 3-byte range - (UInt32[0xffff], (1,8,0,1,0)), # End of 3-byte range - (UInt32[0x10000], (1,16,1,0,0)), # \U10000, beginning of 4-byte range - (UInt32[0x10ffff], (1,16,1,0,0)), # \U10ffff, end of 4-byte range - (UInt32[0xd800,0xdc00], (1,0x30,1,0,0)),# Overlong \U10000, (CESU-8) - (UInt32[0xdbff,0xdfff], (1,0x30,1,0,0)))# Overlong \U10ffff, (CESU-8) - @test Base.checkstring(seq) == res -end - -# Test bounds checking -@test_throws BoundsError Base.checkstring(b"abcdef", -10) -@test_throws BoundsError Base.checkstring(b"abcdef", 0) -@test_throws BoundsError Base.checkstring(b"abcdef", 7) -@test_throws BoundsError Base.checkstring(b"abcdef", 3, -10) -@test_throws BoundsError Base.checkstring(b"abcdef", 3, 0) -@test_throws BoundsError Base.checkstring(b"abcdef", 3, 7) -@test_throws ArgumentError Base.checkstring(b"abcdef", 3, 1) diff --git a/test/unicode/types.jl b/test/unicode/types.jl deleted file mode 100644 index 919de34cc1786..0000000000000 --- a/test/unicode/types.jl +++ /dev/null @@ -1,11 +0,0 @@ -# This file is a part of Julia. License is MIT: http://julialang.org/license - -nullstring16 = UInt16[] -badstring16 = UInt16[0x0065] -@test_throws UnicodeError UTF16String(nullstring16) -@test_throws UnicodeError UTF16String(badstring16) - -nullstring32 = UInt32[] -badstring32 = UInt32['a'] -@test_throws UnicodeError UTF32String(nullstring32) -@test_throws UnicodeError UTF32String(badstring32) From 145dd586a91d7f6a6ed0f197e543069ca4bf7f17 Mon Sep 17 00:00:00 2001 From: Tony Kelman Date: Tue, 5 Jul 2016 20:41:30 -0700 Subject: [PATCH 6/7] One more doc update about utf-16 etc in calling-c-and-fortran-code --- doc/manual/calling-c-and-fortran-code.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/manual/calling-c-and-fortran-code.rst b/doc/manual/calling-c-and-fortran-code.rst index 84e3b35ae4010..cc4fdea8a935e 100644 --- a/doc/manual/calling-c-and-fortran-code.rst +++ b/doc/manual/calling-c-and-fortran-code.rst @@ -487,10 +487,10 @@ C name Standard Julia Alias Julia Base Type For ``wchar_t*`` arguments, the Julia type should be ``Cwstring`` (if the C routine expects a NUL-terminated string) or ``Ptr{Cwchar_t}`` otherwise. Note - also that ASCII, UTF-8, UTF-16, and UTF-32 string data in Julia is internally - NUL-terminated, so it can be passed to C functions expecting NUL-terminated - data without making a copy (but using the ``Cwstring`` type will cause an - error to be thrown if the string itself contains NUL characters). + also that UTF-8 string data in Julia is internally NUL-terminated, so it can + be passed to C functions expecting NUL-terminated data without making a copy + (but using the ``Cwstring`` type will cause an error to be thrown if the string + itself contains NUL characters). .. note:: From 3098e6cd15f9d271ef3d70612e026c4673192600 Mon Sep 17 00:00:00 2001 From: Tony Kelman Date: Thu, 7 Jul 2016 10:24:25 -0700 Subject: [PATCH 7/7] Add link to LegacyStrings.jl in doc/manual/strings.rst --- doc/manual/strings.rst | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/doc/manual/strings.rst b/doc/manual/strings.rst index 45e556f45f99a..a5a89f554425c 100644 --- a/doc/manual/strings.rst +++ b/doc/manual/strings.rst @@ -54,9 +54,8 @@ There are a few noteworthy high-level features about Julia's strings: strings. - Julia supports the full range of `Unicode `_ characters: literal - strings are always `ASCII `_ or - `UTF-8 `_ but other encodings for - strings from external sources can be supported. + strings are always `UTF-8 `_ but + other encodings for strings from external sources can be supported. .. _man-characters: @@ -272,8 +271,8 @@ string literals: Whether these Unicode characters are displayed as escapes or shown as special characters depends on your terminal's locale settings and its -support for Unicode. Non-ASCII string literals are encoded using the -UTF-8 encoding. UTF-8 is a variable-width encoding, meaning that not all +support for Unicode. String literals are encoded using the UTF-8 +encoding. UTF-8 is a variable-width encoding, meaning that not all characters are encoded in the same number of bytes. In UTF-8, ASCII characters — i.e. those with code points less than 0x80 (128) — are encoded as they are in ASCII, using a single byte, while code points @@ -317,11 +316,11 @@ inefficient and verbose way to iterate through the characters of ``s``: .. doctest:: julia> for i = 1:endof(s) - try - println(s[i]) - catch - # ignore the index error - end + try + println(s[i]) + catch + # ignore the index error + end end ∀ @@ -339,7 +338,7 @@ exception handling required: .. doctest:: julia> for c in s - println(c) + println(c) end ∀ @@ -350,10 +349,12 @@ exception handling required: y Julia uses UTF-8 encoding by default, and support for new encodings can -be added by packages. Additional discussion of other encodings and how -to implement support for them is beyond the scope of this document for -the time being. For further discussion of UTF-8 encoding issues, see -the section below on `byte array literals <#Byte+Array+Literals>`_, +be added by packages. For example, the `LegacyStrings.jl +`_ package implements +``UTF16String`` and ``UTF32String`` types. Additional discussion of other +encodings and how to implement support for them is beyond the scope of this +document for the time being. For further discussion of UTF-8 encoding issues, +see the section below on `byte array literals <#Byte+Array+Literals>`_, which goes into some greater detail. .. _man-string-interpolation: @@ -903,10 +904,9 @@ encodings. If this is all extremely confusing, try reading `"The Absolute Minimum Every Software Developer Absolutely, Positively Must Know About Unicode -and Character -Sets" `_. It's an -excellent introduction to Unicode and UTF-8, and may help alleviate some -confusion regarding the matter. +and Character Sets" `_. +It's an excellent introduction to Unicode and UTF-8, and may help alleviate +some confusion regarding the matter. .. _man-version-number-literals: