-
-
Notifications
You must be signed in to change notification settings - Fork 5.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
remove UTF-16 and UTF-32 stuff #16590
Changes from all commits
3ad3784
33be769
593c5de
1c2e5b5
d7b8361
145dd58
3098e6c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -21,8 +21,7 @@ const TAGS = Any[ | |
Symbol, Tuple, Expr, # dummy entries, intentionally shadowed by earlier ones | ||
LineNumberNode, Slot, LabelNode, GotoNode, | ||
QuoteNode, :reserved23 #=was TopNode=#, TypeVar, Core.Box, LambdaInfo, | ||
Module, #=UndefRefTag=#Symbol, Task, String, | ||
UTF16String, UTF32String, Float16, | ||
Module, #=UndefRefTag=#Symbol, Task, String, Float16, | ||
SimpleVector, #=BackrefTag=#Symbol, Method, GlobalRef, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please make it so by pushing the appropriate change to the branch, @vtjnash. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done. looks good to merge to me. |
||
|
||
(), Bool, Any, :Any, Bottom, :reserved21, :reserved22, Type, | ||
|
@@ -42,7 +41,7 @@ const TAGS = Any[ | |
28, 29, 30, 31, 32 | ||
] | ||
|
||
const ser_version = 3 # do not make changes without bumping the version #! | ||
const ser_version = 4 # do not make changes without bumping the version #! | ||
|
||
const NTAGS = length(TAGS) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -33,6 +33,8 @@ const utf8_trailing = [ | |
|
||
## required core functionality ## | ||
|
||
is_valid_continuation(c) = ((c & 0xc0) == 0x80) | ||
|
||
function endof(s::String) | ||
d = s.data | ||
i = length(d) | ||
|
@@ -102,7 +104,7 @@ function first_utf8_byte(ch::Char) | |
end | ||
|
||
function reverseind(s::String, i::Integer) | ||
j = lastidx(s) + 1 - i | ||
j = length(s.data) + 1 - i | ||
d = s.data | ||
while is_valid_continuation(d[j]) | ||
j -= 1 | ||
|
@@ -114,8 +116,6 @@ end | |
|
||
sizeof(s::String) = sizeof(s.data) | ||
|
||
lastidx(s::String) = length(s.data) | ||
|
||
isvalid(s::String, i::Integer) = | ||
(1 <= i <= endof(s.data)) && !is_valid_continuation(s.data[i]) | ||
|
||
|
@@ -239,109 +239,10 @@ function reverse(s::String) | |
String(buf) | ||
end | ||
|
||
## outputting UTF-8 strings ## | ||
|
||
write(io::IO, s::String) = write(io, s.data) | ||
|
||
pointer(x::String) = pointer(x.data) | ||
pointer(x::String, i::Integer) = pointer(x.data)+(i-1) | ||
|
||
## transcoding to UTF-8 ## | ||
|
||
convert(::Type{String}, s::String) = s | ||
|
||
function convert(::Type{String}, dat::Vector{UInt8}) | ||
# handle zero length string quickly | ||
isempty(dat) && return empty_utf8 | ||
# get number of bytes to allocate | ||
len, flags, num4byte, num3byte, num2byte = unsafe_checkstring(dat) | ||
if (flags & (UTF_LONG | UTF_SURROGATE)) == 0 | ||
len = sizeof(dat) | ||
@inbounds return String(copy!(Vector{UInt8}(len), 1, dat, 1, len)) | ||
end | ||
# Copy, but eliminate over-long encodings and surrogate pairs | ||
len += num2byte + num3byte*2 + num4byte*3 | ||
buf = Vector{UInt8}(len) | ||
out = 0 | ||
pos = 0 | ||
@inbounds while out < len | ||
ch::UInt32 = dat[pos += 1] | ||
# Handle ASCII characters | ||
if ch <= 0x7f | ||
buf[out += 1] = ch | ||
# Handle overlong < 0x100 | ||
elseif ch < 0xc2 | ||
buf[out += 1] = ((ch & 3) << 6) | (dat[pos += 1] & 0x3f) | ||
# Handle 0x100-0x7ff | ||
elseif ch < 0xe0 | ||
buf[out += 1] = ch | ||
buf[out += 1] = dat[pos += 1] | ||
elseif ch != 0xed | ||
buf[out += 1] = ch | ||
buf[out += 1] = dat[pos += 1] | ||
buf[out += 1] = dat[pos += 1] | ||
# Copy 4-byte encoded value | ||
ch >= 0xf0 && (buf[out += 1] = dat[pos += 1]) | ||
# Handle surrogate pairs | ||
else | ||
ch = dat[pos += 1] | ||
if ch < 0xa0 # not surrogate pairs | ||
buf[out += 1] = 0xed | ||
buf[out += 1] = ch | ||
buf[out += 1] = dat[pos += 1] | ||
else | ||
# Pick up surrogate pairs (CESU-8 format) | ||
ch = ((((((ch & 0x3f) << 6) | (dat[pos + 1] & 0x3f)) << 10) | ||
+ (((dat[pos + 3] & 0x3f)%UInt32 << 6) | (dat[pos + 4] & 0x3f))) | ||
- 0x01f0c00) | ||
pos += 4 | ||
output_utf8_4byte!(buf, out, ch) | ||
out += 4 | ||
end | ||
end | ||
end | ||
String(buf) | ||
end | ||
|
||
""" | ||
Converts an already validated vector of `UInt16` or `UInt32` to a `String` | ||
|
||
Input Arguments: | ||
|
||
* `dat` Vector of code units (`UInt16` or `UInt32`), explicit `\0` is not converted | ||
* `len` length of output in bytes | ||
|
||
Returns: | ||
|
||
* `String` | ||
""" | ||
function encode_to_utf8{T<:Union{UInt16, UInt32}}(::Type{T}, dat, len) | ||
buf = Vector{UInt8}(len) | ||
out = 0 | ||
pos = 0 | ||
@inbounds while out < len | ||
ch::UInt32 = dat[pos += 1] | ||
# Handle ASCII characters | ||
if ch <= 0x7f | ||
buf[out += 1] = ch | ||
# Handle 0x80-0x7ff | ||
elseif ch < 0x800 | ||
buf[out += 1] = 0xc0 | (ch >>> 6) | ||
buf[out += 1] = 0x80 | (ch & 0x3f) | ||
# Handle 0x10000-0x10ffff (if input is UInt32) | ||
elseif ch > 0xffff # this is only for T == UInt32, should not be generated for UInt16 | ||
output_utf8_4byte!(buf, out, ch) | ||
out += 4 | ||
# Handle surrogate pairs | ||
elseif is_surrogate_codeunit(ch) | ||
output_utf8_4byte!(buf, out, get_supplementary(ch, dat[pos += 1])) | ||
out += 4 | ||
# Handle 0x800-0xd7ff, 0xe000-0xffff UCS-2 characters | ||
else | ||
buf[out += 1] = 0xe0 | ((ch >>> 12) & 0x3f) | ||
buf[out += 1] = 0x80 | ((ch >>> 6) & 0x3f) | ||
buf[out += 1] = 0x80 | (ch & 0x3f) | ||
end | ||
end | ||
String(buf) | ||
end | ||
convert(::Type{String}, v::Vector{UInt8}) = String(v) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should this be implicit from the constructor? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Perhaps, but it doesn't hurt either. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it had been causing a method overwrite warning during bootstrap There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually, this is necessary for some reason this seems to be necessary, probably because of the fact that String has an inner constructor. |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -118,12 +118,11 @@ reverse(s::RevString) = s.string | |
|
||
## reverse an index i so that reverse(s)[i] == s[reverseind(s,i)] | ||
|
||
reverseind(s::AbstractString, i) = chr2ind(s, length(s) + 1 - ind2chr(reverse(s), i)) | ||
reverseind(s::Union{DirectIndexString,SubString{DirectIndexString}}, i::Integer) = length(s) + 1 - i | ||
reverseind(s::RevString, i::Integer) = endof(s) - i + 1 | ||
lastidx(s::AbstractString) = nextind(s, endof(s)) - 1 | ||
lastidx(s::DirectIndexString) = length(s) | ||
reverseind(s::SubString, i::Integer) = | ||
reverseind(s.string, lastidx(s.string)-s.offset-s.endof+i) - s.offset | ||
reverseind(s::SubString{String}, i::Integer) = | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
nevermind, missed the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. While technically this works, I think you're right that removing |
||
reverseind(s.string, nextind(s.string, endof(s.string))-s.offset-s.endof+i-1) - s.offset | ||
|
||
## efficient representation of repeated strings ## | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This method is deleted for
String
which breaks IJulia. Is it intentional? (If it is meant to be removed, the doc should be too.)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If IJulia needs UTF-8 validation, it may need to use LegacyStrings. Otherwise we could have an
isvalid(String)
method that returns true if theString
is valid UTF-8 and false otherwise. The tricky thing is that there are a few different versions of what could be considered valid:And of course, these are not mutually exclusive – each string can exhibit any subset of these issues.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@StefanKarpinski,
Base
still includes UTF-8 validation:isvalid(String, "foo")
works, and calls the C functionu8_isvalid
.You just removed
isvalid(::String)
. This seems a bit arbitrary and probably a mistake? Maybe we should have a fallback methodisvalid{T}(x::T) = isvalid(T, x)
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, let's just put that method back then.