From 3ad378439260b64c7ff15ce2c44fa2dc052b9460 Mon Sep 17 00:00:00 2001
From: Stefan Karpinski <stefan@karpinski.org>
Date: Wed, 25 May 2016 17:24:42 -0400
Subject: [PATCH 1/7] remove UTF-16 and UTF-32 string types and functions

---
 base/deprecated.jl          |  10 --
 base/docs/helpdb/Base.jl    |  52 -------
 base/exports.jl             |   4 -
 base/replutil.jl            |  12 +-
 base/serialize.jl           |   5 +-
 base/strings/io.jl          |  19 +++
 base/strings/string.jl      | 103 +-------------
 base/sysimg.jl              |   1 -
 base/test.jl                |   1 +
 base/unicode/checkstring.jl | 238 -------------------------------
 base/unicode/types.jl       |  34 -----
 base/unicode/unicode.jl     |   6 -
 base/unicode/utf16.jl       | 275 ------------------------------------
 base/unicode/utf32.jl       | 195 -------------------------
 test/serialize.jl           |   4 +-
 test/strings/basic.jl       |  68 +--------
 test/strings/io.jl          |  70 ---------
 test/strings/types.jl       |  44 +++---
 test/strings/util.jl        |   2 +-
 test/unicode.jl             |   4 -
 test/unicode/utf16.jl       |  23 ---
 test/unicode/utf32.jl       | 258 ---------------------------------
 test/unicode/utf8.jl        |   6 +-
 test/unicode/utf8proc.jl    |   2 +-
 24 files changed, 69 insertions(+), 1367 deletions(-)
 delete mode 100644 base/unicode/checkstring.jl
 delete mode 100644 base/unicode/types.jl
 delete mode 100644 base/unicode/unicode.jl
 delete mode 100644 base/unicode/utf16.jl
 delete mode 100644 base/unicode/utf32.jl
 delete mode 100644 test/unicode/utf16.jl
 delete mode 100644 test/unicode/utf32.jl

diff --git a/base/deprecated.jl b/base/deprecated.jl
index b22c1a3cc6523..dc862dc46eca6 100644
--- a/base/deprecated.jl
+++ b/base/deprecated.jl
@@ -488,16 +488,6 @@ end
     end
 )
 
-if sizeof(Cwchar_t) == 2
-    @deprecate_binding WString UTF16String
-    @deprecate_binding wstring utf16
-    utf16(s::Cwstring) = utf16(convert(Ptr{Cwchar_t}, s))
-elseif sizeof(Cwchar_t) == 4
-    @deprecate_binding WString UTF32String
-    @deprecate_binding wstring utf32
-    utf32(s::Cwstring) = utf32(convert(Ptr{Cwchar_t}, s))
-end
-
 @deprecate ==(x::Char, y::Integer) UInt32(x) == y
 @deprecate ==(x::Integer, y::Char) x == UInt32(y)
 @deprecate isless(x::Char, y::Integer) UInt32(x) < y
diff --git a/base/docs/helpdb/Base.jl b/base/docs/helpdb/Base.jl
index 70ffd3ca1c168..f8af1543b5d68 100644
--- a/base/docs/helpdb/Base.jl
+++ b/base/docs/helpdb/Base.jl
@@ -95,32 +95,6 @@ Get the step size of a [`Range`](:obj:`Range`) object.
 """
 step
 
-"""
-    utf32(s)
-
-Create a UTF-32 string from a byte array, array of `Char` or `UInt32`, or any other string
-type. (Conversions of byte arrays check for a byte-order marker in the first four bytes, and
-do not include it in the resulting string.)
-
-Note that the resulting `UTF32String` data is terminated by the NUL codepoint (32-bit zero),
-which is not treated as a character in the string (so that it is mostly invisible in Julia);
-this allows the string to be passed directly to external functions requiring NUL-terminated
-data. This NUL is appended automatically by the `utf32(s)` conversion function. If you have
-a `Char` or `UInt32` array `A` that is already NUL-terminated UTF-32 data, then you can
-instead use `UTF32String(A)` to construct the string without making a copy of the data and
-treating the NUL as a terminator rather than as part of the string.
-"""
-utf32(s)
-
-"""
-    utf32(::Union{Ptr{Char},Ptr{UInt32},Ptr{Int32}} [, length])
-
-Create a string from the address of a NUL-terminated UTF-32 string. A copy is made; the
-pointer can be safely freed. If `length` is specified, the string does not have to be
-NUL-terminated.
-"""
-utf32(::Union{Ptr{Char},Ptr{UInt32},Ptr{Int32}}, length=?)
-
 """
     takebuf_array(b::IOBuffer)
 
@@ -3620,32 +3594,6 @@ Compute ``\\sin(\\pi x) / (\\pi x)`` if ``x \\neq 0``, and ``1`` if ``x = 0``.
 """
 sinc
 
-"""
-    utf16(s)
-
-Create a UTF-16 string from a byte array, array of `UInt16`, or any other string type. (Data
-must be valid UTF-16. Conversions of byte arrays check for a byte-order marker in the first
-two bytes, and do not include it in the resulting string.)
-
-Note that the resulting `UTF16String` data is terminated by the NUL codepoint (16-bit zero),
-which is not treated as a character in the string (so that it is mostly invisible in Julia);
-this allows the string to be passed directly to external functions requiring NUL-terminated
-data. This NUL is appended automatically by the `utf16(s)` conversion function. If you have
-a `UInt16` array `A` that is already NUL-terminated valid UTF-16 data, then you can instead
-use `UTF16String(A)` to construct the string without making a copy of the data and treating
-the NUL as a terminator rather than as part of the string.
-"""
-utf16(s)
-
-"""
-    utf16(::Union{Ptr{UInt16},Ptr{Int16}} [, length])
-
-Create a string from the address of a NUL-terminated UTF-16 string. A copy is made; the
-pointer can be safely freed. If `length` is specified, the string does not have to be
-NUL-terminated.
-"""
-utf16(::Union{Ptr{UInt16},Ptr{Int16}}, length=?)
-
 """
     median(v[, region])
 
diff --git a/base/exports.jl b/base/exports.jl
index 000d2b4a96d2f..5af15b345eb31 100644
--- a/base/exports.jl
+++ b/base/exports.jl
@@ -119,8 +119,6 @@ export
     Tridiagonal,
     UnitRange,
     UpperTriangular,
-    UTF16String,
-    UTF32String,
     Val,
     VecOrMat,
     Vector,
@@ -878,8 +876,6 @@ export
     ucfirst,
     unescape_string,
     uppercase,
-    utf16,
-    utf32,
     warn,
 
 # random numbers
diff --git a/base/replutil.jl b/base/replutil.jl
index 229aa877f657c..edbc2863e53a9 100644
--- a/base/replutil.jl
+++ b/base/replutil.jl
@@ -233,7 +233,6 @@ end
 showerror(io::IO, ::DivideError) = print(io, "DivideError: integer division error")
 showerror(io::IO, ::StackOverflowError) = print(io, "StackOverflowError:")
 showerror(io::IO, ::UndefRefError) = print(io, "UndefRefError: access to undefined reference")
-showerror(io::IO, ex::UndefVarError) = print(io, "UndefVarError: $(ex.var) not defined")
 showerror(io::IO, ::EOFError) = print(io, "EOFError: read end of file")
 showerror(io::IO, ex::ErrorException) = print(io, ex.msg)
 showerror(io::IO, ex::KeyError) = print(io, "KeyError: key $(repr(ex.key)) not found")
@@ -241,6 +240,17 @@ showerror(io::IO, ex::InterruptException) = print(io, "InterruptException:")
 showerror(io::IO, ex::ArgumentError) = print(io, "ArgumentError: $(ex.msg)")
 showerror(io::IO, ex::AssertionError) = print(io, "AssertionError: $(ex.msg)")
 
+function showerror(io::IO, ex::UndefVarError)
+    if ex.var in [:UTF16String, :UTF32String, :WString, :utf16, :utf32, :wstring]
+        return showerror(io, ErrorException("""
+        `$(ex.var)` has been moved to the package LegacyStrings.jl:
+        Run Pkg.add("LegacyStrings") to install LegacyStrings on Julia v0.5-;
+        Then do `using LegacyStrings` to get `$(ex.var)`.
+        """))
+    end
+    print(io, "UndefVarError: $(ex.var) not defined")
+end
+
 function showerror(io::IO, ex::MethodError)
     # ex.args is a tuple type if it was thrown from `invoke` and is
     # a tuple of the arguments otherwise.
diff --git a/base/serialize.jl b/base/serialize.jl
index adb0048310034..debf09797391f 100644
--- a/base/serialize.jl
+++ b/base/serialize.jl
@@ -21,8 +21,7 @@ const TAGS = Any[
     Symbol, Tuple, Expr,  # dummy entries, intentionally shadowed by earlier ones
     LineNumberNode, Slot, LabelNode, GotoNode,
     QuoteNode, :reserved23 #=was TopNode=#, TypeVar, Core.Box, LambdaInfo,
-    Module, #=UndefRefTag=#Symbol, Task, String,
-    UTF16String, UTF32String, Float16,
+    Module, #=UndefRefTag=#Symbol, Task, String, Float16,
     SimpleVector, #=BackrefTag=#Symbol, Method, GlobalRef,
 
     (), Bool, Any, :Any, Bottom, :reserved21, :reserved22, Type,
@@ -42,7 +41,7 @@ const TAGS = Any[
     28, 29, 30, 31, 32
 ]
 
-const ser_version = 3 # do not make changes without bumping the version #!
+const ser_version = 4 # do not make changes without bumping the version #!
 
 const NTAGS = length(TAGS)
 
diff --git a/base/strings/io.jl b/base/strings/io.jl
index 199fe0eefc785..151909d6405e4 100644
--- a/base/strings/io.jl
+++ b/base/strings/io.jl
@@ -324,3 +324,22 @@ function unindent(str::AbstractString, indent::Int; tabwidth=8)
     end
     takebuf_string(buf)
 end
+
+function convert(::Type{String}, chars::AbstractVector{Char})
+    sprint(length(chars), io->begin
+        state = start(chars)
+        while !done(chars, state)
+            c, state = next(chars, state)
+            if '\ud7ff' < c && c + 1024 < '\ue000'
+                d, state = next(chars, state)
+                if '\ud7ff' < d - 1024 && d < '\ue000'
+                    c = Char(0x10000 + ((UInt32(c) & 0x03ff) << 10) | (UInt32(d) & 0x03ff))
+                else
+                    write(io, c)
+                    c = d
+                end
+            end
+            write(io, c)
+        end
+    end)
+end
diff --git a/base/strings/string.jl b/base/strings/string.jl
index 47eb243b21647..c9637b746ef91 100644
--- a/base/strings/string.jl
+++ b/base/strings/string.jl
@@ -33,6 +33,8 @@ const utf8_trailing = [
 
 ## required core functionality ##
 
+is_valid_continuation(c) = ((c & 0xc0) == 0x80)
+
 function endof(s::String)
     d = s.data
     i = length(d)
@@ -239,109 +241,10 @@ function reverse(s::String)
     String(buf)
 end
 
-## outputting UTF-8 strings ##
-
 write(io::IO, s::String) = write(io, s.data)
 
 pointer(x::String) = pointer(x.data)
 pointer(x::String, i::Integer) = pointer(x.data)+(i-1)
 
-## transcoding to UTF-8 ##
-
 convert(::Type{String}, s::String) = s
-
-function convert(::Type{String}, dat::Vector{UInt8})
-    # handle zero length string quickly
-    isempty(dat) && return empty_utf8
-    # get number of bytes to allocate
-    len, flags, num4byte, num3byte, num2byte = unsafe_checkstring(dat)
-    if (flags & (UTF_LONG | UTF_SURROGATE)) == 0
-        len = sizeof(dat)
-        @inbounds return String(copy!(Vector{UInt8}(len), 1, dat, 1, len))
-    end
-    # Copy, but eliminate over-long encodings and surrogate pairs
-    len += num2byte + num3byte*2 + num4byte*3
-    buf = Vector{UInt8}(len)
-    out = 0
-    pos = 0
-    @inbounds while out < len
-        ch::UInt32 = dat[pos += 1]
-        # Handle ASCII characters
-        if ch <= 0x7f
-            buf[out += 1] = ch
-        # Handle overlong < 0x100
-        elseif ch < 0xc2
-            buf[out += 1] = ((ch & 3) << 6) | (dat[pos += 1] & 0x3f)
-        # Handle 0x100-0x7ff
-        elseif ch < 0xe0
-            buf[out += 1] = ch
-            buf[out += 1] = dat[pos += 1]
-        elseif ch != 0xed
-            buf[out += 1] = ch
-            buf[out += 1] = dat[pos += 1]
-            buf[out += 1] = dat[pos += 1]
-            # Copy 4-byte encoded value
-            ch >= 0xf0 && (buf[out += 1] = dat[pos += 1])
-        # Handle surrogate pairs
-        else
-            ch = dat[pos += 1]
-            if ch < 0xa0 # not surrogate pairs
-                buf[out += 1] = 0xed
-                buf[out += 1] = ch
-                buf[out += 1] = dat[pos += 1]
-            else
-                # Pick up surrogate pairs (CESU-8 format)
-                ch = ((((((ch & 0x3f) << 6) | (dat[pos + 1] & 0x3f)) << 10)
-                       + (((dat[pos + 3] & 0x3f)%UInt32 << 6) | (dat[pos + 4] & 0x3f)))
-                      - 0x01f0c00)
-                pos += 4
-                output_utf8_4byte!(buf, out, ch)
-                out += 4
-            end
-        end
-    end
-    String(buf)
-end
-
-"""
-Converts an already validated vector of `UInt16` or `UInt32` to a `String`
-
-Input Arguments:
-
-* `dat` Vector of code units (`UInt16` or `UInt32`), explicit `\0` is not converted
-* `len` length of output in bytes
-
-Returns:
-
-* `String`
-"""
-function encode_to_utf8{T<:Union{UInt16, UInt32}}(::Type{T}, dat, len)
-    buf = Vector{UInt8}(len)
-    out = 0
-    pos = 0
-    @inbounds while out < len
-        ch::UInt32 = dat[pos += 1]
-        # Handle ASCII characters
-        if ch <= 0x7f
-            buf[out += 1] = ch
-        # Handle 0x80-0x7ff
-        elseif ch < 0x800
-            buf[out += 1] = 0xc0 | (ch >>> 6)
-            buf[out += 1] = 0x80 | (ch & 0x3f)
-        # Handle 0x10000-0x10ffff (if input is UInt32)
-        elseif ch > 0xffff # this is only for T == UInt32, should not be generated for UInt16
-            output_utf8_4byte!(buf, out, ch)
-            out += 4
-        # Handle surrogate pairs
-        elseif is_surrogate_codeunit(ch)
-            output_utf8_4byte!(buf, out, get_supplementary(ch, dat[pos += 1]))
-            out += 4
-        # Handle 0x800-0xd7ff, 0xe000-0xffff UCS-2 characters
-        else
-            buf[out += 1] = 0xe0 | ((ch >>> 12) & 0x3f)
-            buf[out += 1] = 0x80 | ((ch >>> 6) & 0x3f)
-            buf[out += 1] = 0x80 | (ch & 0x3f)
-        end
-    end
-    String(buf)
-end
+convert(::Type{String}, v::Vector{UInt8}) = String(v)
diff --git a/base/sysimg.jl b/base/sysimg.jl
index 022c4a9dac16c..85dfeaf5ef75e 100644
--- a/base/sysimg.jl
+++ b/base/sysimg.jl
@@ -145,7 +145,6 @@ include("iobuffer.jl")
 include("char.jl")
 include("intfuncs.jl")
 include("strings/strings.jl")
-include("unicode/unicode.jl")
 include("parse.jl")
 include("shell.jl")
 include("regex.jl")
diff --git a/base/test.jl b/base/test.jl
index b0382f3284f5c..f9790636f6906 100644
--- a/base/test.jl
+++ b/base/test.jl
@@ -1009,5 +1009,6 @@ end
 Base.convert(::Type{GenericString}, s::AbstractString) = GenericString(s)
 Base.endof(s::GenericString) = endof(s.string)
 Base.next(s::GenericString, i::Int) = next(s.string, i)
+Base.reverseind(s::GenericString, i::Integer) = reverseind(s.string, i)
 
 end # module
diff --git a/base/unicode/checkstring.jl b/base/unicode/checkstring.jl
deleted file mode 100644
index 72e9eb9d31062..0000000000000
--- a/base/unicode/checkstring.jl
+++ /dev/null
@@ -1,238 +0,0 @@
-# This file is a part of Julia. License is MIT: http://julialang.org/license
-
-## Functions to check validity of UTF-8, UTF-16, and UTF-32 encoded strings,
-#  and also to return information necessary to convert to other encodings
-
-## Return flags for check_string function
-
-const UTF_LONG = 1              ##< Long encodings are present
-const UTF_LATIN1 = 2            ##< characters in range 0x80-0xFF present
-const UTF_UNICODE2 = 4          ##< characters in range 0x100-0x7ff present
-const UTF_UNICODE3 = 8          ##< characters in range 0x800-0xd7ff, 0xe000-0xffff
-const UTF_UNICODE4 = 16         ##< non-BMP characters present
-const UTF_SURROGATE = 32        ##< surrogate pairs present
-
-## Get a UTF-8 continuation byte, give error if invalid, return updated character value
-@inline function get_continuation(ch::UInt32, byt::UInt8, pos)
-    if !is_valid_continuation(byt)
-        throw(UnicodeError(UTF_ERR_CONT, pos, byt))
-    end
-    (ch << 6) | (byt & 0x3f)
-end
-
-"""
-Validates and calculates number of characters in a UTF-8,UTF-16 or UTF-32 encoded vector/string
-
-Warning: this function does not check the bounds of the start or end positions
-Use `checkstring` to make sure the bounds are checked
-
-Input Arguments:
-
-* `dat`    UTF-8 (`Vector{UInt8}`), UTF-16 (`Vector{UInt16}`) or UTF-32 (`Vector{UInt32}`, `AbstractString`) encoded string
-
-Optional Input Arguments:
-
-* `pos`    start position (defaults to 1)
-* `endpos` end position   (defaults to `endof(dat)`)
-
-Keyword Arguments:
-
-* `accept_long_null`  = `true`  # Modified UTF-8 (`\\0` represented as `b\"\\xc0\\x80\"`)
-* `accept_surrogates` = `true`  # `CESU-8`
-* `accept_long_char`  = `false` # Accept arbitrary long encodings
-
-Returns:
-
-* (total characters, flags, 4-byte, 3-byte, 2-byte)
-
-Throws:
-
-* `UnicodeError`
-"""
-function unsafe_checkstring end
-
-function unsafe_checkstring(dat::AbstractVector{UInt8},
-                      pos = 1,
-                      endpos = endof(dat)
-                      ;
-                      accept_long_null  = true,
-                      accept_surrogates = true,
-                      accept_long_char  = false)
-    local byt::UInt8, ch::UInt32, surr::UInt32
-    flags::UInt = 0
-    totalchar = num2byte = num3byte = num4byte = 0
-    @inbounds while pos <= endpos
-        ch, pos = next(dat, pos)
-        totalchar += 1
-        if ch > 0x7f
-            # Check UTF-8 encoding
-            if ch < 0xe0
-                # 2-byte UTF-8 sequence (i.e. characters 0x80-0x7ff)
-                (pos > endpos) && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
-                byt, pos = next(dat, pos)
-                ch = get_continuation(ch & 0x3f, byt, pos)
-                if ch > 0x7f
-                    num2byte += 1
-                    flags |= (ch > 0xff) ? UTF_UNICODE2 : UTF_LATIN1
-                elseif accept_long_char
-                    flags |= UTF_LONG
-                elseif (ch == 0) && accept_long_null
-                    flags |= UTF_LONG
-                else
-                    throw(UnicodeError(UTF_ERR_LONG, pos, ch))
-                end
-             elseif ch < 0xf0
-                # 3-byte UTF-8 sequence (i.e. characters 0x800-0xffff)
-                (pos + 1 > endpos) && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
-                byt, pos = next(dat, pos)
-                ch = get_continuation(ch & 0x0f, byt, pos)
-                byt, pos = next(dat, pos)
-                ch = get_continuation(ch, byt, pos)
-                # check for surrogate pairs, make sure correct
-                if is_surrogate_codeunit(ch)
-                    !is_surrogate_lead(ch) && throw(UnicodeError(UTF_ERR_NOT_LEAD, pos-2, ch))
-                    # next character *must* be a trailing surrogate character
-                    (pos + 2 > endpos) && throw(UnicodeError(UTF_ERR_MISSING_SURROGATE, pos-2, ch))
-                    byt, pos = next(dat, pos)
-                    (byt != 0xed) && throw(UnicodeError(UTF_ERR_NOT_TRAIL, pos, byt))
-                    byt, pos = next(dat, pos)
-                    surr = get_continuation(0x0000d, byt, pos)
-                    byt, pos = next(dat, pos)
-                    surr = get_continuation(surr, byt, pos)
-                    !is_surrogate_trail(surr) && throw(UnicodeError(UTF_ERR_NOT_TRAIL, pos-2, surr))
-                    !accept_surrogates && throw(UnicodeError(UTF_ERR_SURROGATE, pos-2, surr))
-                    flags |= UTF_SURROGATE
-                    num4byte += 1
-                elseif ch > 0x07ff
-                    num3byte += 1
-                elseif accept_long_char
-                    flags |= UTF_LONG
-                    num2byte += 1
-                else
-                    throw(UnicodeError(UTF_ERR_LONG, pos-2, ch))
-                end
-            elseif ch < 0xf5
-                # 4-byte UTF-8 sequence (i.e. characters > 0xffff)
-                (pos + 2 > endpos) && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
-                byt, pos = next(dat, pos)
-                ch = get_continuation(ch & 0x07, byt, pos)
-                byt, pos = next(dat, pos)
-                ch = get_continuation(ch, byt, pos)
-                byt, pos = next(dat, pos)
-                ch = get_continuation(ch, byt, pos)
-                if ch > 0x10ffff
-                    throw(UnicodeError(UTF_ERR_INVALID, pos-3, ch))
-                elseif ch > 0xffff
-                    num4byte += 1
-                elseif is_surrogate_codeunit(ch)
-                    throw(UnicodeError(UTF_ERR_SURROGATE, pos-3, ch))
-                elseif accept_long_char
-                    # This is an overly long encoded character
-                    flags |= UTF_LONG
-                    if ch > 0x7ff
-                        num3byte += 1
-                    elseif ch > 0x7f
-                        num2byte += 1
-                    end
-                else
-                    throw(UnicodeError(UTF_ERR_LONG, pos-2, ch))
-                end
-            else
-                throw(UnicodeError(UTF_ERR_INVALID, pos, ch))
-            end
-        end
-    end
-    num3byte != 0 && (flags |= UTF_UNICODE3)
-    num4byte != 0 && (flags |= UTF_UNICODE4)
-    return totalchar, flags, num4byte, num3byte, num2byte
-end
-
-typealias AbstractString1632{Tel<:Union{UInt16,UInt32}} Union{AbstractVector{Tel}, AbstractString}
-
-function unsafe_checkstring(
-                      dat::AbstractString1632,
-                      pos = 1,
-                      endpos = endof(dat)
-                      ;
-                      accept_long_null  = true,
-                      accept_surrogates = true,
-                      accept_long_char  = false)
-    local ch::UInt32
-    flags::UInt = 0
-    totalchar = num2byte = num3byte = num4byte = 0
-    @inbounds while pos <= endpos
-        ch, pos = next(dat, pos)
-        totalchar += 1
-        if ch > 0x7f
-            if ch < 0x100
-                num2byte += 1
-                flags |= UTF_LATIN1
-            elseif ch < 0x800
-                num2byte += 1
-                flags |= UTF_UNICODE2
-            elseif ch > 0x0ffff
-                (ch > 0x10ffff) && throw(UnicodeError(UTF_ERR_INVALID, pos, ch))
-                num4byte += 1
-            elseif !is_surrogate_codeunit(ch)
-                num3byte += 1
-            elseif is_surrogate_lead(ch)
-                pos > endpos && throw(UnicodeError(UTF_ERR_MISSING_SURROGATE, pos, ch))
-                # next character *must* be a trailing surrogate character
-                ch, pos = next(dat, pos)
-                !is_surrogate_trail(ch) && throw(UnicodeError(UTF_ERR_NOT_TRAIL, pos, ch))
-                num4byte += 1
-                if !(typeof(dat) <: AbstractVector{UInt16})
-                    !accept_surrogates && throw(UnicodeError(UTF_ERR_SURROGATE, pos, ch))
-                    flags |= UTF_SURROGATE
-                end
-            else
-                throw(UnicodeError(UTF_ERR_NOT_LEAD, pos, ch))
-            end
-        end
-    end
-    num3byte != 0 && (flags |= UTF_UNICODE3)
-    num4byte != 0 && (flags |= UTF_UNICODE4)
-    return totalchar, flags, num4byte, num3byte, num2byte
-end
-
-"""
-Validates and calculates number of characters in a UTF-8,UTF-16 or UTF-32 encoded vector/string
-
-This function checks the bounds of the start and end positions
-Use `unsafe_checkstring` to avoid that overhead if the bounds have already been checked
-
-Input Arguments:
-
-* `dat`    UTF-8 (`Vector{UInt8}`), UTF-16 (`Vector{UInt16}`) or UTF-32 (`Vector{UInt32}`, `AbstractString`) encoded string
-
-Optional Input Arguments:
-
-* `startpos` start position (defaults to 1)
-* `endpos`   end position   (defaults to `endof(dat)`)
-
-Keyword Arguments:
-
-* `accept_long_null`  = `true`  # Modified UTF-8 (`\\0` represented as `b\"\\xc0\\x80\"`)
-* `accept_surrogates` = `true`  # `CESU-8`
-* `accept_long_char`  = `false` # Accept arbitrary long encodings
-
-Returns:
-
-* (total characters, flags, 4-byte, 3-byte, 2-byte)
-
-Throws:
-
-* `UnicodeError`
-"""
-function checkstring end
-
-# No need to check bounds if using defaults
-checkstring(dat; kwargs...) = unsafe_checkstring(dat, 1, endof(dat); kwargs...)
-
-# Make sure that beginning and end positions are bounds checked
-function checkstring(dat, startpos, endpos = endof(dat); kwargs...)
-    checkbounds(dat,startpos)
-    checkbounds(dat,endpos)
-    endpos < startpos && throw(ArgumentError("End position ($endpos) is less than start position ($startpos)"))
-    unsafe_checkstring(dat, startpos, endpos; kwargs...)
-end
diff --git a/base/unicode/types.jl b/base/unicode/types.jl
deleted file mode 100644
index 61f69d6a6638d..0000000000000
--- a/base/unicode/types.jl
+++ /dev/null
@@ -1,34 +0,0 @@
-# This file is a part of Julia. License is MIT: http://julialang.org/license
-
-##\brief      Base UTF16String type, has 16-bit NULL termination word after data, native byte order
-#
-# \throws     UnicodeError
-
-immutable UTF16String <: AbstractString
-    data::Vector{UInt16} # includes 16-bit NULL termination after string chars
-    function UTF16String(data::Vector{UInt16})
-        if length(data) < 1 || data[end] != 0
-            throw(UnicodeError(UTF_ERR_NULL_16_TERMINATE, 0, 0))
-        end
-        new(data)
-    end
-end
-
-##\brief      Base UTF32String type, has 32-bit NULL termination word after data, native byte order
-#
-# \throws     UnicodeError
-
-immutable UTF32String <: DirectIndexString
-    data::Vector{UInt32} # includes 32-bit NULL termination after string chars
-
-    function UTF32String(data::Vector{UInt32})
-        if length(data) < 1 || data[end] != 0
-            throw(UnicodeError(UTF_ERR_NULL_32_TERMINATE, 0, 0))
-        end
-        new(data)
-    end
-end
-UTF32String(data::Vector{Char}) = UTF32String(reinterpret(UInt32, data))
-
-isvalid{T<:Union{String,UTF16String,UTF32String}}(str::T) = isvalid(T, str.data)
-isvalid{T<:Union{String,UTF16String,UTF32String}}(::Type{T}, str::T) = isvalid(T, str.data)
diff --git a/base/unicode/unicode.jl b/base/unicode/unicode.jl
deleted file mode 100644
index a8bd91e1a5a81..0000000000000
--- a/base/unicode/unicode.jl
+++ /dev/null
@@ -1,6 +0,0 @@
-# This file is a part of Julia. License is MIT: http://julialang.org/license
-
-include("unicode/types.jl")
-include("unicode/checkstring.jl")
-include("unicode/utf16.jl")
-include("unicode/utf32.jl")
diff --git a/base/unicode/utf16.jl b/base/unicode/utf16.jl
deleted file mode 100644
index 21cb4059056e5..0000000000000
--- a/base/unicode/utf16.jl
+++ /dev/null
@@ -1,275 +0,0 @@
-# This file is a part of Julia. License is MIT: http://julialang.org/license
-
-# Quickly copy and set trailing \0
-@inline function fast_utf_copy{S <: Union{UTF16String, UTF32String}, T <: Union{UInt16, UInt32}}(
-                              ::Type{S}, ::Type{T}, len, dat, flag::Bool=false)
-    S(setindex!(copy!(Vector{T}(len+1), 1, dat, 1, flag ? len : len+1), 0, len+1))
-end
-
-# Get rest of character ch from 3-byte UTF-8 sequence in dat
-@inline function get_utf8_3byte(dat, pos, ch)
-    @inbounds return ((ch & 0xf) << 12) | (UInt32(dat[pos-1] & 0x3f) << 6) | (dat[pos] & 0x3f)
-end
-# Get rest of character ch from 4-byte UTF-8 sequence in dat
-@inline function get_utf8_4byte(dat, pos, ch)
-    @inbounds return (((ch & 0x7) << 18)
-                        | (UInt32(dat[pos-2] & 0x3f) << 12)
-                        | (UInt32(dat[pos-1] & 0x3f) << 6)
-                        | (dat[pos] & 0x3f))
-end
-
-# Output a character as a 4-byte UTF-8 sequence
-@inline function output_utf8_4byte!(buf, out, ch)
-    @inbounds begin
-        buf[out + 1] = 0xf0 | (ch >>> 18)
-        buf[out + 2] = 0x80 | ((ch >>> 12) & 0x3f)
-        buf[out + 3] = 0x80 | ((ch >>> 6) & 0x3f)
-        buf[out + 4] = 0x80 | (ch & 0x3f)
-    end
-end
-
-const empty_utf16 = UTF16String(UInt16[0])
-
-function length(s::UTF16String)
-    d = s.data
-    len = length(d) - 1
-    len == 0 && return 0
-    cnum = 0
-    for i = 1:len
-        @inbounds cnum += !is_surrogate_trail(d[i])
-    end
-    cnum
-end
-
-function endof(s::UTF16String)
-    d = s.data
-    i = length(d) - 1
-    i == 0 && return i
-    return is_surrogate_codeunit(d[i]) ? i-1 : i
-end
-
-get_supplementary(lead::Unsigned, trail::Unsigned) = (UInt32(lead-0xd7f7)<<10 + trail)
-
-function next(s::UTF16String, i::Int)
-    ch = s.data[i]
-    !is_surrogate_codeunit(ch) && return (Char(ch), i+1)
-    # check length, account for terminating \0
-    i >= (length(s.data)-1) && throw(UnicodeError(UTF_ERR_MISSING_SURROGATE, i, UInt32(ch)))
-    !is_surrogate_lead(ch) && throw(UnicodeError(UTF_ERR_NOT_LEAD, i, ch))
-    ct = s.data[i+1]
-    !is_surrogate_trail(ct) && throw((UTF_ERR_NOT_TRAIL, i, ch))
-    Char(get_supplementary(ch, ct)), i+2
-end
-
-function reverseind(s::UTF16String, i::Integer)
-    j = length(s.data) - i
-    return is_surrogate_trail(s.data[j]) ? j-1 : j
-end
-
-lastidx(s::UTF16String) = length(s.data) - 1 # s.data includes NULL terminator
-
-function reverse(s::UTF16String)
-    d = s.data
-    out = similar(d)
-    out[end] = 0 # NULL termination
-    n = length(d)
-    @inbounds for i = 1:n-1
-        ch = d[n-i]
-        if is_surrogate_lead(ch)
-            out[i],out[i-1] = out[i-1],ch
-        else
-            out[i] = ch
-        end
-    end
-    UTF16String(out)
-end
-
-sizeof(s::UTF16String) = sizeof(s.data) - sizeof(UInt16)
-
-function isvalid(::Type{UTF16String}, data::AbstractArray{UInt16})
-    i = 1
-    n = length(data) # this may include NULL termination; that's okay
-    @inbounds while i < n # check for unpaired surrogates
-        if is_surrogate_lead(data[i]) && is_surrogate_trail(data[i+1])
-            i += 2
-        elseif is_surrogate_codeunit(data[i])
-            return false
-        else
-            i += 1
-        end
-    end
-    return i > n || !is_surrogate_codeunit(data[i])
-end
-
-function convert(::Type{UTF16String}, str::AbstractString)
-    len, flags, num4byte = unsafe_checkstring(str)
-    buf = Vector{UInt16}(len+num4byte+1)
-    out = 0
-    @inbounds for ch in str
-        c = UInt32(ch)
-        if c < 0x10000
-            buf[out += 1] = UInt16(c)
-        else
-            # output surrogate pair
-            buf[out += 1] = UInt16(0xd7c0 + (c >>> 10))
-            buf[out += 1] = UInt16(0xdc00 + (c & 0x3ff))
-        end
-    end
-    @inbounds buf[out + 1] = 0 # NULL termination
-    UTF16String(buf)
-end
-
-function convert(::Type{UTF16String}, str::String)
-    dat = str.data
-    # handle zero length string quickly
-    sizeof(dat) == 0 && return empty_utf16
-    # Check that is correct UTF-8 encoding and get number of words needed
-    len, flags, num4byte = unsafe_checkstring(dat)
-    len += num4byte
-    buf = Vector{UInt16}(len+1)
-    @inbounds buf[len+1] = 0
-    # Optimize case where no characters > 0x7f
-    flags == 0 && @inbounds return UTF16String(copy!(buf, dat))
-    out = 0
-    pos = 0
-    @inbounds while out < len
-        ch::UInt32 = dat[pos += 1]
-        # Handle ASCII characters
-        if ch <= 0x7f
-            buf[out += 1] = ch
-        # Handle range 0x80-0x7ff
-        elseif ch < 0xe0
-            buf[out += 1] = ((ch & 0x1f) << 6) | (dat[pos += 1] & 0x3f)
-        # Handle range 0x800-0xffff
-        elseif ch < 0xf0
-            pos += 2
-            buf[out += 1] = get_utf8_3byte(dat, pos, ch)
-        # Handle range 0x10000-0x10ffff
-        else
-            pos += 3
-            ch = get_utf8_4byte(dat, pos, ch)
-            # output surrogate pair
-            buf[out += 1] = UInt16(0xd7c0 + (ch >>> 10))
-            buf[out += 1] = UInt16(0xdc00 + (ch & 0x3ff))
-        end
-    end
-    UTF16String(buf)
-end
-
-function convert(::Type{String}, str::UTF16String)
-    dat = str.data
-    len = sizeof(dat) >>> 1
-    # handle zero length string quickly
-    len <= 1 && return empty_utf8
-    # get number of bytes to allocate
-    len, flags, num4byte, num3byte, num2byte = unsafe_checkstring(dat, 1, len-1)
-    flags == 0 && @inbounds return String(copy!(Vector{UInt8}(len), 1, dat, 1, len))
-    return encode_to_utf8(UInt16, dat, len + num2byte + num3byte*2 + num4byte*3)
-end
-
-"""
-Converts an already validated UTF-32 encoded vector of `UInt32` to a `UTF16String`
-
-Input Arguments:
-
-*   `dat` `Vector{UInt32}` of UTF-32 encoded data
-*   `len` length of output in 16-bit words
-
-Returns:
-
-*   `UTF16String`
-"""
-function encode_to_utf16(dat, len)
-    buf = Vector{UInt16}(len)
-    @inbounds buf[len] = 0 # NULL termination
-    out = 0
-    pos = 0
-    @inbounds while out < len
-        ch = UInt32(dat[pos += 1])
-        if ch > 0xffff
-            # Output surrogate pair for 0x10000-0x10ffff
-            buf[out += 1] = 0xd7c0 + (ch >>> 10)
-            ch = 0xdc00 + (ch & 0x3ff)
-        end
-        buf[out += 1] = ch
-    end
-    UTF16String(buf)
-end
-
-convert(::Type{Vector{UInt16}}, str::UTF16String) = str.data
-convert(::Type{Array{UInt16}},  str::UTF16String) = str.data
-
-convert(::Type{UTF16String}, str::UTF16String)    = str
-
-unsafe_convert{T<:Union{Int16,UInt16}}(::Type{Ptr{T}}, s::UTF16String) =
-    convert(Ptr{T}, pointer(s))
-
-convert(T::Type{UTF16String}, data::AbstractArray{UInt16}) =
-    convert(T, reshape(data, length(data)))
-
-convert(T::Type{UTF16String}, data::AbstractArray{Int16}) =
-    convert(T, reinterpret(UInt16, data))
-
-function convert(::Type{UTF16String}, dat::AbstractVector{UInt16})
-    len, flags, num4byte = unsafe_checkstring(dat)
-    @inbounds return fast_utf_copy(UTF16String, UInt16, len+num4byte, dat, true)
-end
-
-function convert(T::Type{UTF16String}, bytes::AbstractArray{UInt8})
-    isempty(bytes) && return UTF16String(UInt16[0])
-    isodd(length(bytes)) && throw(UnicodeError(UTF_ERR_ODD_BYTES_16, length(bytes), 0))
-    data = reinterpret(UInt16, bytes)
-    # check for byte-order mark (BOM):
-    if data[1] == 0xfeff        # native byte order
-        d = Array{UInt16}(length(data))
-        copy!(d,1, data,2, length(data)-1)
-    elseif data[1] == 0xfffe    # byte-swapped
-        d = Array{UInt16}(length(data))
-        for i = 2:length(data)
-            d[i-1] = bswap(data[i])
-        end
-    else
-        d = Array{UInt16}(length(data) + 1)
-        copy!(d,1, data,1, length(data)) # assume native byte order
-    end
-    d[end] = 0 # NULL terminate
-    !isvalid(UTF16String, d) && throw(UnicodeError(UTF_ERR_INVALID_16,0,0))
-    UTF16String(d)
-end
-
-utf16(x) = convert(UTF16String, x)
-utf16(p::Ptr{UInt16}, len::Integer) = utf16(unsafe_wrap(Array, p, len))
-utf16(p::Ptr{Int16}, len::Integer) = utf16(convert(Ptr{UInt16}, p), len)
-function utf16(p::Union{Ptr{UInt16}, Ptr{Int16}})
-    len = 0
-    while unsafe_load(p, len+1) != 0; len += 1; end
-    utf16(p, len)
-end
-
-function map(fun, str::UTF16String)
-    buf = UInt16[]
-    sizehint!(buf, length(str.data))
-    for ch in str
-        c2 = fun(ch)
-        if !isa(c2, Char)
-            throw(UnicodeError(UTF_ERR_MAP_CHAR, 0, 0))
-        end
-        uc = UInt32(c2)
-        if uc < 0x10000
-            if is_surrogate_codeunit(UInt16(uc))
-                throw(UnicodeError(UTF_ERR_INVALID_CHAR, 0, uc))
-            end
-            push!(buf, UInt16(uc))
-        elseif uc <= 0x10ffff
-            push!(buf, UInt16(0xd7c0 + (uc >> 10)))
-            push!(buf, UInt16(0xdc00 + (uc & 0x3ff)))
-        else
-            throw(UnicodeError(UTF_ERR_INVALID_CHAR, 0, uc))
-        end
-    end
-    push!(buf, 0)
-    UTF16String(buf)
-end
-
-cconvert(::Type{Cwstring}, v::Vector{UInt16}) = transcode(Cwchar_t, v)
-cconvert(::Type{Cwstring}, s::UTF16String) = transcode(Cwchar_t, s.data)
diff --git a/base/unicode/utf32.jl b/base/unicode/utf32.jl
deleted file mode 100644
index 9250e9310909e..0000000000000
--- a/base/unicode/utf32.jl
+++ /dev/null
@@ -1,195 +0,0 @@
-# This file is a part of Julia. License is MIT: http://julialang.org/license
-
-# UTF-32 basic functions
-next(s::UTF32String, i::Int) = (Char(s.data[i]), i+1)
-endof(s::UTF32String) = length(s.data) - 1
-length(s::UTF32String) = length(s.data) - 1
-
-reverse(s::UTF32String) = UTF32String(reverse!(copy(s.data), 1, length(s)))
-
-sizeof(s::UTF32String) = sizeof(s.data) - sizeof(UInt32)
-
-const empty_utf32 = UTF32String(UInt32[0])
-
-convert(::Type{UTF32String}, c::Char) = UTF32String(UInt32[c, 0])
-convert(::Type{UTF32String}, s::UTF32String) = s
-
-function convert(::Type{UTF32String}, str::AbstractString)
-    len, flags = unsafe_checkstring(str)
-    buf = Vector{UInt32}(len+1)
-    out = 0
-    @inbounds for ch in str ; buf[out += 1] = ch ; end
-    @inbounds buf[out + 1] = 0 # NULL termination
-    UTF32String(buf)
-end
-
-function convert(::Type{String},  str::UTF32String)
-    dat = str.data
-    len = sizeof(dat) >>> 2
-    # handle zero length string quickly
-    len <= 1 && return empty_utf8
-    # get number of bytes to allocate
-    len, flags, num4byte, num3byte, num2byte = unsafe_checkstring(dat, 1, len-1)
-    flags == 0 && @inbounds return String(copy!(Vector{UInt8}(len), 1, dat, 1, len))
-    return encode_to_utf8(UInt32, dat, len + num2byte + num3byte*2 + num4byte*3)
-end
-
-function convert(::Type{UTF32String}, str::String)
-    dat = str.data
-    # handle zero length string quickly
-    sizeof(dat) == 0 && return empty_utf32
-    # Validate UTF-8 encoding, and get number of words to create
-    len, flags = unsafe_checkstring(dat)
-    # Optimize case where no characters > 0x7f
-    flags == 0 && @inbounds return fast_utf_copy(UTF32String, UInt32, len, dat, true)
-    # has multi-byte UTF-8 sequences
-    buf = Vector{UInt32}(len+1)
-    @inbounds buf[len+1] = 0 # NULL termination
-    local ch::UInt32, surr::UInt32
-    out = 0
-    pos = 0
-    @inbounds while out < len
-        ch = dat[pos += 1]
-        # Handle ASCII characters
-        if ch <= 0x7f
-            buf[out += 1] = ch
-        # Handle range 0x80-0x7ff
-        elseif ch < 0xe0
-            buf[out += 1] = ((ch & 0x1f) << 6) | (dat[pos += 1] & 0x3f)
-        # Handle range 0x800-0xffff
-        elseif ch < 0xf0
-            pos += 2
-            ch = get_utf8_3byte(dat, pos, ch)
-            # Handle surrogate pairs (should have been encoded in 4 bytes)
-            if is_surrogate_lead(ch)
-                # Build up 32-bit character from ch and trailing surrogate in next 3 bytes
-                pos += 3
-                surr = ((UInt32(dat[pos-2] & 0xf) << 12)
-                        | (UInt32(dat[pos-1] & 0x3f) << 6)
-                        | (dat[pos] & 0x3f))
-                ch = get_supplementary(ch, surr)
-            end
-            buf[out += 1] = ch
-        # Handle range 0x10000-0x10ffff
-        else
-            pos += 3
-            buf[out += 1] = get_utf8_4byte(dat, pos, ch)
-        end
-    end
-    UTF32String(buf)
-end
-
-function convert(::Type{UTF32String}, str::UTF16String)
-    dat = str.data
-    len = sizeof(dat)
-    # handle zero length string quickly (account for trailing \0)
-    len <= 2 && return empty_utf32
-    # get number of words to create
-    len, flags, num4byte = unsafe_checkstring(dat, 1, len>>>1)
-    # No surrogate pairs, do optimized copy
-    (flags & UTF_UNICODE4) == 0 && @inbounds return UTF32String(copy!(Vector{Char}(len), dat))
-    local ch::UInt32
-    buf = Vector{UInt32}(len)
-    out = 0
-    pos = 0
-    @inbounds while out < len
-        ch = dat[pos += 1]
-        # check for surrogate pair
-        if is_surrogate_lead(ch) ; ch = get_supplementary(ch, dat[pos += 1]) ; end
-        buf[out += 1] = ch
-    end
-    UTF32String(buf)
-end
-
-function convert(::Type{UTF16String}, str::UTF32String)
-    dat = str.data
-    len = sizeof(dat)
-    # handle zero length string quickly
-    len <= 4 && return empty_utf16
-    # get number of words to allocate
-    len, flags, num4byte = unsafe_checkstring(dat, 1, len>>>2)
-    # optimized path, no surrogates
-    num4byte == 0 && @inbounds return UTF16String(copy!(Vector{UInt16}(len), dat))
-    return encode_to_utf16(dat, len + num4byte)
-end
-
-function convert(::Type{UTF32String}, dat::AbstractVector{UInt32})
-    @inbounds return fast_utf_copy(UTF32String, UInt32, length(dat), dat, true)
-end
-
-convert(::Type{UTF32String}, data::AbstractVector{Int32}) =
-    convert(UTF32String, reinterpret(UInt32, convert(Vector{T}, data)))
-
-convert(::Type{UTF32String}, data::AbstractVector{Char}) =
-    convert(UTF32String, map(UInt32, data))
-
-convert{T<:AbstractString, S<:Union{UInt32,Char,Int32}}(::Type{T}, v::AbstractVector{S}) =
-    convert(T, utf32(v))
-
-convert(::Type{Vector{UInt32}}, str::UTF32String) = str.data
-convert(::Type{Array{UInt32}},  str::UTF32String) = str.data
-
-unsafe_convert{T<:Union{UInt32,Int32,Char}}(::Type{Ptr{T}}, s::UTF32String) =
-    convert(Ptr{T}, pointer(s))
-
-function convert(T::Type{UTF32String}, bytes::AbstractArray{UInt8})
-    isempty(bytes) && return empty_utf32
-    length(bytes) & 3 != 0 && throw(UnicodeError(UTF_ERR_ODD_BYTES_32,0,0))
-    data = reinterpret(UInt32, bytes)
-    # check for byte-order mark (BOM):
-    if data[1] == 0x0000feff # native byte order
-        d = Array{UInt32}(length(data))
-        copy!(d,1, data, 2, length(data)-1)
-    elseif data[1] == 0xfffe0000 # byte-swapped
-        d = Array{UInt32}(length(data))
-        for i = 2:length(data)
-            @inbounds d[i-1] = bswap(data[i])
-        end
-    else
-        d = Array{UInt32}(length(data) + 1)
-        copy!(d, 1, data, 1, length(data)) # assume native byte order
-    end
-    d[end] = 0 # NULL terminate
-    UTF32String(d)
-end
-
-cconvert(::Type{Cwstring}, v::Vector{UInt32}) = transcode(Cwchar_t, v)
-cconvert(::Type{Cwstring}, s::UTF32String) = transcode(Cwchar_t, s.data)
-
-function isvalid(::Type{UTF32String}, str::Union{Vector{UInt32}, Vector{Char}})
-    for c in str
-        @inbounds if !isvalid(Char, UInt32(c)) ; return false ; end
-    end
-    return true
-end
-isvalid(str::Vector{Char}) = isvalid(UTF32String, str)
-
-utf32(x) = convert(UTF32String, x)
-
-utf32(p::Ptr{UInt32}, len::Integer) = utf32(unsafe_wrap(Array, p, len))
-utf32(p::Union{Ptr{Char}, Ptr{Int32}}, len::Integer) = utf32(convert(Ptr{UInt32}, p), len)
-function utf32(p::Union{Ptr{UInt32}, Ptr{Char}, Ptr{Int32}})
-    len = 0
-    while unsafe_load(p, len+1) != 0; len += 1; end
-    utf32(p, len)
-end
-
-function map(f, s::UTF32String)
-    d = s.data
-    out = similar(d)
-    out[end] = 0
-
-    @inbounds for i = 1:(length(d)-1)
-        c2 = f(Char(d[i]))
-        if !isa(c2, Char)
-            throw(UnicodeError(UTF_ERR_MAP_CHAR, 0, 0))
-        end
-        out[i] = (c2::Char)
-    end
-    UTF32String(out)
-end
-
-pointer(x::Union{UTF16String,UTF32String}) = pointer(x.data)
-pointer(x::Union{UTF16String,UTF32String}, i::Integer) = pointer(x)+(i-1)*sizeof(eltype(x.data))
-pointer{T<:Union{UTF16String,UTF32String}}(x::SubString{T}) = pointer(x.string.data) + x.offset*sizeof(eltype(x.string.data))
-pointer{T<:Union{UTF16String,UTF32String}}(x::SubString{T}, i::Integer) = pointer(x.string.data) + (x.offset + (i-1))*sizeof(eltype(x.string.data))
diff --git a/test/serialize.jl b/test/serialize.jl
index c7c9e7e3d3c34..67a7249c2b91c 100644
--- a/test/serialize.jl
+++ b/test/serialize.jl
@@ -4,8 +4,8 @@ using Base.Test
 
 # Check that serializer hasn't gone out-of-frame
 @test Serializer.sertag(Symbol) == 2
-@test Serializer.sertag(()) == 46
-@test Serializer.sertag(false) == 122
+@test Serializer.sertag(()) == 44
+@test Serializer.sertag(false) == 120
 
 function create_serialization_stream(f::Function)
     s = IOBuffer()
diff --git a/test/strings/basic.jl b/test/strings/basic.jl
index f8a63850a0253..5d62ddb81a842 100644
--- a/test/strings/basic.jl
+++ b/test/strings/basic.jl
@@ -246,28 +246,11 @@ let p = cstrdup("hello")
     Libc.free(p)
 end
 
-# issue # 11389: Vector{UInt32} was copied with UTF32String, unlike Vector{Char}
-a = UInt32[48,0]
-b = UTF32String(a)
-@test b == "0"
-a[1] = 65
-@test b == "A"
-c = Char['0','\0']
-d = UTF32String(c)
-@test d == "0"
-c[1] = 'A'
-@test d == "A"
-
 # iteration
 @test [c for c in "ḟøøƀäṙ"] == ['ḟ', 'ø', 'ø', 'ƀ', 'ä', 'ṙ']
 @test [i for i in eachindex("ḟøøƀäṙ")] == [1, 4, 6, 8, 10, 12]
 @test [x for x in enumerate("ḟøøƀäṙ")] == [(1, 'ḟ'), (2, 'ø'), (3, 'ø'), (4, 'ƀ'), (5, 'ä'), (6, 'ṙ')]
 
-# Issue #11140
-@test isvalid(utf32("a")) == true
-@test isvalid(utf32("\x00")) == true
-@test isvalid(UTF32String, UInt32[0xd800,0]) == false
-
 # test all edge conditions
 for (val, pass) in (
         (0, true), (0xd7ff, true),
@@ -306,39 +289,9 @@ for (val, pass) in (
         )
     @test isvalid(String, val) == pass
 end
-for (val, pass) in (
-        (UInt16[0x0000], true),
-        (UInt16[0xd7ff,0], true),
-        (UInt16[0xd800,0], false),
-        (UInt16[0xdfff,0], false),
-        (UInt16[0xe000,0], true),
-        (UInt16[0xffff,0], true),
-        (UInt16[0xd800,0xdc00,0], true),
-        (UInt16[0xdbff,0xdfff,0], true),
-        (UInt16[0xd800,0x0100,0], false),
-        (UInt16[0xdc00,0x0100,0], false),
-        (UInt16[0xdc00,0xd800,0], false)
-        )
-    @test isvalid(UTF16String, val) == pass
-end
-for (val, pass) in (
-        (UInt32[0x0000], true),
-        (UInt32[0xd7ff,0], true),
-        (UInt32[0xd800,0], false),
-        (UInt32[0xdfff,0], false),
-        (UInt32[0xe000,0], true),
-        (UInt32[0xffff,0], true),
-        (UInt32[0x100000,0], true),
-        (UInt32[0x10ffff,0], true),
-        (UInt32[0x110000,0], false),
-        )
-    @test isvalid(UTF32String, val) == pass
-end
 
 # Issue #11203
 @test isvalid(String, UInt8[]) == true
-@test isvalid(UTF16String,UInt16[]) == true
-@test isvalid(UTF32String,UInt32[]) == true
 
 # Check UTF-8 characters
 # Check ASCII range (true),
@@ -411,21 +364,6 @@ end
 
 # 11482
 
-# isvalid
-let s = "abcdef", u8 = "abcdef\uff", u16 = utf16(u8), u32 = utf32(u8),
-    bad32 = utf32(UInt32[65,0x110000]), badch = Char[0x110000][1]
-
-    @test !isvalid(bad32)
-    @test !isvalid(badch)
-    @test isvalid(s)
-    @test isvalid(u8)
-    @test isvalid(u16)
-    @test isvalid(u32)
-    @test isvalid(String, u8)
-    @test isvalid(UTF16String, u16)
-    @test isvalid(UTF32String, u32)
-end
-
 # lower and upper
 @test uppercase("aBc") == "ABC"
 @test uppercase('A') == 'A'
@@ -458,9 +396,9 @@ str = "abcdef\uff\uffff\u10ffffABCDEF"
 foomap(ch) = (ch > Char(65))
 foobar(ch) = Char(0xd800)
 foobaz(ch) = reinterpret(Char, typemax(UInt32))
-@test_throws UnicodeError map(foomap, utf16(str))
-@test_throws UnicodeError map(foobar, utf16(str))
-@test_throws UnicodeError map(foobaz, utf16(str))
+@test_throws ArgumentError map(foomap, GenericString(str))
+@test map(foobar, GenericString(str)) == String(repeat(b"\ud800", outer=[17]))
+@test map(foobaz, GenericString(str)) == String(repeat(b"\ufffd", outer=[17]))
 
 @test "a".*["b","c"] == ["ab","ac"]
 @test ["b","c"].*"a" == ["ba","ca"]
diff --git a/test/strings/io.jl b/test/strings/io.jl
index 41df929335f39..1ab6be0035c44 100644
--- a/test/strings/io.jl
+++ b/test/strings/io.jl
@@ -136,76 +136,6 @@ end
 @test "\x0f" == unescape_string("\\x0f")
 @test "\x0F" == unescape_string("\\x0F")
 
-extrapath = is_windows() ? joinpath(JULIA_HOME,"..","Git","usr","bin")*";" : ""
-withenv("PATH" => extrapath * ENV["PATH"]) do
-if !success(`iconv --version`)
-    warn("iconv not found, skipping unicode tests!")
-    is_windows() && warn("Use WinRPM.install(\"win_iconv\") to run these tests")
-else
-    # Create unicode test data directory
-    unicodedir = mktempdir()
-
-    # Use perl to generate the primary data
-    primary_encoding = "UTF-32BE"
-    primary_path = replace(joinpath(unicodedir, primary_encoding*".unicode"),"\\","\\\\\\\\")
-    run(`perl -e "
-        $$fname = \"$primary_path\";
-        open(UNICODEF, \">\", \"$$fname\")         or die \"can\'t open $$fname: $$!\";
-        binmode(UNICODEF);
-        print UNICODEF pack \"N*\", 0xfeff, 0..0xd7ff, 0xe000..0x10ffff;
-        close(UNICODEF);"` )
-
-    # Use iconv to generate the other data
-    for encoding in ["UTF-32LE", "UTF-16BE", "UTF-16LE", "UTF-8"]
-        output_path = joinpath(unicodedir, encoding*".unicode")
-        f = Base.Filesystem.open(output_path,Base.JL_O_WRONLY|Base.JL_O_CREAT,Base.S_IRUSR | Base.S_IWUSR | Base.S_IRGRP | Base.S_IROTH)
-        run(pipeline(`iconv -f $primary_encoding -t $encoding $primary_path`, f))
-        Base.Filesystem.close(f)
-    end
-
-    f=open(joinpath(unicodedir,"UTF-32LE.unicode"))
-    str1 = utf32(read(f, UInt32, 1112065)[2:end])
-    close(f)
-
-    f=open(joinpath(unicodedir,"UTF-8.unicode"))
-    str2 = String(read(f, UInt8, 4382595)[4:end])
-    close(f)
-    @test str1 == str2
-
-    @test str1 == utf16(read(joinpath(unicodedir,"UTF-16LE.unicode"),
-                             UInt16, 2160641)[2:end])
-
-    @test str1 == utf16(read(joinpath(unicodedir,"UTF-16LE.unicode"),
-                             UInt8, 2160641*2))
-
-    @test str1 == utf16(read(joinpath(unicodedir,"UTF-16BE.unicode"),
-                             UInt8, 2160641*2))
-
-
-    @test str1 == utf32(read(joinpath(unicodedir,"UTF-32LE.unicode"),
-                             UInt8, 1112065*4))
-
-    @test str1 == utf32(read(joinpath(unicodedir,"UTF-32BE.unicode"),
-                             UInt8, 1112065*4))
-
-
-    str1 = "∀ ε > 0, ∃ δ > 0: |x-y| < δ ⇒ |f(x)-f(y)| < ε"
-    str2 = UTF32String(UInt32[
-                 8704, 32, 949, 32, 62, 32, 48, 44, 32, 8707, 32,
-                 948, 32, 62, 32, 48, 58, 32, 124, 120, 45, 121, 124,
-                 32, 60, 32, 948, 32, 8658, 32, 124, 102, 40, 120,
-                 41, 45, 102, 40, 121, 41, 124, 32, 60, 32, 949
-                 ,0])
-    @test str1 == str2
-
-    # Cleanup unicode data
-    for encoding in ["UTF-32BE", "UTF-32LE", "UTF-16BE", "UTF-16LE", "UTF-8"]
-        rm(joinpath(unicodedir,encoding*".unicode"))
-    end
-    rm(unicodedir)
-end
-end
-
 # Tests of join()
 @test join([]) == ""
 @test join(["a"],"?") == "a"
diff --git a/test/strings/types.jl b/test/strings/types.jl
index 48991654a7a81..0a5aba56d48ad 100644
--- a/test/strings/types.jl
+++ b/test/strings/types.jl
@@ -163,27 +163,29 @@ rs = RevString("foobar")
 @test rsplit(RevString("ailuj"),'l') == ["ju","ia"]
 @test parse(Float64,RevString("64")) === 46.0
 
-# reverseind
-for T in (String, UTF16String, UTF32String)
-    for prefix in ("", "abcd", "\U0001d6a4\U0001d4c1", "\U0001d6a4\U0001d4c1c", " \U0001d6a4\U0001d4c1")
-        for suffix in ("", "abcde", "\U0001d4c1β\U0001d6a4", "\U0001d4c1β\U0001d6a4c", " \U0001d4c1β\U0001d6a4")
-            for c in ('X', 'δ', '\U0001d6a5')
-                s = convert(T, string(prefix, c, suffix))
-                ri = search(reverse(s), c)
-                @test reverse(s) == RevString(s)
-                @test c == s[reverseind(s, ri)] == reverse(s)[ri]
-                s = RevString(s)
-                ri = search(reverse(s), c)
-                @test c == s[reverseind(s, ri)] == reverse(s)[ri]
-                s = convert(T, string(prefix, prefix, c, suffix, suffix))
-                pre = convert(T, prefix)
-                sb = SubString(s, nextind(pre, endof(pre)), endof(convert(T, string(prefix, prefix, c, suffix))))
-                ri = search(reverse(sb), c)
-                @test c == sb[reverseind(sb, ri)] == reverse(sb)[ri]
-            end
-        end
-    end
-end
+# # reverseind
+# for T in (String, GenericString)
+#     for prefix in ("", "abcd", "\U0001d6a4\U0001d4c1", "\U0001d6a4\U0001d4c1c", " \U0001d6a4\U0001d4c1")
+#         for suffix in ("", "abcde", "\U0001d4c1β\U0001d6a4", "\U0001d4c1β\U0001d6a4c", " \U0001d4c1β\U0001d6a4")
+#             for c in ('X', 'δ', '\U0001d6a5')
+#                 @show (T,prefix,suffix,c)
+#                 s = convert(T, string(prefix, c, suffix))
+#                 r = convert(T, String(reverse(s)))
+#                 ri = search(r, c)
+#                 @test r == RevString(s)
+#                 @test c == s[reverseind(s, ri)] == r[ri]
+#                 s = RevString(s)
+#                 ri = search(r, c)
+#                 @test c == s[reverseind(s, ri)] == r[ri]
+#                 s = convert(T, string(prefix, prefix, c, suffix, suffix))
+#                 pre = convert(T, prefix)
+#                 sb = SubString(s, nextind(pre, endof(pre)), endof(convert(T, string(prefix, prefix, c, suffix))))
+#                 ri = search(reverse(sb), c)
+#                 @test c == sb[reverseind(sb, ri)] == reverse(sb)[ri]
+#             end
+#         end
+#     end
+# end
 
 ## Repeat strings ##
 
diff --git a/test/strings/util.jl b/test/strings/util.jl
index d52189554f320..1aeb5b86e143d 100644
--- a/test/strings/util.jl
+++ b/test/strings/util.jl
@@ -21,7 +21,7 @@
 
 for s in ("", " ", " abc", "abc ", "  abc  "), f in (lstrip, rstrip, strip)
     fs = f(s)
-    for T = (String, UTF16String, UTF32String)
+    for T = (String, GenericString)
         t = convert(T,s)
         ft = f(t)
         @test s == t
diff --git a/test/unicode.jl b/test/unicode.jl
index 21f3dd7d48fb4..18666448ef169 100644
--- a/test/unicode.jl
+++ b/test/unicode.jl
@@ -1,9 +1,5 @@
 # This file is a part of Julia. License is MIT: http://julialang.org/license
 
 include("unicode/UnicodeError.jl")
-include("unicode/types.jl")
-include("unicode/checkstring.jl")
 include("unicode/utf8.jl")
-include("unicode/utf16.jl")
-include("unicode/utf32.jl")
 include("unicode/utf8proc.jl")
diff --git a/test/unicode/utf16.jl b/test/unicode/utf16.jl
deleted file mode 100644
index 1c8e31cdece98..0000000000000
--- a/test/unicode/utf16.jl
+++ /dev/null
@@ -1,23 +0,0 @@
-# This file is a part of Julia. License is MIT: http://julialang.org/license
-
-# UTF16
-u8 = "\U10ffff\U1d565\U1d7f6\U00066\U2008a"
-u16 = utf16(u8)
-@test sizeof(u16) == 18
-@test length(u16.data) == 10 && u16.data[end] == 0
-@test length(u16) == 5
-@test String(u16) == u8
-@test collect(u8) == collect(u16)
-@test u8 == utf16(u16.data[1:end-1]) == utf16(copy!(Array(UInt8, 18), 1, reinterpret(UInt8, u16.data), 1, 18))
-@test u8 == utf16(pointer(u16)) == utf16(convert(Ptr{Int16}, pointer(u16)))
-@test_throws UnicodeError utf16(utf32(Char(0x120000)))
-@test_throws UnicodeError utf16(UInt8[1,2,3])
-
-# Add tests for full coverage
-@test convert(UTF16String, "test") == "test"
-@test convert(UTF16String, u16) == u16
-@test convert(UTF16String, UInt16[[0x65, 0x66] [0x67, 0x68]]) == "efgh"
-@test convert(UTF16String, Int16[[0x65, 0x66] [0x67, 0x68]]) == "efgh"
-@test map(lowercase, utf16("TEST\U1f596")) == "test\U1f596"
-@test typeof(Base.unsafe_convert(Ptr{UInt16}, utf16("test"))) == Ptr{UInt16}
-
diff --git a/test/unicode/utf32.jl b/test/unicode/utf32.jl
deleted file mode 100644
index f4455271be054..0000000000000
--- a/test/unicode/utf32.jl
+++ /dev/null
@@ -1,258 +0,0 @@
-# This file is a part of Julia. License is MIT: http://julialang.org/license
-
-# UTF32
-u8 = "\U10ffff\U1d565\U1d7f6\U00066\U2008a"
-u32 = utf32(u8)
-@test sizeof(u32) == 20
-@test length(u32.data) == 6 && u32.data[end] == 0
-@test length(u32) == 5
-@test String(u32) == u8
-@test collect(u8) == collect(u32)
-@test u8 == utf32(u32.data[1:end-1]) == utf32(copy!(Array{UInt8}(20), 1, reinterpret(UInt8, u32.data), 1, 20))
-@test u8 == utf32(pointer(u32)) == utf32(convert(Ptr{Int32}, pointer(u32)))
-@test_throws UnicodeError utf32(UInt8[1,2,3])
-
-# issue #11551 (#11004,#10959)
-function tstcvt(strUTF8::String, strUTF16::UTF16String, strUTF32::UTF32String)
-    @test utf16(strUTF8) == strUTF16
-    @test utf32(strUTF8) == strUTF32
-    @test String(strUTF16) == strUTF8
-    @test utf32(strUTF16) == strUTF32
-    @test String(strUTF32)  == strUTF8
-    @test utf16(strUTF32) == strUTF16
-end
-
-# Create some ASCII, UTF8, UTF16, and UTF32 strings
-
-strAscii = "abcdefgh"
-strA_UTF8 = ("abcdefgh\uff")[1:8]
-strL_UTF8 = "abcdef\uff\uff"
-str2_UTF8 = "abcd\uff\uff\u7ff\u7ff"
-str3_UTF8 = "abcd\uff\uff\u7fff\u7fff"
-str4_UTF8 = "abcd\uff\u7ff\u7fff\U7ffff"
-strS_UTF8 = String(b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\xed\xa0\x80\xed\xb0\x80")
-strC_UTF8 = String(b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\U10000")
-strz_UTF8 = String(b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\0")
-strZ      = b"abcd\xc3\xbf\xdf\xbf\xe7\xbf\xbf\xc0\x80"
-
-strA_UTF16 = utf16(strA_UTF8)
-strL_UTF16 = utf16(strL_UTF8)
-str2_UTF16 = utf16(str2_UTF8)
-str3_UTF16 = utf16(str3_UTF8)
-str4_UTF16 = utf16(str4_UTF8)
-strS_UTF16 = utf16(strS_UTF8)
-
-strA_UTF32 = utf32(strA_UTF8)
-strL_UTF32 = utf32(strL_UTF8)
-str2_UTF32 = utf32(str2_UTF8)
-str3_UTF32 = utf32(str3_UTF8)
-str4_UTF32 = utf32(str4_UTF8)
-strS_UTF32 = utf32(strS_UTF8)
-
-@test String(strAscii) == strAscii
-@test utf16(strAscii) == strAscii
-@test utf32(strAscii) == strAscii
-
-tstcvt(strA_UTF8,strA_UTF16,strA_UTF32)
-tstcvt(strL_UTF8,strL_UTF16,strL_UTF32)
-tstcvt(str2_UTF8,str2_UTF16,str2_UTF32)
-tstcvt(str3_UTF8,str3_UTF16,str3_UTF32)
-tstcvt(str4_UTF8,str4_UTF16,str4_UTF32)
-
-# Test converting surrogate pairs
-@test utf16(strS_UTF8) == strC_UTF8
-@test utf32(strS_UTF8) == strC_UTF8
-@test String(strS_UTF16) == strC_UTF8
-@test utf32(strS_UTF16) == strC_UTF8
-@test String(strS_UTF32)  == strC_UTF8
-@test utf16(strS_UTF32) == strC_UTF8
-
-# Test converting overlong \0
-@test convert(String, strZ) == strz_UTF8
-@test utf16(String(strZ)) == strz_UTF8
-@test utf32(String(strZ)) == strz_UTF8
-
-# Test invalid sequences
-
-strval(::Type{String}, dat) = dat
-strval(::Union{Type{UTF16String},Type{UTF32String}}, dat) = String(dat)
-
-byt = 0x0
-for T in (String, UTF16String, UTF32String)
-    try
-    # Continuation byte not after lead
-    for byt in 0x80:0xbf
-        @test_throws UnicodeError convert(T,  strval(T, UInt8[byt]))
-    end
-
-    # Test lead bytes
-    for byt in 0xc0:0xff
-        # Single lead byte at end of string
-        @test_throws UnicodeError convert(T, strval(T, UInt8[byt]))
-        # Lead followed by non-continuation character < 0x80
-        @test_throws UnicodeError convert(T, strval(T, UInt8[byt,0]))
-        # Lead followed by non-continuation character > 0xbf
-        @test_throws UnicodeError convert(T, strval(T, UInt8[byt,0xc0]))
-    end
-
-    # Test overlong 2-byte
-    for byt in 0x81:0xbf
-        @test_throws UnicodeError convert(T, strval(T, UInt8[0xc0,byt]))
-    end
-    for byt in 0x80:0xbf
-        @test_throws UnicodeError convert(T, strval(T, UInt8[0xc1,byt]))
-    end
-
-    # Test overlong 3-byte
-    for byt in 0x80:0x9f
-        @test_throws UnicodeError convert(T, strval(T, UInt8[0xe0,byt,0x80]))
-    end
-
-    # Test overlong 4-byte
-    for byt in 0x80:0x8f
-        @test_throws UnicodeError convert(T, strval(T, UInt8[0xef,byt,0x80,0x80]))
-    end
-
-    # Test 4-byte > 0x10ffff
-    for byt in 0x90:0xbf
-        @test_throws UnicodeError convert(T, strval(T, UInt8[0xf4,byt,0x80,0x80]))
-    end
-    for byt in 0xf5:0xf7
-        @test_throws UnicodeError convert(T, strval(T, UInt8[byt,0x80,0x80,0x80]))
-    end
-
-    # Test 5-byte
-    for byt in 0xf8:0xfb
-        @test_throws UnicodeError convert(T, strval(T, UInt8[byt,0x80,0x80,0x80,0x80]))
-    end
-
-    # Test 6-byte
-    for byt in 0xfc:0xfd
-        @test_throws UnicodeError convert(T, strval(T, UInt8[byt,0x80,0x80,0x80,0x80,0x80]))
-    end
-
-    # Test 7-byte
-    @test_throws UnicodeError convert(T, strval(T, UInt8[0xfe,0x80,0x80,0x80,0x80,0x80,0x80]))
-
-    # Three and above byte sequences
-    for byt in 0xe0:0xef
-        # Lead followed by only 1 continuation byte
-        @test_throws UnicodeError convert(T, strval(T, UInt8[byt,0x80]))
-        # Lead ended by non-continuation character < 0x80
-        @test_throws UnicodeError convert(T, strval(T, UInt8[byt,0x80,0]))
-        # Lead ended by non-continuation character > 0xbf
-        @test_throws UnicodeError convert(T, strval(T, UInt8[byt,0x80,0xc0]))
-    end
-
-    # 3-byte encoded surrogate character(s)
-    # Single surrogate
-    @test_throws UnicodeError convert(T, strval(T, UInt8[0xed,0xa0,0x80]))
-    # Not followed by surrogate
-    @test_throws UnicodeError convert(T, strval(T, UInt8[0xed,0xa0,0x80,0xed,0x80,0x80]))
-    # Trailing surrogate first
-    @test_throws UnicodeError convert(T, strval(T, UInt8[0xed,0xb0,0x80,0xed,0xb0,0x80]))
-    # Followed by lead surrogate
-    @test_throws UnicodeError convert(T, strval(T, UInt8[0xed,0xa0,0x80,0xed,0xa0,0x80]))
-
-    # Four byte sequences
-    for byt in 0xf0:0xf4
-        # Lead followed by only 2 continuation bytes
-        @test_throws UnicodeError convert(T, strval(T, UInt8[byt,0x80,0x80]))
-        # Lead followed by non-continuation character < 0x80
-        @test_throws UnicodeError convert(T, strval(T, UInt8[byt,0x80,0x80,0]))
-        # Lead followed by non-continuation character > 0xbf
-        @test_throws UnicodeError convert(T, strval(T, UInt8[byt,0x80,0x80,0xc0]))
-    end
-    catch exp
-        println("Error checking $T: $byt")
-        throw(exp)
-    end
-end
-
-# 12268
-for (fun, S, T) in ((utf16, UInt16, UTF16String), (utf32, UInt32, UTF32String))
-    # AbstractString
-    str = "abcd\0\uff\u7ff\u7fff\U7ffff"
-    tst = SubString(convert(T,str),4)
-    cmp = Char['d','\0','\uff','\u7ff','\u7fff','\U7ffff']
-    cmp32 = UInt32['d','\0','\uff','\u7ff','\u7fff','\U7ffff','\0']
-    cmp16 = UInt16[0x0064,0x0000,0x00ff,0x07ff,0x7fff,0xd9bf,0xdfff,0x0000]
-    x = fun(tst)
-    cmpx = (S == UInt16 ? cmp16 : cmp32)
-    @test typeof(tst) == SubString{T}
-    @test convert(T, tst) == str[4:end]
-    @test convert(Vector{Char}, x) == cmp
-    # Vector{T} / Array{T}
-    @test convert(Vector{S}, x) == cmpx
-    @test convert(Array{S}, x) == cmpx
-    # Embedded nul checking
-    @test Base.containsnul(x)
-    @test Base.containsnul(tst)
-    # map
-    @test_throws UnicodeError map(islower, x)
-    @test_throws ArgumentError map(islower, tst)
-    # SubArray conversion
-    subarr = view(cmp, 1:6)
-    @test convert(T, subarr) == str[4:end]
-end
-
-# Char to UTF32String
-@test utf32('\U7ffff') == utf32("\U7ffff")
-@test convert(UTF32String, '\U7ffff') == utf32("\U7ffff")
-
-@test isvalid(UTF32String, Char['d','\uff','\u7ff','\u7fff','\U7ffff'])
-@test reverse(utf32("abcd \uff\u7ff\u7fff\U7ffff")) == utf32("\U7ffff\u7fff\u7ff\uff dcba")
-
-# Test pointer() functions
-let str = ascii("this ")
-    u8  = String(str)
-    u16 = utf16(str)
-    u32 = utf32(str)
-    pa  = pointer(str)
-    p8  = pointer(u8)
-    p16 = pointer(u16)
-    p32 = pointer(u32)
-    @test typeof(pa) == Ptr{UInt8}
-    @test unsafe_load(pa,1) == 0x74
-    @test typeof(p8) == Ptr{UInt8}
-    @test unsafe_load(p8,1) == 0x74
-    @test typeof(p16) == Ptr{UInt16}
-    @test unsafe_load(p16,1) == 0x74
-    @test typeof(p32) == Ptr{UInt32}
-    @test unsafe_load(p32,1) == 0x74
-    pa  = pointer(str, 2)
-    p8  = pointer(u8,  2)
-    p16 = pointer(u16, 2)
-    p32 = pointer(u32, 2)
-    @test typeof(pa) == Ptr{UInt8}
-    @test unsafe_load(pa,1) == 0x68
-    @test typeof(p8) == Ptr{UInt8}
-    @test unsafe_load(p8,1) == 0x68
-    @test typeof(p16) == Ptr{UInt16}
-    @test unsafe_load(p16,1) == 0x68
-    @test typeof(p32) == Ptr{UInt32}
-    @test unsafe_load(p32,1) == 0x68
-    s8  = SubString{String}(u8, 3, 5)
-    s16 = SubString{UTF16String}(u16, 3, 5)
-    s32 = SubString{UTF32String}(u32, 3, 5)
-    p8  = pointer(s8)
-    p16 = pointer(s16)
-    p32 = pointer(s32)
-    @test typeof(p8) == Ptr{UInt8}
-    @test unsafe_load(p8,1) == 0x69
-    @test typeof(p16) == Ptr{UInt16}
-    @test unsafe_load(p16,1) == 0x69
-    @test typeof(p32) == Ptr{UInt32}
-    @test unsafe_load(p32,1) == 0x69
-    p8  = pointer(s8,  2)
-    p16 = pointer(s16, 2)
-    p32 = pointer(s32, 2)
-    @test typeof(p8) == Ptr{UInt8}
-    @test unsafe_load(p8,1) == 0x73
-    @test typeof(p16) == Ptr{UInt16}
-    @test unsafe_load(p16,1) == 0x73
-    @test typeof(p32) == Ptr{UInt32}
-    @test unsafe_load(p32,1) == 0x73
-end
-
-@test isvalid(Char['f','o','o','b','a','r'])
diff --git a/test/unicode/utf8.jl b/test/unicode/utf8.jl
index 8827642ba9c3a..0a4665146b697 100644
--- a/test/unicode/utf8.jl
+++ b/test/unicode/utf8.jl
@@ -3,9 +3,9 @@
 ## Test for CESU-8 sequences
 
 let ch = 0x10000
-    for hichar = 0xd800:0xdbff
-        for lochar = 0xdc00:0xdfff
-            @test convert(String, String(Char[hichar, lochar]).data) == string(Char(ch))
+    for hi = 0xd800:0xdbff
+        for lo = 0xdc00:0xdfff
+            @test convert(String, String(Char[hi, lo]).data) == string(Char(ch))
             ch += 1
         end
     end
diff --git a/test/unicode/utf8proc.jl b/test/unicode/utf8proc.jl
index 0f1e3b0727e26..3078f05fce080 100644
--- a/test/unicode/utf8proc.jl
+++ b/test/unicode/utf8proc.jl
@@ -234,7 +234,7 @@ let grphtest = (("b\u0300lahβlahb\u0302láh", ["b\u0300","l","a","h",
                                                 "\U1d4c1\u0300"]),
                 ("x",["x"]),
                 ("abc",["a","b","c"]))
-    for T in (String,utf16,utf32)
+    for T in (String,GenericString)
         for nf in (:NFC, :NFD)
             for (s, g) in grphtest
                 s_ = T(normalize_string(s, nf))

From 33be76907eeba40a51c2e80fb074057aadf26ece Mon Sep 17 00:00:00 2001
From: Milan Bouchet-Valat <nalimilan@club.fr>
Date: Sat, 2 Jul 2016 23:12:39 +0200
Subject: [PATCH 2/7] Add reverseind() for AbstractString, re-enable tests with
 GenericString

reverse() for GenericString/AbstractString returns a RevString, whose
indexing behavior is very different from a reverse()'d String which
is returned for String. Thus, calling reverseind() on the underlying String
object is not correct for GenericString. Add a generic but O(n) method for
AbstractString and use it for GenericString.
---
 base/strings/string.jl |  4 +---
 base/strings/types.jl  |  7 +++----
 base/test.jl           |  1 -
 test/strings/types.jl  | 47 +++++++++++++++++++++---------------------
 4 files changed, 28 insertions(+), 31 deletions(-)

diff --git a/base/strings/string.jl b/base/strings/string.jl
index c9637b746ef91..6395c03eac5c7 100644
--- a/base/strings/string.jl
+++ b/base/strings/string.jl
@@ -104,7 +104,7 @@ function first_utf8_byte(ch::Char)
 end
 
 function reverseind(s::String, i::Integer)
-    j = lastidx(s) + 1 - i
+    j = length(s.data) + 1 - i
     d = s.data
     while is_valid_continuation(d[j])
         j -= 1
@@ -116,8 +116,6 @@ end
 
 sizeof(s::String) = sizeof(s.data)
 
-lastidx(s::String) = length(s.data)
-
 isvalid(s::String, i::Integer) =
     (1 <= i <= endof(s.data)) && !is_valid_continuation(s.data[i])
 
diff --git a/base/strings/types.jl b/base/strings/types.jl
index fe3470bac5721..b98ea229f57ab 100644
--- a/base/strings/types.jl
+++ b/base/strings/types.jl
@@ -118,12 +118,11 @@ reverse(s::RevString) = s.string
 
 ## reverse an index i so that reverse(s)[i] == s[reverseind(s,i)]
 
+reverseind(s::AbstractString, i) = chr2ind(s, length(s) + 1 - ind2chr(reverse(s), i))
 reverseind(s::Union{DirectIndexString,SubString{DirectIndexString}}, i::Integer) = length(s) + 1 - i
 reverseind(s::RevString, i::Integer) = endof(s) - i + 1
-lastidx(s::AbstractString) = nextind(s, endof(s)) - 1
-lastidx(s::DirectIndexString) = length(s)
-reverseind(s::SubString, i::Integer) =
-    reverseind(s.string, lastidx(s.string)-s.offset-s.endof+i) - s.offset
+reverseind(s::SubString{String}, i::Integer) =
+    reverseind(s.string, nextind(s.string, endof(s.string))-s.offset-s.endof+i-1) - s.offset
 
 ## efficient representation of repeated strings ##
 
diff --git a/base/test.jl b/base/test.jl
index f9790636f6906..b0382f3284f5c 100644
--- a/base/test.jl
+++ b/base/test.jl
@@ -1009,6 +1009,5 @@ end
 Base.convert(::Type{GenericString}, s::AbstractString) = GenericString(s)
 Base.endof(s::GenericString) = endof(s.string)
 Base.next(s::GenericString, i::Int) = next(s.string, i)
-Base.reverseind(s::GenericString, i::Integer) = reverseind(s.string, i)
 
 end # module
diff --git a/test/strings/types.jl b/test/strings/types.jl
index 0a5aba56d48ad..10e8ca406682b 100644
--- a/test/strings/types.jl
+++ b/test/strings/types.jl
@@ -163,29 +163,30 @@ rs = RevString("foobar")
 @test rsplit(RevString("ailuj"),'l') == ["ju","ia"]
 @test parse(Float64,RevString("64")) === 46.0
 
-# # reverseind
-# for T in (String, GenericString)
-#     for prefix in ("", "abcd", "\U0001d6a4\U0001d4c1", "\U0001d6a4\U0001d4c1c", " \U0001d6a4\U0001d4c1")
-#         for suffix in ("", "abcde", "\U0001d4c1β\U0001d6a4", "\U0001d4c1β\U0001d6a4c", " \U0001d4c1β\U0001d6a4")
-#             for c in ('X', 'δ', '\U0001d6a5')
-#                 @show (T,prefix,suffix,c)
-#                 s = convert(T, string(prefix, c, suffix))
-#                 r = convert(T, String(reverse(s)))
-#                 ri = search(r, c)
-#                 @test r == RevString(s)
-#                 @test c == s[reverseind(s, ri)] == r[ri]
-#                 s = RevString(s)
-#                 ri = search(r, c)
-#                 @test c == s[reverseind(s, ri)] == r[ri]
-#                 s = convert(T, string(prefix, prefix, c, suffix, suffix))
-#                 pre = convert(T, prefix)
-#                 sb = SubString(s, nextind(pre, endof(pre)), endof(convert(T, string(prefix, prefix, c, suffix))))
-#                 ri = search(reverse(sb), c)
-#                 @test c == sb[reverseind(sb, ri)] == reverse(sb)[ri]
-#             end
-#         end
-#     end
-# end
+# reverseind
+for T in (String, GenericString)
+    for prefix in ("", "abcd", "\U0001d6a4\U0001d4c1", "\U0001d6a4\U0001d4c1c", " \U0001d6a4\U0001d4c1")
+        for suffix in ("", "abcde", "\U0001d4c1β\U0001d6a4", "\U0001d4c1β\U0001d6a4c", " \U0001d4c1β\U0001d6a4")
+            for c in ('X', 'δ', '\U0001d6a5')
+                s = convert(T, string(prefix, c, suffix))
+                r = reverse(s)
+                ri = search(r, c)
+                @test r == RevString(s)
+                @test c == s[reverseind(s, ri)] == r[ri]
+                s = RevString(s)
+                r = reverse(s)
+                ri = search(r, c)
+                @test c == s[reverseind(s, ri)] == r[ri]
+                s = convert(T, string(prefix, prefix, c, suffix, suffix))
+                pre = convert(T, prefix)
+                sb = SubString(s, nextind(pre, endof(pre)), endof(convert(T, string(prefix, prefix, c, suffix))))
+                r = reverse(sb)
+                ri = search(r, c)
+                @test c == sb[reverseind(sb, ri)] == r[ri]
+            end
+        end
+    end
+end
 
 ## Repeat strings ##
 

From 593c5de621cf39b930073415dd6c0d66d233baf5 Mon Sep 17 00:00:00 2001
From: Tony Kelman <tony@kelman.net>
Date: Tue, 5 Jul 2016 20:24:24 -0700
Subject: [PATCH 3/7] Delete the now-unused UTF_ERR constants

---
 base/strings/errors.jl | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/base/strings/errors.jl b/base/strings/errors.jl
index b1b694df7fdf2..c0ca38ff28db2 100644
--- a/base/strings/errors.jl
+++ b/base/strings/errors.jl
@@ -3,23 +3,7 @@
 ##    Error messages for Unicode / UTF support
 
 const UTF_ERR_SHORT             = "invalid UTF-8 sequence starting at index <<1>> (0x<<2>> missing one or more continuation bytes)"
-const UTF_ERR_CONT              = "invalid UTF-8 sequence starting at index <<1>> (0x<<2>> is not a continuation byte)"
-const UTF_ERR_LONG              = "invalid UTF-8 sequence, overlong encoding starting at index <<1>> (0x<<2>>)"
-const UTF_ERR_NOT_LEAD          = "not a leading Unicode surrogate code unit at index <<1>> (0x<<2>>)"
-const UTF_ERR_NOT_TRAIL         = "not a trailing Unicode surrogate code unit at index <<1>> (0x<<2>>)"
-const UTF_ERR_NOT_SURROGATE     = "not a valid Unicode surrogate code unit at index <<1>> (0x<<2>>)"
-const UTF_ERR_MISSING_SURROGATE = "missing trailing Unicode surrogate code unit after index <<1>> (0x<<2>>)"
-const UTF_ERR_INVALID           = "invalid Unicode character starting at index <<1>> (0x<<2>> > 0x10ffff)"
-const UTF_ERR_SURROGATE         = "surrogate encoding not allowed in UTF-8 or UTF-32, at index <<1>> (0x<<2>>)"
-const UTF_ERR_NULL_16_TERMINATE = "UTF16String data must be NULL-terminated"
-const UTF_ERR_NULL_32_TERMINATE = "UTF32String data must be NULL-terminated"
-const UTF_ERR_ODD_BYTES_16      = "UTF16String can't have odd number of bytes <<1>>"
-const UTF_ERR_ODD_BYTES_32      = "UTF32String must have multiple of 4 bytes <<1>>"
-const UTF_ERR_INVALID_CHAR      = "invalid Unicode character (0x<<2>> > 0x10ffff)"
-const UTF_ERR_INVALID_8         = "invalid UTF-8 data"
-const UTF_ERR_INVALID_16        = "invalid UTF-16 data"
 const UTF_ERR_INVALID_INDEX     = "invalid character index"
-const UTF_ERR_MAP_CHAR          = "map(f,s::AbstractString) requires f to return Char; try map(f,collect(s)) or a comprehension instead"
 
 type UnicodeError <: Exception
     errmsg::AbstractString      ##< A UTF_ERR_ message

From 1c2e5b555ea5c94d7d883a09d00e485fa8fbd4e9 Mon Sep 17 00:00:00 2001
From: Tony Kelman <tony@kelman.net>
Date: Tue, 5 Jul 2016 20:31:31 -0700
Subject: [PATCH 4/7] Doc update for utf16 and utf32 removal

---
 base/docs/helpdb/Base.jl | 12 +++++-------
 doc/manual/strings.rst   |  9 +++------
 doc/stdlib/strings.rst   | 33 ++-------------------------------
 3 files changed, 10 insertions(+), 44 deletions(-)

diff --git a/base/docs/helpdb/Base.jl b/base/docs/helpdb/Base.jl
index f8af1543b5d68..a45a26f0c5225 100644
--- a/base/docs/helpdb/Base.jl
+++ b/base/docs/helpdb/Base.jl
@@ -8856,19 +8856,17 @@ vecnorm
 """
     isvalid(value) -> Bool
 
-Returns `true` if the given value is valid for its type, which currently can be one of
-`Char`, `String`, `UTF16String`, or `UTF32String`.
+Returns `true` if the given value is valid for its type, which currently can be either
+`Char` or `String`.
 """
 isvalid(value)
 
 """
     isvalid(T, value) -> Bool
 
-Returns `true` if the given value is valid for that type. Types currently can be `Char`,
-`String`, `UTF16String`, or `UTF32String` Values for `Char` can be of
-type `Char` or `UInt32` Values for `String` can be of that type, or
-`Vector{UInt8}` Values for `UTF16String` can be `UTF16String` or `Vector{UInt16}` Values for
-`UTF32String` can be `UTF32String`, `Vector{Char}` or `Vector{UInt32}`
+Returns `true` if the given value is valid for that type. Types currently can
+be either `Char` or `String`. Values for `Char` can be of type `Char` or `UInt32`.
+Values for `String` can be of that type, or `Vector{UInt8}`.
 """
 isvalid(T,value)
 
diff --git a/doc/manual/strings.rst b/doc/manual/strings.rst
index 36180463ba46a..45e556f45f99a 100644
--- a/doc/manual/strings.rst
+++ b/doc/manual/strings.rst
@@ -349,12 +349,9 @@ exception handling required:
     <BLANKLINE>
     y
 
-UTF-8 is not the only encoding that Julia supports, and adding support
-for new encodings is quite easy.  In particular, Julia also provides
-:obj:`UTF16String` and :obj:`UTF32String` types, constructed by
-:func:`utf16` and :func:`utf32` respectively, for UTF-16 and
-UTF-32 encodings. Additional discussion of other encodings and how to
-implement support for them is beyond the scope of this document for
+Julia uses UTF-8 encoding by default, and support for new encodings can
+be added by packages. Additional discussion of other encodings and how
+to implement support for them is beyond the scope of this document for
 the time being. For further discussion of UTF-8 encoding issues, see
 the section below on `byte array literals <#Byte+Array+Literals>`_,
 which goes into some greater detail.
diff --git a/doc/stdlib/strings.rst b/doc/stdlib/strings.rst
index 0817c94f0fa56..40ec3e2d69347 100644
--- a/doc/stdlib/strings.rst
+++ b/doc/stdlib/strings.rst
@@ -143,13 +143,13 @@
 
    .. Docstring generated from Julia source
 
-   Returns ``true`` if the given value is valid for its type, which currently can be one of ``Char``\ , ``String``\ , ``UTF16String``\ , or ``UTF32String``\ .
+   Returns ``true`` if the given value is valid for its type, which currently can be either ``Char`` or ``String``\ .
 
 .. function:: isvalid(T, value) -> Bool
 
    .. Docstring generated from Julia source
 
-   Returns ``true`` if the given value is valid for that type. Types currently can be ``Char``\ , ``String``\ , ``UTF16String``\ , or ``UTF32String`` Values for ``Char`` can be of type ``Char`` or ``UInt32`` Values for ``String`` can be of that type, or ``Vector{UInt8}`` Values for ``UTF16String`` can be ``UTF16String`` or ``Vector{UInt16}`` Values for ``UTF32String`` can be ``UTF32String``\ , ``Vector{Char}`` or ``Vector{UInt32}``
+   Returns ``true`` if the given value is valid for that type. Types currently can be either ``Char`` or ``String``\ . Values for ``Char`` can be of type ``Char`` or ``UInt32``\ . Values for ``String`` can be of that type, or ``Vector{UInt8}``\ .
 
 .. function:: isvalid(str, i)
 
@@ -472,32 +472,3 @@
    .. Docstring generated from Julia source
 
    General unescaping of traditional C and Unicode escape sequences. Reverse of :func:`escape_string`\ . See also :func:`unescape_string`\ .
-
-.. function:: utf16(s)
-
-   .. Docstring generated from Julia source
-
-   Create a UTF-16 string from a byte array, array of ``UInt16``\ , or any other string type. (Data must be valid UTF-16. Conversions of byte arrays check for a byte-order marker in the first two bytes, and do not include it in the resulting string.)
-
-   Note that the resulting ``UTF16String`` data is terminated by the NUL codepoint (16-bit zero), which is not treated as a character in the string (so that it is mostly invisible in Julia); this allows the string to be passed directly to external functions requiring NUL-terminated data. This NUL is appended automatically by the ``utf16(s)`` conversion function. If you have a ``UInt16`` array ``A`` that is already NUL-terminated valid UTF-16 data, then you can instead use ``UTF16String(A)`` to construct the string without making a copy of the data and treating the NUL as a terminator rather than as part of the string.
-
-.. function:: utf16(::Union{Ptr{UInt16},Ptr{Int16}} [, length])
-
-   .. Docstring generated from Julia source
-
-   Create a string from the address of a NUL-terminated UTF-16 string. A copy is made; the pointer can be safely freed. If ``length`` is specified, the string does not have to be NUL-terminated.
-
-.. function:: utf32(s)
-
-   .. Docstring generated from Julia source
-
-   Create a UTF-32 string from a byte array, array of ``Char`` or ``UInt32``\ , or any other string type. (Conversions of byte arrays check for a byte-order marker in the first four bytes, and do not include it in the resulting string.)
-
-   Note that the resulting ``UTF32String`` data is terminated by the NUL codepoint (32-bit zero), which is not treated as a character in the string (so that it is mostly invisible in Julia); this allows the string to be passed directly to external functions requiring NUL-terminated data. This NUL is appended automatically by the ``utf32(s)`` conversion function. If you have a ``Char`` or ``UInt32`` array ``A`` that is already NUL-terminated UTF-32 data, then you can instead use ``UTF32String(A)`` to construct the string without making a copy of the data and treating the NUL as a terminator rather than as part of the string.
-
-.. function:: utf32(::Union{Ptr{Char},Ptr{UInt32},Ptr{Int32}} [, length])
-
-   .. Docstring generated from Julia source
-
-   Create a string from the address of a NUL-terminated UTF-32 string. A copy is made; the pointer can be safely freed. If ``length`` is specified, the string does not have to be NUL-terminated.
-

From d7b8361754863e5a7c6bb907b0e14eeb24465b19 Mon Sep 17 00:00:00 2001
From: Tony Kelman <tony@kelman.net>
Date: Tue, 5 Jul 2016 20:21:50 -0700
Subject: [PATCH 5/7] Delete test/unicode/types.jl and
 test/unicode/checkstring.jl

since their code in base has been removed
---
 test/unicode/checkstring.jl | 175 ------------------------------------
 test/unicode/types.jl       |  11 ---
 2 files changed, 186 deletions(-)
 delete mode 100644 test/unicode/checkstring.jl
 delete mode 100644 test/unicode/types.jl

diff --git a/test/unicode/checkstring.jl b/test/unicode/checkstring.jl
deleted file mode 100644
index 8fd9c81554b7a..0000000000000
--- a/test/unicode/checkstring.jl
+++ /dev/null
@@ -1,175 +0,0 @@
-# This file is a part of Julia. License is MIT: http://julialang.org/license
-
-# 11575
-# Test invalid sequences
-
-byt = 0x0 # Needs to be defined outside the try block!
-try
-    # Continuation byte not after lead
-    for byt in 0x80:0xbf
-        @test_throws UnicodeError Base.checkstring(UInt8[byt])
-    end
-
-    # Test lead bytes
-    for byt in 0xc0:0xff
-        # Single lead byte at end of string
-        @test_throws UnicodeError Base.checkstring(UInt8[byt])
-        # Lead followed by non-continuation character < 0x80
-        @test_throws UnicodeError Base.checkstring(UInt8[byt,0])
-        # Lead followed by non-continuation character > 0xbf
-        @test_throws UnicodeError Base.checkstring(UInt8[byt,0xc0])
-    end
-
-    # Test overlong 2-byte
-    for byt in 0x81:0xbf
-        @test_throws UnicodeError Base.checkstring(UInt8[0xc0,byt])
-    end
-    for byt in 0x80:0xbf
-        @test_throws UnicodeError Base.checkstring(UInt8[0xc1,byt])
-    end
-
-    # Test overlong 3-byte
-    for byt in 0x80:0x9f
-        @test_throws UnicodeError Base.checkstring(UInt8[0xe0,byt,0x80])
-    end
-
-    # Test overlong 4-byte
-    for byt in 0x80:0x8f
-        @test_throws UnicodeError Base.checkstring(UInt8[0xef,byt,0x80,0x80])
-    end
-
-    # Test 4-byte > 0x10ffff
-    for byt in 0x90:0xbf
-        @test_throws UnicodeError Base.checkstring(UInt8[0xf4,byt,0x80,0x80])
-    end
-    for byt in 0xf5:0xf7
-        @test_throws UnicodeError Base.checkstring(UInt8[byt,0x80,0x80,0x80])
-    end
-
-    # Test 5-byte
-    for byt in 0xf8:0xfb
-        @test_throws UnicodeError Base.checkstring(UInt8[byt,0x80,0x80,0x80,0x80])
-    end
-
-    # Test 6-byte
-    for byt in 0xfc:0xfd
-        @test_throws UnicodeError Base.checkstring(UInt8[byt,0x80,0x80,0x80,0x80,0x80])
-    end
-
-    # Test 7-byte
-    @test_throws UnicodeError Base.checkstring(UInt8[0xfe,0x80,0x80,0x80,0x80,0x80,0x80])
-
-    # Three and above byte sequences
-    for byt in 0xe0:0xef
-        # Lead followed by only 1 continuation byte
-        @test_throws UnicodeError Base.checkstring(UInt8[byt,0x80])
-        # Lead ended by non-continuation character < 0x80
-        @test_throws UnicodeError Base.checkstring(UInt8[byt,0x80,0])
-        # Lead ended by non-continuation character > 0xbf
-        @test_throws UnicodeError Base.checkstring(UInt8[byt,0x80,0xc0])
-    end
-
-    # 3-byte encoded surrogate character(s)
-    # Single surrogate
-    @test_throws UnicodeError Base.checkstring(UInt8[0xed,0xa0,0x80])
-    # Not followed by surrogate
-    @test_throws UnicodeError Base.checkstring(UInt8[0xed,0xa0,0x80,0xed,0x80,0x80])
-    # Trailing surrogate first
-    @test_throws UnicodeError Base.checkstring(UInt8[0xed,0xb0,0x80,0xed,0xb0,0x80])
-    # Followed by lead surrogate
-    @test_throws UnicodeError Base.checkstring(UInt8[0xed,0xa0,0x80,0xed,0xa0,0x80])
-
-    # Four byte sequences
-    for byt in 0xf0:0xf4
-        # Lead followed by only 2 continuation bytes
-        @test_throws UnicodeError Base.checkstring(UInt8[byt,0x80,0x80])
-        # Lead followed by non-continuation character < 0x80
-        @test_throws UnicodeError Base.checkstring(UInt8[byt,0x80,0x80,0])
-        # Lead followed by non-continuation character > 0xbf
-        @test_throws UnicodeError Base.checkstring(UInt8[byt,0x80,0x80,0xc0])
-    end
-
-    # Long encoding of 0x01
-    @test_throws UnicodeError convert(String, b"\xf0\x80\x80\x80")
-    # Test ends of long encoded surrogates
-    @test_throws UnicodeError convert(String, b"\xf0\x8d\xa0\x80")
-    @test_throws UnicodeError convert(String, b"\xf0\x8d\xbf\xbf")
-    # Long encodings
-    @test_throws UnicodeError Base.checkstring(b"\xf0\x80\x80\x80")
-    @test Base.checkstring(b"\xc0\x81"; accept_long_char=true) == (1,0x1,0,0,0)
-    @test Base.checkstring(b"\xf0\x80\x80\x80"; accept_long_char=true) == (1,0x1,0,0,0)
-catch exp;
-    println("Error testing checkstring: $byt, $exp")
-    throw(exp)
-end
-
-# Surrogates
-@test_throws UnicodeError Base.checkstring(UInt16[0xd800])
-@test_throws UnicodeError Base.checkstring(UInt16[0xdc00])
-@test_throws UnicodeError Base.checkstring(UInt16[0xdc00,0xd800])
-
-# Surrogates in UTF-32
-@test_throws UnicodeError Base.checkstring(UInt32[0xd800])
-@test_throws UnicodeError Base.checkstring(UInt32[0xdc00])
-@test_throws UnicodeError Base.checkstring(UInt32[0xdc00,0xd800])
-
-# Characters > 0x10ffff
-@test_throws UnicodeError Base.checkstring(UInt32[0x110000])
-
-# Test starting and different position
-@test Base.checkstring(UInt32[0x110000, 0x1f596], 2) == (1,0x10,1,0,0)
-
-# Test valid sequences
-for (seq, res) in (
-    (UInt8[0x0],                (1,0,0,0,0)),   # Nul byte, beginning of ASCII range
-    (UInt8[0x7f],               (1,0,0,0,0)),   # End of ASCII range
-    (UInt8[0xc0,0x80],          (1,1,0,0,0)),   # Long encoded Nul byte (Modified UTF-8, Java)
-    (UInt8[0xc2,0x80],          (1,2,0,0,1)),   # \u80, beginning of Latin1 range
-    (UInt8[0xc3,0xbf],          (1,2,0,0,1)),   # \uff, end of Latin1 range
-    (UInt8[0xc4,0x80],          (1,4,0,0,1)),   # \u100, beginning of non-Latin1 2-byte range
-    (UInt8[0xdf,0xbf],          (1,4,0,0,1)),   # \u7ff, end of non-Latin1 2-byte range
-    (UInt8[0xe0,0xa0,0x80],     (1,8,0,1,0)),   # \u800, beginning of 3-byte range
-    (UInt8[0xed,0x9f,0xbf],     (1,8,0,1,0)),   # \ud7ff, end of first part of 3-byte range
-    (UInt8[0xee,0x80,0x80],     (1,8,0,1,0)),   # \ue000, beginning of second part of 3-byte range
-    (UInt8[0xef,0xbf,0xbf],     (1,8,0,1,0)),   # \uffff, end of 3-byte range
-    (UInt8[0xf0,0x90,0x80,0x80],(1,16,1,0,0)),  # \U10000, beginning of 4-byte range
-    (UInt8[0xf4,0x8f,0xbf,0xbf],(1,16,1,0,0)),  # \U10ffff, end of 4-byte range
-    (UInt8[0xed,0xa0,0x80,0xed,0xb0,0x80], (1,0x30,1,0,0)), # Overlong \U10000, (CESU-8)
-    (UInt8[0xed,0xaf,0xbf,0xed,0xbf,0xbf], (1,0x30,1,0,0)), # Overlong \U10ffff, (CESU-8)
-    (UInt16[0x0000],            (1,0,0,0,0)),   # Nul byte, beginning of ASCII range
-    (UInt16[0x007f],            (1,0,0,0,0)),   # End of ASCII range
-    (UInt16[0x0080],            (1,2,0,0,1)),   # Beginning of Latin1 range
-    (UInt16[0x00ff],            (1,2,0,0,1)),   # End of Latin1 range
-    (UInt16[0x0100],            (1,4,0,0,1)),   # Beginning of non-Latin1 2-byte range
-    (UInt16[0x07ff],            (1,4,0,0,1)),   # End of non-Latin1 2-byte range
-    (UInt16[0x0800],            (1,8,0,1,0)),   # Beginning of 3-byte range
-    (UInt16[0xd7ff],            (1,8,0,1,0)),   # End of first part of 3-byte range
-    (UInt16[0xe000],            (1,8,0,1,0)),   # Beginning of second part of 3-byte range
-    (UInt16[0xffff],            (1,8,0,1,0)),   # End of 3-byte range
-    (UInt16[0xd800,0xdc00],     (1,16,1,0,0)),  # \U10000, beginning of 4-byte range
-    (UInt16[0xdbff,0xdfff],     (1,16,1,0,0)),  # \U10ffff, end of 4-byte range
-    (UInt32[0x0000],            (1,0,0,0,0)),   # Nul byte, beginning of ASCII range
-    (UInt32[0x007f],            (1,0,0,0,0)),   # End of ASCII range
-    (UInt32[0x0080],            (1,2,0,0,1)),   # Beginning of Latin1 range
-    (UInt32[0x00ff],            (1,2,0,0,1)),   # End of Latin1 range
-    (UInt32[0x0100],            (1,4,0,0,1)),   # Beginning of non-Latin1 2-byte range
-    (UInt32[0x07ff],            (1,4,0,0,1)),   # End of non-Latin1 2-byte range
-    (UInt32[0x0800],            (1,8,0,1,0)),   # Beginning of 3-byte range
-    (UInt32[0xd7ff],            (1,8,0,1,0)),   # End of first part of 3-byte range
-    (UInt32[0xe000],            (1,8,0,1,0)),   # Beginning of second part of 3-byte range
-    (UInt32[0xffff],            (1,8,0,1,0)),   # End of 3-byte range
-    (UInt32[0x10000],           (1,16,1,0,0)),  # \U10000, beginning of 4-byte range
-    (UInt32[0x10ffff],          (1,16,1,0,0)),  # \U10ffff, end of 4-byte range
-    (UInt32[0xd800,0xdc00],     (1,0x30,1,0,0)),# Overlong \U10000, (CESU-8)
-    (UInt32[0xdbff,0xdfff],     (1,0x30,1,0,0)))# Overlong \U10ffff, (CESU-8)
-    @test Base.checkstring(seq) == res
-end
-
-# Test bounds checking
-@test_throws BoundsError Base.checkstring(b"abcdef", -10)
-@test_throws BoundsError Base.checkstring(b"abcdef", 0)
-@test_throws BoundsError Base.checkstring(b"abcdef", 7)
-@test_throws BoundsError Base.checkstring(b"abcdef", 3, -10)
-@test_throws BoundsError Base.checkstring(b"abcdef", 3, 0)
-@test_throws BoundsError Base.checkstring(b"abcdef", 3, 7)
-@test_throws ArgumentError Base.checkstring(b"abcdef", 3, 1)
diff --git a/test/unicode/types.jl b/test/unicode/types.jl
deleted file mode 100644
index 919de34cc1786..0000000000000
--- a/test/unicode/types.jl
+++ /dev/null
@@ -1,11 +0,0 @@
-# This file is a part of Julia. License is MIT: http://julialang.org/license
-
-nullstring16 = UInt16[]
-badstring16  = UInt16[0x0065]
-@test_throws UnicodeError UTF16String(nullstring16)
-@test_throws UnicodeError UTF16String(badstring16)
-
-nullstring32 = UInt32[]
-badstring32  = UInt32['a']
-@test_throws UnicodeError UTF32String(nullstring32)
-@test_throws UnicodeError UTF32String(badstring32)

From 145dd586a91d7f6a6ed0f197e543069ca4bf7f17 Mon Sep 17 00:00:00 2001
From: Tony Kelman <tony@kelman.net>
Date: Tue, 5 Jul 2016 20:41:30 -0700
Subject: [PATCH 6/7] One more doc update about utf-16 etc in
 calling-c-and-fortran-code

---
 doc/manual/calling-c-and-fortran-code.rst | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/doc/manual/calling-c-and-fortran-code.rst b/doc/manual/calling-c-and-fortran-code.rst
index 84e3b35ae4010..cc4fdea8a935e 100644
--- a/doc/manual/calling-c-and-fortran-code.rst
+++ b/doc/manual/calling-c-and-fortran-code.rst
@@ -487,10 +487,10 @@ C name                  Standard Julia Alias    Julia Base Type
 
     For ``wchar_t*`` arguments, the Julia type should be ``Cwstring`` (if the C
     routine expects a NUL-terminated string) or ``Ptr{Cwchar_t}`` otherwise. Note
-    also that ASCII, UTF-8, UTF-16, and UTF-32 string data in Julia is internally
-    NUL-terminated, so it can be passed to C functions expecting NUL-terminated
-    data without making a copy (but using the ``Cwstring`` type will cause an
-    error to be thrown if the string itself contains NUL characters).
+    also that UTF-8 string data in Julia is internally NUL-terminated, so it can
+    be passed to C functions expecting NUL-terminated data without making a copy
+    (but using the ``Cwstring`` type will cause an error to be thrown if the string
+    itself contains NUL characters).
 
 .. note::
 

From 3098e6cd15f9d271ef3d70612e026c4673192600 Mon Sep 17 00:00:00 2001
From: Tony Kelman <tony@kelman.net>
Date: Thu, 7 Jul 2016 10:24:25 -0700
Subject: [PATCH 7/7] Add link to LegacyStrings.jl in doc/manual/strings.rst

---
 doc/manual/strings.rst | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/doc/manual/strings.rst b/doc/manual/strings.rst
index 45e556f45f99a..a5a89f554425c 100644
--- a/doc/manual/strings.rst
+++ b/doc/manual/strings.rst
@@ -54,9 +54,8 @@ There are a few noteworthy high-level features about Julia's strings:
    strings.
 -  Julia supports the full range of
    `Unicode <https://en.wikipedia.org/wiki/Unicode>`_ characters: literal
-   strings are always `ASCII <https://en.wikipedia.org/wiki/ASCII>`_ or
-   `UTF-8 <https://en.wikipedia.org/wiki/UTF-8>`_ but other encodings for
-   strings from external sources can be supported.
+   strings are always `UTF-8 <https://en.wikipedia.org/wiki/UTF-8>`_ but
+   other encodings for strings from external sources can be supported.
 
 .. _man-characters:
 
@@ -272,8 +271,8 @@ string literals:
 
 Whether these Unicode characters are displayed as escapes or shown as
 special characters depends on your terminal's locale settings and its
-support for Unicode. Non-ASCII string literals are encoded using the
-UTF-8 encoding. UTF-8 is a variable-width encoding, meaning that not all
+support for Unicode. String literals are encoded using the UTF-8
+encoding. UTF-8 is a variable-width encoding, meaning that not all
 characters are encoded in the same number of bytes. In UTF-8, ASCII
 characters — i.e. those with code points less than 0x80 (128) — are
 encoded as they are in ASCII, using a single byte, while code points
@@ -317,11 +316,11 @@ inefficient and verbose way to iterate through the characters of ``s``:
 .. doctest::
 
     julia> for i = 1:endof(s)
-             try
-               println(s[i])
-             catch
-               # ignore the index error
-             end
+               try
+                   println(s[i])
+               catch
+                   # ignore the index error
+               end
            end
     ∀
     <BLANKLINE>
@@ -339,7 +338,7 @@ exception handling required:
 .. doctest::
 
     julia> for c in s
-             println(c)
+               println(c)
            end
     ∀
     <BLANKLINE>
@@ -350,10 +349,12 @@ exception handling required:
     y
 
 Julia uses UTF-8 encoding by default, and support for new encodings can
-be added by packages. Additional discussion of other encodings and how
-to implement support for them is beyond the scope of this document for
-the time being. For further discussion of UTF-8 encoding issues, see
-the section below on `byte array literals <#Byte+Array+Literals>`_,
+be added by packages. For example, the `LegacyStrings.jl
+<https://github.com/JuliaArchive/LegacyStrings.jl>`_ package implements
+``UTF16String`` and ``UTF32String`` types. Additional discussion of other
+encodings and how to implement support for them is beyond the scope of this
+document for the time being. For further discussion of UTF-8 encoding issues,
+see the section below on `byte array literals <#Byte+Array+Literals>`_,
 which goes into some greater detail.
 
 .. _man-string-interpolation:
@@ -903,10 +904,9 @@ encodings.
 
 If this is all extremely confusing, try reading `"The Absolute Minimum
 Every Software Developer Absolutely, Positively Must Know About Unicode
-and Character
-Sets" <http://www.joelonsoftware.com/articles/Unicode.html>`_. It's an
-excellent introduction to Unicode and UTF-8, and may help alleviate some
-confusion regarding the matter.
+and Character Sets" <http://www.joelonsoftware.com/articles/Unicode.html>`_.
+It's an excellent introduction to Unicode and UTF-8, and may help alleviate
+some confusion regarding the matter.
 
 .. _man-version-number-literals: