diff --git a/.travis.yml b/.travis.yml
index 39a1783..b5d6676 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -3,14 +3,9 @@ os:
   - osx
   - linux
 julia:
-  - 0.7
-  - 1.0
-  - 1.2
+  - 1.4
   - nightly
 notifications:
   email: false
-script:
-  - if [ -a .git/shallow ]; then git fetch --unshallow; fi
-  - julia -e 'using Pkg; Pkg.build(); Pkg.test(coverage=true)';
 after_success:
-  - julia -e 'cd(Pkg.dir("SIMD")); Pkg.add("Coverage"); using Coverage; Coveralls.submit(Coveralls.process_folder())';
+  - julia -e 'Pkg.add("Coverage"); using Coverage; Coveralls.submit(Coveralls.process_folder())';
diff --git a/LICENSE.md b/LICENSE.md
index 6787457..546253e 100644
--- a/LICENSE.md
+++ b/LICENSE.md
@@ -1,6 +1,6 @@
 The SIMD.jl package is licensed under the Simplified "2-clause" BSD License:
 
-> Copyright (c) 2016: Erik Schnetter.
+> Copyright (c) 2016-2020: Erik Schnetter, Kristoffer Carlsson, Julia Computing
 > All rights reserved.
 > 
 > Redistribution and use in source and binary forms, with or without
diff --git a/Project.toml b/Project.toml
index 03faf2f..bdf8379 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,10 +1,10 @@
 name = "SIMD"
 uuid = "fdea26ae-647d-5447-a871-4b548cad5224"
-authors = ["Erik Schnetter <schnetter@gmail.com>"]
-version = "2.8.0"
+authors = ["Erik Schnetter <schnetter@gmail.com>", "Kristoffer Carlsson <kristoffer.carlsson@juliacomputing.com>"]
+version = "2.9.0"
 
 [compat]
-julia = "1"
+julia = "1.4"
 
 [extras]
 InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
diff --git a/README.md b/README.md
index 760138c..d12f5b3 100644
--- a/README.md
+++ b/README.md
@@ -32,7 +32,12 @@ function vadd!(xs::Vector{T}, ys::Vector{T}, ::Type{Vec{N,T}}) where {N, T}
     end
 end
 ```
+
 To simplify this example code, the vector type that should be used (`Vec{N,T}`) is passed in explicitly as additional type argument. This routine is e.g. called as `vadd!(xs, ys, Vec{8,Float64})`.
+Note that this code is not expected to outperform the standard scalar way of
+doing this operation since the Julia optimizer will easily rewrite that to use
+SIMD under the hood. It is merely shown as an illustration of how to load and
+store data into `Vector`s using SIMD.jl
 
 ## SIMD vector operations
 
@@ -46,14 +51,13 @@ The SIMD package provides the usual arithmetic and logical operations for SIMD v
 
 `abs cbrt ceil copysign cos div exp exp10 exp2 flipsign floor fma inv isfinite isinf isnan issubnormal log log10 log2 muladd rem round sign signbit sin sqrt trunc vifelse`
 
-(Currently missing: `count_ones count_zeros exponent ldexp leading_ones leading_zeros significand trailing_ones trailing_zeros`, many trigonometric functions)
-
-(Also currently missing: Type conversions, reinterpretation that changes the vector size)
+(Currently missing: `exponent ldexp significand`, many trigonometric functions)
 
 These operators and functions are always applied element-wise, i.e. they are applied to each element in parallel, yielding again a SIMD vector as result. This means that e.g. multiplying two vectors yields a vector, and comparing two vectors yields a vector of booleans. This behaviour might seem strange and slightly unusual, but corresponds to the machine instructions provided by the hardware. It is also what is usually needed to vectorize loops.
 
 The SIMD package also provides conversion operators from scalars and tuples to SIMD vectors and from SIMD vectors to tuples. Additionally, there are `getindex` and `setindex` functions to access individual vector elements.  SIMD vectors are immutable (like tuples), and `setindex` (note there is no exclamation mark at the end of the name) thus returns the modified vector.
-```Julia
+
+```julia
 # Create a vector where all elements are Float64(1):
 xs = Vec{4,Float64}(1)
 
@@ -63,7 +67,7 @@ ys1 = NTuple{4,Float32}(ys)
 y2 = ys[2]   # getindex
 
 # Update one element of a vector:
-ys = setindex(ys, 5, 3)   # cannot use ys[3] = 5
+ys = Base.setindex(ys, 5, 3)   # cannot use ys[3] = 5
 ```
 
 ## Reduction operations
@@ -73,12 +77,87 @@ Reduction operations reduce a SIMD vector to a scalar. The following reduction o
 `all any maximum minimum sum prod`
 
 Example:
-```Julia
+
+```julia
 v = Vec{4,Float64}((1,2,3,4))
 sum(v)
 10.0
 ```
 
+It is also possible to use reduce with bit operations:
+
+```julia
+julia> v = Vec{4,UInt16}((1,2,3,4))
+<4 x UInt16>[0x0001, 0x0002, 0x0003, 0x0004]
+
+julia> reduce(|, v)
+0x0007
+
+julia> reduce(&, v)
+0x0000
+```
+
+## Overflow operations
+
+Overflow operations do the operation but also give back a flag that indicates
+whether the result of the operation overflowed.
+Note that these only work on Julia with LLVM 9 or higher (Julia 1.5 or higher):
+The functions `Base.Checked.add_with_overflow`, `Base.Checked.sub_with_overflow`,
+`Base.Checked.mul_with_overflow` are extended to work on `Vec`. :
+
+```julia
+julia> v = Vec{4, Int8}((40, -80, 70, -10))
+<4 x Int8>[40, -80, 70, -10]
+
+julia> Base.Checked.add_with_overflow(v, v)
+(<4 x Int8>[80, 96, -116, -20], <4 x Bool>[0, 1, 1, 0])
+
+julia> Base.Checked.add_with_overflow(Int8(-80), Int8(-80))
+(96, true)
+
+julia> Base.Checked.sub_with_overflow(v, 120)
+(<4 x Int8>[-80, 56, -50, 126], <4 x Bool>[0, 1, 0, 1])
+
+julia> Base.Checked.mul_with_overflow(v, 2)
+(<4 x Int8>[80, 96, -116, -20], <4 x Bool>[0, 1, 1, 0])
+```
+
+## Saturation arithmetic
+
+Saturation arithmetic is a version of arithmetic in which operations are limited
+to a fixed range between a minimum and maximum value. If the result of an
+operation is greater than the maximum value, the result is set (or “clamped”) to
+this maximum. If it is below the minimum, it is clamped to this minimum.
+
+
+```julia
+julia> v = Vec{4, Int8}((40, -80, 70, -10))
+<4 x Int8>[40, -80, 70, -10]
+
+julia> SIMD.add_saturate(v, v)
+<4 x Int8>[80, -128, 127, -20]
+
+julia> SIMD.sub_saturate(v, 120)
+<4 x Int8>[-80, -128, -50, -128]
+```
+
+## Fastmath
+
+SIMD.jl hooks into the `@fastmath` macro so that operations in a
+`@fastmath`-block sets the `fast` flag on the floating point intrinsics
+that supports it operations. Compare for example the generated code for the
+following two functions:
+
+```julia
+f1(a, b, c) = a * b - c * 2.0
+f2(a, b, c) = @fastmath a * b - c * 2.0
+V = Vec{4, Float64}
+code_native(f1, Tuple{V, V, V}, debuginfo=:none)
+code_native(f2, Tuple{V, V, V}, debuginfo=:none)
+```
+
+The normal caveats for using `@fastmath` naturally applies.
+
 ## Accessing arrays
 
 When using explicit SIMD vectorization, it is convenient to allocate arrays still as arrays of scalars, not as arrays of vectors. The `vload` and `vstore` functions allow reading vectors from and writing vectors into arrays, accessing several contiguous array elements.
diff --git a/REQUIRE b/REQUIRE
deleted file mode 100644
index 859ad46..0000000
--- a/REQUIRE
+++ /dev/null
@@ -1 +0,0 @@
-julia 0.7
diff --git a/appveyor.yml b/appveyor.yml
index 5fe5251..63410c2 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -1,8 +1,6 @@
 environment:
   matrix:
-  - julia_version: 0.7
-  - julia_version: 1.0
-  - julia_version: 1.2
+  - julia_version: 1.4
   - julia_version: nightly
 
 platform:
@@ -42,3 +40,4 @@ test_script:
 on_success:
    - echo "%JL_CODECOV_SCRIPT%"
    - C:\julia\bin\julia -e "%JL_CODECOV_SCRIPT%"
+
diff --git a/src/LLVM_intrinsics.jl b/src/LLVM_intrinsics.jl
new file mode 100644
index 0000000..405e0d5
--- /dev/null
+++ b/src/LLVM_intrinsics.jl
@@ -0,0 +1,703 @@
+# LLVM operations and intrinsics
+module Intrinsics
+
+# Note, that in the functions below, some care needs to be taken when passing
+# Julia Bools to LLVM. Julia passes Bools as LLVM i8 but expect them to only
+# have the last bit as non-zero. Failure to comply with this can give weird errors
+# like false !== false where the first false is the result of some computation.
+
+# Note, no difference is made between Julia usigned integers and signed integers
+# when passed to LLVM. It is up to the caller to make sure that the correct
+# intrinsic is called (e.g uitofp vs sitofp).
+
+import ..SIMD: SIMD, VE, LVec, FloatingTypes
+# Inlcude Bool in IntegerTypes
+const IntegerTypes = Union{SIMD.IntegerTypes, Bool}
+
+const d = Dict{DataType, String}(
+    Bool         => "i8",
+    Int8         => "i8",
+    Int16        => "i16",
+    Int32        => "i32",
+    Int64        => "i64",
+    Int128       => "i128",
+
+    UInt8        => "i8",
+    UInt16       => "i16",
+    UInt32       => "i32",
+    UInt64       => "i64",
+    UInt128      => "i128",
+
+    #Float16     => "half",
+    Float32      => "float",
+    Float64      => "double",
+)
+# Add the Ptr translations
+foreach(x -> (d[Ptr{x}] = d[Int]), collect(keys(d)))
+
+# LT = LLVM Type (scalar and vectors), we keep type names intentionally short
+# to make the signatures smaller
+const LT{T} = Union{LVec{<:Any, T}, T}
+
+suffix(N::Integer, ::Type{Ptr{T}}) where {T} = "v$(N)p0$(T<:IntegerTypes ? "i" : "f")$(8*sizeof(T))"
+suffix(N::Integer, ::Type{T}) where {T}      = "v$(N)$(T<:IntegerTypes   ? "i" : "f")$(8*sizeof(T))"
+suffix(::Type{T}) where {T}                  = "$(T<:IntegerTypes        ? "i" : "f")$(8*sizeof(T))"
+
+dotit(f) = replace(string(f), "_" => ".")
+llvm_name(llvmf, N, T)                           = string("llvm", ".", dotit(llvmf), ".", suffix(N, T))
+llvm_name(llvmf, ::Type{LVec{N, T}}) where {N,T} = string("llvm", ".", dotit(llvmf), ".", suffix(N, T))
+llvm_name(llvmf, ::Type{T}) where {T}            = string("llvm", ".", dotit(llvmf), ".", suffix(T))
+
+llvm_type(::Type{T}) where {T}            = d[T]
+llvm_type(::Type{LVec{N, T}}) where {N,T} = "< $N x $(d[T])>"
+
+############
+# FastMath #
+############
+
+module FastMath
+    const nnan     = 1 << 0
+    const ninf     = 1 << 1
+    const nsz      = 1 << 2
+    const arcp     = 1 << 3
+    const contract = 1 << 4
+    const afn      = 1 << 5
+    const reassoc  = 1 << 6
+    const fast     = 1 << 7
+end
+
+struct FastMathFlags{T} end
+Base.@pure FastMathFlags(T::Int) = FastMathFlags{T}()
+
+function fp_str(::Type{FastMathFlags{T}}) where {T}
+    flags = String[]
+    (T & FastMath.nnan     != 0) && push!(flags, "nnan")
+    (T & FastMath.ninf     != 0) && push!(flags, "ninf")
+    (T & FastMath.nsz      != 0) && push!(flags, "nsz")
+    (T & FastMath.arcp     != 0) && push!(flags, "arcp")
+    (T & FastMath.contract != 0) && push!(flags, "contract")
+    (T & FastMath.afn      != 0) && push!(flags, "afn")
+    (T & FastMath.reassoc  != 0) && push!(flags, "reassoc")
+    (T & FastMath.fast     != 0) && push!(flags, "fast")
+    return join(flags, " ")
+end
+fp_str(::Type{Nothing}) = ""
+
+const FPFlags{T} = Union{Nothing, FastMathFlags{T}}
+
+####################
+# Unary operators  #
+####################
+
+const UNARY_INTRINSICS_FLOAT = [
+    :sqrt
+    :sin
+    :cos
+    :exp
+    :trunc
+    :exp2
+    :log
+    :log10
+    :log2
+    :fabs
+    :floor
+    :ceil
+    :rint
+    :nearbyint
+    :round
+]
+
+const UNARY_INTRINSICS_INT = [
+    :bitreverse
+    :bswap
+    :ctpop
+    :ctlz
+    :cttz
+    :fshl
+    :fshr
+]
+for (fs, c) in zip([UNARY_INTRINSICS_FLOAT, UNARY_INTRINSICS_INT],
+                   [FloatingTypes,          IntegerTypes])
+    for f in fs
+        @eval begin
+            @generated function $(f)(x::T) where T<:LT{<:$c}
+                ff = llvm_name($(QuoteNode(f)), T)
+                return :(
+                    $(Expr(:meta, :inline));
+                    ccall($ff, llvmcall, T, (T,), x)
+                )
+            end
+        end
+    end
+end
+
+# fneg (not an intrinsic so cannot use `ccall)
+@generated function fneg(x::T, ::F=nothing) where {T<:LT{<:FloatingTypes}, F<:FPFlags}
+    fpflags = fp_str(F)
+    s = """
+    %2 = fneg $fpflags $(llvm_type(T)) %0
+    ret $(llvm_type(T)) %2
+    """
+    return :(
+        $(Expr(:meta, :inline));
+        Base.llvmcall($s, T, Tuple{T}, x)
+    )
+end
+
+#####################
+# Binary operators  #
+#####################
+
+const BINARY_OPS_FLOAT = [
+    :fadd
+    :fsub
+    :fmul
+    :fdiv
+    :frem
+]
+
+const BINARY_OPS_INT = [
+    :add
+    :sub
+    :mul
+    :sdiv
+    :udiv
+    :srem
+    :urem
+    :shl
+    :ashr
+    :lshr
+    :and
+    :or
+    :xor
+]
+
+for f in BINARY_OPS_FLOAT
+    @eval @generated function $f(x::T, y::T, ::F=nothing) where {T<:LT{<:FloatingTypes}, F<:FPFlags}
+        fpflags = fp_str(F)
+        ff = $(QuoteNode(f))
+        s = """
+        %3 = $ff $fpflags $(llvm_type(T)) %0, %1
+        ret $(llvm_type(T)) %3
+        """
+        return :(
+            $(Expr(:meta, :inline));
+            Base.llvmcall($s, T, Tuple{T, T}, x, y)
+        )
+    end
+end
+
+for f in BINARY_OPS_INT
+    @eval @generated function $f(x::T, y::T) where T<:LT{<:IntegerTypes}
+        ff = $(QuoteNode(f))
+        s = """
+        %3 = $ff $(llvm_type(T)) %0, %1
+        ret $(llvm_type(T)) %3
+        """
+        return :(
+            $(Expr(:meta, :inline));
+            Base.llvmcall($s, T, Tuple{T, T}, x, y)
+        )
+    end
+end
+
+const BINARY_INTRINSICS_FLOAT = [
+    :minnum
+    :maxnum
+    :minimum
+    :maximum
+    :copysign
+    :pow
+    :floor
+    :ceil
+    :trunc
+    :rint
+    :nearbyint
+    :round
+]
+
+const BINARY_INTRINSICS_INT = [
+    :sadd_sat
+    :uadd_sat
+    :ssub_sat
+    :usub_sat
+]
+
+for (fs, c) in zip([BINARY_INTRINSICS_FLOAT, BINARY_INTRINSICS_INT],
+                   [FloatingTypes,           IntegerTypes])
+    for f in fs
+        @eval @generated function $(f)(x::T, y::T) where T<:LT{<:$c}
+            ff = llvm_name($(QuoteNode(f)), T,)
+            return :(
+                $(Expr(:meta, :inline));
+                ccall($ff, llvmcall, T, (T, T), x, y)
+            )
+        end
+    end
+end
+
+# pow, powi
+for (f, c) in [(:pow, FloatingTypes), (:powi, IntegerTypes)]
+    @eval @generated function $(f)(x::T, y::T2) where {T <: LT{<:FloatingTypes}, T2 <: $c}
+        ff = llvm_name($(QuoteNode(f)), T)
+        return :(
+            $(Expr(:meta, :inline));
+            ccall($ff, llvmcall, T, (T, T2), x, y)
+        )
+    end
+end
+
+# Overflow
+const OVERFLOW_INTRINSICS = [
+    :sadd_with_overflow
+    :uadd_with_overflow
+    :ssub_with_overflow
+    :usub_with_overflow
+    :smul_with_overflow
+    :umul_with_overflow
+]
+
+const SUPPORTS_VEC_OVERFLOW = Base.libllvm_version >= v"9"
+for f in OVERFLOW_INTRINSICS
+    @eval @generated function $f(x::LVec{N, T}, y::LVec{N, T}) where {N, T <: IntegerTypes}
+        if !SUPPORTS_VEC_OVERFLOW
+            return :(error("LLVM version 9.0 or greater required (Julia 1.5 or greater)"))
+        end
+        ff = llvm_name($(QuoteNode(f)), N, T)
+        if $(QuoteNode(f)) == :smul_with_overflow && Sys.ARCH == :i686 && T == Int64
+            str = "this intrinsic ($ff) is broken on i686"
+            return :(error($str))
+        end
+        decl = "declare {<$N x $(d[T])>, <$N x i1>} @$ff(<$N x $(d[T])>, <$N x $(d[T])>)"
+
+        # Julia passes Tuple{[U]Int8, Bool} as [2 x i8] so we need to special case that scenario
+        ret_type = sizeof(T) == 1 ? "[2 x <$N x i8>]" : "{<$N x $(d[T])>, <$N x i8>}"
+
+        s = """
+        %res = call {<$N x $(d[T])>, <$N x i1>} @$ff(<$N x $(d[T])> %0, <$N x $(d[T])> %1)
+        %plus     = extractvalue {<$N x $(d[T])>, <$N x i1>} %res, 0
+        %overflow = extractvalue {<$N x $(d[T])>, <$N x i1>} %res, 1
+        %overflow_ext = zext <$(N) x i1> %overflow to <$(N) x i8>
+        %new_tuple   = insertvalue $ret_type undef,      <$N x $(d[T])> %plus,         0
+        %new_tuple_2 = insertvalue $ret_type %new_tuple, <$N x i8>      %overflow_ext, 1
+        ret $ret_type %new_tuple_2
+        """
+        return :(
+            $(Expr(:meta, :inline));
+            Base.llvmcall(($decl, $s), Tuple{LVec{N, T}, LVec{N, Bool}}, Tuple{LVec{N, T}, LVec{N, T}}, x, y)
+        )
+    end
+end
+
+
+# Comparisons
+const CMP_FLAGS_FLOAT = [
+    :false
+    :oeq
+    :ogt
+    :oge
+    :olt
+    :ole
+    :one
+    :ord
+    :ueq
+    :ugt
+    :uge
+    :ult
+    :ule
+    :une
+    :uno
+    :true
+]
+
+const CMP_FLAGS_INT = [
+    :eq
+    :ne
+    :sgt
+    :sge
+    :slt
+    :sle
+    :ugt
+    :uge
+    :ult
+    :ule
+]
+
+for flag in CMP_FLAGS_FLOAT
+    ftot = Symbol(string("fcmp_", flag))
+    @eval @generated function $ftot(x::LVec{N, T}, y::LVec{N, T}, ::F=nothing) where {N, T <: FloatingTypes, F<:FPFlags}
+        fpflags = fp_str(F)
+        fflag = $(QuoteNode(flag))
+        s = """
+        %res = fcmp $(fpflags) $(fflag) <$(N) x $(d[T])> %0, %1
+        %resb = zext <$(N) x i1> %res to <$(N) x i8>
+        ret <$(N) x i8> %resb
+        """
+        return :(
+            $(Expr(:meta, :inline));
+            Base.llvmcall($s, LVec{N, Bool}, Tuple{LVec{N, T}, LVec{N, T}}, x, y)
+        )
+    end
+end
+
+for flag in CMP_FLAGS_INT
+    ftot = Symbol(string("icmp_", flag))
+    @eval @generated function $ftot(x::LVec{N, T}, y::LVec{N, T}) where {N, T <: IntegerTypes}
+        fflag = $(QuoteNode(flag))
+        s = """
+        %res = icmp $(fflag) <$(N) x $(d[T])> %0, %1
+        %resb = zext <$(N) x i1> %res to <$(N) x i8>
+        ret <$(N) x i8> %resb
+        """
+        return :(
+            $(Expr(:meta, :inline));
+            Base.llvmcall($s, LVec{N, Bool}, Tuple{LVec{N, T}, LVec{N, T}}, x, y)
+        )
+    end
+end
+
+
+#####################
+# Ternary operators #
+#####################
+
+@generated function select(cond::LVec{N, Bool}, x::LVec{N, T}, y::LVec{N, T}) where {N, T}
+    s = """
+    %cond = trunc <$(N) x i8> %0 to <$(N) x i1>
+    %res = select <$N x i1> %cond, <$N x $(d[T])> %1, <$N x $(d[T])> %2
+    ret <$N x $(d[T])> %res
+    """
+    return :(
+        $(Expr(:meta, :inline));
+        Base.llvmcall($s, LVec{N, T}, Tuple{LVec{N, Bool}, LVec{N, T}, LVec{N, T}}, cond, x, y)
+    )
+end
+
+const MULADD_INTRINSICS = [
+    :fmuladd,
+    :fma,
+]
+
+for f in MULADD_INTRINSICS
+    @eval @generated function $(f)(a::LVec{N, T}, b::LVec{N, T}, c::LVec{N, T}) where {N, T<:FloatingTypes}
+        ff = llvm_name($(QuoteNode(f)), N, T)
+        return :(
+            $(Expr(:meta, :inline));
+            ccall($ff, llvmcall, LVec{N, T}, (LVec{N, T}, LVec{N, T}, LVec{N, T}), a, b, c)
+        )
+    end
+end
+
+
+################
+# Load / store #
+################
+
+# These alignment numbers feels a bit dubious
+n_align(align, N, T) = align ? N * sizeof(T) : sizeof(T)
+temporal_str(temporal) = temporal ? ", !nontemporal !{i32 1}" : ""
+
+@generated function load(x::Type{LVec{N, T}}, ptr::Ptr{T},
+                         ::Val{Al}=Val(false), ::Val{Te}=Val(false)) where {N, T, Al, Te}
+    s = """
+    %ptr = inttoptr $(d[Int]) %0 to <$N x $(d[T])>*
+    %res = load <$N x $(d[T])>, <$N x $(d[T])>* %ptr, align $(n_align(Al, N, T)) $(temporal_str(Te))
+    ret <$N x $(d[T])> %res
+    """
+    return :(
+        $(Expr(:meta, :inline));
+        Base.llvmcall($s, LVec{N, T}, Tuple{Ptr{T}}, ptr)
+    )
+end
+
+@generated function maskedload(ptr::Ptr{T}, mask::LVec{N,Bool},
+                               ::Val{Al}=Val(false), ::Val{Te}=Val(false)) where {N, T, Al, Te}
+    # TODO: Allow setting the passthru
+    decl = "declare <$N x $(d[T])> @llvm.masked.load.$(suffix(N, T))(<$N x $(d[T])>*, i32, <$N x i1>, <$N x $(d[T])>)"
+    s = """
+    %mask = trunc <$(N) x i8> %1 to <$(N) x i1>
+    %ptr = inttoptr $(d[Int]) %0 to <$N x $(d[T])>*
+    %res = call <$N x $(d[T])> @llvm.masked.load.$(suffix(N, T))(<$N x $(d[T])>* %ptr, i32 $(n_align(Al, N, T)), <$N x i1> %mask, <$N x $(d[T])> zeroinitializer)
+    ret <$N x $(d[T])> %res
+    """
+    return :(
+        $(Expr(:meta, :inline));
+        Base.llvmcall(($decl, $s), LVec{N, T}, Tuple{Ptr{T}, LVec{N,Bool}}, ptr, mask)
+    )
+end
+
+@generated function store(x::LVec{N, T}, ptr::Ptr{T},
+                          ::Val{Al}=Val(false), ::Val{Te}=Val(false)) where {N, T, Al, Te}
+    s = """
+    %ptr = inttoptr $(d[Int]) %1 to <$N x $(d[T])>*
+    store <$N x $(d[T])> %0, <$N x $(d[T])>* %ptr, align $(n_align(Al, N, T)) $(temporal_str(Te))
+    ret void
+    """
+    return :(
+
+        $(Expr(:meta, :inline));
+        Base.llvmcall($s, Cvoid, Tuple{LVec{N, T}, Ptr{T}}, x, ptr)
+    )
+end
+
+@generated function maskedstore(x::LVec{N, T}, ptr::Ptr{T}, mask::LVec{N,Bool},
+                               ::Val{Al}=Val(false), ::Val{Te}=Val(false)) where {N, T, Al, Te}
+    # TODO: Allow setting the passthru
+    decl = "declare <$N x $(d[T])> @llvm.masked.store.$(suffix(N, T))(<$N x $(d[T])>, <$N x $(d[T])>*, i32, <$N x i1>)"
+    s = """
+    %mask = trunc <$(N) x i8> %2 to <$(N) x i1>
+    %ptr = inttoptr $(d[Int]) %1 to <$N x $(d[T])>*
+    %res = call <$N x $(d[T])> @llvm.masked.store.$(suffix(N, T))(<$N x $(d[T])> %0, <$N x $(d[T])>* %ptr, i32 $(n_align(Al, N, T)), <$N x i1> %mask)
+    ret void
+    """
+    return :(
+        $(Expr(:meta, :inline));
+        Base.llvmcall(($decl, $s), Cvoid, Tuple{LVec{N, T}, Ptr{T}, LVec{N,Bool}}, x, ptr, mask)
+    )
+end
+
+
+####################
+# Gather / Scatter #
+####################
+
+@generated function maskedgather(ptrs::LVec{N,Ptr{T}},
+                                 mask::LVec{N,Bool}, ::Val{Al}=Val(false)) where {N, T, Al}
+    # TODO: Allow setting the passthru
+    decl = "declare <$N x $(d[T])> @llvm.masked.gather.$(suffix(N, T))(<$N x $(d[T])*>, i32, <$N x i1>, <$N x $(d[T])>)"
+    s = """
+    %mask = trunc <$(N) x i8> %1 to <$(N) x i1>
+    %ptrs = inttoptr <$N x $(d[Int])> %0 to <$N x $(d[T])*>
+    %res = call <$N x $(d[T])> @llvm.masked.gather.$(suffix(N, T))(<$N x $(d[T])*> %ptrs, i32 $(n_align(Al, N, T)), <$N x i1> %mask, <$N x $(d[T])> zeroinitializer)
+    ret <$N x $(d[T])> %res
+    """
+    return :(
+        $(Expr(:meta, :inline));
+        Base.llvmcall(($decl, $s), LVec{N, T}, Tuple{LVec{N, Ptr{T}}, LVec{N, Bool}}, ptrs, mask)
+    )
+end
+
+@generated function maskedscatter(x::LVec{N, T}, ptrs::LVec{N, Ptr{T}},
+                                  mask::LVec{N,Bool}, ::Val{Al}=Val(false)) where {N, T, Al}
+
+    decl = "declare <$N x $(d[T])> @llvm.masked.scatter.$(suffix(N, T))(<$N x $(d[T])>, <$N x $(d[T])*>, i32, <$N x i1>)"
+    s = """
+    %mask = trunc <$(N) x i8> %2 to <$(N) x i1>
+    %ptrs = inttoptr <$N x $(d[Int])> %1 to <$N x $(d[T])*>
+    call <$N x $(d[T])> @llvm.masked.scatter.$(suffix(N, T))(<$N x $(d[T])> %0, <$N x $(d[T])*> %ptrs, i32 $(n_align(Al, N, T)), <$N x i1> %mask)
+    ret void
+    """
+    return :(
+        $(Expr(:meta, :inline));
+        Base.llvmcall(($decl, $s), Cvoid, Tuple{LVec{N, T}, LVec{N, Ptr{T}}, LVec{N, Bool}}, x, ptrs, mask)
+    )
+end
+
+
+######################
+# LVector Operations #
+######################
+
+@generated function extractelement(x::LVec{N, T}, i::I) where {N, T, I <: IntegerTypes}
+    s = """
+    %3 = extractelement <$N x $(d[T])> %0, $(d[I]) %1
+    ret $(d[T]) %3
+    """
+    return :(
+        $(Expr(:meta, :inline));
+        Base.llvmcall($s, T, Tuple{LVec{N, T}, $i}, x, i)
+    )
+end
+
+@generated function insertelement(x::LVec{N, T}, v::T, i::IntegerTypes) where {N, T}
+    s = """
+    %4 = insertelement <$N x $(d[T])> %0, $(d[T]) %1, $(d[i]) %2
+    ret <$N x $(d[T])> %4
+    """
+    return :(
+        $(Expr(:meta, :inline));
+        Base.llvmcall($s, LVec{N, T}, Tuple{LVec{N, T}, T, typeof(i)}, x, v, i)
+    )
+end
+
+_shuffle_vec(I) = join((string("i32 ", i == :undef ? "undef" : Int32(i::Integer)) for i in I), ", ")
+@generated function shufflevector(x::LVec{N, T}, y::LVec{N, T}, ::Val{I}) where {N, T, I}
+    shfl = _shuffle_vec(I)
+    M = length(I)
+    s = """
+    %res = shufflevector <$N x $(d[T])> %0, <$N x $(d[T])> %1, <$M x i32> <$shfl>
+    ret <$M x $(d[T])> %res
+    """
+    return :(
+        $(Expr(:meta, :inline));
+        Base.llvmcall($s, LVec{$M, T}, Tuple{LVec{N, T}, LVec{N, T}}, x, y)
+    )
+end
+
+@generated function shufflevector(x::LVec{N, T}, ::Val{I}) where {N, T, I}
+    shfl = _shuffle_vec(I)
+    M = length(I)
+    s = """
+    %res = shufflevector <$(N) x $(d[T])> %0, <$N x $(d[T])> undef, <$M x i32> <$shfl>
+    ret <$M x $(d[T])> %res
+    """
+    return :(
+        $(Expr(:meta, :inline));
+        Base.llvmcall($s, LVec{$M, T}, Tuple{LVec{N, T}}, x)
+    )
+end
+
+@generated function constantvector(v::T, y::Type{LVec{N, T}}) where {N, T}
+    s = """
+    %2 = insertelement <$N x $(d[T])> undef, $(d[T]) %0, i32 0
+    %res = shufflevector <$N x $(d[T])> %2, <$N x $(d[T])> undef, <$N x i32> zeroinitializer
+    ret <$N x $(d[T])> %res
+    """
+    return :(
+        $(Expr(:meta, :inline));
+        Base.llvmcall($s, LVec{N, T}, Tuple{T}, v)
+    )
+end
+
+#########################
+# Conversion Operations #
+#########################
+
+const CAST_SIZE_CHANGE_FLOAT = [
+    (:fptrunc, >)
+    (:fpext, <)
+]
+
+const CAST_SIZE_CHANGE_INT = [
+    (:trunc, >)
+    (:zext, <)
+    (:sext, <)
+]
+
+for (fs, c) in zip([CAST_SIZE_CHANGE_FLOAT, CAST_SIZE_CHANGE_INT],
+                   [FloatingTypes,          IntegerTypes])
+    for (f, criteria) in fs
+        @eval @generated function $f(::Type{LVec{N, T2}}, x::LVec{N, T1}) where {N, T1 <: $c, T2 <: $c}
+            sT1, sT2 = sizeof(T1) * 8, sizeof(T2) * 8
+            # Not changing size is not allowed
+            if !$criteria(sT1, sT2)
+                return :(error("size of conversion type ($T2: $sT2) must be $($criteria) than the element type ($T1: $sT1)"))
+            end
+            ff = $(QuoteNode(f))
+            s = """
+            %2 = $ff <$(N) x $(d[T1])> %0 to <$(N) x $(d[T2])>
+            ret <$(N) x $(d[T2])> %2
+            """
+            return :(
+                $(Expr(:meta, :inline));
+                Base.llvmcall($s, LVec{N, T2}, Tuple{LVec{N, T1}}, x)
+            )
+        end
+    end
+end
+
+const CONVERSION_FLOAT_TO_INT = [
+    :fptoui,
+    :fptosi
+]
+
+const CONVERSION_INT_TO_FLOAT = [
+    :uitofp,
+    :sitofp
+]
+
+for (fs, (from, to)) in zip([CONVERSION_FLOAT_TO_INT,       CONVERSION_INT_TO_FLOAT],
+                           [(FloatingTypes, IntegerTypes), (IntegerTypes, FloatingTypes)])
+    for f in fs
+        @eval @generated function $f(::Type{LVec{N, T2}}, x::LVec{N, T1}) where {N, T1 <: $from, T2 <: $to}
+            ff = $(QuoteNode(f))
+            s = """
+            %2 = $ff <$(N) x $(d[T1])> %0 to <$(N) x $(d[T2])>
+            ret <$(N) x $(d[T2])> %2
+            """
+            return :(
+                $(Expr(:meta, :inline));
+                Base.llvmcall($s, LVec{N, T2}, Tuple{LVec{N, T1}}, x)
+            )
+        end
+    end
+end
+
+
+###########
+# Bitcast #
+###########
+
+@generated function bitcast(::Type{T1}, x::T2) where {T1<:LT, T2<:LT}
+    sT1, sT2 = sizeof(T1), sizeof(T2)
+    if sT1 != sT2
+        return :(error("size of conversion type ($T1: $sT1) must be equal to the vector type ($T2: $sT2)"))
+    end
+    s = """
+    %2 = bitcast $(llvm_type(T2)) %0 to $(llvm_type(T1))
+    ret $(llvm_type(T1)) %2
+    """
+    return :(
+        $(Expr(:meta, :inline));
+        Base.llvmcall($s, T1, Tuple{T2}, x)
+    )
+end
+
+##################################
+# Horizontal reductions (LLVM 9) #
+##################################
+
+const HORZ_REDUCTION_OPS_FLOAT = [
+    :fmax
+    :fmin
+]
+
+const HORZ_REDUCTION_OPS_INT = [
+    :and
+    :or
+    :mul
+    :add
+    :smax
+    :umax
+    :smin
+    :umin
+]
+
+for (fs, c) in zip([HORZ_REDUCTION_OPS_FLOAT, HORZ_REDUCTION_OPS_INT],
+                   [FloatingTypes,            IntegerTypes])
+    for f in fs
+        f_red = Symbol("reduce_", f)
+        @eval @generated function $f_red(x::LVec{N, T}) where {N,T<:$c}
+            ff = llvm_name(string("experimental.vector.reduce.", $(QuoteNode(f))), N, T)
+            decl = "declare $(d[T]) @$ff(<$N x $(d[T])>)"
+            s2 = """
+            %res = call $(d[T]) @$ff(<$N x $(d[T])> %0)
+            ret $(d[T]) %res
+            """
+            return quote
+                $(Expr(:meta, :inline));
+                Base.llvmcall($(decl, s2), T, Tuple{LVec{N, T},}, x)
+            end
+        end
+    end
+end
+
+# The fadd and fmul reductions take an initializer
+horz_reduction_version = Base.libllvm_version < v"9" ? "" : "v2."
+for (f, neutral) in [(:fadd, "0.0"), (:fmul, "1.0")]
+    f_red = Symbol("reduce_", f)
+    @eval @generated function $f_red(x::LVec{N, T}) where {N,T<:FloatingTypes}
+        ff = llvm_name(string("experimental.vector.reduce.$horz_reduction_version", $(QuoteNode(f))), N, T)
+        decl = "declare $(d[T]) @$ff($(d[T]), <$N x $(d[T])>)"
+        s2 = """
+        %res = call $(d[T]) @$ff($(d[T]) $($neutral), <$N x $(d[T])> %0)
+        ret $(d[T]) %res
+        """
+        return quote
+            $(Expr(:meta, :inline));
+            Base.llvmcall($(decl, s2), T, Tuple{LVec{N, T},}, x)
+        end
+    end
+end
+
+end
diff --git a/src/SIMD.jl b/src/SIMD.jl
index 9093f1f..e8bae1f 100644
--- a/src/SIMD.jl
+++ b/src/SIMD.jl
@@ -1,1972 +1,24 @@
 module SIMD
 
-# A note on Val{} vs. Val():
-#
-# For historic reasoons, SIMD's API accepted compile-time constants as
-# Val{N} instead of Val(N). The difference is that Val{N} is a type
-# (Type{Val{N}}), whereas Val(N) is a value (of type Val{N}). This is
-# against the intent of how Val is designed, and is also much slower
-# at run time unless functions are either @inline'd or @generated.
-#
-# The API has now been cleaned up. To preserve backward compatibility,
-# passing Val{N} instead of Val(N) is still supported. It might go
-# away at the next major release.
-
-
-
-#=
-
-# Various boolean types
-
-# Idea (from <Gaunard-simd.pdf>): Use Mask{N,T} instead of booleans
-# with different sizes
-
-abstract Boolean <: Integer
-
-for sz in (8, 16, 32, 64, 128)
-    Intsz = Symbol(:Int, sz)
-    UIntsz = Symbol(:UInt, sz)
-    Boolsz = Symbol(:Bool, sz)
-    @eval begin
-        immutable $Boolsz <: Boolean
-            int::$UIntsz
-            $Boolsz(b::Bool) =
-                new(ifelse(b, typemax($UIntsz), typemin($UIntsz)))
-        end
-        booltype(::Val($sz)) = $Boolsz
-        inttype(::Val($sz)) = $Intsz
-        uinttype(::Val($sz)) = $UIntsz
-
-        Base.convert(::Type{Bool}, b::$Boolsz) = b.int != 0
-
-        Base.:~(b::$Boolsz) = $Boolsz(~b.int)
-        Base.:!(b::$Boolsz) = ~b
-        Base.:&(b1::$Boolsz, b2::$Boolsz) = $Boolsz(b1.int & b2.int)
-        Base.:|(b1::$Boolsz, b2::$Boolsz) = $Boolsz(b1.int | b2.int)
-        Base.$(:$)(b1::$Boolsz, b2::$Boolsz) = $Boolsz(b1.int $ b2.int)
-
-        Base.:==(b1::$Boolsz, b2::$Boolsz) = $Boolsz(b1.int == b2.int)
-        Base.:!=(b1::$Boolsz, b2::$Boolsz) = $Boolsz(b1.int != b2.int)
-        Base.:<(b1::$Boolsz, b2::$Boolsz) = $Boolsz(b1.int < b2.int)
-        Base.:<=(b1::$Boolsz, b2::$Boolsz) = $Boolsz(b1.int <= b2.int)
-        Base.:>(b1::$Boolsz, b2::$Boolsz) = $Boolsz(b1.int > b2.int)
-        Base.:>=(b1::$Boolsz, b2::$Boolsz) = $Boolsz(b1.int >= b2.int)
-    end
-end
-Base.convert(::Type{Bool}, b::Boolean) = error("impossible")
-Base.convert{I<:Integer}(::Type{I}, b::Boolean) = I(Bool(b))
-Base.convert{B<:Boolean}(::Type{B}, b::Boolean) = B(Bool(b))
-Base.convert{B<:Boolean}(::Type{B}, i::Integer) = B(i!=0)
-
-booltype{T}(::Type{T}) = booltype(Val(8*sizeof(T)))
-inttype{T}(::Type{T}) = inttype(Val(8*sizeof(T)))
-uinttype{T}(::Type{T}) = uinttype(Val(8*sizeof(T)))
-
-=#
-
-# Array types for SIMD
-
-using Base: Slice, ScalarIndex
-
-"""
-    ContiguousSubArray{T,N,P,I,L}
-
-Like `Base.FastContiguousSubArray` but without requirement for linear
-indexing (i.e., type parameter `L` can be `false`).
-
-# Examples
-```
-julia> A = view(ones(5, 5), :, [1,3]);
-
-julia> A isa Base.FastContiguousSubArray
-false
-
-julia> A isa SIMD.ContiguousSubArray
-true
-```
-"""
-ContiguousSubArray{T,N,P,
-                   I<:Union{Tuple{Union{Slice, AbstractUnitRange}, Vararg{Any}},
-                            Tuple{Vararg{ScalarIndex}}},
-                   L} = SubArray{T,N,P,I,L}
-
-"""
-    ContiguousArray{T,N}
-
-Array types with contiguous first dimension.
-"""
-ContiguousArray{T,N} = Union{DenseArray{T,N}, ContiguousSubArray{T,N}}
-
-"""
-    FastContiguousArray{T,N}
-
-This is the type of arrays that `pointer(A, i)` works.
-"""
-FastContiguousArray{T,N} = Union{DenseArray{T,N}, Base.FastContiguousSubArray{T,N}}
-# https://github.com/eschnett/SIMD.jl/pull/40#discussion_r254131184
-# https://github.com/JuliaArrays/MappedArrays.jl/pull/24#issuecomment-460568978
-
-# The Julia SIMD vector type
-
-const BoolTypes = Union{Bool}
-const IntTypes = Union{Int8, Int16, Int32, Int64, Int128}
-const UIntTypes = Union{UInt8, UInt16, UInt32, UInt64, UInt128}
-const IntegerTypes = Union{BoolTypes, IntTypes, UIntTypes}
-const IndexTypes = Union{IntegerTypes, Ptr}
-const FloatingTypes = Union{Float16, Float32, Float64}
-const ScalarTypes = Union{IndexTypes, FloatingTypes}
-
-const VE = Base.VecElement
-
-export Vec
-struct Vec{N,T<:ScalarTypes} # <: Number
-    elts::NTuple{N,VE{T}}
-    @inline Vec{N,T}(elts::NTuple{N, VE{T}}) where {N,T} = new{N,T}(elts)
-end
-
-function Base.show(io::IO, v::Vec{N,T}) where {N,T}
-    print(io, "<$N x $T>[")
-    join(io, [x.value for x in v.elts], ", ")
-    print(io, "]")
-end
-
-# Type properties
-Base.eltype(::Type{Vec{N,T}}) where {N,T} = T
-Base.ndims( ::Type{Vec{N,T}}) where {N,T} = 1
-Base.length(::Type{Vec{N,T}}) where {N,T} = N
-Base.size(  ::Type{Vec{N,T}}) where {N,T} = (N,)
-# TODO: This doesn't follow Base, e.g. `size([], 3) == 1`
-Base.size(::Type{Vec{N,T}}, n::Integer) where {N,T} = (N,)[n]
-
-Base.eltype(V::Vec) = eltype(typeof(V))
-Base.ndims( V::Vec) = ndims(typeof(V))
-Base.length(V::Vec) = length(typeof(V))
-Base.size(  V::Vec) = size(typeof(V))
-Base.size(  V::Vec, n::Integer) = size(typeof(V), n)
-
-# Type conversion
-
-# Create vectors from scalars or tuples
-@generated function (::Type{Vec{N,T}})(x::S) where {N,T,S<:ScalarTypes}
-    quote
-        $(Expr(:meta, :inline))
-        Vec{N,T}(tuple($([:(VE{T}(T(x))) for i in 1:N]...)))
-    end
-end
-Vec{N,T}(xs::Tuple{}) where {N,T<:ScalarTypes} = error("illegal argument")
-@generated function (::Type{Vec{N,T}})(xs::NTuple{N,S}) where {N,T,S<:ScalarTypes}
-    quote
-        $(Expr(:meta, :inline))
-        Vec{N,T}(tuple($([:(VE{T}(T(xs[$i]))) for i in 1:N]...)))
-    end
-end
-Vec(xs::NTuple{N,T}) where {N,T<:ScalarTypes} = Vec{N,T}(xs)
-
-# Convert between vectors
-@inline Base.convert(::Type{Vec{N,T}}, v::Vec{N,T}) where {N,T} = v
-
-@inline Base.convert(::Type{Vec{N,R}}, v::Vec{N}) where {N,R} =
-    Vec{N,R}(NTuple{N, R}(v))
-
-@inline Tuple(v::Vec{N}) where {N} = ntuple(i -> v.elts[i].value, Val(N))
-@inline NTuple{N, T}(v::Vec{N}) where{N, T} = ntuple(i -> convert(T, v.elts[i].value), Val(N))
-
-@generated function Base.:%(v::Vec{N,T}, ::Type{Vec{N,R}}) where {N,R,T}
-    quote
-        $(Expr(:meta, :inline))
-        Vec{N,R}(tuple($([:(v.elts[$i].value % R) for i in 1:N]...)))
-    end
-end
-
-# Convert vectors to tuples
-@generated function Base.convert(::Type{NTuple{N,R}}, v::Vec{N,T}) where {N,R,T}
-    quote
-        $(Expr(:meta, :inline))
-        tuple($([:(R(v.elts[$i].value)) for i in 1:N]...))
-    end
-end
-@inline Base.convert(::Type{Tuple}, v::Vec{N,T}) where {N,T} =
-    Base.convert(NTuple{N,T}, v)
-
-# Promotion rules
-
-# Note: Type promotion only works for subtypes of Number
-# Base.promote_rule{N,T<:ScalarTypes}(::Type{Vec{N,T}}, ::Type{T}) = Vec{N,T}
-
-Base.zero(::Type{Vec{N,T}}) where {N,T} = Vec{N,T}(zero(T))
-Base.one(::Type{Vec{N,T}}) where {N,T} = Vec{N,T}(one(T))
-Base.zero(::Vec{N,T}) where {N,T} = zero(Vec{N,T})
-Base.one(::Vec{N,T}) where {N,T} = one(Vec{N,T})
-
-# Floating point formats
-
-int_type(::Type{Float16}) = Int16
-int_type(::Type{Float32}) = Int32
-int_type(::Type{Float64}) = Int64
-# int_type(::Type{Float128}) = Int128
-# int_type(::Type{Float256}) = Int256
-
-uint_type(::Type{Float16}) = UInt16
-uint_type(::Type{Float32}) = UInt32
-uint_type(::Type{Float64}) = UInt64
-# uint_type(::Type{Float128}) = UInt128
-# uint_type(::Type{Float256}) = UInt256
-
-significand_bits(::Type{Float16}) = 10
-significand_bits(::Type{Float32}) = 23
-significand_bits(::Type{Float64}) = 52
-# significand_bits(::Type{Float128}) = 112
-# significand_bits(::Type{Float256}) = 136
-
-exponent_bits(::Type{T}) where {T<:FloatingTypes} =
-    8*sizeof(T) - 1 - significand_bits(T)
-sign_bits(::Type{T}) where {T<:FloatingTypes} = 1
-
-significand_mask(::Type{T}) where {T<:FloatingTypes} =
-    uint_type(T)(uint_type(T)(1) << significand_bits(T) - 1)
-exponent_mask(::Type{T}) where {T<:FloatingTypes} =
-    uint_type(T)(uint_type(T)(1) << exponent_bits(T) - 1) << significand_bits(T)
-sign_mask(::Type{T}) where {T<:FloatingTypes} =
-    uint_type(T)(1) << (significand_bits(T) + exponent_bits(T))
-
-for T in (Float16, Float32, Float64)
-    @assert sizeof(int_type(T)) == sizeof(T)
-    @assert sizeof(uint_type(T)) == sizeof(T)
-    @assert significand_bits(T) + exponent_bits(T) + sign_bits(T) == 8*sizeof(T)
-    @assert significand_mask(T) | exponent_mask(T) | sign_mask(T) ==
-        typemax(uint_type(T))
-    @assert significand_mask(T) ⊻ exponent_mask(T) ⊻ sign_mask(T) ==
-        typemax(uint_type(T))
-end
-
-# Convert Julia types to LLVM types
-
-llvmtype(::Type{Bool}) = "i8"   # Julia represents Tuple{Bool} as [1 x i8]
-
-# llvmtype(::Type{Bool8}) = "i8"
-# llvmtype(::Type{Bool16}) = "i16"
-# llvmtype(::Type{Bool32}) = "i32"
-# llvmtype(::Type{Bool64}) = "i64"
-# llvmtype(::Type{Bool128}) = "i128"
-
-llvmtype(::Type{Int8}) = "i8"
-llvmtype(::Type{Int16}) = "i16"
-llvmtype(::Type{Int32}) = "i32"
-llvmtype(::Type{Int64}) = "i64"
-llvmtype(::Type{Int128}) = "i128"
-llvmtype(::Type{<:Ptr}) = llvmtype(Int)
-
-llvmtype(::Type{UInt8}) = "i8"
-llvmtype(::Type{UInt16}) = "i16"
-llvmtype(::Type{UInt32}) = "i32"
-llvmtype(::Type{UInt64}) = "i64"
-llvmtype(::Type{UInt128}) = "i128"
-
-llvmtype(::Type{Float16}) = "half"
-llvmtype(::Type{Float32}) = "float"
-llvmtype(::Type{Float64}) = "double"
-
-# Type-dependent optimization flags
-# fastflags{T<:IntTypes}(::Type{T}) = "nsw"
-# fastflags{T<:UIntTypes}(::Type{T}) = "nuw"
-# fastflags{T<:FloatingTypes}(::Type{T}) = "fast"
-
-suffix(N::Integer, ::Type{T}) where {T<:IntegerTypes} = "v$(N)i$(8*sizeof(T))"
-suffix(N::Integer, ::Type{T}) where {T<:FloatingTypes} = "v$(N)f$(8*sizeof(T))"
-
-# Type-dependent LLVM constants
-function llvmconst(::Type{T}, val) where T
-    T(val) === T(0) && return "zeroinitializer"
-    typ = llvmtype(T)
-    "$typ $val"
-end
-function llvmconst(::Type{Bool}, val)
-    Bool(val) === false && return "zeroinitializer"
-    typ = "i1"
-    "$typ $(Int(val))"
-end
-function llvmconst(N::Integer, ::Type{T}, val) where T
-    T(val) === T(0) && return "zeroinitializer"
-    typ = llvmtype(T)
-    "<" * join(["$typ $val" for i in 1:N], ", ") * ">"
-end
-function llvmconst(N::Integer, ::Type{Bool}, val)
-    Bool(val) === false && return "zeroinitializer"
-    typ = "i1"
-    "<" * join(["$typ $(Int(val))" for i in 1:N], ", ") * ">"
-end
-function llvmtypedconst(::Type{T}, val) where T
-    typ = llvmtype(T)
-    T(val) === T(0) && return "$typ zeroinitializer"
-    "$typ $val"
-end
-function llvmtypedconst(::Type{Bool}, val)
-    typ = "i1"
-    Bool(val) === false && return "$typ zeroinitializer"
-    "$typ $(Int(val))"
-end
-
-# Type-dependent LLVM intrinsics
-llvmins(::Val{:+}, N, ::Type{T}) where {T <: IndexTypes} = "add"
-llvmins(::Val{:-}, N, ::Type{T}) where {T <: IndexTypes} = "sub"
-llvmins(::Val{:*}, N, ::Type{T}) where {T <: IntegerTypes} = "mul"
-llvmins(::Val{:div}, N, ::Type{T}) where {T <: IntTypes} = "sdiv"
-llvmins(::Val{:rem}, N, ::Type{T}) where {T <: IntTypes} = "srem"
-llvmins(::Val{:div}, N, ::Type{T}) where {T <: UIntTypes} = "udiv"
-llvmins(::Val{:rem}, N, ::Type{T}) where {T <: UIntTypes} = "urem"
-
-llvmins(::Val{:~}, N, ::Type{T}) where {T <: IntegerTypes} = "xor"
-llvmins(::Val{:&}, N, ::Type{T}) where {T <: IntegerTypes} = "and"
-llvmins(::Val{:|}, N, ::Type{T}) where {T <: IntegerTypes} = "or"
-llvmins(::Val{:⊻}, N, ::Type{T}) where {T <: IntegerTypes} = "xor"
-
-llvmins(::Val{:<<}, N, ::Type{T}) where {T <: IntegerTypes} = "shl"
-llvmins(::Val{:>>>}, N, ::Type{T}) where {T <: IntegerTypes} = "lshr"
-llvmins(::Val{:>>}, N, ::Type{T}) where {T <: UIntTypes} = "lshr"
-llvmins(::Val{:>>}, N, ::Type{T}) where {T <: IntTypes} = "ashr"
-
-llvmins(::Val{:(==)}, N, ::Type{T}) where {T <: IntegerTypes} = "icmp eq"
-llvmins(::Val{:(!=)}, N, ::Type{T}) where {T <: IntegerTypes} = "icmp ne"
-llvmins(::Val{:(>)}, N, ::Type{T}) where {T <: IntTypes} = "icmp sgt"
-llvmins(::Val{:(>=)}, N, ::Type{T}) where {T <: IntTypes} = "icmp sge"
-llvmins(::Val{:(<)}, N, ::Type{T}) where {T <: IntTypes} = "icmp slt"
-llvmins(::Val{:(<=)}, N, ::Type{T}) where {T <: IntTypes} = "icmp sle"
-llvmins(::Val{:(>)}, N, ::Type{T}) where {T <: UIntTypes} = "icmp ugt"
-llvmins(::Val{:(>=)}, N, ::Type{T}) where {T <: UIntTypes} = "icmp uge"
-llvmins(::Val{:(<)}, N, ::Type{T}) where {T <: UIntTypes} = "icmp ult"
-llvmins(::Val{:(<=)}, N, ::Type{T}) where {T <: UIntTypes} = "icmp ule"
-
-llvmins(::Val{:vifelse}, N, ::Type{T}) where {T} = "select"
-
-llvmins(::Val{:+}, N, ::Type{T}) where {T <: FloatingTypes} = "fadd"
-llvmins(::Val{:-}, N, ::Type{T}) where {T <: FloatingTypes} = "fsub"
-llvmins(::Val{:*}, N, ::Type{T}) where {T <: FloatingTypes} = "fmul"
-llvmins(::Val{:/}, N, ::Type{T}) where {T <: FloatingTypes} = "fdiv"
-llvmins(::Val{:inv}, N, ::Type{T}) where {T <: FloatingTypes} = "fdiv"
-llvmins(::Val{:rem}, N, ::Type{T}) where {T <: FloatingTypes} = "frem"
-
-llvmins(::Val{:(==)}, N, ::Type{T}) where {T <: FloatingTypes} = "fcmp oeq"
-llvmins(::Val{:(!=)}, N, ::Type{T}) where {T <: FloatingTypes} = "fcmp une"
-llvmins(::Val{:(>)}, N, ::Type{T}) where {T <: FloatingTypes} = "fcmp ogt"
-llvmins(::Val{:(>=)}, N, ::Type{T}) where {T <: FloatingTypes} = "fcmp oge"
-llvmins(::Val{:(<)}, N, ::Type{T}) where {T <: FloatingTypes} = "fcmp olt"
-llvmins(::Val{:(<=)}, N, ::Type{T}) where {T <: FloatingTypes} = "fcmp ole"
-
-llvmins(::Val{:^}, N, ::Type{T}) where {T <: FloatingTypes} =
-    "@llvm.pow.$(suffix(N,T))"
-llvmins(::Val{:abs}, N, ::Type{T}) where {T<:FloatingTypes} =
-    "@llvm.fabs.$(suffix(N,T))"
-llvmins(::Val{:ceil}, N, ::Type{T}) where {T<:FloatingTypes} =
-    "@llvm.ceil.$(suffix(N,T))"
-llvmins(::Val{:copysign}, N, ::Type{T}) where {T<:FloatingTypes} =
-    "@llvm.copysign.$(suffix(N,T))"
-llvmins(::Val{:cos}, N, ::Type{T}) where {T<:FloatingTypes} =
-    "@llvm.cos.$(suffix(N,T))"
-llvmins(::Val{:exp}, N, ::Type{T}) where {T<:FloatingTypes} =
-    "@llvm.exp.$(suffix(N,T))"
-llvmins(::Val{:exp2}, N, ::Type{T}) where {T<:FloatingTypes} =
-    "@llvm.exp2.$(suffix(N,T))"
-llvmins(::Val{:floor}, N, ::Type{T}) where {T<:FloatingTypes} =
-    "@llvm.floor.$(suffix(N,T))"
-llvmins(::Val{:fma}, N, ::Type{T}) where {T<:FloatingTypes} =
-    "@llvm.fma.$(suffix(N,T))"
-llvmins(::Val{:log}, N, ::Type{T}) where {T<:FloatingTypes} =
-    "@llvm.log.$(suffix(N,T))"
-llvmins(::Val{:log10}, N, ::Type{T}) where {T<:FloatingTypes} =
-    "@llvm.log10.$(suffix(N,T))"
-llvmins(::Val{:log2}, N, ::Type{T}) where {T<:FloatingTypes} =
-    "@llvm.log2.$(suffix(N,T))"
-llvmins(::Val{:max}, N, ::Type{T}) where {T<:FloatingTypes} =
-    "@llvm.maxnum.$(suffix(N,T))"
-llvmins(::Val{:min}, N, ::Type{T}) where {T<:FloatingTypes} =
-    "@llvm.minnum.$(suffix(N,T))"
-# llvmins(::Val{:max}, N, ::Type{T}) where {T<:FloatingTypes} =
-#     "@llvm.maximum.$(suffix(N,T))"
-# llvmins(::Val{:min}, N, ::Type{T}) where {T<:FloatingTypes} =
-#     "@llvm.minimum.$(suffix(N,T))"
-llvmins(::Val{:muladd}, N, ::Type{T}) where {T<:FloatingTypes} =
-    "@llvm.fmuladd.$(suffix(N,T))"
-llvmins(::Val{:powi}, N, ::Type{T}) where {T<:FloatingTypes} =
-    "@llvm.powi.$(suffix(N,T))"
-llvmins(::Val{:round}, N, ::Type{T}) where {T<:FloatingTypes} =
-    "@llvm.rint.$(suffix(N,T))"
-llvmins(::Val{:sin}, N, ::Type{T}) where {T<:FloatingTypes} =
-    "@llvm.sin.$(suffix(N,T))"
-llvmins(::Val{:sqrt}, N, ::Type{T}) where {T<:FloatingTypes} =
-    "@llvm.sqrt.$(suffix(N,T))"
-llvmins(::Val{:trunc}, N, ::Type{T}) where {T<:FloatingTypes} =
-    "@llvm.trunc.$(suffix(N,T))"
-
-# Convert between LLVM scalars, vectors, and arrays
-
-function scalar2vector(vec, siz, typ, sca)
-    instrs = []
-    accum(nam, i) = i<0 ? "undef" : i==siz-1 ? nam : "$(nam)_iter$i"
-    for i in 0:siz-1
-        push!(instrs,
-            "$(accum(vec,i)) = " *
-                "insertelement <$siz x $typ> $(accum(vec,i-1)), " *
-                "$typ $sca, i32 $i")
-    end
-    instrs
-end
-
-function array2vector(vec, siz, typ, arr, tmp="$(arr)_av")
-    instrs = []
-    accum(nam, i) = i<0 ? "undef" : i==siz-1 ? nam : "$(nam)_iter$i"
-    for i in 0:siz-1
-        push!(instrs, "$(tmp)_elem$i = extractvalue [$siz x $typ] $arr, $i")
-        push!(instrs,
-            "$(accum(vec,i)) = " *
-                "insertelement <$siz x $typ> $(accum(vec,i-1)), " *
-                "$typ $(tmp)_elem$i, i32 $i")
-    end
-    instrs
-end
-
-function vector2array(arr, siz, typ, vec, tmp="$(vec)_va")
-    instrs = []
-    accum(nam, i) = i<0 ? "undef" : i==siz-1 ? nam : "$(nam)_iter$i"
-    for i in 0:siz-1
-        push!(instrs,
-            "$(tmp)_elem$i = extractelement <$siz x $typ> $vec, i32 $i")
-        push!(instrs,
-            "$(accum(arr,i)) = "*
-                "insertvalue [$siz x $typ] $(accum(arr,i-1)), " *
-                "$typ $(tmp)_elem$i, $i")
-    end
-    instrs
-end
-
-# TODO: change argument order
-function subvector(vec, siz, typ, rvec, rsiz, roff, tmp="$(rvec)_sv")
-    instrs = []
-    accum(nam, i) = i<0 ? "undef" : i==rsiz-1 ? nam : "$(nam)_iter$i"
-    @assert 0 <= roff
-    @assert roff + rsiz <= siz
-    for i in 0:rsiz-1
-        push!(instrs,
-            "$(tmp)_elem$i = extractelement <$siz x $typ> $vec, i32 $(roff+i)")
-        push!(instrs,
-            "$(accum(rvec,i)) = " *
-                "insertelement <$rsiz x $typ> $(accum(rvec,i-1)), " *
-                "$typ $(tmp)_elem$i, i32 $i")
-    end
-    instrs
-end
-
-function extendvector(vec, siz, typ, voff, vsiz, val, rvec, tmp="$(rvec)_ev")
-    instrs = []
-    accum(nam, i) = i<0 ? "undef" : i==siz+vsiz-1 ? nam : "$(nam)_iter$i"
-    rsiz = siz + vsiz
-    for i in 0:siz-1
-        push!(instrs,
-            "$(tmp)_elem$i = extractelement <$siz x $typ> $vec, i32 $i")
-        push!(instrs,
-            "$(accum(rvec,i)) = " *
-                "insertelement <$rsiz x $typ> $(accum(rvec,i-1)), " *
-                "$typ $(tmp)_elem$i, i32 $i")
-    end
-    for i in siz:siz+vsiz-1
-        push!(instrs,
-            "$(accum(rvec,i)) = " *
-                "insertelement <$rsiz x $typ> $(accum(rvec,i-1)), $val, i32 $i")
-    end
-    instrs
-end
-
-# Element-wise access
-
-export setindex
-@generated function setindex(v::Vec{N,T}, x::Number, ::Val{I}) where {N,T,I}
-    @assert isa(I, Integer)
-    1 <= I <= N || throw(BoundsError())
-    typ = llvmtype(T)
-    ityp = llvmtype(Int)
-    vtyp = "<$N x $typ>"
-    decls = []
-    instrs = []
-    push!(instrs, "%res = insertelement $vtyp %0, $typ %1, $ityp $(I-1)")
-    push!(instrs, "ret $vtyp %res")
-    quote
-        $(Expr(:meta, :inline))
-        Vec{N,T}(Base.llvmcall($((join(decls, "\n"), join(instrs, "\n"))),
-            NTuple{N,VE{T}}, Tuple{NTuple{N,VE{T}}, T}, v.elts, T(x)))
-    end
-end
-@inline function setindex(v::Vec{N,T}, x::Number, ::Type{Val{I}}) where {N,T,I}
-    setindex(v, x, Val(I))
-end
-
-@generated function setindex(v::Vec{N,T}, x::Number, i::Int) where {N,T}
-    typ = llvmtype(T)
-    ityp = llvmtype(Int)
-    vtyp = "<$N x $typ>"
-    decls = []
-    instrs = []
-    push!(instrs, "%res = insertelement $vtyp %0, $typ %2, $ityp %1")
-    push!(instrs, "ret $vtyp %res")
-    quote
-        $(Expr(:meta, :inline))
-        @boundscheck 1 <= i <= N || throw(BoundsError())
-        Vec{N,T}(Base.llvmcall($((join(decls, "\n"), join(instrs, "\n"))),
-            NTuple{N,VE{T}}, Tuple{NTuple{N,VE{T}}, Int, T},
-            v.elts, i-1, T(x)))
-    end
-end
-setindex(v::Vec{N,T}, x::Number, i) where {N,T} = setindex(v, Int(i), x)
-
-Base.@propagate_inbounds Base.getindex(v::Vec{N,T}, ::Val{I}) where {N,T,I} = v.elts[I].value
-Base.@propagate_inbounds Base.getindex(v::Vec{N,T}, ::Type{Val{I}}) where {N,T,I} = Base.getindex(v, Val(I))
-Base.@propagate_inbounds Base.getindex(v::Vec{N,T}, i) where {N,T} = v.elts[i].value
-
-# Type conversion
-
-@generated function Base.reinterpret(::Type{Vec{N,R}},
-        v1::Vec{N1,T1}) where {N,R,N1,T1}
-    @assert N*sizeof(R) == N1*sizeof(T1)
-    typ1 = llvmtype(T1)
-    vtyp1 = "<$N1 x $typ1>"
-    typr = llvmtype(R)
-    vtypr = "<$N x $typr>"
-    decls = []
-    instrs = []
-    push!(instrs, "%res = bitcast $vtyp1 %0 to $vtypr")
-    push!(instrs, "ret $vtypr %res")
-    quote
-        $(Expr(:meta, :inline))
-        Vec{N,R}(Base.llvmcall($((join(decls, "\n"), join(instrs, "\n"))),
-            NTuple{N,VE{R}}, Tuple{NTuple{N1,VE{T1}}}, v1.elts))
-    end
-end
-
-# Generic function wrappers
-
-# Functions taking one argument
-@generated function llvmwrap(::Val{Op}, v1::Vec{N,T1},
-                             ::Type{R} = T1) where {Op,N,T1,R}
-    @assert isa(Op, Symbol)
-    typ1 = llvmtype(T1)
-    vtyp1 = "<$N x $typ1>"
-    typr = llvmtype(R)
-    vtypr = "<$N x $typr>"
-    ins = llvmins(Val(Op), N, T1)
-    decls = []
-    instrs = []
-    if ins[1] == '@'
-        push!(decls, "declare $vtypr $ins($vtyp1)")
-        push!(instrs, "%res = call $vtypr $ins($vtyp1 %0)")
-    else
-        if Op === :~
-            @assert T1 <: IntegerTypes
-            otherval = -1
-        elseif Op === :inv
-            @assert T1 <: FloatingTypes
-            otherval = 1.0
-        else
-            otherval = 0
-        end
-        otherarg = llvmconst(N, T1, otherval)
-        push!(instrs, "%res = $ins $vtyp1 $otherarg, %0")
-    end
-    push!(instrs, "ret $vtypr %res")
-    quote
-        $(Expr(:meta, :inline))
-        Vec{N,R}(Base.llvmcall($((join(decls, "\n"), join(instrs, "\n"))),
-            NTuple{N,VE{R}}, Tuple{NTuple{N,VE{T1}}}, v1.elts))
-    end
-end
-
-# Functions taking one Bool argument
-@generated function llvmwrap(::Val{Op}, v1::Vec{N,Bool},
-                             ::Type{Bool} = Bool) where {Op,N}
-    @assert isa(Op, Symbol)
-    btyp = llvmtype(Bool)
-    vbtyp = "<$N x $btyp>"
-    ins = llvmins(Val(Op), N, Bool)
-    decls = []
-    instrs = []
-    push!(instrs, "%arg1 = trunc $vbtyp %0 to <$N x i1>")
-    otherarg = llvmconst(N, Bool, true)
-    push!(instrs, "%res = $ins <$N x i1> $otherarg, %arg1")
-    push!(instrs, "%resb = zext <$N x i1> %res to $vbtyp")
-    push!(instrs, "ret $vbtyp %resb")
-    quote
-        $(Expr(:meta, :inline))
-        Vec{N,Bool}(Base.llvmcall($((join(decls, "\n"), join(instrs, "\n"))),
-            NTuple{N,VE{Bool}}, Tuple{NTuple{N,VE{Bool}}}, v1.elts))
-    end
-end
-
-# Functions taking two arguments
-@generated function llvmwrap(::Val{Op}, v1::Vec{N,T1}, v2::Vec{N,T2},
-        ::Type{R} = T1) where {Op,N,T1,T2,R}
-    @assert isa(Op, Symbol)
-    typ1 = llvmtype(T1)
-    vtyp1 = "<$N x $typ1>"
-    typ2 = llvmtype(T2)
-    vtyp2 = "<$N x $typ2>"
-    typr = llvmtype(R)
-    vtypr = "<$N x $typr>"
-    ins = llvmins(Val(Op), N, T1)
-    decls = []
-    instrs = []
-    if ins[1] == '@'
-        push!(decls, "declare $vtypr $ins($vtyp1, $vtyp2)")
-        push!(instrs, "%res = call $vtypr $ins($vtyp1 %0, $vtyp2 %1)")
-    else
-        push!(instrs, "%res = $ins $vtyp1 %0, %1")
-    end
-    push!(instrs, "ret $vtypr %res")
-    quote
-        $(Expr(:meta, :inline))
-        Vec{N,R}(Base.llvmcall($((join(decls, "\n"), join(instrs, "\n"))),
-            NTuple{N,VE{R}}, Tuple{NTuple{N,VE{T1}}, NTuple{N,VE{T2}}},
-            v1.elts, v2.elts))
-    end
-end
-
-# Functions taking two arguments, second argument is a scalar
-@generated function llvmwrap(::Val{Op}, v1::Vec{N,T1}, s2::ScalarTypes,
-        ::Type{R} = T1) where {Op,N,T1,R}
-    @assert isa(Op, Symbol)
-    typ1 = llvmtype(T1)
-    vtyp1 = "<$N x $typ1>"
-    typ2 = llvmtype(s2)
-    typr = llvmtype(R)
-    vtypr = "<$N x $typr>"
-    ins = llvmins(Val(Op), N, T1)
-    decls = []
-    instrs = []
-    if ins[1] == '@'
-        push!(decls, "declare $vtypr $ins($vtyp1, $typ2)")
-        push!(instrs, "%res = call $vtypr $ins($vtyp1 %0, $typ2 %1)")
-    else
-        push!(instrs, "%res = $ins $vtyp1 %0, %1")
-    end
-    push!(instrs, "ret $vtypr %res")
-    quote
-        $(Expr(:meta, :inline))
-        Vec{N,R}(Base.llvmcall($((join(decls, "\n"), join(instrs, "\n"))),
-            NTuple{N,VE{R}}, Tuple{NTuple{N,VE{T1}}, $s2},
-            v1.elts, s2))
-    end
-end
-
-# Functions taking two arguments, returning Bool
-@generated function llvmwrap(::Val{Op}, v1::Vec{N,T1}, v2::Vec{N,T2},
-        ::Type{Bool}) where {Op,N,T1,T2}
-    @assert isa(Op, Symbol)
-    btyp = llvmtype(Bool)
-    vbtyp = "<$N x $btyp>"
-    abtyp = "[$N x $btyp]"
-    typ1 = llvmtype(T1)
-    vtyp1 = "<$N x $typ1>"
-    atyp1 = "[$N x $typ1]"
-    typ2 = llvmtype(T2)
-    vtyp2 = "<$N x $typ2>"
-    atyp2 = "[$N x $typ2]"
-    ins = llvmins(Val(Op), N, T1)
-    decls = []
-    instrs = []
-    if false && N == 1
-        append!(instrs, array2vector("%arg1", N, typ1, "%0", "%arg1arr"))
-        append!(instrs, array2vector("%arg2", N, typ2, "%1", "%arg2arr"))
-        push!(instrs, "%cond = $ins $vtyp1 %arg1, %arg2")
-        push!(instrs, "%res = zext <$N x i1> %cond to $vbtyp")
-        append!(instrs, vector2array("%resarr", N, btyp, "%res"))
-        push!(instrs, "ret $abtyp %resarr")
-    else
-        push!(instrs, "%res = $ins $vtyp1 %0, %1")
-        push!(instrs, "%resb = zext <$N x i1> %res to $vbtyp")
-        push!(instrs, "ret $vbtyp %resb")
-    end
-    quote
-        $(Expr(:meta, :inline))
-        Vec{N,Bool}(Base.llvmcall($((join(decls, "\n"), join(instrs, "\n"))),
-            NTuple{N,VE{Bool}}, Tuple{NTuple{N,VE{T1}}, NTuple{N,VE{T2}}},
-            v1.elts, v2.elts))
-    end
-end
-
-# Functions taking a vector and a scalar argument
-# @generated function llvmwrap{Op,N,T1,T2,R}(::Val{Op}, v1::Vec{N,T1},
-#         x2::T2, ::Type{R} = T1)
-#     @assert isa(Op, Symbol)
-#     typ1 = llvmtype(T1)
-#     atyp1 = "[$N x $typ1]"
-#     vtyp1 = "<$N x $typ1>"
-#     typ2 = llvmtype(T2)
-#     typr = llvmtype(R)
-#     atypr = "[$N x $typr]"
-#     vtypr = "<$N x $typr>"
-#     ins = llvmins(Val(Op), N, T1)
-#     decls = []
-#     instrs = []
-#     append!(instrs, array2vector("%arg1", N, typ1, "%0", "%arg1arr"))
-#     if ins[1] == '@'
-#         push!(decls, "declare $vtypr $ins($vtyp1, $typ2)")
-#         push!(instrs, "%res = call $vtypr $ins($vtyp1 %arg1, $typ2 %1)")
-#     else
-#         push!(instrs, "%res = $ins $vtyp1 %arg1, %1")
-#     end
-#     append!(instrs, vector2array("%resarr", N, typr, "%res"))
-#     push!(instrs, "ret $atypr %resarr")
-#     quote
-#         $(Expr(:meta, :inline))
-#         Vec{N,R}(Base.llvmcall($((join(decls, "\n"), join(instrs, "\n"))),
-#             NTuple{N,R}, Tuple{NTuple{N,T1}, T2}, v1.elts, x2))
-#     end
-# end
-
-# Functions taking two Bool arguments, returning Bool
-@generated function llvmwrap(::Val{Op}, v1::Vec{N,Bool}, v2::Vec{N,Bool},
-        ::Type{Bool} = Bool) where {Op,N}
-    @assert isa(Op, Symbol)
-    btyp = llvmtype(Bool)
-    vbtyp = "<$N x $btyp>"
-    ins = llvmins(Val(Op), N, Bool)
-    decls = []
-    instrs = []
-    push!(instrs, "%arg1 = trunc $vbtyp %0 to <$N x i1>")
-    push!(instrs, "%arg2 = trunc $vbtyp %1 to <$N x i1>")
-    push!(instrs, "%res = $ins <$N x i1> %arg1, %arg2")
-    push!(instrs, "%resb = zext <$N x i1> %res to $vbtyp")
-    push!(instrs, "ret $vbtyp %resb")
-    quote
-        $(Expr(:meta, :inline))
-        Vec{N,Bool}(Base.llvmcall($((join(decls, "\n"), join(instrs, "\n"))),
-            NTuple{N,VE{Bool}}, Tuple{NTuple{N,VE{Bool}}, NTuple{N,VE{Bool}}},
-            v1.elts, v2.elts))
-    end
-end
-
-# Functions taking three arguments
-@generated function llvmwrap(::Val{Op}, v1::Vec{N,T1}, v2::Vec{N,T2},
-        v3::Vec{N,T3}, ::Type{R} = T1) where {Op,N,T1,T2,T3,R}
-    @assert isa(Op, Symbol)
-    typ1 = llvmtype(T1)
-    vtyp1 = "<$N x $typ1>"
-    typ2 = llvmtype(T2)
-    vtyp2 = "<$N x $typ2>"
-    typ3 = llvmtype(T3)
-    vtyp3 = "<$N x $typ3>"
-    typr = llvmtype(R)
-    vtypr = "<$N x $typr>"
-    ins = llvmins(Val(Op), N, T1)
-    decls = []
-    instrs = []
-    if ins[1] == '@'
-        push!(decls, "declare $vtypr $ins($vtyp1, $vtyp2, $vtyp3)")
-        push!(instrs,
-            "%res = call $vtypr $ins($vtyp1 %0, $vtyp2 %1, $vtyp3 %2)")
-    else
-        push!(instrs, "%res = $ins $vtyp1 %0, %1, %2")
-    end
-    push!(instrs, "ret $vtypr %res")
-    quote
-        $(Expr(:meta, :inline))
-        Vec{N,R}(Base.llvmcall($((join(decls, "\n"), join(instrs, "\n"))),
-            NTuple{N,VE{R}},
-            Tuple{NTuple{N,VE{T1}}, NTuple{N,VE{T2}}, NTuple{N,VE{T3}}},
-            v1.elts, v2.elts, v3.elts))
-    end
-end
-
-@generated function llvmwrapshift(::Val{Op}, v1::Vec{N,T},
-        ::Val{I}) where {Op,N,T,I}
-    @assert isa(Op, Symbol)
-    if I >= 0
-        op = Op
-        i = I
-    else
-        if Op === :>> || Op === :>>>
-            op = :<<
-        else
-            @assert Op === :<<
-            if T <: Unsigned
-                op = :>>>
-            else
-                op = :>>
-            end
-        end
-        i = -I
-    end
-    @assert op in (:<<, :>>, :>>>)
-    @assert i >= 0
-    typ = llvmtype(T)
-    vtyp = "<$N x $typ>"
-    ins = llvmins(Val(op), N, T)
-    decls = []
-    instrs = []
-    nbits = 8*sizeof(T)
-    if (op === :>> && T <: IntTypes) || i < nbits
-        count = llvmconst(N, T, min(nbits-1, i))
-        push!(instrs, "%res = $ins $vtyp %0, $count")
-        push!(instrs, "ret $vtyp %res")
-    else
-        zero = llvmconst(N, T, 0)
-        push!(instrs, "return $vtyp $zero")
-    end
-    quote
-        $(Expr(:meta, :inline))
-        Vec{N,T}(Base.llvmcall($((join(decls, "\n"), join(instrs, "\n"))),
-            NTuple{N,VE{T}}, Tuple{NTuple{N,VE{T}}}, v1.elts))
-    end
-end
-
-@generated function llvmwrapshift(::Val{Op}, v1::Vec{N,T},
-        x2::Unsigned) where {Op,N,T}
-    @assert isa(Op, Symbol)
-    typ = llvmtype(T)
-    vtyp = "<$N x $typ>"
-    ins = llvmins(Val(Op), N, T)
-    decls = []
-    instrs = []
-    append!(instrs, scalar2vector("%count", N, typ, "%1"))
-    nbits = 8*sizeof(T)
-    push!(instrs, "%tmp = $ins $vtyp %0, %count")
-    push!(instrs, "%inbounds = icmp ult $typ %1, $nbits")
-    if Op === :>> && T <: IntTypes
-        nbits1 = llvmconst(N, T, 8*sizeof(T)-1)
-        push!(instrs, "%limit = $ins $vtyp %0, $nbits1")
-        push!(instrs, "%res = select i1 %inbounds, $vtyp %tmp, $vtyp %limit")
-    else
-        zero = llvmconst(N, T, 0)
-        push!(instrs, "%res = select i1 %inbounds, $vtyp %tmp, $vtyp $zero")
-    end
-    push!(instrs, "ret $vtyp %res")
-    quote
-        $(Expr(:meta, :inline))
-        # Note that this function might be called with out-of-bounds
-        # values for x2, assuming that the results are then ignored
-        Vec{N,T}(Base.llvmcall($((join(decls, "\n"), join(instrs, "\n"))),
-            NTuple{N,VE{T}}, Tuple{NTuple{N,VE{T}}, T}, v1.elts, x2 % T))
-    end
-end
-
-@generated function llvmwrapshift(::Val{Op}, v1::Vec{N,T},
-        x2::Integer) where {Op,N,T}
-    if Op === :>> || Op === :>>>
-        NegOp = :<<
-    else
-        @assert Op === :<<
-        if T <: Unsigned
-            NegOp = :>>>
-        else
-            NegOp = :>>
-        end
-    end
-    ValOp = Val(Op)
-    ValNegOp = Val(NegOp)
-    quote
-        $(Expr(:meta, :inline))
-        ifelse(x2 >= 0,
-               llvmwrapshift($ValOp, v1, unsigned(x2)),
-               llvmwrapshift($ValNegOp, v1, unsigned(-x2)))
-    end
-end
-
-@generated function llvmwrapshift(::Val{Op}, v1::Vec{N,T},
-        v2::Vec{N,U}) where {Op,N,T,U<:UIntTypes}
-    @assert isa(Op, Symbol)
-    typ = llvmtype(T)
-    vtyp = "<$N x $typ>"
-    ins = llvmins(Val(Op), N, T)
-    decls = []
-    instrs = []
-    push!(instrs, "%tmp = $ins $vtyp %0, %1")
-    nbits = llvmconst(N, T, 8*sizeof(T))
-    push!(instrs, "%inbounds = icmp ult $vtyp %1, $nbits")
-    if Op === :>> && T <: IntTypes
-        nbits1 = llvmconst(N, T, 8*sizeof(T)-1)
-        push!(instrs, "%limit = $ins $vtyp %0, $nbits1")
-        push!(instrs,
-            "%res = select <$N x i1> %inbounds, $vtyp %tmp, $vtyp %limit")
-    else
-        zero = llvmconst(N, T, 0)
-        push!(instrs,
-            "%res = select <$N x i1> %inbounds, $vtyp %tmp, $vtyp $zero")
-    end
-    push!(instrs, "ret $vtyp %res")
-    quote
-        $(Expr(:meta, :inline))
-        Vec{N,T}(Base.llvmcall($((join(decls, "\n"), join(instrs, "\n"))),
-            NTuple{N,VE{T}}, Tuple{NTuple{N,VE{T}}, NTuple{N,VE{T}}},
-            v1.elts, (v2 % Vec{N,T}).elts))
-    end
-end
-
-@generated function llvmwrapshift(::Val{Op}, v1::Vec{N,T},
-        v2::Vec{N,U}) where {Op,N,T,U<:IntegerTypes}
-    if Op === :>> || Op === :>>>
-        NegOp = :<<
-    else
-        @assert Op === :<<
-        if T <: Unsigned
-            NegOp = :>>>
-        else
-            NegOp = :>>
-        end
-    end
-    ValOp = Val(Op)
-    ValNegOp = Val(NegOp)
-    quote
-        $(Expr(:meta, :inline))
-        vifelse(v2 >= 0,
-                llvmwrapshift($ValOp, v1, v2 % Vec{N,unsigned(U)}),
-                llvmwrapshift($ValNegOp, v1, -v2 % Vec{N,unsigned(U)}))
-    end
-end
-
-# Conditionals
-
-for op in (:(==), :(!=), :(<), :(<=), :(>), :(>=))
-    @eval begin
-        @inline Base.$op(v1::Vec{N,T}, v2::Vec{N,T}) where {N,T} =
-            llvmwrap(Val($(QuoteNode(op))), v1, v2, Bool)
-    end
-end
-@inline function Base.cmp(v1::Vec{N,T}, v2::Vec{N,T}) where {N,T}
-    I = int_type(T)
-    vifelse(isequal(v1, v2), Vec{N,I}(0),
-            vifelse(isless(v1, v2), Vec{N,I}(-1), Vec{N,I}(1)))
-end
-@inline function Base.isfinite(v1::Vec{N,T}) where {N,T<:FloatingTypes}
-    U = uint_type(T)
-    em = Vec{N,U}(exponent_mask(T))
-    iv = reinterpret(Vec{N,U}, v1)
-    iv & em != em
-end
-@inline Base.isinf(v1::Vec{N,T}) where {N,T<:FloatingTypes} = abs(v1) == Vec{N,T}(Inf)
-@inline Base.isnan(v1::Vec{N,T}) where {N,T<:FloatingTypes} = v1 != v1
-@inline function Base.issubnormal(v1::Vec{N,T}) where {N,T<:FloatingTypes}
-    U = uint_type(T)
-    em = Vec{N,U}(exponent_mask(T))
-    sm = Vec{N,U}(significand_mask(T))
-    iv = reinterpret(Vec{N,U}, v1)
-    (iv & em == Vec{N,U}(0)) & (iv & sm != Vec{N,U}(0))
-end
-@inline function Base.signbit(v1::Vec{N,T}) where {N,T<:FloatingTypes}
-    U = uint_type(T)
-    sm = Vec{N,U}(sign_mask(T))
-    iv = reinterpret(Vec{N,U}, v1)
-    iv & sm != Vec{N,U}(0)
-end
-
-export vifelse
-vifelse(c::Bool, x, y) = ifelse(c, x, y)
-@generated function vifelse(v1::Vec{N,Bool}, v2::Vec{N,T},
-        v3::Vec{N,T}) where {N,T}
-    btyp = llvmtype(Bool)
-    vbtyp = "<$N x $btyp>"
-    abtyp = "[$N x $btyp]"
-    typ = llvmtype(T)
-    vtyp = "<$N x $typ>"
-    atyp = "[$N x $typ]"
-    decls = []
-    instrs = []
-    if false && N == 1
-        append!(instrs, array2vector("%arg1", N, btyp, "%0", "%arg1arr"))
-        append!(instrs, array2vector("%arg2", N, typ, "%1", "%arg2arr"))
-        append!(instrs, array2vector("%arg3", N, typ, "%2", "%arg3arr"))
-        push!(instrs, "%cond = trunc $vbtyp %arg1 to <$N x i1>")
-        push!(instrs, "%res = select <$N x i1> %cond, $vtyp %arg2, $vtyp %arg3")
-        append!(instrs, vector2array("%resarr", N, typ, "%res"))
-        push!(instrs, "ret $atyp %resarr")
-    else
-        push!(instrs, "%cond = trunc $vbtyp %0 to <$N x i1>")
-        push!(instrs, "%res = select <$N x i1> %cond, $vtyp %1, $vtyp %2")
-        push!(instrs, "ret $vtyp %res")
-    end
-    quote
-        $(Expr(:meta, :inline))
-        Vec{N,T}(Base.llvmcall($((join(decls, "\n"), join(instrs, "\n"))),
-            NTuple{N,VE{T}},
-            Tuple{NTuple{N,VE{Bool}}, NTuple{N,VE{T}}, NTuple{N,VE{T}}},
-            v1.elts, v2.elts, v3.elts))
-    end
-end
-
-# Integer arithmetic functions
-
-for op in (:~, :+, :-)
-    @eval begin
-        @inline Base.$op(v1::Vec{N,T}) where {N,T<:IntegerTypes} =
-            llvmwrap(Val($(QuoteNode(op))), v1)
-    end
-end
-@inline Base.:!(v1::Vec{N,Bool}) where {N} = ~v1
-@inline function Base.abs(v1::Vec{N,T}) where {N,T<:IntTypes}
-    # s = -Vec{N,T}(signbit(v1))
-    s = v1 >> Val(8*sizeof(T))
-    # Note: -v1 == ~v1 + 1
-    (s ⊻ v1) - s
-end
-@inline Base.abs(v1::Vec{N,T}) where {N,T<:UIntTypes} = v1
-# TODO: Try T(v1>0) - T(v1<0)
-#       use a shift for v1<0
-#       evaluate v1>0 as -v1<0 ?
-@inline Base.sign(v1::Vec{N,T}) where {N,T<:IntTypes} =
-    vifelse(v1 == Vec{N,T}(0), Vec{N,T}(0),
-        vifelse(v1 < Vec{N,T}(0), Vec{N,T}(-1), Vec{N,T}(1)))
-@inline Base.sign(v1::Vec{N,T}) where {N,T<:UIntTypes} =
-    vifelse(v1 == Vec{N,T}(0), Vec{N,T}(0), Vec{N,T}(1))
-@inline Base.signbit(v1::Vec{N,T}) where {N,T<:IntTypes} = v1 < Vec{N,T}(0)
-@inline Base.signbit(v1::Vec{N,T}) where {N,T<:UIntTypes} = Vec{N,Bool}(false)
-
-for op in (:&, :|, :⊻, :+, :-, :*, :div, :rem)
-    @eval begin
-        @inline Base.$op(v1::Vec{N,T}, v2::Vec{N,T}) where {N,T<:IntegerTypes} =
-            llvmwrap(Val($(QuoteNode(op))), v1, v2)
-    end
-end
-@inline Base.copysign(v1::Vec{N,T}, v2::Vec{N,T}) where {N,T<:IntTypes} =
-    vifelse(signbit(v2), -abs(v1), abs(v1))
-@inline Base.copysign(v1::Vec{N,T}, v2::Vec{N,T}) where {N,T<:UIntTypes} = v1
-@inline Base.flipsign(v1::Vec{N,T}, v2::Vec{N,T}) where {N,T<:IntTypes} =
-    vifelse(signbit(v2), -v1, v1)
-@inline Base.flipsign(v1::Vec{N,T}, v2::Vec{N,T}) where {N,T<:UIntTypes} = v1
-@inline Base.max(v1::Vec{N,T}, v2::Vec{N,T}) where {N,T<:IntegerTypes} =
-    vifelse(v1>=v2, v1, v2)
-@inline Base.min(v1::Vec{N,T}, v2::Vec{N,T}) where {N,T<:IntegerTypes} =
-    vifelse(v1>=v2, v2, v1)
-
-@inline function Base.muladd(v1::Vec{N,T}, v2::Vec{N,T},
-        v3::Vec{N,T}) where {N,T<:IntegerTypes}
-    v1*v2+v3
-end
-
-# TODO: Handle negative shift counts
-#       use vifelse
-#       ensure vifelse is efficient
-for op in (:<<, :>>, :>>>)
-    @eval begin
-        @inline Base.$op(v1::Vec{N,T}, ::Val{I}) where {N,T<:IntegerTypes,I} =
-            llvmwrapshift(Val($(QuoteNode(op))), v1, Val(I))
-        @inline Base.$op(v1::Vec{N,T}, ::Type{Val{I}}) where {N,T<:IntegerTypes,I} =
-            Base.$op(v1, Val(I))
-        @inline Base.$op(v1::Vec{N,T}, x2::Unsigned) where {N,T<:IntegerTypes} =
-            llvmwrapshift(Val($(QuoteNode(op))), v1, x2)
-        @inline Base.$op(v1::Vec{N,T}, x2::Int) where {N,T<:IntegerTypes} =
-            llvmwrapshift(Val($(QuoteNode(op))), v1, x2)
-        @inline Base.$op(v1::Vec{N,T}, x2::Integer) where {N,T<:IntegerTypes} =
-            llvmwrapshift(Val($(QuoteNode(op))), v1, x2)
-        @inline Base.$op(v1::Vec{N,T},
-                         v2::Vec{N,U}) where {N,T<:IntegerTypes,U<:UIntTypes} =
-            llvmwrapshift(Val($(QuoteNode(op))), v1, v2)
-        @inline Base.$op(v1::Vec{N,T},
-                         v2::Vec{N,U}) where {N,T<:IntegerTypes,U<:IntegerTypes} =
-            llvmwrapshift(Val($(QuoteNode(op))), v1, v2)
-        @inline Base.$op(x1::T, v2::Vec{N,T}) where {N,T<:IntegerTypes} =
-            $op(Vec{N,T}(x1), v2)
-    end
-end
-
-# Floating point arithmetic functions
-
-for op in (
-        :+, :-,
-        :abs, :ceil, :cos, :exp, :exp2, :floor, :inv, :log, :log10, :log2,
-        :round, :sin, :sqrt, :trunc)
-    @eval begin
-        @inline Base.$op(v1::Vec{N,T}) where {N,T<:FloatingTypes} =
-            llvmwrap(Val($(QuoteNode(op))), v1)
-    end
-end
-@inline Base.exp10(v1::Vec{N,T}) where {N,T<:FloatingTypes} = Vec{N,T}(10)^v1
-@inline Base.sign(v1::Vec{N,T}) where {N,T<:FloatingTypes} =
-    vifelse(v1 == Vec{N,T}(0.0), Vec{N,T}(0.0), copysign(Vec{N,T}(1.0), v1))
-
-for op in (:+, :-, :*, :/, :^, :copysign, :max, :min, :rem)
-    @eval begin
-        @inline Base.$op(v1::Vec{N,T}, v2::Vec{N,T}) where {N,T<:FloatingTypes} =
-            llvmwrap(Val($(QuoteNode(op))), v1, v2)
-    end
-end
-# Using `IntegerTypes` here so that this definition "wins" against
-# `^(::ScalarTypes, v2::Vec)`.
-@inline Base.:^(v1::Vec{N,T}, x2::IntegerTypes) where {N,T<:FloatingTypes} =
-    llvmwrap(Val(:powi), v1, Int(x2))
-@inline Base.:^(v1::Vec{N,T}, x2::Integer) where {N,T<:FloatingTypes} =
-    llvmwrap(Val(:powi), v1, Int(x2))
-@inline Base.flipsign(v1::Vec{N,T}, v2::Vec{N,T}) where {N,T<:FloatingTypes} =
-    vifelse(signbit(v2), -v1, v1)
-
-# Do what Base does for HWNumber:
-@inline Base.literal_pow(::typeof(^), x::Vec, ::Val{0}) = one(typeof(x))
-@inline Base.literal_pow(::typeof(^), x::Vec, ::Val{1}) = x
-@inline Base.literal_pow(::typeof(^), x::Vec, ::Val{2}) = x*x
-@inline Base.literal_pow(::typeof(^), x::Vec, ::Val{3}) = x*x*x
-
-for op in (:fma, :muladd)
-    @eval begin
-        @inline function Base.$op(v1::Vec{N,T},
-                v2::Vec{N,T}, v3::Vec{N,T}) where {N,T<:FloatingTypes}
-            llvmwrap(Val($(QuoteNode(op))), v1, v2, v3)
-        end
-    end
-end
-
-# Type promotion
-
-# Promote scalars of all IntegerTypes to vectors of IntegerTypes, leaving the
-# vector type unchanged
-
-for op in (
-        :(==), :(!=), :(<), :(<=), :(>), :(>=),
-        :&, :|, :⊻, :+, :-, :*, :copysign, :div, :flipsign, :max, :min, :rem)
-    @eval begin
-        @inline Base.$op(s1::Bool, v2::Vec{N,Bool}) where {N} =
-            $op(Vec{N,Bool}(s1), v2)
-        @inline Base.$op(s1::IntegerTypes, v2::Vec{N,T}) where {N,T<:IntegerTypes} =
-            $op(Vec{N,T}(s1), v2)
-        @inline Base.$op(v1::Vec{N,T}, s2::IntegerTypes) where {N,T<:IntegerTypes} =
-            $op(v1, Vec{N,T}(s2))
-    end
-end
-@inline vifelse(c::Vec{N,Bool}, s1::IntegerTypes,
-        v2::Vec{N,T}) where {N,T<:IntegerTypes} =
-    vifelse(c, Vec{N,T}(s1), v2)
-@inline vifelse(c::Vec{N,Bool}, v1::Vec{N,T},
-        s2::IntegerTypes) where {N,T<:IntegerTypes} =
-    vifelse(c, v1, Vec{N,T}(s2))
-
-for op in (:muladd,)
-    @eval begin
-        @inline Base.$op(s1::IntegerTypes, v2::Vec{N,T},
-                v3::Vec{N,T}) where {N,T<:IntegerTypes} =
-            $op(Vec{N,T}(s1), v2, v3)
-        @inline Base.$op(v1::Vec{N,T}, s2::IntegerTypes,
-                v3::Vec{N,T}) where {N,T<:IntegerTypes} =
-            $op(v1, Vec{N,T}(s2), v3)
-        @inline Base.$op(s1::IntegerTypes, s2::IntegerTypes,
-                v3::Vec{N,T}) where {N,T<:IntegerTypes} =
-            $op(Vec{N,T}(s1), Vec{N,T}(s2), v3)
-        @inline Base.$op(v1::Vec{N,T}, v2::Vec{N,T},
-                s3::IntegerTypes) where {N,T<:IntegerTypes} =
-            $op(v1, v2, Vec{N,T}(s3))
-        @inline Base.$op(s1::IntegerTypes, v2::Vec{N,T},
-                s3::IntegerTypes) where {N,T<:IntegerTypes} =
-            $op(Vec{N,T}(s1), v2, Vec{N,T}(s3))
-        @inline Base.$op(v1::Vec{N,T}, s2::IntegerTypes,
-                s3::IntegerTypes) where {N,T<:IntegerTypes} =
-            $op(v1, Vec{N,T}(s2), Vec{N,T}(s3))
-    end
-end
-
-# Promote scalars of all ScalarTypes to vectors of FloatingTypes, leaving the
-# vector type unchanged
-
-for op in (
-        :(==), :(!=), :(<), :(<=), :(>), :(>=),
-        :+, :-, :*, :/, :^, :copysign, :flipsign, :max, :min, :rem)
-    @eval begin
-        @inline Base.$op(s1::ScalarTypes, v2::Vec{N,T}) where {N,T<:FloatingTypes} =
-            $op(Vec{N,T}(s1), v2)
-        @inline Base.$op(v1::Vec{N,T}, s2::ScalarTypes) where {N,T<:FloatingTypes} =
-            $op(v1, Vec{N,T}(s2))
-    end
-end
-@inline vifelse(c::Vec{N,Bool}, s1::ScalarTypes,
-        v2::Vec{N,T}) where {N,T<:FloatingTypes} =
-    vifelse(c, Vec{N,T}(s1), v2)
-@inline vifelse(c::Vec{N,Bool}, v1::Vec{N,T},
-        s2::ScalarTypes) where {N,T<:FloatingTypes} =
-    vifelse(c, v1, Vec{N,T}(s2))
-
-for op in (:fma, :muladd)
-    @eval begin
-        @inline Base.$op(s1::ScalarTypes, v2::Vec{N,T},
-                v3::Vec{N,T}) where {N,T<:FloatingTypes} =
-            $op(Vec{N,T}(s1), v2, v3)
-        @inline Base.$op(v1::Vec{N,T}, s2::ScalarTypes,
-                v3::Vec{N,T}) where {N,T<:FloatingTypes} =
-            $op(v1, Vec{N,T}(s2), v3)
-        @inline Base.$op(s1::ScalarTypes, s2::ScalarTypes,
-                v3::Vec{N,T}) where {N,T<:FloatingTypes} =
-            $op(Vec{N,T}(s1), Vec{N,T}(s2), v3)
-        @inline Base.$op(v1::Vec{N,T}, v2::Vec{N,T},
-                s3::ScalarTypes) where {N,T<:FloatingTypes} =
-            $op(v1, v2, Vec{N,T}(s3))
-        @inline Base.$op(s1::ScalarTypes, v2::Vec{N,T},
-                s3::ScalarTypes) where {N,T<:FloatingTypes} =
-            $op(Vec{N,T}(s1), v2, Vec{N,T}(s3))
-        @inline Base.$op(v1::Vec{N,T}, s2::ScalarTypes,
-                s3::ScalarTypes) where {N,T<:FloatingTypes} =
-            $op(v1, Vec{N,T}(s2), Vec{N,T}(s3))
-    end
-end
-
-# Poitner arithmetics between Ptr, IntegerTypes, and vectors of them.
-
-for op in (:+, :-)
-    @eval begin
-        @inline Base.$op(v1::Vec{N,<:Ptr}, v2::Vec{N,<:IntegerTypes}) where {N} =
-            llvmwrap(Val($(QuoteNode(op))), v1, v2)
-        @inline Base.$op(v1::Vec{N,<:IntegerTypes}, v2::Vec{N,<:Ptr}) where {N} =
-            llvmwrap(Val($(QuoteNode(op))), v1, v2)
-        @inline Base.$op(s1::P, v2::Vec{N,<:IntegerTypes}) where {N,P<:Ptr} =
-            $op(Vec{N,P}(s1), v2)
-        @inline Base.$op(v1::Vec{N,<:IntegerTypes}, s2::P) where {N,P<:Ptr} =
-            $op(v1, Vec{N,P}(s2))
-    end
-end
-
-
-# Reduction operations
-
-# TODO: map, mapreduce
-
-function getneutral(op::Symbol, ::Type{T}) where T
-    zs = Dict{Symbol,T}()
-    if T <: IntegerTypes
-        zs[:&] = ~T(0)
-        zs[:|] = T(0)
-    end
-    zs[:max] = typemin(T)
-    zs[:min] = typemax(T)
-    zs[:+] = T(0)
-    zs[:*] = T(1)
-    zs[op]
-end
-
-if VERSION >= v"0.7.0-beta2.195"
-    nextpow2(n) = nextpow(2, n)
-end
-
-# We cannot pass in the neutral element via Val{}; if we try, Julia refuses to
-# inline this function, which is then disastrous for performance
-@generated function llvmwrapreduce(::Val{Op}, v::Vec{N,T}) where {Op,N,T}
-    @assert isa(Op, Symbol)
-    z = getneutral(Op, T)
-    typ = llvmtype(T)
-    decls = []
-    instrs = []
-    n = N
-    nam = "%0"
-    nold,n = n,nextpow2(n)
-    if n > nold
-        namold,nam = nam,"%vec_$n"
-        append!(instrs,
-            extendvector(namold, nold, typ, n, n-nold,
-                llvmtypedconst(T,z), nam))
-    end
-    while n > 1
-        nold,n = n, div(n, 2)
-        namold,nam = nam,"%vec_$n"
-        vtyp = "<$n x $typ>"
-        ins = llvmins(Val(Op), n, T)
-        append!(instrs, subvector(namold, nold, typ, "$(nam)_1", n, 0))
-        append!(instrs, subvector(namold, nold, typ, "$(nam)_2", n, n))
-        if ins[1] == '@'
-            push!(decls, "declare $vtyp $ins($vtyp, $vtyp)")
-            push!(instrs,
-                "$nam = call $vtyp $ins($vtyp $(nam)_1, $vtyp $(nam)_2)")
-        else
-            push!(instrs, "$nam = $ins $vtyp $(nam)_1, $(nam)_2")
-        end
-    end
-    push!(instrs, "%res = extractelement <$n x $typ> $nam, i32 0")
-    push!(instrs, "ret $typ %res")
-    quote
-        $(Expr(:meta, :inline))
-        Base.llvmcall($((join(decls, "\n"), join(instrs, "\n"))),
-            T, Tuple{NTuple{N,VE{T}}}, v.elts)
-    end
-end
-
-@inline Base.all(v::Vec{N,T}) where {N,T<:IntegerTypes} = llvmwrapreduce(Val(:&), v)
-@inline Base.any(v::Vec{N,T}) where {N,T<:IntegerTypes} = llvmwrapreduce(Val(:|), v)
-@inline Base.maximum(v::Vec{N,T}) where {N,T<:FloatingTypes} =
-    llvmwrapreduce(Val(:max), v)
-@inline Base.minimum(v::Vec{N,T}) where {N,T<:FloatingTypes} =
-    llvmwrapreduce(Val(:min), v)
-@inline Base.prod(v::Vec{N,T}) where {N,T} = llvmwrapreduce(Val(:*), v)
-@inline Base.sum(v::Vec{N,T}) where {N,T} = llvmwrapreduce(Val(:+), v)
-
-@generated function Base.reduce(::Val{Op}, v::Vec{N,T}) where {Op,N,T}
-    @assert isa(Op, Symbol)
-    z = getneutral(Op, T)
-    stmts = []
-    n = N
-    push!(stmts, :($(Symbol(:v,n)) = v))
-    nold,n = n,nextpow2(n)
-    if n > nold
-        push!(stmts,
-            :($(Symbol(:v,n)) = Vec{$n,T}($(Expr(:tuple,
-                [:($(Symbol(:v,nold)).elts[$i]) for i in 1:nold]...,
-                [z for i in nold+1:n]...)))))
-    end
-    while n > 1
-        nold,n = n, div(n, 2)
-        push!(stmts,
-            :($(Symbol(:v,n,"lo")) = Vec{$n,T}($(Expr(:tuple,
-                [:($(Symbol(:v,nold)).elts[$i]) for i in 1:n]...,)))))
-        push!(stmts,
-            :($(Symbol(:v,n,"hi")) = Vec{$n,T}($(Expr(:tuple,
-                [:($(Symbol(:v,nold)).elts[$i]) for i in n+1:nold]...)))))
-        push!(stmts,
-            :($(Symbol(:v,n)) =
-                $Op($(Symbol(:v,n,"lo")), $(Symbol(:v,n,"hi")))))
-    end
-    push!(stmts, :(v1[1]))
-    Expr(:block, Expr(:meta, :inline), stmts...)
-end
-@inline function Base.reduce(::Type{Val{Op}}, v::Vec{N,T}) where {Op,N,T}
-    Base.reduce(Val(Op), v)
-end
-
-@inline Base.maximum(v::Vec{N,T}) where {N,T<:IntegerTypes} = reduce(Val(:max), v)
-@inline Base.minimum(v::Vec{N,T}) where {N,T<:IntegerTypes} = reduce(Val(:min), v)
-
-# Load and store functions
-
-export valloc
-function valloc(::Type{T}, N::Int, sz::Int) where T
-    @assert N > 0
-    @assert sz >= 0
-    # We use padding to align the address of the first element, and
-    # also to ensure that we can access past the last element up to
-    # the next full vector width
-    padding = N-1 + mod(-sz, N)
-    mem = Vector{T}(undef, sz + padding)
-    addr = Int(pointer(mem))
-    off = mod(-addr, N * sizeof(T))
-    @assert mod(off, sizeof(T)) == 0
-    off = fld(off, sizeof(T))
-    @assert 0 <= off <= padding
-    res = view(mem, off+1 : off+sz)
-    addr2 = Int(pointer(res))
-    @assert mod(addr2, N * sizeof(T)) == 0
-    res
-end
-function valloc(f, ::Type{T}, N::Int, sz::Int) where T
-    mem = valloc(T, N, sz)
-    @inbounds for i in 1:sz
-        mem[i] = f(i)
-    end
-    mem
-end
-
-export vload, vloada, vloadnt
-@generated function vload(::Type{Vec{N,T}}, ptr::Ptr{T},
-                          ::Val{Aligned} = Val(false),
-                          ::Val{Nontemporal} = Val(false)) where {N,T,Aligned,Nontemporal}
-    @assert isa(Aligned, Bool)
-    ptyp = llvmtype(Int)
-    typ = llvmtype(T)
-    vtyp = "<$N x $typ>"
-    decls = []
-    instrs = []
-    if Aligned
-        align = N * sizeof(T)
-    else
-        align = sizeof(T)   # This is overly optimistic
-    end
-    flags = [""]
-    if align > 0
-        push!(flags, "align $align")
-    end
-    if Nontemporal
-        push!(flags, "!nontemporal !{i32 1}")
-    end
-    if VERSION < v"v0.7.0-DEV"
-        push!(instrs, "%ptr = bitcast $typ* %0 to $vtyp*")
-    else
-        push!(instrs, "%ptr = inttoptr $ptyp %0 to $vtyp*")
-    end
-    push!(instrs, "%res = load $vtyp, $vtyp* %ptr" * join(flags, ", "))
-    push!(instrs, "ret $vtyp %res")
-    quote
-        $(Expr(:meta, :inline))
-        Vec{N,T}(Base.llvmcall($((join(decls, "\n"), join(instrs, "\n"))),
-            NTuple{N,VE{T}}, Tuple{Ptr{T}}, ptr))
-    end
-end
-@inline function vload(::Type{Vec{N,T}}, ptr::Ptr{T},
-                       ::Type{Val{Aligned}},
-                       ::Type{Val{Nontemporal}} = Val{false}) where {N,T,Aligned,Nontemporal}
-    vload(Vec{N, T}, ptr, Val(Aligned), Val(Nontemporal))
-end
-
-@inline vloada(::Type{Vec{N,T}}, ptr::Ptr{T}) where {N,T} =
-    vload(Vec{N,T}, ptr, Val(true))
-
-@inline vloadnt(::Type{Vec{N,T}}, ptr::Ptr{T}) where {N,T} =
-    vload(Vec{N,T}, ptr, Val(true), Val(true))
-
-@inline function vload(::Type{Vec{N,T}},
-                       arr::FastContiguousArray{T,1},
-                       i::Integer,
-                       ::Val{Aligned} = Val(false),
-                       ::Val{Nontemporal} = Val(false)) where {N,T,Aligned,Nontemporal}
-    #TODO @boundscheck 1 <= i <= length(arr) - (N-1) || throw(BoundsError())
-    vload(Vec{N,T}, pointer(arr, i), Val(Aligned), Val(Nontemporal))
-end
-@inline function vload(::Type{Vec{N,T}},
-                       arr::FastContiguousArray{T,1},
-                       i::Integer,
-                       ::Type{Val{Aligned}},k = Val{false},
-                       ::Type{Val{Nontemporal}} = Val{false}) where {N,T,Aligned,Nontemporal}
-    vload(Vec{N,T}, arr, i, Val(Aligned), Val(Nontemporal))
-end
-@inline function vloada(::Type{Vec{N,T}},
-                        arr::FastContiguousArray{T,1},
-                        i::Integer) where {N,T}
-    vload(Vec{N,T}, arr, i, Val(true))
-end
-@inline function vloadnt(::Type{Vec{N,T}},
-                        arr::Union{Array{T,1},SubArray{T,1}},
-                        i::Integer) where {N,T}
-    vload(Vec{N,T}, arr, i, Val(true), Val(true))
-end
-
-@inline vload(::Type{Vec{N,T}}, ptr::Ptr{T}, mask::Nothing,
-              ::Val{Aligned} = Val(false)) where {N,T,Aligned} =
-    vload(Vec{N,T}, ptr, Val(Aligned))
-@inline vload(::Type{Vec{N,T}}, ptr::Ptr{T}, mask::Nothing,
-              ::Type{Val{Aligned}}) where {N,T,Aligned} =
-    vload(Vec{N,T}, ptr, make, Val(Aligned))
-
-@generated function vload(::Type{Vec{N,T}}, ptr::Ptr{T},
-                          mask::Vec{N,Bool},
-                          ::Val{Aligned} = Val(false)) where {N,T,Aligned}
-    @assert isa(Aligned, Bool)
-    ptyp = llvmtype(Int)
-    typ = llvmtype(T)
-    vtyp = "<$N x $typ>"
-    btyp = llvmtype(Bool)
-    vbtyp = "<$N x $btyp>"
-    decls = []
-    instrs = []
-    if Aligned
-        align = N * sizeof(T)
-    else
-        align = sizeof(T)   # This is overly optimistic
-    end
-
-    if VERSION < v"v0.7.0-DEV"
-        push!(instrs, "%ptr = bitcast $typ* %0 to $vtyp*")
-    else
-        push!(instrs, "%ptr = inttoptr $ptyp %0 to $vtyp*")
-    end
-    push!(instrs, "%mask = trunc $vbtyp %1 to <$N x i1>")
-    push!(decls,
-        "declare $vtyp @llvm.masked.load.$(suffix(N,T))($vtyp*, i32, " *
-            "<$N x i1>, $vtyp)")
-    push!(instrs,
-        "%res = call $vtyp @llvm.masked.load.$(suffix(N,T))($vtyp* %ptr, " *
-            "i32 $align, <$N x i1> %mask, $vtyp $(llvmconst(N, T, 0)))")
-    push!(instrs, "ret $vtyp %res")
-    quote
-        $(Expr(:meta, :inline))
-        Vec{N,T}(Base.llvmcall($((join(decls, "\n"), join(instrs, "\n"))),
-            NTuple{N,VE{T}}, Tuple{Ptr{T}, NTuple{N,VE{Bool}}}, ptr, mask.elts))
-    end
-end
-@inline function vload(::Type{Vec{N,T}}, ptr::Ptr{T},
-                       mask::Vec{N,Bool},
-                       ::Type{Val{Aligned}}) where {N,T,Aligned}
-    vload(Vec{N,T}, ptr, mask, Val(Aligned))
-end
-
-@inline vloada(::Type{Vec{N,T}}, ptr::Ptr{T},
-               mask::Union{Vec{N,Bool}, Nothing}) where {N,T} =
-    vload(Vec{N,T}, ptr, mask, Val(true))
-
-@inline function vload(::Type{Vec{N,T}},
-                       arr::FastContiguousArray{T,1},
-                       i::Integer, mask::Union{Vec{N,Bool}, Nothing},
-                       ::Val{Aligned} = Val(false)) where {N,T,Aligned}
-    #TODO @boundscheck 1 <= i <= length(arr) - (N-1) || throw(BoundsError())
-    vload(Vec{N,T}, pointer(arr, i), mask, Val(Aligned))
-end
-@inline function vload(::Type{Vec{N,T}},
-                       arr::FastContiguousArray{T,1},
-                       i::Integer, mask::Union{Vec{N,Bool}, Nothing},
-                       ::Type{Val{Aligned}}) where {N,T,Aligned}
-    vload(Vec{N,T}, arr, i, mask, Val(Aligned))
-end
-@inline function vloada(::Type{Vec{N,T}},
-                        arr::FastContiguousArray{T,1}, i::Integer,
-                        mask::Union{Vec{N,Bool}, Nothing}) where {N,T}
-    vload(Vec{N,T}, arr, i, mask, Val(true))
-end
-
-export vstore, vstorea, vstorent
-@generated function vstore(v::Vec{N,T}, ptr::Ptr{T},
-                           ::Val{Aligned} = Val(false),
-                           ::Val{Nontemporal} = Val(false)) where {N,T,Aligned,Nontemporal}
-    @assert isa(Aligned, Bool)
-    @assert isa(Nontemporal, Bool)
-    ptyp = llvmtype(Int)
-    typ = llvmtype(T)
-    vtyp = "<$N x $typ>"
-    decls = []
-    instrs = []
-    if Aligned
-        align = N * sizeof(T)
-    else
-        align = sizeof(T)   # This is overly optimistic
-    end
-    flags = [""]
-    if align > 0
-        push!(flags, "align $align")
-    end
-    if Nontemporal
-        push!(flags, "!nontemporal !{i32 1}")
-    end
-    if VERSION < v"v0.7.0-DEV"
-        push!(instrs, "%ptr = bitcast $typ* %1 to $vtyp*")
-    else
-        push!(instrs, "%ptr = inttoptr $ptyp %1 to $vtyp*")
-    end
-    push!(instrs, "store $vtyp %0, $vtyp* %ptr" * join(flags, ", "))
-    push!(instrs, "ret void")
-    quote
-        $(Expr(:meta, :inline))
-        Base.llvmcall($((join(decls, "\n"), join(instrs, "\n"))),
-                      Cvoid, Tuple{NTuple{N,VE{T}}, Ptr{T}}, v.elts, ptr)
-    end
-end
-@inline function vstore(v::Vec{N,T}, ptr::Ptr{T},
-                        ::Type{Val{Aligned}},
-                        ::Type{Val{Nontemporal}} = Val{false}) where {N,T,Aligned,Nontemporal}
-    vstore(v, ptr, Val(Aligned), Val(Nontemporal))
-end
-
-@inline vstorea(v::Vec{N,T}, ptr::Ptr{T}) where {N,T} = vstore(v, ptr, Val{true})
-
-@inline vstorent(v::Vec{N,T}, ptr::Ptr{T}) where {N,T} = vstore(v, ptr, Val{true}, Val{true})
-
-@inline function vstore(v::Vec{N,T},
-                        arr::FastContiguousArray{T,1},
-                        i::Integer,
-                        ::Val{Aligned} = Val(false),
-                        ::Val{Nontemporal} = Val(false)) where {N,T,Aligned,Nontemporal}
-    @boundscheck 1 <= i <= length(arr) - (N-1) || throw(BoundsError())
-    vstore(v, pointer(arr, i), Val{Aligned}, Val{Nontemporal})
-end
-@inline function vstore(v::Vec{N,T},
-                        arr::FastContiguousArray{T,1},
-                        i::Integer,
-                        ::Type{Val{Aligned}},
-                        ::Type{Val{Nontemporal}} = Val{false}) where {N,T,Aligned,Nontemporal}
-    vstore(v, arr, i, Val(Aligned), Val(Nontemporal))
-end
-@inline function vstorea(v::Vec{N,T}, arr::FastContiguousArray{T,1},
-                         i::Integer) where {N,T}
-    vstore(v, arr, i, Val{true})
-end
-@inline function vstorent(v::Vec{N,T}, arr::FastContiguousArray{T,1},
-                         i::Integer) where {N,T}
-    vstore(v, arr, i, Val{true}, Val{true})
-end
-
-@inline vstore(v::Vec{N,T}, ptr::Ptr{T}, mask::Nothing,
-               ::Val{Aligned} = Val(false)) where {N,T,Aligned} =
-    vstore(v, ptr, Val{Aligned})
-@inline vstore(v::Vec{N,T}, ptr::Ptr{T}, mask::Nothing,
-               ::Type{Val{Aligned}}) where {N,T,Aligned} =
-    vstore(v, ptr, mask, Val(Aligned))
-
-@generated function vstore(v::Vec{N,T}, ptr::Ptr{T},
-                           mask::Vec{N,Bool},
-                           ::Val{Aligned} = Val(false)) where {N,T,Aligned}
-    @assert isa(Aligned, Bool)
-    ptyp = llvmtype(Int)
-    typ = llvmtype(T)
-    vtyp = "<$N x $typ>"
-    btyp = llvmtype(Bool)
-    vbtyp = "<$N x $btyp>"
-    decls = []
-    instrs = []
-    if Aligned
-        align = N * sizeof(T)
-    else
-        align = sizeof(T)   # This is overly optimistic
-    end
-    if VERSION < v"v0.7.0-DEV"
-        push!(instrs, "%ptr = bitcast $typ* %1 to $vtyp*")
-    else
-        push!(instrs, "%ptr = inttoptr $ptyp %1 to $vtyp*")
-    end
-    push!(instrs, "%mask = trunc $vbtyp %2 to <$N x i1>")
-    push!(decls,
-        "declare void @llvm.masked.store.$(suffix(N,T))($vtyp, $vtyp*, i32, " *
-            "<$N x i1>)")
-    push!(instrs,
-        "call void @llvm.masked.store.$(suffix(N,T))($vtyp %0, $vtyp* %ptr, " *
-            "i32 $align, <$N x i1> %mask)")
-    push!(instrs, "ret void")
-    quote
-        $(Expr(:meta, :inline))
-        Base.llvmcall($((join(decls, "\n"), join(instrs, "\n"))),
-            Cvoid, Tuple{NTuple{N,VE{T}}, Ptr{T}, NTuple{N,VE{Bool}}},
-            v.elts, ptr, mask.elts)
-    end
-end
-@inline function vstore(v::Vec{N,T}, ptr::Ptr{T},
-                        mask::Vec{N,Bool},
-                        ::Type{Val{Aligned}}) where {N,T,Aligned}
-    vstore(v, ptr, mask, Val(Aligned))
-end
-
-@inline vstorea(v::Vec{N,T}, ptr::Ptr{T},
-                mask::Union{Vec{N,Bool}, Nothing}) where {N,T} =
-    vstore(v, ptr, mask, Val{true})
-
-@inline function vstore(v::Vec{N,T},
-                        arr::FastContiguousArray{T,1},
-                        i::Integer,
-                        mask::Union{Vec{N,Bool}, Nothing},
-                        ::Val{Aligned} = Val(false),
-                        ::Val{Nontemporal} = Val(false)) where {N,T,Aligned,Nontemporal}
-    #TODO @boundscheck 1 <= i <= length(arr) - (N-1) || throw(BoundsError())
-    vstore(v, pointer(arr, i), mask, Val{Aligned}, Val{Nontemporal})
-end
-@inline function vstore(v::Vec{N,T},
-                        arr::FastContiguousArray{T,1},
-                        i::Integer,
-                        mask::Union{Vec{N,Bool}, Nothing},
-                        ::Type{Val{Aligned}},
-                        ::Type{Val{Nontemporal}} = Val{false}) where {N,T,Aligned,Nontemporal}
-    vstore(v, arr, i, mask, Val(Aligned), Val(Nontemporal))
-end
-@inline function vstorea(v::Vec{N,T},
-                         arr::FastContiguousArray{T,1},
-                         i::Integer,
-                         mask::Union{Vec{N,Bool}, Nothing}) where {N,T}
-    vstore(v, arr, i, mask, Val{true})
-end
-
-export vgather, vgathera
-
-@inline vgather(
-        ::Type{Vec{N,T}}, ptrs::Vec{N,Ptr{T}}, mask::Nothing,
-        ::Val{Aligned} = Val(false)) where {N,T,Aligned} =
-    vgather(Vec{N,T}, ptrs, Vec(ntuple(_ -> true, N)), Val(Aligned))
-@inline vgather(
-        ::Type{Vec{N,T}}, ptrs::Vec{N,Ptr{T}}, mask::Nothing,
-        ::Type{Val{Aligned}}) where {N,T,Aligned} =
-    vgather(Vec{N,T}, ptrs, mask, Val(Aligned))
-
-@generated function vgather(
-        ::Type{Vec{N,T}}, ptrs::Vec{N,Ptr{T}}, mask::Vec{N,Bool},
-        ::Val{Aligned} = Val(false)) where {N,T,Aligned}
-    @assert isa(Aligned, Bool)
-    ptyp = llvmtype(Int)
-    typ = llvmtype(T)
-    vptyp = "<$N x $typ*>"
-    vtyp = "<$N x $typ>"
-    btyp = llvmtype(Bool)
-    vbtyp = "<$N x $btyp>"
-    decls = []
-    instrs = []
-    if Aligned
-        align = N * sizeof(T)
-    else
-        align = sizeof(T)   # This is overly optimistic
-    end
-
-    if VERSION < v"v0.7.0-DEV"
-        push!(instrs, "%ptrs = bitcast <$N x $typ*> %0 to $vptyp")
-    else
-        push!(instrs, "%ptrs = inttoptr <$N x $ptyp> %0 to $vptyp")
-    end
-    push!(instrs, "%mask = trunc $vbtyp %1 to <$N x i1>")
-    push!(decls,
-        "declare $vtyp @llvm.masked.gather.$(suffix(N,T))($vptyp, i32, " *
-            "<$N x i1>, $vtyp)")
-    push!(instrs,
-        "%res = call $vtyp @llvm.masked.gather.$(suffix(N,T))($vptyp %ptrs, " *
-            "i32 $align, <$N x i1> %mask, $vtyp $(llvmconst(N, T, 0)))")
-    push!(instrs, "ret $vtyp %res")
-    quote
-        $(Expr(:meta, :inline))
-        Vec{N,T}(Base.llvmcall($((join(decls, "\n"), join(instrs, "\n"))),
-            NTuple{N,VE{T}}, Tuple{NTuple{N,VE{Ptr{T}}}, NTuple{N,VE{Bool}}},
-            ptrs.elts, mask.elts))
-    end
-end
-@inline function vgather(
-        ::Type{Vec{N,T}}, ptrs::Vec{N,Ptr{T}}, mask::Vec{N,Bool},
-        ::Type{Val{Aligned}}) where {N,T,Aligned}
-    vgather(Vec{N,T}, ptrs, mask, Val(Aligned))
-end
-
-@inline vgathera(::Type{Vec{N,T}}, ptrs::Vec{N,Ptr{T}},
-                 mask::Union{Vec{N,Bool}, Nothing}) where {N,T} =
-    vgather(Vec{N,T}, ptrs, mask, Val{true})
-
-@inline vgather(arr::FastContiguousArray{T,1},
-                idx::Vec{N,<:Integer},
-                mask::Union{Vec{N,Bool}, Nothing} = nothing,
-                ::Val{Aligned} = Val(false)) where {N,T,Aligned} =
-    vgather(Vec{N,T},
-            pointer(arr) + sizeof(T) * (idx - 1),
-            mask, Val{Aligned})
-@inline vgather(arr::FastContiguousArray{T,1},
-                idx::Vec{N,<:Integer},
-                mask::Union{Vec{N,Bool}, Nothing},
-                ::Type{Val{Aligned}}) where {N,T,Aligned} =
-    vgather(arr, idx, mask, Val(Aligned))
-
-@inline vgathera(arr::FastContiguousArray{T,1},
-                 idx::Vec{N,<:Integer},
-                 mask::Union{Vec{N,Bool}, Nothing} = nothing) where {N,T} =
-    vgather(arr, idx, mask, Val{true})
-
-export vscatter, vscattera
-
-@inline vscatter(
-        v::Vec{N,T}, ptrs::Vec{N,Ptr{T}}, mask::Nothing,
-        ::Val{Aligned} = Val(false)) where {N,T,Aligned} =
-    vscatter(v, ptrs, Vec(ntuple(_ -> true, N)), Val(Aligned))
-@inline vscatter(
-        v::Vec{N,T}, ptrs::Vec{N,Ptr{T}}, mask::Nothing,
-        ::Type{Val{Aligned}}) where {N,T,Aligned} =
-    vscatter(v, ptrs, mask, Val(Aligned))
-
-@generated function vscatter(
-        v::Vec{N,T}, ptrs::Vec{N,Ptr{T}}, mask::Vec{N,Bool},
-        ::Val{Aligned} = Val(false)) where {N,T,Aligned}
-    @assert isa(Aligned, Bool)
-    ptyp = llvmtype(Int)
-    typ = llvmtype(T)
-    vptyp = "<$N x $typ*>"
-    vtyp = "<$N x $typ>"
-    btyp = llvmtype(Bool)
-    vbtyp = "<$N x $btyp>"
-    decls = []
-    instrs = []
-    if Aligned
-        align = N * sizeof(T)
-    else
-        align = sizeof(T)   # This is overly optimistic
-    end
-    if VERSION < v"v0.7.0-DEV"
-        push!(instrs, "%ptrs = bitcast <$N x $typ*> %1 to $vptyp")
-    else
-        push!(instrs, "%ptrs = inttoptr <$N x $ptyp> %1 to $vptyp")
-    end
-    push!(instrs, "%mask = trunc $vbtyp %2 to <$N x i1>")
-    push!(decls,
-        "declare void @llvm.masked.scatter.$(suffix(N,T))" *
-            "($vtyp, $vptyp, i32, <$N x i1>)")
-    push!(instrs,
-        "call void @llvm.masked.scatter.$(suffix(N,T))" *
-            "($vtyp %0, $vptyp %ptrs, i32 $align, <$N x i1> %mask)")
-    push!(instrs, "ret void")
-    quote
-        $(Expr(:meta, :inline))
-        Base.llvmcall($((join(decls, "\n"), join(instrs, "\n"))),
-            Cvoid,
-            Tuple{NTuple{N,VE{T}}, NTuple{N,VE{Ptr{T}}}, NTuple{N,VE{Bool}}},
-            v.elts, ptrs.elts, mask.elts)
-    end
-end
-@inline function vscatter(
-        v::Vec{N,T}, ptrs::Vec{N,Ptr{T}}, mask::Vec{N,Bool},
-        ::Type{Val{Aligned}}) where {N,T,Aligned}
-    vscatter(v, ptrs, mask, Val(Aligned))
-end
-
-@inline vscattera(v::Vec{N,T}, ptrs::Vec{N,Ptr{T}},
-                  mask::Union{Vec{N,Bool}, Nothing}) where {N,T} =
-    vscatter(v, ptrs, mask, Val{true})
-
-@inline vscatter(v::Vec{N,T}, arr::FastContiguousArray{T,1},
-                 idx::Vec{N,<:Integer},
-                 mask::Union{Vec{N,Bool}, Nothing} = nothing,
-                 ::Val{Aligned} = Val(false)) where {N,T,Aligned} =
-    vscatter(v, pointer(arr) + sizeof(T) * (idx - 1), mask, Val(Aligned))
-@inline vscatter(v::Vec{N,T}, arr::FastContiguousArray{T,1},
-                 idx::Vec{N,<:Integer},
-                 mask::Union{Vec{N,Bool}, Nothing},
-                 ::Type{Val{Aligned}}) where {N,T,Aligned} =
-    vscatter(v, arr, idx, mask, Val(Aligned))
-
-@inline vscattera(v::Vec{N,T}, arr::FastContiguousArray{T,1},
-                  idx::Vec{N,<:Integer},
-                  mask::Union{Vec{N,Bool}, Nothing} = nothing) where {N,T} =
-    vscatter(v, arr, idx, mask, Val{true})
-
-# Vector shuffles
-
-function shufflevector_instrs(N, T, I, two_operands)
-    typ = llvmtype(T)
-    vtyp2 = vtyp1 = "<$N x $typ>"
-    M = length(I)
-    vtyp3 = "<$M x i32>"
-    vtypr = "<$M x $typ>"
-    mask = "<" * join(map(x->string("i32 ", x), I), ", ") * ">"
-    instrs = []
-    v2 = two_operands ? "%1" : "undef"
-    push!(instrs, "%res = shufflevector $vtyp1 %0, $vtyp2 $v2, $vtyp3 $mask")
-    push!(instrs, "ret $vtypr %res")
-    return M, [], instrs
-end
-
-export shufflevector
-@generated function shufflevector(v1::Vec{N,T}, v2::Vec{N,T},
-                                  ::Val{I}) where {N,T,I}
-    M, decls, instrs = shufflevector_instrs(N, T, I, true)
-    quote
-        $(Expr(:meta, :inline))
-        Vec{$M,T}(Base.llvmcall($((join(decls, "\n"), join(instrs, "\n"))),
-            NTuple{$M,VE{T}},
-            Tuple{NTuple{N,VE{T}}, NTuple{N,VE{T}}},
-            v1.elts, v2.elts))
-    end
-end
-@inline function shufflevector(v1::Vec{N,T}, v2::Vec{N,T},
-                               ::Type{Val{I}}) where {N,T,I}
-    shufflevector(v1, v2, Val(I))
-end
-
-@generated function shufflevector(v1::Vec{N,T}, ::Val{I}) where {N,T,I}
-    M, decls, instrs = shufflevector_instrs(N, T, I, false)
-    quote
-        $(Expr(:meta, :inline))
-        Vec{$M,T}(Base.llvmcall($((join(decls, "\n"), join(instrs, "\n"))),
-            NTuple{$M,VE{T}},
-            Tuple{NTuple{N,VE{T}}},
-            v1.elts))
-    end
-end
-@inline function shufflevector(v1::Vec{N,T}, ::Type{Val{I}}) where {N,T,I}
-    shufflevector(v1, Val(I))
-end
-
-export VecRange
-
-"""
-    VecRange{N}(i::Int)
-
-Analogous to `UnitRange` but for loading SIMD vector of width `N` at
-index `i`.
-
-# Examples
-```jldoctest
-julia> xs = ones(4);
-
-julia> xs[VecRange{4}(1)]  # calls `vload(Vec{4,Float64}, xs, 1)`
-<4 x Float64>[1.0, 1.0, 1.0, 1.0]
-```
-"""
-struct VecRange{N}
-    i::Int
-end
-
-@inline Base.length(idx::VecRange{N}) where {N} = N
-@inline Base.first(idx::VecRange) = idx.i
-@inline Base.last(idx::VecRange) = idx.i + length(idx) - 1
-
-@inline Base.:+(idx::VecRange{N}, j::Integer) where N = VecRange{N}(idx.i + j)
-@inline Base.:+(j::Integer, idx::VecRange{N}) where N = VecRange{N}(idx.i + j)
-@inline Base.:-(idx::VecRange{N}, j::Integer) where N = VecRange{N}(idx.i - j)
-
-Base.checkindex(::Type{Bool}, inds::AbstractUnitRange, idx::VecRange) =
-    (first(inds) <= first(idx)) && (last(idx) <= last(inds))
-
-Base.checkindex(::Type{Bool}, inds::AbstractUnitRange, idx::Vec) =
-    all(first(inds) <= idx) && all(idx <= last(inds))
-
-@inline _checkarity(::AbstractArray{<:Any,N}, ::Vararg{<:Any,N}) where {N} =
-    nothing
-
-@inline _checkarity(::T, ::Any) where {T <: AbstractArray} =
-    if IndexStyle(T) isa IndexLinear
-        nothing
-    else
-        throw(ArgumentError("""
-        Array type $T does not support indexing with a single index.
-        Exactly $(ndims(T)) (non-mask) indices have to be specified.
-        """))
-    end
-
-_checkarity(::AbstractArray{<:Any,N}, ::Vararg{<:Any,M}) where {N,M} =
-    throw(ArgumentError("""
-    $M indices are given to $N-dimensional array.
-    Exactly $N (non-mask) indices have to be specified when using SIMD.
-    """))
-
-# Combined with `_preprocessindices`, helper function `_extractmask`
-# extracts `mask` in the tail position.  As slicing tuple is not
-# type-stable, we use reverse-of-tail-of-reverse hack to extract
-# `mask` at the end of `args`.
-@inline _extractmask(mask::Vec{N,Bool}, R::Vararg{Integer}) where N =
-    (reverse(R), mask)
-@inline _extractmask(R::Vararg{Integer}) = (reverse(R), nothing)
-@inline _extractmask(mask::Vec{N,Bool}) where {N} = ((), mask)
-@inline _extractmask() = ((), nothing)
-
-@noinline _extractmask(rargs...) =
-    throw(ArgumentError("""
-    Using SIMD indexing `array[idx, i2, ..., iN, mask]` for `N`-dimensional
-    array requires `i2` to `iN` to be all integers and `mask` to be optionally
-    a SIMD vector `Vec` of `Bool`s.  Given `(i2, ..., iN, mask)` is
-    $(summary(reverse(rargs)))
-    """))
-
-_maskedidx(idx, ::Nothing, ::Any) = idx
-_maskedidx(idx::Vec, mask::Vec, fst) = vifelse(mask, idx, fst)
-_maskedidx(idx::VecRange, mask::Vec, fst) =
-    _maskedidx(Vec(ntuple(i -> i - 1 + idx.i, length(mask))), mask, fst)
-
-Base.@propagate_inbounds function _preprocessindices(arr, idx, args)
-    I, mask = _extractmask(reverse(args)...)
-    _checkarity(arr, idx, I...)
-    @boundscheck checkbounds(arr,
-                             _maskedidx(idx, mask, first(axes(arr, 1))),
-                             I...)
-    return I, mask
-end
-
-"""
-    _pointer(arr, i, I)
-
-Pointer to the element `arr[i, I...]`.
-"""
-Base.@propagate_inbounds _pointer(arr::Array, i, I) =
-    pointer(arr, LinearIndices(arr)[i, I...])
-Base.@propagate_inbounds _pointer(arr::Base.FastContiguousSubArray, i, I) =
-    pointer(arr, (i, I...))
-Base.@propagate_inbounds _pointer(arr::SubArray, i, I) =
-    pointer(Base.unsafe_view(arr, 1, I...), i)
-
-Base.@propagate_inbounds function Base.getindex(
-        arr::ContiguousArray{T}, idx::VecRange{N},
-        args::Vararg{Union{Integer,Vec{N,Bool}}}) where {N,T}
-    I, mask = _preprocessindices(arr, idx, args)
-    return vload(Vec{N,T}, _pointer(arr, idx.i, I), mask)
-end
-
-Base.@propagate_inbounds function Base.setindex!(
-        arr::ContiguousArray{T}, v::Vec{N,T}, idx::VecRange{N},
-        args::Vararg{Union{Integer,Vec{N,Bool}}}) where {N,T}
-    I, mask = _preprocessindices(arr, idx, args)
-    vstore(v, _pointer(arr, idx.i, I), mask)
-    return arr
-end
-
-Base.@propagate_inbounds function Base.getindex(
-        arr::ContiguousArray{T}, idx::Vec{N,<:Integer},
-        args::Vararg{Union{Integer,Vec{N,Bool}}}) where {N,T}
-    I, mask = _preprocessindices(arr, idx, args)
-    ptrs = _pointer(arr, 1, I) - sizeof(T) + sizeof(T) * idx
-    return vgather(Vec{N,T}, ptrs, mask)
-end
-
-Base.@propagate_inbounds function Base.setindex!(
-        arr::ContiguousArray{T}, v::Vec{N,T}, idx::Vec{N,<:Integer},
-        args::Vararg{Union{Integer,Vec{N,Bool}}}) where {N,T}
-    I, mask = _preprocessindices(arr, idx, args)
-    ptrs = _pointer(arr, 1, I) - sizeof(T) + sizeof(T) * idx
-    vscatter(v, ptrs, mask)
-    return arr
-end
+using Base: @propagate_inbounds
+
+export Vec, vload, vloada, vloadnt, vstore, vstorea, vstorent, vgather, vgathera,
+       vscatter, vscattera, shufflevector, vifelse, valloc, VecRange
+
+const VE         = Base.VecElement
+const LVec{N, T} = NTuple{N, VE{T}}
+
+const IntTypes      = Union{Int8, Int16, Int32, Int64} # Int128 and UInt128 does not get passed as LLVM vectors
+const BIntTypes     = Union{IntTypes, Bool}
+const UIntTypes     = Union{UInt8, UInt16, UInt32, UInt64}
+const IntegerTypes  = Union{IntTypes, UIntTypes}
+const BIntegerTypes = Union{IntegerTypes, Bool}
+const FloatingTypes = Union{Float32, Float64} # Float16 support is non-native in Julia and gets passed as an i16
+const ScalarTypes   = Union{IntegerTypes, FloatingTypes}
+const VecTypes      = Union{ScalarTypes, Ptr, Bool}
+
+include("LLVM_intrinsics.jl")
+include("simdvec.jl")
+include("arrayops.jl")
 
 end
diff --git a/src/arrayops.jl b/src/arrayops.jl
new file mode 100644
index 0000000..4e2ccfa
--- /dev/null
+++ b/src/arrayops.jl
@@ -0,0 +1,285 @@
+using Base: Slice, ScalarIndex
+
+"""
+    ContiguousSubArray{T,N,P,I,L}
+
+Like `Base.FastContiguousSubArray` but without requirement for linear
+indexing (i.e., type parameter `L` can be `false`).
+
+# Examples
+
+```
+julia> A = view(ones(5, 5), :, [1,3]);
+
+julia> A isa Base.FastContiguousSubArray
+false
+
+julia> A isa SIMD.ContiguousSubArray
+true
+```
+"""
+ContiguousSubArray{T,N,P,
+                   I<:Union{Tuple{Union{Slice, AbstractUnitRange}, Vararg{Any}},
+                            Tuple{Vararg{ScalarIndex}}},
+                   L} = SubArray{T,N,P,I,L}
+
+"""
+    ContiguousArray{T,N}
+
+Array types with contiguous first dimension.
+"""
+ContiguousArray{T,N} = Union{DenseArray{T,N}, ContiguousSubArray{T,N}}
+
+"""
+    FastContiguousArray{T,N}
+
+This is the type of arrays that `pointer(A, i)` works.
+"""
+FastContiguousArray{T,N} = Union{DenseArray{T,N}, Base.FastContiguousSubArray{T,N}}
+# https://github.com/eschnett/SIMD.jl/pull/40#discussion_r254131184
+# https://github.com/JuliaArrays/MappedArrays.jl/pull/24#issuecomment-460568978
+
+# vload
+@propagate_inbounds function vload(::Type{Vec{N, T}}, ptr::Ptr{T}, mask::Union{Nothing, Vec{N, Bool}}=nothing,
+                       ::Val{Aligned}=Val(false), ::Val{Nontemporal}=Val(false)) where {N, T, Aligned, Nontemporal}
+    if mask === nothing
+        Vec(Intrinsics.load(Intrinsics.LVec{N, T}, ptr, Val(Aligned), Val(Nontemporal)))
+    else
+        Vec(Intrinsics.maskedload(ptr, mask.data, Val(Aligned), Val(Nontemporal)))
+    end
+end
+
+@propagate_inbounds function vload(::Type{Vec{N, T}}, a::FastContiguousArray{T,1}, i::Integer, mask=nothing,
+                       ::Val{Aligned}=Val(false), ::Val{Nontemporal}=Val(false)) where {N, T, Aligned, Nontemporal}
+    @boundscheck checkbounds(a, i:(i+N-1))
+    GC.@preserve a begin
+        ptr = pointer(a, i)
+        vload(Vec{N, T}, ptr, mask, Val(Aligned), Val(Nontemporal))
+    end
+end
+@propagate_inbounds vloada(::Type{T}, a, i, mask=nothing) where {T<:Vec} = vload(T, a, i, mask, Val(true))
+@propagate_inbounds vloadnt(::Type{T}, a, i, mask=nothing) where {T<:Vec} = vload(T, a, i, mask, Val(true), Val(true))
+
+# vstore
+@propagate_inbounds function vstore(x::Vec{N, T}, ptr::Ptr{T}, mask::Union{Nothing, Vec{N, Bool}}=nothing,
+                       ::Val{Aligned}=Val(false), ::Val{Nontemporal}=Val(false)) where {N, T, Aligned, Nontemporal}
+    if mask === nothing
+        Intrinsics.store(x.data, ptr, Val(Aligned), Val(Nontemporal))
+    else
+        Intrinsics.maskedstore(x.data, ptr, mask.data, Val(Aligned), Val(Nontemporal))
+    end
+end
+@propagate_inbounds function vstore(x::Vec{N, T}, a::FastContiguousArray{T,1}, i::Integer, mask=nothing,
+               ::Val{Aligned}=Val(false), ::Val{Nontemporal}=Val(false)) where {N, T, Aligned, Nontemporal}
+    @boundscheck checkbounds(a, i:(i+N-1))
+    GC.@preserve a begin
+        ptr = pointer(a, i)
+        vstore(x, ptr, mask, Val(Aligned), Val(Nontemporal))
+    end
+    return a
+end
+@propagate_inbounds vstorea(x::Vec, a, i, mask=nothing) = vstore(x, a, i, nothing, Val(true))
+@propagate_inbounds vstorent(x::Vec, a, i, mask=nothing) = vstore(x, a, i, nothing, Val(true), Val(true))
+
+function valloc(::Type{T}, N::Int, sz::Int) where T
+    @assert N > 0
+    @assert sz >= 0
+    # We use padding to align the address of the first element, and
+    # also to ensure that we can access past the last element up to
+    # the next full vector width
+    padding = N-1 + mod(-sz, N)
+    mem = Vector{T}(undef, sz + padding)
+    addr = Int(pointer(mem))
+    off = mod(-addr, N * sizeof(T))
+    @assert mod(off, sizeof(T)) == 0
+    off = fld(off, sizeof(T))
+    @assert 0 <= off <= padding
+    res = view(mem, off+1 : off+sz)
+    addr2 = Int(pointer(res))
+    @assert mod(addr2, N * sizeof(T)) == 0
+    res
+end
+
+function valloc(f, ::Type{T}, N::Int, sz::Int) where T
+    mem = valloc(T, N, sz)
+    @inbounds for i in 1:sz
+        mem[i] = f(i)
+    end
+    mem
+end
+
+@inline function _get_vec_pointers(a, idx::Vec{N, Int}) where {N}
+    ptrs = pointer(a) + (idx - 1) * sizeof(eltype(a))
+end
+
+# Have to be careful with optional arguments and @boundscheck,
+# see https://github.com/JuliaLang/julia/issues/30411,
+# therefore use @propagate_inbounds
+@inline vgather(ptrs::Vec{N,Ptr{T}},
+                 mask::Vec{N,Bool}=one(Vec{N,Bool}),
+                 ::Val{Aligned}=Val(false)) where {N, T<:ScalarTypes, Aligned} =
+    return Vec(Intrinsics.maskedgather(ptrs.data, mask.data))
+@propagate_inbounds function vgather(a::FastContiguousArray{T,1}, idx::Vec{N, Int},
+                                     mask::Vec{N,Bool}=one(Vec{N,Bool}),
+                                     ::Val{Aligned}=Val(false)) where {N, T<:ScalarTypes, Aligned}
+    @boundscheck for i in 1:N
+        checkbounds(a, @inbounds idx[i])
+    end
+    GC.@preserve a begin
+        ptrs = _get_vec_pointers(a, idx)
+        return vgather(ptrs, mask, Val(Aligned))
+    end
+end
+@propagate_inbounds vgathera(a, idx, mask) = vgather(a, idx, mask, Val(true))
+@propagate_inbounds vgathera(a, idx::Vec{N}) where {N} = vgather(a, idx, one(Vec{N,Bool}), Val(true))
+
+@propagate_inbounds Base.getindex(a::FastContiguousArray{T,1}, idx::Vec{N,Int}) where {N,T} =
+    vgather(a, idx)
+
+
+@propagate_inbounds vscatter(x::Vec{N,T}, ptrs::Vec{N,Ptr{T}},
+                             mask::Vec{N,Bool}, ::Val{Aligned}=Val(false)) where {N, T<:ScalarTypes, Aligned} =
+    Intrinsics.maskedscatter(x.data, ptrs.data, mask.data)
+@propagate_inbounds function vscatter(x::Vec{N,T}, a::FastContiguousArray{T,1}, idx::Vec{N, Int},
+                                      mask::Vec{N,Bool}=one(Vec{N, Bool}),
+                                      ::Val{Aligned}=Val(false)) where {N, T<:ScalarTypes, Aligned}
+    @boundscheck for i in 1:N
+        checkbounds(a, @inbounds idx[i])
+    end
+    GC.@preserve a begin
+        ptrs = _get_vec_pointers(a, idx)
+        vscatter(x, ptrs, mask, Val(Aligned))
+    end
+    return
+end
+@propagate_inbounds vscattera(x, a, idx, mask) = vscatter(x, a, idx, mask, Val(true))
+@propagate_inbounds vscattera(x, a, idx::Vec{N}) where {N}  = vscatter(x, a, idx, one(Vec{N,Bool}), Val(true))
+
+@propagate_inbounds Base.setindex!(a::FastContiguousArray{T,1}, v::Vec{N,T}, idx::Vec{N,Int}) where {N, T} =
+    vscatter(v, a, idx)
+
+
+export VecRange
+
+"""
+    VecRange{N}(i::Int)
+Analogous to `UnitRange` but for loading SIMD vector of width `N` at
+index `i`.
+# Examples
+```jldoctest
+julia> xs = ones(4);
+julia> xs[VecRange{4}(1)]  # calls `vload(Vec{4,Float64}, xs, 1)`
+<4 x Float64>[1.0, 1.0, 1.0, 1.0]
+```
+"""
+struct VecRange{N}
+    i::Int
+end
+
+@inline Base.length(idx::VecRange{N}) where {N} = N
+@inline Base.first(idx::VecRange) = idx.i
+@inline Base.last(idx::VecRange) = idx.i + length(idx) - 1
+
+@inline Base.:+(idx::VecRange{N}, j::Integer) where N = VecRange{N}(idx.i + j)
+@inline Base.:+(j::Integer, idx::VecRange{N}) where N = VecRange{N}(idx.i + j)
+@inline Base.:-(idx::VecRange{N}, j::Integer) where N = VecRange{N}(idx.i - j)
+
+Base.checkindex(::Type{Bool}, inds::AbstractUnitRange, idx::VecRange) =
+    (first(inds) <= first(idx)) && (last(idx) <= last(inds))
+
+Base.checkindex(::Type{Bool}, inds::AbstractUnitRange, idx::Vec) =
+    all(first(inds) <= idx) && all(idx <= last(inds))
+
+@inline _checkarity(::AbstractArray{<:Any,N}, ::Vararg{<:Any,N}) where {N} =
+    nothing
+
+@inline _checkarity(::T, ::Any) where {T <: AbstractArray} =
+    if IndexStyle(T) isa IndexLinear
+        nothing
+    else
+        throw(ArgumentError("""
+        Array type $T does not support indexing with a single index.
+        Exactly $(ndims(T)) (non-mask) indices have to be specified.
+        """))
+    end
+
+_checkarity(::AbstractArray{<:Any,N}, ::Vararg{<:Any,M}) where {N,M} =
+    throw(ArgumentError("""
+    $M indices are given to $N-dimensional array.
+    Exactly $N (non-mask) indices have to be specified when using SIMD.
+    """))
+
+# Combined with `_preprocessindices`, helper function `_extractmask`
+# extracts `mask` in the tail position.  As slicing tuple is not
+# type-stable, we use reverse-of-tail-of-reverse hack to extract
+# `mask` at the end of `args`.
+@inline _extractmask(mask::Vec{N,Bool}, R::Vararg{Integer}) where N =
+    (reverse(R), mask)
+@inline _extractmask(R::Vararg{Integer}) = (reverse(R), nothing)
+@inline _extractmask(mask::Vec{N,Bool}) where {N} = ((), mask)
+@inline _extractmask() = ((), nothing)
+
+@noinline _extractmask(rargs...) =
+    throw(ArgumentError("""
+    Using SIMD indexing `array[idx, i2, ..., iN, mask]` for `N`-dimensional
+    array requires `i2` to `iN` to be all integers and `mask` to be optionally
+    a SIMD vector `Vec` of `Bool`s.  Given `(i2, ..., iN, mask)` is
+    $(summary(reverse(rargs)))
+    """))
+
+_maskedidx(idx, ::Nothing, ::Any) = idx
+_maskedidx(idx::Vec, mask::Vec, fst) = vifelse(mask, idx, fst)
+_maskedidx(idx::VecRange, mask::Vec, fst) =
+    _maskedidx(Vec(ntuple(i -> i - 1 + idx.i, length(mask))), mask, fst)
+
+Base.@propagate_inbounds function _preprocessindices(arr, idx, args)
+    I, mask = _extractmask(reverse(args)...)
+    _checkarity(arr, idx, I...)
+    @boundscheck checkbounds(arr,
+                             _maskedidx(idx, mask, first(axes(arr, 1))),
+                             I...)
+    return I, mask
+end
+
+"""
+    _pointer(arr, i, I)
+Pointer to the element `arr[i, I...]`.
+"""
+Base.@propagate_inbounds _pointer(arr::Array, i, I) =
+    pointer(arr, LinearIndices(arr)[i, I...])
+Base.@propagate_inbounds _pointer(arr::Base.FastContiguousSubArray, i, I) =
+    pointer(arr, (i, I...))
+Base.@propagate_inbounds _pointer(arr::SubArray, i, I) =
+    pointer(Base.unsafe_view(arr, 1, I...), i)
+
+Base.@propagate_inbounds function Base.getindex(
+        arr::ContiguousArray{T}, idx::VecRange{N},
+        args::Vararg{Union{Integer,Vec{N,Bool}}}) where {N,T}
+    I, mask = _preprocessindices(arr, idx, args)
+    return vload(Vec{N,T}, _pointer(arr, idx.i, I), mask)
+end
+
+Base.@propagate_inbounds function Base.setindex!(
+        arr::ContiguousArray{T}, v::Vec{N,T}, idx::VecRange{N},
+        args::Vararg{Union{Integer,Vec{N,Bool}}}) where {N,T}
+    I, mask = _preprocessindices(arr, idx, args)
+    vstore(v, _pointer(arr, idx.i, I), mask)
+    return arr
+end
+
+Base.@propagate_inbounds function Base.getindex(
+        arr::ContiguousArray{T}, idx::Vec{N,<:Integer},
+        args::Vararg{Union{Integer,Vec{N,Bool}}}) where {N,T}
+    I, mask = _preprocessindices(arr, idx, args)
+    ptrs = _pointer(arr, 1, I) - sizeof(T) + sizeof(T) * idx
+    return vgather(ptrs, mask)
+end
+
+Base.@propagate_inbounds function Base.setindex!(
+        arr::ContiguousArray{T}, v::Vec{N,T}, idx::Vec{N,<:Integer},
+        args::Vararg{Union{Integer,Vec{N,Bool}}}) where {N,T}
+    I, mask = _preprocessindices(arr, idx, args)
+    ptrs = _pointer(arr, 1, I) - sizeof(T) + sizeof(T) * idx
+    vscatter(v, ptrs, mask)
+    return arr
+end
diff --git a/src/simdvec.jl b/src/simdvec.jl
new file mode 100644
index 0000000..a47ef3c
--- /dev/null
+++ b/src/simdvec.jl
@@ -0,0 +1,479 @@
+struct Vec{N, T<:VecTypes}
+    data::LVec{N, T}
+end
+
+# Constructors
+@inline Vec(v::NTuple{N, T}) where {N, T<:VecTypes} = Vec(VE.(v))
+@inline Vec(v::Vararg{T, N}) where {N, T<:VecTypes} = Vec(v)
+@inline Vec(v::Vec) = v
+# Numbers defines this and it is needed in power_by_squaring...
+Base.copy(v::Vec) = v
+
+# No throwing versions of convert
+@inline _unsafe_convert(::Type{T}, v) where {T <: IntegerTypes} = v % T
+@inline _unsafe_convert(::Type{T}, v) where {T <: VecTypes} = convert(T, v)
+@inline constantvector(v::T1, ::Type{Vec{N, T2}}) where {N, T1, T2} =
+    Vec(Intrinsics.constantvector(_unsafe_convert(T2, v), Intrinsics.LVec{N, T2}))
+
+@inline Vec{N, T}(v::Vec{N, T}) where {N, T<:VecTypes} = v
+@inline Vec{N, T}(v::Vec{N, T}) where {N, T<:FloatingTypes} = v
+@inline Vec{N, T1}(v::T2) where {N, T1<:VecTypes, T2<:VecTypes} = constantvector(v, Vec{N, T1})
+@inline Vec{N, T1}(v::Vec{N, T2}) where {N, T1<:Union{IntegerTypes, Ptr}, T2<:Union{IntegerTypes, Ptr}} =
+    convert(Vec{N, T1}, v)
+
+@inline Base.convert(::Type{Vec{N,T}}, v::Vec{N,T}) where {N,T} = v
+@inline function Base.convert(::Type{Vec{N, T1}}, v::Vec{N, T2}) where {T1, T2, N}
+    if T1 <: Union{IntegerTypes, Ptr}
+        if T2 <: Union{IntegerTypes, Ptr, Bool}
+            if sizeof(T1) < sizeof(T2)
+                return Vec(Intrinsics.trunc(Intrinsics.LVec{N, T1}, v.data))
+            elseif sizeof(T1) == sizeof(T2)
+                return Vec(Intrinsics.bitcast(Intrinsics.LVec{N, T1}, v.data))
+            else
+                if T2 <: UIntTypes
+                    return Vec(Intrinsics.zext(Intrinsics.LVec{N, T1}, v.data))
+                else
+                    return Vec(Intrinsics.sext(Intrinsics.LVec{N, T1}, v.data))
+                end
+            end
+        elseif T2 <: FloatingTypes
+            if T1 <: UIntTypes
+                return Vec(Intrinsics.fptoui(Intrinsics.LVec{N, T1}, v.data))
+            elseif T1 <: IntTypes
+                return Vec(Intrinsics.fptosi(Intrinsics.LVec{N, T1}, v.data))
+            end
+        end
+    end
+    if T1 <: FloatingTypes
+        if T2 <: UIntTypes
+            return Vec(Intrinsics.uitofp(Intrinsics.LVec{N, T1}, v.data))
+        elseif T2 <: IntTypes
+            return Vec(Intrinsics.sitofp(Intrinsics.LVec{N, T1}, v.data))
+        elseif T2 <: FloatingTypes
+            if sizeof(T1) < sizeof(T2)
+                return Vec(Intrinsics.fptrunc(Intrinsics.LVec{N, T1}, v.data))
+            else
+                return Vec(Intrinsics.fpext(Intrinsics.LVec{N, T1}, v.data))
+            end
+        end
+    end
+    _unreachable()
+end
+@noinline _unreachable() = error("unreachable")
+
+Base.Tuple(v::Vec) = map(i -> i.value, v.data)
+Base.NTuple{N, T}(v::Vec{N}) where {T, N} = map(i -> convert(T, i.value), v.data)
+
+Base.eltype(::Type{Vec{N,T}}) where {N,T} = T
+Base.ndims( ::Type{Vec{N,T}}) where {N,T} = 1
+Base.length(::Type{Vec{N,T}}) where {N,T} = N
+Base.size(  ::Type{Vec{N,T}}) where {N,T} = (N,)
+Base.size(  ::Type{Vec{N,T}}, n::Integer) where {N,T} = n > N ? 1 : (N,)[n]
+
+Base.eltype(V::Vec) = eltype(typeof(V))
+Base.ndims(V::Vec) = ndims(typeof(V))
+Base.length(V::Vec) = length(typeof(V))
+Base.size(V::Vec) = size(typeof(V))
+Base.size(V::Vec, n::Integer) = size(typeof(V), n)
+
+if VERSION <= v"1.4.0-rc1.0"
+    function Base.show(io::IO, v::Vec{N,T}) where {N,T}
+        print(io, "<$N x $T>[")
+        join(io, [x.value for x in v.data], ", ")
+        print(io, "]")
+    end
+else
+    # This crashes on pre 1.4-rc2
+    function Base.show(io::IO, v::Vec{N,T}) where {N,T}
+        io = IOContext(io, :typeinfo => eltype(v))
+        print(io, "<$N x $T>[")
+        join(io, [sprint(show, x.value; context=io) for x in v.data], ", ")
+        print(io, "]")
+    end
+end
+
+@inline Base.checkbounds(v::Vec, i::IntegerTypes) =
+(i < 1 || i > length(v.data)) && Base.throw_boundserror(v, i)
+
+function Base.getindex(v::Vec, i::IntegerTypes)
+    @boundscheck checkbounds(v, i)
+    return Intrinsics.extractelement(v.data, i-1)
+end
+
+@inline function Base.setindex(v::Vec{N,T}, x, i::IntegerTypes) where {N,T}
+    @boundscheck checkbounds(v, i)
+    Vec(Intrinsics.insertelement(v.data, _unsafe_convert(T, x), i-1))
+end
+
+Base.zero(::Type{Vec{N,T}}) where {N, T} = Vec{N,T}(zero(T))
+Base.zero(::Vec{N,T}) where {N, T} = zero(Vec{N, T})
+Base.one(::Type{Vec{N,T}}) where {N, T} = Vec{N, T}(one(T))
+Base.one(::Vec{N,T}) where {N, T} = one(Vec{N, T})
+
+Base.reinterpret(::Type{Vec{N, T}}, v::Vec) where {T, N} = Vec(Intrinsics.bitcast(Intrinsics.LVec{N, T}, v.data))
+Base.reinterpret(::Type{Vec{N, T}}, v::ScalarTypes) where {T, N} = Vec(Intrinsics.bitcast(Intrinsics.LVec{N, T}, v))
+Base.reinterpret(::Type{T}, v::Vec) where {T} = Intrinsics.bitcast(T, v.data)
+
+const FASTMATH = Intrinsics.FastMathFlags(Intrinsics.FastMath.fast)
+
+###################
+# Unary operators #
+###################
+
+const UNARY_OPS = [
+    (:sqrt           , FloatingTypes , Intrinsics.sqrt)       ,
+    (:sin            , FloatingTypes , Intrinsics.sin)        ,
+    (:trunc          , FloatingTypes , Intrinsics.trunc)      ,
+    (:cos            , FloatingTypes , Intrinsics.cos)        ,
+    (:exp            , FloatingTypes , Intrinsics.exp)        ,
+    (:exp2           , FloatingTypes , Intrinsics.exp2)       ,
+    (:log            , FloatingTypes , Intrinsics.log)        ,
+    (:log10          , FloatingTypes , Intrinsics.log10)      ,
+    (:log2           , FloatingTypes , Intrinsics.log2)       ,
+    (:abs            , FloatingTypes , Intrinsics.fabs)       ,
+    (:floor          , FloatingTypes , Intrinsics.floor)      ,
+    (:ceil           , FloatingTypes , Intrinsics.ceil)       ,
+    # (:rint         , FloatingTypes , Intrinsics)            ,
+    # (:nearbyint    , FloatingTypes , Intrinsics)            ,
+    (:round          , FloatingTypes , Intrinsics.round)      ,
+
+    (:bswap          , IntegerTypes  , Intrinsics.bswap)      ,
+    (:count_ones     , IntegerTypes  , Intrinsics.ctpop)      ,
+    (:leading_zeros  , IntegerTypes  , Intrinsics.ctlz)       ,
+    (:trailing_zeros , IntegerTypes  , Intrinsics.cttz)       ,
+]
+
+if isdefined(Base, :bitreverse)
+    push!(UNARY_OPS,
+        (:bitreverse   , IntegerTypes  , Intrinsics.bitreverse)
+    )
+end
+
+for (op, constraint, llvmop) in UNARY_OPS
+    @eval @inline (Base.$op)(x::Vec{<:Any, <:$constraint}) =
+        Vec($(llvmop)(x.data))
+end
+
+Base.:+(v::Vec{<:Any, <:ScalarTypes}) = v
+Base.:-(v::Vec{<:Any, <:IntegerTypes}) = zero(v) - v
+Base.:-(v::Vec{<:Any, <:FloatingTypes}) = Vec(Intrinsics.fneg(v.data))
+Base.FastMath.sub_fast(v::Vec{<:Any, <:FloatingTypes}) = Vec(Intrinsics.fneg(v.data, FASTMATH))
+Base.:~(v::Vec{N, T}) where {N, T<:IntegerTypes} = Vec(Intrinsics.xor(v.data, Vec{N, T}(-1).data))
+Base.:~(v::Vec{N, Bool}) where {N} = Vec(Intrinsics.xor(v.data, Vec{N, Bool}(true).data))
+Base.abs(v::Vec{N, T}) where {N, T} = Vec(vifelse(v < zero(T), -v, v))
+Base.:!(v1::Vec{N,Bool}) where {N} = ~v1
+Base.inv(v::Vec{N, T}) where {N, T<:FloatingTypes} = one(T) / v
+
+_unsigned(::Type{Float32}) = UInt32
+_unsigned(::Type{Float64}) = UInt64
+function Base.issubnormal(x::Vec{N, T}) where {N, T<:FloatingTypes}
+    y = reinterpret(Vec{N, _unsigned(T)}, x)
+    (y & Base.exponent_mask(T) == 0) & (y & Base.significand_mask(T) != 0)
+end
+
+@inline Base.signbit(x::Vec{N, <:IntegerTypes}) where {N} = x < 0
+
+@inline Base.leading_ones(x::Vec{<:Any, <:IntegerTypes})  = leading_zeros(~(x))
+@inline Base.trailing_ones(x::Vec{<:Any, <:IntegerTypes}) = trailing_zeros(~(x))
+@inline Base.count_zeros(x::Vec{<:Any, <:IntegerTypes}) = count_ones(~(x))
+
+@inline Base.isnan(v::Vec{<:Any, <:FloatingTypes}) = v != v
+@inline Base.isfinite(v::Vec{<:Any, <:FloatingTypes}) = v - v == zero(v)
+@inline Base.isinf(v::Vec{<:Any, <:FloatingTypes}) = !isnan(v) & !isfinite(v)
+@inline Base.sign(v1::Vec{N,T}) where {N,T} =
+    vifelse(v1 == zero(Vec{N,T}), zero(Vec{N,T}),
+            vifelse(v1 < zero(Vec{N,T}), -one(Vec{N,T}), one(Vec{N,T})))
+
+@inline Base.isnan(v::Vec{N, <:IntegerTypes}) where {N} = zero(Vec{N,Bool})
+@inline Base.isfinite(v::Vec{N, <:IntegerTypes}) where {N} = one(Vec{N, Bool})
+@inline Base.isinf(v::Vec{N, <:IntegerTypes}) where {N} = zero(Vec{N, Bool})
+
+
+####################
+# Binary operators #
+####################
+
+const BINARY_OPS = [
+    (:(Base.:+)        , IntegerTypes  , Intrinsics.add)
+    (:(Base.:-)        , IntegerTypes  , Intrinsics.sub)
+    (:(Base.:*)        , IntegerTypes  , Intrinsics.mul)
+    (:(Base.div)       , UIntTypes     , Intrinsics.udiv)
+    (:(Base.div)       , IntTypes      , Intrinsics.sdiv)
+    (:(Base.rem)       , UIntTypes     , Intrinsics.urem)
+    (:(Base.rem)       , IntTypes      , Intrinsics.srem)
+
+    (:(add_saturate) , IntTypes  , Intrinsics.sadd_sat)
+    (:(add_saturate) , UIntTypes , Intrinsics.uadd_sat)
+    (:(sub_saturate) , IntTypes  , Intrinsics.ssub_sat)
+    (:(sub_saturate) , UIntTypes , Intrinsics.usub_sat)
+
+    (:(Base.:+)        , FloatingTypes , Intrinsics.fadd)
+    (:(Base.:-)        , FloatingTypes , Intrinsics.fsub)
+    (:(Base.:*)        , FloatingTypes , Intrinsics.fmul)
+    (:(Base.:^)        , FloatingTypes , Intrinsics.pow)
+    (:(Base.:/)        , FloatingTypes , Intrinsics.fdiv)
+    (:(Base.rem)       , FloatingTypes , Intrinsics.frem)
+    (:(Base.min)       , FloatingTypes , Intrinsics.minnum)
+    (:(Base.max)       , FloatingTypes , Intrinsics.maxnum)
+    (:(Base.copysign)  , FloatingTypes , Intrinsics.copysign)
+    (:(Base.:~)        , BIntegerTypes , Intrinsics.xor)
+    (:(Base.:&)        , BIntegerTypes , Intrinsics.and)
+    (:(Base.:|)        , BIntegerTypes , Intrinsics.or)
+    (:(Base.:⊻)        , BIntegerTypes , Intrinsics.xor)
+
+    (:(Base.:(==))   , BIntegerTypes  , Intrinsics.icmp_eq)
+    (:(Base.:!=)     , BIntegerTypes  , Intrinsics.icmp_ne)
+    (:(Base.:>)      , BIntTypes      , Intrinsics.icmp_sgt)
+    (:(Base.:>=)     , BIntTypes      , Intrinsics.icmp_sge)
+    (:(Base.:<)      , BIntTypes      , Intrinsics.icmp_slt)
+    (:(Base.:<=)     , BIntTypes      , Intrinsics.icmp_sle)
+    (:(Base.:>)      , UIntTypes      , Intrinsics.icmp_ugt)
+    (:(Base.:>=)     , UIntTypes      , Intrinsics.icmp_uge)
+    (:(Base.:<)      , UIntTypes      , Intrinsics.icmp_ult)
+    (:(Base.:<=)     , UIntTypes      , Intrinsics.icmp_ule)
+
+    (:(Base.:(==))   , FloatingTypes , Intrinsics.fcmp_oeq)
+    (:(Base.:!=)     , FloatingTypes , Intrinsics.fcmp_une)
+    (:(Base.:>)      , FloatingTypes , Intrinsics.fcmp_ogt)
+    (:(Base.:>=)     , FloatingTypes , Intrinsics.fcmp_oge)
+    (:(Base.:<)      , FloatingTypes , Intrinsics.fcmp_olt)
+    (:(Base.:<=)     , FloatingTypes , Intrinsics.fcmp_ole)
+]
+
+function get_fastmath_function(op)
+    if op isa Expr && op.head == Symbol(".") && op.args[1] == :Base &&
+        op.args[2].value in keys(Base.FastMath.fast_op)
+        return :(Base.FastMath.$(Base.FastMath.fast_op[op.args[2].value]))
+    end
+    return nothing
+end
+
+for (op, constraint, llvmop) in BINARY_OPS
+    @eval @inline function $op(x::Vec{N, T}, y::Vec{N, T}) where {N, T <: $constraint}
+        Vec($(llvmop)(x.data, y.data))
+    end
+
+    # Add a fast math version if applicable
+    if (fast_op = get_fastmath_function(op)) !== nothing
+        @eval @inline function $(fast_op)(x::Vec{N, T}, y::Vec{N, T}) where {N, T <: $constraint}
+            Vec($(llvmop)(x.data, y.data, FASTMATH))
+        end
+    end
+end
+
+# overflow
+const OVERFLOW_INTRINSICS = [
+    (:(Base.Checked.add_with_overflow) , IntTypes  , Intrinsics.sadd_with_overflow)
+    (:(Base.Checked.add_with_overflow) , UIntTypes , Intrinsics.uadd_with_overflow)
+    (:(Base.Checked.sub_with_overflow) , IntTypes  , Intrinsics.ssub_with_overflow)
+    (:(Base.Checked.sub_with_overflow) , UIntTypes , Intrinsics.usub_with_overflow)
+    (:(Base.Checked.mul_with_overflow) , IntTypes  , Intrinsics.smul_with_overflow)
+    (:(Base.Checked.mul_with_overflow) , UIntTypes , Intrinsics.umul_with_overflow)
+]
+for (op, constraint, llvmop) in OVERFLOW_INTRINSICS
+    @eval @inline function $op(x::Vec{N, T}, y::Vec{N, T}) where {N, T <: $constraint}
+        val, overflows = $(llvmop)(x.data, y.data)
+        return Vec(val), Vec(overflows)
+    end
+end
+
+# max min
+@inline Base.max(v1::Vec{N,T}, v2::Vec{N,T}) where {N,T<:IntegerTypes} =
+    Vec(vifelse(v1 >= v2, v1, v2))
+@inline Base.min(v1::Vec{N,T}, v2::Vec{N,T}) where {N,T<:IntegerTypes} =
+    Vec(vifelse(v1 >= v2, v2, v1))
+
+# Pow
+@inline Base.:^(x::Vec{N,T}, y::IntegerTypes) where {N,T<:FloatingTypes} =
+    Vec(Intrinsics.powi(x.data, y))
+# Do what Base does for HWNumber:
+@inline Base.literal_pow(::typeof(^), x::Vec, ::Val{0}) = one(typeof(x))
+@inline Base.literal_pow(::typeof(^), x::Vec, ::Val{1}) = x
+@inline Base.literal_pow(::typeof(^), x::Vec, ::Val{2}) = x*x
+@inline Base.literal_pow(::typeof(^), x::Vec, ::Val{3}) = x*x*x
+
+# Sign
+@inline Base.flipsign(v1::Vec{N,T}, v2::Vec{N,T}) where {N,T} =
+    vifelse(signbit(v2), -v1, v1)
+@inline Base.copysign(v1::Vec{N,T}, v2::Vec{N,T}) where {N,T<:IntTypes} =
+    vifelse(signbit(v2), -abs(v1), abs(v1))
+_signed(::Type{Float32}) = Int32
+_signed(::Type{Float64}) = Int64
+@inline Base.signbit(x::Vec{N, T}) where {N, T <:FloatingTypes} =
+    signbit(reinterpret(Vec{N, _signed(T)}, x))
+
+# Pointer arithmetic
+for op in (:+, :-)
+    @eval begin
+        # Cast pointer to Int and back
+        @inline Base.$op(x::Vec{N,Ptr{T}}, y::Vec{N,Ptr{T}}) where {N,T} =
+            convert(Vec{N, Ptr{T}}, ($(op)(convert(Vec{N, Int}, x), convert(Vec{N, Int}, y))))
+        @inline Base.$op(x::Vec{N,Ptr{T}}, y::Union{IntegerTypes}) where {N,T} = $(op)(x, Vec{N,Ptr{T}}(y))
+        @inline Base.$op(x::IntegerTypes, y::Union{Vec{N,Ptr{T}}}) where {N,T} = $(op)(y, x)
+
+        @inline Base.$op(x::Vec{N,<:IntegerTypes}, y::Ptr{T}) where {N,T} = $(op)(Vec{N,Ptr{T}}(x), Vec{N,Ptr{T}}(y))
+        @inline Base.$op(x::Ptr{T}, y::Vec{N,<:IntegerTypes}) where {N,T} = $(op)(y, x)
+    end
+end
+
+# Bitshifts
+# See https://github.com/JuliaLang/julia/blob/7426625b5c07b0d93110293246089a259a0a677d/src/intrinsics.cpp#L1179-L1196
+# Shifting with a value larger than the number of bits in the type is undefined behavior
+# so set to zero in those cases.
+@inline function shl_int(x::Vec{N, T1}, y::Vec{N, T2}) where {N, T1<:IntegerTypes, T2<:IntegerTypes}
+    vifelse(y > sizeof(T1) * 8,
+        zero(Vec{N, T1}),
+        Vec(Intrinsics.shl(x.data, convert(Vec{N,T1}, y).data)))
+end
+
+@inline function lshr_int(x::Vec{N, T1}, y::Vec{N, T2}) where {N, T1<:IntegerTypes, T2<:IntegerTypes}
+    vifelse(y > sizeof(T1) * 8,
+        zero(Vec{N, T1}),
+        Vec(Intrinsics.lshr(x.data, convert(Vec{N,T1}, y).data)))
+end
+
+@inline function ashr_int(x::Vec{N, T1}, y::Vec{N, T2}) where {N, T1<:IntegerTypes, T2<:IntegerTypes}
+    vifelse(y > sizeof(T1) * 8,
+            Vec(Intrinsics.ashr(x.data, Vec{N,T1}(sizeof(T1)*8-1).data)),
+            Vec(Intrinsics.ashr(x.data, Vec{N,T1}(y).data)))
+end
+
+# See https://github.com/JuliaLang/julia/blob/a211abcdfacc05cb93c15774a59ce8961c16dac4/base/int.jl#L422-L435
+@inline Base.:>>(x::Vec{N, <:IntTypes}, y::Vec{N, <:UIntTypes}) where {N} =
+    ashr_int(x, y)
+@inline Base.:>>(x::Vec{N, T1}, y::Vec{N, T2}) where {N, T1<:UIntTypes, T2<:UIntTypes} =
+    lshr_int(x, y)
+@inline Base.:<<(x::Vec{N, T1}, y::Vec{N, T2}) where {N, T1<:IntegerTypes, T2<:UIntTypes} =
+    shl_int(x, y)
+@inline Base.:>>>(x::Vec{N, T1}, y::Vec{N, T2}) where {N, T1<:IntegerTypes, T2<:UIntTypes} =
+    lshr_int(x, y)
+
+@inline unsigned(v::Vec{<:Any, <:UIntTypes}) = v
+@inline unsigned(v::Vec{N, Int32}) where {N} = convert(Vec{N, UInt32}, v)
+@inline unsigned(v::Vec{N, Int64}) where {N} = convert(Vec{N, UInt64}, v)
+
+@inline Base.:>>(x::Vec{N, T1}, y::Vec{N, Int}) where {N, T1<:IntegerTypes} =
+    vifelse(0 <= y, x >> unsigned(y), x << unsigned(-y))
+@inline Base.:<<(x::Vec{N, T1}, y::Vec{N, Int}) where {N, T1<:IntegerTypes} =
+    vifelse(0 <= y, x << unsigned(y), x >> unsigned(-y))
+@inline Base.:>>>(x::Vec{N, T1}, y::Vec{N, Int}) where {N, T1<:IntegerTypes} =
+    vifelse(0 <= y, x >>> unsigned(y), x << unsigned(-y))
+
+for v in (:<<, :>>, :>>>)
+    @eval begin
+        @inline Base.$v(x::Vec{N,T}, y::ScalarTypes) where {N, T} = $v(x, Vec{N,T}(y))
+        @inline Base.$v(x::Vec{N,T}, y::T2) where {N, T<:IntegerTypes, T2<:UIntTypes} = $v(x, Vec{N,T2}(y))
+        @inline Base.$v(x::ScalarTypes, y::Vec{N,T}) where {N, T} = $v(Vec{N,T}(x), y)
+        @inline Base.$v(x::Vec{N,T1}, y::Vec{N,T2}) where {N, T1<:IntegerTypes, T2<:IntegerTypes} =
+            $v(x, convert(Vec{N, Int}, y))
+    end
+end
+
+
+# Vectorize binary functions
+for (op, constraint) in [BINARY_OPS;
+        (:(Base.flipsign) , ScalarTypes)
+        (:(Base.copysign) , ScalarTypes)
+        (:(Base.signbit)  , ScalarTypes)
+        (:(Base.min)      , IntegerTypes)
+        (:(Base.max)      , IntegerTypes)
+        (:(Base.:<<)      , IntegerTypes)
+        (:(Base.:>>)      , IntegerTypes)
+        (:(Base.:>>>)     , IntegerTypes)
+        (:(Base.Checked.add_with_overflow) , IntTypes)
+        (:(Base.Checked.add_with_overflow) , UIntTypes)
+        (:(Base.Checked.sub_with_overflow) , IntTypes)
+        (:(Base.Checked.sub_with_overflow) , UIntTypes)
+        (:(Base.Checked.mul_with_overflow) , IntTypes)
+        (:(Base.Checked.mul_with_overflow) , UIntTypes)
+    ]
+    ops = [op]
+    if (fast_op = get_fastmath_function(op)) !== nothing
+        push!(ops, fast_op)
+    end
+    for op in ops
+        @eval @inline function $op(x::T2, y::Vec{N, T}) where {N, T2<:ScalarTypes, T <: $constraint}
+            $op(Vec{N, T}(x), y)
+        end
+        @eval @inline function $op(x::Vec{N, T}, y::T2) where {N, T2 <:ScalarTypes, T <: $constraint}
+            $op(x, Vec{N, T}(y))
+        end
+    end
+end
+
+#####################
+# Ternary operators #
+#####################
+
+@inline vifelse(v::Bool, v1::Vec{N, T}, v2::Vec{N, T}) where {N, T} = ifelse(v, v1, v2)
+@inline vifelse(v::Bool, v1::Vec{N, T}, v2::ScalarTypes) where {N, T} = ifelse(v, v1, Vec{N,T}(v2))
+@inline vifelse(v::Bool, v1::ScalarTypes, v2::Vec{N, T}) where {N, T} = ifelse(v, Vec{N,T}(v1), v2)
+
+@inline vifelse(v::Bool, v1::T, v2::T) where {T} = ifelse(v, v1, v2)
+@inline vifelse(v::Vec{N, Bool}, v1::Vec{N, T}, v2::Vec{N, T}) where {N, T} =
+    Vec(Intrinsics.select(v.data, v1.data, v2.data))
+@inline vifelse(v::Vec{N, Bool}, v1::T2, v2::Vec{N, T}) where {N, T, T2 <:ScalarTypes} = vifelse(v, Vec{N, T}(v1), v2)
+@inline vifelse(v::Vec{N, Bool}, v1::Vec{N, T}, v2::T2) where {N, T, T2 <:ScalarTypes} = vifelse(v, v1, Vec{N, T}(v2))
+
+# fma, muladd and vectorization of these
+for (op, llvmop) in [(:fma, Intrinsics.fma), (:muladd, Intrinsics.fmuladd)]
+    @eval begin
+        @inline Base.$op(a::Vec{N, T}, b::Vec{N, T}, c::Vec{N, T}) where {N,T<:FloatingTypes} =
+            Vec($llvmop(a.data, b.data, c.data))
+        @inline Base.$op(s1::ScalarTypes, v2::Vec{N,T}, v3::Vec{N,T}) where {N,T<:FloatingTypes} =
+            $op(Vec{N,T}(s1), v2, v3)
+        @inline Base.$op(v1::Vec{N,T}, s2::ScalarTypes, v3::Vec{N,T}) where {N,T<:FloatingTypes} =
+            $op(v1, Vec{N,T}(s2), v3)
+        @inline Base.$op(s1::ScalarTypes, s2::ScalarTypes, v3::Vec{N,T}) where {N,T<:FloatingTypes} =
+            $op(Vec{N,T}(s1), Vec{N,T}(s2), v3)
+        @inline Base.$op(v1::Vec{N,T}, v2::Vec{N,T}, s3::ScalarTypes) where {N,T<:FloatingTypes} =
+            $op(v1, v2, Vec{N,T}(s3))
+        @inline Base.$op(s1::ScalarTypes, v2::Vec{N,T}, s3::ScalarTypes) where {N,T<:FloatingTypes} =
+            $op(Vec{N,T}(s1), v2, Vec{N,T}(s3))
+        @inline Base.$op(v1::Vec{N,T}, s2::ScalarTypes, s3::ScalarTypes) where {N,T<:FloatingTypes} =
+            $op(v1, Vec{N,T}(s2), Vec{N,T}(s3))
+    end
+end
+
+
+##############
+# Reductions #
+##############
+const HORZ_REDUCTION_OPS = [
+    (&   , Union{IntegerTypes, Bool}  , Intrinsics.reduce_and)
+    (|   , Union{IntegerTypes, Bool}  , Intrinsics.reduce_or)
+    (max , IntTypes      , Intrinsics.reduce_smax)
+    (max , UIntTypes     , Intrinsics.reduce_umax)
+    (max , FloatingTypes , Intrinsics.reduce_fmax)
+    (min , IntTypes      , Intrinsics.reduce_smin)
+    (min , UIntTypes     , Intrinsics.reduce_umin)
+    (min , FloatingTypes , Intrinsics.reduce_fmin)
+    (+   , IntegerTypes  , Intrinsics.reduce_add)
+    (*   , IntegerTypes  , Intrinsics.reduce_mul)
+    (+   , FloatingTypes , Intrinsics.reduce_fadd)
+    (*   , FloatingTypes , Intrinsics.reduce_fmul)
+]
+
+for (op, constraint, llvmop) in HORZ_REDUCTION_OPS
+    @eval @inline Base.reduce(::typeof($op), x::Vec{<:Any, <:$constraint}) =
+        $(llvmop)(x.data)
+end
+Base.reduce(F::Any, v::Vec) = error("reduction not defined for SIMD.Vec on $F")
+
+@inline Base.all(v::Vec{<:Any,Bool}) = reduce(&, v)
+@inline Base.any(v::Vec{<:Any,Bool}) = reduce(|, v)
+@inline Base.maximum(v::Vec) = reduce(max, v)
+@inline Base.minimum(v::Vec) = reduce(min, v)
+@inline Base.prod(v::Vec) = reduce(*, v)
+@inline Base.sum(v::Vec) = reduce(+, v)
+
+############
+# Shuffles #
+############
+
+@inline function shufflevector(x::Vec{N, T}, ::Val{I}) where {N, T, I}
+    Vec(Intrinsics.shufflevector(x.data, Val(I)))
+end
+@inline function shufflevector(x::Vec{N, T}, y::Vec{N, T}, ::Val{I}) where {N, T, I}
+    Vec(Intrinsics.shufflevector(x.data, y.data, Val(I)))
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 47f9b2b..6c43cd6 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,6 +1,8 @@
 using SIMD
 using Test, InteractiveUtils
 
+using Base: setindex
+
 """
     llvm_ir(f, args) :: String
 
@@ -8,20 +10,22 @@ Get LLVM IR of `f(args...)` as a string.
 """
 llvm_ir(f, args) = sprint(code_llvm, f, Base.typesof(args...))
 
-@testset "SIMD" begin
+#@testset "SIMD" begin
+    # The vector we are testing.
+    global const nbytes = 32
 
-        # The vector we are testing. Ideally, we should be able to use any vector size
-        # anywhere, but LLVM codegen bugs prevent us from doing so -- thus we make this
-        # a parameter.
-        global const nbytes = 32
+    global const L8 = nbytes÷4
+    global const L4 = nbytes÷8
 
-        global const L8 = nbytes÷4
-        global const L4 = nbytes÷8
+    global const V8I32 = Vec{L8,Int32}
+    global const V8I64 = Vec{L8,Int64}
+    global const V4F64 = Vec{L4,Float64}
 
-        global const V8I32 = Vec{L8,Int32}
-        global const V4F64 = Vec{L4,Float64}
+    global const v8i32 = ntuple(i->Int32(ifelse(isodd(i), i, -i)), L8)
+    global const v8i64 = ntuple(i->Int64(ifelse(isodd(i), i, -i)), L8)
+    global const v4f64 = ntuple(i->Float64(ifelse(isodd(i), i, -i)), L4)
 
-        is_checking_bounds = Core.Compiler.inbounds_option() == :on
+    is_checking_bounds = Core.Compiler.inbounds_option() == :on
 
     @testset "Type properties" begin
         @test eltype(V8I32) === Int32
@@ -37,10 +41,6 @@ llvm_ir(f, args) = sprint(code_llvm, f, Base.typesof(args...))
     end
 
     @testset "Type conversion" begin
-
-        global const v8i32 = ntuple(i->Int32(ifelse(isodd(i), i, -i)), L8)
-        global const v4f64 = ntuple(i->Float64(ifelse(isodd(i), i, -i)), L4)
-
         @test string(V8I32(v8i32)) == "<8 x Int32>[" * string(v8i32)[2:end-1] * "]"
         @test string(V4F64(v4f64)) == "<4 x Float64>[" * string(v4f64)[2:end-1] * "]"
 
@@ -54,43 +54,32 @@ llvm_ir(f, args) = sprint(code_llvm, f, Base.typesof(args...))
         @test Tuple(V4F64(v4f64)) === Tuple(v4f64)
     end
 
+    @testset "Conversion and reinterpretation" begin
+        v = V8I32(v8i32)
+        V4I64 = reinterpret(Vec{4, Int64}, v)
+        @test sum(count_ones(v)) == sum(count_ones(V4I64))
+        @test sum(count_zeros(v)) == sum(count_zeros(V4I64))
+        x = Int64(123456789)
+        @test reinterpret(Int64, reinterpret(Vec{4, Int16}, x)) == x
+
+        @test all(Tuple(convert(Vec{8, Float64}, v)) .== Tuple(v))
+    end
+
     @testset "Element-wise access" begin
 
         for i in 1:L8
-            @test Tuple(setindex(V8I32(v8i32), 9.0, Val(i))) ===
-                ntuple(j->Int32(ifelse(j==i, 9, v8i32[j])), L8)
-            @test Tuple(setindex(V8I32(v8i32), 9.0, Val{i})) ===
-                ntuple(j->Int32(ifelse(j==i, 9, v8i32[j])), L8)
-            @test Tuple(setindex(V8I32(v8i32), 9.0, i)) ===
-                ntuple(j->Int32(ifelse(j==i, 9, v8i32[j])), L8)
-
-            @test V8I32(v8i32)[Val{i}] === v8i32[i]
             @test V8I32(v8i32)[i] === v8i32[i]
         end
 
-        @test_throws BoundsError setindex(V8I32(v8i32), 0, Val(0))
-        @test_throws BoundsError setindex(V8I32(v8i32), 0, Val{0})
-        @test_throws BoundsError setindex(V8I32(v8i32), 0, Val(L8+1))
-        @test_throws BoundsError setindex(V8I32(v8i32), 0, Val{L8+1})
         @test_throws BoundsError setindex(V8I32(v8i32), 0, 0)
         @test_throws BoundsError setindex(V8I32(v8i32), 0, L8+1)
-        @test_throws BoundsError V8I32(v8i32)[Val(0)]
-        @test_throws BoundsError V8I32(v8i32)[Val{0}]
-        @test_throws BoundsError V8I32(v8i32)[Val(L8+1)]
-        @test_throws BoundsError V8I32(v8i32)[Val{L8+1}]
         @test_throws BoundsError V8I32(v8i32)[0]
         @test_throws BoundsError V8I32(v8i32)[L8+1]
 
         for i in 1:L4
-            @test Tuple(setindex(V4F64(v4f64), 9, Val(i))) ===
-                ntuple(j->Float64(ifelse(j==i, 9.0, v4f64[j])), L4)
-            @test Tuple(setindex(V4F64(v4f64), 9, Val{i})) ===
-                ntuple(j->Float64(ifelse(j==i, 9.0, v4f64[j])), L4)
             @test Tuple(setindex(V4F64(v4f64), 9, i)) ===
                 ntuple(j->Float64(ifelse(j==i, 9.0, v4f64[j])), L4)
 
-            @test V4F64(v4f64)[Val(i)] === v4f64[i]
-            @test V4F64(v4f64)[Val{i}] === v4f64[i]
             @test V4F64(v4f64)[i] === v4f64[i]
         end
 
@@ -108,8 +97,9 @@ llvm_ir(f, args) = sprint(code_llvm, f, Base.typesof(args...))
         global const v8i32c = map(x->Int32(x*2), v8i32)
 
         notbool(x) = !(x>=typeof(x)(0))
-        for op in (~, +, -, abs, notbool, sign, signbit)
-            @test Tuple(op(V8I32(v8i32))) === map(op, v8i32)
+        for op in (~, +, -, abs, notbool, sign, signbit, count_ones, count_zeros,
+                   leading_ones, leading_zeros, trailing_ones, trailing_zeros)
+            @test Tuple(op(V8I32(v8i32))) == map(op, v8i32)
         end
 
         for op in (
@@ -125,13 +115,13 @@ llvm_ir(f, args) = sprint(code_llvm, f, Base.typesof(args...))
         end
 
         for op in (<<, >>, >>>)
-            @test Tuple(op(V8I32(v8i32), Val(3))) === map(x->op(x,3), v8i32)
-            @test Tuple(op(V8I32(v8i32), Val{3})) === map(x->op(x,3), v8i32)
-            @test Tuple(op(V8I32(v8i32), Val(-3))) === map(x->op(x,-3), v8i32)
-            @test Tuple(op(V8I32(v8i32), Val{-3})) === map(x->op(x,-3), v8i32)
-            @test Tuple(op(V8I32(v8i32), 3)) === map(x->op(x,3), v8i32)
-            @test Tuple(op(V8I32(v8i32), -3)) === map(x->op(x,-3), v8i32)
-            @test Tuple(op(V8I32(v8i32), V8I32(v8i32))) === map(op, v8i32, v8i32)
+            for v in (V8I32(v8i32), V8I64(v8i64))
+                for z in (3, UInt(3), Int32(10000), UInt8(4))
+                    @test Tuple(op(v, z)) === map(x->op(x,z), Tuple(v))
+                    @test Tuple(op(v, -z)) === map(x->op(x,-z), Tuple(v))
+                    @test Tuple(op(v, v)) === map(op, Tuple(v), Tuple(v))
+                end
+            end
         end
 
         @test Tuple(V8I32(v8i32)^0) === v8i32.^0
@@ -140,6 +130,33 @@ llvm_ir(f, args) = sprint(code_llvm, f, Base.typesof(args...))
         @test Tuple(V8I32(v8i32)^3) === v8i32.^3
     end
 
+    @testset "saturation" begin
+        v = Vec{4, UInt8}(UInt8.((150, 250, 125, 0)))
+        @test SIMD.add_saturate(v, UInt8(50)) === Vec{4, UInt8}(UInt8.((200, 255, 175, 50)))
+        @test SIMD.sub_saturate(v, UInt8(100)) === Vec{4, UInt8}(UInt8.((50, 150, 25, 0)))
+        v = Vec{4, Int8}(Int8.((100, -100, 20, -20)))
+        @test SIMD.add_saturate(v, Int8(50)) === Vec{4, Int8}(Int8.((127, -50, 70, 30)))
+        @test SIMD.sub_saturate(v, Int8(50)) === Vec{4, Int8}(Int8.((50, -128, -30, -70)))
+    end
+
+    using Base.Checked: add_with_overflow, sub_with_overflow, mul_with_overflow
+    if Base.libllvm_version >= v"9"
+        @testset "overflow arithmetic" begin
+            for f in (add_with_overflow, sub_with_overflow, mul_with_overflow)
+                for T in [Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64]
+                    t2 = div(typemax(T), T(2)) + one(T)
+                    t1 = div(typemin(T), T(2)) - (T <: Unsigned ? zero(T) : one(T))
+                    v = Vec(t2, t1, T(0), t2 - one(T))
+                    if f == mul_with_overflow && Sys.ARCH == :i686 && T == Int64
+                        @test_throws ErrorException f(v,v)
+                        continue
+                    end
+                    @test Tuple(zip(Tuple.(f(v,v))...)) === map(f, Tuple(v), Tuple(v))
+                end
+            end
+        end
+    end
+
     @testset "Floating point arithmetic functions" begin
 
         global const v4f64b = map(x->Float64(x+1), v4f64)
@@ -160,7 +177,7 @@ llvm_ir(f, args) = sprint(code_llvm, f, Base.typesof(args...))
             length(t1)==length(t2) &&
                 all(Bool[isapprox(t1[i], t2[i]) for i in 1:length(t1)])
         end
-        for op in (cos, exp, exp10, exp2, logabs, log10abs, log2abs, sin)
+        for op in (cos, exp, exp2, logabs, log10abs, log2abs, sin)
             rvec = Tuple(op(V4F64(v4f64)))
             rsca = map(op, v4f64)
             @test typeof(rvec) === typeof(rsca)
@@ -300,8 +317,13 @@ llvm_ir(f, args) = sprint(code_llvm, f, Base.typesof(args...))
         for op in (maximum, minimum, sum, prod)
             @test op(V8I32(v8i32)) === op(v8i32)
         end
-        @test all(V8I32(v8i32)) == reduce(&, v8i32)
-        @test any(V8I32(v8i32)) == reduce(|, v8i32)
+        t = Vec(true, true, true, true)
+        tf = Vec(true, false, true, false)
+        f = Vec(false, false, false, false)
+        @test all(t) == reduce(&, t) == true
+        @test all(tf) == reduce(&, tf) == false
+        @test any(f) == reduce(|, f) == false
+        @test any(tf) == reduce(|, tf) == true
 
         for op in (maximum, minimum, sum, prod)
             @test op(V4F64(v4f64)) === op(v4f64)
@@ -340,6 +362,17 @@ llvm_ir(f, args) = sprint(code_llvm, f, Base.typesof(args...))
         end
     end
 
+    @testset "fastmath" begin
+        v = Vec(1.0,2.0,3.0,4.0)
+        @test all(Tuple(@fastmath v+v) .≈ Tuple(v+v))
+        @test all(Tuple(@fastmath v+1.0) .≈ Tuple(v+1.0))
+        @test all(Tuple(@fastmath 1.0+v) .≈ Tuple(1.0+v))
+        @test all(Tuple(@fastmath -v) .≈ Tuple(-v))
+        f = v -> @fastmath v + v
+        # Test that v+v is rewritten as v * 2.0 (change test if optimization changes)
+        @test occursin(r"fmul fast <4 x double> %[0-9]*, <double 2\.000000e\+00", llvm_ir(f, (v,)))
+    end
+
     @testset "Gather and scatter function" begin
         for (arr, VT) in [(arri32, V8I32), (arrf64, V4F64)]
             arr .= 1:length(arr)
@@ -583,7 +616,7 @@ llvm_ir(f, args) = sprint(code_llvm, f, Base.typesof(args...))
             ir = llvm_ir(vsum, (xs, V4F64))
             @test occursin(" load <4 x double>", ir)
             @test occursin(" fadd <4 x double>", ir)
-            @test occursin(r"( shufflevector <4 x double>.*){2}"s, ir)
+            # @test occursin(r"( shufflevector <4 x double>.*){2}"s, ir)
         end
 
         function vadd_masked!(xs::AbstractArray{T,1}, ys::AbstractArray{T,1},
@@ -635,7 +668,7 @@ llvm_ir(f, args) = sprint(code_llvm, f, Base.typesof(args...))
             ir = llvm_ir(vsum_masked, (xs, V4F64))
             @test occursin("masked.load.v4f64", ir)
             @test occursin(" fadd <4 x double>", ir)
-            @test occursin(r"( shufflevector <4 x double>.*){2}"s, ir)
+            # @test occursin(r"( shufflevector <4 x double>.*){2}"s, ir)
         end
     end
 
@@ -667,24 +700,24 @@ llvm_ir(f, args) = sprint(code_llvm, f, Base.typesof(args...))
         for T in (Int8,UInt8,Int16,UInt16,Int32,UInt32,Int64,UInt64,Float32,Float64)
             a = Vec{4,T}((1,2,3,4))
             b = Vec{4,T}((5,6,7,8))
-            @test shufflevector(a, b, Val{(2,3,4,5)}) === Vec{4,T}((3,4,5,6))
-            @test shufflevector(a, b, Val{(1,7,5,5)}) === Vec{4,T}((2,8,6,6))
-            @test shufflevector(a, b, Val{0:3}) === a
-            @test shufflevector(a, b, Val{4:7}) === b
-            @test shufflevector(a, Val{(1,0,2,3)}) === Vec{4,T}((2,1,3,4))
-            @test shufflevector(a, b, Val{(0,1,4,5,2,3,6,7)}) === Vec{8,T}((1,2,5,6,3,4,7,8))
-            @test shufflevector(shufflevector(a, b, Val{(6,:undef,0,:undef)}), Val{(0,2)}) === Vec{2,T}((7,1))
-            @test isa(shufflevector(a, Val{(:undef,:undef,:undef,:undef)}), Vec{4,T})
+            @test shufflevector(a, b, Val((2,3,4,5))) === Vec{4,T}((3,4,5,6))
+            @test shufflevector(a, b, Val((1,7,5,5))) === Vec{4,T}((2,8,6,6))
+            @test shufflevector(a, b, Val(0:3)) === a
+            @test shufflevector(a, b, Val(4:7)) === b
+            @test shufflevector(a, Val((1,0,2,3))) === Vec{4,T}((2,1,3,4))
+            @test shufflevector(a, b, Val((0,1,4,5,2,3,6,7))) === Vec{8,T}((1,2,5,6,3,4,7,8))
+            @test shufflevector(shufflevector(a, b, Val((6,:undef,0,:undef))), Val((0,2))) === Vec{2,T}((7,1))
+            @test isa(shufflevector(a, Val((:undef,:undef,:undef,:undef))), Vec{4,T})
             c = Vec{8,T}((1:8...,))
             d = Vec{8,T}((9:16...,))
-            @test shufflevector(c, d, Val{(0,1,8,15)}) === Vec{4,T}((1,2,9,16))
-            @test shufflevector(c, d, Val{1:2:15}) === Vec{8,T}((2:2:16...,))
+            @test shufflevector(c, d, Val((0,1,8,15))) === Vec{4,T}((1,2,9,16))
+            @test shufflevector(c, d, Val(1:2:15)) === Vec{8,T}((2:2:16...,))
         end
 
         let
             a = Vec{4,Bool}((true,false,true,false))
             b = Vec{4,Bool}((false,false,true,true))
-            @test shufflevector(a, b, Val{(2,3,4,5)}) === Vec{4,Bool}((true,false,false,false))
+            @test shufflevector(a, b, Val((2,3,4,5))) === Vec{4,Bool}((true,false,false,false))
         end
     end
-end
+# end