From 83a747199c4f51ee85bb1d0e39c9c4b241a857a9 Mon Sep 17 00:00:00 2001
From: KristofferC <kcarlsson89@gmail.com>
Date: Tue, 11 Feb 2020 14:53:30 +0100
Subject: [PATCH 01/20] This PR pretty much rewrites the package from scratch
 (with the exception of some of the indexing implemented by tkf) while keeping
 the API intact. The reason for this is that I felt that the code could gain a
 lot of clarity by clearly separating the parts that deals with
 LLVM/`llvmcall` and then build a `Vec` on top of that. The number of lines of
 code has also been reduced from ~1600 to 1000.

The code is structured as follows:

- `LLVM_Intrinsics.jl` is pretty much a direct mapping of Julia Vectors
(`NTuple{N, VecElement{T}}`) to the operators and intrinsics defined in
https://llvm.org/docs/LangRef.html. It contains almost no higher level
logic.  - `simdvec.jl` contains the `Vec` (wrapping the tuple of
`VecElement`s) with definitions defined on it that maps to the
intrinsics defined in `LLVM.jl`. In some cases this is pretty automatic
but in some cases requires some logic (like in the bitshifts partly to
avoid undefined behavior or in the different conversions).  -
`arrayops.jl` is the stuff that deals with Julia `Array` like `vload`,
`vstore`, `vgather`.

Things that have gotten added to the API:

- The `count_ones, count_zeros, leading_ones, leading_zeros,
trailing_ones, trailing_zeros` family of functions.

- Type conversions and different types of reinterprets from scalar to
vectors and back and between vectors of different size:

```jl
julia> v = Vec((Int32(2), Int32(4)))
<2 x Int32>[2, 4]

julia> reinterpret(Int64, v)
17179869186

julia> reinterpret(Vec{4, Int16}, v)
<4 x Int16>[2, 0, 4, 0]

julia> reinterpret(Vec{2, Int32}, 4)
<2 x Int32>[4, 0]

julia> convert(Vec{2, Float32}, v)
<2 x Float32>[2.0, 4.0]
```

- Uses the LLVM vector reduction intrinsics
(https://llvm.org/docs/LangRef.html#experimental-vector-reduction-intrinsics)
instead of a hand rolled reducer.

Things that has been removed from the API:

- Removed the `Val` arguments from many functions (`setindex`, `>>`
etc). Julia's constant propagation + LLVM's optimization are enough for
these not to be needed. Things are specialized on the constant just as
well as if using `Val`.

- Removed the `Val{}` arguments and just use `Val()` consistently everywhere.

- Removed `exp10`. This used to just call `10^v` but the reason you
would use `exp10` is that there is a more efficient implementation for
it than the naive one. I feel that providing `exp10` gives the false
impression that it provides a benefit over the naive version

Co-Authored-By: Valentin Churavy <vchuravy@users.noreply.github.com>
---
 .travis.yml            |    9 +-
 LICENSE.md             |    2 +-
 Project.toml           |    6 +-
 README.md              |    6 +-
 REQUIRE                |    1 -
 appveyor.yml           |    5 +-
 src/LLVM_intrinsics.jl |  578 ++++++++++++
 src/SIMD.jl            | 1988 +---------------------------------------
 src/arrayops.jl        |  285 ++++++
 src/simdvec.jl         |  414 +++++++++
 test/runtests.jl       |  119 ++-
 11 files changed, 1364 insertions(+), 2049 deletions(-)
 delete mode 100644 REQUIRE
 create mode 100644 src/LLVM_intrinsics.jl
 create mode 100644 src/arrayops.jl
 create mode 100644 src/simdvec.jl

diff --git a/.travis.yml b/.travis.yml
index 39a1783..b5d6676 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -3,14 +3,9 @@ os:
   - osx
   - linux
 julia:
-  - 0.7
-  - 1.0
-  - 1.2
+  - 1.4
   - nightly
 notifications:
   email: false
-script:
-  - if [ -a .git/shallow ]; then git fetch --unshallow; fi
-  - julia -e 'using Pkg; Pkg.build(); Pkg.test(coverage=true)';
 after_success:
-  - julia -e 'cd(Pkg.dir("SIMD")); Pkg.add("Coverage"); using Coverage; Coveralls.submit(Coveralls.process_folder())';
+  - julia -e 'Pkg.add("Coverage"); using Coverage; Coveralls.submit(Coveralls.process_folder())';
diff --git a/LICENSE.md b/LICENSE.md
index 6787457..546253e 100644
--- a/LICENSE.md
+++ b/LICENSE.md
@@ -1,6 +1,6 @@
 The SIMD.jl package is licensed under the Simplified "2-clause" BSD License:
 
-> Copyright (c) 2016: Erik Schnetter.
+> Copyright (c) 2016-2020: Erik Schnetter, Kristoffer Carlsson, Julia Computing
 > All rights reserved.
 > 
 > Redistribution and use in source and binary forms, with or without
diff --git a/Project.toml b/Project.toml
index 03faf2f..5788502 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,10 +1,10 @@
 name = "SIMD"
 uuid = "fdea26ae-647d-5447-a871-4b548cad5224"
-authors = ["Erik Schnetter <schnetter@gmail.com>"]
-version = "2.8.0"
+authors = ["Erik Schnetter <schnetter@gmail.com>", "Kristoffer Carlsson <kristoffer.carlsson@juliacomputing.com>"]
+version = "3.0.0"
 
 [compat]
-julia = "1"
+julia = "1.4"
 
 [extras]
 InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
diff --git a/README.md b/README.md
index 760138c..7548e87 100644
--- a/README.md
+++ b/README.md
@@ -46,9 +46,7 @@ The SIMD package provides the usual arithmetic and logical operations for SIMD v
 
 `abs cbrt ceil copysign cos div exp exp10 exp2 flipsign floor fma inv isfinite isinf isnan issubnormal log log10 log2 muladd rem round sign signbit sin sqrt trunc vifelse`
 
-(Currently missing: `count_ones count_zeros exponent ldexp leading_ones leading_zeros significand trailing_ones trailing_zeros`, many trigonometric functions)
-
-(Also currently missing: Type conversions, reinterpretation that changes the vector size)
+(Currently missing: `exponent ldexp significand`, many trigonometric functions)
 
 These operators and functions are always applied element-wise, i.e. they are applied to each element in parallel, yielding again a SIMD vector as result. This means that e.g. multiplying two vectors yields a vector, and comparing two vectors yields a vector of booleans. This behaviour might seem strange and slightly unusual, but corresponds to the machine instructions provided by the hardware. It is also what is usually needed to vectorize loops.
 
@@ -63,7 +61,7 @@ ys1 = NTuple{4,Float32}(ys)
 y2 = ys[2]   # getindex
 
 # Update one element of a vector:
-ys = setindex(ys, 5, 3)   # cannot use ys[3] = 5
+ys = Base.setindex(ys, 5, 3)   # cannot use ys[3] = 5
 ```
 
 ## Reduction operations
diff --git a/REQUIRE b/REQUIRE
deleted file mode 100644
index 859ad46..0000000
--- a/REQUIRE
+++ /dev/null
@@ -1 +0,0 @@
-julia 0.7
diff --git a/appveyor.yml b/appveyor.yml
index 5fe5251..63410c2 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -1,8 +1,6 @@
 environment:
   matrix:
-  - julia_version: 0.7
-  - julia_version: 1.0
-  - julia_version: 1.2
+  - julia_version: 1.4
   - julia_version: nightly
 
 platform:
@@ -42,3 +40,4 @@ test_script:
 on_success:
    - echo "%JL_CODECOV_SCRIPT%"
    - C:\julia\bin\julia -e "%JL_CODECOV_SCRIPT%"
+
diff --git a/src/LLVM_intrinsics.jl b/src/LLVM_intrinsics.jl
new file mode 100644
index 0000000..9172dfa
--- /dev/null
+++ b/src/LLVM_intrinsics.jl
@@ -0,0 +1,578 @@
+# LLVM operations and intrinsics
+module Intrinsics
+
+# TODO: fastmath flags
+
+import ..SIMD: SIMD, VE, LVec, FloatingTypes
+# Inlcude Bool in IntegerTypes 
+const IntegerTypes = Union{SIMD.IntegerTypes, Bool}
+
+const d = Dict{DataType, String}(
+    Bool         => "i8",
+    Int8         => "i8",
+    Int16        => "i16",
+    Int32        => "i32",
+    Int64        => "i64",
+    Int128       => "i128",
+
+    UInt8        => "i8",
+    UInt16       => "i16",
+    UInt32       => "i32",
+    UInt64       => "i64",
+    UInt128      => "i128",
+
+    #Float16     => "half",
+    Float32      => "float",
+    Float64      => "double",
+)
+# Add the Ptr translations
+foreach(x -> (d[Ptr{x}] = d[Int]), collect(keys(d)))
+
+# LT = LLVM Type (scalar and vectors), we keep type names intentionally short
+# to make the signatures smaller
+const LT{T} = Union{LVec{<:Any, T}, T}
+
+suffix(N::Integer, ::Type{Ptr{T}}) where {T} = "v$(N)p0$(T<:IntegerTypes ? "i" : "f")$(8*sizeof(T))"
+suffix(N::Integer, ::Type{T}) where {T}      = "v$(N)$(T<:IntegerTypes   ? "i" : "f")$(8*sizeof(T))"
+suffix(::Type{T}) where {T}                  = "$(T<:IntegerTypes        ? "i" : "f")$(8*sizeof(T))"
+
+llvm_name(llvmf, N, T)                           = string("llvm", ".", llvmf, ".", suffix(N, T))
+llvm_name(llvmf, ::Type{LVec{N, T}}) where {N,T} = string("llvm", ".", llvmf, ".", suffix(N, T))
+llvm_name(llvmf, ::Type{T}) where {T}            = string("llvm", ".", llvmf, ".", suffix(T))
+
+llvm_type(::Type{T}) where {T}            = d[T]
+llvm_type(::Type{LVec{N, T}}) where {N,T} = "< $N x $(d[T])>"
+
+
+####################
+# Unary operators  #
+####################
+
+const UNARY_INTRINSICS_FLOAT = [
+    :sqrt
+    :sin
+    :cos
+    :exp
+    :trunc
+    :exp2
+    :log
+    :log10
+    :log2
+    :fabs
+    :floor
+    :ceil
+    :rint
+    :nearbyint
+    :round
+]
+
+const UNARY_INTRINSICS_INT = [
+    :bitreverse
+    :bswap
+    :ctpop
+    :ctlz
+    :cttz
+    :fshl
+    :fshr
+]
+for (fs, c) in zip([UNARY_INTRINSICS_FLOAT, UNARY_INTRINSICS_INT],
+                   [FloatingTypes,          IntegerTypes])
+    for f in fs
+        @eval begin
+            @generated function $(f)(x::T) where T<:LT{<:$c}
+                ff = llvm_name($(QuoteNode(f)), T)
+                return :(
+                    $(Expr(:meta, :inline));
+                    ccall($ff, llvmcall, T, (T,), x)
+                )
+            end
+        end
+    end
+end
+
+# fneg (not an intrinsic so cannot use `ccall)
+@generated function fneg(x::T) where T<:LT{<:FloatingTypes}
+    s = """
+    %2 = fneg $(llvm_type(T)) %0
+    ret $(llvm_type(T)) %2
+    """
+    return :(
+        $(Expr(:meta, :inline));
+        Base.llvmcall($s, T, Tuple{T}, x)
+    )
+end
+
+#####################
+# Binary operators  #
+#####################
+
+const BINARY_OPS_FLOAT = [
+    :fadd
+    :fsub
+    :fmul
+    :fdiv
+    :frem
+]
+
+const BINARY_OPS_INT = [
+    :add
+    :sub
+    :mul
+    :sdiv
+    :udiv
+    :srem
+    :urem
+    :shl
+    :ashr
+    :lshr
+    :and
+    :or
+    :xor
+]
+
+for (fs, c) in zip([BINARY_OPS_FLOAT, BINARY_OPS_INT],
+                   [FloatingTypes, IntegerTypes])
+    for f in fs
+        @eval @generated function $f(x::T, y::T) where T<:LT{<:$c}
+            ff = $(QuoteNode(f))
+            s = """
+            %3 = $ff $(llvm_type(T)) %0, %1
+            ret $(llvm_type(T)) %3
+            """
+            return :(
+                $(Expr(:meta, :inline));
+                Base.llvmcall($s, T, Tuple{T, T}, x, y)
+            )
+        end
+    end
+end
+
+const BINARY_INTRINSICS_FLOAT = [
+    :minnum
+    :maxnum
+    :minimum
+    :maximum
+    :copysign
+    :pow
+    :floor
+    :ceil
+    :trunc
+    :rint
+    :nearbyint
+    :round
+]
+
+for f in BINARY_INTRINSICS_FLOAT
+    @eval @generated function $(f)(x::T, y::T) where T<:LT{<:FloatingTypes}
+        ff = llvm_name($(QuoteNode(f)), T,)
+        return :(
+            $(Expr(:meta, :inline));
+            ccall($ff, llvmcall, T, (T, T), x, y)
+        )
+    end
+end
+
+# pow, powi
+for (f, c) in [(:pow, FloatingTypes), (:powi, IntegerTypes)]
+    @eval @generated function $(f)(x::T, y::T2) where {T <: LT{<:FloatingTypes}, T2 <: $c}
+        ff = llvm_name($(QuoteNode(f)), T)
+        return :(
+            $(Expr(:meta, :inline));
+            ccall($ff, llvmcall, T, (T, T2), x, y)
+        )
+    end
+end
+
+# Comparisons
+const CMP_FLAGS_FLOAT = [
+    :false
+    :oeq
+    :ogt
+    :oge
+    :olt
+    :ole
+    :one
+    :ord
+    :ueq
+    :ugt
+    :uge
+    :ult
+    :ule
+    :une
+    :uno
+    :true
+]
+
+const CMP_FLAGS_INT = [
+    :eq
+    :ne
+    :sgt
+    :sge
+    :slt
+    :sle
+    :ugt
+    :uge
+    :ult
+    :ule
+]
+
+for (f, c, flags) in zip(["fcmp",          "icmp"],
+                         [FloatingTypes,   IntegerTypes],
+                         [CMP_FLAGS_FLOAT, CMP_FLAGS_INT])
+    for flag in flags
+        ftot = Symbol(string(f, "_", flag))
+        @eval @generated function $ftot(x::LVec{N, T}, y::LVec{N, T}) where {N, T <: $c}
+            fflag = $(QuoteNode(flag))
+            ff = $(QuoteNode(f))
+            s = """
+            %res = $ff $(fflag) <$(N) x $(d[T])> %0, %1
+            %resb = zext <$(N) x i1> %res to <$(N) x i8>
+            ret <$(N) x i8> %resb
+            """
+            return :(
+                $(Expr(:meta, :inline));
+                Base.llvmcall($s, LVec{N, Bool}, Tuple{LVec{N, T}, LVec{N, T}}, x, y)
+            )
+        end
+    end
+end
+
+
+#####################
+# Ternary operators #
+#####################
+
+@generated function select(cond::LVec{N, Bool}, x::LVec{N, T}, y::LVec{N, T}) where {N, T}
+    s = """
+    %cond = trunc <$(N) x i8> %0 to <$(N) x i1>
+    %res = select <$N x i1> %cond, <$N x $(d[T])> %1, <$N x $(d[T])> %2
+    ret <$N x $(d[T])> %res
+    """
+    return :(
+        $(Expr(:meta, :inline));
+        Base.llvmcall($s, LVec{N, T}, Tuple{LVec{N, Bool}, LVec{N, T}, LVec{N, T}}, cond, x, y)
+    )
+end
+
+const MULADD_INTRINSICS = [
+    :fmuladd,
+    :fma,
+]
+
+for f in MULADD_INTRINSICS
+    @eval @generated function $(f)(a::LVec{N, T}, b::LVec{N, T}, c::LVec{N, T}) where {N, T<:FloatingTypes}
+        ff = llvm_name($(QuoteNode(f)), N, T)
+        return :(
+            $(Expr(:meta, :inline));
+            ccall($ff, llvmcall, LVec{N, T}, (LVec{N, T}, LVec{N, T}, LVec{N, T}), a, b, c)
+        )
+    end
+end
+
+
+################
+# Load / store #
+################
+
+# These alignment numbers feels a bit dubious
+n_align(align, N, T) = align ? N * sizeof(T) : sizeof(T)
+temporal_str(temporal) = temporal ? ", !nontemporal !{i32 1}" : ""
+
+@generated function load(x::Type{LVec{N, T}}, ptr::Ptr{T},
+                         ::Val{Al}=Val(false), ::Val{Te}=Val(false)) where {N, T, Al, Te}
+    s = """
+    %ptr = inttoptr $(d[Int]) %0 to <$N x $(d[T])>*
+    %res = load <$N x $(d[T])>, <$N x $(d[T])>* %ptr, align $(n_align(Al, N, T)) $(temporal_str(Te))
+    ret <$N x $(d[T])> %res
+    """
+    return :(
+        $(Expr(:meta, :inline));
+        Base.llvmcall($s, LVec{N, T}, Tuple{Ptr{T}}, ptr)
+    )
+end
+
+@generated function maskedload(ptr::Ptr{T}, mask::LVec{N,Bool},
+                               ::Val{Al}=Val(false), ::Val{Te}=Val(false)) where {N, T, Al, Te}
+    # TODO: Allow setting the passthru
+    decl = "declare <$N x $(d[T])> @llvm.masked.load.$(suffix(N, T))(<$N x $(d[T])>*, i32, <$N x i1>, <$N x $(d[T])>)"
+    s = """
+    %mask = trunc <$(N) x i8> %1 to <$(N) x i1>
+    %ptr = inttoptr $(d[Int]) %0 to <$N x $(d[T])>*
+    %res = call <$N x $(d[T])> @llvm.masked.load.$(suffix(N, T))(<$N x $(d[T])>* %ptr, i32 $(n_align(Al, N, T)), <$N x i1> %mask, <$N x $(d[T])> zeroinitializer)
+    ret <$N x $(d[T])> %res
+    """
+    return :(
+        $(Expr(:meta, :inline));
+        Base.llvmcall(($decl, $s), LVec{N, T}, Tuple{Ptr{T}, LVec{N,Bool}}, ptr, mask)
+    )
+end
+
+@generated function store(x::LVec{N, T}, ptr::Ptr{T},
+                          ::Val{Al}=Val(false), ::Val{Te}=Val(false)) where {N, T, Al, Te}
+    s = """
+    %ptr = inttoptr $(d[Int]) %1 to <$N x $(d[T])>*
+    store <$N x $(d[T])> %0, <$N x $(d[T])>* %ptr, align $(n_align(Al, N, T)) $(temporal_str(Te))
+    ret void
+    """
+    return :(
+
+        $(Expr(:meta, :inline));
+        Base.llvmcall($s, Cvoid, Tuple{LVec{N, T}, Ptr{T}}, x, ptr)
+    )
+end
+
+@generated function maskedstore(x::LVec{N, T}, ptr::Ptr{T}, mask::LVec{N,Bool},
+                               ::Val{Al}=Val(false), ::Val{Te}=Val(false)) where {N, T, Al, Te}
+    # TODO: Allow setting the passthru
+    decl = "declare <$N x $(d[T])> @llvm.masked.store.$(suffix(N, T))(<$N x $(d[T])>, <$N x $(d[T])>*, i32, <$N x i1>)"
+    s = """
+    %mask = trunc <$(N) x i8> %2 to <$(N) x i1>
+    %ptr = inttoptr $(d[Int]) %1 to <$N x $(d[T])>*
+    %res = call <$N x $(d[T])> @llvm.masked.store.$(suffix(N, T))(<$N x $(d[T])> %0, <$N x $(d[T])>* %ptr, i32 $(n_align(Al, N, T)), <$N x i1> %mask)
+    ret void
+    """
+    return :(
+        $(Expr(:meta, :inline));
+        Base.llvmcall(($decl, $s), Cvoid, Tuple{LVec{N, T}, Ptr{T}, LVec{N,Bool}}, x, ptr, mask)
+    )
+end
+
+
+####################
+# Gather / Scatter #
+####################
+
+@generated function maskedgather(ptrs::LVec{N,Ptr{T}},
+                                 mask::LVec{N,Bool}, ::Val{Al}=Val(false)) where {N, T, Al}
+    # TODO: Allow setting the passthru
+    decl = "declare <$N x $(d[T])> @llvm.masked.gather.$(suffix(N, T))(<$N x $(d[T])*>, i32, <$N x i1>, <$N x $(d[T])>)"
+    s = """
+    %mask = trunc <$(N) x i8> %1 to <$(N) x i1>
+    %ptrs = inttoptr <$N x $(d[Int])> %0 to <$N x $(d[T])*>
+    %res = call <$N x $(d[T])> @llvm.masked.gather.$(suffix(N, T))(<$N x $(d[T])*> %ptrs, i32 $(n_align(Al, N, T)), <$N x i1> %mask, <$N x $(d[T])> zeroinitializer)
+    ret <$N x $(d[T])> %res
+    """
+    return :(
+        $(Expr(:meta, :inline));
+        Base.llvmcall(($decl, $s), LVec{N, T}, Tuple{LVec{N, Ptr{T}}, LVec{N, Bool}}, ptrs, mask)
+    )
+end
+
+@generated function maskedscatter(x::LVec{N, T}, ptrs::LVec{N, Ptr{T}},
+                                  mask::LVec{N,Bool}, ::Val{Al}=Val(false)) where {N, T, Al}
+
+    decl = "declare <$N x $(d[T])> @llvm.masked.scatter.$(suffix(N, T))(<$N x $(d[T])>, <$N x $(d[T])*>, i32, <$N x i1>)"
+    s = """
+    %mask = trunc <$(N) x i8> %2 to <$(N) x i1>
+    %ptrs = inttoptr <$N x $(d[Int])> %1 to <$N x $(d[T])*>
+    call <$N x $(d[T])> @llvm.masked.scatter.$(suffix(N, T))(<$N x $(d[T])> %0, <$N x $(d[T])*> %ptrs, i32 $(n_align(Al, N, T)), <$N x i1> %mask)
+    ret void
+    """
+    return :(
+        $(Expr(:meta, :inline));
+        Base.llvmcall(($decl, $s), Cvoid, Tuple{LVec{N, T}, LVec{N, Ptr{T}}, LVec{N, Bool}}, x, ptrs, mask)
+    )
+end
+
+
+######################
+# LVector Operations #
+######################
+
+@generated function extractelement(x::LVec{N, T}, i::I) where {N, T, I <: IntegerTypes}
+    s = """
+    %3 = extractelement <$N x $(d[T])> %0, $(d[I]) %1
+    ret $(d[T]) %3
+    """
+    return :(
+        $(Expr(:meta, :inline));
+        Base.llvmcall($s, T, Tuple{LVec{N, T}, $i}, x, i)
+    )
+end
+
+@generated function insertelement(x::LVec{N, T}, v::T, i::IntegerTypes) where {N, T}
+    s = """
+    %4 = insertelement <$N x $(d[T])> %0, $(d[T]) %1, $(d[i]) %2
+    ret <$N x $(d[T])> %4
+    """
+    return :(
+        $(Expr(:meta, :inline));
+        Base.llvmcall($s, LVec{N, T}, Tuple{LVec{N, T}, T, typeof(i)}, x, v, i)
+    )
+end
+
+_shuffle_vec(I) = join((string("i32 ", i == :undef ? "undef" : Int32(i::Integer)) for i in I), ", ")
+@generated function shufflevector(x::LVec{N, T}, y::LVec{N, T}, ::Val{I}) where {N, T, I}
+    shfl = _shuffle_vec(I)
+    M = length(I)
+    s = """
+    %res = shufflevector <$N x $(d[T])> %0, <$N x $(d[T])> %1, <$M x i32> <$shfl>
+    ret <$M x $(d[T])> %res
+    """
+    return :(
+        $(Expr(:meta, :inline));
+        Base.llvmcall($s, LVec{$M, T}, Tuple{LVec{N, T}, LVec{N, T}}, x, y)
+    )
+end
+
+@generated function shufflevector(x::LVec{N, T}, ::Val{I}) where {N, T, I}
+    shfl = _shuffle_vec(I)
+    M = length(I)
+    s = """
+    %res = shufflevector <$(N) x $(d[T])> %0, <$N x $(d[T])> undef, <$M x i32> <$shfl>
+    ret <$M x $(d[T])> %res
+    """
+    return :(
+        $(Expr(:meta, :inline));
+        Base.llvmcall($s, LVec{$M, T}, Tuple{LVec{N, T}}, x)
+    )
+end
+
+@generated function constantvector(v::T, y::Type{LVec{N, T}}) where {N, T}
+    s = """
+    %2 = insertelement <$N x $(d[T])> undef, $(d[T]) %0, i32 0
+    %res = shufflevector <$N x $(d[T])> %2, <$N x $(d[T])> undef, <$N x i32> zeroinitializer
+    ret <$N x $(d[T])> %res
+    """
+    return :(
+        $(Expr(:meta, :inline));
+        Base.llvmcall($s, LVec{N, T}, Tuple{T}, v)
+    )
+end
+
+#########################
+# Conversion Operations #
+#########################
+
+const CAST_SIZE_CHANGE_FLOAT = [
+    (:fptrunc, >)
+    (:fpext, <)
+]
+
+const CAST_SIZE_CHANGE_INT = [
+    (:trunc, >)
+    (:zext, <)
+    (:sext, <)
+]
+
+for (fs, c) in zip([CAST_SIZE_CHANGE_FLOAT, CAST_SIZE_CHANGE_INT],
+                   [FloatingTypes,          IntegerTypes])
+    for (f, criteria) in fs
+        @eval @generated function $f(::Type{LVec{N, T2}}, x::LVec{N, T1}) where {N, T1 <: $c, T2 <: $c}
+            sT1, sT2 = sizeof(T1) * 8, sizeof(T2) * 8
+            # Not changing size is not allowed
+            @assert $criteria(sT1, sT2) "size of conversion type ($T2: $sT2) must be $($criteria) than the element type ($T1: $sT1)"
+            ff = $(QuoteNode(f))
+            s = """
+            %2 = $ff <$(N) x $(d[T1])> %0 to <$(N) x $(d[T2])>
+            ret <$(N) x $(d[T2])> %2
+            """
+            return :(
+                $(Expr(:meta, :inline));
+                Base.llvmcall($s, LVec{N, T2}, Tuple{LVec{N, T1}}, x)
+            )
+        end
+    end
+end
+
+const CONVERSION_FLOAT_TO_INT = [
+    :fptoui,
+    :fptosi
+]
+
+const CONVERSION_INT_TO_FLOAT = [
+    :uitofp,
+    :sitofp
+]
+
+for (fs, (from, to)) in zip([CONVERSION_FLOAT_TO_INT,       CONVERSION_INT_TO_FLOAT],
+                           [(FloatingTypes, IntegerTypes), (IntegerTypes, FloatingTypes)])
+    for f in fs
+        @eval @generated function $f(::Type{LVec{N, T2}}, x::LVec{N, T1}) where {N, T1 <: $from, T2 <: $to}
+            ff = $(QuoteNode(f))
+            s = """
+            %2 = $ff <$(N) x $(d[T1])> %0 to <$(N) x $(d[T2])>
+            ret <$(N) x $(d[T2])> %2
+            """
+            return :(
+                $(Expr(:meta, :inline));
+                Base.llvmcall($s, LVec{N, T2}, Tuple{LVec{N, T1}}, x)
+            )
+        end
+    end
+end
+
+
+###########
+# Bitcast #
+###########
+
+@generated function bitcast(::Type{T1}, x::T2) where {T1<:LT, T2<:LT}
+    sT1, sT2 = sizeof(T1), sizeof(T2)
+    @assert sT1 == sT2 "size of conversion type ($T1: $sT1) must be equal to the vector type ($T2: $sT2)"
+    s = """
+    %2 = bitcast $(llvm_type(T2)) %0 to $(llvm_type(T1))
+    ret $(llvm_type(T1)) %2
+    """
+    return :(
+        $(Expr(:meta, :inline));
+        Base.llvmcall($s, T1, Tuple{T2}, x)
+    )
+end
+
+##################################
+# Horizontal reductions (LLVM 9) #
+##################################
+
+const HORZ_REDUCTION_OPS_FLOAT = [
+    :fmax
+    :fmin
+]
+
+const HORZ_REDUCTION_OPS_INT = [
+    :and
+    :or
+    :mul
+    :add
+    :smax
+    :umax
+    :smin
+    :umin
+]
+
+for (fs, c) in zip([HORZ_REDUCTION_OPS_FLOAT, HORZ_REDUCTION_OPS_INT],
+                   [FloatingTypes,            IntegerTypes])
+    for f in fs
+        f_red = Symbol("reduce_", f)
+        @eval @generated function $f_red(x::LVec{N, T}) where {N,T<:$c}
+            ff = llvm_name(string("experimental.vector.reduce.", $(QuoteNode(f))), N, T)
+            decl = "declare $(d[T]) @$ff(<$N x $(d[T])>)"
+            s2 = """
+            %res = call $(d[T]) @$ff(<$N x $(d[T])> %0)
+            ret $(d[T]) %res
+            """
+            return quote
+                Base.llvmcall($(decl, s2), T, Tuple{LVec{N, T},}, x)
+            end
+        end
+    end
+end
+
+# The fadd and fmul reductions take an initializer
+horz_reduction_version = Base.libllvm_version < v"9" ? "" : "v2."
+for (f, neutral) in [(:fadd, "0.0"), (:fmul, "1.0")]
+    f_red = Symbol("reduce_", f)
+    @eval @generated function $f_red(x::LVec{N, T}) where {N,T<:FloatingTypes}
+        ff = llvm_name(string("experimental.vector.reduce.$horz_reduction_version", $(QuoteNode(f))), N, T)
+        decl = "declare $(d[T]) @$ff($(d[T]), <$N x $(d[T])>)"
+        s2 = """
+        %res = call $(d[T]) @$ff($(d[T]) $($neutral), <$N x $(d[T])> %0)
+        ret $(d[T]) %res
+        """
+        return quote
+            Base.llvmcall($(decl, s2), T, Tuple{LVec{N, T},}, x)
+        end
+    end
+end
+
+end
diff --git a/src/SIMD.jl b/src/SIMD.jl
index 9093f1f..aeb60b1 100644
--- a/src/SIMD.jl
+++ b/src/SIMD.jl
@@ -1,1972 +1,24 @@
 module SIMD
 
-# A note on Val{} vs. Val():
-#
-# For historic reasoons, SIMD's API accepted compile-time constants as
-# Val{N} instead of Val(N). The difference is that Val{N} is a type
-# (Type{Val{N}}), whereas Val(N) is a value (of type Val{N}). This is
-# against the intent of how Val is designed, and is also much slower
-# at run time unless functions are either @inline'd or @generated.
-#
-# The API has now been cleaned up. To preserve backward compatibility,
-# passing Val{N} instead of Val(N) is still supported. It might go
-# away at the next major release.
-
-
-
-#=
-
-# Various boolean types
-
-# Idea (from <Gaunard-simd.pdf>): Use Mask{N,T} instead of booleans
-# with different sizes
-
-abstract Boolean <: Integer
-
-for sz in (8, 16, 32, 64, 128)
-    Intsz = Symbol(:Int, sz)
-    UIntsz = Symbol(:UInt, sz)
-    Boolsz = Symbol(:Bool, sz)
-    @eval begin
-        immutable $Boolsz <: Boolean
-            int::$UIntsz
-            $Boolsz(b::Bool) =
-                new(ifelse(b, typemax($UIntsz), typemin($UIntsz)))
-        end
-        booltype(::Val($sz)) = $Boolsz
-        inttype(::Val($sz)) = $Intsz
-        uinttype(::Val($sz)) = $UIntsz
-
-        Base.convert(::Type{Bool}, b::$Boolsz) = b.int != 0
-
-        Base.:~(b::$Boolsz) = $Boolsz(~b.int)
-        Base.:!(b::$Boolsz) = ~b
-        Base.:&(b1::$Boolsz, b2::$Boolsz) = $Boolsz(b1.int & b2.int)
-        Base.:|(b1::$Boolsz, b2::$Boolsz) = $Boolsz(b1.int | b2.int)
-        Base.$(:$)(b1::$Boolsz, b2::$Boolsz) = $Boolsz(b1.int $ b2.int)
-
-        Base.:==(b1::$Boolsz, b2::$Boolsz) = $Boolsz(b1.int == b2.int)
-        Base.:!=(b1::$Boolsz, b2::$Boolsz) = $Boolsz(b1.int != b2.int)
-        Base.:<(b1::$Boolsz, b2::$Boolsz) = $Boolsz(b1.int < b2.int)
-        Base.:<=(b1::$Boolsz, b2::$Boolsz) = $Boolsz(b1.int <= b2.int)
-        Base.:>(b1::$Boolsz, b2::$Boolsz) = $Boolsz(b1.int > b2.int)
-        Base.:>=(b1::$Boolsz, b2::$Boolsz) = $Boolsz(b1.int >= b2.int)
-    end
-end
-Base.convert(::Type{Bool}, b::Boolean) = error("impossible")
-Base.convert{I<:Integer}(::Type{I}, b::Boolean) = I(Bool(b))
-Base.convert{B<:Boolean}(::Type{B}, b::Boolean) = B(Bool(b))
-Base.convert{B<:Boolean}(::Type{B}, i::Integer) = B(i!=0)
-
-booltype{T}(::Type{T}) = booltype(Val(8*sizeof(T)))
-inttype{T}(::Type{T}) = inttype(Val(8*sizeof(T)))
-uinttype{T}(::Type{T}) = uinttype(Val(8*sizeof(T)))
-
-=#
-
-# Array types for SIMD
-
-using Base: Slice, ScalarIndex
-
-"""
-    ContiguousSubArray{T,N,P,I,L}
-
-Like `Base.FastContiguousSubArray` but without requirement for linear
-indexing (i.e., type parameter `L` can be `false`).
-
-# Examples
-```
-julia> A = view(ones(5, 5), :, [1,3]);
-
-julia> A isa Base.FastContiguousSubArray
-false
-
-julia> A isa SIMD.ContiguousSubArray
-true
-```
-"""
-ContiguousSubArray{T,N,P,
-                   I<:Union{Tuple{Union{Slice, AbstractUnitRange}, Vararg{Any}},
-                            Tuple{Vararg{ScalarIndex}}},
-                   L} = SubArray{T,N,P,I,L}
-
-"""
-    ContiguousArray{T,N}
-
-Array types with contiguous first dimension.
-"""
-ContiguousArray{T,N} = Union{DenseArray{T,N}, ContiguousSubArray{T,N}}
-
-"""
-    FastContiguousArray{T,N}
-
-This is the type of arrays that `pointer(A, i)` works.
-"""
-FastContiguousArray{T,N} = Union{DenseArray{T,N}, Base.FastContiguousSubArray{T,N}}
-# https://github.com/eschnett/SIMD.jl/pull/40#discussion_r254131184
-# https://github.com/JuliaArrays/MappedArrays.jl/pull/24#issuecomment-460568978
-
-# The Julia SIMD vector type
-
-const BoolTypes = Union{Bool}
-const IntTypes = Union{Int8, Int16, Int32, Int64, Int128}
-const UIntTypes = Union{UInt8, UInt16, UInt32, UInt64, UInt128}
-const IntegerTypes = Union{BoolTypes, IntTypes, UIntTypes}
-const IndexTypes = Union{IntegerTypes, Ptr}
-const FloatingTypes = Union{Float16, Float32, Float64}
-const ScalarTypes = Union{IndexTypes, FloatingTypes}
-
-const VE = Base.VecElement
-
-export Vec
-struct Vec{N,T<:ScalarTypes} # <: Number
-    elts::NTuple{N,VE{T}}
-    @inline Vec{N,T}(elts::NTuple{N, VE{T}}) where {N,T} = new{N,T}(elts)
-end
-
-function Base.show(io::IO, v::Vec{N,T}) where {N,T}
-    print(io, "<$N x $T>[")
-    join(io, [x.value for x in v.elts], ", ")
-    print(io, "]")
-end
-
-# Type properties
-Base.eltype(::Type{Vec{N,T}}) where {N,T} = T
-Base.ndims( ::Type{Vec{N,T}}) where {N,T} = 1
-Base.length(::Type{Vec{N,T}}) where {N,T} = N
-Base.size(  ::Type{Vec{N,T}}) where {N,T} = (N,)
-# TODO: This doesn't follow Base, e.g. `size([], 3) == 1`
-Base.size(::Type{Vec{N,T}}, n::Integer) where {N,T} = (N,)[n]
-
-Base.eltype(V::Vec) = eltype(typeof(V))
-Base.ndims( V::Vec) = ndims(typeof(V))
-Base.length(V::Vec) = length(typeof(V))
-Base.size(  V::Vec) = size(typeof(V))
-Base.size(  V::Vec, n::Integer) = size(typeof(V), n)
-
-# Type conversion
-
-# Create vectors from scalars or tuples
-@generated function (::Type{Vec{N,T}})(x::S) where {N,T,S<:ScalarTypes}
-    quote
-        $(Expr(:meta, :inline))
-        Vec{N,T}(tuple($([:(VE{T}(T(x))) for i in 1:N]...)))
-    end
-end
-Vec{N,T}(xs::Tuple{}) where {N,T<:ScalarTypes} = error("illegal argument")
-@generated function (::Type{Vec{N,T}})(xs::NTuple{N,S}) where {N,T,S<:ScalarTypes}
-    quote
-        $(Expr(:meta, :inline))
-        Vec{N,T}(tuple($([:(VE{T}(T(xs[$i]))) for i in 1:N]...)))
-    end
-end
-Vec(xs::NTuple{N,T}) where {N,T<:ScalarTypes} = Vec{N,T}(xs)
-
-# Convert between vectors
-@inline Base.convert(::Type{Vec{N,T}}, v::Vec{N,T}) where {N,T} = v
-
-@inline Base.convert(::Type{Vec{N,R}}, v::Vec{N}) where {N,R} =
-    Vec{N,R}(NTuple{N, R}(v))
-
-@inline Tuple(v::Vec{N}) where {N} = ntuple(i -> v.elts[i].value, Val(N))
-@inline NTuple{N, T}(v::Vec{N}) where{N, T} = ntuple(i -> convert(T, v.elts[i].value), Val(N))
-
-@generated function Base.:%(v::Vec{N,T}, ::Type{Vec{N,R}}) where {N,R,T}
-    quote
-        $(Expr(:meta, :inline))
-        Vec{N,R}(tuple($([:(v.elts[$i].value % R) for i in 1:N]...)))
-    end
-end
-
-# Convert vectors to tuples
-@generated function Base.convert(::Type{NTuple{N,R}}, v::Vec{N,T}) where {N,R,T}
-    quote
-        $(Expr(:meta, :inline))
-        tuple($([:(R(v.elts[$i].value)) for i in 1:N]...))
-    end
-end
-@inline Base.convert(::Type{Tuple}, v::Vec{N,T}) where {N,T} =
-    Base.convert(NTuple{N,T}, v)
-
-# Promotion rules
-
-# Note: Type promotion only works for subtypes of Number
-# Base.promote_rule{N,T<:ScalarTypes}(::Type{Vec{N,T}}, ::Type{T}) = Vec{N,T}
-
-Base.zero(::Type{Vec{N,T}}) where {N,T} = Vec{N,T}(zero(T))
-Base.one(::Type{Vec{N,T}}) where {N,T} = Vec{N,T}(one(T))
-Base.zero(::Vec{N,T}) where {N,T} = zero(Vec{N,T})
-Base.one(::Vec{N,T}) where {N,T} = one(Vec{N,T})
-
-# Floating point formats
-
-int_type(::Type{Float16}) = Int16
-int_type(::Type{Float32}) = Int32
-int_type(::Type{Float64}) = Int64
-# int_type(::Type{Float128}) = Int128
-# int_type(::Type{Float256}) = Int256
-
-uint_type(::Type{Float16}) = UInt16
-uint_type(::Type{Float32}) = UInt32
-uint_type(::Type{Float64}) = UInt64
-# uint_type(::Type{Float128}) = UInt128
-# uint_type(::Type{Float256}) = UInt256
-
-significand_bits(::Type{Float16}) = 10
-significand_bits(::Type{Float32}) = 23
-significand_bits(::Type{Float64}) = 52
-# significand_bits(::Type{Float128}) = 112
-# significand_bits(::Type{Float256}) = 136
-
-exponent_bits(::Type{T}) where {T<:FloatingTypes} =
-    8*sizeof(T) - 1 - significand_bits(T)
-sign_bits(::Type{T}) where {T<:FloatingTypes} = 1
-
-significand_mask(::Type{T}) where {T<:FloatingTypes} =
-    uint_type(T)(uint_type(T)(1) << significand_bits(T) - 1)
-exponent_mask(::Type{T}) where {T<:FloatingTypes} =
-    uint_type(T)(uint_type(T)(1) << exponent_bits(T) - 1) << significand_bits(T)
-sign_mask(::Type{T}) where {T<:FloatingTypes} =
-    uint_type(T)(1) << (significand_bits(T) + exponent_bits(T))
-
-for T in (Float16, Float32, Float64)
-    @assert sizeof(int_type(T)) == sizeof(T)
-    @assert sizeof(uint_type(T)) == sizeof(T)
-    @assert significand_bits(T) + exponent_bits(T) + sign_bits(T) == 8*sizeof(T)
-    @assert significand_mask(T) | exponent_mask(T) | sign_mask(T) ==
-        typemax(uint_type(T))
-    @assert significand_mask(T) ⊻ exponent_mask(T) ⊻ sign_mask(T) ==
-        typemax(uint_type(T))
-end
-
-# Convert Julia types to LLVM types
-
-llvmtype(::Type{Bool}) = "i8"   # Julia represents Tuple{Bool} as [1 x i8]
-
-# llvmtype(::Type{Bool8}) = "i8"
-# llvmtype(::Type{Bool16}) = "i16"
-# llvmtype(::Type{Bool32}) = "i32"
-# llvmtype(::Type{Bool64}) = "i64"
-# llvmtype(::Type{Bool128}) = "i128"
-
-llvmtype(::Type{Int8}) = "i8"
-llvmtype(::Type{Int16}) = "i16"
-llvmtype(::Type{Int32}) = "i32"
-llvmtype(::Type{Int64}) = "i64"
-llvmtype(::Type{Int128}) = "i128"
-llvmtype(::Type{<:Ptr}) = llvmtype(Int)
-
-llvmtype(::Type{UInt8}) = "i8"
-llvmtype(::Type{UInt16}) = "i16"
-llvmtype(::Type{UInt32}) = "i32"
-llvmtype(::Type{UInt64}) = "i64"
-llvmtype(::Type{UInt128}) = "i128"
-
-llvmtype(::Type{Float16}) = "half"
-llvmtype(::Type{Float32}) = "float"
-llvmtype(::Type{Float64}) = "double"
-
-# Type-dependent optimization flags
-# fastflags{T<:IntTypes}(::Type{T}) = "nsw"
-# fastflags{T<:UIntTypes}(::Type{T}) = "nuw"
-# fastflags{T<:FloatingTypes}(::Type{T}) = "fast"
-
-suffix(N::Integer, ::Type{T}) where {T<:IntegerTypes} = "v$(N)i$(8*sizeof(T))"
-suffix(N::Integer, ::Type{T}) where {T<:FloatingTypes} = "v$(N)f$(8*sizeof(T))"
-
-# Type-dependent LLVM constants
-function llvmconst(::Type{T}, val) where T
-    T(val) === T(0) && return "zeroinitializer"
-    typ = llvmtype(T)
-    "$typ $val"
-end
-function llvmconst(::Type{Bool}, val)
-    Bool(val) === false && return "zeroinitializer"
-    typ = "i1"
-    "$typ $(Int(val))"
-end
-function llvmconst(N::Integer, ::Type{T}, val) where T
-    T(val) === T(0) && return "zeroinitializer"
-    typ = llvmtype(T)
-    "<" * join(["$typ $val" for i in 1:N], ", ") * ">"
-end
-function llvmconst(N::Integer, ::Type{Bool}, val)
-    Bool(val) === false && return "zeroinitializer"
-    typ = "i1"
-    "<" * join(["$typ $(Int(val))" for i in 1:N], ", ") * ">"
-end
-function llvmtypedconst(::Type{T}, val) where T
-    typ = llvmtype(T)
-    T(val) === T(0) && return "$typ zeroinitializer"
-    "$typ $val"
-end
-function llvmtypedconst(::Type{Bool}, val)
-    typ = "i1"
-    Bool(val) === false && return "$typ zeroinitializer"
-    "$typ $(Int(val))"
-end
-
-# Type-dependent LLVM intrinsics
-llvmins(::Val{:+}, N, ::Type{T}) where {T <: IndexTypes} = "add"
-llvmins(::Val{:-}, N, ::Type{T}) where {T <: IndexTypes} = "sub"
-llvmins(::Val{:*}, N, ::Type{T}) where {T <: IntegerTypes} = "mul"
-llvmins(::Val{:div}, N, ::Type{T}) where {T <: IntTypes} = "sdiv"
-llvmins(::Val{:rem}, N, ::Type{T}) where {T <: IntTypes} = "srem"
-llvmins(::Val{:div}, N, ::Type{T}) where {T <: UIntTypes} = "udiv"
-llvmins(::Val{:rem}, N, ::Type{T}) where {T <: UIntTypes} = "urem"
-
-llvmins(::Val{:~}, N, ::Type{T}) where {T <: IntegerTypes} = "xor"
-llvmins(::Val{:&}, N, ::Type{T}) where {T <: IntegerTypes} = "and"
-llvmins(::Val{:|}, N, ::Type{T}) where {T <: IntegerTypes} = "or"
-llvmins(::Val{:⊻}, N, ::Type{T}) where {T <: IntegerTypes} = "xor"
-
-llvmins(::Val{:<<}, N, ::Type{T}) where {T <: IntegerTypes} = "shl"
-llvmins(::Val{:>>>}, N, ::Type{T}) where {T <: IntegerTypes} = "lshr"
-llvmins(::Val{:>>}, N, ::Type{T}) where {T <: UIntTypes} = "lshr"
-llvmins(::Val{:>>}, N, ::Type{T}) where {T <: IntTypes} = "ashr"
-
-llvmins(::Val{:(==)}, N, ::Type{T}) where {T <: IntegerTypes} = "icmp eq"
-llvmins(::Val{:(!=)}, N, ::Type{T}) where {T <: IntegerTypes} = "icmp ne"
-llvmins(::Val{:(>)}, N, ::Type{T}) where {T <: IntTypes} = "icmp sgt"
-llvmins(::Val{:(>=)}, N, ::Type{T}) where {T <: IntTypes} = "icmp sge"
-llvmins(::Val{:(<)}, N, ::Type{T}) where {T <: IntTypes} = "icmp slt"
-llvmins(::Val{:(<=)}, N, ::Type{T}) where {T <: IntTypes} = "icmp sle"
-llvmins(::Val{:(>)}, N, ::Type{T}) where {T <: UIntTypes} = "icmp ugt"
-llvmins(::Val{:(>=)}, N, ::Type{T}) where {T <: UIntTypes} = "icmp uge"
-llvmins(::Val{:(<)}, N, ::Type{T}) where {T <: UIntTypes} = "icmp ult"
-llvmins(::Val{:(<=)}, N, ::Type{T}) where {T <: UIntTypes} = "icmp ule"
-
-llvmins(::Val{:vifelse}, N, ::Type{T}) where {T} = "select"
-
-llvmins(::Val{:+}, N, ::Type{T}) where {T <: FloatingTypes} = "fadd"
-llvmins(::Val{:-}, N, ::Type{T}) where {T <: FloatingTypes} = "fsub"
-llvmins(::Val{:*}, N, ::Type{T}) where {T <: FloatingTypes} = "fmul"
-llvmins(::Val{:/}, N, ::Type{T}) where {T <: FloatingTypes} = "fdiv"
-llvmins(::Val{:inv}, N, ::Type{T}) where {T <: FloatingTypes} = "fdiv"
-llvmins(::Val{:rem}, N, ::Type{T}) where {T <: FloatingTypes} = "frem"
-
-llvmins(::Val{:(==)}, N, ::Type{T}) where {T <: FloatingTypes} = "fcmp oeq"
-llvmins(::Val{:(!=)}, N, ::Type{T}) where {T <: FloatingTypes} = "fcmp une"
-llvmins(::Val{:(>)}, N, ::Type{T}) where {T <: FloatingTypes} = "fcmp ogt"
-llvmins(::Val{:(>=)}, N, ::Type{T}) where {T <: FloatingTypes} = "fcmp oge"
-llvmins(::Val{:(<)}, N, ::Type{T}) where {T <: FloatingTypes} = "fcmp olt"
-llvmins(::Val{:(<=)}, N, ::Type{T}) where {T <: FloatingTypes} = "fcmp ole"
-
-llvmins(::Val{:^}, N, ::Type{T}) where {T <: FloatingTypes} =
-    "@llvm.pow.$(suffix(N,T))"
-llvmins(::Val{:abs}, N, ::Type{T}) where {T<:FloatingTypes} =
-    "@llvm.fabs.$(suffix(N,T))"
-llvmins(::Val{:ceil}, N, ::Type{T}) where {T<:FloatingTypes} =
-    "@llvm.ceil.$(suffix(N,T))"
-llvmins(::Val{:copysign}, N, ::Type{T}) where {T<:FloatingTypes} =
-    "@llvm.copysign.$(suffix(N,T))"
-llvmins(::Val{:cos}, N, ::Type{T}) where {T<:FloatingTypes} =
-    "@llvm.cos.$(suffix(N,T))"
-llvmins(::Val{:exp}, N, ::Type{T}) where {T<:FloatingTypes} =
-    "@llvm.exp.$(suffix(N,T))"
-llvmins(::Val{:exp2}, N, ::Type{T}) where {T<:FloatingTypes} =
-    "@llvm.exp2.$(suffix(N,T))"
-llvmins(::Val{:floor}, N, ::Type{T}) where {T<:FloatingTypes} =
-    "@llvm.floor.$(suffix(N,T))"
-llvmins(::Val{:fma}, N, ::Type{T}) where {T<:FloatingTypes} =
-    "@llvm.fma.$(suffix(N,T))"
-llvmins(::Val{:log}, N, ::Type{T}) where {T<:FloatingTypes} =
-    "@llvm.log.$(suffix(N,T))"
-llvmins(::Val{:log10}, N, ::Type{T}) where {T<:FloatingTypes} =
-    "@llvm.log10.$(suffix(N,T))"
-llvmins(::Val{:log2}, N, ::Type{T}) where {T<:FloatingTypes} =
-    "@llvm.log2.$(suffix(N,T))"
-llvmins(::Val{:max}, N, ::Type{T}) where {T<:FloatingTypes} =
-    "@llvm.maxnum.$(suffix(N,T))"
-llvmins(::Val{:min}, N, ::Type{T}) where {T<:FloatingTypes} =
-    "@llvm.minnum.$(suffix(N,T))"
-# llvmins(::Val{:max}, N, ::Type{T}) where {T<:FloatingTypes} =
-#     "@llvm.maximum.$(suffix(N,T))"
-# llvmins(::Val{:min}, N, ::Type{T}) where {T<:FloatingTypes} =
-#     "@llvm.minimum.$(suffix(N,T))"
-llvmins(::Val{:muladd}, N, ::Type{T}) where {T<:FloatingTypes} =
-    "@llvm.fmuladd.$(suffix(N,T))"
-llvmins(::Val{:powi}, N, ::Type{T}) where {T<:FloatingTypes} =
-    "@llvm.powi.$(suffix(N,T))"
-llvmins(::Val{:round}, N, ::Type{T}) where {T<:FloatingTypes} =
-    "@llvm.rint.$(suffix(N,T))"
-llvmins(::Val{:sin}, N, ::Type{T}) where {T<:FloatingTypes} =
-    "@llvm.sin.$(suffix(N,T))"
-llvmins(::Val{:sqrt}, N, ::Type{T}) where {T<:FloatingTypes} =
-    "@llvm.sqrt.$(suffix(N,T))"
-llvmins(::Val{:trunc}, N, ::Type{T}) where {T<:FloatingTypes} =
-    "@llvm.trunc.$(suffix(N,T))"
-
-# Convert between LLVM scalars, vectors, and arrays
-
-function scalar2vector(vec, siz, typ, sca)
-    instrs = []
-    accum(nam, i) = i<0 ? "undef" : i==siz-1 ? nam : "$(nam)_iter$i"
-    for i in 0:siz-1
-        push!(instrs,
-            "$(accum(vec,i)) = " *
-                "insertelement <$siz x $typ> $(accum(vec,i-1)), " *
-                "$typ $sca, i32 $i")
-    end
-    instrs
-end
-
-function array2vector(vec, siz, typ, arr, tmp="$(arr)_av")
-    instrs = []
-    accum(nam, i) = i<0 ? "undef" : i==siz-1 ? nam : "$(nam)_iter$i"
-    for i in 0:siz-1
-        push!(instrs, "$(tmp)_elem$i = extractvalue [$siz x $typ] $arr, $i")
-        push!(instrs,
-            "$(accum(vec,i)) = " *
-                "insertelement <$siz x $typ> $(accum(vec,i-1)), " *
-                "$typ $(tmp)_elem$i, i32 $i")
-    end
-    instrs
-end
-
-function vector2array(arr, siz, typ, vec, tmp="$(vec)_va")
-    instrs = []
-    accum(nam, i) = i<0 ? "undef" : i==siz-1 ? nam : "$(nam)_iter$i"
-    for i in 0:siz-1
-        push!(instrs,
-            "$(tmp)_elem$i = extractelement <$siz x $typ> $vec, i32 $i")
-        push!(instrs,
-            "$(accum(arr,i)) = "*
-                "insertvalue [$siz x $typ] $(accum(arr,i-1)), " *
-                "$typ $(tmp)_elem$i, $i")
-    end
-    instrs
-end
-
-# TODO: change argument order
-function subvector(vec, siz, typ, rvec, rsiz, roff, tmp="$(rvec)_sv")
-    instrs = []
-    accum(nam, i) = i<0 ? "undef" : i==rsiz-1 ? nam : "$(nam)_iter$i"
-    @assert 0 <= roff
-    @assert roff + rsiz <= siz
-    for i in 0:rsiz-1
-        push!(instrs,
-            "$(tmp)_elem$i = extractelement <$siz x $typ> $vec, i32 $(roff+i)")
-        push!(instrs,
-            "$(accum(rvec,i)) = " *
-                "insertelement <$rsiz x $typ> $(accum(rvec,i-1)), " *
-                "$typ $(tmp)_elem$i, i32 $i")
-    end
-    instrs
-end
-
-function extendvector(vec, siz, typ, voff, vsiz, val, rvec, tmp="$(rvec)_ev")
-    instrs = []
-    accum(nam, i) = i<0 ? "undef" : i==siz+vsiz-1 ? nam : "$(nam)_iter$i"
-    rsiz = siz + vsiz
-    for i in 0:siz-1
-        push!(instrs,
-            "$(tmp)_elem$i = extractelement <$siz x $typ> $vec, i32 $i")
-        push!(instrs,
-            "$(accum(rvec,i)) = " *
-                "insertelement <$rsiz x $typ> $(accum(rvec,i-1)), " *
-                "$typ $(tmp)_elem$i, i32 $i")
-    end
-    for i in siz:siz+vsiz-1
-        push!(instrs,
-            "$(accum(rvec,i)) = " *
-                "insertelement <$rsiz x $typ> $(accum(rvec,i-1)), $val, i32 $i")
-    end
-    instrs
-end
-
-# Element-wise access
-
-export setindex
-@generated function setindex(v::Vec{N,T}, x::Number, ::Val{I}) where {N,T,I}
-    @assert isa(I, Integer)
-    1 <= I <= N || throw(BoundsError())
-    typ = llvmtype(T)
-    ityp = llvmtype(Int)
-    vtyp = "<$N x $typ>"
-    decls = []
-    instrs = []
-    push!(instrs, "%res = insertelement $vtyp %0, $typ %1, $ityp $(I-1)")
-    push!(instrs, "ret $vtyp %res")
-    quote
-        $(Expr(:meta, :inline))
-        Vec{N,T}(Base.llvmcall($((join(decls, "\n"), join(instrs, "\n"))),
-            NTuple{N,VE{T}}, Tuple{NTuple{N,VE{T}}, T}, v.elts, T(x)))
-    end
-end
-@inline function setindex(v::Vec{N,T}, x::Number, ::Type{Val{I}}) where {N,T,I}
-    setindex(v, x, Val(I))
-end
-
-@generated function setindex(v::Vec{N,T}, x::Number, i::Int) where {N,T}
-    typ = llvmtype(T)
-    ityp = llvmtype(Int)
-    vtyp = "<$N x $typ>"
-    decls = []
-    instrs = []
-    push!(instrs, "%res = insertelement $vtyp %0, $typ %2, $ityp %1")
-    push!(instrs, "ret $vtyp %res")
-    quote
-        $(Expr(:meta, :inline))
-        @boundscheck 1 <= i <= N || throw(BoundsError())
-        Vec{N,T}(Base.llvmcall($((join(decls, "\n"), join(instrs, "\n"))),
-            NTuple{N,VE{T}}, Tuple{NTuple{N,VE{T}}, Int, T},
-            v.elts, i-1, T(x)))
-    end
-end
-setindex(v::Vec{N,T}, x::Number, i) where {N,T} = setindex(v, Int(i), x)
-
-Base.@propagate_inbounds Base.getindex(v::Vec{N,T}, ::Val{I}) where {N,T,I} = v.elts[I].value
-Base.@propagate_inbounds Base.getindex(v::Vec{N,T}, ::Type{Val{I}}) where {N,T,I} = Base.getindex(v, Val(I))
-Base.@propagate_inbounds Base.getindex(v::Vec{N,T}, i) where {N,T} = v.elts[i].value
-
-# Type conversion
-
-@generated function Base.reinterpret(::Type{Vec{N,R}},
-        v1::Vec{N1,T1}) where {N,R,N1,T1}
-    @assert N*sizeof(R) == N1*sizeof(T1)
-    typ1 = llvmtype(T1)
-    vtyp1 = "<$N1 x $typ1>"
-    typr = llvmtype(R)
-    vtypr = "<$N x $typr>"
-    decls = []
-    instrs = []
-    push!(instrs, "%res = bitcast $vtyp1 %0 to $vtypr")
-    push!(instrs, "ret $vtypr %res")
-    quote
-        $(Expr(:meta, :inline))
-        Vec{N,R}(Base.llvmcall($((join(decls, "\n"), join(instrs, "\n"))),
-            NTuple{N,VE{R}}, Tuple{NTuple{N1,VE{T1}}}, v1.elts))
-    end
-end
-
-# Generic function wrappers
-
-# Functions taking one argument
-@generated function llvmwrap(::Val{Op}, v1::Vec{N,T1},
-                             ::Type{R} = T1) where {Op,N,T1,R}
-    @assert isa(Op, Symbol)
-    typ1 = llvmtype(T1)
-    vtyp1 = "<$N x $typ1>"
-    typr = llvmtype(R)
-    vtypr = "<$N x $typr>"
-    ins = llvmins(Val(Op), N, T1)
-    decls = []
-    instrs = []
-    if ins[1] == '@'
-        push!(decls, "declare $vtypr $ins($vtyp1)")
-        push!(instrs, "%res = call $vtypr $ins($vtyp1 %0)")
-    else
-        if Op === :~
-            @assert T1 <: IntegerTypes
-            otherval = -1
-        elseif Op === :inv
-            @assert T1 <: FloatingTypes
-            otherval = 1.0
-        else
-            otherval = 0
-        end
-        otherarg = llvmconst(N, T1, otherval)
-        push!(instrs, "%res = $ins $vtyp1 $otherarg, %0")
-    end
-    push!(instrs, "ret $vtypr %res")
-    quote
-        $(Expr(:meta, :inline))
-        Vec{N,R}(Base.llvmcall($((join(decls, "\n"), join(instrs, "\n"))),
-            NTuple{N,VE{R}}, Tuple{NTuple{N,VE{T1}}}, v1.elts))
-    end
-end
-
-# Functions taking one Bool argument
-@generated function llvmwrap(::Val{Op}, v1::Vec{N,Bool},
-                             ::Type{Bool} = Bool) where {Op,N}
-    @assert isa(Op, Symbol)
-    btyp = llvmtype(Bool)
-    vbtyp = "<$N x $btyp>"
-    ins = llvmins(Val(Op), N, Bool)
-    decls = []
-    instrs = []
-    push!(instrs, "%arg1 = trunc $vbtyp %0 to <$N x i1>")
-    otherarg = llvmconst(N, Bool, true)
-    push!(instrs, "%res = $ins <$N x i1> $otherarg, %arg1")
-    push!(instrs, "%resb = zext <$N x i1> %res to $vbtyp")
-    push!(instrs, "ret $vbtyp %resb")
-    quote
-        $(Expr(:meta, :inline))
-        Vec{N,Bool}(Base.llvmcall($((join(decls, "\n"), join(instrs, "\n"))),
-            NTuple{N,VE{Bool}}, Tuple{NTuple{N,VE{Bool}}}, v1.elts))
-    end
-end
-
-# Functions taking two arguments
-@generated function llvmwrap(::Val{Op}, v1::Vec{N,T1}, v2::Vec{N,T2},
-        ::Type{R} = T1) where {Op,N,T1,T2,R}
-    @assert isa(Op, Symbol)
-    typ1 = llvmtype(T1)
-    vtyp1 = "<$N x $typ1>"
-    typ2 = llvmtype(T2)
-    vtyp2 = "<$N x $typ2>"
-    typr = llvmtype(R)
-    vtypr = "<$N x $typr>"
-    ins = llvmins(Val(Op), N, T1)
-    decls = []
-    instrs = []
-    if ins[1] == '@'
-        push!(decls, "declare $vtypr $ins($vtyp1, $vtyp2)")
-        push!(instrs, "%res = call $vtypr $ins($vtyp1 %0, $vtyp2 %1)")
-    else
-        push!(instrs, "%res = $ins $vtyp1 %0, %1")
-    end
-    push!(instrs, "ret $vtypr %res")
-    quote
-        $(Expr(:meta, :inline))
-        Vec{N,R}(Base.llvmcall($((join(decls, "\n"), join(instrs, "\n"))),
-            NTuple{N,VE{R}}, Tuple{NTuple{N,VE{T1}}, NTuple{N,VE{T2}}},
-            v1.elts, v2.elts))
-    end
-end
-
-# Functions taking two arguments, second argument is a scalar
-@generated function llvmwrap(::Val{Op}, v1::Vec{N,T1}, s2::ScalarTypes,
-        ::Type{R} = T1) where {Op,N,T1,R}
-    @assert isa(Op, Symbol)
-    typ1 = llvmtype(T1)
-    vtyp1 = "<$N x $typ1>"
-    typ2 = llvmtype(s2)
-    typr = llvmtype(R)
-    vtypr = "<$N x $typr>"
-    ins = llvmins(Val(Op), N, T1)
-    decls = []
-    instrs = []
-    if ins[1] == '@'
-        push!(decls, "declare $vtypr $ins($vtyp1, $typ2)")
-        push!(instrs, "%res = call $vtypr $ins($vtyp1 %0, $typ2 %1)")
-    else
-        push!(instrs, "%res = $ins $vtyp1 %0, %1")
-    end
-    push!(instrs, "ret $vtypr %res")
-    quote
-        $(Expr(:meta, :inline))
-        Vec{N,R}(Base.llvmcall($((join(decls, "\n"), join(instrs, "\n"))),
-            NTuple{N,VE{R}}, Tuple{NTuple{N,VE{T1}}, $s2},
-            v1.elts, s2))
-    end
-end
-
-# Functions taking two arguments, returning Bool
-@generated function llvmwrap(::Val{Op}, v1::Vec{N,T1}, v2::Vec{N,T2},
-        ::Type{Bool}) where {Op,N,T1,T2}
-    @assert isa(Op, Symbol)
-    btyp = llvmtype(Bool)
-    vbtyp = "<$N x $btyp>"
-    abtyp = "[$N x $btyp]"
-    typ1 = llvmtype(T1)
-    vtyp1 = "<$N x $typ1>"
-    atyp1 = "[$N x $typ1]"
-    typ2 = llvmtype(T2)
-    vtyp2 = "<$N x $typ2>"
-    atyp2 = "[$N x $typ2]"
-    ins = llvmins(Val(Op), N, T1)
-    decls = []
-    instrs = []
-    if false && N == 1
-        append!(instrs, array2vector("%arg1", N, typ1, "%0", "%arg1arr"))
-        append!(instrs, array2vector("%arg2", N, typ2, "%1", "%arg2arr"))
-        push!(instrs, "%cond = $ins $vtyp1 %arg1, %arg2")
-        push!(instrs, "%res = zext <$N x i1> %cond to $vbtyp")
-        append!(instrs, vector2array("%resarr", N, btyp, "%res"))
-        push!(instrs, "ret $abtyp %resarr")
-    else
-        push!(instrs, "%res = $ins $vtyp1 %0, %1")
-        push!(instrs, "%resb = zext <$N x i1> %res to $vbtyp")
-        push!(instrs, "ret $vbtyp %resb")
-    end
-    quote
-        $(Expr(:meta, :inline))
-        Vec{N,Bool}(Base.llvmcall($((join(decls, "\n"), join(instrs, "\n"))),
-            NTuple{N,VE{Bool}}, Tuple{NTuple{N,VE{T1}}, NTuple{N,VE{T2}}},
-            v1.elts, v2.elts))
-    end
-end
-
-# Functions taking a vector and a scalar argument
-# @generated function llvmwrap{Op,N,T1,T2,R}(::Val{Op}, v1::Vec{N,T1},
-#         x2::T2, ::Type{R} = T1)
-#     @assert isa(Op, Symbol)
-#     typ1 = llvmtype(T1)
-#     atyp1 = "[$N x $typ1]"
-#     vtyp1 = "<$N x $typ1>"
-#     typ2 = llvmtype(T2)
-#     typr = llvmtype(R)
-#     atypr = "[$N x $typr]"
-#     vtypr = "<$N x $typr>"
-#     ins = llvmins(Val(Op), N, T1)
-#     decls = []
-#     instrs = []
-#     append!(instrs, array2vector("%arg1", N, typ1, "%0", "%arg1arr"))
-#     if ins[1] == '@'
-#         push!(decls, "declare $vtypr $ins($vtyp1, $typ2)")
-#         push!(instrs, "%res = call $vtypr $ins($vtyp1 %arg1, $typ2 %1)")
-#     else
-#         push!(instrs, "%res = $ins $vtyp1 %arg1, %1")
-#     end
-#     append!(instrs, vector2array("%resarr", N, typr, "%res"))
-#     push!(instrs, "ret $atypr %resarr")
-#     quote
-#         $(Expr(:meta, :inline))
-#         Vec{N,R}(Base.llvmcall($((join(decls, "\n"), join(instrs, "\n"))),
-#             NTuple{N,R}, Tuple{NTuple{N,T1}, T2}, v1.elts, x2))
-#     end
-# end
-
-# Functions taking two Bool arguments, returning Bool
-@generated function llvmwrap(::Val{Op}, v1::Vec{N,Bool}, v2::Vec{N,Bool},
-        ::Type{Bool} = Bool) where {Op,N}
-    @assert isa(Op, Symbol)
-    btyp = llvmtype(Bool)
-    vbtyp = "<$N x $btyp>"
-    ins = llvmins(Val(Op), N, Bool)
-    decls = []
-    instrs = []
-    push!(instrs, "%arg1 = trunc $vbtyp %0 to <$N x i1>")
-    push!(instrs, "%arg2 = trunc $vbtyp %1 to <$N x i1>")
-    push!(instrs, "%res = $ins <$N x i1> %arg1, %arg2")
-    push!(instrs, "%resb = zext <$N x i1> %res to $vbtyp")
-    push!(instrs, "ret $vbtyp %resb")
-    quote
-        $(Expr(:meta, :inline))
-        Vec{N,Bool}(Base.llvmcall($((join(decls, "\n"), join(instrs, "\n"))),
-            NTuple{N,VE{Bool}}, Tuple{NTuple{N,VE{Bool}}, NTuple{N,VE{Bool}}},
-            v1.elts, v2.elts))
-    end
-end
-
-# Functions taking three arguments
-@generated function llvmwrap(::Val{Op}, v1::Vec{N,T1}, v2::Vec{N,T2},
-        v3::Vec{N,T3}, ::Type{R} = T1) where {Op,N,T1,T2,T3,R}
-    @assert isa(Op, Symbol)
-    typ1 = llvmtype(T1)
-    vtyp1 = "<$N x $typ1>"
-    typ2 = llvmtype(T2)
-    vtyp2 = "<$N x $typ2>"
-    typ3 = llvmtype(T3)
-    vtyp3 = "<$N x $typ3>"
-    typr = llvmtype(R)
-    vtypr = "<$N x $typr>"
-    ins = llvmins(Val(Op), N, T1)
-    decls = []
-    instrs = []
-    if ins[1] == '@'
-        push!(decls, "declare $vtypr $ins($vtyp1, $vtyp2, $vtyp3)")
-        push!(instrs,
-            "%res = call $vtypr $ins($vtyp1 %0, $vtyp2 %1, $vtyp3 %2)")
-    else
-        push!(instrs, "%res = $ins $vtyp1 %0, %1, %2")
-    end
-    push!(instrs, "ret $vtypr %res")
-    quote
-        $(Expr(:meta, :inline))
-        Vec{N,R}(Base.llvmcall($((join(decls, "\n"), join(instrs, "\n"))),
-            NTuple{N,VE{R}},
-            Tuple{NTuple{N,VE{T1}}, NTuple{N,VE{T2}}, NTuple{N,VE{T3}}},
-            v1.elts, v2.elts, v3.elts))
-    end
-end
-
-@generated function llvmwrapshift(::Val{Op}, v1::Vec{N,T},
-        ::Val{I}) where {Op,N,T,I}
-    @assert isa(Op, Symbol)
-    if I >= 0
-        op = Op
-        i = I
-    else
-        if Op === :>> || Op === :>>>
-            op = :<<
-        else
-            @assert Op === :<<
-            if T <: Unsigned
-                op = :>>>
-            else
-                op = :>>
-            end
-        end
-        i = -I
-    end
-    @assert op in (:<<, :>>, :>>>)
-    @assert i >= 0
-    typ = llvmtype(T)
-    vtyp = "<$N x $typ>"
-    ins = llvmins(Val(op), N, T)
-    decls = []
-    instrs = []
-    nbits = 8*sizeof(T)
-    if (op === :>> && T <: IntTypes) || i < nbits
-        count = llvmconst(N, T, min(nbits-1, i))
-        push!(instrs, "%res = $ins $vtyp %0, $count")
-        push!(instrs, "ret $vtyp %res")
-    else
-        zero = llvmconst(N, T, 0)
-        push!(instrs, "return $vtyp $zero")
-    end
-    quote
-        $(Expr(:meta, :inline))
-        Vec{N,T}(Base.llvmcall($((join(decls, "\n"), join(instrs, "\n"))),
-            NTuple{N,VE{T}}, Tuple{NTuple{N,VE{T}}}, v1.elts))
-    end
-end
-
-@generated function llvmwrapshift(::Val{Op}, v1::Vec{N,T},
-        x2::Unsigned) where {Op,N,T}
-    @assert isa(Op, Symbol)
-    typ = llvmtype(T)
-    vtyp = "<$N x $typ>"
-    ins = llvmins(Val(Op), N, T)
-    decls = []
-    instrs = []
-    append!(instrs, scalar2vector("%count", N, typ, "%1"))
-    nbits = 8*sizeof(T)
-    push!(instrs, "%tmp = $ins $vtyp %0, %count")
-    push!(instrs, "%inbounds = icmp ult $typ %1, $nbits")
-    if Op === :>> && T <: IntTypes
-        nbits1 = llvmconst(N, T, 8*sizeof(T)-1)
-        push!(instrs, "%limit = $ins $vtyp %0, $nbits1")
-        push!(instrs, "%res = select i1 %inbounds, $vtyp %tmp, $vtyp %limit")
-    else
-        zero = llvmconst(N, T, 0)
-        push!(instrs, "%res = select i1 %inbounds, $vtyp %tmp, $vtyp $zero")
-    end
-    push!(instrs, "ret $vtyp %res")
-    quote
-        $(Expr(:meta, :inline))
-        # Note that this function might be called with out-of-bounds
-        # values for x2, assuming that the results are then ignored
-        Vec{N,T}(Base.llvmcall($((join(decls, "\n"), join(instrs, "\n"))),
-            NTuple{N,VE{T}}, Tuple{NTuple{N,VE{T}}, T}, v1.elts, x2 % T))
-    end
-end
-
-@generated function llvmwrapshift(::Val{Op}, v1::Vec{N,T},
-        x2::Integer) where {Op,N,T}
-    if Op === :>> || Op === :>>>
-        NegOp = :<<
-    else
-        @assert Op === :<<
-        if T <: Unsigned
-            NegOp = :>>>
-        else
-            NegOp = :>>
-        end
-    end
-    ValOp = Val(Op)
-    ValNegOp = Val(NegOp)
-    quote
-        $(Expr(:meta, :inline))
-        ifelse(x2 >= 0,
-               llvmwrapshift($ValOp, v1, unsigned(x2)),
-               llvmwrapshift($ValNegOp, v1, unsigned(-x2)))
-    end
-end
-
-@generated function llvmwrapshift(::Val{Op}, v1::Vec{N,T},
-        v2::Vec{N,U}) where {Op,N,T,U<:UIntTypes}
-    @assert isa(Op, Symbol)
-    typ = llvmtype(T)
-    vtyp = "<$N x $typ>"
-    ins = llvmins(Val(Op), N, T)
-    decls = []
-    instrs = []
-    push!(instrs, "%tmp = $ins $vtyp %0, %1")
-    nbits = llvmconst(N, T, 8*sizeof(T))
-    push!(instrs, "%inbounds = icmp ult $vtyp %1, $nbits")
-    if Op === :>> && T <: IntTypes
-        nbits1 = llvmconst(N, T, 8*sizeof(T)-1)
-        push!(instrs, "%limit = $ins $vtyp %0, $nbits1")
-        push!(instrs,
-            "%res = select <$N x i1> %inbounds, $vtyp %tmp, $vtyp %limit")
-    else
-        zero = llvmconst(N, T, 0)
-        push!(instrs,
-            "%res = select <$N x i1> %inbounds, $vtyp %tmp, $vtyp $zero")
-    end
-    push!(instrs, "ret $vtyp %res")
-    quote
-        $(Expr(:meta, :inline))
-        Vec{N,T}(Base.llvmcall($((join(decls, "\n"), join(instrs, "\n"))),
-            NTuple{N,VE{T}}, Tuple{NTuple{N,VE{T}}, NTuple{N,VE{T}}},
-            v1.elts, (v2 % Vec{N,T}).elts))
-    end
-end
-
-@generated function llvmwrapshift(::Val{Op}, v1::Vec{N,T},
-        v2::Vec{N,U}) where {Op,N,T,U<:IntegerTypes}
-    if Op === :>> || Op === :>>>
-        NegOp = :<<
-    else
-        @assert Op === :<<
-        if T <: Unsigned
-            NegOp = :>>>
-        else
-            NegOp = :>>
-        end
-    end
-    ValOp = Val(Op)
-    ValNegOp = Val(NegOp)
-    quote
-        $(Expr(:meta, :inline))
-        vifelse(v2 >= 0,
-                llvmwrapshift($ValOp, v1, v2 % Vec{N,unsigned(U)}),
-                llvmwrapshift($ValNegOp, v1, -v2 % Vec{N,unsigned(U)}))
-    end
-end
-
-# Conditionals
-
-for op in (:(==), :(!=), :(<), :(<=), :(>), :(>=))
-    @eval begin
-        @inline Base.$op(v1::Vec{N,T}, v2::Vec{N,T}) where {N,T} =
-            llvmwrap(Val($(QuoteNode(op))), v1, v2, Bool)
-    end
-end
-@inline function Base.cmp(v1::Vec{N,T}, v2::Vec{N,T}) where {N,T}
-    I = int_type(T)
-    vifelse(isequal(v1, v2), Vec{N,I}(0),
-            vifelse(isless(v1, v2), Vec{N,I}(-1), Vec{N,I}(1)))
-end
-@inline function Base.isfinite(v1::Vec{N,T}) where {N,T<:FloatingTypes}
-    U = uint_type(T)
-    em = Vec{N,U}(exponent_mask(T))
-    iv = reinterpret(Vec{N,U}, v1)
-    iv & em != em
-end
-@inline Base.isinf(v1::Vec{N,T}) where {N,T<:FloatingTypes} = abs(v1) == Vec{N,T}(Inf)
-@inline Base.isnan(v1::Vec{N,T}) where {N,T<:FloatingTypes} = v1 != v1
-@inline function Base.issubnormal(v1::Vec{N,T}) where {N,T<:FloatingTypes}
-    U = uint_type(T)
-    em = Vec{N,U}(exponent_mask(T))
-    sm = Vec{N,U}(significand_mask(T))
-    iv = reinterpret(Vec{N,U}, v1)
-    (iv & em == Vec{N,U}(0)) & (iv & sm != Vec{N,U}(0))
-end
-@inline function Base.signbit(v1::Vec{N,T}) where {N,T<:FloatingTypes}
-    U = uint_type(T)
-    sm = Vec{N,U}(sign_mask(T))
-    iv = reinterpret(Vec{N,U}, v1)
-    iv & sm != Vec{N,U}(0)
-end
-
-export vifelse
-vifelse(c::Bool, x, y) = ifelse(c, x, y)
-@generated function vifelse(v1::Vec{N,Bool}, v2::Vec{N,T},
-        v3::Vec{N,T}) where {N,T}
-    btyp = llvmtype(Bool)
-    vbtyp = "<$N x $btyp>"
-    abtyp = "[$N x $btyp]"
-    typ = llvmtype(T)
-    vtyp = "<$N x $typ>"
-    atyp = "[$N x $typ]"
-    decls = []
-    instrs = []
-    if false && N == 1
-        append!(instrs, array2vector("%arg1", N, btyp, "%0", "%arg1arr"))
-        append!(instrs, array2vector("%arg2", N, typ, "%1", "%arg2arr"))
-        append!(instrs, array2vector("%arg3", N, typ, "%2", "%arg3arr"))
-        push!(instrs, "%cond = trunc $vbtyp %arg1 to <$N x i1>")
-        push!(instrs, "%res = select <$N x i1> %cond, $vtyp %arg2, $vtyp %arg3")
-        append!(instrs, vector2array("%resarr", N, typ, "%res"))
-        push!(instrs, "ret $atyp %resarr")
-    else
-        push!(instrs, "%cond = trunc $vbtyp %0 to <$N x i1>")
-        push!(instrs, "%res = select <$N x i1> %cond, $vtyp %1, $vtyp %2")
-        push!(instrs, "ret $vtyp %res")
-    end
-    quote
-        $(Expr(:meta, :inline))
-        Vec{N,T}(Base.llvmcall($((join(decls, "\n"), join(instrs, "\n"))),
-            NTuple{N,VE{T}},
-            Tuple{NTuple{N,VE{Bool}}, NTuple{N,VE{T}}, NTuple{N,VE{T}}},
-            v1.elts, v2.elts, v3.elts))
-    end
-end
-
-# Integer arithmetic functions
-
-for op in (:~, :+, :-)
-    @eval begin
-        @inline Base.$op(v1::Vec{N,T}) where {N,T<:IntegerTypes} =
-            llvmwrap(Val($(QuoteNode(op))), v1)
-    end
-end
-@inline Base.:!(v1::Vec{N,Bool}) where {N} = ~v1
-@inline function Base.abs(v1::Vec{N,T}) where {N,T<:IntTypes}
-    # s = -Vec{N,T}(signbit(v1))
-    s = v1 >> Val(8*sizeof(T))
-    # Note: -v1 == ~v1 + 1
-    (s ⊻ v1) - s
-end
-@inline Base.abs(v1::Vec{N,T}) where {N,T<:UIntTypes} = v1
-# TODO: Try T(v1>0) - T(v1<0)
-#       use a shift for v1<0
-#       evaluate v1>0 as -v1<0 ?
-@inline Base.sign(v1::Vec{N,T}) where {N,T<:IntTypes} =
-    vifelse(v1 == Vec{N,T}(0), Vec{N,T}(0),
-        vifelse(v1 < Vec{N,T}(0), Vec{N,T}(-1), Vec{N,T}(1)))
-@inline Base.sign(v1::Vec{N,T}) where {N,T<:UIntTypes} =
-    vifelse(v1 == Vec{N,T}(0), Vec{N,T}(0), Vec{N,T}(1))
-@inline Base.signbit(v1::Vec{N,T}) where {N,T<:IntTypes} = v1 < Vec{N,T}(0)
-@inline Base.signbit(v1::Vec{N,T}) where {N,T<:UIntTypes} = Vec{N,Bool}(false)
-
-for op in (:&, :|, :⊻, :+, :-, :*, :div, :rem)
-    @eval begin
-        @inline Base.$op(v1::Vec{N,T}, v2::Vec{N,T}) where {N,T<:IntegerTypes} =
-            llvmwrap(Val($(QuoteNode(op))), v1, v2)
-    end
-end
-@inline Base.copysign(v1::Vec{N,T}, v2::Vec{N,T}) where {N,T<:IntTypes} =
-    vifelse(signbit(v2), -abs(v1), abs(v1))
-@inline Base.copysign(v1::Vec{N,T}, v2::Vec{N,T}) where {N,T<:UIntTypes} = v1
-@inline Base.flipsign(v1::Vec{N,T}, v2::Vec{N,T}) where {N,T<:IntTypes} =
-    vifelse(signbit(v2), -v1, v1)
-@inline Base.flipsign(v1::Vec{N,T}, v2::Vec{N,T}) where {N,T<:UIntTypes} = v1
-@inline Base.max(v1::Vec{N,T}, v2::Vec{N,T}) where {N,T<:IntegerTypes} =
-    vifelse(v1>=v2, v1, v2)
-@inline Base.min(v1::Vec{N,T}, v2::Vec{N,T}) where {N,T<:IntegerTypes} =
-    vifelse(v1>=v2, v2, v1)
-
-@inline function Base.muladd(v1::Vec{N,T}, v2::Vec{N,T},
-        v3::Vec{N,T}) where {N,T<:IntegerTypes}
-    v1*v2+v3
-end
-
-# TODO: Handle negative shift counts
-#       use vifelse
-#       ensure vifelse is efficient
-for op in (:<<, :>>, :>>>)
-    @eval begin
-        @inline Base.$op(v1::Vec{N,T}, ::Val{I}) where {N,T<:IntegerTypes,I} =
-            llvmwrapshift(Val($(QuoteNode(op))), v1, Val(I))
-        @inline Base.$op(v1::Vec{N,T}, ::Type{Val{I}}) where {N,T<:IntegerTypes,I} =
-            Base.$op(v1, Val(I))
-        @inline Base.$op(v1::Vec{N,T}, x2::Unsigned) where {N,T<:IntegerTypes} =
-            llvmwrapshift(Val($(QuoteNode(op))), v1, x2)
-        @inline Base.$op(v1::Vec{N,T}, x2::Int) where {N,T<:IntegerTypes} =
-            llvmwrapshift(Val($(QuoteNode(op))), v1, x2)
-        @inline Base.$op(v1::Vec{N,T}, x2::Integer) where {N,T<:IntegerTypes} =
-            llvmwrapshift(Val($(QuoteNode(op))), v1, x2)
-        @inline Base.$op(v1::Vec{N,T},
-                         v2::Vec{N,U}) where {N,T<:IntegerTypes,U<:UIntTypes} =
-            llvmwrapshift(Val($(QuoteNode(op))), v1, v2)
-        @inline Base.$op(v1::Vec{N,T},
-                         v2::Vec{N,U}) where {N,T<:IntegerTypes,U<:IntegerTypes} =
-            llvmwrapshift(Val($(QuoteNode(op))), v1, v2)
-        @inline Base.$op(x1::T, v2::Vec{N,T}) where {N,T<:IntegerTypes} =
-            $op(Vec{N,T}(x1), v2)
-    end
-end
-
-# Floating point arithmetic functions
-
-for op in (
-        :+, :-,
-        :abs, :ceil, :cos, :exp, :exp2, :floor, :inv, :log, :log10, :log2,
-        :round, :sin, :sqrt, :trunc)
-    @eval begin
-        @inline Base.$op(v1::Vec{N,T}) where {N,T<:FloatingTypes} =
-            llvmwrap(Val($(QuoteNode(op))), v1)
-    end
-end
-@inline Base.exp10(v1::Vec{N,T}) where {N,T<:FloatingTypes} = Vec{N,T}(10)^v1
-@inline Base.sign(v1::Vec{N,T}) where {N,T<:FloatingTypes} =
-    vifelse(v1 == Vec{N,T}(0.0), Vec{N,T}(0.0), copysign(Vec{N,T}(1.0), v1))
-
-for op in (:+, :-, :*, :/, :^, :copysign, :max, :min, :rem)
-    @eval begin
-        @inline Base.$op(v1::Vec{N,T}, v2::Vec{N,T}) where {N,T<:FloatingTypes} =
-            llvmwrap(Val($(QuoteNode(op))), v1, v2)
-    end
-end
-# Using `IntegerTypes` here so that this definition "wins" against
-# `^(::ScalarTypes, v2::Vec)`.
-@inline Base.:^(v1::Vec{N,T}, x2::IntegerTypes) where {N,T<:FloatingTypes} =
-    llvmwrap(Val(:powi), v1, Int(x2))
-@inline Base.:^(v1::Vec{N,T}, x2::Integer) where {N,T<:FloatingTypes} =
-    llvmwrap(Val(:powi), v1, Int(x2))
-@inline Base.flipsign(v1::Vec{N,T}, v2::Vec{N,T}) where {N,T<:FloatingTypes} =
-    vifelse(signbit(v2), -v1, v1)
-
-# Do what Base does for HWNumber:
-@inline Base.literal_pow(::typeof(^), x::Vec, ::Val{0}) = one(typeof(x))
-@inline Base.literal_pow(::typeof(^), x::Vec, ::Val{1}) = x
-@inline Base.literal_pow(::typeof(^), x::Vec, ::Val{2}) = x*x
-@inline Base.literal_pow(::typeof(^), x::Vec, ::Val{3}) = x*x*x
-
-for op in (:fma, :muladd)
-    @eval begin
-        @inline function Base.$op(v1::Vec{N,T},
-                v2::Vec{N,T}, v3::Vec{N,T}) where {N,T<:FloatingTypes}
-            llvmwrap(Val($(QuoteNode(op))), v1, v2, v3)
-        end
-    end
-end
-
-# Type promotion
-
-# Promote scalars of all IntegerTypes to vectors of IntegerTypes, leaving the
-# vector type unchanged
-
-for op in (
-        :(==), :(!=), :(<), :(<=), :(>), :(>=),
-        :&, :|, :⊻, :+, :-, :*, :copysign, :div, :flipsign, :max, :min, :rem)
-    @eval begin
-        @inline Base.$op(s1::Bool, v2::Vec{N,Bool}) where {N} =
-            $op(Vec{N,Bool}(s1), v2)
-        @inline Base.$op(s1::IntegerTypes, v2::Vec{N,T}) where {N,T<:IntegerTypes} =
-            $op(Vec{N,T}(s1), v2)
-        @inline Base.$op(v1::Vec{N,T}, s2::IntegerTypes) where {N,T<:IntegerTypes} =
-            $op(v1, Vec{N,T}(s2))
-    end
-end
-@inline vifelse(c::Vec{N,Bool}, s1::IntegerTypes,
-        v2::Vec{N,T}) where {N,T<:IntegerTypes} =
-    vifelse(c, Vec{N,T}(s1), v2)
-@inline vifelse(c::Vec{N,Bool}, v1::Vec{N,T},
-        s2::IntegerTypes) where {N,T<:IntegerTypes} =
-    vifelse(c, v1, Vec{N,T}(s2))
-
-for op in (:muladd,)
-    @eval begin
-        @inline Base.$op(s1::IntegerTypes, v2::Vec{N,T},
-                v3::Vec{N,T}) where {N,T<:IntegerTypes} =
-            $op(Vec{N,T}(s1), v2, v3)
-        @inline Base.$op(v1::Vec{N,T}, s2::IntegerTypes,
-                v3::Vec{N,T}) where {N,T<:IntegerTypes} =
-            $op(v1, Vec{N,T}(s2), v3)
-        @inline Base.$op(s1::IntegerTypes, s2::IntegerTypes,
-                v3::Vec{N,T}) where {N,T<:IntegerTypes} =
-            $op(Vec{N,T}(s1), Vec{N,T}(s2), v3)
-        @inline Base.$op(v1::Vec{N,T}, v2::Vec{N,T},
-                s3::IntegerTypes) where {N,T<:IntegerTypes} =
-            $op(v1, v2, Vec{N,T}(s3))
-        @inline Base.$op(s1::IntegerTypes, v2::Vec{N,T},
-                s3::IntegerTypes) where {N,T<:IntegerTypes} =
-            $op(Vec{N,T}(s1), v2, Vec{N,T}(s3))
-        @inline Base.$op(v1::Vec{N,T}, s2::IntegerTypes,
-                s3::IntegerTypes) where {N,T<:IntegerTypes} =
-            $op(v1, Vec{N,T}(s2), Vec{N,T}(s3))
-    end
-end
-
-# Promote scalars of all ScalarTypes to vectors of FloatingTypes, leaving the
-# vector type unchanged
-
-for op in (
-        :(==), :(!=), :(<), :(<=), :(>), :(>=),
-        :+, :-, :*, :/, :^, :copysign, :flipsign, :max, :min, :rem)
-    @eval begin
-        @inline Base.$op(s1::ScalarTypes, v2::Vec{N,T}) where {N,T<:FloatingTypes} =
-            $op(Vec{N,T}(s1), v2)
-        @inline Base.$op(v1::Vec{N,T}, s2::ScalarTypes) where {N,T<:FloatingTypes} =
-            $op(v1, Vec{N,T}(s2))
-    end
-end
-@inline vifelse(c::Vec{N,Bool}, s1::ScalarTypes,
-        v2::Vec{N,T}) where {N,T<:FloatingTypes} =
-    vifelse(c, Vec{N,T}(s1), v2)
-@inline vifelse(c::Vec{N,Bool}, v1::Vec{N,T},
-        s2::ScalarTypes) where {N,T<:FloatingTypes} =
-    vifelse(c, v1, Vec{N,T}(s2))
-
-for op in (:fma, :muladd)
-    @eval begin
-        @inline Base.$op(s1::ScalarTypes, v2::Vec{N,T},
-                v3::Vec{N,T}) where {N,T<:FloatingTypes} =
-            $op(Vec{N,T}(s1), v2, v3)
-        @inline Base.$op(v1::Vec{N,T}, s2::ScalarTypes,
-                v3::Vec{N,T}) where {N,T<:FloatingTypes} =
-            $op(v1, Vec{N,T}(s2), v3)
-        @inline Base.$op(s1::ScalarTypes, s2::ScalarTypes,
-                v3::Vec{N,T}) where {N,T<:FloatingTypes} =
-            $op(Vec{N,T}(s1), Vec{N,T}(s2), v3)
-        @inline Base.$op(v1::Vec{N,T}, v2::Vec{N,T},
-                s3::ScalarTypes) where {N,T<:FloatingTypes} =
-            $op(v1, v2, Vec{N,T}(s3))
-        @inline Base.$op(s1::ScalarTypes, v2::Vec{N,T},
-                s3::ScalarTypes) where {N,T<:FloatingTypes} =
-            $op(Vec{N,T}(s1), v2, Vec{N,T}(s3))
-        @inline Base.$op(v1::Vec{N,T}, s2::ScalarTypes,
-                s3::ScalarTypes) where {N,T<:FloatingTypes} =
-            $op(v1, Vec{N,T}(s2), Vec{N,T}(s3))
-    end
-end
-
-# Poitner arithmetics between Ptr, IntegerTypes, and vectors of them.
-
-for op in (:+, :-)
-    @eval begin
-        @inline Base.$op(v1::Vec{N,<:Ptr}, v2::Vec{N,<:IntegerTypes}) where {N} =
-            llvmwrap(Val($(QuoteNode(op))), v1, v2)
-        @inline Base.$op(v1::Vec{N,<:IntegerTypes}, v2::Vec{N,<:Ptr}) where {N} =
-            llvmwrap(Val($(QuoteNode(op))), v1, v2)
-        @inline Base.$op(s1::P, v2::Vec{N,<:IntegerTypes}) where {N,P<:Ptr} =
-            $op(Vec{N,P}(s1), v2)
-        @inline Base.$op(v1::Vec{N,<:IntegerTypes}, s2::P) where {N,P<:Ptr} =
-            $op(v1, Vec{N,P}(s2))
-    end
-end
-
-
-# Reduction operations
-
-# TODO: map, mapreduce
-
-function getneutral(op::Symbol, ::Type{T}) where T
-    zs = Dict{Symbol,T}()
-    if T <: IntegerTypes
-        zs[:&] = ~T(0)
-        zs[:|] = T(0)
-    end
-    zs[:max] = typemin(T)
-    zs[:min] = typemax(T)
-    zs[:+] = T(0)
-    zs[:*] = T(1)
-    zs[op]
-end
-
-if VERSION >= v"0.7.0-beta2.195"
-    nextpow2(n) = nextpow(2, n)
-end
-
-# We cannot pass in the neutral element via Val{}; if we try, Julia refuses to
-# inline this function, which is then disastrous for performance
-@generated function llvmwrapreduce(::Val{Op}, v::Vec{N,T}) where {Op,N,T}
-    @assert isa(Op, Symbol)
-    z = getneutral(Op, T)
-    typ = llvmtype(T)
-    decls = []
-    instrs = []
-    n = N
-    nam = "%0"
-    nold,n = n,nextpow2(n)
-    if n > nold
-        namold,nam = nam,"%vec_$n"
-        append!(instrs,
-            extendvector(namold, nold, typ, n, n-nold,
-                llvmtypedconst(T,z), nam))
-    end
-    while n > 1
-        nold,n = n, div(n, 2)
-        namold,nam = nam,"%vec_$n"
-        vtyp = "<$n x $typ>"
-        ins = llvmins(Val(Op), n, T)
-        append!(instrs, subvector(namold, nold, typ, "$(nam)_1", n, 0))
-        append!(instrs, subvector(namold, nold, typ, "$(nam)_2", n, n))
-        if ins[1] == '@'
-            push!(decls, "declare $vtyp $ins($vtyp, $vtyp)")
-            push!(instrs,
-                "$nam = call $vtyp $ins($vtyp $(nam)_1, $vtyp $(nam)_2)")
-        else
-            push!(instrs, "$nam = $ins $vtyp $(nam)_1, $(nam)_2")
-        end
-    end
-    push!(instrs, "%res = extractelement <$n x $typ> $nam, i32 0")
-    push!(instrs, "ret $typ %res")
-    quote
-        $(Expr(:meta, :inline))
-        Base.llvmcall($((join(decls, "\n"), join(instrs, "\n"))),
-            T, Tuple{NTuple{N,VE{T}}}, v.elts)
-    end
-end
-
-@inline Base.all(v::Vec{N,T}) where {N,T<:IntegerTypes} = llvmwrapreduce(Val(:&), v)
-@inline Base.any(v::Vec{N,T}) where {N,T<:IntegerTypes} = llvmwrapreduce(Val(:|), v)
-@inline Base.maximum(v::Vec{N,T}) where {N,T<:FloatingTypes} =
-    llvmwrapreduce(Val(:max), v)
-@inline Base.minimum(v::Vec{N,T}) where {N,T<:FloatingTypes} =
-    llvmwrapreduce(Val(:min), v)
-@inline Base.prod(v::Vec{N,T}) where {N,T} = llvmwrapreduce(Val(:*), v)
-@inline Base.sum(v::Vec{N,T}) where {N,T} = llvmwrapreduce(Val(:+), v)
-
-@generated function Base.reduce(::Val{Op}, v::Vec{N,T}) where {Op,N,T}
-    @assert isa(Op, Symbol)
-    z = getneutral(Op, T)
-    stmts = []
-    n = N
-    push!(stmts, :($(Symbol(:v,n)) = v))
-    nold,n = n,nextpow2(n)
-    if n > nold
-        push!(stmts,
-            :($(Symbol(:v,n)) = Vec{$n,T}($(Expr(:tuple,
-                [:($(Symbol(:v,nold)).elts[$i]) for i in 1:nold]...,
-                [z for i in nold+1:n]...)))))
-    end
-    while n > 1
-        nold,n = n, div(n, 2)
-        push!(stmts,
-            :($(Symbol(:v,n,"lo")) = Vec{$n,T}($(Expr(:tuple,
-                [:($(Symbol(:v,nold)).elts[$i]) for i in 1:n]...,)))))
-        push!(stmts,
-            :($(Symbol(:v,n,"hi")) = Vec{$n,T}($(Expr(:tuple,
-                [:($(Symbol(:v,nold)).elts[$i]) for i in n+1:nold]...)))))
-        push!(stmts,
-            :($(Symbol(:v,n)) =
-                $Op($(Symbol(:v,n,"lo")), $(Symbol(:v,n,"hi")))))
-    end
-    push!(stmts, :(v1[1]))
-    Expr(:block, Expr(:meta, :inline), stmts...)
-end
-@inline function Base.reduce(::Type{Val{Op}}, v::Vec{N,T}) where {Op,N,T}
-    Base.reduce(Val(Op), v)
-end
-
-@inline Base.maximum(v::Vec{N,T}) where {N,T<:IntegerTypes} = reduce(Val(:max), v)
-@inline Base.minimum(v::Vec{N,T}) where {N,T<:IntegerTypes} = reduce(Val(:min), v)
-
-# Load and store functions
-
-export valloc
-function valloc(::Type{T}, N::Int, sz::Int) where T
-    @assert N > 0
-    @assert sz >= 0
-    # We use padding to align the address of the first element, and
-    # also to ensure that we can access past the last element up to
-    # the next full vector width
-    padding = N-1 + mod(-sz, N)
-    mem = Vector{T}(undef, sz + padding)
-    addr = Int(pointer(mem))
-    off = mod(-addr, N * sizeof(T))
-    @assert mod(off, sizeof(T)) == 0
-    off = fld(off, sizeof(T))
-    @assert 0 <= off <= padding
-    res = view(mem, off+1 : off+sz)
-    addr2 = Int(pointer(res))
-    @assert mod(addr2, N * sizeof(T)) == 0
-    res
-end
-function valloc(f, ::Type{T}, N::Int, sz::Int) where T
-    mem = valloc(T, N, sz)
-    @inbounds for i in 1:sz
-        mem[i] = f(i)
-    end
-    mem
-end
-
-export vload, vloada, vloadnt
-@generated function vload(::Type{Vec{N,T}}, ptr::Ptr{T},
-                          ::Val{Aligned} = Val(false),
-                          ::Val{Nontemporal} = Val(false)) where {N,T,Aligned,Nontemporal}
-    @assert isa(Aligned, Bool)
-    ptyp = llvmtype(Int)
-    typ = llvmtype(T)
-    vtyp = "<$N x $typ>"
-    decls = []
-    instrs = []
-    if Aligned
-        align = N * sizeof(T)
-    else
-        align = sizeof(T)   # This is overly optimistic
-    end
-    flags = [""]
-    if align > 0
-        push!(flags, "align $align")
-    end
-    if Nontemporal
-        push!(flags, "!nontemporal !{i32 1}")
-    end
-    if VERSION < v"v0.7.0-DEV"
-        push!(instrs, "%ptr = bitcast $typ* %0 to $vtyp*")
-    else
-        push!(instrs, "%ptr = inttoptr $ptyp %0 to $vtyp*")
-    end
-    push!(instrs, "%res = load $vtyp, $vtyp* %ptr" * join(flags, ", "))
-    push!(instrs, "ret $vtyp %res")
-    quote
-        $(Expr(:meta, :inline))
-        Vec{N,T}(Base.llvmcall($((join(decls, "\n"), join(instrs, "\n"))),
-            NTuple{N,VE{T}}, Tuple{Ptr{T}}, ptr))
-    end
-end
-@inline function vload(::Type{Vec{N,T}}, ptr::Ptr{T},
-                       ::Type{Val{Aligned}},
-                       ::Type{Val{Nontemporal}} = Val{false}) where {N,T,Aligned,Nontemporal}
-    vload(Vec{N, T}, ptr, Val(Aligned), Val(Nontemporal))
-end
-
-@inline vloada(::Type{Vec{N,T}}, ptr::Ptr{T}) where {N,T} =
-    vload(Vec{N,T}, ptr, Val(true))
-
-@inline vloadnt(::Type{Vec{N,T}}, ptr::Ptr{T}) where {N,T} =
-    vload(Vec{N,T}, ptr, Val(true), Val(true))
-
-@inline function vload(::Type{Vec{N,T}},
-                       arr::FastContiguousArray{T,1},
-                       i::Integer,
-                       ::Val{Aligned} = Val(false),
-                       ::Val{Nontemporal} = Val(false)) where {N,T,Aligned,Nontemporal}
-    #TODO @boundscheck 1 <= i <= length(arr) - (N-1) || throw(BoundsError())
-    vload(Vec{N,T}, pointer(arr, i), Val(Aligned), Val(Nontemporal))
-end
-@inline function vload(::Type{Vec{N,T}},
-                       arr::FastContiguousArray{T,1},
-                       i::Integer,
-                       ::Type{Val{Aligned}},k = Val{false},
-                       ::Type{Val{Nontemporal}} = Val{false}) where {N,T,Aligned,Nontemporal}
-    vload(Vec{N,T}, arr, i, Val(Aligned), Val(Nontemporal))
-end
-@inline function vloada(::Type{Vec{N,T}},
-                        arr::FastContiguousArray{T,1},
-                        i::Integer) where {N,T}
-    vload(Vec{N,T}, arr, i, Val(true))
-end
-@inline function vloadnt(::Type{Vec{N,T}},
-                        arr::Union{Array{T,1},SubArray{T,1}},
-                        i::Integer) where {N,T}
-    vload(Vec{N,T}, arr, i, Val(true), Val(true))
-end
-
-@inline vload(::Type{Vec{N,T}}, ptr::Ptr{T}, mask::Nothing,
-              ::Val{Aligned} = Val(false)) where {N,T,Aligned} =
-    vload(Vec{N,T}, ptr, Val(Aligned))
-@inline vload(::Type{Vec{N,T}}, ptr::Ptr{T}, mask::Nothing,
-              ::Type{Val{Aligned}}) where {N,T,Aligned} =
-    vload(Vec{N,T}, ptr, make, Val(Aligned))
-
-@generated function vload(::Type{Vec{N,T}}, ptr::Ptr{T},
-                          mask::Vec{N,Bool},
-                          ::Val{Aligned} = Val(false)) where {N,T,Aligned}
-    @assert isa(Aligned, Bool)
-    ptyp = llvmtype(Int)
-    typ = llvmtype(T)
-    vtyp = "<$N x $typ>"
-    btyp = llvmtype(Bool)
-    vbtyp = "<$N x $btyp>"
-    decls = []
-    instrs = []
-    if Aligned
-        align = N * sizeof(T)
-    else
-        align = sizeof(T)   # This is overly optimistic
-    end
-
-    if VERSION < v"v0.7.0-DEV"
-        push!(instrs, "%ptr = bitcast $typ* %0 to $vtyp*")
-    else
-        push!(instrs, "%ptr = inttoptr $ptyp %0 to $vtyp*")
-    end
-    push!(instrs, "%mask = trunc $vbtyp %1 to <$N x i1>")
-    push!(decls,
-        "declare $vtyp @llvm.masked.load.$(suffix(N,T))($vtyp*, i32, " *
-            "<$N x i1>, $vtyp)")
-    push!(instrs,
-        "%res = call $vtyp @llvm.masked.load.$(suffix(N,T))($vtyp* %ptr, " *
-            "i32 $align, <$N x i1> %mask, $vtyp $(llvmconst(N, T, 0)))")
-    push!(instrs, "ret $vtyp %res")
-    quote
-        $(Expr(:meta, :inline))
-        Vec{N,T}(Base.llvmcall($((join(decls, "\n"), join(instrs, "\n"))),
-            NTuple{N,VE{T}}, Tuple{Ptr{T}, NTuple{N,VE{Bool}}}, ptr, mask.elts))
-    end
-end
-@inline function vload(::Type{Vec{N,T}}, ptr::Ptr{T},
-                       mask::Vec{N,Bool},
-                       ::Type{Val{Aligned}}) where {N,T,Aligned}
-    vload(Vec{N,T}, ptr, mask, Val(Aligned))
-end
-
-@inline vloada(::Type{Vec{N,T}}, ptr::Ptr{T},
-               mask::Union{Vec{N,Bool}, Nothing}) where {N,T} =
-    vload(Vec{N,T}, ptr, mask, Val(true))
-
-@inline function vload(::Type{Vec{N,T}},
-                       arr::FastContiguousArray{T,1},
-                       i::Integer, mask::Union{Vec{N,Bool}, Nothing},
-                       ::Val{Aligned} = Val(false)) where {N,T,Aligned}
-    #TODO @boundscheck 1 <= i <= length(arr) - (N-1) || throw(BoundsError())
-    vload(Vec{N,T}, pointer(arr, i), mask, Val(Aligned))
-end
-@inline function vload(::Type{Vec{N,T}},
-                       arr::FastContiguousArray{T,1},
-                       i::Integer, mask::Union{Vec{N,Bool}, Nothing},
-                       ::Type{Val{Aligned}}) where {N,T,Aligned}
-    vload(Vec{N,T}, arr, i, mask, Val(Aligned))
-end
-@inline function vloada(::Type{Vec{N,T}},
-                        arr::FastContiguousArray{T,1}, i::Integer,
-                        mask::Union{Vec{N,Bool}, Nothing}) where {N,T}
-    vload(Vec{N,T}, arr, i, mask, Val(true))
-end
-
-export vstore, vstorea, vstorent
-@generated function vstore(v::Vec{N,T}, ptr::Ptr{T},
-                           ::Val{Aligned} = Val(false),
-                           ::Val{Nontemporal} = Val(false)) where {N,T,Aligned,Nontemporal}
-    @assert isa(Aligned, Bool)
-    @assert isa(Nontemporal, Bool)
-    ptyp = llvmtype(Int)
-    typ = llvmtype(T)
-    vtyp = "<$N x $typ>"
-    decls = []
-    instrs = []
-    if Aligned
-        align = N * sizeof(T)
-    else
-        align = sizeof(T)   # This is overly optimistic
-    end
-    flags = [""]
-    if align > 0
-        push!(flags, "align $align")
-    end
-    if Nontemporal
-        push!(flags, "!nontemporal !{i32 1}")
-    end
-    if VERSION < v"v0.7.0-DEV"
-        push!(instrs, "%ptr = bitcast $typ* %1 to $vtyp*")
-    else
-        push!(instrs, "%ptr = inttoptr $ptyp %1 to $vtyp*")
-    end
-    push!(instrs, "store $vtyp %0, $vtyp* %ptr" * join(flags, ", "))
-    push!(instrs, "ret void")
-    quote
-        $(Expr(:meta, :inline))
-        Base.llvmcall($((join(decls, "\n"), join(instrs, "\n"))),
-                      Cvoid, Tuple{NTuple{N,VE{T}}, Ptr{T}}, v.elts, ptr)
-    end
-end
-@inline function vstore(v::Vec{N,T}, ptr::Ptr{T},
-                        ::Type{Val{Aligned}},
-                        ::Type{Val{Nontemporal}} = Val{false}) where {N,T,Aligned,Nontemporal}
-    vstore(v, ptr, Val(Aligned), Val(Nontemporal))
-end
-
-@inline vstorea(v::Vec{N,T}, ptr::Ptr{T}) where {N,T} = vstore(v, ptr, Val{true})
-
-@inline vstorent(v::Vec{N,T}, ptr::Ptr{T}) where {N,T} = vstore(v, ptr, Val{true}, Val{true})
-
-@inline function vstore(v::Vec{N,T},
-                        arr::FastContiguousArray{T,1},
-                        i::Integer,
-                        ::Val{Aligned} = Val(false),
-                        ::Val{Nontemporal} = Val(false)) where {N,T,Aligned,Nontemporal}
-    @boundscheck 1 <= i <= length(arr) - (N-1) || throw(BoundsError())
-    vstore(v, pointer(arr, i), Val{Aligned}, Val{Nontemporal})
-end
-@inline function vstore(v::Vec{N,T},
-                        arr::FastContiguousArray{T,1},
-                        i::Integer,
-                        ::Type{Val{Aligned}},
-                        ::Type{Val{Nontemporal}} = Val{false}) where {N,T,Aligned,Nontemporal}
-    vstore(v, arr, i, Val(Aligned), Val(Nontemporal))
-end
-@inline function vstorea(v::Vec{N,T}, arr::FastContiguousArray{T,1},
-                         i::Integer) where {N,T}
-    vstore(v, arr, i, Val{true})
-end
-@inline function vstorent(v::Vec{N,T}, arr::FastContiguousArray{T,1},
-                         i::Integer) where {N,T}
-    vstore(v, arr, i, Val{true}, Val{true})
-end
-
-@inline vstore(v::Vec{N,T}, ptr::Ptr{T}, mask::Nothing,
-               ::Val{Aligned} = Val(false)) where {N,T,Aligned} =
-    vstore(v, ptr, Val{Aligned})
-@inline vstore(v::Vec{N,T}, ptr::Ptr{T}, mask::Nothing,
-               ::Type{Val{Aligned}}) where {N,T,Aligned} =
-    vstore(v, ptr, mask, Val(Aligned))
-
-@generated function vstore(v::Vec{N,T}, ptr::Ptr{T},
-                           mask::Vec{N,Bool},
-                           ::Val{Aligned} = Val(false)) where {N,T,Aligned}
-    @assert isa(Aligned, Bool)
-    ptyp = llvmtype(Int)
-    typ = llvmtype(T)
-    vtyp = "<$N x $typ>"
-    btyp = llvmtype(Bool)
-    vbtyp = "<$N x $btyp>"
-    decls = []
-    instrs = []
-    if Aligned
-        align = N * sizeof(T)
-    else
-        align = sizeof(T)   # This is overly optimistic
-    end
-    if VERSION < v"v0.7.0-DEV"
-        push!(instrs, "%ptr = bitcast $typ* %1 to $vtyp*")
-    else
-        push!(instrs, "%ptr = inttoptr $ptyp %1 to $vtyp*")
-    end
-    push!(instrs, "%mask = trunc $vbtyp %2 to <$N x i1>")
-    push!(decls,
-        "declare void @llvm.masked.store.$(suffix(N,T))($vtyp, $vtyp*, i32, " *
-            "<$N x i1>)")
-    push!(instrs,
-        "call void @llvm.masked.store.$(suffix(N,T))($vtyp %0, $vtyp* %ptr, " *
-            "i32 $align, <$N x i1> %mask)")
-    push!(instrs, "ret void")
-    quote
-        $(Expr(:meta, :inline))
-        Base.llvmcall($((join(decls, "\n"), join(instrs, "\n"))),
-            Cvoid, Tuple{NTuple{N,VE{T}}, Ptr{T}, NTuple{N,VE{Bool}}},
-            v.elts, ptr, mask.elts)
-    end
-end
-@inline function vstore(v::Vec{N,T}, ptr::Ptr{T},
-                        mask::Vec{N,Bool},
-                        ::Type{Val{Aligned}}) where {N,T,Aligned}
-    vstore(v, ptr, mask, Val(Aligned))
-end
-
-@inline vstorea(v::Vec{N,T}, ptr::Ptr{T},
-                mask::Union{Vec{N,Bool}, Nothing}) where {N,T} =
-    vstore(v, ptr, mask, Val{true})
-
-@inline function vstore(v::Vec{N,T},
-                        arr::FastContiguousArray{T,1},
-                        i::Integer,
-                        mask::Union{Vec{N,Bool}, Nothing},
-                        ::Val{Aligned} = Val(false),
-                        ::Val{Nontemporal} = Val(false)) where {N,T,Aligned,Nontemporal}
-    #TODO @boundscheck 1 <= i <= length(arr) - (N-1) || throw(BoundsError())
-    vstore(v, pointer(arr, i), mask, Val{Aligned}, Val{Nontemporal})
-end
-@inline function vstore(v::Vec{N,T},
-                        arr::FastContiguousArray{T,1},
-                        i::Integer,
-                        mask::Union{Vec{N,Bool}, Nothing},
-                        ::Type{Val{Aligned}},
-                        ::Type{Val{Nontemporal}} = Val{false}) where {N,T,Aligned,Nontemporal}
-    vstore(v, arr, i, mask, Val(Aligned), Val(Nontemporal))
-end
-@inline function vstorea(v::Vec{N,T},
-                         arr::FastContiguousArray{T,1},
-                         i::Integer,
-                         mask::Union{Vec{N,Bool}, Nothing}) where {N,T}
-    vstore(v, arr, i, mask, Val{true})
-end
-
-export vgather, vgathera
-
-@inline vgather(
-        ::Type{Vec{N,T}}, ptrs::Vec{N,Ptr{T}}, mask::Nothing,
-        ::Val{Aligned} = Val(false)) where {N,T,Aligned} =
-    vgather(Vec{N,T}, ptrs, Vec(ntuple(_ -> true, N)), Val(Aligned))
-@inline vgather(
-        ::Type{Vec{N,T}}, ptrs::Vec{N,Ptr{T}}, mask::Nothing,
-        ::Type{Val{Aligned}}) where {N,T,Aligned} =
-    vgather(Vec{N,T}, ptrs, mask, Val(Aligned))
-
-@generated function vgather(
-        ::Type{Vec{N,T}}, ptrs::Vec{N,Ptr{T}}, mask::Vec{N,Bool},
-        ::Val{Aligned} = Val(false)) where {N,T,Aligned}
-    @assert isa(Aligned, Bool)
-    ptyp = llvmtype(Int)
-    typ = llvmtype(T)
-    vptyp = "<$N x $typ*>"
-    vtyp = "<$N x $typ>"
-    btyp = llvmtype(Bool)
-    vbtyp = "<$N x $btyp>"
-    decls = []
-    instrs = []
-    if Aligned
-        align = N * sizeof(T)
-    else
-        align = sizeof(T)   # This is overly optimistic
-    end
-
-    if VERSION < v"v0.7.0-DEV"
-        push!(instrs, "%ptrs = bitcast <$N x $typ*> %0 to $vptyp")
-    else
-        push!(instrs, "%ptrs = inttoptr <$N x $ptyp> %0 to $vptyp")
-    end
-    push!(instrs, "%mask = trunc $vbtyp %1 to <$N x i1>")
-    push!(decls,
-        "declare $vtyp @llvm.masked.gather.$(suffix(N,T))($vptyp, i32, " *
-            "<$N x i1>, $vtyp)")
-    push!(instrs,
-        "%res = call $vtyp @llvm.masked.gather.$(suffix(N,T))($vptyp %ptrs, " *
-            "i32 $align, <$N x i1> %mask, $vtyp $(llvmconst(N, T, 0)))")
-    push!(instrs, "ret $vtyp %res")
-    quote
-        $(Expr(:meta, :inline))
-        Vec{N,T}(Base.llvmcall($((join(decls, "\n"), join(instrs, "\n"))),
-            NTuple{N,VE{T}}, Tuple{NTuple{N,VE{Ptr{T}}}, NTuple{N,VE{Bool}}},
-            ptrs.elts, mask.elts))
-    end
-end
-@inline function vgather(
-        ::Type{Vec{N,T}}, ptrs::Vec{N,Ptr{T}}, mask::Vec{N,Bool},
-        ::Type{Val{Aligned}}) where {N,T,Aligned}
-    vgather(Vec{N,T}, ptrs, mask, Val(Aligned))
-end
-
-@inline vgathera(::Type{Vec{N,T}}, ptrs::Vec{N,Ptr{T}},
-                 mask::Union{Vec{N,Bool}, Nothing}) where {N,T} =
-    vgather(Vec{N,T}, ptrs, mask, Val{true})
-
-@inline vgather(arr::FastContiguousArray{T,1},
-                idx::Vec{N,<:Integer},
-                mask::Union{Vec{N,Bool}, Nothing} = nothing,
-                ::Val{Aligned} = Val(false)) where {N,T,Aligned} =
-    vgather(Vec{N,T},
-            pointer(arr) + sizeof(T) * (idx - 1),
-            mask, Val{Aligned})
-@inline vgather(arr::FastContiguousArray{T,1},
-                idx::Vec{N,<:Integer},
-                mask::Union{Vec{N,Bool}, Nothing},
-                ::Type{Val{Aligned}}) where {N,T,Aligned} =
-    vgather(arr, idx, mask, Val(Aligned))
-
-@inline vgathera(arr::FastContiguousArray{T,1},
-                 idx::Vec{N,<:Integer},
-                 mask::Union{Vec{N,Bool}, Nothing} = nothing) where {N,T} =
-    vgather(arr, idx, mask, Val{true})
-
-export vscatter, vscattera
-
-@inline vscatter(
-        v::Vec{N,T}, ptrs::Vec{N,Ptr{T}}, mask::Nothing,
-        ::Val{Aligned} = Val(false)) where {N,T,Aligned} =
-    vscatter(v, ptrs, Vec(ntuple(_ -> true, N)), Val(Aligned))
-@inline vscatter(
-        v::Vec{N,T}, ptrs::Vec{N,Ptr{T}}, mask::Nothing,
-        ::Type{Val{Aligned}}) where {N,T,Aligned} =
-    vscatter(v, ptrs, mask, Val(Aligned))
-
-@generated function vscatter(
-        v::Vec{N,T}, ptrs::Vec{N,Ptr{T}}, mask::Vec{N,Bool},
-        ::Val{Aligned} = Val(false)) where {N,T,Aligned}
-    @assert isa(Aligned, Bool)
-    ptyp = llvmtype(Int)
-    typ = llvmtype(T)
-    vptyp = "<$N x $typ*>"
-    vtyp = "<$N x $typ>"
-    btyp = llvmtype(Bool)
-    vbtyp = "<$N x $btyp>"
-    decls = []
-    instrs = []
-    if Aligned
-        align = N * sizeof(T)
-    else
-        align = sizeof(T)   # This is overly optimistic
-    end
-    if VERSION < v"v0.7.0-DEV"
-        push!(instrs, "%ptrs = bitcast <$N x $typ*> %1 to $vptyp")
-    else
-        push!(instrs, "%ptrs = inttoptr <$N x $ptyp> %1 to $vptyp")
-    end
-    push!(instrs, "%mask = trunc $vbtyp %2 to <$N x i1>")
-    push!(decls,
-        "declare void @llvm.masked.scatter.$(suffix(N,T))" *
-            "($vtyp, $vptyp, i32, <$N x i1>)")
-    push!(instrs,
-        "call void @llvm.masked.scatter.$(suffix(N,T))" *
-            "($vtyp %0, $vptyp %ptrs, i32 $align, <$N x i1> %mask)")
-    push!(instrs, "ret void")
-    quote
-        $(Expr(:meta, :inline))
-        Base.llvmcall($((join(decls, "\n"), join(instrs, "\n"))),
-            Cvoid,
-            Tuple{NTuple{N,VE{T}}, NTuple{N,VE{Ptr{T}}}, NTuple{N,VE{Bool}}},
-            v.elts, ptrs.elts, mask.elts)
-    end
-end
-@inline function vscatter(
-        v::Vec{N,T}, ptrs::Vec{N,Ptr{T}}, mask::Vec{N,Bool},
-        ::Type{Val{Aligned}}) where {N,T,Aligned}
-    vscatter(v, ptrs, mask, Val(Aligned))
-end
-
-@inline vscattera(v::Vec{N,T}, ptrs::Vec{N,Ptr{T}},
-                  mask::Union{Vec{N,Bool}, Nothing}) where {N,T} =
-    vscatter(v, ptrs, mask, Val{true})
-
-@inline vscatter(v::Vec{N,T}, arr::FastContiguousArray{T,1},
-                 idx::Vec{N,<:Integer},
-                 mask::Union{Vec{N,Bool}, Nothing} = nothing,
-                 ::Val{Aligned} = Val(false)) where {N,T,Aligned} =
-    vscatter(v, pointer(arr) + sizeof(T) * (idx - 1), mask, Val(Aligned))
-@inline vscatter(v::Vec{N,T}, arr::FastContiguousArray{T,1},
-                 idx::Vec{N,<:Integer},
-                 mask::Union{Vec{N,Bool}, Nothing},
-                 ::Type{Val{Aligned}}) where {N,T,Aligned} =
-    vscatter(v, arr, idx, mask, Val(Aligned))
-
-@inline vscattera(v::Vec{N,T}, arr::FastContiguousArray{T,1},
-                  idx::Vec{N,<:Integer},
-                  mask::Union{Vec{N,Bool}, Nothing} = nothing) where {N,T} =
-    vscatter(v, arr, idx, mask, Val{true})
-
-# Vector shuffles
-
-function shufflevector_instrs(N, T, I, two_operands)
-    typ = llvmtype(T)
-    vtyp2 = vtyp1 = "<$N x $typ>"
-    M = length(I)
-    vtyp3 = "<$M x i32>"
-    vtypr = "<$M x $typ>"
-    mask = "<" * join(map(x->string("i32 ", x), I), ", ") * ">"
-    instrs = []
-    v2 = two_operands ? "%1" : "undef"
-    push!(instrs, "%res = shufflevector $vtyp1 %0, $vtyp2 $v2, $vtyp3 $mask")
-    push!(instrs, "ret $vtypr %res")
-    return M, [], instrs
-end
-
-export shufflevector
-@generated function shufflevector(v1::Vec{N,T}, v2::Vec{N,T},
-                                  ::Val{I}) where {N,T,I}
-    M, decls, instrs = shufflevector_instrs(N, T, I, true)
-    quote
-        $(Expr(:meta, :inline))
-        Vec{$M,T}(Base.llvmcall($((join(decls, "\n"), join(instrs, "\n"))),
-            NTuple{$M,VE{T}},
-            Tuple{NTuple{N,VE{T}}, NTuple{N,VE{T}}},
-            v1.elts, v2.elts))
-    end
-end
-@inline function shufflevector(v1::Vec{N,T}, v2::Vec{N,T},
-                               ::Type{Val{I}}) where {N,T,I}
-    shufflevector(v1, v2, Val(I))
-end
-
-@generated function shufflevector(v1::Vec{N,T}, ::Val{I}) where {N,T,I}
-    M, decls, instrs = shufflevector_instrs(N, T, I, false)
-    quote
-        $(Expr(:meta, :inline))
-        Vec{$M,T}(Base.llvmcall($((join(decls, "\n"), join(instrs, "\n"))),
-            NTuple{$M,VE{T}},
-            Tuple{NTuple{N,VE{T}}},
-            v1.elts))
-    end
-end
-@inline function shufflevector(v1::Vec{N,T}, ::Type{Val{I}}) where {N,T,I}
-    shufflevector(v1, Val(I))
-end
-
-export VecRange
-
-"""
-    VecRange{N}(i::Int)
-
-Analogous to `UnitRange` but for loading SIMD vector of width `N` at
-index `i`.
-
-# Examples
-```jldoctest
-julia> xs = ones(4);
-
-julia> xs[VecRange{4}(1)]  # calls `vload(Vec{4,Float64}, xs, 1)`
-<4 x Float64>[1.0, 1.0, 1.0, 1.0]
-```
-"""
-struct VecRange{N}
-    i::Int
-end
-
-@inline Base.length(idx::VecRange{N}) where {N} = N
-@inline Base.first(idx::VecRange) = idx.i
-@inline Base.last(idx::VecRange) = idx.i + length(idx) - 1
-
-@inline Base.:+(idx::VecRange{N}, j::Integer) where N = VecRange{N}(idx.i + j)
-@inline Base.:+(j::Integer, idx::VecRange{N}) where N = VecRange{N}(idx.i + j)
-@inline Base.:-(idx::VecRange{N}, j::Integer) where N = VecRange{N}(idx.i - j)
-
-Base.checkindex(::Type{Bool}, inds::AbstractUnitRange, idx::VecRange) =
-    (first(inds) <= first(idx)) && (last(idx) <= last(inds))
-
-Base.checkindex(::Type{Bool}, inds::AbstractUnitRange, idx::Vec) =
-    all(first(inds) <= idx) && all(idx <= last(inds))
-
-@inline _checkarity(::AbstractArray{<:Any,N}, ::Vararg{<:Any,N}) where {N} =
-    nothing
-
-@inline _checkarity(::T, ::Any) where {T <: AbstractArray} =
-    if IndexStyle(T) isa IndexLinear
-        nothing
-    else
-        throw(ArgumentError("""
-        Array type $T does not support indexing with a single index.
-        Exactly $(ndims(T)) (non-mask) indices have to be specified.
-        """))
-    end
-
-_checkarity(::AbstractArray{<:Any,N}, ::Vararg{<:Any,M}) where {N,M} =
-    throw(ArgumentError("""
-    $M indices are given to $N-dimensional array.
-    Exactly $N (non-mask) indices have to be specified when using SIMD.
-    """))
-
-# Combined with `_preprocessindices`, helper function `_extractmask`
-# extracts `mask` in the tail position.  As slicing tuple is not
-# type-stable, we use reverse-of-tail-of-reverse hack to extract
-# `mask` at the end of `args`.
-@inline _extractmask(mask::Vec{N,Bool}, R::Vararg{Integer}) where N =
-    (reverse(R), mask)
-@inline _extractmask(R::Vararg{Integer}) = (reverse(R), nothing)
-@inline _extractmask(mask::Vec{N,Bool}) where {N} = ((), mask)
-@inline _extractmask() = ((), nothing)
-
-@noinline _extractmask(rargs...) =
-    throw(ArgumentError("""
-    Using SIMD indexing `array[idx, i2, ..., iN, mask]` for `N`-dimensional
-    array requires `i2` to `iN` to be all integers and `mask` to be optionally
-    a SIMD vector `Vec` of `Bool`s.  Given `(i2, ..., iN, mask)` is
-    $(summary(reverse(rargs)))
-    """))
-
-_maskedidx(idx, ::Nothing, ::Any) = idx
-_maskedidx(idx::Vec, mask::Vec, fst) = vifelse(mask, idx, fst)
-_maskedidx(idx::VecRange, mask::Vec, fst) =
-    _maskedidx(Vec(ntuple(i -> i - 1 + idx.i, length(mask))), mask, fst)
-
-Base.@propagate_inbounds function _preprocessindices(arr, idx, args)
-    I, mask = _extractmask(reverse(args)...)
-    _checkarity(arr, idx, I...)
-    @boundscheck checkbounds(arr,
-                             _maskedidx(idx, mask, first(axes(arr, 1))),
-                             I...)
-    return I, mask
-end
-
-"""
-    _pointer(arr, i, I)
-
-Pointer to the element `arr[i, I...]`.
-"""
-Base.@propagate_inbounds _pointer(arr::Array, i, I) =
-    pointer(arr, LinearIndices(arr)[i, I...])
-Base.@propagate_inbounds _pointer(arr::Base.FastContiguousSubArray, i, I) =
-    pointer(arr, (i, I...))
-Base.@propagate_inbounds _pointer(arr::SubArray, i, I) =
-    pointer(Base.unsafe_view(arr, 1, I...), i)
-
-Base.@propagate_inbounds function Base.getindex(
-        arr::ContiguousArray{T}, idx::VecRange{N},
-        args::Vararg{Union{Integer,Vec{N,Bool}}}) where {N,T}
-    I, mask = _preprocessindices(arr, idx, args)
-    return vload(Vec{N,T}, _pointer(arr, idx.i, I), mask)
-end
-
-Base.@propagate_inbounds function Base.setindex!(
-        arr::ContiguousArray{T}, v::Vec{N,T}, idx::VecRange{N},
-        args::Vararg{Union{Integer,Vec{N,Bool}}}) where {N,T}
-    I, mask = _preprocessindices(arr, idx, args)
-    vstore(v, _pointer(arr, idx.i, I), mask)
-    return arr
-end
-
-Base.@propagate_inbounds function Base.getindex(
-        arr::ContiguousArray{T}, idx::Vec{N,<:Integer},
-        args::Vararg{Union{Integer,Vec{N,Bool}}}) where {N,T}
-    I, mask = _preprocessindices(arr, idx, args)
-    ptrs = _pointer(arr, 1, I) - sizeof(T) + sizeof(T) * idx
-    return vgather(Vec{N,T}, ptrs, mask)
-end
-
-Base.@propagate_inbounds function Base.setindex!(
-        arr::ContiguousArray{T}, v::Vec{N,T}, idx::Vec{N,<:Integer},
-        args::Vararg{Union{Integer,Vec{N,Bool}}}) where {N,T}
-    I, mask = _preprocessindices(arr, idx, args)
-    ptrs = _pointer(arr, 1, I) - sizeof(T) + sizeof(T) * idx
-    vscatter(v, ptrs, mask)
-    return arr
-end
+using Base: @propagate_inbounds
+
+export Vec, vload, vloada, vloadnt, vstore, vstorea, vstorent, vgather, vgathera,
+       vscatter, vscattera, shufflevector, vifelse, valloc, VecRange
+
+const VE         = Base.VecElement
+const LVec{N, T} = NTuple{N, VE{T}}
+
+const IntTypes      = Union{Int8, Int16, Int32, Int64, Int128}
+const BIntTypes      = Union{IntTypes, Bool}
+const UIntTypes     = Union{UInt8, UInt16, UInt32, UInt64, UInt128}
+const IntegerTypes  = Union{IntTypes, UIntTypes}
+const BIntegerTypes = Union{IntegerTypes, Bool}
+const FloatingTypes = Union{Float32, Float64} # Float16 support is non-native in Julia and gets passed as an i16
+const ScalarTypes   = Union{IntegerTypes, FloatingTypes}
+const VecTypes      = Union{ScalarTypes, Ptr, Bool}
+
+include("LLVM_intrinsics.jl")
+include("simdvec.jl")
+include("arrayops.jl")
 
 end
diff --git a/src/arrayops.jl b/src/arrayops.jl
new file mode 100644
index 0000000..883185c
--- /dev/null
+++ b/src/arrayops.jl
@@ -0,0 +1,285 @@
+using Base: Slice, ScalarIndex
+
+"""
+    ContiguousSubArray{T,N,P,I,L}
+
+Like `Base.FastContiguousSubArray` but without requirement for linear
+indexing (i.e., type parameter `L` can be `false`).
+
+# Examples
+
+```
+julia> A = view(ones(5, 5), :, [1,3]);
+
+julia> A isa Base.FastContiguousSubArray
+false
+
+julia> A isa SIMD.ContiguousSubArray
+true
+```
+"""
+ContiguousSubArray{T,N,P,
+                   I<:Union{Tuple{Union{Slice, AbstractUnitRange}, Vararg{Any}},
+                            Tuple{Vararg{ScalarIndex}}},
+                   L} = SubArray{T,N,P,I,L}
+
+"""
+    ContiguousArray{T,N}
+
+Array types with contiguous first dimension.
+"""
+ContiguousArray{T,N} = Union{DenseArray{T,N}, ContiguousSubArray{T,N}}
+
+"""
+    FastContiguousArray{T,N}
+
+This is the type of arrays that `pointer(A, i)` works.
+"""
+FastContiguousArray{T,N} = Union{DenseArray{T,N}, Base.FastContiguousSubArray{T,N}}
+# https://github.com/eschnett/SIMD.jl/pull/40#discussion_r254131184
+# https://github.com/JuliaArrays/MappedArrays.jl/pull/24#issuecomment-460568978
+
+# vload
+@inline function vload(::Type{Vec{N, T}}, ptr::Ptr{T}, mask::Union{Nothing, Vec{N, Bool}}=nothing,
+                       ::Val{Aligned}=Val(false), ::Val{Nontemporal}=Val(false)) where {N, T, Aligned, Nontemporal}
+    if mask === nothing
+        Vec(Intrinsics.load(Intrinsics.LVec{N, T}, ptr, Val(Aligned), Val(Nontemporal)))
+    else
+        Vec(Intrinsics.maskedload(ptr, mask.data, Val(Aligned), Val(Nontemporal)))
+    end
+end
+
+@inline function vload(::Type{Vec{N, T}}, a::FastContiguousArray{T,1}, i::Integer, mask=nothing,
+                       ::Val{Aligned}=Val(false), ::Val{Nontemporal}=Val(false)) where {N, T, Aligned, Nontemporal}
+    @boundscheck checkbounds(a, i + N - 1)
+    GC.@preserve a begin
+        ptr = pointer(a, i)
+        vload(Vec{N, T}, ptr, mask, Val(Aligned), Val(Nontemporal))
+    end
+end
+@propagate_inbounds vloada(::Type{T}, a, i, mask=nothing) where {T<:Vec} = vload(T, a, i, mask, Val(true))
+@propagate_inbounds vloadnt(::Type{T}, a, i, mask=nothing) where {T<:Vec} = vload(T, a, i, mask, Val(true), Val(true))
+
+# vstore
+@inline function vstore(x::Vec{N, T}, ptr::Ptr{T}, mask::Union{Nothing, Vec{N, Bool}}=nothing,
+                       ::Val{Aligned}=Val(false), ::Val{Nontemporal}=Val(false)) where {N, T, Aligned, Nontemporal}
+    if mask === nothing
+        Intrinsics.store(x.data, ptr, Val(Aligned), Val(Nontemporal))
+    else
+        Intrinsics.maskedstore(x.data, ptr, mask.data, Val(Aligned), Val(Nontemporal))
+    end
+end
+@inline function vstore(x::Vec{N, T}, a::FastContiguousArray{T,1}, i::Integer, mask=nothing,
+               ::Val{Aligned}=Val(false), ::Val{Nontemporal}=Val(false)) where {N, T, Aligned, Nontemporal}
+    @boundscheck checkbounds(a, i + N - 1)
+    GC.@preserve a begin
+        ptr = pointer(a, i)
+        vstore(x, ptr, mask, Val(Aligned), Val(Nontemporal))
+    end
+    return a
+end
+@propagate_inbounds vstorea(x::Vec, a, i, mask=nothing) = vstore(x, a, i, nothing, Val(true))
+@propagate_inbounds vstorent(x::Vec, a, i, mask=nothing) = vstore(x, a, i, nothing, Val(true), Val(true))
+
+function valloc(::Type{T}, N::Int, sz::Int) where T
+    @assert N > 0
+    @assert sz >= 0
+    # We use padding to align the address of the first element, and
+    # also to ensure that we can access past the last element up to
+    # the next full vector width
+    padding = N-1 + mod(-sz, N)
+    mem = Vector{T}(undef, sz + padding)
+    addr = Int(pointer(mem))
+    off = mod(-addr, N * sizeof(T))
+    @assert mod(off, sizeof(T)) == 0
+    off = fld(off, sizeof(T))
+    @assert 0 <= off <= padding
+    res = view(mem, off+1 : off+sz)
+    addr2 = Int(pointer(res))
+    @assert mod(addr2, N * sizeof(T)) == 0
+    res
+end
+
+function valloc(f, ::Type{T}, N::Int, sz::Int) where T
+    mem = valloc(T, N, sz)
+    @inbounds for i in 1:sz
+        mem[i] = f(i)
+    end
+    mem
+end
+
+@inline function _get_vec_pointers(a, idx::Vec{N, Int}) where {N}
+    ptrs = pointer(a) + (idx - 1) * sizeof(eltype(a))
+end
+
+# Have to be careful with optional arguments and @boundscheck,
+# see https://github.com/JuliaLang/julia/issues/30411,
+# therefore use @propagate_inbounds
+@inline vgather(ptrs::Vec{N,Ptr{T}},
+                 mask::Vec{N,Bool}=one(Vec{N,Bool}),
+                 ::Val{Aligned}=Val(false)) where {N, T<:ScalarTypes, Aligned} =
+    return Vec(Intrinsics.maskedgather(ptrs.data, mask.data))
+@propagate_inbounds function vgather(a::FastContiguousArray{T,1}, idx::Vec{N, Int},
+                                     mask::Vec{N,Bool}=one(Vec{N,Bool}),
+                                     ::Val{Aligned}=Val(false)) where {N, T<:ScalarTypes, Aligned}
+    @boundscheck for i in 1:N
+        checkbounds(a, @inbounds idx[i])
+    end
+    GC.@preserve a begin
+        ptrs = _get_vec_pointers(a, idx)
+        return vgather(ptrs, mask, Val(Aligned))
+    end
+end
+@propagate_inbounds vgathera(a, idx, mask) = vgather(a, idx, mask, Val(true))
+@propagate_inbounds vgathera(a, idx::Vec{N}) where {N} = vgather(a, idx, one(Vec{N,Bool}), Val(true))
+
+@propagate_inbounds Base.getindex(a::FastContiguousArray{T,1}, idx::Vec{N,Int}) where {N,T} =
+    vgather(a, idx)
+
+
+@propagate_inbounds vscatter(x::Vec{N,T}, ptrs::Vec{N,Ptr{T}},
+                             mask::Vec{N,Bool}, ::Val{Aligned}=Val(false)) where {N, T<:ScalarTypes, Aligned} =
+    Intrinsics.maskedscatter(x.data, ptrs.data, mask.data)
+@propagate_inbounds function vscatter(x::Vec{N,T}, a::FastContiguousArray{T,1}, idx::Vec{N, Int},
+                                      mask::Vec{N,Bool}=one(Vec{N, Bool}),
+                                      ::Val{Aligned}=Val(false)) where {N, T<:ScalarTypes, Aligned}
+    @boundscheck for i in 1:N
+        checkbounds(a, @inbounds idx[i])
+    end
+    GC.@preserve a begin
+        ptrs = _get_vec_pointers(a, idx)
+        vscatter(x, ptrs, mask, Val(Aligned))
+    end
+    return
+end
+@propagate_inbounds vscattera(x, a, idx, mask) = vscatter(x, a, idx, mask, Val(true))
+@propagate_inbounds vscattera(x, a, idx::Vec{N}) where {N}  = vscatter(x, a, idx, one(Vec{N,Bool}), Val(true))
+
+@propagate_inbounds Base.setindex!(a::FastContiguousArray{T,1}, v::Vec{N,T}, idx::Vec{N,Int}) where {N, T} =
+    vscatter(v, a, idx)
+
+
+export VecRange
+
+"""
+    VecRange{N}(i::Int)
+Analogous to `UnitRange` but for loading SIMD vector of width `N` at
+index `i`.
+# Examples
+```jldoctest
+julia> xs = ones(4);
+julia> xs[VecRange{4}(1)]  # calls `vload(Vec{4,Float64}, xs, 1)`
+<4 x Float64>[1.0, 1.0, 1.0, 1.0]
+```
+"""
+struct VecRange{N}
+    i::Int
+end
+
+@inline Base.length(idx::VecRange{N}) where {N} = N
+@inline Base.first(idx::VecRange) = idx.i
+@inline Base.last(idx::VecRange) = idx.i + length(idx) - 1
+
+@inline Base.:+(idx::VecRange{N}, j::Integer) where N = VecRange{N}(idx.i + j)
+@inline Base.:+(j::Integer, idx::VecRange{N}) where N = VecRange{N}(idx.i + j)
+@inline Base.:-(idx::VecRange{N}, j::Integer) where N = VecRange{N}(idx.i - j)
+
+Base.checkindex(::Type{Bool}, inds::AbstractUnitRange, idx::VecRange) =
+    (first(inds) <= first(idx)) && (last(idx) <= last(inds))
+
+Base.checkindex(::Type{Bool}, inds::AbstractUnitRange, idx::Vec) =
+    all(first(inds) <= idx) && all(idx <= last(inds))
+
+@inline _checkarity(::AbstractArray{<:Any,N}, ::Vararg{<:Any,N}) where {N} =
+    nothing
+
+@inline _checkarity(::T, ::Any) where {T <: AbstractArray} =
+    if IndexStyle(T) isa IndexLinear
+        nothing
+    else
+        throw(ArgumentError("""
+        Array type $T does not support indexing with a single index.
+        Exactly $(ndims(T)) (non-mask) indices have to be specified.
+        """))
+    end
+
+_checkarity(::AbstractArray{<:Any,N}, ::Vararg{<:Any,M}) where {N,M} =
+    throw(ArgumentError("""
+    $M indices are given to $N-dimensional array.
+    Exactly $N (non-mask) indices have to be specified when using SIMD.
+    """))
+
+# Combined with `_preprocessindices`, helper function `_extractmask`
+# extracts `mask` in the tail position.  As slicing tuple is not
+# type-stable, we use reverse-of-tail-of-reverse hack to extract
+# `mask` at the end of `args`.
+@inline _extractmask(mask::Vec{N,Bool}, R::Vararg{Integer}) where N =
+    (reverse(R), mask)
+@inline _extractmask(R::Vararg{Integer}) = (reverse(R), nothing)
+@inline _extractmask(mask::Vec{N,Bool}) where {N} = ((), mask)
+@inline _extractmask() = ((), nothing)
+
+@noinline _extractmask(rargs...) =
+    throw(ArgumentError("""
+    Using SIMD indexing `array[idx, i2, ..., iN, mask]` for `N`-dimensional
+    array requires `i2` to `iN` to be all integers and `mask` to be optionally
+    a SIMD vector `Vec` of `Bool`s.  Given `(i2, ..., iN, mask)` is
+    $(summary(reverse(rargs)))
+    """))
+
+_maskedidx(idx, ::Nothing, ::Any) = idx
+_maskedidx(idx::Vec, mask::Vec, fst) = vifelse(mask, idx, fst)
+_maskedidx(idx::VecRange, mask::Vec, fst) =
+    _maskedidx(Vec(ntuple(i -> i - 1 + idx.i, length(mask))), mask, fst)
+
+Base.@propagate_inbounds function _preprocessindices(arr, idx, args)
+    I, mask = _extractmask(reverse(args)...)
+    _checkarity(arr, idx, I...)
+    @boundscheck checkbounds(arr,
+                             _maskedidx(idx, mask, first(axes(arr, 1))),
+                             I...)
+    return I, mask
+end
+
+"""
+    _pointer(arr, i, I)
+Pointer to the element `arr[i, I...]`.
+"""
+Base.@propagate_inbounds _pointer(arr::Array, i, I) =
+    pointer(arr, LinearIndices(arr)[i, I...])
+Base.@propagate_inbounds _pointer(arr::Base.FastContiguousSubArray, i, I) =
+    pointer(arr, (i, I...))
+Base.@propagate_inbounds _pointer(arr::SubArray, i, I) =
+    pointer(Base.unsafe_view(arr, 1, I...), i)
+
+Base.@propagate_inbounds function Base.getindex(
+        arr::ContiguousArray{T}, idx::VecRange{N},
+        args::Vararg{Union{Integer,Vec{N,Bool}}}) where {N,T}
+    I, mask = _preprocessindices(arr, idx, args)
+    return vload(Vec{N,T}, _pointer(arr, idx.i, I), mask)
+end
+
+Base.@propagate_inbounds function Base.setindex!(
+        arr::ContiguousArray{T}, v::Vec{N,T}, idx::VecRange{N},
+        args::Vararg{Union{Integer,Vec{N,Bool}}}) where {N,T}
+    I, mask = _preprocessindices(arr, idx, args)
+    vstore(v, _pointer(arr, idx.i, I), mask)
+    return arr
+end
+
+Base.@propagate_inbounds function Base.getindex(
+        arr::ContiguousArray{T}, idx::Vec{N,<:Integer},
+        args::Vararg{Union{Integer,Vec{N,Bool}}}) where {N,T}
+    I, mask = _preprocessindices(arr, idx, args)
+    ptrs = _pointer(arr, 1, I) - sizeof(T) + sizeof(T) * idx
+    return vgather(ptrs, mask)
+end
+
+Base.@propagate_inbounds function Base.setindex!(
+        arr::ContiguousArray{T}, v::Vec{N,T}, idx::Vec{N,<:Integer},
+        args::Vararg{Union{Integer,Vec{N,Bool}}}) where {N,T}
+    I, mask = _preprocessindices(arr, idx, args)
+    ptrs = _pointer(arr, 1, I) - sizeof(T) + sizeof(T) * idx
+    vscatter(v, ptrs, mask)
+    return arr
+end
diff --git a/src/simdvec.jl b/src/simdvec.jl
new file mode 100644
index 0000000..a09ca7f
--- /dev/null
+++ b/src/simdvec.jl
@@ -0,0 +1,414 @@
+struct Vec{N, T <: VecTypes}
+    data::LVec{N, T}
+end
+
+# Constructors
+@inline Vec(v::NTuple{N, T}) where {N, T} = Vec(VE.(v))
+@inline Vec(v::Vararg{T, N}) where {N, T} = Vec(v)
+@inline Vec(v::Vec) = v
+# Numbers defines this and it is needed in power_by_squaring...
+Base.copy(v::Vec) = v
+
+# No throwing versions of convert
+@inline _unsafe_convert(::Type{T}, v) where {T <: IntegerTypes} = v % T
+@inline _unsafe_convert(::Type{T}, v) where {T <: VecTypes} = convert(T, v)
+@inline constantvector(v::T1, ::Type{Vec{N, T2}}) where {N, T1, T2} =
+    Vec(Intrinsics.constantvector(_unsafe_convert(T2, v), Intrinsics.LVec{N, T2}))
+
+@inline Vec{N, T}(v::Vec{N, T}) where {N, T<:VecTypes} = v
+@inline Vec{N, T}(v::Vec{N, T}) where {N, T<:FloatingTypes} = v
+@inline Vec{N, T1}(v::T2) where {N, T1<:VecTypes, T2<:VecTypes} = constantvector(v, Vec{N, T1})
+@inline Vec{N, T1}(v::Vec{N, T2}) where {N, T1<:Union{IntegerTypes, Ptr}, T2<:Union{IntegerTypes, Ptr}} =
+    convert(Vec{N, T1}, v)
+
+@inline Base.convert(::Type{Vec{N,T}}, v::Vec{N,T}) where {N,T} = v
+@inline function Base.convert(::Type{Vec{N, T1}}, v::Vec{N, T2}) where {T1, T2, N}
+    if T1 <: Union{IntegerTypes, Ptr}
+        if T2 <: Union{IntegerTypes, Ptr}
+            if sizeof(T1) < sizeof(T2)
+                return Vec(Intrinsics.trunc(Intrinsics.LVec{N, T1}, v.data))
+            elseif sizeof(T1) == sizeof(T2)
+                return Vec(Intrinsics.bitcast(Intrinsics.LVec{N, T1}, v.data))
+            else
+                if T2 <: UIntTypes
+                    return Vec(Intrinsics.zext(Intrinsics.LVec{N, T1}, v.data))
+                else
+                    return Vec(Intrinsics.sext(Intrinsics.LVec{N, T1}, v.data))
+                end
+            end
+        elseif T2 <: FloatingTypes
+            if T1 <: UIntTypes
+                return Vec(Intrinsics.fptoui(Intrinsics.LVec{N, T1}, v.data))
+            elseif T1 <: IntTypes
+                return Vec(Intrinsics.fptosi(Intrinsics.LVec{N, T1}, v.data))
+            end
+        end
+    end
+    if T1 <: FloatingTypes
+        if T2 <: UIntTypes
+            return Vec(Intrinsics.uitofp(Intrinsics.LVec{N, T1}, v.data))
+        elseif T2 <: IntTypes
+            return Vec(Intrinsics.sitofp(Intrinsics.LVec{N, T1}, v.data))
+        elseif T2 <: FloatingTypes
+            if sizeof(T1) < sizeof(T2)
+                return Vec(Intrinsics.fptrunc(Intrinsics.LVec{N, T1}, v.data))
+            else
+                return Vec(Intrinsics.fpext(Intrinsics.LVec{N, T1}, v.data))
+            end
+        end
+    end
+    _unreachable()
+end
+@noinline _unreachable() = error("unreachable")
+
+Base.Tuple(v::Vec) = map(i -> i.value, v.data)
+Base.NTuple{N, T}(v::Vec{N}) where {T, N} = map(i -> convert(T, i.value), v.data)
+
+Base.eltype(::Type{Vec{N,T}}) where {N,T} = T
+Base.ndims( ::Type{Vec{N,T}}) where {N,T} = 1
+Base.length(::Type{Vec{N,T}}) where {N,T} = N
+Base.size(  ::Type{Vec{N,T}}) where {N,T} = (N,)
+Base.size(  ::Type{Vec{N,T}}, n::Integer) where {N,T} = n > N ? 1 : (N,)[n]
+
+Base.eltype(V::Vec) = eltype(typeof(V))
+Base.ndims(V::Vec) = ndims(typeof(V))
+Base.length(V::Vec) = length(typeof(V))
+Base.size(V::Vec) = size(typeof(V))
+Base.size(V::Vec, n::Integer) = size(typeof(V), n)
+
+function Base.show(io::IO, v::Vec{N,T}) where {N,T}
+    print(io, "<$N x $T>[")
+    join(io, [x.value for x in v.data], ", ")
+    print(io, "]")
+end
+
+@inline Base.checkbounds(v::Vec, i::IntegerTypes) =
+(i < 1 || i > length(v.data)) && Base.throw_boundserror(v, i)
+
+function Base.getindex(v::Vec, i::IntegerTypes)
+    @boundscheck checkbounds(v, i)
+    return Intrinsics.extractelement(v.data, i-1)
+end
+
+@inline function Base.setindex(v::Vec{N,T}, x, i::IntegerTypes) where {N,T}
+    @boundscheck checkbounds(v, i)
+    Vec(Intrinsics.insertelement(v.data, _unsafe_convert(T, x), i-1))
+end
+
+Base.zero(::Type{Vec{N,T}}) where {N, T} = Vec{N,T}(zero(T))
+Base.zero(::Vec{N,T}) where {N, T} = zero(Vec{N, T})
+Base.one(::Type{Vec{N,T}}) where {N, T} = Vec{N, T}(one(T))
+Base.one(::Vec{N,T}) where {N, T} = one(Vec{N, T})
+
+Base.reinterpret(::Type{Vec{N, T}}, v::Vec) where {T, N} = Vec(Intrinsics.bitcast(Intrinsics.LVec{N, T}, v.data))
+Base.reinterpret(::Type{Vec{N, T}}, v::ScalarTypes) where {T, N} = Vec(Intrinsics.bitcast(Intrinsics.LVec{N, T}, v))
+Base.reinterpret(::Type{T}, v::Vec) where {T} = Intrinsics.bitcast(T, v.data)
+
+
+###################
+# Unary operators #
+###################
+
+const UNARY_OPS = [
+    (:sqrt           , FloatingTypes , Intrinsics.sqrt)       ,
+    (:sin            , FloatingTypes , Intrinsics.sin)        ,
+    (:trunc          , FloatingTypes , Intrinsics.trunc)      ,
+    (:cos            , FloatingTypes , Intrinsics.cos)        ,
+    (:exp            , FloatingTypes , Intrinsics.exp)        ,
+    (:exp2           , FloatingTypes , Intrinsics.exp2)       ,
+    (:log            , FloatingTypes , Intrinsics.log)        ,
+    (:log10          , FloatingTypes , Intrinsics.log10)      ,
+    (:log2           , FloatingTypes , Intrinsics.log2)       ,
+    (:abs            , FloatingTypes , Intrinsics.fabs)       ,
+    (:floor          , FloatingTypes , Intrinsics.floor)      ,
+    (:ceil           , FloatingTypes , Intrinsics.ceil)       ,
+    # (:rint         , FloatingTypes , Intrinsics)            ,
+    # (:nearbyint    , FloatingTypes , Intrinsics)            ,
+    (:round          , FloatingTypes , Intrinsics.round)      ,
+
+    # (:bitreverse   , IntegerTypes  , Intrinsics.bitreverse) ,
+    (:bswap          , IntegerTypes  , Intrinsics.bswap)      ,
+    (:count_ones     , IntegerTypes  , Intrinsics.ctpop)      ,
+    (:leading_zeros  , IntegerTypes  , Intrinsics.ctlz)       ,
+    (:trailing_zeros , IntegerTypes  , Intrinsics.cttz)       ,
+]
+
+for (op, constraint, llvmop) in UNARY_OPS
+    @eval @inline (Base.$op)(x::Vec{<:Any, <:$constraint}) =
+        Vec($(llvmop)(x.data))
+end
+
+Base.:+(v::Vec{<:Any, <:ScalarTypes}) = v
+Base.:-(v::Vec{<:Any, <:IntegerTypes}) = zero(v) - v
+Base.:-(v::Vec{<:Any, <:FloatingTypes}) = Vec(Intrinsics.fneg(v.data))
+Base.:~(v::Vec{N, T}) where {N, T<:IntegerTypes} = Vec(Intrinsics.xor(v.data, Vec{N, T}(-1).data))
+Base.:~(v::Vec{N, Bool}) where {N} = Vec(Intrinsics.xor(v.data, Vec{N, Bool}(true).data))
+Base.abs(v::Vec{N, T}) where {N, T} = Vec(vifelse(v < zero(T), -v, v))
+Base.:!(v1::Vec{N,Bool}) where {N} = ~v1
+Base.inv(v::Vec{N, T}) where {N, T<:FloatingTypes} = one(T) / v
+
+_unsigned(::Type{Float32}) = UInt32
+_unsigned(::Type{Float64}) = UInt64
+function Base.issubnormal(x::Vec{N, T}) where {N, T<:FloatingTypes}
+    y = reinterpret(Vec{N, _unsigned(T)}, x)
+    (y & Base.exponent_mask(T) == 0) & (y & Base.significand_mask(T) != 0)
+end
+
+@inline Base.signbit(x::Vec{N, <:IntegerTypes}) where {N} = x < 0
+
+@inline Base.leading_ones(x::Vec{<:Any, <:IntegerTypes})  = leading_zeros(~(x))
+@inline Base.trailing_ones(x::Vec{<:Any, <:IntegerTypes}) = trailing_zeros(~(x))
+@inline Base.count_zeros(x::Vec{<:Any, <:IntegerTypes}) = count_ones(~(x))
+
+@inline Base.isnan(v::Vec{<:Any, <:FloatingTypes}) = v != v
+@inline Base.isfinite(v::Vec{<:Any, <:FloatingTypes}) = v - v == zero(v)
+@inline Base.isinf(v::Vec{<:Any, <:FloatingTypes}) = !isnan(v) & !isfinite(v)
+@inline Base.sign(v1::Vec{N,T}) where {N,T} =
+    vifelse(v1 == zero(Vec{N,T}), zero(Vec{N,T}),
+            vifelse(v1 < zero(Vec{N,T}), -one(Vec{N,T}), one(Vec{N,T})))
+
+@inline Base.isnan(v::Vec{N, <:IntegerTypes}) where {N} = zero(Vec{N,Bool})
+@inline Base.isfinite(v::Vec{N, <:IntegerTypes}) where {N} = one(Vec{N, Bool})
+@inline Base.isinf(v::Vec{N, <:IntegerTypes}) where {N} = zero(Vec{N, Bool})
+
+
+####################
+# Binary operators #
+####################
+
+const BINARY_OPS = [
+    (:+        , IntegerTypes  , Intrinsics.add)
+    (:-        , IntegerTypes  , Intrinsics.sub)
+    (:*        , IntegerTypes  , Intrinsics.mul)
+    (:div      , UIntTypes     , Intrinsics.udiv)
+    (:div      , IntTypes      , Intrinsics.sdiv)
+    (:rem      , UIntTypes     , Intrinsics.urem)
+    (:rem      , IntTypes      , Intrinsics.srem)
+
+    (:+        , FloatingTypes , Intrinsics.fadd)
+    (:-        , FloatingTypes , Intrinsics.fsub)
+    (:*        , FloatingTypes , Intrinsics.fmul)
+    (:^        , FloatingTypes , Intrinsics.pow)
+    (:/        , FloatingTypes , Intrinsics.fdiv)
+    (:rem      , FloatingTypes , Intrinsics.frem)
+    (:min      , FloatingTypes , Intrinsics.minnum)
+    (:max      , FloatingTypes , Intrinsics.maxnum)
+    (:copysign , FloatingTypes , Intrinsics.copysign)
+
+    (:~        , BIntegerTypes  , Intrinsics.xor)
+    (:&        , BIntegerTypes  , Intrinsics.and)
+    (:|        , BIntegerTypes  , Intrinsics.or)
+    (:⊻        , BIntegerTypes  , Intrinsics.xor)
+
+    (:(==)     , BIntegerTypes  , Intrinsics.icmp_eq)
+    (:(!=)     , BIntegerTypes  , Intrinsics.icmp_ne)
+    (:(>)      , BIntTypes      , Intrinsics.icmp_sgt)
+    (:(>=)     , BIntTypes      , Intrinsics.icmp_sge)
+    (:(<)      , BIntTypes      , Intrinsics.icmp_slt)
+    (:(<=)     , BIntTypes      , Intrinsics.icmp_sle)
+    (:(>)      , UIntTypes      , Intrinsics.icmp_ugt)
+    (:(>=)     , UIntTypes      , Intrinsics.icmp_uge)
+    (:(<)      , UIntTypes      , Intrinsics.icmp_ult)
+    (:(<=)     , UIntTypes      , Intrinsics.icmp_ule)
+
+    (:(==)     , FloatingTypes , Intrinsics.fcmp_oeq)
+    (:(!=)     , FloatingTypes , Intrinsics.fcmp_une)
+    (:(>)      , FloatingTypes , Intrinsics.fcmp_ogt)
+    (:(>=)     , FloatingTypes , Intrinsics.fcmp_oge)
+    (:(<)      , FloatingTypes , Intrinsics.fcmp_olt)
+    (:(<=)     , FloatingTypes , Intrinsics.fcmp_ole)
+]
+
+for (op, constraint, llvmop) in BINARY_OPS
+    @eval @inline function (Base.$op)(x::Vec{N, T}, y::Vec{N, T}) where {N, T <: $constraint}
+        Vec($(llvmop)(x.data, y.data))
+    end
+end
+
+# max min
+@inline Base.max(v1::Vec{N,T}, v2::Vec{N,T}) where {N,T<:IntegerTypes} =
+    Vec(vifelse(v1 >= v2, v1, v2))
+@inline Base.min(v1::Vec{N,T}, v2::Vec{N,T}) where {N,T<:IntegerTypes} =
+    Vec(vifelse(v1 >= v2, v2, v1))
+
+# Pow
+@inline Base.:^(x::Vec{N,T}, y::IntegerTypes) where {N,T<:FloatingTypes} =
+    Vec(Intrinsics.powi(x.data, y))
+# Do what Base does for HWNumber:
+@inline Base.literal_pow(::typeof(^), x::Vec, ::Val{0}) = one(typeof(x))
+@inline Base.literal_pow(::typeof(^), x::Vec, ::Val{1}) = x
+@inline Base.literal_pow(::typeof(^), x::Vec, ::Val{2}) = x*x
+@inline Base.literal_pow(::typeof(^), x::Vec, ::Val{3}) = x*x*x
+
+# Sign
+@inline Base.flipsign(v1::Vec{N,T}, v2::Vec{N,T}) where {N,T} =
+    vifelse(signbit(v2), -v1, v1)
+@inline Base.copysign(v1::Vec{N,T}, v2::Vec{N,T}) where {N,T<:IntTypes} =
+    vifelse(signbit(v2), -abs(v1), abs(v1))
+_signed(::Type{Float32}) = Int32
+_signed(::Type{Float64}) = Int64
+@inline Base.signbit(x::Vec{N, T}) where {N, T <:FloatingTypes} =
+    signbit(reinterpret(Vec{N, _signed(T)}, x))
+
+# Pointer arithmetic
+for op in (:+, :-)
+    @eval begin
+        # Cast pointer to Int and back
+        @inline Base.$op(x::Vec{N,Ptr{T}}, y::Vec{N,Ptr{T}}) where {N,T} =
+            convert(Vec{N, Ptr{T}}, ($(op)(convert(Vec{N, Int}, x), convert(Vec{N, Int}, y))))
+        @inline Base.$op(x::Vec{N,Ptr{T}}, y::Union{IntegerTypes}) where {N,T} = $(op)(x, Vec{N,Ptr{T}}(y))
+        @inline Base.$op(x::IntegerTypes, y::Union{Vec{N,Ptr{T}}}) where {N,T} = $(op)(y, x)
+
+        @inline Base.$op(x::Vec{N,<:IntegerTypes}, y::Ptr{T}) where {N,T} = $(op)(Vec{N,Ptr{T}}(x), Vec{N,Ptr{T}}(y))
+        @inline Base.$op(x::Ptr{T}, y::Vec{N,<:IntegerTypes}) where {N,T} = $(op)(y, x)
+    end
+end
+
+# Bitshifts
+# See https://github.com/JuliaLang/julia/blob/7426625b5c07b0d93110293246089a259a0a677d/src/intrinsics.cpp#L1179-L1196
+# Shifting with a value larger than the number of bits in the type is undefined behavior
+# so set to zero in those cases.
+@inline function shl_int(x::Vec{N, T1}, y::Vec{N, T2}) where {N, T1<:IntegerTypes, T2<:IntegerTypes}
+    vifelse(y > sizeof(T1) * 8,
+        zero(Vec{N, T1}),
+        Vec(Intrinsics.shl(x.data, convert(Vec{N,T1}, y).data)))
+end
+
+@inline function lshr_int(x::Vec{N, T1}, y::Vec{N, T2}) where {N, T1<:IntegerTypes, T2<:IntegerTypes}
+    vifelse(y > sizeof(T1) * 8,
+        zero(Vec{N, T1}),
+        Vec(Intrinsics.lshr(x.data, convert(Vec{N,T1}, y).data)))
+end
+
+@inline function ashr_int(x::Vec{N, T1}, y::Vec{N, T2}) where {N, T1<:IntegerTypes, T2<:IntegerTypes}
+    vifelse(y > sizeof(T1) * 8,
+            Vec(Intrinsics.ashr(x.data, Vec{N,T1}(sizeof(T1)*8-1).data)),
+            Vec(Intrinsics.ashr(x.data, Vec{N,T1}(y).data)))
+end
+
+# See https://github.com/JuliaLang/julia/blob/a211abcdfacc05cb93c15774a59ce8961c16dac4/base/int.jl#L422-L435
+@inline Base.:>>(x::Vec{N, <:IntTypes}, y::Vec{N, <:UIntTypes}) where {N} =
+    ashr_int(x, y)
+@inline Base.:>>(x::Vec{N, T1}, y::Vec{N, T2}) where {N, T1<:UIntTypes, T2<:UIntTypes} =
+    lshr_int(x, y)
+@inline Base.:<<(x::Vec{N, T1}, y::Vec{N, T2}) where {N, T1<:IntegerTypes, T2<:UIntTypes} =
+    shl_int(x, y)
+@inline Base.:>>>(x::Vec{N, T1}, y::Vec{N, T2}) where {N, T1<:IntegerTypes, T2<:UIntTypes} =
+    lshr_int(x, y)
+
+@inline unsigned(v::Vec{<:Any, <:UIntTypes}) = v
+@inline unsigned(v::Vec{N, Int32}) where {N} = convert(Vec{N, UInt32}, v)
+@inline unsigned(v::Vec{N, Int64}) where {N} = convert(Vec{N, UInt64}, v)
+
+@inline Base.:>>(x::Vec{N, T1}, y::Vec{N, Int}) where {N, T1<:IntegerTypes} =
+    vifelse(0 <= y, x >> unsigned(y), x << unsigned(-y))
+@inline Base.:<<(x::Vec{N, T1}, y::Vec{N, Int}) where {N, T1<:IntegerTypes} =
+    vifelse(0 <= y, x << unsigned(y), x >> unsigned(-y))
+@inline Base.:>>>(x::Vec{N, T1}, y::Vec{N, Int}) where {N, T1<:IntegerTypes} =
+    vifelse(0 <= y, x >>> unsigned(y), x << unsigned(-y))
+
+for v in (:<<, :>>, :>>>)
+    @eval begin
+        @inline Base.$v(x::Vec{N,T}, y::ScalarTypes) where {N, T} = $v(x, Vec{N,T}(y))
+        @inline Base.$v(x::Vec{N,T}, y::T2) where {N, T<:IntegerTypes, T2<:UIntTypes} = $v(x, Vec{N,T2}(y))
+        @inline Base.$v(x::ScalarTypes, y::Vec{N,T}) where {N, T} = $v(Vec{N,T}(x), y)
+        @inline Base.$v(x::Vec{N,T1}, y::Vec{N,T2}) where {N, T1<:IntegerTypes, T2<:IntegerTypes} =
+            $v(x, convert(Vec{N, Int}, y))
+    end
+end
+
+# Vectorize binary functions
+for (op, constraint) in [BINARY_OPS;
+        (:flipsign , ScalarTypes)
+        (:copysign , ScalarTypes)
+        (:signbit  , ScalarTypes)
+        (:min      , IntegerTypes)
+        (:max      , IntegerTypes)
+        (:<<       , IntegerTypes)
+        (:>>       , IntegerTypes)
+        (:>>>      , IntegerTypes)
+    ]
+    @eval @inline function (Base.$op)(x::T2, y::Vec{N, T}) where {N, T2<:ScalarTypes, T <: $constraint}
+        Base.$op(Vec{N, T}(x), y)
+    end
+    @eval @inline function (Base.$op)(x::Vec{N, T}, y::T2) where {N, T2 <:ScalarTypes, T <: $constraint}
+        Base.$op(x, Vec{N, T}(y))
+    end
+end
+
+#####################
+# Ternary operators #
+#####################
+
+@inline vifelse(v::Bool, v1::Vec{N, T}, v2::Vec{N, T}) where {N, T} = ifelse(v, v1, v2)
+@inline vifelse(v::Bool, v1::Vec{N, T}, v2::ScalarTypes) where {N, T} = ifelse(v, v1, Vec{N,T}(v2))
+@inline vifelse(v::Bool, v1::ScalarTypes, v2::Vec{N, T}) where {N, T} = ifelse(v, Vec{N,T}(v1), v2)
+
+@inline vifelse(v::Bool, v1::T, v2::T) where {T} = ifelse(v, v1, v2)
+@inline vifelse(v::Vec{N, Bool}, v1::Vec{N, T}, v2::Vec{N, T}) where {N, T} =
+    Vec(Intrinsics.select(v.data, v1.data, v2.data))
+@inline vifelse(v::Vec{N, Bool}, v1::T2, v2::Vec{N, T}) where {N, T, T2 <:ScalarTypes} = vifelse(v, Vec{N, T}(v1), v2)
+@inline vifelse(v::Vec{N, Bool}, v1::Vec{N, T}, v2::T2) where {N, T, T2 <:ScalarTypes} = vifelse(v, v1, Vec{N, T}(v2))
+
+# fma, muladd and vectorization of these
+for (op, llvmop) in [(:fma, Intrinsics.fma), (:muladd, Intrinsics.fmuladd)]
+    @eval begin
+        @inline Base.$op(a::Vec{N, T}, b::Vec{N, T}, c::Vec{N, T}) where {N,T<:FloatingTypes} =
+            Vec($llvmop(a.data, b.data, c.data))
+        @inline Base.$op(s1::ScalarTypes, v2::Vec{N,T}, v3::Vec{N,T}) where {N,T<:FloatingTypes} =
+            $op(Vec{N,T}(s1), v2, v3)
+        @inline Base.$op(v1::Vec{N,T}, s2::ScalarTypes, v3::Vec{N,T}) where {N,T<:FloatingTypes} =
+            $op(v1, Vec{N,T}(s2), v3)
+        @inline Base.$op(s1::ScalarTypes, s2::ScalarTypes, v3::Vec{N,T}) where {N,T<:FloatingTypes} =
+            $op(Vec{N,T}(s1), Vec{N,T}(s2), v3)
+        @inline Base.$op(v1::Vec{N,T}, v2::Vec{N,T}, s3::ScalarTypes) where {N,T<:FloatingTypes} =
+            $op(v1, v2, Vec{N,T}(s3))
+        @inline Base.$op(s1::ScalarTypes, v2::Vec{N,T}, s3::ScalarTypes) where {N,T<:FloatingTypes} =
+            $op(Vec{N,T}(s1), v2, Vec{N,T}(s3))
+        @inline Base.$op(v1::Vec{N,T}, s2::ScalarTypes, s3::ScalarTypes) where {N,T<:FloatingTypes} =
+            $op(v1, Vec{N,T}(s2), Vec{N,T}(s3))
+    end
+end
+
+
+##############
+# Reductions #
+##############
+const HORZ_REDUCTION_OPS = [
+    (&   , Union{IntegerTypes, Bool}  , Intrinsics.reduce_and)
+    (|   , Union{IntegerTypes, Bool}  , Intrinsics.reduce_or)
+    (max , IntTypes      , Intrinsics.reduce_smax)
+    (max , UIntTypes     , Intrinsics.reduce_umax)
+    (max , FloatingTypes , Intrinsics.reduce_fmax)
+    (min , IntTypes      , Intrinsics.reduce_smin)
+    (min , UIntTypes     , Intrinsics.reduce_umin)
+    (min , FloatingTypes , Intrinsics.reduce_fmin)
+    (+   , IntegerTypes  , Intrinsics.reduce_add)
+    (*   , IntegerTypes  , Intrinsics.reduce_mul)
+    (+   , FloatingTypes , Intrinsics.reduce_fadd)
+    (*   , FloatingTypes , Intrinsics.reduce_fmul)
+]
+
+for (op, constraint, llvmop) in HORZ_REDUCTION_OPS
+    @eval @inline Base.reduce(::typeof($op), x::Vec{<:Any, <:$constraint}) =
+        $(llvmop)(x.data)
+end
+Base.reduce(F::Any, v::Vec) = error("reduction not defined for SIMD.Vec on $F")
+
+@inline Base.all(v::Vec{<:Any,Bool}) = reduce(&, v)
+@inline Base.any(v::Vec{<:Any,Bool}) = reduce(|, v)
+@inline Base.maximum(v::Vec) = reduce(max, v)
+@inline Base.minimum(v::Vec) = reduce(min, v)
+@inline Base.prod(v::Vec) = reduce(*, v)
+@inline Base.sum(v::Vec) = reduce(+, v)
+
+############
+# Shuffles #
+############
+
+@inline function shufflevector(x::Vec{N, T}, ::Val{I}) where {N, T, I}
+    Vec(Intrinsics.shufflevector(x.data, Val(I)))
+end
+@inline function shufflevector(x::Vec{N, T}, y::Vec{N, T}, ::Val{I}) where {N, T, I}
+    Vec(Intrinsics.shufflevector(x.data, y.data, Val(I)))
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 47f9b2b..74bb8f3 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,6 +1,8 @@
 using SIMD
 using Test, InteractiveUtils
 
+using Base: setindex
+
 """
     llvm_ir(f, args) :: String
 
@@ -8,20 +10,22 @@ Get LLVM IR of `f(args...)` as a string.
 """
 llvm_ir(f, args) = sprint(code_llvm, f, Base.typesof(args...))
 
-@testset "SIMD" begin
+#@testset "SIMD" begin
+    # The vector we are testing.
+    global const nbytes = 32
 
-        # The vector we are testing. Ideally, we should be able to use any vector size
-        # anywhere, but LLVM codegen bugs prevent us from doing so -- thus we make this
-        # a parameter.
-        global const nbytes = 32
+    global const L8 = nbytes÷4
+    global const L4 = nbytes÷8
 
-        global const L8 = nbytes÷4
-        global const L4 = nbytes÷8
+    global const V8I32 = Vec{L8,Int32}
+    global const V8I64 = Vec{L8,Int64}
+    global const V4F64 = Vec{L4,Float64}
 
-        global const V8I32 = Vec{L8,Int32}
-        global const V4F64 = Vec{L4,Float64}
+    global const v8i32 = ntuple(i->Int32(ifelse(isodd(i), i, -i)), L8)
+    global const v8i64 = ntuple(i->Int64(ifelse(isodd(i), i, -i)), L8)
+    global const v4f64 = ntuple(i->Float64(ifelse(isodd(i), i, -i)), L4)
 
-        is_checking_bounds = Core.Compiler.inbounds_option() == :on
+    is_checking_bounds = Core.Compiler.inbounds_option() == :on
 
     @testset "Type properties" begin
         @test eltype(V8I32) === Int32
@@ -37,10 +41,6 @@ llvm_ir(f, args) = sprint(code_llvm, f, Base.typesof(args...))
     end
 
     @testset "Type conversion" begin
-
-        global const v8i32 = ntuple(i->Int32(ifelse(isodd(i), i, -i)), L8)
-        global const v4f64 = ntuple(i->Float64(ifelse(isodd(i), i, -i)), L4)
-
         @test string(V8I32(v8i32)) == "<8 x Int32>[" * string(v8i32)[2:end-1] * "]"
         @test string(V4F64(v4f64)) == "<4 x Float64>[" * string(v4f64)[2:end-1] * "]"
 
@@ -54,43 +54,32 @@ llvm_ir(f, args) = sprint(code_llvm, f, Base.typesof(args...))
         @test Tuple(V4F64(v4f64)) === Tuple(v4f64)
     end
 
+    @testset "Conversion and reinterpretation" begin
+        v = V8I32(v8i32)
+        V4I64 = reinterpret(Vec{4, Int64}, v)
+        @test sum(count_ones(v)) == sum(count_ones(V4I64))
+        @test sum(count_zeros(v)) == sum(count_zeros(V4I64))
+        x = Int64(123456789)
+        @test reinterpret(Int64, reinterpret(Vec{4, Int16}, x)) == x
+
+        @test all(Tuple(convert(Vec{8, Float64}, v)) .== Tuple(v))
+    end
+
     @testset "Element-wise access" begin
 
         for i in 1:L8
-            @test Tuple(setindex(V8I32(v8i32), 9.0, Val(i))) ===
-                ntuple(j->Int32(ifelse(j==i, 9, v8i32[j])), L8)
-            @test Tuple(setindex(V8I32(v8i32), 9.0, Val{i})) ===
-                ntuple(j->Int32(ifelse(j==i, 9, v8i32[j])), L8)
-            @test Tuple(setindex(V8I32(v8i32), 9.0, i)) ===
-                ntuple(j->Int32(ifelse(j==i, 9, v8i32[j])), L8)
-
-            @test V8I32(v8i32)[Val{i}] === v8i32[i]
             @test V8I32(v8i32)[i] === v8i32[i]
         end
 
-        @test_throws BoundsError setindex(V8I32(v8i32), 0, Val(0))
-        @test_throws BoundsError setindex(V8I32(v8i32), 0, Val{0})
-        @test_throws BoundsError setindex(V8I32(v8i32), 0, Val(L8+1))
-        @test_throws BoundsError setindex(V8I32(v8i32), 0, Val{L8+1})
         @test_throws BoundsError setindex(V8I32(v8i32), 0, 0)
         @test_throws BoundsError setindex(V8I32(v8i32), 0, L8+1)
-        @test_throws BoundsError V8I32(v8i32)[Val(0)]
-        @test_throws BoundsError V8I32(v8i32)[Val{0}]
-        @test_throws BoundsError V8I32(v8i32)[Val(L8+1)]
-        @test_throws BoundsError V8I32(v8i32)[Val{L8+1}]
         @test_throws BoundsError V8I32(v8i32)[0]
         @test_throws BoundsError V8I32(v8i32)[L8+1]
 
         for i in 1:L4
-            @test Tuple(setindex(V4F64(v4f64), 9, Val(i))) ===
-                ntuple(j->Float64(ifelse(j==i, 9.0, v4f64[j])), L4)
-            @test Tuple(setindex(V4F64(v4f64), 9, Val{i})) ===
-                ntuple(j->Float64(ifelse(j==i, 9.0, v4f64[j])), L4)
             @test Tuple(setindex(V4F64(v4f64), 9, i)) ===
                 ntuple(j->Float64(ifelse(j==i, 9.0, v4f64[j])), L4)
 
-            @test V4F64(v4f64)[Val(i)] === v4f64[i]
-            @test V4F64(v4f64)[Val{i}] === v4f64[i]
             @test V4F64(v4f64)[i] === v4f64[i]
         end
 
@@ -108,8 +97,9 @@ llvm_ir(f, args) = sprint(code_llvm, f, Base.typesof(args...))
         global const v8i32c = map(x->Int32(x*2), v8i32)
 
         notbool(x) = !(x>=typeof(x)(0))
-        for op in (~, +, -, abs, notbool, sign, signbit)
-            @test Tuple(op(V8I32(v8i32))) === map(op, v8i32)
+        for op in (~, +, -, abs, notbool, sign, signbit, count_ones, count_zeros,
+                   leading_ones, leading_zeros, trailing_ones, trailing_zeros)
+            @test Tuple(op(V8I32(v8i32))) == map(op, v8i32)
         end
 
         for op in (
@@ -125,13 +115,13 @@ llvm_ir(f, args) = sprint(code_llvm, f, Base.typesof(args...))
         end
 
         for op in (<<, >>, >>>)
-            @test Tuple(op(V8I32(v8i32), Val(3))) === map(x->op(x,3), v8i32)
-            @test Tuple(op(V8I32(v8i32), Val{3})) === map(x->op(x,3), v8i32)
-            @test Tuple(op(V8I32(v8i32), Val(-3))) === map(x->op(x,-3), v8i32)
-            @test Tuple(op(V8I32(v8i32), Val{-3})) === map(x->op(x,-3), v8i32)
-            @test Tuple(op(V8I32(v8i32), 3)) === map(x->op(x,3), v8i32)
-            @test Tuple(op(V8I32(v8i32), -3)) === map(x->op(x,-3), v8i32)
-            @test Tuple(op(V8I32(v8i32), V8I32(v8i32))) === map(op, v8i32, v8i32)
+            for v in (V8I32(v8i32), V8I64(v8i64))
+                for z in (3, UInt(3), Int32(10000), UInt8(4))
+                    @test Tuple(op(v, z)) === map(x->op(x,z), Tuple(v))
+                    @test Tuple(op(v, -z)) === map(x->op(x,-z), Tuple(v))
+                    @test Tuple(op(v, v)) === map(op, Tuple(v), Tuple(v))
+                end
+            end
         end
 
         @test Tuple(V8I32(v8i32)^0) === v8i32.^0
@@ -160,7 +150,7 @@ llvm_ir(f, args) = sprint(code_llvm, f, Base.typesof(args...))
             length(t1)==length(t2) &&
                 all(Bool[isapprox(t1[i], t2[i]) for i in 1:length(t1)])
         end
-        for op in (cos, exp, exp10, exp2, logabs, log10abs, log2abs, sin)
+        for op in (cos, exp, exp2, logabs, log10abs, log2abs, sin)
             rvec = Tuple(op(V4F64(v4f64)))
             rsca = map(op, v4f64)
             @test typeof(rvec) === typeof(rsca)
@@ -300,8 +290,13 @@ llvm_ir(f, args) = sprint(code_llvm, f, Base.typesof(args...))
         for op in (maximum, minimum, sum, prod)
             @test op(V8I32(v8i32)) === op(v8i32)
         end
-        @test all(V8I32(v8i32)) == reduce(&, v8i32)
-        @test any(V8I32(v8i32)) == reduce(|, v8i32)
+        t = Vec(true, true, true, true)
+        tf = Vec(true, false, true, false)
+        f = Vec(false, false, false, false)
+        @test all(t) == reduce(&, t) == true
+        @test all(tf) == reduce(&, tf) == false
+        @test any(f) == reduce(|, f) == false
+        @test any(tf) == reduce(|, tf) == true
 
         for op in (maximum, minimum, sum, prod)
             @test op(V4F64(v4f64)) === op(v4f64)
@@ -583,7 +578,7 @@ llvm_ir(f, args) = sprint(code_llvm, f, Base.typesof(args...))
             ir = llvm_ir(vsum, (xs, V4F64))
             @test occursin(" load <4 x double>", ir)
             @test occursin(" fadd <4 x double>", ir)
-            @test occursin(r"( shufflevector <4 x double>.*){2}"s, ir)
+            # @test occursin(r"( shufflevector <4 x double>.*){2}"s, ir)
         end
 
         function vadd_masked!(xs::AbstractArray{T,1}, ys::AbstractArray{T,1},
@@ -635,7 +630,7 @@ llvm_ir(f, args) = sprint(code_llvm, f, Base.typesof(args...))
             ir = llvm_ir(vsum_masked, (xs, V4F64))
             @test occursin("masked.load.v4f64", ir)
             @test occursin(" fadd <4 x double>", ir)
-            @test occursin(r"( shufflevector <4 x double>.*){2}"s, ir)
+            # @test occursin(r"( shufflevector <4 x double>.*){2}"s, ir)
         end
     end
 
@@ -667,24 +662,24 @@ llvm_ir(f, args) = sprint(code_llvm, f, Base.typesof(args...))
         for T in (Int8,UInt8,Int16,UInt16,Int32,UInt32,Int64,UInt64,Float32,Float64)
             a = Vec{4,T}((1,2,3,4))
             b = Vec{4,T}((5,6,7,8))
-            @test shufflevector(a, b, Val{(2,3,4,5)}) === Vec{4,T}((3,4,5,6))
-            @test shufflevector(a, b, Val{(1,7,5,5)}) === Vec{4,T}((2,8,6,6))
-            @test shufflevector(a, b, Val{0:3}) === a
-            @test shufflevector(a, b, Val{4:7}) === b
-            @test shufflevector(a, Val{(1,0,2,3)}) === Vec{4,T}((2,1,3,4))
-            @test shufflevector(a, b, Val{(0,1,4,5,2,3,6,7)}) === Vec{8,T}((1,2,5,6,3,4,7,8))
-            @test shufflevector(shufflevector(a, b, Val{(6,:undef,0,:undef)}), Val{(0,2)}) === Vec{2,T}((7,1))
-            @test isa(shufflevector(a, Val{(:undef,:undef,:undef,:undef)}), Vec{4,T})
+            @test shufflevector(a, b, Val((2,3,4,5))) === Vec{4,T}((3,4,5,6))
+            @test shufflevector(a, b, Val((1,7,5,5))) === Vec{4,T}((2,8,6,6))
+            @test shufflevector(a, b, Val(0:3)) === a
+            @test shufflevector(a, b, Val(4:7)) === b
+            @test shufflevector(a, Val((1,0,2,3))) === Vec{4,T}((2,1,3,4))
+            @test shufflevector(a, b, Val((0,1,4,5,2,3,6,7))) === Vec{8,T}((1,2,5,6,3,4,7,8))
+            @test shufflevector(shufflevector(a, b, Val((6,:undef,0,:undef))), Val((0,2))) === Vec{2,T}((7,1))
+            @test isa(shufflevector(a, Val((:undef,:undef,:undef,:undef))), Vec{4,T})
             c = Vec{8,T}((1:8...,))
             d = Vec{8,T}((9:16...,))
-            @test shufflevector(c, d, Val{(0,1,8,15)}) === Vec{4,T}((1,2,9,16))
-            @test shufflevector(c, d, Val{1:2:15}) === Vec{8,T}((2:2:16...,))
+            @test shufflevector(c, d, Val((0,1,8,15))) === Vec{4,T}((1,2,9,16))
+            @test shufflevector(c, d, Val(1:2:15)) === Vec{8,T}((2:2:16...,))
         end
 
         let
             a = Vec{4,Bool}((true,false,true,false))
             b = Vec{4,Bool}((false,false,true,true))
-            @test shufflevector(a, b, Val{(2,3,4,5)}) === Vec{4,Bool}((true,false,false,false))
+            @test shufflevector(a, b, Val((2,3,4,5))) === Vec{4,Bool}((true,false,false,false))
         end
     end
-end
+# end

From 68c0b2d9f1c425c7377d4c6c4ae75ff9a0094548 Mon Sep 17 00:00:00 2001
From: Kristoffer Carlsson <kcarlsson89@gmail.com>
Date: Thu, 13 Feb 2020 16:30:31 +0100
Subject: [PATCH 02/20] add a warning and two explicit inlines

---
 src/LLVM_intrinsics.jl | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/src/LLVM_intrinsics.jl b/src/LLVM_intrinsics.jl
index 9172dfa..2807968 100644
--- a/src/LLVM_intrinsics.jl
+++ b/src/LLVM_intrinsics.jl
@@ -1,10 +1,19 @@
 # LLVM operations and intrinsics
 module Intrinsics
 
+# Note, that in the functions below, some care needs to be taken when passing
+# Julia Bools to LLVM. Julia passes Bools as LLVM i8 but expect them to only
+# have the last bit as non-zero. Failure to comply with this can give weird errors
+# like false !== false where the first false is the result of some computation.
+
+# Note, no difference is made between Julia usigned integers and signed integers
+# when passed to LLVM. It is up to the caller to make sure that the correct
+# intrinsic is called (e.g uitofp vs sitofp).
+
 # TODO: fastmath flags
 
 import ..SIMD: SIMD, VE, LVec, FloatingTypes
-# Inlcude Bool in IntegerTypes 
+# Inlcude Bool in IntegerTypes
 const IntegerTypes = Union{SIMD.IntegerTypes, Bool}
 
 const d = Dict{DataType, String}(
@@ -552,6 +561,7 @@ for (fs, c) in zip([HORZ_REDUCTION_OPS_FLOAT, HORZ_REDUCTION_OPS_INT],
             ret $(d[T]) %res
             """
             return quote
+                $(Expr(:meta, :inline));
                 Base.llvmcall($(decl, s2), T, Tuple{LVec{N, T},}, x)
             end
         end
@@ -570,6 +580,7 @@ for (f, neutral) in [(:fadd, "0.0"), (:fmul, "1.0")]
         ret $(d[T]) %res
         """
         return quote
+            $(Expr(:meta, :inline));
             Base.llvmcall($(decl, s2), T, Tuple{LVec{N, T},}, x)
         end
     end

From e3321e54a8ccbbba2164ffebea9cfd8b914394cb Mon Sep 17 00:00:00 2001
From: Kristoffer Carlsson <kcarlsson89@gmail.com>
Date: Thu, 13 Feb 2020 16:30:31 +0100
Subject: [PATCH 03/20] add functions for doing saturated adds and subs

---
 src/LLVM_intrinsics.jl |  31 ++++++++----
 src/simdvec.jl         | 111 +++++++++++++++++++++--------------------
 test/runtests.jl       |  10 ++++
 3 files changed, 89 insertions(+), 63 deletions(-)

diff --git a/src/LLVM_intrinsics.jl b/src/LLVM_intrinsics.jl
index 2807968..8c3f172 100644
--- a/src/LLVM_intrinsics.jl
+++ b/src/LLVM_intrinsics.jl
@@ -45,9 +45,10 @@ suffix(N::Integer, ::Type{Ptr{T}}) where {T} = "v$(N)p0$(T<:IntegerTypes ? "i" :
 suffix(N::Integer, ::Type{T}) where {T}      = "v$(N)$(T<:IntegerTypes   ? "i" : "f")$(8*sizeof(T))"
 suffix(::Type{T}) where {T}                  = "$(T<:IntegerTypes        ? "i" : "f")$(8*sizeof(T))"
 
-llvm_name(llvmf, N, T)                           = string("llvm", ".", llvmf, ".", suffix(N, T))
-llvm_name(llvmf, ::Type{LVec{N, T}}) where {N,T} = string("llvm", ".", llvmf, ".", suffix(N, T))
-llvm_name(llvmf, ::Type{T}) where {T}            = string("llvm", ".", llvmf, ".", suffix(T))
+dotit(f) = replace(string(f), "_" => ".")
+llvm_name(llvmf, N, T)                           = string("llvm", ".", dotit(llvmf), ".", suffix(N, T))
+llvm_name(llvmf, ::Type{LVec{N, T}}) where {N,T} = string("llvm", ".", dotit(llvmf), ".", suffix(N, T))
+llvm_name(llvmf, ::Type{T}) where {T}            = string("llvm", ".", dotit(llvmf), ".", suffix(T))
 
 llvm_type(::Type{T}) where {T}            = d[T]
 llvm_type(::Type{LVec{N, T}}) where {N,T} = "< $N x $(d[T])>"
@@ -171,13 +172,23 @@ const BINARY_INTRINSICS_FLOAT = [
     :round
 ]
 
-for f in BINARY_INTRINSICS_FLOAT
-    @eval @generated function $(f)(x::T, y::T) where T<:LT{<:FloatingTypes}
-        ff = llvm_name($(QuoteNode(f)), T,)
-        return :(
-            $(Expr(:meta, :inline));
-            ccall($ff, llvmcall, T, (T, T), x, y)
-        )
+const BINARY_INTRINSICS_INT = [
+    :sadd_sat
+    :uadd_sat
+    :ssub_sat
+    :usub_sat
+]
+
+for (fs, c) in zip([BINARY_INTRINSICS_FLOAT, BINARY_INTRINSICS_INT],
+                   [FloatingTypes,           IntegerTypes])
+    for f in fs
+        @eval @generated function $(f)(x::T, y::T) where T<:LT{<:$c}
+            ff = llvm_name($(QuoteNode(f)), T,)
+            return :(
+                $(Expr(:meta, :inline));
+                ccall($ff, llvmcall, T, (T, T), x, y)
+            )
+        end
     end
 end
 
diff --git a/src/simdvec.jl b/src/simdvec.jl
index a09ca7f..6932b59 100644
--- a/src/simdvec.jl
+++ b/src/simdvec.jl
@@ -177,50 +177,54 @@ end
 ####################
 
 const BINARY_OPS = [
-    (:+        , IntegerTypes  , Intrinsics.add)
-    (:-        , IntegerTypes  , Intrinsics.sub)
-    (:*        , IntegerTypes  , Intrinsics.mul)
-    (:div      , UIntTypes     , Intrinsics.udiv)
-    (:div      , IntTypes      , Intrinsics.sdiv)
-    (:rem      , UIntTypes     , Intrinsics.urem)
-    (:rem      , IntTypes      , Intrinsics.srem)
-
-    (:+        , FloatingTypes , Intrinsics.fadd)
-    (:-        , FloatingTypes , Intrinsics.fsub)
-    (:*        , FloatingTypes , Intrinsics.fmul)
-    (:^        , FloatingTypes , Intrinsics.pow)
-    (:/        , FloatingTypes , Intrinsics.fdiv)
-    (:rem      , FloatingTypes , Intrinsics.frem)
-    (:min      , FloatingTypes , Intrinsics.minnum)
-    (:max      , FloatingTypes , Intrinsics.maxnum)
-    (:copysign , FloatingTypes , Intrinsics.copysign)
-
-    (:~        , BIntegerTypes  , Intrinsics.xor)
-    (:&        , BIntegerTypes  , Intrinsics.and)
-    (:|        , BIntegerTypes  , Intrinsics.or)
-    (:⊻        , BIntegerTypes  , Intrinsics.xor)
-
-    (:(==)     , BIntegerTypes  , Intrinsics.icmp_eq)
-    (:(!=)     , BIntegerTypes  , Intrinsics.icmp_ne)
-    (:(>)      , BIntTypes      , Intrinsics.icmp_sgt)
-    (:(>=)     , BIntTypes      , Intrinsics.icmp_sge)
-    (:(<)      , BIntTypes      , Intrinsics.icmp_slt)
-    (:(<=)     , BIntTypes      , Intrinsics.icmp_sle)
-    (:(>)      , UIntTypes      , Intrinsics.icmp_ugt)
-    (:(>=)     , UIntTypes      , Intrinsics.icmp_uge)
-    (:(<)      , UIntTypes      , Intrinsics.icmp_ult)
-    (:(<=)     , UIntTypes      , Intrinsics.icmp_ule)
-
-    (:(==)     , FloatingTypes , Intrinsics.fcmp_oeq)
-    (:(!=)     , FloatingTypes , Intrinsics.fcmp_une)
-    (:(>)      , FloatingTypes , Intrinsics.fcmp_ogt)
-    (:(>=)     , FloatingTypes , Intrinsics.fcmp_oge)
-    (:(<)      , FloatingTypes , Intrinsics.fcmp_olt)
-    (:(<=)     , FloatingTypes , Intrinsics.fcmp_ole)
+    (:(Base.:+)        , IntegerTypes  , Intrinsics.add)
+    (:(Base.:-)        , IntegerTypes  , Intrinsics.sub)
+    (:(Base.:*)        , IntegerTypes  , Intrinsics.mul)
+    (:(Base.div)       , UIntTypes     , Intrinsics.udiv)
+    (:(Base.div)       , IntTypes      , Intrinsics.sdiv)
+    (:(Base.rem)       , UIntTypes     , Intrinsics.urem)
+    (:(Base.rem)       , IntTypes      , Intrinsics.srem)
+
+    (:(add_saturate) , IntTypes  , Intrinsics.sadd_sat)
+    (:(add_saturate) , UIntTypes , Intrinsics.uadd_sat)
+    (:(sub_saturate) , IntTypes  , Intrinsics.ssub_sat)
+    (:(sub_saturate) , UIntTypes , Intrinsics.usub_sat)
+
+    (:(Base.:+)        , FloatingTypes , Intrinsics.fadd)
+    (:(Base.:-)        , FloatingTypes , Intrinsics.fsub)
+    (:(Base.:*)        , FloatingTypes , Intrinsics.fmul)
+    (:(Base.:^)        , FloatingTypes , Intrinsics.pow)
+    (:(Base.:/)        , FloatingTypes , Intrinsics.fdiv)
+    (:(Base.rem)       , FloatingTypes , Intrinsics.frem)
+    (:(Base.min)       , FloatingTypes , Intrinsics.minnum)
+    (:(Base.max)       , FloatingTypes , Intrinsics.maxnum)
+    (:(Base.copysign)  , FloatingTypes , Intrinsics.copysign)
+    (:(Base.:~)        , BIntegerTypes , Intrinsics.xor)
+    (:(Base.:&)        , BIntegerTypes , Intrinsics.and)
+    (:(Base.:|)        , BIntegerTypes , Intrinsics.or)
+    (:(Base.:⊻)        , BIntegerTypes , Intrinsics.xor)
+
+    (:(Base.:(==))   , BIntegerTypes  , Intrinsics.icmp_eq)
+    (:(Base.:!=)     , BIntegerTypes  , Intrinsics.icmp_ne)
+    (:(Base.:>)      , BIntTypes      , Intrinsics.icmp_sgt)
+    (:(Base.:>=)     , BIntTypes      , Intrinsics.icmp_sge)
+    (:(Base.:<)      , BIntTypes      , Intrinsics.icmp_slt)
+    (:(Base.:<=)     , BIntTypes      , Intrinsics.icmp_sle)
+    (:(Base.:>)      , UIntTypes      , Intrinsics.icmp_ugt)
+    (:(Base.:>=)     , UIntTypes      , Intrinsics.icmp_uge)
+    (:(Base.:<)      , UIntTypes      , Intrinsics.icmp_ult)
+    (:(Base.:<=)     , UIntTypes      , Intrinsics.icmp_ule)
+
+    (:(Base.:(==))   , FloatingTypes , Intrinsics.fcmp_oeq)
+    (:(Base.:!=)     , FloatingTypes , Intrinsics.fcmp_une)
+    (:(Base.:>)      , FloatingTypes , Intrinsics.fcmp_ogt)
+    (:(Base.:>=)     , FloatingTypes , Intrinsics.fcmp_oge)
+    (:(Base.:<)      , FloatingTypes , Intrinsics.fcmp_olt)
+    (:(Base.:<=)     , FloatingTypes , Intrinsics.fcmp_ole)
 ]
 
 for (op, constraint, llvmop) in BINARY_OPS
-    @eval @inline function (Base.$op)(x::Vec{N, T}, y::Vec{N, T}) where {N, T <: $constraint}
+    @eval @inline function $op(x::Vec{N, T}, y::Vec{N, T}) where {N, T <: $constraint}
         Vec($(llvmop)(x.data, y.data))
     end
 end
@@ -317,22 +321,23 @@ for v in (:<<, :>>, :>>>)
     end
 end
 
+
 # Vectorize binary functions
 for (op, constraint) in [BINARY_OPS;
-        (:flipsign , ScalarTypes)
-        (:copysign , ScalarTypes)
-        (:signbit  , ScalarTypes)
-        (:min      , IntegerTypes)
-        (:max      , IntegerTypes)
-        (:<<       , IntegerTypes)
-        (:>>       , IntegerTypes)
-        (:>>>      , IntegerTypes)
+        (:(Base.flipsign) , ScalarTypes)
+        (:(Base.copysign) , ScalarTypes)
+        (:(Base.signbit)  , ScalarTypes)
+        (:(Base.min)      , IntegerTypes)
+        (:(Base.max)      , IntegerTypes)
+        (:(Base.:<<)      , IntegerTypes)
+        (:(Base.:>>)      , IntegerTypes)
+        (:(Base.:>>>)     , IntegerTypes)
     ]
-    @eval @inline function (Base.$op)(x::T2, y::Vec{N, T}) where {N, T2<:ScalarTypes, T <: $constraint}
-        Base.$op(Vec{N, T}(x), y)
+    @eval @inline function $op(x::T2, y::Vec{N, T}) where {N, T2<:ScalarTypes, T <: $constraint}
+        $op(Vec{N, T}(x), y)
     end
-    @eval @inline function (Base.$op)(x::Vec{N, T}, y::T2) where {N, T2 <:ScalarTypes, T <: $constraint}
-        Base.$op(x, Vec{N, T}(y))
+    @eval @inline function $op(x::Vec{N, T}, y::T2) where {N, T2 <:ScalarTypes, T <: $constraint}
+        $op(x, Vec{N, T}(y))
     end
 end
 
diff --git a/test/runtests.jl b/test/runtests.jl
index 74bb8f3..ae8665d 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -130,6 +130,16 @@ llvm_ir(f, args) = sprint(code_llvm, f, Base.typesof(args...))
         @test Tuple(V8I32(v8i32)^3) === v8i32.^3
     end
 
+    @testset "saturation" begin
+        v = Vec{4, UInt8}(UInt8.((150, 250, 125, 0)))
+        @test SIMD.add_saturate(v, UInt8(50)) === Vec{4, UInt8}(UInt8.((200, 255, 175, 50)))
+        @test SIMD.sub_saturate(v, UInt8(100)) === Vec{4, UInt8}(UInt8.((50, 150, 25, 0)))
+        v = Vec{4, Int8}(Int8.((100, -100, 20, -20)))
+        @test SIMD.add_saturate(v, Int8(50)) === Vec{4, Int8}(Int8.((127, -50, 70, 30)))
+        @test SIMD.sub_saturate(v, Int8(50)) === Vec{4, Int8}(Int8.((50, -128, -30, -70)))
+
+    end
+
     @testset "Floating point arithmetic functions" begin
 
         global const v4f64b = map(x->Float64(x+1), v4f64)

From bf71b69fc66e2d2901b712df8e06867e4a121bd7 Mon Sep 17 00:00:00 2001
From: Kristoffer <kcarlsson89@gmail.com>
Date: Sat, 22 Feb 2020 20:49:22 +0100
Subject: [PATCH 04/20] fix supported element types

fixup: fix supported element types
---
 src/SIMD.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/SIMD.jl b/src/SIMD.jl
index aeb60b1..e8bae1f 100644
--- a/src/SIMD.jl
+++ b/src/SIMD.jl
@@ -8,9 +8,9 @@ export Vec, vload, vloada, vloadnt, vstore, vstorea, vstorent, vgather, vgathera
 const VE         = Base.VecElement
 const LVec{N, T} = NTuple{N, VE{T}}
 
-const IntTypes      = Union{Int8, Int16, Int32, Int64, Int128}
-const BIntTypes      = Union{IntTypes, Bool}
-const UIntTypes     = Union{UInt8, UInt16, UInt32, UInt64, UInt128}
+const IntTypes      = Union{Int8, Int16, Int32, Int64} # Int128 and UInt128 does not get passed as LLVM vectors
+const BIntTypes     = Union{IntTypes, Bool}
+const UIntTypes     = Union{UInt8, UInt16, UInt32, UInt64}
 const IntegerTypes  = Union{IntTypes, UIntTypes}
 const BIntegerTypes = Union{IntegerTypes, Bool}
 const FloatingTypes = Union{Float32, Float64} # Float16 support is non-native in Julia and gets passed as an i16

From f237a54abcd8fc7c90520c64be8a8af951eecf06 Mon Sep 17 00:00:00 2001
From: Kristoffer <kcarlsson89@gmail.com>
Date: Sat, 22 Feb 2020 21:01:15 +0100
Subject: [PATCH 05/20] improve typeinfo propagation

---
 src/simdvec.jl | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/src/simdvec.jl b/src/simdvec.jl
index 6932b59..010b8f8 100644
--- a/src/simdvec.jl
+++ b/src/simdvec.jl
@@ -76,10 +76,20 @@ Base.length(V::Vec) = length(typeof(V))
 Base.size(V::Vec) = size(typeof(V))
 Base.size(V::Vec, n::Integer) = size(typeof(V), n)
 
-function Base.show(io::IO, v::Vec{N,T}) where {N,T}
-    print(io, "<$N x $T>[")
-    join(io, [x.value for x in v.data], ", ")
-    print(io, "]")
+if VERSION <= v"1.4.0-rc1.0"
+    function Base.show(io::IO, v::Vec{N,T}) where {N,T}
+        print(io, "<$N x $T>[")
+        join(io, [x.value for x in v.data], ", ")
+        print(io, "]")
+    end
+else
+    # This crashes on pre 1.4-rc2
+    function Base.show(io::IO, v::Vec{N,T}) where {N,T}
+        io = IOContext(io, :typeinfo => eltype(v))
+        print(io, "<$N x $T>[")
+        join(io, [sprint(show, x.value; context=io) for x in v.data], ", ")
+        print(io, "]")
+    end
 end
 
 @inline Base.checkbounds(v::Vec, i::IntegerTypes) =

From 9efd5438304c29ca921f209cb9f74cc10199c917 Mon Sep 17 00:00:00 2001
From: Kristoffer <kcarlsson89@gmail.com>
Date: Sat, 22 Feb 2020 21:22:50 +0100
Subject: [PATCH 06/20] overload bitreverse when it exists in Base (1.5)

---
 src/simdvec.jl | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/simdvec.jl b/src/simdvec.jl
index 010b8f8..556e2b7 100644
--- a/src/simdvec.jl
+++ b/src/simdvec.jl
@@ -136,13 +136,17 @@ const UNARY_OPS = [
     # (:nearbyint    , FloatingTypes , Intrinsics)            ,
     (:round          , FloatingTypes , Intrinsics.round)      ,
 
-    # (:bitreverse   , IntegerTypes  , Intrinsics.bitreverse) ,
     (:bswap          , IntegerTypes  , Intrinsics.bswap)      ,
     (:count_ones     , IntegerTypes  , Intrinsics.ctpop)      ,
     (:leading_zeros  , IntegerTypes  , Intrinsics.ctlz)       ,
     (:trailing_zeros , IntegerTypes  , Intrinsics.cttz)       ,
 ]
 
+if isdefined(Base, :bitreverse)
+    push!(UNARY_OPS,
+        (:bitreverse   , IntegerTypes  , Intrinsics.bitreverse)
+    )
+end
 for (op, constraint, llvmop) in UNARY_OPS
     @eval @inline (Base.$op)(x::Vec{<:Any, <:$constraint}) =
         Vec($(llvmop)(x.data))

From 86805c2acd085d5177560e03da751ceac3183e89 Mon Sep 17 00:00:00 2001
From: Kristoffer <kcarlsson89@gmail.com>
Date: Sat, 22 Feb 2020 21:23:20 +0100
Subject: [PATCH 07/20] add some more docs to reduce

---
 README.md | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 7548e87..f202f70 100644
--- a/README.md
+++ b/README.md
@@ -51,7 +51,8 @@ The SIMD package provides the usual arithmetic and logical operations for SIMD v
 These operators and functions are always applied element-wise, i.e. they are applied to each element in parallel, yielding again a SIMD vector as result. This means that e.g. multiplying two vectors yields a vector, and comparing two vectors yields a vector of booleans. This behaviour might seem strange and slightly unusual, but corresponds to the machine instructions provided by the hardware. It is also what is usually needed to vectorize loops.
 
 The SIMD package also provides conversion operators from scalars and tuples to SIMD vectors and from SIMD vectors to tuples. Additionally, there are `getindex` and `setindex` functions to access individual vector elements.  SIMD vectors are immutable (like tuples), and `setindex` (note there is no exclamation mark at the end of the name) thus returns the modified vector.
-```Julia
+
+```julia
 # Create a vector where all elements are Float64(1):
 xs = Vec{4,Float64}(1)
 
@@ -71,12 +72,25 @@ Reduction operations reduce a SIMD vector to a scalar. The following reduction o
 `all any maximum minimum sum prod`
 
 Example:
-```Julia
+
+```julia
 v = Vec{4,Float64}((1,2,3,4))
 sum(v)
 10.0
 ```
 
+It is also possible to use reduce with bit operations:
+
+```julia
+julia> v = Vec{4,UInt16}((1,2,3,4))
+<4 x UInt16>[0x0001, 0x0002, 0x0003, 0x0004]
+
+julia> reduce(|, v)
+0x0007
+
+julia> reduce(&, v)
+0x0000
+```
 ## Accessing arrays
 
 When using explicit SIMD vectorization, it is convenient to allocate arrays still as arrays of scalars, not as arrays of vectors. The `vload` and `vstore` functions allow reading vectors from and writing vectors into arrays, accessing several contiguous array elements.

From bca864d6feec254a0349e9866413b7a01fe7c01b Mon Sep 17 00:00:00 2001
From: Kristoffer <kcarlsson89@gmail.com>
Date: Sat, 22 Feb 2020 21:23:50 +0100
Subject: [PATCH 08/20] move asserts in generated functions to return an error
 expression instead

---
 src/LLVM_intrinsics.jl | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/LLVM_intrinsics.jl b/src/LLVM_intrinsics.jl
index 8c3f172..1f8969e 100644
--- a/src/LLVM_intrinsics.jl
+++ b/src/LLVM_intrinsics.jl
@@ -481,7 +481,9 @@ for (fs, c) in zip([CAST_SIZE_CHANGE_FLOAT, CAST_SIZE_CHANGE_INT],
         @eval @generated function $f(::Type{LVec{N, T2}}, x::LVec{N, T1}) where {N, T1 <: $c, T2 <: $c}
             sT1, sT2 = sizeof(T1) * 8, sizeof(T2) * 8
             # Not changing size is not allowed
-            @assert $criteria(sT1, sT2) "size of conversion type ($T2: $sT2) must be $($criteria) than the element type ($T1: $sT1)"
+            if !$criteria(sT1, sT2)
+                return :(error("size of conversion type ($T2: $sT2) must be $($criteria) than the element type ($T1: $sT1)"))
+            end
             ff = $(QuoteNode(f))
             s = """
             %2 = $ff <$(N) x $(d[T1])> %0 to <$(N) x $(d[T2])>
@@ -529,7 +531,9 @@ end
 
 @generated function bitcast(::Type{T1}, x::T2) where {T1<:LT, T2<:LT}
     sT1, sT2 = sizeof(T1), sizeof(T2)
-    @assert sT1 == sT2 "size of conversion type ($T1: $sT1) must be equal to the vector type ($T2: $sT2)"
+    if sT1 != sT2
+        return :(error("size of conversion type ($T1: $sT1) must be equal to the vector type ($T2: $sT2)"))
+    end
     s = """
     %2 = bitcast $(llvm_type(T2)) %0 to $(llvm_type(T1))
     ret $(llvm_type(T1)) %2

From bbd89f8c732fbe38a12bd2373d683bf13b5a1fc3 Mon Sep 17 00:00:00 2001
From: Kristoffer <kcarlsson89@gmail.com>
Date: Sat, 22 Feb 2020 21:24:19 +0100
Subject: [PATCH 09/20] restrict eltypes in tuples and varags for Vec
 constructor

---
 src/simdvec.jl | 31 ++++++++++++++++++++++++++++---
 1 file changed, 28 insertions(+), 3 deletions(-)

diff --git a/src/simdvec.jl b/src/simdvec.jl
index 556e2b7..702d18e 100644
--- a/src/simdvec.jl
+++ b/src/simdvec.jl
@@ -1,10 +1,10 @@
-struct Vec{N, T <: VecTypes}
+struct Vec{N, T<:VecTypes}
     data::LVec{N, T}
 end
 
 # Constructors
-@inline Vec(v::NTuple{N, T}) where {N, T} = Vec(VE.(v))
-@inline Vec(v::Vararg{T, N}) where {N, T} = Vec(v)
+@inline Vec(v::NTuple{N, T}) where {N, T<:VecTypes} = Vec(VE.(v))
+@inline Vec(v::Vararg{T, N}) where {N, T<:VecTypes} = Vec(v)
 @inline Vec(v::Vec) = v
 # Numbers defines this and it is needed in power_by_squaring...
 Base.copy(v::Vec) = v
@@ -147,6 +147,7 @@ if isdefined(Base, :bitreverse)
         (:bitreverse   , IntegerTypes  , Intrinsics.bitreverse)
     )
 end
+
 for (op, constraint, llvmop) in UNARY_OPS
     @eval @inline (Base.$op)(x::Vec{<:Any, <:$constraint}) =
         Vec($(llvmop)(x.data))
@@ -243,6 +244,24 @@ for (op, constraint, llvmop) in BINARY_OPS
     end
 end
 
+# overflow
+
+const OVERFLOW_INTRINSICS = [
+    (:(Base.Checked.add_with_overflow) , IntTypes  , Intrinsics.sadd_with_overflow)
+    (:(Base.Checked.add_with_overflow) , UIntTypes , Intrinsics.uadd_with_overflow)
+    (:(Base.Checked.sub_with_overflow) , IntTypes  , Intrinsics.ssub_with_overflow)
+    (:(Base.Checked.sub_with_overflow) , UIntTypes , Intrinsics.usub_with_overflow)
+    (:(Base.Checked.mul_with_overflow) , IntTypes  , Intrinsics.smul_with_overflow)
+    (:(Base.Checked.mul_with_overflow) , UIntTypes , Intrinsics.umul_with_overflow)
+]
+for (op, constraint, llvmop) in OVERFLOW_INTRINSICS
+    @eval @inline function $op(x::Vec{N, T}, y::Vec{N, T}) where {N, T <: $constraint}
+        val, overflows = $(llvmop)(x.data, y.data)
+        return Vec(val), Vec(overflows)
+    end
+end
+
+
 # max min
 @inline Base.max(v1::Vec{N,T}, v2::Vec{N,T}) where {N,T<:IntegerTypes} =
     Vec(vifelse(v1 >= v2, v1, v2))
@@ -346,6 +365,12 @@ for (op, constraint) in [BINARY_OPS;
         (:(Base.:<<)      , IntegerTypes)
         (:(Base.:>>)      , IntegerTypes)
         (:(Base.:>>>)     , IntegerTypes)
+        (:(Base.Checked.add_with_overflow) , IntTypes)
+        (:(Base.Checked.add_with_overflow) , UIntTypes)
+        (:(Base.Checked.sub_with_overflow) , IntTypes)
+        (:(Base.Checked.sub_with_overflow) , UIntTypes)
+        (:(Base.Checked.mul_with_overflow) , IntTypes)
+        (:(Base.Checked.mul_with_overflow) , UIntTypes)
     ]
     @eval @inline function $op(x::T2, y::Vec{N, T}) where {N, T2<:ScalarTypes, T <: $constraint}
         $op(Vec{N, T}(x), y)

From 0706d0f2cf8d2195bd274db467241a1baca5631f Mon Sep 17 00:00:00 2001
From: Kristoffer <kcarlsson89@gmail.com>
Date: Sat, 22 Feb 2020 21:28:12 +0100
Subject: [PATCH 10/20] add docs for saturation arithmetic

---
 README.md | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/README.md b/README.md
index f202f70..5905203 100644
--- a/README.md
+++ b/README.md
@@ -91,6 +91,26 @@ julia> reduce(|, v)
 julia> reduce(&, v)
 0x0000
 ```
+
+## Saturation arithmetic
+
+Saturation arithmetic is a version of arithmetic in which operations are limited
+to a fixed range between a minimum and maximum value. If the result of an
+operation is greater than the maximum value, the result is set (or “clamped”) to
+this maximum. If it is below the minimum, it is clamped to this minimum.
+
+
+```julia
+julia> v = Vec{4, Int8}((40, -80, 70, -10))
+<4 x Int8>[40, -80, 70, -10]
+
+julia> SIMD.add_saturate(v, v)
+<4 x Int8>[80, -128, 127, -20]
+
+julia> SIMD.sub_saturate(v, 120)
+<4 x Int8>[-80, -128, -50, -128]
+```
+
 ## Accessing arrays
 
 When using explicit SIMD vectorization, it is convenient to allocate arrays still as arrays of scalars, not as arrays of vectors. The `vload` and `vstore` functions allow reading vectors from and writing vectors into arrays, accessing several contiguous array elements.

From 516767332a0d1c6e54460d9973a51276e0440fbb Mon Sep 17 00:00:00 2001
From: Kristoffer <kcarlsson89@gmail.com>
Date: Sat, 22 Feb 2020 21:29:04 +0100
Subject: [PATCH 11/20] add overflow arithmetic

---
 README.md              | 25 +++++++++++++++++++++++++
 src/LLVM_intrinsics.jl | 39 +++++++++++++++++++++++++++++++++++++++
 test/runtests.jl       | 13 +++++++++++++
 3 files changed, 77 insertions(+)

diff --git a/README.md b/README.md
index 5905203..69a2e5b 100644
--- a/README.md
+++ b/README.md
@@ -92,6 +92,31 @@ julia> reduce(&, v)
 0x0000
 ```
 
+## Overflow operations
+
+Overflow operations do the operation but also give back a flag that indicates
+whether the result of the operation overflowed.
+Note that these only work on Julia with LLVM 9 or higher (Julia 1.5 or higher):
+The functions `Base.Checked.add_with_overflow`, `Base.Checked.sub_with_overflow`,
+`Base.Checked.mul_with_overflow` are extended to work on `Vec`. :
+
+```julia
+julia> v = Vec{4, Int8}((40, -80, 70, -10))
+<4 x Int8>[40, -80, 70, -10]
+
+julia> Base.Checked.add_with_overflow(v, v)
+(<4 x Int8>[80, 96, -116, -20], <4 x Bool>[0, 1, 1, 0])
+
+julia> Base.Checked.add_with_overflow(Int8(-80), Int8(-80))
+(96, true)
+
+julia> Base.Checked.sub_with_overflow(v, 120)
+(<4 x Int8>[-80, 56, -50, 126], <4 x Bool>[0, 1, 0, 1])
+
+julia> Base.Checked.mul_with_overflow(v, 2)
+(<4 x Int8>[80, 96, -116, -20], <4 x Bool>[0, 1, 1, 0])
+```
+
 ## Saturation arithmetic
 
 Saturation arithmetic is a version of arithmetic in which operations are limited
diff --git a/src/LLVM_intrinsics.jl b/src/LLVM_intrinsics.jl
index 1f8969e..2f16eb8 100644
--- a/src/LLVM_intrinsics.jl
+++ b/src/LLVM_intrinsics.jl
@@ -203,6 +203,45 @@ for (f, c) in [(:pow, FloatingTypes), (:powi, IntegerTypes)]
     end
 end
 
+# Overflow
+const OVERFLOW_INTRINSICS = [
+    :sadd_with_overflow
+    :uadd_with_overflow
+    :ssub_with_overflow
+    :usub_with_overflow
+    :smul_with_overflow
+    :umul_with_overflow
+]
+
+const SUPPORTS_VEC_OVERFLOW = Base.libllvm_version >= v"9"
+for f in OVERFLOW_INTRINSICS
+    @eval @generated function $f(x::LVec{N, T}, y::LVec{N, T}) where {N, T <: IntegerTypes}
+        if !SUPPORTS_VEC_OVERFLOW
+            return :(error("LLVM version 9.0 or greater required (Julia 1.5 or greater)"))
+        end
+        ff = llvm_name($(QuoteNode(f)), N, T)
+        decl = "declare {<$N x $(d[T])>, <$N x i1>} @$ff(<$N x $(d[T])>, <$N x $(d[T])>)"
+
+        # Julia passes Tuple{[U]Int8, Bool} as [2 x i8] so we need to special case that scenario
+        ret_type = sizeof(T) == 1 ? "[2 x <$N x i8>]" : "{<$N x $(d[T])>, <$N x i8>}"
+
+        s = """
+        %res = call {<$N x $(d[T])>, <$N x i1>} @$ff(<$N x $(d[T])> %0, <$N x $(d[T])> %1)
+        %plus     = extractvalue {<$N x $(d[T])>, <$N x i1>} %res, 0
+        %overflow = extractvalue {<$N x $(d[T])>, <$N x i1>} %res, 1
+        %overflow_ext = zext <$(N) x i1> %overflow to <$(N) x i8>
+        %new_tuple   = insertvalue $ret_type undef,      <$N x $(d[T])> %plus,         0
+        %new_tuple_2 = insertvalue $ret_type %new_tuple, <$N x i8>      %overflow_ext, 1
+        ret $ret_type %new_tuple_2
+        """
+        return :(
+            $(Expr(:meta, :inline));
+            Base.llvmcall(($decl, $s), Tuple{LVec{N, T}, LVec{N, Bool}}, Tuple{LVec{N, T}, LVec{N, T}}, x, y)
+        )
+    end
+end
+
+
 # Comparisons
 const CMP_FLAGS_FLOAT = [
     :false
diff --git a/test/runtests.jl b/test/runtests.jl
index ae8665d..75c7cc2 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -137,7 +137,20 @@ llvm_ir(f, args) = sprint(code_llvm, f, Base.typesof(args...))
         v = Vec{4, Int8}(Int8.((100, -100, 20, -20)))
         @test SIMD.add_saturate(v, Int8(50)) === Vec{4, Int8}(Int8.((127, -50, 70, 30)))
         @test SIMD.sub_saturate(v, Int8(50)) === Vec{4, Int8}(Int8.((50, -128, -30, -70)))
+    end
 
+    using Base.Checked: add_with_overflow, sub_with_overflow, mul_with_overflow
+    if Base.libllvm_version >= v"9"
+        @testset "overflow arithmetic" begin
+            for f in (add_with_overflow, sub_with_overflow, mul_with_overflow)
+                for T in [Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64]
+                    t2 = div(typemax(T), T(2)) + one(T)
+                    t1 = div(typemin(T), T(2)) - (T <: Unsigned ? zero(T) : one(T))
+                    v = Vec(t2, t1, T(0), t2 - one(T))
+                    @test Tuple(zip(Tuple.(f(v,v))...)) === map(f, Tuple(v), Tuple(v))
+                end
+            end
+        end
     end
 
     @testset "Floating point arithmetic functions" begin

From 5fb86c2f17d57f4a1c58092d52aae6a01ca8157b Mon Sep 17 00:00:00 2001
From: Kristoffer <kcarlsson89@gmail.com>
Date: Sat, 22 Feb 2020 22:32:00 +0100
Subject: [PATCH 12/20] throw when trying to call mul with overflow on Int64 on
 i686 because

the error we get is "LLVM ERROR: Symbols not found: { __mulodi4 }" which seems
like it would require compiler-rt support"
---
 src/LLVM_intrinsics.jl | 4 ++++
 test/runtests.jl       | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/src/LLVM_intrinsics.jl b/src/LLVM_intrinsics.jl
index 2f16eb8..3d8fe05 100644
--- a/src/LLVM_intrinsics.jl
+++ b/src/LLVM_intrinsics.jl
@@ -220,6 +220,10 @@ for f in OVERFLOW_INTRINSICS
             return :(error("LLVM version 9.0 or greater required (Julia 1.5 or greater)"))
         end
         ff = llvm_name($(QuoteNode(f)), N, T)
+        if $(QuoteNode(f)) == :smul_with_overflow && Sys.ARCH == :i686 && T == Int64
+            str = "this intrinsic ($ff) is broken on i686"
+            return :(error($str))
+        end
         decl = "declare {<$N x $(d[T])>, <$N x i1>} @$ff(<$N x $(d[T])>, <$N x $(d[T])>)"
 
         # Julia passes Tuple{[U]Int8, Bool} as [2 x i8] so we need to special case that scenario
diff --git a/test/runtests.jl b/test/runtests.jl
index 75c7cc2..89af98d 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -147,6 +147,10 @@ llvm_ir(f, args) = sprint(code_llvm, f, Base.typesof(args...))
                     t2 = div(typemax(T), T(2)) + one(T)
                     t1 = div(typemin(T), T(2)) - (T <: Unsigned ? zero(T) : one(T))
                     v = Vec(t2, t1, T(0), t2 - one(T))
+                    if f == mul_with_overflow && Sys.ARCH == :i686 && T == Int64
+                        @test_throws ErrorException f(v,v)
+                        continue
+                    end
                     @test Tuple(zip(Tuple.(f(v,v))...)) === map(f, Tuple(v), Tuple(v))
                 end
             end

From 67372bfdf8f5d6e79e7a30b086778948e4ab7949 Mon Sep 17 00:00:00 2001
From: Kristoffer <kcarlsson89@gmail.com>
Date: Sat, 22 Feb 2020 22:45:43 +0100
Subject: [PATCH 13/20] add conversion from Bool

---
 src/simdvec.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/simdvec.jl b/src/simdvec.jl
index 702d18e..f49c1fb 100644
--- a/src/simdvec.jl
+++ b/src/simdvec.jl
@@ -24,7 +24,7 @@ Base.copy(v::Vec) = v
 @inline Base.convert(::Type{Vec{N,T}}, v::Vec{N,T}) where {N,T} = v
 @inline function Base.convert(::Type{Vec{N, T1}}, v::Vec{N, T2}) where {T1, T2, N}
     if T1 <: Union{IntegerTypes, Ptr}
-        if T2 <: Union{IntegerTypes, Ptr}
+        if T2 <: Union{IntegerTypes, Ptr, Bool}
             if sizeof(T1) < sizeof(T2)
                 return Vec(Intrinsics.trunc(Intrinsics.LVec{N, T1}, v.data))
             elseif sizeof(T1) == sizeof(T2)

From 5de3b157974ff70fdd0fb2bb102ceb34411e923d Mon Sep 17 00:00:00 2001
From: Kristoffer <kcarlsson89@gmail.com>
Date: Sun, 23 Feb 2020 17:21:02 +0100
Subject: [PATCH 14/20] fix some required uses of propagate_inbounds
 (https://github.com/JuliaLang/julia/issues/30411)

---
 src/arrayops.jl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/arrayops.jl b/src/arrayops.jl
index 883185c..ef1210b 100644
--- a/src/arrayops.jl
+++ b/src/arrayops.jl
@@ -40,7 +40,7 @@ FastContiguousArray{T,N} = Union{DenseArray{T,N}, Base.FastContiguousSubArray{T,
 # https://github.com/JuliaArrays/MappedArrays.jl/pull/24#issuecomment-460568978
 
 # vload
-@inline function vload(::Type{Vec{N, T}}, ptr::Ptr{T}, mask::Union{Nothing, Vec{N, Bool}}=nothing,
+@propagate_inbounds function vload(::Type{Vec{N, T}}, ptr::Ptr{T}, mask::Union{Nothing, Vec{N, Bool}}=nothing,
                        ::Val{Aligned}=Val(false), ::Val{Nontemporal}=Val(false)) where {N, T, Aligned, Nontemporal}
     if mask === nothing
         Vec(Intrinsics.load(Intrinsics.LVec{N, T}, ptr, Val(Aligned), Val(Nontemporal)))
@@ -49,7 +49,7 @@ FastContiguousArray{T,N} = Union{DenseArray{T,N}, Base.FastContiguousSubArray{T,
     end
 end
 
-@inline function vload(::Type{Vec{N, T}}, a::FastContiguousArray{T,1}, i::Integer, mask=nothing,
+@propagate_inbounds function vload(::Type{Vec{N, T}}, a::FastContiguousArray{T,1}, i::Integer, mask=nothing,
                        ::Val{Aligned}=Val(false), ::Val{Nontemporal}=Val(false)) where {N, T, Aligned, Nontemporal}
     @boundscheck checkbounds(a, i + N - 1)
     GC.@preserve a begin
@@ -61,7 +61,7 @@ end
 @propagate_inbounds vloadnt(::Type{T}, a, i, mask=nothing) where {T<:Vec} = vload(T, a, i, mask, Val(true), Val(true))
 
 # vstore
-@inline function vstore(x::Vec{N, T}, ptr::Ptr{T}, mask::Union{Nothing, Vec{N, Bool}}=nothing,
+@propagate_inbounds function vstore(x::Vec{N, T}, ptr::Ptr{T}, mask::Union{Nothing, Vec{N, Bool}}=nothing,
                        ::Val{Aligned}=Val(false), ::Val{Nontemporal}=Val(false)) where {N, T, Aligned, Nontemporal}
     if mask === nothing
         Intrinsics.store(x.data, ptr, Val(Aligned), Val(Nontemporal))
@@ -69,7 +69,7 @@ end
         Intrinsics.maskedstore(x.data, ptr, mask.data, Val(Aligned), Val(Nontemporal))
     end
 end
-@inline function vstore(x::Vec{N, T}, a::FastContiguousArray{T,1}, i::Integer, mask=nothing,
+@propagate_inbounds function vstore(x::Vec{N, T}, a::FastContiguousArray{T,1}, i::Integer, mask=nothing,
                ::Val{Aligned}=Val(false), ::Val{Nontemporal}=Val(false)) where {N, T, Aligned, Nontemporal}
     @boundscheck checkbounds(a, i + N - 1)
     GC.@preserve a begin

From 8204863834b50e8a38aca76793137d2acd2ff52c Mon Sep 17 00:00:00 2001
From: Kristoffer <kcarlsson89@gmail.com>
Date: Sun, 23 Feb 2020 17:33:09 +0100
Subject: [PATCH 15/20] add a note that the readme example is not meant to beat
 scalar version

---
 README.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/README.md b/README.md
index 69a2e5b..21cb5ed 100644
--- a/README.md
+++ b/README.md
@@ -32,7 +32,12 @@ function vadd!(xs::Vector{T}, ys::Vector{T}, ::Type{Vec{N,T}}) where {N, T}
     end
 end
 ```
+
 To simplify this example code, the vector type that should be used (`Vec{N,T}`) is passed in explicitly as additional type argument. This routine is e.g. called as `vadd!(xs, ys, Vec{8,Float64})`.
+Note that this code is not expected to outperform the standard scalar way of
+doing this operation since the Julia optimizer will easily rewrite that to use
+SIMD under the hood. It is merely shown as an illustration of how to load and
+store data into `Vector`s using SIMD.jl
 
 ## SIMD vector operations
 

From bdfd5854ad1f32bed1549b50637af19dc5278252 Mon Sep 17 00:00:00 2001
From: Kristoffer Carlsson <kricarl@student.chalmers.se>
Date: Wed, 4 Mar 2020 17:40:19 +0100
Subject: [PATCH 16/20] add fast math options to intrinsics and hook into
 fastmath macro (#1)

---
 src/LLVM_intrinsics.jl | 128 +++++++++++++++++++++++++++++------------
 src/simdvec.jl         |  33 +++++++++--
 test/runtests.jl       |   8 +++
 3 files changed, 127 insertions(+), 42 deletions(-)

diff --git a/src/LLVM_intrinsics.jl b/src/LLVM_intrinsics.jl
index 3d8fe05..405e0d5 100644
--- a/src/LLVM_intrinsics.jl
+++ b/src/LLVM_intrinsics.jl
@@ -10,8 +10,6 @@ module Intrinsics
 # when passed to LLVM. It is up to the caller to make sure that the correct
 # intrinsic is called (e.g uitofp vs sitofp).
 
-# TODO: fastmath flags
-
 import ..SIMD: SIMD, VE, LVec, FloatingTypes
 # Inlcude Bool in IntegerTypes
 const IntegerTypes = Union{SIMD.IntegerTypes, Bool}
@@ -53,6 +51,39 @@ llvm_name(llvmf, ::Type{T}) where {T}            = string("llvm", ".", dotit(llv
 llvm_type(::Type{T}) where {T}            = d[T]
 llvm_type(::Type{LVec{N, T}}) where {N,T} = "< $N x $(d[T])>"
 
+############
+# FastMath #
+############
+
+module FastMath
+    const nnan     = 1 << 0
+    const ninf     = 1 << 1
+    const nsz      = 1 << 2
+    const arcp     = 1 << 3
+    const contract = 1 << 4
+    const afn      = 1 << 5
+    const reassoc  = 1 << 6
+    const fast     = 1 << 7
+end
+
+struct FastMathFlags{T} end
+Base.@pure FastMathFlags(T::Int) = FastMathFlags{T}()
+
+function fp_str(::Type{FastMathFlags{T}}) where {T}
+    flags = String[]
+    (T & FastMath.nnan     != 0) && push!(flags, "nnan")
+    (T & FastMath.ninf     != 0) && push!(flags, "ninf")
+    (T & FastMath.nsz      != 0) && push!(flags, "nsz")
+    (T & FastMath.arcp     != 0) && push!(flags, "arcp")
+    (T & FastMath.contract != 0) && push!(flags, "contract")
+    (T & FastMath.afn      != 0) && push!(flags, "afn")
+    (T & FastMath.reassoc  != 0) && push!(flags, "reassoc")
+    (T & FastMath.fast     != 0) && push!(flags, "fast")
+    return join(flags, " ")
+end
+fp_str(::Type{Nothing}) = ""
+
+const FPFlags{T} = Union{Nothing, FastMathFlags{T}}
 
 ####################
 # Unary operators  #
@@ -101,9 +132,10 @@ for (fs, c) in zip([UNARY_INTRINSICS_FLOAT, UNARY_INTRINSICS_INT],
 end
 
 # fneg (not an intrinsic so cannot use `ccall)
-@generated function fneg(x::T) where T<:LT{<:FloatingTypes}
+@generated function fneg(x::T, ::F=nothing) where {T<:LT{<:FloatingTypes}, F<:FPFlags}
+    fpflags = fp_str(F)
     s = """
-    %2 = fneg $(llvm_type(T)) %0
+    %2 = fneg $fpflags $(llvm_type(T)) %0
     ret $(llvm_type(T)) %2
     """
     return :(
@@ -140,20 +172,32 @@ const BINARY_OPS_INT = [
     :xor
 ]
 
-for (fs, c) in zip([BINARY_OPS_FLOAT, BINARY_OPS_INT],
-                   [FloatingTypes, IntegerTypes])
-    for f in fs
-        @eval @generated function $f(x::T, y::T) where T<:LT{<:$c}
-            ff = $(QuoteNode(f))
-            s = """
-            %3 = $ff $(llvm_type(T)) %0, %1
-            ret $(llvm_type(T)) %3
-            """
-            return :(
-                $(Expr(:meta, :inline));
-                Base.llvmcall($s, T, Tuple{T, T}, x, y)
-            )
-        end
+for f in BINARY_OPS_FLOAT
+    @eval @generated function $f(x::T, y::T, ::F=nothing) where {T<:LT{<:FloatingTypes}, F<:FPFlags}
+        fpflags = fp_str(F)
+        ff = $(QuoteNode(f))
+        s = """
+        %3 = $ff $fpflags $(llvm_type(T)) %0, %1
+        ret $(llvm_type(T)) %3
+        """
+        return :(
+            $(Expr(:meta, :inline));
+            Base.llvmcall($s, T, Tuple{T, T}, x, y)
+        )
+    end
+end
+
+for f in BINARY_OPS_INT
+    @eval @generated function $f(x::T, y::T) where T<:LT{<:IntegerTypes}
+        ff = $(QuoteNode(f))
+        s = """
+        %3 = $ff $(llvm_type(T)) %0, %1
+        ret $(llvm_type(T)) %3
+        """
+        return :(
+            $(Expr(:meta, :inline));
+            Base.llvmcall($s, T, Tuple{T, T}, x, y)
+        )
     end
 end
 
@@ -279,24 +323,36 @@ const CMP_FLAGS_INT = [
     :ule
 ]
 
-for (f, c, flags) in zip(["fcmp",          "icmp"],
-                         [FloatingTypes,   IntegerTypes],
-                         [CMP_FLAGS_FLOAT, CMP_FLAGS_INT])
-    for flag in flags
-        ftot = Symbol(string(f, "_", flag))
-        @eval @generated function $ftot(x::LVec{N, T}, y::LVec{N, T}) where {N, T <: $c}
-            fflag = $(QuoteNode(flag))
-            ff = $(QuoteNode(f))
-            s = """
-            %res = $ff $(fflag) <$(N) x $(d[T])> %0, %1
-            %resb = zext <$(N) x i1> %res to <$(N) x i8>
-            ret <$(N) x i8> %resb
-            """
-            return :(
-                $(Expr(:meta, :inline));
-                Base.llvmcall($s, LVec{N, Bool}, Tuple{LVec{N, T}, LVec{N, T}}, x, y)
-            )
-        end
+for flag in CMP_FLAGS_FLOAT
+    ftot = Symbol(string("fcmp_", flag))
+    @eval @generated function $ftot(x::LVec{N, T}, y::LVec{N, T}, ::F=nothing) where {N, T <: FloatingTypes, F<:FPFlags}
+        fpflags = fp_str(F)
+        fflag = $(QuoteNode(flag))
+        s = """
+        %res = fcmp $(fpflags) $(fflag) <$(N) x $(d[T])> %0, %1
+        %resb = zext <$(N) x i1> %res to <$(N) x i8>
+        ret <$(N) x i8> %resb
+        """
+        return :(
+            $(Expr(:meta, :inline));
+            Base.llvmcall($s, LVec{N, Bool}, Tuple{LVec{N, T}, LVec{N, T}}, x, y)
+        )
+    end
+end
+
+for flag in CMP_FLAGS_INT
+    ftot = Symbol(string("icmp_", flag))
+    @eval @generated function $ftot(x::LVec{N, T}, y::LVec{N, T}) where {N, T <: IntegerTypes}
+        fflag = $(QuoteNode(flag))
+        s = """
+        %res = icmp $(fflag) <$(N) x $(d[T])> %0, %1
+        %resb = zext <$(N) x i1> %res to <$(N) x i8>
+        ret <$(N) x i8> %resb
+        """
+        return :(
+            $(Expr(:meta, :inline));
+            Base.llvmcall($s, LVec{N, Bool}, Tuple{LVec{N, T}, LVec{N, T}}, x, y)
+        )
     end
 end
 
diff --git a/src/simdvec.jl b/src/simdvec.jl
index f49c1fb..a47ef3c 100644
--- a/src/simdvec.jl
+++ b/src/simdvec.jl
@@ -114,6 +114,7 @@ Base.reinterpret(::Type{Vec{N, T}}, v::Vec) where {T, N} = Vec(Intrinsics.bitcas
 Base.reinterpret(::Type{Vec{N, T}}, v::ScalarTypes) where {T, N} = Vec(Intrinsics.bitcast(Intrinsics.LVec{N, T}, v))
 Base.reinterpret(::Type{T}, v::Vec) where {T} = Intrinsics.bitcast(T, v.data)
 
+const FASTMATH = Intrinsics.FastMathFlags(Intrinsics.FastMath.fast)
 
 ###################
 # Unary operators #
@@ -156,6 +157,7 @@ end
 Base.:+(v::Vec{<:Any, <:ScalarTypes}) = v
 Base.:-(v::Vec{<:Any, <:IntegerTypes}) = zero(v) - v
 Base.:-(v::Vec{<:Any, <:FloatingTypes}) = Vec(Intrinsics.fneg(v.data))
+Base.FastMath.sub_fast(v::Vec{<:Any, <:FloatingTypes}) = Vec(Intrinsics.fneg(v.data, FASTMATH))
 Base.:~(v::Vec{N, T}) where {N, T<:IntegerTypes} = Vec(Intrinsics.xor(v.data, Vec{N, T}(-1).data))
 Base.:~(v::Vec{N, Bool}) where {N} = Vec(Intrinsics.xor(v.data, Vec{N, Bool}(true).data))
 Base.abs(v::Vec{N, T}) where {N, T} = Vec(vifelse(v < zero(T), -v, v))
@@ -238,14 +240,28 @@ const BINARY_OPS = [
     (:(Base.:<=)     , FloatingTypes , Intrinsics.fcmp_ole)
 ]
 
+function get_fastmath_function(op)
+    if op isa Expr && op.head == Symbol(".") && op.args[1] == :Base &&
+        op.args[2].value in keys(Base.FastMath.fast_op)
+        return :(Base.FastMath.$(Base.FastMath.fast_op[op.args[2].value]))
+    end
+    return nothing
+end
+
 for (op, constraint, llvmop) in BINARY_OPS
     @eval @inline function $op(x::Vec{N, T}, y::Vec{N, T}) where {N, T <: $constraint}
         Vec($(llvmop)(x.data, y.data))
     end
+
+    # Add a fast math version if applicable
+    if (fast_op = get_fastmath_function(op)) !== nothing
+        @eval @inline function $(fast_op)(x::Vec{N, T}, y::Vec{N, T}) where {N, T <: $constraint}
+            Vec($(llvmop)(x.data, y.data, FASTMATH))
+        end
+    end
 end
 
 # overflow
-
 const OVERFLOW_INTRINSICS = [
     (:(Base.Checked.add_with_overflow) , IntTypes  , Intrinsics.sadd_with_overflow)
     (:(Base.Checked.add_with_overflow) , UIntTypes , Intrinsics.uadd_with_overflow)
@@ -261,7 +277,6 @@ for (op, constraint, llvmop) in OVERFLOW_INTRINSICS
     end
 end
 
-
 # max min
 @inline Base.max(v1::Vec{N,T}, v2::Vec{N,T}) where {N,T<:IntegerTypes} =
     Vec(vifelse(v1 >= v2, v1, v2))
@@ -372,11 +387,17 @@ for (op, constraint) in [BINARY_OPS;
         (:(Base.Checked.mul_with_overflow) , IntTypes)
         (:(Base.Checked.mul_with_overflow) , UIntTypes)
     ]
-    @eval @inline function $op(x::T2, y::Vec{N, T}) where {N, T2<:ScalarTypes, T <: $constraint}
-        $op(Vec{N, T}(x), y)
+    ops = [op]
+    if (fast_op = get_fastmath_function(op)) !== nothing
+        push!(ops, fast_op)
     end
-    @eval @inline function $op(x::Vec{N, T}, y::T2) where {N, T2 <:ScalarTypes, T <: $constraint}
-        $op(x, Vec{N, T}(y))
+    for op in ops
+        @eval @inline function $op(x::T2, y::Vec{N, T}) where {N, T2<:ScalarTypes, T <: $constraint}
+            $op(Vec{N, T}(x), y)
+        end
+        @eval @inline function $op(x::Vec{N, T}, y::T2) where {N, T2 <:ScalarTypes, T <: $constraint}
+            $op(x, Vec{N, T}(y))
+        end
     end
 end
 
diff --git a/test/runtests.jl b/test/runtests.jl
index 89af98d..fe1bb25 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -362,6 +362,14 @@ llvm_ir(f, args) = sprint(code_llvm, f, Base.typesof(args...))
         end
     end
 
+    @testset "fastmath" begin
+        v = Vec(1.0,2.0,3.0,4.0)
+        @test all(Tuple(@fastmath v+v) .≈ Tuple(v+v))
+        @test all(Tuple(@fastmath v+1.0) .≈ Tuple(v+1.0))
+        @test all(Tuple(@fastmath 1.0+v) .≈ Tuple(1.0+v))
+        @test all(Tuple(@fastmath -v) .≈ Tuple(-v))
+    end
+
     @testset "Gather and scatter function" begin
         for (arr, VT) in [(arri32, V8I32), (arrf64, V4F64)]
             arr .= 1:length(arr)

From 05949bc77ebee638f1f740e3c3d2daa4102e107a Mon Sep 17 00:00:00 2001
From: Kristoffer <kcarlsson89@gmail.com>
Date: Thu, 5 Mar 2020 10:21:12 +0100
Subject: [PATCH 17/20] add an extra fastmath test

---
 test/runtests.jl | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test/runtests.jl b/test/runtests.jl
index fe1bb25..6c43cd6 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -368,6 +368,9 @@ llvm_ir(f, args) = sprint(code_llvm, f, Base.typesof(args...))
         @test all(Tuple(@fastmath v+1.0) .≈ Tuple(v+1.0))
         @test all(Tuple(@fastmath 1.0+v) .≈ Tuple(1.0+v))
         @test all(Tuple(@fastmath -v) .≈ Tuple(-v))
+        f = v -> @fastmath v + v
+        # Test that v+v is rewritten as v * 2.0 (change test if optimization changes)
+        @test occursin(r"fmul fast <4 x double> %[0-9]*, <double 2\.000000e\+00", llvm_ir(f, (v,)))
     end
 
     @testset "Gather and scatter function" begin

From 37b2340f29b3f643e063b5749e4198527d21888c Mon Sep 17 00:00:00 2001
From: Kristoffer Carlsson <kricarl@student.chalmers.se>
Date: Sun, 22 Mar 2020 21:31:42 +0100
Subject: [PATCH 18/20] fix some boundschecks

---
 src/arrayops.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/arrayops.jl b/src/arrayops.jl
index ef1210b..4e2ccfa 100644
--- a/src/arrayops.jl
+++ b/src/arrayops.jl
@@ -51,7 +51,7 @@ end
 
 @propagate_inbounds function vload(::Type{Vec{N, T}}, a::FastContiguousArray{T,1}, i::Integer, mask=nothing,
                        ::Val{Aligned}=Val(false), ::Val{Nontemporal}=Val(false)) where {N, T, Aligned, Nontemporal}
-    @boundscheck checkbounds(a, i + N - 1)
+    @boundscheck checkbounds(a, i:(i+N-1))
     GC.@preserve a begin
         ptr = pointer(a, i)
         vload(Vec{N, T}, ptr, mask, Val(Aligned), Val(Nontemporal))
@@ -71,7 +71,7 @@ end
 end
 @propagate_inbounds function vstore(x::Vec{N, T}, a::FastContiguousArray{T,1}, i::Integer, mask=nothing,
                ::Val{Aligned}=Val(false), ::Val{Nontemporal}=Val(false)) where {N, T, Aligned, Nontemporal}
-    @boundscheck checkbounds(a, i + N - 1)
+    @boundscheck checkbounds(a, i:(i+N-1))
     GC.@preserve a begin
         ptr = pointer(a, i)
         vstore(x, ptr, mask, Val(Aligned), Val(Nontemporal))

From e8f5815b95b0a0eec6d266c7514c761a2bca1e90 Mon Sep 17 00:00:00 2001
From: KristofferC <kcarlsson89@gmail.com>
Date: Mon, 23 Mar 2020 10:37:00 +0100
Subject: [PATCH 19/20] add docs for fastmath

---
 README.md | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/README.md b/README.md
index 21cb5ed..d12f5b3 100644
--- a/README.md
+++ b/README.md
@@ -141,6 +141,23 @@ julia> SIMD.sub_saturate(v, 120)
 <4 x Int8>[-80, -128, -50, -128]
 ```
 
+## Fastmath
+
+SIMD.jl hooks into the `@fastmath` macro so that operations in a
+`@fastmath`-block sets the `fast` flag on the floating point intrinsics
+that supports it operations. Compare for example the generated code for the
+following two functions:
+
+```julia
+f1(a, b, c) = a * b - c * 2.0
+f2(a, b, c) = @fastmath a * b - c * 2.0
+V = Vec{4, Float64}
+code_native(f1, Tuple{V, V, V}, debuginfo=:none)
+code_native(f2, Tuple{V, V, V}, debuginfo=:none)
+```
+
+The normal caveats for using `@fastmath` naturally applies.
+
 ## Accessing arrays
 
 When using explicit SIMD vectorization, it is convenient to allocate arrays still as arrays of scalars, not as arrays of vectors. The `vload` and `vstore` functions allow reading vectors from and writing vectors into arrays, accessing several contiguous array elements.

From 90d54fdd9f1e182c3e30322d314ee0181cd7657d Mon Sep 17 00:00:00 2001
From: Kristoffer Carlsson <kricarl@student.chalmers.se>
Date: Tue, 31 Mar 2020 10:01:12 +0200
Subject: [PATCH 20/20] this release should be non-breaking

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index 5788502..bdf8379 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "SIMD"
 uuid = "fdea26ae-647d-5447-a871-4b548cad5224"
 authors = ["Erik Schnetter <schnetter@gmail.com>", "Kristoffer Carlsson <kristoffer.carlsson@juliacomputing.com>"]
-version = "3.0.0"
+version = "2.9.0"
 
 [compat]
 julia = "1.4"