diff --git a/base/atomics.jl b/base/atomics.jl
index 1a980eb6561ec..97405d88fd408 100644
--- a/base/atomics.jl
+++ b/base/atomics.jl
@@ -335,7 +335,7 @@ const llvmtypes = IdDict{Any,String}(
     Int32 => "i32", UInt32 => "i32",
     Int64 => "i64", UInt64 => "i64",
     Int128 => "i128", UInt128 => "i128",
-    Float16 => "i16", # half
+    Float16 => "half",
     Float32 => "float",
     Float64 => "double",
 )
diff --git a/base/boot.jl b/base/boot.jl
index 7c8d05cc132c4..54d852ca96416 100644
--- a/base/boot.jl
+++ b/base/boot.jl
@@ -779,11 +779,11 @@ Unsigned(x::Int64)  = UInt64(x)
 Signed(x::UInt128)  = Int128(x)
 Unsigned(x::Int128) = UInt128(x)
 
-Signed(x::Union{Float32, Float64, Bool})   = Int(x)
-Unsigned(x::Union{Float32, Float64, Bool}) = UInt(x)
+Signed(x::Union{Float16, Float32, Float64, Bool})   = Int(x)
+Unsigned(x::Union{Float16, Float32, Float64, Bool}) = UInt(x)
 
 Integer(x::Integer) = x
-Integer(x::Union{Float32, Float64}) = Int(x)
+Integer(x::Union{Float16, Float32, Float64}) = Int(x)
 
 # Binding for the julia parser, called as
 #
diff --git a/base/essentials.jl b/base/essentials.jl
index fb360ea6482db..1158fa7483d73 100644
--- a/base/essentials.jl
+++ b/base/essentials.jl
@@ -420,8 +420,6 @@ julia> reinterpret(Float32, UInt32[1 2 3 4 5])
 ```
 """
 reinterpret(::Type{T}, x) where {T} = bitcast(T, x)
-reinterpret(::Type{Unsigned}, x::Float16) = reinterpret(UInt16,x)
-reinterpret(::Type{Signed}, x::Float16) = reinterpret(Int16,x)
 
 """
     sizeof(T::DataType)
diff --git a/base/float.jl b/base/float.jl
index 242fddfca8946..ec28e4f741f14 100644
--- a/base/float.jl
+++ b/base/float.jl
@@ -47,14 +47,48 @@ A not-a-number value of type [`Float64`](@ref).
 """
 NaN, NaN64
 
+# bit patterns
+reinterpret(::Type{Unsigned}, x::Float64) = reinterpret(UInt64, x)
+reinterpret(::Type{Unsigned}, x::Float32) = reinterpret(UInt32, x)
+reinterpret(::Type{Unsigned}, x::Float16) = reinterpret(UInt16, x)
+reinterpret(::Type{Signed}, x::Float64) = reinterpret(Int64, x)
+reinterpret(::Type{Signed}, x::Float32) = reinterpret(Int32, x)
+reinterpret(::Type{Signed}, x::Float16) = reinterpret(Int16, x)
+
+sign_mask(::Type{Float64}) =        0x8000_0000_0000_0000
+exponent_mask(::Type{Float64}) =    0x7ff0_0000_0000_0000
+exponent_one(::Type{Float64}) =     0x3ff0_0000_0000_0000
+exponent_half(::Type{Float64}) =    0x3fe0_0000_0000_0000
+significand_mask(::Type{Float64}) = 0x000f_ffff_ffff_ffff
+
+sign_mask(::Type{Float32}) =        0x8000_0000
+exponent_mask(::Type{Float32}) =    0x7f80_0000
+exponent_one(::Type{Float32}) =     0x3f80_0000
+exponent_half(::Type{Float32}) =    0x3f00_0000
+significand_mask(::Type{Float32}) = 0x007f_ffff
+
+sign_mask(::Type{Float16}) =        0x8000
+exponent_mask(::Type{Float16}) =    0x7c00
+exponent_one(::Type{Float16}) =     0x3c00
+exponent_half(::Type{Float16}) =    0x3800
+significand_mask(::Type{Float16}) = 0x03ff
+
+for T in (Float16, Float32, Float64)
+    @eval significand_bits(::Type{$T}) = $(trailing_ones(significand_mask(T)))
+    @eval exponent_bits(::Type{$T}) = $(sizeof(T)*8 - significand_bits(T) - 1)
+    @eval exponent_bias(::Type{$T}) = $(Int(exponent_one(T) >> significand_bits(T)))
+    # maximum float exponent
+    @eval exponent_max(::Type{$T}) = $(Int(exponent_mask(T) >> significand_bits(T)) - exponent_bias(T))
+    # maximum float exponent without bias
+    @eval exponent_raw_max(::Type{$T}) = $(Int(exponent_mask(T) >> significand_bits(T)))
+end
+
 ## conversions to floating-point ##
+
+# TODO: deprecate in 2.0
 Float16(x::Integer) = convert(Float16, convert(Float32, x)::Float32)
-for t in (Int8, Int16, Int32, Int64, Int128, UInt8, UInt16, UInt32, UInt64, UInt128)
-    @eval promote_rule(::Type{Float16}, ::Type{$t}) = Float16
-end
-promote_rule(::Type{Float16}, ::Type{Bool}) = Float16
 
-for t1 in (Float32, Float64)
+for t1 in (Float16, Float32, Float64)
     for st in (Int8, Int16, Int32, Int64)
         @eval begin
             (::Type{$t1})(x::($st)) = sitofp($t1, x)
@@ -68,7 +102,6 @@ for t1 in (Float32, Float64)
         end
     end
 end
-(::Type{T})(x::Float16) where {T<:Integer} = T(Float32(x))
 
 Bool(x::Real) = x==0 ? false : x==1 ? true : throw(InexactError(:Bool, Bool, x))
 
@@ -76,6 +109,8 @@ promote_rule(::Type{Float64}, ::Type{UInt128}) = Float64
 promote_rule(::Type{Float64}, ::Type{Int128}) = Float64
 promote_rule(::Type{Float32}, ::Type{UInt128}) = Float32
 promote_rule(::Type{Float32}, ::Type{Int128}) = Float32
+promote_rule(::Type{Float16}, ::Type{UInt128}) = Float16
+promote_rule(::Type{Float16}, ::Type{Int128}) = Float16
 
 function Float64(x::UInt128)
     x == 0 && return 0.0
@@ -137,123 +172,17 @@ function Float32(x::Int128)
     reinterpret(Float32, s | d + y)
 end
 
-# Float32 -> Float16 algorithm from:
-#   "Fast Half Float Conversion" by Jeroen van der Zijp
-#   ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf
-#
-# With adjustments for round-to-nearest, ties to even.
-#
-let _basetable = Vector{UInt16}(undef, 512),
-    _shifttable = Vector{UInt8}(undef, 512)
-    for i = 0:255
-        e = i - 127
-        if e < -25  # Very small numbers map to zero
-            _basetable[i|0x000+1] = 0x0000
-            _basetable[i|0x100+1] = 0x8000
-            _shifttable[i|0x000+1] = 25
-            _shifttable[i|0x100+1] = 25
-        elseif e < -14  # Small numbers map to denorms
-            _basetable[i|0x000+1] = 0x0000
-            _basetable[i|0x100+1] = 0x8000
-            _shifttable[i|0x000+1] = -e-1
-            _shifttable[i|0x100+1] = -e-1
-        elseif e <= 15  # Normal numbers just lose precision
-            _basetable[i|0x000+1] = ((e+15)<<10)
-            _basetable[i|0x100+1] = ((e+15)<<10) | 0x8000
-            _shifttable[i|0x000+1] = 13
-            _shifttable[i|0x100+1] = 13
-        elseif e < 128  # Large numbers map to Infinity
-            _basetable[i|0x000+1] = 0x7C00
-            _basetable[i|0x100+1] = 0xFC00
-            _shifttable[i|0x000+1] = 24
-            _shifttable[i|0x100+1] = 24
-        else  # Infinity and NaN's stay Infinity and NaN's
-            _basetable[i|0x000+1] = 0x7C00
-            _basetable[i|0x100+1] = 0xFC00
-            _shifttable[i|0x000+1] = 13
-            _shifttable[i|0x100+1] = 13
-        end
-    end
-    global const shifttable = (_shifttable...,)
-    global const basetable = (_basetable...,)
-end
-
-function Float16(val::Float32)
-    f = reinterpret(UInt32, val)
-    if isnan(val)
-        t = 0x8000 ⊻ (0x8000 & ((f >> 0x10) % UInt16))
-        return reinterpret(Float16, t ⊻ ((f >> 0xd) % UInt16))
-    end
-    i = ((f & ~significand_mask(Float32)) >> significand_bits(Float32)) + 1
-    @inbounds sh = shifttable[i]
-    f &= significand_mask(Float32)
-    # If `val` is subnormal, the tables are set up to force the
-    # result to 0, so the significand has an implicit `1` in the
-    # cases we care about.
-    f |= significand_mask(Float32) + 0x1
-    @inbounds h = (basetable[i] + (f >> sh) & significand_mask(Float16)) % UInt16
-    # round
-    # NOTE: we maybe should ignore NaNs here, but the payload is
-    # getting truncated anyway so "rounding" it might not matter
-    nextbit = (f >> (sh-1)) & 1
-    if nextbit != 0 && (h & 0x7C00) != 0x7C00
-        # Round halfway to even or check lower bits
-        if h&1 == 1 || (f & ((1<<(sh-1))-1)) != 0
-            h += UInt16(1)
-        end
-    end
-    reinterpret(Float16, h)
-end
-
-function Float32(val::Float16)
-    local ival::UInt32 = reinterpret(UInt16, val)
-    local sign::UInt32 = (ival & 0x8000) >> 15
-    local exp::UInt32  = (ival & 0x7c00) >> 10
-    local sig::UInt32  = (ival & 0x3ff) >> 0
-    local ret::UInt32
-
-    if exp == 0
-        if sig == 0
-            sign = sign << 31
-            ret = sign | exp | sig
-        else
-            n_bit = 1
-            bit = 0x0200
-            while (bit & sig) == 0
-                n_bit = n_bit + 1
-                bit = bit >> 1
-            end
-            sign = sign << 31
-            exp = ((-14 - n_bit + 127) << 23) % UInt32
-            sig = ((sig & (~bit)) << n_bit) << (23 - 10)
-            ret = sign | exp | sig
-        end
-    elseif exp == 0x1f
-        if sig == 0  # Inf
-            if sign == 0
-                ret = 0x7f800000
-            else
-                ret = 0xff800000
-            end
-        else  # NaN
-            ret = 0x7fc00000 | (sign<<31) | (sig<<(23-10))
-        end
-    else
-        sign = sign << 31
-        exp  = ((exp - 15 + 127) << 23) % UInt32
-        sig  = sig << (23 - 10)
-        ret = sign | exp | sig
-    end
-    return reinterpret(Float32, ret)
-end
+# TODO: optimize
+Float16(x::UInt128) = convert(Float16, Float32(x))
+Float16(x::Int128)  = convert(Float16, Float32(x))
 
-#convert(::Type{Float16}, x::Float32) = fptrunc(Float16, x)
+Float16(x::Float32) = fptrunc(Float16, x)
+Float16(x::Float64) = fptrunc(Float16, x)
 Float32(x::Float64) = fptrunc(Float32, x)
-Float16(x::Float64) = Float16(Float32(x))
 
-#convert(::Type{Float32}, x::Float16) = fpext(Float32, x)
+Float32(x::Float16) = fpext(Float32, x)
 Float64(x::Float32) = fpext(Float64, x)
-Float64(x::Float16) = Float64(Float32(x))
+Float64(x::Float16) = fpext(Float64, x)
 
 AbstractFloat(x::Bool)    = Float64(x)
 AbstractFloat(x::Int8)    = Float64(x)
@@ -305,14 +234,14 @@ function unsafe_trunc end
 
 for Ti in (Int8, Int16, Int32, Int64)
     @eval begin
-        unsafe_trunc(::Type{$Ti}, x::Float16) = unsafe_trunc($Ti, Float32(x))
+        unsafe_trunc(::Type{$Ti}, x::Float16) = fptosi($Ti, x)
         unsafe_trunc(::Type{$Ti}, x::Float32) = fptosi($Ti, x)
         unsafe_trunc(::Type{$Ti}, x::Float64) = fptosi($Ti, x)
     end
 end
 for Ti in (UInt8, UInt16, UInt32, UInt64)
     @eval begin
-        unsafe_trunc(::Type{$Ti}, x::Float16) = unsafe_trunc($Ti, Float32(x))
+        unsafe_trunc(::Type{$Ti}, x::Float16) = fptoui($Ti, x)
         unsafe_trunc(::Type{$Ti}, x::Float32) = fptoui($Ti, x)
         unsafe_trunc(::Type{$Ti}, x::Float64) = fptoui($Ti, x)
     end
@@ -351,35 +280,33 @@ unsafe_trunc(::Type{Int128}, x::Float16) = unsafe_trunc(Int128, Float32(x))
 
 # matches convert methods
 # also determines floor, ceil, round
+trunc(::Type{Signed}, x::Float16) = trunc(Int,x)
 trunc(::Type{Signed}, x::Float32) = trunc(Int,x)
 trunc(::Type{Signed}, x::Float64) = trunc(Int,x)
+trunc(::Type{Unsigned}, x::Float16) = trunc(UInt,x)
 trunc(::Type{Unsigned}, x::Float32) = trunc(UInt,x)
 trunc(::Type{Unsigned}, x::Float64) = trunc(UInt,x)
+trunc(::Type{Integer}, x::Float16) = trunc(Int,x)
 trunc(::Type{Integer}, x::Float32) = trunc(Int,x)
 trunc(::Type{Integer}, x::Float64) = trunc(Int,x)
-trunc(::Type{T}, x::Float16) where {T<:Integer} = trunc(T, Float32(x))
 
 # fallbacks
 floor(::Type{T}, x::AbstractFloat) where {T<:Integer} = trunc(T,round(x, RoundDown))
-floor(::Type{T}, x::Float16) where {T<:Integer} = floor(T, Float32(x))
 ceil(::Type{T}, x::AbstractFloat) where {T<:Integer} = trunc(T,round(x, RoundUp))
-ceil(::Type{T}, x::Float16) where {T<:Integer} = ceil(T, Float32(x))
 round(::Type{T}, x::AbstractFloat) where {T<:Integer} = trunc(T,round(x, RoundNearest))
-round(::Type{T}, x::Float16) where {T<:Integer} = round(T, Float32(x))
 
 round(x::Float64, r::RoundingMode{:ToZero})  = trunc_llvm(x)
 round(x::Float32, r::RoundingMode{:ToZero})  = trunc_llvm(x)
+round(x::Float16, r::RoundingMode{:ToZero})  = trunc_llvm(x)
 round(x::Float64, r::RoundingMode{:Down})    = floor_llvm(x)
 round(x::Float32, r::RoundingMode{:Down})    = floor_llvm(x)
+round(x::Float16, r::RoundingMode{:Down})    = floor_llvm(x)
 round(x::Float64, r::RoundingMode{:Up})      = ceil_llvm(x)
 round(x::Float32, r::RoundingMode{:Up})      = ceil_llvm(x)
+round(x::Float16, r::RoundingMode{:Up})      = ceil_llvm(x)
 round(x::Float64, r::RoundingMode{:Nearest}) = rint_llvm(x)
 round(x::Float32, r::RoundingMode{:Nearest}) = rint_llvm(x)
-
-round(x::Float16, r::RoundingMode{:ToZero}) = Float16(round(Float32(x), r))
-round(x::Float16, r::RoundingMode{:Down}) = Float16(round(Float32(x), r))
-round(x::Float16, r::RoundingMode{:Up}) = Float16(round(Float32(x), r))
-round(x::Float16, r::RoundingMode{:Nearest}) = Float16(round(Float32(x), r))
+round(x::Float16, r::RoundingMode{:Nearest}) = rint_llvm(x)
 
 ## floating point promotions ##
 promote_rule(::Type{Float32}, ::Type{Float16}) = Float32
@@ -392,36 +319,30 @@ widen(::Type{Float32}) = Float64
 ## floating point arithmetic ##
 -(x::Float64) = neg_float(x)
 -(x::Float32) = neg_float(x)
--(x::Float16) = reinterpret(Float16, reinterpret(UInt16, x) ⊻ 0x8000)
+-(x::Float16) = neg_float(x)
 
-for op in (:+, :-, :*, :/, :\, :^)
-    @eval ($op)(a::Float16, b::Float16) = Float16(($op)(Float32(a), Float32(b)))
-end
++(x::Float16, y::Float16) = add_float(x, y)
 +(x::Float32, y::Float32) = add_float(x, y)
 +(x::Float64, y::Float64) = add_float(x, y)
+-(x::Float16, y::Float16) = sub_float(x, y)
 -(x::Float32, y::Float32) = sub_float(x, y)
 -(x::Float64, y::Float64) = sub_float(x, y)
+*(x::Float16, y::Float16) = mul_float(x, y)
 *(x::Float32, y::Float32) = mul_float(x, y)
 *(x::Float64, y::Float64) = mul_float(x, y)
+/(x::Float16, y::Float16) = div_float(x, y)
 /(x::Float32, y::Float32) = div_float(x, y)
 /(x::Float64, y::Float64) = div_float(x, y)
 
+muladd(x::Float16, y::Float16, z::Float16) = muladd_float(x, y, z)
 muladd(x::Float32, y::Float32, z::Float32) = muladd_float(x, y, z)
 muladd(x::Float64, y::Float64, z::Float64) = muladd_float(x, y, z)
-function muladd(a::Float16, b::Float16, c::Float16)
-    Float16(muladd(Float32(a), Float32(b), Float32(c)))
-end
 
 # TODO: faster floating point div?
 # TODO: faster floating point fld?
 # TODO: faster floating point mod?
 
-for func in (:div,:fld,:cld,:rem,:mod)
-    @eval begin
-        $func(a::Float16,b::Float16) = Float16($func(Float32(a),Float32(b)))
-    end
-end
-
+rem(x::Float16, y::Float16) = rem_float(x, y)
 rem(x::Float32, y::Float32) = rem_float(x, y)
 rem(x::Float64, y::Float64) = rem_float(x, y)
 
@@ -439,33 +360,25 @@ function mod(x::T, y::T) where T<:AbstractFloat
 end
 
 ## floating point comparisons ##
-function ==(x::Float16, y::Float16)
-    ix = reinterpret(UInt16,x)
-    iy = reinterpret(UInt16,y)
-    if (ix|iy)&0x7fff > 0x7c00 #isnan(x) || isnan(y)
-        return false
-    end
-    if (ix|iy)&0x7fff == 0x0000
-        return true
-    end
-    return ix == iy
-end
+==(x::Float16, y::Float16) = eq_float(x, y)
 ==(x::Float32, y::Float32) = eq_float(x, y)
 ==(x::Float64, y::Float64) = eq_float(x, y)
+!=(x::Float16, y::Float16) = ne_float(x, y)
 !=(x::Float32, y::Float32) = ne_float(x, y)
 !=(x::Float64, y::Float64) = ne_float(x, y)
+<( x::Float16, y::Float16) = lt_float(x, y)
 <( x::Float32, y::Float32) = lt_float(x, y)
 <( x::Float64, y::Float64) = lt_float(x, y)
+<=(x::Float16, y::Float16) = le_float(x, y)
 <=(x::Float32, y::Float32) = le_float(x, y)
 <=(x::Float64, y::Float64) = le_float(x, y)
 
+isequal(x::Float16, y::Float16) = fpiseq(x, y)
 isequal(x::Float32, y::Float32) = fpiseq(x, y)
 isequal(x::Float64, y::Float64) = fpiseq(x, y)
+isless( x::Float16, y::Float16) = fpislt(x, y)
 isless( x::Float32, y::Float32) = fpislt(x, y)
 isless( x::Float64, y::Float64) = fpislt(x, y)
-for op in (:<, :<=, :isless)
-    @eval ($op)(a::Float16, b::Float16) = ($op)(Float32(a), Float32(b))
-end
 
 # Exact Float (Tf) vs Integer (Ti) comparisons
 # Assumes:
@@ -481,7 +394,7 @@ end
 #  b. unsafe_convert undefined behaviour if fy == Tf(typemax(Ti))
 #     (but consequently x == fy > y)
 for Ti in (Int64,UInt64,Int128,UInt128)
-    for Tf in (Float32,Float64)
+    for Tf in (Float16,Float32,Float64)
         @eval begin
             function ==(x::$Tf, y::$Ti)
                 fy = ($Tf)(y)
@@ -523,7 +436,7 @@ for op in (:(==), :<, :<=)
 end
 
 
-abs(x::Float16) = reinterpret(Float16, reinterpret(UInt16, x) & 0x7fff)
+abs(x::Float16) = abs_float(x)
 abs(x::Float32) = abs_float(x)
 abs(x::Float64) = abs_float(x)
 
@@ -534,11 +447,9 @@ Test whether a number value is a NaN, an indeterminate value which is neither an
 nor a finite number ("not a number").
 """
 isnan(x::AbstractFloat) = (x != x)::Bool
-isnan(x::Float16) = reinterpret(UInt16,x)&0x7fff > 0x7c00
 isnan(x::Real) = false
 
 isfinite(x::AbstractFloat) = x - x == 0
-isfinite(x::Float16) = reinterpret(UInt16,x)&0x7c00 != 0x7c00
 isfinite(x::Real) = decompose(x)[3] != 0
 isfinite(x::Integer) = true
 
@@ -655,7 +566,7 @@ such `y` exists (e.g. if `x` is `-Inf` or `NaN`), then return `x`.
 prevfloat(x::AbstractFloat) = nextfloat(x,-1)
 
 for Ti in (Int8, Int16, Int32, Int64, Int128, UInt8, UInt16, UInt32, UInt64, UInt128)
-    for Tf in (Float32, Float64)
+    for Tf in (Float16, Float32, Float64)
         if Ti <: Unsigned || sizeof(Ti) < sizeof(Tf)
             # Here `Tf(typemin(Ti))-1` is exact, so we can compare the lower-bound
             # directly. `Tf(typemax(Ti))+1` is either always exactly representable, or
@@ -853,47 +764,11 @@ eps(::AbstractFloat)
 ## byte order swaps for arbitrary-endianness serialization/deserialization ##
 bswap(x::IEEEFloat) = bswap_int(x)
 
-# bit patterns
-reinterpret(::Type{Unsigned}, x::Float64) = reinterpret(UInt64, x)
-reinterpret(::Type{Unsigned}, x::Float32) = reinterpret(UInt32, x)
-reinterpret(::Type{Signed}, x::Float64) = reinterpret(Int64, x)
-reinterpret(::Type{Signed}, x::Float32) = reinterpret(Int32, x)
-
-sign_mask(::Type{Float64}) =        0x8000_0000_0000_0000
-exponent_mask(::Type{Float64}) =    0x7ff0_0000_0000_0000
-exponent_one(::Type{Float64}) =     0x3ff0_0000_0000_0000
-exponent_half(::Type{Float64}) =    0x3fe0_0000_0000_0000
-significand_mask(::Type{Float64}) = 0x000f_ffff_ffff_ffff
-
-sign_mask(::Type{Float32}) =        0x8000_0000
-exponent_mask(::Type{Float32}) =    0x7f80_0000
-exponent_one(::Type{Float32}) =     0x3f80_0000
-exponent_half(::Type{Float32}) =    0x3f00_0000
-significand_mask(::Type{Float32}) = 0x007f_ffff
-
-sign_mask(::Type{Float16}) =        0x8000
-exponent_mask(::Type{Float16}) =    0x7c00
-exponent_one(::Type{Float16}) =     0x3c00
-exponent_half(::Type{Float16}) =    0x3800
-significand_mask(::Type{Float16}) = 0x03ff
-
-for T in (Float16, Float32, Float64)
-    @eval significand_bits(::Type{$T}) = $(trailing_ones(significand_mask(T)))
-    @eval exponent_bits(::Type{$T}) = $(sizeof(T)*8 - significand_bits(T) - 1)
-    @eval exponent_bias(::Type{$T}) = $(Int(exponent_one(T) >> significand_bits(T)))
-    # maximum float exponent
-    @eval exponent_max(::Type{$T}) = $(Int(exponent_mask(T) >> significand_bits(T)) - exponent_bias(T))
-    # maximum float exponent without bias
-    @eval exponent_raw_max(::Type{$T}) = $(Int(exponent_mask(T) >> significand_bits(T)))
-end
-
 # integer size of float
 uinttype(::Type{Float64}) = UInt64
 uinttype(::Type{Float32}) = UInt32
 uinttype(::Type{Float16}) = UInt16
 
-Base.iszero(x::Float16) = reinterpret(UInt16, x) & ~sign_mask(Float16) == 0x0000
-
 ## Array operations on floating point numbers ##
 
 float(A::AbstractArray{<:AbstractFloat}) = A
diff --git a/base/math.jl b/base/math.jl
index ab77d95b97df6..94d001c0f31dc 100644
--- a/base/math.jl
+++ b/base/math.jl
@@ -898,6 +898,8 @@ end
     end
     z
 end
+@inline ^(x::Float16, y::Float16) = Float16(Float32(x)^Float32(y))  # TODO: optimize
+
 @inline function ^(x::Float64, y::Integer)
     y == -1 && return inv(x)
     y == 0 && return one(x)
diff --git a/src/APInt-C.cpp b/src/APInt-C.cpp
index 0e0ffbfa73713..bc0a62e21dd3e 100644
--- a/src/APInt-C.cpp
+++ b/src/APInt-C.cpp
@@ -9,6 +9,7 @@
 #include "APInt-C.h"
 #include "julia.h"
 #include "julia_assert.h"
+#include "julia_internal.h"
 
 using namespace llvm;
 
@@ -312,14 +313,16 @@ void LLVMByteSwap(unsigned numbits, integerPart *pa, integerPart *pr) {
     ASSIGN(r, a)
 }
 
-void LLVMFPtoInt(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr, bool isSigned, bool *isExact) {
+void LLVMFPtoInt(unsigned numbits, void *pa, unsigned onumbits, integerPart *pr, bool isSigned, bool *isExact) {
     double Val;
-    if (numbits == 32)
+    if (numbits == 16)
+        Val = __gnu_h2f_ieee(*(uint16_t*)pa);
+    else if (numbits == 32)
         Val = *(float*)pa;
     else if (numbits == 64)
         Val = *(double*)pa;
     else
-        jl_error("FPtoSI: runtime floating point intrinsics are not implemented for bit sizes other than 32 and 64");
+        jl_error("FPtoSI: runtime floating point intrinsics are not implemented for bit sizes other than 16, 32 and 64");
     unsigned onumbytes = RoundUpToAlignment(onumbits, host_char_bit) / host_char_bit;
     if (onumbits <= 64) { // fast-path, if possible
         if (isSigned) {
@@ -387,12 +390,14 @@ void LLVMSItoFP(unsigned numbits, integerPart *pa, unsigned onumbits, integerPar
         CREATE(a)
         val = a.roundToDouble(true);
     }
-    if (onumbits == 32)
+    if (onumbits == 16)
+        *(uint16_t*)pr = __gnu_f2h_ieee(val);
+    else if (onumbits == 32)
         *(float*)pr = val;
     else if (onumbits == 64)
         *(double*)pr = val;
     else
-        jl_error("SItoFP: runtime floating point intrinsics are not implemented for bit sizes other than 32 and 64");
+        jl_error("SItoFP: runtime floating point intrinsics are not implemented for bit sizes other than 16, 32 and 64");
 }
 
 extern "C" JL_DLLEXPORT
@@ -402,7 +407,9 @@ void LLVMUItoFP(unsigned numbits, integerPart *pa, unsigned onumbits, integerPar
         CREATE(a)
         val = a.roundToDouble(false);
     }
-    if (onumbits == 32)
+    if (onumbits == 16)
+        *(uint16_t*)pr = __gnu_f2h_ieee(val);
+    else if (onumbits == 32)
         *(float*)pr = val;
     else if (onumbits == 64)
         *(double*)pr = val;
diff --git a/src/Makefile b/src/Makefile
index 835c2bf60b55a..00d7d3fc0de8b 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -56,7 +56,7 @@ RUNTIME_SRCS += jitlayers aotcompile debuginfo disasm llvm-simdloop llvm-muladd
 	llvm-final-gc-lowering llvm-pass-helpers llvm-late-gc-lowering \
 	llvm-lower-handlers llvm-gc-invariant-verifier llvm-propagate-addrspaces \
 	llvm-multiversioning llvm-alloc-opt cgmemmgr llvm-api llvm-remove-addrspaces \
-	llvm-remove-ni llvm-julia-licm
+	llvm-remove-ni llvm-julia-licm llvm-demote-float16
 FLAGS += -I$(shell $(LLVM_CONFIG_HOST) --includedir)
 LLVM_LIBS := all
 ifeq ($(USE_POLLY),1)
diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp
index b0539786315b5..17df8b6a8a67b 100644
--- a/src/aotcompile.cpp
+++ b/src/aotcompile.cpp
@@ -508,8 +508,10 @@ void jl_dump_native(void *native_code,
 
     if (unopt_bc_fname)
         PM.add(createBitcodeWriterPass(unopt_bc_OS));
-    if (bc_fname || obj_fname || asm_fname)
+    if (bc_fname || obj_fname || asm_fname) {
         addOptimizationPasses(&PM, jl_options.opt_level, true, true);
+        addMachinePasses(&PM, TM.get());
+    }
     if (bc_fname)
         PM.add(createBitcodeWriterPass(bc_OS));
     if (obj_fname)
@@ -604,6 +606,15 @@ void addTargetPasses(legacy::PassManagerBase *PM, TargetMachine *TM)
     PM->add(createTargetTransformInfoWrapperPass(TM->getTargetIRAnalysis()));
 }
 
+
+void addMachinePasses(legacy::PassManagerBase *PM, TargetMachine *TM)
+{
+    // TODO: don't do this on CPUs that natively support Float16
+    PM->add(createDemoteFloat16Pass());
+    PM->add(createGVNPass());
+}
+
+
 // this defines the set of optimization passes defined for Julia at various optimization levels.
 // it assumes that the TLI and TTI wrapper passes have already been added.
 void addOptimizationPasses(legacy::PassManagerBase *PM, int opt_level,
@@ -809,6 +820,7 @@ class JuliaPipeline : public Pass {
         TPMAdapter Adapter(TPM);
         addTargetPasses(&Adapter, jl_TargetMachine);
         addOptimizationPasses(&Adapter, OptLevel);
+        addMachinePasses(&Adapter, jl_TargetMachine);
     }
     JuliaPipeline() : Pass(PT_PassManager, ID) {}
     Pass *createPrinterPass(raw_ostream &O, const std::string &Banner) const override {
@@ -846,6 +858,7 @@ void *jl_get_llvmf_defn(jl_method_instance_t *mi, size_t world, char getwrapper,
         PM = new legacy::PassManager();
         addTargetPasses(PM, jl_TargetMachine);
         addOptimizationPasses(PM, jl_options.opt_level);
+        addMachinePasses(PM, jl_TargetMachine);
     }
 
     // get the source code for this function
diff --git a/src/cgutils.cpp b/src/cgutils.cpp
index b516a75257dee..e6bd8a0744115 100644
--- a/src/cgutils.cpp
+++ b/src/cgutils.cpp
@@ -516,7 +516,7 @@ static Type *bitstype_to_llvm(jl_value_t *bt, bool llvmcall = false)
         return T_int32;
     if (bt == (jl_value_t*)jl_int64_type)
         return T_int64;
-    if (llvmcall && (bt == (jl_value_t*)jl_float16_type))
+    if (bt == (jl_value_t*)jl_float16_type)
         return T_float16;
     if (bt == (jl_value_t*)jl_float32_type)
         return T_float32;
diff --git a/src/codegen.cpp b/src/codegen.cpp
index b8dac42970d7e..013c0697b38f4 100644
--- a/src/codegen.cpp
+++ b/src/codegen.cpp
@@ -145,8 +145,6 @@ extern void _chkstk(void);
 #define __alignof__ __alignof
 #endif
 
-#define DISABLE_FLOAT16
-
 // llvm state
 extern JITEventListener *CreateJuliaJITEventListener();
 
diff --git a/src/intrinsics.cpp b/src/intrinsics.cpp
index ede8031dda5cf..ada6166c1ceb8 100644
--- a/src/intrinsics.cpp
+++ b/src/intrinsics.cpp
@@ -106,10 +106,8 @@ static Type *FLOATT(Type *t)
         return T_float64;
     if (nb == 32)
         return T_float32;
-#ifndef DISABLE_FLOAT16
     if (nb == 16)
         return T_float16;
-#endif
     if (nb == 128)
         return T_float128;
     return NULL;
@@ -1040,8 +1038,7 @@ static Value *emit_untyped_intrinsic(jl_codectx_t &ctx, intrinsic f, Value **arg
 
     }
 
-// Implements IEEE negate. See issue #7868
-    case neg_float: return math_builder(ctx)().CreateFSub(ConstantFP::get(t, -0.0), x);
+    case neg_float: return math_builder(ctx)().CreateFNeg(x);
     case neg_float_fast: return math_builder(ctx, true)().CreateFNeg(x);
     case add_float: return math_builder(ctx)().CreateFAdd(x, y);
     case sub_float: return math_builder(ctx)().CreateFSub(x, y);
@@ -1301,3 +1298,200 @@ static Value *emit_untyped_intrinsic(jl_codectx_t &ctx, intrinsic f, Value **arg
     }
     assert(0 && "unreachable");
 }
+
+
+// float16 intrinsics
+// TODO: use LLVM's compiler-rt
+
+static inline float half_to_float(uint16_t ival)
+{
+    uint32_t sign = (ival & 0x8000) >> 15;
+    uint32_t exp = (ival & 0x7c00) >> 10;
+    uint32_t sig = (ival & 0x3ff) >> 0;
+    uint32_t ret;
+
+    if (exp == 0) {
+        if (sig == 0) {
+            sign = sign << 31;
+            ret = sign | exp | sig;
+        }
+        else {
+            int n_bit = 1;
+            uint16_t bit = 0x0200;
+            while ((bit & sig) == 0) {
+                n_bit = n_bit + 1;
+                bit = bit >> 1;
+            }
+            sign = sign << 31;
+            exp = ((-14 - n_bit + 127) << 23);
+            sig = ((sig & (~bit)) << n_bit) << (23 - 10);
+            ret = sign | exp | sig;
+        }
+    }
+    else if (exp == 0x1f) {
+        if (sig == 0) { // Inf
+            if (sign == 0)
+                ret = 0x7f800000;
+            else
+                ret = 0xff800000;
+        }
+        else // NaN
+            ret = 0x7fc00000 | (sign << 31) | (sig << (23 - 10));
+    }
+    else {
+        sign = sign << 31;
+        exp = ((exp - 15 + 127) << 23);
+        sig = sig << (23 - 10);
+        ret = sign | exp | sig;
+    }
+
+    float fret;
+    memcpy(&fret, &ret, sizeof(float));
+    return fret;
+}
+
+// float to half algorithm from:
+//   "Fast Half Float Conversion" by Jeroen van der Zijp
+//   ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf
+//
+// With adjustments for round-to-nearest, ties to even.
+
+static uint16_t basetable[512] = {
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0400, 0x0800, 0x0c00, 0x1000, 0x1400, 0x1800, 0x1c00, 0x2000,
+    0x2400, 0x2800, 0x2c00, 0x3000, 0x3400, 0x3800, 0x3c00, 0x4000, 0x4400, 0x4800, 0x4c00,
+    0x5000, 0x5400, 0x5800, 0x5c00, 0x6000, 0x6400, 0x6800, 0x6c00, 0x7000, 0x7400, 0x7800,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8400, 0x8800, 0x8c00, 0x9000, 0x9400,
+    0x9800, 0x9c00, 0xa000, 0xa400, 0xa800, 0xac00, 0xb000, 0xb400, 0xb800, 0xbc00, 0xc000,
+    0xc400, 0xc800, 0xcc00, 0xd000, 0xd400, 0xd800, 0xdc00, 0xe000, 0xe400, 0xe800, 0xec00,
+    0xf000, 0xf400, 0xf800, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00};
+
+static uint8_t shifttable[512] = {
+    0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19,
+    0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19,
+    0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19,
+    0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19,
+    0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19,
+    0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19,
+    0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19,
+    0x19, 0x19, 0x19, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f,
+    0x0e, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d,
+    0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d,
+    0x0d, 0x0d, 0x0d, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x0d, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19,
+    0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19,
+    0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19,
+    0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19,
+    0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19,
+    0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19,
+    0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19,
+    0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13,
+    0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d,
+    0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d,
+    0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x0d};
+
+static inline uint16_t float_to_half(float param)
+{
+    uint32_t f;
+    memcpy(&f, &param, sizeof(float));
+    if (isnan(param)) {
+        uint32_t t = 0x8000 ^ (0x8000 & ((uint16_t)(f >> 0x10)));
+        return t ^ ((uint16_t)(f >> 0xd));
+    }
+    int i = ((f & ~0x007fffff) >> 23);
+    uint8_t sh = shifttable[i];
+    f &= 0x007fffff;
+    // If `val` is subnormal, the tables are set up to force the
+    // result to 0, so the significand has an implicit `1` in the
+    // cases we care about.
+    f |= 0x007fffff + 0x1;
+    uint16_t h = (uint16_t)(basetable[i] + ((f >> sh) & 0x03ff));
+    // round
+    // NOTE: we maybe should ignore NaNs here, but the payload is
+    // getting truncated anyway so "rounding" it might not matter
+    int nextbit = (f >> (sh - 1)) & 1;
+    if (nextbit != 0 && (h & 0x7C00) != 0x7C00) {
+        // Round halfway to even or check lower bits
+        if ((h & 1) == 1 || (f & ((1 << (sh - 1)) - 1)) != 0)
+            h += UINT16_C(1);
+    }
+    return h;
+}
+
+#if !defined(_OS_DARWIN_)   // xcode already links compiler-rt
+
+extern "C" JL_DLLEXPORT float __gnu_h2f_ieee(uint16_t param)
+{
+    return half_to_float(param);
+}
+
+extern "C" JL_DLLEXPORT float __extendhfsf2(uint16_t param)
+{
+    return half_to_float(param);
+}
+
+extern "C" JL_DLLEXPORT uint16_t __gnu_f2h_ieee(float param)
+{
+    return float_to_half(param);
+}
+
+extern "C" JL_DLLEXPORT uint16_t __truncdfhf2(double param)
+{
+    return float_to_half((float)param);
+}
+
+#endif
diff --git a/src/jitlayers.cpp b/src/jitlayers.cpp
index 6658a26e92d52..3481db683a95c 100644
--- a/src/jitlayers.cpp
+++ b/src/jitlayers.cpp
@@ -523,6 +523,7 @@ static void addPassesForOptLevel(legacy::PassManager &PM, TargetMachine &TM, raw
 {
     addTargetPasses(&PM, &TM);
     addOptimizationPasses(&PM, optlevel);
+    addMachinePasses(&PM, &TM);
     if (TM.addPassesToEmitMC(PM, Ctx, ObjStream))
         llvm_unreachable("Target does not support MC emission.");
 }
diff --git a/src/jitlayers.h b/src/jitlayers.h
index 8dd45c1f939f5..10f371f610cb3 100644
--- a/src/jitlayers.h
+++ b/src/jitlayers.h
@@ -24,6 +24,7 @@ extern bool imaging_mode;
 
 void addTargetPasses(legacy::PassManagerBase *PM, TargetMachine *TM);
 void addOptimizationPasses(legacy::PassManagerBase *PM, int opt_level, bool lower_intrinsics=true, bool dump_native=false);
+void addMachinePasses(legacy::PassManagerBase *PM, TargetMachine *TM);
 void jl_finalize_module(std::unique_ptr<Module>  m);
 void jl_merge_module(Module *dest, std::unique_ptr<Module> src);
 Module *jl_create_llvm_module(StringRef name);
@@ -241,6 +242,7 @@ Pass *createRemoveNIPass();
 Pass *createJuliaLICMPass();
 Pass *createMultiVersioningPass();
 Pass *createAllocOptPass();
+Pass *createDemoteFloat16Pass();
 // Whether the Function is an llvm or julia intrinsic.
 static inline bool isIntrinsicFunction(Function *F)
 {
diff --git a/src/julia.expmap b/src/julia.expmap
index aa77f4c8cff31..baf3220f147ed 100644
--- a/src/julia.expmap
+++ b/src/julia.expmap
@@ -42,6 +42,12 @@
     environ;
     __progname;
 
+    /* compiler run-time intrinsics */
+    __gnu_h2f_ieee;
+    __extendhfsf2;
+    __gnu_f2h_ieee;
+    __truncdfhf2;
+
   local:
     *;
 };
diff --git a/src/julia_internal.h b/src/julia_internal.h
index 410a3e9a15988..369781b79bed3 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -1311,6 +1311,9 @@ jl_sym_t *_jl_symbol(const char *str, size_t len) JL_NOTSAFEPOINT;
   #define JL_GC_ASSERT_LIVE(x) (void)(x)
 #endif
 
+float __gnu_h2f_ieee(uint16_t param) JL_NOTSAFEPOINT;
+uint16_t __gnu_f2h_ieee(float param) JL_NOTSAFEPOINT;
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/llvm-demote-float16.cpp b/src/llvm-demote-float16.cpp
new file mode 100644
index 0000000000000..6ed06fb12508a
--- /dev/null
+++ b/src/llvm-demote-float16.cpp
@@ -0,0 +1,146 @@
+// This file is a part of Julia. License is MIT: https://julialang.org/license
+
+// This pass finds floating-point operations on 16-bit (half precision) values, and replaces
+// them by equivalent operations on 32-bit (single precision) values surrounded by a fpext
+// and fptrunc. This ensures that the exact semantics of IEEE floating-point are preserved.
+//
+// Without this pass, back-ends that do not natively support half-precision (e.g. x86_64)
+// similarly pattern-match half-precision operations with single-precision equivalents, but
+// without truncating after every operation. Doing so breaks floating-point operations that
+// assume precise semantics, such as Dekker arithmetic (as used in twiceprecision.jl).
+//
+// This pass is intended to run late in the pipeline, and should not be followed by
+// instcombine. A run of GVN is recommended to clean-up identical conversions.
+
+#include "llvm-version.h"
+
+#define DEBUG_TYPE "demote_float16"
+
+#include "support/dtypes.h"
+
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/LegacyPassManager.h>
+#include <llvm/IR/Module.h>
+#include <llvm/Support/Debug.h>
+
+using namespace llvm;
+
+namespace {
+
+struct DemoteFloat16Pass : public FunctionPass {
+    static char ID;
+    DemoteFloat16Pass() : FunctionPass(ID){};
+
+private:
+    bool runOnFunction(Function &F) override;
+};
+
+bool DemoteFloat16Pass::runOnFunction(Function &F)
+{
+    auto &ctx = F.getContext();
+    auto T_float16 = Type::getHalfTy(ctx);
+    auto T_float32 = Type::getFloatTy(ctx);
+
+    SmallVector<Instruction *, 0> erase;
+    for (auto &BB : F) {
+        for (auto &I : BB) {
+            switch (I.getOpcode()) {
+            case Instruction::FNeg:
+            case Instruction::FAdd:
+            case Instruction::FSub:
+            case Instruction::FMul:
+            case Instruction::FDiv:
+            case Instruction::FRem:
+            case Instruction::FCmp:
+                break;
+            default:
+                continue;
+            }
+
+            IRBuilder<> builder(&I);
+
+            // extend Float16 operands to Float32
+            bool OperandsChanged = false;
+            SmallVector<Value *, 2> Operands(I.getNumOperands());
+            for (size_t i = 0; i < I.getNumOperands(); i++) {
+                Value *Op = I.getOperand(i);
+                if (Op->getType() == T_float16) {
+                    Op = builder.CreateFPExt(Op, T_float32);
+                    OperandsChanged = true;
+                }
+                Operands[i] = (Op);
+            }
+
+            // recreate the instruction if any operands changed,
+            // truncating the result back to Float16
+            if (OperandsChanged) {
+                Value *NewI;
+                switch (I.getOpcode()) {
+                case Instruction::FNeg:
+                    assert(Operands.size() == 1);
+                    NewI = builder.CreateFNeg(Operands[0]);
+                    break;
+                case Instruction::FAdd:
+                    assert(Operands.size() == 2);
+                    NewI = builder.CreateFAdd(Operands[0], Operands[1]);
+                    break;
+                case Instruction::FSub:
+                    assert(Operands.size() == 2);
+                    NewI = builder.CreateFSub(Operands[0], Operands[1]);
+                    break;
+                case Instruction::FMul:
+                    assert(Operands.size() == 2);
+                    NewI = builder.CreateFMul(Operands[0], Operands[1]);
+                    break;
+                case Instruction::FDiv:
+                    assert(Operands.size() == 2);
+                    NewI = builder.CreateFDiv(Operands[0], Operands[1]);
+                    break;
+                case Instruction::FRem:
+                    assert(Operands.size() == 2);
+                    NewI = builder.CreateFRem(Operands[0], Operands[1]);
+                    break;
+                case Instruction::FCmp:
+                    assert(Operands.size() == 2);
+                    NewI = builder.CreateFCmp(cast<FCmpInst>(&I)->getPredicate(),
+                                              Operands[0], Operands[1]);
+                    break;
+                default:
+                    abort();
+                }
+                cast<Instruction>(NewI)->copyMetadata(I);
+                cast<Instruction>(NewI)->copyFastMathFlags(&I);
+                if (NewI->getType() != I.getType())
+                    NewI = builder.CreateFPTrunc(NewI, I.getType());
+                I.replaceAllUsesWith(NewI);
+                erase.push_back(&I);
+            }
+        }
+    }
+
+    if (erase.size() > 0) {
+        for (auto V : erase)
+            V->eraseFromParent();
+        return true;
+    }
+    else
+        return false;
+}
+
+char DemoteFloat16Pass::ID = 0;
+static RegisterPass<DemoteFloat16Pass>
+        Y("DemoteFloat16",
+          "Demote Float16 operations to Float32 equivalents.",
+          false,
+          false);
+}
+
+Pass *createDemoteFloat16Pass()
+{
+    return new DemoteFloat16Pass();
+}
+
+extern "C" JL_DLLEXPORT void LLVMExtraAddDemoteFloat16Pass(LLVMPassManagerRef PM)
+{
+    unwrap(PM)->add(createDemoteFloat16Pass());
+}
diff --git a/src/runtime_intrinsics.c b/src/runtime_intrinsics.c
index 17d9c60911022..2337abe7d5704 100644
--- a/src/runtime_intrinsics.c
+++ b/src/runtime_intrinsics.c
@@ -211,6 +211,20 @@ static inline void name(unsigned osize, void *pa, void *pr) JL_NOTSAFEPOINT \
     OP((c_type*)pr, a); \
 }
 
+#define un_fintrinsic_half(OP, name) \
+static inline void name(unsigned osize, void *pa, void *pr) JL_NOTSAFEPOINT \
+{ \
+    uint16_t a = *(uint16_t*)pa; \
+    float A = __gnu_h2f_ieee(a); \
+    if (osize == 16) { \
+        float R; \
+        OP(&R, A); \
+        *(uint16_t*)pr = __gnu_f2h_ieee(R); \
+    } else { \
+        OP((uint16_t*)pr, A); \
+    } \
+    }
+
 // float or integer inputs
 // OP::Function macro(inputa, inputb)
 // name::unique string
@@ -224,6 +238,18 @@ static void jl_##name##nbits(unsigned runtime_nbits, void *pa, void *pb, void *p
     *(c_type*)pr = (c_type)OP(a, b); \
 }
 
+#define bi_intrinsic_half(OP, name) \
+static void jl_##name##16(unsigned runtime_nbits, void *pa, void *pb, void *pr) JL_NOTSAFEPOINT \
+{ \
+    uint16_t a = *(uint16_t*)pa; \
+    uint16_t b = *(uint16_t*)pb; \
+    float A = __gnu_h2f_ieee(a); \
+    float B = __gnu_h2f_ieee(b); \
+    runtime_nbits = 16; \
+    float R = OP(A, B); \
+    *(uint16_t*)pr = __gnu_f2h_ieee(R); \
+}
+
 // float or integer inputs, bool output
 // OP::Function macro(inputa, inputb)
 // name::unique string
@@ -237,6 +263,18 @@ static int jl_##name##nbits(unsigned runtime_nbits, void *pa, void *pb) JL_NOTSA
     return OP(a, b); \
 }
 
+#define bool_intrinsic_half(OP, name) \
+static int jl_##name##16(unsigned runtime_nbits, void *pa, void *pb) JL_NOTSAFEPOINT \
+{ \
+    uint16_t a = *(uint16_t*)pa; \
+    uint16_t b = *(uint16_t*)pb; \
+    float A = __gnu_h2f_ieee(a); \
+    float B = __gnu_h2f_ieee(b); \
+    runtime_nbits = 16; \
+    return OP(A, B); \
+}
+
+
 // integer inputs, with precondition test
 // OP::Function macro(inputa, inputb)
 // name::unique string
@@ -265,6 +303,20 @@ static void jl_##name##nbits(unsigned runtime_nbits, void *pa, void *pb, void *p
     *(c_type*)pr = (c_type)OP(a, b, c); \
 }
 
+#define ter_intrinsic_half(OP, name) \
+static void jl_##name##16(unsigned runtime_nbits, void *pa, void *pb, void *pc, void *pr) JL_NOTSAFEPOINT \
+{ \
+    uint16_t a = *(uint16_t*)pa; \
+    uint16_t b = *(uint16_t*)pb; \
+    uint16_t c = *(uint16_t*)pc; \
+    float A = __gnu_h2f_ieee(a); \
+    float B = __gnu_h2f_ieee(b); \
+    float C = __gnu_h2f_ieee(c); \
+    runtime_nbits = 16; \
+    float R = OP(A, B, C); \
+    *(uint16_t*)pr = __gnu_f2h_ieee(R); \
+}
+
 
 // unary operator generator //
 
@@ -407,11 +459,12 @@ static inline jl_value_t *jl_intrinsic_cvt(jl_value_t *ty, jl_value_t *a, const
 // floating point
 
 #define un_fintrinsic_withtype(OP, name) \
+un_fintrinsic_half(OP, jl_##name##16) \
 un_fintrinsic_ctype(OP, jl_##name##32, float) \
 un_fintrinsic_ctype(OP, jl_##name##64, double) \
 JL_DLLEXPORT jl_value_t *jl_##name(jl_value_t *ty, jl_value_t *a) \
 { \
-    return jl_fintrinsic_1(ty, a, #name, jl_##name##32, jl_##name##64); \
+    return jl_fintrinsic_1(ty, a, #name, jl_##name##16, jl_##name##32, jl_##name##64); \
 }
 
 #define un_fintrinsic(OP, name) \
@@ -423,7 +476,7 @@ JL_DLLEXPORT jl_value_t *jl_##name(jl_value_t *a) \
 
 typedef void (fintrinsic_op1)(unsigned, void*, void*);
 
-static inline jl_value_t *jl_fintrinsic_1(jl_value_t *ty, jl_value_t *a, const char *name, fintrinsic_op1 *floatop, fintrinsic_op1 *doubleop)
+static inline jl_value_t *jl_fintrinsic_1(jl_value_t *ty, jl_value_t *a, const char *name, fintrinsic_op1 *halfop, fintrinsic_op1 *floatop, fintrinsic_op1 *doubleop)
 {
     jl_ptls_t ptls = jl_get_ptls_states();
     if (!jl_is_primitivetype(jl_typeof(a)))
@@ -436,6 +489,9 @@ static inline jl_value_t *jl_fintrinsic_1(jl_value_t *ty, jl_value_t *a, const c
     unsigned sz = jl_datatype_size(jl_typeof(a));
     switch (sz) {
     /* choose the right size c-type operation based on the input */
+    case 2:
+        halfop(sz2 * host_char_bit, pa, pr);
+        break;
     case 4:
         floatop(sz2 * host_char_bit, pa, pr);
         break;
@@ -443,7 +499,7 @@ static inline jl_value_t *jl_fintrinsic_1(jl_value_t *ty, jl_value_t *a, const c
         doubleop(sz2 * host_char_bit, pa, pr);
         break;
     default:
-        jl_errorf("%s: runtime floating point intrinsics are not implemented for bit sizes other than 32 and 64", name);
+        jl_errorf("%s: runtime floating point intrinsics are not implemented for bit sizes other than 16, 32 and 64", name);
     }
     return newv;
 }
@@ -612,6 +668,7 @@ static inline jl_value_t *jl_intrinsiclambda_checkeddiv(jl_value_t *ty, void *pa
 // floating point
 
 #define bi_fintrinsic(OP, name) \
+    bi_intrinsic_half(OP, name) \
     bi_intrinsic_ctype(OP, name, 32, float) \
     bi_intrinsic_ctype(OP, name, 64, double) \
 JL_DLLEXPORT jl_value_t *jl_##name(jl_value_t *a, jl_value_t *b) \
@@ -627,6 +684,9 @@ JL_DLLEXPORT jl_value_t *jl_##name(jl_value_t *a, jl_value_t *b) \
     void *pa = jl_data_ptr(a), *pb = jl_data_ptr(b), *pr = jl_data_ptr(newv); \
     switch (sz) { \
     /* choose the right size c-type operation */ \
+    case 2: \
+        jl_##name##16(16, pa, pb, pr); \
+        break; \
     case 4: \
         jl_##name##32(32, pa, pb, pr); \
         break; \
@@ -634,12 +694,13 @@ JL_DLLEXPORT jl_value_t *jl_##name(jl_value_t *a, jl_value_t *b) \
         jl_##name##64(64, pa, pb, pr); \
         break; \
     default: \
-        jl_error(#name ": runtime floating point intrinsics are not implemented for bit sizes other than 32 and 64"); \
+        jl_error(#name ": runtime floating point intrinsics are not implemented for bit sizes other than 16, 32 and 64"); \
     } \
     return newv; \
 }
 
 #define bool_fintrinsic(OP, name) \
+    bool_intrinsic_half(OP, name) \
     bool_intrinsic_ctype(OP, name, 32, float) \
     bool_intrinsic_ctype(OP, name, 64, double) \
 JL_DLLEXPORT jl_value_t *jl_##name(jl_value_t *a, jl_value_t *b) \
@@ -654,6 +715,9 @@ JL_DLLEXPORT jl_value_t *jl_##name(jl_value_t *a, jl_value_t *b) \
     int cmp; \
     switch (sz) { \
     /* choose the right size c-type operation */ \
+    case 2: \
+        cmp = jl_##name##16(16, pa, pb); \
+        break; \
     case 4: \
         cmp = jl_##name##32(32, pa, pb); \
         break; \
@@ -667,6 +731,7 @@ JL_DLLEXPORT jl_value_t *jl_##name(jl_value_t *a, jl_value_t *b) \
 }
 
 #define ter_fintrinsic(OP, name) \
+    ter_intrinsic_half(OP, name) \
     ter_intrinsic_ctype(OP, name, 32, float) \
     ter_intrinsic_ctype(OP, name, 64, double) \
 JL_DLLEXPORT jl_value_t *jl_##name(jl_value_t *a, jl_value_t *b, jl_value_t *c) \
@@ -682,6 +747,9 @@ JL_DLLEXPORT jl_value_t *jl_##name(jl_value_t *a, jl_value_t *b, jl_value_t *c)
     void *pa = jl_data_ptr(a), *pb = jl_data_ptr(b), *pc = jl_data_ptr(c), *pr = jl_data_ptr(newv); \
     switch (sz) { \
     /* choose the right size c-type operation */ \
+    case 2: \
+        jl_##name##16(16, pa, pb, pc, pr); \
+        break; \
     case 4: \
         jl_##name##32(32, pa, pb, pc, pr); \
         break; \
@@ -689,7 +757,7 @@ JL_DLLEXPORT jl_value_t *jl_##name(jl_value_t *a, jl_value_t *b, jl_value_t *c)
         jl_##name##64(64, pa, pb, pc, pr); \
         break; \
     default: \
-        jl_error(#name ": runtime floating point intrinsics are not implemented for bit sizes other than 32 and 64"); \
+        jl_error(#name ": runtime floating point intrinsics are not implemented for bit sizes other than 16, 32 and 64"); \
     } \
     return newv; \
 }
@@ -834,15 +902,17 @@ cvt_iintrinsic(LLVMFPtoUI, fptoui)
 #define fptrunc(pr, a) \
         if (!(osize < 8 * sizeof(a))) \
             jl_error("fptrunc: output bitsize must be < input bitsize"); \
-        if (osize == 32) \
+        else if (osize == 16) \
+            *(uint16_t*)pr = __gnu_f2h_ieee(a); \
+        else if (osize == 32) \
             *(float*)pr = a; \
         else if (osize == 64) \
             *(double*)pr = a; \
         else \
-            jl_error("fptrunc: runtime floating point intrinsics are not implemented for bit sizes other than 32 and 64");
+            jl_error("fptrunc: runtime floating point intrinsics are not implemented for bit sizes other than 16, 32 and 64");
 #define fpext(pr, a) \
-        if (!(osize > 8 * sizeof(a))) \
-            jl_error("fpext: output bitsize must be > input bitsize"); \
+        if (!(osize >= 8 * sizeof(a))) \
+            jl_error("fpext: output bitsize must be >= input bitsize"); \
         if (osize == 32) \
             *(float*)pr = a; \
         else if (osize == 64) \
diff --git a/test/intrinsics.jl b/test/intrinsics.jl
index a5f3308c68639..47560d7dbd626 100644
--- a/test/intrinsics.jl
+++ b/test/intrinsics.jl
@@ -13,8 +13,7 @@ include("testenv.jl")
 @testset "runtime intrinsics" begin
     @test Core.Intrinsics.add_int(1, 1) == 2
     @test Core.Intrinsics.sub_int(1, 1) == 0
-    @test_throws ErrorException("fpext: output bitsize must be > input bitsize")    Core.Intrinsics.fpext(Int32, 0x0000_0000)
-    @test_throws ErrorException("fpext: output bitsize must be > input bitsize")    Core.Intrinsics.fpext(Int32, 0x0000_0000_0000_0000)
+    @test_throws ErrorException("fpext: output bitsize must be >= input bitsize")    Core.Intrinsics.fpext(Int32, 0x0000_0000_0000_0000)
     @test_throws ErrorException("fptrunc: output bitsize must be < input bitsize")  Core.Intrinsics.fptrunc(Int32, 0x0000_0000)
     @test_throws ErrorException("fptrunc: output bitsize must be < input bitsize")  Core.Intrinsics.fptrunc(Int64, 0x0000_0000)
     @test_throws ErrorException("ZExt: output bitsize must be > input bitsize")     Core.Intrinsics.zext_int(Int8, 0x00)
@@ -106,3 +105,50 @@ end
 @test unsafe_load(Ptr{Nothing}(0)) === nothing
 struct GhostStruct end
 @test unsafe_load(Ptr{GhostStruct}(rand(Int))) === GhostStruct()
+
+# macro to verify and compare the compiled output of an intrinsic with its runtime version
+macro test_intrinsic(intr, args...)
+    output = args[end]
+    inputs = args[1:end-1]
+    quote
+        function f()
+            $intr($(inputs...))
+        end
+        @test f() === Base.invokelatest($intr, $(inputs...))
+        @test f() == $output
+    end
+end
+
+@testset "Float16 intrinsics" begin
+    # unary
+    @test_intrinsic Core.Intrinsics.neg_float Float16(3.3) Float16(-3.3)
+    @test_intrinsic Core.Intrinsics.fpext Float32 Float16(3.3) 3.3007812f0
+    @test_intrinsic Core.Intrinsics.fpext Float64 Float16(3.3) 3.30078125
+    @test_intrinsic Core.Intrinsics.fptrunc Float16 Float32(3.3) Float16(3.3)
+    @test_intrinsic Core.Intrinsics.fptrunc Float16 Float64(3.3) Float16(3.3)
+
+    # binary
+    @test_intrinsic Core.Intrinsics.add_float Float16(3.3) Float16(2) Float16(5.3)
+    @test_intrinsic Core.Intrinsics.sub_float Float16(3.3) Float16(2) Float16(1.301)
+    @test_intrinsic Core.Intrinsics.mul_float Float16(3.3) Float16(2) Float16(6.6)
+    @test_intrinsic Core.Intrinsics.div_float Float16(3.3) Float16(2) Float16(1.65)
+    @test_intrinsic Core.Intrinsics.rem_float Float16(3.3) Float16(2) Float16(1.301)
+
+    # ternary
+    @test_intrinsic Core.Intrinsics.fma_float Float16(3.3) Float16(4.4) Float16(5.5) Float16(20.02)
+    @test_intrinsic Core.Intrinsics.muladd_float Float16(3.3) Float16(4.4) Float16(5.5) Float16(20.02)
+
+    # boolean
+    @test_intrinsic Core.Intrinsics.eq_float Float16(3.3) Float16(3.3) true
+    @test_intrinsic Core.Intrinsics.eq_float Float16(3.3) Float16(2) false
+    @test_intrinsic Core.Intrinsics.ne_float Float16(3.3) Float16(3.3) false
+    @test_intrinsic Core.Intrinsics.ne_float Float16(3.3) Float16(2) true
+    @test_intrinsic Core.Intrinsics.le_float Float16(3.3) Float16(3.3) true
+    @test_intrinsic Core.Intrinsics.le_float Float16(3.3) Float16(2) false
+
+    # conversions
+    @test_intrinsic Core.Intrinsics.sitofp Float16 3 Float16(3f0)
+    @test_intrinsic Core.Intrinsics.uitofp Float16 UInt(3) Float16(3f0)
+    @test_intrinsic Core.Intrinsics.fptosi Int Float16(3.3) 3
+    @test_intrinsic Core.Intrinsics.fptoui UInt Float16(3.3) UInt(3)
+end
diff --git a/test/llvmpasses/llvmcall.jl b/test/llvmpasses/llvmcall.jl
index c9cdf4db1fc38..7da2dbec36a8f 100644
--- a/test/llvmpasses/llvmcall.jl
+++ b/test/llvmpasses/llvmcall.jl
@@ -28,5 +28,5 @@ emit(foo, Core.LLVMPtr{Float32, 3})
 # CHECK: call { i32, i32 } @foo({ i32, i32 } %{{[0-9]+}})
 emit(foo, Foo)
 
-# CHECK: define <2 x i16> @julia_bar_{{[0-9]+}}([2 x i16]
+# CHECK: define <2 x half> @julia_bar_{{[0-9]+}}([2 x half]
 emit(bar, NTuple{2, Float16})