From 377fbacc6f21564e2b2b6d9fb6557d0bcccfce8c Mon Sep 17 00:00:00 2001
From: Jameson Nash <vtjnash@gmail.com>
Date: Wed, 28 Jun 2017 19:19:09 -0400
Subject: [PATCH] revise boundscheck structure

it is much easier if this value gets treated as a normal parameter
as that allows all of the normal control flow logic to apply
rather than require a complete reimplementation of it
---
 base/array.jl                  |  11 +-
 base/essentials.jl             |  21 +-
 base/inference.jl              | 356 +++++++++++----------------------
 base/tuple.jl                  |   8 +-
 doc/src/devdocs/ast.md         |   4 +-
 doc/src/devdocs/boundscheck.md |  12 +-
 src/builtins.c                 |  44 ++--
 src/cgutils.cpp                |  48 +++--
 src/codegen.cpp                | 111 ++++------
 src/common_symbols1.inc        |   1 -
 src/common_symbols2.inc        |   2 +-
 src/dump.c                     |   2 +-
 src/interpreter.c              |   5 +-
 src/julia-syntax.scm           |   7 +-
 src/julia_internal.h           |   4 +-
 test/boundscheck_exec.jl       |  20 +-
 test/core.jl                   |  13 +-
 17 files changed, 271 insertions(+), 398 deletions(-)

diff --git a/base/array.jl b/base/array.jl
index bd5cda891903a..75a4a76a99690 100644
--- a/base/array.jl
+++ b/base/array.jl
@@ -583,8 +583,8 @@ done(a::Array,i) = (@_inline_meta; i == length(a)+1)
 ## Indexing: getindex ##
 
 # This is more complicated than it needs to be in order to get Win64 through bootstrap
-getindex(A::Array, i1::Int) = arrayref(A, i1)
-getindex(A::Array, i1::Int, i2::Int, I::Int...) = (@_inline_meta; arrayref(A, i1, i2, I...)) # TODO: REMOVE FOR #14770
+@eval getindex(A::Array, i1::Int) = arrayref($(Expr(:boundscheck)), A, i1)
+@eval getindex(A::Array, i1::Int, i2::Int, I::Int...) = (@_inline_meta; arrayref($(Expr(:boundscheck)), A, i1, i2, I...)) # TODO: REMOVE FOR #14770
 
 # Faster contiguous indexing using copy! for UnitRange and Colon
 function getindex(A::Array, I::UnitRange{Int})
@@ -612,8 +612,9 @@ function getindex(A::Array{S}, I::Range{Int}) where S
 end
 
 ## Indexing: setindex! ##
-setindex!(A::Array{T}, x, i1::Int) where {T} = arrayset(A, convert(T,x)::T, i1)
-setindex!(A::Array{T}, x, i1::Int, i2::Int, I::Int...) where {T} = (@_inline_meta; arrayset(A, convert(T,x)::T, i1, i2, I...)) # TODO: REMOVE FOR #14770
+@eval setindex!(A::Array{T}, x, i1::Int) where {T} = arrayset($(Expr(:boundscheck)), A, convert(T, x)::T, i1)
+@eval setindex!(A::Array{T}, x, i1::Int, i2::Int, I::Int...) where {T} =
+    (@_inline_meta; arrayset($(Expr(:boundscheck)), A, convert(T, x)::T, i1, i2, I...)) # TODO: REMOVE FOR #14770
 
 # These are redundant with the abstract fallbacks but needed for bootstrap
 function setindex!(A::Array, x, I::AbstractVector{Int})
@@ -696,7 +697,7 @@ end
 
 function push!(a::Array{Any,1}, @nospecialize item)
     _growend!(a, 1)
-    arrayset(a, item, length(a))
+    arrayset(true, a, item, length(a))
     return a
 end
 
diff --git a/base/essentials.jl b/base/essentials.jl
index 38dab2076b033..174fe7138f178 100644
--- a/base/essentials.jl
+++ b/base/essentials.jl
@@ -198,7 +198,7 @@ function append_any(xs...)
                 ccall(:jl_array_grow_end, Void, (Any, UInt), out, 16)
                 l += 16
             end
-            Core.arrayset(out, y, i)
+            Core.arrayset(true, out, y, i)
             i += 1
         end
     end
@@ -207,7 +207,7 @@ function append_any(xs...)
 end
 
 # simple Array{Any} operations needed for bootstrap
-setindex!(A::Array{Any}, @nospecialize(x), i::Int) = Core.arrayset(A, x, i)
+@eval setindex!(A::Array{Any}, @nospecialize(x), i::Int) = Core.arrayset($(Expr(:boundscheck)), A, x, i)
 
 function precompile(@nospecialize(f), args::Tuple)
     ccall(:jl_compile_hint, Int32, (Any,), Tuple{Core.Typeof(f), args...}) != 0
@@ -227,10 +227,7 @@ section of the Metaprogramming chapter of the manual for more details and exampl
 esc(@nospecialize(e)) = Expr(:escape, e)
 
 macro boundscheck(blk)
-    # hack: use this syntax since it avoids introducing line numbers
-    :($(Expr(:boundscheck,true));
-      $(esc(blk));
-      $(Expr(:boundscheck,:pop)))
+    return Expr(:if, Expr(:boundscheck), esc(blk))
 end
 
 """
@@ -238,7 +235,8 @@ end
 
 Eliminates array bounds checking within expressions.
 
-In the example below the bound check of array A is skipped to improve performance.
+In the example below the in-range check for referencing
+element i of array A is skipped to improve performance.
 
 ```julia
 function sum(A::AbstractArray)
@@ -256,9 +254,10 @@ end
     for out-of-bounds indices. The user is responsible for checking it manually.
 """
 macro inbounds(blk)
-    :($(Expr(:inbounds,true));
-      $(esc(blk));
-      $(Expr(:inbounds,:pop)))
+    return Expr(:block,
+        Expr(:inbounds, true),
+        esc(blk),
+        Expr(:inbounds, :pop))
 end
 
 macro label(name::Symbol)
@@ -379,7 +378,7 @@ function vector_any(@nospecialize xs...)
     n = length(xs)
     a = Vector{Any}(n)
     @inbounds for i = 1:n
-        Core.arrayset(a,xs[i],i)
+        Core.arrayset(false, a, xs[i], i)
     end
     a
 end
diff --git a/base/inference.jl b/base/inference.jl
index 67156c16cee90..154aa489fd17a 100644
--- a/base/inference.jl
+++ b/base/inference.jl
@@ -1090,7 +1090,9 @@ function const_datatype_getfield_tfunc(sv, fld)
 end
 
 # returns (type, isexact)
-function getfield_tfunc(@nospecialize(s00), name)
+getfield_tfunc(@nospecialize(s00), @nospecialize(name), @nospecialize(inbounds)) =
+    getfield_tfunc(s00, name)
+function getfield_tfunc(@nospecialize(s00), @nospecialize(name))
     if isa(s00, TypeVar)
         s00 = s00.ub
     end
@@ -1210,8 +1212,10 @@ function getfield_tfunc(@nospecialize(s00), name)
     # in the current type system
     return rewrap_unionall(limit_type_depth(R, MAX_TYPE_DEPTH), s00)
 end
-add_tfunc(getfield, 2, 2, (@nospecialize(s), @nospecialize(name)) -> getfield_tfunc(s, name), 1)
+add_tfunc(getfield, 2, 3, getfield_tfunc, 1)
 add_tfunc(setfield!, 3, 3, (@nospecialize(o), @nospecialize(f), @nospecialize(v)) -> v, 3)
+fieldtype_tfunc(@nospecialize(s0), @nospecialize(name), @nospecialize(inbounds)) =
+    fieldtype_tfunc(s0, name)
 function fieldtype_tfunc(@nospecialize(s0), @nospecialize(name))
     if s0 === Any || s0 === Type || DataType ⊑ s0 || UnionAll ⊑ s0
         return Type
@@ -1271,7 +1275,7 @@ function fieldtype_tfunc(@nospecialize(s0), @nospecialize(name))
     end
     return Type{<:ft}
 end
-add_tfunc(fieldtype, 2, 2, fieldtype_tfunc, 0)
+add_tfunc(fieldtype, 2, 3, fieldtype_tfunc, 0)
 
 function valid_tparam(@nospecialize(x))
     if isa(x,Tuple)
@@ -1455,27 +1459,25 @@ function builtin_tfunction(@nospecialize(f), argtypes::Array{Any,1},
     elseif f === svec
         return SimpleVector
     elseif f === arrayset
-        if length(argtypes) < 3 && !isva
+        if length(argtypes) < 4
+            isva && return Any
             return Bottom
         end
-        a1 = argtypes[1]
-        if isvarargtype(a1)
-            return unwrap_unionall(a1).parameters[1]
-        end
-        return a1
+        return argtypes[2]
     elseif f === arrayref
-        if length(argtypes) < 2 && !isva
+        if length(argtypes) < 3
+            isva && return Any
             return Bottom
         end
-        a = widenconst(argtypes[1])
+        a = widenconst(argtypes[2])
         if a <: Array
-            if isa(a,DataType) && (isa(a.parameters[1],Type) || isa(a.parameters[1],TypeVar))
+            if isa(a, DataType) && (isa(a.parameters[1], Type) || isa(a.parameters[1], TypeVar))
                 # TODO: the TypeVar case should not be needed here
                 a = a.parameters[1]
-                return isa(a,TypeVar) ? a.ub : a
-            elseif isa(a,UnionAll) && !has_free_typevars(a)
+                return isa(a, TypeVar) ? a.ub : a
+            elseif isa(a, UnionAll) && !has_free_typevars(a)
                 unw = unwrap_unionall(a)
-                if isa(unw,DataType)
+                if isa(unw, DataType)
                     return rewrap_unionall(unw.parameters[1], a)
                 end
             end
@@ -2391,6 +2393,8 @@ function abstract_eval(@nospecialize(e), vtypes::VarTable, sv::InferenceState)
         return abstract_eval_constant(e.args[1])
     elseif e.head === :invoke
         error("type inference data-flow error: tried to double infer a function")
+    elseif e.head === :boundscheck
+        return Bool
     elseif e.head === :isdefined
         sym = e.args[1]
         t = Bool
@@ -3299,16 +3303,19 @@ function optimize(me::InferenceState)
         # if we start to create `SSAValue` in type inference when not
         # optimizing and use unoptimized IR in codegen.
         gotoifnot_elim_pass!(me)
-        inlining_pass!(me)
+        inlining_pass!(me, me.src.propagate_inbounds)
+        gotoifnot_elim_pass!(me)
         void_use_elim_pass!(me)
         alloc_elim_pass!(me)
         getfield_elim_pass!(me)
         # Clean up for `alloc_elim_pass!` and `getfield_elim_pass!`
         void_use_elim_pass!(me)
         do_coverage = coverage_enabled()
-        meta_elim_pass!(me.src.code::Array{Any,1}, me.src.propagate_inbounds, do_coverage)
+        code = me.src.code::Array{Any,1}
+        meta_elim_pass!(code, do_coverage)
         # Pop metadata before label reindexing
-        force_noinline = popmeta!(me.src.code::Array{Any,1}, :noinline)[1]
+        filter!(x -> x !== nothing, code)
+        force_noinline = popmeta!(code, :noinline)[1]
         reindex_labels!(me)
     end
 
@@ -3652,7 +3659,10 @@ end
 
 # replace slots 1:na with argexprs, static params with spvals, and increment
 # other slots by offset.
-function substitute!(@nospecialize(e), na::Int, argexprs::Vector{Any}, @nospecialize(spsig), spvals::Vector{Any}, offset::Int)
+function substitute!(
+        @nospecialize(e), na::Int, argexprs::Vector{Any},
+        @nospecialize(spsig), spvals::Vector{Any},
+        offset::Int, boundscheck::Symbol)
     if isa(e, Slot)
         id = slot_id(e)
         if 1 <= id <= na
@@ -3669,7 +3679,7 @@ function substitute!(@nospecialize(e), na::Int, argexprs::Vector{Any}, @nospecia
         end
     end
     if isa(e, NewvarNode)
-        return NewvarNode(substitute!(e.slot, na, argexprs, spsig, spvals, offset))
+        return NewvarNode(substitute!(e.slot, na, argexprs, spsig, spvals, offset, boundscheck))
     end
     if isa(e, Expr)
         e = e::Expr
@@ -3677,7 +3687,7 @@ function substitute!(@nospecialize(e), na::Int, argexprs::Vector{Any}, @nospecia
         if head === :static_parameter
             return spvals[e.args[1]]
         elseif head === :foreigncall
-            @assert !isa(spsig,UnionAll) || !isempty(spvals)
+            @assert !isa(spsig, UnionAll) || !isempty(spvals)
             for i = 1:length(e.args)
                 if i == 2
                     e.args[2] = ccall(:jl_instantiate_type_in_env, Any, (Any, Any, Ptr{Any}), e.args[2], spsig, spvals)
@@ -3692,12 +3702,20 @@ function substitute!(@nospecialize(e), na::Int, argexprs::Vector{Any}, @nospecia
                 elseif i == 5
                     @assert isa(e.args[5], Int)
                 else
-                    e.args[i] = substitute!(e.args[i], na, argexprs, spsig, spvals, offset)
+                    e.args[i] = substitute!(e.args[i], na, argexprs, spsig, spvals, offset, boundscheck)
                 end
             end
+        elseif head === :boundscheck
+            if boundscheck === :propagate
+                return e
+            elseif boundscheck === :off
+                return false
+            else
+                return true
+            end
         elseif !is_meta_expr_head(head)
             for i = 1:length(e.args)
-                e.args[i] = substitute!(e.args[i], na, argexprs, spsig, spvals, offset)
+                e.args[i] = substitute!(e.args[i], na, argexprs, spsig, spvals, offset, boundscheck)
             end
         end
     end
@@ -3809,9 +3827,10 @@ function effect_free(@nospecialize(e), src::CodeInfo, mod::Module, allow_volatil
                     if is_known_call(e, arrayref, src, mod) || is_known_call(e, arraylen, src, mod)
                         return false
                     elseif is_known_call(e, getfield, src, mod)
-                        length(ea) == 3 || return false
+                        nargs = length(ea)
+                        (nargs == 3 || nargs == 4) || return false
                         et = exprtype(e, src, mod)
-                        if !isa(et,Const) && !(isType(et) && isleaftype(et))
+                        if !isa(et, Const) && !(isType(et) && isleaftype(et))
                             # first argument must be immutable to ensure e is affect_free
                             a = ea[2]
                             typ = widenconst(exprtype(a, src, mod))
@@ -4074,8 +4093,9 @@ end
 # `ft` is the type of the function. `f` is the exact function if known, or else `nothing`.
 # `pending_stmts` is an array of statements from functions inlined so far, so
 # we can estimate the total size of the enclosing function after inlining.
-function inlineable(@nospecialize(f), @nospecialize(ft), e::Expr, atypes::Vector{Any}, sv::InferenceState,
-                    pending_stmts)
+function inlineable(@nospecialize(f), @nospecialize(ft), e::Expr, atypes::Vector{Any},
+                    pending_stmt::Vector{Any}, boundscheck::Symbol,
+                    sv::InferenceState)
     argexprs = e.args
 
     if (f === typeassert || ft ⊑ typeof(typeassert)) && length(atypes)==3
@@ -4388,7 +4408,6 @@ function inlineable(@nospecialize(f), @nospecialize(ft), e::Expr, atypes::Vector
 
     body = Expr(:block)
     body.args = ast
-    propagate_inbounds = src.propagate_inbounds
 
     # see if each argument occurs only once in the body expression
     stmts = []
@@ -4449,7 +4468,7 @@ function inlineable(@nospecialize(f), @nospecialize(ft), e::Expr, atypes::Vector
     end
 
     # ok, substitute argument expressions for argument names in the body
-    body = substitute!(body, na, argexprs, method.sig, spvals, length(sv.src.slotnames) - na)
+    body = substitute!(body, na, argexprs, method.sig, spvals, length(sv.src.slotnames) - na, boundscheck)
     append!(sv.src.slotnames, src.slotnames[(na + 1):end])
     append!(sv.src.slottypes, src.slottypes[(na + 1):end])
     append!(sv.src.slotflags, src.slotflags[(na + 1):end])
@@ -4570,22 +4589,6 @@ function inlineable(@nospecialize(f), @nospecialize(ft), e::Expr, atypes::Vector
             push!(stmts, Expr(:meta, :pop_loc))
         end
     end
-    if !isempty(stmts) && !propagate_inbounds
-        # avoid redundant inbounds annotations
-        s_1, s_end = stmts[1], stmts[end]
-        i = 2
-        while length(stmts) > i && ((isa(s_1,Expr)&&s_1.head===:line) || isa(s_1,LineNumberNode))
-            s_1 = stmts[i]
-            i += 1
-        end
-        if isa(s_1, Expr) && s_1.head === :inbounds && s_1.args[1] === false &&
-            isa(s_end, Expr) && s_end.head === :inbounds && s_end.args[1] === :pop
-        else
-            # inlined statements are out-of-bounds by default
-            unshift!(stmts, Expr(:inbounds, false))
-            push!(stmts, Expr(:inbounds, :pop))
-        end
-    end
 
     if isa(expr, Expr)
         old_t = e.typ
@@ -4728,18 +4731,54 @@ function mk_tuplecall(args, sv::InferenceState)
     return e
 end
 
-function inlining_pass!(sv::InferenceState)
+function inlining_pass!(sv::InferenceState, propagate_inbounds::Bool)
+    # Also handles bounds check elision:
+    #
+    #    1. If check_bounds is always on, set `Expr(:boundscheck)` true
+    #    2. If check_bounds is always off, set `Expr(:boundscheck)` false
+    #    3. If check_bounds is default, figure out whether each boundscheck
+    #         is true, false, or propagate based on the enclosing inbounds directives
+    _opt_check_bounds = JLOptions().check_bounds
+    opt_check_bounds = (_opt_check_bounds == 0 ? :default :
+                        _opt_check_bounds == 1 ? :on :
+                        :off)
+    # Number of stacked inbounds
+    inbounds_depth = 0
+
     eargs = sv.src.code
     i = 1
     stmtbuf = []
     while i <= length(eargs)
         ei = eargs[i]
         if isa(ei, Expr)
-            eargs[i] = inlining_pass(ei, sv, stmtbuf, 1)
-            if !isempty(stmtbuf)
-                splice!(eargs, i:i-1, stmtbuf)
-                i += length(stmtbuf)
-                empty!(stmtbuf)
+            if ei.head === :inbounds
+                eargs[i] = nothing
+                arg1 = ei.args[1]
+                if arg1 === true # push
+                    inbounds_depth += 1
+                elseif arg1 === false # clear
+                    inbounds_depth = 0
+                elseif inbounds_depth > 0 # pop
+                    inbounds_depth -= 1
+                end
+            else
+                if opt_check_bounds === :off
+                     boundscheck = :off
+                elseif opt_check_bounds === :on
+                     boundscheck = :on
+                elseif inbounds_depth > 0
+                     boundscheck = :off
+                elseif propagate_inbounds
+                     boundscheck = :propagate
+                else
+                     boundscheck = :on
+                end
+                eargs[i] = inlining_pass(ei, sv, stmtbuf, 1, boundscheck)
+                if !isempty(stmtbuf)
+                    splice!(eargs, i:(i - 1), stmtbuf)
+                    i += length(stmtbuf)
+                    empty!(stmtbuf)
+                end
             end
         end
         i += 1
@@ -4750,15 +4789,24 @@ const corenumtype = Union{Int32, Int64, Float32, Float64}
 
 # return inlined replacement for `e`, inserting new needed statements
 # at index `ins` in `stmts`.
-function inlining_pass(e::Expr, sv::InferenceState, stmts, ins)
-    if e.head === :isdefined
-        isa(e.typ, Const) && return e.typ.val
-        return e
-    end
+function inlining_pass(e::Expr, sv::InferenceState, stmts::Vector{Any}, ins, boundscheck::Symbol)
     if e.head === :method
         # avoid running the inlining pass on function definitions
         return e
     end
+    if e.head === :meta
+        # ignore meta
+        return e
+    end
+    # inliners for special exprs
+    if e.head === :boundscheck
+        return e
+    end
+    if e.head === :isdefined
+        isa(e.typ, Const) && return e.typ.val
+        return e
+    end
+
     eargs = e.args
     if length(eargs) < 1
         return e
@@ -4801,7 +4849,7 @@ function inlining_pass(e::Expr, sv::InferenceState, stmts, ins)
                 argloc = eargs
             end
             sl0 = length(stmts)
-            res = inlining_pass(ei, sv, stmts, ins)
+            res = inlining_pass(ei, sv, stmts, ins, boundscheck)
             ns = length(stmts) - sl0  # number of new statements just added
             if isccallee
                 restype = exprtype(res, sv.src, sv.mod)
@@ -4893,11 +4941,11 @@ function inlining_pass(e::Expr, sv::InferenceState, stmts, ins)
                                            exprtype(a1, sv.src, sv.mod) ⊑ basenumtype)
                     if square
                         e.args = Any[GlobalRef(Main.Base,:*), a1, a1]
-                        res = inlining_pass(e, sv, stmts, ins)
+                        res = inlining_pass(e, sv, stmts, ins, boundscheck)
                     else
                         e.args = Any[GlobalRef(Main.Base,:*), Expr(:call, GlobalRef(Main.Base,:*), a1, a1), a1]
                         e.args[2].typ = e.typ
-                        res = inlining_pass(e, sv, stmts, ins)
+                        res = inlining_pass(e, sv, stmts, ins, boundscheck)
                     end
                     return res
                 end
@@ -4913,7 +4961,7 @@ function inlining_pass(e::Expr, sv::InferenceState, stmts, ins)
             (a === Bottom || isvarargtype(a)) && return e
             ata[i] = a
         end
-        res = inlineable(f, ft, e, ata, sv, stmts)
+        res = inlineable(f, ft, e, ata, stmts, boundscheck, sv)
         if isa(res,Tuple)
             if isa(res[2],Array) && !isempty(res[2])
                 splice!(stmts, ins:ins-1, res[2])
@@ -5301,7 +5349,7 @@ function void_use_elim_pass!(sv::InferenceState)
     nothing
 end
 
-function meta_elim_pass!(code::Array{Any,1}, propagate_inbounds::Bool, do_coverage::Bool)
+function meta_elim_pass!(code::Array{Any,1}, do_coverage::Bool)
     # 1. Remove place holders
     #
     # 2. If coverage is off, remove line number nodes that don't mark any
@@ -5309,54 +5357,6 @@ function meta_elim_pass!(code::Array{Any,1}, propagate_inbounds::Bool, do_covera
     #
     # 3. Remove top level SSAValue
     #
-    # 4. Handle bounds check elision
-    #
-    #    4.1. If check_bounds is always on, delete all `Expr(:boundscheck)`
-    #    4.2. If check_bounds is always off, delete all boundscheck blocks.
-    #    4.3. If check_bounds is default, figure out whether each checkbounds
-    #         blocks needs to be eliminated or could be eliminated when inlined
-    #         into another function. Delete the blocks that should be eliminated
-    #         and delete the `Expr(:boundscheck)` for blocks that will never be
-    #         deleted. (i.e. the ones that are not eliminated with
-    #         `length(inbounds_stack) >= 2`)
-    #
-    #    When deleting IR with boundscheck, keep the label node in order to not
-    #    confuse later passes or codegen. (we could also track if  any SSAValue
-    #    is deleted while still having uses that are not but that's a little
-    #    expensive).
-    #
-    # 5. Clean up `Expr(:inbounds)`
-    #
-    #    Delete all `Expr(:inbounds)` that is unnecessary, which is all of them
-    #    for non-default check_bounds. For default check_bounds this includes
-    #
-    #    * `Expr(:inbounds, true)` in `Expr(:inbounds, true)`
-    #    * `Expr(:inbounds, false)` when
-    #      `!is_inbounds && length(inbounds_stack) >= 2`
-    #
-    #    Functions without `propagate_inbounds` have an implicit `false` on the
-    #    `inbounds_stack`
-    #
-    #    There are other cases in which we can eliminate `Expr(:inbounds)` or
-    #    `Expr(:boundscheck)` (e.g. when they don't enclose any non-meta
-    #    expressions). Those are a little harder to detect and are hopefully
-    #    not too common.
-    check_bounds = JLOptions().check_bounds
-
-    inbounds_stack = propagate_inbounds ? Bool[] : Bool[false]
-    # Whether the push is deleted (therefore if the pop has to be too)
-    # Shared for `Expr(:boundscheck)` and `Expr(:inbounds)`
-    bounds_elim_stack = Bool[]
-    # The expression index of the push, set to `0` when encountering a
-    # non-meta expression that might be affect by the push.
-    # The clearing needs to be propagated up during pop
-    # This is not pushed to if the push is already eliminated
-    # Also shared for `Expr(:boundscheck)` and `Expr(:inbounds)`
-    bounds_push_pos_stack = Int[0] # always non-empty
-    # Number of boundscheck pushes in a eliminated boundscheck block
-    void_boundscheck_depth = 0
-    is_inbounds = check_bounds == 2
-    enabled = true
 
     # Position of the last line number node without any non-meta expressions
     # in between.
@@ -5384,140 +5384,16 @@ function meta_elim_pass!(code::Array{Any,1}, propagate_inbounds::Bool, do_covera
             prev_dbg_stack[end] = i
             continue
         elseif !isa(ex, Expr)
-            if enabled
-                prev_dbg_stack[end] = 0
-                push_loc_pos_stack[end] = 0
-                bounds_push_pos_stack[end] = 0
-            else
-                code[i] = nothing
-            end
+            prev_dbg_stack[end] = 0
+            push_loc_pos_stack[end] = 0
             continue
         end
         ex = ex::Expr
         args = ex.args
         head = ex.head
-        if head === :boundscheck
-            if !enabled
-                # we are in an eliminated boundscheck, simply record the number
-                # of push/pop
-                if !(args[1] === :pop)
-                    void_boundscheck_depth += 1
-                elseif void_boundscheck_depth == 0
-                    # There must have been a push
-                    pop!(bounds_elim_stack)
-                    enabled = true
-                else
-                    void_boundscheck_depth -= 1
-                end
-                code[i] = nothing
-            elseif args[1] === :pop
-                # This will also delete pops that don't match
-                if (isempty(bounds_elim_stack) ? true :
-                    pop!(bounds_elim_stack))
-                    code[i] = nothing
-                    continue
-                end
-                push_idx = bounds_push_pos_stack[end]
-                if length(bounds_push_pos_stack) > 1
-                    pop!(bounds_push_pos_stack)
-                end
-                if push_idx > 0
-                    code[push_idx] = nothing
-                    code[i] = nothing
-                else
-                    bounds_push_pos_stack[end] = 0
-                end
-            elseif is_inbounds
-                code[i] = nothing
-                push!(bounds_elim_stack, true)
-                enabled = false
-            elseif check_bounds == 1 || length(inbounds_stack) >= 2
-                # Not inbounds and at least two levels deep, this will never
-                # be eliminated when inlined to another function.
-                code[i] = nothing
-                push!(bounds_elim_stack, true)
-            else
-                push!(bounds_elim_stack, false)
-                push!(bounds_push_pos_stack, i)
-            end
-            continue
-        end
-        if !enabled && !(do_coverage && head === :meta)
-            code[i] = nothing
-            continue
-        end
-        if head === :inbounds
-            if check_bounds != 0
-                code[i] = nothing
-                continue
-            end
-            arg1 = args[1]
-            if arg1 === true
-                if !isempty(inbounds_stack) && inbounds_stack[end]
-                    code[i] = nothing
-                    push!(bounds_elim_stack, true)
-                else
-                    is_inbounds = true
-                    push!(bounds_elim_stack, false)
-                    push!(bounds_push_pos_stack, i)
-                end
-                push!(inbounds_stack, true)
-            elseif arg1 === false
-                if is_inbounds
-                    # There must have been a `true` on the stack so
-                    # `inbounds_stack` must not be empty
-                    if !inbounds_stack[end]
-                        is_inbounds = false
-                    end
-                    push!(bounds_elim_stack, false)
-                    push!(bounds_push_pos_stack, i)
-                elseif length(inbounds_stack) >= 2
-                    code[i] = nothing
-                    push!(bounds_elim_stack, true)
-                else
-                    push!(bounds_elim_stack, false)
-                    push!(bounds_push_pos_stack, i)
-                end
-                push!(inbounds_stack, false)
-            else
-                # pop
-                inbounds_len = length(inbounds_stack)
-                if inbounds_len != 0
-                    pop!(inbounds_stack)
-                    inbounds_len -= 1
-                end
-                # This will also delete pops that don't match
-                if (isempty(bounds_elim_stack) ? true :
-                    pop!(bounds_elim_stack))
-                    # No need to update `is_inbounds` since the push was a no-op
-                    code[i] = nothing
-                    continue
-                end
-                if inbounds_len >= 2
-                    is_inbounds = (inbounds_stack[inbounds_len] ||
-                                   inbounds_stack[inbounds_len - 1])
-                elseif inbounds_len == 1
-                    is_inbounds = inbounds_stack[inbounds_len]
-                else
-                    is_inbounds = false
-                end
-                push_idx = bounds_push_pos_stack[end]
-                if length(bounds_push_pos_stack) > 1
-                    pop!(bounds_push_pos_stack)
-                end
-                if push_idx > 0
-                    code[push_idx] = nothing
-                    code[i] = nothing
-                else
-                    bounds_push_pos_stack[end] = 0
-                end
-            end
-            continue
-        end
         if head !== :meta
             prev_dbg_stack[end] = 0
             push_loc_pos_stack[end] = 0
-            bounds_push_pos_stack[end] = 0
             continue
         end
         nargs = length(args)
@@ -5553,7 +5429,6 @@ function meta_elim_pass!(code::Array{Any,1}, propagate_inbounds::Bool, do_covera
             continue
         end
     end
-    return filter!(x -> x !== nothing, code)
 end
 
 # does the same job as alloc_elim_pass for allocations inline in getfields
@@ -5568,11 +5443,14 @@ function getfield_elim_pass!(sv::InferenceState)
 end
 
 function _getfield_elim_pass!(e::Expr, sv::InferenceState)
-    for i = 1:length(e.args)
+    nargs = length(e.args)
+    for i = 1:nargs
         e.args[i] = _getfield_elim_pass!(e.args[i], sv)
     end
-    if is_known_call(e, getfield, sv.src, sv.mod) && length(e.args)==3 &&
-        (isa(e.args[3],Int) || isa(e.args[3],QuoteNode))
+    if is_known_call(e, getfield, sv.src, sv.mod) &&
+            (nargs == 3 || nargs == 4) &&
+            (isa(e.args[3], Int) || isa(e.args[3], QuoteNode)) &&
+            (nargs == 3 || isa(e.args[4], Bool))
         e1 = e.args[2]
         j = e.args[3]
         single_use = true
diff --git a/base/tuple.jl b/base/tuple.jl
index cce81f0151830..8018ba27621bb 100644
--- a/base/tuple.jl
+++ b/base/tuple.jl
@@ -17,11 +17,11 @@ NTuple
 
 length(t::Tuple) = nfields(t)
 endof(t::Tuple) = length(t)
-size(t::Tuple, d) = d==1 ? length(t) : throw(ArgumentError("invalid tuple dimension $d"))
-getindex(t::Tuple, i::Int) = getfield(t, i)
-getindex(t::Tuple, i::Real) = getfield(t, convert(Int, i))
+size(t::Tuple, d) = (d == 1) ? length(t) : throw(ArgumentError("invalid tuple dimension $d"))
+@eval getindex(t::Tuple, i::Int) = getfield(t, i, $(Expr(:boundscheck)))
+@eval getindex(t::Tuple, i::Real) = getfield(t, convert(Int, i), $(Expr(:boundscheck)))
 getindex(t::Tuple, r::AbstractArray{<:Any,1}) = ([t[ri] for ri in r]...)
-getindex(t::Tuple, b::AbstractArray{Bool,1}) = length(b) == length(t) ? getindex(t,find(b)) : throw(BoundsError(t, b))
+getindex(t::Tuple, b::AbstractArray{Bool,1}) = length(b) == length(t) ? getindex(t, find(b)) : throw(BoundsError(t, b))
 
 # returns new tuple; N.B.: becomes no-op if i is out-of-bounds
 setindex(x::Tuple, v, i::Integer) = (@_inline_meta; _setindex(v, i, x...))
diff --git a/doc/src/devdocs/ast.md b/doc/src/devdocs/ast.md
index 16724c3154954..7d7904f98b3e8 100644
--- a/doc/src/devdocs/ast.md
+++ b/doc/src/devdocs/ast.md
@@ -170,8 +170,8 @@ These symbols appear in the `head` field of `Expr`s in lowered form.
 
   * `boundscheck`
 
-    Indicates the beginning or end of a section of code that performs a bounds check. Like `inbounds`,
-    a stack is maintained, and the second argument can be one of: `true`, `false`, or `:pop`.
+    Has the value `false` if inlined into a section of code marked with `@inbounds`,
+    otherwise hase the value `true`.
 
   * `copyast`
 
diff --git a/doc/src/devdocs/boundscheck.md b/doc/src/devdocs/boundscheck.md
index 0647e01a722f1..9eed150ea1497 100644
--- a/doc/src/devdocs/boundscheck.md
+++ b/doc/src/devdocs/boundscheck.md
@@ -5,17 +5,15 @@ accessing arrays. In tight inner loops or other performance critical situations,
 to skip these bounds checks to improve runtime performance. For instance, in order to emit vectorized
 (SIMD) instructions, your loop body cannot contain branches, and thus cannot contain bounds checks.
 Consequently, Julia includes an `@inbounds(...)` macro to tell the compiler to skip such bounds
-checks within the given block. For the built-in `Array` type, the magic happens inside the `arrayref`
-and `arrayset` intrinsics. User-defined array types instead use the `@boundscheck(...)` macro
+checks within the given block. User-defined array types can use the `@boundscheck(...)` macro
 to achieve context-sensitive code selection.
 
 ## Eliding bounds checks
 
-The `@boundscheck(...)` macro marks blocks of code that perform bounds checking. When such blocks
-appear inside of an `@inbounds(...)` block, the compiler removes these blocks. When the `@boundscheck(...)`
-is nested inside of a calling function containing an `@inbounds(...)`, the compiler will remove
-the `@boundscheck` block *only if it is inlined* into the calling function. For example, you might
-write the method `sum` as:
+The `@boundscheck(...)` macro marks blocks of code that perform bounds checking.
+When such blocks are inlined into an `@inbounds(...)` block, the compiler may remove these blocks.
+The compiler removes the `@boundscheck` block *only if it is inlined* into the calling function.
+For example, you might write the method `sum` as:
 
 ```julia
 function sum(A::AbstractArray)
diff --git a/src/builtins.c b/src/builtins.c
index 3a0014144a107..ad32f9386a1c1 100644
--- a/src/builtins.c
+++ b/src/builtins.c
@@ -591,6 +591,10 @@ JL_CALLABLE(jl_f_svec)
 
 JL_CALLABLE(jl_f_getfield)
 {
+    if (nargs == 3) {
+        JL_TYPECHK(getfield, bool, args[2]);
+        nargs -= 1;
+    }
     JL_NARGS(getfield, 2, 2);
     jl_value_t *v = args[0];
     jl_value_t *vt = (jl_value_t*)jl_typeof(v);
@@ -682,6 +686,10 @@ static jl_value_t *get_fieldtype(jl_value_t *t, jl_value_t *f)
 
 JL_CALLABLE(jl_f_fieldtype)
 {
+    if (nargs == 3) {
+        JL_TYPECHK(fieldtype, bool, args[2]);
+        nargs -= 1;
+    }
     JL_NARGS(fieldtype, 2, 2);
     jl_datatype_t *st = (jl_datatype_t*)args[0];
     if (st == jl_module_type)
@@ -907,20 +915,20 @@ JL_CALLABLE(jl_f_arraysize)
 static size_t array_nd_index(jl_array_t *a, jl_value_t **args, size_t nidxs,
                              const char *fname)
 {
-    size_t i=0;
-    size_t k, stride=1;
+    size_t i = 0;
+    size_t k, stride = 1;
     size_t nd = jl_array_ndims(a);
-    for(k=0; k < nidxs; k++) {
+    for (k = 0; k < nidxs; k++) {
         if (!jl_is_long(args[k]))
             jl_type_error(fname, (jl_value_t*)jl_long_type, args[k]);
-        size_t ii = jl_unbox_long(args[k])-1;
+        size_t ii = jl_unbox_long(args[k]) - 1;
         i += ii * stride;
-        size_t d = k>=nd ? 1 : jl_array_dim(a, k);
-        if (k < nidxs-1 && ii >= d)
+        size_t d = (k >= nd) ? 1 : jl_array_dim(a, k);
+        if (k < nidxs - 1 && ii >= d)
             jl_bounds_error_v((jl_value_t*)a, args, nidxs);
         stride *= d;
     }
-    for(; k < nd; k++)
+    for (; k < nd; k++)
         stride *= jl_array_dim(a, k);
     if (i >= stride)
         jl_bounds_error_v((jl_value_t*)a, args, nidxs);
@@ -929,21 +937,23 @@ static size_t array_nd_index(jl_array_t *a, jl_value_t **args, size_t nidxs,
 
 JL_CALLABLE(jl_f_arrayref)
 {
-    JL_NARGSV(arrayref, 2);
-    JL_TYPECHK(arrayref, array, args[0]);
-    jl_array_t *a = (jl_array_t*)args[0];
-    size_t i = array_nd_index(a, &args[1], nargs-1, "arrayref");
+    JL_NARGSV(arrayref, 3);
+    JL_TYPECHK(arrayref, bool, args[0]);
+    JL_TYPECHK(arrayref, array, args[1]);
+    jl_array_t *a = (jl_array_t*)args[1];
+    size_t i = array_nd_index(a, &args[2], nargs - 2, "arrayref");
     return jl_arrayref(a, i);
 }
 
 JL_CALLABLE(jl_f_arrayset)
 {
-    JL_NARGSV(arrayset, 3);
-    JL_TYPECHK(arrayset, array, args[0]);
-    jl_array_t *a = (jl_array_t*)args[0];
-    size_t i = array_nd_index(a, &args[2], nargs-2, "arrayset");
-    jl_arrayset(a, args[1], i);
-    return args[0];
+    JL_NARGSV(arrayset, 4);
+    JL_TYPECHK(arrayset, bool, args[0]);
+    JL_TYPECHK(arrayset, array, args[1]);
+    jl_array_t *a = (jl_array_t*)args[1];
+    size_t i = array_nd_index(a, &args[3], nargs - 3, "arrayset");
+    jl_arrayset(a, args[2], i);
+    return args[1];
 }
 
 // IntrinsicFunctions ---------------------------------------------------------
diff --git a/src/cgutils.cpp b/src/cgutils.cpp
index 268c82579b3b7..b9299caa595b0 100644
--- a/src/cgutils.cpp
+++ b/src/cgutils.cpp
@@ -1105,24 +1105,29 @@ static void emit_leafcheck(jl_codectx_t &ctx, Value *typ, const std::string &msg
 }
 
 #define CHECK_BOUNDS 1
-static bool bounds_check_enabled(jl_codectx_t &ctx) {
+static bool bounds_check_enabled(jl_codectx_t &ctx, jl_value_t *inbounds) {
 #if CHECK_BOUNDS==1
-    return (!ctx.is_inbounds &&
-         jl_options.check_bounds != JL_OPTIONS_CHECK_BOUNDS_OFF) ||
-         jl_options.check_bounds == JL_OPTIONS_CHECK_BOUNDS_ON;
+    if (jl_options.check_bounds == JL_OPTIONS_CHECK_BOUNDS_ON)
+        return 1;
+    if (jl_options.check_bounds == JL_OPTIONS_CHECK_BOUNDS_OFF)
+        return 0;
+    if (inbounds == jl_false)
+        return 0;
+    return 1;
 #else
     return 0;
 #endif
 }
 
-static Value *emit_bounds_check(jl_codectx_t &ctx, const jl_cgval_t &ainfo, jl_value_t *ty, Value *i, Value *len)
+static Value *emit_bounds_check(jl_codectx_t &ctx, const jl_cgval_t &ainfo, jl_value_t *ty, Value *i, Value *len, jl_value_t *boundscheck)
 {
     Value *im1 = ctx.builder.CreateSub(i, ConstantInt::get(T_size, 1));
+    jl_cgval_t ib = emit_expr(ctx, boundscheck);
 #if CHECK_BOUNDS==1
-    if (bounds_check_enabled(ctx)) {
+    if (bounds_check_enabled(ctx, ib.constant)) {
         Value *ok = ctx.builder.CreateICmpULT(im1, len);
-        BasicBlock *failBB = BasicBlock::Create(jl_LLVMContext,"fail",ctx.f);
-        BasicBlock *passBB = BasicBlock::Create(jl_LLVMContext,"pass");
+        BasicBlock *failBB = BasicBlock::Create(jl_LLVMContext, "fail", ctx.f);
+        BasicBlock *passBB = BasicBlock::Create(jl_LLVMContext, "pass");
         ctx.builder.CreateCondBr(ok, passBB, failBB);
         ctx.builder.SetInsertPoint(failBB);
         if (!ty) { // jl_value_t** tuple (e.g. the vararg)
@@ -1351,12 +1356,12 @@ static Value *data_pointer(jl_codectx_t &ctx, const jl_cgval_t &x, Type *astype
 
 static bool emit_getfield_unknownidx(jl_codectx_t &ctx,
         jl_cgval_t *ret, const jl_cgval_t &strct,
-        Value *idx, jl_datatype_t *stt)
+        Value *idx, jl_datatype_t *stt, jl_value_t *inbounds)
 {
     size_t nfields = jl_datatype_nfields(stt);
     if (strct.ispointer()) { // boxed or stack
         if (is_datatype_all_pointers(stt)) {
-            idx = emit_bounds_check(ctx, strct, (jl_value_t*)stt, idx, ConstantInt::get(T_size, nfields));
+            idx = emit_bounds_check(ctx, strct, (jl_value_t*)stt, idx, ConstantInt::get(T_size, nfields), inbounds);
             bool maybe_null = (unsigned)stt->ninitialized != nfields;
             size_t minimum_field_size = (size_t)-1;
             for (size_t i = 0; i < nfields; ++i) {
@@ -1379,7 +1384,7 @@ static bool emit_getfield_unknownidx(jl_codectx_t &ctx,
         else if (is_tupletype_homogeneous(stt->types)) {
             assert(nfields > 0); // nf == 0 trapped by all_pointers case
             jl_value_t *jt = jl_field_type(stt, 0);
-            idx = emit_bounds_check(ctx, strct, (jl_value_t*)stt, idx, ConstantInt::get(T_size, nfields));
+            idx = emit_bounds_check(ctx, strct, (jl_value_t*)stt, idx, ConstantInt::get(T_size, nfields), inbounds);
             Value *ptr = data_pointer(ctx, strct);
             if (!stt->mutabl) {
                 // just compute the pointer and let user load it when necessary
@@ -1403,14 +1408,15 @@ static bool emit_getfield_unknownidx(jl_codectx_t &ctx,
     else if (is_tupletype_homogeneous(stt->types)) {
         assert(jl_isbits(stt));
         if (nfields == 0) {
-            idx = emit_bounds_check(ctx, ghostValue(stt),
-                                    (jl_value_t*)stt, idx, ConstantInt::get(T_size, nfields));
+            idx = emit_bounds_check(
+                    ctx, ghostValue(stt), (jl_value_t*)stt,
+                    idx, ConstantInt::get(T_size, nfields), inbounds);
             *ret = jl_cgval_t();
             return true;
         }
         assert(!jl_field_isptr(stt, 0));
         jl_value_t *jt = jl_field_type(stt, 0);
-        Value *idx0 = emit_bounds_check(ctx, strct, (jl_value_t*)stt, idx, ConstantInt::get(T_size, nfields));
+        Value *idx0 = emit_bounds_check(ctx, strct, (jl_value_t*)stt, idx, ConstantInt::get(T_size, nfields), inbounds);
         if (strct.isghost) {
             *ret = ghostValue(jt);
             return true;
@@ -1696,23 +1702,23 @@ static Value *emit_arraysize_for_unsafe_dim(jl_codectx_t &ctx,
 }
 
 // `nd == -1` means the dimension is unknown.
-static Value *emit_array_nd_index(jl_codectx_t &ctx,
-        const jl_cgval_t &ainfo, jl_value_t *ex, ssize_t nd, jl_value_t **args, size_t nidxs)
+static Value *emit_array_nd_index(
+        jl_codectx_t &ctx, const jl_cgval_t &ainfo, jl_value_t *ex, ssize_t nd,
+        jl_value_t **args, size_t nidxs, jl_value_t *inbounds)
 {
     Value *a = boxed(ctx, ainfo);
     Value *i = ConstantInt::get(T_size, 0);
     Value *stride = ConstantInt::get(T_size, 1);
+    jl_cgval_t ib = emit_expr(ctx, inbounds);
 #if CHECK_BOUNDS==1
-    bool bc = (!ctx.is_inbounds &&
-               jl_options.check_bounds != JL_OPTIONS_CHECK_BOUNDS_OFF) ||
-        jl_options.check_bounds == JL_OPTIONS_CHECK_BOUNDS_ON;
-    BasicBlock *failBB=NULL, *endBB=NULL;
+    bool bc = bounds_check_enabled(ctx, ib.constant);
+    BasicBlock *failBB = NULL, *endBB = NULL;
     if (bc) {
         failBB = BasicBlock::Create(jl_LLVMContext, "oob");
         endBB = BasicBlock::Create(jl_LLVMContext, "idxend");
     }
 #endif
-    Value **idxs = (Value**)alloca(sizeof(Value*)*nidxs);
+    Value **idxs = (Value**)alloca(sizeof(Value*) * nidxs);
     for (size_t k = 0; k < nidxs; k++) {
         idxs[k] = emit_unbox(ctx, T_size, emit_expr(ctx, args[k]), NULL);
     }
diff --git a/src/codegen.cpp b/src/codegen.cpp
index a0c701acbfe6f..8c8187b101442 100644
--- a/src/codegen.cpp
+++ b/src/codegen.cpp
@@ -562,8 +562,6 @@ class jl_codectx_t {
     Value *world_age_field = NULL;
 
     bool debug_enabled = false;
-    bool is_inbounds = false;
-
     const jl_cgparams_t *params = NULL;
 
     jl_codectx_t(LLVMContext &llvmctx)
@@ -1950,7 +1948,7 @@ static jl_value_t *static_eval(jl_codectx_t &ctx, jl_value_t *ex, int sparams=tr
         if (e->head == call_sym) {
             jl_value_t *f = static_eval(ctx, jl_exprarg(e, 0), sparams, allow_alloc);
             if (f) {
-                if (jl_array_dim0(e->args) == 3 && f==jl_builtin_getfield) {
+                if (jl_array_dim0(e->args) == 3 && f == jl_builtin_getfield) {
                     m = (jl_module_t*)static_eval(ctx, jl_exprarg(e, 1), sparams, allow_alloc);
                     // Check the tag before evaluating `s` so that a value of random
                     // type won't be corrupted.
@@ -2113,7 +2111,7 @@ static void simple_escape_analysis(jl_codectx_t &ctx, jl_value_t *expr, bool esc
                         esc = false;
                     }
                     else {
-                        if ((fv == jl_builtin_getfield && alen == 3 &&
+                        if ((fv == jl_builtin_getfield && (alen == 3 || alen == 4) &&
                              expr_type(ctx, jl_exprarg(e, 2)) == (jl_value_t*)jl_long_type) ||
                             fv == jl_builtin_nfields ||
                             (fv == jl_builtin__apply && alen == 3)) {
@@ -2595,10 +2593,10 @@ static bool emit_builtin_call(jl_codectx_t &ctx, jl_cgval_t *ret, jl_value_t *f,
         }
     }
 
-    else if (f==jl_builtin_arrayref && nargs>=2) {
-        jl_value_t *aty = expr_type(ctx, args[1]); rt1 = aty;
+    else if (f == jl_builtin_arrayref && nargs >= 3) {
+        jl_value_t *aty = expr_type(ctx, args[2]); rt1 = aty;
         bool indexes_ok = true;
-        for (size_t i=2; i <= nargs; i++) {
+        for (size_t i = 3; i <= nargs; i++) {
             if (expr_type(ctx, args[i]) != (jl_value_t*)jl_long_type) {
                 indexes_ok = false;
                 break;
@@ -2611,19 +2609,20 @@ static bool emit_builtin_call(jl_codectx_t &ctx, jl_cgval_t *ret, jl_value_t *f,
                 if (!jl_array_store_unboxed(ety))
                     ety = (jl_value_t*)jl_any_type;
                 jl_value_t *ndp = jl_tparam1(aty_dt);
-                if (jl_is_long(ndp) || nargs==2) {
-                    jl_cgval_t ary = emit_expr(ctx, args[1]);
+                if (jl_is_long(ndp) || nargs == 3) {
+                    jl_cgval_t ary = emit_expr(ctx, args[2]);
                     ssize_t nd = jl_is_long(ndp) ? jl_unbox_long(ndp) : -1;
-                    Value *idx = emit_array_nd_index(ctx, ary, args[1], nd, &args[2], nargs - 1);
-                    if (jl_array_store_unboxed(ety) &&
-                        jl_datatype_size(ety) == 0) {
+                    Value *idx = emit_array_nd_index(ctx, ary, args[2], nd, &args[3], nargs - 2, args[1]);
+                    if (jl_array_store_unboxed(ety) && jl_datatype_size(ety) == 0) {
                         assert(jl_is_datatype(ety));
                         assert(((jl_datatype_t*)ety)->instance != NULL);
                         *ret = ghostValue(ety);
                     }
                     else {
-                        *ret = typed_load(ctx, emit_arrayptr(ctx, ary, args[1]), idx, ety,
-                            jl_array_store_unboxed(ety) ? tbaa_arraybuf : tbaa_ptrarraybuf);
+                        *ret = typed_load(ctx,
+                                emit_arrayptr(ctx, ary, args[2]),
+                                idx, ety,
+                                jl_array_store_unboxed(ety) ? tbaa_arraybuf : tbaa_ptrarraybuf);
                     }
                     JL_GC_POP();
                     return true;
@@ -2632,11 +2631,11 @@ static bool emit_builtin_call(jl_codectx_t &ctx, jl_cgval_t *ret, jl_value_t *f,
         }
     }
 
-    else if (f==jl_builtin_arrayset && nargs>=3) {
-        jl_value_t *aty = expr_type(ctx, args[1]); rt1 = aty;
-        jl_value_t *vty = expr_type(ctx, args[2]); rt2 = vty;
+    else if (f == jl_builtin_arrayset && nargs >= 4) {
+        jl_value_t *aty = expr_type(ctx, args[2]); rt1 = aty;
+        jl_value_t *vty = expr_type(ctx, args[3]); rt2 = vty;
         bool indexes_ok = true;
-        for (size_t i=3; i <= nargs; i++) {
+        for (size_t i = 4; i <= nargs; i++) {
             if (expr_type(ctx, args[i]) != (jl_value_t*)jl_long_type) {
                 indexes_ok = false;
                 break;
@@ -2650,17 +2649,17 @@ static bool emit_builtin_call(jl_codectx_t &ctx, jl_cgval_t *ret, jl_value_t *f,
                 if (isboxed)
                     ety = (jl_value_t*)jl_any_type;
                 jl_value_t *ndp = jl_tparam1(aty_dt);
-                if (jl_is_long(ndp) || nargs==3) {
-                    jl_cgval_t ary = emit_expr(ctx, args[1]);
+                if (jl_is_long(ndp) || nargs == 4) {
+                    jl_cgval_t ary = emit_expr(ctx, args[2]);
                     ssize_t nd = jl_is_long(ndp) ? jl_unbox_long(ndp) : -1;
-                    Value *idx = emit_array_nd_index(ctx, ary, args[1], nd, &args[3], nargs - 2);
+                    Value *idx = emit_array_nd_index(ctx, ary, args[2], nd, &args[4], nargs - 3, args[1]);
                     if (!isboxed && jl_datatype_size(ety) == 0) {
                         // no-op, but emit expr for possible effects
                         assert(jl_is_datatype(ety));
-                        emit_expr(ctx, args[2]);
+                        emit_expr(ctx, args[3]);
                     }
                     else {
-                        jl_cgval_t v = emit_expr(ctx, args[2]);
+                        jl_cgval_t v = emit_expr(ctx, args[3]);
                         PHINode *data_owner = NULL; // owner object against which the write barrier must check
                         if (isboxed) { // if not boxed we don't need a write barrier
                             assert(ary.isboxed);
@@ -2696,7 +2695,7 @@ static bool emit_builtin_call(jl_codectx_t &ctx, jl_cgval_t *ret, jl_value_t *f,
                             data_owner->addIncoming(own_ptr, ownedBB);
                         }
                         typed_store(ctx,
-                                    emit_arrayptr(ctx, ary, args[1], isboxed),
+                                    emit_arrayptr(ctx, ary, args[2], isboxed),
                                     idx, v, ety,
                                     !isboxed ? tbaa_arraybuf : tbaa_ptrarraybuf,
                                     data_owner, 0,
@@ -2710,7 +2709,7 @@ static bool emit_builtin_call(jl_codectx_t &ctx, jl_cgval_t *ret, jl_value_t *f,
         }
     }
 
-    else if (f==jl_builtin_getfield && nargs==2) {
+    else if (f == jl_builtin_getfield && (nargs == 2 || nargs == 3)) {
         if (jl_is_quotenode(args[2]) && jl_is_symbol(jl_fieldref(args[2], 0))) {
             *ret = emit_getfield(ctx, args[1],
                                  (jl_sym_t*)jl_fieldref(args[2], 0));
@@ -2728,7 +2727,8 @@ static bool emit_builtin_call(jl_codectx_t &ctx, jl_cgval_t *ret, jl_value_t *f,
             idx = emit_bounds_check(
                     ctx,
                     jl_cgval_t(ctx.builder.CreateGEP(ctx.argArray, ConstantInt::get(T_size, ctx.nReqArgs)), NULL, false, NULL, NULL),
-                    NULL, idx, valen);
+                    NULL, idx, valen,
+                    nargs == 3 ? args[3] : jl_true);
             idx = ctx.builder.CreateAdd(idx, ConstantInt::get(T_size, ctx.nReqArgs));
             *ret = mark_julia_type(
                     ctx,
@@ -2756,7 +2756,8 @@ static bool emit_builtin_call(jl_codectx_t &ctx, jl_cgval_t *ret, jl_value_t *f,
                 else {
                     // unknown index
                     Value *vidx = emit_unbox(ctx, T_size, emit_expr(ctx, args[2]), (jl_value_t*)jl_long_type);
-                    if (emit_getfield_unknownidx(ctx, ret, strct, vidx, (jl_datatype_t*)utt)) {
+                    jl_value_t *boundscheck = (nargs == 3 ? args[3] : jl_true);
+                    if (emit_getfield_unknownidx(ctx, ret, strct, vidx, (jl_datatype_t*)utt, boundscheck)) {
                         if (ret->typ == (jl_value_t*)jl_any_type) // improve the type, if known from the expr
                             ret->typ = expr_type(ctx, expr);
                         JL_GC_POP();
@@ -2779,11 +2780,14 @@ static bool emit_builtin_call(jl_codectx_t &ctx, jl_cgval_t *ret, jl_value_t *f,
                     Value *vidx = emit_unbox(ctx, T_size, emit_expr(ctx, args[2]), (jl_value_t*)jl_long_type);
                     // This is not necessary for correctness, but allows to omit
                     // the extra code for getting the length of the tuple
-                    if (!bounds_check_enabled(ctx)) {
+                    jl_cgval_t boundscheck = emit_expr(ctx, nargs == 3 ? args[3] : jl_true);
+                    if (!bounds_check_enabled(ctx, boundscheck.constant)) {
                         vidx = ctx.builder.CreateSub(vidx, ConstantInt::get(T_size, 1));
                     } else {
-                        vidx = emit_bounds_check(ctx, strct, (jl_value_t*)stt, vidx,
-                            emit_datatype_nfields(ctx, emit_typeof_boxed(ctx, strct)));
+                        vidx = emit_bounds_check(ctx,
+                                strct, (jl_value_t*)stt, vidx,
+                                emit_datatype_nfields(ctx, emit_typeof_boxed(ctx, strct)),
+                                jl_true);
                     }
                     Value *ptr = data_pointer(ctx, strct);
                     *ret = typed_load(ctx, ptr, vidx, jt, strct.tbaa, false);
@@ -2865,7 +2869,7 @@ static bool emit_builtin_call(jl_codectx_t &ctx, jl_cgval_t *ret, jl_value_t *f,
         }
     }
 
-    else if (f==jl_builtin_fieldtype && nargs==2) {
+    else if (f == jl_builtin_fieldtype && (nargs == 2 || nargs == 3)) {
         jl_datatype_t *sty = (jl_datatype_t*)expr_type(ctx, args[1]);
         rt1 = (jl_value_t*)sty;
         if (jl_is_type_type((jl_value_t*)sty) || sty == jl_datatype_type) {
@@ -2877,8 +2881,11 @@ static bool emit_builtin_call(jl_codectx_t &ctx, jl_cgval_t *ret, jl_value_t *f,
                 Value *types_svec = emit_datatype_types(ctx, tyv);
                 Value *types_len = emit_datatype_nfields(ctx, tyv);
                 Value *idx = emit_unbox(ctx, T_size, emit_expr(ctx, args[2]), (jl_value_t*)jl_long_type);
-                emit_bounds_check(ctx, ty, (jl_value_t*)jl_datatype_type, idx, types_len);
-                Value *fieldtyp = tbaa_decorate(tbaa_const, ctx.builder.CreateLoad(ctx.builder.CreateGEP(decay_derived(emit_bitcast(ctx, types_svec, T_pprjlvalue)), idx)));
+                jl_value_t *boundscheck = (nargs == 3 ? args[3] : jl_true);
+                emit_bounds_check(ctx, ty, (jl_value_t*)jl_datatype_type, idx, types_len, boundscheck);
+                Value *fieldtyp = tbaa_decorate(tbaa_const,
+                        ctx.builder.CreateLoad(ctx.builder.CreateGEP(decay_derived(
+                                    emit_bitcast(ctx, types_svec, T_pprjlvalue)), idx)));
                 *ret = mark_julia_type(ctx, fieldtyp, true, expr_type(ctx, expr));
                 JL_GC_POP();
                 return true;
@@ -3870,8 +3877,7 @@ static void emit_stmtpos(jl_codectx_t &ctx, jl_value_t *expr)
     jl_expr_t *ex = (jl_expr_t*)expr;
     jl_value_t **args = (jl_value_t**)jl_array_data(ex->args);
     jl_sym_t *head = ex->head;
-    if (head == line_sym || head == meta_sym || head == boundscheck_sym ||
-        head == inbounds_sym) {
+    if (head == line_sym || head == meta_sym || head == inbounds_sym) {
         // some expression types are metadata and can be ignored
         // in statement position
         return;
@@ -4100,7 +4106,7 @@ static jl_cgval_t emit_expr(jl_codectx_t &ctx, jl_value_t *expr)
         jl_error("Expr(:inbounds) in value position");
     }
     else if (head == boundscheck_sym) {
-        jl_error("Expr(:boundscheck) in value position");
+        return mark_julia_const(bounds_check_enabled(ctx, jl_true) ? jl_true : jl_false);
     }
     else {
         if (!strcmp(jl_symbol_name(head), "$"))
@@ -5522,24 +5528,14 @@ static std::unique_ptr<Module> emit_function(
         DebugLoc loc;
         StringRef file;
         ssize_t line;
-        bool is_inbounds;
         bool loc_changed;
         bool is_poploc;
         bool in_user_code;
     };
     std::vector<StmtProp> stmtprops(stmtslen);
     std::vector<DbgState> DI_stack;
-    std::vector<bool> inbounds_stack{false};
-    auto is_inbounds = [&] () {
-        // inbounds rule is either of top two values on inbounds stack are true
-        size_t sz = inbounds_stack.size();
-        bool inbounds = sz && inbounds_stack.back();
-        if (sz > 1)
-            inbounds |= inbounds_stack[sz - 2];
-        return inbounds;
-    };
     StmtProp cur_prop{topdebugloc, filename, toplineno,
-            false, true, false, false};
+            true, false, false};
     ctx.line = &cur_prop.line;
     if (coverage_mode != JL_LOG_NONE || malloc_log_mode) {
         cur_prop.in_user_code = (!jl_is_submodule(ctx.module, jl_base_module) &&
@@ -5646,29 +5642,9 @@ static std::unique_ptr<Module> emit_function(
                 cur_prop.loc_changed = true;
             }
         }
-        if (expr) {
-            jl_value_t **args = (jl_value_t**)jl_array_data(expr->args);
-            if (expr->head == inbounds_sym) {
-                // manipulate inbounds stack
-                if (jl_array_len(expr->args) > 0) {
-                    jl_value_t *arg = args[0];
-                    if (arg == jl_true) {
-                        inbounds_stack.push_back(true);
-                    }
-                    else if (arg == jl_false) {
-                        inbounds_stack.push_back(false);
-                    }
-                    else if (!inbounds_stack.empty()) {
-                        inbounds_stack.pop_back();
-                    }
-                }
-            }
-        }
-        cur_prop.is_inbounds = is_inbounds();
         stmtprops[i] = cur_prop;
     }
     DI_stack.clear();
-    inbounds_stack.clear();
 
     // step 12. Do codegen in control flow order
     std::vector<std::pair<int,BasicBlock*>> workstack;
@@ -5771,7 +5747,6 @@ static std::unique_ptr<Module> emit_function(
             !props.is_poploc) {
             coverageVisitLine(ctx, props.file, props.line);
         }
-        ctx.is_inbounds = props.is_inbounds;
         jl_value_t *stmt = jl_array_ptr_ref(stmts, cursor);
         jl_expr_t *expr = jl_is_expr(stmt) ? (jl_expr_t*)stmt : nullptr;
         if (jl_is_labelnode(stmt)) {
diff --git a/src/common_symbols1.inc b/src/common_symbols1.inc
index 013dfdd15dc06..09b1cf5c6adb5 100644
--- a/src/common_symbols1.inc
+++ b/src/common_symbols1.inc
@@ -14,7 +14,6 @@ jl_symbol("getindex"),
 jl_symbol("new"),
 jl_symbol("arrayref"),
 jl_symbol("static_parameter"),
-jl_symbol("abstractarray.jl"),
 jl_symbol("slt_int"),
 jl_symbol("convert"),
 jl_symbol("start"),
diff --git a/src/common_symbols2.inc b/src/common_symbols2.inc
index 4554dfef79b5b..0a1d0eb73cbea 100644
--- a/src/common_symbols2.inc
+++ b/src/common_symbols2.inc
@@ -249,4 +249,4 @@ jl_symbol("NF"),
 jl_symbol("isvarargtype"),
 jl_symbol("n"),
 jl_symbol("inferred"),
-jl_symbol("eachindex"),
+jl_symbol("abstractarray.jl"),
diff --git a/src/dump.c b/src/dump.c
index e25b7f5b110f7..627346a9a0dc7 100644
--- a/src/dump.c
+++ b/src/dump.c
@@ -2672,7 +2672,7 @@ void jl_init_serializer(void)
 
                      jl_emptysvec, jl_emptytuple, jl_false, jl_true, jl_nothing, jl_any_type,
                      call_sym, invoke_sym, goto_ifnot_sym, return_sym, body_sym, line_sym,
-                     lambda_sym, jl_symbol("tuple"), assign_sym, isdefined_sym,
+                     lambda_sym, jl_symbol("tuple"), assign_sym, isdefined_sym, boundscheck_sym,
 
                      // empirical list of very common symbols
                      #include "common_symbols1.inc"
diff --git a/src/interpreter.c b/src/interpreter.c
index 19f4c576e00f1..e3cc61c22f5ee 100644
--- a/src/interpreter.c
+++ b/src/interpreter.c
@@ -497,12 +497,15 @@ static jl_value_t *eval(jl_value_t *e, interpreter_state *s)
             jl_errorf("syntax: %s", jl_string_data(args[0]));
         jl_throw(args[0]);
     }
+    else if (ex->head == boundscheck_sym) {
+        return jl_true;
+    }
     else if (ex->head == boundscheck_sym || ex->head == inbounds_sym || ex->head == fastmath_sym ||
              ex->head == simdloop_sym || ex->head == meta_sym) {
         return jl_nothing;
     }
     jl_errorf("unsupported or misplaced expression %s", jl_symbol_name(ex->head));
-    return (jl_value_t*)jl_nothing;
+    abort();
 }
 
 jl_value_t *jl_toplevel_eval_body(jl_module_t *m, jl_array_t *stmts)
diff --git a/src/julia-syntax.scm b/src/julia-syntax.scm
index b77fdb6ed096d..11cce9153005b 100644
--- a/src/julia-syntax.scm
+++ b/src/julia-syntax.scm
@@ -498,10 +498,10 @@
                  (block
                   ;; ii = i*2 - 1
                   (= ,ii (call (top -) (call (top *) ,i 2) 1))
-                  (= ,elt (call (core arrayref) ,kw ,ii))
+                  (= ,elt (call (core arrayref) true ,kw ,ii))
                   ,(foldl (lambda (kvf else)
                             (let* ((k     (car kvf))
-                                   (rval0 `(call (core arrayref) ,kw
+                                   (rval0 `(call (core arrayref) true ,kw
                                                  (call (top +) ,ii 1)))
                                    ;; note: if the "declared" type of a KW arg
                                    ;; includes something from keyword-sparams
@@ -545,7 +545,7 @@
                               `(foreigncall 'jl_array_ptr_1d_push (core Void) (call (core svec) Any Any)
                                             'ccall 2
                                             ,rkw (tuple ,elt
-                                                        (call (core arrayref) ,kw
+                                                        (call (core arrayref) true ,kw
                                                               (call (top +) ,ii 1)))))
                           (map list vars vals flags))))
             ;; set keywords that weren't present to their default values
@@ -3619,6 +3619,7 @@ f(x) = yt(x)
             ((implicit-global) #f)
             ((const) (emit e))
             ((isdefined) (if tail (emit-return e) e))
+            ((boundscheck) (if tail (emit-return e) e))
 
             ;; top level expressions returning values
             ((abstract_type bits_type composite_type thunk toplevel module)
diff --git a/src/julia_internal.h b/src/julia_internal.h
index 5066922872d2e..127f7451f159f 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -997,7 +997,9 @@ extern jl_sym_t *meta_sym; extern jl_sym_t *list_sym;
 extern jl_sym_t *inert_sym; extern jl_sym_t *static_parameter_sym;
 extern jl_sym_t *polly_sym; extern jl_sym_t *inline_sym;
 extern jl_sym_t *propagate_inbounds_sym;
-extern jl_sym_t *isdefined_sym; extern jl_sym_t *nospecialize_sym;
+extern jl_sym_t *isdefined_sym;
+extern jl_sym_t *nospecialize_sym;
+extern jl_sym_t *boundscheck_sym;
 
 void jl_register_fptrs(uint64_t sysimage_base, const char *base, const int32_t *offsets,
                        jl_method_instance_t **linfos, size_t n);
diff --git a/test/boundscheck_exec.jl b/test/boundscheck_exec.jl
index df2deea73cb51..d94cd5e95e915 100644
--- a/test/boundscheck_exec.jl
+++ b/test/boundscheck_exec.jl
@@ -27,16 +27,20 @@ function A1_inbounds()
     end
     return r
 end
+A1_wrap() = @inbounds return A1_inbounds()
 
 if bc_opt == bc_default
     @test A1() == 1
-    @test A1_inbounds() == 0
+    @test A1_inbounds() == 1
+    @test A1_wrap() == 0
 elseif bc_opt == bc_on
     @test A1() == 1
     @test A1_inbounds() == 1
+    @test A1_wrap() == 1
 else
     @test A1() == 0
     @test A1_inbounds() == 0
+    @test A1_wrap() == 0
 end
 
 # test for boundscheck block eliminated one layer deep, if the called method is inlined
@@ -124,25 +128,31 @@ end
 # elide a throw
 cb(x) = x > 0 || throw(BoundsError())
 
-function B1()
-    y = [1,2,3]
+@inline function B1()
+    y = [1, 2, 3]
     @inbounds begin
         @boundscheck cb(0)
     end
     return 0
 end
+B1_wrap() = @inbounds return B1()
 
-if bc_opt == bc_default || bc_opt == bc_off
+if bc_opt == bc_default
+    @test_throws BoundsError B1()
+    @test B1_wrap() == 0
+elseif bc_opt == bc_off
     @test B1() == 0
+    @test B1_wrap() == 0
 else
     @test_throws BoundsError B1()
+    @test_throws BoundsError B1_wrap()
 end
 
 # elide a simple branch
 cond(x) = x > 0 ? x : -x
 
 function B2()
-    y = [1,2,3]
+    y = [1, 2, 3]
     @inbounds begin
         @boundscheck cond(0)
     end
diff --git a/test/core.jl b/test/core.jl
index 6185f83acafef..f30104d215f54 100644
--- a/test/core.jl
+++ b/test/core.jl
@@ -3776,7 +3776,7 @@ end
 # `TypeVar`) without crashing
 let
     function arrayset_unknown_dim{T}(::Type{T}, n)
-        Base.arrayset(reshape(Vector{T}(1), ones(Int, n)...), 2, 1)
+        Core.arrayset(true, reshape(Vector{T}(1), ones(Int, n)...), 2, 1)
     end
     arrayset_unknown_dim(Any, 1)
     arrayset_unknown_dim(Any, 2)
@@ -3968,16 +3968,13 @@ end
 
 function metadata_matches(ast::CodeInfo)
     inbounds_cnt = Ref(0)
-    boundscheck_cnt = Ref(0)
     for ex in ast.code::Array{Any,1}
         if isa(ex, Expr)
             ex = ex::Expr
             count_expr_push(ex, :inbounds, inbounds_cnt)
-            count_expr_push(ex, :boundscheck, boundscheck_cnt)
         end
     end
     @test inbounds_cnt[] == 0
-    @test boundscheck_cnt[] == 0
 end
 
 function test_metadata_matches(@nospecialize(f), @nospecialize(tt))
@@ -3993,14 +3990,9 @@ function f2()
     end
 end
 # No, don't write code this way...
-@eval function f3()
-    a = $(Expr(:boundscheck, true))
-    return 1
-    b = $(Expr(:boundscheck, :pop))
-end
 @noinline function g(a)
 end
-@eval function f4()
+@eval function f3()
     g($(Expr(:inbounds, true)))
     @goto out
     g($(Expr(:inbounds, :pop)))
@@ -4010,7 +4002,6 @@ end
 test_metadata_matches(f1, Tuple{})
 test_metadata_matches(f2, Tuple{})
 test_metadata_matches(f3, Tuple{})
-test_metadata_matches(f4, Tuple{})
 
 end