diff --git a/accelerate-llvm-native/src/Data/Array/Accelerate/LLVM/Native/CodeGen.hs b/accelerate-llvm-native/src/Data/Array/Accelerate/LLVM/Native/CodeGen.hs
index 8bea0a8a7..80f38af77 100644
--- a/accelerate-llvm-native/src/Data/Array/Accelerate/LLVM/Native/CodeGen.hs
+++ b/accelerate-llvm-native/src/Data/Array/Accelerate/LLVM/Native/CodeGen.hs
@@ -106,7 +106,10 @@ codegen name env (Clustered c b) args =
     -- body acc loopsize'
     acc' <- operandsMapToPairs acc $ \(accTypeR, toOp, fromOp) -> fmap fromOp $ flip execStateT (toOp acc) $ case loopsize of
       LS loopshr loopsh -> 
-        workstealChunked loopshr workstealIndex workstealActiveThreads (flipShape loopshr loopsh) accTypeR (body loopshr toOp fromOp)
+        workstealChunked loopshr workstealIndex workstealActiveThreads (flipShape loopshr loopsh) accTypeR 
+          (body loopshr toOp fromOp, -- the LoopWork
+          StateT $ \op -> second toOp <$> runStateT (foo (liftInt 0) []) (fromOp op)) -- the action to run after the outer loop
+    -- acc'' <- flip execStateT acc' $ foo (liftInt 0) []
     pure ()
     where
       ba = makeBackendArg @NativeOp args gamma c b
@@ -114,7 +117,6 @@ codegen name env (Clustered c b) args =
       body :: ShapeR sh -> (Accumulated -> a) -> (a -> Accumulated) -> LoopWork sh (StateT a (CodeGen Native))
       body ShapeRz _ _ = LoopWorkZ
       body (ShapeRsnoc shr) toOp fromOp = LoopWorkSnoc (body shr toOp fromOp) (\i is -> StateT $ \op -> second toOp <$> runStateT (foo i is) (fromOp op))
-        where
       foo :: Operands Int -> [Operands Int] -> StateT Accumulated (CodeGen Native) ()
       foo linix ixs = do
         let d = length ixs -- TODO check: this or its inverse (i.e. totalDepth - length ixs)?
diff --git a/accelerate-llvm-native/src/Data/Array/Accelerate/LLVM/Native/CodeGen/Loop.hs b/accelerate-llvm-native/src/Data/Array/Accelerate/LLVM/Native/CodeGen/Loop.hs
index 3e1ff44fd..76b143d51 100644
--- a/accelerate-llvm-native/src/Data/Array/Accelerate/LLVM/Native/CodeGen/Loop.hs
+++ b/accelerate-llvm-native/src/Data/Array/Accelerate/LLVM/Native/CodeGen/Loop.hs
@@ -80,10 +80,12 @@ imapNestFromTo shr start end extent body =
       $ \sz      -> imapFromTo ssz esz
       $ \i       -> k (OP_Pair sz i)
 
-loopWorkFromTo :: ShapeR sh -> Operands sh -> Operands sh -> Operands sh -> TypeR s -> LoopWork sh (StateT (Operands s) (CodeGen Native)) -> StateT (Operands s) (CodeGen Native) ()
-loopWorkFromTo shr start end extent tys loopwork = do
+loopWorkFromTo :: ShapeR sh -> Operands sh -> Operands sh -> Operands sh -> TypeR s -> (LoopWork sh (StateT (Operands s) (CodeGen Native)),StateT (Operands s) (CodeGen Native) ()) -> StateT (Operands s) (CodeGen Native) ()
+loopWorkFromTo shr start end extent tys (loopwork,finish) = do
   linix <- lift (intOfIndex shr extent start)
   loopWorkFromTo' shr start end extent linix [] tys loopwork
+  finish
+
 
 loopWorkFromTo' :: ShapeR sh -> Operands sh -> Operands sh -> Operands sh -> Operands Int -> [Operands Int] -> TypeR s -> LoopWork sh (StateT (Operands s) (CodeGen Native)) -> StateT (Operands s) (CodeGen Native) ()
 loopWorkFromTo' ShapeRz OP_Unit OP_Unit OP_Unit _ _ _ LoopWorkZ = pure ()
@@ -272,7 +274,7 @@ workstealLoop counter activeThreads size doWork = do
   -- lift $ setBlock dummy -- without this, the previous block always returns True for some reason
   
 
-workstealChunked :: ShapeR sh -> Operand (Ptr Int32) -> Operand (Ptr Int32) -> Operands sh -> TypeR s -> LoopWork sh (StateT (Operands s) (CodeGen Native)) -> StateT (Operands s) (CodeGen Native) ()
+workstealChunked :: ShapeR sh -> Operand (Ptr Int32) -> Operand (Ptr Int32) -> Operands sh -> TypeR s -> (LoopWork sh (StateT (Operands s) (CodeGen Native)), StateT (Operands s) (CodeGen Native) ()) -> StateT (Operands s) (CodeGen Native) ()
 workstealChunked shr counter activeThreads sh tys loopwork = do
   let chunkSz = chunkSize' shr sh
   chunkCounts <- lift $ chunkCount shr sh chunkSz
diff --git a/accelerate-llvm-native/src/Data/Array/Accelerate/LLVM/Native/Compile.hs b/accelerate-llvm-native/src/Data/Array/Accelerate/LLVM/Native/Compile.hs
index 5def25766..8706f0cef 100644
--- a/accelerate-llvm-native/src/Data/Array/Accelerate/LLVM/Native/Compile.hs
+++ b/accelerate-llvm-native/src/Data/Array/Accelerate/LLVM/Native/Compile.hs
@@ -93,7 +93,7 @@ compile uid name module' = do
         withNativeTargetMachine      $ \machine ->
         withTargetLibraryInfo triple $ \libinfo -> do
           -- dump llvm
-          -- hPutStrLn stderr . T.unpack . decodeUtf8 =<< moduleLLVMAssembly mdl
+          hPutStrLn stderr . T.unpack . decodeUtf8 =<< moduleLLVMAssembly mdl
           
           optimiseModule datalayout (Just machine) (Just libinfo) mdl
 
diff --git a/accelerate-llvm-native/src/Data/Array/Accelerate/LLVM/Native/Operation.hs b/accelerate-llvm-native/src/Data/Array/Accelerate/LLVM/Native/Operation.hs
index 65b51e191..2ef62bb3d 100644
--- a/accelerate-llvm-native/src/Data/Array/Accelerate/LLVM/Native/Operation.hs
+++ b/accelerate-llvm-native/src/Data/Array/Accelerate/LLVM/Native/Operation.hs
@@ -288,11 +288,9 @@ instance MakesILP NativeOp where
 
   encodeBackendClusterArg (BCAN i) = intHost $(hashQ ("BCAN" :: String)) <> intHost i
 
-inputConstraints :: HasCallStack => Label -> Labels -> Constraint NativeOp
+inputConstraints :: Label -> Labels -> Constraint NativeOp
 inputConstraints l = foldMap $ \lIn -> 
-                timesN (fused lIn l) .>=. ILP.c (InDir  l) .-. ILP.c (OutDir  lIn)
-    <> (-1) .*. timesN (fused lIn l) .<=. ILP.c (InDir  l) .-. ILP.c (OutDir  lIn)
-    <>          timesN (fused lIn l) .>=. ILP.c (InDims l) .-. ILP.c (OutDims lIn)
+                timesN (fused lIn l) .>=. ILP.c (InDims l) .-. ILP.c (OutDims lIn)
     <> (-1) .*. timesN (fused lIn l) .<=. ILP.c (InDims l) .-. ILP.c (OutDims lIn)
 
 inrankifmanifest :: ShapeR sh -> Label -> Constraint NativeOp
diff --git a/accelerate-llvm-native/test/nofib/Main.hs b/accelerate-llvm-native/test/nofib/Main.hs
index d61bab6ac..685c98270 100644
--- a/accelerate-llvm-native/test/nofib/Main.hs
+++ b/accelerate-llvm-native/test/nofib/Main.hs
@@ -32,20 +32,22 @@ import Control.Concurrent
 -- import Quickhull
 main :: IO ()
 main = do
-  let xs = fromList (Z :. 5 :. 7) [1 :: Int ..]
-  -- let ys = map (+1) $ 
-  --           use xs
-  -- let f = map (*2)
-  -- let program = awhile (map (A.>0) . asnd) (\(T2 a b) -> T2 (f a) (map (\x -> x - 1) b)) (T2 ys $ unit $ constant (100000 :: Int))
-
-  putStrLn "scan:"
-  let f = 
-        --map (*2) $ 
-        scanl1 (+) $
-        --map (+4) $ 
-        use xs
-  putStrLn $ test @UniformScheduleFun @NativeKernel f
-  print $ run @Native f
+  let xs = fromList (Z :. 5) [1 :: Int ..]
+  let ys = map (+1) $ 
+            use xs
+  let f = map (*2)
+  let program = awhile (map (A.>0) . asnd) (\(T2 a b) -> T2 (f a) (map (\x -> x - 1) b)) (T2 ys $ unit $ constant (100000 :: Int))
+
+  -- putStrLn "scan:"
+  -- let f = 
+  --       --map (*2) $ 
+  --       scanl1 (+) $
+  --       --map (+4) $ 
+  --       use xs
+  -- putStrLn $ test @UniformScheduleFun @NativeKernel f
+  -- print $ run @Native f
+
+  -- putStrLn $ test @UniformScheduleFun @NativeKernel $ map (\(I2 a b)->b) (generate (I2 10 5) (\(I2 i j) -> fromIndex (I2 (5 :: Exp Int) (10 :: Exp Int)) (toIndex (I2 10 5) (I2 i j))))
 
   -- threadDelay 5000000
   -- putStrLn "done"
@@ -83,10 +85,10 @@ main = do
   -- print $ runN @Native f xs
   -- print $ runN @Native (f ys)
 
-  -- putStrLn "fold:"
-  -- let f = fold1 (+) ys
-  -- -- putStrLn $ test @UniformScheduleFun @NativeKernel f
-  -- print $ run @Native f
+  putStrLn "fold:"
+  let f = fold1 (+) ys
+  putStrLn $ test @UniformScheduleFun @NativeKernel f
+  print $ run @Native f
 
   -- putStrLn "scan:"
   -- let f = scanl1 (+) ys