diff --git a/accelerate-llvm-native/src/Data/Array/Accelerate/LLVM/Native/CodeGen.hs b/accelerate-llvm-native/src/Data/Array/Accelerate/LLVM/Native/CodeGen.hs index 8bea0a8a7..80f38af77 100644 --- a/accelerate-llvm-native/src/Data/Array/Accelerate/LLVM/Native/CodeGen.hs +++ b/accelerate-llvm-native/src/Data/Array/Accelerate/LLVM/Native/CodeGen.hs @@ -106,7 +106,10 @@ codegen name env (Clustered c b) args = -- body acc loopsize' acc' <- operandsMapToPairs acc $ \(accTypeR, toOp, fromOp) -> fmap fromOp $ flip execStateT (toOp acc) $ case loopsize of LS loopshr loopsh -> - workstealChunked loopshr workstealIndex workstealActiveThreads (flipShape loopshr loopsh) accTypeR (body loopshr toOp fromOp) + workstealChunked loopshr workstealIndex workstealActiveThreads (flipShape loopshr loopsh) accTypeR + (body loopshr toOp fromOp, -- the LoopWork + StateT $ \op -> second toOp <$> runStateT (foo (liftInt 0) []) (fromOp op)) -- the action to run after the outer loop + -- acc'' <- flip execStateT acc' $ foo (liftInt 0) [] pure () where ba = makeBackendArg @NativeOp args gamma c b @@ -114,7 +117,6 @@ codegen name env (Clustered c b) args = body :: ShapeR sh -> (Accumulated -> a) -> (a -> Accumulated) -> LoopWork sh (StateT a (CodeGen Native)) body ShapeRz _ _ = LoopWorkZ body (ShapeRsnoc shr) toOp fromOp = LoopWorkSnoc (body shr toOp fromOp) (\i is -> StateT $ \op -> second toOp <$> runStateT (foo i is) (fromOp op)) - where foo :: Operands Int -> [Operands Int] -> StateT Accumulated (CodeGen Native) () foo linix ixs = do let d = length ixs -- TODO check: this or its inverse (i.e. totalDepth - length ixs)? diff --git a/accelerate-llvm-native/src/Data/Array/Accelerate/LLVM/Native/CodeGen/Loop.hs b/accelerate-llvm-native/src/Data/Array/Accelerate/LLVM/Native/CodeGen/Loop.hs index 3e1ff44fd..76b143d51 100644 --- a/accelerate-llvm-native/src/Data/Array/Accelerate/LLVM/Native/CodeGen/Loop.hs +++ b/accelerate-llvm-native/src/Data/Array/Accelerate/LLVM/Native/CodeGen/Loop.hs @@ -80,10 +80,12 @@ imapNestFromTo shr start end extent body = $ \sz -> imapFromTo ssz esz $ \i -> k (OP_Pair sz i) -loopWorkFromTo :: ShapeR sh -> Operands sh -> Operands sh -> Operands sh -> TypeR s -> LoopWork sh (StateT (Operands s) (CodeGen Native)) -> StateT (Operands s) (CodeGen Native) () -loopWorkFromTo shr start end extent tys loopwork = do +loopWorkFromTo :: ShapeR sh -> Operands sh -> Operands sh -> Operands sh -> TypeR s -> (LoopWork sh (StateT (Operands s) (CodeGen Native)),StateT (Operands s) (CodeGen Native) ()) -> StateT (Operands s) (CodeGen Native) () +loopWorkFromTo shr start end extent tys (loopwork,finish) = do linix <- lift (intOfIndex shr extent start) loopWorkFromTo' shr start end extent linix [] tys loopwork + finish + loopWorkFromTo' :: ShapeR sh -> Operands sh -> Operands sh -> Operands sh -> Operands Int -> [Operands Int] -> TypeR s -> LoopWork sh (StateT (Operands s) (CodeGen Native)) -> StateT (Operands s) (CodeGen Native) () loopWorkFromTo' ShapeRz OP_Unit OP_Unit OP_Unit _ _ _ LoopWorkZ = pure () @@ -272,7 +274,7 @@ workstealLoop counter activeThreads size doWork = do -- lift $ setBlock dummy -- without this, the previous block always returns True for some reason -workstealChunked :: ShapeR sh -> Operand (Ptr Int32) -> Operand (Ptr Int32) -> Operands sh -> TypeR s -> LoopWork sh (StateT (Operands s) (CodeGen Native)) -> StateT (Operands s) (CodeGen Native) () +workstealChunked :: ShapeR sh -> Operand (Ptr Int32) -> Operand (Ptr Int32) -> Operands sh -> TypeR s -> (LoopWork sh (StateT (Operands s) (CodeGen Native)), StateT (Operands s) (CodeGen Native) ()) -> StateT (Operands s) (CodeGen Native) () workstealChunked shr counter activeThreads sh tys loopwork = do let chunkSz = chunkSize' shr sh chunkCounts <- lift $ chunkCount shr sh chunkSz diff --git a/accelerate-llvm-native/src/Data/Array/Accelerate/LLVM/Native/Compile.hs b/accelerate-llvm-native/src/Data/Array/Accelerate/LLVM/Native/Compile.hs index 5def25766..8706f0cef 100644 --- a/accelerate-llvm-native/src/Data/Array/Accelerate/LLVM/Native/Compile.hs +++ b/accelerate-llvm-native/src/Data/Array/Accelerate/LLVM/Native/Compile.hs @@ -93,7 +93,7 @@ compile uid name module' = do withNativeTargetMachine $ \machine -> withTargetLibraryInfo triple $ \libinfo -> do -- dump llvm - -- hPutStrLn stderr . T.unpack . decodeUtf8 =<< moduleLLVMAssembly mdl + hPutStrLn stderr . T.unpack . decodeUtf8 =<< moduleLLVMAssembly mdl optimiseModule datalayout (Just machine) (Just libinfo) mdl diff --git a/accelerate-llvm-native/src/Data/Array/Accelerate/LLVM/Native/Operation.hs b/accelerate-llvm-native/src/Data/Array/Accelerate/LLVM/Native/Operation.hs index 65b51e191..2ef62bb3d 100644 --- a/accelerate-llvm-native/src/Data/Array/Accelerate/LLVM/Native/Operation.hs +++ b/accelerate-llvm-native/src/Data/Array/Accelerate/LLVM/Native/Operation.hs @@ -288,11 +288,9 @@ instance MakesILP NativeOp where encodeBackendClusterArg (BCAN i) = intHost $(hashQ ("BCAN" :: String)) <> intHost i -inputConstraints :: HasCallStack => Label -> Labels -> Constraint NativeOp +inputConstraints :: Label -> Labels -> Constraint NativeOp inputConstraints l = foldMap $ \lIn -> - timesN (fused lIn l) .>=. ILP.c (InDir l) .-. ILP.c (OutDir lIn) - <> (-1) .*. timesN (fused lIn l) .<=. ILP.c (InDir l) .-. ILP.c (OutDir lIn) - <> timesN (fused lIn l) .>=. ILP.c (InDims l) .-. ILP.c (OutDims lIn) + timesN (fused lIn l) .>=. ILP.c (InDims l) .-. ILP.c (OutDims lIn) <> (-1) .*. timesN (fused lIn l) .<=. ILP.c (InDims l) .-. ILP.c (OutDims lIn) inrankifmanifest :: ShapeR sh -> Label -> Constraint NativeOp diff --git a/accelerate-llvm-native/test/nofib/Main.hs b/accelerate-llvm-native/test/nofib/Main.hs index d61bab6ac..685c98270 100644 --- a/accelerate-llvm-native/test/nofib/Main.hs +++ b/accelerate-llvm-native/test/nofib/Main.hs @@ -32,20 +32,22 @@ import Control.Concurrent -- import Quickhull main :: IO () main = do - let xs = fromList (Z :. 5 :. 7) [1 :: Int ..] - -- let ys = map (+1) $ - -- use xs - -- let f = map (*2) - -- let program = awhile (map (A.>0) . asnd) (\(T2 a b) -> T2 (f a) (map (\x -> x - 1) b)) (T2 ys $ unit $ constant (100000 :: Int)) - - putStrLn "scan:" - let f = - --map (*2) $ - scanl1 (+) $ - --map (+4) $ - use xs - putStrLn $ test @UniformScheduleFun @NativeKernel f - print $ run @Native f + let xs = fromList (Z :. 5) [1 :: Int ..] + let ys = map (+1) $ + use xs + let f = map (*2) + let program = awhile (map (A.>0) . asnd) (\(T2 a b) -> T2 (f a) (map (\x -> x - 1) b)) (T2 ys $ unit $ constant (100000 :: Int)) + + -- putStrLn "scan:" + -- let f = + -- --map (*2) $ + -- scanl1 (+) $ + -- --map (+4) $ + -- use xs + -- putStrLn $ test @UniformScheduleFun @NativeKernel f + -- print $ run @Native f + + -- putStrLn $ test @UniformScheduleFun @NativeKernel $ map (\(I2 a b)->b) (generate (I2 10 5) (\(I2 i j) -> fromIndex (I2 (5 :: Exp Int) (10 :: Exp Int)) (toIndex (I2 10 5) (I2 i j)))) -- threadDelay 5000000 -- putStrLn "done" @@ -83,10 +85,10 @@ main = do -- print $ runN @Native f xs -- print $ runN @Native (f ys) - -- putStrLn "fold:" - -- let f = fold1 (+) ys - -- -- putStrLn $ test @UniformScheduleFun @NativeKernel f - -- print $ run @Native f + putStrLn "fold:" + let f = fold1 (+) ys + putStrLn $ test @UniformScheduleFun @NativeKernel f + print $ run @Native f -- putStrLn "scan:" -- let f = scanl1 (+) ys