From c3ae76c2bf06e737ebcec478867cfb96c343c26c Mon Sep 17 00:00:00 2001
From: Olexa Bilaniuk <obilaniu@gmail.com>
Date: Mon, 23 Jan 2017 19:33:24 -0500
Subject: [PATCH 01/34] Current status of reduction generalization and
 small-destination support.

---
 src/gpuarray/array.h     |  115 ++-
 src/gpuarray_reduction.c | 1586 ++++++++++++++++++++++++++++++--------
 tests/check_reduction.c  |  242 +++++-
 3 files changed, 1570 insertions(+), 373 deletions(-)

diff --git a/src/gpuarray/array.h b/src/gpuarray/array.h
index a99366a7c4..5ea9377b9a 100644
--- a/src/gpuarray/array.h
+++ b/src/gpuarray/array.h
@@ -118,6 +118,27 @@ typedef enum _ga_order {
   GA_F_ORDER=1
 } ga_order;
 
+/**
+ * Supported array reduction operations.
+ */
+
+typedef enum _ga_reduce_op {
+	GA_REDUCE_SUM,             /*        +        */
+	GA_REDUCE_PROD,            /*        *        */
+	GA_REDUCE_PRODNZ,          /*        * (!=0)  */
+	GA_REDUCE_MIN,             /*      min()      */
+	GA_REDUCE_MAX,             /*      max()      */
+	GA_REDUCE_ARGMIN,          /*     argmin()    */
+	GA_REDUCE_ARGMAX,          /*     argmax()    */
+	GA_REDUCE_MINANDARGMIN,    /* min(), argmin() */
+	GA_REDUCE_MAXANDARGMAX,    /* max(), argmax() */
+	GA_REDUCE_AND,             /*        &        */
+	GA_REDUCE_OR,              /*        |        */
+	GA_REDUCE_XOR,             /*        ^        */
+	GA_REDUCE_ALL,             /*     &&/all()    */
+	GA_REDUCE_ANY,             /*     ||/any()    */
+} ga_reduce_op;
+
 /**
  * Checks if all the specified flags are set.
  *
@@ -604,26 +625,31 @@ GPUARRAY_PUBLIC void GpuArray_fprintf(FILE *fd, const GpuArray *a);
 
 GPUARRAY_PUBLIC int GpuArray_fdump(FILE *fd, const GpuArray *a);
 
+
 /**
- * @brief Computes simultaneously the maxima and the arguments of maxima over
- * specified axes of the tensor.
+ * @brief Compute a reduction sum (+), product (*), non-zero product (* != 0),
+ *        min, max, argmin, argmax, min-and-argmin, max-and-argmax, and (&),
+ *        or (|), xor (^), all (&&) or any (||) over a list of axes to reduce.
  *
- * Returns two tensors of identical shape. Both tensors' axes are a subset of
- * the axes of the original tensor. The axes to be reduced are specified by
- * the caller, and the maxima and arguments of maxima are computed over them.
+ * Returns one (in the case of min-and-argmin/max-and-argmax, two) destination
+ * tensors. The destination tensor(s)' axes are a strict subset of the axes of the
+ * source tensor. The axes to be reduced are specified by the caller, and the
+ * reduction is performed over these axes, which are then removed in the
+ * destination.
  *
- * @param [out] dstMax     The resulting tensor of maxima
- * @param [out] dstArgmax  the resulting tensor of arguments at maxima
+ * @param [out] dst        The destination tensor. Has the same type as the source.
+ * @param [out] dstArg     For argument of minima/maxima operations. Has type int64.
  * @param [in]  src        The source tensor.
  * @param [in]  reduxLen   The number of axes reduced. Must be >= 1 and
  *                         <= src->nd.
  * @param [in]  reduxList  A list of integers of length reduxLen, indicating
  *                         the axes to be reduced. The order of the axes
- *                         matters for dstArgmax index calculations. All
- *                         entries in the list must be unique, >= 0 and
- *                         < src->nd.
+ *                         matters for dstArg index calculations (GpuArray_argmin,
+ *                         GpuArray_argmax, GpuArray_minandargmin,
+ *                         GpuArray_maxandargmax). All entries in the list must be
+ *                         unique, >= 0 and < src->nd.
  *                         
- *                         For example, if a 5D-tensor is reduced with an axis
+ *                         For example, if a 5D-tensor is max-reduced with an axis
  *                         list of [3,4,1], then reduxLen shall be 3, and the
  *                         index calculation in every point shall take the form
  *                         
@@ -637,11 +663,74 @@ GPUARRAY_PUBLIC int GpuArray_fdump(FILE *fd, const GpuArray *a);
  *         code otherwise.
  */
 
-GPUARRAY_PUBLIC int GpuArray_maxandargmax(GpuArray*       dstMax,
-                                          GpuArray*       dstArgmax,
+GPUARRAY_PUBLIC int GpuArray_sum         (GpuArray*       dst,
                                           const GpuArray* src,
                                           unsigned        reduxLen,
                                           const unsigned* reduxList);
+GPUARRAY_PUBLIC int GpuArray_prod        (GpuArray*       dst,
+                                          const GpuArray* src,
+                                          unsigned        reduxLen,
+                                          const unsigned* reduxList);
+GPUARRAY_PUBLIC int GpuArray_prodnz      (GpuArray*       dst,
+                                          const GpuArray* src,
+                                          unsigned        reduxLen,
+                                          const unsigned* reduxList);
+GPUARRAY_PUBLIC int GpuArray_min         (GpuArray*       dst,
+                                          const GpuArray* src,
+                                          unsigned        reduxLen,
+                                          const unsigned* reduxList);
+GPUARRAY_PUBLIC int GpuArray_max         (GpuArray*       dst,
+                                          const GpuArray* src,
+                                          unsigned        reduxLen,
+                                          const unsigned* reduxList);
+GPUARRAY_PUBLIC int GpuArray_argmin      (GpuArray*       dstArg,
+                                          const GpuArray* src,
+                                          unsigned        reduxLen,
+                                          const unsigned* reduxList);
+GPUARRAY_PUBLIC int GpuArray_argmax      (GpuArray*       dstArg,
+                                          const GpuArray* src,
+                                          unsigned        reduxLen,
+                                          const unsigned* reduxList);
+GPUARRAY_PUBLIC int GpuArray_minandargmin(GpuArray*       dst,
+                                          GpuArray*       dstArg,
+                                          const GpuArray* src,
+                                          unsigned        reduxLen,
+                                          const unsigned* reduxList);
+GPUARRAY_PUBLIC int GpuArray_maxandargmax(GpuArray*       dst,
+                                          GpuArray*       dstArg,
+                                          const GpuArray* src,
+                                          unsigned        reduxLen,
+                                          const unsigned* reduxList);
+GPUARRAY_PUBLIC int GpuArray_and         (GpuArray*       dst,
+                                          const GpuArray* src,
+                                          unsigned        reduxLen,
+                                          const unsigned* reduxList);
+GPUARRAY_PUBLIC int GpuArray_or          (GpuArray*       dst,
+                                          const GpuArray* src,
+                                          unsigned        reduxLen,
+                                          const unsigned* reduxList);
+GPUARRAY_PUBLIC int GpuArray_xor         (GpuArray*       dst,
+                                          const GpuArray* src,
+                                          unsigned        reduxLen,
+                                          const unsigned* reduxList);
+GPUARRAY_PUBLIC int GpuArray_all         (GpuArray*       dst,
+                                          const GpuArray* src,
+                                          unsigned        reduxLen,
+                                          const unsigned* reduxList);
+GPUARRAY_PUBLIC int GpuArray_any         (GpuArray*       dst,
+                                          const GpuArray* src,
+                                          unsigned        reduxLen,
+                                          const unsigned* reduxList);
+GPUARRAY_PUBLIC int GpuArray_reduction   (ga_reduce_op    op,
+                                          GpuArray*       dst,
+                                          GpuArray*       dstArg,
+                                          const GpuArray* src,
+                                          unsigned        reduxLen,
+                                          const unsigned* reduxList);
+
+
+
+
 
 #ifdef __cplusplus
 }
diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c
index fc4fc56975..8a6a2dc98b 100644
--- a/src/gpuarray_reduction.c
+++ b/src/gpuarray_reduction.c
@@ -21,106 +21,613 @@
 #include "util/integerfactoring.h"
 
 
+/* Defines */
+#define  MAX_HW_DIMS                   3
+
+
+
 /* Datatypes */
-struct maxandargmax_ctx{
+
+/**
+ *                    Reduction Kernel Generator.
+ * 
+ * The generator produces a kernel from one of two "code models":
+ *   - Large
+ *   - Small
+ * Which one is used depends on the size of the destination tensor and the
+ * number of reductions for each destination element. A destination tensor
+ * with more than SMALL_REDUX_THRESHOLD elements or more elements than
+ * reductions for each element will result in use of the large code model;
+ * Otherwise the small code model is used.
+ * 
+ * 
+ *                         LARGE CODE MODEL:
+ * 
+ * In the large code model, each destination element is processed by a
+ * single thread.
+ * 
+ * Each thread begins with an initial value in a register, reads from all
+ * source elements contributing to the reduction, computes the result and
+ * writes it to the destination element.
+ * 
+ * A single kernel is generated that performs prescalar transformations, the
+ * reduction itself, postscalar transformations and the write to global memory.
+ * 
+ * 
+ *                         SMALL CODE MODEL:
+ * 
+ * In the small code model, each destination element is processed by
+ * multiple threads.
+ * 
+ * The destination tensor is first initialized with the initial value. Then,
+ * one several threads cooperate to perform the reduction atomically on each
+ * destination element. Lastly, postscalar transformations are applied
+ * in-place.
+ * 
+ * Two or three kernels are generated: The initialization kernel, the main
+ * kernel that performs prescalar transformations and the reduction itself, and
+ * possibly also a postscalar transformation kernel when it is required.
+ * 
+ * 
+ *                           Kernel Template:
+ * 
+ * The following kernel code template displays the code generated for the
+ * small code model. For the large code model, no pre/postRedux() kernels
+ * are generated (since their functionality is incorporated within the main
+ * redux() kernel), no atomicRedux() function needs to be generated because
+ * writes to global memory are unconditional and not contended.
+ * 
+ * 
+ *     //Includes
+ *     #include <limits.h>
+ *     #include <math.h>
+ *     #include <stdint.h>
+ *     
+ *     
+ *     //Typedefs:
+ *     typedef  float    T
+ *     typedef  int64_t  X
+ *     
+ *     
+ *     //Initializer (in case initial T cannot be expressed as a literal)
+ *     static T    getInitVal(void){
+ *         return ...
+ *     }
+ *     
+ *     
+ *     //Reduce into global memory destination a value.
+ *     static void atomicRedux(GLOBAL_MEM T* dst, T val){
+ *         ...
+ *     }
+ *     
+ *     
+ *     //Load data from source and apply pre-operations.
+ *     static T loadVal(X i0, X i1, ..., X iN,
+ *                      const GLOBAL_MEM T* src,
+ *                      const GLOBAL_MEM X* srcSteps,
+ *                      ...?){
+ *         return ...
+ *     }
+ *     
+ *     
+ *     //Initialization kernel,
+ *     KERNEL void preRedux(const GLOBAL_MEM X*        srcSize,
+ *                          const GLOBAL_MEM X*        chunkSize,
+ *                          GLOBAL_MEM T*              dst,
+ *                          const X                    dstOff,
+ *                          const GLOBAL_MEM X*        dstSteps){
+ *         //OFFSETS
+ *         dst += dstOff;
+ *         
+ *         //Initialize
+ *         dst[...] = getInitVal();
+ *     }
+ *     
+ *     
+ *     //Reduction Kernel.
+ *     KERNEL void redux(const GLOBAL_MEM T*        src,
+ *                       const X                    srcOff,
+ *                       const GLOBAL_MEM X*        srcSteps,
+ *                       const GLOBAL_MEM X*        srcSize,
+ *                       const GLOBAL_MEM X*        chunkSize,
+ *                       GLOBAL_MEM T*              dst,
+ *                       const X                    dstOff,
+ *                       const GLOBAL_MEM X*        dstSteps,
+ *                       GLOBAL_MEM X*              dstArg,
+ *                       const X                    dstArgOff,
+ *                       const GLOBAL_MEM X*        dstArgSteps){
+ *         //OFFSETS
+ *         src    += srcOff
+ *         dst    += dstOff
+ *         dstArg += dstArgOff
+ *         
+ *         //Declare Indices
+ *         //Compute Ranges
+ *         
+ *         //Define macros
+ *         //Outer Loops
+ *            //Inner Loops
+ *         //Undefine macros
+ *     }
+ *     
+ *     
+ *     //Post-scalar kernel,
+ *     KERNEL void postRedux(const GLOBAL_MEM X*        srcSize,
+ *                           const GLOBAL_MEM X*        chunkSize,
+ *                           GLOBAL_MEM T*              dst,
+ *                           const X                    dstOff,
+ *                           const GLOBAL_MEM X*        dstSteps){
+ *         //OFFSETS
+ *         dst += dstOff;
+ *         
+ *         //Initialize
+ *         dst[...] = getInitVal();
+ *     }
+ * 
+ * 
+ *                           Initial Reduction Values
+ * +--------------+-----+-----+---------+---------+-----+-----+-----+-----+-----+
+ * | Type\Op      |  +  |  *  |   max   |   min   |  &  |  |  |  ^  | &&  | ||  |
+ * +--------------+-----+-----+---------+---------+-----+-----+-----+-----+-----+
+ * | signed   int |  0  |  1  | INT_MIN | INT_MAX | ~0  |  0  |  0  | ~0  |  0  |
+ * | unsigned int |  0  |  1  |    0    |   ~0    | ~0  |  0  |  0  | ~0  |  0  |
+ * | floating     | 0.0 | 1.0 |   NAN   |   NAN   |     |     |     |     |     |
+ * +--------------+-----+-----+---------+---------+-----+-----+-----+-----+-----+
+ */
+
+struct redux_ctx{
 	/* Function Arguments. */
-	GpuArray*       dstMax;
-	GpuArray*       dstArgmax;
+	ga_reduce_op    op;
+	GpuArray*       dst;
+	GpuArray*       dstArg;
 	const GpuArray* src;
 	int             reduxLen;
 	const int*      reduxList;
 
 	/* General. */
-	int             ret;
 	int*            axisList;
 	gpucontext*     gpuCtx;
 
 	/* Source code Generator. */
-	const char*     dstMaxType;
-	const char*     dstArgmaxType;
+	int             srcTypeCode;
+	int             dstTypeCode;
+	int             dstArgTypeCode;
+	int             idxTypeCode;
+	int             accTypeCode;
+	const char*     srcTypeStr;
+	const char*     dstTypeStr;
+	const char*     dstArgTypeStr;
+	const char*     idxTypeStr;
+	const char*     accTypeStr;
+	const char*     initVal;
 	int             ndd;
 	int             ndr;
 	int             nds;
 	int             ndh;
+	int             ndhd;
+	int             ndhr;
+	int             largeCodeModel;
 	strb            s;
 	char*           sourceCode;
+	GpuKernel       preKernel;
 	GpuKernel       kernel;
+	GpuKernel       postKernel;
 
 	/* Scheduler */
-	int             hwAxisList[3];
-	size_t          blockSize [3];
-	size_t          gridSize  [3];
-	size_t          chunkSize [3];
+	int             hwAxisList[MAX_HW_DIMS];
+	size_t          blockSize [MAX_HW_DIMS];
+	size_t          gridSize  [MAX_HW_DIMS];
+	size_t          chunkSize [MAX_HW_DIMS];
 
 	/* Invoker */
 	gpudata*        srcStepsGD;
 	gpudata*        srcSizeGD;
 	gpudata*        chunkSizeGD;
-	gpudata*        dstMaxStepsGD;
-	gpudata*        dstArgmaxStepsGD;
+	gpudata*        dstStepsGD;
+	gpudata*        dstArgStepsGD;
 };
-typedef struct maxandargmax_ctx maxandargmax_ctx;
+typedef struct redux_ctx redux_ctx;
 
 
 
 /* Function prototypes */
-static int   axisInSet                          (int                v,
-                                                 const int*         set,
-                                                 size_t             setLen,
-                                                 size_t*            where);
-static void  appendIdxes                        (strb*              s,
-                                                 const char*        prologue,
-                                                 const char*        prefix,
-                                                 int                startIdx,
-                                                 int                endIdx,
-                                                 const char*        suffix,
-                                                 const char*        epilogue);
-static int   maxandargmaxCheckargs              (maxandargmax_ctx*  ctx);
-static int   maxandargmaxSelectHwAxes           (maxandargmax_ctx*  ctx);
-static int   maxandargmaxGenSource              (maxandargmax_ctx*  ctx);
-static void  maxandargmaxAppendKernel           (maxandargmax_ctx*  ctx);
-static void  maxandargmaxAppendTypedefs         (maxandargmax_ctx*  ctx);
-static void  maxandargmaxAppendPrototype        (maxandargmax_ctx*  ctx);
-static void  maxandargmaxAppendOffsets          (maxandargmax_ctx*  ctx);
-static void  maxandargmaxAppendIndexDeclarations(maxandargmax_ctx*  ctx);
-static void  maxandargmaxAppendRangeCalculations(maxandargmax_ctx*  ctx);
-static void  maxandargmaxAppendLoops            (maxandargmax_ctx*  ctx);
-static void  maxandargmaxAppendLoopMacroDefs    (maxandargmax_ctx*  ctx);
-static void  maxandargmaxAppendLoopOuter        (maxandargmax_ctx*  ctx);
-static void  maxandargmaxAppendLoopInner        (maxandargmax_ctx*  ctx);
-static void  maxandargmaxAppendLoopMacroUndefs  (maxandargmax_ctx*  ctx);
-static void  maxandargmaxComputeAxisList        (maxandargmax_ctx*  ctx);
-static int   maxandargmaxCompile                (maxandargmax_ctx*  ctx);
-static int   maxandargmaxSchedule               (maxandargmax_ctx*  ctx);
-static int   maxandargmaxInvoke                 (maxandargmax_ctx*  ctx);
-static int   maxandargmaxCleanup                (maxandargmax_ctx*  ctx);
+static int   reduxGetSumInit               (int typecode, const char** property);
+static int   reduxGetProdInit              (int typecode, const char** property);
+static int   reduxGetMinInit               (int typecode, const char** property);
+static int   reduxGetMaxInit               (int typecode, const char** property);
+static int   reduxGetAndInit               (int typecode, const char** property);
+static int   reduxGetOrInit                (int typecode, const char** property);
+static int   axisInSet                     (int                v,
+                                            const int*         set,
+                                            size_t             setLen,
+                                            size_t*            where);
+static void  appendIdxes                   (strb*              s,
+                                            const char*        prologue,
+                                            const char*        prefix,
+                                            int                startIdx,
+                                            int                endIdx,
+                                            const char*        suffix,
+                                            const char*        epilogue);
+static int   reduxCheckargs                (redux_ctx*  ctx);
+static void  reduxSelectTypes              (redux_ctx*  ctx);
+static int   reduxSelectModel              (redux_ctx*  ctx);
+static int   reduxIsSmallCodeModel         (redux_ctx*  ctx);
+static int   reduxIsLargeCodeModel         (redux_ctx*  ctx);
+static int   reduxHasDst                   (redux_ctx*  ctx);
+static int   reduxHasDstArg                (redux_ctx*  ctx);
+static int   reduxKernelRequiresDst        (redux_ctx*  ctx);
+static int   reduxKernelRequiresDstArg     (redux_ctx*  ctx);
+static int   reduxCanAppendHwAxis          (redux_ctx* ctx, int wantReductionAxis);
+static void  reduxAppendLargestAxisToHwList(redux_ctx* ctx, int wantReductionAxis);
+static int   reduxSelectHwAxes             (redux_ctx*  ctx);
+static int   reduxComputeAxisList          (redux_ctx*  ctx);
+static int   reduxGenSource                (redux_ctx*  ctx);
+static void  reduxAppendSource             (redux_ctx*  ctx);
+static void  reduxAppendIncludes           (redux_ctx*  ctx);
+static void  reduxAppendTypedefs           (redux_ctx*  ctx);
+static void  reduxAppendFuncGetInitVal     (redux_ctx*  ctx);
+static void  reduxAppendFuncLoadVal        (redux_ctx*  ctx);
+static void  reduxAppendFuncReduxVal       (redux_ctx*  ctx);
+static void  reduxAppendFuncPreKernel      (redux_ctx*  ctx);
+static void  reduxAppendFuncKernel         (redux_ctx*  ctx);
+static void  reduxAppendFuncPostKernel     (redux_ctx*  ctx);
+static void  reduxAppendPrototype          (redux_ctx*  ctx);
+static void  reduxAppendOffsets            (redux_ctx*  ctx);
+static void  reduxAppendIndexDeclarations  (redux_ctx*  ctx);
+static void  reduxAppendRangeCalculations  (redux_ctx*  ctx);
+static void  reduxAppendLoops              (redux_ctx*  ctx);
+static void  reduxAppendLoopMacroDefs      (redux_ctx*  ctx);
+static void  reduxAppendLoopOuter          (redux_ctx*  ctx);
+static void  reduxAppendLoopInner          (redux_ctx*  ctx);
+static void  reduxAppendLoopMacroUndefs    (redux_ctx*  ctx);
+static int   reduxCompileLarge             (redux_ctx*  ctx);
+static int   reduxCompileSmall             (redux_ctx*  ctx);
+static int   reduxScheduleLarge            (redux_ctx*  ctx);
+static int   reduxInvokeLarge              (redux_ctx*  ctx);
+static int   reduxCleanup                  (redux_ctx*  ctx, int ret);
 
 
 /* Function implementation */
-GPUARRAY_PUBLIC int GpuArray_maxandargmax       (GpuArray*       dstMax,
-                                                 GpuArray*       dstArgmax,
-                                                 const GpuArray* src,
-                                                 unsigned        reduxLen,
-                                                 const unsigned* reduxList){
-	maxandargmax_ctx  ctxSTACK = {0};
-	maxandargmax_ctx  *ctx = &ctxSTACK;
-
-  ctxSTACK.dstMax = dstMax;
-	ctxSTACK.dstArgmax = dstArgmax;
-	ctxSTACK.src = src;
-	ctxSTACK.reduxLen = (int)reduxLen;
-	ctxSTACK.reduxList = (const int*)reduxList;
-
-	if(maxandargmaxCheckargs   (ctx) == GA_NO_ERROR &&
-	   maxandargmaxSelectHwAxes(ctx) == GA_NO_ERROR &&
-	   maxandargmaxGenSource   (ctx) == GA_NO_ERROR &&
-	   maxandargmaxCompile     (ctx) == GA_NO_ERROR &&
-	   maxandargmaxSchedule    (ctx) == GA_NO_ERROR &&
-	   maxandargmaxInvoke      (ctx) == GA_NO_ERROR){
-		return maxandargmaxCleanup(ctx);
-	}else{
-		return maxandargmaxCleanup(ctx);
+GPUARRAY_PUBLIC int  GpuArray_sum         (GpuArray*       dst,
+                                           const GpuArray* src,
+                                           unsigned        reduxLen,
+                                           const unsigned* reduxList){
+	return GpuArray_reduction(GA_REDUCE_SUM,
+	                          dst,  NULL,   src, reduxLen, reduxList);
+}
+GPUARRAY_PUBLIC int  GpuArray_prod        (GpuArray*       dst,
+                                           const GpuArray* src,
+                                           unsigned        reduxLen,
+                                           const unsigned* reduxList){
+	return GpuArray_reduction(GA_REDUCE_PROD,
+	                          dst,  NULL,   src, reduxLen, reduxList);
+}
+GPUARRAY_PUBLIC int  GpuArray_prodnz      (GpuArray*       dst,
+                                           const GpuArray* src,
+                                           unsigned        reduxLen,
+                                           const unsigned* reduxList){
+	return GpuArray_reduction(GA_REDUCE_PRODNZ,
+	                          dst,  NULL,   src, reduxLen, reduxList);
+}
+GPUARRAY_PUBLIC int  GpuArray_min         (GpuArray*       dst,
+                                           const GpuArray* src,
+                                           unsigned        reduxLen,
+                                           const unsigned* reduxList){
+	return GpuArray_reduction(GA_REDUCE_MIN,
+	                          dst,  NULL,   src, reduxLen, reduxList);
+}
+GPUARRAY_PUBLIC int  GpuArray_max         (GpuArray*       dst,
+                                           const GpuArray* src,
+                                           unsigned        reduxLen,
+                                           const unsigned* reduxList){
+	return GpuArray_reduction(GA_REDUCE_MAX,
+	                          dst,  NULL,   src, reduxLen, reduxList);
+}
+GPUARRAY_PUBLIC int  GpuArray_argmin      (GpuArray*       dstArg,
+                                           const GpuArray* src,
+                                           unsigned        reduxLen,
+                                           const unsigned* reduxList){
+	return GpuArray_reduction(GA_REDUCE_ARGMIN,
+	                          NULL, dstArg, src, reduxLen, reduxList);
+}
+GPUARRAY_PUBLIC int  GpuArray_argmax      (GpuArray*       dstArg,
+                                           const GpuArray* src,
+                                           unsigned        reduxLen,
+                                           const unsigned* reduxList){
+	return GpuArray_reduction(GA_REDUCE_ARGMAX,
+	                          NULL, dstArg, src, reduxLen, reduxList);
+}
+GPUARRAY_PUBLIC int  GpuArray_minandargmin(GpuArray*       dst,
+                                           GpuArray*       dstArg,
+                                           const GpuArray* src,
+                                           unsigned        reduxLen,
+                                           const unsigned* reduxList){
+	return GpuArray_reduction(GA_REDUCE_MINANDARGMIN,
+	                          dst,  dstArg, src, reduxLen, reduxList);
+}
+GPUARRAY_PUBLIC int  GpuArray_maxandargmax(GpuArray*       dst,
+                                           GpuArray*       dstArg,
+                                           const GpuArray* src,
+                                           unsigned        reduxLen,
+                                           const unsigned* reduxList){
+	return GpuArray_reduction(GA_REDUCE_MAXANDARGMAX,
+	                          dst,  dstArg, src, reduxLen, reduxList);
+}
+GPUARRAY_PUBLIC int  GpuArray_and         (GpuArray*       dst,
+                                           const GpuArray* src,
+                                           unsigned        reduxLen,
+                                           const unsigned* reduxList){
+	return GpuArray_reduction(GA_REDUCE_AND,
+	                          dst,  NULL,   src, reduxLen, reduxList);
+}
+GPUARRAY_PUBLIC int  GpuArray_or          (GpuArray*       dst,
+                                           const GpuArray* src,
+                                           unsigned        reduxLen,
+                                           const unsigned* reduxList){
+	return GpuArray_reduction(GA_REDUCE_OR,
+	                          dst,  NULL,   src, reduxLen, reduxList);
+}
+GPUARRAY_PUBLIC int  GpuArray_xor         (GpuArray*       dst,
+                                           const GpuArray* src,
+                                           unsigned        reduxLen,
+                                           const unsigned* reduxList){
+	return GpuArray_reduction(GA_REDUCE_XOR,
+	                          dst,  NULL,   src, reduxLen, reduxList);
+}
+GPUARRAY_PUBLIC int  GpuArray_all         (GpuArray*       dst,
+                                           const GpuArray* src,
+                                           unsigned        reduxLen,
+                                           const unsigned* reduxList){
+	return GpuArray_reduction(GA_REDUCE_ALL,
+	                          dst,  NULL,   src, reduxLen, reduxList);
+}
+GPUARRAY_PUBLIC int  GpuArray_any         (GpuArray*       dst,
+                                           const GpuArray* src,
+                                           unsigned        reduxLen,
+                                           const unsigned* reduxList){
+	return GpuArray_reduction(GA_REDUCE_ANY,
+	                          dst,  NULL,   src, reduxLen, reduxList);
+}
+GPUARRAY_PUBLIC int  GpuArray_reduction   (ga_reduce_op    op,
+                                           GpuArray*       dst,
+                                           GpuArray*       dstArg,
+                                           const GpuArray* src,
+                                           unsigned        reduxLen,
+                                           const unsigned* reduxList){
+	redux_ctx  ctxSTACK = {op, dst, dstArg, src,
+	                       (int)reduxLen, (const int*)reduxList},
+	          *ctx      = &ctxSTACK;
+
+	return reduxCheckargs(ctx);
+}
+
+/**
+ * @brief Get an expression representing a suitable initialization value for
+ *        the given datatype and a sum-reduction operation.
+ *
+ * @param [in]  typecode  Typecode of the type whose initializer is to be
+ *                        requested.
+ * @param [out] property  A pointer to a string. On return it will be set to
+ *                        the initializer expression.
+ * @return Zero if successful; Non-zero if the datatype is not supported.
+ */
+
+static int   reduxGetSumInit               (int typecode, const char** property){
+	if(typecode == GA_POINTER ||
+	   typecode == GA_BUFFER){
+		return GA_UNSUPPORTED_ERROR;
+	}
+	*property = "0";
+	return GA_NO_ERROR;
+}
+
+/**
+ * @brief Get an expression representing a suitable initialization value for
+ *        the given datatype and a prod-reduction operation.
+ *
+ * @param [in]  typecode  Typecode of the type whose initializer is to be
+ *                        requested.
+ * @param [out] property  A pointer to a string. On return it will be set to
+ *                        the initializer expression.
+ * @return Zero if successful; Non-zero if the datatype is not supported.
+ */
+
+static int   reduxGetProdInit              (int typecode, const char** property){
+	if(typecode == GA_POINTER ||
+	   typecode == GA_BUFFER){
+		return GA_UNSUPPORTED_ERROR;
+	}
+	*property = "1";
+	return GA_NO_ERROR;
+}
+
+/**
+ * @brief Get an expression representing a suitable initialization value for
+ *        the given datatype and a max-reduction operation.
+ *
+ * @param [in]  typecode  Typecode of the type whose initializer is to be
+ *                        requested.
+ * @param [out] property  A pointer to a string. On return it will be set to
+ *                        the initializer expression.
+ * @return Zero if successful; Non-zero if the datatype is not supported.
+ */
+
+static int   reduxGetMinInit               (int typecode, const char** property){
+	switch(typecode){
+		case GA_BYTE2:
+		case GA_BYTE3:
+		case GA_BYTE4:
+		case GA_BYTE8:
+		case GA_BYTE16:
+		case GA_BYTE:               *property = "SCHAR_MIN"; break;
+		case GA_SHORT2:
+		case GA_SHORT3:
+		case GA_SHORT4:
+		case GA_SHORT8:
+		case GA_SHORT16:
+		case GA_SHORT:              *property = "SHRT_MIN"; break;
+		case GA_INT2:
+		case GA_INT3:
+		case GA_INT4:
+		case GA_INT8:
+		case GA_INT16:
+		case GA_INT:                *property = "INT_MIN"; break;
+		case GA_LONG2:
+		case GA_LONG3:
+		case GA_LONG4:
+		case GA_LONG8:
+		case GA_LONG16:
+		case GA_LONG:               *property = "LONG_MIN"; break;
+		case GA_LONGLONG:           *property = "LLONG_MIN"; break;
+		case GA_BOOL:
+		case GA_UBYTE2:
+		case GA_UBYTE3:
+		case GA_UBYTE4:
+		case GA_UBYTE8:
+		case GA_UBYTE16:
+		case GA_UBYTE:
+		case GA_USHORT2:
+		case GA_USHORT3:
+		case GA_USHORT4:
+		case GA_USHORT8:
+		case GA_USHORT16:
+		case GA_USHORT:
+		case GA_UINT2:
+		case GA_UINT3:
+		case GA_UINT4:
+		case GA_UINT8:
+		case GA_UINT16:
+		case GA_UINT:
+		case GA_ULONG2:
+		case GA_ULONG3:
+		case GA_ULONG4:
+		case GA_ULONG8:
+		case GA_ULONG16:
+		case GA_ULONG:
+		case GA_ULONGLONG:
+		case GA_SIZE:               *property = "0"; break;
+		case GA_HALF:
+		case GA_FLOAT:
+		case GA_DOUBLE:
+		case GA_QUAD:               *property = "NAN"; break;
+		default:      return GA_UNSUPPORTED_ERROR;
+	}
+	
+	return GA_NO_ERROR;
+}
+
+/**
+ * @brief Get an expression representing a suitable initialization value for
+ *        the given datatype and a min-reduction operation.
+ *
+ * @param [in]  typecode  Typecode of the type whose initializer is to be
+ *                        requested.
+ * @param [out] property  A pointer to a string. On return it will be set to
+ *                        the initializer expression.
+ * @return Zero if successful; Non-zero if the datatype is not supported.
+ */
+
+static int   reduxGetMaxInit               (int typecode, const char** property){
+	switch(typecode){
+		case GA_BOOL:               *property = "1"; break;
+		case GA_BYTE2:
+		case GA_BYTE3:
+		case GA_BYTE4:
+		case GA_BYTE8:
+		case GA_BYTE16:
+		case GA_BYTE:               *property = "SCHAR_MAX"; break;
+		case GA_UBYTE2:
+		case GA_UBYTE3:
+		case GA_UBYTE4:
+		case GA_UBYTE8:
+		case GA_UBYTE16:
+		case GA_UBYTE:              *property = "UCHAR_MAX"; break;
+		case GA_SHORT2:
+		case GA_SHORT3:
+		case GA_SHORT4:
+		case GA_SHORT8:
+		case GA_SHORT16:
+		case GA_SHORT:              *property = "SHRT_MAX"; break;
+		case GA_USHORT2:
+		case GA_USHORT3:
+		case GA_USHORT4:
+		case GA_USHORT8:
+		case GA_USHORT16:
+		case GA_USHORT:             *property = "USHRT_MAX"; break;
+		case GA_INT2:
+		case GA_INT3:
+		case GA_INT4:
+		case GA_INT8:
+		case GA_INT16:
+		case GA_INT:                *property = "INT_MAX"; break;
+		case GA_UINT2:
+		case GA_UINT3:
+		case GA_UINT4:
+		case GA_UINT8:
+		case GA_UINT16:
+		case GA_UINT:               *property = "UINT_MAX"; break;
+		case GA_LONG2:
+		case GA_LONG3:
+		case GA_LONG4:
+		case GA_LONG8:
+		case GA_LONG16:
+		case GA_LONG:               *property = "LONG_MAX"; break;
+		case GA_ULONG2:
+		case GA_ULONG3:
+		case GA_ULONG4:
+		case GA_ULONG8:
+		case GA_ULONG16:
+		case GA_ULONG:              *property = "ULONG_MAX"; break;
+		case GA_LONGLONG:           *property = "LLONG_MAX"; break;
+		case GA_ULONGLONG:          *property = "ULLONG_MAX"; break;
+		case GA_HALF:
+		case GA_FLOAT:
+		case GA_DOUBLE:
+		case GA_QUAD:               *property = "NAN"; break;
+		default:      return GA_UNSUPPORTED_ERROR;
+	}
+	
+	return GA_NO_ERROR;
+}
+
+/**
+ * @brief Get an expression representing a suitable initialization value for
+ *        the given datatype and a and-reduction operation.
+ *
+ * @param [in]  typecode  Typecode of the type whose initializer is to be
+ *                        requested.
+ * @param [out] property  A pointer to a string. On return it will be set to
+ *                        the initializer expression.
+ * @return Zero if successful; Non-zero if the datatype is not supported.
+ */
+
+static int   reduxGetAndInit               (int typecode, const char** property){
+	if(typecode == GA_POINTER ||
+	   typecode == GA_BUFFER){
+		return GA_UNSUPPORTED_ERROR;
+	}
+	*property = "~0";
+	return GA_NO_ERROR;
+}
+
+/**
+ * @brief Get an expression representing a suitable initialization value for
+ *        the given datatype and a or-reduction operation.
+ *
+ * @param [in]  typecode  Typecode of the type whose initializer is to be
+ *                        requested.
+ * @param [out] property  A pointer to a string. On return it will be set to
+ *                        the initializer expression.
+ * @return Zero if successful; Non-zero if the datatype is not supported.
+ */
+
+static int   reduxGetOrInit                (int typecode, const char** property){
+	if(typecode == GA_POINTER ||
+	   typecode == GA_BUFFER){
+		return GA_UNSUPPORTED_ERROR;
 	}
+	*property = "0";
+	return GA_NO_ERROR;
 }
 
 /**
@@ -133,10 +640,10 @@ GPUARRAY_PUBLIC int GpuArray_maxandargmax       (GpuArray*       dstMax,
  * @return Non-zero if the set is non-empty and v is in it; Zero otherwise.
  */
 
-static int   axisInSet                          (int                v,
-                                                 const int*         set,
-                                                 size_t             setLen,
-                                                 size_t*            where){
+static int   axisInSet                     (int                v,
+                                            const int*         set,
+                                            size_t             setLen,
+                                            size_t*            where){
 	size_t i;
 
 	for(i=0;i<setLen;i++){
@@ -165,13 +672,13 @@ static int   axisInSet                          (int                v,
  * @param [in]  epilogue  Text that is appended and NOT repeated.
  */
 
-static void  appendIdxes                        (strb*              s,
-                                                 const char*        prologue,
-                                                 const char*        prefix,
-                                                 int                startIdx,
-                                                 int                endIdx,
-                                                 const char*        suffix,
-                                                 const char*        epilogue){
+static void  appendIdxes                   (strb*              s,
+                                            const char*        prologue,
+                                            const char*        prefix,
+                                            int                startIdx,
+                                            int                endIdx,
+                                            const char*        suffix,
+                                            const char*        epilogue){
 	int i;
 
 	prologue = prologue ? prologue : "";
@@ -188,63 +695,101 @@ static void  appendIdxes                        (strb*              s,
 
 /**
  * @brief Check the sanity of the arguments, in agreement with the
- *        documentation for GpuArray_maxandargmax().
+ *        documentation for GpuArray_reduction().
  *
  *        Also initialize certain parts of the context.
  *
  * @return GA_INVALID_ERROR if arguments invalid; GA_NO_ERROR otherwise.
  */
 
-static int   maxandargmaxCheckargs              (maxandargmax_ctx*  ctx){
-	int i;
+static int   reduxCheckargs                (redux_ctx*  ctx){
+	int i, ret;
+	const strb INIT_STRB = STRB_STATIC_INIT;
 
 	/**
 	 * We initialize certain parts of the context.
 	 */
 
-	ctx->ret           = GA_NO_ERROR;
 	ctx->axisList      = NULL;
 	ctx->gpuCtx        = NULL;
 
-	ctx->dstMaxType    = ctx->dstArgmaxType = NULL;
+	ctx->srcTypeStr    = ctx->dstTypeStr    = ctx->dstArgTypeStr =
+	ctx->accTypeStr    = ctx->idxTypeStr    = NULL;
+	ctx->initVal       = NULL;
 	ctx->ndh           = 0;
+	ctx->ndhd          = 0;
+	ctx->ndhr          = 0;
 	ctx->sourceCode    = NULL;
+	ctx->s             = INIT_STRB;
 
-	ctx->hwAxisList[0] = ctx->hwAxisList[1] = ctx->hwAxisList[2] = 0;
-	ctx->blockSize [0] = ctx->blockSize [1] = ctx->blockSize [2] = 1;
-	ctx->gridSize  [0] = ctx->gridSize  [1] = ctx->gridSize  [2] = 1;
-	ctx->chunkSize [0] = ctx->chunkSize [1] = ctx->chunkSize [2] = 1;
+	for(i=0;i<MAX_HW_DIMS;i++){
+		ctx->hwAxisList[i] = 0;
+		ctx->blockSize [i] = 1;
+		ctx->gridSize  [i] = 1;
+		ctx->chunkSize [i] = 1;
+	}
 
-	ctx->srcStepsGD    = ctx->srcSizeGD     = ctx->chunkSizeGD   =
-	ctx->dstMaxStepsGD = ctx->dstArgmaxStepsGD = NULL;
+	ctx->srcStepsGD = ctx->srcSizeGD     = ctx->chunkSizeGD   =
+	ctx->dstStepsGD = ctx->dstArgStepsGD = NULL;
+	/* *** IT IS NOW SAFE TO CALL reduxCleanup() *** */
 
 
-	/* Insane src or reduxLen? */
-	if(!ctx->dstMax || !ctx->dstArgmax || !ctx->src || ctx->src->nd == 0 ||
-	    ctx->reduxLen == 0 || ctx->reduxLen > (int)ctx->src->nd){
-		return ctx->ret=GA_INVALID_ERROR;
+	/* Insane src, reduxLen, dst or dstArg? */
+	if(!ctx->src || ctx->src->nd <= 0 || ctx->reduxLen == 0 ||
+	   ctx->reduxLen > (int)ctx->src->nd){
+		return reduxCleanup(ctx, GA_INVALID_ERROR);
+	}
+	if((reduxHasDst   (ctx) && !ctx->dst)   ||
+	   (reduxHasDstArg(ctx) && !ctx->dstArg)){
+		return reduxCleanup(ctx, GA_INVALID_ERROR);
 	}
 
+
 	/* Insane or duplicate list entry? */
 	for(i=0;i<ctx->reduxLen;i++){
 		if(ctx->reduxList[i] <  0                            ||
 		   ctx->reduxList[i] >= (int)ctx->src->nd            ||
 		   axisInSet(ctx->reduxList[i], ctx->reduxList, i, 0)){
-			return ctx->ret=GA_INVALID_ERROR;
+			return reduxCleanup(ctx, GA_INVALID_ERROR);
 		}
 	}
 
-	/* Unknown type? */
-	ctx->dstMaxType    = gpuarray_get_type(ctx->src->typecode)->cluda_name;
-	ctx->dstArgmaxType = gpuarray_get_type(GA_SSIZE)          ->cluda_name;
-	if(!ctx->dstMaxType || !ctx->dstArgmaxType){
-		return ctx->ret=GA_INVALID_ERROR;
-	}
 
 	/* GPU context non-existent? */
-	ctx->gpuCtx        = GpuArray_context(ctx->src);
+	ctx->gpuCtx     = GpuArray_context(ctx->src);
 	if(!ctx->gpuCtx){
-		return ctx->ret=GA_INVALID_ERROR;
+		return reduxCleanup(ctx, GA_INVALID_ERROR);
+	}
+
+
+	/* Unknown type? */
+	reduxSelectTypes(ctx);
+	if(!ctx->srcTypeStr || !ctx->dstTypeStr || !ctx->dstArgTypeStr ||
+	   !ctx->accTypeStr){
+		return reduxCleanup(ctx, GA_INVALID_ERROR);
+	}
+
+
+	/* Determine initializer, and error out if reduction unsupported. */
+	switch(ctx->op){
+		case GA_REDUCE_SUM:  ret = reduxGetSumInit (ctx->accTypeCode, &ctx->initVal); break;
+		case GA_REDUCE_PRODNZ:
+		case GA_REDUCE_PROD: ret = reduxGetProdInit(ctx->accTypeCode, &ctx->initVal); break;
+		case GA_REDUCE_MINANDARGMIN:
+		case GA_REDUCE_ARGMIN:
+		case GA_REDUCE_MIN:  ret = reduxGetMinInit (ctx->accTypeCode, &ctx->initVal); break;
+		case GA_REDUCE_MAXANDARGMAX:
+		case GA_REDUCE_ARGMAX:
+		case GA_REDUCE_MAX:  ret = reduxGetMaxInit (ctx->accTypeCode, &ctx->initVal); break;
+		case GA_REDUCE_ALL:
+		case GA_REDUCE_AND:  ret = reduxGetAndInit (ctx->accTypeCode, &ctx->initVal); break;
+		case GA_REDUCE_ANY:
+		case GA_REDUCE_XOR:
+		case GA_REDUCE_OR:   ret = reduxGetOrInit  (ctx->accTypeCode, &ctx->initVal); break;
+		default:             ret = GA_UNSUPPORTED_ERROR; break;
+	}
+	if(ret != GA_NO_ERROR){
+		return reduxCleanup(ctx, ret);
 	}
 
 
@@ -256,114 +801,493 @@ static int   maxandargmaxCheckargs              (maxandargmax_ctx*  ctx){
 	ctx->nds = ctx->src->nd;
 	ctx->ndr = ctx->reduxLen;
 	ctx->ndd = ctx->nds - ctx->ndr;
+	strb_ensure(&ctx->s, 5*1024);
+
 
-	return ctx->ret;
+
+	return reduxSelectModel(ctx);
 }
 
 /**
- * @brief Select which axes (up to 3) will be assigned to hardware
- *        dimensions.
+ * @brief Select types for the reduction kernel's implementation.
+ * 
+ * There are 5 types of relevance:
+ *   - Source                   (S=Source)
+ *   - Destination              (T=Target)
+ *   - Destination Argument     (A=Arg)
+ *   - Index                    (X=indeX)
+ *   - Accumulator              (K=aKKumulator/reduction)
  */
 
-static int   maxandargmaxSelectHwAxes           (maxandargmax_ctx*  ctx){
-	int    i, j, maxI = 0;
-	size_t maxV;
+static void  reduxSelectTypes              (redux_ctx*  ctx){
+	/* Deal with the various typecodes. */
+	ctx->srcTypeCode    = ctx->src->typecode;
+	ctx->dstTypeCode    = ctx->srcTypeCode;
+	ctx->dstArgTypeCode = GA_SSIZE;
+	ctx->idxTypeCode    = GA_SSIZE;
+	switch(ctx->srcTypeCode){
+		case GA_HALF:   ctx->accTypeCode = GA_FLOAT;
+		case GA_HALF2:  ctx->accTypeCode = GA_FLOAT2;
+		case GA_HALF4:  ctx->accTypeCode = GA_FLOAT4;
+		case GA_HALF8:  ctx->accTypeCode = GA_FLOAT8;
+		case GA_HALF16: ctx->accTypeCode = GA_FLOAT16;
+		default:        ctx->accTypeCode = ctx->srcTypeCode;
+	}
+	
+	/* Get the string version as well. */
+	ctx->srcTypeStr     = gpuarray_get_type(ctx->srcTypeCode)   ->cluda_name;
+	ctx->dstTypeStr     = gpuarray_get_type(ctx->dstTypeCode)   ->cluda_name;
+	ctx->dstArgTypeStr  = gpuarray_get_type(ctx->dstArgTypeCode)->cluda_name;
+	ctx->idxTypeStr     = gpuarray_get_type(ctx->idxTypeCode)   ->cluda_name;
+	ctx->accTypeStr     = gpuarray_get_type(ctx->accTypeCode)   ->cluda_name;
+}
+
+/**
+ * @brief Select which code model will be used:
+ * 
+ *        - Large (Destination tensor >= SMALL_REDUX_THRESHOLD elements, or
+ *                 destination tensor size >= # of reductions per destination
+ *                 tensor element):
+ *            All destination elements have their own thread.
+ *        - Small (otherwise):
+ *            Multiple threads cooperate on a single destination element.
+ */
+
+static int   reduxSelectModel              (redux_ctx*  ctx){
+	int      i, ret;
+	unsigned numProcs;
+	size_t   localSize;
+	size_t   dstNumElem = 1, reduxPerElem = 1;
 
-	ctx->ndh = ctx->ndd<3 ? ctx->ndd : 3;
 
 	/**
-	 * The ctx->hwAxisLen largest axes are selected and assigned in
-	 * descending order to X, Y, Z.
+	 * Query device for approximate total level of parallelism. If destination
+	 * tensor is so big it can keep all threads busy on individual elements,
+	 * use large code model; Otherwise use small code model, where threads will
+	 * have to cooperate.
 	 */
+	
+	ret = gpucontext_property(ctx->gpuCtx, GA_CTX_PROP_NUMPROCS, &numProcs);
+	if(ret != GA_NO_ERROR){
+		return reduxCleanup(ctx, ret);
+	}
+	ret = gpucontext_property(ctx->gpuCtx, GA_CTX_PROP_MAXLSIZE, &localSize);
+	if(ret != GA_NO_ERROR){
+		return reduxCleanup(ctx, ret);
+	}
 
-	for(i=0;i<ctx->ndh;i++){
-		maxV = 0;
-
-		for(j=0;j<ctx->nds;j++){
-			if(!axisInSet(j, ctx->hwAxisList, i,        0) &&
-			   !axisInSet(j, ctx->reduxList,  ctx->ndr, 0) &&
-			   ctx->src->dimensions[j] >= maxV){
-				maxV = ctx->src->dimensions[j];
-				maxI = j;
-			}
+
+	/**
+	 * Compute #elems in dst and # reductions per dst element.
+	 */
+
+	for(i=0;i<ctx->nds;i++){
+		if(axisInSet(i, ctx->reduxList, ctx->nds, NULL)){
+			reduxPerElem *= ctx->src->dimensions[i];
+		}else{
+			dstNumElem   *= ctx->src->dimensions[i];
 		}
+	}
+	ctx->largeCodeModel = dstNumElem >= numProcs*localSize ||
+	                      dstNumElem >= reduxPerElem
+	                      || 1;/* BUG: Erase when small code model implemented. */
+	/**
+	 * *** IT IS NOW SAFE TO CALL: ***
+	 *       - reduxIsLargeModel()
+	 *       - reduxIsSmallModel()
+	 *       - reduxKernelRequiresDst()
+	 *       - reduxKernelRequiresDstArg()
+	 */
+	
+	
+	return reduxSelectHwAxes(ctx);
+}
+
+/**
+ * @brief Returns whether we are using the small code model or not.
+ */
 
-		ctx->hwAxisList[i] = maxI;
+static int   reduxIsSmallCodeModel         (redux_ctx*  ctx){
+	return !reduxIsLargeCodeModel(ctx);
+}
+
+/**
+ * @brief Returns whether we are using the large code model or not.
+ */
+
+static int   reduxIsLargeCodeModel         (redux_ctx*  ctx){
+	return ctx->largeCodeModel;
+}
+
+/**
+ * @brief Returns whether the reduction interface requires a dst argument.
+ */
+
+static int   reduxHasDst                   (redux_ctx*  ctx){
+	switch(ctx->op){
+		case GA_REDUCE_ARGMIN:
+		case GA_REDUCE_ARGMAX:       return 0;
+		default:                     return 1;
 	}
+}
+
+/**
+ * @brief Returns whether the reduction interface requires a dstArg argument.
+ */
 
-	return ctx->ret=GA_NO_ERROR;
+static int   reduxHasDstArg                (redux_ctx*  ctx){
+	switch(ctx->op){
+		case GA_REDUCE_MINANDARGMIN:
+		case GA_REDUCE_MAXANDARGMAX:
+		case GA_REDUCE_ARGMIN:
+		case GA_REDUCE_ARGMAX:       return 1;
+		default:                     return 0;
+	}
 }
 
 /**
- * @brief Generate the kernel code for MaxAndArgmax.
- *
- * @return GA_MEMORY_ERROR if not enough memory left; GA_NO_ERROR otherwise.
+ * @brief Returns whether the generated kernel internally requires a dst
+ *        argument.
+ * 
+ * This is semantically subtly different from reduxHasDst(). The main
+ * difference is in the implementation of the GA_REDUCE_ARGMIN/ARGMAX
+ * reductions; Either *might* require a dst buffer, which will have to be
+ * allocated, even though it will be discared.
  */
 
-static int   maxandargmaxGenSource              (maxandargmax_ctx*  ctx){
-	/* Compute internal axis remapping. */
+static int   reduxKernelRequiresDst        (redux_ctx*  ctx){
+	switch(ctx->op){
+		case GA_REDUCE_ARGMIN:
+		case GA_REDUCE_ARGMAX:       return reduxIsSmallCodeModel(ctx);
+		default:                     return 1;
+	}
+}
+
+/**
+ * @brief Returns whether the generated kernel internally requires a dstArg
+ *        argument.
+ * 
+ * This is semantically subtly different from reduxHasDstArg(), since it asks
+ * whether the reduction, even though it does not accept a dstArg argument,
+ * still requires a dstArg internally.
+ */
+
+static int   reduxKernelRequiresDstArg     (redux_ctx*  ctx){
+	/**
+	 * At present there exists no reduction whose implementation requires
+	 * a dstArg but whose interface does not.
+	 * 
+	 * E.g. the max() and min() reductions do NOT currently require a temporary
+	 *      buffer for indexes, and will not in the foreseeable future.
+	 */
+	
+	return reduxHasDstArg(ctx);
+}
+
+/**
+ * @brief Check whether we can add another reduction axis
+ *        (wantReductionAxis=1) or destination axis (wantReductionAxis=0) to
+ *        the hardware axis list.
+ */
+
+static int   reduxCanAppendHwAxis          (redux_ctx* ctx, int wantReductionAxis){
+	if(ctx->ndh >= MAX_HW_DIMS){
+		return 0;
+	}else{
+		return wantReductionAxis ? ctx->ndhr < ctx->ndr:
+		                           ctx->ndhd < ctx->ndd;
+	}
+}
+
+/**
+ * @brief Append the largest reduction axis (wantReductionAxis=1) or
+ *        destination axis (wantReductionAxis=0) that isn't yet in the hardware
+ *        axis list into said hardware axis list.
+ */
+
+static void  reduxAppendLargestAxisToHwList(redux_ctx* ctx, int wantReductionAxis){
+	int    maxI = 0, i, isInHwList, isInReduxList, isInDesiredList, isLargestSoFar;
+	size_t maxV = 0;
+	
+	/* Find */
+	for(i=0;i<ctx->nds;i++){
+		isInHwList      = axisInSet(i, ctx->hwAxisList, ctx->ndh, 0);
+		isInReduxList   = axisInSet(i, ctx->reduxList,  ctx->ndr, 0);
+		isInDesiredList = wantReductionAxis ? isInReduxList : !isInReduxList;
+		isLargestSoFar  = ctx->src->dimensions[i] >= maxV;
+		
+		if(!isInHwList && isInDesiredList && isLargestSoFar){
+			maxV = ctx->src->dimensions[i];
+			maxI = i;
+		}
+	}
+	
+	/* Append */
+	ctx->hwAxisList[ctx->ndh++] = maxI;
+	if(wantReductionAxis){
+		ctx->ndhr++;
+	}else{
+		ctx->ndhd++;
+	}
+}
+
+/**
+ * @brief Select which axes (up to MAX_HW_DIMS) will be assigned to hardware
+ *        dimensions.
+ * 
+ * For the "large" code model: The up-to-MAX_HW_DIMS largest destination tensor
+ *                             dimensions are selected.
+ * For the "small" code model: Up to MAX_HW_DIMS reduction dimensions (largest-
+ *                             to-smallest) are selected. If less than
+ *                             MAX_HW_DIMS dimensions were selected,
+ *                             destination tensor dimensions are selected until
+ *                             MAX_HW_DIMS total dimensions are selected, or no
+ *                             destination tensors are left.
+ */
+
+static int   reduxSelectHwAxes             (redux_ctx*  ctx){
+	if(reduxIsSmallCodeModel(ctx)){
+		while(reduxCanAppendHwAxis(ctx, 1)){
+			reduxAppendLargestAxisToHwList(ctx, 1);
+		}
+	}
+	
+	while(reduxCanAppendHwAxis(ctx, 0)){
+		reduxAppendLargestAxisToHwList(ctx, 0);
+	}
+	
+	return reduxComputeAxisList(ctx);
+}
+
+/**
+ * @brief Compute the axis list.
+ * 
+ * The axis list describes the mapping between the nested loops of the kernel
+ * as well as their accompanying indices (i0*, i1*, ..., in*) on one hand, and
+ * the axes of the source tensor. The first axis in the list corresponds to the
+ * outermost loop and the last axis in the list to the innermost.
+ * 
+ * The first ctx->ndd axes correspond to the outer loops that iterate over
+ * each destination element. The last ctx->ndr axes correspond to the inner
+ * loops that iterate over the dimensions of elements that are to be reduced.
+ * 
+ * @return GA_MEMORY_ERROR if allocating the list failed; Otherwise, returns
+ *         GA_NO_ERROR.
+ */
+
+static int   reduxComputeAxisList          (redux_ctx*  ctx){
+	int i, f=0;
+	
 	ctx->axisList = malloc(ctx->nds * sizeof(unsigned));
 	if(!ctx->axisList){
-		return ctx->ret=GA_MEMORY_ERROR;
+		return reduxCleanup(ctx, GA_MEMORY_ERROR);
 	}
-	maxandargmaxComputeAxisList(ctx);
 
-	/* Generate kernel proper. */
-	strb_ensure(&ctx->s, 5*1024);
-	maxandargmaxAppendKernel(ctx);
-	free(ctx->axisList);
-	ctx->axisList   = NULL;
+	for(i=0;i<ctx->nds;i++){
+		if(!axisInSet(i, ctx->reduxList, ctx->ndr, 0)){
+			ctx->axisList[f++] = i;
+		}
+	}
+	memcpy(&ctx->axisList[f], ctx->reduxList, ctx->ndr * sizeof(*ctx->reduxList));
+	
+	
+	return reduxGenSource(ctx);
+}
+
+/**
+ * @brief Generate the kernel code for the reduction.
+ *
+ * @return GA_MEMORY_ERROR if not enough memory left; GA_NO_ERROR otherwise.
+ */
+
+static int   reduxGenSource                (redux_ctx*  ctx){
+	reduxAppendSource(ctx);
 	ctx->sourceCode = strb_cstr(&ctx->s);
 	if(!ctx->sourceCode){
-		return ctx->ret=GA_MEMORY_ERROR;
+		return reduxCleanup(ctx, GA_MEMORY_ERROR);
 	}
-
-	/* Return it. */
-	return ctx->ret=GA_NO_ERROR;
+	
+	return reduxIsLargeCodeModel(ctx) ? reduxCompileLarge(ctx):
+	                                    reduxCompileSmall(ctx);
 }
-static void  maxandargmaxAppendKernel           (maxandargmax_ctx*  ctx){
-	strb_appends           (&ctx->s, "#include \"cluda.h\"\n");
-	maxandargmaxAppendTypedefs         (ctx);
-	maxandargmaxAppendPrototype        (ctx);
-	strb_appends           (&ctx->s, "{\n");
-	maxandargmaxAppendOffsets          (ctx);
-	maxandargmaxAppendIndexDeclarations(ctx);
-	maxandargmaxAppendRangeCalculations(ctx);
-	maxandargmaxAppendLoops            (ctx);
-	strb_appends           (&ctx->s, "}\n");
+static void  reduxAppendSource             (redux_ctx*  ctx){
+	reduxAppendIncludes         (ctx);
+	reduxAppendTypedefs         (ctx);
+	reduxAppendFuncGetInitVal   (ctx);
+	reduxAppendFuncLoadVal      (ctx);
+	reduxAppendFuncReduxVal     (ctx);
+	reduxAppendFuncPreKernel    (ctx);
+	reduxAppendFuncKernel       (ctx);
+	reduxAppendFuncPostKernel   (ctx);
 }
-static void  maxandargmaxAppendTypedefs         (maxandargmax_ctx*  ctx){
+static void  reduxAppendIncludes           (redux_ctx*  ctx){
+	strb_appends(&ctx->s, "/* Includes */\n");
+	strb_appends(&ctx->s, "#include \"cluda.h\"\n");
+	strb_appends(&ctx->s, "\n");
+	strb_appends(&ctx->s, "\n");
+	strb_appends(&ctx->s, "\n");
+}
+static void  reduxAppendTypedefs           (redux_ctx*  ctx){
 	strb_appends(&ctx->s, "/* Typedefs */\n");
-	strb_appendf(&ctx->s, "typedef %s     T;/* The type of the array being processed. */\n", ctx->dstMaxType);
-	strb_appendf(&ctx->s, "typedef %s     X;/* Index type: signed 32/64-bit. */\n",          ctx->dstArgmaxType);
+	strb_appendf(&ctx->s, "typedef %s     S;/* The type of the source array. */\n",                ctx->srcTypeStr);
+	strb_appendf(&ctx->s, "typedef %s     T;/* The type of the destination array. */\n",           ctx->dstTypeStr);
+	strb_appendf(&ctx->s, "typedef %s     A;/* The type of the destination argument array. */\n",  ctx->dstArgTypeStr);
+	strb_appendf(&ctx->s, "typedef %s     X;/* The type of the indices: signed 32/64-bit. */\n",   ctx->idxTypeStr);
+	strb_appendf(&ctx->s, "typedef %s     K;/* The type of the accumulator variable. */\n",        ctx->accTypeStr);
+	strb_appends(&ctx->s, "\n");
+	strb_appends(&ctx->s, "\n");
+	strb_appends(&ctx->s, "\n");
+}
+static void  reduxAppendFuncGetInitVal     (redux_ctx*  ctx){
+	strb_appends(&ctx->s, "/**\n");
+	strb_appends(&ctx->s, " * Initial value function.\n");
+	strb_appends(&ctx->s, " */\n");
+	strb_appends(&ctx->s, "\n");
+	strb_appends(&ctx->s, "WITHIN_KERNEL K    getInitVal(void){\n");
+	strb_appendf(&ctx->s, "\treturn (%s);\n", ctx->initVal);
+	strb_appends(&ctx->s, "}\n");
+	strb_appends(&ctx->s, "\n");
+	strb_appends(&ctx->s, "\n");
+	strb_appends(&ctx->s, "\n");
+}
+static void  reduxAppendFuncLoadVal        (redux_ctx*  ctx){
+	int i;
+	
+	strb_appends(&ctx->s, "/**\n");
+	strb_appends(&ctx->s, " * Multidimensional source element loader.\n");
+	strb_appends(&ctx->s, " *\n");
+	strb_appends(&ctx->s, " * Also implements prescalar transformations if any.\n");
+	strb_appends(&ctx->s, " */\n");
+	strb_appends(&ctx->s, "\n");
+	appendIdxes (&ctx->s, "WITHIN_KERNEL K    loadVal(", "X i", 0, ctx->nds, "", "");
+	if(ctx->nds > 0){
+		strb_appends(&ctx->s, ", ");
+	}
+	strb_appends(&ctx->s, "const GLOBAL_MEM S* src, const GLOBAL_MEM X* srcSteps){\n");
+	strb_appends(&ctx->s, "\tS v = (*(const GLOBAL_MEM S*)((const GLOBAL_MEM char*)src + ");
+	for(i=0;i<ctx->nds;i++){
+		strb_appendf(&ctx->s, "i%d*srcSteps[%d] + \\\n\t                                                            ", i, ctx->axisList[i]);
+	}
+	strb_appends(&ctx->s, "0));\n");
+	strb_appends(&ctx->s, "\treturn v;\n");
+	strb_appends(&ctx->s, "}\n");
+	strb_appends(&ctx->s, "\n");
 	strb_appends(&ctx->s, "\n");
 	strb_appends(&ctx->s, "\n");
+}
+static void  reduxAppendFuncReduxVal       (redux_ctx*  ctx){
+	int i, anyArgsEmitted = 0;
+	
+	/* Function Signature. */
+	strb_appends(&ctx->s, "/**\n");
+	strb_appends(&ctx->s, " * Global memory value reduction function.\n");
+	strb_appends(&ctx->s, " *\n");
+	strb_appends(&ctx->s, " * Responsible for either:\n");
+	strb_appends(&ctx->s, " *   1) Safe writeback of final value to memory, or\n");
+	strb_appends(&ctx->s, " *   2) Safe atomic reduction of partial value into memory.\n");
+	strb_appends(&ctx->s, " */\n");
+	strb_appends(&ctx->s, "\n");
+	appendIdxes (&ctx->s, "WITHIN_KERNEL void reduxVal(", "X i", 0, ctx->ndd, "", "");
+	anyArgsEmitted = ctx->ndd>0;
+	if(anyArgsEmitted){
+		strb_appends(&ctx->s, ", ");
+	}
+	if(reduxKernelRequiresDst   (ctx)){
+		anyArgsEmitted = 1;
+		strb_appends(&ctx->s, "GLOBAL_MEM T* dst,    const GLOBAL_MEM X* dstSteps,    K v");
+	}
+	if(anyArgsEmitted){
+		strb_appends(&ctx->s, ", ");
+	}
+	if(reduxKernelRequiresDstArg(ctx)){
+		anyArgsEmitted = 1;
+		strb_appends(&ctx->s, "GLOBAL_MEM A* dstArg, const GLOBAL_MEM X* dstArgSteps, X i");
+	}
+	strb_appends(&ctx->s, "){\n");
+	
+	
+	/* Post-scalar transformations go here. */
+	
+	
+	/* Write to memory. */
+	if(reduxIsLargeCodeModel(ctx)){
+		/* Large code model. Easy: just write out the data, since it's safe. */
+		if(reduxKernelRequiresDst   (ctx)){
+			strb_appends(&ctx->s, "\t(*(GLOBAL_MEM T*)((GLOBAL_MEM char*)dst + ");
+			for(i=0;i<ctx->ndd;i++){
+				strb_appendf(&ctx->s, "i%d*dstSteps[%d] +\n\t                                          ", i, i);
+			}
+			strb_appends(&ctx->s, "0)) = v;\n");
+		}
+		if(reduxKernelRequiresDstArg(ctx)){
+			strb_appends(&ctx->s, "\t(*(GLOBAL_MEM A*)((GLOBAL_MEM char*)dstArg + ");
+			for(i=0;i<ctx->ndd;i++){
+				strb_appendf(&ctx->s, "i%d*dstArgSteps[%d] +\n\t                                             ", i, i);
+			}
+			strb_appends(&ctx->s, "0)) = i;\n");
+		}
+	}else{
+		/* BUG: Implement the atomic reduction, one or two CAS loops. */
+		if      ( reduxKernelRequiresDst   (ctx) && !reduxKernelRequiresDstArg(ctx)){
+			
+		}else if(!reduxKernelRequiresDst   (ctx) &&  reduxKernelRequiresDstArg(ctx)){
+			
+		}else if( reduxKernelRequiresDst   (ctx) &&  reduxKernelRequiresDstArg(ctx)){
+			
+		}
+	}
+	
+	/* Close off function. */
+	strb_appends(&ctx->s, "}\n");
+	strb_appends(&ctx->s, "\n");
+	strb_appends(&ctx->s, "\n");
+	strb_appends(&ctx->s, "\n");
+}
+static void  reduxAppendFuncPreKernel      (redux_ctx*  ctx){
+	
+}
+static void  reduxAppendFuncKernel         (redux_ctx*  ctx){
+	reduxAppendPrototype        (ctx);
+	strb_appends           (&ctx->s, "{\n");
+	reduxAppendOffsets          (ctx);
+	reduxAppendIndexDeclarations(ctx);
+	reduxAppendRangeCalculations(ctx);
+	reduxAppendLoops            (ctx);
+	strb_appends           (&ctx->s, "}\n");
+}
+static void  reduxAppendFuncPostKernel     (redux_ctx*  ctx){
+	
+}
+static void  reduxAppendPrototype          (redux_ctx*  ctx){
+	strb_appends(&ctx->s, "/**\n");
+	strb_appends(&ctx->s, " * Reduction Kernel.\n");
+	strb_appends(&ctx->s, " *\n");
+	strb_appends(&ctx->s, " * Implements actual reduction operation.\n");
+	strb_appends(&ctx->s, " */\n");
 	strb_appends(&ctx->s, "\n");
+	strb_appends(&ctx->s, "KERNEL void redux(const GLOBAL_MEM S*        src,\n");
+	strb_appends(&ctx->s, "                  const X                    srcOff,\n");
+	strb_appends(&ctx->s, "                  const GLOBAL_MEM X*        srcSteps,\n");
+	strb_appends(&ctx->s, "                  const GLOBAL_MEM X*        srcSize,\n");
+	strb_appends(&ctx->s, "                  const GLOBAL_MEM X*        chunkSize,\n");
+	strb_appends(&ctx->s, "                  GLOBAL_MEM T*              dst,\n");
+	strb_appends(&ctx->s, "                  const X                    dstOff,\n");
+	strb_appends(&ctx->s, "                  const GLOBAL_MEM X*        dstSteps,\n");
+	strb_appends(&ctx->s, "                  GLOBAL_MEM A*              dstArg,\n");
+	strb_appends(&ctx->s, "                  const X                    dstArgOff,\n");
+	strb_appends(&ctx->s, "                  const GLOBAL_MEM X*        dstArgSteps)");
 }
-static void  maxandargmaxAppendPrototype        (maxandargmax_ctx*  ctx){
-	strb_appends(&ctx->s, "KERNEL void maxandargmax(const GLOBAL_MEM T*        src,\n");
-	strb_appends(&ctx->s, "                         const X         srcOff,\n");
-	strb_appends(&ctx->s, "                         const GLOBAL_MEM X*        srcSteps,\n");
-	strb_appends(&ctx->s, "                         const GLOBAL_MEM X*        srcSize,\n");
-	strb_appends(&ctx->s, "                         const GLOBAL_MEM X*        chunkSize,\n");
-	strb_appends(&ctx->s, "                         GLOBAL_MEM T*              dstMax,\n");
-	strb_appends(&ctx->s, "                         const X         dstMaxOff,\n");
-	strb_appends(&ctx->s, "                         const GLOBAL_MEM X*        dstMaxSteps,\n");
-	strb_appends(&ctx->s, "                         GLOBAL_MEM X*              dstArgmax,\n");
-	strb_appends(&ctx->s, "                         const X         dstArgmaxOff,\n");
-	strb_appends(&ctx->s, "                         const GLOBAL_MEM X*        dstArgmaxSteps)");
-}
-static void  maxandargmaxAppendOffsets          (maxandargmax_ctx*  ctx){
+static void  reduxAppendOffsets            (redux_ctx*  ctx){
 	strb_appends(&ctx->s, "\t/* Add offsets */\n");
-	strb_appends(&ctx->s, "\tsrc       = (const GLOBAL_MEM T*)((const GLOBAL_MEM char*)src       + srcOff);\n");
-	strb_appends(&ctx->s, "\tdstMax    = (GLOBAL_MEM T*)      ((GLOBAL_MEM char*)      dstMax    + dstMaxOff);\n");
-	strb_appends(&ctx->s, "\tdstArgmax = (GLOBAL_MEM X*)      ((GLOBAL_MEM char*)      dstArgmax + dstArgmaxOff);\n");
+	strb_appends(&ctx->s, "\tsrc    = (const GLOBAL_MEM T*)((const GLOBAL_MEM char*)src    + srcOff);\n");
+	strb_appends(&ctx->s, "\tdst    = (GLOBAL_MEM T*)      ((GLOBAL_MEM char*)      dst    + dstOff);\n");
+	strb_appends(&ctx->s, "\tdstArg = (GLOBAL_MEM X*)      ((GLOBAL_MEM char*)      dstArg + dstArgOff);\n");
 	strb_appends(&ctx->s, "\t\n");
 	strb_appends(&ctx->s, "\t\n");
 }
-static void  maxandargmaxAppendIndexDeclarations(maxandargmax_ctx*  ctx){
+static void  reduxAppendIndexDeclarations  (redux_ctx*  ctx){
 	int i;
-	strb_appends(&ctx->s, "\t/* GPU kernel coordinates. Always 3D. */\n");
+	strb_appends(&ctx->s, "\t/* GPU kernel coordinates. Always 3D in OpenCL/CUDA. */\n");
 
 	strb_appends(&ctx->s, "\tX bi0 = GID_0,        bi1 = GID_1,        bi2 = GID_2;\n");
 	strb_appends(&ctx->s, "\tX bd0 = LDIM_0,       bd1 = LDIM_1,       bd2 = LDIM_2;\n");
@@ -393,7 +1317,7 @@ static void  maxandargmaxAppendIndexDeclarations(maxandargmax_ctx*  ctx){
 	strb_appends(&ctx->s, "\t\n");
 	strb_appends(&ctx->s, "\t\n");
 }
-static void  maxandargmaxAppendRangeCalculations(maxandargmax_ctx*  ctx){
+static void  reduxAppendRangeCalculations  (redux_ctx*  ctx){
 	size_t hwDim;
 	int    i;
 
@@ -407,10 +1331,10 @@ static void  maxandargmaxAppendRangeCalculations(maxandargmax_ctx*  ctx){
 		strb_appendf(&ctx->s, "\ti%dSStep   = srcSteps[%d];\n", i, ctx->axisList[i]);
 	}
 	for(i=0;i<ctx->ndd;i++){
-		strb_appendf(&ctx->s, "\ti%dMStep   = dstMaxSteps[%d];\n", i, i);
+		strb_appendf(&ctx->s, "\ti%dMStep   = dstSteps[%d];\n", i, i);
 	}
 	for(i=0;i<ctx->ndd;i++){
-		strb_appendf(&ctx->s, "\ti%dAStep   = dstArgmaxSteps[%d];\n", i, i);
+		strb_appendf(&ctx->s, "\ti%dAStep   = dstArgSteps[%d];\n", i, i);
 	}
 	for(i=ctx->nds-1;i>=ctx->ndd;i--){
 		/**
@@ -426,7 +1350,7 @@ static void  maxandargmaxAppendRangeCalculations(maxandargmax_ctx*  ctx){
 	}
 	for(i=0;i<ctx->nds;i++){
 		/**
-		 * Up to 3 dimensions get to rely on hardware loops.
+		 * Up to MAX_HW_DIMS dimensions get to rely on hardware loops.
 		 * The others, if any, have to use software looping beginning at 0.
 		 */
 
@@ -438,7 +1362,7 @@ static void  maxandargmaxAppendRangeCalculations(maxandargmax_ctx*  ctx){
 	}
 	for(i=0;i<ctx->nds;i++){
 		/**
-		 * Up to 3 dimensions get to rely on hardware loops.
+		 * Up to MAX_HW_DIMS dimensions get to rely on hardware loops.
 		 * The others, if any, have to use software looping beginning at 0.
 		 */
 
@@ -452,17 +1376,17 @@ static void  maxandargmaxAppendRangeCalculations(maxandargmax_ctx*  ctx){
 	strb_appends(&ctx->s, "\t\n");
 	strb_appends(&ctx->s, "\t\n");
 }
-static void  maxandargmaxAppendLoops            (maxandargmax_ctx*  ctx){
+static void  reduxAppendLoops              (redux_ctx*  ctx){
 	strb_appends(&ctx->s, "\t/**\n");
 	strb_appends(&ctx->s, "\t * FREE LOOPS.\n");
 	strb_appends(&ctx->s, "\t */\n");
 	strb_appends(&ctx->s, "\t\n");
 
-	maxandargmaxAppendLoopMacroDefs  (ctx);
-	maxandargmaxAppendLoopOuter      (ctx);
-	maxandargmaxAppendLoopMacroUndefs(ctx);
+	reduxAppendLoopMacroDefs  (ctx);
+	reduxAppendLoopOuter      (ctx);
+	reduxAppendLoopMacroUndefs(ctx);
 }
-static void  maxandargmaxAppendLoopMacroDefs    (maxandargmax_ctx*  ctx){
+static void  reduxAppendLoopMacroDefs      (redux_ctx*  ctx){
 	int i;
 
 	/**
@@ -477,16 +1401,6 @@ static void  maxandargmaxAppendLoopMacroDefs    (maxandargmax_ctx*  ctx){
 
 	strb_appends(&ctx->s, "#define ESCAPE(idx)     if(i##idx >= i##idx##Dim){continue;}\n");
 
-	/**
-	 * SRCINDEXER Macro
-	 */
-
-	appendIdxes (&ctx->s, "#define SRCINDEXER(", "i", 0, ctx->nds, "", ")   (*(GLOBAL_MEM T*)((GLOBAL_MEM char*)src + ");
-	for(i=0;i<ctx->nds;i++){
-		strb_appendf(&ctx->s, "i%d*i%dSStep + \\\n                                            ", i, i);
-	}
-	strb_appends(&ctx->s, "0))\n");
-
 	/**
 	 * RDXINDEXER Macro
 	 */
@@ -496,28 +1410,8 @@ static void  maxandargmaxAppendLoopMacroDefs    (maxandargmax_ctx*  ctx){
 		strb_appendf(&ctx->s, "i%d*i%dPDim + \\\n                                        ", i, i);
 	}
 	strb_appends(&ctx->s, "0)\n");
-
-	/**
-	 * DSTMINDEXER Macro
-	 */
-
-	appendIdxes (&ctx->s, "#define DSTMINDEXER(", "i", 0, ctx->ndd, "", ")        (*(GLOBAL_MEM T*)((GLOBAL_MEM char*)dstMax + ");
-	for(i=0;i<ctx->ndd;i++){
-		strb_appendf(&ctx->s, "i%d*i%dMStep + \\\n                                                  ", i, i);
-	}
-	strb_appends(&ctx->s, "0))\n");
-
-	/**
-	 * DSTAINDEXER Macro
-	 */
-
-	appendIdxes (&ctx->s, "#define DSTAINDEXER(", "i", 0, ctx->ndd, "", ")        (*(GLOBAL_MEM X*)((GLOBAL_MEM char*)dstArgmax + ");
-	for(i=0;i<ctx->ndd;i++){
-		strb_appendf(&ctx->s, "i%d*i%dAStep + \\\n                                                     ", i, i);
-	}
-	strb_appends(&ctx->s, "0))\n");
 }
-static void  maxandargmaxAppendLoopOuter        (maxandargmax_ctx*  ctx){
+static void  reduxAppendLoopOuter          (redux_ctx*  ctx){
 	int i;
 
 	/**
@@ -532,7 +1426,7 @@ static void  maxandargmaxAppendLoopOuter        (maxandargmax_ctx*  ctx){
 	 * Inner Loop Generation
 	 */
 
-	maxandargmaxAppendLoopInner(ctx);
+	reduxAppendLoopInner(ctx);
 
 	/**
 	 * Outer Loop Trailer Generation
@@ -542,87 +1436,111 @@ static void  maxandargmaxAppendLoopOuter        (maxandargmax_ctx*  ctx){
 		strb_appends(&ctx->s, "\t}\n");
 	}
 }
-static void  maxandargmaxAppendLoopInner        (maxandargmax_ctx*  ctx){
+static void  reduxAppendLoopInner          (redux_ctx*  ctx){
 	int i;
 
 	/**
 	 * Inner Loop Prologue
 	 */
 
-	strb_appends(&ctx->s, "\t/**\n");
-	strb_appends(&ctx->s, "\t * Reduction initialization.\n");
-	strb_appends(&ctx->s, "\t */\n");
-	strb_appends(&ctx->s, "\t\n");
-
-	appendIdxes (&ctx->s, "\tT maxV = SRCINDEXER(", "i", 0, ctx->ndd, "", "");
-	if(ctx->ndd && ctx->ndr){strb_appends(&ctx->s, ",");}
-	appendIdxes (&ctx->s, "", "i", ctx->ndd, ctx->nds, "Start", ");\n");
-
-	appendIdxes (&ctx->s, "\tX maxI = RDXINDEXER(", "i", ctx->ndd, ctx->nds, "Start", ");\n");
-
-	strb_appends(&ctx->s, "\t\n");
-	strb_appends(&ctx->s, "\t/**\n");
-	strb_appends(&ctx->s, "\t * REDUCTION LOOPS.\n");
-	strb_appends(&ctx->s, "\t */\n");
-	strb_appends(&ctx->s, "\t\n");
+	strb_appends(&ctx->s, "\t\t/**\n");
+	strb_appends(&ctx->s, "\t\t * Reduction initialization.\n");
+	strb_appends(&ctx->s, "\t\t */\n");
+	strb_appends(&ctx->s, "\t\t\n");
+	strb_appends(&ctx->s, "\t\tK rdxV = getInitVal();\n");
+	strb_appends(&ctx->s, "\t\tX argI = 0;\n");
+	strb_appends(&ctx->s, "\t\t\n");
+	strb_appends(&ctx->s, "\t\t/**\n");
+	strb_appends(&ctx->s, "\t\t * REDUCTION LOOPS.\n");
+	strb_appends(&ctx->s, "\t\t */\n");
+	strb_appends(&ctx->s, "\t\t\n");
 
 	/**
 	 * Inner Loop Header Generation
 	 */
 
 	for(i=ctx->ndd;i<ctx->nds;i++){
-		strb_appendf(&ctx->s, "\tFOROVER(%d){ESCAPE(%d)\n", i, i);
+		strb_appendf(&ctx->s, "\t\tFOROVER(%d){ESCAPE(%d)\n", i, i);
 	}
 
 	/**
 	 * Inner Loop Body Generation
 	 */
 
-	appendIdxes (&ctx->s, "\tT V = SRCINDEXER(", "i", 0, ctx->nds, "", ");\n");
-	strb_appends(&ctx->s, "\t\n");
-	strb_appends(&ctx->s, "\tif(V > maxV){\n");
-	strb_appends(&ctx->s, "\t\tmaxV = V;\n");
-	appendIdxes (&ctx->s, "\t\tmaxI = RDXINDEXER(", "i", ctx->ndd, ctx->nds, "", ");\n");
-	strb_appends(&ctx->s, "\t}\n");
+	appendIdxes (&ctx->s, "\t\t\tK v = loadVal(", "i", 0, ctx->nds, "", "");
+	if(ctx->nds > 0){
+		strb_appends(&ctx->s, ", ");
+	}
+	strb_appends(&ctx->s, "src, srcSteps);\n");
+	strb_appends(&ctx->s, "\t\t\t\n");
+	switch(ctx->op){
+		case GA_REDUCE_SUM:          strb_appends(&ctx->s, "\t\t\trdxV += v;\n"); break;
+		case GA_REDUCE_PROD:         strb_appends(&ctx->s, "\t\t\trdxV *= v;\n"); break;
+		case GA_REDUCE_PRODNZ:       strb_appends(&ctx->s, "\t\t\trdxV *= v==0 ? getInitVal() : v;\n"); break;
+		case GA_REDUCE_MIN:          strb_appends(&ctx->s, "\t\t\trdxV  = min(rdxV, v);\n"); break;
+		case GA_REDUCE_MAX:          strb_appends(&ctx->s, "\t\t\trdxV  = max(rdxV, v);\n"); break;
+		case GA_REDUCE_ARGMIN:
+		case GA_REDUCE_MINANDARGMIN:
+			strb_appends(&ctx->s, "\t\t\trdxV  = min(rdxV, v);\n");
+			strb_appends(&ctx->s, "\t\t\tif(v == rdxV){\n");
+			appendIdxes (&ctx->s, "\t\t\t\targI = RDXINDEXER(", "i", ctx->ndd, ctx->nds, "", ");\n");
+			strb_appends(&ctx->s, "\t\t\t}\n");
+		break;
+		case GA_REDUCE_ARGMAX:
+		case GA_REDUCE_MAXANDARGMAX:
+			strb_appends(&ctx->s, "\t\t\trdxV  = max(rdxV, v);\n");
+			strb_appends(&ctx->s, "\t\t\tif(v == rdxV){\n");
+			appendIdxes (&ctx->s, "\t\t\t\targI = RDXINDEXER(", "i", ctx->ndd, ctx->nds, "", ");\n");
+			strb_appends(&ctx->s, "\t\t\t}\n");
+		break;
+		case GA_REDUCE_AND:          strb_appends(&ctx->s, "\t\t\trdxV &= v;\n"); break;
+		case GA_REDUCE_OR:           strb_appends(&ctx->s, "\t\t\trdxV |= v;\n"); break;
+		case GA_REDUCE_XOR:          strb_appends(&ctx->s, "\t\t\trdxV ^= v;\n"); break;
+		case GA_REDUCE_ALL:          strb_appends(&ctx->s, "\t\t\trdxV  = rdxV && v;\n"); break;
+		case GA_REDUCE_ANY:          strb_appends(&ctx->s, "\t\t\trdxV  = rdxV || v;\n"); break;
+	}
 
 	/**
 	 * Inner Loop Trailer Generation
 	 */
 
 	for(i=ctx->ndd;i<ctx->nds;i++){
-		strb_appends(&ctx->s, "\t}\n");
+		strb_appends(&ctx->s, "\t\t}\n");
 	}
-	strb_appends(&ctx->s, "\t\n");
+	strb_appends(&ctx->s, "\t\t\n");
 
 	/**
 	 * Inner Loop Epilogue Generation
 	 */
 
-	strb_appends(&ctx->s, "\t/**\n");
-	strb_appends(&ctx->s, "\t * Destination writeback.\n");
-	strb_appends(&ctx->s, "\t */\n");
-	strb_appends(&ctx->s, "\t\n");
-	appendIdxes (&ctx->s, "\tDSTMINDEXER(", "i", 0, ctx->ndd, "", ") = maxV;\n");
-	appendIdxes (&ctx->s, "\tDSTAINDEXER(", "i", 0, ctx->ndd, "", ") = maxI;\n");
+	strb_appends(&ctx->s, "\t\t/**\n");
+	strb_appends(&ctx->s, "\t\t * Destination writeback.\n");
+	strb_appends(&ctx->s, "\t\t */\n");
+	strb_appends(&ctx->s, "\t\t\n");
+	if      ( reduxKernelRequiresDst   (ctx) && !reduxKernelRequiresDstArg(ctx)){
+		appendIdxes (&ctx->s, "\t\treduxVal(", "i", 0, ctx->ndd, "", "");
+		if(ctx->ndd > 0){
+			strb_appends(&ctx->s, ", ");
+		}
+		strb_appends(&ctx->s, "dst, dstSteps, rdxV);\n");
+	}else if(!reduxKernelRequiresDst   (ctx) &&  reduxKernelRequiresDstArg(ctx)){
+		appendIdxes (&ctx->s, "\t\treduxVal(", "i", 0, ctx->ndd, "", "");
+		if(ctx->ndd > 0){
+			strb_appends(&ctx->s, ", ");
+		}
+		strb_appends(&ctx->s, "dstArg, dstArgSteps, argI);\n");
+	}else if( reduxKernelRequiresDst   (ctx) &&  reduxKernelRequiresDstArg(ctx)){
+		appendIdxes (&ctx->s, "\t\treduxVal(", "i", 0, ctx->ndd, "", "");
+		if(ctx->ndd > 0){
+			strb_appends(&ctx->s, ", ");
+		}
+		strb_appends(&ctx->s, "dst, dstSteps, rdxV, dstArg, dstArgSteps, argI);\n");
+	}
 }
-static void  maxandargmaxAppendLoopMacroUndefs  (maxandargmax_ctx*  ctx){
+static void  reduxAppendLoopMacroUndefs    (redux_ctx*  ctx){
 	strb_appends(&ctx->s, "#undef FOROVER\n");
 	strb_appends(&ctx->s, "#undef ESCAPE\n");
-	strb_appends(&ctx->s, "#undef SRCINDEXER\n");
 	strb_appends(&ctx->s, "#undef RDXINDEXER\n");
-	strb_appends(&ctx->s, "#undef DSTMINDEXER\n");
-	strb_appends(&ctx->s, "#undef DSTAINDEXER\n");
-}
-static void  maxandargmaxComputeAxisList        (maxandargmax_ctx*  ctx){
-	int i, f=0;
-
-	for(i=0;i<ctx->nds;i++){
-		if(axisInSet(i, ctx->reduxList, ctx->ndr, 0)){
-			continue;
-		}
-		ctx->axisList[f++] = i;
-	}
-	memcpy(&ctx->axisList[f], ctx->reduxList, ctx->ndr * sizeof(*ctx->reduxList));
 }
 
 /**
@@ -631,59 +1549,65 @@ static void  maxandargmaxComputeAxisList        (maxandargmax_ctx*  ctx){
  * @return
  */
 
-static int   maxandargmaxCompile                (maxandargmax_ctx*  ctx){
+static int   reduxCompileLarge             (redux_ctx*  ctx){
 	const int    ARG_TYPECODES[]   = {
 		GA_BUFFER, /* src */
 		GA_SIZE,   /* srcOff */
 		GA_BUFFER, /* srcSteps */
 		GA_BUFFER, /* srcSize */
 		GA_BUFFER, /* chnkSize */
-		GA_BUFFER, /* dstMax */
-		GA_SIZE,   /* dstMaxOff */
-		GA_BUFFER, /* dstMaxSteps */
-		GA_BUFFER, /* dstArgmax */
-		GA_SIZE,   /* dstArgmaxOff */
-		GA_BUFFER  /* dstArgmaxSteps */
+		GA_BUFFER, /* dst */
+		GA_SIZE,   /* dstOff */
+		GA_BUFFER, /* dstSteps */
+		GA_BUFFER, /* dstArg */
+		GA_SIZE,   /* dstArgOff */
+		GA_BUFFER  /* dstArgSteps */
 	};
-	const unsigned int ARG_TYPECODES_LEN = sizeof(ARG_TYPECODES)/sizeof(*ARG_TYPECODES);
-	const char*  SRCS[1];
-
-	SRCS[0] = ctx->sourceCode;
-
-	ctx->ret = GpuKernel_init(&ctx->kernel,
+	const size_t ARG_TYPECODES_LEN = sizeof(ARG_TYPECODES)/sizeof(*ARG_TYPECODES);
+	const char*  SRCS[1]           = {ctx->sourceCode};
+	const size_t SRC_LENS[1]       = {strlen(ctx->sourceCode)};
+	const size_t SRCS_LEN          = sizeof(SRCS)/sizeof(*SRCS);
+	
+	int ret  = GpuKernel_init(&ctx->kernel,
 	                          ctx->gpuCtx,
-	                          1,
+	                          SRCS_LEN,
 	                          SRCS,
-	                          NULL,
-	                          "maxandargmax",
+	                          SRC_LENS,
+	                          "redux",
 	                          ARG_TYPECODES_LEN,
 	                          ARG_TYPECODES,
 	                          0,
 	                          (char**)0);
-	free(ctx->sourceCode);
-	ctx->sourceCode = NULL;
 
-	return ctx->ret;
+	if(ret != GA_NO_ERROR){
+		return reduxCleanup(ctx, ret);
+	}else{
+		return reduxScheduleLarge(ctx);
+	}
+}
+static int   reduxCompileSmall             (redux_ctx*  ctx){
+	/* BUG: Implement small code model. */
+	return reduxCompileLarge(ctx);
 }
 
 /**
  * Compute a good thread block size / grid size / software chunk size for Nvidia.
  */
 
-static int   maxandargmaxSchedule               (maxandargmax_ctx*  ctx){
+static int   reduxScheduleLarge            (redux_ctx*  ctx){
 	int            i;
 	size_t         warpMod;
 	size_t         bestWarpMod  = 1;
 	unsigned       bestWarpAxis = 0;
 	uint64_t       maxLg;
-	uint64_t       maxLs[3];
+	uint64_t       maxLs[MAX_HW_DIMS];
 	uint64_t       maxGg;
-	uint64_t       maxGs[3];
-	uint64_t       dims [3];
-	double         slack[3];
-	ga_factor_list factBS[3];
-	ga_factor_list factGS[3];
-	ga_factor_list factCS[3];
+	uint64_t       maxGs [MAX_HW_DIMS];
+	uint64_t       dims  [MAX_HW_DIMS];
+	double         slack [MAX_HW_DIMS];
+	ga_factor_list factBS[MAX_HW_DIMS];
+	ga_factor_list factGS[MAX_HW_DIMS];
+	ga_factor_list factCS[MAX_HW_DIMS];
 
 
 	/**
@@ -772,76 +1696,78 @@ static int   maxandargmaxSchedule               (maxandargmax_ctx*  ctx){
 	}
 
 	/* Return. */
-	return ctx->ret=GA_NO_ERROR;
+	return reduxInvokeLarge(ctx);
 }
 
 /**
  * Invoke the kernel.
  */
 
-static int   maxandargmaxInvoke                 (maxandargmax_ctx*  ctx){
+static int   reduxInvokeLarge              (redux_ctx*  ctx){
 	void* args[11];
+	int   ret;
 
 	/**
 	 * Argument Marshalling. This the grossest gross thing in here.
 	 */
 
-	const int flags       = GA_BUFFER_READ_ONLY|GA_BUFFER_INIT;
-	ctx->srcStepsGD       = gpudata_alloc(ctx->gpuCtx, ctx->nds    * sizeof(size_t),
-	                                      ctx->src->strides,       flags, 0);
-	ctx->srcSizeGD        = gpudata_alloc(ctx->gpuCtx, ctx->nds    * sizeof(size_t),
-	                                      ctx->src->dimensions,    flags, 0);
-	ctx->chunkSizeGD      = gpudata_alloc(ctx->gpuCtx, ctx->ndh * sizeof(size_t),
-	                                      ctx->chunkSize,          flags, 0);
-	ctx->dstMaxStepsGD    = gpudata_alloc(ctx->gpuCtx, ctx->ndd * sizeof(size_t),
-	                                      ctx->dstMax->strides,    flags, 0);
-	ctx->dstArgmaxStepsGD = gpudata_alloc(ctx->gpuCtx, ctx->ndd * sizeof(size_t),
-	                                      ctx->dstArgmax->strides, flags, 0);
+	const int flags    = GA_BUFFER_READ_ONLY|GA_BUFFER_INIT;
+	ctx->srcStepsGD    = gpudata_alloc(ctx->gpuCtx, ctx->nds * sizeof(size_t),
+	                                   ctx->src->strides,    flags, 0);
+	ctx->srcSizeGD     = gpudata_alloc(ctx->gpuCtx, ctx->nds * sizeof(size_t),
+	                                   ctx->src->dimensions, flags, 0);
+	ctx->chunkSizeGD   = gpudata_alloc(ctx->gpuCtx, ctx->ndh * sizeof(size_t),
+	                                   ctx->chunkSize,       flags, 0);
+	ctx->dstStepsGD    = gpudata_alloc(ctx->gpuCtx, ctx->ndd * sizeof(size_t),
+	                                   ctx->dst->strides,    flags, 0);
+	ctx->dstArgStepsGD = gpudata_alloc(ctx->gpuCtx, ctx->ndd * sizeof(size_t),
+	                                   ctx->dstArg->strides, flags, 0);
 	args[ 0] = (void*) ctx->src->data;
 	args[ 1] = (void*)&ctx->src->offset;
 	args[ 2] = (void*) ctx->srcStepsGD;
 	args[ 3] = (void*) ctx->srcSizeGD;
 	args[ 4] = (void*) ctx->chunkSizeGD;
-	args[ 5] = (void*) ctx->dstMax->data;
-	args[ 6] = (void*)&ctx->dstMax->offset;
-	args[ 7] = (void*) ctx->dstMaxStepsGD;
-	args[ 8] = (void*) ctx->dstArgmax->data;
-	args[ 9] = (void*)&ctx->dstArgmax->offset;
-	args[10] = (void*) ctx->dstArgmaxStepsGD;
-
-	if(ctx->srcStepsGD      &&
-	   ctx->srcSizeGD       &&
-	   ctx->chunkSizeGD     &&
-	   ctx->dstMaxStepsGD   &&
-	   ctx->dstArgmaxStepsGD){
-		ctx->ret = GpuKernel_call(&ctx->kernel,
-		                          ctx->ndh>0 ? ctx->ndh : 1,
-		                          ctx->gridSize,
-		                          ctx->blockSize,
-		                          0,
-		                          args);
+	args[ 5] = (void*) ctx->dst->data;
+	args[ 6] = (void*)&ctx->dst->offset;
+	args[ 7] = (void*) ctx->dstStepsGD;
+	args[ 8] = (void*) ctx->dstArg->data;
+	args[ 9] = (void*)&ctx->dstArg->offset;
+	args[10] = (void*) ctx->dstArgStepsGD;
+
+	if(ctx->srcStepsGD   &&
+	   ctx->srcSizeGD    &&
+	   ctx->chunkSizeGD  &&
+	   ctx->dstStepsGD   &&
+	   ctx->dstArgStepsGD){
+		ret = GpuKernel_call(&ctx->kernel,
+		                     ctx->ndh>0 ? ctx->ndh : 1,
+		                     ctx->gridSize,
+		                     ctx->blockSize,
+		                     0,
+		                     args);
+		return reduxCleanup(ctx, ret);
 	}else{
-		ctx->ret = GA_MEMORY_ERROR;
+		return reduxCleanup(ctx, GA_MEMORY_ERROR);
 	}
-
-	gpudata_release(ctx->srcStepsGD);
-	gpudata_release(ctx->srcSizeGD);
-	gpudata_release(ctx->chunkSizeGD);
-	gpudata_release(ctx->dstMaxStepsGD);
-	gpudata_release(ctx->dstArgmaxStepsGD);
-
-	return ctx->ret;
 }
 
 /**
  * Cleanup
  */
 
-static int   maxandargmaxCleanup                (maxandargmax_ctx*  ctx){
+static int   reduxCleanup                  (redux_ctx*  ctx, int ret){
 	free(ctx->axisList);
 	free(ctx->sourceCode);
-	ctx->axisList       = NULL;
-	ctx->sourceCode     = NULL;
+	ctx->axisList   = NULL;
+	ctx->sourceCode = NULL;
+
+	gpudata_release(ctx->srcStepsGD);
+	gpudata_release(ctx->srcSizeGD);
+	gpudata_release(ctx->chunkSizeGD);
+	gpudata_release(ctx->dstStepsGD);
+	gpudata_release(ctx->dstArgStepsGD);
+	ctx->srcStepsGD = ctx->srcSizeGD     = ctx->chunkSizeGD   =
+	ctx->dstStepsGD = ctx->dstArgStepsGD = NULL;
 
-	return ctx->ret;
+	return ret;
 }
diff --git a/tests/check_reduction.c b/tests/check_reduction.c
index ca3f231bf4..d8c14aa572 100644
--- a/tests/check_reduction.c
+++ b/tests/check_reduction.c
@@ -67,15 +67,14 @@ static       double   pcgRand01(void){
  * Test cases.
  */
 
-START_TEST(test_reduction){
+START_TEST(test_maxandargmax_reduction){
+	pcgSeed(1);
+
 	/**
 	 * We test here a reduction of some random 3D tensor on the first and
 	 * third dimensions.
 	 */
 
-	GpuArray gaSrc;
-	GpuArray gaMax;
-	GpuArray gaArgmax;
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
@@ -94,7 +93,6 @@ START_TEST(test_reduction){
 	 * Initialize source data.
 	 */
 
-	pcgSeed(1);
 	for(i=0;i<prodDims;i++){
 		pSrc[i] = pcgRand01();
 	}
@@ -104,6 +102,10 @@ START_TEST(test_reduction){
 	 * Run the kernel.
 	 */
 
+	GpuArray gaSrc;
+	GpuArray gaMax;
+	GpuArray gaArgmax;
+
 	ga_assert_ok(GpuArray_empty(&gaSrc,    ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty(&gaMax,    ctx, GA_FLOAT, 1, &dims[1], GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty(&gaArgmax, ctx, GA_ULONG,  1, &dims[1], GA_C_ORDER));
@@ -153,7 +155,9 @@ START_TEST(test_reduction){
 	GpuArray_clear(&gaArgmax);
 }END_TEST
 
-START_TEST(test_idxtranspose){
+START_TEST(test_maxandargmax_idxtranspose){
+	pcgSeed(1);
+
 	/**
 	 * We test here the same reduction as test_reduction, except with a
 	 * reversed reduxList {2,0} instead of {0,2}. That should lead to a
@@ -161,9 +165,6 @@ START_TEST(test_idxtranspose){
 	 * "flattened" output version.
 	 */
 
-	GpuArray gaSrc;
-	GpuArray gaMax;
-	GpuArray gaArgmax;
 	size_t i,j,k;
 	size_t dims[3]     = {32,50,79};
 	size_t prodDims    = dims[0]*dims[1]*dims[2];
@@ -184,7 +185,6 @@ START_TEST(test_idxtranspose){
 	 * Initialize source data.
 	 */
 
-	pcgSeed(1);
 	for(i=0;i<prodDims;i++){
 		pSrc[i] = pcgRand01();
 	}
@@ -194,6 +194,10 @@ START_TEST(test_idxtranspose){
 	 * Run the kernel.
 	 */
 
+	GpuArray gaSrc;
+	GpuArray gaMax;
+	GpuArray gaArgmax;
+
 	ga_assert_ok(GpuArray_empty(&gaSrc,    ctx, GA_FLOAT, 3, dims,    GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty(&gaMax,    ctx, GA_FLOAT, 1, rdxDims, GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty(&gaArgmax, ctx, GA_ULONG,  1, rdxDims, GA_C_ORDER));
@@ -243,15 +247,13 @@ START_TEST(test_idxtranspose){
 	GpuArray_clear(&gaArgmax);
 }END_TEST
 
-START_TEST(test_veryhighrank){
+START_TEST(test_maxandargmax_veryhighrank){
+	pcgSeed(1);
+
 	/**
 	 * Here we test a reduction of a random 8D tensor on four dimensions.
 	 */
 
-	GpuArray gaSrc;
-	GpuArray gaMax;
-	GpuArray gaArgmax;
-	size_t dstIdx;
 	size_t i,j,k,l,m,n,o,p;
 	size_t dims   [8]  = {1171,373,2,1,2,1,2,1};
 	size_t prodDims    = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7];
@@ -272,7 +274,6 @@ START_TEST(test_veryhighrank){
 	 * Initialize source data.
 	 */
 
-	pcgSeed(1);
 	for(i=0;i<prodDims;i++){
 		pSrc[i] = pcgRand01();
 	}
@@ -282,6 +283,10 @@ START_TEST(test_veryhighrank){
 	 * Run the kernel.
 	 */
 
+	GpuArray gaSrc;
+	GpuArray gaMax;
+	GpuArray gaArgmax;
+
 	ga_assert_ok(GpuArray_empty(&gaSrc,    ctx, GA_FLOAT, 8, dims,    GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty(&gaMax,    ctx, GA_FLOAT, 4, rdxDims, GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty(&gaArgmax, ctx, GA_ULONG,  4, rdxDims, GA_C_ORDER));
@@ -322,7 +327,7 @@ START_TEST(test_veryhighrank){
 						}
 					}
 
-					dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
+					size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
 					ck_assert_msg(gtMax    == pMax[dstIdx],    "Max value mismatch!");
 					ck_assert_msg(gtArgmax == pArgmax[dstIdx], "Argmax value mismatch!");
 				}
@@ -343,20 +348,17 @@ START_TEST(test_veryhighrank){
 	GpuArray_clear(&gaArgmax);
 }END_TEST
 
-START_TEST(test_alldimsreduced){
+START_TEST(test_maxandargmax_alldimsreduced){
+	pcgSeed(1);
+
 	/**
 	 * We test here a reduction of some random 3D tensor on all dimensions.
 	 */
 
-	GpuArray gaSrc;
-	GpuArray gaMax;
-	GpuArray gaArgmax;
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
 	const unsigned reduxList[] = {0,1,2};
-	size_t gtArgmax;
-	float  gtMax;
 
 	float *pSrc    = calloc(sizeof(*pSrc), prodDims);
 	float *pMax    = calloc(1, sizeof(*pMax));
@@ -371,7 +373,6 @@ START_TEST(test_alldimsreduced){
 	 * Initialize source data.
 	 */
 
-	pcgSeed(1);
 	for(i=0;i<prodDims;i++){
 		pSrc[i] = pcgRand01();
 	}
@@ -381,6 +382,10 @@ START_TEST(test_alldimsreduced){
 	 * Run the kernel.
 	 */
 
+	GpuArray gaSrc;
+	GpuArray gaMax;
+	GpuArray gaArgmax;
+
 	ga_assert_ok(GpuArray_empty(&gaSrc,    ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty(&gaMax,    ctx, GA_FLOAT, 0, NULL,     GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty(&gaArgmax, ctx, GA_ULONG,  0, NULL,     GA_C_ORDER));
@@ -399,8 +404,8 @@ START_TEST(test_alldimsreduced){
 	 * Check that the destination tensors are correct.
 	 */
 
-	gtArgmax = 0;
-	gtMax    = pSrc[0];
+	size_t gtArgmax = 0;
+	float  gtMax    = pSrc[0];
 
 	for(i=0;i<dims[0];i++){
 		for(j=0;j<dims[1];j++){
@@ -430,16 +435,193 @@ START_TEST(test_alldimsreduced){
 	GpuArray_clear(&gaArgmax);
 }END_TEST
 
+START_TEST(test_minandargmin_reduction){
+	pcgSeed(1);
+
+	/**
+	 * We test here a reduction of some random 3D tensor on the first and
+	 * third dimensions.
+	 */
+
+	size_t i,j,k;
+	size_t dims[3]  = {32,50,79};
+	size_t prodDims = dims[0]*dims[1]*dims[2];
+	const unsigned reduxList[] = {0,2};
+
+	float*  pSrc    = calloc(1, sizeof(*pSrc)    * dims[0]*dims[1]*dims[2]);
+	float*  pMin    = calloc(1, sizeof(*pMin)    *         dims[1]        );
+	size_t* pArgmin = calloc(1, sizeof(*pArgmin) *         dims[1]        );
+
+	ck_assert_ptr_ne(pSrc,    NULL);
+	ck_assert_ptr_ne(pMin,    NULL);
+	ck_assert_ptr_ne(pArgmin, NULL);
+
+
+	/**
+	 * Initialize source data.
+	 */
+
+	for(i=0;i<prodDims;i++){
+		pSrc[i] = pcgRand01();
+	}
+
+
+	/**
+	 * Run the kernel.
+	 */
+
+	GpuArray gaSrc;
+	GpuArray gaMin;
+	GpuArray gaArgmin;
+
+	ga_assert_ok(GpuArray_empty(&gaSrc,    ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaMin,    ctx, GA_FLOAT, 1, &dims[1], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaArgmin, ctx, GA_SIZE,  1, &dims[1], GA_C_ORDER));
+
+	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaMin,    -1));  /* 0xFFFFFFFF is a qNaN. */
+	ga_assert_ok(GpuArray_memset(&gaArgmin, -1));
+
+	ga_assert_ok(GpuArray_minandargmin(&gaMin, &gaArgmin, &gaSrc, 2, reduxList));
+
+	ga_assert_ok(GpuArray_read(pMin,    sizeof(*pMin)   *dims[1], &gaMin));
+	ga_assert_ok(GpuArray_read(pArgmin, sizeof(*pArgmin)*dims[1], &gaArgmin));
+
+
+	/**
+	 * Check that the destination tensors are correct.
+	 */
+
+	for(j=0;j<dims[1];j++){
+		size_t gtArgmin = 0;
+		float  gtMin    = pSrc[(0*dims[1] + j)*dims[2] + 0];
+
+		for(i=0;i<dims[0];i++){
+			for(k=0;k<dims[2];k++){
+				float v = pSrc[(i*dims[1] + j)*dims[2] + k];
+
+				if(v < gtMin){
+					gtMin    = v;
+					gtArgmin = i*dims[2] + k;
+				}
+			}
+		}
+
+		ck_assert_msg(gtMin    == pMin[j],    "Min value mismatch!");
+		ck_assert_msg(gtArgmin == pArgmin[j], "Argmin value mismatch!");
+	}
+
+	/**
+	 * Deallocate.
+	 */
+
+	free(pSrc);
+	free(pMin);
+	free(pArgmin);
+	GpuArray_clear(&gaSrc);
+	GpuArray_clear(&gaMin);
+	GpuArray_clear(&gaArgmin);
+}END_TEST
+
+START_TEST(test_minandargmin_alldimsreduced){
+	pcgSeed(1);
+
+	/**
+	 * We test here a reduction of some random 3D tensor on all dimensions.
+	 */
+
+	size_t i,j,k;
+	size_t dims[3]  = {32,50,79};
+	size_t prodDims = dims[0]*dims[1]*dims[2];
+	const unsigned reduxList[] = {0,1,2};
+
+	float*  pSrc    = calloc(1, sizeof(*pSrc)    * dims[0]*dims[1]*dims[2]);
+	float*  pMin    = calloc(1, sizeof(*pMin)                             );
+	size_t* pArgmin = calloc(1, sizeof(*pArgmin)                          );
+
+	ck_assert_ptr_ne(pSrc,    NULL);
+	ck_assert_ptr_ne(pMin,    NULL);
+	ck_assert_ptr_ne(pArgmin, NULL);
+
+
+	/**
+	 * Initialize source data.
+	 */
+
+	for(i=0;i<prodDims;i++){
+		pSrc[i] = pcgRand01();
+	}
+
+
+	/**
+	 * Run the kernel.
+	 */
+
+	GpuArray gaSrc;
+	GpuArray gaMin;
+	GpuArray gaArgmin;
+
+	ga_assert_ok(GpuArray_empty(&gaSrc,    ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaMin,    ctx, GA_FLOAT, 0, NULL,     GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaArgmin, ctx, GA_SIZE,  0, NULL,     GA_C_ORDER));
+
+	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaMin,    -1));  /* 0xFFFFFFFF is a qNaN. */
+	ga_assert_ok(GpuArray_memset(&gaArgmin, -1));
+
+	ga_assert_ok(GpuArray_minandargmin(&gaMin, &gaArgmin, &gaSrc, 3, reduxList));
+
+	ga_assert_ok(GpuArray_read(pMin,    sizeof(*pMin),    &gaMin));
+	ga_assert_ok(GpuArray_read(pArgmin, sizeof(*pArgmin), &gaArgmin));
+
+
+	/**
+	 * Check that the destination tensors are correct.
+	 */
+
+	size_t gtArgmin = 0;
+	float  gtMin    = pSrc[0];
+
+	for(i=0;i<dims[0];i++){
+		for(j=0;j<dims[1];j++){
+			for(k=0;k<dims[2];k++){
+				float v = pSrc[(i*dims[1] + j)*dims[2] + k];
+
+				if(v < gtMin){
+					gtMin    = v;
+					gtArgmin = (i*dims[1] + j)*dims[2] + k;
+				}
+			}
+		}
+	}
+
+	ck_assert_msg(gtMin    == pMin[0],    "Min value mismatch!");
+	ck_assert_msg(gtArgmin == pArgmin[0], "Argmin value mismatch!");
+
+	/**
+	 * Deallocate.
+	 */
+
+	free(pSrc);
+	free(pMin);
+	free(pArgmin);
+	GpuArray_clear(&gaSrc);
+	GpuArray_clear(&gaMin);
+	GpuArray_clear(&gaArgmin);
+}END_TEST
+
 Suite *get_suite(void) {
 	Suite *s  = suite_create("reduction");
 	TCase *tc = tcase_create("basic");
 	tcase_add_checked_fixture(tc, setup, teardown);
 	tcase_set_timeout(tc, 15.0);
 
-	tcase_add_test(tc, test_reduction);
-	tcase_add_test(tc, test_idxtranspose);
-	tcase_add_test(tc, test_veryhighrank);
-	tcase_add_test(tc, test_alldimsreduced);
+	tcase_add_test(tc, test_maxandargmax_reduction);
+	tcase_add_test(tc, test_maxandargmax_idxtranspose);
+	tcase_add_test(tc, test_maxandargmax_veryhighrank);
+	tcase_add_test(tc, test_maxandargmax_alldimsreduced);
+	tcase_add_test(tc, test_minandargmin_reduction);
+	tcase_add_test(tc, test_minandargmin_alldimsreduced);
 
 	suite_add_tcase(s, tc);
 	return s;

From 939a1153fb3d65b84876a273936a2c495f61e706 Mon Sep 17 00:00:00 2001
From: Olexa Bilaniuk <obilaniu@gmail.com>
Date: Wed, 25 Jan 2017 19:23:44 -0500
Subject: [PATCH 02/34] Add strb_init() function.

It allows initializing at runtime an strb. This can't always be done at
compile-time, for instance if it is dynamically allocated.
---
 src/util/strb.h | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/util/strb.h b/src/util/strb.h
index 3289de5796..223145908e 100644
--- a/src/util/strb.h
+++ b/src/util/strb.h
@@ -77,6 +77,15 @@ static inline int strb_error(strb *sb) {
   return sb->l == (size_t)-1;
 }
 
+/*
+ * Initialize at runtime an strb.
+ */
+
+static inline void strb_init(strb* sb){
+  const strb s = STRB_STATIC_INIT;
+  *sb = s;
+}
+
 
 /*
  * Clear any allocation the strb may have done and reset all of its

From a21bcb575f1deb5b6ccfc6a66682c3b6edb2b0e3 Mon Sep 17 00:00:00 2001
From: Olexa Bilaniuk <obilaniu@gmail.com>
Date: Wed, 25 Jan 2017 19:50:58 -0500
Subject: [PATCH 03/34] Moved the reduction API to reduction.h.

---
 src/CMakeLists.txt       |   1 +
 src/gpuarray/array.h     | 125 -------------------------------
 src/gpuarray/reduction.h | 157 +++++++++++++++++++++++++++++++++++++++
 src/gpuarray_reduction.c |  71 ++++++++++++------
 tests/check_reduction.c  |  81 +++++++++++++++++++-
 5 files changed, 284 insertions(+), 151 deletions(-)
 create mode 100644 src/gpuarray/reduction.h

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index b687e5da1a..1505014e5e 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -143,6 +143,7 @@ set(headers
   gpuarray/extension.h
   gpuarray/ext_cuda.h
   gpuarray/kernel.h
+  gpuarray/reduction.h
   gpuarray/types.h
   gpuarray/util.h
 )
diff --git a/src/gpuarray/array.h b/src/gpuarray/array.h
index 5ea9377b9a..639d176489 100644
--- a/src/gpuarray/array.h
+++ b/src/gpuarray/array.h
@@ -118,27 +118,6 @@ typedef enum _ga_order {
   GA_F_ORDER=1
 } ga_order;
 
-/**
- * Supported array reduction operations.
- */
-
-typedef enum _ga_reduce_op {
-	GA_REDUCE_SUM,             /*        +        */
-	GA_REDUCE_PROD,            /*        *        */
-	GA_REDUCE_PRODNZ,          /*        * (!=0)  */
-	GA_REDUCE_MIN,             /*      min()      */
-	GA_REDUCE_MAX,             /*      max()      */
-	GA_REDUCE_ARGMIN,          /*     argmin()    */
-	GA_REDUCE_ARGMAX,          /*     argmax()    */
-	GA_REDUCE_MINANDARGMIN,    /* min(), argmin() */
-	GA_REDUCE_MAXANDARGMAX,    /* max(), argmax() */
-	GA_REDUCE_AND,             /*        &        */
-	GA_REDUCE_OR,              /*        |        */
-	GA_REDUCE_XOR,             /*        ^        */
-	GA_REDUCE_ALL,             /*     &&/all()    */
-	GA_REDUCE_ANY,             /*     ||/any()    */
-} ga_reduce_op;
-
 /**
  * Checks if all the specified flags are set.
  *
@@ -626,110 +605,6 @@ GPUARRAY_PUBLIC void GpuArray_fprintf(FILE *fd, const GpuArray *a);
 GPUARRAY_PUBLIC int GpuArray_fdump(FILE *fd, const GpuArray *a);
 
 
-/**
- * @brief Compute a reduction sum (+), product (*), non-zero product (* != 0),
- *        min, max, argmin, argmax, min-and-argmin, max-and-argmax, and (&),
- *        or (|), xor (^), all (&&) or any (||) over a list of axes to reduce.
- *
- * Returns one (in the case of min-and-argmin/max-and-argmax, two) destination
- * tensors. The destination tensor(s)' axes are a strict subset of the axes of the
- * source tensor. The axes to be reduced are specified by the caller, and the
- * reduction is performed over these axes, which are then removed in the
- * destination.
- *
- * @param [out] dst        The destination tensor. Has the same type as the source.
- * @param [out] dstArg     For argument of minima/maxima operations. Has type int64.
- * @param [in]  src        The source tensor.
- * @param [in]  reduxLen   The number of axes reduced. Must be >= 1 and
- *                         <= src->nd.
- * @param [in]  reduxList  A list of integers of length reduxLen, indicating
- *                         the axes to be reduced. The order of the axes
- *                         matters for dstArg index calculations (GpuArray_argmin,
- *                         GpuArray_argmax, GpuArray_minandargmin,
- *                         GpuArray_maxandargmax). All entries in the list must be
- *                         unique, >= 0 and < src->nd.
- *                         
- *                         For example, if a 5D-tensor is max-reduced with an axis
- *                         list of [3,4,1], then reduxLen shall be 3, and the
- *                         index calculation in every point shall take the form
- *                         
- *                             dstArgmax[i0,i2] = i3 * src.shape[4] * src.shape[1] +
- *                                                i4 * src.shape[1]                +
- *                                                i1
- *                         
- *                         where (i3,i4,i1) are the coordinates of the maximum-
- *                         valued element within subtensor [i0,:,i2,:,:] of src.
- * @return GA_NO_ERROR if the operation was successful, or a non-zero error
- *         code otherwise.
- */
-
-GPUARRAY_PUBLIC int GpuArray_sum         (GpuArray*       dst,
-                                          const GpuArray* src,
-                                          unsigned        reduxLen,
-                                          const unsigned* reduxList);
-GPUARRAY_PUBLIC int GpuArray_prod        (GpuArray*       dst,
-                                          const GpuArray* src,
-                                          unsigned        reduxLen,
-                                          const unsigned* reduxList);
-GPUARRAY_PUBLIC int GpuArray_prodnz      (GpuArray*       dst,
-                                          const GpuArray* src,
-                                          unsigned        reduxLen,
-                                          const unsigned* reduxList);
-GPUARRAY_PUBLIC int GpuArray_min         (GpuArray*       dst,
-                                          const GpuArray* src,
-                                          unsigned        reduxLen,
-                                          const unsigned* reduxList);
-GPUARRAY_PUBLIC int GpuArray_max         (GpuArray*       dst,
-                                          const GpuArray* src,
-                                          unsigned        reduxLen,
-                                          const unsigned* reduxList);
-GPUARRAY_PUBLIC int GpuArray_argmin      (GpuArray*       dstArg,
-                                          const GpuArray* src,
-                                          unsigned        reduxLen,
-                                          const unsigned* reduxList);
-GPUARRAY_PUBLIC int GpuArray_argmax      (GpuArray*       dstArg,
-                                          const GpuArray* src,
-                                          unsigned        reduxLen,
-                                          const unsigned* reduxList);
-GPUARRAY_PUBLIC int GpuArray_minandargmin(GpuArray*       dst,
-                                          GpuArray*       dstArg,
-                                          const GpuArray* src,
-                                          unsigned        reduxLen,
-                                          const unsigned* reduxList);
-GPUARRAY_PUBLIC int GpuArray_maxandargmax(GpuArray*       dst,
-                                          GpuArray*       dstArg,
-                                          const GpuArray* src,
-                                          unsigned        reduxLen,
-                                          const unsigned* reduxList);
-GPUARRAY_PUBLIC int GpuArray_and         (GpuArray*       dst,
-                                          const GpuArray* src,
-                                          unsigned        reduxLen,
-                                          const unsigned* reduxList);
-GPUARRAY_PUBLIC int GpuArray_or          (GpuArray*       dst,
-                                          const GpuArray* src,
-                                          unsigned        reduxLen,
-                                          const unsigned* reduxList);
-GPUARRAY_PUBLIC int GpuArray_xor         (GpuArray*       dst,
-                                          const GpuArray* src,
-                                          unsigned        reduxLen,
-                                          const unsigned* reduxList);
-GPUARRAY_PUBLIC int GpuArray_all         (GpuArray*       dst,
-                                          const GpuArray* src,
-                                          unsigned        reduxLen,
-                                          const unsigned* reduxList);
-GPUARRAY_PUBLIC int GpuArray_any         (GpuArray*       dst,
-                                          const GpuArray* src,
-                                          unsigned        reduxLen,
-                                          const unsigned* reduxList);
-GPUARRAY_PUBLIC int GpuArray_reduction   (ga_reduce_op    op,
-                                          GpuArray*       dst,
-                                          GpuArray*       dstArg,
-                                          const GpuArray* src,
-                                          unsigned        reduxLen,
-                                          const unsigned* reduxList);
-
-
-
 
 
 #ifdef __cplusplus
diff --git a/src/gpuarray/reduction.h b/src/gpuarray/reduction.h
new file mode 100644
index 0000000000..1db5664535
--- /dev/null
+++ b/src/gpuarray/reduction.h
@@ -0,0 +1,157 @@
+#ifndef GPUARRAY_REDUCTION_H
+#define GPUARRAY_REDUCTION_H
+/**
+ * \file reduction.h
+ * \brief Reduction functions.
+ */
+
+#include <gpuarray/array.h>
+
+#ifdef _MSC_VER
+#ifndef inline
+#define inline __inline
+#endif
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#ifdef CONFUSE_EMACS
+}
+#endif
+
+
+/**
+ * Supported array reduction operations.
+ */
+
+typedef enum _ga_reduce_op {
+	GA_REDUCE_SUM,             /*        +        */
+	GA_REDUCE_PROD,            /*        *        */
+	GA_REDUCE_PRODNZ,          /*        * (!=0)  */
+	GA_REDUCE_MIN,             /*      min()      */
+	GA_REDUCE_MAX,             /*      max()      */
+	GA_REDUCE_ARGMIN,          /*     argmin()    */
+	GA_REDUCE_ARGMAX,          /*     argmax()    */
+	GA_REDUCE_MINANDARGMIN,    /* min(), argmin() */
+	GA_REDUCE_MAXANDARGMAX,    /* max(), argmax() */
+	GA_REDUCE_AND,             /*        &        */
+	GA_REDUCE_OR,              /*        |        */
+	GA_REDUCE_XOR,             /*        ^        */
+	GA_REDUCE_ALL,             /*     &&/all()    */
+	GA_REDUCE_ANY,             /*     ||/any()    */
+} ga_reduce_op;
+
+
+
+/**
+ * @brief Compute a reduction sum (+), product (*), non-zero product (* != 0),
+ *        min, max, argmin, argmax, min-and-argmin, max-and-argmax, and (&),
+ *        or (|), xor (^), all (&&) or any (||) over a list of axes to reduce.
+ *
+ * Returns one (in the case of min-and-argmin/max-and-argmax, two) destination
+ * tensors. The destination tensor(s)' axes are a strict subset of the axes of the
+ * source tensor. The axes to be reduced are specified by the caller, and the
+ * reduction is performed over these axes, which are then removed in the
+ * destination.
+ *
+ * @param [out] dst        The destination tensor. Has the same type as the source.
+ * @param [out] dstArg     For argument of minima/maxima operations. Has type int64.
+ * @param [in]  src        The source tensor.
+ * @param [in]  reduxLen   The number of axes reduced. Must be >= 1 and
+ *                         <= src->nd.
+ * @param [in]  reduxList  A list of integers of length reduxLen, indicating
+ *                         the axes to be reduced. The order of the axes
+ *                         matters for dstArg index calculations (GpuArray_argmin,
+ *                         GpuArray_argmax, GpuArray_minandargmin,
+ *                         GpuArray_maxandargmax). All entries in the list must be
+ *                         unique, >= 0 and < src->nd.
+ *                         
+ *                         For example, if a 5D-tensor is max-reduced with an axis
+ *                         list of [3,4,1], then reduxLen shall be 3, and the
+ *                         index calculation in every point shall take the form
+ *                         
+ *                             dstArgmax[i0,i2] = i3 * src.shape[4] * src.shape[1] +
+ *                                                i4 * src.shape[1]                +
+ *                                                i1
+ *                         
+ *                         where (i3,i4,i1) are the coordinates of the maximum-
+ *                         valued element within subtensor [i0,:,i2,:,:] of src.
+ * @return GA_NO_ERROR if the operation was successful, or a non-zero error
+ *         code otherwise.
+ */
+
+GPUARRAY_PUBLIC int GpuArray_sum         (GpuArray*       dst,
+                                          const GpuArray* src,
+                                          unsigned        reduxLen,
+                                          const unsigned* reduxList);
+GPUARRAY_PUBLIC int GpuArray_prod        (GpuArray*       dst,
+                                          const GpuArray* src,
+                                          unsigned        reduxLen,
+                                          const unsigned* reduxList);
+GPUARRAY_PUBLIC int GpuArray_prodnz      (GpuArray*       dst,
+                                          const GpuArray* src,
+                                          unsigned        reduxLen,
+                                          const unsigned* reduxList);
+GPUARRAY_PUBLIC int GpuArray_min         (GpuArray*       dst,
+                                          const GpuArray* src,
+                                          unsigned        reduxLen,
+                                          const unsigned* reduxList);
+GPUARRAY_PUBLIC int GpuArray_max         (GpuArray*       dst,
+                                          const GpuArray* src,
+                                          unsigned        reduxLen,
+                                          const unsigned* reduxList);
+GPUARRAY_PUBLIC int GpuArray_argmin      (GpuArray*       dstArg,
+                                          const GpuArray* src,
+                                          unsigned        reduxLen,
+                                          const unsigned* reduxList);
+GPUARRAY_PUBLIC int GpuArray_argmax      (GpuArray*       dstArg,
+                                          const GpuArray* src,
+                                          unsigned        reduxLen,
+                                          const unsigned* reduxList);
+GPUARRAY_PUBLIC int GpuArray_minandargmin(GpuArray*       dst,
+                                          GpuArray*       dstArg,
+                                          const GpuArray* src,
+                                          unsigned        reduxLen,
+                                          const unsigned* reduxList);
+GPUARRAY_PUBLIC int GpuArray_maxandargmax(GpuArray*       dst,
+                                          GpuArray*       dstArg,
+                                          const GpuArray* src,
+                                          unsigned        reduxLen,
+                                          const unsigned* reduxList);
+GPUARRAY_PUBLIC int GpuArray_and         (GpuArray*       dst,
+                                          const GpuArray* src,
+                                          unsigned        reduxLen,
+                                          const unsigned* reduxList);
+GPUARRAY_PUBLIC int GpuArray_or          (GpuArray*       dst,
+                                          const GpuArray* src,
+                                          unsigned        reduxLen,
+                                          const unsigned* reduxList);
+GPUARRAY_PUBLIC int GpuArray_xor         (GpuArray*       dst,
+                                          const GpuArray* src,
+                                          unsigned        reduxLen,
+                                          const unsigned* reduxList);
+GPUARRAY_PUBLIC int GpuArray_all         (GpuArray*       dst,
+                                          const GpuArray* src,
+                                          unsigned        reduxLen,
+                                          const unsigned* reduxList);
+GPUARRAY_PUBLIC int GpuArray_any         (GpuArray*       dst,
+                                          const GpuArray* src,
+                                          unsigned        reduxLen,
+                                          const unsigned* reduxList);
+GPUARRAY_PUBLIC int GpuArray_reduction   (ga_reduce_op    op,
+                                          GpuArray*       dst,
+                                          GpuArray*       dstArg,
+                                          const GpuArray* src,
+                                          unsigned        reduxLen,
+                                          const unsigned* reduxList);
+
+
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c
index 8a6a2dc98b..859da8e272 100644
--- a/src/gpuarray_reduction.c
+++ b/src/gpuarray_reduction.c
@@ -12,9 +12,9 @@
 #include <errno.h>
 
 #include "private.h"
-#include "gpuarray/array.h"
 #include "gpuarray/error.h"
 #include "gpuarray/kernel.h"
+#include "gpuarray/reduction.h"
 #include "gpuarray/util.h"
 
 #include "util/strb.h"
@@ -704,7 +704,6 @@ static void  appendIdxes                   (strb*              s,
 
 static int   reduxCheckargs                (redux_ctx*  ctx){
 	int i, ret;
-	const strb INIT_STRB = STRB_STATIC_INIT;
 
 	/**
 	 * We initialize certain parts of the context.
@@ -720,7 +719,7 @@ static int   reduxCheckargs                (redux_ctx*  ctx){
 	ctx->ndhd          = 0;
 	ctx->ndhr          = 0;
 	ctx->sourceCode    = NULL;
-	ctx->s             = INIT_STRB;
+	strb_init(&ctx->s);
 
 	for(i=0;i<MAX_HW_DIMS;i++){
 		ctx->hwAxisList[i] = 0;
@@ -1169,6 +1168,10 @@ static void  reduxAppendFuncLoadVal        (redux_ctx*  ctx){
 		strb_appendf(&ctx->s, "i%d*srcSteps[%d] + \\\n\t                                                            ", i, ctx->axisList[i]);
 	}
 	strb_appends(&ctx->s, "0));\n");
+	
+	/* Prescalar transformations go here... */
+	
+	/* Return the value. */
 	strb_appends(&ctx->s, "\treturn v;\n");
 	strb_appends(&ctx->s, "}\n");
 	strb_appends(&ctx->s, "\n");
@@ -1189,17 +1192,17 @@ static void  reduxAppendFuncReduxVal       (redux_ctx*  ctx){
 	strb_appends(&ctx->s, "\n");
 	appendIdxes (&ctx->s, "WITHIN_KERNEL void reduxVal(", "X i", 0, ctx->ndd, "", "");
 	anyArgsEmitted = ctx->ndd>0;
-	if(anyArgsEmitted){
-		strb_appends(&ctx->s, ", ");
-	}
 	if(reduxKernelRequiresDst   (ctx)){
+		if(anyArgsEmitted){
+			strb_appends(&ctx->s, ", ");
+		}
 		anyArgsEmitted = 1;
 		strb_appends(&ctx->s, "GLOBAL_MEM T* dst,    const GLOBAL_MEM X* dstSteps,    K v");
 	}
-	if(anyArgsEmitted){
-		strb_appends(&ctx->s, ", ");
-	}
 	if(reduxKernelRequiresDstArg(ctx)){
+		if(anyArgsEmitted){
+			strb_appends(&ctx->s, ", ");
+		}
 		anyArgsEmitted = 1;
 		strb_appends(&ctx->s, "GLOBAL_MEM A* dstArg, const GLOBAL_MEM X* dstArgSteps, X i");
 	}
@@ -1248,12 +1251,12 @@ static void  reduxAppendFuncPreKernel      (redux_ctx*  ctx){
 }
 static void  reduxAppendFuncKernel         (redux_ctx*  ctx){
 	reduxAppendPrototype        (ctx);
-	strb_appends           (&ctx->s, "{\n");
+	strb_appends                (&ctx->s, "{\n");
 	reduxAppendOffsets          (ctx);
 	reduxAppendIndexDeclarations(ctx);
 	reduxAppendRangeCalculations(ctx);
 	reduxAppendLoops            (ctx);
-	strb_appends           (&ctx->s, "}\n");
+	strb_appends                (&ctx->s, "}\n");
 }
 static void  reduxAppendFuncPostKernel     (redux_ctx*  ctx){
 	
@@ -1280,8 +1283,12 @@ static void  reduxAppendPrototype          (redux_ctx*  ctx){
 static void  reduxAppendOffsets            (redux_ctx*  ctx){
 	strb_appends(&ctx->s, "\t/* Add offsets */\n");
 	strb_appends(&ctx->s, "\tsrc    = (const GLOBAL_MEM T*)((const GLOBAL_MEM char*)src    + srcOff);\n");
-	strb_appends(&ctx->s, "\tdst    = (GLOBAL_MEM T*)      ((GLOBAL_MEM char*)      dst    + dstOff);\n");
-	strb_appends(&ctx->s, "\tdstArg = (GLOBAL_MEM X*)      ((GLOBAL_MEM char*)      dstArg + dstArgOff);\n");
+	if(reduxKernelRequiresDst(ctx)){
+		strb_appends(&ctx->s, "\tdst    = (GLOBAL_MEM T*)      ((GLOBAL_MEM char*)      dst    + dstOff);\n");
+	}
+	if(reduxKernelRequiresDstArg(ctx)){
+		strb_appends(&ctx->s, "\tdstArg = (GLOBAL_MEM X*)      ((GLOBAL_MEM char*)      dstArg + dstArgOff);\n");
+	}
 	strb_appends(&ctx->s, "\t\n");
 	strb_appends(&ctx->s, "\t\n");
 }
@@ -1448,7 +1455,9 @@ static void  reduxAppendLoopInner          (redux_ctx*  ctx){
 	strb_appends(&ctx->s, "\t\t */\n");
 	strb_appends(&ctx->s, "\t\t\n");
 	strb_appends(&ctx->s, "\t\tK rdxV = getInitVal();\n");
-	strb_appends(&ctx->s, "\t\tX argI = 0;\n");
+	if(reduxKernelRequiresDstArg(ctx)){
+		strb_appends(&ctx->s, "\t\tX argI = 0;\n");
+	}
 	strb_appends(&ctx->s, "\t\t\n");
 	strb_appends(&ctx->s, "\t\t/**\n");
 	strb_appends(&ctx->s, "\t\t * REDUCTION LOOPS.\n");
@@ -1718,21 +1727,35 @@ static int   reduxInvokeLarge              (redux_ctx*  ctx){
 	                                   ctx->src->dimensions, flags, 0);
 	ctx->chunkSizeGD   = gpudata_alloc(ctx->gpuCtx, ctx->ndh * sizeof(size_t),
 	                                   ctx->chunkSize,       flags, 0);
-	ctx->dstStepsGD    = gpudata_alloc(ctx->gpuCtx, ctx->ndd * sizeof(size_t),
-	                                   ctx->dst->strides,    flags, 0);
-	ctx->dstArgStepsGD = gpudata_alloc(ctx->gpuCtx, ctx->ndd * sizeof(size_t),
-	                                   ctx->dstArg->strides, flags, 0);
+	if(reduxKernelRequiresDst(ctx)){
+		ctx->dstStepsGD    = gpudata_alloc(ctx->gpuCtx, ctx->ndd * sizeof(size_t),
+		                                   ctx->dst->strides,    flags, 0);
+	}
+	if(reduxKernelRequiresDstArg(ctx)){
+		ctx->dstArgStepsGD = gpudata_alloc(ctx->gpuCtx, ctx->ndd * sizeof(size_t),
+		                                   ctx->dstArg->strides, flags, 0);
+	}
 	args[ 0] = (void*) ctx->src->data;
 	args[ 1] = (void*)&ctx->src->offset;
 	args[ 2] = (void*) ctx->srcStepsGD;
 	args[ 3] = (void*) ctx->srcSizeGD;
 	args[ 4] = (void*) ctx->chunkSizeGD;
-	args[ 5] = (void*) ctx->dst->data;
-	args[ 6] = (void*)&ctx->dst->offset;
-	args[ 7] = (void*) ctx->dstStepsGD;
-	args[ 8] = (void*) ctx->dstArg->data;
-	args[ 9] = (void*)&ctx->dstArg->offset;
-	args[10] = (void*) ctx->dstArgStepsGD;
+	if      ( reduxKernelRequiresDst   (ctx) &&  reduxKernelRequiresDstArg(ctx)){
+		args[ 5] = (void*) ctx->dst->data;
+		args[ 6] = (void*)&ctx->dst->offset;
+		args[ 7] = (void*) ctx->dstStepsGD;
+		args[ 8] = (void*) ctx->dstArg->data;
+		args[ 9] = (void*)&ctx->dstArg->offset;
+		args[10] = (void*) ctx->dstArgStepsGD;
+	}else if( reduxKernelRequiresDst   (ctx) && !reduxKernelRequiresDstArg(ctx)){
+		args[ 5] = (void*) ctx->dst->data;
+		args[ 6] = (void*)&ctx->dst->offset;
+		args[ 7] = (void*) ctx->dstStepsGD;
+	}else if(!reduxKernelRequiresDst   (ctx) &&  reduxKernelRequiresDstArg(ctx)){
+		args[ 5] = (void*) ctx->dstArg->data;
+		args[ 6] = (void*)&ctx->dstArg->offset;
+		args[ 7] = (void*) ctx->dstArgStepsGD;
+	}
 
 	if(ctx->srcStepsGD   &&
 	   ctx->srcSizeGD    &&
diff --git a/tests/check_reduction.c b/tests/check_reduction.c
index d8c14aa572..2d47d6541d 100644
--- a/tests/check_reduction.c
+++ b/tests/check_reduction.c
@@ -1,8 +1,7 @@
 #include <check.h>
 
-#include <gpuarray/buffer.h>
-#include <gpuarray/array.h>
 #include <gpuarray/error.h>
+#include <gpuarray/reduction.h>
 #include <gpuarray/types.h>
 
 #include <stdint.h>
@@ -610,6 +609,83 @@ START_TEST(test_minandargmin_alldimsreduced){
 	GpuArray_clear(&gaArgmin);
 }END_TEST
 
+START_TEST(test_min_alldimsreduced){
+	pcgSeed(1);
+
+	/**
+	 * We test here a reduction of some random 3D tensor on all dimensions.
+	 */
+
+	size_t i,j,k;
+	size_t dims[3]  = {32,50,79};
+	size_t prodDims = dims[0]*dims[1]*dims[2];
+	const unsigned reduxList[] = {0,1,2};
+
+	float*  pSrc    = calloc(1, sizeof(*pSrc)    * dims[0]*dims[1]*dims[2]);
+	float*  pMin    = calloc(1, sizeof(*pMin)                             );
+
+	ck_assert_ptr_ne(pSrc,    NULL);
+	ck_assert_ptr_ne(pMin,    NULL);
+
+
+	/**
+	 * Initialize source data.
+	 */
+
+	for(i=0;i<prodDims;i++){
+		pSrc[i] = pcgRand01();
+	}
+
+
+	/**
+	 * Run the kernel.
+	 */
+
+	GpuArray gaSrc;
+	GpuArray gaMin;
+
+	ga_assert_ok(GpuArray_empty(&gaSrc,    ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaMin,    ctx, GA_FLOAT, 0, NULL,     GA_C_ORDER));
+
+	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaMin,    -1));  /* 0xFFFFFFFF is a qNaN. */
+
+	ga_assert_ok(GpuArray_min(&gaMin, &gaSrc, 3, reduxList));
+
+	ga_assert_ok(GpuArray_read(pMin,    sizeof(*pMin),    &gaMin));
+
+
+	/**
+	 * Check that the destination tensors are correct.
+	 */
+
+	float  gtMin    = pSrc[0];
+
+	for(i=0;i<dims[0];i++){
+		for(j=0;j<dims[1];j++){
+			for(k=0;k<dims[2];k++){
+				float v = pSrc[(i*dims[1] + j)*dims[2] + k];
+
+				if(v < gtMin){
+					gtMin    = v;
+				}
+			}
+		}
+	}
+
+	ck_assert_msg(gtMin    == pMin[0],    "Min value mismatch!");
+
+	/**
+	 * Deallocate.
+	 */
+
+	free(pSrc);
+	free(pMin);
+	GpuArray_clear(&gaSrc);
+	GpuArray_clear(&gaMin);
+}END_TEST
+
+
 Suite *get_suite(void) {
 	Suite *s  = suite_create("reduction");
 	TCase *tc = tcase_create("basic");
@@ -622,6 +698,7 @@ Suite *get_suite(void) {
 	tcase_add_test(tc, test_maxandargmax_alldimsreduced);
 	tcase_add_test(tc, test_minandargmin_reduction);
 	tcase_add_test(tc, test_minandargmin_alldimsreduced);
+	tcase_add_test(tc, test_min_alldimsreduced);
 
 	suite_add_tcase(s, tc);
 	return s;

From 6ed2534c5cc3d38b8f3327c4da2921b56dc210b3 Mon Sep 17 00:00:00 2001
From: Olexa Bilaniuk <obilaniu@gmail.com>
Date: Wed, 25 Jan 2017 20:09:18 -0500
Subject: [PATCH 04/34] Feedback Applied.

Spaces after a bunch of keywords, do certain string appends more
efficiently, no appending includes.
---
 src/gpuarray_reduction.c | 289 ++++++++++++++++++---------------------
 1 file changed, 136 insertions(+), 153 deletions(-)

diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c
index 859da8e272..5534d84c36 100644
--- a/src/gpuarray_reduction.c
+++ b/src/gpuarray_reduction.c
@@ -395,8 +395,8 @@ GPUARRAY_PUBLIC int  GpuArray_reduction   (ga_reduce_op    op,
                                            unsigned        reduxLen,
                                            const unsigned* reduxList){
 	redux_ctx  ctxSTACK = {op, dst, dstArg, src,
-	                       (int)reduxLen, (const int*)reduxList},
-	          *ctx      = &ctxSTACK;
+	                       (int)reduxLen, (const int*)reduxList};
+	redux_ctx *ctx      = &ctxSTACK;
 
 	return reduxCheckargs(ctx);
 }
@@ -413,8 +413,8 @@ GPUARRAY_PUBLIC int  GpuArray_reduction   (ga_reduce_op    op,
  */
 
 static int   reduxGetSumInit               (int typecode, const char** property){
-	if(typecode == GA_POINTER ||
-	   typecode == GA_BUFFER){
+	if (typecode == GA_POINTER ||
+	    typecode == GA_BUFFER){
 		return GA_UNSUPPORTED_ERROR;
 	}
 	*property = "0";
@@ -433,8 +433,8 @@ static int   reduxGetSumInit               (int typecode, const char** property)
  */
 
 static int   reduxGetProdInit              (int typecode, const char** property){
-	if(typecode == GA_POINTER ||
-	   typecode == GA_BUFFER){
+	if (typecode == GA_POINTER ||
+	    typecode == GA_BUFFER){
 		return GA_UNSUPPORTED_ERROR;
 	}
 	*property = "1";
@@ -453,7 +453,7 @@ static int   reduxGetProdInit              (int typecode, const char** property)
  */
 
 static int   reduxGetMinInit               (int typecode, const char** property){
-	switch(typecode){
+	switch (typecode){
 		case GA_BYTE2:
 		case GA_BYTE3:
 		case GA_BYTE4:
@@ -528,7 +528,7 @@ static int   reduxGetMinInit               (int typecode, const char** property)
  */
 
 static int   reduxGetMaxInit               (int typecode, const char** property){
-	switch(typecode){
+	switch (typecode){
 		case GA_BOOL:               *property = "1"; break;
 		case GA_BYTE2:
 		case GA_BYTE3:
@@ -602,8 +602,8 @@ static int   reduxGetMaxInit               (int typecode, const char** property)
  */
 
 static int   reduxGetAndInit               (int typecode, const char** property){
-	if(typecode == GA_POINTER ||
-	   typecode == GA_BUFFER){
+	if (typecode == GA_POINTER ||
+	    typecode == GA_BUFFER){
 		return GA_UNSUPPORTED_ERROR;
 	}
 	*property = "~0";
@@ -622,8 +622,8 @@ static int   reduxGetAndInit               (int typecode, const char** property)
  */
 
 static int   reduxGetOrInit                (int typecode, const char** property){
-	if(typecode == GA_POINTER ||
-	   typecode == GA_BUFFER){
+	if (typecode == GA_POINTER ||
+	    typecode == GA_BUFFER){
 		return GA_UNSUPPORTED_ERROR;
 	}
 	*property = "0";
@@ -646,9 +646,9 @@ static int   axisInSet                     (int                v,
                                             size_t*            where){
 	size_t i;
 
-	for(i=0;i<setLen;i++){
-		if(set[i] == v){
-			if(where){*where = i;}
+	for (i=0;i<setLen;i++){
+		if (set[i] == v){
+			if (where){*where = i;}
 			return 1;
 		}
 	}
@@ -687,7 +687,7 @@ static void  appendIdxes                   (strb*              s,
 	epilogue = epilogue ? epilogue : "";
 
 	strb_appends(s, prologue);
-	for(i=startIdx;i<endIdx;i++){
+	for (i=startIdx;i<endIdx;i++){
 		strb_appendf(s, "%s%d%s%s", prefix, i, suffix, &","[i==endIdx-1]);
 	}
 	strb_appends(s, epilogue);
@@ -721,7 +721,7 @@ static int   reduxCheckargs                (redux_ctx*  ctx){
 	ctx->sourceCode    = NULL;
 	strb_init(&ctx->s);
 
-	for(i=0;i<MAX_HW_DIMS;i++){
+	for (i=0;i<MAX_HW_DIMS;i++){
 		ctx->hwAxisList[i] = 0;
 		ctx->blockSize [i] = 1;
 		ctx->gridSize  [i] = 1;
@@ -734,21 +734,21 @@ static int   reduxCheckargs                (redux_ctx*  ctx){
 
 
 	/* Insane src, reduxLen, dst or dstArg? */
-	if(!ctx->src || ctx->src->nd <= 0 || ctx->reduxLen == 0 ||
-	   ctx->reduxLen > (int)ctx->src->nd){
+	if (!ctx->src || ctx->src->nd <= 0 || ctx->reduxLen == 0 ||
+	    ctx->reduxLen > (int)ctx->src->nd){
 		return reduxCleanup(ctx, GA_INVALID_ERROR);
 	}
-	if((reduxHasDst   (ctx) && !ctx->dst)   ||
-	   (reduxHasDstArg(ctx) && !ctx->dstArg)){
+	if ((reduxHasDst   (ctx) && !ctx->dst)   ||
+	    (reduxHasDstArg(ctx) && !ctx->dstArg)){
 		return reduxCleanup(ctx, GA_INVALID_ERROR);
 	}
 
 
 	/* Insane or duplicate list entry? */
-	for(i=0;i<ctx->reduxLen;i++){
-		if(ctx->reduxList[i] <  0                            ||
-		   ctx->reduxList[i] >= (int)ctx->src->nd            ||
-		   axisInSet(ctx->reduxList[i], ctx->reduxList, i, 0)){
+	for (i=0;i<ctx->reduxLen;i++){
+		if (ctx->reduxList[i] <  0                            ||
+		    ctx->reduxList[i] >= (int)ctx->src->nd            ||
+		    axisInSet(ctx->reduxList[i], ctx->reduxList, i, 0)){
 			return reduxCleanup(ctx, GA_INVALID_ERROR);
 		}
 	}
@@ -756,21 +756,21 @@ static int   reduxCheckargs                (redux_ctx*  ctx){
 
 	/* GPU context non-existent? */
 	ctx->gpuCtx     = GpuArray_context(ctx->src);
-	if(!ctx->gpuCtx){
+	if (!ctx->gpuCtx){
 		return reduxCleanup(ctx, GA_INVALID_ERROR);
 	}
 
 
 	/* Unknown type? */
 	reduxSelectTypes(ctx);
-	if(!ctx->srcTypeStr || !ctx->dstTypeStr || !ctx->dstArgTypeStr ||
-	   !ctx->accTypeStr){
+	if (!ctx->srcTypeStr || !ctx->dstTypeStr || !ctx->dstArgTypeStr ||
+	    !ctx->accTypeStr){
 		return reduxCleanup(ctx, GA_INVALID_ERROR);
 	}
 
 
 	/* Determine initializer, and error out if reduction unsupported. */
-	switch(ctx->op){
+	switch (ctx->op){
 		case GA_REDUCE_SUM:  ret = reduxGetSumInit (ctx->accTypeCode, &ctx->initVal); break;
 		case GA_REDUCE_PRODNZ:
 		case GA_REDUCE_PROD: ret = reduxGetProdInit(ctx->accTypeCode, &ctx->initVal); break;
@@ -787,7 +787,7 @@ static int   reduxCheckargs                (redux_ctx*  ctx){
 		case GA_REDUCE_OR:   ret = reduxGetOrInit  (ctx->accTypeCode, &ctx->initVal); break;
 		default:             ret = GA_UNSUPPORTED_ERROR; break;
 	}
-	if(ret != GA_NO_ERROR){
+	if (ret != GA_NO_ERROR){
 		return reduxCleanup(ctx, ret);
 	}
 
@@ -824,7 +824,7 @@ static void  reduxSelectTypes              (redux_ctx*  ctx){
 	ctx->dstTypeCode    = ctx->srcTypeCode;
 	ctx->dstArgTypeCode = GA_SSIZE;
 	ctx->idxTypeCode    = GA_SSIZE;
-	switch(ctx->srcTypeCode){
+	switch (ctx->srcTypeCode){
 		case GA_HALF:   ctx->accTypeCode = GA_FLOAT;
 		case GA_HALF2:  ctx->accTypeCode = GA_FLOAT2;
 		case GA_HALF4:  ctx->accTypeCode = GA_FLOAT4;
@@ -867,11 +867,11 @@ static int   reduxSelectModel              (redux_ctx*  ctx){
 	 */
 	
 	ret = gpucontext_property(ctx->gpuCtx, GA_CTX_PROP_NUMPROCS, &numProcs);
-	if(ret != GA_NO_ERROR){
+	if (ret != GA_NO_ERROR){
 		return reduxCleanup(ctx, ret);
 	}
 	ret = gpucontext_property(ctx->gpuCtx, GA_CTX_PROP_MAXLSIZE, &localSize);
-	if(ret != GA_NO_ERROR){
+	if (ret != GA_NO_ERROR){
 		return reduxCleanup(ctx, ret);
 	}
 
@@ -880,8 +880,8 @@ static int   reduxSelectModel              (redux_ctx*  ctx){
 	 * Compute #elems in dst and # reductions per dst element.
 	 */
 
-	for(i=0;i<ctx->nds;i++){
-		if(axisInSet(i, ctx->reduxList, ctx->nds, NULL)){
+	for (i=0;i<ctx->nds;i++){
+		if (axisInSet(i, ctx->reduxList, ctx->nds, NULL)){
 			reduxPerElem *= ctx->src->dimensions[i];
 		}else{
 			dstNumElem   *= ctx->src->dimensions[i];
@@ -923,7 +923,7 @@ static int   reduxIsLargeCodeModel         (redux_ctx*  ctx){
  */
 
 static int   reduxHasDst                   (redux_ctx*  ctx){
-	switch(ctx->op){
+	switch (ctx->op){
 		case GA_REDUCE_ARGMIN:
 		case GA_REDUCE_ARGMAX:       return 0;
 		default:                     return 1;
@@ -935,7 +935,7 @@ static int   reduxHasDst                   (redux_ctx*  ctx){
  */
 
 static int   reduxHasDstArg                (redux_ctx*  ctx){
-	switch(ctx->op){
+	switch (ctx->op){
 		case GA_REDUCE_MINANDARGMIN:
 		case GA_REDUCE_MAXANDARGMAX:
 		case GA_REDUCE_ARGMIN:
@@ -955,7 +955,7 @@ static int   reduxHasDstArg                (redux_ctx*  ctx){
  */
 
 static int   reduxKernelRequiresDst        (redux_ctx*  ctx){
-	switch(ctx->op){
+	switch (ctx->op){
 		case GA_REDUCE_ARGMIN:
 		case GA_REDUCE_ARGMAX:       return reduxIsSmallCodeModel(ctx);
 		default:                     return 1;
@@ -990,7 +990,7 @@ static int   reduxKernelRequiresDstArg     (redux_ctx*  ctx){
  */
 
 static int   reduxCanAppendHwAxis          (redux_ctx* ctx, int wantReductionAxis){
-	if(ctx->ndh >= MAX_HW_DIMS){
+	if (ctx->ndh >= MAX_HW_DIMS){
 		return 0;
 	}else{
 		return wantReductionAxis ? ctx->ndhr < ctx->ndr:
@@ -1009,13 +1009,13 @@ static void  reduxAppendLargestAxisToHwList(redux_ctx* ctx, int wantReductionAxi
 	size_t maxV = 0;
 	
 	/* Find */
-	for(i=0;i<ctx->nds;i++){
+	for (i=0;i<ctx->nds;i++){
 		isInHwList      = axisInSet(i, ctx->hwAxisList, ctx->ndh, 0);
 		isInReduxList   = axisInSet(i, ctx->reduxList,  ctx->ndr, 0);
 		isInDesiredList = wantReductionAxis ? isInReduxList : !isInReduxList;
 		isLargestSoFar  = ctx->src->dimensions[i] >= maxV;
 		
-		if(!isInHwList && isInDesiredList && isLargestSoFar){
+		if (!isInHwList && isInDesiredList && isLargestSoFar){
 			maxV = ctx->src->dimensions[i];
 			maxI = i;
 		}
@@ -1023,7 +1023,7 @@ static void  reduxAppendLargestAxisToHwList(redux_ctx* ctx, int wantReductionAxi
 	
 	/* Append */
 	ctx->hwAxisList[ctx->ndh++] = maxI;
-	if(wantReductionAxis){
+	if (wantReductionAxis){
 		ctx->ndhr++;
 	}else{
 		ctx->ndhd++;
@@ -1045,7 +1045,7 @@ static void  reduxAppendLargestAxisToHwList(redux_ctx* ctx, int wantReductionAxi
  */
 
 static int   reduxSelectHwAxes             (redux_ctx*  ctx){
-	if(reduxIsSmallCodeModel(ctx)){
+	if (reduxIsSmallCodeModel(ctx)){
 		while(reduxCanAppendHwAxis(ctx, 1)){
 			reduxAppendLargestAxisToHwList(ctx, 1);
 		}
@@ -1078,12 +1078,12 @@ static int   reduxComputeAxisList          (redux_ctx*  ctx){
 	int i, f=0;
 	
 	ctx->axisList = malloc(ctx->nds * sizeof(unsigned));
-	if(!ctx->axisList){
+	if (!ctx->axisList){
 		return reduxCleanup(ctx, GA_MEMORY_ERROR);
 	}
 
-	for(i=0;i<ctx->nds;i++){
-		if(!axisInSet(i, ctx->reduxList, ctx->ndr, 0)){
+	for (i=0;i<ctx->nds;i++){
+		if (!axisInSet(i, ctx->reduxList, ctx->ndr, 0)){
 			ctx->axisList[f++] = i;
 		}
 	}
@@ -1102,7 +1102,7 @@ static int   reduxComputeAxisList          (redux_ctx*  ctx){
 static int   reduxGenSource                (redux_ctx*  ctx){
 	reduxAppendSource(ctx);
 	ctx->sourceCode = strb_cstr(&ctx->s);
-	if(!ctx->sourceCode){
+	if (!ctx->sourceCode){
 		return reduxCleanup(ctx, GA_MEMORY_ERROR);
 	}
 	
@@ -1133,21 +1133,15 @@ static void  reduxAppendTypedefs           (redux_ctx*  ctx){
 	strb_appendf(&ctx->s, "typedef %s     A;/* The type of the destination argument array. */\n",  ctx->dstArgTypeStr);
 	strb_appendf(&ctx->s, "typedef %s     X;/* The type of the indices: signed 32/64-bit. */\n",   ctx->idxTypeStr);
 	strb_appendf(&ctx->s, "typedef %s     K;/* The type of the accumulator variable. */\n",        ctx->accTypeStr);
-	strb_appends(&ctx->s, "\n");
-	strb_appends(&ctx->s, "\n");
-	strb_appends(&ctx->s, "\n");
+	strb_appends(&ctx->s, "\n\n\n");
 }
 static void  reduxAppendFuncGetInitVal     (redux_ctx*  ctx){
 	strb_appends(&ctx->s, "/**\n");
 	strb_appends(&ctx->s, " * Initial value function.\n");
-	strb_appends(&ctx->s, " */\n");
-	strb_appends(&ctx->s, "\n");
+	strb_appends(&ctx->s, " */\n\n");
 	strb_appends(&ctx->s, "WITHIN_KERNEL K    getInitVal(void){\n");
 	strb_appendf(&ctx->s, "\treturn (%s);\n", ctx->initVal);
-	strb_appends(&ctx->s, "}\n");
-	strb_appends(&ctx->s, "\n");
-	strb_appends(&ctx->s, "\n");
-	strb_appends(&ctx->s, "\n");
+	strb_appends(&ctx->s, "}\n\n\n\n");
 }
 static void  reduxAppendFuncLoadVal        (redux_ctx*  ctx){
 	int i;
@@ -1159,12 +1153,12 @@ static void  reduxAppendFuncLoadVal        (redux_ctx*  ctx){
 	strb_appends(&ctx->s, " */\n");
 	strb_appends(&ctx->s, "\n");
 	appendIdxes (&ctx->s, "WITHIN_KERNEL K    loadVal(", "X i", 0, ctx->nds, "", "");
-	if(ctx->nds > 0){
+	if (ctx->nds > 0){
 		strb_appends(&ctx->s, ", ");
 	}
 	strb_appends(&ctx->s, "const GLOBAL_MEM S* src, const GLOBAL_MEM X* srcSteps){\n");
 	strb_appends(&ctx->s, "\tS v = (*(const GLOBAL_MEM S*)((const GLOBAL_MEM char*)src + ");
-	for(i=0;i<ctx->nds;i++){
+	for (i=0;i<ctx->nds;i++){
 		strb_appendf(&ctx->s, "i%d*srcSteps[%d] + \\\n\t                                                            ", i, ctx->axisList[i]);
 	}
 	strb_appends(&ctx->s, "0));\n");
@@ -1173,10 +1167,7 @@ static void  reduxAppendFuncLoadVal        (redux_ctx*  ctx){
 	
 	/* Return the value. */
 	strb_appends(&ctx->s, "\treturn v;\n");
-	strb_appends(&ctx->s, "}\n");
-	strb_appends(&ctx->s, "\n");
-	strb_appends(&ctx->s, "\n");
-	strb_appends(&ctx->s, "\n");
+	strb_appends(&ctx->s, "}\n\n\n\n");
 }
 static void  reduxAppendFuncReduxVal       (redux_ctx*  ctx){
 	int i, anyArgsEmitted = 0;
@@ -1192,15 +1183,15 @@ static void  reduxAppendFuncReduxVal       (redux_ctx*  ctx){
 	strb_appends(&ctx->s, "\n");
 	appendIdxes (&ctx->s, "WITHIN_KERNEL void reduxVal(", "X i", 0, ctx->ndd, "", "");
 	anyArgsEmitted = ctx->ndd>0;
-	if(reduxKernelRequiresDst   (ctx)){
-		if(anyArgsEmitted){
+	if (reduxKernelRequiresDst   (ctx)){
+		if (anyArgsEmitted){
 			strb_appends(&ctx->s, ", ");
 		}
 		anyArgsEmitted = 1;
 		strb_appends(&ctx->s, "GLOBAL_MEM T* dst,    const GLOBAL_MEM X* dstSteps,    K v");
 	}
-	if(reduxKernelRequiresDstArg(ctx)){
-		if(anyArgsEmitted){
+	if (reduxKernelRequiresDstArg(ctx)){
+		if (anyArgsEmitted){
 			strb_appends(&ctx->s, ", ");
 		}
 		anyArgsEmitted = 1;
@@ -1213,38 +1204,35 @@ static void  reduxAppendFuncReduxVal       (redux_ctx*  ctx){
 	
 	
 	/* Write to memory. */
-	if(reduxIsLargeCodeModel(ctx)){
+	if (reduxIsLargeCodeModel(ctx)){
 		/* Large code model. Easy: just write out the data, since it's safe. */
-		if(reduxKernelRequiresDst   (ctx)){
+		if (reduxKernelRequiresDst   (ctx)){
 			strb_appends(&ctx->s, "\t(*(GLOBAL_MEM T*)((GLOBAL_MEM char*)dst + ");
-			for(i=0;i<ctx->ndd;i++){
+			for (i=0;i<ctx->ndd;i++){
 				strb_appendf(&ctx->s, "i%d*dstSteps[%d] +\n\t                                          ", i, i);
 			}
 			strb_appends(&ctx->s, "0)) = v;\n");
 		}
-		if(reduxKernelRequiresDstArg(ctx)){
+		if (reduxKernelRequiresDstArg(ctx)){
 			strb_appends(&ctx->s, "\t(*(GLOBAL_MEM A*)((GLOBAL_MEM char*)dstArg + ");
-			for(i=0;i<ctx->ndd;i++){
+			for (i=0;i<ctx->ndd;i++){
 				strb_appendf(&ctx->s, "i%d*dstArgSteps[%d] +\n\t                                             ", i, i);
 			}
 			strb_appends(&ctx->s, "0)) = i;\n");
 		}
 	}else{
 		/* BUG: Implement the atomic reduction, one or two CAS loops. */
-		if      ( reduxKernelRequiresDst   (ctx) && !reduxKernelRequiresDstArg(ctx)){
+		if       ( reduxKernelRequiresDst   (ctx) && !reduxKernelRequiresDstArg(ctx)){
 			
-		}else if(!reduxKernelRequiresDst   (ctx) &&  reduxKernelRequiresDstArg(ctx)){
+		}else if (!reduxKernelRequiresDst   (ctx) &&  reduxKernelRequiresDstArg(ctx)){
 			
-		}else if( reduxKernelRequiresDst   (ctx) &&  reduxKernelRequiresDstArg(ctx)){
+		}else if ( reduxKernelRequiresDst   (ctx) &&  reduxKernelRequiresDstArg(ctx)){
 			
 		}
 	}
 	
 	/* Close off function. */
-	strb_appends(&ctx->s, "}\n");
-	strb_appends(&ctx->s, "\n");
-	strb_appends(&ctx->s, "\n");
-	strb_appends(&ctx->s, "\n");
+	strb_appends(&ctx->s, "}\n\n\n\n");
 }
 static void  reduxAppendFuncPreKernel      (redux_ctx*  ctx){
 	
@@ -1266,8 +1254,7 @@ static void  reduxAppendPrototype          (redux_ctx*  ctx){
 	strb_appends(&ctx->s, " * Reduction Kernel.\n");
 	strb_appends(&ctx->s, " *\n");
 	strb_appends(&ctx->s, " * Implements actual reduction operation.\n");
-	strb_appends(&ctx->s, " */\n");
-	strb_appends(&ctx->s, "\n");
+	strb_appends(&ctx->s, " */\n\n");
 	strb_appends(&ctx->s, "KERNEL void redux(const GLOBAL_MEM S*        src,\n");
 	strb_appends(&ctx->s, "                  const X                    srcOff,\n");
 	strb_appends(&ctx->s, "                  const GLOBAL_MEM X*        srcSteps,\n");
@@ -1283,14 +1270,13 @@ static void  reduxAppendPrototype          (redux_ctx*  ctx){
 static void  reduxAppendOffsets            (redux_ctx*  ctx){
 	strb_appends(&ctx->s, "\t/* Add offsets */\n");
 	strb_appends(&ctx->s, "\tsrc    = (const GLOBAL_MEM T*)((const GLOBAL_MEM char*)src    + srcOff);\n");
-	if(reduxKernelRequiresDst(ctx)){
+	if (reduxKernelRequiresDst(ctx)){
 		strb_appends(&ctx->s, "\tdst    = (GLOBAL_MEM T*)      ((GLOBAL_MEM char*)      dst    + dstOff);\n");
 	}
-	if(reduxKernelRequiresDstArg(ctx)){
+	if (reduxKernelRequiresDstArg(ctx)){
 		strb_appends(&ctx->s, "\tdstArg = (GLOBAL_MEM X*)      ((GLOBAL_MEM char*)      dstArg + dstArgOff);\n");
 	}
-	strb_appends(&ctx->s, "\t\n");
-	strb_appends(&ctx->s, "\t\n");
+	strb_appends(&ctx->s, "\t\n\t\n");
 }
 static void  reduxAppendIndexDeclarations  (redux_ctx*  ctx){
 	int i;
@@ -1300,29 +1286,27 @@ static void  reduxAppendIndexDeclarations  (redux_ctx*  ctx){
 	strb_appends(&ctx->s, "\tX bd0 = LDIM_0,       bd1 = LDIM_1,       bd2 = LDIM_2;\n");
 	strb_appends(&ctx->s, "\tX ti0 = LID_0,        ti1 = LID_1,        ti2 = LID_2;\n");
 	strb_appends(&ctx->s, "\tX gi0 = bi0*bd0+ti0,  gi1 = bi1*bd1+ti1,  gi2 = bi2*bd2+ti2;\n");
-	if(ctx->ndh>0){
+	if (ctx->ndh>0){
 		strb_appends(&ctx->s, "\tX ");
-		for(i=0;i<ctx->ndh;i++){
+		for (i=0;i<ctx->ndh;i++){
 			strb_appendf(&ctx->s, "ci%u = chunkSize[%u]%s",
 			             i, i, (i==ctx->ndh-1) ? ";\n" : ", ");
 		}
 	}
 
-	strb_appends(&ctx->s, "\t\n");
-	strb_appends(&ctx->s, "\t\n");
+	strb_appends(&ctx->s, "\t\n\t\n");
 	strb_appends(&ctx->s, "\t/* Free indices & Reduction indices */\n");
 
-	if(ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0,               ctx->nds, "",        ";\n");}
-	if(ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0,               ctx->nds, "Dim",     ";\n");}
-	if(ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0,               ctx->nds, "Start",   ";\n");}
-	if(ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0,               ctx->nds, "End",     ";\n");}
-	if(ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0,               ctx->nds, "SStep",   ";\n");}
-	if(ctx->ndd > 0){appendIdxes (&ctx->s, "\tX ", "i", 0,               ctx->ndd, "MStep",   ";\n");}
-	if(ctx->ndd > 0){appendIdxes (&ctx->s, "\tX ", "i", 0,               ctx->ndd, "AStep",   ";\n");}
-	if(ctx->nds > ctx->ndd){appendIdxes (&ctx->s, "\tX ", "i", ctx->ndd, ctx->nds, "PDim",    ";\n");}
+	if (ctx->nds >        0){appendIdxes (&ctx->s, "\tX ", "i", 0,        ctx->nds, "",        ";\n");}
+	if (ctx->nds >        0){appendIdxes (&ctx->s, "\tX ", "i", 0,        ctx->nds, "Dim",     ";\n");}
+	if (ctx->nds >        0){appendIdxes (&ctx->s, "\tX ", "i", 0,        ctx->nds, "Start",   ";\n");}
+	if (ctx->nds >        0){appendIdxes (&ctx->s, "\tX ", "i", 0,        ctx->nds, "End",     ";\n");}
+	if (ctx->nds >        0){appendIdxes (&ctx->s, "\tX ", "i", 0,        ctx->nds, "SStep",   ";\n");}
+	if (ctx->ndd >        0){appendIdxes (&ctx->s, "\tX ", "i", 0,        ctx->ndd, "MStep",   ";\n");}
+	if (ctx->ndd >        0){appendIdxes (&ctx->s, "\tX ", "i", 0,        ctx->ndd, "AStep",   ";\n");}
+	if (ctx->nds > ctx->ndd){appendIdxes (&ctx->s, "\tX ", "i", ctx->ndd, ctx->nds, "PDim",    ";\n");}
 
-	strb_appends(&ctx->s, "\t\n");
-	strb_appends(&ctx->s, "\t\n");
+	strb_appends(&ctx->s, "\t\n\t\n");
 }
 static void  reduxAppendRangeCalculations  (redux_ctx*  ctx){
 	size_t hwDim;
@@ -1331,57 +1315,56 @@ static void  reduxAppendRangeCalculations  (redux_ctx*  ctx){
 	/* Use internal remapping when computing the ranges for this thread. */
 	strb_appends(&ctx->s, "\t/* Compute ranges for this thread. */\n");
 
-	for(i=0;i<ctx->nds;i++){
+	for (i=0;i<ctx->nds;i++){
 		strb_appendf(&ctx->s, "\ti%dDim     = srcSize[%d];\n", i, ctx->axisList[i]);
 	}
-	for(i=0;i<ctx->nds;i++){
+	for (i=0;i<ctx->nds;i++){
 		strb_appendf(&ctx->s, "\ti%dSStep   = srcSteps[%d];\n", i, ctx->axisList[i]);
 	}
-	for(i=0;i<ctx->ndd;i++){
+	for (i=0;i<ctx->ndd;i++){
 		strb_appendf(&ctx->s, "\ti%dMStep   = dstSteps[%d];\n", i, i);
 	}
-	for(i=0;i<ctx->ndd;i++){
+	for (i=0;i<ctx->ndd;i++){
 		strb_appendf(&ctx->s, "\ti%dAStep   = dstArgSteps[%d];\n", i, i);
 	}
-	for(i=ctx->nds-1;i>=ctx->ndd;i--){
+	for (i=ctx->nds-1;i>=ctx->ndd;i--){
 		/**
 		 * If this is the last index, it's the first cumulative dimension
 		 * product we generate, and thus we initialize to 1.
 		 */
 
-		if(i == ctx->nds-1){
+		if (i == ctx->nds-1){
 			strb_appendf(&ctx->s, "\ti%dPDim    = 1;\n", i);
 		}else{
 			strb_appendf(&ctx->s, "\ti%dPDim    = i%dPDim * i%dDim;\n", i, i+1, i+1);
 		}
 	}
-	for(i=0;i<ctx->nds;i++){
+	for (i=0;i<ctx->nds;i++){
 		/**
 		 * Up to MAX_HW_DIMS dimensions get to rely on hardware loops.
 		 * The others, if any, have to use software looping beginning at 0.
 		 */
 
-		if(axisInSet(ctx->axisList[i], ctx->hwAxisList, ctx->ndh, &hwDim)){
+		if (axisInSet(ctx->axisList[i], ctx->hwAxisList, ctx->ndh, &hwDim)){
 			strb_appendf(&ctx->s, "\ti%dStart   = gi%d * ci%d;\n", i, hwDim, hwDim);
 		}else{
 			strb_appendf(&ctx->s, "\ti%dStart   = 0;\n", i);
 		}
 	}
-	for(i=0;i<ctx->nds;i++){
+	for (i=0;i<ctx->nds;i++){
 		/**
 		 * Up to MAX_HW_DIMS dimensions get to rely on hardware loops.
 		 * The others, if any, have to use software looping beginning at 0.
 		 */
 
-		if(axisInSet(ctx->axisList[i], ctx->hwAxisList, ctx->ndh, &hwDim)){
+		if (axisInSet(ctx->axisList[i], ctx->hwAxisList, ctx->ndh, &hwDim)){
 			strb_appendf(&ctx->s, "\ti%dEnd     = i%dStart + ci%d;\n", i, i, hwDim);
 		}else{
 			strb_appendf(&ctx->s, "\ti%dEnd     = i%dStart + i%dDim;\n", i, i, i);
 		}
 	}
 
-	strb_appends(&ctx->s, "\t\n");
-	strb_appends(&ctx->s, "\t\n");
+	strb_appends(&ctx->s, "\t\n\t\n");
 }
 static void  reduxAppendLoops              (redux_ctx*  ctx){
 	strb_appends(&ctx->s, "\t/**\n");
@@ -1413,7 +1396,7 @@ static void  reduxAppendLoopMacroDefs      (redux_ctx*  ctx){
 	 */
 
 	appendIdxes (&ctx->s, "#define RDXINDEXER(", "i", ctx->ndd, ctx->nds, "", ")              (");
-	for(i=ctx->ndd;i<ctx->nds;i++){
+	for (i=ctx->ndd;i<ctx->nds;i++){
 		strb_appendf(&ctx->s, "i%d*i%dPDim + \\\n                                        ", i, i);
 	}
 	strb_appends(&ctx->s, "0)\n");
@@ -1425,7 +1408,7 @@ static void  reduxAppendLoopOuter          (redux_ctx*  ctx){
 	 * Outer Loop Header Generation
 	 */
 
-	for(i=0;i<ctx->ndd;i++){
+	for (i=0;i<ctx->ndd;i++){
 		strb_appendf(&ctx->s, "\tFOROVER(%d){ESCAPE(%d)\n", i, i);
 	}
 
@@ -1439,7 +1422,7 @@ static void  reduxAppendLoopOuter          (redux_ctx*  ctx){
 	 * Outer Loop Trailer Generation
 	 */
 
-	for(i=0;i<ctx->ndd;i++){
+	for (i=0;i<ctx->ndd;i++){
 		strb_appends(&ctx->s, "\t}\n");
 	}
 }
@@ -1455,7 +1438,7 @@ static void  reduxAppendLoopInner          (redux_ctx*  ctx){
 	strb_appends(&ctx->s, "\t\t */\n");
 	strb_appends(&ctx->s, "\t\t\n");
 	strb_appends(&ctx->s, "\t\tK rdxV = getInitVal();\n");
-	if(reduxKernelRequiresDstArg(ctx)){
+	if (reduxKernelRequiresDstArg(ctx)){
 		strb_appends(&ctx->s, "\t\tX argI = 0;\n");
 	}
 	strb_appends(&ctx->s, "\t\t\n");
@@ -1468,7 +1451,7 @@ static void  reduxAppendLoopInner          (redux_ctx*  ctx){
 	 * Inner Loop Header Generation
 	 */
 
-	for(i=ctx->ndd;i<ctx->nds;i++){
+	for (i=ctx->ndd;i<ctx->nds;i++){
 		strb_appendf(&ctx->s, "\t\tFOROVER(%d){ESCAPE(%d)\n", i, i);
 	}
 
@@ -1477,12 +1460,12 @@ static void  reduxAppendLoopInner          (redux_ctx*  ctx){
 	 */
 
 	appendIdxes (&ctx->s, "\t\t\tK v = loadVal(", "i", 0, ctx->nds, "", "");
-	if(ctx->nds > 0){
+	if (ctx->nds > 0){
 		strb_appends(&ctx->s, ", ");
 	}
 	strb_appends(&ctx->s, "src, srcSteps);\n");
 	strb_appends(&ctx->s, "\t\t\t\n");
-	switch(ctx->op){
+	switch (ctx->op){
 		case GA_REDUCE_SUM:          strb_appends(&ctx->s, "\t\t\trdxV += v;\n"); break;
 		case GA_REDUCE_PROD:         strb_appends(&ctx->s, "\t\t\trdxV *= v;\n"); break;
 		case GA_REDUCE_PRODNZ:       strb_appends(&ctx->s, "\t\t\trdxV *= v==0 ? getInitVal() : v;\n"); break;
@@ -1513,7 +1496,7 @@ static void  reduxAppendLoopInner          (redux_ctx*  ctx){
 	 * Inner Loop Trailer Generation
 	 */
 
-	for(i=ctx->ndd;i<ctx->nds;i++){
+	for (i=ctx->ndd;i<ctx->nds;i++){
 		strb_appends(&ctx->s, "\t\t}\n");
 	}
 	strb_appends(&ctx->s, "\t\t\n");
@@ -1526,21 +1509,21 @@ static void  reduxAppendLoopInner          (redux_ctx*  ctx){
 	strb_appends(&ctx->s, "\t\t * Destination writeback.\n");
 	strb_appends(&ctx->s, "\t\t */\n");
 	strb_appends(&ctx->s, "\t\t\n");
-	if      ( reduxKernelRequiresDst   (ctx) && !reduxKernelRequiresDstArg(ctx)){
+	if       ( reduxKernelRequiresDst   (ctx) && !reduxKernelRequiresDstArg(ctx)){
 		appendIdxes (&ctx->s, "\t\treduxVal(", "i", 0, ctx->ndd, "", "");
-		if(ctx->ndd > 0){
+		if (ctx->ndd > 0){
 			strb_appends(&ctx->s, ", ");
 		}
 		strb_appends(&ctx->s, "dst, dstSteps, rdxV);\n");
-	}else if(!reduxKernelRequiresDst   (ctx) &&  reduxKernelRequiresDstArg(ctx)){
+	}else if (!reduxKernelRequiresDst   (ctx) &&  reduxKernelRequiresDstArg(ctx)){
 		appendIdxes (&ctx->s, "\t\treduxVal(", "i", 0, ctx->ndd, "", "");
-		if(ctx->ndd > 0){
+		if (ctx->ndd > 0){
 			strb_appends(&ctx->s, ", ");
 		}
 		strb_appends(&ctx->s, "dstArg, dstArgSteps, argI);\n");
-	}else if( reduxKernelRequiresDst   (ctx) &&  reduxKernelRequiresDstArg(ctx)){
+	}else if ( reduxKernelRequiresDst   (ctx) &&  reduxKernelRequiresDstArg(ctx)){
 		appendIdxes (&ctx->s, "\t\treduxVal(", "i", 0, ctx->ndd, "", "");
-		if(ctx->ndd > 0){
+		if (ctx->ndd > 0){
 			strb_appends(&ctx->s, ", ");
 		}
 		strb_appends(&ctx->s, "dst, dstSteps, rdxV, dstArg, dstArgSteps, argI);\n");
@@ -1560,23 +1543,23 @@ static void  reduxAppendLoopMacroUndefs    (redux_ctx*  ctx){
 
 static int   reduxCompileLarge             (redux_ctx*  ctx){
 	const int    ARG_TYPECODES[]   = {
-		GA_BUFFER, /* src */
-		GA_SIZE,   /* srcOff */
-		GA_BUFFER, /* srcSteps */
-		GA_BUFFER, /* srcSize */
-		GA_BUFFER, /* chnkSize */
-		GA_BUFFER, /* dst */
-		GA_SIZE,   /* dstOff */
-		GA_BUFFER, /* dstSteps */
-		GA_BUFFER, /* dstArg */
-		GA_SIZE,   /* dstArgOff */
-		GA_BUFFER  /* dstArgSteps */
+	    GA_BUFFER, /* src */
+	    GA_SIZE,   /* srcOff */
+	    GA_BUFFER, /* srcSteps */
+	    GA_BUFFER, /* srcSize */
+	    GA_BUFFER, /* chnkSize */
+	    GA_BUFFER, /* dst */
+	    GA_SIZE,   /* dstOff */
+	    GA_BUFFER, /* dstSteps */
+	    GA_BUFFER, /* dstArg */
+	    GA_SIZE,   /* dstArgOff */
+	    GA_BUFFER  /* dstArgSteps */
 	};
 	const size_t ARG_TYPECODES_LEN = sizeof(ARG_TYPECODES)/sizeof(*ARG_TYPECODES);
 	const char*  SRCS[1]           = {ctx->sourceCode};
 	const size_t SRC_LENS[1]       = {strlen(ctx->sourceCode)};
 	const size_t SRCS_LEN          = sizeof(SRCS)/sizeof(*SRCS);
-	
+
 	int ret  = GpuKernel_init(&ctx->kernel,
 	                          ctx->gpuCtx,
 	                          SRCS_LEN,
@@ -1588,7 +1571,7 @@ static int   reduxCompileLarge             (redux_ctx*  ctx){
 	                          0,
 	                          (char**)0);
 
-	if(ret != GA_NO_ERROR){
+	if (ret != GA_NO_ERROR){
 		return reduxCleanup(ctx, ret);
 	}else{
 		return reduxScheduleLarge(ctx);
@@ -1652,20 +1635,20 @@ static int   reduxScheduleLarge            (redux_ctx*  ctx){
 	dims[0]  = dims[1]  = dims[2]  = 1;
 	slack[0] = slack[1] = slack[2] = 1.1;
 
-	for(i=0;i<ctx->ndh;i++){
+	for (i=0;i<ctx->ndh;i++){
 		dims[i] = ctx->src->dimensions[ctx->hwAxisList[i]];
 		gaIFLInit(&factBS[i]);
 		gaIFLInit(&factGS[i]);
 		gaIFLInit(&factCS[i]);
 
 		warpMod = dims[i]%warpSize;
-		if(bestWarpMod>0 && (warpMod==0 || warpMod>=bestWarpMod)){
+		if (bestWarpMod>0 && (warpMod==0 || warpMod>=bestWarpMod)){
 			bestWarpAxis = i;
 			bestWarpMod  = warpMod;
 		}
 	}
 
-	if(ctx->ndh > 0){
+	if (ctx->ndh > 0){
 		dims[bestWarpAxis] = (dims[bestWarpAxis] + warpSize - 1)/warpSize;
 		gaIFactorize(warpSize, 0, 0, &factBS[bestWarpAxis]);
 	}
@@ -1676,8 +1659,8 @@ static int   reduxScheduleLarge            (redux_ctx*  ctx){
 	 * chunkSize.
 	 */
 
-	for(i=0;i<ctx->ndh;i++){
-		while(!gaIFactorize(dims[i], (uint64_t)(dims[i]*slack[i]), maxLs[i], &factCS[i])){
+	for (i=0;i<ctx->ndh;i++){
+		while (!gaIFactorize(dims[i], (uint64_t)(dims[i]*slack[i]), maxLs[i], &factCS[i])){
 			/**
 			 * Error! Failed to factorize dimension i with given slack and
 			 * k-smoothness constraints! Increase slack. Once slack reaches
@@ -1698,7 +1681,7 @@ static int   reduxScheduleLarge            (redux_ctx*  ctx){
 	gaIFLSchedule(ctx->ndh, maxLg, maxLs, maxGg, maxGs, factBS, factGS, factCS);
 
 	/* Output. */
-	for(i=0;i<ctx->ndh;i++){
+	for (i=0;i<ctx->ndh;i++){
 		ctx->blockSize[i] = gaIFLGetProduct(&factBS[i]);
 		ctx->gridSize [i] = gaIFLGetProduct(&factGS[i]);
 		ctx->chunkSize[i] = gaIFLGetProduct(&factCS[i]);
@@ -1727,11 +1710,11 @@ static int   reduxInvokeLarge              (redux_ctx*  ctx){
 	                                   ctx->src->dimensions, flags, 0);
 	ctx->chunkSizeGD   = gpudata_alloc(ctx->gpuCtx, ctx->ndh * sizeof(size_t),
 	                                   ctx->chunkSize,       flags, 0);
-	if(reduxKernelRequiresDst(ctx)){
+	if (reduxKernelRequiresDst(ctx)){
 		ctx->dstStepsGD    = gpudata_alloc(ctx->gpuCtx, ctx->ndd * sizeof(size_t),
 		                                   ctx->dst->strides,    flags, 0);
 	}
-	if(reduxKernelRequiresDstArg(ctx)){
+	if (reduxKernelRequiresDstArg(ctx)){
 		ctx->dstArgStepsGD = gpudata_alloc(ctx->gpuCtx, ctx->ndd * sizeof(size_t),
 		                                   ctx->dstArg->strides, flags, 0);
 	}
@@ -1740,28 +1723,28 @@ static int   reduxInvokeLarge              (redux_ctx*  ctx){
 	args[ 2] = (void*) ctx->srcStepsGD;
 	args[ 3] = (void*) ctx->srcSizeGD;
 	args[ 4] = (void*) ctx->chunkSizeGD;
-	if      ( reduxKernelRequiresDst   (ctx) &&  reduxKernelRequiresDstArg(ctx)){
+	if       ( reduxKernelRequiresDst   (ctx) &&  reduxKernelRequiresDstArg(ctx)){
 		args[ 5] = (void*) ctx->dst->data;
 		args[ 6] = (void*)&ctx->dst->offset;
 		args[ 7] = (void*) ctx->dstStepsGD;
 		args[ 8] = (void*) ctx->dstArg->data;
 		args[ 9] = (void*)&ctx->dstArg->offset;
 		args[10] = (void*) ctx->dstArgStepsGD;
-	}else if( reduxKernelRequiresDst   (ctx) && !reduxKernelRequiresDstArg(ctx)){
+	}else if ( reduxKernelRequiresDst   (ctx) && !reduxKernelRequiresDstArg(ctx)){
 		args[ 5] = (void*) ctx->dst->data;
 		args[ 6] = (void*)&ctx->dst->offset;
 		args[ 7] = (void*) ctx->dstStepsGD;
-	}else if(!reduxKernelRequiresDst   (ctx) &&  reduxKernelRequiresDstArg(ctx)){
+	}else if (!reduxKernelRequiresDst   (ctx) &&  reduxKernelRequiresDstArg(ctx)){
 		args[ 5] = (void*) ctx->dstArg->data;
 		args[ 6] = (void*)&ctx->dstArg->offset;
 		args[ 7] = (void*) ctx->dstArgStepsGD;
 	}
 
-	if(ctx->srcStepsGD   &&
-	   ctx->srcSizeGD    &&
-	   ctx->chunkSizeGD  &&
-	   ctx->dstStepsGD   &&
-	   ctx->dstArgStepsGD){
+	if (ctx->srcStepsGD   &&
+	    ctx->srcSizeGD    &&
+	    ctx->chunkSizeGD  &&
+	    ctx->dstStepsGD   &&
+	    ctx->dstArgStepsGD){
 		ret = GpuKernel_call(&ctx->kernel,
 		                     ctx->ndh>0 ? ctx->ndh : 1,
 		                     ctx->gridSize,

From a0654c204ff70a54a7a56a25ba22de6463802daf Mon Sep 17 00:00:00 2001
From: Olexa Bilaniuk <obilaniu@gmail.com>
Date: Wed, 25 Jan 2017 20:30:35 -0500
Subject: [PATCH 05/34] More style fixes on switches.

---
 src/gpuarray_reduction.c | 290 +++++++++++++++++++++++++--------------
 1 file changed, 190 insertions(+), 100 deletions(-)

diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c
index 5534d84c36..62502fb1fd 100644
--- a/src/gpuarray_reduction.c
+++ b/src/gpuarray_reduction.c
@@ -459,26 +459,36 @@ static int   reduxGetMinInit               (int typecode, const char** property)
 		case GA_BYTE4:
 		case GA_BYTE8:
 		case GA_BYTE16:
-		case GA_BYTE:               *property = "SCHAR_MIN"; break;
+		case GA_BYTE:
+		  *property = "SCHAR_MIN";
+		break;
 		case GA_SHORT2:
 		case GA_SHORT3:
 		case GA_SHORT4:
 		case GA_SHORT8:
 		case GA_SHORT16:
-		case GA_SHORT:              *property = "SHRT_MIN"; break;
+		case GA_SHORT:
+		  *property = "SHRT_MIN";
+		break;
 		case GA_INT2:
 		case GA_INT3:
 		case GA_INT4:
 		case GA_INT8:
 		case GA_INT16:
-		case GA_INT:                *property = "INT_MIN"; break;
+		case GA_INT:
+		  *property = "INT_MIN";
+		break;
 		case GA_LONG2:
 		case GA_LONG3:
 		case GA_LONG4:
 		case GA_LONG8:
 		case GA_LONG16:
-		case GA_LONG:               *property = "LONG_MIN"; break;
-		case GA_LONGLONG:           *property = "LLONG_MIN"; break;
+		case GA_LONG:
+		  *property = "LONG_MIN";
+		break;
+		case GA_LONGLONG:
+		  *property = "LLONG_MIN";
+		break;
 		case GA_BOOL:
 		case GA_UBYTE2:
 		case GA_UBYTE3:
@@ -505,14 +515,19 @@ static int   reduxGetMinInit               (int typecode, const char** property)
 		case GA_ULONG16:
 		case GA_ULONG:
 		case GA_ULONGLONG:
-		case GA_SIZE:               *property = "0"; break;
+		case GA_SIZE:
+		  *property = "0";
+		break;
 		case GA_HALF:
 		case GA_FLOAT:
 		case GA_DOUBLE:
-		case GA_QUAD:               *property = "NAN"; break;
-		default:      return GA_UNSUPPORTED_ERROR;
+		case GA_QUAD:
+		  *property = "NAN";
+		break;
+		default:
+		  return GA_UNSUPPORTED_ERROR;
 	}
-	
+
 	return GA_NO_ERROR;
 }
 
@@ -529,64 +544,89 @@ static int   reduxGetMinInit               (int typecode, const char** property)
 
 static int   reduxGetMaxInit               (int typecode, const char** property){
 	switch (typecode){
-		case GA_BOOL:               *property = "1"; break;
+		case GA_BOOL:
+		  *property = "1";
+		break;
 		case GA_BYTE2:
 		case GA_BYTE3:
 		case GA_BYTE4:
 		case GA_BYTE8:
 		case GA_BYTE16:
-		case GA_BYTE:               *property = "SCHAR_MAX"; break;
+		case GA_BYTE:
+		  *property = "SCHAR_MAX";
+		break;
 		case GA_UBYTE2:
 		case GA_UBYTE3:
 		case GA_UBYTE4:
 		case GA_UBYTE8:
 		case GA_UBYTE16:
-		case GA_UBYTE:              *property = "UCHAR_MAX"; break;
+		case GA_UBYTE:
+		  *property = "UCHAR_MAX";
+		break;
 		case GA_SHORT2:
 		case GA_SHORT3:
 		case GA_SHORT4:
 		case GA_SHORT8:
 		case GA_SHORT16:
-		case GA_SHORT:              *property = "SHRT_MAX"; break;
+		case GA_SHORT:
+		  *property = "SHRT_MAX";
+		break;
 		case GA_USHORT2:
 		case GA_USHORT3:
 		case GA_USHORT4:
 		case GA_USHORT8:
 		case GA_USHORT16:
-		case GA_USHORT:             *property = "USHRT_MAX"; break;
+		case GA_USHORT:
+		  *property = "USHRT_MAX";
+		break;
 		case GA_INT2:
 		case GA_INT3:
 		case GA_INT4:
 		case GA_INT8:
 		case GA_INT16:
-		case GA_INT:                *property = "INT_MAX"; break;
+		case GA_INT:
+		  *property = "INT_MAX";
+		break;
 		case GA_UINT2:
 		case GA_UINT3:
 		case GA_UINT4:
 		case GA_UINT8:
 		case GA_UINT16:
-		case GA_UINT:               *property = "UINT_MAX"; break;
+		case GA_UINT:
+		  *property = "UINT_MAX";
+		break;
 		case GA_LONG2:
 		case GA_LONG3:
 		case GA_LONG4:
 		case GA_LONG8:
 		case GA_LONG16:
-		case GA_LONG:               *property = "LONG_MAX"; break;
+		case GA_LONG:
+		  *property = "LONG_MAX";
+		break;
 		case GA_ULONG2:
 		case GA_ULONG3:
 		case GA_ULONG4:
 		case GA_ULONG8:
 		case GA_ULONG16:
-		case GA_ULONG:              *property = "ULONG_MAX"; break;
-		case GA_LONGLONG:           *property = "LLONG_MAX"; break;
-		case GA_ULONGLONG:          *property = "ULLONG_MAX"; break;
+		case GA_ULONG:
+		  *property = "ULONG_MAX";
+		break;
+		case GA_LONGLONG:
+		  *property = "LLONG_MAX";
+		break;
+		case GA_ULONGLONG:
+		  *property = "ULLONG_MAX";
+		break;
 		case GA_HALF:
 		case GA_FLOAT:
 		case GA_DOUBLE:
-		case GA_QUAD:               *property = "NAN"; break;
-		default:      return GA_UNSUPPORTED_ERROR;
+		case GA_QUAD:
+		  *property = "NAN";
+		break;
+		default:
+		  return GA_UNSUPPORTED_ERROR;
 	}
-	
+
 	return GA_NO_ERROR;
 }
 
@@ -771,21 +811,34 @@ static int   reduxCheckargs                (redux_ctx*  ctx){
 
 	/* Determine initializer, and error out if reduction unsupported. */
 	switch (ctx->op){
-		case GA_REDUCE_SUM:  ret = reduxGetSumInit (ctx->accTypeCode, &ctx->initVal); break;
+		case GA_REDUCE_SUM:
+		  ret = reduxGetSumInit (ctx->accTypeCode, &ctx->initVal);
+		break;
 		case GA_REDUCE_PRODNZ:
-		case GA_REDUCE_PROD: ret = reduxGetProdInit(ctx->accTypeCode, &ctx->initVal); break;
+		case GA_REDUCE_PROD:
+		  ret = reduxGetProdInit(ctx->accTypeCode, &ctx->initVal);
+		break;
 		case GA_REDUCE_MINANDARGMIN:
 		case GA_REDUCE_ARGMIN:
-		case GA_REDUCE_MIN:  ret = reduxGetMinInit (ctx->accTypeCode, &ctx->initVal); break;
+		case GA_REDUCE_MIN:
+		  ret = reduxGetMinInit (ctx->accTypeCode, &ctx->initVal);
+		break;
 		case GA_REDUCE_MAXANDARGMAX:
 		case GA_REDUCE_ARGMAX:
-		case GA_REDUCE_MAX:  ret = reduxGetMaxInit (ctx->accTypeCode, &ctx->initVal); break;
+		case GA_REDUCE_MAX:
+		  ret = reduxGetMaxInit (ctx->accTypeCode, &ctx->initVal);
+		break;
 		case GA_REDUCE_ALL:
-		case GA_REDUCE_AND:  ret = reduxGetAndInit (ctx->accTypeCode, &ctx->initVal); break;
+		case GA_REDUCE_AND:
+		  ret = reduxGetAndInit (ctx->accTypeCode, &ctx->initVal);
+		break;
 		case GA_REDUCE_ANY:
 		case GA_REDUCE_XOR:
-		case GA_REDUCE_OR:   ret = reduxGetOrInit  (ctx->accTypeCode, &ctx->initVal); break;
-		default:             ret = GA_UNSUPPORTED_ERROR; break;
+		case GA_REDUCE_OR:
+		  ret = reduxGetOrInit  (ctx->accTypeCode, &ctx->initVal);
+		break;
+		default:
+		  ret = GA_UNSUPPORTED_ERROR;
 	}
 	if (ret != GA_NO_ERROR){
 		return reduxCleanup(ctx, ret);
@@ -809,7 +862,7 @@ static int   reduxCheckargs                (redux_ctx*  ctx){
 
 /**
  * @brief Select types for the reduction kernel's implementation.
- * 
+ *
  * There are 5 types of relevance:
  *   - Source                   (S=Source)
  *   - Destination              (T=Target)
@@ -825,14 +878,25 @@ static void  reduxSelectTypes              (redux_ctx*  ctx){
 	ctx->dstArgTypeCode = GA_SSIZE;
 	ctx->idxTypeCode    = GA_SSIZE;
 	switch (ctx->srcTypeCode){
-		case GA_HALF:   ctx->accTypeCode = GA_FLOAT;
-		case GA_HALF2:  ctx->accTypeCode = GA_FLOAT2;
-		case GA_HALF4:  ctx->accTypeCode = GA_FLOAT4;
-		case GA_HALF8:  ctx->accTypeCode = GA_FLOAT8;
-		case GA_HALF16: ctx->accTypeCode = GA_FLOAT16;
-		default:        ctx->accTypeCode = ctx->srcTypeCode;
-	}
-	
+		case GA_HALF:
+		  ctx->accTypeCode = GA_FLOAT;
+		break;
+		case GA_HALF2:
+		  ctx->accTypeCode = GA_FLOAT2;
+		break;
+		case GA_HALF4:
+		  ctx->accTypeCode = GA_FLOAT4;
+		break;
+		case GA_HALF8:
+		  ctx->accTypeCode = GA_FLOAT8;
+		break;
+		case GA_HALF16:
+		  ctx->accTypeCode = GA_FLOAT16;
+		break;
+		default:
+		  ctx->accTypeCode = ctx->srcTypeCode;
+	}
+
 	/* Get the string version as well. */
 	ctx->srcTypeStr     = gpuarray_get_type(ctx->srcTypeCode)   ->cluda_name;
 	ctx->dstTypeStr     = gpuarray_get_type(ctx->dstTypeCode)   ->cluda_name;
@@ -843,7 +907,7 @@ static void  reduxSelectTypes              (redux_ctx*  ctx){
 
 /**
  * @brief Select which code model will be used:
- * 
+ *
  *        - Large (Destination tensor >= SMALL_REDUX_THRESHOLD elements, or
  *                 destination tensor size >= # of reductions per destination
  *                 tensor element):
@@ -865,7 +929,7 @@ static int   reduxSelectModel              (redux_ctx*  ctx){
 	 * use large code model; Otherwise use small code model, where threads will
 	 * have to cooperate.
 	 */
-	
+
 	ret = gpucontext_property(ctx->gpuCtx, GA_CTX_PROP_NUMPROCS, &numProcs);
 	if (ret != GA_NO_ERROR){
 		return reduxCleanup(ctx, ret);
@@ -897,8 +961,8 @@ static int   reduxSelectModel              (redux_ctx*  ctx){
 	 *       - reduxKernelRequiresDst()
 	 *       - reduxKernelRequiresDstArg()
 	 */
-	
-	
+
+
 	return reduxSelectHwAxes(ctx);
 }
 
@@ -925,8 +989,10 @@ static int   reduxIsLargeCodeModel         (redux_ctx*  ctx){
 static int   reduxHasDst                   (redux_ctx*  ctx){
 	switch (ctx->op){
 		case GA_REDUCE_ARGMIN:
-		case GA_REDUCE_ARGMAX:       return 0;
-		default:                     return 1;
+		case GA_REDUCE_ARGMAX:
+		  return 0;
+		default:
+		  return 1;
 	}
 }
 
@@ -939,15 +1005,17 @@ static int   reduxHasDstArg                (redux_ctx*  ctx){
 		case GA_REDUCE_MINANDARGMIN:
 		case GA_REDUCE_MAXANDARGMAX:
 		case GA_REDUCE_ARGMIN:
-		case GA_REDUCE_ARGMAX:       return 1;
-		default:                     return 0;
+		case GA_REDUCE_ARGMAX:
+		  return 1;
+		default:
+		  return 0;
 	}
 }
 
 /**
  * @brief Returns whether the generated kernel internally requires a dst
  *        argument.
- * 
+ *
  * This is semantically subtly different from reduxHasDst(). The main
  * difference is in the implementation of the GA_REDUCE_ARGMIN/ARGMAX
  * reductions; Either *might* require a dst buffer, which will have to be
@@ -957,15 +1025,17 @@ static int   reduxHasDstArg                (redux_ctx*  ctx){
 static int   reduxKernelRequiresDst        (redux_ctx*  ctx){
 	switch (ctx->op){
 		case GA_REDUCE_ARGMIN:
-		case GA_REDUCE_ARGMAX:       return reduxIsSmallCodeModel(ctx);
-		default:                     return 1;
+		case GA_REDUCE_ARGMAX:
+		  return reduxIsSmallCodeModel(ctx);
+		default:
+		  return 1;
 	}
 }
 
 /**
  * @brief Returns whether the generated kernel internally requires a dstArg
  *        argument.
- * 
+ *
  * This is semantically subtly different from reduxHasDstArg(), since it asks
  * whether the reduction, even though it does not accept a dstArg argument,
  * still requires a dstArg internally.
@@ -975,11 +1045,11 @@ static int   reduxKernelRequiresDstArg     (redux_ctx*  ctx){
 	/**
 	 * At present there exists no reduction whose implementation requires
 	 * a dstArg but whose interface does not.
-	 * 
+	 *
 	 * E.g. the max() and min() reductions do NOT currently require a temporary
 	 *      buffer for indexes, and will not in the foreseeable future.
 	 */
-	
+
 	return reduxHasDstArg(ctx);
 }
 
@@ -1007,20 +1077,20 @@ static int   reduxCanAppendHwAxis          (redux_ctx* ctx, int wantReductionAxi
 static void  reduxAppendLargestAxisToHwList(redux_ctx* ctx, int wantReductionAxis){
 	int    maxI = 0, i, isInHwList, isInReduxList, isInDesiredList, isLargestSoFar;
 	size_t maxV = 0;
-	
+
 	/* Find */
 	for (i=0;i<ctx->nds;i++){
 		isInHwList      = axisInSet(i, ctx->hwAxisList, ctx->ndh, 0);
 		isInReduxList   = axisInSet(i, ctx->reduxList,  ctx->ndr, 0);
 		isInDesiredList = wantReductionAxis ? isInReduxList : !isInReduxList;
 		isLargestSoFar  = ctx->src->dimensions[i] >= maxV;
-		
+
 		if (!isInHwList && isInDesiredList && isLargestSoFar){
 			maxV = ctx->src->dimensions[i];
 			maxI = i;
 		}
 	}
-	
+
 	/* Append */
 	ctx->hwAxisList[ctx->ndh++] = maxI;
 	if (wantReductionAxis){
@@ -1033,7 +1103,7 @@ static void  reduxAppendLargestAxisToHwList(redux_ctx* ctx, int wantReductionAxi
 /**
  * @brief Select which axes (up to MAX_HW_DIMS) will be assigned to hardware
  *        dimensions.
- * 
+ *
  * For the "large" code model: The up-to-MAX_HW_DIMS largest destination tensor
  *                             dimensions are selected.
  * For the "small" code model: Up to MAX_HW_DIMS reduction dimensions (largest-
@@ -1046,37 +1116,37 @@ static void  reduxAppendLargestAxisToHwList(redux_ctx* ctx, int wantReductionAxi
 
 static int   reduxSelectHwAxes             (redux_ctx*  ctx){
 	if (reduxIsSmallCodeModel(ctx)){
-		while(reduxCanAppendHwAxis(ctx, 1)){
+		while (reduxCanAppendHwAxis(ctx, 1)){
 			reduxAppendLargestAxisToHwList(ctx, 1);
 		}
 	}
-	
-	while(reduxCanAppendHwAxis(ctx, 0)){
+
+	while (reduxCanAppendHwAxis(ctx, 0)){
 		reduxAppendLargestAxisToHwList(ctx, 0);
 	}
-	
+
 	return reduxComputeAxisList(ctx);
 }
 
 /**
  * @brief Compute the axis list.
- * 
+ *
  * The axis list describes the mapping between the nested loops of the kernel
  * as well as their accompanying indices (i0*, i1*, ..., in*) on one hand, and
  * the axes of the source tensor. The first axis in the list corresponds to the
  * outermost loop and the last axis in the list to the innermost.
- * 
+ *
  * The first ctx->ndd axes correspond to the outer loops that iterate over
  * each destination element. The last ctx->ndr axes correspond to the inner
  * loops that iterate over the dimensions of elements that are to be reduced.
- * 
+ *
  * @return GA_MEMORY_ERROR if allocating the list failed; Otherwise, returns
  *         GA_NO_ERROR.
  */
 
 static int   reduxComputeAxisList          (redux_ctx*  ctx){
 	int i, f=0;
-	
+
 	ctx->axisList = malloc(ctx->nds * sizeof(unsigned));
 	if (!ctx->axisList){
 		return reduxCleanup(ctx, GA_MEMORY_ERROR);
@@ -1088,8 +1158,8 @@ static int   reduxComputeAxisList          (redux_ctx*  ctx){
 		}
 	}
 	memcpy(&ctx->axisList[f], ctx->reduxList, ctx->ndr * sizeof(*ctx->reduxList));
-	
-	
+
+
 	return reduxGenSource(ctx);
 }
 
@@ -1105,7 +1175,7 @@ static int   reduxGenSource                (redux_ctx*  ctx){
 	if (!ctx->sourceCode){
 		return reduxCleanup(ctx, GA_MEMORY_ERROR);
 	}
-	
+
 	return reduxIsLargeCodeModel(ctx) ? reduxCompileLarge(ctx):
 	                                    reduxCompileSmall(ctx);
 }
@@ -1145,7 +1215,7 @@ static void  reduxAppendFuncGetInitVal     (redux_ctx*  ctx){
 }
 static void  reduxAppendFuncLoadVal        (redux_ctx*  ctx){
 	int i;
-	
+
 	strb_appends(&ctx->s, "/**\n");
 	strb_appends(&ctx->s, " * Multidimensional source element loader.\n");
 	strb_appends(&ctx->s, " *\n");
@@ -1162,16 +1232,16 @@ static void  reduxAppendFuncLoadVal        (redux_ctx*  ctx){
 		strb_appendf(&ctx->s, "i%d*srcSteps[%d] + \\\n\t                                                            ", i, ctx->axisList[i]);
 	}
 	strb_appends(&ctx->s, "0));\n");
-	
+
 	/* Prescalar transformations go here... */
-	
+
 	/* Return the value. */
 	strb_appends(&ctx->s, "\treturn v;\n");
 	strb_appends(&ctx->s, "}\n\n\n\n");
 }
 static void  reduxAppendFuncReduxVal       (redux_ctx*  ctx){
 	int i, anyArgsEmitted = 0;
-	
+
 	/* Function Signature. */
 	strb_appends(&ctx->s, "/**\n");
 	strb_appends(&ctx->s, " * Global memory value reduction function.\n");
@@ -1198,11 +1268,11 @@ static void  reduxAppendFuncReduxVal       (redux_ctx*  ctx){
 		strb_appends(&ctx->s, "GLOBAL_MEM A* dstArg, const GLOBAL_MEM X* dstArgSteps, X i");
 	}
 	strb_appends(&ctx->s, "){\n");
-	
-	
+
+
 	/* Post-scalar transformations go here. */
-	
-	
+
+
 	/* Write to memory. */
 	if (reduxIsLargeCodeModel(ctx)){
 		/* Large code model. Easy: just write out the data, since it's safe. */
@@ -1223,19 +1293,19 @@ static void  reduxAppendFuncReduxVal       (redux_ctx*  ctx){
 	}else{
 		/* BUG: Implement the atomic reduction, one or two CAS loops. */
 		if       ( reduxKernelRequiresDst   (ctx) && !reduxKernelRequiresDstArg(ctx)){
-			
+
 		}else if (!reduxKernelRequiresDst   (ctx) &&  reduxKernelRequiresDstArg(ctx)){
-			
+
 		}else if ( reduxKernelRequiresDst   (ctx) &&  reduxKernelRequiresDstArg(ctx)){
-			
+
 		}
 	}
-	
+
 	/* Close off function. */
 	strb_appends(&ctx->s, "}\n\n\n\n");
 }
 static void  reduxAppendFuncPreKernel      (redux_ctx*  ctx){
-	
+
 }
 static void  reduxAppendFuncKernel         (redux_ctx*  ctx){
 	reduxAppendPrototype        (ctx);
@@ -1247,7 +1317,7 @@ static void  reduxAppendFuncKernel         (redux_ctx*  ctx){
 	strb_appends                (&ctx->s, "}\n");
 }
 static void  reduxAppendFuncPostKernel     (redux_ctx*  ctx){
-	
+
 }
 static void  reduxAppendPrototype          (redux_ctx*  ctx){
 	strb_appends(&ctx->s, "/**\n");
@@ -1466,30 +1536,50 @@ static void  reduxAppendLoopInner          (redux_ctx*  ctx){
 	strb_appends(&ctx->s, "src, srcSteps);\n");
 	strb_appends(&ctx->s, "\t\t\t\n");
 	switch (ctx->op){
-		case GA_REDUCE_SUM:          strb_appends(&ctx->s, "\t\t\trdxV += v;\n"); break;
-		case GA_REDUCE_PROD:         strb_appends(&ctx->s, "\t\t\trdxV *= v;\n"); break;
-		case GA_REDUCE_PRODNZ:       strb_appends(&ctx->s, "\t\t\trdxV *= v==0 ? getInitVal() : v;\n"); break;
-		case GA_REDUCE_MIN:          strb_appends(&ctx->s, "\t\t\trdxV  = min(rdxV, v);\n"); break;
-		case GA_REDUCE_MAX:          strb_appends(&ctx->s, "\t\t\trdxV  = max(rdxV, v);\n"); break;
+		case GA_REDUCE_SUM:
+		  strb_appends(&ctx->s, "\t\t\trdxV += v;\n");
+		break;
+		case GA_REDUCE_PROD:
+		  strb_appends(&ctx->s, "\t\t\trdxV *= v;\n");
+		break;
+		case GA_REDUCE_PRODNZ:
+		  strb_appends(&ctx->s, "\t\t\trdxV *= v==0 ? getInitVal() : v;\n");
+		break;
+		case GA_REDUCE_MIN:
+		  strb_appends(&ctx->s, "\t\t\trdxV  = min(rdxV, v);\n");
+		break;
+		case GA_REDUCE_MAX:
+		  strb_appends(&ctx->s, "\t\t\trdxV  = max(rdxV, v);\n");
+		break;
 		case GA_REDUCE_ARGMIN:
 		case GA_REDUCE_MINANDARGMIN:
-			strb_appends(&ctx->s, "\t\t\trdxV  = min(rdxV, v);\n");
-			strb_appends(&ctx->s, "\t\t\tif(v == rdxV){\n");
-			appendIdxes (&ctx->s, "\t\t\t\targI = RDXINDEXER(", "i", ctx->ndd, ctx->nds, "", ");\n");
-			strb_appends(&ctx->s, "\t\t\t}\n");
+		  strb_appends(&ctx->s, "\t\t\trdxV  = min(rdxV, v);\n");
+		  strb_appends(&ctx->s, "\t\t\tif(v == rdxV){\n");
+		  appendIdxes (&ctx->s, "\t\t\t\targI = RDXINDEXER(", "i", ctx->ndd, ctx->nds, "", ");\n");
+		  strb_appends(&ctx->s, "\t\t\t}\n");
 		break;
 		case GA_REDUCE_ARGMAX:
 		case GA_REDUCE_MAXANDARGMAX:
-			strb_appends(&ctx->s, "\t\t\trdxV  = max(rdxV, v);\n");
-			strb_appends(&ctx->s, "\t\t\tif(v == rdxV){\n");
-			appendIdxes (&ctx->s, "\t\t\t\targI = RDXINDEXER(", "i", ctx->ndd, ctx->nds, "", ");\n");
-			strb_appends(&ctx->s, "\t\t\t}\n");
+		  strb_appends(&ctx->s, "\t\t\trdxV  = max(rdxV, v);\n");
+		  strb_appends(&ctx->s, "\t\t\tif(v == rdxV){\n");
+		  appendIdxes (&ctx->s, "\t\t\t\targI = RDXINDEXER(", "i", ctx->ndd, ctx->nds, "", ");\n");
+		  strb_appends(&ctx->s, "\t\t\t}\n");
+		break;
+		case GA_REDUCE_AND:
+		  strb_appends(&ctx->s, "\t\t\trdxV &= v;\n");
+		break;
+		case GA_REDUCE_OR:
+		  strb_appends(&ctx->s, "\t\t\trdxV |= v;\n");
+		break;
+		case GA_REDUCE_XOR:
+		  strb_appends(&ctx->s, "\t\t\trdxV ^= v;\n");
+		break;
+		case GA_REDUCE_ALL:
+		  strb_appends(&ctx->s, "\t\t\trdxV  = rdxV && v;\n");
+		break;
+		case GA_REDUCE_ANY:
+		  strb_appends(&ctx->s, "\t\t\trdxV  = rdxV || v;\n");
 		break;
-		case GA_REDUCE_AND:          strb_appends(&ctx->s, "\t\t\trdxV &= v;\n"); break;
-		case GA_REDUCE_OR:           strb_appends(&ctx->s, "\t\t\trdxV |= v;\n"); break;
-		case GA_REDUCE_XOR:          strb_appends(&ctx->s, "\t\t\trdxV ^= v;\n"); break;
-		case GA_REDUCE_ALL:          strb_appends(&ctx->s, "\t\t\trdxV  = rdxV && v;\n"); break;
-		case GA_REDUCE_ANY:          strb_appends(&ctx->s, "\t\t\trdxV  = rdxV || v;\n"); break;
 	}
 
 	/**

From 67e163e99cb0f786c845be4cdc41aac5e76fb960 Mon Sep 17 00:00:00 2001
From: Olexa Bilaniuk <obilaniu@gmail.com>
Date: Fri, 3 Mar 2017 07:45:34 -0500
Subject: [PATCH 06/34] Refactoring of all non-code-gen-related functions.

---
 src/gpuarray_reduction.c | 782 ++++++++++++++++++++++++++-------------
 1 file changed, 526 insertions(+), 256 deletions(-)

diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c
index 62502fb1fd..a5940f504d 100644
--- a/src/gpuarray_reduction.c
+++ b/src/gpuarray_reduction.c
@@ -23,6 +23,10 @@
 
 /* Defines */
 #define  MAX_HW_DIMS                   3
+#define  KERNEL_PRIMARY                0
+#define  KERNEL_AUXILIARY              1
+#define  AXIS_FREE                     0
+#define  AXIS_REDUX                    1
 
 
 
@@ -185,7 +189,8 @@ struct redux_ctx{
 	const int*      reduxList;
 
 	/* General. */
-	int*            axisList;
+	int*            srcAxisList;
+	int*            dstAxisList;
 	gpucontext*     gpuCtx;
 
 	/* Source code Generator. */
@@ -203,21 +208,40 @@ struct redux_ctx{
 	int             ndd;
 	int             ndr;
 	int             nds;
-	int             ndh;
-	int             ndhd;
-	int             ndhr;
 	int             largeCodeModel;
 	strb            s;
 	char*           sourceCode;
+	size_t          sourceCodeLen;
+	char*           errorString0;
+	char*           errorString1;
+	char*           errorString2;
 	GpuKernel       preKernel;
 	GpuKernel       kernel;
 	GpuKernel       postKernel;
 
-	/* Scheduler */
-	int             hwAxisList[MAX_HW_DIMS];
-	size_t          blockSize [MAX_HW_DIMS];
-	size_t          gridSize  [MAX_HW_DIMS];
-	size_t          chunkSize [MAX_HW_DIMS];
+	/**
+	 * Scheduler
+	 *
+	 * There are two sets of kernels that may be scheduled:
+	 *   1) The reduction kernel. This is the only kernel scheduled in the
+	 *      large code model.
+	 *   2) The initialization and post-scalar kernels. These are scheduled
+	 *      only in the small code model.
+	 *
+	 * The reduction kernel is the "primary" kernel. The other two, if needed,
+	 * are referred to as "auxiliary" kernels.
+	 */
+
+	struct{
+		int         ndh;
+		int         ndhd;
+		int         ndhr;
+		int         axisList   [MAX_HW_DIMS];
+		size_t      bs         [MAX_HW_DIMS];
+		size_t      gs         [MAX_HW_DIMS];
+		size_t      cs         [MAX_HW_DIMS];
+		gpudata*    chunkSizeGD;
+	} pri, aux;
 
 	/* Invoker */
 	gpudata*        srcStepsGD;
@@ -257,8 +281,12 @@ static int   reduxHasDst                   (redux_ctx*  ctx);
 static int   reduxHasDstArg                (redux_ctx*  ctx);
 static int   reduxKernelRequiresDst        (redux_ctx*  ctx);
 static int   reduxKernelRequiresDstArg     (redux_ctx*  ctx);
-static int   reduxCanAppendHwAxis          (redux_ctx* ctx, int wantReductionAxis);
-static void  reduxAppendLargestAxisToHwList(redux_ctx* ctx, int wantReductionAxis);
+static int   reduxCanAppendHwAxis          (redux_ctx* ctx,
+                                            int        kernelType,
+                                            int        axisType);
+static void  reduxAppendLargestAxisToHwList(redux_ctx* ctx,
+                                            int        kernelType,
+                                            int        axisType);
 static int   reduxSelectHwAxes             (redux_ctx*  ctx);
 static int   reduxComputeAxisList          (redux_ctx*  ctx);
 static int   reduxGenSource                (redux_ctx*  ctx);
@@ -280,10 +308,19 @@ static void  reduxAppendLoopMacroDefs      (redux_ctx*  ctx);
 static void  reduxAppendLoopOuter          (redux_ctx*  ctx);
 static void  reduxAppendLoopInner          (redux_ctx*  ctx);
 static void  reduxAppendLoopMacroUndefs    (redux_ctx*  ctx);
-static int   reduxCompileLarge             (redux_ctx*  ctx);
-static int   reduxCompileSmall             (redux_ctx*  ctx);
-static int   reduxScheduleLarge            (redux_ctx*  ctx);
-static int   reduxInvokeLarge              (redux_ctx*  ctx);
+static int   reduxCompile                  (redux_ctx*  ctx);
+static int   reduxSchedule                 (redux_ctx*  ctx);
+static void  reduxScheduleKernel           (int         ndims,
+                                            uint64_t*   dims,
+                                            uint64_t    warpSize,
+                                            uint64_t    maxLg,
+                                            uint64_t*   maxLs,
+                                            uint64_t    maxGg,
+                                            uint64_t*   maxGs,
+                                            uint64_t*   bs,
+                                            uint64_t*   gs,
+                                            uint64_t*   cs);
+static int   reduxInvoke                   (redux_ctx*  ctx);
 static int   reduxCleanup                  (redux_ctx*  ctx, int ret);
 
 
@@ -749,27 +786,33 @@ static int   reduxCheckargs                (redux_ctx*  ctx){
 	 * We initialize certain parts of the context.
 	 */
 
-	ctx->axisList      = NULL;
+	ctx->srcAxisList   = NULL;
+	ctx->dstAxisList   = NULL;
 	ctx->gpuCtx        = NULL;
 
 	ctx->srcTypeStr    = ctx->dstTypeStr    = ctx->dstArgTypeStr =
 	ctx->accTypeStr    = ctx->idxTypeStr    = NULL;
 	ctx->initVal       = NULL;
-	ctx->ndh           = 0;
-	ctx->ndhd          = 0;
-	ctx->ndhr          = 0;
+	ctx->pri.ndh       = ctx->aux.ndh  = 0;
+	ctx->pri.ndhd      = ctx->aux.ndhd = 0;
+	ctx->pri.ndhr      = ctx->aux.ndhr = 0;
 	ctx->sourceCode    = NULL;
+	ctx->sourceCodeLen = 0;
+	ctx->errorString0  = NULL;
+	ctx->errorString1  = NULL;
+	ctx->errorString2  = NULL;
 	strb_init(&ctx->s);
 
 	for (i=0;i<MAX_HW_DIMS;i++){
-		ctx->hwAxisList[i] = 0;
-		ctx->blockSize [i] = 1;
-		ctx->gridSize  [i] = 1;
-		ctx->chunkSize [i] = 1;
+		ctx->aux.axisList[i] = ctx->pri.axisList[i] = 0;
+		ctx->aux.bs      [i] = ctx->pri.bs      [i] = 1;
+		ctx->aux.gs      [i] = ctx->pri.gs      [i] = 1;
+		ctx->aux.cs      [i] = ctx->pri.cs      [i] = 1;
 	}
 
-	ctx->srcStepsGD = ctx->srcSizeGD     = ctx->chunkSizeGD   =
-	ctx->dstStepsGD = ctx->dstArgStepsGD = NULL;
+	ctx->srcStepsGD      = ctx->srcSizeGD       =
+	ctx->dstStepsGD      = ctx->dstArgStepsGD   =
+	ctx->pri.chunkSizeGD = ctx->aux.chunkSizeGD = NULL;
 	/* *** IT IS NOW SAFE TO CALL reduxCleanup() *** */
 
 
@@ -1054,75 +1097,108 @@ static int   reduxKernelRequiresDstArg     (redux_ctx*  ctx){
 }
 
 /**
- * @brief Check whether we can add another reduction axis
- *        (wantReductionAxis=1) or destination axis (wantReductionAxis=0) to
- *        the hardware axis list.
+ * @brief Check whether we can add another reduction axis or free axis
+ *        to the hardware axis list for either the primary or secondary kernel.
  */
 
-static int   reduxCanAppendHwAxis          (redux_ctx* ctx, int wantReductionAxis){
-	if (ctx->ndh >= MAX_HW_DIMS){
+static int   reduxCanAppendHwAxis          (redux_ctx* ctx,
+                                            int        kernelType,
+                                            int        axisType){
+	int kernelNdh  = kernelType == KERNEL_PRIMARY ? ctx->pri.ndh  : ctx->aux.ndh;
+	int kernelNdhr = kernelType == KERNEL_PRIMARY ? ctx->pri.ndhr : ctx->aux.ndhr;
+	int kernelNdhd = kernelType == KERNEL_PRIMARY ? ctx->pri.ndhd : ctx->aux.ndhd;
+	
+	if(kernelNdh >= MAX_HW_DIMS){
 		return 0;
 	}else{
-		return wantReductionAxis ? ctx->ndhr < ctx->ndr:
-		                           ctx->ndhd < ctx->ndd;
+		return axisType == AXIS_REDUX ? kernelNdhr < ctx->ndr:
+		                                kernelNdhd < ctx->ndd;
 	}
 }
 
 /**
- * @brief Append the largest reduction axis (wantReductionAxis=1) or
- *        destination axis (wantReductionAxis=0) that isn't yet in the hardware
- *        axis list into said hardware axis list.
+ * @brief Append the largest reduction axis or free axis that isn't yet
+ *        in the hardware axis list for either the primary or secondary kernel
+ *        into said hardware axis list.
  */
 
-static void  reduxAppendLargestAxisToHwList(redux_ctx* ctx, int wantReductionAxis){
+static void  reduxAppendLargestAxisToHwList(redux_ctx* ctx,
+                                            int        kernelType,
+                                            int        axisType){
 	int    maxI = 0, i, isInHwList, isInReduxList, isInDesiredList, isLargestSoFar;
-	size_t maxV = 0;
+	int*   hwAxisList, * ndh, * ndhr, * ndhd;
+	size_t v, maxV = 0;
+
+	/* Get pointers to the correct kernel's variables */
+	hwAxisList = kernelType == KERNEL_PRIMARY ?  ctx->pri.axisList:
+	                                             ctx->aux.axisList;
+	ndh        = kernelType == KERNEL_PRIMARY ? &ctx->pri.ndh:
+	                                            &ctx->aux.ndh;
+	ndhr       = kernelType == KERNEL_PRIMARY ? &ctx->pri.ndhr:
+	                                            &ctx->aux.ndhr;
+	ndhd       = kernelType == KERNEL_PRIMARY ? &ctx->pri.ndhd:
+	                                            &ctx->aux.ndhd;
 
 	/* Find */
 	for (i=0;i<ctx->nds;i++){
-		isInHwList      = axisInSet(i, ctx->hwAxisList, ctx->ndh, 0);
-		isInReduxList   = axisInSet(i, ctx->reduxList,  ctx->ndr, 0);
-		isInDesiredList = wantReductionAxis ? isInReduxList : !isInReduxList;
-		isLargestSoFar  = ctx->src->dimensions[i] >= maxV;
+		isInHwList      = axisInSet(i, hwAxisList,     *ndh,     0);
+		isInReduxList   = axisInSet(i, ctx->reduxList, ctx->ndr, 0);
+		isInDesiredList = axisType == AXIS_REDUX ?  isInReduxList:
+		                                           !isInReduxList;
+		v               = ctx->src->dimensions[i];
+		isLargestSoFar  = v >= maxV;
 
 		if (!isInHwList && isInDesiredList && isLargestSoFar){
-			maxV = ctx->src->dimensions[i];
+			maxV = v;
 			maxI = i;
 		}
 	}
 
 	/* Append */
-	ctx->hwAxisList[ctx->ndh++] = maxI;
-	if (wantReductionAxis){
-		ctx->ndhr++;
+	hwAxisList[(*ndh)++] = maxI;
+	if (axisType == AXIS_REDUX){
+		(*ndhr)++;
 	}else{
-		ctx->ndhd++;
+		(*ndhd)++;
 	}
 }
 
 /**
  * @brief Select which axes (up to MAX_HW_DIMS) will be assigned to hardware
- *        dimensions.
+ *        dimensions for both the primary and auxiliary kernels.
+ *
+ * LARGE code model: Up to the MAX_HW_DIMS largest free axes are selected.
+ *                   Because the primary reduction kernel does everything, it's
+ *                   not necessary to compute an auxiliary kernel axis
+ *                   selection (or at least, one distinct from the primary
+ *                   kernel's).
+ *
+ * SMALL code model: For the primary reduction kernel, up to MAX_HW_DIMS
+ *                   reduction axes (largest-to-smallest) are selected. If less
+ *                   than MAX_HW_DIMS axes were selected, free axes are
+ *                   selected until MAX_HW_DIMS total axes are selected, or no
+ *                   free axes are left.
  *
- * For the "large" code model: The up-to-MAX_HW_DIMS largest destination tensor
- *                             dimensions are selected.
- * For the "small" code model: Up to MAX_HW_DIMS reduction dimensions (largest-
- *                             to-smallest) are selected. If less than
- *                             MAX_HW_DIMS dimensions were selected,
- *                             destination tensor dimensions are selected until
- *                             MAX_HW_DIMS total dimensions are selected, or no
- *                             destination tensors are left.
+ *                   For the auxiliary reduction kernel, up to the MAX_HW_DIMS
+ *                   largest free axes are selected.
  */
 
 static int   reduxSelectHwAxes             (redux_ctx*  ctx){
-	if (reduxIsSmallCodeModel(ctx)){
-		while (reduxCanAppendHwAxis(ctx, 1)){
-			reduxAppendLargestAxisToHwList(ctx, 1);
+	if (reduxIsLargeCodeModel(ctx)){
+		while (reduxCanAppendHwAxis       (ctx, KERNEL_PRIMARY,   AXIS_FREE)){
+			reduxAppendLargestAxisToHwList(ctx, KERNEL_PRIMARY,   AXIS_FREE);
+		}
+	}else{
+		while (reduxCanAppendHwAxis       (ctx, KERNEL_PRIMARY,   AXIS_REDUX)){
+			reduxAppendLargestAxisToHwList(ctx, KERNEL_PRIMARY,   AXIS_REDUX);
+		}
+		while (reduxCanAppendHwAxis       (ctx, KERNEL_PRIMARY,   AXIS_FREE)){
+			reduxAppendLargestAxisToHwList(ctx, KERNEL_PRIMARY,   AXIS_FREE);
 		}
-	}
 
-	while (reduxCanAppendHwAxis(ctx, 0)){
-		reduxAppendLargestAxisToHwList(ctx, 0);
+		while (reduxCanAppendHwAxis       (ctx, KERNEL_AUXILIARY, AXIS_FREE)){
+			reduxAppendLargestAxisToHwList(ctx, KERNEL_AUXILIARY, AXIS_FREE);
+		}
 	}
 
 	return reduxComputeAxisList(ctx);
@@ -1147,17 +1223,17 @@ static int   reduxSelectHwAxes             (redux_ctx*  ctx){
 static int   reduxComputeAxisList          (redux_ctx*  ctx){
 	int i, f=0;
 
-	ctx->axisList = malloc(ctx->nds * sizeof(unsigned));
-	if (!ctx->axisList){
+	ctx->srcAxisList = malloc(ctx->nds * sizeof(unsigned));
+	if (!ctx->srcAxisList){
 		return reduxCleanup(ctx, GA_MEMORY_ERROR);
 	}
 
 	for (i=0;i<ctx->nds;i++){
 		if (!axisInSet(i, ctx->reduxList, ctx->ndr, 0)){
-			ctx->axisList[f++] = i;
+			ctx->srcAxisList[f++] = i;
 		}
 	}
-	memcpy(&ctx->axisList[f], ctx->reduxList, ctx->ndr * sizeof(*ctx->reduxList));
+	memcpy(&ctx->srcAxisList[f], ctx->reduxList, ctx->ndr * sizeof(*ctx->reduxList));
 
 
 	return reduxGenSource(ctx);
@@ -1171,13 +1247,13 @@ static int   reduxComputeAxisList          (redux_ctx*  ctx){
 
 static int   reduxGenSource                (redux_ctx*  ctx){
 	reduxAppendSource(ctx);
-	ctx->sourceCode = strb_cstr(&ctx->s);
+	ctx->sourceCodeLen = ctx->s.l;
+	ctx->sourceCode    = strb_cstr(&ctx->s);
 	if (!ctx->sourceCode){
 		return reduxCleanup(ctx, GA_MEMORY_ERROR);
 	}
 
-	return reduxIsLargeCodeModel(ctx) ? reduxCompileLarge(ctx):
-	                                    reduxCompileSmall(ctx);
+	return reduxCompile(ctx);
 }
 static void  reduxAppendSource             (redux_ctx*  ctx){
 	reduxAppendIncludes         (ctx);
@@ -1185,9 +1261,11 @@ static void  reduxAppendSource             (redux_ctx*  ctx){
 	reduxAppendFuncGetInitVal   (ctx);
 	reduxAppendFuncLoadVal      (ctx);
 	reduxAppendFuncReduxVal     (ctx);
-	reduxAppendFuncPreKernel    (ctx);
+	if(reduxIsSmallCodeModel(ctx)){
+		reduxAppendFuncPreKernel    (ctx);
+		reduxAppendFuncPostKernel   (ctx);
+	}
 	reduxAppendFuncKernel       (ctx);
-	reduxAppendFuncPostKernel   (ctx);
 }
 static void  reduxAppendIncludes           (redux_ctx*  ctx){
 	strb_appends(&ctx->s, "/* Includes */\n");
@@ -1197,31 +1275,30 @@ static void  reduxAppendIncludes           (redux_ctx*  ctx){
 	strb_appends(&ctx->s, "\n");
 }
 static void  reduxAppendTypedefs           (redux_ctx*  ctx){
-	strb_appends(&ctx->s, "/* Typedefs */\n");
-	strb_appendf(&ctx->s, "typedef %s     S;/* The type of the source array. */\n",                ctx->srcTypeStr);
-	strb_appendf(&ctx->s, "typedef %s     T;/* The type of the destination array. */\n",           ctx->dstTypeStr);
-	strb_appendf(&ctx->s, "typedef %s     A;/* The type of the destination argument array. */\n",  ctx->dstArgTypeStr);
-	strb_appendf(&ctx->s, "typedef %s     X;/* The type of the indices: signed 32/64-bit. */\n",   ctx->idxTypeStr);
-	strb_appendf(&ctx->s, "typedef %s     K;/* The type of the accumulator variable. */\n",        ctx->accTypeStr);
-	strb_appends(&ctx->s, "\n\n\n");
+	strb_appendf(&ctx->s, "typedef %s S;\n", ctx->srcTypeStr);   /* The type of the source array. */
+	strb_appendf(&ctx->s, "typedef %s T;\n", ctx->dstTypeStr);   /* The type of the destination array. */
+	strb_appendf(&ctx->s, "typedef %s A;\n", ctx->dstArgTypeStr);/* The type of the destination argument array. */
+	strb_appendf(&ctx->s, "typedef %s X;\n", ctx->idxTypeStr);   /* The type of the indices: signed 32/64-bit. */
+	strb_appendf(&ctx->s, "typedef %s K;\n", ctx->accTypeStr);   /* The type of the accumulator variable. */
 }
 static void  reduxAppendFuncGetInitVal     (redux_ctx*  ctx){
-	strb_appends(&ctx->s, "/**\n");
-	strb_appends(&ctx->s, " * Initial value function.\n");
-	strb_appends(&ctx->s, " */\n\n");
-	strb_appends(&ctx->s, "WITHIN_KERNEL K    getInitVal(void){\n");
-	strb_appendf(&ctx->s, "\treturn (%s);\n", ctx->initVal);
-	strb_appends(&ctx->s, "}\n\n\n\n");
+	/**
+	 * Initial value function.
+	 */
+
+	strb_appendf(&ctx->s, "WITHIN_KERNEL K    getInitVal(void){\n"
+	                      "\treturn (%s);\n"
+	                      "}\n\n\n\n", ctx->initVal);
 }
 static void  reduxAppendFuncLoadVal        (redux_ctx*  ctx){
 	int i;
 
-	strb_appends(&ctx->s, "/**\n");
-	strb_appends(&ctx->s, " * Multidimensional source element loader.\n");
-	strb_appends(&ctx->s, " *\n");
-	strb_appends(&ctx->s, " * Also implements prescalar transformations if any.\n");
-	strb_appends(&ctx->s, " */\n");
-	strb_appends(&ctx->s, "\n");
+	/**
+	 * Multidimensional source element loader.
+	 *
+	 * Also implements prescalar transformations if any.
+	 */
+
 	appendIdxes (&ctx->s, "WITHIN_KERNEL K    loadVal(", "X i", 0, ctx->nds, "", "");
 	if (ctx->nds > 0){
 		strb_appends(&ctx->s, ", ");
@@ -1229,7 +1306,7 @@ static void  reduxAppendFuncLoadVal        (redux_ctx*  ctx){
 	strb_appends(&ctx->s, "const GLOBAL_MEM S* src, const GLOBAL_MEM X* srcSteps){\n");
 	strb_appends(&ctx->s, "\tS v = (*(const GLOBAL_MEM S*)((const GLOBAL_MEM char*)src + ");
 	for (i=0;i<ctx->nds;i++){
-		strb_appendf(&ctx->s, "i%d*srcSteps[%d] + \\\n\t                                                            ", i, ctx->axisList[i]);
+		strb_appendf(&ctx->s, "i%d*srcSteps[%d] + \\\n\t                                                            ", i, ctx->srcAxisList[i]);
 	}
 	strb_appends(&ctx->s, "0));\n");
 
@@ -1242,15 +1319,16 @@ static void  reduxAppendFuncLoadVal        (redux_ctx*  ctx){
 static void  reduxAppendFuncReduxVal       (redux_ctx*  ctx){
 	int i, anyArgsEmitted = 0;
 
-	/* Function Signature. */
-	strb_appends(&ctx->s, "/**\n");
-	strb_appends(&ctx->s, " * Global memory value reduction function.\n");
-	strb_appends(&ctx->s, " *\n");
-	strb_appends(&ctx->s, " * Responsible for either:\n");
-	strb_appends(&ctx->s, " *   1) Safe writeback of final value to memory, or\n");
-	strb_appends(&ctx->s, " *   2) Safe atomic reduction of partial value into memory.\n");
-	strb_appends(&ctx->s, " */\n");
-	strb_appends(&ctx->s, "\n");
+	/**
+	 * Function Signature.
+	 *
+	 * Global memory value reduction function.
+	 *
+	 * Responsible for either:
+	 *   1) Safe writeback of final value to memory, or
+	 *   2) Safe atomic reduction of partial value into memory.
+	 */
+
 	appendIdxes (&ctx->s, "WITHIN_KERNEL void reduxVal(", "X i", 0, ctx->ndd, "", "");
 	anyArgsEmitted = ctx->ndd>0;
 	if (reduxKernelRequiresDst   (ctx)){
@@ -1356,11 +1434,11 @@ static void  reduxAppendIndexDeclarations  (redux_ctx*  ctx){
 	strb_appends(&ctx->s, "\tX bd0 = LDIM_0,       bd1 = LDIM_1,       bd2 = LDIM_2;\n");
 	strb_appends(&ctx->s, "\tX ti0 = LID_0,        ti1 = LID_1,        ti2 = LID_2;\n");
 	strb_appends(&ctx->s, "\tX gi0 = bi0*bd0+ti0,  gi1 = bi1*bd1+ti1,  gi2 = bi2*bd2+ti2;\n");
-	if (ctx->ndh>0){
+	if (ctx->pri.ndh>0){
 		strb_appends(&ctx->s, "\tX ");
-		for (i=0;i<ctx->ndh;i++){
+		for (i=0;i<ctx->pri.ndh;i++){
 			strb_appendf(&ctx->s, "ci%u = chunkSize[%u]%s",
-			             i, i, (i==ctx->ndh-1) ? ";\n" : ", ");
+			             i, i, (i==ctx->pri.ndh-1) ? ";\n" : ", ");
 		}
 	}
 
@@ -1386,10 +1464,10 @@ static void  reduxAppendRangeCalculations  (redux_ctx*  ctx){
 	strb_appends(&ctx->s, "\t/* Compute ranges for this thread. */\n");
 
 	for (i=0;i<ctx->nds;i++){
-		strb_appendf(&ctx->s, "\ti%dDim     = srcSize[%d];\n", i, ctx->axisList[i]);
+		strb_appendf(&ctx->s, "\ti%dDim     = srcSize[%d];\n", i, ctx->srcAxisList[i]);
 	}
 	for (i=0;i<ctx->nds;i++){
-		strb_appendf(&ctx->s, "\ti%dSStep   = srcSteps[%d];\n", i, ctx->axisList[i]);
+		strb_appendf(&ctx->s, "\ti%dSStep   = srcSteps[%d];\n", i, ctx->srcAxisList[i]);
 	}
 	for (i=0;i<ctx->ndd;i++){
 		strb_appendf(&ctx->s, "\ti%dMStep   = dstSteps[%d];\n", i, i);
@@ -1415,7 +1493,7 @@ static void  reduxAppendRangeCalculations  (redux_ctx*  ctx){
 		 * The others, if any, have to use software looping beginning at 0.
 		 */
 
-		if (axisInSet(ctx->axisList[i], ctx->hwAxisList, ctx->ndh, &hwDim)){
+		if (axisInSet(ctx->srcAxisList[i], ctx->pri.axisList, ctx->pri.ndh, &hwDim)){
 			strb_appendf(&ctx->s, "\ti%dStart   = gi%d * ci%d;\n", i, hwDim, hwDim);
 		}else{
 			strb_appendf(&ctx->s, "\ti%dStart   = 0;\n", i);
@@ -1427,7 +1505,7 @@ static void  reduxAppendRangeCalculations  (redux_ctx*  ctx){
 		 * The others, if any, have to use software looping beginning at 0.
 		 */
 
-		if (axisInSet(ctx->axisList[i], ctx->hwAxisList, ctx->ndh, &hwDim)){
+		if (axisInSet(ctx->srcAxisList[i], ctx->pri.axisList, ctx->pri.ndh, &hwDim)){
 			strb_appendf(&ctx->s, "\ti%dEnd     = i%dStart + ci%d;\n", i, i, hwDim);
 		}else{
 			strb_appendf(&ctx->s, "\ti%dEnd     = i%dStart + i%dDim;\n", i, i, i);
@@ -1627,119 +1705,255 @@ static void  reduxAppendLoopMacroUndefs    (redux_ctx*  ctx){
 
 /**
  * @brief Compile the kernel from source code.
- *
- * @return
  */
 
-static int   reduxCompileLarge             (redux_ctx*  ctx){
-	const int    ARG_TYPECODES[]   = {
-	    GA_BUFFER, /* src */
-	    GA_SIZE,   /* srcOff */
-	    GA_BUFFER, /* srcSteps */
-	    GA_BUFFER, /* srcSize */
-	    GA_BUFFER, /* chnkSize */
-	    GA_BUFFER, /* dst */
-	    GA_SIZE,   /* dstOff */
-	    GA_BUFFER, /* dstSteps */
-	    GA_BUFFER, /* dstArg */
-	    GA_SIZE,   /* dstArgOff */
-	    GA_BUFFER  /* dstArgSteps */
-	};
-	const size_t ARG_TYPECODES_LEN = sizeof(ARG_TYPECODES)/sizeof(*ARG_TYPECODES);
-	const char*  SRCS[1]           = {ctx->sourceCode};
-	const size_t SRC_LENS[1]       = {strlen(ctx->sourceCode)};
-	const size_t SRCS_LEN          = sizeof(SRCS)/sizeof(*SRCS);
-
-	int ret  = GpuKernel_init(&ctx->kernel,
-	                          ctx->gpuCtx,
-	                          SRCS_LEN,
-	                          SRCS,
-	                          SRC_LENS,
-	                          "redux",
-	                          ARG_TYPECODES_LEN,
-	                          ARG_TYPECODES,
-	                          0,
-	                          (char**)0);
-
-	if (ret != GA_NO_ERROR){
-		return reduxCleanup(ctx, ret);
-	}else{
-		return reduxScheduleLarge(ctx);
+static int   reduxCompile                  (redux_ctx*  ctx){
+	int    ret, i = 0;
+	int    PRI_TYPECODES[11];
+	size_t PRI_TYPECODES_LEN;
+	int*   AUX_TYPECODES;
+	size_t AUX_TYPECODES_LEN;
+	
+	
+	/**
+	 * Construct Argument Typecode Lists.
+	 */
+	
+	PRI_TYPECODES[i++] = GA_BUFFER; /* src */
+	PRI_TYPECODES[i++] = GA_SIZE;   /* srcOff */
+	PRI_TYPECODES[i++] = GA_BUFFER; /* srcSteps */
+	PRI_TYPECODES[i++] = GA_BUFFER; /* srcSize */
+	PRI_TYPECODES[i++] = GA_BUFFER; /* chnkSize */
+	if(reduxKernelRequiresDst(ctx)){
+		PRI_TYPECODES[i++] = GA_BUFFER; /* dst */
+		PRI_TYPECODES[i++] = GA_SIZE;   /* dstOff */
+		PRI_TYPECODES[i++] = GA_BUFFER; /* dstSteps */
+	}
+	if(reduxKernelRequiresDstArg(ctx)){
+		PRI_TYPECODES[i++] = GA_BUFFER; /* dstArg */
+		PRI_TYPECODES[i++] = GA_SIZE;   /* dstArgOff */
+		PRI_TYPECODES[i++] = GA_BUFFER; /* dstArgSteps */
+	}
+	PRI_TYPECODES_LEN  = i;
+	AUX_TYPECODES      = &PRI_TYPECODES[3];
+	AUX_TYPECODES_LEN  = PRI_TYPECODES_LEN-3;
+	
+	
+	/**
+	 * Compile the kernels.
+	 */
+	
+	{
+		ret  = GpuKernel_init(&ctx->kernel,
+		                      ctx->gpuCtx,
+		                      1,
+		                      (const char**)&ctx->sourceCode,
+		                      &ctx->sourceCodeLen,
+		                      "redux",
+		                      PRI_TYPECODES_LEN,
+		                      PRI_TYPECODES,
+		                      GA_USE_CLUDA,
+		                      &ctx->errorString0);
+		if (ret != GA_NO_ERROR){
+			return reduxCleanup(ctx, ret);
+		}
 	}
-}
-static int   reduxCompileSmall             (redux_ctx*  ctx){
-	/* BUG: Implement small code model. */
-	return reduxCompileLarge(ctx);
+	if(reduxIsSmallCodeModel(ctx)){
+		ret  = GpuKernel_init(&ctx->kernel,
+		                      ctx->gpuCtx,
+		                      1,
+		                      (const char**)&ctx->sourceCode,
+		                      &ctx->sourceCodeLen,
+		                      "preRedux",
+		                      AUX_TYPECODES_LEN,
+		                      AUX_TYPECODES,
+		                      GA_USE_CLUDA,
+		                      &ctx->errorString1);
+		if (ret != GA_NO_ERROR){
+			return reduxCleanup(ctx, ret);
+		}
+		ret  = GpuKernel_init(&ctx->kernel,
+		                      ctx->gpuCtx,
+		                      1,
+		                      (const char**)&ctx->sourceCode,
+		                      &ctx->sourceCodeLen,
+		                      "postRedux",
+		                      AUX_TYPECODES_LEN,
+		                      AUX_TYPECODES,
+		                      GA_USE_CLUDA,
+		                      &ctx->errorString2);
+		if (ret != GA_NO_ERROR){
+			return reduxCleanup(ctx, ret);
+		}
+	}
+
+	return reduxSchedule(ctx);
 }
 
 /**
- * Compute a good thread block size / grid size / software chunk size for Nvidia.
+ * @brief Compute a good thread block size / grid size / software chunk size
+ *        for the primary/auxilliary kernels.
  */
 
-static int   reduxScheduleLarge            (redux_ctx*  ctx){
-	int            i;
-	size_t         warpMod;
-	size_t         bestWarpMod  = 1;
-	unsigned       bestWarpAxis = 0;
-	uint64_t       maxLg;
-	uint64_t       maxLs[MAX_HW_DIMS];
-	uint64_t       maxGg;
-	uint64_t       maxGs [MAX_HW_DIMS];
-	uint64_t       dims  [MAX_HW_DIMS];
-	double         slack [MAX_HW_DIMS];
-	ga_factor_list factBS[MAX_HW_DIMS];
-	ga_factor_list factGS[MAX_HW_DIMS];
-	ga_factor_list factCS[MAX_HW_DIMS];
-
-
+static int   reduxSchedule                 (redux_ctx*  ctx){
+	int      i, priNdims, auxNdims;
+	uint64_t maxLgRdx, maxLgPre, maxLgPost;
+	uint64_t maxLgPri, maxLgAux;
+	uint64_t maxLs  [MAX_HW_DIMS];
+	uint64_t maxGg;
+	uint64_t maxGs  [MAX_HW_DIMS];
+	uint64_t priDims[MAX_HW_DIMS];
+	uint64_t auxDims[MAX_HW_DIMS];
+	uint64_t bs     [MAX_HW_DIMS];
+	uint64_t gs     [MAX_HW_DIMS];
+	uint64_t cs     [MAX_HW_DIMS];
+	size_t   warpSize,
+	         maxL, maxL0, maxL1, maxL2,
+	         maxG, maxG0, maxG1, maxG2;
+	
+	
 	/**
 	 * Obtain the constraints of our problem.
 	 */
 
-	size_t warpSize,
-	       maxL, maxL0, maxL1, maxL2,  /* Maximum total and per-dimension thread/block sizes */
-	       maxG, maxG0, maxG1, maxG2;  /* Maximum total and per-dimension block /grid  sizes */
-	gpukernel_property(ctx->kernel.k,  GA_KERNEL_PROP_PREFLSIZE, &warpSize);
-	gpukernel_property(ctx->kernel.k,  GA_KERNEL_PROP_MAXLSIZE,  &maxL);
-	gpudata_property  (ctx->src->data, GA_CTX_PROP_MAXLSIZE0,    &maxL0);
-	gpudata_property  (ctx->src->data, GA_CTX_PROP_MAXLSIZE1,    &maxL1);
-	gpudata_property  (ctx->src->data, GA_CTX_PROP_MAXLSIZE2,    &maxL2);
-	gpudata_property  (ctx->src->data, GA_CTX_PROP_MAXGSIZE0,    &maxG0);
-	maxG = maxG0;
-	gpudata_property  (ctx->src->data, GA_CTX_PROP_MAXGSIZE1,    &maxG1);
-	gpudata_property  (ctx->src->data, GA_CTX_PROP_MAXGSIZE2,    &maxG2);
-
+	gpudata_property  (ctx->src->data,    GA_CTX_PROP_MAXLSIZE0,    &maxL0);
+	gpudata_property  (ctx->src->data,    GA_CTX_PROP_MAXLSIZE1,    &maxL1);
+	gpudata_property  (ctx->src->data,    GA_CTX_PROP_MAXLSIZE2,    &maxL2);
+	gpudata_property  (ctx->src->data,    GA_CTX_PROP_MAXGSIZE,     &maxG);
+	gpudata_property  (ctx->src->data,    GA_CTX_PROP_MAXGSIZE0,    &maxG0);
+	gpudata_property  (ctx->src->data,    GA_CTX_PROP_MAXGSIZE1,    &maxG1);
+	gpudata_property  (ctx->src->data,    GA_CTX_PROP_MAXGSIZE2,    &maxG2);
+	gpukernel_property(ctx->kernel.k,     GA_KERNEL_PROP_PREFLSIZE, &warpSize);
+	gpukernel_property(ctx->kernel.k,     GA_KERNEL_PROP_MAXLSIZE,  &maxL);
+	maxLgRdx  = maxL;
+	maxLgPri  = maxLgRdx;
+	if(reduxIsSmallCodeModel(ctx)){
+		gpukernel_property(ctx->preKernel.k,  GA_KERNEL_PROP_MAXLSIZE,  &maxL);
+		maxLgPre  = maxL;
+		gpukernel_property(ctx->postKernel.k, GA_KERNEL_PROP_MAXLSIZE,  &maxL);
+		maxLgPost = maxL;
+		maxLgAux  = maxLgPre<maxLgPost ? maxLgPre : maxLgPost;
+	}
+	
+	priNdims  = ctx->pri.ndh;
+	maxGs[0]  = maxG0;
+	maxGs[1]  = maxG1;
+	maxGs[2]  = maxG2;
+	maxGg     = maxG;
+	maxLs[0]  = maxL0;
+	maxLs[1]  = maxL1;
+	maxLs[2]  = maxL2;
+	for (i=0;i<priNdims;i++){
+		priDims[i] = ctx->src->dimensions[ctx->pri.axisList[i]];
+	}
+	if(reduxIsSmallCodeModel(ctx)){
+		auxNdims  = ctx->aux.ndh;
+		for (i=0;i<auxNdims;i++){
+			auxDims[i] = ctx->src->dimensions[ctx->aux.axisList[i]];
+		}
+	}
+	
+	
 	/**
-	 * Prepare inputs to the solver.
-	 *
-	 * This involves, amongst others,
-	 * - Initializing the blockSize, gridSize and chunkSize factor lists for all
-	 *   hardware dimensions.
-	 * - Finding on which hardware axis is it optimal to place the warpSize factor.
+	 * Apply the solver.
 	 */
+	
+	{
+		reduxScheduleKernel(priNdims,
+		                    priDims,
+		                    warpSize,
+		                    maxLgPri, maxLs,
+		                    maxGg,    maxGs,
+		                    bs, gs, cs);
+		for (i=0;i<priNdims;i++){
+			ctx->pri.bs[i] = bs[i];
+			ctx->pri.gs[i] = gs[i];
+			ctx->pri.cs[i] = cs[i];
+		}
+		if (priNdims <= 0){
+			ctx->pri.bs[i] = ctx->pri.gs[i] = ctx->pri.cs[i] = 1;
+		}
+	}
+	if (reduxIsSmallCodeModel(ctx)){
+		reduxScheduleKernel(auxNdims,
+		                    auxDims,
+		                    warpSize,
+		                    maxLgAux, maxLs,
+		                    maxGg,    maxGs,
+		                    bs, gs, cs);
+		for (i=0;i<auxNdims;i++){
+			ctx->aux.bs[i] = bs[i];
+			ctx->aux.gs[i] = gs[i];
+			ctx->aux.cs[i] = cs[i];
+		}
+		if (auxNdims <= 0){
+			ctx->aux.bs[i] = ctx->aux.gs[i] = ctx->aux.cs[i] = 1;
+		}
+	}
+	
+	return reduxInvoke(ctx);
+}
+
+/**
+ * @brief Given the parameters of a kernel scheduling problem, solve it as
+ *        optimally as possible.
+ * 
+ * NB: This is the only function in this entire file that should have
+ *     anything to do with the integer factorization APIs.
+ */
 
-	maxLg    = maxL;
-	maxLs[0] = maxL0, maxLs[1]=maxL1, maxLs[2]=maxL2;
-	maxGg    = maxG;
-	maxGs[0] = maxG0, maxGs[1]=maxG1, maxGs[2]=maxG2;
-	dims[0]  = dims[1]  = dims[2]  = 1;
-	slack[0] = slack[1] = slack[2] = 1.1;
+static void  reduxScheduleKernel           (int         ndims,
+                                            uint64_t*   dims,
+                                            uint64_t    warpSize,
+                                            uint64_t    maxLg,
+                                            uint64_t*   maxLs,
+                                            uint64_t    maxGg,
+                                            uint64_t*   maxGs,
+                                            uint64_t*   bs,
+                                            uint64_t*   gs,
+                                            uint64_t*   cs){
+	uint64_t       warpMod, bestWarpMod  = 1;
+	int            i,       bestWarpAxis = 0;
+	uint64_t       roundedDims[MAX_HW_DIMS];
+	double         slack      [MAX_HW_DIMS];
+	ga_factor_list factBS     [MAX_HW_DIMS];
+	ga_factor_list factGS     [MAX_HW_DIMS];
+	ga_factor_list factCS     [MAX_HW_DIMS];
+	
+	
+	/**
+	 * Quick check for scalar case.
+	 */
+	
+	if (ndims <= 0){
+		return;
+	}
+	
+	
+	/**
+	 * Identify the dimension to which the warp factor will be given.
+	 * 
+	 * The current heuristic is to find the dimension that is either
+	 *   1) Evenly divided by the warp size, or
+	 *   2) As close to filling the last warp as possible.
+	 */
 
-	for (i=0;i<ctx->ndh;i++){
-		dims[i] = ctx->src->dimensions[ctx->hwAxisList[i]];
+	for (i=0;i<ndims;i++){
+		roundedDims[i] = dims[i];
+		slack      [i] = 1.1;
 		gaIFLInit(&factBS[i]);
 		gaIFLInit(&factGS[i]);
 		gaIFLInit(&factCS[i]);
 
-		warpMod = dims[i]%warpSize;
+		warpMod = roundedDims[i] % warpSize;
 		if (bestWarpMod>0 && (warpMod==0 || warpMod>=bestWarpMod)){
 			bestWarpAxis = i;
 			bestWarpMod  = warpMod;
 		}
 	}
 
-	if (ctx->ndh > 0){
-		dims[bestWarpAxis] = (dims[bestWarpAxis] + warpSize - 1)/warpSize;
+	if (ndims > 0){
+		roundedDims[bestWarpAxis] = (roundedDims[bestWarpAxis] + warpSize - 1)/warpSize;
 		gaIFactorize(warpSize, 0, 0, &factBS[bestWarpAxis]);
 	}
 
@@ -1749,8 +1963,11 @@ static int   reduxScheduleLarge            (redux_ctx*  ctx){
 	 * chunkSize.
 	 */
 
-	for (i=0;i<ctx->ndh;i++){
-		while (!gaIFactorize(dims[i], (uint64_t)(dims[i]*slack[i]), maxLs[i], &factCS[i])){
+	for (i=0;i<ndims;i++){
+		while (!gaIFactorize(roundedDims[i],
+		                     roundedDims[i]*slack[i],
+		                     maxLs      [i],
+		                     &factCS    [i])){
 			/**
 			 * Error! Failed to factorize dimension i with given slack and
 			 * k-smoothness constraints! Increase slack. Once slack reaches
@@ -1768,79 +1985,122 @@ static int   reduxScheduleLarge            (redux_ctx*  ctx){
 	 * gridSize, improving performance.
 	 */
 
-	gaIFLSchedule(ctx->ndh, maxLg, maxLs, maxGg, maxGs, factBS, factGS, factCS);
-
-	/* Output. */
-	for (i=0;i<ctx->ndh;i++){
-		ctx->blockSize[i] = gaIFLGetProduct(&factBS[i]);
-		ctx->gridSize [i] = gaIFLGetProduct(&factGS[i]);
-		ctx->chunkSize[i] = gaIFLGetProduct(&factCS[i]);
+	gaIFLSchedule(ndims, maxLg, maxLs, maxGg, maxGs, factBS, factGS, factCS);
+	for (i=0;i<ndims;i++){
+		bs[i] = gaIFLGetProduct(&factBS[i]);
+		gs[i] = gaIFLGetProduct(&factGS[i]);
+		cs[i] = gaIFLGetProduct(&factCS[i]);
 	}
-
-	/* Return. */
-	return reduxInvokeLarge(ctx);
 }
 
 /**
  * Invoke the kernel.
  */
 
-static int   reduxInvokeLarge              (redux_ctx*  ctx){
-	void* args[11];
-	int   ret;
+static int   reduxInvoke                   (redux_ctx*  ctx){
+	void* priArgs[11];
+	void* auxArgs[ 8];
+	int   ret, i = 0;
+	int   failedDstSteps     = 0;
+	int   failedDstArgSteps  = 0;
+	int   failedAuxChunkSize = 0;
+
 
 	/**
 	 * Argument Marshalling. This the grossest gross thing in here.
 	 */
 
-	const int flags    = GA_BUFFER_READ_ONLY|GA_BUFFER_INIT;
-	ctx->srcStepsGD    = gpudata_alloc(ctx->gpuCtx, ctx->nds * sizeof(size_t),
-	                                   ctx->src->strides,    flags, 0);
-	ctx->srcSizeGD     = gpudata_alloc(ctx->gpuCtx, ctx->nds * sizeof(size_t),
-	                                   ctx->src->dimensions, flags, 0);
-	ctx->chunkSizeGD   = gpudata_alloc(ctx->gpuCtx, ctx->ndh * sizeof(size_t),
-	                                   ctx->chunkSize,       flags, 0);
-	if (reduxKernelRequiresDst(ctx)){
-		ctx->dstStepsGD    = gpudata_alloc(ctx->gpuCtx, ctx->ndd * sizeof(size_t),
-		                                   ctx->dst->strides,    flags, 0);
+	const int flags      = GA_BUFFER_READ_ONLY|GA_BUFFER_INIT;
+	ctx->srcStepsGD      = gpudata_alloc(ctx->gpuCtx, ctx->nds     * sizeof(size_t),
+	                                     ctx->src->strides,     flags, 0);
+	ctx->srcSizeGD       = gpudata_alloc(ctx->gpuCtx, ctx->nds     * sizeof(size_t),
+	                                     ctx->src->dimensions,  flags, 0);
+	ctx->pri.chunkSizeGD = gpudata_alloc(ctx->gpuCtx, ctx->pri.ndh * sizeof(size_t),
+	                                     ctx->pri.cs,           flags, 0);
+	
+	priArgs[i++] = (void*) ctx->src->data;
+	priArgs[i++] = (void*)&ctx->src->offset;
+	priArgs[i++] = (void*) ctx->srcStepsGD;
+	priArgs[i++] = (void*) ctx->srcSizeGD;
+	priArgs[i++] = (void*) ctx->pri.chunkSizeGD;
+	if (reduxKernelRequiresDst   (ctx)){
+		ctx->dstStepsGD      = gpudata_alloc(ctx->gpuCtx, ctx->ndd * sizeof(size_t),
+		                                     ctx->dst->strides,    flags, 0);
+		priArgs[i++]         = (void*) ctx->dst->data;
+		priArgs[i++]         = (void*)&ctx->dst->offset;
+		priArgs[i++]         = (void*) ctx->dstStepsGD;
+		failedDstSteps       =        !ctx->dstStepsGD;
 	}
 	if (reduxKernelRequiresDstArg(ctx)){
-		ctx->dstArgStepsGD = gpudata_alloc(ctx->gpuCtx, ctx->ndd * sizeof(size_t),
-		                                   ctx->dstArg->strides, flags, 0);
-	}
-	args[ 0] = (void*) ctx->src->data;
-	args[ 1] = (void*)&ctx->src->offset;
-	args[ 2] = (void*) ctx->srcStepsGD;
-	args[ 3] = (void*) ctx->srcSizeGD;
-	args[ 4] = (void*) ctx->chunkSizeGD;
-	if       ( reduxKernelRequiresDst   (ctx) &&  reduxKernelRequiresDstArg(ctx)){
-		args[ 5] = (void*) ctx->dst->data;
-		args[ 6] = (void*)&ctx->dst->offset;
-		args[ 7] = (void*) ctx->dstStepsGD;
-		args[ 8] = (void*) ctx->dstArg->data;
-		args[ 9] = (void*)&ctx->dstArg->offset;
-		args[10] = (void*) ctx->dstArgStepsGD;
-	}else if ( reduxKernelRequiresDst   (ctx) && !reduxKernelRequiresDstArg(ctx)){
-		args[ 5] = (void*) ctx->dst->data;
-		args[ 6] = (void*)&ctx->dst->offset;
-		args[ 7] = (void*) ctx->dstStepsGD;
-	}else if (!reduxKernelRequiresDst   (ctx) &&  reduxKernelRequiresDstArg(ctx)){
-		args[ 5] = (void*) ctx->dstArg->data;
-		args[ 6] = (void*)&ctx->dstArg->offset;
-		args[ 7] = (void*) ctx->dstArgStepsGD;
+		ctx->dstArgStepsGD   = gpudata_alloc(ctx->gpuCtx, ctx->ndd * sizeof(size_t),
+		                                     ctx->dstArg->strides, flags, 0);
+		priArgs[i++]         = (void*) ctx->dstArg->data;
+		priArgs[i++]         = (void*)&ctx->dstArg->offset;
+		priArgs[i++]         = (void*) ctx->dstArgStepsGD;
+		failedDstArgSteps    =        !ctx->dstArgStepsGD;
+	}
+	if (reduxIsSmallCodeModel(ctx)){
+		/**
+		 * The auxiliary kernel's args are identical to the primary kernel's,
+		 * except that the first three arguments are deleted and the fifth
+		 * argument (now second), called chunkSize, is different.
+		 */
+
+		memcpy(auxArgs, &priArgs[3], sizeof(auxArgs));
+		ctx->aux.chunkSizeGD = gpudata_alloc(ctx->gpuCtx, ctx->aux.ndh * sizeof(size_t),
+		                                     ctx->aux.cs,           flags, 0);
+		auxArgs[ 1 ]         = (void*) ctx->aux.chunkSizeGD;
+		failedAuxChunkSize   =        !ctx->aux.chunkSizeGD;
 	}
 
-	if (ctx->srcStepsGD   &&
-	    ctx->srcSizeGD    &&
-	    ctx->chunkSizeGD  &&
-	    ctx->dstStepsGD   &&
-	    ctx->dstArgStepsGD){
+
+	/**
+	 * One or three kernels is now invoked, depending on the code model.
+	 */
+
+	if (ctx->srcStepsGD      &&
+	    ctx->srcSizeGD       &&
+	    ctx->pri.chunkSizeGD &&
+	    !failedDstSteps      &&
+	    !failedDstArgSteps   &&
+	    !failedAuxChunkSize){
+		/* Pre-kernel invocation, if necessary */
+		if(reduxIsSmallCodeModel(ctx)){
+			ret = GpuKernel_call(&ctx->preKernel,
+			                     ctx->aux.ndh>0 ? ctx->aux.ndh : 1,
+			                     ctx->aux.gs,
+			                     ctx->aux.bs,
+			                     0,
+			                     auxArgs);
+			if (ret != GA_NO_ERROR){
+				return reduxCleanup(ctx, ret);
+			}
+		}
+
+		/* Reduction kernel invocation */
 		ret = GpuKernel_call(&ctx->kernel,
-		                     ctx->ndh>0 ? ctx->ndh : 1,
-		                     ctx->gridSize,
-		                     ctx->blockSize,
+		                     ctx->pri.ndh>0 ? ctx->pri.ndh : 1,
+		                     ctx->pri.gs,
+		                     ctx->pri.bs,
 		                     0,
-		                     args);
+		                     priArgs);
+		if (ret != GA_NO_ERROR){
+			return reduxCleanup(ctx, ret);
+		}
+
+		/* Post-kernel invocation, if necessary */
+		if(reduxIsSmallCodeModel(ctx)){
+			ret = GpuKernel_call(&ctx->postKernel,
+			                     ctx->aux.ndh>0 ? ctx->aux.ndh : 1,
+			                     ctx->aux.gs,
+			                     ctx->aux.bs,
+			                     0,
+			                     auxArgs);
+			if (ret != GA_NO_ERROR){
+				return reduxCleanup(ctx, ret);
+			}
+		}
+
 		return reduxCleanup(ctx, ret);
 	}else{
 		return reduxCleanup(ctx, GA_MEMORY_ERROR);
@@ -1852,18 +2112,28 @@ static int   reduxInvokeLarge              (redux_ctx*  ctx){
  */
 
 static int   reduxCleanup                  (redux_ctx*  ctx, int ret){
-	free(ctx->axisList);
+	free(ctx->srcAxisList);
+	free(ctx->dstAxisList);
 	free(ctx->sourceCode);
-	ctx->axisList   = NULL;
-	ctx->sourceCode = NULL;
+	free(ctx->errorString0);
+	free(ctx->errorString1);
+	free(ctx->errorString2);
+	ctx->srcAxisList  = NULL;
+	ctx->dstAxisList  = NULL;
+	ctx->sourceCode   = NULL;
+	ctx->errorString0 = NULL;
+	ctx->errorString1 = NULL;
+	ctx->errorString2 = NULL;
 
 	gpudata_release(ctx->srcStepsGD);
 	gpudata_release(ctx->srcSizeGD);
-	gpudata_release(ctx->chunkSizeGD);
 	gpudata_release(ctx->dstStepsGD);
 	gpudata_release(ctx->dstArgStepsGD);
-	ctx->srcStepsGD = ctx->srcSizeGD     = ctx->chunkSizeGD   =
-	ctx->dstStepsGD = ctx->dstArgStepsGD = NULL;
+	gpudata_release(ctx->pri.chunkSizeGD);
+	gpudata_release(ctx->aux.chunkSizeGD);
+	ctx->srcStepsGD      = ctx->srcSizeGD       =
+	ctx->dstStepsGD      = ctx->dstArgStepsGD   =
+	ctx->pri.chunkSizeGD = ctx->aux.chunkSizeGD = NULL;
 
 	return ret;
 }

From b88ae5716d661c6b05a13d2675018d698daba18d Mon Sep 17 00:00:00 2001
From: Olexa Bilaniuk <obilaniu@gmail.com>
Date: Fri, 3 Mar 2017 17:10:50 -0500
Subject: [PATCH 07/34] Added variadic string append function strb_appendv().

---
 src/util/strb.c | 32 ++++++++++++++++++++++----------
 src/util/strb.h | 12 ++++++++++++
 2 files changed, 34 insertions(+), 10 deletions(-)

diff --git a/src/util/strb.c b/src/util/strb.c
index dda9dcdfc2..ddf50924ca 100644
--- a/src/util/strb.c
+++ b/src/util/strb.c
@@ -43,25 +43,29 @@ int strb_grow(strb *sb, size_t n) {
   return 0;
 }
 
-void strb_appendf(strb *sb, const char *f, ...) {
-  va_list ap;
-  int s;
+void strb_appendv(strb *sb, const char *f, va_list ap) {
+  va_list apSave;
+  int     s;
 
-  va_start(ap, f);
 #ifdef _MSC_VER
-  s = _vscprintf(f, ap);
+  /**
+   * va_copy() is a C99 novelty that a particular company should have started
+   * supporting a long time ago, to their undying shame.
+   */
+  
+  apSave = ap;
+  s = _vscprintf(f, apSave);
 #else
-  s = vsnprintf(NULL, 0, f, ap);
+  va_copy(apSave, ap);
+  s = vsnprintf(NULL, 0, f, apSave);
 #endif
-  va_end(ap);
-
+  va_end(apSave);
+  
   if (s < 0) { strb_seterror(sb); return; }
   s += 1;
   
   if (strb_ensure(sb, s)) return;
-  va_start(ap, f);
   s = vsnprintf(sb->s+sb->l, s, f, ap);
-  va_end(ap);
   sb->l += s;
 }
 
@@ -100,3 +104,11 @@ int strb_write(int fd, strb *sb) {
   }
   return 0;
 }
+
+void strb_appendf(strb *sb, const char *f, ...) {
+  va_list ap;
+  va_start(ap, f);
+  strb_appendv(sb, f, ap);
+  va_end(ap);
+}
+
diff --git a/src/util/strb.h b/src/util/strb.h
index 223145908e..88a2c08794 100644
--- a/src/util/strb.h
+++ b/src/util/strb.h
@@ -2,6 +2,7 @@
 #define STRB_H
 
 #include "private_config.h"
+#include <stdarg.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -187,6 +188,17 @@ void strb_read(strb *sb, int fd, size_t sz);
  */
 int strb_write(int fd, strb *sb);
 
+/*
+ * Appends the result of a sprintf using the format string `f` and
+ * following variadic arguments list, excluding terminating nul.
+ *
+ * Unlike sprintf, this function makes sure not to run off the end of
+ * memory and behaves like asprintf in that respect.
+ *
+ * A format error will place the strb in error mode.
+ */
+void strb_appendv(strb *, const char *f, va_list ap);
+
 /*
  * Returns a C string from the content of the strb.
  *

From 0949626935955fb8a2427b4e3bb97c753f0bc6b7 Mon Sep 17 00:00:00 2001
From: Olexa Bilaniuk <obilaniu@gmail.com>
Date: Sat, 4 Mar 2017 19:29:50 -0500
Subject: [PATCH 08/34] Massive refactor of kernel codegen.

---
 src/gpuarray_reduction.c | 984 +++++++++++++++++++--------------------
 src/util/srcgen.h        | 106 +++++
 2 files changed, 596 insertions(+), 494 deletions(-)
 create mode 100644 src/util/srcgen.h

diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c
index a5940f504d..61f1688a4f 100644
--- a/src/gpuarray_reduction.c
+++ b/src/gpuarray_reduction.c
@@ -18,6 +18,7 @@
 #include "gpuarray/util.h"
 
 #include "util/strb.h"
+#include "util/srcgen.h"
 #include "util/integerfactoring.h"
 
 
@@ -34,7 +35,7 @@
 
 /**
  *                    Reduction Kernel Generator.
- * 
+ *
  * The generator produces a kernel from one of two "code models":
  *   - Large
  *   - Small
@@ -43,132 +44,142 @@
  * with more than SMALL_REDUX_THRESHOLD elements or more elements than
  * reductions for each element will result in use of the large code model;
  * Otherwise the small code model is used.
- * 
- * 
+ *
+ *
  *                         LARGE CODE MODEL:
- * 
+ *
  * In the large code model, each destination element is processed by a
  * single thread.
- * 
+ *
  * Each thread begins with an initial value in a register, reads from all
  * source elements contributing to the reduction, computes the result and
  * writes it to the destination element.
- * 
+ *
  * A single kernel is generated that performs prescalar transformations, the
  * reduction itself, postscalar transformations and the write to global memory.
- * 
- * 
+ *
+ *
  *                         SMALL CODE MODEL:
- * 
+ *
  * In the small code model, each destination element is processed by
  * multiple threads.
- * 
+ *
  * The destination tensor is first initialized with the initial value. Then,
  * one several threads cooperate to perform the reduction atomically on each
  * destination element. Lastly, postscalar transformations are applied
  * in-place.
- * 
+ *
  * Two or three kernels are generated: The initialization kernel, the main
  * kernel that performs prescalar transformations and the reduction itself, and
  * possibly also a postscalar transformation kernel when it is required.
- * 
- * 
+ *
+ *
  *                           Kernel Template:
- * 
+ *
  * The following kernel code template displays the code generated for the
  * small code model. For the large code model, no pre/postRedux() kernels
  * are generated (since their functionality is incorporated within the main
  * redux() kernel), no atomicRedux() function needs to be generated because
  * writes to global memory are unconditional and not contended.
- * 
- * 
- *     //Includes
- *     #include <limits.h>
- *     #include <math.h>
- *     #include <stdint.h>
- *     
- *     
+ *
+ *
+ *     //Macros
+ *     #define FOROVER
+ *     #define ESCAPE
+ *     #define srcVal       //Indexer
+ *     #define dstVal       //Indexer
+ *     #define dstArgVal    //Indexer
+ *     #define rdxIdx       //Special reduction index computer
+ *
+ *
  *     //Typedefs:
- *     typedef  float    T
- *     typedef  int64_t  X
- *     
- *     
- *     //Initializer (in case initial T cannot be expressed as a literal)
- *     static T    getInitVal(void){
+ *     typedef  float    S  //The type of the source array.
+ *     typedef  float    T  //The type of the destination array.
+ *     typedef  ssize_t  A  //The type of the destination argument array.
+ *     typedef  ssize_t  X  //The type of the indices: signed 32/64-bit.
+ *     typedef  float    K  //The type of the accumulator variable.
+ *
+ *
+ *     //Initializer (in case initial value of accumulator cannot be expressed
+ *     //as a literal)
+ *     static K    getInitValTFn(void){
  *         return ...
  *     }
- *     
- *     
+ *     static K    getInitValKFn(void){
+ *         return ...
+ *     }
+ *
+ *
  *     //Reduce into global memory destination a value.
- *     static void atomicRedux(GLOBAL_MEM T* dst, T val){
- *         ...
+ *     static void writeBackFn(GLOBAL_MEM T* d_, T d,
+ *                             GLOBAL_MEM A* a_, A a){
+ *         //Large code model:
+ *         *dPtr = d;
+ *         *aPtr = a;
+ *
+ *         //Small code model:
+ *         // Something complex possibly involving CAS loops
  *     }
- *     
- *     
- *     //Load data from source and apply pre-operations.
- *     static T loadVal(X i0, X i1, ..., X iN,
- *                      const GLOBAL_MEM T* src,
- *                      const GLOBAL_MEM X* srcSteps,
- *                      ...?){
+ *
+ *
+ *     //Load data from source and apply pre-operations, coercing the type to
+ *     //the accumulator type K.
+ *     static K loadValFn(X i0, X i1, ..., X iN,
+ *                        const GLOBAL_MEM S* srcPtr,
+ *                        const X             srcOff,
+ *                        const GLOBAL_MEM X* srcSteps,
+ *                        ...?){
  *         return ...
  *     }
- *     
- *     
- *     //Initialization kernel,
- *     KERNEL void preRedux(const GLOBAL_MEM X*        srcSize,
- *                          const GLOBAL_MEM X*        chunkSize,
- *                          GLOBAL_MEM T*              dst,
- *                          const X                    dstOff,
- *                          const GLOBAL_MEM X*        dstSteps){
- *         //OFFSETS
- *         dst += dstOff;
- *         
- *         //Initialize
- *         dst[...] = getInitVal();
+ *
+ *
+ *     //Initialization kernel
+ *     KERNEL void initKer(const GLOBAL_MEM X*        srcSize,
+ *                         const GLOBAL_MEM X*        chunkSize,
+ *                         GLOBAL_MEM T*              dstPtr,
+ *                         const X                    dstOff,
+ *                         const GLOBAL_MEM X*        dstSteps){
+ *         dstVal = getInitValTFn();
  *     }
- *     
- *     
+ *
+ *
  *     //Reduction Kernel.
- *     KERNEL void redux(const GLOBAL_MEM T*        src,
- *                       const X                    srcOff,
- *                       const GLOBAL_MEM X*        srcSteps,
- *                       const GLOBAL_MEM X*        srcSize,
- *                       const GLOBAL_MEM X*        chunkSize,
- *                       GLOBAL_MEM T*              dst,
- *                       const X                    dstOff,
- *                       const GLOBAL_MEM X*        dstSteps,
- *                       GLOBAL_MEM X*              dstArg,
- *                       const X                    dstArgOff,
- *                       const GLOBAL_MEM X*        dstArgSteps){
- *         //OFFSETS
- *         src    += srcOff
- *         dst    += dstOff
- *         dstArg += dstArgOff
- *         
+ *     KERNEL void reduxKer(GLOBAL_MEM S*              srcPtr,
+ *                          const X                    srcOff,
+ *                          const GLOBAL_MEM X*        srcSteps,
+ *                          const GLOBAL_MEM X*        srcSize,
+ *                          const GLOBAL_MEM X*        chunkSize,
+ *                          GLOBAL_MEM T*              dstPtr,
+ *                          const X                    dstOff,
+ *                          const GLOBAL_MEM X*        dstSteps,
+ *                          GLOBAL_MEM A*              dstArgPtr,
+ *                          const X                    dstArgOff,
+ *                          const GLOBAL_MEM X*        dstArgSteps){
  *         //Declare Indices
  *         //Compute Ranges
- *         
- *         //Define macros
+ *
  *         //Outer Loops
+ *            K rdxK = getInitValKFn();
+ *            A rdxA = 0;
  *            //Inner Loops
- *         //Undefine macros
+ *                K k  = loadValFn(indices..., srcPtr, srcOff, srcSteps)
+ *                rdxK = k
+ *                rdxA = rdxIdx
+ *            writeBackFn(&dstVal, d, &dstArgVal, a);
  *     }
- *     
- *     
+ *
+ *
  *     //Post-scalar kernel,
- *     KERNEL void postRedux(const GLOBAL_MEM X*        srcSize,
- *                           const GLOBAL_MEM X*        chunkSize,
- *                           GLOBAL_MEM T*              dst,
- *                           const X                    dstOff,
- *                           const GLOBAL_MEM X*        dstSteps){
- *         //OFFSETS
- *         dst += dstOff;
- *         
- *         //Initialize
- *         dst[...] = getInitVal();
+ *     KERNEL void postKer(const GLOBAL_MEM X*        srcSize,
+ *                         const GLOBAL_MEM X*        chunkSize,
+ *                         GLOBAL_MEM T*              dst,
+ *                         const X                    dstOff,
+ *                         const GLOBAL_MEM X*        dstSteps){
+ *         //Default: Nothing.
+ *         dstVal = dstVal
  *     }
- * 
- * 
+ *
+ *
  *                           Initial Reduction Values
  * +--------------+-----+-----+---------+---------+-----+-----+-----+-----+-----+
  * | Type\Op      |  +  |  *  |   max   |   min   |  &  |  |  |  ^  | &&  | ||  |
@@ -189,8 +200,10 @@ struct redux_ctx{
 	const int*      reduxList;
 
 	/* General. */
+	GpuArray*       wsDst;
+	GpuArray*       wsDstArg;
 	int*            srcAxisList;
-	int*            dstAxisList;
+	size_t*         dstDims;
 	gpucontext*     gpuCtx;
 
 	/* Source code Generator. */
@@ -204,12 +217,14 @@ struct redux_ctx{
 	const char*     dstArgTypeStr;
 	const char*     idxTypeStr;
 	const char*     accTypeStr;
-	const char*     initVal;
+	const char*     initValT;
+	const char*     initValK;
 	int             ndd;
 	int             ndr;
 	int             nds;
 	int             largeCodeModel;
 	strb            s;
+	srcb            srcGen;
 	char*           sourceCode;
 	size_t          sourceCodeLen;
 	char*           errorString0;
@@ -274,11 +289,10 @@ static void  appendIdxes                   (strb*              s,
                                             const char*        epilogue);
 static int   reduxCheckargs                (redux_ctx*  ctx);
 static void  reduxSelectTypes              (redux_ctx*  ctx);
-static int   reduxSelectModel              (redux_ctx*  ctx);
 static int   reduxIsSmallCodeModel         (redux_ctx*  ctx);
 static int   reduxIsLargeCodeModel         (redux_ctx*  ctx);
-static int   reduxHasDst                   (redux_ctx*  ctx);
-static int   reduxHasDstArg                (redux_ctx*  ctx);
+static int   reduxRequiresDst              (redux_ctx*  ctx);
+static int   reduxRequiresDstArg           (redux_ctx*  ctx);
 static int   reduxKernelRequiresDst        (redux_ctx*  ctx);
 static int   reduxKernelRequiresDstArg     (redux_ctx*  ctx);
 static int   reduxCanAppendHwAxis          (redux_ctx* ctx,
@@ -292,22 +306,22 @@ static int   reduxComputeAxisList          (redux_ctx*  ctx);
 static int   reduxGenSource                (redux_ctx*  ctx);
 static void  reduxAppendSource             (redux_ctx*  ctx);
 static void  reduxAppendIncludes           (redux_ctx*  ctx);
+static void  reduxAppendTensorDeclArgs     (redux_ctx*  ctx,
+                                            const char* type,
+                                            const char* baseName);
+static void  reduxAppendTensorCallArgs     (redux_ctx*  ctx,
+                                            const char* baseName);
+static void  reduxAppendMacroDefs          (redux_ctx*  ctx);
 static void  reduxAppendTypedefs           (redux_ctx*  ctx);
-static void  reduxAppendFuncGetInitVal     (redux_ctx*  ctx);
-static void  reduxAppendFuncLoadVal        (redux_ctx*  ctx);
-static void  reduxAppendFuncReduxVal       (redux_ctx*  ctx);
-static void  reduxAppendFuncPreKernel      (redux_ctx*  ctx);
-static void  reduxAppendFuncKernel         (redux_ctx*  ctx);
-static void  reduxAppendFuncPostKernel     (redux_ctx*  ctx);
+static void  reduxAppendGetInitValFns      (redux_ctx*  ctx);
+static void  reduxAppendWriteBackFn        (redux_ctx*  ctx);
+static void  reduxAppendReduxKernel        (redux_ctx*  ctx);
 static void  reduxAppendPrototype          (redux_ctx*  ctx);
-static void  reduxAppendOffsets            (redux_ctx*  ctx);
 static void  reduxAppendIndexDeclarations  (redux_ctx*  ctx);
 static void  reduxAppendRangeCalculations  (redux_ctx*  ctx);
 static void  reduxAppendLoops              (redux_ctx*  ctx);
-static void  reduxAppendLoopMacroDefs      (redux_ctx*  ctx);
-static void  reduxAppendLoopOuter          (redux_ctx*  ctx);
-static void  reduxAppendLoopInner          (redux_ctx*  ctx);
-static void  reduxAppendLoopMacroUndefs    (redux_ctx*  ctx);
+static void  reduxAppendInitKernel         (redux_ctx*  ctx);
+static void  reduxAppendPostKernel         (redux_ctx*  ctx);
 static int   reduxCompile                  (redux_ctx*  ctx);
 static int   reduxSchedule                 (redux_ctx*  ctx);
 static void  reduxScheduleKernel           (int         ndims,
@@ -771,28 +785,36 @@ static void  appendIdxes                   (strb*              s,
 }
 
 /**
- * @brief Check the sanity of the arguments, in agreement with the
+ * @brief Check the sanity of the arguments in agreement with the
  *        documentation for GpuArray_reduction().
  *
- *        Also initialize certain parts of the context.
+ *        Also initialize certain parts of the context, allocate memory
+ *        buffers and fail out if at any point the environment gives us
+ *        a problem.
  *
- * @return GA_INVALID_ERROR if arguments invalid; GA_NO_ERROR otherwise.
+ * @return GA_INVALID_ERROR if arguments invalid; GA_NO_MEMORY if out of
+ *         memory, GA_NO_ERROR otherwise.
  */
 
 static int   reduxCheckargs                (redux_ctx*  ctx){
-	int i, ret;
+	int      i, j, ret, retT, retK;
+	unsigned numProcs;
+	size_t   localSize;
+	size_t   dstNumElem = 1, reduxPerElem = 1;
 
 	/**
 	 * We initialize certain parts of the context.
 	 */
 
+	ctx->wsDst         = NULL;
+	ctx->wsDstArg      = NULL;
 	ctx->srcAxisList   = NULL;
-	ctx->dstAxisList   = NULL;
+	ctx->dstDims       = NULL;
 	ctx->gpuCtx        = NULL;
 
 	ctx->srcTypeStr    = ctx->dstTypeStr    = ctx->dstArgTypeStr =
 	ctx->accTypeStr    = ctx->idxTypeStr    = NULL;
-	ctx->initVal       = NULL;
+	ctx->initValK       = NULL;
 	ctx->pri.ndh       = ctx->aux.ndh  = 0;
 	ctx->pri.ndhd      = ctx->aux.ndhd = 0;
 	ctx->pri.ndhr      = ctx->aux.ndhr = 0;
@@ -802,6 +824,7 @@ static int   reduxCheckargs                (redux_ctx*  ctx){
 	ctx->errorString1  = NULL;
 	ctx->errorString2  = NULL;
 	strb_init(&ctx->s);
+	srcbInit (&ctx->srcGen, &ctx->s);
 
 	for (i=0;i<MAX_HW_DIMS;i++){
 		ctx->aux.axisList[i] = ctx->pri.axisList[i] = 0;
@@ -817,12 +840,14 @@ static int   reduxCheckargs                (redux_ctx*  ctx){
 
 
 	/* Insane src, reduxLen, dst or dstArg? */
-	if (!ctx->src || ctx->src->nd <= 0 || ctx->reduxLen == 0 ||
-	    ctx->reduxLen > (int)ctx->src->nd){
-		return reduxCleanup(ctx, GA_INVALID_ERROR);
-	}
-	if ((reduxHasDst   (ctx) && !ctx->dst)   ||
-	    (reduxHasDstArg(ctx) && !ctx->dstArg)){
+	if (!ctx->src                                                      ||
+	    (reduxRequiresDst   (ctx) && !ctx->dst)                        ||
+	    (reduxRequiresDstArg(ctx) && !ctx->dstArg)                     ||
+	    (ctx->src->nd  <= 0)                                           ||
+	    (ctx->reduxLen <= 0)                                           ||
+	    (ctx->src->nd  <  (unsigned)ctx->reduxLen)                     ||
+	    (ctx->dst    && ctx->dst->nd   +ctx->reduxLen != ctx->src->nd) ||
+	    (ctx->dstArg && ctx->dstArg->nd+ctx->reduxLen != ctx->src->nd) ){
 		return reduxCleanup(ctx, GA_INVALID_ERROR);
 	}
 
@@ -855,36 +880,46 @@ static int   reduxCheckargs                (redux_ctx*  ctx){
 	/* Determine initializer, and error out if reduction unsupported. */
 	switch (ctx->op){
 		case GA_REDUCE_SUM:
-		  ret = reduxGetSumInit (ctx->accTypeCode, &ctx->initVal);
+		  retT = reduxGetSumInit (ctx->dstTypeCode, &ctx->initValT);
+		  retK = reduxGetSumInit (ctx->accTypeCode, &ctx->initValK);
 		break;
 		case GA_REDUCE_PRODNZ:
 		case GA_REDUCE_PROD:
-		  ret = reduxGetProdInit(ctx->accTypeCode, &ctx->initVal);
+		  retT = reduxGetProdInit(ctx->dstTypeCode, &ctx->initValT);
+		  retK = reduxGetProdInit(ctx->accTypeCode, &ctx->initValK);
 		break;
 		case GA_REDUCE_MINANDARGMIN:
 		case GA_REDUCE_ARGMIN:
 		case GA_REDUCE_MIN:
-		  ret = reduxGetMinInit (ctx->accTypeCode, &ctx->initVal);
+		  retT = reduxGetMinInit (ctx->dstTypeCode, &ctx->initValT);
+		  retK = reduxGetMinInit (ctx->accTypeCode, &ctx->initValK);
 		break;
 		case GA_REDUCE_MAXANDARGMAX:
 		case GA_REDUCE_ARGMAX:
 		case GA_REDUCE_MAX:
-		  ret = reduxGetMaxInit (ctx->accTypeCode, &ctx->initVal);
+		  retT = reduxGetMaxInit (ctx->dstTypeCode, &ctx->initValT);
+		  retK = reduxGetMaxInit (ctx->accTypeCode, &ctx->initValK);
 		break;
 		case GA_REDUCE_ALL:
 		case GA_REDUCE_AND:
-		  ret = reduxGetAndInit (ctx->accTypeCode, &ctx->initVal);
+		  retT = reduxGetAndInit (ctx->dstTypeCode, &ctx->initValT);
+		  retK = reduxGetAndInit (ctx->accTypeCode, &ctx->initValK);
 		break;
 		case GA_REDUCE_ANY:
 		case GA_REDUCE_XOR:
 		case GA_REDUCE_OR:
-		  ret = reduxGetOrInit  (ctx->accTypeCode, &ctx->initVal);
+		  retT = reduxGetOrInit  (ctx->dstTypeCode, &ctx->initValT);
+		  retK = reduxGetOrInit  (ctx->accTypeCode, &ctx->initValK);
 		break;
 		default:
-		  ret = GA_UNSUPPORTED_ERROR;
+		  retT = GA_UNSUPPORTED_ERROR;
+		  retK = GA_UNSUPPORTED_ERROR;
 	}
-	if (ret != GA_NO_ERROR){
-		return reduxCleanup(ctx, ret);
+	if (retT != GA_NO_ERROR){
+		return reduxCleanup(ctx, retT);
+	}
+	if (retK != GA_NO_ERROR){
+		return reduxCleanup(ctx, retK);
 	}
 
 
@@ -896,11 +931,109 @@ static int   reduxCheckargs                (redux_ctx*  ctx){
 	ctx->nds = ctx->src->nd;
 	ctx->ndr = ctx->reduxLen;
 	ctx->ndd = ctx->nds - ctx->ndr;
-	strb_ensure(&ctx->s, 5*1024);
+	strb_ensure(&ctx->s, 3*1024);
+
+
+	/**
+	 * And make a few small dynamic memory allocations for the benefit of the
+	 * rest of the code, allowing error checking to happen early and fail fast.
+	 */
+
+	ctx->srcAxisList = malloc(ctx->nds * sizeof(unsigned));
+	ctx->dstDims     = malloc(ctx->ndd * sizeof(size_t));
+	if (!ctx->srcAxisList ||
+	    !ctx->dstDims     ){
+		return reduxCleanup(ctx, GA_MEMORY_ERROR);
+	}
+
+
+	/**
+	 * Query device for approximate total level of parallelism. If destination
+	 * tensor is so big it can keep all threads busy on individual elements,
+	 * use large code model; Otherwise use small code model, where threads will
+	 * have to cooperate.
+	 *
+	 *    - Large (Destination tensor >= SMALL_REDUX_THRESHOLD elements, or
+	 *             destination tensor size >= # of reductions per destination
+	 *             tensor element):
+	 *        All destination elements have their own thread.
+	 *    - Small (otherwise):
+	 *        Multiple threads cooperate on a single destination element.
+	 */
 
+	ret = gpucontext_property(ctx->gpuCtx, GA_CTX_PROP_NUMPROCS, &numProcs);
+	if (ret != GA_NO_ERROR){
+		return reduxCleanup(ctx, ret);
+	}
+	ret = gpucontext_property(ctx->gpuCtx, GA_CTX_PROP_MAXLSIZE, &localSize);
+	if (ret != GA_NO_ERROR){
+		return reduxCleanup(ctx, ret);
+	}
 
+	for (i=j=0;i<ctx->nds;i++){
+		if (axisInSet(i, ctx->reduxList, ctx->nds, NULL)){
+			reduxPerElem *= ctx->src->dimensions[i];
+		}else{
+			dstNumElem   *= ctx->src->dimensions[i];
+			ctx->dstDims[j++] = ctx->src->dimensions[i];;
+		}
+	}
 
-	return reduxSelectModel(ctx);
+	ctx->largeCodeModel = dstNumElem >= numProcs*localSize ||
+	                      dstNumElem >= reduxPerElem
+	                      || 1;/* BUG: Erase when small code model implemented. */
+	/**
+	 * *** IT IS NOW SAFE TO CALL: ***
+	 *       - reduxIsLargeModel()
+	 *       - reduxIsSmallModel()
+	 *       - reduxKernelRequiresDst()
+	 *       - reduxKernelRequiresDstArg()
+	 */
+
+
+	/**
+	 * Allocate workspaces.
+	 *
+	 * Certain reductions may require a workspace that isn't provided by the user.
+	 * For instance, **when using the small code model**, argmin/argmax require
+	 * a dst buffer, but the user didn't supply one (as he would have for
+	 * maxandargmax/minandargmin). We must allocate and deallocate it ourselves.
+	 *
+	 * Otherwise we use the user-supplied buffers.
+	 */
+
+	if (!reduxRequiresDst   (ctx) && reduxKernelRequiresDst(ctx)){
+		ctx->wsDst    = malloc(sizeof(*ctx->wsDst));
+		if (!ctx->wsDst){
+			return reduxCleanup(ctx, GA_MEMORY_ERROR);
+		}
+
+		ret = GpuArray_empty(ctx->wsDst, ctx->gpuCtx,  ctx->dstTypeCode,
+		                     ctx->ndd,   ctx->dstDims, GA_C_ORDER);
+		if(ret != GA_NO_ERROR){
+			return reduxCleanup(ctx, ret);
+		}
+	}else{
+		ctx->wsDst    = ctx->dst;
+	}
+	if (!reduxRequiresDstArg(ctx) && reduxKernelRequiresDstArg(ctx)){
+		ctx->wsDstArg = malloc(sizeof(*ctx->wsDstArg));
+		if (!ctx->wsDstArg){
+			return reduxCleanup(ctx, GA_MEMORY_ERROR);
+		}
+
+		ret = GpuArray_empty(ctx->wsDstArg, ctx->gpuCtx,  ctx->dstArgTypeCode,
+		                     ctx->ndd,      ctx->dstDims, GA_C_ORDER);
+		if(ret != GA_NO_ERROR){
+			return reduxCleanup(ctx, ret);
+		}
+	}else{
+		ctx->wsDstArg = ctx->dstArg;
+	}
+
+
+
+	return reduxSelectHwAxes(ctx);
 }
 
 /**
@@ -948,67 +1081,6 @@ static void  reduxSelectTypes              (redux_ctx*  ctx){
 	ctx->accTypeStr     = gpuarray_get_type(ctx->accTypeCode)   ->cluda_name;
 }
 
-/**
- * @brief Select which code model will be used:
- *
- *        - Large (Destination tensor >= SMALL_REDUX_THRESHOLD elements, or
- *                 destination tensor size >= # of reductions per destination
- *                 tensor element):
- *            All destination elements have their own thread.
- *        - Small (otherwise):
- *            Multiple threads cooperate on a single destination element.
- */
-
-static int   reduxSelectModel              (redux_ctx*  ctx){
-	int      i, ret;
-	unsigned numProcs;
-	size_t   localSize;
-	size_t   dstNumElem = 1, reduxPerElem = 1;
-
-
-	/**
-	 * Query device for approximate total level of parallelism. If destination
-	 * tensor is so big it can keep all threads busy on individual elements,
-	 * use large code model; Otherwise use small code model, where threads will
-	 * have to cooperate.
-	 */
-
-	ret = gpucontext_property(ctx->gpuCtx, GA_CTX_PROP_NUMPROCS, &numProcs);
-	if (ret != GA_NO_ERROR){
-		return reduxCleanup(ctx, ret);
-	}
-	ret = gpucontext_property(ctx->gpuCtx, GA_CTX_PROP_MAXLSIZE, &localSize);
-	if (ret != GA_NO_ERROR){
-		return reduxCleanup(ctx, ret);
-	}
-
-
-	/**
-	 * Compute #elems in dst and # reductions per dst element.
-	 */
-
-	for (i=0;i<ctx->nds;i++){
-		if (axisInSet(i, ctx->reduxList, ctx->nds, NULL)){
-			reduxPerElem *= ctx->src->dimensions[i];
-		}else{
-			dstNumElem   *= ctx->src->dimensions[i];
-		}
-	}
-	ctx->largeCodeModel = dstNumElem >= numProcs*localSize ||
-	                      dstNumElem >= reduxPerElem
-	                      || 1;/* BUG: Erase when small code model implemented. */
-	/**
-	 * *** IT IS NOW SAFE TO CALL: ***
-	 *       - reduxIsLargeModel()
-	 *       - reduxIsSmallModel()
-	 *       - reduxKernelRequiresDst()
-	 *       - reduxKernelRequiresDstArg()
-	 */
-
-
-	return reduxSelectHwAxes(ctx);
-}
-
 /**
  * @brief Returns whether we are using the small code model or not.
  */
@@ -1029,7 +1101,7 @@ static int   reduxIsLargeCodeModel         (redux_ctx*  ctx){
  * @brief Returns whether the reduction interface requires a dst argument.
  */
 
-static int   reduxHasDst                   (redux_ctx*  ctx){
+static int   reduxRequiresDst              (redux_ctx*  ctx){
 	switch (ctx->op){
 		case GA_REDUCE_ARGMIN:
 		case GA_REDUCE_ARGMAX:
@@ -1043,7 +1115,7 @@ static int   reduxHasDst                   (redux_ctx*  ctx){
  * @brief Returns whether the reduction interface requires a dstArg argument.
  */
 
-static int   reduxHasDstArg                (redux_ctx*  ctx){
+static int   reduxRequiresDstArg           (redux_ctx*  ctx){
 	switch (ctx->op){
 		case GA_REDUCE_MINANDARGMIN:
 		case GA_REDUCE_MAXANDARGMAX:
@@ -1093,7 +1165,7 @@ static int   reduxKernelRequiresDstArg     (redux_ctx*  ctx){
 	 *      buffer for indexes, and will not in the foreseeable future.
 	 */
 
-	return reduxHasDstArg(ctx);
+	return reduxRequiresDstArg(ctx);
 }
 
 /**
@@ -1107,8 +1179,8 @@ static int   reduxCanAppendHwAxis          (redux_ctx* ctx,
 	int kernelNdh  = kernelType == KERNEL_PRIMARY ? ctx->pri.ndh  : ctx->aux.ndh;
 	int kernelNdhr = kernelType == KERNEL_PRIMARY ? ctx->pri.ndhr : ctx->aux.ndhr;
 	int kernelNdhd = kernelType == KERNEL_PRIMARY ? ctx->pri.ndhd : ctx->aux.ndhd;
-	
-	if(kernelNdh >= MAX_HW_DIMS){
+
+	if (kernelNdh >= MAX_HW_DIMS){
 		return 0;
 	}else{
 		return axisType == AXIS_REDUX ? kernelNdhr < ctx->ndr:
@@ -1215,19 +1287,11 @@ static int   reduxSelectHwAxes             (redux_ctx*  ctx){
  * The first ctx->ndd axes correspond to the outer loops that iterate over
  * each destination element. The last ctx->ndr axes correspond to the inner
  * loops that iterate over the dimensions of elements that are to be reduced.
- *
- * @return GA_MEMORY_ERROR if allocating the list failed; Otherwise, returns
- *         GA_NO_ERROR.
  */
 
 static int   reduxComputeAxisList          (redux_ctx*  ctx){
 	int i, f=0;
 
-	ctx->srcAxisList = malloc(ctx->nds * sizeof(unsigned));
-	if (!ctx->srcAxisList){
-		return reduxCleanup(ctx, GA_MEMORY_ERROR);
-	}
-
 	for (i=0;i<ctx->nds;i++){
 		if (!axisInSet(i, ctx->reduxList, ctx->ndr, 0)){
 			ctx->srcAxisList[f++] = i;
@@ -1257,15 +1321,81 @@ static int   reduxGenSource                (redux_ctx*  ctx){
 }
 static void  reduxAppendSource             (redux_ctx*  ctx){
 	reduxAppendIncludes         (ctx);
+	reduxAppendMacroDefs        (ctx);
 	reduxAppendTypedefs         (ctx);
-	reduxAppendFuncGetInitVal   (ctx);
-	reduxAppendFuncLoadVal      (ctx);
-	reduxAppendFuncReduxVal     (ctx);
-	if(reduxIsSmallCodeModel(ctx)){
-		reduxAppendFuncPreKernel    (ctx);
-		reduxAppendFuncPostKernel   (ctx);
-	}
-	reduxAppendFuncKernel       (ctx);
+	reduxAppendGetInitValFns    (ctx);
+	reduxAppendWriteBackFn      (ctx);
+	reduxAppendReduxKernel       (ctx);
+	if (reduxIsSmallCodeModel(ctx)){
+		reduxAppendInitKernel    (ctx);
+		reduxAppendPostKernel   (ctx);
+	}
+}
+static void  reduxAppendTensorDeclArgs     (redux_ctx*  ctx,
+                                            const char* type,
+                                            const char* baseName){
+	srcbAppendElemf(&ctx->srcGen, "%s* %sPtr",             type, baseName);
+	srcbAppendElemf(&ctx->srcGen, "const X %sOff",               baseName);
+	srcbAppendElemf(&ctx->srcGen, "const GLOBAL_MEM X* %sSteps", baseName);
+	(void)reduxAppendTensorCallArgs;/* Silence unused warning */
+}
+static void  reduxAppendTensorCallArgs     (redux_ctx*  ctx,
+                                            const char* baseName){
+	srcbAppendElemf(&ctx->srcGen, "%sPtr",   baseName);
+	srcbAppendElemf(&ctx->srcGen, "%sOff",   baseName);
+	srcbAppendElemf(&ctx->srcGen, "%sSteps", baseName);
+}
+static void  reduxAppendMacroDefs          (redux_ctx*  ctx){
+	int i;
+
+	srcbAppends    (&ctx->srcGen, "#define FOROVER(idx)    for(i##idx = i##idx##Start; i##idx < i##idx##End; i##idx++)\n");
+	srcbAppends    (&ctx->srcGen, "#define ESCAPE(idx)     if(i##idx >= i##idx##Dim){continue;}\n");
+
+	/* srcVal indexer */
+	srcbAppends    (&ctx->srcGen, "#define srcVal          (*(const GLOBAL_MEM S*)(");
+	srcbBeginList  (&ctx->srcGen, "+", "0");
+	srcbAppendElemf(&ctx->srcGen, "(const GLOBAL_MEM char*)srcPtr");
+	srcbAppendElemf(&ctx->srcGen, "srcOff");
+	for (i=0;i<ctx->nds;i++){
+		srcbAppendElemf(&ctx->srcGen, "i%d*i%dSStep", i, i);
+	}
+	srcbEndList    (&ctx->srcGen);
+	srcbAppends    (&ctx->srcGen, "))\n");
+
+	/* dstVal indexer */
+	if (reduxKernelRequiresDst(ctx)){
+		srcbAppends    (&ctx->srcGen, "#define dstVal          (*(GLOBAL_MEM T*)(");
+		srcbBeginList  (&ctx->srcGen, "+", "0");
+		srcbAppendElemf(&ctx->srcGen, "(GLOBAL_MEM char*)dstPtr");
+		srcbAppendElemf(&ctx->srcGen, "dstOff");
+		for (i=0;i<ctx->ndd;i++){
+			srcbAppendElemf(&ctx->srcGen, "i%d*i%dDStep", i, i);
+		}
+		srcbEndList    (&ctx->srcGen);
+		srcbAppends    (&ctx->srcGen, "))\n");
+	}
+
+	/* dstArgVal indexer */
+	if (reduxKernelRequiresDstArg(ctx)){
+		srcbAppends    (&ctx->srcGen, "#define dstArgVal       (*(GLOBAL_MEM A*)(");
+		srcbBeginList  (&ctx->srcGen, "+", "0");
+		srcbAppendElemf(&ctx->srcGen, "(GLOBAL_MEM char*)dstArgPtr");
+		srcbAppendElemf(&ctx->srcGen, "dstArgOff");
+		for (i=0;i<ctx->ndd;i++){
+			srcbAppendElemf(&ctx->srcGen, "i%d*i%dAStep", i, i);
+		}
+		srcbEndList    (&ctx->srcGen);
+		srcbAppends    (&ctx->srcGen, "))\n");
+	}
+
+	/* rdxIdx indexer */
+	srcbAppends    (&ctx->srcGen, "#define rdxIdx          (");
+	srcbBeginList  (&ctx->srcGen, "+", "0");
+	for (i=ctx->ndd;i<ctx->nds;i++){
+		srcbAppendElemf(&ctx->srcGen, "i%d*i%dPDim", i, i);
+	}
+	srcbEndList    (&ctx->srcGen);
+	srcbAppends    (&ctx->srcGen, ")\n");
 }
 static void  reduxAppendIncludes           (redux_ctx*  ctx){
 	strb_appends(&ctx->s, "/* Includes */\n");
@@ -1281,47 +1411,20 @@ static void  reduxAppendTypedefs           (redux_ctx*  ctx){
 	strb_appendf(&ctx->s, "typedef %s X;\n", ctx->idxTypeStr);   /* The type of the indices: signed 32/64-bit. */
 	strb_appendf(&ctx->s, "typedef %s K;\n", ctx->accTypeStr);   /* The type of the accumulator variable. */
 }
-static void  reduxAppendFuncGetInitVal     (redux_ctx*  ctx){
+static void  reduxAppendGetInitValFns      (redux_ctx*  ctx){
 	/**
-	 * Initial value function.
+	 * Initial value functions.
 	 */
 
-	strb_appendf(&ctx->s, "WITHIN_KERNEL K    getInitVal(void){\n"
+	strb_appendf(&ctx->s, "WITHIN_KERNEL T    getInitValTFn(void){\n"
 	                      "\treturn (%s);\n"
-	                      "}\n\n\n\n", ctx->initVal);
-}
-static void  reduxAppendFuncLoadVal        (redux_ctx*  ctx){
-	int i;
-
-	/**
-	 * Multidimensional source element loader.
-	 *
-	 * Also implements prescalar transformations if any.
-	 */
-
-	appendIdxes (&ctx->s, "WITHIN_KERNEL K    loadVal(", "X i", 0, ctx->nds, "", "");
-	if (ctx->nds > 0){
-		strb_appends(&ctx->s, ", ");
-	}
-	strb_appends(&ctx->s, "const GLOBAL_MEM S* src, const GLOBAL_MEM X* srcSteps){\n");
-	strb_appends(&ctx->s, "\tS v = (*(const GLOBAL_MEM S*)((const GLOBAL_MEM char*)src + ");
-	for (i=0;i<ctx->nds;i++){
-		strb_appendf(&ctx->s, "i%d*srcSteps[%d] + \\\n\t                                                            ", i, ctx->srcAxisList[i]);
-	}
-	strb_appends(&ctx->s, "0));\n");
-
-	/* Prescalar transformations go here... */
-
-	/* Return the value. */
-	strb_appends(&ctx->s, "\treturn v;\n");
-	strb_appends(&ctx->s, "}\n\n\n\n");
+	                      "}\n\n\n\n"
+	                      "WITHIN_KERNEL K    getInitValKFn(void){\n"
+	                      "\treturn (%s);\n"
+	                      "}\n\n\n\n", ctx->initValT, ctx->initValK);
 }
-static void  reduxAppendFuncReduxVal       (redux_ctx*  ctx){
-	int i, anyArgsEmitted = 0;
-
+static void  reduxAppendWriteBackFn        (redux_ctx*  ctx){
 	/**
-	 * Function Signature.
-	 *
 	 * Global memory value reduction function.
 	 *
 	 * Responsible for either:
@@ -1329,44 +1432,25 @@ static void  reduxAppendFuncReduxVal       (redux_ctx*  ctx){
 	 *   2) Safe atomic reduction of partial value into memory.
 	 */
 
-	appendIdxes (&ctx->s, "WITHIN_KERNEL void reduxVal(", "X i", 0, ctx->ndd, "", "");
-	anyArgsEmitted = ctx->ndd>0;
-	if (reduxKernelRequiresDst   (ctx)){
-		if (anyArgsEmitted){
-			strb_appends(&ctx->s, ", ");
-		}
-		anyArgsEmitted = 1;
-		strb_appends(&ctx->s, "GLOBAL_MEM T* dst,    const GLOBAL_MEM X* dstSteps,    K v");
+	srcbAppends    (&ctx->srcGen, "WITHIN_KERNEL void writeBackFn(");
+	srcbBeginList  (&ctx->srcGen, ", ", "void");
+	if (reduxKernelRequiresDst(ctx)){
+		srcbAppendElemf(&ctx->srcGen, "GLOBAL_MEM T* d_");
+		srcbAppendElemf(&ctx->srcGen, "T d");
 	}
 	if (reduxKernelRequiresDstArg(ctx)){
-		if (anyArgsEmitted){
-			strb_appends(&ctx->s, ", ");
-		}
-		anyArgsEmitted = 1;
-		strb_appends(&ctx->s, "GLOBAL_MEM A* dstArg, const GLOBAL_MEM X* dstArgSteps, X i");
+		srcbAppendElemf(&ctx->srcGen, "GLOBAL_MEM A* a_");
+		srcbAppendElemf(&ctx->srcGen, "A a");
 	}
-	strb_appends(&ctx->s, "){\n");
-
+	srcbEndList    (&ctx->srcGen);
+	srcbAppends    (&ctx->srcGen, "){\n");
 
-	/* Post-scalar transformations go here. */
-
-
-	/* Write to memory. */
 	if (reduxIsLargeCodeModel(ctx)){
-		/* Large code model. Easy: just write out the data, since it's safe. */
 		if (reduxKernelRequiresDst   (ctx)){
-			strb_appends(&ctx->s, "\t(*(GLOBAL_MEM T*)((GLOBAL_MEM char*)dst + ");
-			for (i=0;i<ctx->ndd;i++){
-				strb_appendf(&ctx->s, "i%d*dstSteps[%d] +\n\t                                          ", i, i);
-			}
-			strb_appends(&ctx->s, "0)) = v;\n");
+			srcbAppends    (&ctx->srcGen, "\t*d_ = d;\n");
 		}
 		if (reduxKernelRequiresDstArg(ctx)){
-			strb_appends(&ctx->s, "\t(*(GLOBAL_MEM A*)((GLOBAL_MEM char*)dstArg + ");
-			for (i=0;i<ctx->ndd;i++){
-				strb_appendf(&ctx->s, "i%d*dstArgSteps[%d] +\n\t                                             ", i, i);
-			}
-			strb_appends(&ctx->s, "0)) = i;\n");
+			srcbAppends    (&ctx->srcGen, "\t*a_ = a;\n");
 		}
 	}else{
 		/* BUG: Implement the atomic reduction, one or two CAS loops. */
@@ -1382,49 +1466,28 @@ static void  reduxAppendFuncReduxVal       (redux_ctx*  ctx){
 	/* Close off function. */
 	strb_appends(&ctx->s, "}\n\n\n\n");
 }
-static void  reduxAppendFuncPreKernel      (redux_ctx*  ctx){
-
-}
-static void  reduxAppendFuncKernel         (redux_ctx*  ctx){
+static void  reduxAppendReduxKernel        (redux_ctx*  ctx){
 	reduxAppendPrototype        (ctx);
 	strb_appends                (&ctx->s, "{\n");
-	reduxAppendOffsets          (ctx);
 	reduxAppendIndexDeclarations(ctx);
 	reduxAppendRangeCalculations(ctx);
 	reduxAppendLoops            (ctx);
 	strb_appends                (&ctx->s, "}\n");
-}
-static void  reduxAppendFuncPostKernel     (redux_ctx*  ctx){
-
 }
 static void  reduxAppendPrototype          (redux_ctx*  ctx){
-	strb_appends(&ctx->s, "/**\n");
-	strb_appends(&ctx->s, " * Reduction Kernel.\n");
-	strb_appends(&ctx->s, " *\n");
-	strb_appends(&ctx->s, " * Implements actual reduction operation.\n");
-	strb_appends(&ctx->s, " */\n\n");
-	strb_appends(&ctx->s, "KERNEL void redux(const GLOBAL_MEM S*        src,\n");
-	strb_appends(&ctx->s, "                  const X                    srcOff,\n");
-	strb_appends(&ctx->s, "                  const GLOBAL_MEM X*        srcSteps,\n");
-	strb_appends(&ctx->s, "                  const GLOBAL_MEM X*        srcSize,\n");
-	strb_appends(&ctx->s, "                  const GLOBAL_MEM X*        chunkSize,\n");
-	strb_appends(&ctx->s, "                  GLOBAL_MEM T*              dst,\n");
-	strb_appends(&ctx->s, "                  const X                    dstOff,\n");
-	strb_appends(&ctx->s, "                  const GLOBAL_MEM X*        dstSteps,\n");
-	strb_appends(&ctx->s, "                  GLOBAL_MEM A*              dstArg,\n");
-	strb_appends(&ctx->s, "                  const X                    dstArgOff,\n");
-	strb_appends(&ctx->s, "                  const GLOBAL_MEM X*        dstArgSteps)");
-}
-static void  reduxAppendOffsets            (redux_ctx*  ctx){
-	strb_appends(&ctx->s, "\t/* Add offsets */\n");
-	strb_appends(&ctx->s, "\tsrc    = (const GLOBAL_MEM T*)((const GLOBAL_MEM char*)src    + srcOff);\n");
-	if (reduxKernelRequiresDst(ctx)){
-		strb_appends(&ctx->s, "\tdst    = (GLOBAL_MEM T*)      ((GLOBAL_MEM char*)      dst    + dstOff);\n");
+	srcbAppends    (&ctx->srcGen, "KERNEL void reduxKer(");
+	srcbBeginList  (&ctx->srcGen, ", ", "void");
+	reduxAppendTensorDeclArgs(ctx, "S", "src");
+	srcbAppendElemf(&ctx->srcGen, "const GLOBAL_MEM X*        srcSize");
+	srcbAppendElemf(&ctx->srcGen, "const GLOBAL_MEM X*        chunkSize");
+	if(reduxKernelRequiresDst(ctx)){
+		reduxAppendTensorDeclArgs(ctx, "T", "dst");
 	}
-	if (reduxKernelRequiresDstArg(ctx)){
-		strb_appends(&ctx->s, "\tdstArg = (GLOBAL_MEM X*)      ((GLOBAL_MEM char*)      dstArg + dstArgOff);\n");
+	if(reduxKernelRequiresDstArg(ctx)){
+		reduxAppendTensorDeclArgs(ctx, "A", "dstArg");
 	}
-	strb_appends(&ctx->s, "\t\n\t\n");
+	srcbEndList    (&ctx->srcGen);
+	srcbAppends    (&ctx->srcGen, ")");
 }
 static void  reduxAppendIndexDeclarations  (redux_ctx*  ctx){
 	int i;
@@ -1441,39 +1504,39 @@ static void  reduxAppendIndexDeclarations  (redux_ctx*  ctx){
 			             i, i, (i==ctx->pri.ndh-1) ? ";\n" : ", ");
 		}
 	}
-
 	strb_appends(&ctx->s, "\t\n\t\n");
 	strb_appends(&ctx->s, "\t/* Free indices & Reduction indices */\n");
-
 	if (ctx->nds >        0){appendIdxes (&ctx->s, "\tX ", "i", 0,        ctx->nds, "",        ";\n");}
 	if (ctx->nds >        0){appendIdxes (&ctx->s, "\tX ", "i", 0,        ctx->nds, "Dim",     ";\n");}
 	if (ctx->nds >        0){appendIdxes (&ctx->s, "\tX ", "i", 0,        ctx->nds, "Start",   ";\n");}
 	if (ctx->nds >        0){appendIdxes (&ctx->s, "\tX ", "i", 0,        ctx->nds, "End",     ";\n");}
 	if (ctx->nds >        0){appendIdxes (&ctx->s, "\tX ", "i", 0,        ctx->nds, "SStep",   ";\n");}
-	if (ctx->ndd >        0){appendIdxes (&ctx->s, "\tX ", "i", 0,        ctx->ndd, "MStep",   ";\n");}
+	if (ctx->ndd >        0){appendIdxes (&ctx->s, "\tX ", "i", 0,        ctx->ndd, "DStep",   ";\n");}
 	if (ctx->ndd >        0){appendIdxes (&ctx->s, "\tX ", "i", 0,        ctx->ndd, "AStep",   ";\n");}
 	if (ctx->nds > ctx->ndd){appendIdxes (&ctx->s, "\tX ", "i", ctx->ndd, ctx->nds, "PDim",    ";\n");}
-
 	strb_appends(&ctx->s, "\t\n\t\n");
 }
 static void  reduxAppendRangeCalculations  (redux_ctx*  ctx){
 	size_t hwDim;
 	int    i;
 
-	/* Use internal remapping when computing the ranges for this thread. */
 	strb_appends(&ctx->s, "\t/* Compute ranges for this thread. */\n");
 
 	for (i=0;i<ctx->nds;i++){
-		strb_appendf(&ctx->s, "\ti%dDim     = srcSize[%d];\n", i, ctx->srcAxisList[i]);
+		strb_appendf(&ctx->s, "\ti%dDim     = srcSize[%d];\n",  i, ctx->srcAxisList[i]);
 	}
 	for (i=0;i<ctx->nds;i++){
 		strb_appendf(&ctx->s, "\ti%dSStep   = srcSteps[%d];\n", i, ctx->srcAxisList[i]);
 	}
-	for (i=0;i<ctx->ndd;i++){
-		strb_appendf(&ctx->s, "\ti%dMStep   = dstSteps[%d];\n", i, i);
+	if(reduxKernelRequiresDst(ctx)){
+		for (i=0;i<ctx->ndd;i++){
+			strb_appendf(&ctx->s, "\ti%dDStep   = dstSteps[%d];\n", i, i);
+		}
 	}
-	for (i=0;i<ctx->ndd;i++){
-		strb_appendf(&ctx->s, "\ti%dAStep   = dstArgSteps[%d];\n", i, i);
+	if(reduxKernelRequiresDstArg(ctx)){
+		for (i=0;i<ctx->ndd;i++){
+			strb_appendf(&ctx->s, "\ti%dAStep   = dstArgSteps[%d];\n", i, i);
+		}
 	}
 	for (i=ctx->nds-1;i>=ctx->ndd;i--){
 		/**
@@ -1515,192 +1578,114 @@ static void  reduxAppendRangeCalculations  (redux_ctx*  ctx){
 	strb_appends(&ctx->s, "\t\n\t\n");
 }
 static void  reduxAppendLoops              (redux_ctx*  ctx){
-	strb_appends(&ctx->s, "\t/**\n");
-	strb_appends(&ctx->s, "\t * FREE LOOPS.\n");
-	strb_appends(&ctx->s, "\t */\n");
-	strb_appends(&ctx->s, "\t\n");
-
-	reduxAppendLoopMacroDefs  (ctx);
-	reduxAppendLoopOuter      (ctx);
-	reduxAppendLoopMacroUndefs(ctx);
-}
-static void  reduxAppendLoopMacroDefs      (redux_ctx*  ctx){
-	int i;
-
-	/**
-	 * FOROVER Macro
-	 */
-
-	strb_appends(&ctx->s, "#define FOROVER(idx)    for(i##idx = i##idx##Start; i##idx < i##idx##End; i##idx++)\n");
-
-	/**
-	 * ESCAPE Macro
-	 */
-
-	strb_appends(&ctx->s, "#define ESCAPE(idx)     if(i##idx >= i##idx##Dim){continue;}\n");
-
-	/**
-	 * RDXINDEXER Macro
-	 */
-
-	appendIdxes (&ctx->s, "#define RDXINDEXER(", "i", ctx->ndd, ctx->nds, "", ")              (");
-	for (i=ctx->ndd;i<ctx->nds;i++){
-		strb_appendf(&ctx->s, "i%d*i%dPDim + \\\n                                        ", i, i);
-	}
-	strb_appends(&ctx->s, "0)\n");
-}
-static void  reduxAppendLoopOuter          (redux_ctx*  ctx){
 	int i;
 
-	/**
-	 * Outer Loop Header Generation
-	 */
-
-	for (i=0;i<ctx->ndd;i++){
-		strb_appendf(&ctx->s, "\tFOROVER(%d){ESCAPE(%d)\n", i, i);
-	}
-
-	/**
-	 * Inner Loop Generation
-	 */
-
-	reduxAppendLoopInner(ctx);
-
-	/**
-	 * Outer Loop Trailer Generation
-	 */
-
 	for (i=0;i<ctx->ndd;i++){
-		strb_appends(&ctx->s, "\t}\n");
+		srcbAppendf(&ctx->srcGen, "\tFOROVER(%d){ESCAPE(%d)\n", i, i);
 	}
-}
-static void  reduxAppendLoopInner          (redux_ctx*  ctx){
-	int i;
 
-	/**
-	 * Inner Loop Prologue
-	 */
-
-	strb_appends(&ctx->s, "\t\t/**\n");
-	strb_appends(&ctx->s, "\t\t * Reduction initialization.\n");
-	strb_appends(&ctx->s, "\t\t */\n");
-	strb_appends(&ctx->s, "\t\t\n");
-	strb_appends(&ctx->s, "\t\tK rdxV = getInitVal();\n");
+	srcbAppends    (&ctx->srcGen, "\t\tT rdxT;\n");
+	srcbAppends    (&ctx->srcGen, "\t\tK rdxK = getInitValKFn();\n");
 	if (reduxKernelRequiresDstArg(ctx)){
-		strb_appends(&ctx->s, "\t\tX argI = 0;\n");
+		srcbAppends(&ctx->srcGen, "\t\tX rdxA = 0;\n");
 	}
-	strb_appends(&ctx->s, "\t\t\n");
-	strb_appends(&ctx->s, "\t\t/**\n");
-	strb_appends(&ctx->s, "\t\t * REDUCTION LOOPS.\n");
-	strb_appends(&ctx->s, "\t\t */\n");
-	strb_appends(&ctx->s, "\t\t\n");
-
-	/**
-	 * Inner Loop Header Generation
-	 */
+	srcbAppends    (&ctx->srcGen, "\t\t\n");
 
 	for (i=ctx->ndd;i<ctx->nds;i++){
-		strb_appendf(&ctx->s, "\t\tFOROVER(%d){ESCAPE(%d)\n", i, i);
+		srcbAppendf    (&ctx->srcGen, "\t\tFOROVER(%d){ESCAPE(%d)\n", i, i);
 	}
 
+	srcbAppends    (&ctx->srcGen, "\t\t\tS s = srcVal;\n");
+
 	/**
-	 * Inner Loop Body Generation
+	 * Prescalar transformations go here. They transform and coerce the S-typed
+	 * value s into the K-typed value k.
 	 */
 
-	appendIdxes (&ctx->s, "\t\t\tK v = loadVal(", "i", 0, ctx->nds, "", "");
-	if (ctx->nds > 0){
-		strb_appends(&ctx->s, ", ");
-	}
-	strb_appends(&ctx->s, "src, srcSteps);\n");
-	strb_appends(&ctx->s, "\t\t\t\n");
+	srcbAppends    (&ctx->srcGen, "\t\t\tK k = s;\n");
+
 	switch (ctx->op){
 		case GA_REDUCE_SUM:
-		  strb_appends(&ctx->s, "\t\t\trdxV += v;\n");
+		  srcbAppends(&ctx->srcGen, "\t\t\trdxK += k;\n");
 		break;
 		case GA_REDUCE_PROD:
-		  strb_appends(&ctx->s, "\t\t\trdxV *= v;\n");
+		  srcbAppends(&ctx->srcGen, "\t\t\trdxK *= k;\n");
 		break;
 		case GA_REDUCE_PRODNZ:
-		  strb_appends(&ctx->s, "\t\t\trdxV *= v==0 ? getInitVal() : v;\n");
+		  srcbAppends(&ctx->srcGen, "\t\t\trdxK *= k==0 ? getInitValKFn() : k;\n");
 		break;
 		case GA_REDUCE_MIN:
-		  strb_appends(&ctx->s, "\t\t\trdxV  = min(rdxV, v);\n");
+		  srcbAppends(&ctx->srcGen, "\t\t\trdxK  = min(rdxK, k);\n");
 		break;
 		case GA_REDUCE_MAX:
-		  strb_appends(&ctx->s, "\t\t\trdxV  = max(rdxV, v);\n");
+		  srcbAppends(&ctx->srcGen, "\t\t\trdxK  = max(rdxK, k);\n");
 		break;
 		case GA_REDUCE_ARGMIN:
 		case GA_REDUCE_MINANDARGMIN:
-		  strb_appends(&ctx->s, "\t\t\trdxV  = min(rdxV, v);\n");
-		  strb_appends(&ctx->s, "\t\t\tif(v == rdxV){\n");
-		  appendIdxes (&ctx->s, "\t\t\t\targI = RDXINDEXER(", "i", ctx->ndd, ctx->nds, "", ");\n");
-		  strb_appends(&ctx->s, "\t\t\t}\n");
+		  srcbAppends(&ctx->srcGen, "\t\t\trdxK  = min(rdxK, k);\n"
+		                            "\t\t\tif(rdxK == k){\n"
+		                            "\t\t\t\trdxA = rdxIdx;\n"
+		                            "\t\t\t}\n");
 		break;
 		case GA_REDUCE_ARGMAX:
 		case GA_REDUCE_MAXANDARGMAX:
-		  strb_appends(&ctx->s, "\t\t\trdxV  = max(rdxV, v);\n");
-		  strb_appends(&ctx->s, "\t\t\tif(v == rdxV){\n");
-		  appendIdxes (&ctx->s, "\t\t\t\targI = RDXINDEXER(", "i", ctx->ndd, ctx->nds, "", ");\n");
-		  strb_appends(&ctx->s, "\t\t\t}\n");
+		  srcbAppends(&ctx->srcGen, "\t\t\trdxK  = max(rdxK, k);\n"
+		                            "\t\t\tif(rdxK == k){\n"
+		                            "\t\t\t\trdxA = rdxIdx;\n"
+		                            "\t\t\t}\n");
 		break;
 		case GA_REDUCE_AND:
-		  strb_appends(&ctx->s, "\t\t\trdxV &= v;\n");
+		  srcbAppends(&ctx->srcGen, "\t\t\trdxK &= k;\n");
 		break;
 		case GA_REDUCE_OR:
-		  strb_appends(&ctx->s, "\t\t\trdxV |= v;\n");
+		  srcbAppends(&ctx->srcGen, "\t\t\trdxK |= k;\n");
 		break;
 		case GA_REDUCE_XOR:
-		  strb_appends(&ctx->s, "\t\t\trdxV ^= v;\n");
+		  srcbAppends(&ctx->srcGen, "\t\t\trdxK ^= k;\n");
 		break;
 		case GA_REDUCE_ALL:
-		  strb_appends(&ctx->s, "\t\t\trdxV  = rdxV && v;\n");
+		  srcbAppends(&ctx->srcGen, "\t\t\trdxK  = rdxK && k;\n");
 		break;
 		case GA_REDUCE_ANY:
-		  strb_appends(&ctx->s, "\t\t\trdxV  = rdxV || v;\n");
+		  srcbAppends(&ctx->srcGen, "\t\t\trdxK  = rdxK || k;\n");
 		break;
 	}
 
-	/**
-	 * Inner Loop Trailer Generation
-	 */
-
 	for (i=ctx->ndd;i<ctx->nds;i++){
-		strb_appends(&ctx->s, "\t\t}\n");
+		srcbAppends(&ctx->srcGen, "\t\t}\n");
 	}
-	strb_appends(&ctx->s, "\t\t\n");
+	srcbAppends(&ctx->srcGen, "\t\t\n");
 
 	/**
-	 * Inner Loop Epilogue Generation
+	 * Large code model: Postscalar transformations go here, coercing the
+	 * K-typed value rdxK to the T-typed value rdxT
 	 */
 
-	strb_appends(&ctx->s, "\t\t/**\n");
-	strb_appends(&ctx->s, "\t\t * Destination writeback.\n");
-	strb_appends(&ctx->s, "\t\t */\n");
-	strb_appends(&ctx->s, "\t\t\n");
-	if       ( reduxKernelRequiresDst   (ctx) && !reduxKernelRequiresDstArg(ctx)){
-		appendIdxes (&ctx->s, "\t\treduxVal(", "i", 0, ctx->ndd, "", "");
-		if (ctx->ndd > 0){
-			strb_appends(&ctx->s, ", ");
-		}
-		strb_appends(&ctx->s, "dst, dstSteps, rdxV);\n");
-	}else if (!reduxKernelRequiresDst   (ctx) &&  reduxKernelRequiresDstArg(ctx)){
-		appendIdxes (&ctx->s, "\t\treduxVal(", "i", 0, ctx->ndd, "", "");
-		if (ctx->ndd > 0){
-			strb_appends(&ctx->s, ", ");
-		}
-		strb_appends(&ctx->s, "dstArg, dstArgSteps, argI);\n");
-	}else if ( reduxKernelRequiresDst   (ctx) &&  reduxKernelRequiresDstArg(ctx)){
-		appendIdxes (&ctx->s, "\t\treduxVal(", "i", 0, ctx->ndd, "", "");
-		if (ctx->ndd > 0){
-			strb_appends(&ctx->s, ", ");
-		}
-		strb_appends(&ctx->s, "dst, dstSteps, rdxV, dstArg, dstArgSteps, argI);\n");
+	srcbAppends    (&ctx->srcGen, "\t\trdxT = rdxK;\n");
+
+	/* Final writeback. */
+	srcbAppends    (&ctx->srcGen, "\t\twriteBackFn(");
+	srcbBeginList  (&ctx->srcGen, ", ", "");
+	if (reduxKernelRequiresDst(ctx)){
+		srcbAppendElemf(&ctx->srcGen, "&dstVal");
+		srcbAppendElemf(&ctx->srcGen, "rdxT");
+	}
+	if (reduxKernelRequiresDstArg(ctx)){
+		srcbAppendElemf(&ctx->srcGen, "&dstArgVal");
+		srcbAppendElemf(&ctx->srcGen, "rdxA");
+	}
+	srcbEndList    (&ctx->srcGen);
+	srcbAppends    (&ctx->srcGen, ");\n");
+
+	for (i=0;i<ctx->ndd;i++){
+		srcbAppends(&ctx->srcGen, "\t}\n");
 	}
 }
-static void  reduxAppendLoopMacroUndefs    (redux_ctx*  ctx){
-	strb_appends(&ctx->s, "#undef FOROVER\n");
-	strb_appends(&ctx->s, "#undef ESCAPE\n");
-	strb_appends(&ctx->s, "#undef RDXINDEXER\n");
+static void  reduxAppendInitKernel         (redux_ctx*  ctx){
+	/* BUG: Implement this for small code model. */
+}
+static void  reduxAppendPostKernel         (redux_ctx*  ctx){
+	/* BUG: Implement this for small code model. */
 }
 
 /**
@@ -1713,43 +1698,43 @@ static int   reduxCompile                  (redux_ctx*  ctx){
 	size_t PRI_TYPECODES_LEN;
 	int*   AUX_TYPECODES;
 	size_t AUX_TYPECODES_LEN;
-	
-	
+
+
 	/**
 	 * Construct Argument Typecode Lists.
 	 */
-	
-	PRI_TYPECODES[i++] = GA_BUFFER; /* src */
+
+	PRI_TYPECODES[i++] = GA_BUFFER; /* srcPtr */
 	PRI_TYPECODES[i++] = GA_SIZE;   /* srcOff */
 	PRI_TYPECODES[i++] = GA_BUFFER; /* srcSteps */
 	PRI_TYPECODES[i++] = GA_BUFFER; /* srcSize */
 	PRI_TYPECODES[i++] = GA_BUFFER; /* chnkSize */
-	if(reduxKernelRequiresDst(ctx)){
-		PRI_TYPECODES[i++] = GA_BUFFER; /* dst */
+	if (reduxKernelRequiresDst(ctx)){
+		PRI_TYPECODES[i++] = GA_BUFFER; /* dstPtr */
 		PRI_TYPECODES[i++] = GA_SIZE;   /* dstOff */
 		PRI_TYPECODES[i++] = GA_BUFFER; /* dstSteps */
 	}
-	if(reduxKernelRequiresDstArg(ctx)){
-		PRI_TYPECODES[i++] = GA_BUFFER; /* dstArg */
+	if (reduxKernelRequiresDstArg(ctx)){
+		PRI_TYPECODES[i++] = GA_BUFFER; /* dstArgPtr */
 		PRI_TYPECODES[i++] = GA_SIZE;   /* dstArgOff */
 		PRI_TYPECODES[i++] = GA_BUFFER; /* dstArgSteps */
 	}
 	PRI_TYPECODES_LEN  = i;
 	AUX_TYPECODES      = &PRI_TYPECODES[3];
 	AUX_TYPECODES_LEN  = PRI_TYPECODES_LEN-3;
-	
-	
+
+
 	/**
 	 * Compile the kernels.
 	 */
-	
+
 	{
 		ret  = GpuKernel_init(&ctx->kernel,
 		                      ctx->gpuCtx,
 		                      1,
 		                      (const char**)&ctx->sourceCode,
 		                      &ctx->sourceCodeLen,
-		                      "redux",
+		                      "reduxKer",
 		                      PRI_TYPECODES_LEN,
 		                      PRI_TYPECODES,
 		                      GA_USE_CLUDA,
@@ -1758,13 +1743,13 @@ static int   reduxCompile                  (redux_ctx*  ctx){
 			return reduxCleanup(ctx, ret);
 		}
 	}
-	if(reduxIsSmallCodeModel(ctx)){
+	if (reduxIsSmallCodeModel(ctx)){
 		ret  = GpuKernel_init(&ctx->kernel,
 		                      ctx->gpuCtx,
 		                      1,
 		                      (const char**)&ctx->sourceCode,
 		                      &ctx->sourceCodeLen,
-		                      "preRedux",
+		                      "initKer",
 		                      AUX_TYPECODES_LEN,
 		                      AUX_TYPECODES,
 		                      GA_USE_CLUDA,
@@ -1777,7 +1762,7 @@ static int   reduxCompile                  (redux_ctx*  ctx){
 		                      1,
 		                      (const char**)&ctx->sourceCode,
 		                      &ctx->sourceCodeLen,
-		                      "postRedux",
+		                      "postKer",
 		                      AUX_TYPECODES_LEN,
 		                      AUX_TYPECODES,
 		                      GA_USE_CLUDA,
@@ -1810,8 +1795,8 @@ static int   reduxSchedule                 (redux_ctx*  ctx){
 	size_t   warpSize,
 	         maxL, maxL0, maxL1, maxL2,
 	         maxG, maxG0, maxG1, maxG2;
-	
-	
+
+
 	/**
 	 * Obtain the constraints of our problem.
 	 */
@@ -1827,14 +1812,14 @@ static int   reduxSchedule                 (redux_ctx*  ctx){
 	gpukernel_property(ctx->kernel.k,     GA_KERNEL_PROP_MAXLSIZE,  &maxL);
 	maxLgRdx  = maxL;
 	maxLgPri  = maxLgRdx;
-	if(reduxIsSmallCodeModel(ctx)){
+	if (reduxIsSmallCodeModel(ctx)){
 		gpukernel_property(ctx->preKernel.k,  GA_KERNEL_PROP_MAXLSIZE,  &maxL);
 		maxLgPre  = maxL;
 		gpukernel_property(ctx->postKernel.k, GA_KERNEL_PROP_MAXLSIZE,  &maxL);
 		maxLgPost = maxL;
 		maxLgAux  = maxLgPre<maxLgPost ? maxLgPre : maxLgPost;
 	}
-	
+
 	priNdims  = ctx->pri.ndh;
 	maxGs[0]  = maxG0;
 	maxGs[1]  = maxG1;
@@ -1846,18 +1831,18 @@ static int   reduxSchedule                 (redux_ctx*  ctx){
 	for (i=0;i<priNdims;i++){
 		priDims[i] = ctx->src->dimensions[ctx->pri.axisList[i]];
 	}
-	if(reduxIsSmallCodeModel(ctx)){
+	if (reduxIsSmallCodeModel(ctx)){
 		auxNdims  = ctx->aux.ndh;
 		for (i=0;i<auxNdims;i++){
 			auxDims[i] = ctx->src->dimensions[ctx->aux.axisList[i]];
 		}
 	}
-	
-	
+
+
 	/**
 	 * Apply the solver.
 	 */
-	
+
 	{
 		reduxScheduleKernel(priNdims,
 		                    priDims,
@@ -1890,14 +1875,14 @@ static int   reduxSchedule                 (redux_ctx*  ctx){
 			ctx->aux.bs[i] = ctx->aux.gs[i] = ctx->aux.cs[i] = 1;
 		}
 	}
-	
+
 	return reduxInvoke(ctx);
 }
 
 /**
  * @brief Given the parameters of a kernel scheduling problem, solve it as
  *        optimally as possible.
- * 
+ *
  * NB: This is the only function in this entire file that should have
  *     anything to do with the integer factorization APIs.
  */
@@ -1919,20 +1904,20 @@ static void  reduxScheduleKernel           (int         ndims,
 	ga_factor_list factBS     [MAX_HW_DIMS];
 	ga_factor_list factGS     [MAX_HW_DIMS];
 	ga_factor_list factCS     [MAX_HW_DIMS];
-	
-	
+
+
 	/**
 	 * Quick check for scalar case.
 	 */
-	
+
 	if (ndims <= 0){
 		return;
 	}
-	
-	
+
+
 	/**
 	 * Identify the dimension to which the warp factor will be given.
-	 * 
+	 *
 	 * The current heuristic is to find the dimension that is either
 	 *   1) Evenly divided by the warp size, or
 	 *   2) As close to filling the last warp as possible.
@@ -2017,7 +2002,7 @@ static int   reduxInvoke                   (redux_ctx*  ctx){
 	                                     ctx->src->dimensions,  flags, 0);
 	ctx->pri.chunkSizeGD = gpudata_alloc(ctx->gpuCtx, ctx->pri.ndh * sizeof(size_t),
 	                                     ctx->pri.cs,           flags, 0);
-	
+
 	priArgs[i++] = (void*) ctx->src->data;
 	priArgs[i++] = (void*)&ctx->src->offset;
 	priArgs[i++] = (void*) ctx->srcStepsGD;
@@ -2025,21 +2010,21 @@ static int   reduxInvoke                   (redux_ctx*  ctx){
 	priArgs[i++] = (void*) ctx->pri.chunkSizeGD;
 	if (reduxKernelRequiresDst   (ctx)){
 		ctx->dstStepsGD      = gpudata_alloc(ctx->gpuCtx, ctx->ndd * sizeof(size_t),
-		                                     ctx->dst->strides,    flags, 0);
-		priArgs[i++]         = (void*) ctx->dst->data;
-		priArgs[i++]         = (void*)&ctx->dst->offset;
+		                                     ctx->wsDst->strides,    flags, 0);
+		priArgs[i++]         = (void*) ctx->wsDst->data;
+		priArgs[i++]         = (void*)&ctx->wsDst->offset;
 		priArgs[i++]         = (void*) ctx->dstStepsGD;
 		failedDstSteps       =        !ctx->dstStepsGD;
 	}
 	if (reduxKernelRequiresDstArg(ctx)){
 		ctx->dstArgStepsGD   = gpudata_alloc(ctx->gpuCtx, ctx->ndd * sizeof(size_t),
-		                                     ctx->dstArg->strides, flags, 0);
-		priArgs[i++]         = (void*) ctx->dstArg->data;
-		priArgs[i++]         = (void*)&ctx->dstArg->offset;
+		                                     ctx->wsDstArg->strides, flags, 0);
+		priArgs[i++]         = (void*) ctx->wsDstArg->data;
+		priArgs[i++]         = (void*)&ctx->wsDstArg->offset;
 		priArgs[i++]         = (void*) ctx->dstArgStepsGD;
 		failedDstArgSteps    =        !ctx->dstArgStepsGD;
 	}
-	if (reduxIsSmallCodeModel(ctx)){
+	if (reduxIsSmallCodeModel    (ctx)){
 		/**
 		 * The auxiliary kernel's args are identical to the primary kernel's,
 		 * except that the first three arguments are deleted and the fifth
@@ -2065,7 +2050,7 @@ static int   reduxInvoke                   (redux_ctx*  ctx){
 	    !failedDstArgSteps   &&
 	    !failedAuxChunkSize){
 		/* Pre-kernel invocation, if necessary */
-		if(reduxIsSmallCodeModel(ctx)){
+		if (reduxIsSmallCodeModel(ctx)){
 			ret = GpuKernel_call(&ctx->preKernel,
 			                     ctx->aux.ndh>0 ? ctx->aux.ndh : 1,
 			                     ctx->aux.gs,
@@ -2089,7 +2074,7 @@ static int   reduxInvoke                   (redux_ctx*  ctx){
 		}
 
 		/* Post-kernel invocation, if necessary */
-		if(reduxIsSmallCodeModel(ctx)){
+		if (reduxIsSmallCodeModel(ctx)){
 			ret = GpuKernel_call(&ctx->postKernel,
 			                     ctx->aux.ndh>0 ? ctx->aux.ndh : 1,
 			                     ctx->aux.gs,
@@ -2112,14 +2097,25 @@ static int   reduxInvoke                   (redux_ctx*  ctx){
  */
 
 static int   reduxCleanup                  (redux_ctx*  ctx, int ret){
+	if (ctx->dst    != ctx->wsDst){
+		GpuArray_clear(ctx->wsDst);
+		free(ctx->wsDst);
+		ctx->wsDst    = NULL;
+	}
+	if (ctx->dstArg != ctx->wsDstArg){
+		GpuArray_clear(ctx->wsDstArg);
+		free(ctx->wsDstArg);
+		ctx->wsDstArg = NULL;
+	}
+
 	free(ctx->srcAxisList);
-	free(ctx->dstAxisList);
+	free(ctx->dstDims);
 	free(ctx->sourceCode);
 	free(ctx->errorString0);
 	free(ctx->errorString1);
 	free(ctx->errorString2);
 	ctx->srcAxisList  = NULL;
-	ctx->dstAxisList  = NULL;
+	ctx->dstDims      = NULL;
 	ctx->sourceCode   = NULL;
 	ctx->errorString0 = NULL;
 	ctx->errorString1 = NULL;
diff --git a/src/util/srcgen.h b/src/util/srcgen.h
new file mode 100644
index 0000000000..c577b47c72
--- /dev/null
+++ b/src/util/srcgen.h
@@ -0,0 +1,106 @@
+/* Include Guards */
+#ifndef SRCGEN_H
+#define SRCGEN_H
+
+
+/* Includes */
+#include "util/strb.h"
+
+
+/* Extern "C" Guard */
+#ifdef __cplusplus
+extern "C" {
+#endif
+#ifdef CONFUSE_EMACS
+}
+#endif
+
+
+
+/* Data Structure Prototypes & Typedefs */
+struct srcb;
+typedef struct srcb srcb;
+
+
+
+/* Enumerations */
+enum srcb_state{
+	SRCB_STATE_NONE,
+	SRCB_STATE_INLIST,
+};
+typedef enum srcb_state srcb_state;
+
+
+
+/* Data Structures */
+
+/**
+ * @brief The srcb struct
+ * 
+ * The Source Code Buffer. Augments strb with C-like language generation tools.
+ */
+
+struct srcb{
+	strb*       s;
+	srcb_state  state;
+	int         numElems;
+	const char* sep;
+	const char* empty;
+};
+
+
+
+/* Functions */
+static inline  void srcbInit  (srcb* s, strb* sb){
+	s->s          = sb;
+	s->state      = SRCB_STATE_NONE;
+	s->numElems   = 0;
+}
+static inline  void srcbBeginList(srcb* s, const char* sep, const char* empty){
+	s->state      = SRCB_STATE_INLIST;
+	s->numElems   = 0;
+	s->sep        = sep;
+	s->empty      = empty;
+}
+static inline  void srcbEndList(srcb* s){
+	if(s->numElems == 0){
+		strb_appends(s->s, s->empty);
+	}
+	
+	s->state      = SRCB_STATE_NONE;
+	s->numElems   = 0;
+	s->sep        = "";
+	s->empty      = "";
+}
+static inline  void srcbAppendElemv(srcb* s, const char *f, va_list ap){
+	if(s->numElems > 0){
+		strb_appends(s->s, s->sep);
+	}
+	
+	strb_appendv(s->s, f, ap);
+	
+	s->numElems++;
+}
+static inline  void srcbAppendElemf(srcb* s, const char *f, ...){
+	va_list ap;
+	va_start(ap, f);
+	srcbAppendElemv(s, f, ap);
+	va_end(ap);
+}
+static inline  void srcbAppends(srcb* s, const char *f){
+	strb_appends(s->s, f);
+}
+static inline  void srcbAppendf(srcb* s, const char *f, ...){
+	va_list ap;
+	va_start(ap, f);
+	strb_appendv(s->s, f, ap);
+	va_end(ap);
+}
+
+
+/* End Extern "C" Guard */
+#ifdef __cplusplus
+}
+#endif
+
+#endif

From 8fe9083490e2f65e8e7f25cd1ba72d290879f50c Mon Sep 17 00:00:00 2001
From: Olexa Bilaniuk <obilaniu@gmail.com>
Date: Sun, 5 Mar 2017 01:24:21 -0500
Subject: [PATCH 09/34] Added testcases for all reductions.

All tests pass, but currently the codegen is locked to the large code
model (the small code model has most of the groundwork laid down but has
several extra complexities which haven't yet been implemented, like
atomic reduction operators.
---
 tests/check_reduction.c | 3120 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 3086 insertions(+), 34 deletions(-)

diff --git a/tests/check_reduction.c b/tests/check_reduction.c
index 2d47d6541d..370f074167 100644
--- a/tests/check_reduction.c
+++ b/tests/check_reduction.c
@@ -23,7 +23,7 @@ void teardown(void);
 
 /**
  * PRNG based on PCG XSH RR 64/32 (LCG)
- * 
+ *
  * Used to generate random data for the kernel tests.
  */
 
@@ -44,15 +44,15 @@ static       void     pcgSeed  (uint64_t seed){
 }
 static       uint32_t pcgRand  (void){
 	pcgS = pcgS*pcgM + pcgA;
-	
+
 	/**
 	 * PCG does something akin to an unbalanced Feistel round to blind the LCG
 	 * state:
-	 * 
+	 *
 	 * The rightmost 59 bits are involved in an xorshift by 18.
 	 * The leftmost   5 bits select a rotation of the 32 bits 58:27.
 	 */
-	
+
 	return pcgRor32((pcgS^(pcgS>>18))>>27, pcgS>>59);
 }
 static       double   pcgRand01(void){
@@ -522,6 +522,107 @@ START_TEST(test_minandargmin_reduction){
 	GpuArray_clear(&gaArgmin);
 }END_TEST
 
+START_TEST(test_minandargmin_veryhighrank){
+	pcgSeed(1);
+
+	/**
+	 * Here we test a reduction of a random 8D tensor on four dimensions.
+	 */
+
+	size_t i,j,k,l,m,n,o,p;
+	size_t dims   [8]  = {1171,373,2,1,2,1,2,1};
+	size_t prodDims    = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7];
+	size_t rdxDims[4]  = {1171,373,1,2};
+	size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3];
+	const unsigned reduxList[] = {2,4,7,5};
+
+	float*  pSrc    = calloc(1, sizeof(*pSrc)    * prodDims);
+	float*  pMin    = calloc(1, sizeof(*pMin)    * rdxProdDims);
+	size_t* pArgmin = calloc(1, sizeof(*pArgmin) * rdxProdDims);
+
+	ck_assert_ptr_ne(pSrc,    NULL);
+	ck_assert_ptr_ne(pMin,    NULL);
+	ck_assert_ptr_ne(pArgmin, NULL);
+
+
+	/**
+	 * Initialize source data.
+	 */
+
+	for(i=0;i<prodDims;i++){
+		pSrc[i] = pcgRand01();
+	}
+
+
+	/**
+	 * Run the kernel.
+	 */
+
+	GpuArray gaSrc;
+	GpuArray gaMin;
+	GpuArray gaArgmin;
+
+	ga_assert_ok(GpuArray_empty(&gaSrc,    ctx, GA_FLOAT, 8, dims,    GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaMin,    ctx, GA_FLOAT, 4, rdxDims, GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaArgmin, ctx, GA_SIZE,  4, rdxDims, GA_C_ORDER));
+
+	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaMin,    -1));  /* 0xFFFFFFFF is a qNaN. */
+	ga_assert_ok(GpuArray_memset(&gaArgmin, -1));
+
+	ga_assert_ok(GpuArray_minandargmin(&gaMin, &gaArgmin, &gaSrc, 4, reduxList));
+
+	ga_assert_ok(GpuArray_read(pMin,    sizeof(*pMin)   *rdxProdDims, &gaMin));
+	ga_assert_ok(GpuArray_read(pArgmin, sizeof(*pArgmin)*rdxProdDims, &gaArgmin));
+
+
+	/**
+	 * Check that the destination tensors are correct.
+	 */
+
+	for(i=0;i<dims[0];i++){
+		for(j=0;j<dims[1];j++){
+			for(l=0;l<dims[3];l++){
+				for(o=0;o<dims[6];o++){
+					size_t gtArgmin = 0;
+					float  gtMin    = pSrc[(((((((i)*dims[1] + j)*dims[2] + 0)*dims[3] + l)*dims[4] + 0)*dims[5] + 0)*dims[6] + o)*dims[7] + 0];
+
+					for(k=0;k<dims[2];k++){
+						for(m=0;m<dims[4];m++){
+							for(p=0;p<dims[7];p++){
+								for(n=0;n<dims[5];n++){
+									float v = pSrc[(((((((i)*dims[1] + j)*dims[2] + k)*dims[3] + l)*dims[4] + m)*dims[5] + n)*dims[6] + o)*dims[7] + p];
+
+									if(v < gtMin){
+										gtMin    = v;
+										gtArgmin = (((k)*dims[4] + m)*dims[7] + p)*dims[5] + n;
+									}
+								}
+							}
+						}
+					}
+
+					size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
+					ck_assert_msg(gtMin    == pMin[dstIdx],    "Min value mismatch!");
+					ck_assert_msg(gtArgmin == pArgmin[dstIdx], "Argmin value mismatch!");
+				}
+			}
+		}
+	}
+
+
+	/**
+	 * Deallocate.
+	 */
+
+	free(pSrc);
+	free(pMin);
+	free(pArgmin);
+	GpuArray_clear(&gaSrc);
+	GpuArray_clear(&gaMin);
+	GpuArray_clear(&gaArgmin);
+}END_TEST
+
 START_TEST(test_minandargmin_alldimsreduced){
 	pcgSeed(1);
 
@@ -609,23 +710,26 @@ START_TEST(test_minandargmin_alldimsreduced){
 	GpuArray_clear(&gaArgmin);
 }END_TEST
 
-START_TEST(test_min_alldimsreduced){
+START_TEST(test_argmax_reduction){
 	pcgSeed(1);
 
 	/**
-	 * We test here a reduction of some random 3D tensor on all dimensions.
+	 * We test here a reduction of some random 3D tensor on the first and
+	 * third dimensions.
 	 */
 
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
-	const unsigned reduxList[] = {0,1,2};
+	const unsigned reduxList[] = {0,2};
 
 	float*  pSrc    = calloc(1, sizeof(*pSrc)    * dims[0]*dims[1]*dims[2]);
-	float*  pMin    = calloc(1, sizeof(*pMin)                             );
+	float*  pMax    = calloc(1, sizeof(*pMax)    *         dims[1]        );
+	size_t* pArgmax = calloc(1, sizeof(*pArgmax) *         dims[1]        );
 
 	ck_assert_ptr_ne(pSrc,    NULL);
-	ck_assert_ptr_ne(pMin,    NULL);
+	ck_assert_ptr_ne(pMax,    NULL);
+	ck_assert_ptr_ne(pArgmax, NULL);
 
 
 	/**
@@ -642,63 +746,3011 @@ START_TEST(test_min_alldimsreduced){
 	 */
 
 	GpuArray gaSrc;
-	GpuArray gaMin;
+	GpuArray gaArgmax;
 
 	ga_assert_ok(GpuArray_empty(&gaSrc,    ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty(&gaMin,    ctx, GA_FLOAT, 0, NULL,     GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaArgmax, ctx, GA_SIZE,  1, &dims[1], GA_C_ORDER));
 
 	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
-	ga_assert_ok(GpuArray_memset(&gaMin,    -1));  /* 0xFFFFFFFF is a qNaN. */
+	ga_assert_ok(GpuArray_memset(&gaArgmax, -1));
 
-	ga_assert_ok(GpuArray_min(&gaMin, &gaSrc, 3, reduxList));
+	ga_assert_ok(GpuArray_argmax(&gaArgmax, &gaSrc, 2, reduxList));
 
-	ga_assert_ok(GpuArray_read(pMin,    sizeof(*pMin),    &gaMin));
+	ga_assert_ok(GpuArray_read(pArgmax, sizeof(*pArgmax)*dims[1], &gaArgmax));
 
 
 	/**
 	 * Check that the destination tensors are correct.
 	 */
 
-	float  gtMin    = pSrc[0];
+	for(j=0;j<dims[1];j++){
+		size_t gtArgmax = 0;
+		float  gtMax    = pSrc[(0*dims[1] + j)*dims[2] + 0];
 
-	for(i=0;i<dims[0];i++){
-		for(j=0;j<dims[1];j++){
+		for(i=0;i<dims[0];i++){
 			for(k=0;k<dims[2];k++){
 				float v = pSrc[(i*dims[1] + j)*dims[2] + k];
 
-				if(v < gtMin){
-					gtMin    = v;
+				if(v > gtMax){
+					gtMax    = v;
+					gtArgmax = i*dims[2] + k;
+				}
+			}
+		}
+
+		ck_assert_msg(gtArgmax == pArgmax[j], "Argmax value mismatch!");
+	}
+
+	/**
+	 * Deallocate.
+	 */
+
+	free(pSrc);
+	free(pMax);
+	free(pArgmax);
+	GpuArray_clear(&gaSrc);
+	GpuArray_clear(&gaArgmax);
+}END_TEST
+
+START_TEST(test_argmax_veryhighrank){
+	pcgSeed(1);
+
+	/**
+	 * Here we test a reduction of a random 8D tensor on four dimensions.
+	 */
+
+	size_t i,j,k,l,m,n,o,p;
+	size_t dims   [8]  = {1171,373,2,1,2,1,2,1};
+	size_t prodDims    = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7];
+	size_t rdxDims[4]  = {1171,373,1,2};
+	size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3];
+	const unsigned reduxList[] = {2,4,7,5};
+
+	float*  pSrc    = calloc(1, sizeof(*pSrc)    * prodDims);
+	float*  pMax    = calloc(1, sizeof(*pMax)    * rdxProdDims);
+	size_t* pArgmax = calloc(1, sizeof(*pArgmax) * rdxProdDims);
+
+	ck_assert_ptr_ne(pSrc,    NULL);
+	ck_assert_ptr_ne(pArgmax, NULL);
+
+
+	/**
+	 * Initialize source data.
+	 */
+
+	for(i=0;i<prodDims;i++){
+		pSrc[i] = pcgRand01();
+	}
+
+
+	/**
+	 * Run the kernel.
+	 */
+
+	GpuArray gaSrc;
+	GpuArray gaArgmax;
+
+	ga_assert_ok(GpuArray_empty(&gaSrc,    ctx, GA_FLOAT, 8, dims,    GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaArgmax, ctx, GA_SIZE,  4, rdxDims, GA_C_ORDER));
+
+	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaArgmax, -1));
+
+	ga_assert_ok(GpuArray_argmax(&gaArgmax, &gaSrc, 4, reduxList));
+
+	ga_assert_ok(GpuArray_read(pArgmax, sizeof(*pArgmax)*rdxProdDims, &gaArgmax));
+
+
+	/**
+	 * Check that the destination tensors are correct.
+	 */
+
+	for(i=0;i<dims[0];i++){
+		for(j=0;j<dims[1];j++){
+			for(l=0;l<dims[3];l++){
+				for(o=0;o<dims[6];o++){
+					size_t gtArgmax = 0;
+					float  gtMax    = pSrc[(((((((i)*dims[1] + j)*dims[2] + 0)*dims[3] + l)*dims[4] + 0)*dims[5] + 0)*dims[6] + o)*dims[7] + 0];
+
+					for(k=0;k<dims[2];k++){
+						for(m=0;m<dims[4];m++){
+							for(p=0;p<dims[7];p++){
+								for(n=0;n<dims[5];n++){
+									float v = pSrc[(((((((i)*dims[1] + j)*dims[2] + k)*dims[3] + l)*dims[4] + m)*dims[5] + n)*dims[6] + o)*dims[7] + p];
+
+									if(v > gtMax){
+										gtMax    = v;
+										gtArgmax = (((k)*dims[4] + m)*dims[7] + p)*dims[5] + n;
+									}
+								}
+							}
+						}
+					}
+
+					size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
+					ck_assert_msg(gtArgmax == pArgmax[dstIdx], "Argmax value mismatch!");
 				}
 			}
 		}
 	}
 
-	ck_assert_msg(gtMin    == pMin[0],    "Min value mismatch!");
 
 	/**
 	 * Deallocate.
 	 */
 
 	free(pSrc);
-	free(pMin);
+	free(pMax);
+	free(pArgmax);
 	GpuArray_clear(&gaSrc);
-	GpuArray_clear(&gaMin);
+	GpuArray_clear(&gaArgmax);
 }END_TEST
 
+START_TEST(test_argmax_alldimsreduced){
+	pcgSeed(1);
 
-Suite *get_suite(void) {
-	Suite *s  = suite_create("reduction");
-	TCase *tc = tcase_create("basic");
-	tcase_add_checked_fixture(tc, setup, teardown);
-	tcase_set_timeout(tc, 15.0);
+	/**
+	 * We test here a reduction of some random 3D tensor on all dimensions.
+	 */
 
-	tcase_add_test(tc, test_maxandargmax_reduction);
-	tcase_add_test(tc, test_maxandargmax_idxtranspose);
-	tcase_add_test(tc, test_maxandargmax_veryhighrank);
-	tcase_add_test(tc, test_maxandargmax_alldimsreduced);
-	tcase_add_test(tc, test_minandargmin_reduction);
-	tcase_add_test(tc, test_minandargmin_alldimsreduced);
-	tcase_add_test(tc, test_min_alldimsreduced);
+	size_t i,j,k;
+	size_t dims[3]  = {32,50,79};
+	size_t prodDims = dims[0]*dims[1]*dims[2];
+	const unsigned reduxList[] = {0,1,2};
+
+	float*  pSrc    = calloc(1, sizeof(*pSrc)    * dims[0]*dims[1]*dims[2]);
+	float*  pMax    = calloc(1, sizeof(*pMax)                             );
+	size_t* pArgmax = calloc(1, sizeof(*pArgmax)                          );
+
+	ck_assert_ptr_ne(pSrc,    NULL);
+	ck_assert_ptr_ne(pMax,    NULL);
+	ck_assert_ptr_ne(pArgmax, NULL);
+
+
+	/**
+	 * Initialize source data.
+	 */
+
+	for(i=0;i<prodDims;i++){
+		pSrc[i] = pcgRand01();
+	}
+
+
+	/**
+	 * Run the kernel.
+	 */
+
+	GpuArray gaSrc;
+	GpuArray gaArgmax;
+
+	ga_assert_ok(GpuArray_empty(&gaSrc,    ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaArgmax, ctx, GA_SIZE,  0, NULL,     GA_C_ORDER));
+
+	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaArgmax, -1));
+
+	ga_assert_ok(GpuArray_argmax(&gaArgmax, &gaSrc, 3, reduxList));
+
+	ga_assert_ok(GpuArray_read(pArgmax, sizeof(*pArgmax), &gaArgmax));
+
+
+	/**
+	 * Check that the destination tensors are correct.
+	 */
+
+	size_t gtArgmax = 0;
+	float  gtMax    = pSrc[0];
+
+	for(i=0;i<dims[0];i++){
+		for(j=0;j<dims[1];j++){
+			for(k=0;k<dims[2];k++){
+				float v = pSrc[(i*dims[1] + j)*dims[2] + k];
+
+				if(v > gtMax){
+					gtMax    = v;
+					gtArgmax = (i*dims[1] + j)*dims[2] + k;
+				}
+			}
+		}
+	}
+
+	ck_assert_msg(gtArgmax == pArgmax[0], "Argmax value mismatch!");
+
+	/**
+	 * Deallocate.
+	 */
+
+	free(pSrc);
+	free(pMax);
+	free(pArgmax);
+	GpuArray_clear(&gaSrc);
+	GpuArray_clear(&gaArgmax);
+}END_TEST
+
+START_TEST(test_argmin_reduction){
+	pcgSeed(1);
+
+	/**
+	 * We test here a reduction of some random 3D tensor on the first and
+	 * third dimensions.
+	 */
+
+	size_t i,j,k;
+	size_t dims[3]  = {32,50,79};
+	size_t prodDims = dims[0]*dims[1]*dims[2];
+	const unsigned reduxList[] = {0,2};
+
+	float*  pSrc    = calloc(1, sizeof(*pSrc)    * dims[0]*dims[1]*dims[2]);
+	float*  pMin    = calloc(1, sizeof(*pMin)    *         dims[1]        );
+	size_t* pArgmin = calloc(1, sizeof(*pArgmin) *         dims[1]        );
+
+	ck_assert_ptr_ne(pSrc,    NULL);
+	ck_assert_ptr_ne(pMin,    NULL);
+	ck_assert_ptr_ne(pArgmin, NULL);
+
+
+	/**
+	 * Initialize source data.
+	 */
+
+	for(i=0;i<prodDims;i++){
+		pSrc[i] = pcgRand01();
+	}
+
+
+	/**
+	 * Run the kernel.
+	 */
+
+	GpuArray gaSrc;
+	GpuArray gaArgmin;
+
+	ga_assert_ok(GpuArray_empty(&gaSrc,    ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaArgmin, ctx, GA_SIZE,  1, &dims[1], GA_C_ORDER));
+
+	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaArgmin, -1));
+
+	ga_assert_ok(GpuArray_argmin(&gaArgmin, &gaSrc, 2, reduxList));
+
+	ga_assert_ok(GpuArray_read(pArgmin, sizeof(*pArgmin)*dims[1], &gaArgmin));
+
+
+	/**
+	 * Check that the destination tensors are correct.
+	 */
+
+	for(j=0;j<dims[1];j++){
+		size_t gtArgmin = 0;
+		float  gtMin    = pSrc[(0*dims[1] + j)*dims[2] + 0];
+
+		for(i=0;i<dims[0];i++){
+			for(k=0;k<dims[2];k++){
+				float v = pSrc[(i*dims[1] + j)*dims[2] + k];
+
+				if(v < gtMin){
+					gtMin    = v;
+					gtArgmin = i*dims[2] + k;
+				}
+			}
+		}
+
+		ck_assert_msg(gtArgmin == pArgmin[j], "Argmin value mismatch!");
+	}
+
+	/**
+	 * Deallocate.
+	 */
+
+	free(pSrc);
+	free(pMin);
+	free(pArgmin);
+	GpuArray_clear(&gaSrc);
+	GpuArray_clear(&gaArgmin);
+}END_TEST
+
+START_TEST(test_argmin_veryhighrank){
+	pcgSeed(1);
+
+	/**
+	 * Here we test a reduction of a random 8D tensor on four dimensions.
+	 */
+
+	size_t i,j,k,l,m,n,o,p;
+	size_t dims   [8]  = {1171,373,2,1,2,1,2,1};
+	size_t prodDims    = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7];
+	size_t rdxDims[4]  = {1171,373,1,2};
+	size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3];
+	const unsigned reduxList[] = {2,4,7,5};
+
+	float*  pSrc    = calloc(1, sizeof(*pSrc)    * prodDims);
+	float*  pMin    = calloc(1, sizeof(*pMin)    * rdxProdDims);
+	size_t* pArgmin = calloc(1, sizeof(*pArgmin) * rdxProdDims);
+
+	ck_assert_ptr_ne(pSrc,    NULL);
+	ck_assert_ptr_ne(pArgmin, NULL);
+
+
+	/**
+	 * Initialize source data.
+	 */
+
+	for(i=0;i<prodDims;i++){
+		pSrc[i] = pcgRand01();
+	}
+
+
+	/**
+	 * Run the kernel.
+	 */
+
+	GpuArray gaSrc;
+	GpuArray gaArgmin;
+
+	ga_assert_ok(GpuArray_empty(&gaSrc,    ctx, GA_FLOAT, 8, dims,    GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaArgmin, ctx, GA_SIZE,  4, rdxDims, GA_C_ORDER));
+
+	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaArgmin, -1));
+
+	ga_assert_ok(GpuArray_argmin(&gaArgmin, &gaSrc, 4, reduxList));
+
+	ga_assert_ok(GpuArray_read(pArgmin, sizeof(*pArgmin)*rdxProdDims, &gaArgmin));
+
+
+	/**
+	 * Check that the destination tensors are correct.
+	 */
+
+	for(i=0;i<dims[0];i++){
+		for(j=0;j<dims[1];j++){
+			for(l=0;l<dims[3];l++){
+				for(o=0;o<dims[6];o++){
+					size_t gtArgmin = 0;
+					float  gtMin    = pSrc[(((((((i)*dims[1] + j)*dims[2] + 0)*dims[3] + l)*dims[4] + 0)*dims[5] + 0)*dims[6] + o)*dims[7] + 0];
+
+					for(k=0;k<dims[2];k++){
+						for(m=0;m<dims[4];m++){
+							for(p=0;p<dims[7];p++){
+								for(n=0;n<dims[5];n++){
+									float v = pSrc[(((((((i)*dims[1] + j)*dims[2] + k)*dims[3] + l)*dims[4] + m)*dims[5] + n)*dims[6] + o)*dims[7] + p];
+
+									if(v < gtMin){
+										gtMin    = v;
+										gtArgmin = (((k)*dims[4] + m)*dims[7] + p)*dims[5] + n;
+									}
+								}
+							}
+						}
+					}
+
+					size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
+					ck_assert_msg(gtArgmin == pArgmin[dstIdx], "Argmin value mismatch!");
+				}
+			}
+		}
+	}
+
+
+	/**
+	 * Deallocate.
+	 */
+
+	free(pSrc);
+	free(pMin);
+	free(pArgmin);
+	GpuArray_clear(&gaSrc);
+	GpuArray_clear(&gaArgmin);
+}END_TEST
+
+START_TEST(test_argmin_alldimsreduced){
+	pcgSeed(1);
+
+	/**
+	 * We test here a reduction of some random 3D tensor on all dimensions.
+	 */
+
+	size_t i,j,k;
+	size_t dims[3]  = {32,50,79};
+	size_t prodDims = dims[0]*dims[1]*dims[2];
+	const unsigned reduxList[] = {0,1,2};
+
+	float*  pSrc    = calloc(1, sizeof(*pSrc)    * dims[0]*dims[1]*dims[2]);
+	float*  pMin    = calloc(1, sizeof(*pMin)                             );
+	size_t* pArgmin = calloc(1, sizeof(*pArgmin)                          );
+
+	ck_assert_ptr_ne(pSrc,    NULL);
+	ck_assert_ptr_ne(pMin,    NULL);
+	ck_assert_ptr_ne(pArgmin, NULL);
+
+
+	/**
+	 * Initialize source data.
+	 */
+
+	for(i=0;i<prodDims;i++){
+		pSrc[i] = pcgRand01();
+	}
+
+
+	/**
+	 * Run the kernel.
+	 */
+
+	GpuArray gaSrc;
+	GpuArray gaArgmin;
+
+	ga_assert_ok(GpuArray_empty(&gaSrc,    ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaArgmin, ctx, GA_SIZE,  0, NULL,     GA_C_ORDER));
+
+	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaArgmin, -1));
+
+	ga_assert_ok(GpuArray_argmin(&gaArgmin, &gaSrc, 3, reduxList));
+
+	ga_assert_ok(GpuArray_read(pArgmin, sizeof(*pArgmin), &gaArgmin));
+
+
+	/**
+	 * Check that the destination tensors are correct.
+	 */
+
+	size_t gtArgmin = 0;
+	float  gtMin    = pSrc[0];
+
+	for(i=0;i<dims[0];i++){
+		for(j=0;j<dims[1];j++){
+			for(k=0;k<dims[2];k++){
+				float v = pSrc[(i*dims[1] + j)*dims[2] + k];
+
+				if(v < gtMin){
+					gtMin    = v;
+					gtArgmin = (i*dims[1] + j)*dims[2] + k;
+				}
+			}
+		}
+	}
+
+	ck_assert_msg(gtArgmin == pArgmin[0], "Argmin value mismatch!");
+
+	/**
+	 * Deallocate.
+	 */
+
+	free(pSrc);
+	free(pMin);
+	free(pArgmin);
+	GpuArray_clear(&gaSrc);
+	GpuArray_clear(&gaArgmin);
+}END_TEST
+
+START_TEST(test_max_reduction){
+	pcgSeed(1);
+
+	/**
+	 * We test here a reduction of some random 3D tensor on all dimensions.
+	 */
+
+	size_t i,j,k;
+	size_t dims[3]  = {32,50,79};
+	size_t prodDims = dims[0]*dims[1]*dims[2];
+	const unsigned reduxList[] = {0,2};
+
+	float*  pSrc    = calloc(1, sizeof(*pSrc)    * dims[0]*dims[1]*dims[2]);
+	float*  pMax    = calloc(1, sizeof(*pMax)    *         dims[1]        );
+
+	ck_assert_ptr_ne(pSrc,    NULL);
+	ck_assert_ptr_ne(pMax,    NULL);
+
+
+	/**
+	 * axitialize source data.
+	 */
+
+	for(i=0;i<prodDims;i++){
+		pSrc[i] = pcgRand01();
+	}
+
+
+	/**
+	 * Run the kernel.
+	 */
+
+	GpuArray gaSrc;
+	GpuArray gaMax;
+
+	ga_assert_ok(GpuArray_empty(&gaSrc,    ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaMax,    ctx, GA_FLOAT, 1, &dims[1], GA_C_ORDER));
+
+	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaMax,    -1));  /* 0xFFFFFFFF is a qNaN. */
+
+	ga_assert_ok(GpuArray_max(&gaMax, &gaSrc, 2, reduxList));
+
+	ga_assert_ok(GpuArray_read(pMax,    sizeof(*pMax)   *dims[1], &gaMax));
+
+
+	/**
+	 * Check that the destaxation tensors are correct.
+	 */
+
+	for(j=0;j<dims[1];j++){
+		float  gtMax    = pSrc[(0*dims[1] + j)*dims[2] + 0];
+
+		for(i=0;i<dims[0];i++){
+			for(k=0;k<dims[2];k++){
+				float v = pSrc[(i*dims[1] + j)*dims[2] + k];
+
+				if(v > gtMax){
+					gtMax    = v;
+				}
+			}
+		}
+
+		ck_assert_msg(gtMax    == pMax[j],    "Max value mismatch!");
+	}
+
+	/**
+	 * Deallocate.
+	 */
+
+	free(pSrc);
+	free(pMax);
+	GpuArray_clear(&gaSrc);
+	GpuArray_clear(&gaMax);
+}END_TEST
+
+START_TEST(test_max_veryhighrank){
+	pcgSeed(1);
+
+	/**
+	 * Here we test a reduction of a random 8D tensor on four dimensions.
+	 */
+
+	size_t i,j,k,l,m,n,o,p;
+	size_t dims   [8]  = {1171,373,2,1,2,1,2,1};
+	size_t prodDims    = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7];
+	size_t rdxDims[4]  = {1171,373,1,2};
+	size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3];
+	const unsigned reduxList[] = {2,4,7,5};
+
+	float*  pSrc    = calloc(1, sizeof(*pSrc)    * prodDims);
+	float*  pMax    = calloc(1, sizeof(*pMax)    * rdxProdDims);
+
+	ck_assert_ptr_ne(pSrc,    NULL);
+	ck_assert_ptr_ne(pMax,    NULL);
+
+
+	/**
+	 * Initialize source data.
+	 */
+
+	for(i=0;i<prodDims;i++){
+		pSrc[i] = pcgRand01();
+	}
+
+
+	/**
+	 * Run the kernel.
+	 */
+
+	GpuArray gaSrc;
+	GpuArray gaMax;
+
+	ga_assert_ok(GpuArray_empty(&gaSrc,    ctx, GA_FLOAT, 8, dims,    GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaMax,    ctx, GA_FLOAT, 4, rdxDims, GA_C_ORDER));
+
+	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaMax,    -1));  /* 0xFFFFFFFF is a qNaN. */
+
+	ga_assert_ok(GpuArray_max(&gaMax, &gaSrc, 4, reduxList));
+
+	ga_assert_ok(GpuArray_read(pMax,    sizeof(*pMax)   *rdxProdDims, &gaMax));
+
+
+	/**
+	 * Check that the destination tensors are correct.
+	 */
+
+	for(i=0;i<dims[0];i++){
+		for(j=0;j<dims[1];j++){
+			for(l=0;l<dims[3];l++){
+				for(o=0;o<dims[6];o++){
+					float  gtMax    = pSrc[(((((((i)*dims[1] + j)*dims[2] + 0)*dims[3] + l)*dims[4] + 0)*dims[5] + 0)*dims[6] + o)*dims[7] + 0];
+
+					for(k=0;k<dims[2];k++){
+						for(m=0;m<dims[4];m++){
+							for(p=0;p<dims[7];p++){
+								for(n=0;n<dims[5];n++){
+									float v = pSrc[(((((((i)*dims[1] + j)*dims[2] + k)*dims[3] + l)*dims[4] + m)*dims[5] + n)*dims[6] + o)*dims[7] + p];
+
+									if(v > gtMax){
+										gtMax    = v;
+									}
+								}
+							}
+						}
+					}
+
+					size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
+					ck_assert_msg(gtMax    == pMax[dstIdx],    "Max value mismatch!");
+				}
+			}
+		}
+	}
+
+
+	/**
+	 * Deallocate.
+	 */
+
+	free(pSrc);
+	free(pMax);
+	GpuArray_clear(&gaSrc);
+	GpuArray_clear(&gaMax);
+}END_TEST
+
+START_TEST(test_max_alldimsreduced){
+	pcgSeed(1);
+
+	/**
+	 * We test here a reduction of some random 3D tensor on all dimensions.
+	 */
+
+	size_t i,j,k;
+	size_t dims[3]  = {32,50,79};
+	size_t prodDims = dims[0]*dims[1]*dims[2];
+	const unsigned reduxList[] = {0,1,2};
+
+	float*  pSrc    = calloc(1, sizeof(*pSrc)    * dims[0]*dims[1]*dims[2]);
+	float*  pMax    = calloc(1, sizeof(*pMax)                             );
+
+	ck_assert_ptr_ne(pSrc,    NULL);
+	ck_assert_ptr_ne(pMax,    NULL);
+
+
+	/**
+	 * axitialize source data.
+	 */
+
+	for(i=0;i<prodDims;i++){
+		pSrc[i] = pcgRand01();
+	}
+
+
+	/**
+	 * Run the kernel.
+	 */
+
+	GpuArray gaSrc;
+	GpuArray gaMax;
+
+	ga_assert_ok(GpuArray_empty(&gaSrc,    ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaMax,    ctx, GA_FLOAT, 0, NULL,     GA_C_ORDER));
+
+	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaMax,    -1));  /* 0xFFFFFFFF is a qNaN. */
+
+	ga_assert_ok(GpuArray_max(&gaMax, &gaSrc, 3, reduxList));
+
+	ga_assert_ok(GpuArray_read(pMax,    sizeof(*pMax),    &gaMax));
+
+
+	/**
+	 * Check that the destaxation tensors are correct.
+	 */
+
+	float  gtMax    = pSrc[0];
+
+	for(i=0;i<dims[0];i++){
+		for(j=0;j<dims[1];j++){
+			for(k=0;k<dims[2];k++){
+				float v = pSrc[(i*dims[1] + j)*dims[2] + k];
+
+				if(v > gtMax){
+					gtMax    = v;
+				}
+			}
+		}
+	}
+
+	ck_assert_msg(gtMax    == pMax[0],    "Max value mismatch!");
+
+	/**
+	 * Deallocate.
+	 */
+
+	free(pSrc);
+	free(pMax);
+	GpuArray_clear(&gaSrc);
+	GpuArray_clear(&gaMax);
+}END_TEST
+
+START_TEST(test_min_reduction){
+	pcgSeed(1);
+
+	/**
+	 * We test here a reduction of some random 3D tensor on all dimensions.
+	 */
+
+	size_t i,j,k;
+	size_t dims[3]  = {32,50,79};
+	size_t prodDims = dims[0]*dims[1]*dims[2];
+	const unsigned reduxList[] = {0,2};
+
+	float*  pSrc    = calloc(1, sizeof(*pSrc)    * dims[0]*dims[1]*dims[2]);
+	float*  pMin    = calloc(1, sizeof(*pMin)    *         dims[1]        );
+
+	ck_assert_ptr_ne(pSrc,    NULL);
+	ck_assert_ptr_ne(pMin,    NULL);
+
+
+	/**
+	 * Initialize source data.
+	 */
+
+	for(i=0;i<prodDims;i++){
+		pSrc[i] = pcgRand01();
+	}
+
+
+	/**
+	 * Run the kernel.
+	 */
+
+	GpuArray gaSrc;
+	GpuArray gaMin;
+
+	ga_assert_ok(GpuArray_empty(&gaSrc,    ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaMin,    ctx, GA_FLOAT, 1, &dims[1], GA_C_ORDER));
+
+	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaMin,    -1));  /* 0xFFFFFFFF is a qNaN. */
+
+	ga_assert_ok(GpuArray_min(&gaMin, &gaSrc, 2, reduxList));
+
+	ga_assert_ok(GpuArray_read(pMin,    sizeof(*pMin)   *dims[1], &gaMin));
+
+
+	/**
+	 * Check that the destination tensors are correct.
+	 */
+
+	for(j=0;j<dims[1];j++){
+		float  gtMin    = pSrc[(0*dims[1] + j)*dims[2] + 0];
+
+		for(i=0;i<dims[0];i++){
+			for(k=0;k<dims[2];k++){
+				float v = pSrc[(i*dims[1] + j)*dims[2] + k];
+
+				if(v < gtMin){
+					gtMin    = v;
+				}
+			}
+		}
+
+		ck_assert_msg(gtMin    == pMin[j],    "Min value mismatch!");
+	}
+
+	/**
+	 * Deallocate.
+	 */
+
+	free(pSrc);
+	free(pMin);
+	GpuArray_clear(&gaSrc);
+	GpuArray_clear(&gaMin);
+}END_TEST
+
+START_TEST(test_min_veryhighrank){
+	pcgSeed(1);
+
+	/**
+	 * Here we test a reduction of a random 8D tensor on four dimensions.
+	 */
+
+	size_t i,j,k,l,m,n,o,p;
+	size_t dims   [8]  = {1171,373,2,1,2,1,2,1};
+	size_t prodDims    = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7];
+	size_t rdxDims[4]  = {1171,373,1,2};
+	size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3];
+	const unsigned reduxList[] = {2,4,7,5};
+
+	float*  pSrc    = calloc(1, sizeof(*pSrc)    * prodDims);
+	float*  pMin    = calloc(1, sizeof(*pMin)    * rdxProdDims);
+
+	ck_assert_ptr_ne(pSrc,    NULL);
+	ck_assert_ptr_ne(pMin,    NULL);
+
+
+	/**
+	 * Initialize source data.
+	 */
+
+	for(i=0;i<prodDims;i++){
+		pSrc[i] = pcgRand01();
+	}
+
+
+	/**
+	 * Run the kernel.
+	 */
+
+	GpuArray gaSrc;
+	GpuArray gaMin;
+
+	ga_assert_ok(GpuArray_empty(&gaSrc,    ctx, GA_FLOAT, 8, dims,    GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaMin,    ctx, GA_FLOAT, 4, rdxDims, GA_C_ORDER));
+
+	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaMin,    -1));  /* 0xFFFFFFFF is a qNaN. */
+
+	ga_assert_ok(GpuArray_min(&gaMin, &gaSrc, 4, reduxList));
+
+	ga_assert_ok(GpuArray_read(pMin,    sizeof(*pMin)   *rdxProdDims, &gaMin));
+
+
+	/**
+	 * Check that the destination tensors are correct.
+	 */
+
+	for(i=0;i<dims[0];i++){
+		for(j=0;j<dims[1];j++){
+			for(l=0;l<dims[3];l++){
+				for(o=0;o<dims[6];o++){
+					float  gtMin    = pSrc[(((((((i)*dims[1] + j)*dims[2] + 0)*dims[3] + l)*dims[4] + 0)*dims[5] + 0)*dims[6] + o)*dims[7] + 0];
+
+					for(k=0;k<dims[2];k++){
+						for(m=0;m<dims[4];m++){
+							for(p=0;p<dims[7];p++){
+								for(n=0;n<dims[5];n++){
+									float v = pSrc[(((((((i)*dims[1] + j)*dims[2] + k)*dims[3] + l)*dims[4] + m)*dims[5] + n)*dims[6] + o)*dims[7] + p];
+
+									if(v < gtMin){
+										gtMin    = v;
+									}
+								}
+							}
+						}
+					}
+
+					size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
+					ck_assert_msg(gtMin    == pMin[dstIdx],    "Min value mismatch!");
+				}
+			}
+		}
+	}
+
+
+	/**
+	 * Deallocate.
+	 */
+
+	free(pSrc);
+	free(pMin);
+	GpuArray_clear(&gaSrc);
+	GpuArray_clear(&gaMin);
+}END_TEST
+
+START_TEST(test_min_alldimsreduced){
+	pcgSeed(1);
+
+	/**
+	 * We test here a reduction of some random 3D tensor on all dimensions.
+	 */
+
+	size_t i,j,k;
+	size_t dims[3]  = {32,50,79};
+	size_t prodDims = dims[0]*dims[1]*dims[2];
+	const unsigned reduxList[] = {0,1,2};
+
+	float*  pSrc    = calloc(1, sizeof(*pSrc)    * dims[0]*dims[1]*dims[2]);
+	float*  pMin    = calloc(1, sizeof(*pMin)                             );
+
+	ck_assert_ptr_ne(pSrc,    NULL);
+	ck_assert_ptr_ne(pMin,    NULL);
+
+
+	/**
+	 * Initialize source data.
+	 */
+
+	for(i=0;i<prodDims;i++){
+		pSrc[i] = pcgRand01();
+	}
+
+
+	/**
+	 * Run the kernel.
+	 */
+
+	GpuArray gaSrc;
+	GpuArray gaMin;
+
+	ga_assert_ok(GpuArray_empty(&gaSrc,    ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaMin,    ctx, GA_FLOAT, 0, NULL,     GA_C_ORDER));
+
+	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaMin,    -1));  /* 0xFFFFFFFF is a qNaN. */
+
+	ga_assert_ok(GpuArray_min(&gaMin, &gaSrc, 3, reduxList));
+
+	ga_assert_ok(GpuArray_read(pMin,    sizeof(*pMin),    &gaMin));
+
+
+	/**
+	 * Check that the destination tensors are correct.
+	 */
+
+	float  gtMin    = pSrc[0];
+
+	for(i=0;i<dims[0];i++){
+		for(j=0;j<dims[1];j++){
+			for(k=0;k<dims[2];k++){
+				float v = pSrc[(i*dims[1] + j)*dims[2] + k];
+
+				if(v < gtMin){
+					gtMin    = v;
+				}
+			}
+		}
+	}
+
+	ck_assert_msg(gtMin    == pMin[0],    "Min value mismatch!");
+
+	/**
+	 * Deallocate.
+	 */
+
+	free(pSrc);
+	free(pMin);
+	GpuArray_clear(&gaSrc);
+	GpuArray_clear(&gaMin);
+}END_TEST
+
+START_TEST(test_sum_reduction){
+	pcgSeed(1);
+
+	/**
+	 * We test here a reduction of some random 3D tensor on all dimensions.
+	 */
+
+	size_t i,j,k;
+	size_t dims[3]  = {32,50,79};
+	size_t prodDims = dims[0]*dims[1]*dims[2];
+	const unsigned reduxList[] = {0,2};
+	const float TOL = 1e-5;
+
+	float*  pS = calloc(1, sizeof(*pS)    * dims[0]*dims[1]*dims[2]);
+	float*  pD = calloc(1, sizeof(*pD)    *         dims[1]        );
+
+	ck_assert_ptr_ne(pS,    NULL);
+	ck_assert_ptr_ne(pD,    NULL);
+
+
+	/**
+	 * Initialize source data.
+	 */
+
+	for(i=0;i<prodDims;i++){
+		pS[i] = pcgRand01();
+	}
+
+
+	/**
+	 * Run the kernel.
+	 */
+
+	GpuArray gaS;
+	GpuArray gaD;
+
+	ga_assert_ok(GpuArray_empty (&gaS, ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty (&gaD, ctx, GA_FLOAT, 1, &dims[1], GA_C_ORDER));
+
+	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
+
+	ga_assert_ok(GpuArray_sum   (&gaD, &gaS, 2, reduxList));
+
+	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*dims[1], &gaD));
+
+
+	/**
+	 * Check that the destination tensors are correct.
+	 */
+
+	for(j=0;j<dims[1];j++){
+		float  gtD = 0;
+
+		for(i=0;i<dims[0];i++){
+			for(k=0;k<dims[2];k++){
+				float v = pS[(i*dims[1] + j)*dims[2] + k];
+				gtD += v;
+			}
+		}
+
+		ck_assert_double_eq_tol(gtD, pD[j], TOL);
+	}
+
+	/**
+	 * Deallocate.
+	 */
+
+	free(pS);
+	free(pD);
+	GpuArray_clear(&gaS);
+	GpuArray_clear(&gaD);
+}END_TEST
+
+START_TEST(test_sum_veryhighrank){
+	pcgSeed(1);
+
+	/**
+	 * Here we test a reduction of a random 8D tensor on four dimensions.
+	 */
+
+	size_t i,j,k,l,m,n,o,p;
+	size_t dims   [8]  = {1171,373,2,1,2,1,2,1};
+	size_t prodDims    = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7];
+	size_t rdxDims[4]  = {1171,373,1,2};
+	size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3];
+	const unsigned reduxList[] = {2,4,7,5};
+	const float TOL    = 1e-5;
+
+	float*  pS = calloc(1, sizeof(*pS) * prodDims);
+	float*  pD = calloc(1, sizeof(*pD) * rdxProdDims);
+
+	ck_assert_ptr_ne(pS, NULL);
+	ck_assert_ptr_ne(pD, NULL);
+
+
+	/**
+	 * Initialize source data.
+	 */
+
+	for(i=0;i<prodDims;i++){
+		pS[i] = pcgRand01();
+	}
+
+
+	/**
+	 * Run the kernel.
+	 */
+
+	GpuArray gaS;
+	GpuArray gaD;
+
+	ga_assert_ok(GpuArray_empty (&gaS, ctx, GA_FLOAT, 8, dims,    GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty (&gaD, ctx, GA_FLOAT, 4, rdxDims, GA_C_ORDER));
+
+	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
+
+	ga_assert_ok(GpuArray_sum   (&gaD, &gaS, 4, reduxList));
+
+	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*rdxProdDims, &gaD));
+
+
+	/**
+	 * Check that the destination tensors are correct.
+	 */
+
+	for(i=0;i<dims[0];i++){
+		for(j=0;j<dims[1];j++){
+			for(l=0;l<dims[3];l++){
+				for(o=0;o<dims[6];o++){
+					float gtD = 0;
+
+					for(k=0;k<dims[2];k++){
+						for(m=0;m<dims[4];m++){
+							for(p=0;p<dims[7];p++){
+								for(n=0;n<dims[5];n++){
+									float v = pS[(((((((i)*dims[1] + j)*dims[2] + k)*dims[3] + l)*dims[4] + m)*dims[5] + n)*dims[6] + o)*dims[7] + p];
+									gtD += v;
+								}
+							}
+						}
+					}
+
+					size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
+					ck_assert_double_eq_tol(gtD, pD[dstIdx], TOL);
+				}
+			}
+		}
+	}
+
+
+	/**
+	 * Deallocate.
+	 */
+
+	free(pS);
+	free(pD);
+	GpuArray_clear(&gaS);
+	GpuArray_clear(&gaD);
+}END_TEST
+
+START_TEST(test_sum_alldimsreduced){
+	pcgSeed(1);
+
+	/**
+	 * We test here a reduction of some random 3D tensor on all dimensions.
+	 */
+
+	size_t i,j,k;
+	size_t dims[3]  = {32,50,79};
+	size_t prodDims = dims[0]*dims[1]*dims[2];
+	const unsigned reduxList[] = {0,1,2};
+	const float TOL = 1e-5;
+
+	float*  pS = calloc(1, sizeof(*pS)    * dims[0]*dims[1]*dims[2]);
+	float*  pD = calloc(1, sizeof(*pD)                             );
+
+	ck_assert_ptr_ne(pS,    NULL);
+	ck_assert_ptr_ne(pD,    NULL);
+
+
+	/**
+	 * Initialize source data.
+	 */
+
+	for(i=0;i<prodDims;i++){
+		pS[i] = pcgRand01();
+	}
+
+
+	/**
+	 * Run the kernel.
+	 */
+
+	GpuArray gaS;
+	GpuArray gaD;
+
+	ga_assert_ok(GpuArray_empty (&gaS, ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty (&gaD, ctx, GA_FLOAT, 0, NULL,     GA_C_ORDER));
+
+	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
+
+	ga_assert_ok(GpuArray_sum   (&gaD, &gaS, 3, reduxList));
+
+	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD), &gaD));
+
+
+	/**
+	 * Check that the destination tensors are correct.
+	 */
+
+	float  gtD = 0;
+
+	for(i=0;i<dims[0];i++){
+		for(j=0;j<dims[1];j++){
+			for(k=0;k<dims[2];k++){
+				float v = pS[(i*dims[1] + j)*dims[2] + k];
+				gtD += v;
+			}
+		}
+	}
+
+	ck_assert_double_eq_tol(gtD, pD[0], TOL);
+
+	/**
+	 * Deallocate.
+	 */
+
+	free(pS);
+	free(pD);
+	GpuArray_clear(&gaS);
+	GpuArray_clear(&gaD);
+}END_TEST
+
+START_TEST(test_prod_reduction){
+	pcgSeed(1);
+
+	/**
+	 * We test here a reduction of some random 3D tensor on all dimensions.
+	 */
+
+	size_t i,j,k;
+	size_t dims[3]  = {32,50,79};
+	size_t prodDims = dims[0]*dims[1]*dims[2];
+	const unsigned reduxList[] = {0,2};
+	const float TOL = 1e-5;
+
+	float*  pS = calloc(1, sizeof(*pS)    * dims[0]*dims[1]*dims[2]);
+	float*  pD = calloc(1, sizeof(*pD)    *         dims[1]        );
+
+	ck_assert_ptr_ne(pS,    NULL);
+	ck_assert_ptr_ne(pD,    NULL);
+
+
+	/**
+	 * Initialize source data.
+	 */
+
+	for(i=0;i<prodDims;i++){
+		pS[i] = (pcgRand01()-0.5)*0.1 + 1;
+	}
+
+
+	/**
+	 * Run the kernel.
+	 */
+
+	GpuArray gaS;
+	GpuArray gaD;
+
+	ga_assert_ok(GpuArray_empty (&gaS, ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty (&gaD, ctx, GA_FLOAT, 1, &dims[1], GA_C_ORDER));
+
+	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
+
+	ga_assert_ok(GpuArray_prod  (&gaD, &gaS, 2, reduxList));
+
+	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*dims[1], &gaD));
+
+
+	/**
+	 * Check that the destination tensors are correct.
+	 */
+
+	for(j=0;j<dims[1];j++){
+		float  gtD = 1;
+
+		for(i=0;i<dims[0];i++){
+			for(k=0;k<dims[2];k++){
+				float v = pS[(i*dims[1] + j)*dims[2] + k];
+				gtD *= v;
+			}
+		}
+
+		ck_assert_double_eq_tol(gtD, pD[j], TOL);
+	}
+
+	/**
+	 * Deallocate.
+	 */
+
+	free(pS);
+	free(pD);
+	GpuArray_clear(&gaS);
+	GpuArray_clear(&gaD);
+}END_TEST
+
+START_TEST(test_prod_veryhighrank){
+	pcgSeed(1);
+
+	/**
+	 * Here we test a reduction of a random 8D tensor on four dimensions.
+	 */
+
+	size_t i,j,k,l,m,n,o,p;
+	size_t dims   [8]  = {1171,373,2,1,2,1,2,1};
+	size_t prodDims    = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7];
+	size_t rdxDims[4]  = {1171,373,1,2};
+	size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3];
+	const unsigned reduxList[] = {2,4,7,5};
+	const float TOL    = 1e-5;
+
+	float*  pS = calloc(1, sizeof(*pS) * prodDims);
+	float*  pD = calloc(1, sizeof(*pD) * rdxProdDims);
+
+	ck_assert_ptr_ne(pS, NULL);
+	ck_assert_ptr_ne(pD, NULL);
+
+
+	/**
+	 * Initialize source data.
+	 */
+
+	for(i=0;i<prodDims;i++){
+		pS[i] = (pcgRand01()-0.5)*0.1 + 1;
+	}
+
+
+	/**
+	 * Run the kernel.
+	 */
+
+	GpuArray gaS;
+	GpuArray gaD;
+
+	ga_assert_ok(GpuArray_empty (&gaS, ctx, GA_FLOAT, 8, dims,    GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty (&gaD, ctx, GA_FLOAT, 4, rdxDims, GA_C_ORDER));
+
+	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
+
+	ga_assert_ok(GpuArray_prod  (&gaD, &gaS, 4, reduxList));
+
+	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*rdxProdDims, &gaD));
+
+
+	/**
+	 * Check that the destination tensors are correct.
+	 */
+
+	for(i=0;i<dims[0];i++){
+		for(j=0;j<dims[1];j++){
+			for(l=0;l<dims[3];l++){
+				for(o=0;o<dims[6];o++){
+					float gtD = 1;
+
+					for(k=0;k<dims[2];k++){
+						for(m=0;m<dims[4];m++){
+							for(p=0;p<dims[7];p++){
+								for(n=0;n<dims[5];n++){
+									float v = pS[(((((((i)*dims[1] + j)*dims[2] + k)*dims[3] + l)*dims[4] + m)*dims[5] + n)*dims[6] + o)*dims[7] + p];
+									gtD *= v;
+								}
+							}
+						}
+					}
+
+					size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
+					ck_assert_double_eq_tol(gtD, pD[dstIdx], TOL);
+				}
+			}
+		}
+	}
+
+
+	/**
+	 * Deallocate.
+	 */
+
+	free(pS);
+	free(pD);
+	GpuArray_clear(&gaS);
+	GpuArray_clear(&gaD);
+}END_TEST
+
+START_TEST(test_prod_alldimsreduced){
+	pcgSeed(1);
+
+	/**
+	 * We test here a reduction of some random 3D tensor on all dimensions.
+	 */
+
+	size_t i,j,k;
+	size_t dims[3]  = {32,50,79};
+	size_t prodDims = dims[0]*dims[1]*dims[2];
+	const unsigned reduxList[] = {0,1,2};
+	const float TOL = 1e-5;
+
+	float*  pS = calloc(1, sizeof(*pS)    * dims[0]*dims[1]*dims[2]);
+	float*  pD = calloc(1, sizeof(*pD)                             );
+
+	ck_assert_ptr_ne(pS,    NULL);
+	ck_assert_ptr_ne(pD,    NULL);
+
+
+	/**
+	 * Initialize source data.
+	 */
+
+	for(i=0;i<prodDims;i++){
+		pS[i] = (pcgRand01()-0.5)*0.1 + 1;
+	}
+
+
+	/**
+	 * Run the kernel.
+	 */
+
+	GpuArray gaS;
+	GpuArray gaD;
+
+	ga_assert_ok(GpuArray_empty (&gaS, ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty (&gaD, ctx, GA_FLOAT, 0, NULL,     GA_C_ORDER));
+
+	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
+
+	ga_assert_ok(GpuArray_prod  (&gaD, &gaS, 3, reduxList));
+
+	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD), &gaD));
+
+
+	/**
+	 * Check that the destination tensors are correct.
+	 */
+
+	float  gtD = 1;
+
+	for(i=0;i<dims[0];i++){
+		for(j=0;j<dims[1];j++){
+			for(k=0;k<dims[2];k++){
+				float v = pS[(i*dims[1] + j)*dims[2] + k];
+				gtD *= v;
+			}
+		}
+	}
+
+	ck_assert_double_eq_tol(gtD, pD[0], TOL);
+
+	/**
+	 * Deallocate.
+	 */
+
+	free(pS);
+	free(pD);
+	GpuArray_clear(&gaS);
+	GpuArray_clear(&gaD);
+}END_TEST
+
+START_TEST(test_prodnz_reduction){
+	pcgSeed(1);
+
+	/**
+	 * We test here a reduction of some random 3D tensor on all dimensions.
+	 */
+
+	size_t i,j,k;
+	size_t dims[3]  = {32,50,79};
+	size_t prodDims = dims[0]*dims[1]*dims[2];
+	const unsigned reduxList[] = {0,2};
+	const float TOL = 1e-5;
+
+	float*  pS = calloc(1, sizeof(*pS)    * dims[0]*dims[1]*dims[2]);
+	float*  pD = calloc(1, sizeof(*pD)    *         dims[1]        );
+
+	ck_assert_ptr_ne(pS,    NULL);
+	ck_assert_ptr_ne(pD,    NULL);
+
+
+	/**
+	 * Initialize source data.
+	 */
+
+	for(i=0;i<prodDims;i++){
+		pS[i] = (pcgRand01()-0.5)*0.1 + 1;
+		if(pcgRand01()<0.1){
+			pS[i] = 0;
+		}
+	}
+
+
+	/**
+	 * Run the kernel.
+	 */
+
+	GpuArray gaS;
+	GpuArray gaD;
+
+	ga_assert_ok(GpuArray_empty (&gaS, ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty (&gaD, ctx, GA_FLOAT, 1, &dims[1], GA_C_ORDER));
+
+	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
+
+	ga_assert_ok(GpuArray_prodnz(&gaD, &gaS, 2, reduxList));
+
+	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*dims[1], &gaD));
+
+
+	/**
+	 * Check that the destination tensors are correct.
+	 */
+
+	for(j=0;j<dims[1];j++){
+		float  gtD = 1;
+
+		for(i=0;i<dims[0];i++){
+			for(k=0;k<dims[2];k++){
+				float v = pS[(i*dims[1] + j)*dims[2] + k];
+				gtD *= v==0 ? 1 : v;
+			}
+		}
+
+		ck_assert_double_eq_tol(gtD, pD[j], TOL);
+	}
+
+	/**
+	 * Deallocate.
+	 */
+
+	free(pS);
+	free(pD);
+	GpuArray_clear(&gaS);
+	GpuArray_clear(&gaD);
+}END_TEST
+
+START_TEST(test_prodnz_veryhighrank){
+	pcgSeed(1);
+
+	/**
+	 * Here we test a reduction of a random 8D tensor on four dimensions.
+	 */
+
+	size_t i,j,k,l,m,n,o,p;
+	size_t dims   [8]  = {1171,373,2,1,2,1,2,1};
+	size_t prodDims    = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7];
+	size_t rdxDims[4]  = {1171,373,1,2};
+	size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3];
+	const unsigned reduxList[] = {2,4,7,5};
+	const float TOL    = 1e-5;
+
+	float*  pS = calloc(1, sizeof(*pS) * prodDims);
+	float*  pD = calloc(1, sizeof(*pD) * rdxProdDims);
+
+	ck_assert_ptr_ne(pS, NULL);
+	ck_assert_ptr_ne(pD, NULL);
+
+
+	/**
+	 * Initialize source data.
+	 */
+
+	for(i=0;i<prodDims;i++){
+		pS[i] = (pcgRand01()-0.5)*0.1 + 1;
+		if(pcgRand01()<0.1){
+			pS[i] = 0;
+		}
+	}
+
+
+	/**
+	 * Run the kernel.
+	 */
+
+	GpuArray gaS;
+	GpuArray gaD;
+
+	ga_assert_ok(GpuArray_empty (&gaS, ctx, GA_FLOAT, 8, dims,    GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty (&gaD, ctx, GA_FLOAT, 4, rdxDims, GA_C_ORDER));
+
+	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
+
+	ga_assert_ok(GpuArray_prodnz(&gaD, &gaS, 4, reduxList));
+
+	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*rdxProdDims, &gaD));
+
+
+	/**
+	 * Check that the destination tensors are correct.
+	 */
+
+	for(i=0;i<dims[0];i++){
+		for(j=0;j<dims[1];j++){
+			for(l=0;l<dims[3];l++){
+				for(o=0;o<dims[6];o++){
+					float gtD = 1;
+
+					for(k=0;k<dims[2];k++){
+						for(m=0;m<dims[4];m++){
+							for(p=0;p<dims[7];p++){
+								for(n=0;n<dims[5];n++){
+									float v = pS[(((((((i)*dims[1] + j)*dims[2] + k)*dims[3] + l)*dims[4] + m)*dims[5] + n)*dims[6] + o)*dims[7] + p];
+									gtD *= v==0 ? 1 : v;
+								}
+							}
+						}
+					}
+
+					size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
+					ck_assert_double_eq_tol(gtD, pD[dstIdx], TOL);
+				}
+			}
+		}
+	}
+
+
+	/**
+	 * Deallocate.
+	 */
+
+	free(pS);
+	free(pD);
+	GpuArray_clear(&gaS);
+	GpuArray_clear(&gaD);
+}END_TEST
+
+START_TEST(test_prodnz_alldimsreduced){
+	pcgSeed(1);
+
+	/**
+	 * We test here a reduction of some random 3D tensor on all dimensions.
+	 */
+
+	size_t i,j,k;
+	size_t dims[3]  = {32,50,79};
+	size_t prodDims = dims[0]*dims[1]*dims[2];
+	const unsigned reduxList[] = {0,1,2};
+	const float TOL = 1e-5;
+
+	float*  pS = calloc(1, sizeof(*pS)    * dims[0]*dims[1]*dims[2]);
+	float*  pD = calloc(1, sizeof(*pD)                             );
+
+	ck_assert_ptr_ne(pS,    NULL);
+	ck_assert_ptr_ne(pD,    NULL);
+
+
+	/**
+	 * Initialize source data.
+	 */
+
+	for(i=0;i<prodDims;i++){
+		pS[i] = (pcgRand01()-0.5)*0.1 + 1;
+		if(pcgRand01()<0.1){
+			pS[i] = 0;
+		}
+	}
+
+
+	/**
+	 * Run the kernel.
+	 */
+
+	GpuArray gaS;
+	GpuArray gaD;
+
+	ga_assert_ok(GpuArray_empty (&gaS, ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty (&gaD, ctx, GA_FLOAT, 0, NULL,     GA_C_ORDER));
+
+	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
+
+	ga_assert_ok(GpuArray_prodnz(&gaD, &gaS, 3, reduxList));
+
+	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD), &gaD));
+
+
+	/**
+	 * Check that the destination tensors are correct.
+	 */
+
+	float  gtD = 1;
+
+	for(i=0;i<dims[0];i++){
+		for(j=0;j<dims[1];j++){
+			for(k=0;k<dims[2];k++){
+				float v = pS[(i*dims[1] + j)*dims[2] + k];
+				gtD *= v==0 ? 1 : v;
+			}
+		}
+	}
+
+	ck_assert_double_eq_tol(gtD, pD[0], TOL);
+
+	/**
+	 * Deallocate.
+	 */
+
+	free(pS);
+	free(pD);
+	GpuArray_clear(&gaS);
+	GpuArray_clear(&gaD);
+}END_TEST
+
+START_TEST(test_and_reduction){
+	pcgSeed(1);
+
+	/**
+	 * We test here a reduction of some random 3D tensor on all dimensions.
+	 */
+
+	size_t i,j,k;
+	size_t dims[3]  = {32,50,79};
+	size_t prodDims = dims[0]*dims[1]*dims[2];
+	const unsigned reduxList[] = {0,2};
+
+	uint32_t*  pS = calloc(1, sizeof(*pS)    * dims[0]*dims[1]*dims[2]);
+	uint32_t*  pD = calloc(1, sizeof(*pD)    *         dims[1]        );
+
+	ck_assert_ptr_ne(pS,    NULL);
+	ck_assert_ptr_ne(pD,    NULL);
+
+
+	/**
+	 * Initialize source data.
+	 */
+
+	for(i=0;i<prodDims;i++){
+		/**
+		 * We are testing logic-AND, so the bits should be 1 with high
+		 * probability.
+		 */
+
+		pS[i]  = (uint32_t)(pcgRand01() * (uint32_t)-1);
+		pS[i] |= (uint32_t)(pcgRand01() * (uint32_t)-1);
+		pS[i] |= (uint32_t)(pcgRand01() * (uint32_t)-1);
+		pS[i] |= (uint32_t)(pcgRand01() * (uint32_t)-1);
+		pS[i] |= (uint32_t)(pcgRand01() * (uint32_t)-1);
+	}
+
+
+	/**
+	 * Run the kernel.
+	 */
+
+	GpuArray gaS;
+	GpuArray gaD;
+
+	ga_assert_ok(GpuArray_empty (&gaS, ctx, GA_UINT, 3, &dims[0], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty (&gaD, ctx, GA_UINT, 1, &dims[1], GA_C_ORDER));
+
+	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD, -1));
+
+	ga_assert_ok(GpuArray_and   (&gaD, &gaS, 2, reduxList));
+
+	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*dims[1], &gaD));
+
+
+	/**
+	 * Check that the destination tensors are correct.
+	 */
+
+	for(j=0;j<dims[1];j++){
+		uint32_t  gtD = -1;
+
+		for(i=0;i<dims[0];i++){
+			for(k=0;k<dims[2];k++){
+				uint32_t v = pS[(i*dims[1] + j)*dims[2] + k];
+				gtD &= v;
+			}
+		}
+
+		ck_assert_uint_eq(gtD, pD[j]);
+	}
+
+	/**
+	 * Deallocate.
+	 */
+
+	free(pS);
+	free(pD);
+	GpuArray_clear(&gaS);
+	GpuArray_clear(&gaD);
+}END_TEST
+
+START_TEST(test_and_veryhighrank){
+	pcgSeed(1);
+
+	/**
+	 * Here we test a reduction of a random 8D tensor on four dimensions.
+	 */
+
+	size_t i,j,k,l,m,n,o,p;
+	size_t dims   [8]  = {1171,373,2,1,2,1,2,1};
+	size_t prodDims    = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7];
+	size_t rdxDims[4]  = {1171,373,1,2};
+	size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3];
+	const unsigned reduxList[] = {2,4,7,5};
+
+	uint32_t*  pS = calloc(1, sizeof(*pS) * prodDims);
+	uint32_t*  pD = calloc(1, sizeof(*pD) * rdxProdDims);
+
+	ck_assert_ptr_ne(pS, NULL);
+	ck_assert_ptr_ne(pD, NULL);
+
+
+	/**
+	 * Initialize source data.
+	 */
+
+	for(i=0;i<prodDims;i++){
+		/**
+		 * We are testing logic-AND, so the bits should be 1 with high
+		 * probability.
+		 */
+
+		pS[i]  = (uint32_t)(pcgRand01() * (uint32_t)-1);
+		pS[i] |= (uint32_t)(pcgRand01() * (uint32_t)-1);
+		pS[i] |= (uint32_t)(pcgRand01() * (uint32_t)-1);
+		pS[i] |= (uint32_t)(pcgRand01() * (uint32_t)-1);
+		pS[i] |= (uint32_t)(pcgRand01() * (uint32_t)-1);
+	}
+
+
+	/**
+	 * Run the kernel.
+	 */
+
+	GpuArray gaS;
+	GpuArray gaD;
+
+	ga_assert_ok(GpuArray_empty (&gaS, ctx, GA_UINT, 8, dims,    GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty (&gaD, ctx, GA_UINT, 4, rdxDims, GA_C_ORDER));
+
+	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
+
+	ga_assert_ok(GpuArray_and   (&gaD, &gaS, 4, reduxList));
+
+	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*rdxProdDims, &gaD));
+
+
+	/**
+	 * Check that the destination tensors are correct.
+	 */
+
+	for(i=0;i<dims[0];i++){
+		for(j=0;j<dims[1];j++){
+			for(l=0;l<dims[3];l++){
+				for(o=0;o<dims[6];o++){
+					uint32_t gtD = -1;
+
+					for(k=0;k<dims[2];k++){
+						for(m=0;m<dims[4];m++){
+							for(p=0;p<dims[7];p++){
+								for(n=0;n<dims[5];n++){
+									uint32_t v = pS[(((((((i)*dims[1] + j)*dims[2] + k)*dims[3] + l)*dims[4] + m)*dims[5] + n)*dims[6] + o)*dims[7] + p];
+									gtD &= v;
+								}
+							}
+						}
+					}
+
+					size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
+					ck_assert_uint_eq(gtD, pD[dstIdx]);
+				}
+			}
+		}
+	}
+
+
+	/**
+	 * Deallocate.
+	 */
+
+	free(pS);
+	free(pD);
+	GpuArray_clear(&gaS);
+	GpuArray_clear(&gaD);
+}END_TEST
+
+START_TEST(test_and_alldimsreduced){
+	pcgSeed(1);
+
+	/**
+	 * We test here a reduction of some random 3D tensor on all dimensions.
+	 */
+
+	size_t i,j,k;
+	size_t dims[3]  = {32,50,79};
+	size_t prodDims = dims[0]*dims[1]*dims[2];
+	const unsigned reduxList[] = {0,1,2};
+
+	uint32_t*  pS = calloc(1, sizeof(*pS)    * dims[0]*dims[1]*dims[2]);
+	uint32_t*  pD = calloc(1, sizeof(*pD)                             );
+
+	ck_assert_ptr_ne(pS,    NULL);
+	ck_assert_ptr_ne(pD,    NULL);
+
+
+	/**
+	 * Initialize source data.
+	 */
+
+	for(i=0;i<prodDims;i++){
+		/**
+		 * We are testing logic-AND, so the bits should be 1 with high
+		 * probability.
+		 */
+
+		pS[i]  = (uint32_t)(pcgRand01() * (uint32_t)-1);
+		pS[i] |= (uint32_t)(pcgRand01() * (uint32_t)-1);
+		pS[i] |= (uint32_t)(pcgRand01() * (uint32_t)-1);
+		pS[i] |= (uint32_t)(pcgRand01() * (uint32_t)-1);
+		pS[i] |= (uint32_t)(pcgRand01() * (uint32_t)-1);
+	}
+
+
+	/**
+	 * Run the kernel.
+	 */
+
+	GpuArray gaS;
+	GpuArray gaD;
+
+	ga_assert_ok(GpuArray_empty (&gaS, ctx, GA_UINT, 3, &dims[0], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty (&gaD, ctx, GA_UINT, 0, NULL,     GA_C_ORDER));
+
+	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
+
+	ga_assert_ok(GpuArray_and   (&gaD, &gaS, 3, reduxList));
+
+	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD), &gaD));
+
+
+	/**
+	 * Check that the destination tensors are correct.
+	 */
+
+	uint32_t  gtD = -1;
+
+	for(i=0;i<dims[0];i++){
+		for(j=0;j<dims[1];j++){
+			for(k=0;k<dims[2];k++){
+				uint32_t v = pS[(i*dims[1] + j)*dims[2] + k];
+				gtD &= v;
+			}
+		}
+	}
+
+	ck_assert_uint_eq(gtD, pD[0]);
+
+	/**
+	 * Deallocate.
+	 */
+
+	free(pS);
+	free(pD);
+	GpuArray_clear(&gaS);
+	GpuArray_clear(&gaD);
+}END_TEST
+
+START_TEST(test_or_reduction){
+	pcgSeed(1);
+
+	/**
+	 * We test here a reduction of some random 3D tensor on all dimensions.
+	 */
+
+	size_t i,j,k;
+	size_t dims[3]  = {32,50,79};
+	size_t prodDims = dims[0]*dims[1]*dims[2];
+	const unsigned reduxList[] = {0,2};
+
+	uint32_t*  pS = calloc(1, sizeof(*pS)    * dims[0]*dims[1]*dims[2]);
+	uint32_t*  pD = calloc(1, sizeof(*pD)    *         dims[1]        );
+
+	ck_assert_ptr_ne(pS,    NULL);
+	ck_assert_ptr_ne(pD,    NULL);
+
+
+	/**
+	 * Initialize source data.
+	 */
+
+	for(i=0;i<prodDims;i++){
+		/**
+		 * We are testing logic-OR, so the bits should be 0 with high
+		 * probability.
+		 */
+
+		pS[i]  = (uint32_t)(pcgRand01() * (uint32_t)-1);
+		pS[i] &= (uint32_t)(pcgRand01() * (uint32_t)-1);
+		pS[i] &= (uint32_t)(pcgRand01() * (uint32_t)-1);
+		pS[i] &= (uint32_t)(pcgRand01() * (uint32_t)-1);
+		pS[i] &= (uint32_t)(pcgRand01() * (uint32_t)-1);
+	}
+
+
+	/**
+	 * Run the kernel.
+	 */
+
+	GpuArray gaS;
+	GpuArray gaD;
+
+	ga_assert_ok(GpuArray_empty (&gaS, ctx, GA_UINT, 3, &dims[0], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty (&gaD, ctx, GA_UINT, 1, &dims[1], GA_C_ORDER));
+
+	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD, -1));
+
+	ga_assert_ok(GpuArray_or    (&gaD, &gaS, 2, reduxList));
+
+	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*dims[1], &gaD));
+
+
+	/**
+	 * Check that the destination tensors are correct.
+	 */
+
+	for(j=0;j<dims[1];j++){
+		uint32_t  gtD = 0;
+
+		for(i=0;i<dims[0];i++){
+			for(k=0;k<dims[2];k++){
+				uint32_t v = pS[(i*dims[1] + j)*dims[2] + k];
+				gtD |= v;
+			}
+		}
+
+		ck_assert_uint_eq(gtD, pD[j]);
+	}
+
+	/**
+	 * Deallocate.
+	 */
+
+	free(pS);
+	free(pD);
+	GpuArray_clear(&gaS);
+	GpuArray_clear(&gaD);
+}END_TEST
+
+START_TEST(test_or_veryhighrank){
+	pcgSeed(1);
+
+	/**
+	 * Here we test a reduction of a random 8D tensor on four dimensions.
+	 */
+
+	size_t i,j,k,l,m,n,o,p;
+	size_t dims   [8]  = {1171,373,2,1,2,1,2,1};
+	size_t prodDims    = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7];
+	size_t rdxDims[4]  = {1171,373,1,2};
+	size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3];
+	const unsigned reduxList[] = {2,4,7,5};
+
+	uint32_t*  pS = calloc(1, sizeof(*pS) * prodDims);
+	uint32_t*  pD = calloc(1, sizeof(*pD) * rdxProdDims);
+
+	ck_assert_ptr_ne(pS, NULL);
+	ck_assert_ptr_ne(pD, NULL);
+
+
+	/**
+	 * Initialize source data.
+	 */
+
+	for(i=0;i<prodDims;i++){
+		/**
+		 * We are testing logic-OR, so the bits should be 0 with high
+		 * probability.
+		 */
+
+		pS[i]  = (uint32_t)(pcgRand01() * (uint32_t)-1);
+		pS[i] &= (uint32_t)(pcgRand01() * (uint32_t)-1);
+		pS[i] &= (uint32_t)(pcgRand01() * (uint32_t)-1);
+		pS[i] &= (uint32_t)(pcgRand01() * (uint32_t)-1);
+		pS[i] &= (uint32_t)(pcgRand01() * (uint32_t)-1);
+	}
+
+
+	/**
+	 * Run the kernel.
+	 */
+
+	GpuArray gaS;
+	GpuArray gaD;
+
+	ga_assert_ok(GpuArray_empty (&gaS, ctx, GA_UINT, 8, dims,    GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty (&gaD, ctx, GA_UINT, 4, rdxDims, GA_C_ORDER));
+
+	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
+
+	ga_assert_ok(GpuArray_or    (&gaD, &gaS, 4, reduxList));
+
+	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*rdxProdDims, &gaD));
+
+
+	/**
+	 * Check that the destination tensors are correct.
+	 */
+
+	for(i=0;i<dims[0];i++){
+		for(j=0;j<dims[1];j++){
+			for(l=0;l<dims[3];l++){
+				for(o=0;o<dims[6];o++){
+					uint32_t gtD = 0;
+
+					for(k=0;k<dims[2];k++){
+						for(m=0;m<dims[4];m++){
+							for(p=0;p<dims[7];p++){
+								for(n=0;n<dims[5];n++){
+									uint32_t v = pS[(((((((i)*dims[1] + j)*dims[2] + k)*dims[3] + l)*dims[4] + m)*dims[5] + n)*dims[6] + o)*dims[7] + p];
+									gtD |= v;
+								}
+							}
+						}
+					}
+
+					size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
+					ck_assert_uint_eq(gtD, pD[dstIdx]);
+				}
+			}
+		}
+	}
+
+
+	/**
+	 * Deallocate.
+	 */
+
+	free(pS);
+	free(pD);
+	GpuArray_clear(&gaS);
+	GpuArray_clear(&gaD);
+}END_TEST
+
+START_TEST(test_or_alldimsreduced){
+	pcgSeed(1);
+
+	/**
+	 * We test here a reduction of some random 3D tensor on all dimensions.
+	 */
+
+	size_t i,j,k;
+	size_t dims[3]  = {32,50,79};
+	size_t prodDims = dims[0]*dims[1]*dims[2];
+	const unsigned reduxList[] = {0,1,2};
+
+	uint32_t*  pS = calloc(1, sizeof(*pS)    * dims[0]*dims[1]*dims[2]);
+	uint32_t*  pD = calloc(1, sizeof(*pD)                             );
+
+	ck_assert_ptr_ne(pS,    NULL);
+	ck_assert_ptr_ne(pD,    NULL);
+
+
+	/**
+	 * Initialize source data.
+	 */
+
+	for(i=0;i<prodDims;i++){
+		/**
+		 * We are testing logic-OR, so the bits should be 0 with high
+		 * probability.
+		 */
+
+		pS[i]  = (uint32_t)(pcgRand01() * (uint32_t)-1);
+		pS[i] &= (uint32_t)(pcgRand01() * (uint32_t)-1);
+		pS[i] &= (uint32_t)(pcgRand01() * (uint32_t)-1);
+		pS[i] &= (uint32_t)(pcgRand01() * (uint32_t)-1);
+		pS[i] &= (uint32_t)(pcgRand01() * (uint32_t)-1);
+	}
+
+
+	/**
+	 * Run the kernel.
+	 */
+
+	GpuArray gaS;
+	GpuArray gaD;
+
+	ga_assert_ok(GpuArray_empty (&gaS, ctx, GA_UINT, 3, &dims[0], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty (&gaD, ctx, GA_UINT, 0, NULL,     GA_C_ORDER));
+
+	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
+
+	ga_assert_ok(GpuArray_or    (&gaD, &gaS, 3, reduxList));
+
+	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD), &gaD));
+
+
+	/**
+	 * Check that the destination tensors are correct.
+	 */
+
+	uint32_t  gtD = 0;
+
+	for(i=0;i<dims[0];i++){
+		for(j=0;j<dims[1];j++){
+			for(k=0;k<dims[2];k++){
+				uint32_t v = pS[(i*dims[1] + j)*dims[2] + k];
+				gtD |= v;
+			}
+		}
+	}
+
+	ck_assert_uint_eq(gtD, pD[0]);
+
+	/**
+	 * Deallocate.
+	 */
+
+	free(pS);
+	free(pD);
+	GpuArray_clear(&gaS);
+	GpuArray_clear(&gaD);
+}END_TEST
+
+START_TEST(test_xor_reduction){
+	pcgSeed(1);
+
+	/**
+	 * We test here a reduction of some random 3D tensor on all dimensions.
+	 */
+
+	size_t i,j,k;
+	size_t dims[3]  = {32,50,79};
+	size_t prodDims = dims[0]*dims[1]*dims[2];
+	const unsigned reduxList[] = {0,2};
+
+	uint32_t*  pS = calloc(1, sizeof(*pS)    * dims[0]*dims[1]*dims[2]);
+	uint32_t*  pD = calloc(1, sizeof(*pD)    *         dims[1]        );
+
+	ck_assert_ptr_ne(pS,    NULL);
+	ck_assert_ptr_ne(pD,    NULL);
+
+
+	/**
+	 * Initialize source data.
+	 */
+
+	for(i=0;i<prodDims;i++){
+		/**
+		 * We are testing logic-XOR, so the bits should be 1 with even
+		 * probability.
+		 */
+
+		pS[i]  = (uint32_t)(pcgRand01() * (uint32_t)-1);
+	}
+
+
+	/**
+	 * Run the kernel.
+	 */
+
+	GpuArray gaS;
+	GpuArray gaD;
+
+	ga_assert_ok(GpuArray_empty (&gaS, ctx, GA_UINT, 3, &dims[0], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty (&gaD, ctx, GA_UINT, 1, &dims[1], GA_C_ORDER));
+
+	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD, -1));
+
+	ga_assert_ok(GpuArray_xor   (&gaD, &gaS, 2, reduxList));
+
+	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*dims[1], &gaD));
+
+
+	/**
+	 * Check that the destination tensors are correct.
+	 */
+
+	for(j=0;j<dims[1];j++){
+		uint32_t  gtD = 0;
+
+		for(i=0;i<dims[0];i++){
+			for(k=0;k<dims[2];k++){
+				uint32_t v = pS[(i*dims[1] + j)*dims[2] + k];
+				gtD ^= v;
+			}
+		}
+
+		ck_assert_uint_eq(gtD, pD[j]);
+	}
+
+	/**
+	 * Deallocate.
+	 */
+
+	free(pS);
+	free(pD);
+	GpuArray_clear(&gaS);
+	GpuArray_clear(&gaD);
+}END_TEST
+
+START_TEST(test_xor_veryhighrank){
+	pcgSeed(1);
+
+	/**
+	 * Here we test a reduction of a random 8D tensor on four dimensions.
+	 */
+
+	size_t i,j,k,l,m,n,o,p;
+	size_t dims   [8]  = {1171,373,2,1,2,1,2,1};
+	size_t prodDims    = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7];
+	size_t rdxDims[4]  = {1171,373,1,2};
+	size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3];
+	const unsigned reduxList[] = {2,4,7,5};
+
+	uint32_t*  pS = calloc(1, sizeof(*pS) * prodDims);
+	uint32_t*  pD = calloc(1, sizeof(*pD) * rdxProdDims);
+
+	ck_assert_ptr_ne(pS, NULL);
+	ck_assert_ptr_ne(pD, NULL);
+
+
+	/**
+	 * Initialize source data.
+	 */
+
+	for(i=0;i<prodDims;i++){
+		/**
+		 * We are testing logic-XOR, so the bits should be 1 with even
+		 * probability.
+		 */
+
+		pS[i]  = (uint32_t)(pcgRand01() * (uint32_t)-1);
+	}
+
+
+	/**
+	 * Run the kernel.
+	 */
+
+	GpuArray gaS;
+	GpuArray gaD;
+
+	ga_assert_ok(GpuArray_empty (&gaS, ctx, GA_UINT, 8, dims,    GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty (&gaD, ctx, GA_UINT, 4, rdxDims, GA_C_ORDER));
+
+	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
+
+	ga_assert_ok(GpuArray_xor   (&gaD, &gaS, 4, reduxList));
+
+	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*rdxProdDims, &gaD));
+
+
+	/**
+	 * Check that the destination tensors are correct.
+	 */
+
+	for(i=0;i<dims[0];i++){
+		for(j=0;j<dims[1];j++){
+			for(l=0;l<dims[3];l++){
+				for(o=0;o<dims[6];o++){
+					uint32_t gtD = 0;
+
+					for(k=0;k<dims[2];k++){
+						for(m=0;m<dims[4];m++){
+							for(p=0;p<dims[7];p++){
+								for(n=0;n<dims[5];n++){
+									uint32_t v = pS[(((((((i)*dims[1] + j)*dims[2] + k)*dims[3] + l)*dims[4] + m)*dims[5] + n)*dims[6] + o)*dims[7] + p];
+									gtD ^= v;
+								}
+							}
+						}
+					}
+
+					size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
+					ck_assert_uint_eq(gtD, pD[dstIdx]);
+				}
+			}
+		}
+	}
+
+
+	/**
+	 * Deallocate.
+	 */
+
+	free(pS);
+	free(pD);
+	GpuArray_clear(&gaS);
+	GpuArray_clear(&gaD);
+}END_TEST
+
+START_TEST(test_xor_alldimsreduced){
+	pcgSeed(1);
+
+	/**
+	 * We test here a reduction of some random 3D tensor on all dimensions.
+	 */
+
+	size_t i,j,k;
+	size_t dims[3]  = {32,50,79};
+	size_t prodDims = dims[0]*dims[1]*dims[2];
+	const unsigned reduxList[] = {0,1,2};
+
+	uint32_t*  pS = calloc(1, sizeof(*pS)    * dims[0]*dims[1]*dims[2]);
+	uint32_t*  pD = calloc(1, sizeof(*pD)                             );
+
+	ck_assert_ptr_ne(pS,    NULL);
+	ck_assert_ptr_ne(pD,    NULL);
+
+
+	/**
+	 * Initialize source data.
+	 */
+
+	for(i=0;i<prodDims;i++){
+		/**
+		 * We are testing logic-XOR, so the bits should be 1 with even
+		 * probability.
+		 */
+
+		pS[i]  = (uint32_t)(pcgRand01() * (uint32_t)-1);
+	}
+
+
+	/**
+	 * Run the kernel.
+	 */
+
+	GpuArray gaS;
+	GpuArray gaD;
+
+	ga_assert_ok(GpuArray_empty (&gaS, ctx, GA_UINT, 3, &dims[0], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty (&gaD, ctx, GA_UINT, 0, NULL,     GA_C_ORDER));
+
+	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
+
+	ga_assert_ok(GpuArray_xor   (&gaD, &gaS, 3, reduxList));
+
+	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD), &gaD));
+
+
+	/**
+	 * Check that the destination tensors are correct.
+	 */
+
+	uint32_t  gtD = 0;
+
+	for(i=0;i<dims[0];i++){
+		for(j=0;j<dims[1];j++){
+			for(k=0;k<dims[2];k++){
+				uint32_t v = pS[(i*dims[1] + j)*dims[2] + k];
+				gtD ^= v;
+			}
+		}
+	}
+
+	ck_assert_uint_eq(gtD, pD[0]);
+
+	/**
+	 * Deallocate.
+	 */
+
+	free(pS);
+	free(pD);
+	GpuArray_clear(&gaS);
+	GpuArray_clear(&gaD);
+}END_TEST
+
+START_TEST(test_any_reduction){
+	pcgSeed(1);
+
+	/**
+	 * We test here a reduction of some random 3D tensor on all dimensions.
+	 */
+
+	size_t i,j,k;
+	size_t dims[3]  = {32,50,79};
+	size_t prodDims = dims[0]*dims[1]*dims[2];
+	const unsigned reduxList[] = {0,2};
+
+	uint32_t*  pS = calloc(1, sizeof(*pS)    * dims[0]*dims[1]*dims[2]);
+	uint32_t*  pD = calloc(1, sizeof(*pD)    *         dims[1]        );
+
+	ck_assert_ptr_ne(pS,    NULL);
+	ck_assert_ptr_ne(pD,    NULL);
+
+
+	/**
+	 * Initialize source data.
+	 */
+
+	for(i=0;i<prodDims;i++){
+		/**
+		 * We are testing logic-ANY, so the values should be 0 with high
+		 * probability.
+		 */
+
+		pS[i]  = pcgRand01() < 0.05;
+	}
+
+
+	/**
+	 * Run the kernel.
+	 */
+
+	GpuArray gaS;
+	GpuArray gaD;
+
+	ga_assert_ok(GpuArray_empty (&gaS, ctx, GA_UINT, 3, &dims[0], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty (&gaD, ctx, GA_UINT, 1, &dims[1], GA_C_ORDER));
+
+	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD, -1));
+
+	ga_assert_ok(GpuArray_any   (&gaD, &gaS, 2, reduxList));
+
+	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*dims[1], &gaD));
+
+
+	/**
+	 * Check that the destination tensors are correct.
+	 */
+
+	for(j=0;j<dims[1];j++){
+		uint32_t  gtD = 0;
+
+		for(i=0;i<dims[0];i++){
+			for(k=0;k<dims[2];k++){
+				uint32_t v = pS[(i*dims[1] + j)*dims[2] + k];
+				gtD = gtD || v;
+			}
+		}
+
+		ck_assert_uint_eq(gtD, pD[j]);
+	}
+
+	/**
+	 * Deallocate.
+	 */
+
+	free(pS);
+	free(pD);
+	GpuArray_clear(&gaS);
+	GpuArray_clear(&gaD);
+}END_TEST
+
+START_TEST(test_any_veryhighrank){
+	pcgSeed(1);
+
+	/**
+	 * Here we test a reduction of a random 8D tensor on four dimensions.
+	 */
+
+	size_t i,j,k,l,m,n,o,p;
+	size_t dims   [8]  = {1171,373,2,1,2,1,2,1};
+	size_t prodDims    = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7];
+	size_t rdxDims[4]  = {1171,373,1,2};
+	size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3];
+	const unsigned reduxList[] = {2,4,7,5};
+
+	uint32_t*  pS = calloc(1, sizeof(*pS) * prodDims);
+	uint32_t*  pD = calloc(1, sizeof(*pD) * rdxProdDims);
+
+	ck_assert_ptr_ne(pS, NULL);
+	ck_assert_ptr_ne(pD, NULL);
+
+
+	/**
+	 * Initialize source data.
+	 */
+
+	for(i=0;i<prodDims;i++){
+		/**
+		 * We are testing logic-ANY, so the values should be 0 with high
+		 * probability.
+		 */
+
+		pS[i]  = pcgRand01() < 0.05;
+	}
+
+
+	/**
+	 * Run the kernel.
+	 */
+
+	GpuArray gaS;
+	GpuArray gaD;
+
+	ga_assert_ok(GpuArray_empty (&gaS, ctx, GA_UINT, 8, dims,    GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty (&gaD, ctx, GA_UINT, 4, rdxDims, GA_C_ORDER));
+
+	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
+
+	ga_assert_ok(GpuArray_any   (&gaD, &gaS, 4, reduxList));
+
+	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*rdxProdDims, &gaD));
+
+
+	/**
+	 * Check that the destination tensors are correct.
+	 */
+
+	for(i=0;i<dims[0];i++){
+		for(j=0;j<dims[1];j++){
+			for(l=0;l<dims[3];l++){
+				for(o=0;o<dims[6];o++){
+					uint32_t gtD = 0;
+
+					for(k=0;k<dims[2];k++){
+						for(m=0;m<dims[4];m++){
+							for(p=0;p<dims[7];p++){
+								for(n=0;n<dims[5];n++){
+									uint32_t v = pS[(((((((i)*dims[1] + j)*dims[2] + k)*dims[3] + l)*dims[4] + m)*dims[5] + n)*dims[6] + o)*dims[7] + p];
+									gtD = gtD || v;
+								}
+							}
+						}
+					}
+
+					size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
+					ck_assert_uint_eq(gtD, pD[dstIdx]);
+				}
+			}
+		}
+	}
+
+
+	/**
+	 * Deallocate.
+	 */
+
+	free(pS);
+	free(pD);
+	GpuArray_clear(&gaS);
+	GpuArray_clear(&gaD);
+}END_TEST
+
+START_TEST(test_any_alldimsreduced){
+	pcgSeed(1);
+
+	/**
+	 * We test here a reduction of some random 3D tensor on all dimensions.
+	 */
+
+	size_t i,j,k;
+	size_t dims[3]  = {32,50,79};
+	size_t prodDims = dims[0]*dims[1]*dims[2];
+	const unsigned reduxList[] = {0,1,2};
+
+	uint32_t*  pS = calloc(1, sizeof(*pS)    * dims[0]*dims[1]*dims[2]);
+	uint32_t*  pD = calloc(1, sizeof(*pD)                             );
+
+	ck_assert_ptr_ne(pS,    NULL);
+	ck_assert_ptr_ne(pD,    NULL);
+
+
+	/**
+	 * Initialize source data.
+	 */
+
+	for(i=0;i<prodDims;i++){
+		/**
+		 * We are testing logic-ANY, so the values should be 0 with high
+		 * probability.
+		 */
+
+		pS[i]  = pcgRand01() < 0.05;
+	}
+
+
+	/**
+	 * Run the kernel.
+	 */
+
+	GpuArray gaS;
+	GpuArray gaD;
+
+	ga_assert_ok(GpuArray_empty (&gaS, ctx, GA_UINT, 3, &dims[0], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty (&gaD, ctx, GA_UINT, 0, NULL,     GA_C_ORDER));
+
+	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
+
+	ga_assert_ok(GpuArray_any   (&gaD, &gaS, 3, reduxList));
+
+	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD), &gaD));
+
+
+	/**
+	 * Check that the destination tensors are correct.
+	 */
+
+	uint32_t  gtD = 0;
+
+	for(i=0;i<dims[0];i++){
+		for(j=0;j<dims[1];j++){
+			for(k=0;k<dims[2];k++){
+				uint32_t v = pS[(i*dims[1] + j)*dims[2] + k];
+				gtD = gtD || v;
+			}
+		}
+	}
+
+	ck_assert_uint_eq(gtD, pD[0]);
+
+	/**
+	 * Deallocate.
+	 */
+
+	free(pS);
+	free(pD);
+	GpuArray_clear(&gaS);
+	GpuArray_clear(&gaD);
+}END_TEST
+
+START_TEST(test_all_reduction){
+	pcgSeed(1);
+
+	/**
+	 * We test here a reduction of some random 3D tensor on all dimensions.
+	 */
+
+	size_t i,j,k;
+	size_t dims[3]  = {32,50,79};
+	size_t prodDims = dims[0]*dims[1]*dims[2];
+	const unsigned reduxList[] = {0,2};
+
+	uint32_t*  pS = calloc(1, sizeof(*pS)    * dims[0]*dims[1]*dims[2]);
+	uint32_t*  pD = calloc(1, sizeof(*pD)    *         dims[1]        );
+
+	ck_assert_ptr_ne(pS,    NULL);
+	ck_assert_ptr_ne(pD,    NULL);
+
+
+	/**
+	 * Initialize source data.
+	 */
+
+	for(i=0;i<prodDims;i++){
+		/**
+		 * We are testing logic-ALL, so the values should be non-0 with high
+		 * probability.
+		 */
+
+		pS[i]  = pcgRand01() > 0.05;
+	}
+
+
+	/**
+	 * Run the kernel.
+	 */
+
+	GpuArray gaS;
+	GpuArray gaD;
+
+	ga_assert_ok(GpuArray_empty (&gaS, ctx, GA_UINT, 3, &dims[0], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty (&gaD, ctx, GA_UINT, 1, &dims[1], GA_C_ORDER));
+
+	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD, -1));
+
+	ga_assert_ok(GpuArray_all   (&gaD, &gaS, 2, reduxList));
+
+	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*dims[1], &gaD));
+
+
+	/**
+	 * Check that the destination tensors are correct.
+	 */
+
+	for(j=0;j<dims[1];j++){
+		uint32_t  gtD = 1;
+
+		for(i=0;i<dims[0];i++){
+			for(k=0;k<dims[2];k++){
+				uint32_t v = pS[(i*dims[1] + j)*dims[2] + k];
+				gtD = gtD && v;
+			}
+		}
+
+		ck_assert_uint_eq(gtD, pD[j]);
+	}
+
+	/**
+	 * Deallocate.
+	 */
+
+	free(pS);
+	free(pD);
+	GpuArray_clear(&gaS);
+	GpuArray_clear(&gaD);
+}END_TEST
+
+START_TEST(test_all_veryhighrank){
+	pcgSeed(1);
+
+	/**
+	 * Here we test a reduction of a random 8D tensor on four dimensions.
+	 */
+
+	size_t i,j,k,l,m,n,o,p;
+	size_t dims   [8]  = {1171,373,2,1,2,1,2,1};
+	size_t prodDims    = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7];
+	size_t rdxDims[4]  = {1171,373,1,2};
+	size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3];
+	const unsigned reduxList[] = {2,4,7,5};
+
+	uint32_t*  pS = calloc(1, sizeof(*pS) * prodDims);
+	uint32_t*  pD = calloc(1, sizeof(*pD) * rdxProdDims);
+
+	ck_assert_ptr_ne(pS, NULL);
+	ck_assert_ptr_ne(pD, NULL);
+
+
+	/**
+	 * Initialize source data.
+	 */
+
+	for(i=0;i<prodDims;i++){
+		/**
+		 * We are testing logic-ALL, so the values should be non-0 with high
+		 * probability.
+		 */
+
+		pS[i]  = pcgRand01() > 0.05;
+	}
+
+
+	/**
+	 * Run the kernel.
+	 */
+
+	GpuArray gaS;
+	GpuArray gaD;
+
+	ga_assert_ok(GpuArray_empty (&gaS, ctx, GA_UINT, 8, dims,    GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty (&gaD, ctx, GA_UINT, 4, rdxDims, GA_C_ORDER));
+
+	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
+
+	ga_assert_ok(GpuArray_all   (&gaD, &gaS, 4, reduxList));
+
+	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*rdxProdDims, &gaD));
+
+
+	/**
+	 * Check that the destination tensors are correct.
+	 */
+
+	for(i=0;i<dims[0];i++){
+		for(j=0;j<dims[1];j++){
+			for(l=0;l<dims[3];l++){
+				for(o=0;o<dims[6];o++){
+					uint32_t gtD = 1;
+
+					for(k=0;k<dims[2];k++){
+						for(m=0;m<dims[4];m++){
+							for(p=0;p<dims[7];p++){
+								for(n=0;n<dims[5];n++){
+									uint32_t v = pS[(((((((i)*dims[1] + j)*dims[2] + k)*dims[3] + l)*dims[4] + m)*dims[5] + n)*dims[6] + o)*dims[7] + p];
+									gtD = gtD && v;
+								}
+							}
+						}
+					}
+
+					size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
+					ck_assert_uint_eq(gtD, pD[dstIdx]);
+				}
+			}
+		}
+	}
+
+
+	/**
+	 * Deallocate.
+	 */
+
+	free(pS);
+	free(pD);
+	GpuArray_clear(&gaS);
+	GpuArray_clear(&gaD);
+}END_TEST
+
+START_TEST(test_all_alldimsreduced){
+	pcgSeed(1);
+
+	/**
+	 * We test here a reduction of some random 3D tensor on all dimensions.
+	 */
+
+	size_t i,j,k;
+	size_t dims[3]  = {32,50,79};
+	size_t prodDims = dims[0]*dims[1]*dims[2];
+	const unsigned reduxList[] = {0,1,2};
+
+	uint32_t*  pS = calloc(1, sizeof(*pS)    * dims[0]*dims[1]*dims[2]);
+	uint32_t*  pD = calloc(1, sizeof(*pD)                             );
+
+	ck_assert_ptr_ne(pS,    NULL);
+	ck_assert_ptr_ne(pD,    NULL);
+
+
+	/**
+	 * Initialize source data.
+	 */
+
+	for(i=0;i<prodDims;i++){
+		/**
+		 * We are testing logic-ALL, so the values should be non-0 with high
+		 * probability.
+		 */
+
+		pS[i]  = pcgRand01() > 0.05;
+	}
+
+
+	/**
+	 * Run the kernel.
+	 */
+
+	GpuArray gaS;
+	GpuArray gaD;
+
+	ga_assert_ok(GpuArray_empty (&gaS, ctx, GA_UINT, 3, &dims[0], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty (&gaD, ctx, GA_UINT, 0, NULL,     GA_C_ORDER));
+
+	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
+
+	ga_assert_ok(GpuArray_all   (&gaD, &gaS, 3, reduxList));
+
+	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD), &gaD));
+
+
+	/**
+	 * Check that the destination tensors are correct.
+	 */
+
+	uint32_t  gtD = 1;
+
+	for(i=0;i<dims[0];i++){
+		for(j=0;j<dims[1];j++){
+			for(k=0;k<dims[2];k++){
+				uint32_t v = pS[(i*dims[1] + j)*dims[2] + k];
+				gtD = gtD && v;
+			}
+		}
+	}
+
+	ck_assert_uint_eq(gtD, pD[0]);
+
+	/**
+	 * Deallocate.
+	 */
+
+	free(pS);
+	free(pD);
+	GpuArray_clear(&gaS);
+	GpuArray_clear(&gaD);
+}END_TEST
+
+Suite *get_suite(void) {
+	Suite *s  = suite_create("reduction");
+	TCase *tc = tcase_create("basic");
+	tcase_add_checked_fixture(tc, setup, teardown);
+	tcase_set_timeout(tc, 120.0);
+
+	tcase_add_test(tc, test_maxandargmax_reduction);
+	tcase_add_test(tc, test_maxandargmax_idxtranspose);
+	tcase_add_test(tc, test_maxandargmax_veryhighrank);
+	tcase_add_test(tc, test_maxandargmax_alldimsreduced);
+
+	tcase_add_test(tc, test_minandargmin_reduction);
+	tcase_add_test(tc, test_minandargmin_veryhighrank);
+	tcase_add_test(tc, test_minandargmin_alldimsreduced);
+
+	tcase_add_test(tc, test_argmax_reduction);
+	tcase_add_test(tc, test_argmax_veryhighrank);
+	tcase_add_test(tc, test_argmax_alldimsreduced);
+
+	tcase_add_test(tc, test_argmin_reduction);
+	tcase_add_test(tc, test_argmin_veryhighrank);
+	tcase_add_test(tc, test_argmin_alldimsreduced);
+
+	tcase_add_test(tc, test_max_reduction);
+	tcase_add_test(tc, test_max_veryhighrank);
+	tcase_add_test(tc, test_max_alldimsreduced);
+
+	tcase_add_test(tc, test_min_reduction);
+	tcase_add_test(tc, test_min_veryhighrank);
+	tcase_add_test(tc, test_min_alldimsreduced);
+
+	tcase_add_test(tc, test_sum_reduction);
+	tcase_add_test(tc, test_sum_veryhighrank);
+	tcase_add_test(tc, test_sum_alldimsreduced);
+
+	tcase_add_test(tc, test_prod_reduction);
+	tcase_add_test(tc, test_prod_veryhighrank);
+	tcase_add_test(tc, test_prod_alldimsreduced);
+
+	tcase_add_test(tc, test_prodnz_reduction);
+	tcase_add_test(tc, test_prodnz_veryhighrank);
+	tcase_add_test(tc, test_prodnz_alldimsreduced);
+
+	tcase_add_test(tc, test_and_reduction);
+	tcase_add_test(tc, test_and_veryhighrank);
+	tcase_add_test(tc, test_and_alldimsreduced);
+
+	tcase_add_test(tc, test_or_reduction);
+	tcase_add_test(tc, test_or_veryhighrank);
+	tcase_add_test(tc, test_or_alldimsreduced);
+
+	tcase_add_test(tc, test_xor_reduction);
+	tcase_add_test(tc, test_xor_veryhighrank);
+	tcase_add_test(tc, test_xor_alldimsreduced);
+
+	tcase_add_test(tc, test_any_reduction);
+	tcase_add_test(tc, test_any_veryhighrank);
+	tcase_add_test(tc, test_any_alldimsreduced);
+
+	tcase_add_test(tc, test_all_reduction);
+	tcase_add_test(tc, test_all_veryhighrank);
+	tcase_add_test(tc, test_all_alldimsreduced);
 
 	suite_add_tcase(s, tc);
 	return s;

From 32bd11dfa52a143f7f015cf5c31a6789827d6e6e Mon Sep 17 00:00:00 2001
From: Olexa Bilaniuk <obilaniu@gmail.com>
Date: Sun, 5 Mar 2017 01:43:37 -0500
Subject: [PATCH 10/34] Muzzle incorrect GCC maybe-uninitialized diagnostic.

Clang and MSVC correctly recognize that all paths to the allegedly-
uninitialized variables are, in fact, dominated by their initialization.
---
 src/gpuarray_reduction.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c
index 61f1688a4f..5bab3a65f7 100644
--- a/src/gpuarray_reduction.c
+++ b/src/gpuarray_reduction.c
@@ -1781,9 +1781,9 @@ static int   reduxCompile                  (redux_ctx*  ctx){
  */
 
 static int   reduxSchedule                 (redux_ctx*  ctx){
-	int      i, priNdims, auxNdims;
-	uint64_t maxLgRdx, maxLgPre, maxLgPost;
-	uint64_t maxLgPri, maxLgAux;
+	int      i, priNdims = 0, auxNdims = 0;
+	uint64_t maxLgRdx = 0, maxLgPre = 0, maxLgPost = 0;
+	uint64_t maxLgPri = 0, maxLgAux = 0;
 	uint64_t maxLs  [MAX_HW_DIMS];
 	uint64_t maxGg;
 	uint64_t maxGs  [MAX_HW_DIMS];

From 19bd9390937e394b2fa264ea3a37e05aa981bd1c Mon Sep 17 00:00:00 2001
From: Olexa Bilaniuk <obilaniu@gmail.com>
Date: Mon, 15 May 2017 14:43:26 -0400
Subject: [PATCH 11/34] Current State

---
 src/gpuarray/reduction.h |   91 +-
 src/gpuarray_reduction.c | 1829 ++++++++++++++++++++++++++------------
 tests/check_reduction.c  |   86 +-
 3 files changed, 1310 insertions(+), 696 deletions(-)

diff --git a/src/gpuarray/reduction.h b/src/gpuarray/reduction.h
index 1db5664535..f6638c9a83 100644
--- a/src/gpuarray/reduction.h
+++ b/src/gpuarray/reduction.h
@@ -26,28 +26,26 @@ extern "C" {
  */
 
 typedef enum _ga_reduce_op {
-	GA_REDUCE_SUM,             /*        +        */
-	GA_REDUCE_PROD,            /*        *        */
-	GA_REDUCE_PRODNZ,          /*        * (!=0)  */
-	GA_REDUCE_MIN,             /*      min()      */
-	GA_REDUCE_MAX,             /*      max()      */
-	GA_REDUCE_ARGMIN,          /*     argmin()    */
-	GA_REDUCE_ARGMAX,          /*     argmax()    */
-	GA_REDUCE_MINANDARGMIN,    /* min(), argmin() */
-	GA_REDUCE_MAXANDARGMAX,    /* max(), argmax() */
-	GA_REDUCE_AND,             /*        &        */
-	GA_REDUCE_OR,              /*        |        */
-	GA_REDUCE_XOR,             /*        ^        */
-	GA_REDUCE_ALL,             /*     &&/all()    */
-	GA_REDUCE_ANY,             /*     ||/any()    */
+	GA_REDUCE_SUM,             /*          +          */
+	GA_REDUCE_PROD,            /*          *          */
+	GA_REDUCE_PRODNZ,          /*          * (!=0)    */
+	GA_REDUCE_MIN,             /*        min()        */
+	GA_REDUCE_MAX,             /*        max()        */
+	GA_REDUCE_ARGMIN,          /*       argmin()      */
+	GA_REDUCE_ARGMAX,          /*       argmax()      */
+	GA_REDUCE_MINANDARGMIN,    /*   min(), argmin()   */
+	GA_REDUCE_MAXANDARGMAX,    /*   max(), argmax()   */
+	GA_REDUCE_AND,             /*          &          */
+	GA_REDUCE_OR,              /*          |          */
+	GA_REDUCE_XOR,             /*          ^          */
+	GA_REDUCE_ALL,             /*       &&/all()      */
+	GA_REDUCE_ANY,             /*       ||/any()      */
 } ga_reduce_op;
 
 
 
 /**
- * @brief Compute a reduction sum (+), product (*), non-zero product (* != 0),
- *        min, max, argmin, argmax, min-and-argmin, max-and-argmax, and (&),
- *        or (|), xor (^), all (&&) or any (||) over a list of axes to reduce.
+ * @brief Compute a reduction over a list of axes to reduce.
  *
  * Returns one (in the case of min-and-argmin/max-and-argmax, two) destination
  * tensors. The destination tensor(s)' axes are a strict subset of the axes of the
@@ -55,6 +53,7 @@ typedef enum _ga_reduce_op {
  * reduction is performed over these axes, which are then removed in the
  * destination.
  *
+ * @param [in]  op         The reduction operation to perform.
  * @param [out] dst        The destination tensor. Has the same type as the source.
  * @param [out] dstArg     For argument of minima/maxima operations. Has type int64.
  * @param [in]  src        The source tensor.
@@ -81,64 +80,6 @@ typedef enum _ga_reduce_op {
  *         code otherwise.
  */
 
-GPUARRAY_PUBLIC int GpuArray_sum         (GpuArray*       dst,
-                                          const GpuArray* src,
-                                          unsigned        reduxLen,
-                                          const unsigned* reduxList);
-GPUARRAY_PUBLIC int GpuArray_prod        (GpuArray*       dst,
-                                          const GpuArray* src,
-                                          unsigned        reduxLen,
-                                          const unsigned* reduxList);
-GPUARRAY_PUBLIC int GpuArray_prodnz      (GpuArray*       dst,
-                                          const GpuArray* src,
-                                          unsigned        reduxLen,
-                                          const unsigned* reduxList);
-GPUARRAY_PUBLIC int GpuArray_min         (GpuArray*       dst,
-                                          const GpuArray* src,
-                                          unsigned        reduxLen,
-                                          const unsigned* reduxList);
-GPUARRAY_PUBLIC int GpuArray_max         (GpuArray*       dst,
-                                          const GpuArray* src,
-                                          unsigned        reduxLen,
-                                          const unsigned* reduxList);
-GPUARRAY_PUBLIC int GpuArray_argmin      (GpuArray*       dstArg,
-                                          const GpuArray* src,
-                                          unsigned        reduxLen,
-                                          const unsigned* reduxList);
-GPUARRAY_PUBLIC int GpuArray_argmax      (GpuArray*       dstArg,
-                                          const GpuArray* src,
-                                          unsigned        reduxLen,
-                                          const unsigned* reduxList);
-GPUARRAY_PUBLIC int GpuArray_minandargmin(GpuArray*       dst,
-                                          GpuArray*       dstArg,
-                                          const GpuArray* src,
-                                          unsigned        reduxLen,
-                                          const unsigned* reduxList);
-GPUARRAY_PUBLIC int GpuArray_maxandargmax(GpuArray*       dst,
-                                          GpuArray*       dstArg,
-                                          const GpuArray* src,
-                                          unsigned        reduxLen,
-                                          const unsigned* reduxList);
-GPUARRAY_PUBLIC int GpuArray_and         (GpuArray*       dst,
-                                          const GpuArray* src,
-                                          unsigned        reduxLen,
-                                          const unsigned* reduxList);
-GPUARRAY_PUBLIC int GpuArray_or          (GpuArray*       dst,
-                                          const GpuArray* src,
-                                          unsigned        reduxLen,
-                                          const unsigned* reduxList);
-GPUARRAY_PUBLIC int GpuArray_xor         (GpuArray*       dst,
-                                          const GpuArray* src,
-                                          unsigned        reduxLen,
-                                          const unsigned* reduxList);
-GPUARRAY_PUBLIC int GpuArray_all         (GpuArray*       dst,
-                                          const GpuArray* src,
-                                          unsigned        reduxLen,
-                                          const unsigned* reduxList);
-GPUARRAY_PUBLIC int GpuArray_any         (GpuArray*       dst,
-                                          const GpuArray* src,
-                                          unsigned        reduxLen,
-                                          const unsigned* reduxList);
 GPUARRAY_PUBLIC int GpuArray_reduction   (ga_reduce_op    op,
                                           GpuArray*       dst,
                                           GpuArray*       dstArg,
diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c
index 5bab3a65f7..072f1e2685 100644
--- a/src/gpuarray_reduction.c
+++ b/src/gpuarray_reduction.c
@@ -6,6 +6,7 @@
 #include <assert.h>
 #include <stdarg.h>
 #include <stddef.h>
+#include <stdio.h>
 #include "gpuarray/config.h"
 #include <stdlib.h>
 #include <string.h>
@@ -33,47 +34,96 @@
 
 /* Datatypes */
 
+/**
+ * @brief Axis Description.
+ */
+
+struct axis_desc{
+	int      reduxNum;
+	unsigned isReduced     : 1;
+	unsigned isHW          : 1;
+	unsigned isSW          : 1;
+	size_t   warpLen;
+	size_t   len;
+	ssize_t  srcStride,       srcOffset;
+	ssize_t  dstStride,       dstOffset;
+	ssize_t  dstArgStride,    dstArgOffset;
+	ssize_t  tmpDstStride,    tmpDstOffset;
+	ssize_t  tmpDstArgStride, tmpDstArgOffset;
+};
+typedef struct axis_desc axis_desc;
+
 /**
  *                    Reduction Kernel Generator.
- *
- * The generator produces a kernel from one of two "code models":
- *   - Large
- *   - Small
- * Which one is used depends on the size of the destination tensor and the
- * number of reductions for each destination element. A destination tensor
- * with more than SMALL_REDUX_THRESHOLD elements or more elements than
- * reductions for each element will result in use of the large code model;
- * Otherwise the small code model is used.
- *
- *
- *                         LARGE CODE MODEL:
- *
- * In the large code model, each destination element is processed by a
- * single thread.
- *
- * Each thread begins with an initial value in a register, reads from all
- * source elements contributing to the reduction, computes the result and
- * writes it to the destination element.
- *
- * A single kernel is generated that performs prescalar transformations, the
- * reduction itself, postscalar transformations and the write to global memory.
- *
- *
- *                         SMALL CODE MODEL:
- *
- * In the small code model, each destination element is processed by
- * multiple threads.
- *
- * The destination tensor is first initialized with the initial value. Then,
- * one several threads cooperate to perform the reduction atomically on each
- * destination element. Lastly, postscalar transformations are applied
- * in-place.
- *
- * Two or three kernels are generated: The initialization kernel, the main
- * kernel that performs prescalar transformations and the reduction itself, and
- * possibly also a postscalar transformation kernel when it is required.
- *
- *
+ * 
+ * INTRO
+ * 
+ * Generates the source code for a reduction kernel over arbitrarily-dimensioned,
+ * -shaped and -typed tensors.
+ * 
+ * 
+ * GOALS
+ * 
+ * The generator has the following goals:
+ * 
+ *   1. Maximizing the use of coalesced memory loads within a warp.
+ *   2. Maximizing the # of useful threads within a warp.
+ *   3. Maximizing the number of warps within a block.
+ * 
+ *   NOTE: It is possible to guarantee for any tensor problem of at least
+ *         2*WARP_SIZE in scale that either
+ *         1. All warp blocks in the X dimension have more than 50% threads
+ *            active 100% of the time, or
+ *         2. The warp blocks in the X dimension have 100% threads active more
+ *            than 50% of the time.
+ * 
+ *   4. Ensuring there are no more blocks than are permitted by the warp
+ *      configuration and 2nd-stage workspace size (if required).
+ *   5. Ensuring there are no more than 5 blocks per multiprocessor.
+ *   6. Minimizing the 2nd-stage workspace (if it is required).
+ *   7. Striding the 2nd-stage workspace for maximum convenience (if it is
+ *      required). Make it contiguous.
+ * 
+ * 
+ * NOTES
+ * 
+ * Information elements required to perform reduction.
+ * 
+ *   1. Ndim, shape and dtype of src tensor
+ *   2. Ndim, shape and dtype of dst/dstArg tensors
+ *   3. GPU context
+ *   4. Number of processors
+ *   5. Warp size
+ *   6. Maximum size of block
+ *   7. Maximum size of block dimension X, Y, Z
+ *   8. Maximum size of grid
+ *   9. Maximum size of grid  dimension X, Y, Z
+ *  10. Dtype and initializer of accumulator
+ *  11. Sorted src axes for contiguous memory accesses
+ *  12. Ndim, shape and dtype of flattened src tensor
+ *  13. Number of stages (1 or 2)
+ *  14. Ndim, shape and dtype of workspace tensor
+ *  15. Warp     axes
+ *  16. Hardware axes
+ *  17. Software axes
+ *  18. Source code
+ * 
+ * Rationale for dependencies:
+ * 
+ *   1) Get the GPU context and its properties immediately, since an invalid
+ *      context is a likely error and we want to fail fast.
+ *   2) The type and initializer of the accumulator should be determined after
+ *      the context's properties have been retrieved since they provide
+ *      information about the device's natively-supported types and operations.
+ * 
+ * REFERENCES
+ * 
+ * http://lpgpu.org/wp/wp-content/uploads/2013/05/poster_andresch_acaces2014.pdf
+ * 
+ * 
+ * 
+ * 
+ * 
  *                           Kernel Template:
  *
  * The following kernel code template displays the code generated for the
@@ -200,11 +250,41 @@ struct redux_ctx{
 	const int*      reduxList;
 
 	/* General. */
+	int             nds;          /* # Source              dimensions */
+	int             ndr;          /* # Reduced             dimensions */
+	int             ndd;          /* # Destination         dimensions */
+	int             ndw;          /* # Warp                dimensions */
+	int             ndp;          /* # Partial warp        dimensions */
+	int             ndf;          /* # Flattened source    dimensions */
+	int             ndt;          /* # Temporary workspace dimensions */
+	int             zeroAllAxes;  /* # of zero-length                   axes in source tensor */
+	int             zeroRdxAxes;  /* # of zero-length         reduction axes in source tensor */
+	size_t          prodAllAxes;  /* Product of length of all           axes in source tensor */
+	size_t          prodRdxAxes;  /* Product of length of all reduction axes in source tensor */
+	size_t          prodFreeAxes; /* Product of length of all free      axes in source tensor */
+	size_t          prodWarpAxes; /* Number of active threads per warp. Strictly <= warpSize. */
+	int             splitWarpAxis;/* Index of the split warp axis within the source tensor's shape; -1 otherwise. */
+	
+	gpucontext*     gpuCtx;
+	unsigned        numProcs;
+	size_t          warpSize;
+	size_t          maxLg;
+	size_t          maxLs[MAX_HW_DIMS];
+	size_t          maxGg;
+	size_t          maxGs[MAX_HW_DIMS];
+	
+	axis_desc*      xdSrc;
+	axis_desc*      xdSrcFlat;
+	axis_desc*      xdTmp;
+	
+	axis_desc**     xdSrcPtrs;
+	
+	int             numStages;
+	
 	GpuArray*       wsDst;
 	GpuArray*       wsDstArg;
 	int*            srcAxisList;
 	size_t*         dstDims;
-	gpucontext*     gpuCtx;
 
 	/* Source code Generator. */
 	int             srcTypeCode;
@@ -219,9 +299,6 @@ struct redux_ctx{
 	const char*     accTypeStr;
 	const char*     initValT;
 	const char*     initValK;
-	int             ndd;
-	int             ndr;
-	int             nds;
 	int             largeCodeModel;
 	strb            s;
 	srcb            srcGen;
@@ -269,187 +346,134 @@ typedef struct redux_ctx redux_ctx;
 
 
 
-/* Function prototypes */
-static int   reduxGetSumInit               (int typecode, const char** property);
-static int   reduxGetProdInit              (int typecode, const char** property);
-static int   reduxGetMinInit               (int typecode, const char** property);
-static int   reduxGetMaxInit               (int typecode, const char** property);
-static int   reduxGetAndInit               (int typecode, const char** property);
-static int   reduxGetOrInit                (int typecode, const char** property);
-static int   axisInSet                     (int                v,
-                                            const int*         set,
-                                            size_t             setLen,
-                                            size_t*            where);
-static void  appendIdxes                   (strb*              s,
-                                            const char*        prologue,
-                                            const char*        prefix,
-                                            int                startIdx,
-                                            int                endIdx,
-                                            const char*        suffix,
-                                            const char*        epilogue);
-static int   reduxCheckargs                (redux_ctx*  ctx);
-static void  reduxSelectTypes              (redux_ctx*  ctx);
-static int   reduxIsSmallCodeModel         (redux_ctx*  ctx);
-static int   reduxIsLargeCodeModel         (redux_ctx*  ctx);
-static int   reduxRequiresDst              (redux_ctx*  ctx);
-static int   reduxRequiresDstArg           (redux_ctx*  ctx);
-static int   reduxKernelRequiresDst        (redux_ctx*  ctx);
-static int   reduxKernelRequiresDstArg     (redux_ctx*  ctx);
-static int   reduxCanAppendHwAxis          (redux_ctx* ctx,
-                                            int        kernelType,
-                                            int        axisType);
-static void  reduxAppendLargestAxisToHwList(redux_ctx* ctx,
-                                            int        kernelType,
-                                            int        axisType);
-static int   reduxSelectHwAxes             (redux_ctx*  ctx);
-static int   reduxComputeAxisList          (redux_ctx*  ctx);
-static int   reduxGenSource                (redux_ctx*  ctx);
-static void  reduxAppendSource             (redux_ctx*  ctx);
-static void  reduxAppendIncludes           (redux_ctx*  ctx);
-static void  reduxAppendTensorDeclArgs     (redux_ctx*  ctx,
-                                            const char* type,
-                                            const char* baseName);
-static void  reduxAppendTensorCallArgs     (redux_ctx*  ctx,
-                                            const char* baseName);
-static void  reduxAppendMacroDefs          (redux_ctx*  ctx);
-static void  reduxAppendTypedefs           (redux_ctx*  ctx);
-static void  reduxAppendGetInitValFns      (redux_ctx*  ctx);
-static void  reduxAppendWriteBackFn        (redux_ctx*  ctx);
-static void  reduxAppendReduxKernel        (redux_ctx*  ctx);
-static void  reduxAppendPrototype          (redux_ctx*  ctx);
-static void  reduxAppendIndexDeclarations  (redux_ctx*  ctx);
-static void  reduxAppendRangeCalculations  (redux_ctx*  ctx);
-static void  reduxAppendLoops              (redux_ctx*  ctx);
-static void  reduxAppendInitKernel         (redux_ctx*  ctx);
-static void  reduxAppendPostKernel         (redux_ctx*  ctx);
-static int   reduxCompile                  (redux_ctx*  ctx);
-static int   reduxSchedule                 (redux_ctx*  ctx);
-static void  reduxScheduleKernel           (int         ndims,
-                                            uint64_t*   dims,
-                                            uint64_t    warpSize,
-                                            uint64_t    maxLg,
-                                            uint64_t*   maxLs,
-                                            uint64_t    maxGg,
-                                            uint64_t*   maxGs,
-                                            uint64_t*   bs,
-                                            uint64_t*   gs,
-                                            uint64_t*   cs);
-static int   reduxInvoke                   (redux_ctx*  ctx);
-static int   reduxCleanup                  (redux_ctx*  ctx, int ret);
+/* Static Function prototypes */
+/* Utilities */
+static int        reduxGetSumInit               (int typecode, const char** property);
+static int        reduxGetProdInit              (int typecode, const char** property);
+static int        reduxGetMinInit               (int typecode, const char** property);
+static int        reduxGetMaxInit               (int typecode, const char** property);
+static int        reduxGetAndInit               (int typecode, const char** property);
+static int        reduxGetOrInit                (int typecode, const char** property);
+static int        reduxSortFlatSensitive        (const void* a, const void* b);
+static int        reduxSortFlatInsensitive      (const void* a, const void* b);
+static int        reduxSortWarp                 (const void* a, const void* b);
+static int        axisInSet                     (int         v,
+                                                 const int*  set,
+                                                 size_t      setLen,
+                                                 size_t*     where);
+static void       appendIdxes                   (strb*       s,
+                                                 const char* prologue,
+                                                 const char* prefix,
+                                                 int         startIdx,
+                                                 int         endIdx,
+                                                 const char* suffix,
+                                                 const char* epilogue);
+
+/* Axis Description API */
+static void       axisInit                      (axis_desc*       axis,
+                                                 ssize_t          len,
+                                                 ssize_t          srcStride);
+static void       axisMarkReduced               (axis_desc*       axis, int    reduxNum);
+static void       axisMarkWarp                  (axis_desc*       axis, size_t partialSlice);
+static int        axisGetReduxNum               (const axis_desc* axis);
+static size_t     axisGetLen                    (const axis_desc* axis);
+static ssize_t    axisGetSrcStride              (const axis_desc* axis);
+static size_t     axisGetSrcAbsStride           (const axis_desc* axis);
+static ssize_t    axisGetSrcOffset              (const axis_desc* axis);
+static ssize_t    axisGetDstStride              (const axis_desc* axis);
+static size_t     axisGetDstAbsStride           (const axis_desc* axis);
+static ssize_t    axisGetDstOffset              (const axis_desc* axis);
+static ssize_t    axisGetDstArgStride           (const axis_desc* axis);
+static size_t     axisGetDstArgAbsStride        (const axis_desc* axis);
+static ssize_t    axisGetDstArgOffset           (const axis_desc* axis);
+static int        axisIsReduced                 (const axis_desc* axis);
+static int        axisIsWarp                    (const axis_desc* axis);
+static int        axisIsPartialWarp             (const axis_desc* axis);
+
+/* Reduction Context API */
+/*     Utilities */
+static int        reduxRequiresDst              (const redux_ctx*  ctx);
+static int        reduxRequiresDstArg           (const redux_ctx*  ctx);
+static int        reduxKernelRequiresDst        (const redux_ctx*  ctx);
+static int        reduxKernelRequiresDstArg     (const redux_ctx*  ctx);
+static int        reduxIsSensitive              (const redux_ctx*  ctx);
+static int        reduxIsSmallCodeModel         (const redux_ctx*  ctx);
+static int        reduxIsLargeCodeModel         (const redux_ctx*  ctx);
+static axis_desc* reduxGetSrcAxis               (const redux_ctx*  ctx, int i);
+static axis_desc* reduxGetSrcSortAxis           (const redux_ctx*  ctx, int i);
+static axis_desc* reduxGetSrcFlatAxis           (const redux_ctx*  ctx, int i);
+static int        reduxTryFlattenInto           (const redux_ctx* ctx,
+                                                 axis_desc*       into,
+                                                 const axis_desc* from);
+static int        reduxCanAppendHwAxis          (redux_ctx*  ctx,
+                                                 int         kernelType,
+                                                 int         axisType);
+static void       reduxAppendLargestAxisToHwList(redux_ctx*  ctx,
+                                                 int         kernelType,
+                                                 int         axisType);
+/*     Control Flow */
+static int        reduxInit                     (redux_ctx*  ctx);
+static int        reduxInferProperties          (redux_ctx*  ctx);
+static int        reduxFlattenSource            (redux_ctx*  ctx);
+static int        reduxSelectWarpAxes           (redux_ctx*  ctx);
+static int        reduxSelectNumStages          (redux_ctx*  ctx);
+static int        reduxSelectHwAxes             (redux_ctx*  ctx);
+static int        reduxComputeAxisList          (redux_ctx*  ctx);
+static int        reduxGenSource                (redux_ctx*  ctx);
+static void       reduxAppendSource             (redux_ctx*  ctx);
+static void       reduxAppendIncludes           (redux_ctx*  ctx);
+static void       reduxAppendTensorDeclArgs     (redux_ctx*  ctx,
+                                                 const char* type,
+                                                 const char* baseName);
+static void       reduxAppendTensorCallArgs     (redux_ctx*  ctx,
+                                                 const char* baseName);
+static void       reduxAppendMacroDefs          (redux_ctx*  ctx);
+static void       reduxAppendTypedefs           (redux_ctx*  ctx);
+static void       reduxAppendGetInitValFns      (redux_ctx*  ctx);
+static void       reduxAppendWriteBackFn        (redux_ctx*  ctx);
+static void       reduxAppendReduxKernel        (redux_ctx*  ctx);
+static void       reduxAppendPrototype          (redux_ctx*  ctx);
+static void       reduxAppendIndexDeclarations  (redux_ctx*  ctx);
+static void       reduxAppendRangeCalculations  (redux_ctx*  ctx);
+static void       reduxAppendLoops              (redux_ctx*  ctx);
+static void       reduxAppendInitKernel         (redux_ctx*  ctx);
+static void       reduxAppendPostKernel         (redux_ctx*  ctx);
+static int        reduxCompile                  (redux_ctx*  ctx);
+static int        reduxSchedule                 (redux_ctx*  ctx);
+static void       reduxScheduleKernel           (int         ndims,
+                                                 uint64_t*   dims,
+                                                 uint64_t    warpSize,
+                                                 uint64_t    maxLg,
+                                                 uint64_t*   maxLs,
+                                                 uint64_t    maxGg,
+                                                 uint64_t*   maxGs,
+                                                 uint64_t*   bs,
+                                                 uint64_t*   gs,
+                                                 uint64_t*   cs);
+static int        reduxInvoke                   (redux_ctx*  ctx);
+static int        reduxCleanup                  (redux_ctx*  ctx, int ret);
+static int        reduxCleanupMsg               (redux_ctx*  ctx, int ret,
+                                                 const char* fmt, ...);
 
 
 /* Function implementation */
-GPUARRAY_PUBLIC int  GpuArray_sum         (GpuArray*       dst,
-                                           const GpuArray* src,
-                                           unsigned        reduxLen,
-                                           const unsigned* reduxList){
-	return GpuArray_reduction(GA_REDUCE_SUM,
-	                          dst,  NULL,   src, reduxLen, reduxList);
-}
-GPUARRAY_PUBLIC int  GpuArray_prod        (GpuArray*       dst,
-                                           const GpuArray* src,
-                                           unsigned        reduxLen,
-                                           const unsigned* reduxList){
-	return GpuArray_reduction(GA_REDUCE_PROD,
-	                          dst,  NULL,   src, reduxLen, reduxList);
-}
-GPUARRAY_PUBLIC int  GpuArray_prodnz      (GpuArray*       dst,
-                                           const GpuArray* src,
-                                           unsigned        reduxLen,
-                                           const unsigned* reduxList){
-	return GpuArray_reduction(GA_REDUCE_PRODNZ,
-	                          dst,  NULL,   src, reduxLen, reduxList);
-}
-GPUARRAY_PUBLIC int  GpuArray_min         (GpuArray*       dst,
-                                           const GpuArray* src,
-                                           unsigned        reduxLen,
-                                           const unsigned* reduxList){
-	return GpuArray_reduction(GA_REDUCE_MIN,
-	                          dst,  NULL,   src, reduxLen, reduxList);
-}
-GPUARRAY_PUBLIC int  GpuArray_max         (GpuArray*       dst,
-                                           const GpuArray* src,
-                                           unsigned        reduxLen,
-                                           const unsigned* reduxList){
-	return GpuArray_reduction(GA_REDUCE_MAX,
-	                          dst,  NULL,   src, reduxLen, reduxList);
-}
-GPUARRAY_PUBLIC int  GpuArray_argmin      (GpuArray*       dstArg,
-                                           const GpuArray* src,
-                                           unsigned        reduxLen,
-                                           const unsigned* reduxList){
-	return GpuArray_reduction(GA_REDUCE_ARGMIN,
-	                          NULL, dstArg, src, reduxLen, reduxList);
-}
-GPUARRAY_PUBLIC int  GpuArray_argmax      (GpuArray*       dstArg,
-                                           const GpuArray* src,
-                                           unsigned        reduxLen,
-                                           const unsigned* reduxList){
-	return GpuArray_reduction(GA_REDUCE_ARGMAX,
-	                          NULL, dstArg, src, reduxLen, reduxList);
-}
-GPUARRAY_PUBLIC int  GpuArray_minandargmin(GpuArray*       dst,
-                                           GpuArray*       dstArg,
-                                           const GpuArray* src,
-                                           unsigned        reduxLen,
-                                           const unsigned* reduxList){
-	return GpuArray_reduction(GA_REDUCE_MINANDARGMIN,
-	                          dst,  dstArg, src, reduxLen, reduxList);
-}
-GPUARRAY_PUBLIC int  GpuArray_maxandargmax(GpuArray*       dst,
-                                           GpuArray*       dstArg,
-                                           const GpuArray* src,
-                                           unsigned        reduxLen,
-                                           const unsigned* reduxList){
-	return GpuArray_reduction(GA_REDUCE_MAXANDARGMAX,
-	                          dst,  dstArg, src, reduxLen, reduxList);
-}
-GPUARRAY_PUBLIC int  GpuArray_and         (GpuArray*       dst,
-                                           const GpuArray* src,
-                                           unsigned        reduxLen,
-                                           const unsigned* reduxList){
-	return GpuArray_reduction(GA_REDUCE_AND,
-	                          dst,  NULL,   src, reduxLen, reduxList);
-}
-GPUARRAY_PUBLIC int  GpuArray_or          (GpuArray*       dst,
-                                           const GpuArray* src,
-                                           unsigned        reduxLen,
-                                           const unsigned* reduxList){
-	return GpuArray_reduction(GA_REDUCE_OR,
-	                          dst,  NULL,   src, reduxLen, reduxList);
-}
-GPUARRAY_PUBLIC int  GpuArray_xor         (GpuArray*       dst,
-                                           const GpuArray* src,
-                                           unsigned        reduxLen,
-                                           const unsigned* reduxList){
-	return GpuArray_reduction(GA_REDUCE_XOR,
-	                          dst,  NULL,   src, reduxLen, reduxList);
-}
-GPUARRAY_PUBLIC int  GpuArray_all         (GpuArray*       dst,
-                                           const GpuArray* src,
-                                           unsigned        reduxLen,
-                                           const unsigned* reduxList){
-	return GpuArray_reduction(GA_REDUCE_ALL,
-	                          dst,  NULL,   src, reduxLen, reduxList);
-}
-GPUARRAY_PUBLIC int  GpuArray_any         (GpuArray*       dst,
-                                           const GpuArray* src,
-                                           unsigned        reduxLen,
-                                           const unsigned* reduxList){
-	return GpuArray_reduction(GA_REDUCE_ANY,
-	                          dst,  NULL,   src, reduxLen, reduxList);
-}
 GPUARRAY_PUBLIC int  GpuArray_reduction   (ga_reduce_op    op,
                                            GpuArray*       dst,
                                            GpuArray*       dstArg,
                                            const GpuArray* src,
                                            unsigned        reduxLen,
                                            const unsigned* reduxList){
-	redux_ctx  ctxSTACK = {op, dst, dstArg, src,
-	                       (int)reduxLen, (const int*)reduxList};
-	redux_ctx *ctx      = &ctxSTACK;
+	redux_ctx ctxSTACK, *ctx = &ctxSTACK;
+	memset(ctx, 0, sizeof(*ctx));
 
-	return reduxCheckargs(ctx);
+	ctx->op        = op;
+	ctx->dst       = dst;
+	ctx->dstArg    = dstArg;
+	ctx->src       = src;
+	ctx->reduxLen  = reduxLen;
+	ctx->reduxList = (const int*)reduxList;
+
+	return reduxInit(ctx);
 }
 
 /**
@@ -463,7 +487,7 @@ GPUARRAY_PUBLIC int  GpuArray_reduction   (ga_reduce_op    op,
  * @return Zero if successful; Non-zero if the datatype is not supported.
  */
 
-static int   reduxGetSumInit               (int typecode, const char** property){
+static int        reduxGetSumInit               (int typecode, const char** property){
 	if (typecode == GA_POINTER ||
 	    typecode == GA_BUFFER){
 		return GA_UNSUPPORTED_ERROR;
@@ -483,7 +507,7 @@ static int   reduxGetSumInit               (int typecode, const char** property)
  * @return Zero if successful; Non-zero if the datatype is not supported.
  */
 
-static int   reduxGetProdInit              (int typecode, const char** property){
+static int        reduxGetProdInit              (int typecode, const char** property){
 	if (typecode == GA_POINTER ||
 	    typecode == GA_BUFFER){
 		return GA_UNSUPPORTED_ERROR;
@@ -503,7 +527,7 @@ static int   reduxGetProdInit              (int typecode, const char** property)
  * @return Zero if successful; Non-zero if the datatype is not supported.
  */
 
-static int   reduxGetMinInit               (int typecode, const char** property){
+static int        reduxGetMinInit               (int typecode, const char** property){
 	switch (typecode){
 		case GA_BYTE2:
 		case GA_BYTE3:
@@ -593,7 +617,7 @@ static int   reduxGetMinInit               (int typecode, const char** property)
  * @return Zero if successful; Non-zero if the datatype is not supported.
  */
 
-static int   reduxGetMaxInit               (int typecode, const char** property){
+static int        reduxGetMaxInit               (int typecode, const char** property){
 	switch (typecode){
 		case GA_BOOL:
 		  *property = "1";
@@ -692,7 +716,7 @@ static int   reduxGetMaxInit               (int typecode, const char** property)
  * @return Zero if successful; Non-zero if the datatype is not supported.
  */
 
-static int   reduxGetAndInit               (int typecode, const char** property){
+static int        reduxGetAndInit               (int typecode, const char** property){
 	if (typecode == GA_POINTER ||
 	    typecode == GA_BUFFER){
 		return GA_UNSUPPORTED_ERROR;
@@ -712,7 +736,7 @@ static int   reduxGetAndInit               (int typecode, const char** property)
  * @return Zero if successful; Non-zero if the datatype is not supported.
  */
 
-static int   reduxGetOrInit                (int typecode, const char** property){
+static int        reduxGetOrInit                (int typecode, const char** property){
 	if (typecode == GA_POINTER ||
 	    typecode == GA_BUFFER){
 		return GA_UNSUPPORTED_ERROR;
@@ -721,6 +745,110 @@ static int   reduxGetOrInit                (int typecode, const char** property)
 	return GA_NO_ERROR;
 }
 
+/**
+ * @brief Sort the axes into optimal order for flattening.
+ * 
+ * Two orderings exist: "Sensitive" and "Insensitive", for reductions that are
+ * sensitive (or not) to indexing.
+ * 
+ * In all cases:
+ * 
+ *   1. Free axes are sorted before reduction axes.
+ *   2. Free axes are sorted by decreasing absolute stride.
+ *   3.                 then by increasing source axis number.
+ * 
+ * In the sensitive case:
+ * 
+ *   4. Reduction axes are sorted by their position in reduxList.
+ * 
+ * In the insensitive case:
+ * 
+ *   4. Reduction axes are sorted by decreasing absolute stride.
+ *   5.                      then by increasing source axis number.
+ */
+
+static int        reduxSortFlatInsensitive      (const void* a, const void* b){
+	const axis_desc* xda  = (const axis_desc*)a;
+	const axis_desc* xdb  = (const axis_desc*)b;
+
+	if       ( axisIsReduced(xda)      && !axisIsReduced(xdb)){
+		return +1;
+	}else if (!axisIsReduced(xda)      &&  axisIsReduced(xdb)){
+		return -1;
+	}
+	
+	if       (axisGetSrcAbsStride(xda)  <  axisGetSrcAbsStride(xdb)){
+		return +1;
+	}else if (axisGetSrcAbsStride(xda)  >  axisGetSrcAbsStride(xdb)){
+		return -1;
+	}
+
+	return 0;
+}
+static int        reduxSortFlatSensitive        (const void* a, const void* b){
+	const axis_desc* xda  = (const axis_desc*)a;
+	const axis_desc* xdb  = (const axis_desc*)b;
+
+	if       ( axisIsReduced(xda)      && !axisIsReduced(xdb)){
+		return +1;
+	}else if (!axisIsReduced(xda)      &&  axisIsReduced(xdb)){
+		return -1;
+	}
+
+	if (axisIsReduced(xda)){
+		return axisGetReduxNum(xda)<axisGetReduxNum(xdb) ? -1 : +1;
+	}else{
+		if       (axisGetSrcAbsStride(xda)  <  axisGetSrcAbsStride(xdb)){
+			return +1;
+		}else if (axisGetSrcAbsStride(xda)  >  axisGetSrcAbsStride(xdb)){
+			return -1;
+		}
+		
+		return 0;
+	}
+}
+
+/**
+ * @brief Sort axes in preferred order for integration into warp.
+ * 
+ * The axes with stride != 0 are sorted by lowest absolute
+ * stride. Picking the few axes with the lowest absolute stride (while
+ * keeping the product of their dimensions <= warpSize) should maximize
+ * memory bandwidth of the warp.
+ * 
+ * The restriction stride != 0 is intended to avoid waste of memory
+ * bandwidth. Once a memory transaction is necessary, it typically operates at
+ * far greater granularity than just 32 bits (4 bytes).
+ * 
+ * Sorting by absolute stride should result, in the case of a packed tensor, in
+ * the memory accesses being close to perfectly contiguous.
+ */
+
+static int        reduxSortWarp                 (const void* a, const void* b){
+	const axis_desc* xda  = *(const axis_desc* const *)a;
+	const axis_desc* xdb  = *(const axis_desc* const *)b;
+
+	if       ( axisGetSrcStride(xda)   && !axisGetSrcStride(xdb)){
+		return -1;
+	}else if (!axisGetSrcStride(xda)   &&  axisGetSrcStride(xdb)){
+		return +1;
+	}
+	
+	if       (axisGetSrcAbsStride(xda)    <   axisGetSrcAbsStride(xdb)){
+		return -1;
+	}else if (axisGetSrcAbsStride(xda)    >   axisGetSrcAbsStride(xdb)){
+		return +1;
+	}
+
+	if       ( axisIsReduced(xda)      && !axisIsReduced(xdb)){
+		return -1;
+	}else if (!axisIsReduced(xda)      &&  axisIsReduced(xdb)){
+		return +1;
+	}
+
+	return 0;
+}
+
 /**
  * @brief Check whether axis numbered v is already in the given set of axes.
  *
@@ -731,10 +859,10 @@ static int   reduxGetOrInit                (int typecode, const char** property)
  * @return Non-zero if the set is non-empty and v is in it; Zero otherwise.
  */
 
-static int   axisInSet                     (int                v,
-                                            const int*         set,
-                                            size_t             setLen,
-                                            size_t*            where){
+static int        axisInSet                     (int         v,
+                                                 const int*  set,
+                                                 size_t      setLen,
+                                                 size_t*     where){
 	size_t i;
 
 	for (i=0;i<setLen;i++){
@@ -763,13 +891,13 @@ static int   axisInSet                     (int                v,
  * @param [in]  epilogue  Text that is appended and NOT repeated.
  */
 
-static void  appendIdxes                   (strb*              s,
-                                            const char*        prologue,
-                                            const char*        prefix,
-                                            int                startIdx,
-                                            int                endIdx,
-                                            const char*        suffix,
-                                            const char*        epilogue){
+static void       appendIdxes                   (strb*       s,
+                                                 const char* prologue,
+                                                 const char* prefix,
+                                                 int         startIdx,
+                                                 int         endIdx,
+                                                 const char* suffix,
+                                                 const char* epilogue){
 	int i;
 
 	prologue = prologue ? prologue : "";
@@ -784,23 +912,420 @@ static void  appendIdxes                   (strb*              s,
 	strb_appends(s, epilogue);
 }
 
+/* Axis Description API */
+
+/**
+ * @brief Initialize Axis Description.
+ */
+
+static void       axisInit                      (axis_desc*       axis,
+                                                 ssize_t          len,
+                                                 ssize_t          srcStride){
+	memset(axis, 0, sizeof(*axis));
+	
+	axis->reduxNum        = -1;
+	axis->warpLen         = 0;
+	axis->len             = len;
+	
+	axis->srcStride       = srcStride;
+	axis->srcOffset       = 0;
+	
+	axis->dstStride       = 0;
+	axis->dstOffset       = 0;
+	
+	axis->dstArgStride    = 0;
+	axis->dstArgOffset    = 0;
+	
+	axis->tmpDstStride    = 0;
+	axis->tmpDstOffset    = 0;
+	
+	axis->tmpDstArgStride = 0;
+	axis->tmpDstArgOffset = 0;
+}
+
 /**
- * @brief Check the sanity of the arguments in agreement with the
- *        documentation for GpuArray_reduction().
+ * @brief Mark axis as reduction axis, with position reduxNum in the axis list.
+ */
+
+static void       axisMarkReduced               (axis_desc*       axis, int    reduxNum){
+	axis->isReduced = 1;
+	axis->reduxNum  = reduxNum;
+}
+
+/**
+ * @brief Mark axis as warp axis.
+ */
+
+static void       axisMarkWarp                  (axis_desc*       axis, size_t warpLen){
+	axis->warpLen = warpLen;
+}
+
+/**
+ * @brief Get properties of an axis.
+ */
+
+static int        axisGetReduxNum               (const axis_desc* axis){
+	return axis->reduxNum;
+}
+static size_t     axisGetLen                    (const axis_desc* axis){
+	return axis->len;
+}
+static ssize_t    axisGetSrcStride              (const axis_desc* axis){
+	return axisGetLen(axis) > 1 ? axis->srcStride : 0;
+}
+static size_t     axisGetSrcAbsStride           (const axis_desc* axis){
+	return axisGetSrcStride(axis)<0 ? -(size_t)axisGetSrcStride(axis):
+	                                  +(size_t)axisGetSrcStride(axis);
+}
+static ssize_t    axisGetSrcOffset              (const axis_desc* axis){
+	return axis->srcOffset;
+}
+static ssize_t    axisGetDstStride              (const axis_desc* axis){
+	return axisGetLen(axis) > 1 ? axis->dstStride : 0;
+}
+static size_t     axisGetDstAbsStride           (const axis_desc* axis){
+	return axisGetDstStride(axis)<0 ? -(size_t)axisGetDstStride(axis):
+	                                  +(size_t)axisGetDstStride(axis);
+}
+static ssize_t    axisGetDstOffset              (const axis_desc* axis){
+	return axis->dstOffset;
+}
+static ssize_t    axisGetDstArgStride           (const axis_desc* axis){
+	return axisGetLen(axis) > 1 ? axis->dstArgStride : 0;
+}
+static size_t     axisGetDstArgAbsStride        (const axis_desc* axis){
+	return axisGetDstArgStride(axis)<0 ? -(size_t)axisGetDstArgStride(axis):
+	                                     +(size_t)axisGetDstArgStride(axis);
+}
+static ssize_t    axisGetDstArgOffset           (const axis_desc* axis){
+	return axis->dstArgOffset;
+}
+static int        axisIsReduced                 (const axis_desc* axis){
+	return axis->isReduced;
+}
+static int        axisIsWarp                    (const axis_desc* axis){
+	return !!axis->warpLen;
+}
+static int        axisIsPartialWarp             (const axis_desc* axis){
+	return axis->warpLen > 0 && axis->warpLen != axis->len;
+}
+
+/**
+ * @brief Returns whether the reduction interface requires a dst argument.
+ */
+
+static int        reduxRequiresDst              (const redux_ctx*  ctx){
+	switch (ctx->op){
+		case GA_REDUCE_ARGMIN:
+		case GA_REDUCE_ARGMAX:
+		  return 0;
+		default:
+		  return 1;
+	}
+}
+
+/**
+ * @brief Returns whether the reduction interface requires a dstArg argument.
+ */
+
+static int        reduxRequiresDstArg           (const redux_ctx*  ctx){
+	switch (ctx->op){
+		case GA_REDUCE_MINANDARGMIN:
+		case GA_REDUCE_MAXANDARGMAX:
+		case GA_REDUCE_ARGMIN:
+		case GA_REDUCE_ARGMAX:
+		  return 1;
+		default:
+		  return 0;
+	}
+}
+
+/**
+ * @brief Returns whether the generated kernel internally requires a dst
+ *        argument.
  *
- *        Also initialize certain parts of the context, allocate memory
- *        buffers and fail out if at any point the environment gives us
- *        a problem.
+ * This is semantically subtly different from reduxHasDst(). The main
+ * difference is in the implementation of the GA_REDUCE_ARGMIN/ARGMAX
+ * reductions; Either *might* require a dst buffer, which will have to be
+ * allocated, even though it will be discared.
+ */
+
+static int        reduxKernelRequiresDst        (const redux_ctx*  ctx){
+	switch (ctx->op){
+		case GA_REDUCE_ARGMIN:
+		case GA_REDUCE_ARGMAX:
+		  return reduxIsSmallCodeModel(ctx);
+		default:
+		  return 1;
+	}
+}
+
+/**
+ * @brief Returns whether the generated kernel internally requires a dstArg
+ *        argument.
  *
- * @return GA_INVALID_ERROR if arguments invalid; GA_NO_MEMORY if out of
- *         memory, GA_NO_ERROR otherwise.
+ * This is semantically subtly different from reduxHasDstArg(), since it asks
+ * whether the reduction, even though it does not accept a dstArg argument,
+ * still requires a dstArg internally.
+ */
+
+static int        reduxKernelRequiresDstArg     (const redux_ctx*  ctx){
+	/**
+	 * At present there exists no reduction whose implementation requires
+	 * a dstArg but whose interface does not.
+	 *
+	 * E.g. the max() and min() reductions do NOT currently require a temporary
+	 *      buffer for indexes, and will not in the foreseeable future.
+	 */
+
+	return reduxRequiresDstArg(ctx);
+}
+
+/**
+ * @brief Returns whether the reduction is sensitive.
+ * 
+ * A reduction is sensitive when its output satisfies at least one of the
+ * following conditions:
+ * 
+ *   - It depends on the exact order of axes in the reduxList
+ *   - It depends on exact signs of the strides of axes in the reduxList
+ * 
+ * Such sensitivity may prevent a flattening of contiguous axes even when it
+ * would have been otherwise permitted.
+ * 
+ * For instance, ARGMIN/ARGMAX have this sensitivity, because the dstArg
+ * tensor's contents are flattened coordinates into the source tensor, and
+ * the flattening order is precisely reduxList. Permuting it would thus produce
+ * incorrect output. Moreover, if the strides of a reduction axis were to be
+ * reversed for the purpose of flattening the axis into another, the computed
+ * coordinate would again be incorrect.
+ * 
+ * 
+ * TL;DR: Reduction is sensitive if
+ *   reduce(x, axis=axisList) != reduce(x, axis=axisList[::-1])
+ * or
+ *   reduce(x) != reduce(x[::-1])
+ * .
+ */
+
+static int        reduxIsSensitive              (const redux_ctx*  ctx){
+	switch (ctx->op){
+		case GA_REDUCE_MINANDARGMIN:
+		case GA_REDUCE_MAXANDARGMAX:
+		case GA_REDUCE_ARGMIN:
+		case GA_REDUCE_ARGMAX:
+		  return 1;
+		default:
+		  return 0;
+	}
+}
+
+/**
+ * @brief Returns whether we are using the small code model or not.
+ */
+
+static int        reduxIsSmallCodeModel         (const redux_ctx*  ctx){
+	return !reduxIsLargeCodeModel(ctx);
+}
+
+/**
+ * @brief Returns whether we are using the large code model or not.
+ */
+
+static int        reduxIsLargeCodeModel         (const redux_ctx*  ctx){
+	return ctx->largeCodeModel;
+}
+
+/**
+ * @brief Get description of source axis with given number.
+ */
+
+static axis_desc* reduxGetSrcAxis               (const redux_ctx*  ctx, int i){
+	return &ctx->xdSrc[i];
+}
+
+/**
+ * @brief Get description of source axis with given number in sort-order.
  */
 
-static int   reduxCheckargs                (redux_ctx*  ctx){
-	int      i, j, ret, retT, retK;
-	unsigned numProcs;
-	size_t   localSize;
-	size_t   dstNumElem = 1, reduxPerElem = 1;
+static axis_desc* reduxGetSrcSortAxis           (const redux_ctx*  ctx, int i){
+	return ctx->xdSrcPtrs[i];
+}
+
+/**
+ * @brief Get description of flattened source axis with given number.
+ */
+
+static axis_desc* reduxGetSrcFlatAxis           (const redux_ctx*  ctx, int i){
+	return &ctx->xdSrcFlat[i];
+}
+
+/**
+ * @brief Attempt to flatten an axis `from` into an axis `into`.
+ * 
+ * An axis can be considered for flattening into the previous one if ALL of
+ * the following conditions hold:
+ * 
+ *   1. The product of the previous axis' length by its stride exactly
+ *      matches the current axis' stride.
+ *   2. Both axes are reduced.
+ * 
+ * For reductions where axis order matters (e.g. those that compute
+ * indices, like argmax/argmin), ALL of the following additional conditions
+ * must hold:
+ * 
+ *   3. The sign of the strides must match.
+ *   4. The axis numbers must follow consecutively in the reduction list
+ *      (this is ensured by the "sensitive" sort order)
+ * 
+ * @return Non-zero if flattening attempt successful; Zero otherwise.
+ */
+
+static int        reduxTryFlattenInto           (const redux_ctx* ctx,
+                                                 axis_desc*       into,
+                                                 const axis_desc* from){
+	int signSrc    = 0, signDst    = 0, signDstArg    = 0,
+	    reverseSrc = 0, reverseDst = 0, reverseDstArg = 0;
+	
+	if (axisIsReduced         (into) != axisIsReduced         (from)                 ||
+	    axisGetSrcAbsStride   (into) != axisGetSrcAbsStride   (from)*axisGetLen(from)){
+		return 0;
+	}
+	
+	if (reduxRequiresDst(ctx) &&
+	    axisGetDstAbsStride   (into) != axisGetDstAbsStride   (from)*axisGetLen(from)){
+		return 0;
+	}
+	
+	if (reduxRequiresDstArg(ctx) &&
+	    axisGetDstArgAbsStride(into) != axisGetDstArgAbsStride(from)*axisGetLen(from)){
+		return 0;
+	}
+	
+	signSrc       = (axisGetSrcStride   (into)^axisGetSrcStride   (from)) < 0;
+	signDst       = (axisGetDstStride   (into)^axisGetDstStride   (from)) < 0;
+	signDstArg    = (axisGetDstArgStride(into)^axisGetDstArgStride(from)) < 0;
+	reverseSrc    = signSrc;
+	reverseDst    = signDst    && reduxRequiresDst   (ctx);
+	reverseDstArg = signDstArg && reduxRequiresDstArg(ctx);
+	
+	if (reduxIsSensitive(ctx)){
+		if(reverseSrc || reverseDst || reverseDstArg){
+			return 0;
+		}
+	}
+	
+	if (reduxRequiresDst   (ctx) &&
+	    reduxRequiresDstArg(ctx) &&
+	    reverseDst != reverseDstArg){
+		/* Either both, or neither, of dst and dstArg must require reversal. */
+		return 0;
+	}
+	
+	if (reverseSrc){
+		into->srcOffset    += (ssize_t)(axisGetLen(from)-1)*axisGetSrcStride(from);
+		into->srcStride     = -axisGetSrcStride   (from);
+	}else{
+		into->srcStride     =  axisGetSrcStride   (from);
+	}
+	
+	if (reverseDst){
+		into->dstOffset    += (ssize_t)(axisGetLen(from)-1)*axisGetDstStride(from);
+		into->dstStride     = -axisGetDstStride   (from);
+	}else{
+		into->dstStride     =  axisGetDstStride   (from);
+	}
+	
+	if (reverseDstArg){
+		into->dstArgOffset += (ssize_t)(axisGetLen(from)-1)*axisGetDstArgStride(from);
+		into->dstArgStride  = -axisGetDstArgStride(from);
+	}else{
+		into->dstArgStride  =  axisGetDstArgStride(from);
+	}
+	
+	into->srcOffset    += axisGetSrcOffset   (from);
+	into->dstOffset    += axisGetDstOffset   (from);
+	into->dstArgOffset += axisGetDstArgOffset(from);
+	into->len          *= axisGetLen         (from);
+	
+	return 1;
+}
+
+/**
+ * @brief Check whether we can add another reduction axis or free axis
+ *        to the hardware axis list for either the primary or secondary kernel.
+ */
+
+static int        reduxCanAppendHwAxis          (redux_ctx*  ctx,
+                                                 int         kernelType,
+                                                 int         axisType){
+	int kernelNdh  = kernelType == KERNEL_PRIMARY ? ctx->pri.ndh  : ctx->aux.ndh;
+	int kernelNdhr = kernelType == KERNEL_PRIMARY ? ctx->pri.ndhr : ctx->aux.ndhr;
+	int kernelNdhd = kernelType == KERNEL_PRIMARY ? ctx->pri.ndhd : ctx->aux.ndhd;
+
+	if (kernelNdh >= MAX_HW_DIMS){
+		return 0;
+	}else{
+		return axisType == AXIS_REDUX ? kernelNdhr < ctx->ndr:
+		                                kernelNdhd < ctx->ndd;
+	}
+}
+
+/**
+ * @brief Append the largest reduction axis or free axis that isn't yet
+ *        in the hardware axis list for either the primary or secondary kernel
+ *        into said hardware axis list.
+ */
+
+static void       reduxAppendLargestAxisToHwList(redux_ctx*  ctx,
+                                                 int         kernelType,
+                                                 int         axisType){
+	int    maxI = 0, i, isInHwList, isInReduxList, isInDesiredList, isLargestSoFar;
+	int*   hwAxisList, * ndh, * ndhr, * ndhd;
+	size_t v, maxV = 0;
+
+	/* Get pointers to the correct kernel's variables */
+	hwAxisList = kernelType == KERNEL_PRIMARY ?  ctx->pri.axisList:
+	                                             ctx->aux.axisList;
+	ndh        = kernelType == KERNEL_PRIMARY ? &ctx->pri.ndh:
+	                                            &ctx->aux.ndh;
+	ndhr       = kernelType == KERNEL_PRIMARY ? &ctx->pri.ndhr:
+	                                            &ctx->aux.ndhr;
+	ndhd       = kernelType == KERNEL_PRIMARY ? &ctx->pri.ndhd:
+	                                            &ctx->aux.ndhd;
+
+	/* Find */
+	for (i=0;i<ctx->nds;i++){
+		isInHwList      = axisInSet(i, hwAxisList,     *ndh,     0);
+		isInReduxList   = axisInSet(i, ctx->reduxList, ctx->ndr, 0);
+		isInDesiredList = axisType == AXIS_REDUX ?  isInReduxList:
+		                                           !isInReduxList;
+		v               = ctx->src->dimensions[i];
+		isLargestSoFar  = v >= maxV;
+
+		if (!isInHwList && isInDesiredList && isLargestSoFar){
+			maxV = v;
+			maxI = i;
+		}
+	}
+
+	/* Append */
+	hwAxisList[(*ndh)++] = maxI;
+	if (axisType == AXIS_REDUX){
+		(*ndhr)++;
+	}else{
+		(*ndhd)++;
+	}
+}
+
+/**
+ * @brief Initialize the context.
+ * 
+ * After this function, calling reduxCleanup() becomes safe.
+ */
+
+static int        reduxInit                     (redux_ctx*  ctx){
+	int i;
 
 	/**
 	 * We initialize certain parts of the context.
@@ -814,15 +1339,16 @@ static int   reduxCheckargs                (redux_ctx*  ctx){
 
 	ctx->srcTypeStr    = ctx->dstTypeStr    = ctx->dstArgTypeStr =
 	ctx->accTypeStr    = ctx->idxTypeStr    = NULL;
-	ctx->initValK       = NULL;
-	ctx->pri.ndh       = ctx->aux.ndh  = 0;
-	ctx->pri.ndhd      = ctx->aux.ndhd = 0;
-	ctx->pri.ndhr      = ctx->aux.ndhr = 0;
+	ctx->initValK      = NULL;
 	ctx->sourceCode    = NULL;
-	ctx->sourceCodeLen = 0;
 	ctx->errorString0  = NULL;
 	ctx->errorString1  = NULL;
 	ctx->errorString2  = NULL;
+
+	ctx->splitWarpAxis = -1;
+	ctx->numStages     =  1;
+	ctx->prodWarpAxes  =  1;
+	ctx->prodAllAxes   = ctx->prodRdxAxes   = ctx->prodFreeAxes  = 1;
 	strb_init(&ctx->s);
 	srcbInit (&ctx->srcGen, &ctx->s);
 
@@ -836,219 +1362,134 @@ static int   reduxCheckargs                (redux_ctx*  ctx){
 	ctx->srcStepsGD      = ctx->srcSizeGD       =
 	ctx->dstStepsGD      = ctx->dstArgStepsGD   =
 	ctx->pri.chunkSizeGD = ctx->aux.chunkSizeGD = NULL;
-	/* *** IT IS NOW SAFE TO CALL reduxCleanup() *** */
-
 
-	/* Insane src, reduxLen, dst or dstArg? */
-	if (!ctx->src                                                      ||
-	    (reduxRequiresDst   (ctx) && !ctx->dst)                        ||
-	    (reduxRequiresDstArg(ctx) && !ctx->dstArg)                     ||
-	    (ctx->src->nd  <= 0)                                           ||
-	    (ctx->reduxLen <= 0)                                           ||
-	    (ctx->src->nd  <  (unsigned)ctx->reduxLen)                     ||
-	    (ctx->dst    && ctx->dst->nd   +ctx->reduxLen != ctx->src->nd) ||
-	    (ctx->dstArg && ctx->dstArg->nd+ctx->reduxLen != ctx->src->nd) ){
-		return reduxCleanup(ctx, GA_INVALID_ERROR);
-	}
-
-
-	/* Insane or duplicate list entry? */
-	for (i=0;i<ctx->reduxLen;i++){
-		if (ctx->reduxList[i] <  0                            ||
-		    ctx->reduxList[i] >= (int)ctx->src->nd            ||
-		    axisInSet(ctx->reduxList[i], ctx->reduxList, i, 0)){
-			return reduxCleanup(ctx, GA_INVALID_ERROR);
-		}
-	}
+	return reduxInferProperties(ctx);
+}
 
+/**
+ * @brief Begin inferring the properties of the reduction.
+ */
 
-	/* GPU context non-existent? */
-	ctx->gpuCtx     = GpuArray_context(ctx->src);
-	if (!ctx->gpuCtx){
-		return reduxCleanup(ctx, GA_INVALID_ERROR);
-	}
+static int        reduxInferProperties          (redux_ctx*  ctx){
+	axis_desc* a;
+	int        i, j, retT, retK;
+	size_t     d;
 
 
-	/* Unknown type? */
-	reduxSelectTypes(ctx);
-	if (!ctx->srcTypeStr || !ctx->dstTypeStr || !ctx->dstArgTypeStr ||
-	    !ctx->accTypeStr){
-		return reduxCleanup(ctx, GA_INVALID_ERROR);
+	/* Source code buffer preallocation failed? */
+	if (strb_ensure(&ctx->s, 4*1024) != 0){
+		return reduxCleanupMsg(ctx, GA_MEMORY_ERROR,
+		    "Could not preallocate source code buffer!\n");
 	}
 
 
-	/* Determine initializer, and error out if reduction unsupported. */
-	switch (ctx->op){
-		case GA_REDUCE_SUM:
-		  retT = reduxGetSumInit (ctx->dstTypeCode, &ctx->initValT);
-		  retK = reduxGetSumInit (ctx->accTypeCode, &ctx->initValK);
-		break;
-		case GA_REDUCE_PRODNZ:
-		case GA_REDUCE_PROD:
-		  retT = reduxGetProdInit(ctx->dstTypeCode, &ctx->initValT);
-		  retK = reduxGetProdInit(ctx->accTypeCode, &ctx->initValK);
-		break;
-		case GA_REDUCE_MINANDARGMIN:
-		case GA_REDUCE_ARGMIN:
-		case GA_REDUCE_MIN:
-		  retT = reduxGetMinInit (ctx->dstTypeCode, &ctx->initValT);
-		  retK = reduxGetMinInit (ctx->accTypeCode, &ctx->initValK);
-		break;
-		case GA_REDUCE_MAXANDARGMAX:
-		case GA_REDUCE_ARGMAX:
-		case GA_REDUCE_MAX:
-		  retT = reduxGetMaxInit (ctx->dstTypeCode, &ctx->initValT);
-		  retK = reduxGetMaxInit (ctx->accTypeCode, &ctx->initValK);
-		break;
-		case GA_REDUCE_ALL:
-		case GA_REDUCE_AND:
-		  retT = reduxGetAndInit (ctx->dstTypeCode, &ctx->initValT);
-		  retK = reduxGetAndInit (ctx->accTypeCode, &ctx->initValK);
-		break;
-		case GA_REDUCE_ANY:
-		case GA_REDUCE_XOR:
-		case GA_REDUCE_OR:
-		  retT = reduxGetOrInit  (ctx->dstTypeCode, &ctx->initValT);
-		  retK = reduxGetOrInit  (ctx->accTypeCode, &ctx->initValK);
-		break;
-		default:
-		  retT = GA_UNSUPPORTED_ERROR;
-		  retK = GA_UNSUPPORTED_ERROR;
-	}
-	if (retT != GA_NO_ERROR){
-		return reduxCleanup(ctx, retT);
-	}
-	if (retK != GA_NO_ERROR){
-		return reduxCleanup(ctx, retK);
+	/* Insane src, reduxLen, dst or dstArg? */
+	if       (!ctx->src){
+		return reduxCleanupMsg(ctx, GA_INVALID_ERROR,
+		    "src is NULL!\n");
+	}else if (ctx->src->nd  <= 0){
+		return reduxCleanupMsg(ctx, GA_INVALID_ERROR,
+		    "src has less than 1 dimensions!\n");
+	}else if (ctx->reduxLen <= 0){
+		return reduxCleanupMsg(ctx, GA_INVALID_ERROR,
+		    "List of dimensions to be reduced is empty!\n");
+	}else if (ctx->src->nd  <  (unsigned)ctx->reduxLen){
+		return reduxCleanupMsg(ctx, GA_INVALID_ERROR,
+		    "src has fewer dimensions than there are dimensions to reduce!\n");
+	}else if (reduxRequiresDst   (ctx) && !ctx->dst){
+		return reduxCleanupMsg(ctx, GA_INVALID_ERROR,
+		    "dst is NULL, but reduction requires it!\n");
+	}else if (reduxRequiresDstArg(ctx) && !ctx->dstArg){
+		return reduxCleanupMsg(ctx, GA_INVALID_ERROR,
+		    "dstArg is NULL, but reduction requires it!\n");
+	}else if (ctx->dst    && ctx->dst->nd   +ctx->reduxLen != ctx->src->nd){
+		return reduxCleanupMsg(ctx, GA_INVALID_ERROR,
+		    "dst is of incorrect dimensionality for this reduction!\n");
+	}else if (ctx->dstArg && ctx->dstArg->nd+ctx->reduxLen != ctx->src->nd){
+		return reduxCleanupMsg(ctx, GA_INVALID_ERROR,
+		    "dstArg is of incorrect dimensionality for this reduction!\n");
 	}
-
-
-	/**
-	 * We initialize some more parts of the context, using the guarantees
-	 * we now have about the sanity of the arguments.
-	 */
-
 	ctx->nds = ctx->src->nd;
 	ctx->ndr = ctx->reduxLen;
 	ctx->ndd = ctx->nds - ctx->ndr;
-	strb_ensure(&ctx->s, 3*1024);
-
-
-	/**
-	 * And make a few small dynamic memory allocations for the benefit of the
-	 * rest of the code, allowing error checking to happen early and fail fast.
-	 */
-
-	ctx->srcAxisList = malloc(ctx->nds * sizeof(unsigned));
-	ctx->dstDims     = malloc(ctx->ndd * sizeof(size_t));
-	if (!ctx->srcAxisList ||
-	    !ctx->dstDims     ){
-		return reduxCleanup(ctx, GA_MEMORY_ERROR);
+	ctx->ndw = 0;
+	ctx->ndp = 0;
+	ctx->ndf = 0;
+	ctx->ndt = ctx->ndd + 1;
+	
+	/* Insane reduxList? */
+	for (i=0;i<ctx->ndr;i++){
+		j = ctx->reduxList[i];
+		if (j < -ctx->nds || j >= ctx->nds){
+			return reduxCleanupMsg(ctx, GA_INVALID_ERROR,
+			    "Insane axis number %d! Should be [%d, %d)!\n",
+			    j, -ctx->nds, ctx->nds);
+		}
+		j = j<0 ? ctx->nds+j : j;
+		d                 = ctx->src->dimensions[j];
+		ctx->zeroRdxAxes += !d;
+		ctx->prodRdxAxes *=  d?d:1;
 	}
 
 
 	/**
-	 * Query device for approximate total level of parallelism. If destination
-	 * tensor is so big it can keep all threads busy on individual elements,
-	 * use large code model; Otherwise use small code model, where threads will
-	 * have to cooperate.
-	 *
-	 *    - Large (Destination tensor >= SMALL_REDUX_THRESHOLD elements, or
-	 *             destination tensor size >= # of reductions per destination
-	 *             tensor element):
-	 *        All destination elements have their own thread.
-	 *    - Small (otherwise):
-	 *        Multiple threads cooperate on a single destination element.
+	 * Insane shape?
+	 * 
+	 * The source tensor is allowed to be empty (its shape may contain 0s).
+	 * However, all axes that are of length 0 must be reduction axes.
+	 * 
+	 * The reason for this is that a reduction cannot store any output into an
+	 * empty destination tensor (whose dimensions are the free axes), because
+	 * it has 0 space. The operation cannot then fulfill its contract.
+	 * 
+	 * On the other hand, when some or all reduction axes of a tensor are of
+	 * length 0, the reduction can be interpreted as initializing the
+	 * destination tensor to the identity value of the operation. For lack of a
+	 * better idea, the destination argument tensor can then be zeroed.
 	 */
 
-	ret = gpucontext_property(ctx->gpuCtx, GA_CTX_PROP_NUMPROCS, &numProcs);
-	if (ret != GA_NO_ERROR){
-		return reduxCleanup(ctx, ret);
+	for (i=0;i<ctx->nds;i++){
+		d                  = ctx->src->dimensions[i];
+		ctx->zeroAllAxes += !d;
+		ctx->prodAllAxes *=  d?d:1;
 	}
-	ret = gpucontext_property(ctx->gpuCtx, GA_CTX_PROP_MAXLSIZE, &localSize);
-	if (ret != GA_NO_ERROR){
-		return reduxCleanup(ctx, ret);
+	if (ctx->zeroAllAxes != ctx->zeroRdxAxes){
+		return reduxCleanupMsg(ctx, GA_INVALID_ERROR,
+		    "Source tensor has length-0 dimensions that are not reduced!");
 	}
+	ctx->prodFreeAxes = ctx->prodAllAxes/ctx->prodRdxAxes;
 
-	for (i=j=0;i<ctx->nds;i++){
-		if (axisInSet(i, ctx->reduxList, ctx->nds, NULL)){
-			reduxPerElem *= ctx->src->dimensions[i];
-		}else{
-			dstNumElem   *= ctx->src->dimensions[i];
-			ctx->dstDims[j++] = ctx->src->dimensions[i];;
-		}
-	}
 
-	ctx->largeCodeModel = dstNumElem >= numProcs*localSize ||
-	                      dstNumElem >= reduxPerElem
-	                      || 1;/* BUG: Erase when small code model implemented. */
 	/**
-	 * *** IT IS NOW SAFE TO CALL: ***
-	 *       - reduxIsLargeModel()
-	 *       - reduxIsSmallModel()
-	 *       - reduxKernelRequiresDst()
-	 *       - reduxKernelRequiresDstArg()
+	 * GPU context non-existent, or cannot read its properties?
 	 */
 
-
-	/**
-	 * Allocate workspaces.
-	 *
-	 * Certain reductions may require a workspace that isn't provided by the user.
-	 * For instance, **when using the small code model**, argmin/argmax require
-	 * a dst buffer, but the user didn't supply one (as he would have for
-	 * maxandargmax/minandargmin). We must allocate and deallocate it ourselves.
-	 *
-	 * Otherwise we use the user-supplied buffers.
-	 */
-
-	if (!reduxRequiresDst   (ctx) && reduxKernelRequiresDst(ctx)){
-		ctx->wsDst    = malloc(sizeof(*ctx->wsDst));
-		if (!ctx->wsDst){
-			return reduxCleanup(ctx, GA_MEMORY_ERROR);
-		}
-
-		ret = GpuArray_empty(ctx->wsDst, ctx->gpuCtx,  ctx->dstTypeCode,
-		                     ctx->ndd,   ctx->dstDims, GA_C_ORDER);
-		if(ret != GA_NO_ERROR){
-			return reduxCleanup(ctx, ret);
-		}
-	}else{
-		ctx->wsDst    = ctx->dst;
-	}
-	if (!reduxRequiresDstArg(ctx) && reduxKernelRequiresDstArg(ctx)){
-		ctx->wsDstArg = malloc(sizeof(*ctx->wsDstArg));
-		if (!ctx->wsDstArg){
-			return reduxCleanup(ctx, GA_MEMORY_ERROR);
-		}
-
-		ret = GpuArray_empty(ctx->wsDstArg, ctx->gpuCtx,  ctx->dstArgTypeCode,
-		                     ctx->ndd,      ctx->dstDims, GA_C_ORDER);
-		if(ret != GA_NO_ERROR){
-			return reduxCleanup(ctx, ret);
-		}
-	}else{
-		ctx->wsDstArg = ctx->dstArg;
+	ctx->gpuCtx = GpuArray_context(ctx->src);
+	if (!ctx->gpuCtx                                                                           ||
+	    gpucontext_property(ctx->gpuCtx, GA_CTX_PROP_NUMPROCS,  &ctx->numProcs) != GA_NO_ERROR ||
+	    gpucontext_property(ctx->gpuCtx, GA_CTX_PROP_MAXLSIZE,  &ctx->maxLg)    != GA_NO_ERROR ||
+	    gpudata_property(ctx->src->data, GA_CTX_PROP_MAXLSIZE0, &ctx->maxLs[0]) != GA_NO_ERROR ||
+	    gpudata_property(ctx->src->data, GA_CTX_PROP_MAXLSIZE1, &ctx->maxLs[1]) != GA_NO_ERROR ||
+	    gpudata_property(ctx->src->data, GA_CTX_PROP_MAXLSIZE2, &ctx->maxLs[2]) != GA_NO_ERROR ||
+	    gpudata_property(ctx->src->data, GA_CTX_PROP_MAXGSIZE,  &ctx->maxGg)    != GA_NO_ERROR ||
+	    gpudata_property(ctx->src->data, GA_CTX_PROP_MAXGSIZE0, &ctx->maxGs[0]) != GA_NO_ERROR ||
+	    gpudata_property(ctx->src->data, GA_CTX_PROP_MAXGSIZE1, &ctx->maxGs[1]) != GA_NO_ERROR ||
+	    gpudata_property(ctx->src->data, GA_CTX_PROP_MAXGSIZE2, &ctx->maxGs[2]) != GA_NO_ERROR ){
+		/* gpukernel_property(ctx->kernel.k,     GA_KERNEL_PROP_PREFLSIZE, &warpSize); */
+		return reduxCleanupMsg(ctx, GA_INVALID_ERROR,
+		    "Error obtaining one or more properties from GPU context!\n");
 	}
+	ctx->warpSize = 32;
 
 
+	/**
+	 * Type management.
+	 * 
+	 * - Deal with the various typecodes.
+	 * - Determine initializer and error out if reduction unsupported on that
+	 *   datatype.
+	 */
 
-	return reduxSelectHwAxes(ctx);
-}
-
-/**
- * @brief Select types for the reduction kernel's implementation.
- *
- * There are 5 types of relevance:
- *   - Source                   (S=Source)
- *   - Destination              (T=Target)
- *   - Destination Argument     (A=Arg)
- *   - Index                    (X=indeX)
- *   - Accumulator              (K=aKKumulator/reduction)
- */
-
-static void  reduxSelectTypes              (redux_ctx*  ctx){
-	/* Deal with the various typecodes. */
 	ctx->srcTypeCode    = ctx->src->typecode;
 	ctx->dstTypeCode    = ctx->srcTypeCode;
 	ctx->dstArgTypeCode = GA_SSIZE;
@@ -1060,179 +1501,330 @@ static void  reduxSelectTypes              (redux_ctx*  ctx){
 		case GA_HALF2:
 		  ctx->accTypeCode = GA_FLOAT2;
 		break;
-		case GA_HALF4:
-		  ctx->accTypeCode = GA_FLOAT4;
+		case GA_HALF4:
+		  ctx->accTypeCode = GA_FLOAT4;
+		break;
+		case GA_HALF8:
+		  ctx->accTypeCode = GA_FLOAT8;
+		break;
+		case GA_HALF16:
+		  ctx->accTypeCode = GA_FLOAT16;
+		break;
+		default:
+		  ctx->accTypeCode = ctx->srcTypeCode;
+	}
+	ctx->srcTypeStr     = gpuarray_get_type(ctx->srcTypeCode)   ->cluda_name;
+	ctx->dstTypeStr     = gpuarray_get_type(ctx->dstTypeCode)   ->cluda_name;
+	ctx->dstArgTypeStr  = gpuarray_get_type(ctx->dstArgTypeCode)->cluda_name;
+	ctx->idxTypeStr     = gpuarray_get_type(ctx->idxTypeCode)   ->cluda_name;
+	ctx->accTypeStr     = gpuarray_get_type(ctx->accTypeCode)   ->cluda_name;
+	if (!ctx->srcTypeStr    ||
+	    !ctx->dstTypeStr    ||
+	    !ctx->dstArgTypeStr ||
+	    !ctx->idxTypeStr    ||
+	    !ctx->accTypeStr    ){
+		return reduxCleanup(ctx, GA_INVALID_ERROR);
+	}
+	switch (ctx->op){
+		case GA_REDUCE_SUM:
+		  retT = reduxGetSumInit (ctx->dstTypeCode, &ctx->initValT);
+		  retK = reduxGetSumInit (ctx->accTypeCode, &ctx->initValK);
+		break;
+		case GA_REDUCE_PRODNZ:
+		case GA_REDUCE_PROD:
+		  retT = reduxGetProdInit(ctx->dstTypeCode, &ctx->initValT);
+		  retK = reduxGetProdInit(ctx->accTypeCode, &ctx->initValK);
+		break;
+		case GA_REDUCE_MINANDARGMIN:
+		case GA_REDUCE_ARGMIN:
+		case GA_REDUCE_MIN:
+		  retT = reduxGetMinInit (ctx->dstTypeCode, &ctx->initValT);
+		  retK = reduxGetMinInit (ctx->accTypeCode, &ctx->initValK);
+		break;
+		case GA_REDUCE_MAXANDARGMAX:
+		case GA_REDUCE_ARGMAX:
+		case GA_REDUCE_MAX:
+		  retT = reduxGetMaxInit (ctx->dstTypeCode, &ctx->initValT);
+		  retK = reduxGetMaxInit (ctx->accTypeCode, &ctx->initValK);
 		break;
-		case GA_HALF8:
-		  ctx->accTypeCode = GA_FLOAT8;
+		case GA_REDUCE_ALL:
+		case GA_REDUCE_AND:
+		  retT = reduxGetAndInit (ctx->dstTypeCode, &ctx->initValT);
+		  retK = reduxGetAndInit (ctx->accTypeCode, &ctx->initValK);
 		break;
-		case GA_HALF16:
-		  ctx->accTypeCode = GA_FLOAT16;
+		case GA_REDUCE_ANY:
+		case GA_REDUCE_XOR:
+		case GA_REDUCE_OR:
+		  retT = reduxGetOrInit  (ctx->dstTypeCode, &ctx->initValT);
+		  retK = reduxGetOrInit  (ctx->accTypeCode, &ctx->initValK);
 		break;
 		default:
-		  ctx->accTypeCode = ctx->srcTypeCode;
+		  retT = GA_UNSUPPORTED_ERROR;
+		  retK = GA_UNSUPPORTED_ERROR;
+	}
+	if (retT != GA_NO_ERROR){
+		return reduxCleanupMsg(ctx, retT,
+		    "Problem selecting types to be used in reduction!\n");
+	}
+	if (retK != GA_NO_ERROR){
+		return reduxCleanupMsg(ctx, retK,
+		    "Problem selecting types to be used in reduction!\n");
 	}
 
-	/* Get the string version as well. */
-	ctx->srcTypeStr     = gpuarray_get_type(ctx->srcTypeCode)   ->cluda_name;
-	ctx->dstTypeStr     = gpuarray_get_type(ctx->dstTypeCode)   ->cluda_name;
-	ctx->dstArgTypeStr  = gpuarray_get_type(ctx->dstArgTypeCode)->cluda_name;
-	ctx->idxTypeStr     = gpuarray_get_type(ctx->idxTypeCode)   ->cluda_name;
-	ctx->accTypeStr     = gpuarray_get_type(ctx->accTypeCode)   ->cluda_name;
-}
-
-/**
- * @brief Returns whether we are using the small code model or not.
- */
 
-static int   reduxIsSmallCodeModel         (redux_ctx*  ctx){
-	return !reduxIsLargeCodeModel(ctx);
-}
+	/**
+	 * Allocate and construct source-tensor axis-description lists.
+	 * 
+	 * While constructing the descriptions of each axis, verify that:
+	 * 
+	 *   1. reduxLen has no duplicates.
+	 *   2. dst and/or dstArg's dimensions match src's dimensions, stripped of
+	 *      the reduction axes.
+	 */
 
-/**
- * @brief Returns whether we are using the large code model or not.
- */
+	ctx->xdSrc     = calloc(ctx->nds, sizeof(*ctx->xdSrc));
+	ctx->xdSrcPtrs = calloc(ctx->nds, sizeof(*ctx->xdSrcPtrs));
+	ctx->xdSrcFlat = calloc(ctx->nds, sizeof(*ctx->xdSrcFlat));
+	ctx->xdTmp     = calloc(ctx->ndt, sizeof(*ctx->xdTmp));
+	if (!ctx->xdSrc || !ctx->xdSrcPtrs || !ctx->xdSrcFlat || !ctx->xdTmp){
+		return reduxCleanup(ctx, GA_MEMORY_ERROR);
+	}
+	for (i=0;i<ctx->nds;i++){
+		axisInit(&ctx->xdSrc[i],
+		         ctx->src->dimensions[i],
+		         ctx->src->strides[i]);
+	}
+	for (i=0;i<ctx->ndr;i++){
+		j = ctx->reduxList[i];
+		j = j<0 ? ctx->nds+j : j;
+		a = reduxGetSrcAxis(ctx, j);
+		if (axisIsReduced(a)){
+			return reduxCleanupMsg(ctx, GA_INVALID_ERROR,
+			                       "Axis %d appears multiple times in the "
+			                       "reduction axis list!\n",
+			                       j);
+		}
+		axisMarkReduced(a, i);
+	}
+	for (i=j=0;i<ctx->nds;i++){
+		axis_desc* a      = reduxGetSrcAxis(ctx, i);
+		size_t     srcLen = axisGetLen(a), dstLen, dstArgLen;
+		
+		if (axisIsReduced(a)){continue;}
+		if (reduxRequiresDst(ctx)){
+			dstLen = ctx->dst->dimensions[j];
+			
+			if(srcLen != dstLen){
+				return reduxCleanupMsg(ctx, GA_INVALID_ERROR,
+				                       "Source axis %d has length %zu, but "
+				                       "corresponding destination axis %d has length %zu!\n",
+				                       i, srcLen, j, dstLen);
+			}
+			
+			a->dstStride    = ctx->dst->strides[j];
+		}
+		if (reduxRequiresDstArg(ctx)){
+			dstArgLen = ctx->dstArg->dimensions[j];
+			
+			if(srcLen != dstArgLen){
+				return reduxCleanupMsg(ctx, GA_INVALID_ERROR,
+				                       "Source axis %d has length %zu, but "
+				                       "corresponding destination-argument axis %d has length %zu!\n",
+				                       i, srcLen, j, dstArgLen);
+			}
+			
+			a->dstArgStride = ctx->dstArg->strides[j];
+		}
+		
+		j++;
+	}
 
-static int   reduxIsLargeCodeModel         (redux_ctx*  ctx){
-	return ctx->largeCodeModel;
-}
 
-/**
- * @brief Returns whether the reduction interface requires a dst argument.
- */
+	/**
+	 * Begin flattening the source tensor.
+	 */
 
-static int   reduxRequiresDst              (redux_ctx*  ctx){
-	switch (ctx->op){
-		case GA_REDUCE_ARGMIN:
-		case GA_REDUCE_ARGMAX:
-		  return 0;
-		default:
-		  return 1;
-	}
+	return reduxFlattenSource(ctx);
 }
 
 /**
- * @brief Returns whether the reduction interface requires a dstArg argument.
+ * @brief Flatten the source tensor as much as is practical.
+ * 
+ * This makes the axis lengths as long as possible and the tensor itself as
+ * contiguous as possible.
  */
 
-static int   reduxRequiresDstArg           (redux_ctx*  ctx){
-	switch (ctx->op){
-		case GA_REDUCE_MINANDARGMIN:
-		case GA_REDUCE_MAXANDARGMAX:
-		case GA_REDUCE_ARGMIN:
-		case GA_REDUCE_ARGMAX:
-		  return 1;
-		default:
-		  return 0;
+static int        reduxFlattenSource            (redux_ctx*  ctx){
+	axis_desc* axis, *flatAxis, *sortAxis;
+	int        i, j, isSensitive;
+	
+	/**
+	 * Copy source axis descriptions list to flattened source axis description
+	 * list, in preparation for attempts at flattening.
+	 */
+	
+	memcpy(ctx->xdSrcFlat, ctx->xdSrc, ctx->nds*sizeof(*ctx->xdSrcFlat));
+	ctx->ndf = ctx->nds;
+
+	/**
+	 * Pass 1: Flatten out 0-length dimensions. We already know that
+	 * 
+	 *         a) There are no 0-length free dimensions, because that
+	 *            constitutes an invalid input, and
+	 *         b) How many 0-length reduction dimensions there are, because
+	 *            we counted them in the error-checking code.
+	 * 
+	 * So if there are any 0-length axes, we can delete all reduction axes and
+	 * replace them with a single one.
+	 */
+	
+	if (ctx->zeroRdxAxes > 0){
+		for (i=j=0;i<ctx->ndf;i++){
+			axis = reduxGetSrcFlatAxis(ctx, i);
+			
+			if (!axisIsReduced(axis)){
+				*reduxGetSrcFlatAxis(ctx, j++) = *axis;
+			}
+		}
+		
+		axisInit       (reduxGetSrcFlatAxis(ctx, j), 0, 0);
+		axisMarkReduced(reduxGetSrcFlatAxis(ctx, j), 0);
+		j++;
+		ctx->ndf = j;
+	}
+	
+	/**
+	 * Pass 2: Flatten out 1-length dimensions, since they can always be
+	 *         ignored; They are always indexed at [0].
+	 */
+	
+	for (i=j=0;i<ctx->ndf;i++){
+		axis = reduxGetSrcFlatAxis(ctx, i);
+		
+		if (axisGetLen(axis) != 1){
+			*reduxGetSrcFlatAxis(ctx, j++) = *axis;
+		}
+	}
+	ctx->ndf = j;
+	
+	/**
+	 * Pass 3: Flatten out continuous dimensions, where strides and sensitivity
+	 *         allows it.
+	 */
+	
+	isSensitive = reduxIsSensitive(ctx);
+	
+	qsort(ctx->xdSrcFlat, ctx->ndf, sizeof(*ctx->xdSrcFlat),
+		  isSensitive ? reduxSortFlatSensitive : reduxSortFlatInsensitive);
+	
+	for (i=j=1;i<ctx->ndf;i++){
+		flatAxis = reduxGetSrcFlatAxis(ctx, j-1);
+		sortAxis = reduxGetSrcFlatAxis(ctx, i);
+		
+		if (!reduxTryFlattenInto(ctx, flatAxis, sortAxis)){
+			*reduxGetSrcFlatAxis(ctx, j++) = *sortAxis;
+		}
 	}
+	ctx->ndf = j;
+
+	return reduxSelectWarpAxes(ctx);
 }
 
 /**
- * @brief Returns whether the generated kernel internally requires a dst
- *        argument.
- *
- * This is semantically subtly different from reduxHasDst(). The main
- * difference is in the implementation of the GA_REDUCE_ARGMIN/ARGMAX
- * reductions; Either *might* require a dst buffer, which will have to be
- * allocated, even though it will be discared.
+ * @brief Select the warp axes in such a way as to maximize memory bandwidth.
  */
 
-static int   reduxKernelRequiresDst        (redux_ctx*  ctx){
-	switch (ctx->op){
-		case GA_REDUCE_ARGMIN:
-		case GA_REDUCE_ARGMAX:
-		  return reduxIsSmallCodeModel(ctx);
-		default:
-		  return 1;
-	}
-}
+static int        reduxSelectWarpAxes           (redux_ctx*  ctx){
+	axis_desc* a;
+	int        i;
+	size_t     aL;
 
-/**
- * @brief Returns whether the generated kernel internally requires a dstArg
- *        argument.
- *
- * This is semantically subtly different from reduxHasDstArg(), since it asks
- * whether the reduction, even though it does not accept a dstArg argument,
- * still requires a dstArg internally.
- */
 
-static int   reduxKernelRequiresDstArg     (redux_ctx*  ctx){
 	/**
-	 * At present there exists no reduction whose implementation requires
-	 * a dstArg but whose interface does not.
-	 *
-	 * E.g. the max() and min() reductions do NOT currently require a temporary
-	 *      buffer for indexes, and will not in the foreseeable future.
+	 * NOTE: At this point it is possible for there to be no axes
+	 * (ctx->ndf == 0), but this will only occur if all axes of the original
+	 * tensor were length-1 (i.e., if this was a scalar masquerading as a
+	 * multidimensional tensor).
+	 * 
+	 * We check for this case and simulate a 1-dimensional, 1-length tensor.
 	 */
 
-	return reduxRequiresDstArg(ctx);
-}
+	if(ctx->ndf == 0){
+		axisInit       (reduxGetSrcFlatAxis(ctx, ctx->ndf), 1, 0);
+		axisMarkReduced(reduxGetSrcFlatAxis(ctx, ctx->ndf), 0);
+		ctx->ndf = 1;
+	}
 
-/**
- * @brief Check whether we can add another reduction axis or free axis
- *        to the hardware axis list for either the primary or secondary kernel.
- */
 
-static int   reduxCanAppendHwAxis          (redux_ctx* ctx,
-                                            int        kernelType,
-                                            int        axisType){
-	int kernelNdh  = kernelType == KERNEL_PRIMARY ? ctx->pri.ndh  : ctx->aux.ndh;
-	int kernelNdhr = kernelType == KERNEL_PRIMARY ? ctx->pri.ndhr : ctx->aux.ndhr;
-	int kernelNdhd = kernelType == KERNEL_PRIMARY ? ctx->pri.ndhd : ctx->aux.ndhd;
+	/**
+	 * Select Warp Axes.
+	 * 
+	 * Using a particular heuristic order (*), sort the axis list by
+	 * suitability for belonging to the warp. Then, pick the first few axes,
+	 * until the product of their lengths exceeds the warp size.
+	 * 
+	 * (*) See documentation of value-comparison function.
+	 */
 
-	if (kernelNdh >= MAX_HW_DIMS){
-		return 0;
-	}else{
-		return axisType == AXIS_REDUX ? kernelNdhr < ctx->ndr:
-		                                kernelNdhd < ctx->ndd;
+	for(i=0;i<ctx->ndf;i++){
+		ctx->xdSrcPtrs[i] = reduxGetSrcFlatAxis(ctx, i);
 	}
-}
 
-/**
- * @brief Append the largest reduction axis or free axis that isn't yet
- *        in the hardware axis list for either the primary or secondary kernel
- *        into said hardware axis list.
- */
+	qsort(ctx->xdSrcPtrs, ctx->ndf, sizeof(*ctx->xdSrcPtrs), reduxSortWarp);
 
-static void  reduxAppendLargestAxisToHwList(redux_ctx* ctx,
-                                            int        kernelType,
-                                            int        axisType){
-	int    maxI = 0, i, isInHwList, isInReduxList, isInDesiredList, isLargestSoFar;
-	int*   hwAxisList, * ndh, * ndhr, * ndhd;
-	size_t v, maxV = 0;
+	for (i=0;i<ctx->ndf;i++){
+		a  = reduxGetSrcSortAxis(ctx, i);
+		aL = axisGetLen(a);
+		if (aL <= 1){break;}
+		
+		ctx->prodWarpAxes *= aL;
+		if (ctx->prodWarpAxes <= ctx->warpSize){
+			axisMarkWarp(a, aL);
+			ctx->ndw++;
+		}else{
+			/**
+			 * The product of warp lengths just exceeded warpSize. We backtrack
+			 * by undoing the multiplication by aL. We then check whether we
+			 * can "split" this axis by extracting at least a factor of 2 into
+			 * warpLen. If yes, we mark is as the (only) warp axis that is
+			 * split by setting its warpLen to something neither 0 nor len.
+			 */
+			
+			ctx->prodWarpAxes /= aL;
+			aL = ctx->warpSize/ctx->prodWarpAxes;
+			if (aL >= 2){
+				axisMarkWarp(a, aL);
+				ctx->prodWarpAxes  *= aL;
+				ctx->splitWarpAxis  = i;
+				ctx->ndw++;
+				ctx->ndp++;
+			}
+			break;
+		}
+	}
 
-	/* Get pointers to the correct kernel's variables */
-	hwAxisList = kernelType == KERNEL_PRIMARY ?  ctx->pri.axisList:
-	                                             ctx->aux.axisList;
-	ndh        = kernelType == KERNEL_PRIMARY ? &ctx->pri.ndh:
-	                                            &ctx->aux.ndh;
-	ndhr       = kernelType == KERNEL_PRIMARY ? &ctx->pri.ndhr:
-	                                            &ctx->aux.ndhr;
-	ndhd       = kernelType == KERNEL_PRIMARY ? &ctx->pri.ndhd:
-	                                            &ctx->aux.ndhd;
 
-	/* Find */
-	for (i=0;i<ctx->nds;i++){
-		isInHwList      = axisInSet(i, hwAxisList,     *ndh,     0);
-		isInReduxList   = axisInSet(i, ctx->reduxList, ctx->ndr, 0);
-		isInDesiredList = axisType == AXIS_REDUX ?  isInReduxList:
-		                                           !isInReduxList;
-		v               = ctx->src->dimensions[i];
-		isLargestSoFar  = v >= maxV;
+	return reduxSelectNumStages(ctx);
+}
 
-		if (!isInHwList && isInDesiredList && isLargestSoFar){
-			maxV = v;
-			maxI = i;
-		}
-	}
+/**
+ * @brief Select the number of stages of the reduction.
+ * 
+ * This depends a lot on the GPU and the specific size of the reduction.
+ */
 
-	/* Append */
-	hwAxisList[(*ndh)++] = maxI;
-	if (axisType == AXIS_REDUX){
-		(*ndhr)++;
+static int        reduxSelectNumStages          (redux_ctx*  ctx){
+	size_t parallelism = 2 * ctx->numProcs * ctx->maxLg;
+	
+	if(ctx->zeroRdxAxes                     || /* Reduction is empty? */
+	   ctx->prodFreeAxes > ctx->prodRdxAxes || /* Large # of destination elements? */
+	   ctx->prodFreeAxes > parallelism      ){ /* # of destination elements large enough to fill available parallelism? */
+		ctx->numStages = 1;
 	}else{
-		(*ndhd)++;
+		ctx->numStages = 2;
 	}
+	
+	return reduxSelectHwAxes(ctx);
 }
 
 /**
@@ -1255,7 +1847,67 @@ static void  reduxAppendLargestAxisToHwList(redux_ctx* ctx,
  *                   largest free axes are selected.
  */
 
-static int   reduxSelectHwAxes             (redux_ctx*  ctx){
+static int        reduxSelectHwAxes             (redux_ctx*  ctx){
+	int ret;
+	
+	ctx->srcAxisList = malloc(ctx->nds * sizeof(unsigned));
+	ctx->dstDims     = malloc(ctx->ndd * sizeof(size_t));
+	if (!ctx->srcAxisList ||
+	    !ctx->dstDims     ){
+		return reduxCleanup(ctx, GA_MEMORY_ERROR);
+	}
+
+	ctx->largeCodeModel = 1;/* BUG: Erase when small code model fixed. */
+	/**
+	 * *** IT IS NOW SAFE TO CALL: ***
+	 *       - reduxIsLargeModel()
+	 *       - reduxIsSmallModel()
+	 *       - reduxKernelRequiresDst()
+	 *       - reduxKernelRequiresDstArg()
+	 */
+
+
+	/**
+	 * Allocate workspaces.
+	 *
+	 * Certain reductions may require a workspace that isn't provided by the user.
+	 * For instance, **when using the small code model**, argmin/argmax require
+	 * a dst buffer, but the user didn't supply one (as he would have for
+	 * maxandargmax/minandargmin). We must allocate and deallocate it ourselves.
+	 *
+	 * Otherwise we use the user-supplied buffers.
+	 */
+
+	if (!reduxRequiresDst   (ctx) && reduxKernelRequiresDst(ctx)){
+		ctx->wsDst    = malloc(sizeof(*ctx->wsDst));
+		if (!ctx->wsDst){
+			return reduxCleanup(ctx, GA_MEMORY_ERROR);
+		}
+
+		ret = GpuArray_empty(ctx->wsDst, ctx->gpuCtx,  ctx->dstTypeCode,
+		                     ctx->ndd,   ctx->dstDims, GA_C_ORDER);
+		if (ret != GA_NO_ERROR){
+			return reduxCleanup(ctx, ret);
+		}
+	}else{
+		ctx->wsDst    = ctx->dst;
+	}
+	if (!reduxRequiresDstArg(ctx) && reduxKernelRequiresDstArg(ctx)){
+		ctx->wsDstArg = malloc(sizeof(*ctx->wsDstArg));
+		if (!ctx->wsDstArg){
+			return reduxCleanup(ctx, GA_MEMORY_ERROR);
+		}
+
+		ret = GpuArray_empty(ctx->wsDstArg, ctx->gpuCtx,  ctx->dstArgTypeCode,
+		                     ctx->ndd,      ctx->dstDims, GA_C_ORDER);
+		if (ret != GA_NO_ERROR){
+			return reduxCleanup(ctx, ret);
+		}
+	}else{
+		ctx->wsDstArg = ctx->dstArg;
+	}
+
+
 	if (reduxIsLargeCodeModel(ctx)){
 		while (reduxCanAppendHwAxis       (ctx, KERNEL_PRIMARY,   AXIS_FREE)){
 			reduxAppendLargestAxisToHwList(ctx, KERNEL_PRIMARY,   AXIS_FREE);
@@ -1348,8 +2000,8 @@ static void  reduxAppendTensorCallArgs     (redux_ctx*  ctx,
 static void  reduxAppendMacroDefs          (redux_ctx*  ctx){
 	int i;
 
-	srcbAppends    (&ctx->srcGen, "#define FOROVER(idx)    for(i##idx = i##idx##Start; i##idx < i##idx##End; i##idx++)\n");
-	srcbAppends    (&ctx->srcGen, "#define ESCAPE(idx)     if(i##idx >= i##idx##Dim){continue;}\n");
+	srcbAppends    (&ctx->srcGen, "#define FOROVER(idx)    for (i##idx = i##idx##Start; i##idx < i##idx##End; i##idx++)\n");
+	srcbAppends    (&ctx->srcGen, "#define ESCAPE(idx)     if (i##idx >= i##idx##Dim){continue;}\n");
 
 	/* srcVal indexer */
 	srcbAppends    (&ctx->srcGen, "#define srcVal          (*(const GLOBAL_MEM S*)(");
@@ -1480,10 +2132,10 @@ static void  reduxAppendPrototype          (redux_ctx*  ctx){
 	reduxAppendTensorDeclArgs(ctx, "S", "src");
 	srcbAppendElemf(&ctx->srcGen, "const GLOBAL_MEM X*        srcSize");
 	srcbAppendElemf(&ctx->srcGen, "const GLOBAL_MEM X*        chunkSize");
-	if(reduxKernelRequiresDst(ctx)){
+	if (reduxKernelRequiresDst(ctx)){
 		reduxAppendTensorDeclArgs(ctx, "T", "dst");
 	}
-	if(reduxKernelRequiresDstArg(ctx)){
+	if (reduxKernelRequiresDstArg(ctx)){
 		reduxAppendTensorDeclArgs(ctx, "A", "dstArg");
 	}
 	srcbEndList    (&ctx->srcGen);
@@ -1528,12 +2180,12 @@ static void  reduxAppendRangeCalculations  (redux_ctx*  ctx){
 	for (i=0;i<ctx->nds;i++){
 		strb_appendf(&ctx->s, "\ti%dSStep   = srcSteps[%d];\n", i, ctx->srcAxisList[i]);
 	}
-	if(reduxKernelRequiresDst(ctx)){
+	if (reduxKernelRequiresDst(ctx)){
 		for (i=0;i<ctx->ndd;i++){
 			strb_appendf(&ctx->s, "\ti%dDStep   = dstSteps[%d];\n", i, i);
 		}
 	}
-	if(reduxKernelRequiresDstArg(ctx)){
+	if (reduxKernelRequiresDstArg(ctx)){
 		for (i=0;i<ctx->ndd;i++){
 			strb_appendf(&ctx->s, "\ti%dAStep   = dstArgSteps[%d];\n", i, i);
 		}
@@ -1623,14 +2275,14 @@ static void  reduxAppendLoops              (redux_ctx*  ctx){
 		case GA_REDUCE_ARGMIN:
 		case GA_REDUCE_MINANDARGMIN:
 		  srcbAppends(&ctx->srcGen, "\t\t\trdxK  = min(rdxK, k);\n"
-		                            "\t\t\tif(rdxK == k){\n"
+		                            "\t\t\tif (rdxK == k){\n"
 		                            "\t\t\t\trdxA = rdxIdx;\n"
 		                            "\t\t\t}\n");
 		break;
 		case GA_REDUCE_ARGMAX:
 		case GA_REDUCE_MAXANDARGMAX:
 		  srcbAppends(&ctx->srcGen, "\t\t\trdxK  = max(rdxK, k);\n"
-		                            "\t\t\tif(rdxK == k){\n"
+		                            "\t\t\tif (rdxK == k){\n"
 		                            "\t\t\t\trdxA = rdxIdx;\n"
 		                            "\t\t\t}\n");
 		break;
@@ -2096,14 +2748,18 @@ static int   reduxInvoke                   (redux_ctx*  ctx){
  * Cleanup
  */
 
-static int   reduxCleanup                  (redux_ctx*  ctx, int ret){
+static int        reduxCleanup                  (redux_ctx*  ctx, int ret){
 	if (ctx->dst    != ctx->wsDst){
-		GpuArray_clear(ctx->wsDst);
+		if(ctx->wsDst){
+			GpuArray_clear(ctx->wsDst);
+		}
 		free(ctx->wsDst);
 		ctx->wsDst    = NULL;
 	}
 	if (ctx->dstArg != ctx->wsDstArg){
-		GpuArray_clear(ctx->wsDstArg);
+		if(ctx->wsDstArg){
+			GpuArray_clear(ctx->wsDstArg);
+		}
 		free(ctx->wsDstArg);
 		ctx->wsDstArg = NULL;
 	}
@@ -2133,3 +2789,20 @@ static int   reduxCleanup                  (redux_ctx*  ctx, int ret){
 
 	return ret;
 }
+
+static int   reduxCleanupMsg               (redux_ctx*  ctx, int ret,
+                                            const char* fmt, ...){
+#if DEBUG
+	FILE* fp = stderr;
+	
+	va_list ap;
+	va_start(ap, fmt);
+	vfprintf(fp, fmt, ap);
+	va_end(ap);
+	fflush(fp);
+#else
+	(void)fmt;
+#endif
+	
+	return reduxCleanup(ctx, ret);
+}
diff --git a/tests/check_reduction.c b/tests/check_reduction.c
index 370f074167..18cf9e7615 100644
--- a/tests/check_reduction.c
+++ b/tests/check_reduction.c
@@ -113,7 +113,7 @@ START_TEST(test_maxandargmax_reduction){
 	ga_assert_ok(GpuArray_memset(&gaMax,    -1));  /* 0xFFFFFFFF is a qNaN. */
 	ga_assert_ok(GpuArray_memset(&gaArgmax, -1));
 
-	ga_assert_ok(GpuArray_maxandargmax(&gaMax, &gaArgmax, &gaSrc, 2, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_MAXANDARGMAX, &gaMax, &gaArgmax, &gaSrc, 2, reduxList));
 
 	ga_assert_ok(GpuArray_read(pMax,    sizeof(*pMax)   *dims[1], &gaMax));
 	ga_assert_ok(GpuArray_read(pArgmax, sizeof(*pArgmax)*dims[1], &gaArgmax));
@@ -205,7 +205,7 @@ START_TEST(test_maxandargmax_idxtranspose){
 	ga_assert_ok(GpuArray_memset(&gaMax,    -1));  /* 0xFFFFFFFF is a qNaN. */
 	ga_assert_ok(GpuArray_memset(&gaArgmax, -1));
 
-	ga_assert_ok(GpuArray_maxandargmax(&gaMax, &gaArgmax, &gaSrc, 2, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_MAXANDARGMAX, &gaMax, &gaArgmax, &gaSrc, 2, reduxList));
 
 	ga_assert_ok(GpuArray_read(pMax,    sizeof(*pMax)   *rdxProdDims, &gaMax));
 	ga_assert_ok(GpuArray_read(pArgmax, sizeof(*pArgmax)*rdxProdDims, &gaArgmax));
@@ -294,7 +294,7 @@ START_TEST(test_maxandargmax_veryhighrank){
 	ga_assert_ok(GpuArray_memset(&gaMax,    -1));  /* 0xFFFFFFFF is a qNaN. */
 	ga_assert_ok(GpuArray_memset(&gaArgmax, -1));
 
-	ga_assert_ok(GpuArray_maxandargmax(&gaMax, &gaArgmax, &gaSrc, 4, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_MAXANDARGMAX, &gaMax, &gaArgmax, &gaSrc, 4, reduxList));
 
 	ga_assert_ok(GpuArray_read(pMax,    sizeof(*pMax)   *rdxProdDims, &gaMax));
 	ga_assert_ok(GpuArray_read(pArgmax, sizeof(*pArgmax)*rdxProdDims, &gaArgmax));
@@ -393,7 +393,7 @@ START_TEST(test_maxandargmax_alldimsreduced){
 	ga_assert_ok(GpuArray_memset(&gaMax,    -1));  /* 0xFFFFFFFF is a qNaN. */
 	ga_assert_ok(GpuArray_memset(&gaArgmax, -1));
 
-	ga_assert_ok(GpuArray_maxandargmax(&gaMax, &gaArgmax, &gaSrc, 3, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_MAXANDARGMAX, &gaMax, &gaArgmax, &gaSrc, 3, reduxList));
 
 	ga_assert_ok(GpuArray_read(pMax,    sizeof(*pMax),    &gaMax));
 	ga_assert_ok(GpuArray_read(pArgmax, sizeof(*pArgmax), &gaArgmax));
@@ -481,7 +481,7 @@ START_TEST(test_minandargmin_reduction){
 	ga_assert_ok(GpuArray_memset(&gaMin,    -1));  /* 0xFFFFFFFF is a qNaN. */
 	ga_assert_ok(GpuArray_memset(&gaArgmin, -1));
 
-	ga_assert_ok(GpuArray_minandargmin(&gaMin, &gaArgmin, &gaSrc, 2, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_MINANDARGMIN, &gaMin, &gaArgmin, &gaSrc, 2, reduxList));
 
 	ga_assert_ok(GpuArray_read(pMin,    sizeof(*pMin)   *dims[1], &gaMin));
 	ga_assert_ok(GpuArray_read(pArgmin, sizeof(*pArgmin)*dims[1], &gaArgmin));
@@ -570,7 +570,7 @@ START_TEST(test_minandargmin_veryhighrank){
 	ga_assert_ok(GpuArray_memset(&gaMin,    -1));  /* 0xFFFFFFFF is a qNaN. */
 	ga_assert_ok(GpuArray_memset(&gaArgmin, -1));
 
-	ga_assert_ok(GpuArray_minandargmin(&gaMin, &gaArgmin, &gaSrc, 4, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_MINANDARGMIN, &gaMin, &gaArgmin, &gaSrc, 4, reduxList));
 
 	ga_assert_ok(GpuArray_read(pMin,    sizeof(*pMin)   *rdxProdDims, &gaMin));
 	ga_assert_ok(GpuArray_read(pArgmin, sizeof(*pArgmin)*rdxProdDims, &gaArgmin));
@@ -669,7 +669,7 @@ START_TEST(test_minandargmin_alldimsreduced){
 	ga_assert_ok(GpuArray_memset(&gaMin,    -1));  /* 0xFFFFFFFF is a qNaN. */
 	ga_assert_ok(GpuArray_memset(&gaArgmin, -1));
 
-	ga_assert_ok(GpuArray_minandargmin(&gaMin, &gaArgmin, &gaSrc, 3, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_MINANDARGMIN, &gaMin, &gaArgmin, &gaSrc, 3, reduxList));
 
 	ga_assert_ok(GpuArray_read(pMin,    sizeof(*pMin),    &gaMin));
 	ga_assert_ok(GpuArray_read(pArgmin, sizeof(*pArgmin), &gaArgmin));
@@ -754,7 +754,7 @@ START_TEST(test_argmax_reduction){
 	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaArgmax, -1));
 
-	ga_assert_ok(GpuArray_argmax(&gaArgmax, &gaSrc, 2, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_ARGMAX, NULL, &gaArgmax, &gaSrc, 2, reduxList));
 
 	ga_assert_ok(GpuArray_read(pArgmax, sizeof(*pArgmax)*dims[1], &gaArgmax));
 
@@ -836,7 +836,7 @@ START_TEST(test_argmax_veryhighrank){
 	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaArgmax, -1));
 
-	ga_assert_ok(GpuArray_argmax(&gaArgmax, &gaSrc, 4, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_ARGMAX, NULL, &gaArgmax, &gaSrc, 4, reduxList));
 
 	ga_assert_ok(GpuArray_read(pArgmax, sizeof(*pArgmax)*rdxProdDims, &gaArgmax));
 
@@ -929,7 +929,7 @@ START_TEST(test_argmax_alldimsreduced){
 	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaArgmax, -1));
 
-	ga_assert_ok(GpuArray_argmax(&gaArgmax, &gaSrc, 3, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_ARGMAX, NULL, &gaArgmax, &gaSrc, 3, reduxList));
 
 	ga_assert_ok(GpuArray_read(pArgmax, sizeof(*pArgmax), &gaArgmax));
 
@@ -1011,7 +1011,7 @@ START_TEST(test_argmin_reduction){
 	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaArgmin, -1));
 
-	ga_assert_ok(GpuArray_argmin(&gaArgmin, &gaSrc, 2, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_ARGMIN, NULL, &gaArgmin, &gaSrc, 2, reduxList));
 
 	ga_assert_ok(GpuArray_read(pArgmin, sizeof(*pArgmin)*dims[1], &gaArgmin));
 
@@ -1093,7 +1093,7 @@ START_TEST(test_argmin_veryhighrank){
 	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaArgmin, -1));
 
-	ga_assert_ok(GpuArray_argmin(&gaArgmin, &gaSrc, 4, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_ARGMIN, NULL, &gaArgmin, &gaSrc, 4, reduxList));
 
 	ga_assert_ok(GpuArray_read(pArgmin, sizeof(*pArgmin)*rdxProdDims, &gaArgmin));
 
@@ -1186,7 +1186,7 @@ START_TEST(test_argmin_alldimsreduced){
 	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaArgmin, -1));
 
-	ga_assert_ok(GpuArray_argmin(&gaArgmin, &gaSrc, 3, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_ARGMIN, NULL, &gaArgmin, &gaSrc, 3, reduxList));
 
 	ga_assert_ok(GpuArray_read(pArgmin, sizeof(*pArgmin), &gaArgmin));
 
@@ -1265,7 +1265,7 @@ START_TEST(test_max_reduction){
 	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaMax,    -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_max(&gaMax, &gaSrc, 2, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_MAX, &gaMax, NULL, &gaSrc, 2, reduxList));
 
 	ga_assert_ok(GpuArray_read(pMax,    sizeof(*pMax)   *dims[1], &gaMax));
 
@@ -1343,7 +1343,7 @@ START_TEST(test_max_veryhighrank){
 	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaMax,    -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_max(&gaMax, &gaSrc, 4, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_MAX, &gaMax, NULL, &gaSrc, 4, reduxList));
 
 	ga_assert_ok(GpuArray_read(pMax,    sizeof(*pMax)   *rdxProdDims, &gaMax));
 
@@ -1431,7 +1431,7 @@ START_TEST(test_max_alldimsreduced){
 	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaMax,    -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_max(&gaMax, &gaSrc, 3, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_MAX, &gaMax, NULL, &gaSrc, 3, reduxList));
 
 	ga_assert_ok(GpuArray_read(pMax,    sizeof(*pMax),    &gaMax));
 
@@ -1507,7 +1507,7 @@ START_TEST(test_min_reduction){
 	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaMin,    -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_min(&gaMin, &gaSrc, 2, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_MIN, &gaMin, NULL, &gaSrc, 2, reduxList));
 
 	ga_assert_ok(GpuArray_read(pMin,    sizeof(*pMin)   *dims[1], &gaMin));
 
@@ -1585,7 +1585,7 @@ START_TEST(test_min_veryhighrank){
 	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaMin,    -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_min(&gaMin, &gaSrc, 4, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_MIN, &gaMin, NULL, &gaSrc, 4, reduxList));
 
 	ga_assert_ok(GpuArray_read(pMin,    sizeof(*pMin)   *rdxProdDims, &gaMin));
 
@@ -1673,7 +1673,7 @@ START_TEST(test_min_alldimsreduced){
 	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaMin,    -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_min(&gaMin, &gaSrc, 3, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_MIN, &gaMin, NULL, &gaSrc, 3, reduxList));
 
 	ga_assert_ok(GpuArray_read(pMin,    sizeof(*pMin),    &gaMin));
 
@@ -1750,7 +1750,7 @@ START_TEST(test_sum_reduction){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_sum   (&gaD, &gaS, 2, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_SUM, &gaD, NULL, &gaS, 2, reduxList));
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*dims[1], &gaD));
 
@@ -1826,7 +1826,7 @@ START_TEST(test_sum_veryhighrank){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_sum   (&gaD, &gaS, 4, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_SUM, &gaD, NULL, &gaS, 4, reduxList));
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*rdxProdDims, &gaD));
 
@@ -1912,7 +1912,7 @@ START_TEST(test_sum_alldimsreduced){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_sum   (&gaD, &gaS, 3, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_SUM, &gaD, NULL, &gaS, 3, reduxList));
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD), &gaD));
 
@@ -1986,7 +1986,7 @@ START_TEST(test_prod_reduction){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_prod  (&gaD, &gaS, 2, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_PROD, &gaD, NULL, &gaS, 2, reduxList));
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*dims[1], &gaD));
 
@@ -2062,7 +2062,7 @@ START_TEST(test_prod_veryhighrank){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_prod  (&gaD, &gaS, 4, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_PROD, &gaD, NULL, &gaS, 4, reduxList));
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*rdxProdDims, &gaD));
 
@@ -2148,7 +2148,7 @@ START_TEST(test_prod_alldimsreduced){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_prod  (&gaD, &gaS, 3, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_PROD, &gaD, NULL, &gaS, 3, reduxList));
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD), &gaD));
 
@@ -2225,7 +2225,7 @@ START_TEST(test_prodnz_reduction){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_prodnz(&gaD, &gaS, 2, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_PRODNZ, &gaD, NULL, &gaS, 2, reduxList));
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*dims[1], &gaD));
 
@@ -2304,7 +2304,7 @@ START_TEST(test_prodnz_veryhighrank){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_prodnz(&gaD, &gaS, 4, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_PRODNZ, &gaD, NULL, &gaS, 4, reduxList));
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*rdxProdDims, &gaD));
 
@@ -2393,7 +2393,7 @@ START_TEST(test_prodnz_alldimsreduced){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_prodnz(&gaD, &gaS, 3, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_PRODNZ, &gaD, NULL, &gaS, 3, reduxList));
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD), &gaD));
 
@@ -2475,7 +2475,7 @@ START_TEST(test_and_reduction){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));
 
-	ga_assert_ok(GpuArray_and   (&gaD, &gaS, 2, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_AND, &gaD, NULL, &gaS, 2, reduxList));
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*dims[1], &gaD));
 
@@ -2559,7 +2559,7 @@ START_TEST(test_and_veryhighrank){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_and   (&gaD, &gaS, 4, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_AND, &gaD, NULL, &gaS, 4, reduxList));
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*rdxProdDims, &gaD));
 
@@ -2653,7 +2653,7 @@ START_TEST(test_and_alldimsreduced){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_and   (&gaD, &gaS, 3, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_AND, &gaD, NULL, &gaS, 3, reduxList));
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD), &gaD));
 
@@ -2735,7 +2735,7 @@ START_TEST(test_or_reduction){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));
 
-	ga_assert_ok(GpuArray_or    (&gaD, &gaS, 2, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_OR, &gaD, NULL, &gaS, 2, reduxList));
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*dims[1], &gaD));
 
@@ -2819,7 +2819,7 @@ START_TEST(test_or_veryhighrank){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_or    (&gaD, &gaS, 4, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_OR, &gaD, NULL, &gaS, 4, reduxList));
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*rdxProdDims, &gaD));
 
@@ -2913,7 +2913,7 @@ START_TEST(test_or_alldimsreduced){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_or    (&gaD, &gaS, 3, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_OR, &gaD, NULL, &gaS, 3, reduxList));
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD), &gaD));
 
@@ -2991,7 +2991,7 @@ START_TEST(test_xor_reduction){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));
 
-	ga_assert_ok(GpuArray_xor   (&gaD, &gaS, 2, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_XOR, &gaD, NULL, &gaS, 2, reduxList));
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*dims[1], &gaD));
 
@@ -3071,7 +3071,7 @@ START_TEST(test_xor_veryhighrank){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_xor   (&gaD, &gaS, 4, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_XOR, &gaD, NULL, &gaS, 4, reduxList));
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*rdxProdDims, &gaD));
 
@@ -3161,7 +3161,7 @@ START_TEST(test_xor_alldimsreduced){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_xor   (&gaD, &gaS, 3, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_XOR, &gaD, NULL, &gaS, 3, reduxList));
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD), &gaD));
 
@@ -3239,7 +3239,7 @@ START_TEST(test_any_reduction){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));
 
-	ga_assert_ok(GpuArray_any   (&gaD, &gaS, 2, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_ANY, &gaD, NULL, &gaS, 2, reduxList));
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*dims[1], &gaD));
 
@@ -3319,7 +3319,7 @@ START_TEST(test_any_veryhighrank){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_any   (&gaD, &gaS, 4, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_ANY, &gaD, NULL, &gaS, 4, reduxList));
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*rdxProdDims, &gaD));
 
@@ -3409,7 +3409,7 @@ START_TEST(test_any_alldimsreduced){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_any   (&gaD, &gaS, 3, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_ANY, &gaD, NULL, &gaS, 3, reduxList));
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD), &gaD));
 
@@ -3487,7 +3487,7 @@ START_TEST(test_all_reduction){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));
 
-	ga_assert_ok(GpuArray_all   (&gaD, &gaS, 2, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_ALL, &gaD, NULL, &gaS, 2, reduxList));
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*dims[1], &gaD));
 
@@ -3567,7 +3567,7 @@ START_TEST(test_all_veryhighrank){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_all   (&gaD, &gaS, 4, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_ALL, &gaD, NULL, &gaS, 4, reduxList));
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*rdxProdDims, &gaD));
 
@@ -3657,7 +3657,7 @@ START_TEST(test_all_alldimsreduced){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_all   (&gaD, &gaS, 3, reduxList));
+	ga_assert_ok(GpuArray_reduction(GA_REDUCE_ALL, &gaD, NULL, &gaS, 3, reduxList));
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD), &gaD));
 

From 1a2df8dc54926d161d13b9813fbc0349302b3f5d Mon Sep 17 00:00:00 2001
From: Olexa Bilaniuk <obilaniu@gmail.com>
Date: Tue, 13 Jun 2017 17:06:02 -0400
Subject: [PATCH 12/34] Current State

---
 src/gpuarray_reduction.c | 167 +++++++++++++++++++++++++++------------
 1 file changed, 118 insertions(+), 49 deletions(-)

diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c
index 072f1e2685..1c1721ee4f 100644
--- a/src/gpuarray_reduction.c
+++ b/src/gpuarray_reduction.c
@@ -41,10 +41,7 @@
 struct axis_desc{
 	int      reduxNum;
 	unsigned isReduced     : 1;
-	unsigned isHW          : 1;
-	unsigned isSW          : 1;
-	size_t   warpLen;
-	size_t   len;
+	size_t   len, warpLen, sliceLen;
 	ssize_t  srcStride,       srcOffset;
 	ssize_t  dstStride,       dstOffset;
 	ssize_t  dstArgStride,    dstArgOffset;
@@ -392,6 +389,7 @@ static int        axisIsPartialWarp             (const axis_desc* axis);
 
 /* Reduction Context API */
 /*     Utilities */
+static size_t     reduxEstimateParallelism      (const redux_ctx*  ctx);
 static int        reduxRequiresDst              (const redux_ctx*  ctx);
 static int        reduxRequiresDstArg           (const redux_ctx*  ctx);
 static int        reduxKernelRequiresDst        (const redux_ctx*  ctx);
@@ -417,6 +415,8 @@ static int        reduxInferProperties          (redux_ctx*  ctx);
 static int        reduxFlattenSource            (redux_ctx*  ctx);
 static int        reduxSelectWarpAxes           (redux_ctx*  ctx);
 static int        reduxSelectNumStages          (redux_ctx*  ctx);
+static int        reduxPlan1Stage               (redux_ctx*  ctx);
+static int        reduxPlan2Stage               (redux_ctx*  ctx);
 static int        reduxSelectHwAxes             (redux_ctx*  ctx);
 static int        reduxComputeAxisList          (redux_ctx*  ctx);
 static int        reduxGenSource                (redux_ctx*  ctx);
@@ -1010,6 +1010,28 @@ static int        axisIsPartialWarp             (const axis_desc* axis){
 	return axis->warpLen > 0 && axis->warpLen != axis->len;
 }
 
+/**
+ * @brief Estimate the level of parallelism in the device.
+ * 
+ * This is a rough target number of threads.  It would definitely fill the
+ * device, plus some substantial margin.
+ */
+
+static size_t     reduxEstimateParallelism      (const redux_ctx*  ctx){
+	/**
+	 * An arbitrary margin factor ensuring there will be a few thread blocks
+	 * per SMX.
+	 * 
+	 * E.g. on Kepler, each SMX can handle up to two 1024-thread blocks
+	 * simultaneously, so a margin of 6/SMX should ensure with very high
+	 * likelyhood that all SMXes will be fed and kept busy.
+	 */
+	
+	size_t marginFactor = 6;
+	
+	return marginFactor*ctx->numProcs*ctx->maxLg;
+}
+
 /**
  * @brief Returns whether the reduction interface requires a dst argument.
  */
@@ -1582,10 +1604,10 @@ static int        reduxInferProperties          (redux_ctx*  ctx){
 	 *      the reduction axes.
 	 */
 
-	ctx->xdSrc     = calloc(ctx->nds, sizeof(*ctx->xdSrc));
-	ctx->xdSrcPtrs = calloc(ctx->nds, sizeof(*ctx->xdSrcPtrs));
-	ctx->xdSrcFlat = calloc(ctx->nds, sizeof(*ctx->xdSrcFlat));
-	ctx->xdTmp     = calloc(ctx->ndt, sizeof(*ctx->xdTmp));
+	ctx->xdSrc     = calloc(ctx->nds,   sizeof(*ctx->xdSrc));
+	ctx->xdSrcPtrs = calloc(ctx->nds+1, sizeof(*ctx->xdSrcPtrs));
+	ctx->xdSrcFlat = calloc(ctx->nds+1, sizeof(*ctx->xdSrcFlat));
+	ctx->xdTmp     = calloc(ctx->ndt,   sizeof(*ctx->xdTmp));
 	if (!ctx->xdSrc || !ctx->xdSrcPtrs || !ctx->xdSrcFlat || !ctx->xdTmp){
 		return reduxCleanup(ctx, GA_MEMORY_ERROR);
 	}
@@ -1814,15 +1836,62 @@ static int        reduxSelectWarpAxes           (redux_ctx*  ctx){
  */
 
 static int        reduxSelectNumStages          (redux_ctx*  ctx){
-	size_t parallelism = 2 * ctx->numProcs * ctx->maxLg;
+	size_t parallelism  = reduxEstimateParallelism(ctx);
 	
-	if(ctx->zeroRdxAxes                     || /* Reduction is empty? */
-	   ctx->prodFreeAxes > ctx->prodRdxAxes || /* Large # of destination elements? */
-	   ctx->prodFreeAxes > parallelism      ){ /* # of destination elements large enough to fill available parallelism? */
-		ctx->numStages = 1;
+	if (ctx->zeroRdxAxes                      || /* Reduction over  0  elements? */
+	    ctx->prodAllAxes  <= ctx->maxLg       || /* Reduction over few elements? */
+	    ctx->prodFreeAxes >= ctx->prodRdxAxes || /* More destinations than reductions? */
+	    ctx->prodFreeAxes >= parallelism      ){ /* Destination very large? */
+		return reduxPlan1Stage(ctx);
 	}else{
-		ctx->numStages = 2;
+		return reduxPlan2Stage(ctx);
 	}
+}
+
+/**
+ * @brief Plan a 1-stage reduction.
+ * 
+ * Inputs: ctx->xdSrcFlat[0...ctx->ndf-1]
+ * 
+ * This plan involves a direct write to the destinations, and does not require
+ * working space.
+ * 
+ * Because the reduction is deterministic, all reductions required for any
+ * destination element must be performed within a single thread block.
+ * 
+ * In this implementation we choose to perform only intra-warp reductions,
+ * insulating ourselves from having to worry about the interplay between block
+ * size and kernel source code (A kernel's max block size is limited by
+ * numerous factors including its own source code, but the specific kernel we
+ * pick and generate requires foreknowledge of its block size. Chicken or egg).
+ */
+
+static int        reduxPlan1Stage               (redux_ctx*  ctx){
+	ctx->numStages = 1;
+	
+	
+	
+	return reduxSelectHwAxes(ctx);
+}
+
+/**
+ * @brief Plan a 2-stage reduction.
+ * 
+ * Inputs: ctx->xdSrcFlat[0...ctx->ndf-1]
+ * 
+ * This plan involves splitting the reduction into two stages:
+ * 
+ *    Stage 1:  A reduction by approximately R = sqrt(prodRdxAxes) elements per
+ *              destination elements into allocated temporary workspace(s)
+ *              of approximate size dst.shape + (prodRdxAxes/R,)
+ *    Stage 2:  A reduction by approximately prodRdxAxes/R elements into the
+ *              final destination.
+ */
+
+static int        reduxPlan2Stage               (redux_ctx*  ctx){
+	ctx->numStages = 2;
+	
+	/* NOTE: Use gpuarray_get_elsize(typecode) */
 	
 	return reduxSelectHwAxes(ctx);
 }
@@ -1941,7 +2010,7 @@ static int        reduxSelectHwAxes             (redux_ctx*  ctx){
  * loops that iterate over the dimensions of elements that are to be reduced.
  */
 
-static int   reduxComputeAxisList          (redux_ctx*  ctx){
+static int        reduxComputeAxisList          (redux_ctx*  ctx){
 	int i, f=0;
 
 	for (i=0;i<ctx->nds;i++){
@@ -1961,7 +2030,7 @@ static int   reduxComputeAxisList          (redux_ctx*  ctx){
  * @return GA_MEMORY_ERROR if not enough memory left; GA_NO_ERROR otherwise.
  */
 
-static int   reduxGenSource                (redux_ctx*  ctx){
+static int        reduxGenSource                (redux_ctx*  ctx){
 	reduxAppendSource(ctx);
 	ctx->sourceCodeLen = ctx->s.l;
 	ctx->sourceCode    = strb_cstr(&ctx->s);
@@ -1971,7 +2040,7 @@ static int   reduxGenSource                (redux_ctx*  ctx){
 
 	return reduxCompile(ctx);
 }
-static void  reduxAppendSource             (redux_ctx*  ctx){
+static void       reduxAppendSource             (redux_ctx*  ctx){
 	reduxAppendIncludes         (ctx);
 	reduxAppendMacroDefs        (ctx);
 	reduxAppendTypedefs         (ctx);
@@ -1983,21 +2052,21 @@ static void  reduxAppendSource             (redux_ctx*  ctx){
 		reduxAppendPostKernel   (ctx);
 	}
 }
-static void  reduxAppendTensorDeclArgs     (redux_ctx*  ctx,
-                                            const char* type,
-                                            const char* baseName){
+static void       reduxAppendTensorDeclArgs     (redux_ctx*  ctx,
+                                                 const char* type,
+                                                 const char* baseName){
 	srcbAppendElemf(&ctx->srcGen, "%s* %sPtr",             type, baseName);
 	srcbAppendElemf(&ctx->srcGen, "const X %sOff",               baseName);
 	srcbAppendElemf(&ctx->srcGen, "const GLOBAL_MEM X* %sSteps", baseName);
 	(void)reduxAppendTensorCallArgs;/* Silence unused warning */
 }
-static void  reduxAppendTensorCallArgs     (redux_ctx*  ctx,
-                                            const char* baseName){
+static void       reduxAppendTensorCallArgs     (redux_ctx*  ctx,
+                                                 const char* baseName){
 	srcbAppendElemf(&ctx->srcGen, "%sPtr",   baseName);
 	srcbAppendElemf(&ctx->srcGen, "%sOff",   baseName);
 	srcbAppendElemf(&ctx->srcGen, "%sSteps", baseName);
 }
-static void  reduxAppendMacroDefs          (redux_ctx*  ctx){
+static void       reduxAppendMacroDefs          (redux_ctx*  ctx){
 	int i;
 
 	srcbAppends    (&ctx->srcGen, "#define FOROVER(idx)    for (i##idx = i##idx##Start; i##idx < i##idx##End; i##idx++)\n");
@@ -2049,21 +2118,21 @@ static void  reduxAppendMacroDefs          (redux_ctx*  ctx){
 	srcbEndList    (&ctx->srcGen);
 	srcbAppends    (&ctx->srcGen, ")\n");
 }
-static void  reduxAppendIncludes           (redux_ctx*  ctx){
+static void       reduxAppendIncludes           (redux_ctx*  ctx){
 	strb_appends(&ctx->s, "/* Includes */\n");
 	strb_appends(&ctx->s, "#include \"cluda.h\"\n");
 	strb_appends(&ctx->s, "\n");
 	strb_appends(&ctx->s, "\n");
 	strb_appends(&ctx->s, "\n");
 }
-static void  reduxAppendTypedefs           (redux_ctx*  ctx){
+static void       reduxAppendTypedefs           (redux_ctx*  ctx){
 	strb_appendf(&ctx->s, "typedef %s S;\n", ctx->srcTypeStr);   /* The type of the source array. */
 	strb_appendf(&ctx->s, "typedef %s T;\n", ctx->dstTypeStr);   /* The type of the destination array. */
 	strb_appendf(&ctx->s, "typedef %s A;\n", ctx->dstArgTypeStr);/* The type of the destination argument array. */
 	strb_appendf(&ctx->s, "typedef %s X;\n", ctx->idxTypeStr);   /* The type of the indices: signed 32/64-bit. */
 	strb_appendf(&ctx->s, "typedef %s K;\n", ctx->accTypeStr);   /* The type of the accumulator variable. */
 }
-static void  reduxAppendGetInitValFns      (redux_ctx*  ctx){
+static void       reduxAppendGetInitValFns      (redux_ctx*  ctx){
 	/**
 	 * Initial value functions.
 	 */
@@ -2075,7 +2144,7 @@ static void  reduxAppendGetInitValFns      (redux_ctx*  ctx){
 	                      "\treturn (%s);\n"
 	                      "}\n\n\n\n", ctx->initValT, ctx->initValK);
 }
-static void  reduxAppendWriteBackFn        (redux_ctx*  ctx){
+static void       reduxAppendWriteBackFn        (redux_ctx*  ctx){
 	/**
 	 * Global memory value reduction function.
 	 *
@@ -2118,7 +2187,7 @@ static void  reduxAppendWriteBackFn        (redux_ctx*  ctx){
 	/* Close off function. */
 	strb_appends(&ctx->s, "}\n\n\n\n");
 }
-static void  reduxAppendReduxKernel        (redux_ctx*  ctx){
+static void       reduxAppendReduxKernel        (redux_ctx*  ctx){
 	reduxAppendPrototype        (ctx);
 	strb_appends                (&ctx->s, "{\n");
 	reduxAppendIndexDeclarations(ctx);
@@ -2126,7 +2195,7 @@ static void  reduxAppendReduxKernel        (redux_ctx*  ctx){
 	reduxAppendLoops            (ctx);
 	strb_appends                (&ctx->s, "}\n");
 }
-static void  reduxAppendPrototype          (redux_ctx*  ctx){
+static void       reduxAppendPrototype          (redux_ctx*  ctx){
 	srcbAppends    (&ctx->srcGen, "KERNEL void reduxKer(");
 	srcbBeginList  (&ctx->srcGen, ", ", "void");
 	reduxAppendTensorDeclArgs(ctx, "S", "src");
@@ -2141,7 +2210,7 @@ static void  reduxAppendPrototype          (redux_ctx*  ctx){
 	srcbEndList    (&ctx->srcGen);
 	srcbAppends    (&ctx->srcGen, ")");
 }
-static void  reduxAppendIndexDeclarations  (redux_ctx*  ctx){
+static void       reduxAppendIndexDeclarations  (redux_ctx*  ctx){
 	int i;
 	strb_appends(&ctx->s, "\t/* GPU kernel coordinates. Always 3D in OpenCL/CUDA. */\n");
 
@@ -2168,7 +2237,7 @@ static void  reduxAppendIndexDeclarations  (redux_ctx*  ctx){
 	if (ctx->nds > ctx->ndd){appendIdxes (&ctx->s, "\tX ", "i", ctx->ndd, ctx->nds, "PDim",    ";\n");}
 	strb_appends(&ctx->s, "\t\n\t\n");
 }
-static void  reduxAppendRangeCalculations  (redux_ctx*  ctx){
+static void       reduxAppendRangeCalculations  (redux_ctx*  ctx){
 	size_t hwDim;
 	int    i;
 
@@ -2229,7 +2298,7 @@ static void  reduxAppendRangeCalculations  (redux_ctx*  ctx){
 
 	strb_appends(&ctx->s, "\t\n\t\n");
 }
-static void  reduxAppendLoops              (redux_ctx*  ctx){
+static void       reduxAppendLoops              (redux_ctx*  ctx){
 	int i;
 
 	for (i=0;i<ctx->ndd;i++){
@@ -2333,10 +2402,10 @@ static void  reduxAppendLoops              (redux_ctx*  ctx){
 		srcbAppends(&ctx->srcGen, "\t}\n");
 	}
 }
-static void  reduxAppendInitKernel         (redux_ctx*  ctx){
+static void       reduxAppendInitKernel         (redux_ctx*  ctx){
 	/* BUG: Implement this for small code model. */
 }
-static void  reduxAppendPostKernel         (redux_ctx*  ctx){
+static void       reduxAppendPostKernel         (redux_ctx*  ctx){
 	/* BUG: Implement this for small code model. */
 }
 
@@ -2344,7 +2413,7 @@ static void  reduxAppendPostKernel         (redux_ctx*  ctx){
  * @brief Compile the kernel from source code.
  */
 
-static int   reduxCompile                  (redux_ctx*  ctx){
+static int        reduxCompile                  (redux_ctx*  ctx){
 	int    ret, i = 0;
 	int    PRI_TYPECODES[11];
 	size_t PRI_TYPECODES_LEN;
@@ -2432,7 +2501,7 @@ static int   reduxCompile                  (redux_ctx*  ctx){
  *        for the primary/auxilliary kernels.
  */
 
-static int   reduxSchedule                 (redux_ctx*  ctx){
+static int        reduxSchedule                 (redux_ctx*  ctx){
 	int      i, priNdims = 0, auxNdims = 0;
 	uint64_t maxLgRdx = 0, maxLgPre = 0, maxLgPost = 0;
 	uint64_t maxLgPri = 0, maxLgAux = 0;
@@ -2539,16 +2608,16 @@ static int   reduxSchedule                 (redux_ctx*  ctx){
  *     anything to do with the integer factorization APIs.
  */
 
-static void  reduxScheduleKernel           (int         ndims,
-                                            uint64_t*   dims,
-                                            uint64_t    warpSize,
-                                            uint64_t    maxLg,
-                                            uint64_t*   maxLs,
-                                            uint64_t    maxGg,
-                                            uint64_t*   maxGs,
-                                            uint64_t*   bs,
-                                            uint64_t*   gs,
-                                            uint64_t*   cs){
+static void       reduxScheduleKernel           (int         ndims,
+                                                 uint64_t*   dims,
+                                                 uint64_t    warpSize,
+                                                 uint64_t    maxLg,
+                                                 uint64_t*   maxLs,
+                                                 uint64_t    maxGg,
+                                                 uint64_t*   maxGs,
+                                                 uint64_t*   bs,
+                                                 uint64_t*   gs,
+                                                 uint64_t*   cs){
 	uint64_t       warpMod, bestWarpMod  = 1;
 	int            i,       bestWarpAxis = 0;
 	uint64_t       roundedDims[MAX_HW_DIMS];
@@ -2634,7 +2703,7 @@ static void  reduxScheduleKernel           (int         ndims,
  * Invoke the kernel.
  */
 
-static int   reduxInvoke                   (redux_ctx*  ctx){
+static int        reduxInvoke                   (redux_ctx*  ctx){
 	void* priArgs[11];
 	void* auxArgs[ 8];
 	int   ret, i = 0;
@@ -2790,8 +2859,8 @@ static int        reduxCleanup                  (redux_ctx*  ctx, int ret){
 	return ret;
 }
 
-static int   reduxCleanupMsg               (redux_ctx*  ctx, int ret,
-                                            const char* fmt, ...){
+static int        reduxCleanupMsg               (redux_ctx*  ctx, int ret,
+                                                 const char* fmt, ...){
 #if DEBUG
 	FILE* fp = stderr;
 	

From fffd323e8efa735a69fb36ddc2037d77efae9c9b Mon Sep 17 00:00:00 2001
From: Olexa Bilaniuk <obilaniu@gmail.com>
Date: Wed, 14 Jun 2017 11:07:12 -0400
Subject: [PATCH 13/34] Remove warp axis select.

---
 src/gpuarray_reduction.c | 129 ---------------------------------------
 1 file changed, 129 deletions(-)

diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c
index 1c1721ee4f..123c059964 100644
--- a/src/gpuarray_reduction.c
+++ b/src/gpuarray_reduction.c
@@ -250,8 +250,6 @@ struct redux_ctx{
 	int             nds;          /* # Source              dimensions */
 	int             ndr;          /* # Reduced             dimensions */
 	int             ndd;          /* # Destination         dimensions */
-	int             ndw;          /* # Warp                dimensions */
-	int             ndp;          /* # Partial warp        dimensions */
 	int             ndf;          /* # Flattened source    dimensions */
 	int             ndt;          /* # Temporary workspace dimensions */
 	int             zeroAllAxes;  /* # of zero-length                   axes in source tensor */
@@ -259,8 +257,6 @@ struct redux_ctx{
 	size_t          prodAllAxes;  /* Product of length of all           axes in source tensor */
 	size_t          prodRdxAxes;  /* Product of length of all reduction axes in source tensor */
 	size_t          prodFreeAxes; /* Product of length of all free      axes in source tensor */
-	size_t          prodWarpAxes; /* Number of active threads per warp. Strictly <= warpSize. */
-	int             splitWarpAxis;/* Index of the split warp axis within the source tensor's shape; -1 otherwise. */
 	
 	gpucontext*     gpuCtx;
 	unsigned        numProcs;
@@ -353,7 +349,6 @@ static int        reduxGetAndInit               (int typecode, const char** prop
 static int        reduxGetOrInit                (int typecode, const char** property);
 static int        reduxSortFlatSensitive        (const void* a, const void* b);
 static int        reduxSortFlatInsensitive      (const void* a, const void* b);
-static int        reduxSortWarp                 (const void* a, const void* b);
 static int        axisInSet                     (int         v,
                                                  const int*  set,
                                                  size_t      setLen,
@@ -371,7 +366,6 @@ static void       axisInit                      (axis_desc*       axis,
                                                  ssize_t          len,
                                                  ssize_t          srcStride);
 static void       axisMarkReduced               (axis_desc*       axis, int    reduxNum);
-static void       axisMarkWarp                  (axis_desc*       axis, size_t partialSlice);
 static int        axisGetReduxNum               (const axis_desc* axis);
 static size_t     axisGetLen                    (const axis_desc* axis);
 static ssize_t    axisGetSrcStride              (const axis_desc* axis);
@@ -384,8 +378,6 @@ static ssize_t    axisGetDstArgStride           (const axis_desc* axis);
 static size_t     axisGetDstArgAbsStride        (const axis_desc* axis);
 static ssize_t    axisGetDstArgOffset           (const axis_desc* axis);
 static int        axisIsReduced                 (const axis_desc* axis);
-static int        axisIsWarp                    (const axis_desc* axis);
-static int        axisIsPartialWarp             (const axis_desc* axis);
 
 /* Reduction Context API */
 /*     Utilities */
@@ -413,7 +405,6 @@ static void       reduxAppendLargestAxisToHwList(redux_ctx*  ctx,
 static int        reduxInit                     (redux_ctx*  ctx);
 static int        reduxInferProperties          (redux_ctx*  ctx);
 static int        reduxFlattenSource            (redux_ctx*  ctx);
-static int        reduxSelectWarpAxes           (redux_ctx*  ctx);
 static int        reduxSelectNumStages          (redux_ctx*  ctx);
 static int        reduxPlan1Stage               (redux_ctx*  ctx);
 static int        reduxPlan2Stage               (redux_ctx*  ctx);
@@ -808,47 +799,6 @@ static int        reduxSortFlatSensitive        (const void* a, const void* b){
 	}
 }
 
-/**
- * @brief Sort axes in preferred order for integration into warp.
- * 
- * The axes with stride != 0 are sorted by lowest absolute
- * stride. Picking the few axes with the lowest absolute stride (while
- * keeping the product of their dimensions <= warpSize) should maximize
- * memory bandwidth of the warp.
- * 
- * The restriction stride != 0 is intended to avoid waste of memory
- * bandwidth. Once a memory transaction is necessary, it typically operates at
- * far greater granularity than just 32 bits (4 bytes).
- * 
- * Sorting by absolute stride should result, in the case of a packed tensor, in
- * the memory accesses being close to perfectly contiguous.
- */
-
-static int        reduxSortWarp                 (const void* a, const void* b){
-	const axis_desc* xda  = *(const axis_desc* const *)a;
-	const axis_desc* xdb  = *(const axis_desc* const *)b;
-
-	if       ( axisGetSrcStride(xda)   && !axisGetSrcStride(xdb)){
-		return -1;
-	}else if (!axisGetSrcStride(xda)   &&  axisGetSrcStride(xdb)){
-		return +1;
-	}
-	
-	if       (axisGetSrcAbsStride(xda)    <   axisGetSrcAbsStride(xdb)){
-		return -1;
-	}else if (axisGetSrcAbsStride(xda)    >   axisGetSrcAbsStride(xdb)){
-		return +1;
-	}
-
-	if       ( axisIsReduced(xda)      && !axisIsReduced(xdb)){
-		return -1;
-	}else if (!axisIsReduced(xda)      &&  axisIsReduced(xdb)){
-		return +1;
-	}
-
-	return 0;
-}
-
 /**
  * @brief Check whether axis numbered v is already in the given set of axes.
  *
@@ -952,14 +902,6 @@ static void       axisMarkReduced               (axis_desc*       axis, int    r
 	axis->reduxNum  = reduxNum;
 }
 
-/**
- * @brief Mark axis as warp axis.
- */
-
-static void       axisMarkWarp                  (axis_desc*       axis, size_t warpLen){
-	axis->warpLen = warpLen;
-}
-
 /**
  * @brief Get properties of an axis.
  */
@@ -1003,12 +945,6 @@ static ssize_t    axisGetDstArgOffset           (const axis_desc* axis){
 static int        axisIsReduced                 (const axis_desc* axis){
 	return axis->isReduced;
 }
-static int        axisIsWarp                    (const axis_desc* axis){
-	return !!axis->warpLen;
-}
-static int        axisIsPartialWarp             (const axis_desc* axis){
-	return axis->warpLen > 0 && axis->warpLen != axis->len;
-}
 
 /**
  * @brief Estimate the level of parallelism in the device.
@@ -1367,9 +1303,7 @@ static int        reduxInit                     (redux_ctx*  ctx){
 	ctx->errorString1  = NULL;
 	ctx->errorString2  = NULL;
 
-	ctx->splitWarpAxis = -1;
 	ctx->numStages     =  1;
-	ctx->prodWarpAxes  =  1;
 	ctx->prodAllAxes   = ctx->prodRdxAxes   = ctx->prodFreeAxes  = 1;
 	strb_init(&ctx->s);
 	srcbInit (&ctx->srcGen, &ctx->s);
@@ -1434,8 +1368,6 @@ static int        reduxInferProperties          (redux_ctx*  ctx){
 	ctx->nds = ctx->src->nd;
 	ctx->ndr = ctx->reduxLen;
 	ctx->ndd = ctx->nds - ctx->ndr;
-	ctx->ndw = 0;
-	ctx->ndp = 0;
 	ctx->ndf = 0;
 	ctx->ndt = ctx->ndd + 1;
 	
@@ -1749,18 +1681,6 @@ static int        reduxFlattenSource            (redux_ctx*  ctx){
 	}
 	ctx->ndf = j;
 
-	return reduxSelectWarpAxes(ctx);
-}
-
-/**
- * @brief Select the warp axes in such a way as to maximize memory bandwidth.
- */
-
-static int        reduxSelectWarpAxes           (redux_ctx*  ctx){
-	axis_desc* a;
-	int        i;
-	size_t     aL;
-
 
 	/**
 	 * NOTE: At this point it is possible for there to be no axes
@@ -1777,55 +1697,6 @@ static int        reduxSelectWarpAxes           (redux_ctx*  ctx){
 		ctx->ndf = 1;
 	}
 
-
-	/**
-	 * Select Warp Axes.
-	 * 
-	 * Using a particular heuristic order (*), sort the axis list by
-	 * suitability for belonging to the warp. Then, pick the first few axes,
-	 * until the product of their lengths exceeds the warp size.
-	 * 
-	 * (*) See documentation of value-comparison function.
-	 */
-
-	for(i=0;i<ctx->ndf;i++){
-		ctx->xdSrcPtrs[i] = reduxGetSrcFlatAxis(ctx, i);
-	}
-
-	qsort(ctx->xdSrcPtrs, ctx->ndf, sizeof(*ctx->xdSrcPtrs), reduxSortWarp);
-
-	for (i=0;i<ctx->ndf;i++){
-		a  = reduxGetSrcSortAxis(ctx, i);
-		aL = axisGetLen(a);
-		if (aL <= 1){break;}
-		
-		ctx->prodWarpAxes *= aL;
-		if (ctx->prodWarpAxes <= ctx->warpSize){
-			axisMarkWarp(a, aL);
-			ctx->ndw++;
-		}else{
-			/**
-			 * The product of warp lengths just exceeded warpSize. We backtrack
-			 * by undoing the multiplication by aL. We then check whether we
-			 * can "split" this axis by extracting at least a factor of 2 into
-			 * warpLen. If yes, we mark is as the (only) warp axis that is
-			 * split by setting its warpLen to something neither 0 nor len.
-			 */
-			
-			ctx->prodWarpAxes /= aL;
-			aL = ctx->warpSize/ctx->prodWarpAxes;
-			if (aL >= 2){
-				axisMarkWarp(a, aL);
-				ctx->prodWarpAxes  *= aL;
-				ctx->splitWarpAxis  = i;
-				ctx->ndw++;
-				ctx->ndp++;
-			}
-			break;
-		}
-	}
-
-
 	return reduxSelectNumStages(ctx);
 }
 

From 1cfe552d8e71c447529de0603d865e11dd118a37 Mon Sep 17 00:00:00 2001
From: Olexa Bilaniuk <obilaniu@gmail.com>
Date: Wed, 14 Jun 2017 16:11:19 -0400
Subject: [PATCH 14/34] Massive cleanup.

---
 src/gpuarray_reduction.c | 975 +++++++++++++++++----------------------
 1 file changed, 418 insertions(+), 557 deletions(-)

diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c
index 123c059964..24243f78ca 100644
--- a/src/gpuarray_reduction.c
+++ b/src/gpuarray_reduction.c
@@ -41,7 +41,8 @@
 struct axis_desc{
 	int      reduxNum;
 	unsigned isReduced     : 1;
-	size_t   len, warpLen, sliceLen;
+	int      hwAxisStage0, hwAxisStage1;
+	size_t   len, tmpLen, sliceLen;
 	ssize_t  srcStride,       srcOffset;
 	ssize_t  dstStride,       dstOffset;
 	ssize_t  dstArgStride,    dstArgOffset;
@@ -250,7 +251,9 @@ struct redux_ctx{
 	int             nds;          /* # Source              dimensions */
 	int             ndr;          /* # Reduced             dimensions */
 	int             ndd;          /* # Destination         dimensions */
-	int             ndf;          /* # Flattened source    dimensions */
+	int             ndfs;         /* # Flattened source    dimensions */
+	int             ndfr;         /* # Flattened source    dimensions */
+	int             ndfd;         /* # Flattened source    dimensions */
 	int             ndt;          /* # Temporary workspace dimensions */
 	int             zeroAllAxes;  /* # of zero-length                   axes in source tensor */
 	int             zeroRdxAxes;  /* # of zero-length         reduction axes in source tensor */
@@ -258,6 +261,7 @@ struct redux_ctx{
 	size_t          prodRdxAxes;  /* Product of length of all reduction axes in source tensor */
 	size_t          prodFreeAxes; /* Product of length of all free      axes in source tensor */
 	
+	/* GPU Context & Device */
 	gpucontext*     gpuCtx;
 	unsigned        numProcs;
 	size_t          warpSize;
@@ -266,18 +270,33 @@ struct redux_ctx{
 	size_t          maxGg;
 	size_t          maxGs[MAX_HW_DIMS];
 	
+	/* Flattening */
 	axis_desc*      xdSrc;
 	axis_desc*      xdSrcFlat;
-	axis_desc*      xdTmp;
-	
 	axis_desc**     xdSrcPtrs;
 	
+	size_t*         flatSrcDimensions;
+	ssize_t*        flatSrcStrides;
+	gpudata*        flatSrcData;
+	ssize_t         flatSrcOffset;
+	ssize_t*        flatDstStrides;
+	gpudata*        flatDstData;
+	ssize_t         flatDstOffset;
+	ssize_t*        flatDstArgStrides;
+	gpudata*        flatDstArgData;
+	ssize_t         flatDstArgOffset;
+	
+	/* Select number of stages */
 	int             numStages;
 	
-	GpuArray*       wsDst;
-	GpuArray*       wsDstArg;
-	int*            srcAxisList;
-	size_t*         dstDims;
+	/* Workspaces, in the case of 2-stage reduction */
+	size_t*         tmpSrcDimensions;
+	ssize_t*        tmpDstStrides;
+	gpudata*        tmpDstData;
+	ssize_t         tmpDstOffset;
+	ssize_t*        tmpDstArgStrides;
+	gpudata*        tmpDstArgData;
+	ssize_t         tmpDstArgOffset;
 
 	/* Source code Generator. */
 	int             srcTypeCode;
@@ -292,14 +311,12 @@ struct redux_ctx{
 	const char*     accTypeStr;
 	const char*     initValT;
 	const char*     initValK;
-	int             largeCodeModel;
 	strb            s;
 	srcb            srcGen;
 	char*           sourceCode;
 	size_t          sourceCodeLen;
 	char*           errorString0;
 	char*           errorString1;
-	char*           errorString2;
 	GpuKernel       preKernel;
 	GpuKernel       kernel;
 	GpuKernel       postKernel;
@@ -319,14 +336,14 @@ struct redux_ctx{
 
 	struct{
 		int         ndh;
+		int         ndhp;
 		int         ndhd;
 		int         ndhr;
-		int         axisList   [MAX_HW_DIMS];
 		size_t      bs         [MAX_HW_DIMS];
 		size_t      gs         [MAX_HW_DIMS];
 		size_t      cs         [MAX_HW_DIMS];
 		gpudata*    chunkSizeGD;
-	} pri, aux;
+	} st1, st2;
 
 	/* Invoker */
 	gpudata*        srcStepsGD;
@@ -349,10 +366,8 @@ static int        reduxGetAndInit               (int typecode, const char** prop
 static int        reduxGetOrInit                (int typecode, const char** property);
 static int        reduxSortFlatSensitive        (const void* a, const void* b);
 static int        reduxSortFlatInsensitive      (const void* a, const void* b);
-static int        axisInSet                     (int         v,
-                                                 const int*  set,
-                                                 size_t      setLen,
-                                                 size_t*     where);
+static int        reduxSortPlan1Stage           (const void* a, const void* b);
+static int        reduxSortPlan2Stage0          (const void* a, const void* b);
 static void       appendIdxes                   (strb*       s,
                                                  const char* prologue,
                                                  const char* prefix,
@@ -362,22 +377,24 @@ static void       appendIdxes                   (strb*       s,
                                                  const char* epilogue);
 
 /* Axis Description API */
-static void       axisInit                      (axis_desc*       axis,
-                                                 ssize_t          len,
-                                                 ssize_t          srcStride);
-static void       axisMarkReduced               (axis_desc*       axis, int    reduxNum);
-static int        axisGetReduxNum               (const axis_desc* axis);
-static size_t     axisGetLen                    (const axis_desc* axis);
-static ssize_t    axisGetSrcStride              (const axis_desc* axis);
-static size_t     axisGetSrcAbsStride           (const axis_desc* axis);
-static ssize_t    axisGetSrcOffset              (const axis_desc* axis);
-static ssize_t    axisGetDstStride              (const axis_desc* axis);
-static size_t     axisGetDstAbsStride           (const axis_desc* axis);
-static ssize_t    axisGetDstOffset              (const axis_desc* axis);
-static ssize_t    axisGetDstArgStride           (const axis_desc* axis);
-static size_t     axisGetDstArgAbsStride        (const axis_desc* axis);
-static ssize_t    axisGetDstArgOffset           (const axis_desc* axis);
-static int        axisIsReduced                 (const axis_desc* axis);
+static void       axisInit                      (axis_desc*        axis,
+                                                 ssize_t           len,
+                                                 ssize_t           srcStride);
+static void       axisMarkReduced               (axis_desc*        axis, int    reduxNum);
+static int        axisGetReduxNum               (const axis_desc*  axis);
+static size_t     axisGetLen                    (const axis_desc*  axis);
+static ssize_t    axisGetSrcStride              (const axis_desc*  axis);
+static size_t     axisGetSrcAbsStride           (const axis_desc*  axis);
+static ssize_t    axisGetSrcOffset              (const axis_desc*  axis);
+static ssize_t    axisGetDstStride              (const axis_desc*  axis);
+static size_t     axisGetDstAbsStride           (const axis_desc*  axis);
+static ssize_t    axisGetDstOffset              (const axis_desc*  axis);
+static ssize_t    axisGetDstArgStride           (const axis_desc*  axis);
+static size_t     axisGetDstArgAbsStride        (const axis_desc*  axis);
+static ssize_t    axisGetDstArgOffset           (const axis_desc*  axis);
+static int        axisIsReduced                 (const axis_desc*  axis);
+static int        axisIsHW                      (const axis_desc*  axis, int stage);
+static int        axisGetHWAxisNum              (const axis_desc*  axis, int stage);
 
 /* Reduction Context API */
 /*     Utilities */
@@ -387,64 +404,58 @@ static int        reduxRequiresDstArg           (const redux_ctx*  ctx);
 static int        reduxKernelRequiresDst        (const redux_ctx*  ctx);
 static int        reduxKernelRequiresDstArg     (const redux_ctx*  ctx);
 static int        reduxIsSensitive              (const redux_ctx*  ctx);
-static int        reduxIsSmallCodeModel         (const redux_ctx*  ctx);
-static int        reduxIsLargeCodeModel         (const redux_ctx*  ctx);
+static int        reduxIs1Stage                 (const redux_ctx*  ctx);
+static int        reduxIs2Stage                 (const redux_ctx*  ctx);
 static axis_desc* reduxGetSrcAxis               (const redux_ctx*  ctx, int i);
 static axis_desc* reduxGetSrcSortAxis           (const redux_ctx*  ctx, int i);
 static axis_desc* reduxGetSrcFlatAxis           (const redux_ctx*  ctx, int i);
-static int        reduxTryFlattenInto           (const redux_ctx* ctx,
-                                                 axis_desc*       into,
-                                                 const axis_desc* from);
-static int        reduxCanAppendHwAxis          (redux_ctx*  ctx,
-                                                 int         kernelType,
-                                                 int         axisType);
-static void       reduxAppendLargestAxisToHwList(redux_ctx*  ctx,
-                                                 int         kernelType,
-                                                 int         axisType);
+static int        reduxTryFlattenInto           (const redux_ctx*  ctx,
+                                                 axis_desc*        into,
+                                                 const axis_desc*  from);
+static void       reduxSortAxisPtrsBy           (axis_desc**       ptrs,
+                                                 axis_desc*        axes,
+                                                 size_t            numAxes,
+                                                 int(*fn)(const void*, const void*));
 /*     Control Flow */
-static int        reduxInit                     (redux_ctx*  ctx);
-static int        reduxInferProperties          (redux_ctx*  ctx);
-static int        reduxFlattenSource            (redux_ctx*  ctx);
-static int        reduxSelectNumStages          (redux_ctx*  ctx);
-static int        reduxPlan1Stage               (redux_ctx*  ctx);
-static int        reduxPlan2Stage               (redux_ctx*  ctx);
-static int        reduxSelectHwAxes             (redux_ctx*  ctx);
-static int        reduxComputeAxisList          (redux_ctx*  ctx);
-static int        reduxGenSource                (redux_ctx*  ctx);
-static void       reduxAppendSource             (redux_ctx*  ctx);
-static void       reduxAppendIncludes           (redux_ctx*  ctx);
-static void       reduxAppendTensorDeclArgs     (redux_ctx*  ctx,
-                                                 const char* type,
-                                                 const char* baseName);
-static void       reduxAppendTensorCallArgs     (redux_ctx*  ctx,
-                                                 const char* baseName);
-static void       reduxAppendMacroDefs          (redux_ctx*  ctx);
-static void       reduxAppendTypedefs           (redux_ctx*  ctx);
-static void       reduxAppendGetInitValFns      (redux_ctx*  ctx);
-static void       reduxAppendWriteBackFn        (redux_ctx*  ctx);
-static void       reduxAppendReduxKernel        (redux_ctx*  ctx);
-static void       reduxAppendPrototype          (redux_ctx*  ctx);
-static void       reduxAppendIndexDeclarations  (redux_ctx*  ctx);
-static void       reduxAppendRangeCalculations  (redux_ctx*  ctx);
-static void       reduxAppendLoops              (redux_ctx*  ctx);
-static void       reduxAppendInitKernel         (redux_ctx*  ctx);
-static void       reduxAppendPostKernel         (redux_ctx*  ctx);
-static int        reduxCompile                  (redux_ctx*  ctx);
-static int        reduxSchedule                 (redux_ctx*  ctx);
-static void       reduxScheduleKernel           (int         ndims,
-                                                 uint64_t*   dims,
-                                                 uint64_t    warpSize,
-                                                 uint64_t    maxLg,
-                                                 uint64_t*   maxLs,
-                                                 uint64_t    maxGg,
-                                                 uint64_t*   maxGs,
-                                                 uint64_t*   bs,
-                                                 uint64_t*   gs,
-                                                 uint64_t*   cs);
-static int        reduxInvoke                   (redux_ctx*  ctx);
-static int        reduxCleanup                  (redux_ctx*  ctx, int ret);
-static int        reduxCleanupMsg               (redux_ctx*  ctx, int ret,
-                                                 const char* fmt, ...);
+static int        reduxInit                     (redux_ctx*        ctx);
+static int        reduxInferProperties          (redux_ctx*        ctx);
+static int        reduxFlattenSource            (redux_ctx*        ctx);
+static int        reduxSelectNumStages          (redux_ctx*        ctx);
+static int        reduxPlan1Stage               (redux_ctx*        ctx);
+static int        reduxPlan2Stage               (redux_ctx*        ctx);
+static int        reduxGenSource                (redux_ctx*        ctx);
+static void       reduxAppendSource             (redux_ctx*        ctx);
+static void       reduxAppendIncludes           (redux_ctx*        ctx);
+static void       reduxAppendTensorDeclArgs     (redux_ctx*        ctx,
+                                                 const char*       type,
+                                                 const char*       baseName);
+static void       reduxAppendTensorCallArgs     (redux_ctx*        ctx,
+                                                 const char*       baseName);
+static void       reduxAppendMacroDefs          (redux_ctx*        ctx);
+static void       reduxAppendTypedefs           (redux_ctx*        ctx);
+static void       reduxAppendGetInitValFns      (redux_ctx*        ctx);
+static void       reduxAppendWriteBackFn        (redux_ctx*        ctx);
+static void       reduxAppendReduxKernel        (redux_ctx*        ctx);
+static void       reduxAppendPrototype          (redux_ctx*        ctx);
+static void       reduxAppendIndexDeclarations  (redux_ctx*        ctx);
+static void       reduxAppendRangeCalculations  (redux_ctx*        ctx);
+static void       reduxAppendLoops              (redux_ctx*        ctx);
+static int        reduxCompile                  (redux_ctx*        ctx);
+static int        reduxSchedule                 (redux_ctx*        ctx);
+static void       reduxScheduleKernel           (int               ndims,
+                                                 uint64_t*         dims,
+                                                 uint64_t          warpSize,
+                                                 uint64_t          maxLg,
+                                                 uint64_t*         maxLs,
+                                                 uint64_t          maxGg,
+                                                 uint64_t*         maxGs,
+                                                 uint64_t*         bs,
+                                                 uint64_t*         gs,
+                                                 uint64_t*         cs);
+static int        reduxInvoke                   (redux_ctx*        ctx);
+static int        reduxCleanup                  (redux_ctx*        ctx, int ret);
+static int        reduxCleanupMsg               (redux_ctx*        ctx, int ret,
+                                                 const char*       fmt, ...);
 
 
 /* Function implementation */
@@ -800,29 +811,39 @@ static int        reduxSortFlatSensitive        (const void* a, const void* b){
 }
 
 /**
- * @brief Check whether axis numbered v is already in the given set of axes.
- *
- * @param [in]  v
- * @param [in]  set
- * @param [in]  setLen
- * @param [out] where
- * @return Non-zero if the set is non-empty and v is in it; Zero otherwise.
+ * For the plan of a 1-stage reduction, we need to sort the free axes by
+ * decreasing length.
  */
 
-static int        axisInSet                     (int         v,
-                                                 const int*  set,
-                                                 size_t      setLen,
-                                                 size_t*     where){
-	size_t i;
+static int        reduxSortPlan1Stage           (const void* a, const void* b){
+	const axis_desc* xda  = *(const axis_desc* const*)a;
+	const axis_desc* xdb  = *(const axis_desc* const*)b;
 
-	for (i=0;i<setLen;i++){
-		if (set[i] == v){
-			if (where){*where = i;}
-			return 1;
-		}
+	if       ( axisIsReduced(xda)      && !axisIsReduced(xdb)){
+		return +1;
+	}else if (!axisIsReduced(xda)      &&  axisIsReduced(xdb)){
+		return -1;
 	}
 
-	return 0;
+	return axisGetLen(xda)<axisGetLen(xdb) ? +1 : -1;
+}
+
+/**
+ * For the plan of Stage 0 in a 2-stage reduction, we need to sort such that
+ * reduction axes come first and are ordered by decreasing length.
+ */
+
+static int        reduxSortPlan2Stage0          (const void* a, const void* b){
+	const axis_desc* xda  = *(const axis_desc* const*)a;
+	const axis_desc* xdb  = *(const axis_desc* const*)b;
+
+	if       ( axisIsReduced(xda)      && !axisIsReduced(xdb)){
+		return -1;
+	}else if (!axisIsReduced(xda)      &&  axisIsReduced(xdb)){
+		return +1;
+	}
+
+	return axisGetLen(xda)<axisGetLen(xdb) ? +1 : -1;
 }
 
 /**
@@ -874,8 +895,10 @@ static void       axisInit                      (axis_desc*       axis,
 	memset(axis, 0, sizeof(*axis));
 	
 	axis->reduxNum        = -1;
-	axis->warpLen         = 0;
+	axis->hwAxisStage0    = axis->hwAxisStage1 = -1;
 	axis->len             = len;
+	axis->tmpLen          = 0;
+	axis->sliceLen        = 0;
 	
 	axis->srcStride       = srcStride;
 	axis->srcOffset       = 0;
@@ -945,6 +968,12 @@ static ssize_t    axisGetDstArgOffset           (const axis_desc* axis){
 static int        axisIsReduced                 (const axis_desc* axis){
 	return axis->isReduced;
 }
+static int        axisIsHW                      (const axis_desc* axis, int stage){
+	return (stage == 0 ? axis->hwAxisStage0 : axis->hwAxisStage1) >= 0;
+}
+static int        axisGetHWAxisNum              (const axis_desc* axis, int stage){
+	return stage == 0 ? axis->hwAxisStage0 : axis->hwAxisStage1;
+}
 
 /**
  * @brief Estimate the level of parallelism in the device.
@@ -1012,7 +1041,7 @@ static int        reduxKernelRequiresDst        (const redux_ctx*  ctx){
 	switch (ctx->op){
 		case GA_REDUCE_ARGMIN:
 		case GA_REDUCE_ARGMAX:
-		  return reduxIsSmallCodeModel(ctx);
+		  return reduxIs2Stage(ctx);
 		default:
 		  return 1;
 	}
@@ -1079,19 +1108,19 @@ static int        reduxIsSensitive              (const redux_ctx*  ctx){
 }
 
 /**
- * @brief Returns whether we are using the small code model or not.
+ * @brief Is the reduction 1-stage?
  */
 
-static int        reduxIsSmallCodeModel         (const redux_ctx*  ctx){
-	return !reduxIsLargeCodeModel(ctx);
+static int        reduxIs1Stage                 (const redux_ctx*  ctx){
+	return ctx->numStages == 1;
 }
 
 /**
- * @brief Returns whether we are using the large code model or not.
+ * @brief Is the reduction 2-stage?
  */
 
-static int        reduxIsLargeCodeModel         (const redux_ctx*  ctx){
-	return ctx->largeCodeModel;
+static int        reduxIs2Stage                 (const redux_ctx*  ctx){
+	return !reduxIs1Stage(ctx);
 }
 
 /**
@@ -1139,9 +1168,9 @@ static axis_desc* reduxGetSrcFlatAxis           (const redux_ctx*  ctx, int i){
  * @return Non-zero if flattening attempt successful; Zero otherwise.
  */
 
-static int        reduxTryFlattenInto           (const redux_ctx* ctx,
-                                                 axis_desc*       into,
-                                                 const axis_desc* from){
+static int        reduxTryFlattenInto           (const redux_ctx*  ctx,
+                                                 axis_desc*        into,
+                                                 const axis_desc*  from){
 	int signSrc    = 0, signDst    = 0, signDstArg    = 0,
 	    reverseSrc = 0, reverseDst = 0, reverseDstArg = 0;
 	
@@ -1210,70 +1239,21 @@ static int        reduxTryFlattenInto           (const redux_ctx* ctx,
 }
 
 /**
- * @brief Check whether we can add another reduction axis or free axis
- *        to the hardware axis list for either the primary or secondary kernel.
+ * Sort an array of *pointers* to axes by the given comparison function, while
+ * not touching the axes themselves.
  */
 
-static int        reduxCanAppendHwAxis          (redux_ctx*  ctx,
-                                                 int         kernelType,
-                                                 int         axisType){
-	int kernelNdh  = kernelType == KERNEL_PRIMARY ? ctx->pri.ndh  : ctx->aux.ndh;
-	int kernelNdhr = kernelType == KERNEL_PRIMARY ? ctx->pri.ndhr : ctx->aux.ndhr;
-	int kernelNdhd = kernelType == KERNEL_PRIMARY ? ctx->pri.ndhd : ctx->aux.ndhd;
-
-	if (kernelNdh >= MAX_HW_DIMS){
-		return 0;
-	}else{
-		return axisType == AXIS_REDUX ? kernelNdhr < ctx->ndr:
-		                                kernelNdhd < ctx->ndd;
-	}
-}
-
-/**
- * @brief Append the largest reduction axis or free axis that isn't yet
- *        in the hardware axis list for either the primary or secondary kernel
- *        into said hardware axis list.
- */
-
-static void       reduxAppendLargestAxisToHwList(redux_ctx*  ctx,
-                                                 int         kernelType,
-                                                 int         axisType){
-	int    maxI = 0, i, isInHwList, isInReduxList, isInDesiredList, isLargestSoFar;
-	int*   hwAxisList, * ndh, * ndhr, * ndhd;
-	size_t v, maxV = 0;
-
-	/* Get pointers to the correct kernel's variables */
-	hwAxisList = kernelType == KERNEL_PRIMARY ?  ctx->pri.axisList:
-	                                             ctx->aux.axisList;
-	ndh        = kernelType == KERNEL_PRIMARY ? &ctx->pri.ndh:
-	                                            &ctx->aux.ndh;
-	ndhr       = kernelType == KERNEL_PRIMARY ? &ctx->pri.ndhr:
-	                                            &ctx->aux.ndhr;
-	ndhd       = kernelType == KERNEL_PRIMARY ? &ctx->pri.ndhd:
-	                                            &ctx->aux.ndhd;
-
-	/* Find */
-	for (i=0;i<ctx->nds;i++){
-		isInHwList      = axisInSet(i, hwAxisList,     *ndh,     0);
-		isInReduxList   = axisInSet(i, ctx->reduxList, ctx->ndr, 0);
-		isInDesiredList = axisType == AXIS_REDUX ?  isInReduxList:
-		                                           !isInReduxList;
-		v               = ctx->src->dimensions[i];
-		isLargestSoFar  = v >= maxV;
-
-		if (!isInHwList && isInDesiredList && isLargestSoFar){
-			maxV = v;
-			maxI = i;
-		}
-	}
-
-	/* Append */
-	hwAxisList[(*ndh)++] = maxI;
-	if (axisType == AXIS_REDUX){
-		(*ndhr)++;
-	}else{
-		(*ndhd)++;
+static void       reduxSortAxisPtrsBy           (axis_desc**       ptrs,
+                                                 axis_desc*        axes,
+                                                 size_t            numAxes,
+                                                 int(*fn)(const void*, const void*)){
+	size_t i;
+	
+	for(i=0;i<numAxes;i++){
+		ptrs[i] = &axes[i];
 	}
+	
+	qsort(ptrs, numAxes, sizeof(*ptrs), fn);
 }
 
 /**
@@ -1289,10 +1269,6 @@ static int        reduxInit                     (redux_ctx*  ctx){
 	 * We initialize certain parts of the context.
 	 */
 
-	ctx->wsDst         = NULL;
-	ctx->wsDstArg      = NULL;
-	ctx->srcAxisList   = NULL;
-	ctx->dstDims       = NULL;
 	ctx->gpuCtx        = NULL;
 
 	ctx->srcTypeStr    = ctx->dstTypeStr    = ctx->dstArgTypeStr =
@@ -1301,7 +1277,6 @@ static int        reduxInit                     (redux_ctx*  ctx){
 	ctx->sourceCode    = NULL;
 	ctx->errorString0  = NULL;
 	ctx->errorString1  = NULL;
-	ctx->errorString2  = NULL;
 
 	ctx->numStages     =  1;
 	ctx->prodAllAxes   = ctx->prodRdxAxes   = ctx->prodFreeAxes  = 1;
@@ -1309,15 +1284,14 @@ static int        reduxInit                     (redux_ctx*  ctx){
 	srcbInit (&ctx->srcGen, &ctx->s);
 
 	for (i=0;i<MAX_HW_DIMS;i++){
-		ctx->aux.axisList[i] = ctx->pri.axisList[i] = 0;
-		ctx->aux.bs      [i] = ctx->pri.bs      [i] = 1;
-		ctx->aux.gs      [i] = ctx->pri.gs      [i] = 1;
-		ctx->aux.cs      [i] = ctx->pri.cs      [i] = 1;
+		ctx->st2.bs      [i] = ctx->st1.bs      [i] = 1;
+		ctx->st2.gs      [i] = ctx->st1.gs      [i] = 1;
+		ctx->st2.cs      [i] = ctx->st1.cs      [i] = 1;
 	}
 
 	ctx->srcStepsGD      = ctx->srcSizeGD       =
 	ctx->dstStepsGD      = ctx->dstArgStepsGD   =
-	ctx->pri.chunkSizeGD = ctx->aux.chunkSizeGD = NULL;
+	ctx->st1.chunkSizeGD = ctx->st2.chunkSizeGD = NULL;
 
 	return reduxInferProperties(ctx);
 }
@@ -1365,11 +1339,11 @@ static int        reduxInferProperties          (redux_ctx*  ctx){
 		return reduxCleanupMsg(ctx, GA_INVALID_ERROR,
 		    "dstArg is of incorrect dimensionality for this reduction!\n");
 	}
-	ctx->nds = ctx->src->nd;
-	ctx->ndr = ctx->reduxLen;
-	ctx->ndd = ctx->nds - ctx->ndr;
-	ctx->ndf = 0;
-	ctx->ndt = ctx->ndd + 1;
+	ctx->nds  = ctx->src->nd;
+	ctx->ndr  = ctx->reduxLen;
+	ctx->ndd  = ctx->nds - ctx->ndr;
+	ctx->ndfs = ctx->ndfr = ctx->ndfd = 0;
+	ctx->ndt  = ctx->ndd + 1;
 	
 	/* Insane reduxList? */
 	for (i=0;i<ctx->ndr;i++){
@@ -1539,8 +1513,7 @@ static int        reduxInferProperties          (redux_ctx*  ctx){
 	ctx->xdSrc     = calloc(ctx->nds,   sizeof(*ctx->xdSrc));
 	ctx->xdSrcPtrs = calloc(ctx->nds+1, sizeof(*ctx->xdSrcPtrs));
 	ctx->xdSrcFlat = calloc(ctx->nds+1, sizeof(*ctx->xdSrcFlat));
-	ctx->xdTmp     = calloc(ctx->ndt,   sizeof(*ctx->xdTmp));
-	if (!ctx->xdSrc || !ctx->xdSrcPtrs || !ctx->xdSrcFlat || !ctx->xdTmp){
+	if (!ctx->xdSrc || !ctx->xdSrcPtrs || !ctx->xdSrcFlat){
 		return reduxCleanup(ctx, GA_MEMORY_ERROR);
 	}
 	for (i=0;i<ctx->nds;i++){
@@ -1618,7 +1591,7 @@ static int        reduxFlattenSource            (redux_ctx*  ctx){
 	 */
 	
 	memcpy(ctx->xdSrcFlat, ctx->xdSrc, ctx->nds*sizeof(*ctx->xdSrcFlat));
-	ctx->ndf = ctx->nds;
+	ctx->ndfs = ctx->nds;
 
 	/**
 	 * Pass 1: Flatten out 0-length dimensions. We already know that
@@ -1633,7 +1606,7 @@ static int        reduxFlattenSource            (redux_ctx*  ctx){
 	 */
 	
 	if (ctx->zeroRdxAxes > 0){
-		for (i=j=0;i<ctx->ndf;i++){
+		for (i=j=0;i<ctx->ndfs;i++){
 			axis = reduxGetSrcFlatAxis(ctx, i);
 			
 			if (!axisIsReduced(axis)){
@@ -1644,7 +1617,7 @@ static int        reduxFlattenSource            (redux_ctx*  ctx){
 		axisInit       (reduxGetSrcFlatAxis(ctx, j), 0, 0);
 		axisMarkReduced(reduxGetSrcFlatAxis(ctx, j), 0);
 		j++;
-		ctx->ndf = j;
+		ctx->ndfs = j;
 	}
 	
 	/**
@@ -1652,14 +1625,14 @@ static int        reduxFlattenSource            (redux_ctx*  ctx){
 	 *         ignored; They are always indexed at [0].
 	 */
 	
-	for (i=j=0;i<ctx->ndf;i++){
+	for (i=j=0;i<ctx->ndfs;i++){
 		axis = reduxGetSrcFlatAxis(ctx, i);
 		
 		if (axisGetLen(axis) != 1){
 			*reduxGetSrcFlatAxis(ctx, j++) = *axis;
 		}
 	}
-	ctx->ndf = j;
+	ctx->ndfs = j;
 	
 	/**
 	 * Pass 3: Flatten out continuous dimensions, where strides and sensitivity
@@ -1668,10 +1641,10 @@ static int        reduxFlattenSource            (redux_ctx*  ctx){
 	
 	isSensitive = reduxIsSensitive(ctx);
 	
-	qsort(ctx->xdSrcFlat, ctx->ndf, sizeof(*ctx->xdSrcFlat),
+	qsort(ctx->xdSrcFlat, ctx->ndfs, sizeof(*ctx->xdSrcFlat),
 		  isSensitive ? reduxSortFlatSensitive : reduxSortFlatInsensitive);
 	
-	for (i=j=1;i<ctx->ndf;i++){
+	for (i=j=1;i<ctx->ndfs;i++){
 		flatAxis = reduxGetSrcFlatAxis(ctx, j-1);
 		sortAxis = reduxGetSrcFlatAxis(ctx, i);
 		
@@ -1679,7 +1652,7 @@ static int        reduxFlattenSource            (redux_ctx*  ctx){
 			*reduxGetSrcFlatAxis(ctx, j++) = *sortAxis;
 		}
 	}
-	ctx->ndf = j;
+	ctx->ndfs = j;
 
 
 	/**
@@ -1691,10 +1664,72 @@ static int        reduxFlattenSource            (redux_ctx*  ctx){
 	 * We check for this case and simulate a 1-dimensional, 1-length tensor.
 	 */
 
-	if(ctx->ndf == 0){
-		axisInit       (reduxGetSrcFlatAxis(ctx, ctx->ndf), 1, 0);
-		axisMarkReduced(reduxGetSrcFlatAxis(ctx, ctx->ndf), 0);
-		ctx->ndf = 1;
+	if(ctx->ndfs == 0){
+		axisInit       (reduxGetSrcFlatAxis(ctx, ctx->ndfs), 1, 0);
+		axisMarkReduced(reduxGetSrcFlatAxis(ctx, ctx->ndfs), 0);
+		ctx->ndfs = 1;
+	}
+
+
+	/**
+	 * Having flattened the tensor to the very best of our ability, allocate
+	 * and/or compute
+	 * 
+	 *   ctx->ndfr
+	 *   ctx->ndfd
+	 *   ctx->flatSrcDimensions
+	 *   ctx->flatSrcStrides
+	 *   ctx->flatSrcData
+	 *   ctx->flatSrcOffset + axis offsets
+	 *   ctx->flatDstStrides
+	 *   ctx->flatDstData
+	 *   ctx->flatDstOffset + axis offsets
+	 *   ctx->flatDstArgStrides
+	 *   ctx->flatDstArgData
+	 *   ctx->flatDstArgOffset + axis offsets
+	 * 
+	 * and suchlike data that will be used post-flatten.
+	 */
+	
+	ctx->flatSrcDimensions = malloc(ctx->ndfs * sizeof(*ctx->flatSrcDimensions));
+	ctx->flatSrcStrides    = malloc(ctx->ndfs * sizeof(*ctx->flatSrcStrides));
+	ctx->flatDstStrides    = malloc(ctx->ndfs * sizeof(*ctx->flatDstStrides));
+	ctx->flatDstArgStrides = malloc(ctx->ndfs * sizeof(*ctx->flatDstArgStrides));
+	if(!ctx->flatSrcDimensions || !ctx->flatSrcStrides   ||
+	   !ctx->flatDstStrides    || !ctx->flatDstArgStrides){
+		return reduxCleanup(ctx, GA_MEMORY_ERROR);
+	}
+	
+	ctx->flatSrcData       = ctx->src->data;
+	ctx->flatSrcOffset     = ctx->src->offset;
+	if(reduxRequiresDst(ctx)){
+		ctx->flatDstData       = ctx->dst->data;
+		ctx->flatDstOffset     = ctx->dst->offset;
+	}
+	if(reduxRequiresDstArg(ctx)){
+		ctx->flatDstArgData    = ctx->dstArg->data;
+		ctx->flatDstArgOffset  = ctx->dstArg->offset;
+	}
+	for(ctx->ndfd=ctx->ndfr=i=0;i<ctx->ndfs;i++){
+		axis = reduxGetSrcFlatAxis(ctx, i);
+		if(axisIsReduced(axis)){
+			ctx->ndfr++;
+		}else{
+			if(reduxRequiresDst(ctx)){
+				ctx->flatDstStrides[ctx->ndfd]    = axisGetDstStride(axis);
+				ctx->flatDstOffset               += axisGetDstOffset(axis);
+			}
+			if(reduxRequiresDstArg(ctx)){
+				ctx->flatDstArgStrides[ctx->ndfd] = axisGetDstArgStride(axis);
+				ctx->flatDstArgOffset            += axisGetDstArgOffset(axis);
+			}
+			
+			ctx->ndfd++;
+		}
+
+		ctx->flatSrcDimensions[i] = axisGetLen      (axis);
+		ctx->flatSrcStrides[i]    = axisGetSrcStride(axis);
+		ctx->flatSrcOffset       += axisGetSrcOffset(axis);
 	}
 
 	return reduxSelectNumStages(ctx);
@@ -1713,9 +1748,13 @@ static int        reduxSelectNumStages          (redux_ctx*  ctx){
 	    ctx->prodAllAxes  <= ctx->maxLg       || /* Reduction over few elements? */
 	    ctx->prodFreeAxes >= ctx->prodRdxAxes || /* More destinations than reductions? */
 	    ctx->prodFreeAxes >= parallelism      ){ /* Destination very large? */
+		ctx->numStages = 1;
 		return reduxPlan1Stage(ctx);
 	}else{
-		return reduxPlan2Stage(ctx);
+		/* BUG: Switch to 2Stage when small code model fixed. */
+		(void)reduxPlan2Stage;
+		ctx->numStages = 1;
+		return reduxPlan1Stage(ctx);
 	}
 }
 
@@ -1738,11 +1777,25 @@ static int        reduxSelectNumStages          (redux_ctx*  ctx){
  */
 
 static int        reduxPlan1Stage               (redux_ctx*  ctx){
-	ctx->numStages = 1;
+	int        i;
+	axis_desc* axis;
+	
+	reduxSortAxisPtrsBy(ctx->xdSrcPtrs, ctx->xdSrcFlat, ctx->ndfs,
+	                    reduxSortPlan1Stage);
 	
+	ctx->st1.ndh  = 0;
+	ctx->st1.ndhp = 0;
+	ctx->st1.ndhr = 0;
 	
+	for (i=0;i<ctx->ndfd && i<MAX_HW_DIMS;i++){
+		axis = reduxGetSrcFlatAxis(ctx, i);
+		axis->hwAxisStage0 = i;
+		
+		ctx->st1.ndh++;
+	}
+	ctx->st1.ndhd = ctx->st1.ndh;
 	
-	return reduxSelectHwAxes(ctx);
+	return reduxGenSource(ctx);
 }
 
 /**
@@ -1752,146 +1805,77 @@ static int        reduxPlan1Stage               (redux_ctx*  ctx){
  * 
  * This plan involves splitting the reduction into two stages:
  * 
- *    Stage 1:  A reduction by approximately R = sqrt(prodRdxAxes) elements per
- *              destination elements into allocated temporary workspace(s)
- *              of approximate size dst.shape + (prodRdxAxes/R,)
- *    Stage 2:  A reduction by approximately prodRdxAxes/R elements into the
- *              final destination.
+ *    Stage 0:  A huge reduction only along reduction axes into a workspace.
+ *    Stage 1:  A small reduction into the destination.
+ * 
+ * We select only reduction axes in the first stage.
  */
 
 static int        reduxPlan2Stage               (redux_ctx*  ctx){
-	ctx->numStages = 2;
-	
-	/* NOTE: Use gpuarray_get_elsize(typecode) */
-	
-	return reduxSelectHwAxes(ctx);
-}
-
-/**
- * @brief Select which axes (up to MAX_HW_DIMS) will be assigned to hardware
- *        dimensions for both the primary and auxiliary kernels.
- *
- * LARGE code model: Up to the MAX_HW_DIMS largest free axes are selected.
- *                   Because the primary reduction kernel does everything, it's
- *                   not necessary to compute an auxiliary kernel axis
- *                   selection (or at least, one distinct from the primary
- *                   kernel's).
- *
- * SMALL code model: For the primary reduction kernel, up to MAX_HW_DIMS
- *                   reduction axes (largest-to-smallest) are selected. If less
- *                   than MAX_HW_DIMS axes were selected, free axes are
- *                   selected until MAX_HW_DIMS total axes are selected, or no
- *                   free axes are left.
- *
- *                   For the auxiliary reduction kernel, up to the MAX_HW_DIMS
- *                   largest free axes are selected.
- */
-
-static int        reduxSelectHwAxes             (redux_ctx*  ctx){
-	int ret;
+	int        i;
+	axis_desc* axis;
+	size_t     a = 1, aL, aPartial, target = ctx->maxLg;
 	
-	ctx->srcAxisList = malloc(ctx->nds * sizeof(unsigned));
-	ctx->dstDims     = malloc(ctx->ndd * sizeof(size_t));
-	if (!ctx->srcAxisList ||
-	    !ctx->dstDims     ){
-		return reduxCleanup(ctx, GA_MEMORY_ERROR);
-	}
-
-	ctx->largeCodeModel = 1;/* BUG: Erase when small code model fixed. */
 	/**
-	 * *** IT IS NOW SAFE TO CALL: ***
-	 *       - reduxIsLargeModel()
-	 *       - reduxIsSmallModel()
-	 *       - reduxKernelRequiresDst()
-	 *       - reduxKernelRequiresDstArg()
+	 * Sort axis descriptions reduction-axes-first then longest-first, and
+	 * select up to 3 reduction axes, splitting them s.t. their product does
+	 * not exceed the max block size.
 	 */
-
-
-	/**
-	 * Allocate workspaces.
-	 *
-	 * Certain reductions may require a workspace that isn't provided by the user.
-	 * For instance, **when using the small code model**, argmin/argmax require
-	 * a dst buffer, but the user didn't supply one (as he would have for
-	 * maxandargmax/minandargmin). We must allocate and deallocate it ourselves.
-	 *
-	 * Otherwise we use the user-supplied buffers.
-	 */
-
-	if (!reduxRequiresDst   (ctx) && reduxKernelRequiresDst(ctx)){
-		ctx->wsDst    = malloc(sizeof(*ctx->wsDst));
-		if (!ctx->wsDst){
-			return reduxCleanup(ctx, GA_MEMORY_ERROR);
-		}
-
-		ret = GpuArray_empty(ctx->wsDst, ctx->gpuCtx,  ctx->dstTypeCode,
-		                     ctx->ndd,   ctx->dstDims, GA_C_ORDER);
-		if (ret != GA_NO_ERROR){
-			return reduxCleanup(ctx, ret);
-		}
-	}else{
-		ctx->wsDst    = ctx->dst;
-	}
-	if (!reduxRequiresDstArg(ctx) && reduxKernelRequiresDstArg(ctx)){
-		ctx->wsDstArg = malloc(sizeof(*ctx->wsDstArg));
-		if (!ctx->wsDstArg){
-			return reduxCleanup(ctx, GA_MEMORY_ERROR);
+	
+	reduxSortAxisPtrsBy(ctx->xdSrcPtrs, ctx->xdSrcFlat, ctx->ndfs,
+	                    reduxSortPlan2Stage0);
+	
+	ctx->st1.ndh  = 0;
+	ctx->st1.ndhp = 0;
+	ctx->st1.ndhr = 0;
+	ctx->st1.ndhd = 0;
+	
+	for(i=0;i<ctx->ndfs && i<MAX_HW_DIMS;i++){
+		axis = reduxGetSrcSortAxis(ctx, i);
+		if(!axisIsReduced(axis)){
+			break;
 		}
-
-		ret = GpuArray_empty(ctx->wsDstArg, ctx->gpuCtx,  ctx->dstArgTypeCode,
-		                     ctx->ndd,      ctx->dstDims, GA_C_ORDER);
-		if (ret != GA_NO_ERROR){
-			return reduxCleanup(ctx, ret);
+		
+		aL = axisGetLen(axis);
+		a *= aL;
+		if(a <= target){
+			axis->hwAxisStage0 = i;
+			axis->sliceLen     = aL;
+			axis->tmpLen       = (axis->len+axis->sliceLen-1)/axis->sliceLen;
+			
+			ctx->st1.ndh++;
+		}else{
+			a /= aL;
+			aPartial = target/a;
+			if(aPartial >= 2){
+				a *= aPartial;
+				
+				axis->hwAxisStage0 = i++;
+				axis->sliceLen     = aPartial;
+				axis->tmpLen       = (axis->len+axis->sliceLen-1)/axis->sliceLen;
+				
+				ctx->st1.ndh++;
+				ctx->st1.ndhp++;
+			}
+			break;
 		}
-	}else{
-		ctx->wsDstArg = ctx->dstArg;
 	}
-
-
-	if (reduxIsLargeCodeModel(ctx)){
-		while (reduxCanAppendHwAxis       (ctx, KERNEL_PRIMARY,   AXIS_FREE)){
-			reduxAppendLargestAxisToHwList(ctx, KERNEL_PRIMARY,   AXIS_FREE);
-		}
-	}else{
-		while (reduxCanAppendHwAxis       (ctx, KERNEL_PRIMARY,   AXIS_REDUX)){
-			reduxAppendLargestAxisToHwList(ctx, KERNEL_PRIMARY,   AXIS_REDUX);
-		}
-		while (reduxCanAppendHwAxis       (ctx, KERNEL_PRIMARY,   AXIS_FREE)){
-			reduxAppendLargestAxisToHwList(ctx, KERNEL_PRIMARY,   AXIS_FREE);
-		}
-
-		while (reduxCanAppendHwAxis       (ctx, KERNEL_AUXILIARY, AXIS_FREE)){
-			reduxAppendLargestAxisToHwList(ctx, KERNEL_AUXILIARY, AXIS_FREE);
-		}
+	ctx->st1.ndhr = ctx->st1.ndh;
+	
+	/**
+	 * We now have enough information to allocate the workspaces.
+	 */
+	
+	if(!reduxRequiresDst   (ctx) && reduxKernelRequiresDst(ctx)){
+		
 	}
-
-	return reduxComputeAxisList(ctx);
-}
-
-/**
- * @brief Compute the axis list.
- *
- * The axis list describes the mapping between the nested loops of the kernel
- * as well as their accompanying indices (i0*, i1*, ..., in*) on one hand, and
- * the axes of the source tensor. The first axis in the list corresponds to the
- * outermost loop and the last axis in the list to the innermost.
- *
- * The first ctx->ndd axes correspond to the outer loops that iterate over
- * each destination element. The last ctx->ndr axes correspond to the inner
- * loops that iterate over the dimensions of elements that are to be reduced.
- */
-
-static int        reduxComputeAxisList          (redux_ctx*  ctx){
-	int i, f=0;
-
-	for (i=0;i<ctx->nds;i++){
-		if (!axisInSet(i, ctx->reduxList, ctx->ndr, 0)){
-			ctx->srcAxisList[f++] = i;
-		}
+	if(!reduxRequiresDstArg(ctx) && reduxKernelRequiresDstArg(ctx)){
+		
 	}
-	memcpy(&ctx->srcAxisList[f], ctx->reduxList, ctx->ndr * sizeof(*ctx->reduxList));
-
-
+	
+	
+	/* NOTE: Use gpuarray_get_elsize(typecode) */
+	
 	return reduxGenSource(ctx);
 }
 
@@ -1918,10 +1902,6 @@ static void       reduxAppendSource             (redux_ctx*  ctx){
 	reduxAppendGetInitValFns    (ctx);
 	reduxAppendWriteBackFn      (ctx);
 	reduxAppendReduxKernel       (ctx);
-	if (reduxIsSmallCodeModel(ctx)){
-		reduxAppendInitKernel    (ctx);
-		reduxAppendPostKernel   (ctx);
-	}
 }
 static void       reduxAppendTensorDeclArgs     (redux_ctx*  ctx,
                                                  const char* type,
@@ -1948,7 +1928,7 @@ static void       reduxAppendMacroDefs          (redux_ctx*  ctx){
 	srcbBeginList  (&ctx->srcGen, "+", "0");
 	srcbAppendElemf(&ctx->srcGen, "(const GLOBAL_MEM char*)srcPtr");
 	srcbAppendElemf(&ctx->srcGen, "srcOff");
-	for (i=0;i<ctx->nds;i++){
+	for (i=0;i<ctx->ndfs;i++){
 		srcbAppendElemf(&ctx->srcGen, "i%d*i%dSStep", i, i);
 	}
 	srcbEndList    (&ctx->srcGen);
@@ -1960,7 +1940,7 @@ static void       reduxAppendMacroDefs          (redux_ctx*  ctx){
 		srcbBeginList  (&ctx->srcGen, "+", "0");
 		srcbAppendElemf(&ctx->srcGen, "(GLOBAL_MEM char*)dstPtr");
 		srcbAppendElemf(&ctx->srcGen, "dstOff");
-		for (i=0;i<ctx->ndd;i++){
+		for (i=0;i<ctx->ndfd;i++){
 			srcbAppendElemf(&ctx->srcGen, "i%d*i%dDStep", i, i);
 		}
 		srcbEndList    (&ctx->srcGen);
@@ -1973,7 +1953,7 @@ static void       reduxAppendMacroDefs          (redux_ctx*  ctx){
 		srcbBeginList  (&ctx->srcGen, "+", "0");
 		srcbAppendElemf(&ctx->srcGen, "(GLOBAL_MEM char*)dstArgPtr");
 		srcbAppendElemf(&ctx->srcGen, "dstArgOff");
-		for (i=0;i<ctx->ndd;i++){
+		for (i=0;i<ctx->ndfd;i++){
 			srcbAppendElemf(&ctx->srcGen, "i%d*i%dAStep", i, i);
 		}
 		srcbEndList    (&ctx->srcGen);
@@ -1983,7 +1963,7 @@ static void       reduxAppendMacroDefs          (redux_ctx*  ctx){
 	/* rdxIdx indexer */
 	srcbAppends    (&ctx->srcGen, "#define rdxIdx          (");
 	srcbBeginList  (&ctx->srcGen, "+", "0");
-	for (i=ctx->ndd;i<ctx->nds;i++){
+	for (i=ctx->ndfd;i<ctx->ndfs;i++){
 		srcbAppendElemf(&ctx->srcGen, "i%d*i%dPDim", i, i);
 	}
 	srcbEndList    (&ctx->srcGen);
@@ -2037,7 +2017,7 @@ static void       reduxAppendWriteBackFn        (redux_ctx*  ctx){
 	srcbEndList    (&ctx->srcGen);
 	srcbAppends    (&ctx->srcGen, "){\n");
 
-	if (reduxIsLargeCodeModel(ctx)){
+	if (reduxIs1Stage(ctx)){
 		if (reduxKernelRequiresDst   (ctx)){
 			srcbAppends    (&ctx->srcGen, "\t*d_ = d;\n");
 		}
@@ -2089,78 +2069,85 @@ static void       reduxAppendIndexDeclarations  (redux_ctx*  ctx){
 	strb_appends(&ctx->s, "\tX bd0 = LDIM_0,       bd1 = LDIM_1,       bd2 = LDIM_2;\n");
 	strb_appends(&ctx->s, "\tX ti0 = LID_0,        ti1 = LID_1,        ti2 = LID_2;\n");
 	strb_appends(&ctx->s, "\tX gi0 = bi0*bd0+ti0,  gi1 = bi1*bd1+ti1,  gi2 = bi2*bd2+ti2;\n");
-	if (ctx->pri.ndh>0){
+	if (ctx->st1.ndh>0){
 		strb_appends(&ctx->s, "\tX ");
-		for (i=0;i<ctx->pri.ndh;i++){
+		for (i=0;i<ctx->st1.ndh;i++){
 			strb_appendf(&ctx->s, "ci%u = chunkSize[%u]%s",
-			             i, i, (i==ctx->pri.ndh-1) ? ";\n" : ", ");
+			             i, i, (i==ctx->st1.ndh-1) ? ";\n" : ", ");
 		}
 	}
 	strb_appends(&ctx->s, "\t\n\t\n");
 	strb_appends(&ctx->s, "\t/* Free indices & Reduction indices */\n");
-	if (ctx->nds >        0){appendIdxes (&ctx->s, "\tX ", "i", 0,        ctx->nds, "",        ";\n");}
-	if (ctx->nds >        0){appendIdxes (&ctx->s, "\tX ", "i", 0,        ctx->nds, "Dim",     ";\n");}
-	if (ctx->nds >        0){appendIdxes (&ctx->s, "\tX ", "i", 0,        ctx->nds, "Start",   ";\n");}
-	if (ctx->nds >        0){appendIdxes (&ctx->s, "\tX ", "i", 0,        ctx->nds, "End",     ";\n");}
-	if (ctx->nds >        0){appendIdxes (&ctx->s, "\tX ", "i", 0,        ctx->nds, "SStep",   ";\n");}
-	if (ctx->ndd >        0){appendIdxes (&ctx->s, "\tX ", "i", 0,        ctx->ndd, "DStep",   ";\n");}
-	if (ctx->ndd >        0){appendIdxes (&ctx->s, "\tX ", "i", 0,        ctx->ndd, "AStep",   ";\n");}
-	if (ctx->nds > ctx->ndd){appendIdxes (&ctx->s, "\tX ", "i", ctx->ndd, ctx->nds, "PDim",    ";\n");}
+	if (ctx->ndfs >         0){appendIdxes (&ctx->s, "\tX ", "i", 0,         ctx->ndfs, "",        ";\n");}
+	if (ctx->ndfs >         0){appendIdxes (&ctx->s, "\tX ", "i", 0,         ctx->ndfs, "Dim",     ";\n");}
+	if (ctx->ndfs >         0){appendIdxes (&ctx->s, "\tX ", "i", 0,         ctx->ndfs, "Start",   ";\n");}
+	if (ctx->ndfs >         0){appendIdxes (&ctx->s, "\tX ", "i", 0,         ctx->ndfs, "End",     ";\n");}
+	if (ctx->ndfs >         0){appendIdxes (&ctx->s, "\tX ", "i", 0,         ctx->ndfs, "SStep",   ";\n");}
+	if (ctx->ndfd >         0){appendIdxes (&ctx->s, "\tX ", "i", 0,         ctx->ndfd, "DStep",   ";\n");}
+	if (ctx->ndfd >         0){appendIdxes (&ctx->s, "\tX ", "i", 0,         ctx->ndfd, "AStep",   ";\n");}
+	if (ctx->ndfs > ctx->ndfd){appendIdxes (&ctx->s, "\tX ", "i", ctx->ndfd, ctx->ndfs, "PDim",    ";\n");}
 	strb_appends(&ctx->s, "\t\n\t\n");
 }
 static void       reduxAppendRangeCalculations  (redux_ctx*  ctx){
-	size_t hwDim;
-	int    i;
+	axis_desc* axis;
+	size_t     hwDim;
+	int        i;
 
 	strb_appends(&ctx->s, "\t/* Compute ranges for this thread. */\n");
 
-	for (i=0;i<ctx->nds;i++){
-		strb_appendf(&ctx->s, "\ti%dDim     = srcSize[%d];\n",  i, ctx->srcAxisList[i]);
+	for (i=0;i<ctx->ndfs;i++){
+		strb_appendf(&ctx->s, "\ti%dDim     = srcSize[%d];\n",  i, i);
 	}
-	for (i=0;i<ctx->nds;i++){
-		strb_appendf(&ctx->s, "\ti%dSStep   = srcSteps[%d];\n", i, ctx->srcAxisList[i]);
+	for (i=0;i<ctx->ndfs;i++){
+		strb_appendf(&ctx->s, "\ti%dSStep   = srcSteps[%d];\n", i, i);
 	}
 	if (reduxKernelRequiresDst(ctx)){
-		for (i=0;i<ctx->ndd;i++){
+		for (i=0;i<ctx->ndfd;i++){
 			strb_appendf(&ctx->s, "\ti%dDStep   = dstSteps[%d];\n", i, i);
 		}
 	}
 	if (reduxKernelRequiresDstArg(ctx)){
-		for (i=0;i<ctx->ndd;i++){
+		for (i=0;i<ctx->ndfd;i++){
 			strb_appendf(&ctx->s, "\ti%dAStep   = dstArgSteps[%d];\n", i, i);
 		}
 	}
-	for (i=ctx->nds-1;i>=ctx->ndd;i--){
+	for (i=ctx->ndfs-1;i>=ctx->ndfd;i--){
 		/**
 		 * If this is the last index, it's the first cumulative dimension
 		 * product we generate, and thus we initialize to 1.
 		 */
 
-		if (i == ctx->nds-1){
+		if (i == ctx->ndfs-1){
 			strb_appendf(&ctx->s, "\ti%dPDim    = 1;\n", i);
 		}else{
 			strb_appendf(&ctx->s, "\ti%dPDim    = i%dPDim * i%dDim;\n", i, i+1, i+1);
 		}
 	}
-	for (i=0;i<ctx->nds;i++){
+	for (i=0;i<ctx->ndfs;i++){
 		/**
 		 * Up to MAX_HW_DIMS dimensions get to rely on hardware loops.
 		 * The others, if any, have to use software looping beginning at 0.
 		 */
 
-		if (axisInSet(ctx->srcAxisList[i], ctx->pri.axisList, ctx->pri.ndh, &hwDim)){
+		axis = reduxGetSrcFlatAxis(ctx, i);
+		if (axisIsHW(axis, 0)){
+			hwDim = axisGetHWAxisNum(axis, 0);
+			//axisInSet(i, ctx->st1.axisList, ctx->st1.ndh, &hwDim);
 			strb_appendf(&ctx->s, "\ti%dStart   = gi%d * ci%d;\n", i, hwDim, hwDim);
 		}else{
 			strb_appendf(&ctx->s, "\ti%dStart   = 0;\n", i);
 		}
 	}
-	for (i=0;i<ctx->nds;i++){
+	for (i=0;i<ctx->ndfs;i++){
 		/**
 		 * Up to MAX_HW_DIMS dimensions get to rely on hardware loops.
 		 * The others, if any, have to use software looping beginning at 0.
 		 */
 
-		if (axisInSet(ctx->srcAxisList[i], ctx->pri.axisList, ctx->pri.ndh, &hwDim)){
+		axis = reduxGetSrcFlatAxis(ctx, i);
+		if (axisIsHW(axis, 0)){
+			hwDim = axisGetHWAxisNum(axis, 0);
+			//axisInSet(i, ctx->st1.axisList, ctx->st1.ndh, &hwDim);
 			strb_appendf(&ctx->s, "\ti%dEnd     = i%dStart + ci%d;\n", i, i, hwDim);
 		}else{
 			strb_appendf(&ctx->s, "\ti%dEnd     = i%dStart + i%dDim;\n", i, i, i);
@@ -2172,7 +2159,7 @@ static void       reduxAppendRangeCalculations  (redux_ctx*  ctx){
 static void       reduxAppendLoops              (redux_ctx*  ctx){
 	int i;
 
-	for (i=0;i<ctx->ndd;i++){
+	for (i=0;i<ctx->ndfd;i++){
 		srcbAppendf(&ctx->srcGen, "\tFOROVER(%d){ESCAPE(%d)\n", i, i);
 	}
 
@@ -2183,7 +2170,7 @@ static void       reduxAppendLoops              (redux_ctx*  ctx){
 	}
 	srcbAppends    (&ctx->srcGen, "\t\t\n");
 
-	for (i=ctx->ndd;i<ctx->nds;i++){
+	for (i=ctx->ndfd;i<ctx->ndfs;i++){
 		srcbAppendf    (&ctx->srcGen, "\t\tFOROVER(%d){ESCAPE(%d)\n", i, i);
 	}
 
@@ -2243,7 +2230,7 @@ static void       reduxAppendLoops              (redux_ctx*  ctx){
 		break;
 	}
 
-	for (i=ctx->ndd;i<ctx->nds;i++){
+	for (i=ctx->ndfd;i<ctx->ndfs;i++){
 		srcbAppends(&ctx->srcGen, "\t\t}\n");
 	}
 	srcbAppends(&ctx->srcGen, "\t\t\n");
@@ -2269,16 +2256,10 @@ static void       reduxAppendLoops              (redux_ctx*  ctx){
 	srcbEndList    (&ctx->srcGen);
 	srcbAppends    (&ctx->srcGen, ");\n");
 
-	for (i=0;i<ctx->ndd;i++){
+	for (i=0;i<ctx->ndfd;i++){
 		srcbAppends(&ctx->srcGen, "\t}\n");
 	}
 }
-static void       reduxAppendInitKernel         (redux_ctx*  ctx){
-	/* BUG: Implement this for small code model. */
-}
-static void       reduxAppendPostKernel         (redux_ctx*  ctx){
-	/* BUG: Implement this for small code model. */
-}
 
 /**
  * @brief Compile the kernel from source code.
@@ -2288,8 +2269,6 @@ static int        reduxCompile                  (redux_ctx*  ctx){
 	int    ret, i = 0;
 	int    PRI_TYPECODES[11];
 	size_t PRI_TYPECODES_LEN;
-	int*   AUX_TYPECODES;
-	size_t AUX_TYPECODES_LEN;
 
 
 	/**
@@ -2312,8 +2291,6 @@ static int        reduxCompile                  (redux_ctx*  ctx){
 		PRI_TYPECODES[i++] = GA_BUFFER; /* dstArgSteps */
 	}
 	PRI_TYPECODES_LEN  = i;
-	AUX_TYPECODES      = &PRI_TYPECODES[3];
-	AUX_TYPECODES_LEN  = PRI_TYPECODES_LEN-3;
 
 
 	/**
@@ -2335,34 +2312,6 @@ static int        reduxCompile                  (redux_ctx*  ctx){
 			return reduxCleanup(ctx, ret);
 		}
 	}
-	if (reduxIsSmallCodeModel(ctx)){
-		ret  = GpuKernel_init(&ctx->kernel,
-		                      ctx->gpuCtx,
-		                      1,
-		                      (const char**)&ctx->sourceCode,
-		                      &ctx->sourceCodeLen,
-		                      "initKer",
-		                      AUX_TYPECODES_LEN,
-		                      AUX_TYPECODES,
-		                      GA_USE_CLUDA,
-		                      &ctx->errorString1);
-		if (ret != GA_NO_ERROR){
-			return reduxCleanup(ctx, ret);
-		}
-		ret  = GpuKernel_init(&ctx->kernel,
-		                      ctx->gpuCtx,
-		                      1,
-		                      (const char**)&ctx->sourceCode,
-		                      &ctx->sourceCodeLen,
-		                      "postKer",
-		                      AUX_TYPECODES_LEN,
-		                      AUX_TYPECODES,
-		                      GA_USE_CLUDA,
-		                      &ctx->errorString2);
-		if (ret != GA_NO_ERROR){
-			return reduxCleanup(ctx, ret);
-		}
-	}
 
 	return reduxSchedule(ctx);
 }
@@ -2373,60 +2322,41 @@ static int        reduxCompile                  (redux_ctx*  ctx){
  */
 
 static int        reduxSchedule                 (redux_ctx*  ctx){
-	int      i, priNdims = 0, auxNdims = 0;
-	uint64_t maxLgRdx = 0, maxLgPre = 0, maxLgPost = 0;
-	uint64_t maxLgPri = 0, maxLgAux = 0;
+	int      i, priNdims = 0;
+	uint64_t maxLgRdx = 0;
+	uint64_t maxLgPri = 0;
 	uint64_t maxLs  [MAX_HW_DIMS];
 	uint64_t maxGg;
 	uint64_t maxGs  [MAX_HW_DIMS];
 	uint64_t priDims[MAX_HW_DIMS];
-	uint64_t auxDims[MAX_HW_DIMS];
 	uint64_t bs     [MAX_HW_DIMS];
 	uint64_t gs     [MAX_HW_DIMS];
 	uint64_t cs     [MAX_HW_DIMS];
-	size_t   warpSize,
-	         maxL, maxL0, maxL1, maxL2,
-	         maxG, maxG0, maxG1, maxG2;
+	size_t   warpSize, maxL;
+	axis_desc* axis;
 
 
 	/**
 	 * Obtain the constraints of our problem.
 	 */
-
-	gpudata_property  (ctx->src->data,    GA_CTX_PROP_MAXLSIZE0,    &maxL0);
-	gpudata_property  (ctx->src->data,    GA_CTX_PROP_MAXLSIZE1,    &maxL1);
-	gpudata_property  (ctx->src->data,    GA_CTX_PROP_MAXLSIZE2,    &maxL2);
-	gpudata_property  (ctx->src->data,    GA_CTX_PROP_MAXGSIZE,     &maxG);
-	gpudata_property  (ctx->src->data,    GA_CTX_PROP_MAXGSIZE0,    &maxG0);
-	gpudata_property  (ctx->src->data,    GA_CTX_PROP_MAXGSIZE1,    &maxG1);
-	gpudata_property  (ctx->src->data,    GA_CTX_PROP_MAXGSIZE2,    &maxG2);
+	
 	gpukernel_property(ctx->kernel.k,     GA_KERNEL_PROP_PREFLSIZE, &warpSize);
 	gpukernel_property(ctx->kernel.k,     GA_KERNEL_PROP_MAXLSIZE,  &maxL);
 	maxLgRdx  = maxL;
 	maxLgPri  = maxLgRdx;
-	if (reduxIsSmallCodeModel(ctx)){
-		gpukernel_property(ctx->preKernel.k,  GA_KERNEL_PROP_MAXLSIZE,  &maxL);
-		maxLgPre  = maxL;
-		gpukernel_property(ctx->postKernel.k, GA_KERNEL_PROP_MAXLSIZE,  &maxL);
-		maxLgPost = maxL;
-		maxLgAux  = maxLgPre<maxLgPost ? maxLgPre : maxLgPost;
-	}
-
-	priNdims  = ctx->pri.ndh;
-	maxGs[0]  = maxG0;
-	maxGs[1]  = maxG1;
-	maxGs[2]  = maxG2;
-	maxGg     = maxG;
-	maxLs[0]  = maxL0;
-	maxLs[1]  = maxL1;
-	maxLs[2]  = maxL2;
-	for (i=0;i<priNdims;i++){
-		priDims[i] = ctx->src->dimensions[ctx->pri.axisList[i]];
-	}
-	if (reduxIsSmallCodeModel(ctx)){
-		auxNdims  = ctx->aux.ndh;
-		for (i=0;i<auxNdims;i++){
-			auxDims[i] = ctx->src->dimensions[ctx->aux.axisList[i]];
+
+	priNdims  = ctx->st1.ndh;
+	maxGs[0]  = ctx->maxGs[0];
+	maxGs[1]  = ctx->maxGs[1];
+	maxGs[2]  = ctx->maxGs[2];
+	maxGg     = ctx->maxGg;
+	maxLs[0]  = ctx->maxLs[0];
+	maxLs[1]  = ctx->maxLs[1];
+	maxLs[2]  = ctx->maxLs[2];
+	for (i=0;i<ctx->ndfs;i++){
+		axis = reduxGetSrcFlatAxis(ctx, i);
+		if(axisIsHW(axis, 0)){
+			priDims[axisGetHWAxisNum(axis, 0)] = axisGetLen(axis);
 		}
 	}
 
@@ -2443,28 +2373,12 @@ static int        reduxSchedule                 (redux_ctx*  ctx){
 		                    maxGg,    maxGs,
 		                    bs, gs, cs);
 		for (i=0;i<priNdims;i++){
-			ctx->pri.bs[i] = bs[i];
-			ctx->pri.gs[i] = gs[i];
-			ctx->pri.cs[i] = cs[i];
+			ctx->st1.bs[i] = bs[i];
+			ctx->st1.gs[i] = gs[i];
+			ctx->st1.cs[i] = cs[i];
 		}
 		if (priNdims <= 0){
-			ctx->pri.bs[i] = ctx->pri.gs[i] = ctx->pri.cs[i] = 1;
-		}
-	}
-	if (reduxIsSmallCodeModel(ctx)){
-		reduxScheduleKernel(auxNdims,
-		                    auxDims,
-		                    warpSize,
-		                    maxLgAux, maxLs,
-		                    maxGg,    maxGs,
-		                    bs, gs, cs);
-		for (i=0;i<auxNdims;i++){
-			ctx->aux.bs[i] = bs[i];
-			ctx->aux.gs[i] = gs[i];
-			ctx->aux.cs[i] = cs[i];
-		}
-		if (auxNdims <= 0){
-			ctx->aux.bs[i] = ctx->aux.gs[i] = ctx->aux.cs[i] = 1;
+			ctx->st1.bs[i] = ctx->st1.gs[i] = ctx->st1.cs[i] = 1;
 		}
 	}
 
@@ -2576,7 +2490,6 @@ static void       reduxScheduleKernel           (int         ndims,
 
 static int        reduxInvoke                   (redux_ctx*  ctx){
 	void* priArgs[11];
-	void* auxArgs[ 8];
 	int   ret, i = 0;
 	int   failedDstSteps     = 0;
 	int   failedDstArgSteps  = 0;
@@ -2588,47 +2501,34 @@ static int        reduxInvoke                   (redux_ctx*  ctx){
 	 */
 
 	const int flags      = GA_BUFFER_READ_ONLY|GA_BUFFER_INIT;
-	ctx->srcStepsGD      = gpudata_alloc(ctx->gpuCtx, ctx->nds     * sizeof(size_t),
-	                                     ctx->src->strides,     flags, 0);
-	ctx->srcSizeGD       = gpudata_alloc(ctx->gpuCtx, ctx->nds     * sizeof(size_t),
-	                                     ctx->src->dimensions,  flags, 0);
-	ctx->pri.chunkSizeGD = gpudata_alloc(ctx->gpuCtx, ctx->pri.ndh * sizeof(size_t),
-	                                     ctx->pri.cs,           flags, 0);
-
-	priArgs[i++] = (void*) ctx->src->data;
-	priArgs[i++] = (void*)&ctx->src->offset;
+	ctx->srcStepsGD      = gpudata_alloc(ctx->gpuCtx, ctx->ndfs    * sizeof(size_t),
+	                                     ctx->flatSrcStrides,    flags, 0);
+	ctx->srcSizeGD       = gpudata_alloc(ctx->gpuCtx, ctx->ndfs    * sizeof(size_t),
+	                                     ctx->flatSrcDimensions, flags, 0);
+	ctx->st1.chunkSizeGD = gpudata_alloc(ctx->gpuCtx, ctx->st1.ndh * sizeof(size_t),
+	                                     ctx->st1.cs,            flags, 0);
+
+	priArgs[i++] = (void*) ctx->flatSrcData;
+	priArgs[i++] = (void*)&ctx->flatSrcOffset;
 	priArgs[i++] = (void*) ctx->srcStepsGD;
 	priArgs[i++] = (void*) ctx->srcSizeGD;
-	priArgs[i++] = (void*) ctx->pri.chunkSizeGD;
+	priArgs[i++] = (void*) ctx->st1.chunkSizeGD;
 	if (reduxKernelRequiresDst   (ctx)){
-		ctx->dstStepsGD      = gpudata_alloc(ctx->gpuCtx, ctx->ndd * sizeof(size_t),
-		                                     ctx->wsDst->strides,    flags, 0);
-		priArgs[i++]         = (void*) ctx->wsDst->data;
-		priArgs[i++]         = (void*)&ctx->wsDst->offset;
+		ctx->dstStepsGD      = gpudata_alloc(ctx->gpuCtx, ctx->ndfd * sizeof(size_t),
+		                                     ctx->flatDstStrides,    flags, 0);
+		priArgs[i++]         = (void*) ctx->flatDstData;
+		priArgs[i++]         = (void*)&ctx->flatDstOffset;
 		priArgs[i++]         = (void*) ctx->dstStepsGD;
 		failedDstSteps       =        !ctx->dstStepsGD;
 	}
 	if (reduxKernelRequiresDstArg(ctx)){
-		ctx->dstArgStepsGD   = gpudata_alloc(ctx->gpuCtx, ctx->ndd * sizeof(size_t),
-		                                     ctx->wsDstArg->strides, flags, 0);
-		priArgs[i++]         = (void*) ctx->wsDstArg->data;
-		priArgs[i++]         = (void*)&ctx->wsDstArg->offset;
+		ctx->dstArgStepsGD   = gpudata_alloc(ctx->gpuCtx, ctx->ndfd * sizeof(size_t),
+		                                     ctx->flatDstArgStrides, flags, 0);
+		priArgs[i++]         = (void*) ctx->flatDstArgData;
+		priArgs[i++]         = (void*)&ctx->flatDstArgOffset;
 		priArgs[i++]         = (void*) ctx->dstArgStepsGD;
 		failedDstArgSteps    =        !ctx->dstArgStepsGD;
 	}
-	if (reduxIsSmallCodeModel    (ctx)){
-		/**
-		 * The auxiliary kernel's args are identical to the primary kernel's,
-		 * except that the first three arguments are deleted and the fifth
-		 * argument (now second), called chunkSize, is different.
-		 */
-
-		memcpy(auxArgs, &priArgs[3], sizeof(auxArgs));
-		ctx->aux.chunkSizeGD = gpudata_alloc(ctx->gpuCtx, ctx->aux.ndh * sizeof(size_t),
-		                                     ctx->aux.cs,           flags, 0);
-		auxArgs[ 1 ]         = (void*) ctx->aux.chunkSizeGD;
-		failedAuxChunkSize   =        !ctx->aux.chunkSizeGD;
-	}
 
 
 	/**
@@ -2637,47 +2537,21 @@ static int        reduxInvoke                   (redux_ctx*  ctx){
 
 	if (ctx->srcStepsGD      &&
 	    ctx->srcSizeGD       &&
-	    ctx->pri.chunkSizeGD &&
+	    ctx->st1.chunkSizeGD &&
 	    !failedDstSteps      &&
 	    !failedDstArgSteps   &&
 	    !failedAuxChunkSize){
-		/* Pre-kernel invocation, if necessary */
-		if (reduxIsSmallCodeModel(ctx)){
-			ret = GpuKernel_call(&ctx->preKernel,
-			                     ctx->aux.ndh>0 ? ctx->aux.ndh : 1,
-			                     ctx->aux.gs,
-			                     ctx->aux.bs,
-			                     0,
-			                     auxArgs);
-			if (ret != GA_NO_ERROR){
-				return reduxCleanup(ctx, ret);
-			}
-		}
-
 		/* Reduction kernel invocation */
 		ret = GpuKernel_call(&ctx->kernel,
-		                     ctx->pri.ndh>0 ? ctx->pri.ndh : 1,
-		                     ctx->pri.gs,
-		                     ctx->pri.bs,
+		                     ctx->st1.ndh>0 ? ctx->st1.ndh : 1,
+		                     ctx->st1.gs,
+		                     ctx->st1.bs,
 		                     0,
 		                     priArgs);
 		if (ret != GA_NO_ERROR){
 			return reduxCleanup(ctx, ret);
 		}
 
-		/* Post-kernel invocation, if necessary */
-		if (reduxIsSmallCodeModel(ctx)){
-			ret = GpuKernel_call(&ctx->postKernel,
-			                     ctx->aux.ndh>0 ? ctx->aux.ndh : 1,
-			                     ctx->aux.gs,
-			                     ctx->aux.bs,
-			                     0,
-			                     auxArgs);
-			if (ret != GA_NO_ERROR){
-				return reduxCleanup(ctx, ret);
-			}
-		}
-
 		return reduxCleanup(ctx, ret);
 	}else{
 		return reduxCleanup(ctx, GA_MEMORY_ERROR);
@@ -2689,43 +2563,30 @@ static int        reduxInvoke                   (redux_ctx*  ctx){
  */
 
 static int        reduxCleanup                  (redux_ctx*  ctx, int ret){
-	if (ctx->dst    != ctx->wsDst){
-		if(ctx->wsDst){
-			GpuArray_clear(ctx->wsDst);
-		}
-		free(ctx->wsDst);
-		ctx->wsDst    = NULL;
-	}
-	if (ctx->dstArg != ctx->wsDstArg){
-		if(ctx->wsDstArg){
-			GpuArray_clear(ctx->wsDstArg);
-		}
-		free(ctx->wsDstArg);
-		ctx->wsDstArg = NULL;
-	}
-
-	free(ctx->srcAxisList);
-	free(ctx->dstDims);
+	free(ctx->flatSrcDimensions);
+	free(ctx->flatSrcStrides);
+	free(ctx->flatDstStrides);
+	free(ctx->flatDstArgStrides);
 	free(ctx->sourceCode);
 	free(ctx->errorString0);
 	free(ctx->errorString1);
-	free(ctx->errorString2);
-	ctx->srcAxisList  = NULL;
-	ctx->dstDims      = NULL;
+	ctx->flatSrcDimensions = NULL;
+	ctx->flatSrcStrides    = NULL;
+	ctx->flatDstStrides    = NULL;
+	ctx->flatDstArgStrides = NULL;
 	ctx->sourceCode   = NULL;
 	ctx->errorString0 = NULL;
 	ctx->errorString1 = NULL;
-	ctx->errorString2 = NULL;
 
 	gpudata_release(ctx->srcStepsGD);
 	gpudata_release(ctx->srcSizeGD);
 	gpudata_release(ctx->dstStepsGD);
 	gpudata_release(ctx->dstArgStepsGD);
-	gpudata_release(ctx->pri.chunkSizeGD);
-	gpudata_release(ctx->aux.chunkSizeGD);
+	gpudata_release(ctx->st1.chunkSizeGD);
+	gpudata_release(ctx->st2.chunkSizeGD);
 	ctx->srcStepsGD      = ctx->srcSizeGD       =
 	ctx->dstStepsGD      = ctx->dstArgStepsGD   =
-	ctx->pri.chunkSizeGD = ctx->aux.chunkSizeGD = NULL;
+	ctx->st1.chunkSizeGD = ctx->st2.chunkSizeGD = NULL;
 
 	return ret;
 }

From 2317ca1e864045856e7d21ba6ed393b585c651f5 Mon Sep 17 00:00:00 2001
From: Olexa Bilaniuk <obilaniu@gmail.com>
Date: Wed, 14 Jun 2017 18:34:44 -0400
Subject: [PATCH 15/34] More planning for 2-stage reduction.

---
 src/gpuarray_reduction.c | 116 +++++++++++++++++++++++++++++++--------
 1 file changed, 93 insertions(+), 23 deletions(-)

diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c
index 24243f78ca..8f0f22f2fc 100644
--- a/src/gpuarray_reduction.c
+++ b/src/gpuarray_reduction.c
@@ -46,8 +46,6 @@ struct axis_desc{
 	ssize_t  srcStride,       srcOffset;
 	ssize_t  dstStride,       dstOffset;
 	ssize_t  dstArgStride,    dstArgOffset;
-	ssize_t  tmpDstStride,    tmpDstOffset;
-	ssize_t  tmpDstArgStride, tmpDstArgOffset;
 };
 typedef struct axis_desc axis_desc;
 
@@ -274,6 +272,7 @@ struct redux_ctx{
 	axis_desc*      xdSrc;
 	axis_desc*      xdSrcFlat;
 	axis_desc**     xdSrcPtrs;
+	axis_desc**     xdTmpPtrs;
 	
 	size_t*         flatSrcDimensions;
 	ssize_t*        flatSrcStrides;
@@ -290,13 +289,11 @@ struct redux_ctx{
 	int             numStages;
 	
 	/* Workspaces, in the case of 2-stage reduction */
-	size_t*         tmpSrcDimensions;
+	size_t*         tmpDstDimensions;
 	ssize_t*        tmpDstStrides;
 	gpudata*        tmpDstData;
-	ssize_t         tmpDstOffset;
 	ssize_t*        tmpDstArgStrides;
 	gpudata*        tmpDstArgData;
-	ssize_t         tmpDstArgOffset;
 
 	/* Source code Generator. */
 	int             srcTypeCode;
@@ -383,6 +380,8 @@ static void       axisInit                      (axis_desc*        axis,
 static void       axisMarkReduced               (axis_desc*        axis, int    reduxNum);
 static int        axisGetReduxNum               (const axis_desc*  axis);
 static size_t     axisGetLen                    (const axis_desc*  axis);
+static size_t     axisGetTmpLen                 (const axis_desc*  axis);
+static size_t     axisGetSliceLen               (const axis_desc*  axis);
 static ssize_t    axisGetSrcStride              (const axis_desc*  axis);
 static size_t     axisGetSrcAbsStride           (const axis_desc*  axis);
 static ssize_t    axisGetSrcOffset              (const axis_desc*  axis);
@@ -409,6 +408,7 @@ static int        reduxIs2Stage                 (const redux_ctx*  ctx);
 static axis_desc* reduxGetSrcAxis               (const redux_ctx*  ctx, int i);
 static axis_desc* reduxGetSrcSortAxis           (const redux_ctx*  ctx, int i);
 static axis_desc* reduxGetSrcFlatAxis           (const redux_ctx*  ctx, int i);
+static axis_desc* reduxGetTmpAxis               (const redux_ctx*  ctx, int i);
 static int        reduxTryFlattenInto           (const redux_ctx*  ctx,
                                                  axis_desc*        into,
                                                  const axis_desc*  from);
@@ -908,12 +908,6 @@ static void       axisInit                      (axis_desc*       axis,
 	
 	axis->dstArgStride    = 0;
 	axis->dstArgOffset    = 0;
-	
-	axis->tmpDstStride    = 0;
-	axis->tmpDstOffset    = 0;
-	
-	axis->tmpDstArgStride = 0;
-	axis->tmpDstArgOffset = 0;
 }
 
 /**
@@ -935,6 +929,12 @@ static int        axisGetReduxNum               (const axis_desc* axis){
 static size_t     axisGetLen                    (const axis_desc* axis){
 	return axis->len;
 }
+static size_t     axisGetTmpLen                 (const axis_desc* axis){
+	return axis->tmpLen;
+}
+static size_t     axisGetSliceLen               (const axis_desc* axis){
+	return axis->sliceLen;
+}
 static ssize_t    axisGetSrcStride              (const axis_desc* axis){
 	return axisGetLen(axis) > 1 ? axis->srcStride : 0;
 }
@@ -971,6 +971,9 @@ static int        axisIsReduced                 (const axis_desc* axis){
 static int        axisIsHW                      (const axis_desc* axis, int stage){
 	return (stage == 0 ? axis->hwAxisStage0 : axis->hwAxisStage1) >= 0;
 }
+static int        axisIsPartialHW               (const axis_desc* axis, int stage){
+	return axisIsHW(axis, stage) && axis->sliceLen != axis->len;
+}
 static int        axisGetHWAxisNum              (const axis_desc* axis, int stage){
 	return stage == 0 ? axis->hwAxisStage0 : axis->hwAxisStage1;
 }
@@ -1034,7 +1037,7 @@ static int        reduxRequiresDstArg           (const redux_ctx*  ctx){
  * This is semantically subtly different from reduxHasDst(). The main
  * difference is in the implementation of the GA_REDUCE_ARGMIN/ARGMAX
  * reductions; Either *might* require a dst buffer, which will have to be
- * allocated, even though it will be discared.
+ * allocated, even though it will be discarded.
  */
 
 static int        reduxKernelRequiresDst        (const redux_ctx*  ctx){
@@ -1147,6 +1150,14 @@ static axis_desc* reduxGetSrcFlatAxis           (const redux_ctx*  ctx, int i){
 	return &ctx->xdSrcFlat[i];
 }
 
+/**
+ * @brief Get description of temporary axis with given number.
+ */
+
+static axis_desc* reduxGetTmpAxis               (const redux_ctx*  ctx, int i){
+	return ctx->xdTmpPtrs[i];
+}
+
 /**
  * @brief Attempt to flatten an axis `from` into an axis `into`.
  * 
@@ -1343,7 +1354,6 @@ static int        reduxInferProperties          (redux_ctx*  ctx){
 	ctx->ndr  = ctx->reduxLen;
 	ctx->ndd  = ctx->nds - ctx->ndr;
 	ctx->ndfs = ctx->ndfr = ctx->ndfd = 0;
-	ctx->ndt  = ctx->ndd + 1;
 	
 	/* Insane reduxList? */
 	for (i=0;i<ctx->ndr;i++){
@@ -1749,13 +1759,11 @@ static int        reduxSelectNumStages          (redux_ctx*  ctx){
 	    ctx->prodFreeAxes >= ctx->prodRdxAxes || /* More destinations than reductions? */
 	    ctx->prodFreeAxes >= parallelism      ){ /* Destination very large? */
 		ctx->numStages = 1;
-		return reduxPlan1Stage(ctx);
 	}else{
 		/* BUG: Switch to 2Stage when small code model fixed. */
-		(void)reduxPlan2Stage;
 		ctx->numStages = 1;
-		return reduxPlan1Stage(ctx);
 	}
+	return ctx->numStages == 1 ? reduxPlan1Stage(ctx) : reduxPlan2Stage(ctx);
 }
 
 /**
@@ -1812,11 +1820,13 @@ static int        reduxPlan1Stage               (redux_ctx*  ctx){
  */
 
 static int        reduxPlan2Stage               (redux_ctx*  ctx){
-	int        i;
+	int        i, j, ret = 0;
 	axis_desc* axis;
-	size_t     a = 1, aL, aPartial, target = ctx->maxLg;
+	size_t     a = 1, aL, aPartial, target = reduxEstimateParallelism(ctx), sz;
 	
 	/**
+	 * Plan Stage 0.
+	 * 
 	 * Sort axis descriptions reduction-axes-first then longest-first, and
 	 * select up to 3 reduction axes, splitting them s.t. their product does
 	 * not exceed the max block size.
@@ -1841,7 +1851,7 @@ static int        reduxPlan2Stage               (redux_ctx*  ctx){
 		if(a <= target){
 			axis->hwAxisStage0 = i;
 			axis->sliceLen     = aL;
-			axis->tmpLen       = (axis->len+axis->sliceLen-1)/axis->sliceLen;
+			axis->tmpLen       = 1;
 			
 			ctx->st1.ndh++;
 		}else{
@@ -1866,15 +1876,67 @@ static int        reduxPlan2Stage               (redux_ctx*  ctx){
 	 * We now have enough information to allocate the workspaces.
 	 */
 	
-	if(!reduxRequiresDst   (ctx) && reduxKernelRequiresDst(ctx)){
+	ctx->ndt              = ctx->ndfs - ctx->st1.ndh + ctx->st1.ndhp;
+	ctx->xdTmpPtrs        = malloc(ctx->ndt*sizeof(*ctx->xdTmpPtrs));
+	ctx->tmpDstDimensions = malloc(ctx->ndt*sizeof(*ctx->tmpDstDimensions));
+	ctx->tmpDstStrides    = malloc(ctx->ndt*sizeof(*ctx->tmpDstStrides));
+	ctx->tmpDstArgStrides = malloc(ctx->ndt*sizeof(*ctx->tmpDstArgStrides));
+	if(!ctx->xdTmpPtrs || !ctx->tmpDstDimensions || !ctx->tmpDstStrides ||
+	   !ctx->tmpDstArgStrides){
+		return reduxCleanup(ctx, GA_MEMORY_ERROR);
+	}
+	for(i=j=0;i<ctx->ndfs;i++){
+		axis = reduxGetSrcFlatAxis(ctx, i);
+		if(!axisIsHW(axis, 0) || axisIsPartialHW(axis, 0)){
+			ctx->xdTmpPtrs       [j] = axis;
+			ctx->tmpDstDimensions[j] = axisGetTmpLen(axis);
+		}
+	}
+	
+	if (reduxKernelRequiresDst(ctx)){
+		sz = gpuarray_get_elsize(ctx->dstTypeCode);
 		
+		for(i=ctx->ndt-1;i>=0;i--){
+			ctx->tmpDstStrides[i] = sz;
+			sz *= ctx->tmpDstDimensions[i];
+		}
+		
+		ctx->tmpDstData    = gpudata_alloc(ctx->gpuCtx, sz, 0, 0, &ret);
+		if(ret != GA_NO_ERROR){
+			return reduxCleanup(ctx, ret);
+		}
 	}
-	if(!reduxRequiresDstArg(ctx) && reduxKernelRequiresDstArg(ctx)){
+	if (reduxKernelRequiresDstArg(ctx)){
+		sz = gpuarray_get_elsize(ctx->dstArgTypeCode);
 		
+		for(i=ctx->ndt-1;i>=0;i--){
+			ctx->tmpDstArgStrides[i] = sz;
+			sz *= ctx->tmpDstDimensions[i];
+		}
+		
+		ctx->tmpDstArgData = gpudata_alloc(ctx->gpuCtx, sz, 0, 0, &ret);
+		if(ret != GA_NO_ERROR){
+			return reduxCleanup(ctx, ret);
+		}
 	}
 	
+	/**
+	 * Plan Stage 1.
+	 */
+	
+	qsort(ctx->xdTmpPtrs, ctx->ndt, sizeof(*ctx->xdTmpPtrs), reduxSortPlan1Stage);
 	
-	/* NOTE: Use gpuarray_get_elsize(typecode) */
+	ctx->st2.ndh  = 0;
+	ctx->st2.ndhp = 0;
+	ctx->st2.ndhr = 0;
+	
+	for (i=0;i<ctx->ndfd && i<MAX_HW_DIMS;i++){
+		axis = reduxGetTmpAxis(ctx, i);
+		axis->hwAxisStage1 = i;
+		
+		ctx->st2.ndh++;
+	}
+	ctx->st2.ndhd = ctx->st2.ndh;
 	
 	return reduxGenSource(ctx);
 }
@@ -2567,6 +2629,9 @@ static int        reduxCleanup                  (redux_ctx*  ctx, int ret){
 	free(ctx->flatSrcStrides);
 	free(ctx->flatDstStrides);
 	free(ctx->flatDstArgStrides);
+	free(ctx->tmpDstDimensions);
+	free(ctx->tmpDstStrides);
+	free(ctx->tmpDstArgStrides);
 	free(ctx->sourceCode);
 	free(ctx->errorString0);
 	free(ctx->errorString1);
@@ -2574,10 +2639,15 @@ static int        reduxCleanup                  (redux_ctx*  ctx, int ret){
 	ctx->flatSrcStrides    = NULL;
 	ctx->flatDstStrides    = NULL;
 	ctx->flatDstArgStrides = NULL;
+	ctx->tmpDstDimensions  = NULL;
+	ctx->tmpDstStrides     = NULL;
+	ctx->tmpDstArgStrides  = NULL;
 	ctx->sourceCode   = NULL;
 	ctx->errorString0 = NULL;
 	ctx->errorString1 = NULL;
-
+	
+	gpudata_release(ctx->tmpDstData);
+	gpudata_release(ctx->tmpDstArgData);
 	gpudata_release(ctx->srcStepsGD);
 	gpudata_release(ctx->srcSizeGD);
 	gpudata_release(ctx->dstStepsGD);

From c3977d8c1f28dd2156ae010d6306a41fd140b905 Mon Sep 17 00:00:00 2001
From: Olexa Bilaniuk <obilaniu@gmail.com>
Date: Tue, 27 Jun 2017 19:03:32 -0400
Subject: [PATCH 16/34] Near-complete rewrite based on 1/2-phase code model
 with workspace.

---
 src/gpuarray/reduction.h |   94 +-
 src/gpuarray_reduction.c | 4167 +++++++++++++++++++++++---------------
 tests/check_reduction.c  |  393 +++-
 3 files changed, 2912 insertions(+), 1742 deletions(-)

diff --git a/src/gpuarray/reduction.h b/src/gpuarray/reduction.h
index f6638c9a83..c8508b841d 100644
--- a/src/gpuarray/reduction.h
+++ b/src/gpuarray/reduction.h
@@ -21,39 +21,76 @@ extern "C" {
 #endif
 
 
+/* Data Structures */
+struct         GpuReduction;
+typedef struct GpuReduction GpuReduction;
+
+
 /**
  * Supported array reduction operations.
  */
 
 typedef enum _ga_reduce_op {
-	GA_REDUCE_SUM,             /*          +          */
-	GA_REDUCE_PROD,            /*          *          */
-	GA_REDUCE_PRODNZ,          /*          * (!=0)    */
-	GA_REDUCE_MIN,             /*        min()        */
-	GA_REDUCE_MAX,             /*        max()        */
-	GA_REDUCE_ARGMIN,          /*       argmin()      */
-	GA_REDUCE_ARGMAX,          /*       argmax()      */
-	GA_REDUCE_MINANDARGMIN,    /*   min(), argmin()   */
-	GA_REDUCE_MAXANDARGMAX,    /*   max(), argmax()   */
-	GA_REDUCE_AND,             /*          &          */
-	GA_REDUCE_OR,              /*          |          */
-	GA_REDUCE_XOR,             /*          ^          */
-	GA_REDUCE_ALL,             /*       &&/all()      */
-	GA_REDUCE_ANY,             /*       ||/any()      */
+	                           /*    dst   ,  dstArg  */
+	GA_REDUCE_SUM,             /*     +               */
+	GA_REDUCE_PROD,            /*     *               */
+	GA_REDUCE_PRODNZ,          /*     * (!=0)         */
+	GA_REDUCE_MIN,             /*   min()             */
+	GA_REDUCE_MAX,             /*   max()             */
+	GA_REDUCE_ARGMIN,          /*            argmin() */
+	GA_REDUCE_ARGMAX,          /*            argmax() */
+	GA_REDUCE_MINANDARGMIN,    /*   min()  , argmin() */
+	GA_REDUCE_MAXANDARGMAX,    /*   max()  , argmax() */
+	GA_REDUCE_AND,             /*     &               */
+	GA_REDUCE_OR,              /*     |               */
+	GA_REDUCE_XOR,             /*     ^               */
+	GA_REDUCE_ALL,             /*  &&/all()           */
+	GA_REDUCE_ANY,             /*  ||/any()           */
 } ga_reduce_op;
 
 
+/* External Functions */
 
 /**
- * @brief Compute a reduction over a list of axes to reduce.
+ * @brief Create a new GPU reduction operator over a list of axes to reduce.
+ *
+ * @param [out] gr           The reduction operator.
+ * @param [in]  gpuCtx       The GPU context.
+ * @param [in]  op           The reduction operation to perform.
+ * @param [in]  ndf          The minimum number of destination dimensions to support.
+ * @param [in]  ndr          The minimum number of reduction   dimensions to support.
+ * @param [in]  srcTypeCode  The data type of the source operand.
+ * @param [in]  flags        Reduction operator creation flags. Currently must be
+ *                           set to 0.
+ *
+ * @return GA_NO_ERROR if the operator was created successfully, or a non-zero
+ *         error code otherwise.
+ */
+
+GPUARRAY_PUBLIC int   GpuReduction_new   (GpuReduction**   grOut,
+                                          gpucontext*      gpuCtx,
+                                          ga_reduce_op     op,
+                                          unsigned         ndf,
+                                          unsigned         ndr,
+                                          int              srcTypeCode,
+                                          int              flags);
+
+/**
+ * @brief Deallocate an operator allocated by GpuReduction_new().
+ */
+
+GPUARRAY_PUBLIC void  GpuReduction_free  (GpuReduction*    gr);
+
+/**
+ * @brief Invoke an operator allocated by GpuReduction_new() on a source tensor.
  *
  * Returns one (in the case of min-and-argmin/max-and-argmax, two) destination
  * tensors. The destination tensor(s)' axes are a strict subset of the axes of the
  * source tensor. The axes to be reduced are specified by the caller, and the
  * reduction is performed over these axes, which are then removed in the
  * destination.
- *
- * @param [in]  op         The reduction operation to perform.
+ * 
+ * @param [in]  gr         The reduction operator.
  * @param [out] dst        The destination tensor. Has the same type as the source.
  * @param [out] dstArg     For argument of minima/maxima operations. Has type int64.
  * @param [in]  src        The source tensor.
@@ -76,19 +113,20 @@ typedef enum _ga_reduce_op {
  *                         
  *                         where (i3,i4,i1) are the coordinates of the maximum-
  *                         valued element within subtensor [i0,:,i2,:,:] of src.
- * @return GA_NO_ERROR if the operation was successful, or a non-zero error
- *         code otherwise.
+ * @param [in]  flags      Reduction operator invocation flags. Currently must be
+ *                         set to 0.
+ *
+ * @return GA_NO_ERROR if the operator was invoked successfully, or a non-zero
+ *         error code otherwise.
  */
 
-GPUARRAY_PUBLIC int GpuArray_reduction   (ga_reduce_op    op,
-                                          GpuArray*       dst,
-                                          GpuArray*       dstArg,
-                                          const GpuArray* src,
-                                          unsigned        reduxLen,
-                                          const unsigned* reduxList);
-
-
-
+GPUARRAY_PUBLIC int   GpuReduction_call  (GpuReduction*    gr,
+                                          GpuArray*        dst,
+                                          GpuArray*        dstArg,
+                                          const GpuArray*  src,
+                                          unsigned         reduxLen,
+                                          const int*       reduxList,
+                                          int              flags);
 
 
 #ifdef __cplusplus
diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c
index 8f0f22f2fc..81376b93b8 100644
--- a/src/gpuarray_reduction.c
+++ b/src/gpuarray_reduction.c
@@ -24,11 +24,8 @@
 
 
 /* Defines */
+#define  DIVIDECEIL(a,b) (((a)+(b)-1)/(b))
 #define  MAX_HW_DIMS                   3
-#define  KERNEL_PRIMARY                0
-#define  KERNEL_AUXILIARY              1
-#define  AXIS_FREE                     0
-#define  AXIS_REDUX                    1
 
 
 
@@ -40,17 +37,21 @@
 
 struct axis_desc{
 	int      reduxNum;
-	unsigned isReduced     : 1;
-	int      hwAxisStage0, hwAxisStage1;
-	size_t   len, tmpLen, sliceLen;
-	ssize_t  srcStride,       srcOffset;
-	ssize_t  dstStride,       dstOffset;
-	ssize_t  dstArgStride,    dstArgOffset;
+	int      ibNum;
+	unsigned ibp;
+	unsigned isReduced : 1;
+	unsigned isIntra   : 1;
+	size_t   len;
+	size_t   splitLen;
+	size_t   pdim;
+	ssize_t  srcStride;
+	ssize_t  dstStride;
+	ssize_t  dstArgStride;
 };
 typedef struct axis_desc axis_desc;
 
 /**
- *                    Reduction Kernel Generator.
+ *                    Reduction Kernel Invoker.
  * 
  * INTRO
  * 
@@ -65,20 +66,8 @@ typedef struct axis_desc axis_desc;
  *   1. Maximizing the use of coalesced memory loads within a warp.
  *   2. Maximizing the # of useful threads within a warp.
  *   3. Maximizing the number of warps within a block.
- * 
- *   NOTE: It is possible to guarantee for any tensor problem of at least
- *         2*WARP_SIZE in scale that either
- *         1. All warp blocks in the X dimension have more than 50% threads
- *            active 100% of the time, or
- *         2. The warp blocks in the X dimension have 100% threads active more
- *            than 50% of the time.
- * 
- *   4. Ensuring there are no more blocks than are permitted by the warp
- *      configuration and 2nd-stage workspace size (if required).
- *   5. Ensuring there are no more than 5 blocks per multiprocessor.
- *   6. Minimizing the 2nd-stage workspace (if it is required).
- *   7. Striding the 2nd-stage workspace for maximum convenience (if it is
- *      required). Make it contiguous.
+ *   4. Ensuring there are no more than 5 blocks per multiprocessor.
+ *   5. Minimizing the workspace size (if it is required).
  * 
  * 
  * NOTES
@@ -98,11 +87,9 @@ typedef struct axis_desc axis_desc;
  *  11. Sorted src axes for contiguous memory accesses
  *  12. Ndim, shape and dtype of flattened src tensor
  *  13. Number of stages (1 or 2)
- *  14. Ndim, shape and dtype of workspace tensor
- *  15. Warp     axes
- *  16. Hardware axes
- *  17. Software axes
- *  18. Source code
+ *  14. Size of workspace tensor
+ *  15. Intrablock/split/free/reduced axes
+ *  16. Source code
  * 
  * Rationale for dependencies:
  * 
@@ -110,140 +97,20 @@ typedef struct axis_desc axis_desc;
  *      context is a likely error and we want to fail fast.
  *   2) The type and initializer of the accumulator should be determined after
  *      the context's properties have been retrieved since they provide
- *      information about the device's natively-supported types and operations.
- * 
- * REFERENCES
- * 
- * http://lpgpu.org/wp/wp-content/uploads/2013/05/poster_andresch_acaces2014.pdf
- * 
- * 
- * 
- * 
- * 
- *                           Kernel Template:
- *
- * The following kernel code template displays the code generated for the
- * small code model. For the large code model, no pre/postRedux() kernels
- * are generated (since their functionality is incorporated within the main
- * redux() kernel), no atomicRedux() function needs to be generated because
- * writes to global memory are unconditional and not contended.
- *
- *
- *     //Macros
- *     #define FOROVER
- *     #define ESCAPE
- *     #define srcVal       //Indexer
- *     #define dstVal       //Indexer
- *     #define dstArgVal    //Indexer
- *     #define rdxIdx       //Special reduction index computer
- *
- *
- *     //Typedefs:
- *     typedef  float    S  //The type of the source array.
- *     typedef  float    T  //The type of the destination array.
- *     typedef  ssize_t  A  //The type of the destination argument array.
- *     typedef  ssize_t  X  //The type of the indices: signed 32/64-bit.
- *     typedef  float    K  //The type of the accumulator variable.
- *
- *
- *     //Initializer (in case initial value of accumulator cannot be expressed
- *     //as a literal)
- *     static K    getInitValTFn(void){
- *         return ...
- *     }
- *     static K    getInitValKFn(void){
- *         return ...
- *     }
- *
- *
- *     //Reduce into global memory destination a value.
- *     static void writeBackFn(GLOBAL_MEM T* d_, T d,
- *                             GLOBAL_MEM A* a_, A a){
- *         //Large code model:
- *         *dPtr = d;
- *         *aPtr = a;
- *
- *         //Small code model:
- *         // Something complex possibly involving CAS loops
- *     }
- *
- *
- *     //Load data from source and apply pre-operations, coercing the type to
- *     //the accumulator type K.
- *     static K loadValFn(X i0, X i1, ..., X iN,
- *                        const GLOBAL_MEM S* srcPtr,
- *                        const X             srcOff,
- *                        const GLOBAL_MEM X* srcSteps,
- *                        ...?){
- *         return ...
- *     }
- *
- *
- *     //Initialization kernel
- *     KERNEL void initKer(const GLOBAL_MEM X*        srcSize,
- *                         const GLOBAL_MEM X*        chunkSize,
- *                         GLOBAL_MEM T*              dstPtr,
- *                         const X                    dstOff,
- *                         const GLOBAL_MEM X*        dstSteps){
- *         dstVal = getInitValTFn();
- *     }
- *
- *
- *     //Reduction Kernel.
- *     KERNEL void reduxKer(GLOBAL_MEM S*              srcPtr,
- *                          const X                    srcOff,
- *                          const GLOBAL_MEM X*        srcSteps,
- *                          const GLOBAL_MEM X*        srcSize,
- *                          const GLOBAL_MEM X*        chunkSize,
- *                          GLOBAL_MEM T*              dstPtr,
- *                          const X                    dstOff,
- *                          const GLOBAL_MEM X*        dstSteps,
- *                          GLOBAL_MEM A*              dstArgPtr,
- *                          const X                    dstArgOff,
- *                          const GLOBAL_MEM X*        dstArgSteps){
- *         //Declare Indices
- *         //Compute Ranges
- *
- *         //Outer Loops
- *            K rdxK = getInitValKFn();
- *            A rdxA = 0;
- *            //Inner Loops
- *                K k  = loadValFn(indices..., srcPtr, srcOff, srcSteps)
- *                rdxK = k
- *                rdxA = rdxIdx
- *            writeBackFn(&dstVal, d, &dstArgVal, a);
- *     }
- *
- *
- *     //Post-scalar kernel,
- *     KERNEL void postKer(const GLOBAL_MEM X*        srcSize,
- *                         const GLOBAL_MEM X*        chunkSize,
- *                         GLOBAL_MEM T*              dst,
- *                         const X                    dstOff,
- *                         const GLOBAL_MEM X*        dstSteps){
- *         //Default: Nothing.
- *         dstVal = dstVal
- *     }
- *
- *
- *                           Initial Reduction Values
- * +--------------+-----+-----+---------+---------+-----+-----+-----+-----+-----+
- * | Type\Op      |  +  |  *  |   max   |   min   |  &  |  |  |  ^  | &&  | ||  |
- * +--------------+-----+-----+---------+---------+-----+-----+-----+-----+-----+
- * | signed   int |  0  |  1  | INT_MIN | INT_MAX | ~0  |  0  |  0  | ~0  |  0  |
- * | unsigned int |  0  |  1  |    0    |   ~0    | ~0  |  0  |  0  | ~0  |  0  |
- * | floating     | 0.0 | 1.0 |   NAN   |   NAN   |     |     |     |     |     |
- * +--------------+-----+-----+---------+---------+-----+-----+-----+-----+-----+
+ *      information about the device's natively-supported types and operations
+ *      (e.g. half-precision float)
  */
 
 struct redux_ctx{
 	/* Function Arguments. */
+	GpuReduction*   gr;
 	ga_reduce_op    op;
 	GpuArray*       dst;
 	GpuArray*       dstArg;
 	const GpuArray* src;
 	int             reduxLen;
 	const int*      reduxList;
+	int             flags;
 
 	/* General. */
 	int             nds;          /* # Source              dimensions */
@@ -252,105 +119,146 @@ struct redux_ctx{
 	int             ndfs;         /* # Flattened source    dimensions */
 	int             ndfr;         /* # Flattened source    dimensions */
 	int             ndfd;         /* # Flattened source    dimensions */
-	int             ndt;          /* # Temporary workspace dimensions */
+	int             ndib;         /* # Intra-block         dimensions */
 	int             zeroAllAxes;  /* # of zero-length                   axes in source tensor */
 	int             zeroRdxAxes;  /* # of zero-length         reduction axes in source tensor */
 	size_t          prodAllAxes;  /* Product of length of all           axes in source tensor */
 	size_t          prodRdxAxes;  /* Product of length of all reduction axes in source tensor */
 	size_t          prodFreeAxes; /* Product of length of all free      axes in source tensor */
 	
-	/* GPU Context & Device */
-	gpucontext*     gpuCtx;
-	unsigned        numProcs;
-	size_t          warpSize;
-	size_t          maxLg;
-	size_t          maxLs[MAX_HW_DIMS];
-	size_t          maxGg;
-	size_t          maxGs[MAX_HW_DIMS];
-	
 	/* Flattening */
 	axis_desc*      xdSrc;
-	axis_desc*      xdSrcFlat;
 	axis_desc**     xdSrcPtrs;
 	axis_desc**     xdTmpPtrs;
+
+	/* Invoker */
+	int             phase;
+	size_t          U;
+	size_t          V;
+	size_t          B;
+	unsigned        D;
+	unsigned        H;
+	unsigned        splitReduce;
+	unsigned        splitFree;
+	
+	axis_desc*      xdSplit;
+	
+	size_t*         l;
+	size_t*         lPDim;
+	ssize_t*        sJ;
+	ssize_t*        dJ;
+	ssize_t*        aJ;
 	
-	size_t*         flatSrcDimensions;
-	ssize_t*        flatSrcStrides;
 	gpudata*        flatSrcData;
 	ssize_t         flatSrcOffset;
-	ssize_t*        flatDstStrides;
 	gpudata*        flatDstData;
 	ssize_t         flatDstOffset;
-	ssize_t*        flatDstArgStrides;
 	gpudata*        flatDstArgData;
 	ssize_t         flatDstArgOffset;
 	
-	/* Select number of stages */
-	int             numStages;
+	gpudata*        w;
+	size_t          SHMEM;
+	ssize_t         wdOff;
+	ssize_t         pdOff;
+	ssize_t         waOff;
+	ssize_t         paOff;
 	
-	/* Workspaces, in the case of 2-stage reduction */
-	size_t*         tmpDstDimensions;
-	ssize_t*        tmpDstStrides;
-	gpudata*        tmpDstData;
-	ssize_t*        tmpDstArgStrides;
-	gpudata*        tmpDstArgData;
-
-	/* Source code Generator. */
-	int             srcTypeCode;
-	int             dstTypeCode;
-	int             dstArgTypeCode;
-	int             idxTypeCode;
-	int             accTypeCode;
-	const char*     srcTypeStr;
-	const char*     dstTypeStr;
-	const char*     dstArgTypeStr;
-	const char*     idxTypeStr;
-	const char*     accTypeStr;
-	const char*     initValT;
-	const char*     initValK;
-	strb            s;
-	srcb            srcGen;
-	char*           sourceCode;
-	size_t          sourceCodeLen;
-	char*           errorString0;
-	char*           errorString1;
-	GpuKernel       preKernel;
-	GpuKernel       kernel;
-	GpuKernel       postKernel;
+	unsigned*       ibs;
+	unsigned*       ibp;
+	size_t*         iblPDim;
+	ssize_t*        ibsOff;
+	ssize_t*        ibdOff;
+	ssize_t*        ibaOff;
+	
+	void**          kArgs;
+	
+	
+	/* Scheduler */
+	size_t          bs;
+	size_t          gs;
+};
+typedef struct redux_ctx redux_ctx;
 
-	/**
-	 * Scheduler
-	 *
-	 * There are two sets of kernels that may be scheduled:
-	 *   1) The reduction kernel. This is the only kernel scheduled in the
-	 *      large code model.
-	 *   2) The initialization and post-scalar kernels. These are scheduled
-	 *      only in the small code model.
-	 *
-	 * The reduction kernel is the "primary" kernel. The other two, if needed,
-	 * are referred to as "auxiliary" kernels.
-	 */
 
-	struct{
-		int         ndh;
-		int         ndhp;
-		int         ndhd;
-		int         ndhr;
-		size_t      bs         [MAX_HW_DIMS];
-		size_t      gs         [MAX_HW_DIMS];
-		size_t      cs         [MAX_HW_DIMS];
-		gpudata*    chunkSizeGD;
-	} st1, st2;
+/**
+ *                    Reduction Operator.
+ * 
+ * INTRO
+ * 
+ * Generates the source code for a reduction kernel over arbitrarily-dimensioned,
+ * -shaped and -typed tensors.
+ * 
+ * 
+ * GOALS
+ * 
+ * The generator has the following goals:
+ * 
+ *   1. Maximizing the use of coalesced memory loads within a warp.
+ *   2. Maximizing the # of useful threads within a warp.
+ *   3. Maximizing the number of warps within a block.
+ *   4. Ensuring there are no more than 5 blocks per multiprocessor.
+ *   5. Minimizing the workspace size (if it is required).
+ * 
+ * 
+ * REFERENCES
+ * 
+ * http://lpgpu.org/wp/wp-content/uploads/2013/05/poster_andresch_acaces2014.pdf
+ *
+ *
+ *                           Initial Reduction Values
+ * +--------------+-----+-----+---------+---------+-----+-----+-----+-----+-----+
+ * | Type\Op      |  +  |  *  |   max   |   min   |  &  |  |  |  ^  | &&  | ||  |
+ * +--------------+-----+-----+---------+---------+-----+-----+-----+-----+-----+
+ * | signed   int |  0  |  1  | INT_MIN | INT_MAX | ~0  |  0  |  0  | ~0  |  0  |
+ * | unsigned int |  0  |  1  |    0    |   ~0    | ~0  |  0  |  0  | ~0  |  0  |
+ * | floating     | 0.0 | 1.0 |   NAN   |   NAN   |     |     |     |     |     |
+ * +--------------+-----+-----+---------+---------+-----+-----+-----+-----+-----+
+ */
 
-	/* Invoker */
-	gpudata*        srcStepsGD;
-	gpudata*        srcSizeGD;
-	gpudata*        chunkSizeGD;
-	gpudata*        dstStepsGD;
-	gpudata*        dstArgStepsGD;
+struct GpuReduction{
+	/* Function Arguments. */
+	gpucontext*      gpuCtx;
+	ga_reduce_op     op;
+	int              ndd;
+	int              ndr;
+	int              srcTypeCode;
+	int              flags;
+	
+	/* Misc */
+	int              nds;
+	
+	/* Source code Generator. */
+	strb             s;
+	srcb             srcGen;
+	char*            kSourceCode;
+	size_t           kSourceCodeLen;
+	int              dstTypeCode;
+	int              dstArgTypeCode;
+	int              idxTypeCode;
+	int              accTypeCode;
+	const char*      srcTypeStr;
+	const char*      dstTypeStr;
+	const char*      dstArgTypeStr;
+	const char*      idxTypeStr;
+	const char*      accTypeStr;
+	const char*      initVal;
+	
+	/* Compile */
+	int              log2MaxL;
+	int              kNumArgs;
+	int*             kArgTypeCodes;
+	char*            kErrorString;
+	GpuKernel        k;
+	
+	/* Scheduling */
+	unsigned         numProcs;
+	size_t           maxLg;
+	size_t           maxL0;
+	size_t           maxGg;
+	size_t           maxG0;
+	size_t           maxLM;
+	size_t           maxLK;
 };
-typedef struct redux_ctx redux_ctx;
-
 
 
 /* Static Function prototypes */
@@ -361,123 +269,184 @@ static int        reduxGetMinInit               (int typecode, const char** prop
 static int        reduxGetMaxInit               (int typecode, const char** property);
 static int        reduxGetAndInit               (int typecode, const char** property);
 static int        reduxGetOrInit                (int typecode, const char** property);
+static int        reduxIsSensitive              (int               typecode);
 static int        reduxSortFlatSensitive        (const void* a, const void* b);
 static int        reduxSortFlatInsensitive      (const void* a, const void* b);
-static int        reduxSortPlan1Stage           (const void* a, const void* b);
-static int        reduxSortPlan2Stage0          (const void* a, const void* b);
-static void       appendIdxes                   (strb*       s,
-                                                 const char* prologue,
-                                                 const char* prefix,
-                                                 int         startIdx,
-                                                 int         endIdx,
-                                                 const char* suffix,
-                                                 const char* epilogue);
+static int        reduxSortPtrIBSrcRdSelect     (const void* a, const void* b);
+static int        reduxSortPtrByReduxNum        (const void* a, const void* b);
+static int        reduxSortPtrIBDstWrSelect     (const void* a, const void* b);
+static int        reduxSortPtrIBDstArgWrSelect  (const void* a, const void* b);
+static int        reduxSortPtrFinalOrder        (const void* a, const void* b);
 
 /* Axis Description API */
-static void       axisInit                      (axis_desc*        axis,
-                                                 ssize_t           len,
-                                                 ssize_t           srcStride);
-static void       axisMarkReduced               (axis_desc*        axis, int    reduxNum);
-static int        axisGetReduxNum               (const axis_desc*  axis);
-static size_t     axisGetLen                    (const axis_desc*  axis);
-static size_t     axisGetTmpLen                 (const axis_desc*  axis);
-static size_t     axisGetSliceLen               (const axis_desc*  axis);
-static ssize_t    axisGetSrcStride              (const axis_desc*  axis);
-static size_t     axisGetSrcAbsStride           (const axis_desc*  axis);
-static ssize_t    axisGetSrcOffset              (const axis_desc*  axis);
-static ssize_t    axisGetDstStride              (const axis_desc*  axis);
-static size_t     axisGetDstAbsStride           (const axis_desc*  axis);
-static ssize_t    axisGetDstOffset              (const axis_desc*  axis);
-static ssize_t    axisGetDstArgStride           (const axis_desc*  axis);
-static size_t     axisGetDstArgAbsStride        (const axis_desc*  axis);
-static ssize_t    axisGetDstArgOffset           (const axis_desc*  axis);
-static int        axisIsReduced                 (const axis_desc*  axis);
-static int        axisIsHW                      (const axis_desc*  axis, int stage);
-static int        axisGetHWAxisNum              (const axis_desc*  axis, int stage);
+static void       axisInit                      (axis_desc*           axis,
+                                                 ssize_t              len,
+                                                 ssize_t              srcStride);
+static void       axisMarkReduced               (axis_desc*           axis, int    reduxNum);
+static void       axisMarkIntraBlock            (axis_desc*           axis,
+                                                 int                  ibNum,
+                                                 size_t               ibLen);
+static int        axisGetReduxNum               (const axis_desc*     axis);
+static size_t     axisGetLen                    (const axis_desc*     axis);
+static size_t     axisGetIntraLen               (const axis_desc*     axis);
+static size_t     axisGetInterLen               (const axis_desc*     axis);
+static ssize_t    axisGetSrcStride              (const axis_desc*     axis);
+static size_t     axisGetSrcAbsStride           (const axis_desc*     axis);
+static ssize_t    axisGetDstStride              (const axis_desc*     axis);
+static size_t     axisGetDstAbsStride           (const axis_desc*     axis);
+static ssize_t    axisGetDstArgStride           (const axis_desc*     axis);
+static size_t     axisGetDstArgAbsStride        (const axis_desc*     axis);
+static unsigned   axisGetIBP                    (const axis_desc*     axis);
+static int        axisGetIBNum                  (const axis_desc*     axis);
+static void       axisSetIBP                    (axis_desc*           axis,
+                                                 unsigned             ibp);
+static size_t     axisGetPDim                   (const axis_desc*     axis);
+static void       axisSetPDim                   (axis_desc*           axis,
+                                                 size_t               pdim);
+static int        axisIsReduced                 (const axis_desc*     axis);
+static int        axisIsIntra                   (const axis_desc*     axis);
+static int        axisIsInter                   (const axis_desc*     axis);
+static int        axisIsSplit                   (const axis_desc*     axis);
 
 /* Reduction Context API */
-/*     Utilities */
-static size_t     reduxEstimateParallelism      (const redux_ctx*  ctx);
-static int        reduxRequiresDst              (const redux_ctx*  ctx);
-static int        reduxRequiresDstArg           (const redux_ctx*  ctx);
-static int        reduxKernelRequiresDst        (const redux_ctx*  ctx);
-static int        reduxKernelRequiresDstArg     (const redux_ctx*  ctx);
-static int        reduxIsSensitive              (const redux_ctx*  ctx);
-static int        reduxIs1Stage                 (const redux_ctx*  ctx);
-static int        reduxIs2Stage                 (const redux_ctx*  ctx);
-static axis_desc* reduxGetSrcAxis               (const redux_ctx*  ctx, int i);
-static axis_desc* reduxGetSrcSortAxis           (const redux_ctx*  ctx, int i);
-static axis_desc* reduxGetSrcFlatAxis           (const redux_ctx*  ctx, int i);
-static axis_desc* reduxGetTmpAxis               (const redux_ctx*  ctx, int i);
-static int        reduxTryFlattenInto           (const redux_ctx*  ctx,
+/*     Generator Control Flow */
+static int        reduxGenInit                  (GpuReduction*        gr);
+static int        reduxGenInferProperties       (GpuReduction*        gr);
+static int        reduxGenSrc                   (GpuReduction*        gr);
+static void       reduxGenSrcAppend             (GpuReduction*        gr);
+static void       reduxGenSrcAppendIncludes     (GpuReduction*        gr);
+static void       reduxGenSrcAppendMacroDefs    (GpuReduction*        gr);
+static void       reduxGenSrcAppendTypedefs     (GpuReduction*        gr);
+static void       reduxGenSrcAppendReduxKernel  (GpuReduction*        gr);
+static void       reduxGenSrcAppendPrototype    (GpuReduction*        gr);
+static void       reduxGenSrcAppendBlockDecode  (GpuReduction*        gr);
+static void       reduxGenSrcAppendThreadDecode (GpuReduction*        gr);
+static void       reduxGenSrcAppendPhase0       (GpuReduction*        gr);
+static void       reduxGenSrcAppendLoops        (GpuReduction*        gr,
+                                                 int                  freeMaybeSplit,
+                                                 int                  reduceMaybeSplit);
+static void       reduxGenSrcAppendLoop         (GpuReduction*        gr,
+                                                 int                  initial,
+                                                 int                  freeMaybeSplit,
+                                                 int                  reduceMaybeSplit);
+static void       reduxGenSrcAppendDecrement    (GpuReduction*        gr);
+static void       reduxGenSrcAppendVertical     (GpuReduction*        gr,
+                                                 int                  freeMaybeSplit,
+                                                 int                  reduceMaybeSplit);
+static void       reduxGenSrcAppendIncrement    (GpuReduction*        gr,
+                                                 int                  axis,
+                                                 int                  initial,
+                                                 int                  freeMaybeSplit,
+                                                 int                  reduceMaybeSplit);
+static void       reduxGenSrcAppendDstWrite     (GpuReduction*        gr,
+                                                 int                  initial,
+                                                 int                  freeMaybeSplit,
+                                                 int                  reduceMaybeSplit);
+static void       reduxGenSrcAppendPhase1       (GpuReduction*        gr);
+static int        reduxGenCompile               (GpuReduction*        gr);
+static int        reduxGenComputeLaunchBounds   (GpuReduction*        gr);
+static int        reduxGenCleanup               (GpuReduction*        gr,  int ret);
+static int        reduxGenCleanupMsg            (GpuReduction*        gr,  int ret,
+                                                 const char*          fmt, ...);
+
+/*     Generator Utilities */
+static size_t     reduxGenEstimateParallelism   (const GpuReduction*  gr);
+static int        reduxGenRequiresDst           (const GpuReduction*  gr);
+static int        reduxGenRequiresDstArg        (const GpuReduction*  gr);
+static int        reduxGenKernelRequiresDst     (const GpuReduction*  gr);
+static int        reduxGenKernelRequiresDstArg  (const GpuReduction*  gr);
+static int        reduxGenAxisMaybeSplit        (const GpuReduction*  gr, int axis);
+static size_t     reduxGenGetReduxStateSize     (const GpuReduction*  gr);
+static size_t     reduxGenGetMaxLocalSize       (const GpuReduction*  gr);
+static size_t     reduxGenGetSHMEMSize          (const GpuReduction*  gr, size_t bs);
+static size_t     reduxGenGetSHMEMDstOff        (const GpuReduction*  gr, size_t bs);
+static size_t     reduxGenGetSHMEMDstArgOff     (const GpuReduction*  gr, size_t bs);
+
+/*     Invoker Control Flow */
+static int        reduxInvInit                  (redux_ctx*           ctx);
+static int        reduxInvInferProperties       (redux_ctx*           ctx);
+static int        reduxInvFlattenSource         (redux_ctx*           ctx);
+static int        reduxInvComputeKArgs          (redux_ctx*           ctx);
+static int        reduxInvSchedule              (redux_ctx*           ctx);
+static int        reduxInvoke                   (redux_ctx*           ctx);
+static int        reduxInvCleanup               (redux_ctx*           ctx, int ret);
+static int        reduxInvCleanupMsg            (redux_ctx*           ctx, int ret,
+                                                 const char*          fmt, ...);
+
+/*     Invoker Utilities */
+static size_t     reduxInvEstimateParallelism   (const redux_ctx*  ctx);
+static int        reduxInvRequiresDst           (const redux_ctx*  ctx);
+static int        reduxInvRequiresDstArg        (const redux_ctx*  ctx);
+static int        reduxInvKernelRequiresDst     (const redux_ctx*  ctx);
+static unsigned   reduxInvGetSplitFree          (const redux_ctx*  ctx);
+static unsigned   reduxInvGetSplitReduce        (const redux_ctx*  ctx);
+static axis_desc* reduxInvGetSrcAxis            (const redux_ctx*  ctx, int i);
+static axis_desc* reduxInvGetSrcSortAxis        (const redux_ctx*  ctx, int i);
+static int        reduxTryFlattenOut            (const redux_ctx*  ctx,
+                                                 const axis_desc*  out);
+static int        reduxTryFlattenInto           (redux_ctx*        ctx,
                                                  axis_desc*        into,
                                                  const axis_desc*  from);
 static void       reduxSortAxisPtrsBy           (axis_desc**       ptrs,
                                                  axis_desc*        axes,
                                                  size_t            numAxes,
                                                  int(*fn)(const void*, const void*));
-/*     Control Flow */
-static int        reduxInit                     (redux_ctx*        ctx);
-static int        reduxInferProperties          (redux_ctx*        ctx);
-static int        reduxFlattenSource            (redux_ctx*        ctx);
-static int        reduxSelectNumStages          (redux_ctx*        ctx);
-static int        reduxPlan1Stage               (redux_ctx*        ctx);
-static int        reduxPlan2Stage               (redux_ctx*        ctx);
-static int        reduxGenSource                (redux_ctx*        ctx);
-static void       reduxAppendSource             (redux_ctx*        ctx);
-static void       reduxAppendIncludes           (redux_ctx*        ctx);
-static void       reduxAppendTensorDeclArgs     (redux_ctx*        ctx,
-                                                 const char*       type,
-                                                 const char*       baseName);
-static void       reduxAppendTensorCallArgs     (redux_ctx*        ctx,
-                                                 const char*       baseName);
-static void       reduxAppendMacroDefs          (redux_ctx*        ctx);
-static void       reduxAppendTypedefs           (redux_ctx*        ctx);
-static void       reduxAppendGetInitValFns      (redux_ctx*        ctx);
-static void       reduxAppendWriteBackFn        (redux_ctx*        ctx);
-static void       reduxAppendReduxKernel        (redux_ctx*        ctx);
-static void       reduxAppendPrototype          (redux_ctx*        ctx);
-static void       reduxAppendIndexDeclarations  (redux_ctx*        ctx);
-static void       reduxAppendRangeCalculations  (redux_ctx*        ctx);
-static void       reduxAppendLoops              (redux_ctx*        ctx);
-static int        reduxCompile                  (redux_ctx*        ctx);
-static int        reduxSchedule                 (redux_ctx*        ctx);
-static void       reduxScheduleKernel           (int               ndims,
-                                                 uint64_t*         dims,
-                                                 uint64_t          warpSize,
-                                                 uint64_t          maxLg,
-                                                 uint64_t*         maxLs,
-                                                 uint64_t          maxGg,
-                                                 uint64_t*         maxGs,
-                                                 uint64_t*         bs,
-                                                 uint64_t*         gs,
-                                                 uint64_t*         cs);
-static int        reduxInvoke                   (redux_ctx*        ctx);
-static int        reduxCleanup                  (redux_ctx*        ctx, int ret);
-static int        reduxCleanupMsg               (redux_ctx*        ctx, int ret,
-                                                 const char*       fmt, ...);
-
-
-/* Function implementation */
-GPUARRAY_PUBLIC int  GpuArray_reduction   (ga_reduce_op    op,
-                                           GpuArray*       dst,
-                                           GpuArray*       dstArg,
-                                           const GpuArray* src,
-                                           unsigned        reduxLen,
-                                           const unsigned* reduxList){
+
+
+/* Function Implementations */
+/* Extern Functions */
+GPUARRAY_PUBLIC int   GpuReduction_new   (GpuReduction**   grOut,
+                                          gpucontext*      gpuCtx,
+                                          ga_reduce_op     op,
+                                          unsigned         ndf,
+                                          unsigned         ndr,
+                                          int              srcTypeCode,
+                                          int              flags){
+	if(!grOut){
+		return GA_INVALID_ERROR;
+	}
+	
+	*grOut = calloc(1, sizeof(**grOut));
+	if(*grOut){
+		(*grOut)->gpuCtx      = gpuCtx;
+		(*grOut)->op          = op;
+		(*grOut)->ndd         = (int)ndf;
+		(*grOut)->ndr         = (int)ndr;
+		(*grOut)->srcTypeCode = srcTypeCode;
+		(*grOut)->flags       = flags;
+		
+		return reduxGenInit(*grOut);
+	}else{
+		return GA_MEMORY_ERROR;
+	}
+}
+GPUARRAY_PUBLIC void  GpuReduction_free  (GpuReduction*    gr){
+	reduxGenCleanup(gr, !GA_NO_ERROR);
+}
+GPUARRAY_PUBLIC int   GpuReduction_call  (GpuReduction*    gr,
+                                          GpuArray*        dst,
+                                          GpuArray*        dstArg,
+                                          const GpuArray*  src,
+                                          unsigned         reduxLen,
+                                          const int*       reduxList,
+                                          int              flags){
 	redux_ctx ctxSTACK, *ctx = &ctxSTACK;
 	memset(ctx, 0, sizeof(*ctx));
 
-	ctx->op        = op;
+	ctx->gr        = gr;
 	ctx->dst       = dst;
 	ctx->dstArg    = dstArg;
 	ctx->src       = src;
 	ctx->reduxLen  = reduxLen;
-	ctx->reduxList = (const int*)reduxList;
+	ctx->reduxList = reduxList;
+	ctx->flags     = flags;
 
-	return reduxInit(ctx);
+	return reduxInvInit(ctx);
 }
 
+
+/* Static Functions */
+
 /**
  * @brief Get an expression representing a suitable initialization value for
  *        the given datatype and a sum-reduction operation.
@@ -747,6 +716,45 @@ static int        reduxGetOrInit                (int typecode, const char** prop
 	return GA_NO_ERROR;
 }
 
+/**
+ * @brief Returns whether the reduction is sensitive.
+ * 
+ * A reduction is sensitive when its output satisfies at least one of the
+ * following conditions:
+ * 
+ *   - It depends on the exact order of axes in the reduxList
+ *   - It depends on exact signs of the strides of axes in the reduxList
+ * 
+ * Such sensitivity may prevent a flattening of contiguous axes even when it
+ * would have been otherwise permitted.
+ * 
+ * For instance, ARGMIN/ARGMAX have this sensitivity, because the dstArg
+ * tensor's contents are flattened coordinates into the source tensor, and
+ * the flattening order is precisely reduxList. Permuting it would thus produce
+ * incorrect output. Moreover, if the strides of a reduction axis were to be
+ * reversed for the purpose of flattening the axis into another, the computed
+ * coordinate would again be incorrect.
+ * 
+ * 
+ * TL;DR: Reduction is sensitive if
+ *   reduce(x, axis=axisList) != reduce(x, axis=axisList[::-1])
+ * or
+ *   reduce(x) != reduce(x[::-1])
+ * .
+ */
+
+static int        reduxIsSensitive              (int               typecode){
+	switch (typecode){
+		case GA_REDUCE_MINANDARGMIN:
+		case GA_REDUCE_MAXANDARGMAX:
+		case GA_REDUCE_ARGMIN:
+		case GA_REDUCE_ARGMAX:
+		  return 1;
+		default:
+		  return 0;
+	}
+}
+
 /**
  * @brief Sort the axes into optimal order for flattening.
  * 
@@ -811,78 +819,155 @@ static int        reduxSortFlatSensitive        (const void* a, const void* b){
 }
 
 /**
- * For the plan of a 1-stage reduction, we need to sort the free axes by
- * decreasing length.
+ * @brief Sort the axes into optimal order for contiguous memory access.
+ * 
+ * This means ascending order of absolute stride.
  */
 
-static int        reduxSortPlan1Stage           (const void* a, const void* b){
+static int        reduxSortPtrIBSrcRdSelect     (const void* a, const void* b){
 	const axis_desc* xda  = *(const axis_desc* const*)a;
 	const axis_desc* xdb  = *(const axis_desc* const*)b;
+	
+	if       (axisGetSrcAbsStride(xda)  <  axisGetSrcAbsStride(xdb)){
+		return -1;
+	}else if (axisGetSrcAbsStride(xda)  >  axisGetSrcAbsStride(xdb)){
+		return +1;
+	}
 
-	if       ( axisIsReduced(xda)      && !axisIsReduced(xdb)){
+	return 0;
+}
+static int        reduxSortPtrByReduxNum        (const void* a, const void* b){
+	const axis_desc* xda  = *(const axis_desc* const*)a;
+	const axis_desc* xdb  = *(const axis_desc* const*)b;
+	
+	if       ( axisIsReduced(xda)  && !axisIsReduced(xdb)){
+		return -1;
+	}else if (!axisIsReduced(xda)  &&  axisIsReduced(xdb)){
 		return +1;
-	}else if (!axisIsReduced(xda)      &&  axisIsReduced(xdb)){
+	}
+	
+	if       (axisGetReduxNum(xda)  <  axisGetReduxNum(xdb)){
+		return +1;
+	}else if (axisGetReduxNum(xda)  >  axisGetReduxNum(xdb)){
 		return -1;
 	}
 
-	return axisGetLen(xda)<axisGetLen(xdb) ? +1 : -1;
+	return 0;
 }
-
-/**
- * For the plan of Stage 0 in a 2-stage reduction, we need to sort such that
- * reduction axes come first and are ordered by decreasing length.
- */
-
-static int        reduxSortPlan2Stage0          (const void* a, const void* b){
+static int        reduxSortPtrIBDstWrSelect     (const void* a, const void* b){
 	const axis_desc* xda  = *(const axis_desc* const*)a;
 	const axis_desc* xdb  = *(const axis_desc* const*)b;
-
-	if       ( axisIsReduced(xda)      && !axisIsReduced(xdb)){
+	
+	/* All intra axes go first. */
+	if       (axisIsIntra(xda)  &&  axisIsInter(xdb)){
 		return -1;
-	}else if (!axisIsReduced(xda)      &&  axisIsReduced(xdb)){
+	}else if (axisIsInter(xda)  &&  axisIsIntra(xdb)){
+		return +1;
+	}
+	
+	/* All free axes go first (for lower stride within SHMEM[H][D]). */
+	if       ( axisIsReduced(xda)  && !axisIsReduced(xdb)){
+		return +1;
+	}else if (!axisIsReduced(xda)  &&  axisIsReduced(xdb)){
+		return -1;
+	}
+	
+	/* The split axis, if it is free, goes last within the free axes. */
+	if       ( axisIsSplit(xda)  && !axisIsReduced(xda)){
+		return +1;
+	}else if ( axisIsSplit(xdb)  && !axisIsReduced(xdb)){
+		return -1;
+	}
+	
+	/* Otherwise it's sort by destination absolute stride. */
+	if       (axisGetDstAbsStride(xda)  <  axisGetDstAbsStride(xdb)){
+		return -1;
+	}else if (axisGetDstAbsStride(xda)  >  axisGetDstAbsStride(xdb)){
 		return +1;
 	}
 
-	return axisGetLen(xda)<axisGetLen(xdb) ? +1 : -1;
+	return 0;
 }
+static int        reduxSortPtrIBDstArgWrSelect  (const void* a, const void* b){
+	const axis_desc* xda  = *(const axis_desc* const*)a;
+	const axis_desc* xdb  = *(const axis_desc* const*)b;
+	
+	/* All intra axes go first. */
+	if       (axisIsIntra(xda)  &&  axisIsInter(xdb)){
+		return -1;
+	}else if (axisIsInter(xda)  &&  axisIsIntra(xdb)){
+		return +1;
+	}
+	
+	/* All free axes go first (for lower stride within SHMEM[H][D]). */
+	if       ( axisIsReduced(xda)  && !axisIsReduced(xdb)){
+		return +1;
+	}else if (!axisIsReduced(xda)  &&  axisIsReduced(xdb)){
+		return -1;
+	}
+	
+	/* The split axis, if it is free, goes last within the free axes. */
+	if       ( axisIsSplit(xda)  && !axisIsReduced(xda)){
+		return +1;
+	}else if ( axisIsSplit(xdb)  && !axisIsReduced(xdb)){
+		return -1;
+	}
+	
+	/* Otherwise it's sort by destination argument absolute stride. */
+	if       (axisGetDstArgAbsStride(xda)  <  axisGetDstArgAbsStride(xdb)){
+		return -1;
+	}else if (axisGetDstArgAbsStride(xda)  >  axisGetDstArgAbsStride(xdb)){
+		return +1;
+	}
 
-/**
- * @brief Append a comma-separated list of indices, whose name contains an
- *        incrementing integer, to a string buffer.
- *
- *
- * @param [in]  s         The string buffer to which to append.
- * @param [in]  prologue  Text that is prepended in front and NOT repeated.
- * @param [in]  prefix    Text that is prepended in front of the integer and
- *                        repeated.
- * @param [in]  startIdx  First value of the integer (inclusive)
- * @param [in]  endIdx    Last  value of the integer (exclusive)
- * @param [in]  suffix    Text that is appended after the integer, followed by
- *                        a comma if it isn't the last index, and repeated.
- * @param [in]  epilogue  Text that is appended and NOT repeated.
- */
-
-static void       appendIdxes                   (strb*       s,
-                                                 const char* prologue,
-                                                 const char* prefix,
-                                                 int         startIdx,
-                                                 int         endIdx,
-                                                 const char* suffix,
-                                                 const char* epilogue){
-	int i;
-
-	prologue = prologue ? prologue : "";
-	prefix   = prefix   ? prefix   : "";
-	suffix   = suffix   ? suffix   : "";
-	epilogue = epilogue ? epilogue : "";
-
-	strb_appends(s, prologue);
-	for (i=startIdx;i<endIdx;i++){
-		strb_appendf(s, "%s%d%s%s", prefix, i, suffix, &","[i==endIdx-1]);
+	return 0;
+}
+static int        reduxSortPtrFinalOrder        (const void* a, const void* b){
+	const axis_desc* xda  = *(const axis_desc* const*)a;
+	const axis_desc* xdb  = *(const axis_desc* const*)b;
+	
+	/* All intra axes go last. */
+	if       (axisIsIntra(xda)  &&  axisIsInter(xdb)){
+		return +1;
+	}else if (axisIsInter(xda)  &&  axisIsIntra(xdb)){
+		return -1;
+	}
+	
+	
+	if(axisIsIntra(xda)){
+		/**
+		 * Intra axes sort between themselves by descending intra axis number.
+		 * The split axis is always intra, and since it has the highest intra
+		 * axis number it will always sort first.
+		 */
+		
+		if       (axisGetIBNum(xda)  <  axisGetIBNum(xdb)){
+			return +1;
+		}else if (axisGetIBNum(xda)  >  axisGetIBNum(xdb)){
+			return -1;
+		}
+		
+		return 0;
+	}else{
+		/* All free inter axes go first (i{0..3}) */
+		if       ( axisIsReduced(xda)  && !axisIsReduced(xdb)){
+			return +1;
+		}else if (!axisIsReduced(xda)  &&  axisIsReduced(xdb)){
+			return -1;
+		}
+		
+		/* Otherwise it's sort by descending source argument absolute stride. */
+		if       (axisGetSrcAbsStride(xda)  <  axisGetSrcAbsStride(xdb)){
+			return +1;
+		}else if (axisGetSrcAbsStride(xda)  >  axisGetSrcAbsStride(xdb)){
+			return -1;
+		}
 	}
-	strb_appends(s, epilogue);
+
+	return 0;
 }
 
+
 /* Axis Description API */
 
 /**
@@ -895,19 +980,15 @@ static void       axisInit                      (axis_desc*       axis,
 	memset(axis, 0, sizeof(*axis));
 	
 	axis->reduxNum        = -1;
-	axis->hwAxisStage0    = axis->hwAxisStage1 = -1;
+	axis->ibNum           = -1;
+	axis->ibp             = 0;
 	axis->len             = len;
-	axis->tmpLen          = 0;
-	axis->sliceLen        = 0;
+	axis->splitLen        = 1;
+	axis->pdim            = 0;
 	
 	axis->srcStride       = srcStride;
-	axis->srcOffset       = 0;
-	
 	axis->dstStride       = 0;
-	axis->dstOffset       = 0;
-	
 	axis->dstArgStride    = 0;
-	axis->dstArgOffset    = 0;
 }
 
 /**
@@ -919,6 +1000,18 @@ static void       axisMarkReduced               (axis_desc*       axis, int    r
 	axis->reduxNum  = reduxNum;
 }
 
+/**
+ * @brief Mark axis as (split) intrablock axis.
+ */
+
+static void       axisMarkIntraBlock            (axis_desc*       axis,
+                                                 int              ibNum,
+                                                 size_t           ibLen){
+	axis->isIntra  = 1;
+	axis->ibNum    = ibNum;
+	axis->splitLen = ibLen;
+}
+
 /**
  * @brief Get properties of an axis.
  */
@@ -929,11 +1022,23 @@ static int        axisGetReduxNum               (const axis_desc* axis){
 static size_t     axisGetLen                    (const axis_desc* axis){
 	return axis->len;
 }
-static size_t     axisGetTmpLen                 (const axis_desc* axis){
-	return axis->tmpLen;
+static size_t     axisGetIntraLen               (const axis_desc* axis){
+	if       (axisIsSplit(axis)){
+		return axis->splitLen;
+	}else if (axisIsIntra(axis)){
+		return axis->len;
+	}else{
+		return 1;
+	}
 }
-static size_t     axisGetSliceLen               (const axis_desc* axis){
-	return axis->sliceLen;
+static size_t     axisGetInterLen               (const axis_desc* axis){
+	if       (axisIsSplit(axis)){
+		return DIVIDECEIL(axis->len, axis->splitLen);
+	}else if (axisIsIntra(axis)){
+		return 1;
+	}else{
+		return axis->len;
+	}
 }
 static ssize_t    axisGetSrcStride              (const axis_desc* axis){
 	return axisGetLen(axis) > 1 ? axis->srcStride : 0;
@@ -942,9 +1047,6 @@ static size_t     axisGetSrcAbsStride           (const axis_desc* axis){
 	return axisGetSrcStride(axis)<0 ? -(size_t)axisGetSrcStride(axis):
 	                                  +(size_t)axisGetSrcStride(axis);
 }
-static ssize_t    axisGetSrcOffset              (const axis_desc* axis){
-	return axis->srcOffset;
-}
 static ssize_t    axisGetDstStride              (const axis_desc* axis){
 	return axisGetLen(axis) > 1 ? axis->dstStride : 0;
 }
@@ -952,9 +1054,6 @@ static size_t     axisGetDstAbsStride           (const axis_desc* axis){
 	return axisGetDstStride(axis)<0 ? -(size_t)axisGetDstStride(axis):
 	                                  +(size_t)axisGetDstStride(axis);
 }
-static ssize_t    axisGetDstOffset              (const axis_desc* axis){
-	return axis->dstOffset;
-}
 static ssize_t    axisGetDstArgStride           (const axis_desc* axis){
 	return axisGetLen(axis) > 1 ? axis->dstArgStride : 0;
 }
@@ -962,175 +1061,70 @@ static size_t     axisGetDstArgAbsStride        (const axis_desc* axis){
 	return axisGetDstArgStride(axis)<0 ? -(size_t)axisGetDstArgStride(axis):
 	                                     +(size_t)axisGetDstArgStride(axis);
 }
-static ssize_t    axisGetDstArgOffset           (const axis_desc* axis){
-	return axis->dstArgOffset;
+static unsigned   axisGetIBP                    (const axis_desc* axis){
+	return axis->ibp;
 }
-static int        axisIsReduced                 (const axis_desc* axis){
+static int        axisGetIBNum                  (const axis_desc* axis){
+	return axis->ibNum;
+}
+static void       axisSetIBP                    (axis_desc*       axis,
+                                                 unsigned         ibp){
+	axis->ibp = ibp;
+}
+static size_t     axisGetPDim                   (const axis_desc*     axis){
+	return axis->pdim;
+}
+static void       axisSetPDim                   (axis_desc*           axis,
+                                                 size_t               pdim){
+	axis->pdim = pdim;
+}
+static int        axisIsReduced                 (const axis_desc* axis){
 	return axis->isReduced;
 }
-static int        axisIsHW                      (const axis_desc* axis, int stage){
-	return (stage == 0 ? axis->hwAxisStage0 : axis->hwAxisStage1) >= 0;
+static int        axisIsIntra                   (const axis_desc* axis){
+	return axis->isIntra;
 }
-static int        axisIsPartialHW               (const axis_desc* axis, int stage){
-	return axisIsHW(axis, stage) && axis->sliceLen != axis->len;
+static int        axisIsInter                   (const axis_desc* axis){
+	return !axisIsIntra(axis);
 }
-static int        axisGetHWAxisNum              (const axis_desc* axis, int stage){
-	return stage == 0 ? axis->hwAxisStage0 : axis->hwAxisStage1;
+static int        axisIsSplit                   (const axis_desc* axis){
+	return axisIsIntra(axis) && axis->splitLen != axis->len;
 }
-
-/**
- * @brief Estimate the level of parallelism in the device.
- * 
- * This is a rough target number of threads.  It would definitely fill the
- * device, plus some substantial margin.
- */
-
-static size_t     reduxEstimateParallelism      (const redux_ctx*  ctx){
-	/**
-	 * An arbitrary margin factor ensuring there will be a few thread blocks
-	 * per SMX.
-	 * 
-	 * E.g. on Kepler, each SMX can handle up to two 1024-thread blocks
-	 * simultaneously, so a margin of 6/SMX should ensure with very high
-	 * likelyhood that all SMXes will be fed and kept busy.
-	 */
-	
-	size_t marginFactor = 6;
-	
-	return marginFactor*ctx->numProcs*ctx->maxLg;
+static size_t     reduxInvEstimateParallelism   (const redux_ctx*  ctx){
+	return reduxGenEstimateParallelism(ctx->gr);
 }
-
-/**
- * @brief Returns whether the reduction interface requires a dst argument.
- */
-
-static int        reduxRequiresDst              (const redux_ctx*  ctx){
-	switch (ctx->op){
-		case GA_REDUCE_ARGMIN:
-		case GA_REDUCE_ARGMAX:
-		  return 0;
-		default:
-		  return 1;
-	}
+static int        reduxInvRequiresDst           (const redux_ctx*  ctx){
+	return reduxGenRequiresDst(ctx->gr);
 }
-
-/**
- * @brief Returns whether the reduction interface requires a dstArg argument.
- */
-
-static int        reduxRequiresDstArg           (const redux_ctx*  ctx){
-	switch (ctx->op){
-		case GA_REDUCE_MINANDARGMIN:
-		case GA_REDUCE_MAXANDARGMAX:
-		case GA_REDUCE_ARGMIN:
-		case GA_REDUCE_ARGMAX:
-		  return 1;
-		default:
-		  return 0;
-	}
+static int        reduxInvRequiresDstArg        (const redux_ctx*  ctx){
+	return reduxGenRequiresDstArg(ctx->gr);
 }
-
-/**
- * @brief Returns whether the generated kernel internally requires a dst
- *        argument.
- *
- * This is semantically subtly different from reduxHasDst(). The main
- * difference is in the implementation of the GA_REDUCE_ARGMIN/ARGMAX
- * reductions; Either *might* require a dst buffer, which will have to be
- * allocated, even though it will be discarded.
- */
-
-static int        reduxKernelRequiresDst        (const redux_ctx*  ctx){
-	switch (ctx->op){
-		case GA_REDUCE_ARGMIN:
-		case GA_REDUCE_ARGMAX:
-		  return reduxIs2Stage(ctx);
-		default:
-		  return 1;
-	}
+static int        reduxInvKernelRequiresDst     (const redux_ctx*  ctx){
+	return reduxGenKernelRequiresDst(ctx->gr);
 }
-
-/**
- * @brief Returns whether the generated kernel internally requires a dstArg
- *        argument.
- *
- * This is semantically subtly different from reduxHasDstArg(), since it asks
- * whether the reduction, even though it does not accept a dstArg argument,
- * still requires a dstArg internally.
- */
-
-static int        reduxKernelRequiresDstArg     (const redux_ctx*  ctx){
-	/**
-	 * At present there exists no reduction whose implementation requires
-	 * a dstArg but whose interface does not.
-	 *
-	 * E.g. the max() and min() reductions do NOT currently require a temporary
-	 *      buffer for indexes, and will not in the foreseeable future.
-	 */
-
-	return reduxRequiresDstArg(ctx);
+static int        reduxInvKernelRequiresDstArg  (const redux_ctx*  ctx){
+	return reduxGenKernelRequiresDstArg(ctx->gr);
 }
-
-/**
- * @brief Returns whether the reduction is sensitive.
- * 
- * A reduction is sensitive when its output satisfies at least one of the
- * following conditions:
- * 
- *   - It depends on the exact order of axes in the reduxList
- *   - It depends on exact signs of the strides of axes in the reduxList
- * 
- * Such sensitivity may prevent a flattening of contiguous axes even when it
- * would have been otherwise permitted.
- * 
- * For instance, ARGMIN/ARGMAX have this sensitivity, because the dstArg
- * tensor's contents are flattened coordinates into the source tensor, and
- * the flattening order is precisely reduxList. Permuting it would thus produce
- * incorrect output. Moreover, if the strides of a reduction axis were to be
- * reversed for the purpose of flattening the axis into another, the computed
- * coordinate would again be incorrect.
- * 
- * 
- * TL;DR: Reduction is sensitive if
- *   reduce(x, axis=axisList) != reduce(x, axis=axisList[::-1])
- * or
- *   reduce(x) != reduce(x[::-1])
- * .
- */
-
-static int        reduxIsSensitive              (const redux_ctx*  ctx){
-	switch (ctx->op){
-		case GA_REDUCE_MINANDARGMIN:
-		case GA_REDUCE_MAXANDARGMAX:
-		case GA_REDUCE_ARGMIN:
-		case GA_REDUCE_ARGMAX:
-		  return 1;
-		default:
-		  return 0;
+static unsigned   reduxInvGetSplitFree          (const redux_ctx*  ctx){
+	if(ctx->xdSplit && !axisIsReduced(ctx->xdSplit)){
+		return axisGetIntraLen(ctx->xdSplit);
+	}else{
+		return 1;
 	}
 }
-
-/**
- * @brief Is the reduction 1-stage?
- */
-
-static int        reduxIs1Stage                 (const redux_ctx*  ctx){
-	return ctx->numStages == 1;
-}
-
-/**
- * @brief Is the reduction 2-stage?
- */
-
-static int        reduxIs2Stage                 (const redux_ctx*  ctx){
-	return !reduxIs1Stage(ctx);
+static unsigned   reduxInvGetSplitReduce        (const redux_ctx*  ctx){
+	if(ctx->xdSplit && axisIsReduced(ctx->xdSplit)){
+		return axisGetIntraLen(ctx->xdSplit);
+	}else{
+		return 1;
+	}
 }
 
 /**
  * @brief Get description of source axis with given number.
  */
 
-static axis_desc* reduxGetSrcAxis               (const redux_ctx*  ctx, int i){
+static axis_desc* reduxInvGetSrcAxis            (const redux_ctx*  ctx, int i){
 	return &ctx->xdSrc[i];
 }
 
@@ -1138,24 +1132,30 @@ static axis_desc* reduxGetSrcAxis               (const redux_ctx*  ctx, int i){
  * @brief Get description of source axis with given number in sort-order.
  */
 
-static axis_desc* reduxGetSrcSortAxis           (const redux_ctx*  ctx, int i){
+static axis_desc* reduxInvGetSrcSortAxis        (const redux_ctx*  ctx, int i){
 	return ctx->xdSrcPtrs[i];
 }
 
 /**
- * @brief Get description of flattened source axis with given number.
- */
-
-static axis_desc* reduxGetSrcFlatAxis           (const redux_ctx*  ctx, int i){
-	return &ctx->xdSrcFlat[i];
-}
-
-/**
- * @brief Get description of temporary axis with given number.
+ * @brief Attempt to flatten out an axis from the context.
+ * 
+ * An axis can be flattened out if:
+ * 
+ *   1. The axis is of length 1.
+ *   2. The axis is a reduction axis, and there exists at least one reduction
+ *      axis of length 0 in the source tensor.
+ * 
+ * @return Non-zero if flattening attempt successful; Zero otherwise.
  */
 
-static axis_desc* reduxGetTmpAxis               (const redux_ctx*  ctx, int i){
-	return ctx->xdTmpPtrs[i];
+static int        reduxTryFlattenOut            (const redux_ctx*  ctx,
+                                                 const axis_desc*  out){
+	if ((axisGetLen   (out) == 1                   )||
+	    (axisIsReduced(out) && ctx->zeroRdxAxes > 0)){
+		return 1;
+	}else{
+		return 0;
+	}
 }
 
 /**
@@ -1179,7 +1179,7 @@ static axis_desc* reduxGetTmpAxis               (const redux_ctx*  ctx, int i){
  * @return Non-zero if flattening attempt successful; Zero otherwise.
  */
 
-static int        reduxTryFlattenInto           (const redux_ctx*  ctx,
+static int        reduxTryFlattenInto           (redux_ctx*        ctx,
                                                  axis_desc*        into,
                                                  const axis_desc*  from){
 	int signSrc    = 0, signDst    = 0, signDstArg    = 0,
@@ -1190,12 +1190,12 @@ static int        reduxTryFlattenInto           (const redux_ctx*  ctx,
 		return 0;
 	}
 	
-	if (reduxRequiresDst(ctx) &&
+	if (reduxInvRequiresDst   (ctx) &&
 	    axisGetDstAbsStride   (into) != axisGetDstAbsStride   (from)*axisGetLen(from)){
 		return 0;
 	}
 	
-	if (reduxRequiresDstArg(ctx) &&
+	if (reduxInvRequiresDstArg(ctx) &&
 	    axisGetDstArgAbsStride(into) != axisGetDstArgAbsStride(from)*axisGetLen(from)){
 		return 0;
 	}
@@ -1204,47 +1204,44 @@ static int        reduxTryFlattenInto           (const redux_ctx*  ctx,
 	signDst       = (axisGetDstStride   (into)^axisGetDstStride   (from)) < 0;
 	signDstArg    = (axisGetDstArgStride(into)^axisGetDstArgStride(from)) < 0;
 	reverseSrc    = signSrc;
-	reverseDst    = signDst    && reduxRequiresDst   (ctx);
-	reverseDstArg = signDstArg && reduxRequiresDstArg(ctx);
+	reverseDst    = signDst    && reduxInvRequiresDst   (ctx);
+	reverseDstArg = signDstArg && reduxInvRequiresDstArg(ctx);
 	
-	if (reduxIsSensitive(ctx)){
+	if (reduxIsSensitive(ctx->op)){
 		if(reverseSrc || reverseDst || reverseDstArg){
 			return 0;
 		}
 	}
 	
-	if (reduxRequiresDst   (ctx) &&
-	    reduxRequiresDstArg(ctx) &&
+	if (reduxInvRequiresDst   (ctx) &&
+	    reduxInvRequiresDstArg(ctx) &&
 	    reverseDst != reverseDstArg){
 		/* Either both, or neither, of dst and dstArg must require reversal. */
 		return 0;
 	}
 	
 	if (reverseSrc){
-		into->srcOffset    += (ssize_t)(axisGetLen(from)-1)*axisGetSrcStride(from);
-		into->srcStride     = -axisGetSrcStride   (from);
+		ctx->flatSrcOffset    += (ssize_t)(axisGetLen(from)-1)*axisGetSrcStride(from);
+		into->srcStride        = -axisGetSrcStride   (from);
 	}else{
-		into->srcStride     =  axisGetSrcStride   (from);
+		into->srcStride        =  axisGetSrcStride   (from);
 	}
 	
 	if (reverseDst){
-		into->dstOffset    += (ssize_t)(axisGetLen(from)-1)*axisGetDstStride(from);
-		into->dstStride     = -axisGetDstStride   (from);
+		ctx->flatDstOffset    += (ssize_t)(axisGetLen(from)-1)*axisGetDstStride(from);
+		into->dstStride        = -axisGetDstStride   (from);
 	}else{
-		into->dstStride     =  axisGetDstStride   (from);
+		into->dstStride        =  axisGetDstStride   (from);
 	}
 	
 	if (reverseDstArg){
-		into->dstArgOffset += (ssize_t)(axisGetLen(from)-1)*axisGetDstArgStride(from);
-		into->dstArgStride  = -axisGetDstArgStride(from);
+		ctx->flatDstArgOffset += (ssize_t)(axisGetLen(from)-1)*axisGetDstArgStride(from);
+		into->dstArgStride     = -axisGetDstArgStride(from);
 	}else{
-		into->dstArgStride  =  axisGetDstArgStride(from);
+		into->dstArgStride     =  axisGetDstArgStride(from);
 	}
 	
-	into->srcOffset    += axisGetSrcOffset   (from);
-	into->dstOffset    += axisGetDstOffset   (from);
-	into->dstArgOffset += axisGetDstArgOffset(from);
-	into->len          *= axisGetLen         (from);
+	into->len *= axisGetLen(from);
 	
 	return 1;
 }
@@ -1267,159 +1264,76 @@ static void       reduxSortAxisPtrsBy           (axis_desc**       ptrs,
 	qsort(ptrs, numAxes, sizeof(*ptrs), fn);
 }
 
+
 /**
- * @brief Initialize the context.
+ * @brief Initialize generator context.
  * 
- * After this function, calling reduxCleanup() becomes safe.
+ * After this function, calling reduxGenCleanup*() becomes safe.
  */
 
-static int        reduxInit                     (redux_ctx*  ctx){
-	int i;
-
-	/**
-	 * We initialize certain parts of the context.
-	 */
-
-	ctx->gpuCtx        = NULL;
-
-	ctx->srcTypeStr    = ctx->dstTypeStr    = ctx->dstArgTypeStr =
-	ctx->accTypeStr    = ctx->idxTypeStr    = NULL;
-	ctx->initValK      = NULL;
-	ctx->sourceCode    = NULL;
-	ctx->errorString0  = NULL;
-	ctx->errorString1  = NULL;
-
-	ctx->numStages     =  1;
-	ctx->prodAllAxes   = ctx->prodRdxAxes   = ctx->prodFreeAxes  = 1;
-	strb_init(&ctx->s);
-	srcbInit (&ctx->srcGen, &ctx->s);
-
-	for (i=0;i<MAX_HW_DIMS;i++){
-		ctx->st2.bs      [i] = ctx->st1.bs      [i] = 1;
-		ctx->st2.gs      [i] = ctx->st1.gs      [i] = 1;
-		ctx->st2.cs      [i] = ctx->st1.cs      [i] = 1;
-	}
-
-	ctx->srcStepsGD      = ctx->srcSizeGD       =
-	ctx->dstStepsGD      = ctx->dstArgStepsGD   =
-	ctx->st1.chunkSizeGD = ctx->st2.chunkSizeGD = NULL;
-
-	return reduxInferProperties(ctx);
+static int        reduxGenInit                  (GpuReduction*     gr){
+	gr->kArgTypeCodes = NULL;
+	gr->kSourceCode   = NULL;
+	gr->kErrorString  = NULL;
+	
+	return reduxGenInferProperties(gr);
 }
 
 /**
- * @brief Begin inferring the properties of the reduction.
+ * @brief Begin inferring the properties of the reduction operator.
  */
 
-static int        reduxInferProperties          (redux_ctx*  ctx){
-	axis_desc* a;
-	int        i, j, retT, retK;
-	size_t     d;
-
-
-	/* Source code buffer preallocation failed? */
-	if (strb_ensure(&ctx->s, 4*1024) != 0){
-		return reduxCleanupMsg(ctx, GA_MEMORY_ERROR,
-		    "Could not preallocate source code buffer!\n");
+static int        reduxGenInferProperties       (GpuReduction*     gr){
+	int i, ret;
+	int k;
+	
+	
+	/**
+	 * Insane arguments?
+	 */
+	
+	if(gr->ndr <= 0){
+		return reduxGenCleanupMsg(gr, GA_INVALID_ERROR,
+		       "No reduction axes!\n");
 	}
-
-
-	/* Insane src, reduxLen, dst or dstArg? */
-	if       (!ctx->src){
-		return reduxCleanupMsg(ctx, GA_INVALID_ERROR,
-		    "src is NULL!\n");
-	}else if (ctx->src->nd  <= 0){
-		return reduxCleanupMsg(ctx, GA_INVALID_ERROR,
-		    "src has less than 1 dimensions!\n");
-	}else if (ctx->reduxLen <= 0){
-		return reduxCleanupMsg(ctx, GA_INVALID_ERROR,
-		    "List of dimensions to be reduced is empty!\n");
-	}else if (ctx->src->nd  <  (unsigned)ctx->reduxLen){
-		return reduxCleanupMsg(ctx, GA_INVALID_ERROR,
-		    "src has fewer dimensions than there are dimensions to reduce!\n");
-	}else if (reduxRequiresDst   (ctx) && !ctx->dst){
-		return reduxCleanupMsg(ctx, GA_INVALID_ERROR,
-		    "dst is NULL, but reduction requires it!\n");
-	}else if (reduxRequiresDstArg(ctx) && !ctx->dstArg){
-		return reduxCleanupMsg(ctx, GA_INVALID_ERROR,
-		    "dstArg is NULL, but reduction requires it!\n");
-	}else if (ctx->dst    && ctx->dst->nd   +ctx->reduxLen != ctx->src->nd){
-		return reduxCleanupMsg(ctx, GA_INVALID_ERROR,
-		    "dst is of incorrect dimensionality for this reduction!\n");
-	}else if (ctx->dstArg && ctx->dstArg->nd+ctx->reduxLen != ctx->src->nd){
-		return reduxCleanupMsg(ctx, GA_INVALID_ERROR,
-		    "dstArg is of incorrect dimensionality for this reduction!\n");
+	if(gr->ndd <  0){
+		return reduxGenCleanupMsg(gr, GA_INVALID_ERROR,
+		       "Destination has less than 0 dimensions!\n");
 	}
-	ctx->nds  = ctx->src->nd;
-	ctx->ndr  = ctx->reduxLen;
-	ctx->ndd  = ctx->nds - ctx->ndr;
-	ctx->ndfs = ctx->ndfr = ctx->ndfd = 0;
-	
-	/* Insane reduxList? */
-	for (i=0;i<ctx->ndr;i++){
-		j = ctx->reduxList[i];
-		if (j < -ctx->nds || j >= ctx->nds){
-			return reduxCleanupMsg(ctx, GA_INVALID_ERROR,
-			    "Insane axis number %d! Should be [%d, %d)!\n",
-			    j, -ctx->nds, ctx->nds);
-		}
-		j = j<0 ? ctx->nds+j : j;
-		d                 = ctx->src->dimensions[j];
-		ctx->zeroRdxAxes += !d;
-		ctx->prodRdxAxes *=  d?d:1;
+	if(gr->flags != 0){
+		return reduxGenCleanupMsg(gr, GA_INVALID_ERROR,
+		       "\"flags\" must be set to 0!\n");
 	}
-
-
+	gr->nds = gr->ndr+gr->ndd;
+	
+	
 	/**
-	 * Insane shape?
-	 * 
-	 * The source tensor is allowed to be empty (its shape may contain 0s).
-	 * However, all axes that are of length 0 must be reduction axes.
-	 * 
-	 * The reason for this is that a reduction cannot store any output into an
-	 * empty destination tensor (whose dimensions are the free axes), because
-	 * it has 0 space. The operation cannot then fulfill its contract.
-	 * 
-	 * On the other hand, when some or all reduction axes of a tensor are of
-	 * length 0, the reduction can be interpreted as initializing the
-	 * destination tensor to the identity value of the operation. For lack of a
-	 * better idea, the destination argument tensor can then be zeroed.
+	 * Source code buffer preallocation failed?
 	 */
-
-	for (i=0;i<ctx->nds;i++){
-		d                  = ctx->src->dimensions[i];
-		ctx->zeroAllAxes += !d;
-		ctx->prodAllAxes *=  d?d:1;
-	}
-	if (ctx->zeroAllAxes != ctx->zeroRdxAxes){
-		return reduxCleanupMsg(ctx, GA_INVALID_ERROR,
-		    "Source tensor has length-0 dimensions that are not reduced!");
+	
+	if (strb_ensure(&gr->s, 32*1024) != 0){
+		return reduxGenCleanupMsg(gr, GA_MEMORY_ERROR,
+		       "Could not preallocate source code buffer!\n");
 	}
-	ctx->prodFreeAxes = ctx->prodAllAxes/ctx->prodRdxAxes;
-
-
+	srcbInit(&gr->srcGen, &gr->s);
+	
+	
 	/**
 	 * GPU context non-existent, or cannot read its properties?
 	 */
-
-	ctx->gpuCtx = GpuArray_context(ctx->src);
-	if (!ctx->gpuCtx                                                                           ||
-	    gpucontext_property(ctx->gpuCtx, GA_CTX_PROP_NUMPROCS,  &ctx->numProcs) != GA_NO_ERROR ||
-	    gpucontext_property(ctx->gpuCtx, GA_CTX_PROP_MAXLSIZE,  &ctx->maxLg)    != GA_NO_ERROR ||
-	    gpudata_property(ctx->src->data, GA_CTX_PROP_MAXLSIZE0, &ctx->maxLs[0]) != GA_NO_ERROR ||
-	    gpudata_property(ctx->src->data, GA_CTX_PROP_MAXLSIZE1, &ctx->maxLs[1]) != GA_NO_ERROR ||
-	    gpudata_property(ctx->src->data, GA_CTX_PROP_MAXLSIZE2, &ctx->maxLs[2]) != GA_NO_ERROR ||
-	    gpudata_property(ctx->src->data, GA_CTX_PROP_MAXGSIZE,  &ctx->maxGg)    != GA_NO_ERROR ||
-	    gpudata_property(ctx->src->data, GA_CTX_PROP_MAXGSIZE0, &ctx->maxGs[0]) != GA_NO_ERROR ||
-	    gpudata_property(ctx->src->data, GA_CTX_PROP_MAXGSIZE1, &ctx->maxGs[1]) != GA_NO_ERROR ||
-	    gpudata_property(ctx->src->data, GA_CTX_PROP_MAXGSIZE2, &ctx->maxGs[2]) != GA_NO_ERROR ){
-		/* gpukernel_property(ctx->kernel.k,     GA_KERNEL_PROP_PREFLSIZE, &warpSize); */
-		return reduxCleanupMsg(ctx, GA_INVALID_ERROR,
-		    "Error obtaining one or more properties from GPU context!\n");
+	
+	if (!gr->gpuCtx                                                                          ||
+	    gpucontext_property(gr->gpuCtx, GA_CTX_PROP_NUMPROCS,  &gr->numProcs) != GA_NO_ERROR ||
+	    gpucontext_property(gr->gpuCtx, GA_CTX_PROP_MAXLSIZE,  &gr->maxLg)    != GA_NO_ERROR ||
+	    gpucontext_property(gr->gpuCtx, GA_CTX_PROP_MAXLSIZE0, &gr->maxL0)    != GA_NO_ERROR ||
+	    gpucontext_property(gr->gpuCtx, GA_CTX_PROP_MAXGSIZE,  &gr->maxGg)    != GA_NO_ERROR ||
+	    gpucontext_property(gr->gpuCtx, GA_CTX_PROP_MAXGSIZE0, &gr->maxG0)    != GA_NO_ERROR ||
+	    gpucontext_property(gr->gpuCtx, GA_CTX_PROP_LMEMSIZE,  &gr->maxLM)    != GA_NO_ERROR ){
+		return reduxGenCleanupMsg(gr, GA_INVALID_ERROR,
+		       "Error obtaining one or more properties from GPU context!\n");
 	}
-	ctx->warpSize = 32;
-
-
+	
+	
 	/**
 	 * Type management.
 	 * 
@@ -1428,1025 +1342,1948 @@ static int        reduxInferProperties          (redux_ctx*  ctx){
 	 *   datatype.
 	 */
 
-	ctx->srcTypeCode    = ctx->src->typecode;
-	ctx->dstTypeCode    = ctx->srcTypeCode;
-	ctx->dstArgTypeCode = GA_SSIZE;
-	ctx->idxTypeCode    = GA_SSIZE;
-	switch (ctx->srcTypeCode){
+	gr->dstTypeCode    = gr->srcTypeCode;
+	gr->dstArgTypeCode = GA_SSIZE;
+	gr->idxTypeCode    = GA_SSIZE;
+	switch (gr->srcTypeCode){
 		case GA_HALF:
-		  ctx->accTypeCode = GA_FLOAT;
+		  gr->accTypeCode = GA_FLOAT;
 		break;
 		case GA_HALF2:
-		  ctx->accTypeCode = GA_FLOAT2;
+		  gr->accTypeCode = GA_FLOAT2;
 		break;
 		case GA_HALF4:
-		  ctx->accTypeCode = GA_FLOAT4;
+		  gr->accTypeCode = GA_FLOAT4;
 		break;
 		case GA_HALF8:
-		  ctx->accTypeCode = GA_FLOAT8;
+		  gr->accTypeCode = GA_FLOAT8;
 		break;
 		case GA_HALF16:
-		  ctx->accTypeCode = GA_FLOAT16;
+		  gr->accTypeCode = GA_FLOAT16;
 		break;
 		default:
-		  ctx->accTypeCode = ctx->srcTypeCode;
-	}
-	ctx->srcTypeStr     = gpuarray_get_type(ctx->srcTypeCode)   ->cluda_name;
-	ctx->dstTypeStr     = gpuarray_get_type(ctx->dstTypeCode)   ->cluda_name;
-	ctx->dstArgTypeStr  = gpuarray_get_type(ctx->dstArgTypeCode)->cluda_name;
-	ctx->idxTypeStr     = gpuarray_get_type(ctx->idxTypeCode)   ->cluda_name;
-	ctx->accTypeStr     = gpuarray_get_type(ctx->accTypeCode)   ->cluda_name;
-	if (!ctx->srcTypeStr    ||
-	    !ctx->dstTypeStr    ||
-	    !ctx->dstArgTypeStr ||
-	    !ctx->idxTypeStr    ||
-	    !ctx->accTypeStr    ){
-		return reduxCleanup(ctx, GA_INVALID_ERROR);
-	}
-	switch (ctx->op){
+		  gr->accTypeCode = gr->srcTypeCode;
+	}
+	gr->srcTypeStr     = gpuarray_get_type(gr->srcTypeCode)   ->cluda_name;
+	gr->dstTypeStr     = gpuarray_get_type(gr->dstTypeCode)   ->cluda_name;
+	gr->dstArgTypeStr  = gpuarray_get_type(gr->dstArgTypeCode)->cluda_name;
+	gr->idxTypeStr     = gpuarray_get_type(gr->idxTypeCode)   ->cluda_name;
+	gr->accTypeStr     = gpuarray_get_type(gr->accTypeCode)   ->cluda_name;
+	if (!gr->srcTypeStr    ||
+	    !gr->dstTypeStr    ||
+	    !gr->dstArgTypeStr ||
+	    !gr->idxTypeStr    ||
+	    !gr->accTypeStr    ){
+		return reduxGenCleanupMsg(gr, GA_INVALID_ERROR,
+		                          "Have typecode with no CLUDA name!\n");
+	}
+	switch (gr->op){
 		case GA_REDUCE_SUM:
-		  retT = reduxGetSumInit (ctx->dstTypeCode, &ctx->initValT);
-		  retK = reduxGetSumInit (ctx->accTypeCode, &ctx->initValK);
+		  ret = reduxGetSumInit (gr->accTypeCode, &gr->initVal);
 		break;
 		case GA_REDUCE_PRODNZ:
 		case GA_REDUCE_PROD:
-		  retT = reduxGetProdInit(ctx->dstTypeCode, &ctx->initValT);
-		  retK = reduxGetProdInit(ctx->accTypeCode, &ctx->initValK);
+		  ret = reduxGetProdInit(gr->accTypeCode, &gr->initVal);
 		break;
 		case GA_REDUCE_MINANDARGMIN:
 		case GA_REDUCE_ARGMIN:
 		case GA_REDUCE_MIN:
-		  retT = reduxGetMinInit (ctx->dstTypeCode, &ctx->initValT);
-		  retK = reduxGetMinInit (ctx->accTypeCode, &ctx->initValK);
+		  ret = reduxGetMinInit (gr->accTypeCode, &gr->initVal);
 		break;
 		case GA_REDUCE_MAXANDARGMAX:
 		case GA_REDUCE_ARGMAX:
 		case GA_REDUCE_MAX:
-		  retT = reduxGetMaxInit (ctx->dstTypeCode, &ctx->initValT);
-		  retK = reduxGetMaxInit (ctx->accTypeCode, &ctx->initValK);
+		  ret = reduxGetMaxInit (gr->accTypeCode, &gr->initVal);
 		break;
 		case GA_REDUCE_ALL:
 		case GA_REDUCE_AND:
-		  retT = reduxGetAndInit (ctx->dstTypeCode, &ctx->initValT);
-		  retK = reduxGetAndInit (ctx->accTypeCode, &ctx->initValK);
+		  ret = reduxGetAndInit (gr->accTypeCode, &gr->initVal);
 		break;
 		case GA_REDUCE_ANY:
 		case GA_REDUCE_XOR:
 		case GA_REDUCE_OR:
-		  retT = reduxGetOrInit  (ctx->dstTypeCode, &ctx->initValT);
-		  retK = reduxGetOrInit  (ctx->accTypeCode, &ctx->initValK);
+		  ret = reduxGetOrInit  (gr->accTypeCode, &gr->initVal);
 		break;
 		default:
-		  retT = GA_UNSUPPORTED_ERROR;
-		  retK = GA_UNSUPPORTED_ERROR;
+		  ret = GA_UNSUPPORTED_ERROR;
 	}
-	if (retT != GA_NO_ERROR){
-		return reduxCleanupMsg(ctx, retT,
-		    "Problem selecting types to be used in reduction!\n");
+	if (ret != GA_NO_ERROR){
+		return reduxGenCleanupMsg(gr, ret,
+		       "Problem selecting types to be used in reduction!\n");
 	}
-	if (retK != GA_NO_ERROR){
-		return reduxCleanupMsg(ctx, retK,
-		    "Problem selecting types to be used in reduction!\n");
+	
+	/* Compute floor(log2(gr->log2MaxL)). */
+	gr->log2MaxL = gr->maxLg-1;
+	for(i=1;gr->log2MaxL & (gr->log2MaxL+1);i*=2){
+		gr->log2MaxL |= gr->log2MaxL>>i;
 	}
-
-
-	/**
-	 * Allocate and construct source-tensor axis-description lists.
-	 * 
-	 * While constructing the descriptions of each axis, verify that:
-	 * 
-	 *   1. reduxLen has no duplicates.
-	 *   2. dst and/or dstArg's dimensions match src's dimensions, stripped of
-	 *      the reduction axes.
-	 */
-
-	ctx->xdSrc     = calloc(ctx->nds,   sizeof(*ctx->xdSrc));
-	ctx->xdSrcPtrs = calloc(ctx->nds+1, sizeof(*ctx->xdSrcPtrs));
-	ctx->xdSrcFlat = calloc(ctx->nds+1, sizeof(*ctx->xdSrcFlat));
-	if (!ctx->xdSrc || !ctx->xdSrcPtrs || !ctx->xdSrcFlat){
-		return reduxCleanup(ctx, GA_MEMORY_ERROR);
+	for(i=0;gr->log2MaxL;i++){
+		gr->log2MaxL >>= 1;
 	}
-	for (i=0;i<ctx->nds;i++){
-		axisInit(&ctx->xdSrc[i],
-		         ctx->src->dimensions[i],
-		         ctx->src->strides[i]);
+	gr->log2MaxL = i;
+	
+	/* Compute number of kernel arguments. */
+	gr->kNumArgs  = 6                                       /* phase, U, V, B, D, H   */
+	              + 2                                       /* splitFree, splitReduce */
+	              + gr->nds                                 /* l{0..n}                */
+	              + reduxGenRequiresDstArg(gr)*gr->ndr      /* l{m..n}PDim            */
+	              + 1                                       /* s                      */
+	              + 1                                       /* sOff                   */
+	              + gr->nds                                 /* sJ{0..n}               */
+	              + reduxGenRequiresDst   (gr)              /* d                      */
+	              + reduxGenRequiresDst   (gr)              /* dOff                   */
+	              + reduxGenRequiresDst   (gr)*gr->ndd      /* dJ{0..m}               */
+	              + reduxGenRequiresDstArg(gr)              /* a                      */
+	              + reduxGenRequiresDstArg(gr)              /* aOff                   */
+	              + reduxGenRequiresDstArg(gr)*gr->ndd      /* aJ{0..m}               */
+	              + 1                                       /* w                      */
+	              + reduxGenKernelRequiresDst   (gr)*2      /* wdOff, pdOff           */
+	              + reduxGenKernelRequiresDstArg(gr)*2      /* waOff, paOff           */
+	              + gr->log2MaxL                            /* bs{0..p}               */
+	              + gr->log2MaxL                            /* bp{0..p}               */
+	              + reduxGenRequiresDstArg(gr)*gr->log2MaxL /* bi{0..p}               */
+	              + gr->log2MaxL                            /* bsOff{0..p}            */
+	              + reduxGenRequiresDst   (gr)*gr->log2MaxL /* bdOff{0..p}            */
+	              + reduxGenRequiresDstArg(gr)*gr->log2MaxL;/* baOff{0..p}            */
+	
+	
+	/* Construct kernel argument typecode list */
+	gr->kArgTypeCodes = calloc(gr->kNumArgs, sizeof(*gr->kArgTypeCodes));
+	if(!gr->kArgTypeCodes){
+		return reduxGenCleanupMsg(gr, GA_MEMORY_ERROR,
+		                          "Failed to allocate memory for kernel arguments "
+		                          "typecode list!\n");
 	}
-	for (i=0;i<ctx->ndr;i++){
-		j = ctx->reduxList[i];
-		j = j<0 ? ctx->nds+j : j;
-		a = reduxGetSrcAxis(ctx, j);
-		if (axisIsReduced(a)){
-			return reduxCleanupMsg(ctx, GA_INVALID_ERROR,
-			                       "Axis %d appears multiple times in the "
-			                       "reduction axis list!\n",
-			                       j);
+	
+	i = 0;
+	gr->kArgTypeCodes[i++] =           GA_INT;   /* phase */
+	gr->kArgTypeCodes[i++] =           GA_SIZE;  /* U */
+	gr->kArgTypeCodes[i++] =           GA_SIZE;  /* V */
+	gr->kArgTypeCodes[i++] =           GA_SIZE;  /* B */
+	gr->kArgTypeCodes[i++] =           GA_UINT;  /* D */
+	gr->kArgTypeCodes[i++] =           GA_UINT;  /* H */
+	gr->kArgTypeCodes[i++] =           GA_UINT;  /* splitFree */
+	gr->kArgTypeCodes[i++] =           GA_UINT;  /* splitReduce */
+	for(k=0;k < gr->nds;k++){
+		gr->kArgTypeCodes[i++] =       GA_SIZE;  /* lN */
+	}
+	for(k=0;k < gr->ndr && reduxGenRequiresDstArg(gr);k++){
+		gr->kArgTypeCodes[i++] =       GA_SIZE;  /* lNPDim */
+	}
+	gr->kArgTypeCodes[i++] =           GA_BUFFER;/* s */
+	gr->kArgTypeCodes[i++] =           GA_SSIZE; /* sOff */
+	for(k=0;k < gr->nds;k++){
+		gr->kArgTypeCodes[i++] =       GA_SSIZE; /* sJN */
+	}
+	if(reduxGenRequiresDst   (gr)){
+		gr->kArgTypeCodes[i++] =       GA_BUFFER;/* d */
+		gr->kArgTypeCodes[i++] =       GA_SSIZE; /* dOff */
+		for(k=0;k < gr->ndd;k++){
+			gr->kArgTypeCodes[i++] =   GA_SSIZE; /* dJN */
 		}
-		axisMarkReduced(a, i);
 	}
-	for (i=j=0;i<ctx->nds;i++){
-		axis_desc* a      = reduxGetSrcAxis(ctx, i);
-		size_t     srcLen = axisGetLen(a), dstLen, dstArgLen;
-		
-		if (axisIsReduced(a)){continue;}
-		if (reduxRequiresDst(ctx)){
-			dstLen = ctx->dst->dimensions[j];
-			
-			if(srcLen != dstLen){
-				return reduxCleanupMsg(ctx, GA_INVALID_ERROR,
-				                       "Source axis %d has length %zu, but "
-				                       "corresponding destination axis %d has length %zu!\n",
-				                       i, srcLen, j, dstLen);
-			}
-			
-			a->dstStride    = ctx->dst->strides[j];
-		}
-		if (reduxRequiresDstArg(ctx)){
-			dstArgLen = ctx->dstArg->dimensions[j];
-			
-			if(srcLen != dstArgLen){
-				return reduxCleanupMsg(ctx, GA_INVALID_ERROR,
-				                       "Source axis %d has length %zu, but "
-				                       "corresponding destination-argument axis %d has length %zu!\n",
-				                       i, srcLen, j, dstArgLen);
-			}
-			
-			a->dstArgStride = ctx->dstArg->strides[j];
+	if(reduxGenRequiresDstArg(gr)){
+		gr->kArgTypeCodes[i++] =       GA_BUFFER;/* a */
+		gr->kArgTypeCodes[i++] =       GA_SSIZE; /* aOff */
+		for(k=0;k < gr->ndd;k++){
+			gr->kArgTypeCodes[i++] =   GA_SSIZE; /* aJN */
 		}
-		
-		j++;
 	}
+	gr->kArgTypeCodes[i++] =           GA_BUFFER;/* w */
+	if(reduxGenKernelRequiresDst   (gr)){
+		gr->kArgTypeCodes[i++] =       GA_SSIZE; /* wdOff */
+		gr->kArgTypeCodes[i++] =       GA_SSIZE; /* pdOff */
+	}
+	if(reduxGenKernelRequiresDstArg(gr)){
+		gr->kArgTypeCodes[i++] =       GA_SSIZE; /* waOff */
+		gr->kArgTypeCodes[i++] =       GA_SSIZE; /* paOff */
+	}
+	for(k=0;k < gr->log2MaxL;k++){
+		gr->kArgTypeCodes[i++] =       GA_UINT;  /* ibsN */
+	}
+	for(k=0;k < gr->log2MaxL;k++){
+		gr->kArgTypeCodes[i++] =       GA_UINT;  /* ibpN */
+	}
+	for(k=0;k < gr->log2MaxL && reduxGenRequiresDstArg(gr);k++){
+		gr->kArgTypeCodes[i++] =       GA_SIZE;  /* iblNPDim */
+	}
+	for(k=0;k < gr->log2MaxL;k++){
+		gr->kArgTypeCodes[i++] =       GA_SSIZE; /* ibsOffN */
+	}
+	for(k=0;k < gr->log2MaxL && reduxGenRequiresDst   (gr);k++){
+		gr->kArgTypeCodes[i++] =       GA_SSIZE; /* ibdOffN */
+	}
+	for(k=0;k < gr->log2MaxL && reduxGenRequiresDstArg(gr);k++){
+		gr->kArgTypeCodes[i++] =       GA_SSIZE; /* ibaOffN */
+	}
+	
+	return reduxGenSrc(gr);
+}
 
+/**
+ * @brief Generate the kernel source code for the reduction.
+ *
+ * @return GA_MEMORY_ERROR if not enough memory left; GA_NO_ERROR otherwise.
+ */
+
+static int        reduxGenSrc                   (GpuReduction*     gr){
+	reduxGenSrcAppend(gr);
 
-	/**
-	 * Begin flattening the source tensor.
-	 */
-
-	return reduxFlattenSource(ctx);
+	gr->kSourceCodeLen = gr->s.l;
+	gr->kSourceCode    = strb_cstr(&gr->s);
+
+	if (gr->kSourceCode){
+		return reduxGenCompile(gr);
+	}else{
+		return reduxGenCleanupMsg(gr, GA_MEMORY_ERROR,
+		                          "Failure in source code string buffer allocation "
+		                          "during codegen!\n");
+	}
 }
 
 /**
- * @brief Flatten the source tensor as much as is practical.
- * 
- * This makes the axis lengths as long as possible and the tensor itself as
- * contiguous as possible.
+ * @brief Append source code to the string buffer.
  */
 
-static int        reduxFlattenSource            (redux_ctx*  ctx){
-	axis_desc* axis, *flatAxis, *sortAxis;
-	int        i, j, isSensitive;
+static void       reduxGenSrcAppend             (GpuReduction*     gr){
+	reduxGenSrcAppendIncludes      (gr);
+	reduxGenSrcAppendMacroDefs     (gr);
+	reduxGenSrcAppendTypedefs      (gr);
+	reduxGenSrcAppendReduxKernel   (gr);
+}
+static void       reduxGenSrcAppendIncludes     (GpuReduction*     gr){
+	srcbAppends(&gr->srcGen, "/* Includes */\n");
+	srcbAppends(&gr->srcGen, "#include \"cluda.h\"\n");
+	srcbAppends(&gr->srcGen, "\n");
+	srcbAppends(&gr->srcGen, "\n");
+	srcbAppends(&gr->srcGen, "\n");
+}
+static void       reduxGenSrcAppendMacroDefs    (GpuReduction*     gr){
+	int i;
 	
 	/**
-	 * Copy source axis descriptions list to flattened source axis description
-	 * list, in preparation for attempts at flattening.
+	 * DECLREDUXSTATE, INITREDUXSTATE and SETREDUXSTATE macros.
 	 */
 	
-	memcpy(ctx->xdSrcFlat, ctx->xdSrc, ctx->nds*sizeof(*ctx->xdSrcFlat));
-	ctx->ndfs = ctx->nds;
-
+	if       ( reduxGenKernelRequiresDst(gr) &&  reduxGenKernelRequiresDstArg(gr)){
+		srcbAppendf(&gr->srcGen,
+		            "#define DECLREDUXSTATE(V, I) TK V;TX I;\n"
+		            "#define INITREDUXSTATE(V, I) do{(V) = %s;(I) = 0;}while(0)\n"
+		            "#define SETREDUXSTATE(V, I, v, i)  do{(V) = (v);(I) = (i);}while(0)\n",
+		            gr->initVal);
+	}else if ( reduxGenKernelRequiresDst(gr) && !reduxGenKernelRequiresDstArg(gr)){
+		srcbAppendf(&gr->srcGen,
+		            "#define DECLREDUXSTATE(V, I) TK V;\n"
+		            "#define INITREDUXSTATE(V, I) do{(V) = %s;}while(0)\n"
+		            "#define SETREDUXSTATE(V, I, v, i)  do{(V) = (v);}while(0)\n",
+		            gr->initVal);
+	}else if (!reduxGenKernelRequiresDst(gr) &&  reduxGenKernelRequiresDstArg(gr)){
+		srcbAppendf(&gr->srcGen,
+		            "#define DECLREDUXSTATE(V, I) TX I;\n"
+		            "#define INITREDUXSTATE(V, I) do{(I) = 0;}while(0)\n"
+		            "#define SETREDUXSTATE(V, I, v, i)  do{(I) = (i);}while(0)\n");
+	}
+	
+	
 	/**
-	 * Pass 1: Flatten out 0-length dimensions. We already know that
+	 * LOADS(v, p) macro.
 	 * 
-	 *         a) There are no 0-length free dimensions, because that
-	 *            constitutes an invalid input, and
-	 *         b) How many 0-length reduction dimensions there are, because
-	 *            we counted them in the error-checking code.
-	 * 
-	 * So if there are any 0-length axes, we can delete all reduction axes and
-	 * replace them with a single one.
+	 * Loads a TK-typed value v from a TS-typed source pointer p.
 	 */
 	
-	if (ctx->zeroRdxAxes > 0){
-		for (i=j=0;i<ctx->ndfs;i++){
-			axis = reduxGetSrcFlatAxis(ctx, i);
-			
-			if (!axisIsReduced(axis)){
-				*reduxGetSrcFlatAxis(ctx, j++) = *axis;
-			}
-		}
-		
-		axisInit       (reduxGetSrcFlatAxis(ctx, j), 0, 0);
-		axisMarkReduced(reduxGetSrcFlatAxis(ctx, j), 0);
-		j++;
-		ctx->ndfs = j;
+	if (gr->srcTypeCode == GA_HALF && gr->accTypeCode == GA_FLOAT){
+		srcbAppends(&gr->srcGen, "#define LOADS(v, p) do{(v) = (TK)load_half((TS*)(p));}while(0)\n");
+	}else{
+		srcbAppends(&gr->srcGen, "#define LOADS(v, p) do{(v) = (TK)*(TS*)(p);}while(0)\n");
 	}
 	
+	
 	/**
-	 * Pass 2: Flatten out 1-length dimensions, since they can always be
-	 *         ignored; They are always indexed at [0].
+	 * GETIDX macro.
+	 * 
+	 * Expands to the current flattened index.
 	 */
 	
-	for (i=j=0;i<ctx->ndfs;i++){
-		axis = reduxGetSrcFlatAxis(ctx, i);
-		
-		if (axisGetLen(axis) != 1){
-			*reduxGetSrcFlatAxis(ctx, j++) = *axis;
-		}
+	srcbAppends    (&gr->srcGen, "#define GETIDX   (");
+	srcbBeginList  (&gr->srcGen, " + ", "0");
+	srcbAppendElemf(&gr->srcGen, "ti");
+	for(i=gr->ndd;i<gr->nds;i++){
+		srcbAppendElemf(&gr->srcGen, "i%d*l%dPDim", i, i);
+	}
+	srcbEndList    (&gr->srcGen);
+	srcbAppends    (&gr->srcGen, ")\n");
+	
+	/**
+	 * REDUX macro.
+	 * 
+	 * Performs a reduction operation, jointly reducing a datum v and its
+	 * flattened index i into reduction states V and I respectively.
+	 */
+	
+	srcbAppends(&gr->srcGen, "#define REDUX(V, I, v, i) do{           \\\n");
+	switch (gr->op){
+		case GA_REDUCE_SUM:
+		  srcbAppendf(&gr->srcGen, "        (V) += (v);                     \\\n");
+		break;
+		case GA_REDUCE_PROD:
+		  srcbAppendf(&gr->srcGen, "        (V) *= (v);                     \\\n");
+		break;
+		case GA_REDUCE_PRODNZ:
+		  srcbAppendf(&gr->srcGen, "        (V) *= ((v) == 0 ? (%s) : (v)); \\\n", gr->initVal);
+		break;
+		case GA_REDUCE_MIN:
+		  srcbAppendf(&gr->srcGen, "    (V)  = min((V), (v));           \\\n");
+		break;
+		case GA_REDUCE_MAX:
+		  srcbAppendf(&gr->srcGen, "        (V)  = max((V), (v));           \\\n");
+		break;
+		case GA_REDUCE_ARGMIN:
+		case GA_REDUCE_MINANDARGMIN:
+		  srcbAppendf(&gr->srcGen, "        (V)  = min((V), (v));           \\\n"
+		                           "        if((V) == (v)){                 \\\n"
+		                           "            (I) = (i);                  \\\n"
+		                           "        }                               \\\n");
+		break;
+		case GA_REDUCE_ARGMAX:
+		case GA_REDUCE_MAXANDARGMAX:
+		  srcbAppendf(&gr->srcGen, "        (V)  = max((V), (v));           \\\n"
+		                           "        if((V) == (v)){                 \\\n"
+		                           "            (I) = (i);                  \\\n"
+		                           "        }                               \\\n");
+		break;
+		case GA_REDUCE_AND:
+		  srcbAppendf(&gr->srcGen, "        (V) &= (v);                     \\\n");
+		break;
+		case GA_REDUCE_OR:
+		  srcbAppendf(&gr->srcGen, "        (V) |= (v);                     \\\n");
+		break;
+		case GA_REDUCE_XOR:
+		  srcbAppendf(&gr->srcGen, "        (V) ^= (v);                     \\\n");
+		break;
+		case GA_REDUCE_ALL:
+		  srcbAppendf(&gr->srcGen, "        (V)  = (V) && (v);              \\\n");
+		break;
+		case GA_REDUCE_ANY:
+		  srcbAppendf(&gr->srcGen, "        (V)  = (V) || (v);              \\\n");
+		break;
 	}
-	ctx->ndfs = j;
+	srcbAppends(&gr->srcGen, "    }while(0)\n");
+	
 	
 	/**
-	 * Pass 3: Flatten out continuous dimensions, where strides and sensitivity
-	 *         allows it.
+	 * HREDUX macro.
+	 * 
+	 * Performs a horizontal reduction operation, first intra-block permuting
+	 * the data and its index and then reducing it till done.
 	 */
 	
-	isSensitive = reduxIsSensitive(ctx);
+	srcbAppends(&gr->srcGen,
+	"#define HREDUX(pd, pa, tp, V, I)                                                    \\\n"
+	"    do{                                                                             \\\n"
+	"        /* Horizontal Reduction */                                                  \\\n"
+	"        SETREDUXSTATE(pd[tp], pa[tp], accV, accI);                                  \\\n"
+	"        local_barrier();                                                            \\\n"
+	"                                                                                    \\\n"
+	"        h = H;                                                                      \\\n"
+	"        while(h>1){                                                                 \\\n"
+	"            if((h&1) && (LID_0 < D)){                                               \\\n"
+	"                REDUX(pd[LID_0], pa[LID_0], pd[LID_0 + D*h-D], pa[LID_0 + D*h-D]);  \\\n"
+	"            }                                                                       \\\n"
+	"            h >>= 1;                                                                \\\n"
+	"            if(LID_0 < D*h){                                                        \\\n"
+	"                REDUX(pd[LID_0], pa[LID_0], pd[LID_0 + D*h  ], pa[LID_0 + D*h  ]);  \\\n"
+	"            }                                                                       \\\n"
+	"            local_barrier();                                                        \\\n"
+	"        }                                                                           \\\n"
+	"    }while(0)\n");
 	
-	qsort(ctx->xdSrcFlat, ctx->ndfs, sizeof(*ctx->xdSrcFlat),
-		  isSensitive ? reduxSortFlatSensitive : reduxSortFlatInsensitive);
+	/**
+	 * STORED macro.
+	 * 
+	 * Stores a TK-typed value v into a TS-typed destination pointer p.
+	 */
 	
-	for (i=j=1;i<ctx->ndfs;i++){
-		flatAxis = reduxGetSrcFlatAxis(ctx, j-1);
-		sortAxis = reduxGetSrcFlatAxis(ctx, i);
-		
-		if (!reduxTryFlattenInto(ctx, flatAxis, sortAxis)){
-			*reduxGetSrcFlatAxis(ctx, j++) = *sortAxis;
+	if (reduxGenRequiresDst(gr)){
+		if (gr->dstTypeCode == GA_HALF && gr->accTypeCode == GA_FLOAT){
+			srcbAppends(&gr->srcGen, "#define STORED(p, v) do{store_half((TD*)(p), (v));}while(0)\n");
+		}else{
+			srcbAppends(&gr->srcGen, "#define STORED(p, v) do{*(TD*)(p) = (v);}while(0)\n");
 		}
+	}else{
+		srcbAppends(&gr->srcGen, "#define STORED(p, v) do{}while(0)\n");
 	}
-	ctx->ndfs = j;
-
-
+	
+	
 	/**
-	 * NOTE: At this point it is possible for there to be no axes
-	 * (ctx->ndf == 0), but this will only occur if all axes of the original
-	 * tensor were length-1 (i.e., if this was a scalar masquerading as a
-	 * multidimensional tensor).
+	 * STOREA macro.
 	 * 
-	 * We check for this case and simulate a 1-dimensional, 1-length tensor.
+	 * Stores a TX-typed value v into a TA-typed destination pointer p.
 	 */
-
-	if(ctx->ndfs == 0){
-		axisInit       (reduxGetSrcFlatAxis(ctx, ctx->ndfs), 1, 0);
-		axisMarkReduced(reduxGetSrcFlatAxis(ctx, ctx->ndfs), 0);
-		ctx->ndfs = 1;
+	
+	if (reduxGenRequiresDstArg(gr)){
+		srcbAppends(&gr->srcGen, "#define STOREA(p, v) do{*(TA*)(p) = (v);}while(0)\n");
+	}else{
+		srcbAppends(&gr->srcGen, "#define STOREA(p, v) do{}while(0)\n");
 	}
-
-
+	
+	
 	/**
-	 * Having flattened the tensor to the very best of our ability, allocate
-	 * and/or compute
-	 * 
-	 *   ctx->ndfr
-	 *   ctx->ndfd
-	 *   ctx->flatSrcDimensions
-	 *   ctx->flatSrcStrides
-	 *   ctx->flatSrcData
-	 *   ctx->flatSrcOffset + axis offsets
-	 *   ctx->flatDstStrides
-	 *   ctx->flatDstData
-	 *   ctx->flatDstOffset + axis offsets
-	 *   ctx->flatDstArgStrides
-	 *   ctx->flatDstArgData
-	 *   ctx->flatDstArgOffset + axis offsets
-	 * 
-	 * and suchlike data that will be used post-flatten.
+	 * DIVIDECEIL macro.
 	 */
 	
-	ctx->flatSrcDimensions = malloc(ctx->ndfs * sizeof(*ctx->flatSrcDimensions));
-	ctx->flatSrcStrides    = malloc(ctx->ndfs * sizeof(*ctx->flatSrcStrides));
-	ctx->flatDstStrides    = malloc(ctx->ndfs * sizeof(*ctx->flatDstStrides));
-	ctx->flatDstArgStrides = malloc(ctx->ndfs * sizeof(*ctx->flatDstArgStrides));
-	if(!ctx->flatSrcDimensions || !ctx->flatSrcStrides   ||
-	   !ctx->flatDstStrides    || !ctx->flatDstArgStrides){
-		return reduxCleanup(ctx, GA_MEMORY_ERROR);
+	srcbAppends(&gr->srcGen, "#define DIVIDECEIL(a,b) (((a)+(b)-1)/(b))\n");
+	
+	srcbAppends(&gr->srcGen, "\n\n\n\n");
+}
+static void       reduxGenSrcAppendTypedefs     (GpuReduction*     gr){
+	srcbAppendf(&gr->srcGen, "typedef %-20s TS;\n", gr->srcTypeStr);
+	srcbAppendf(&gr->srcGen, "typedef %-20s TD;\n", gr->dstTypeStr);
+	srcbAppendf(&gr->srcGen, "typedef %-20s TA;\n", gr->dstArgTypeStr);
+	srcbAppendf(&gr->srcGen, "typedef %-20s TX;\n", gr->idxTypeStr);
+	srcbAppendf(&gr->srcGen, "typedef %-20s TK;\n", gr->accTypeStr);
+	srcbAppendf(&gr->srcGen, "\n\n\n\n");
+}
+static void       reduxGenSrcAppendReduxKernel  (GpuReduction*     gr){
+	reduxGenSrcAppendPrototype   (gr);
+	srcbAppends                  (&gr->srcGen, "{\n");
+	reduxGenSrcAppendBlockDecode (gr);
+	reduxGenSrcAppendThreadDecode(gr);
+	srcbAppends                  (&gr->srcGen, "    /**\n"
+	                                           "     * PERFORM REDUCTION.\n"
+	                                           "     * \n"
+	                                           "     * We either perform Phase 0 or Phase 1 according to our argument.\n"
+	                                           "     * \n"
+	                                           "     * Phase 0 is the primary worker and, in special cases, is the only necessary phase.\n"
+	                                           "     * However, it may occasionally do only part of a reduction, in which case it leaves\n"
+	                                           "     * the partial reduction results in a workspace that is then read by Phase 1.\n"
+	                                           "     * \n"
+	                                           "     * Phase 1 is a fixup phase that collects any partial reduction results from Phase 0\n"
+	                                           "     * and completes the reduction before writing to the final destination.\n"
+	                                           "     */\n"
+	                                           "    \n"
+	                                           "    if(phase==0){\n");
+	reduxGenSrcAppendPhase0      (gr);
+	srcbAppends                  (&gr->srcGen, "    }else{\n");
+	reduxGenSrcAppendPhase1      (gr);
+	srcbAppends                  (&gr->srcGen, "    }\n");
+	srcbAppends                  (&gr->srcGen, "}\n");
+}
+static void       reduxGenSrcAppendPrototype    (GpuReduction*     gr){
+	int i;
+	
+	srcbAppends            (&gr->srcGen, "KERNEL void redux(int                      phase,\n"
+	                                     "                  TX                       U,\n"
+	                                     "                  TX                       V,\n"
+	                                     "                  TX                       B,\n"
+	                                     "                  unsigned                 D,\n"
+	                                     "                  unsigned                 H,\n"
+	                                     "                  unsigned                 splitFree,\n"
+	                                     "                  unsigned                 splitReduce,\n");
+	srcbBeginList          (&gr->srcGen, ",\n", "void");
+	for(i=0;i<(int)(gr->ndd+gr->ndr);i++){
+		srcbAppendElemf    (&gr->srcGen, "                  TX                       l%d", i);
+	}
+	for(i=gr->ndd;i<(int)(gr->ndd+gr->ndr);i++){
+		srcbAppendElemf    (&gr->srcGen, "                  TX                       l%dPDim", i);
+	}
+	srcbAppendElemf        (&gr->srcGen, "                  const GLOBAL_MEM char*   s");
+	srcbAppendElemf        (&gr->srcGen, "                  TX                       sOff");
+	for(i=0;i<(int)(gr->ndd+gr->ndr);i++){
+		srcbAppendElemf    (&gr->srcGen, "                  TX                       sJ%d", i);
+	}
+	if (reduxGenRequiresDst(gr)){
+		srcbAppendElemf    (&gr->srcGen, "                  GLOBAL_MEM char*         d");
+		srcbAppendElemf    (&gr->srcGen, "                  TX                       dOff");
+		for(i=0;i<(int)(gr->ndd);i++){
+			srcbAppendElemf(&gr->srcGen, "                  TX                       dJ%d", i);
+		}
+	}
+	if (reduxGenRequiresDstArg(gr)){
+		srcbAppendElemf    (&gr->srcGen, "                  GLOBAL_MEM char*         a");
+		srcbAppendElemf    (&gr->srcGen, "                  TX                       aOff");
+		for(i=0;i<(int)(gr->ndd);i++){
+			srcbAppendElemf(&gr->srcGen, "                  TX                       aJ%d", i);
+		}
+	}
+	srcbAppendElemf        (&gr->srcGen, "                  GLOBAL_MEM char*         w");
+	if (reduxGenKernelRequiresDst(gr)){
+		srcbAppendElemf    (&gr->srcGen, "                  TX                       wdOff");
+		srcbAppendElemf    (&gr->srcGen, "                  TX                       pdOff");
+	}
+	if (reduxGenKernelRequiresDstArg(gr)){
+		srcbAppendElemf    (&gr->srcGen, "                  TX                       waOff");
+		srcbAppendElemf    (&gr->srcGen, "                  TX                       paOff");
+	}
+	for(i=0;i<(int)(gr->log2MaxL);i++){
+		srcbAppendElemf    (&gr->srcGen, "                  unsigned                 ibs%d", i);
 	}
+	for(i=0;i<(int)(gr->log2MaxL);i++){
+		srcbAppendElemf    (&gr->srcGen, "                  unsigned                 ibp%d", i);
+	}
+	for(i=0;i<(int)(gr->log2MaxL) && reduxGenRequiresDstArg(gr);i++){
+		srcbAppendElemf    (&gr->srcGen, "                  TX                       ibl%dPDim", i);
+	}
+	for(i=0;i<(int)(gr->log2MaxL);i++){
+		srcbAppendElemf    (&gr->srcGen, "                  TX                       ibsOff%d", i);
+	}
+	for(i=0;i<(int)(gr->log2MaxL) && reduxGenRequiresDst   (gr);i++){
+		srcbAppendElemf    (&gr->srcGen, "                  TX                       ibdOff%d", i);
+	}
+	for(i=0;i<(int)(gr->log2MaxL) && reduxGenRequiresDstArg(gr);i++){
+		srcbAppendElemf    (&gr->srcGen, "                  TX                       ibaOff%d", i);
+	}
+	srcbEndList    (&gr->srcGen);
+	srcbAppends    (&gr->srcGen, ")");
+}
+static void       reduxGenSrcAppendBlockDecode  (GpuReduction*     gr){
+	int i;
 	
-	ctx->flatSrcData       = ctx->src->data;
-	ctx->flatSrcOffset     = ctx->src->offset;
-	if(reduxRequiresDst(ctx)){
-		ctx->flatDstData       = ctx->dst->data;
-		ctx->flatDstOffset     = ctx->dst->offset;
+	srcbAppends(&gr->srcGen,
+	"    GA_DECL_SHARED_BODY(char, SHMEM)\n"
+	"    DECLREDUXSTATE(accV, accI)\n"
+	"    DECLREDUXSTATE(tmpV, tmpI)\n"
+	"    INITREDUXSTATE(accV, accI);\n"
+	"    \n"
+	"     /**\n"
+	"      *  +-------------+-------------+------------+---------------------------------+\n"
+	"      *  |  misalignL  |  misalignR  |  doFinish  |            DESCRIPTION          |\n"
+	"      *  +-------------+-------------+------------+---------------------------------+\n"
+	"      *  |      0      |       0     |      0     |  Impossible unless v == 0,      |\n"
+	"      *  |             |             |            |  which is forbidden.            |\n"
+	"      *  |             |             |            |                                 |\n"
+	"      *  |      0      |       0     |      1     |  V % B == 0. Each block         |\n"
+	"      *  |             |             |            |  handles integer number of      |\n"
+	"      *  |             |             |            |  destination elements, no       |\n"
+	"      *  |             |             |            |  partial results are required,  |\n"
+	"      *  |             |             |            |  workspace is unused.           |\n"
+	"      *  |             |             |            |                                 |\n"
+	"      *  |      0      |       1     |      0     |  V < B. Block begins aligned    |\n"
+	"      *  |             |             |            |  but ends misaligned, before    |\n"
+	"      *  |             |             |            |  the end of its first element.  |\n"
+	"      *  |             |             |            |  Partial result written to      |\n"
+	"      *  |             |             |            |  right-half of array.           |\n"
+	"      *  |             |             |            |                                 |\n"
+	"      *  |      0      |       1     |      1     |  V > B, V % B != 0. Block       |\n"
+	"      *  |             |             |            |  begins aligned but ends        |\n"
+	"      *  |             |             |            |  misaligned, after the end of   |\n"
+	"      *  |             |             |            |  its first element.             |\n"
+	"      *  |             |             |            |  First 1 or more complete       |\n"
+	"      *  |             |             |            |  elements written out directly  |\n"
+	"      *  |             |             |            |  to destination.                |\n"
+	"      *  |             |             |            |  Partial result of last element |\n"
+	"      *  |             |             |            |  written to right-half of array.|\n"
+	"      *  |             |             |            |                                 |\n"
+	"      *  |      1      |       0     |      0     |  Impossible unless v == 0,      |\n"
+	"      *  |             |             |            |  which is forbidden.            |\n"
+	"      *  |             |             |            |                                 |\n"
+	"      *  |      1      |       0     |      1     |  V % B != 0. Partial result of  |\n"
+	"      *  |             |             |            |  first element written to left- |\n"
+	"      *  |             |             |            |  half of array. Zero or more    |\n"
+	"      *  |             |             |            |  complete reductions performed  |\n"
+	"      *  |             |             |            |  and written directly to        |\n"
+	"      *  |             |             |            |  destination. Block ends        |\n"
+	"      *  |             |             |            |  aligned.                       |\n"
+	"      *  |             |             |            |                                 |\n"
+	"      *  |      1      |       1     |      0     |  V < B. Block begins misaligned |\n"
+	"      *  |             |             |            |  and ends misaligned, before    |\n"
+	"      *  |             |             |            |  the end of its first element.  |\n"
+	"      *  |             |             |            |  Partial result written to at   |\n"
+	"      *  |             |             |            |  least right-half of array.     |\n"
+	"      *  |             |             |            |                                 |\n"
+	"      *  |      1      |       1     |      1     |  V % B != 0. Block begins       |\n"
+	"      *  |             |             |            |  misaligned and ends misaligned,|\n"
+	"      *  |             |             |            |  after the end of its first     |\n"
+	"      *  |             |             |            |  element.                       |\n"
+	"      *  |             |             |            |  Partial result of first element|\n"
+	"      *  |             |             |            |  written to left-half of array. |\n"
+	"      *  |             |             |            |  Partial result of last element |\n"
+	"      *  |             |             |            |  written to right-half of array.|\n"
+	"      *  |             |             |            |  0 or more complete elements    |\n"
+	"      *  |             |             |            |  written out directly to        |\n"
+	"      *  |             |             |            |  destination.                   |\n"
+	"      *  +-------------+-------------+------------+---------------------------------+\n"
+	"      * \n"
+	"      * Possible configurations of blocks:\n"
+	"      *   If V % B == 0:  001\n"
+	"      *   If V < B:       010, 110, 111, 101\n"
+	"      *   If V > B:       011, 111, 101\n"
+	"      * \n"
+	"      * Possible configurations for collector blocks (responsible for gathering of\n"
+	"      * results to the left):\n"
+	"      *   101, 111          (misalignL && doFinish)\n"
+	"      * \n"
+	"      * Possible configurations for left-neighbours of collector blocks\n"
+	"      *   110 (any number 0+), then exactly one of:\n"
+	"      *   010, 011, 111\n"
+	"      * \n"
+	"      * Conclusion:\n"
+	"      *     - In Phase 0:\n"
+	"      *         - Always make a right-write if misalignR                (010, 011, 110, 111).\n"
+	"      *         - Make        a left -write at least if collector block (101, 111).\n"
+	"      *     - In Phase 1:\n"
+	"      *         - Exit if not collector block (101, 111)\n"
+	"      *         - If collector block,\n"
+	"      *             - Left -read from self\n"
+	"      *             - Right-read from all left-neighbours with same write-target.\n"
+	"      * \n"
+	"      * Code Structure perfectly satisfying conclusion:\n"
+	"      * \n"
+	"      * if(misalignL){\n"
+	"      *     while(v > 0){\n"
+	"      *         v--;\n"
+	"      *         REDUX();\n"
+	"      *         ReduxLoopIncs_CONTINUE;\n"
+	"      *         HREDUX();\n"
+	"      *         WSLeftWrite();\n"
+	"      *         REINIT();\n"
+	"      *         FreeLoopIncs_BREAK;\n"
+	"      *         BREAK;\n"
+	"      *     }\n"
+	"      * }\n"
+	"      * while(v > 0){\n"
+	"      *     v--;\n"
+	"      *     REDUX();\n"
+	"      *     ReduxLoopIncs_CONTINUE;\n"
+	"      *     HREDUX();\n"
+	"      *     DstWrite();\n"
+	"      *     REINIT();\n"
+	"      *     FreeLoopIncs_CONTINUE;\n"
+	"      *     BREAK;\n"
+	"      * }\n"
+	"      * if(misalignR){\n"
+	"      *     HREDUX();\n"
+	"      *     WSRightWrite();\n"
+	"      * }\n"
+	"      * \n"
+	"      * Code Walkthrough:\n"
+	"      * \n"
+	"      * 000, 100: Impossible, can be ignored.\n"
+	"      * 001:      Only master loop entered, handles exact integer number of destinations.\n"
+	"      * 010:      Master loop entered but broken on vcount before HREDUX() reached.\n"
+	"      *           No reinit executed on breakout. HREDUX(), followed by WSRightWrite() of\n"
+	"      *           partial result.\n"
+	"      * 011:      Master loop entered for at least 1 full destination, then broken on\n"
+	"      *           vcount before HREDUX() reached. No reinit executed on breakout. HREDUX()\n"
+	"      *           followed by WSRightWrite() of partial result.\n"
+	"      * 101:      Left-misalign loop entered and completes a reduction. HREDUX()\n"
+	"      *           performed, WSLeftWrite() performed, reinitialization, bump of outer\n"
+	"      *           loop counters, then breakout. Master loop entered for 0 or more complete\n"
+	"      *           destination elements involving full writeouts to destination and reinit.\n"
+	"      *           Aligned on both misalignL and master loop breakouts. No entry into\n"
+	"      *           misalignR fixup.\n"
+	"      * 110:      Left-misalign loop entered, breaks on vcount before HREDUX(). No reinit\n"
+	"      *           executed on breakout. Master loop not entered. HREDUX(), followed by\n"
+	"      *           WSRightWrite() of partial result.\n"
+	"      * 111:      Left-misalign loop entered and completes a reduction. HREDUX() performed,\n"
+	"      *           WSLeftWrite() performed, reinit, bump of outer loop counters, breakout.\n"
+	"      *           Master loop entered for 0 or more complete destination elements\n"
+	"      *           involving full writeout to destination and reinit.\n"
+	"      *           Master loop broken on vcount before HREDUX(). misalignR fixup entered,\n"
+	"      *           HREDUX(), WSRightWrite().\n"
+	"      */\n"
+	"    \n"
+	"    TX      start        = GID_0 * V;\n"
+	"    if(start >= U){return;}\n"
+	"    TX      v            = U-start < V ? U-start : V;\n"
+	"    \n"
+	"    int     misalignL    = (start+0)%B != 0;\n"
+	"    int     misalignR    = (start+v)%B != 0;\n"
+	"    int     doFinish     = (start+0)/B != (start+v)/B;\n"
+	"    \n"
+	"    /**\n"
+	"     * Decode BLOCK start point.\n"
+	"     * \n"
+	"     * For the purpose of decoding the start point, the split axis's \"length\"\n"
+	"     * is divided by either splitReduce or splitFree and rounded up. Therefore,\n"
+	"     * for those axes the true computed initial starting point must be\n"
+	"     * multiplied by either splitReduce or splitFree.\n"
+	"     * \n"
+	"     * Since we provide not strides but \"jumps\" to the kernel (to move as many\n"
+	"     * things as possible into constant memory and out of the fast path), we\n"
+	"     * must also convert jumps to strides in preparation for offsetting the\n"
+	"     * base pointers to their starting point.\n"
+	"     */\n"
+	"    \n"
+	"    unsigned    Dunit    = D/splitFree;\n");
+	if(gr->ndd > 0){
+		srcbAppendf(&gr->srcGen,
+		"    TX          l%dMul    = DIVIDECEIL(l%d, splitFree);\n",
+		            gr->ndd-1, gr->ndd-1);
+	}
+	if(gr->ndr > 0){
+		srcbAppendf(&gr->srcGen,
+		"    TX          l%dMul    = DIVIDECEIL(l%d, splitReduce);\n",
+		            gr->nds-1, gr->nds-1);
+	}
+	srcbAppends(&gr->srcGen, "    \n");
+	for(i=gr->nds-1;i>=0;i--){
+		if(i == gr->nds-1){
+			srcbAppendf(&gr->srcGen,
+			"    TX          i%d       = start %% l%dMul;\n",
+			            i, i);
+			
+		}else{
+			srcbAppendf(&gr->srcGen,
+			"    TX          i%d       = i%d    / l%d%s %% l%d%s;\n",
+			            i, i+1,
+			            i+1,
+			            reduxGenAxisMaybeSplit(gr, i+1) ? "Mul" : "",
+			            i,
+			            reduxGenAxisMaybeSplit(gr, i)   ? "Mul" : "");
+		}
 	}
-	if(reduxRequiresDstArg(ctx)){
-		ctx->flatDstArgData    = ctx->dstArg->data;
-		ctx->flatDstArgOffset  = ctx->dstArg->offset;
+	srcbAppends(&gr->srcGen, "    \n");
+	if(gr->ndd > 0){
+		srcbAppendf(&gr->srcGen,
+		"    i%d                  *= splitFree;\n",
+		            gr->ndd-1);
+	}
+	if(gr->ndr > 0){
+		srcbAppendf(&gr->srcGen,
+		"    i%d                  *= splitReduce;\n",
+	                gr->nds-1);
+	}
+	srcbAppends(&gr->srcGen, "    \n");
+	for(i=gr->nds-1;i>=0;i--){
+		if(i == gr->nds-1){
+			srcbAppendf(&gr->srcGen,
+			"    TX          sS%d      = (sJ%d             ) / splitReduce;\n",
+			            i, i);
+		}else{
+			srcbAppendf(&gr->srcGen,
+			"    TX          sS%d      = (sJ%d + (TX)l%d*sS%d)%s;\n",
+			            i, i, i+1, i+1,
+			            i == gr->ndd-1 ? " / splitFree" : "");
+		}
 	}
-	for(ctx->ndfd=ctx->ndfr=i=0;i<ctx->ndfs;i++){
-		axis = reduxGetSrcFlatAxis(ctx, i);
-		if(axisIsReduced(axis)){
-			ctx->ndfr++;
+	if (reduxGenRequiresDst(gr)){
+		srcbAppends(&gr->srcGen, "    \n");
+		for(i=gr->ndd-1;i>=0;i--){
+			if(i == gr->ndd-1){
+				srcbAppendf(&gr->srcGen,
+				"    TX          dS%d      = (dJ%d             ) / splitFree;\n",
+				            i, i);
+			}else{
+				srcbAppendf(&gr->srcGen,
+				"    TX          dS%d      = (dJ%d + (TX)l%d*dS%d);\n",
+				            i, i, i+1, i+1);
+			}
+		}
+	}
+	if (reduxGenRequiresDstArg(gr)){
+		srcbAppends(&gr->srcGen, "    \n");
+		for(i=gr->ndd-1;i>=0;i--){
+			if(i == gr->ndd-1){
+				srcbAppendf(&gr->srcGen,
+				"    TX          aS%d      = (aJ%d             ) / splitFree;\n",
+				            i, i);
+			}else{
+				srcbAppendf(&gr->srcGen,
+				"    TX          aS%d      = (aJ%d + (TX)l%d*aS%d);\n",
+				            i, i, i+1, i+1);
+			}
+		}
+	}
+	srcbAppends(&gr->srcGen, "    \n");
+	srcbAppends(&gr->srcGen, "    sOff                += ");
+	srcbBeginList(&gr->srcGen, " + ", "0");
+	for(i=0;i<gr->nds;i++){
+		srcbAppendElemf(&gr->srcGen, "(TX)i%d*sS%d", i, i);
+	}
+	srcbEndList(&gr->srcGen);
+	srcbAppends(&gr->srcGen, ";\n");
+	if (reduxGenRequiresDst(gr)){
+		srcbAppends(&gr->srcGen, "    dOff                += ");
+		srcbBeginList(&gr->srcGen, " + ", "0");
+		for(i=0;i<gr->ndd;i++){
+			srcbAppendElemf(&gr->srcGen, "(TX)i%d*dS%d", i, i);
+		}
+		srcbEndList(&gr->srcGen);
+		srcbAppends(&gr->srcGen, ";\n");
+	}
+	if (reduxGenRequiresDstArg(gr)){
+		srcbAppends(&gr->srcGen, "    aOff                += ");
+		srcbBeginList(&gr->srcGen, " + ", "0");
+		for(i=0;i<gr->ndd;i++){
+			srcbAppendElemf(&gr->srcGen, "(TX)i%d*aS%d", i, i);
+		}
+		srcbEndList(&gr->srcGen);
+		srcbAppends(&gr->srcGen, ";\n");
+	}
+	srcbAppends(&gr->srcGen, "    \n");
+	if(reduxGenKernelRequiresDst(gr)){
+		srcbAppends(&gr->srcGen,
+		"    TK*         wd       = (TK*)(w     + wdOff);\n"
+		"    TK*         wdL      = &wd[0];\n"
+		"    TK*         wdR      = &wd[GDIM_0*D];\n"
+		"    TK*         pd       = (TK*)(SHMEM + pdOff);\n");
+	}
+	if(reduxGenKernelRequiresDstArg(gr)){
+		srcbAppends(&gr->srcGen,
+		"    TA*         wa       = (TA*)(w     + waOff);\n"
+		"    TA*         waL      = &wa[0];\n"
+		"    TA*         waR      = &wa[GDIM_0*D];\n"
+		"    TA*         pa       = (TA*)(SHMEM + paOff);\n");
+	}
+	srcbAppends(&gr->srcGen,
+	    "    \n"
+	    "    TX          h, k;\n"
+	    "    \n");
+}
+static void       reduxGenSrcAppendThreadDecode (GpuReduction*     gr){
+	int i;
+
+	srcbAppends(&gr->srcGen,
+	"    /**\n"
+	"     * Decode THREAD start point.\n"
+	"     * \n"
+	"     * This involves computing the intra-block coordinate of a thread in a\n"
+	"     * up-to-log2(MAX_BLOCK_THREADS)-dimensional coordinate system, then using\n"
+	"     * those coordinates to compute private source/destination/destination\n"
+	"     * argument pointers, argument indices and permute targets.\n"
+	"     */\n"
+	"    \n"
+	"    unsigned    iSplit   = LID_0/(LDIM_0/(splitFree*splitReduce));\n");
+	
+	for(i=gr->log2MaxL-1;i>=0;i--){
+		if(i == gr->log2MaxL-1){
+			srcbAppendf(&gr->srcGen,
+			"    int         t%d       = (unsigned)LID_0 %% ibs%d;\n",
+			            i, i);
 		}else{
-			if(reduxRequiresDst(ctx)){
-				ctx->flatDstStrides[ctx->ndfd]    = axisGetDstStride(axis);
-				ctx->flatDstOffset               += axisGetDstOffset(axis);
+			srcbAppendf(&gr->srcGen,
+			"    int         t%d       = (unsigned)t%d    / ibs%d %% ibs%d;\n",
+			            i, i+1, i+1, i);
+		}
+	}
+	if(reduxGenRequiresDstArg(gr)){
+		srcbAppends(&gr->srcGen, "    TX          ti       = ");
+		srcbBeginList(&gr->srcGen, " + ", "0");
+		for(i=0;i<gr->log2MaxL;i++){
+			srcbAppendElemf(&gr->srcGen, "t%d*ibl%dPDim", i, i);
+		}
+		srcbEndList(&gr->srcGen);
+		srcbAppends(&gr->srcGen, ";\n");
+	}
+	srcbAppends(&gr->srcGen, "    unsigned    tp       = ");
+	srcbBeginList(&gr->srcGen, " + ", "0");
+	for(i=0;i<gr->log2MaxL;i++){
+		srcbAppendElemf(&gr->srcGen, "t%d*    ibp%d", i, i);
+	}
+	srcbEndList(&gr->srcGen);
+	srcbAppends(&gr->srcGen, ";\n");
+	
+	
+	
+	
+	srcbAppends(&gr->srcGen, "    \n"
+	                         "    sOff                += ");
+	srcbBeginList(&gr->srcGen, " + ", "0");
+	for(i=0;i<gr->log2MaxL;i++){
+		srcbAppendElemf(&gr->srcGen, "t%d*ibsOff%d ", i, i);
+	}
+	srcbEndList(&gr->srcGen);
+	srcbAppends(&gr->srcGen, ";\n");
+	if(reduxGenRequiresDst(gr)){
+		srcbAppends(&gr->srcGen, "    \n"
+		                         "    dOff                += ");
+		srcbBeginList(&gr->srcGen, " + ", "0");
+		for(i=0;i<gr->log2MaxL;i++){
+			srcbAppendElemf(&gr->srcGen, "t%d*ibdOff%d ", i, i);
+		}
+		srcbEndList(&gr->srcGen);
+		srcbAppends(&gr->srcGen, ";\n");
+		srcbAppends(&gr->srcGen, "    ((TX*)SHMEM)[tp]     = dOff;\n"
+		                         "    local_barrier();\n"
+		                         "    dOff                 = ((TX*)SHMEM)[LID_0];\n"
+		                         "    local_barrier();\n");
+	}
+	if(reduxGenRequiresDstArg(gr)){
+		srcbAppends(&gr->srcGen, "    \n"
+		                         "    aOff                += ");
+		srcbBeginList(&gr->srcGen, " + ", "0");
+		for(i=0;i<gr->log2MaxL;i++){
+			srcbAppendElemf(&gr->srcGen, "t%d*ibaOff%d ", i, i);
+		}
+		srcbEndList(&gr->srcGen);
+		srcbAppends(&gr->srcGen, ";\n");
+		srcbAppends(&gr->srcGen, "    ((TX*)SHMEM)[tp]     = aOff;\n"
+		                         "    local_barrier();\n"
+		                         "    aOff                 = ((TX*)SHMEM)[LID_0];\n"
+		                         "    local_barrier();\n");
+	}
+	srcbAppends(&gr->srcGen, "    \n"
+	                         "    const char* ts       = s + sOff;\n");
+	if(reduxGenRequiresDst(gr)){
+		srcbAppends(&gr->srcGen, "    char*       td       = d + dOff;\n");
+	}
+	if(reduxGenRequiresDstArg(gr)){
+		srcbAppends(&gr->srcGen, "    char*       ta       = a + aOff;\n");
+	}
+	srcbAppends(&gr->srcGen, "    \n"
+	                         "    \n");
+}
+static void       reduxGenSrcAppendPhase0       (GpuReduction*     gr){
+	srcbAppends(&gr->srcGen,
+	"        /* PHASE 0 */\n"
+	"        \n"
+	"        /* Loop Cores. */\n");
+	if (gr->ndd == 0){
+		/**
+		 * Special case: If ndd == 0, we know this is an all-reduce or nearly, so
+		 * we know that the only split axis, if any, is going to be a reduction axis.
+		 * Therefore, splitFree will always be 1, and we only need to generate one
+		 * set of loops.
+		 */
+		
+		reduxGenSrcAppendLoops(gr, 0, 1);
+	}else{
+		srcbAppends(&gr->srcGen, "        if(splitReduce == 1){\n"
+		                         "            /* Free   axis possibly split. */\n");
+		reduxGenSrcAppendLoops(gr, 1, 0);
+		srcbAppends(&gr->srcGen, "        }else{\n"
+		                         "            /* Reduce axis possibly split. */\n");
+		reduxGenSrcAppendLoops(gr, 0, 1);
+		srcbAppends(&gr->srcGen, "        }\n");
+	}
+}
+static void       reduxGenSrcAppendLoops        (GpuReduction*        gr,
+                                                 int                  freeMaybeSplit,
+                                                 int                  reduceMaybeSplit){
+	srcbAppends(&gr->srcGen, "            if(misalignL){\n");
+	reduxGenSrcAppendLoop(gr, 1, freeMaybeSplit, reduceMaybeSplit);
+	srcbAppends(&gr->srcGen, "            }\n");
+	reduxGenSrcAppendLoop(gr, 0, freeMaybeSplit, reduceMaybeSplit);
+	srcbAppends(&gr->srcGen,
+	"            \n"
+	"            /**\n"
+	"             * Are we misaligned on the right? If so, we have a partial reduction\n"
+	"             * to save.\n"
+	"             */\n"
+	"            \n"
+	"            if(misalignR){\n"
+	"                HREDUX(pd, pa, tp, accV, accI);\n"
+	"                \n"
+	"                /* Right-write partial reduction to workspace. */\n"
+	"                if(LID_0 < D){\n"
+	"                    SETREDUXSTATE(wdR[GID_0*D+LID_0], waR[GID_0*D+LID_0], pd[LID_0], pa[LID_0]);\n"
+	"                }\n"
+	"            }\n");
+}
+static void       reduxGenSrcAppendLoop         (GpuReduction*        gr,
+                                                 int                  initial,
+                                                 int                  freeMaybeSplit,
+                                                 int                  reduceMaybeSplit){
+	int i;
+	
+	srcbAppends(&gr->srcGen, "            while(v > 0){\n");
+	reduxGenSrcAppendDecrement(gr);
+	reduxGenSrcAppendVertical (gr, freeMaybeSplit, reduceMaybeSplit);
+	srcbAppends(&gr->srcGen, "                /* Reduction Increments */\n");
+	for(i=gr->nds-1;i >= gr->ndd;i--){
+		reduxGenSrcAppendIncrement(gr, i, initial, freeMaybeSplit, reduceMaybeSplit);
+	}
+	srcbAppends(&gr->srcGen, "                /* Horizontal Reduction */\n"
+	                         "                HREDUX(pd, pa, tp, accV, accI);\n"
+	                         "                \n");
+	reduxGenSrcAppendDstWrite(gr, initial, freeMaybeSplit, reduceMaybeSplit);
+	srcbAppends(&gr->srcGen, "                /* Reinitialize accumulators */\n"
+	                         "                INITREDUXSTATE(accV, accI);\n"
+	                         "                \n");
+	srcbAppends(&gr->srcGen, "                /* Free Increments */\n");
+	for(i=gr->ndd-1;i >= 0;i--){
+		reduxGenSrcAppendIncrement(gr, i, initial, freeMaybeSplit, reduceMaybeSplit);
+	}
+	srcbAppends(&gr->srcGen, "                /* Exit loop */\n"
+	                         "                break;\n"
+	                         "            }\n");
+}
+static void       reduxGenSrcAppendDecrement    (GpuReduction*        gr){
+	srcbAppends(&gr->srcGen, "                /* Decrement. */\n"
+	                         "                v--;\n"
+	                         "                \n");
+}
+static void       reduxGenSrcAppendVertical     (GpuReduction*        gr,
+                                                 int                  freeMaybeSplit,
+                                                 int                  reduceMaybeSplit){
+	int i;
+	
+	if(!freeMaybeSplit && !reduceMaybeSplit){
+		srcbAppends(&gr->srcGen, "                /* Vertical Reductions */\n"
+		                         "                LOADS(tmpV, ts);\n"
+		                         "                REDUX(accV, accI, tmpV, GETIDX);\n"
+		                         "                \n");
+	}else{
+		i = freeMaybeSplit ? gr->ndd-1 : gr->nds-1;
+		srcbAppendf(&gr->srcGen, "                /* Vertical Reductions */\n"
+		                         "                if(i%d+iSplit < l%d){\n"
+		                         "                    LOADS(tmpV, ts);\n"
+		                         "                    REDUX(accV, accI, tmpV, GETIDX);\n"
+		                         "                }\n"
+		                         "                \n", i, i);
+	}
+}
+static void       reduxGenSrcAppendIncrement    (GpuReduction*        gr,
+                                                 int                  axis,
+                                                 int                  initial,
+                                                 int                  freeMaybeSplit,
+                                                 int                  reduceMaybeSplit){
+	const char* breakOrCont = (initial) && (axis < gr->ndd) ? "break" : "continue";
+	
+	if       (freeMaybeSplit   && axis == gr->ndd-1){
+		srcbAppendf(&gr->srcGen,
+		"                i%d += splitFree;\n"
+		"                ts += sJ%d;",
+		            axis, axis);
+		if(reduxGenRequiresDst(gr)){
+			srcbAppendf(&gr->srcGen, "td += dJ%d;", axis);
+		}
+		if(reduxGenRequiresDstArg(gr)){
+			srcbAppendf(&gr->srcGen, "ta += aJ%d;", axis);
+		}
+		srcbAppends(&gr->srcGen, "\n");
+		srcbAppendf(&gr->srcGen,
+		"                if  (i%d < l%d){%s;}\n"
+		"                else         {i%d = 0;}\n"
+		"                \n",
+		            axis, axis, breakOrCont, axis);
+	}else if (reduceMaybeSplit && axis == gr->nds-1){
+		srcbAppendf(&gr->srcGen,
+		"                i%d += splitReduce;\n"
+		"                ts += sJ%d;\n"
+		"                if  (i%d < l%d){%s;}\n"
+		"                else         {i%d = 0;}\n"
+		"                \n",
+		            axis, axis, axis, axis, breakOrCont, axis);
+	}else{
+		srcbAppendf(&gr->srcGen,
+		"                i%d++;\n"
+		"                ts += sJ%d;",
+		            axis, axis);
+		if(axis < gr->ndd){
+			if(reduxGenRequiresDst(gr)){
+				srcbAppendf(&gr->srcGen, "td += dJ%d;", axis);
 			}
-			if(reduxRequiresDstArg(ctx)){
-				ctx->flatDstArgStrides[ctx->ndfd] = axisGetDstArgStride(axis);
-				ctx->flatDstArgOffset            += axisGetDstArgOffset(axis);
+			if(reduxGenRequiresDstArg(gr)){
+				srcbAppendf(&gr->srcGen, "ta += aJ%d;", axis);
 			}
-			
-			ctx->ndfd++;
 		}
+		srcbAppends(&gr->srcGen, "\n");
+		srcbAppendf(&gr->srcGen,
+		"                if  (i%d < l%d){%s;}\n"
+		"                else         {i%d = 0;}\n"
+		"                \n",
+		            axis, axis, breakOrCont, axis);
+	}
+}
+static void       reduxGenSrcAppendDstWrite     (GpuReduction*        gr,
+                                                 int                  initial,
+                                                 int                  freeMaybeSplit,
+                                                 int                  reduceMaybeSplit){
+	if(initial){
+		srcbAppends(&gr->srcGen, "                /* Workspace Left-Write */\n"
+		                         "                if(LID_0 < D){\n"
+		                         "                    SETREDUXSTATE(wdL[GID_0*D + LID_0], waL[GID_0*D + LID_0], pd[LID_0], pa[LID_0]);\n"
+		                         "                }\n"
+		                         "                \n");
+	}else{
+		if(!freeMaybeSplit){
+			srcbAppends(&gr->srcGen, "                /* Destination Write */\n"
+			                         "                if(LID_0 < D){\n"
+			                         "                    STORED(td, pd[LID_0]);\n"
+			                         "                    STOREA(ta, pa[LID_0]);\n"
+			                         "                }\n"
+			                         "                \n");
+		}else{
+			if(gr->ndd > 0){
+				srcbAppendf(&gr->srcGen, "                /* Destination Write */\n"
+				                         "                if(LID_0 < (l%d-i%d<splitFree ? (l%d-i%d)*Dunit : D)){\n"
+				                         "                    STORED(td, pd[LID_0]);\n"
+				                         "                    STOREA(ta, pa[LID_0]);\n"
+				                         "                }\n"
+				                         "                \n",
+				            gr->ndd-1, gr->ndd-1, gr->ndd-1, gr->ndd-1);
+			}else{
+				srcbAppendf(&gr->srcGen, "                STORED(td, pd[LID_0]);\n"
+				                         "                STOREA(ta, pa[LID_0]);\n");
+			}
+		}
+	}
+}
+static void       reduxGenSrcAppendPhase1       (GpuReduction*        gr){
+	srcbAppends(&gr->srcGen,
+	"        /* PHASE 1 */\n"
+	"        \n"
+	"        /**\n"
+	"         * If we are a collector block, gather all partial results for the\n"
+	"         * same point to the left of the current position in our workspace\n"
+	"         * and accumulate them into our partial result, then write out to\n"
+	"         * destination/destination argument.\n"
+	"         * We perform a left-read of our workspace and a right-read of the\n"
+	"         * other blocks' workspace.\n"
+	"         */\n"
+	"        \n"
+	"        if(misalignL && doFinish && LID_0 < D){\n"
+	"            SETREDUXSTATE(accV, accI, wdL[(GID_0+0)*D+LID_0], waL[(GID_0+0)*D+LID_0]);\n"
+	"            \n"
+	"            for(k=-1;                /* Starting with the first block to our left... */\n"
+	"                (start      +0)/B == /* Is our write target the same as that of */\n"
+	"                (start+k*V+V-1)/B;   /* the target k blocks to our left? */\n"
+	"                k--){                /* Try moving one more to the left. */\n"
+	"                REDUX(accV, accI, wdR[(GID_0+k)*D+LID_0], waR[(GID_0+k)*D+LID_0]);\n"
+	"            }\n"
+	"            \n");
+	if(gr->ndd > 0){
+		srcbAppendf(&gr->srcGen,
+		"            if(LID_0 < (l%d-i%d<splitFree ? (l%d-i%d)*Dunit : D)){\n"
+		"                STORED(td, accV);\n"
+		"                STOREA(ta, accI);\n"
+		"            }\n",
+		            gr->ndd-1, gr->ndd-1, gr->ndd-1, gr->ndd-1);
+	}else{
+		srcbAppends(&gr->srcGen,
+		            "            STORED(td, accV);\n"
+		            "            STOREA(ta, accI);\n");
+	}
+	srcbAppends(&gr->srcGen,
+	            "        }\n");
+}
 
-		ctx->flatSrcDimensions[i] = axisGetLen      (axis);
-		ctx->flatSrcStrides[i]    = axisGetSrcStride(axis);
-		ctx->flatSrcOffset       += axisGetSrcOffset(axis);
+/**
+ * @brief Compile the generated kernel.
+ */
+
+static int        reduxGenCompile               (GpuReduction*     gr){
+	int ret;
+	
+	ret  = GpuKernel_init(&gr->k,
+	                      gr->gpuCtx,
+	                      1,
+	                      (const char**)&gr->kSourceCode,
+	                      &gr->kSourceCodeLen,
+	                      "redux",
+	                      gr->kNumArgs,
+	                      gr->kArgTypeCodes,
+	                      GA_USE_CLUDA,
+	                      &gr->kErrorString);
+
+	if (ret != GA_NO_ERROR){
+		return reduxGenCleanupMsg(gr, ret,
+		       "Failed to compile reduction kernel!\n"
+		       "Error code   is: %d\n"
+		       "Error string is:\n"
+		       "%s\n"
+		       "Source code  is:\n"
+		       "%s\n",
+		       ret, gr->kErrorString, gr->kSourceCode);
 	}
+	
+	return reduxGenComputeLaunchBounds(gr);
+}
+
+/**
+ * @brief Compute the maximum number of threads this reduction operator will
+ *        support launching.
+ */
 
-	return reduxSelectNumStages(ctx);
+static int        reduxGenComputeLaunchBounds   (GpuReduction*        gr){
+	int    ret;
+	size_t a,b,c;
+	
+	/**
+	 * Compute the maximum number of threads this kernel will support,
+	 * since this is critical to the scheduling and will not change now
+	 * that the kernel is compiled.
+	 * 
+	 * This depends on several exhaustible resources and isn't necessarily
+	 * trivial to compute due to the complicated rules we must follow to
+	 * align shared memory, possibly slightly increasing consumption.
+	 */
+	
+	ret = gpukernel_property(gr->k.k, GA_KERNEL_PROP_MAXLSIZE,  &gr->maxLK);
+	if(ret != GA_NO_ERROR){
+		return reduxGenCleanupMsg(gr, ret,
+		       "Failed to read max local size for compiled kernel!\n");
+	}
+	a         = gr->maxL0;
+	b         = gr->maxLg;
+	c         = gr->maxLM/reduxGenGetReduxStateSize(gr);
+	                                       /* Kernel register use              */
+	gr->maxLK = gr->maxLK<a ? gr->maxLK: a;/* Maximum block size on axis 0     */
+	gr->maxLK = gr->maxLK<b ? gr->maxLK: b;/* Maximum total block size         */
+	gr->maxLK = gr->maxLK<c ? gr->maxLK: c;/* Shared memory per thread.        */
+	
+	/**
+	 * We now have a tight bound on the maximum block size, but due to memory
+	 * alignment rules the memory consumption may be slightly higher than we
+	 * initially computed, and thus the shared memory use can still be
+	 * excessive. The following loop will almost certainly decrement at most
+	 * once, unless type alignments are very wierd.
+	 */
+	
+	while(reduxGenGetSHMEMSize(gr, gr->maxLK) > gr->maxLM){
+		gr->maxLK--;
+	}
+	
+	return reduxGenCleanup(gr, GA_NO_ERROR);
 }
 
 /**
- * @brief Select the number of stages of the reduction.
+ * @brief Cleanup generator context.
+ */
+
+static int        reduxGenCleanup               (GpuReduction*     gr,  int ret){
+	if(ret != GA_NO_ERROR){
+		free(gr->kArgTypeCodes);
+		free(gr->kSourceCode);
+		free(gr->kErrorString);
+	
+		memset(gr, 0, sizeof(*gr));
+		free(gr);
+	}
+
+	return ret;
+}
+static int        reduxGenCleanupMsg            (GpuReduction*     gr,  int ret,
+                                                 const char*       fmt, ...){
+#if DEBUG
+	FILE* fp = stderr;
+	
+	va_list ap;
+	va_start(ap, fmt);
+	vfprintf(fp, fmt, ap);
+	va_end(ap);
+	fflush(fp);
+#else
+	(void)fmt;
+#endif
+	
+	return reduxGenCleanup(gr, ret);
+}
+
+/**
+ * @brief Estimate the level of parallelism available in the GPU context of
+ *        this reduction operator.
  * 
- * This depends a lot on the GPU and the specific size of the reduction.
+ * This is a rough target number of threads.  It would definitely fill the
+ * device, plus some substantial margin.
  */
 
-static int        reduxSelectNumStages          (redux_ctx*  ctx){
-	size_t parallelism  = reduxEstimateParallelism(ctx);
+static size_t     reduxGenEstimateParallelism   (const GpuReduction*  gr){
+	/**
+	 * An arbitrary margin factor ensuring there will be a few thread blocks
+	 * per SMX.
+	 * 
+	 * E.g. on Kepler, each SMX can handle up to two 1024-thread blocks
+	 * simultaneously, so a margin of 6/SMX should ensure with very high
+	 * likelyhood that all SMXes will be fed and kept busy.
+	 */
 	
-	if (ctx->zeroRdxAxes                      || /* Reduction over  0  elements? */
-	    ctx->prodAllAxes  <= ctx->maxLg       || /* Reduction over few elements? */
-	    ctx->prodFreeAxes >= ctx->prodRdxAxes || /* More destinations than reductions? */
-	    ctx->prodFreeAxes >= parallelism      ){ /* Destination very large? */
-		ctx->numStages = 1;
-	}else{
-		/* BUG: Switch to 2Stage when small code model fixed. */
-		ctx->numStages = 1;
+	size_t marginFactor = 6;
+	return marginFactor * gr->numProcs * gr->maxLg;
+}
+
+/**
+ * @brief Returns whether the reduction interface requires a dst argument.
+ */
+
+static int        reduxGenRequiresDst           (const GpuReduction*  gr){
+	switch (gr->op){
+		case GA_REDUCE_ARGMIN:
+		case GA_REDUCE_ARGMAX:
+		  return 0;
+		default:
+		  return 1;
+	}
+}
+
+/**
+ * @brief Returns whether the reduction interface requires a dstArg argument.
+ */
+
+static int        reduxGenRequiresDstArg        (const GpuReduction*  gr){
+	switch (gr->op){
+		case GA_REDUCE_MINANDARGMIN:
+		case GA_REDUCE_MAXANDARGMAX:
+		case GA_REDUCE_ARGMIN:
+		case GA_REDUCE_ARGMAX:
+		  return 1;
+		default:
+		  return 0;
 	}
-	return ctx->numStages == 1 ? reduxPlan1Stage(ctx) : reduxPlan2Stage(ctx);
 }
 
 /**
- * @brief Plan a 1-stage reduction.
+ * @brief Returns whether the generated kernel internally requires a dst
+ *        workspace.
+ *
+ * This is semantically subtly different from reduxGenRequiresDst(). The main
+ * difference is in the implementation of the GA_REDUCE_ARGMIN/ARGMAX
+ * reductions; both require a dst workspace buffer for the min/max values
+ * associated with the indices that they return, even though they will be
+ * discarded.
  * 
- * Inputs: ctx->xdSrcFlat[0...ctx->ndf-1]
+ * As of now, all reductions use a dst workspace internally.
+ */
+
+static int        reduxGenKernelRequiresDst     (const GpuReduction*  gr){
+	return 1;
+}
+
+/**
+ * @brief Returns whether the generated kernel internally requires a dstArg
+ *        workspace.
+ *
+ * This is semantically subtly different from reduxHasDstArg(), since it asks
+ * whether the reduction, even though it might not accept a dstArg argument,
+ * still requires a dstArg workspace internally.
  * 
- * This plan involves a direct write to the destinations, and does not require
- * working space.
+ * Currently, there exist no operations that require a dstArg workspace
+ * internally but which is not also part of the external interface.
+ */
+
+static int        reduxGenKernelRequiresDstArg  (const GpuReduction*  gr){
+	return reduxGenRequiresDstArg(gr);
+}
+
+/**
+ * @brief Whether or not an axis is maybe split.
  * 
- * Because the reduction is deterministic, all reductions required for any
- * destination element must be performed within a single thread block.
+ * An axis is possibly split if it is the last free or last reduction axis.
+ */
+
+static int        reduxGenAxisMaybeSplit        (const GpuReduction*  gr, int axis){
+	return axis == gr->ndd-1 || axis == gr->nds-1;
+}
+
+/**
+ * @brief Get the number of bytes of workspace per (partial) reduction per thread.
+ */
+
+static size_t     reduxGenGetReduxStateSize     (const GpuReduction*  gr){
+	size_t total = 0, idxSize = gpuarray_get_elsize(gr->idxTypeCode);
+	
+	/* The accumulator and index types can be wider than dst/dstArg's types. */
+	total += reduxGenKernelRequiresDst(gr)           ?
+	         gpuarray_get_elsize(gr->accTypeCode)    :
+	         0;
+	total += reduxGenKernelRequiresDstArg(gr)        ?
+	         gpuarray_get_elsize(gr->idxTypeCode)    :
+	         0;
+	
+	/* At minimum, there must be space for the offset permute. */
+	total  = total < idxSize ? idxSize : total;
+	          
+	
+	/* Return the calculated amount of space. */
+	return total;
+}
+
+/**
+ * @brief Get the maximum number of threads this operator's kernel can handle.
+ */
+
+static size_t     reduxGenGetMaxLocalSize       (const GpuReduction*  gr){
+	return gr->maxLK;
+}
+
+/**
+ * @brief Get the shared memory consumption for a given block size.
  * 
- * In this implementation we choose to perform only intra-warp reductions,
- * insulating ourselves from having to worry about the interplay between block
- * size and kernel source code (A kernel's max block size is limited by
- * numerous factors including its own source code, but the specific kernel we
- * pick and generate requires foreknowledge of its block size. Chicken or egg).
+ * This is non-trivial since it requires ensuring alignment of datatypes.
  */
 
-static int        reduxPlan1Stage               (redux_ctx*  ctx){
-	int        i;
-	axis_desc* axis;
+static size_t     reduxGenGetSHMEMSize          (const GpuReduction*  gr, size_t bs){
+	const gpuarray_type* type;
+	size_t               total = 0;
 	
-	reduxSortAxisPtrsBy(ctx->xdSrcPtrs, ctx->xdSrcFlat, ctx->ndfs,
-	                    reduxSortPlan1Stage);
+	if(reduxGenKernelRequiresDst(gr)){
+		type   = gpuarray_get_type(gr->accTypeCode);
+		total  = DIVIDECEIL(total, type->align)*type->align;
+		total += bs*type->size;
+	}
+	if(reduxGenKernelRequiresDstArg(gr)){
+		type   = gpuarray_get_type(gr->idxTypeCode);
+		total  = DIVIDECEIL(total, type->align)*type->align;
+		total += bs*type->size;
+	}
 	
-	ctx->st1.ndh  = 0;
-	ctx->st1.ndhp = 0;
-	ctx->st1.ndhr = 0;
+	return total;
+}
+
+/**
+ * @brief Get the shared memory byte offset for dst.
+ */
+
+static size_t     reduxGenGetSHMEMDstOff        (const GpuReduction*  gr, size_t bs){
+	return 0;
+}
+
+/**
+ * @brief Get the shared memory byte offset for dstArg.
+ */
+
+static size_t     reduxGenGetSHMEMDstArgOff     (const GpuReduction*  gr, size_t bs){
+	const gpuarray_type* type;
+	size_t               total = 0;
 	
-	for (i=0;i<ctx->ndfd && i<MAX_HW_DIMS;i++){
-		axis = reduxGetSrcFlatAxis(ctx, i);
-		axis->hwAxisStage0 = i;
+	if(reduxGenKernelRequiresDst(gr) && reduxGenKernelRequiresDstArg(gr)){
+		type   = gpuarray_get_type(gr->accTypeCode);
+		total  = DIVIDECEIL(total, type->align)*type->align;
+		total += bs*type->size;
+		type   = gpuarray_get_type(gr->idxTypeCode);
+		total  = DIVIDECEIL(total, type->align)*type->align;
 		
-		ctx->st1.ndh++;
+		return total;
+	}else{
+		return 0;
 	}
-	ctx->st1.ndhd = ctx->st1.ndh;
-	
-	return reduxGenSource(ctx);
 }
 
 /**
- * @brief Plan a 2-stage reduction.
- * 
- * Inputs: ctx->xdSrcFlat[0...ctx->ndf-1]
- * 
- * This plan involves splitting the reduction into two stages:
- * 
- *    Stage 0:  A huge reduction only along reduction axes into a workspace.
- *    Stage 1:  A small reduction into the destination.
+ * @brief Initialize the context.
  * 
- * We select only reduction axes in the first stage.
+ * After this function, calling reduxInvCleanup*() becomes safe.
  */
 
-static int        reduxPlan2Stage               (redux_ctx*  ctx){
-	int        i, j, ret = 0;
-	axis_desc* axis;
-	size_t     a = 1, aL, aPartial, target = reduxEstimateParallelism(ctx), sz;
-	
+static int        reduxInvInit                  (redux_ctx*  ctx){
 	/**
-	 * Plan Stage 0.
-	 * 
-	 * Sort axis descriptions reduction-axes-first then longest-first, and
-	 * select up to 3 reduction axes, splitting them s.t. their product does
-	 * not exceed the max block size.
+	 * We initialize certain parts of the context.
 	 */
 	
-	reduxSortAxisPtrsBy(ctx->xdSrcPtrs, ctx->xdSrcFlat, ctx->ndfs,
-	                    reduxSortPlan2Stage0);
+	ctx->l                 = NULL;
+	ctx->lPDim             = NULL;
+	ctx->sJ                = NULL;
+	ctx->dJ                = NULL;
+	ctx->aJ                = NULL;
+	ctx->ibs               = NULL;
+	ctx->ibp               = NULL;
+	ctx->iblPDim           = NULL;
+	ctx->ibsOff            = NULL;
+	ctx->ibdOff            = NULL;
+	ctx->ibaOff            = NULL;
+	ctx->kArgs             = NULL;
+	ctx->xdSrc             = NULL;
+	ctx->xdSrcPtrs         = NULL;
+	ctx->xdTmpPtrs         = NULL;
+	ctx->xdSplit           = NULL;
 	
-	ctx->st1.ndh  = 0;
-	ctx->st1.ndhp = 0;
-	ctx->st1.ndhr = 0;
-	ctx->st1.ndhd = 0;
+	ctx->w                 = NULL;
 	
-	for(i=0;i<ctx->ndfs && i<MAX_HW_DIMS;i++){
-		axis = reduxGetSrcSortAxis(ctx, i);
-		if(!axisIsReduced(axis)){
-			break;
-		}
-		
-		aL = axisGetLen(axis);
-		a *= aL;
-		if(a <= target){
-			axis->hwAxisStage0 = i;
-			axis->sliceLen     = aL;
-			axis->tmpLen       = 1;
-			
-			ctx->st1.ndh++;
-		}else{
-			a /= aL;
-			aPartial = target/a;
-			if(aPartial >= 2){
-				a *= aPartial;
-				
-				axis->hwAxisStage0 = i++;
-				axis->sliceLen     = aPartial;
-				axis->tmpLen       = (axis->len+axis->sliceLen-1)/axis->sliceLen;
-				
-				ctx->st1.ndh++;
-				ctx->st1.ndhp++;
-			}
-			break;
-		}
+	ctx->prodAllAxes       = ctx->prodRdxAxes   = ctx->prodFreeAxes  = 1;
+	ctx->bs                = ctx->gs            = 1;
+
+	return reduxInvInferProperties(ctx);
+}
+
+/**
+ * @brief Begin inferring the properties of the reduction invocation.
+ */
+
+static int        reduxInvInferProperties       (redux_ctx*  ctx){
+	axis_desc* a;
+	int        i, j;
+	size_t     d;
+
+
+	/* Insane src, reduxLen, dst or dstArg? */
+	if(!ctx->reduxList){
+		ctx->reduxLen = ctx->src->nd;
 	}
-	ctx->st1.ndhr = ctx->st1.ndh;
+	if       (!ctx->src){
+		return reduxInvCleanupMsg(ctx, GA_INVALID_ERROR,
+		       "src is NULL!\n");
+	}else if (ctx->src->nd  <= 0){
+		return reduxInvCleanupMsg(ctx, GA_INVALID_ERROR,
+		       "src is a scalar, cannot reduce it!\n");
+	}else if (ctx->reduxLen <  0){
+		return reduxInvCleanupMsg(ctx, GA_INVALID_ERROR,
+		       "Length of list of dimensions to be reduced is less than 0!\n");
+	}else if (ctx->src->nd  <  (unsigned)ctx->reduxLen){
+		return reduxInvCleanupMsg(ctx, GA_INVALID_ERROR,
+		       "src has fewer dimensions than there are dimensions to reduce!\n");
+	}else if (reduxInvRequiresDst   (ctx) && !ctx->dst){
+		return reduxInvCleanupMsg(ctx, GA_INVALID_ERROR,
+		       "dst is NULL, but reduction requires it!\n");
+	}else if (reduxInvRequiresDstArg(ctx) && !ctx->dstArg){
+		return reduxInvCleanupMsg(ctx, GA_INVALID_ERROR,
+		       "dstArg is NULL, but reduction requires it!\n");
+	}else if (ctx->dst    && ctx->dst->nd   +ctx->reduxLen != ctx->src->nd){
+		return reduxInvCleanupMsg(ctx, GA_INVALID_ERROR,
+		       "dst is of incorrect dimensionality for this reduction!\n");
+	}else if (ctx->dstArg && ctx->dstArg->nd+ctx->reduxLen != ctx->src->nd){
+		return reduxInvCleanupMsg(ctx, GA_INVALID_ERROR,
+		       "dstArg is of incorrect dimensionality for this reduction!\n");
+	}
+	ctx->nds  = ctx->src->nd;
+	ctx->ndr  = ctx->reduxLen;
+	ctx->ndd  = ctx->nds - ctx->ndr;
+	ctx->ndfs = ctx->ndfr = ctx->ndfd = 0;
 	
+	/* Insane reduxList? */
+	for (i=0;i<ctx->ndr;i++){
+		j = ctx->reduxList ? ctx->reduxList[i] : i;
+		if (j < -ctx->nds || j >= ctx->nds){
+			return reduxInvCleanupMsg(ctx, GA_INVALID_ERROR,
+			       "Insane axis number %d! Should be [%d, %d)!\n",
+			       j, -ctx->nds, ctx->nds);
+		}
+		j = j<0 ? ctx->nds+j : j;
+		d                 = ctx->src->dimensions[j];
+		ctx->zeroRdxAxes += !d;
+		ctx->prodRdxAxes *=  d?d:1;
+	}
+
+
 	/**
-	 * We now have enough information to allocate the workspaces.
+	 * Insane shape?
+	 * 
+	 * The source tensor is allowed to be empty (its shape may contain 0s).
+	 * However, all axes that are of length 0 must be reduction axes.
+	 * 
+	 * The reason for this is that a reduction cannot store any output into an
+	 * empty destination tensor (whose dimensions are the free axes), because
+	 * it has 0 space. The operation cannot then fulfill its contract.
+	 * 
+	 * On the other hand, when some or all reduction axes of a tensor are of
+	 * length 0, the reduction can be interpreted as initializing the
+	 * destination tensor to the identity value of the operation. For lack of a
+	 * better idea, the destination argument tensor can then be zeroed.
 	 */
-	
-	ctx->ndt              = ctx->ndfs - ctx->st1.ndh + ctx->st1.ndhp;
-	ctx->xdTmpPtrs        = malloc(ctx->ndt*sizeof(*ctx->xdTmpPtrs));
-	ctx->tmpDstDimensions = malloc(ctx->ndt*sizeof(*ctx->tmpDstDimensions));
-	ctx->tmpDstStrides    = malloc(ctx->ndt*sizeof(*ctx->tmpDstStrides));
-	ctx->tmpDstArgStrides = malloc(ctx->ndt*sizeof(*ctx->tmpDstArgStrides));
-	if(!ctx->xdTmpPtrs || !ctx->tmpDstDimensions || !ctx->tmpDstStrides ||
-	   !ctx->tmpDstArgStrides){
-		return reduxCleanup(ctx, GA_MEMORY_ERROR);
-	}
-	for(i=j=0;i<ctx->ndfs;i++){
-		axis = reduxGetSrcFlatAxis(ctx, i);
-		if(!axisIsHW(axis, 0) || axisIsPartialHW(axis, 0)){
-			ctx->xdTmpPtrs       [j] = axis;
-			ctx->tmpDstDimensions[j] = axisGetTmpLen(axis);
-		}
+
+	for (i=0;i<ctx->nds;i++){
+		d                 =  ctx->src->dimensions[i];
+		ctx->zeroAllAxes += !d;
+		ctx->prodAllAxes *=  d?d:1;
 	}
-	
-	if (reduxKernelRequiresDst(ctx)){
-		sz = gpuarray_get_elsize(ctx->dstTypeCode);
-		
-		for(i=ctx->ndt-1;i>=0;i--){
-			ctx->tmpDstStrides[i] = sz;
-			sz *= ctx->tmpDstDimensions[i];
-		}
-		
-		ctx->tmpDstData    = gpudata_alloc(ctx->gpuCtx, sz, 0, 0, &ret);
-		if(ret != GA_NO_ERROR){
-			return reduxCleanup(ctx, ret);
+	if (ctx->zeroAllAxes != ctx->zeroRdxAxes){
+		return reduxInvCleanupMsg(ctx, GA_INVALID_ERROR,
+		       "Source tensor has length-0 dimensions that are not reduced!\n");
+	}
+	ctx->prodFreeAxes = ctx->prodAllAxes/ctx->prodRdxAxes;
+
+
+	/**
+	 * Allocate and construct source-tensor axis-description lists.
+	 * 
+	 * While constructing the descriptions of each axis, verify that:
+	 * 
+	 *   1. reduxLen has no duplicates.
+	 *   2. dst and/or dstArg's dimensions match src's dimensions, stripped of
+	 *      the reduction axes.
+	 */
+
+	ctx->xdSrc     = calloc(ctx->nds,   sizeof(*ctx->xdSrc));
+	ctx->xdSrcPtrs = calloc(ctx->nds+1, sizeof(*ctx->xdSrcPtrs));
+	if (!ctx->xdSrc || !ctx->xdSrcPtrs){
+		return reduxInvCleanup(ctx, GA_MEMORY_ERROR);
+	}
+	for (i=0;i<ctx->nds;i++){
+		axisInit(&ctx->xdSrc[i],
+		         ctx->src->dimensions[i],
+		         ctx->src->strides[i]);
+	}
+	for (i=0;i<ctx->ndr;i++){
+		j = ctx->reduxList ? ctx->reduxList[i] : i;
+		j = j<0 ? ctx->nds+j : j;
+		a = reduxInvGetSrcAxis(ctx, j);
+		if (axisIsReduced(a)){
+			return reduxInvCleanupMsg(ctx, GA_INVALID_ERROR,
+			       "Axis %d appears multiple times in the "
+			       "reduction axis list!\n",
+			       j);
 		}
+		axisMarkReduced(a, i);
 	}
-	if (reduxKernelRequiresDstArg(ctx)){
-		sz = gpuarray_get_elsize(ctx->dstArgTypeCode);
+	for (i=j=0;i<ctx->nds;i++){
+		axis_desc* a      = reduxInvGetSrcAxis(ctx, i);
+		size_t     srcLen = axisGetLen(a), dstLen, dstArgLen;
 		
-		for(i=ctx->ndt-1;i>=0;i--){
-			ctx->tmpDstArgStrides[i] = sz;
-			sz *= ctx->tmpDstDimensions[i];
+		if (axisIsReduced(a)){continue;}
+		if (reduxInvRequiresDst(ctx)){
+			dstLen = ctx->dst->dimensions[j];
+			
+			if(srcLen != dstLen){
+				return reduxInvCleanupMsg(ctx, GA_INVALID_ERROR,
+				       "Source axis %d has length %zu, but "
+				       "corresponding destination axis %d has length %zu!\n",
+				       i, srcLen, j, dstLen);
+			}
+			
+			a->dstStride    = ctx->dst->strides[j];
 		}
-		
-		ctx->tmpDstArgData = gpudata_alloc(ctx->gpuCtx, sz, 0, 0, &ret);
-		if(ret != GA_NO_ERROR){
-			return reduxCleanup(ctx, ret);
+		if (reduxInvRequiresDstArg(ctx)){
+			dstArgLen = ctx->dstArg->dimensions[j];
+			
+			if(srcLen != dstArgLen){
+				return reduxInvCleanupMsg(ctx, GA_INVALID_ERROR,
+				       "Source axis %d has length %zu, but "
+				       "corresponding destination-argument axis %d has length %zu!\n",
+				       i, srcLen, j, dstArgLen);
+			}
+			
+			a->dstArgStride = ctx->dstArg->strides[j];
 		}
+		
+		j++;
 	}
 	
+	
 	/**
-	 * Plan Stage 1.
+	 * Grab gpudata buffers and byte offsets before we begin flattening the
+	 * tensors. As we flatten the tensor, we may reverse some axes, leading to
+	 * a bump of the byte offset.
 	 */
 	
-	qsort(ctx->xdTmpPtrs, ctx->ndt, sizeof(*ctx->xdTmpPtrs), reduxSortPlan1Stage);
-	
-	ctx->st2.ndh  = 0;
-	ctx->st2.ndhp = 0;
-	ctx->st2.ndhr = 0;
-	
-	for (i=0;i<ctx->ndfd && i<MAX_HW_DIMS;i++){
-		axis = reduxGetTmpAxis(ctx, i);
-		axis->hwAxisStage1 = i;
-		
-		ctx->st2.ndh++;
+	ctx->flatSrcData       = ctx->src->data;
+	ctx->flatSrcOffset     = ctx->src->offset;
+	if(reduxInvRequiresDst(ctx)){
+		ctx->flatDstData       = ctx->dst->data;
+		ctx->flatDstOffset     = ctx->dst->offset;
 	}
-	ctx->st2.ndhd = ctx->st2.ndh;
-	
-	return reduxGenSource(ctx);
+	if(reduxInvRequiresDstArg(ctx)){
+		ctx->flatDstArgData    = ctx->dstArg->data;
+		ctx->flatDstArgOffset  = ctx->dstArg->offset;
+	}
+
+	return reduxInvFlattenSource(ctx);
 }
 
 /**
- * @brief Generate the kernel code for the reduction.
- *
- * @return GA_MEMORY_ERROR if not enough memory left; GA_NO_ERROR otherwise.
+ * @brief Flatten the source tensor as much as is practical.
+ * 
+ * This makes the axis lengths as long as possible and the tensor itself as
+ * contiguous as possible.
  */
 
-static int        reduxGenSource                (redux_ctx*  ctx){
-	reduxAppendSource(ctx);
-	ctx->sourceCodeLen = ctx->s.l;
-	ctx->sourceCode    = strb_cstr(&ctx->s);
-	if (!ctx->sourceCode){
-		return reduxCleanup(ctx, GA_MEMORY_ERROR);
-	}
-
-	return reduxCompile(ctx);
-}
-static void       reduxAppendSource             (redux_ctx*  ctx){
-	reduxAppendIncludes         (ctx);
-	reduxAppendMacroDefs        (ctx);
-	reduxAppendTypedefs         (ctx);
-	reduxAppendGetInitValFns    (ctx);
-	reduxAppendWriteBackFn      (ctx);
-	reduxAppendReduxKernel       (ctx);
-}
-static void       reduxAppendTensorDeclArgs     (redux_ctx*  ctx,
-                                                 const char* type,
-                                                 const char* baseName){
-	srcbAppendElemf(&ctx->srcGen, "%s* %sPtr",             type, baseName);
-	srcbAppendElemf(&ctx->srcGen, "const X %sOff",               baseName);
-	srcbAppendElemf(&ctx->srcGen, "const GLOBAL_MEM X* %sSteps", baseName);
-	(void)reduxAppendTensorCallArgs;/* Silence unused warning */
-}
-static void       reduxAppendTensorCallArgs     (redux_ctx*  ctx,
-                                                 const char* baseName){
-	srcbAppendElemf(&ctx->srcGen, "%sPtr",   baseName);
-	srcbAppendElemf(&ctx->srcGen, "%sOff",   baseName);
-	srcbAppendElemf(&ctx->srcGen, "%sSteps", baseName);
-}
-static void       reduxAppendMacroDefs          (redux_ctx*  ctx){
-	int i;
-
-	srcbAppends    (&ctx->srcGen, "#define FOROVER(idx)    for (i##idx = i##idx##Start; i##idx < i##idx##End; i##idx++)\n");
-	srcbAppends    (&ctx->srcGen, "#define ESCAPE(idx)     if (i##idx >= i##idx##Dim){continue;}\n");
+static int        reduxInvFlattenSource         (redux_ctx*  ctx){
+	axis_desc* axis, *flatAxis, *sortAxis;
+	int        i, j, k, isSensitive;
 
-	/* srcVal indexer */
-	srcbAppends    (&ctx->srcGen, "#define srcVal          (*(const GLOBAL_MEM S*)(");
-	srcbBeginList  (&ctx->srcGen, "+", "0");
-	srcbAppendElemf(&ctx->srcGen, "(const GLOBAL_MEM char*)srcPtr");
-	srcbAppendElemf(&ctx->srcGen, "srcOff");
-	for (i=0;i<ctx->ndfs;i++){
-		srcbAppendElemf(&ctx->srcGen, "i%d*i%dSStep", i, i);
-	}
-	srcbEndList    (&ctx->srcGen);
-	srcbAppends    (&ctx->srcGen, "))\n");
-
-	/* dstVal indexer */
-	if (reduxKernelRequiresDst(ctx)){
-		srcbAppends    (&ctx->srcGen, "#define dstVal          (*(GLOBAL_MEM T*)(");
-		srcbBeginList  (&ctx->srcGen, "+", "0");
-		srcbAppendElemf(&ctx->srcGen, "(GLOBAL_MEM char*)dstPtr");
-		srcbAppendElemf(&ctx->srcGen, "dstOff");
-		for (i=0;i<ctx->ndfd;i++){
-			srcbAppendElemf(&ctx->srcGen, "i%d*i%dDStep", i, i);
-		}
-		srcbEndList    (&ctx->srcGen);
-		srcbAppends    (&ctx->srcGen, "))\n");
-	}
-
-	/* dstArgVal indexer */
-	if (reduxKernelRequiresDstArg(ctx)){
-		srcbAppends    (&ctx->srcGen, "#define dstArgVal       (*(GLOBAL_MEM A*)(");
-		srcbBeginList  (&ctx->srcGen, "+", "0");
-		srcbAppendElemf(&ctx->srcGen, "(GLOBAL_MEM char*)dstArgPtr");
-		srcbAppendElemf(&ctx->srcGen, "dstArgOff");
-		for (i=0;i<ctx->ndfd;i++){
-			srcbAppendElemf(&ctx->srcGen, "i%d*i%dAStep", i, i);
-		}
-		srcbEndList    (&ctx->srcGen);
-		srcbAppends    (&ctx->srcGen, "))\n");
-	}
-
-	/* rdxIdx indexer */
-	srcbAppends    (&ctx->srcGen, "#define rdxIdx          (");
-	srcbBeginList  (&ctx->srcGen, "+", "0");
-	for (i=ctx->ndfd;i<ctx->ndfs;i++){
-		srcbAppendElemf(&ctx->srcGen, "i%d*i%dPDim", i, i);
-	}
-	srcbEndList    (&ctx->srcGen);
-	srcbAppends    (&ctx->srcGen, ")\n");
-}
-static void       reduxAppendIncludes           (redux_ctx*  ctx){
-	strb_appends(&ctx->s, "/* Includes */\n");
-	strb_appends(&ctx->s, "#include \"cluda.h\"\n");
-	strb_appends(&ctx->s, "\n");
-	strb_appends(&ctx->s, "\n");
-	strb_appends(&ctx->s, "\n");
-}
-static void       reduxAppendTypedefs           (redux_ctx*  ctx){
-	strb_appendf(&ctx->s, "typedef %s S;\n", ctx->srcTypeStr);   /* The type of the source array. */
-	strb_appendf(&ctx->s, "typedef %s T;\n", ctx->dstTypeStr);   /* The type of the destination array. */
-	strb_appendf(&ctx->s, "typedef %s A;\n", ctx->dstArgTypeStr);/* The type of the destination argument array. */
-	strb_appendf(&ctx->s, "typedef %s X;\n", ctx->idxTypeStr);   /* The type of the indices: signed 32/64-bit. */
-	strb_appendf(&ctx->s, "typedef %s K;\n", ctx->accTypeStr);   /* The type of the accumulator variable. */
-}
-static void       reduxAppendGetInitValFns      (redux_ctx*  ctx){
-	/**
-	 * Initial value functions.
-	 */
+	ctx->ndfs = ctx->nds;
 
-	strb_appendf(&ctx->s, "WITHIN_KERNEL T    getInitValTFn(void){\n"
-	                      "\treturn (%s);\n"
-	                      "}\n\n\n\n"
-	                      "WITHIN_KERNEL K    getInitValKFn(void){\n"
-	                      "\treturn (%s);\n"
-	                      "}\n\n\n\n", ctx->initValT, ctx->initValK);
-}
-static void       reduxAppendWriteBackFn        (redux_ctx*  ctx){
 	/**
-	 * Global memory value reduction function.
-	 *
-	 * Responsible for either:
-	 *   1) Safe writeback of final value to memory, or
-	 *   2) Safe atomic reduction of partial value into memory.
+	 * Pass 1: Flatten out 0- and 1-length dimensions. We already know that
+	 * 
+	 *         a) There are no 0-length free dimensions, because that
+	 *            constitutes an invalid input, and
+	 *         b) How many 0-length reduction dimensions there are, because
+	 *            we counted them in the error-checking code.
+	 * 
+	 * So if there are any 0-length axes, we can delete all reduction axes and
+	 * replace them with a single one.
+	 * 
+	 * We can also delete 1-length axes outright, since they can always be
+	 * ignored; They are always indexed at [0].
 	 */
 
-	srcbAppends    (&ctx->srcGen, "WITHIN_KERNEL void writeBackFn(");
-	srcbBeginList  (&ctx->srcGen, ", ", "void");
-	if (reduxKernelRequiresDst(ctx)){
-		srcbAppendElemf(&ctx->srcGen, "GLOBAL_MEM T* d_");
-		srcbAppendElemf(&ctx->srcGen, "T d");
-	}
-	if (reduxKernelRequiresDstArg(ctx)){
-		srcbAppendElemf(&ctx->srcGen, "GLOBAL_MEM A* a_");
-		srcbAppendElemf(&ctx->srcGen, "A a");
-	}
-	srcbEndList    (&ctx->srcGen);
-	srcbAppends    (&ctx->srcGen, "){\n");
-
-	if (reduxIs1Stage(ctx)){
-		if (reduxKernelRequiresDst   (ctx)){
-			srcbAppends    (&ctx->srcGen, "\t*d_ = d;\n");
-		}
-		if (reduxKernelRequiresDstArg(ctx)){
-			srcbAppends    (&ctx->srcGen, "\t*a_ = a;\n");
-		}
-	}else{
-		/* BUG: Implement the atomic reduction, one or two CAS loops. */
-		if       ( reduxKernelRequiresDst   (ctx) && !reduxKernelRequiresDstArg(ctx)){
-
-		}else if (!reduxKernelRequiresDst   (ctx) &&  reduxKernelRequiresDstArg(ctx)){
-
-		}else if ( reduxKernelRequiresDst   (ctx) &&  reduxKernelRequiresDstArg(ctx)){
-
-		}
-	}
-
-	/* Close off function. */
-	strb_appends(&ctx->s, "}\n\n\n\n");
-}
-static void       reduxAppendReduxKernel        (redux_ctx*  ctx){
-	reduxAppendPrototype        (ctx);
-	strb_appends                (&ctx->s, "{\n");
-	reduxAppendIndexDeclarations(ctx);
-	reduxAppendRangeCalculations(ctx);
-	reduxAppendLoops            (ctx);
-	strb_appends                (&ctx->s, "}\n");
-}
-static void       reduxAppendPrototype          (redux_ctx*  ctx){
-	srcbAppends    (&ctx->srcGen, "KERNEL void reduxKer(");
-	srcbBeginList  (&ctx->srcGen, ", ", "void");
-	reduxAppendTensorDeclArgs(ctx, "S", "src");
-	srcbAppendElemf(&ctx->srcGen, "const GLOBAL_MEM X*        srcSize");
-	srcbAppendElemf(&ctx->srcGen, "const GLOBAL_MEM X*        chunkSize");
-	if (reduxKernelRequiresDst(ctx)){
-		reduxAppendTensorDeclArgs(ctx, "T", "dst");
-	}
-	if (reduxKernelRequiresDstArg(ctx)){
-		reduxAppendTensorDeclArgs(ctx, "A", "dstArg");
-	}
-	srcbEndList    (&ctx->srcGen);
-	srcbAppends    (&ctx->srcGen, ")");
-}
-static void       reduxAppendIndexDeclarations  (redux_ctx*  ctx){
-	int i;
-	strb_appends(&ctx->s, "\t/* GPU kernel coordinates. Always 3D in OpenCL/CUDA. */\n");
-
-	strb_appends(&ctx->s, "\tX bi0 = GID_0,        bi1 = GID_1,        bi2 = GID_2;\n");
-	strb_appends(&ctx->s, "\tX bd0 = LDIM_0,       bd1 = LDIM_1,       bd2 = LDIM_2;\n");
-	strb_appends(&ctx->s, "\tX ti0 = LID_0,        ti1 = LID_1,        ti2 = LID_2;\n");
-	strb_appends(&ctx->s, "\tX gi0 = bi0*bd0+ti0,  gi1 = bi1*bd1+ti1,  gi2 = bi2*bd2+ti2;\n");
-	if (ctx->st1.ndh>0){
-		strb_appends(&ctx->s, "\tX ");
-		for (i=0;i<ctx->st1.ndh;i++){
-			strb_appendf(&ctx->s, "ci%u = chunkSize[%u]%s",
-			             i, i, (i==ctx->st1.ndh-1) ? ";\n" : ", ");
-		}
-	}
-	strb_appends(&ctx->s, "\t\n\t\n");
-	strb_appends(&ctx->s, "\t/* Free indices & Reduction indices */\n");
-	if (ctx->ndfs >         0){appendIdxes (&ctx->s, "\tX ", "i", 0,         ctx->ndfs, "",        ";\n");}
-	if (ctx->ndfs >         0){appendIdxes (&ctx->s, "\tX ", "i", 0,         ctx->ndfs, "Dim",     ";\n");}
-	if (ctx->ndfs >         0){appendIdxes (&ctx->s, "\tX ", "i", 0,         ctx->ndfs, "Start",   ";\n");}
-	if (ctx->ndfs >         0){appendIdxes (&ctx->s, "\tX ", "i", 0,         ctx->ndfs, "End",     ";\n");}
-	if (ctx->ndfs >         0){appendIdxes (&ctx->s, "\tX ", "i", 0,         ctx->ndfs, "SStep",   ";\n");}
-	if (ctx->ndfd >         0){appendIdxes (&ctx->s, "\tX ", "i", 0,         ctx->ndfd, "DStep",   ";\n");}
-	if (ctx->ndfd >         0){appendIdxes (&ctx->s, "\tX ", "i", 0,         ctx->ndfd, "AStep",   ";\n");}
-	if (ctx->ndfs > ctx->ndfd){appendIdxes (&ctx->s, "\tX ", "i", ctx->ndfd, ctx->ndfs, "PDim",    ";\n");}
-	strb_appends(&ctx->s, "\t\n\t\n");
-}
-static void       reduxAppendRangeCalculations  (redux_ctx*  ctx){
-	axis_desc* axis;
-	size_t     hwDim;
-	int        i;
-
-	strb_appends(&ctx->s, "\t/* Compute ranges for this thread. */\n");
-
-	for (i=0;i<ctx->ndfs;i++){
-		strb_appendf(&ctx->s, "\ti%dDim     = srcSize[%d];\n",  i, i);
-	}
-	for (i=0;i<ctx->ndfs;i++){
-		strb_appendf(&ctx->s, "\ti%dSStep   = srcSteps[%d];\n", i, i);
-	}
-	if (reduxKernelRequiresDst(ctx)){
-		for (i=0;i<ctx->ndfd;i++){
-			strb_appendf(&ctx->s, "\ti%dDStep   = dstSteps[%d];\n", i, i);
-		}
-	}
-	if (reduxKernelRequiresDstArg(ctx)){
-		for (i=0;i<ctx->ndfd;i++){
-			strb_appendf(&ctx->s, "\ti%dAStep   = dstArgSteps[%d];\n", i, i);
-		}
-	}
-	for (i=ctx->ndfs-1;i>=ctx->ndfd;i--){
-		/**
-		 * If this is the last index, it's the first cumulative dimension
-		 * product we generate, and thus we initialize to 1.
-		 */
-
-		if (i == ctx->ndfs-1){
-			strb_appendf(&ctx->s, "\ti%dPDim    = 1;\n", i);
-		}else{
-			strb_appendf(&ctx->s, "\ti%dPDim    = i%dPDim * i%dDim;\n", i, i+1, i+1);
-		}
-	}
-	for (i=0;i<ctx->ndfs;i++){
-		/**
-		 * Up to MAX_HW_DIMS dimensions get to rely on hardware loops.
-		 * The others, if any, have to use software looping beginning at 0.
-		 */
-
-		axis = reduxGetSrcFlatAxis(ctx, i);
-		if (axisIsHW(axis, 0)){
-			hwDim = axisGetHWAxisNum(axis, 0);
-			//axisInSet(i, ctx->st1.axisList, ctx->st1.ndh, &hwDim);
-			strb_appendf(&ctx->s, "\ti%dStart   = gi%d * ci%d;\n", i, hwDim, hwDim);
-		}else{
-			strb_appendf(&ctx->s, "\ti%dStart   = 0;\n", i);
-		}
-	}
-	for (i=0;i<ctx->ndfs;i++){
-		/**
-		 * Up to MAX_HW_DIMS dimensions get to rely on hardware loops.
-		 * The others, if any, have to use software looping beginning at 0.
-		 */
-
-		axis = reduxGetSrcFlatAxis(ctx, i);
-		if (axisIsHW(axis, 0)){
-			hwDim = axisGetHWAxisNum(axis, 0);
-			//axisInSet(i, ctx->st1.axisList, ctx->st1.ndh, &hwDim);
-			strb_appendf(&ctx->s, "\ti%dEnd     = i%dStart + ci%d;\n", i, i, hwDim);
-		}else{
-			strb_appendf(&ctx->s, "\ti%dEnd     = i%dStart + i%dDim;\n", i, i, i);
+	for (i=j=0;i<ctx->ndfs;i++){
+		axis = reduxInvGetSrcAxis(ctx, i);
+		if (!reduxTryFlattenOut(ctx, axis)){
+			*reduxInvGetSrcAxis(ctx, j++) = *axis;
 		}
 	}
-
-	strb_appends(&ctx->s, "\t\n\t\n");
-}
-static void       reduxAppendLoops              (redux_ctx*  ctx){
-	int i;
-
-	for (i=0;i<ctx->ndfd;i++){
-		srcbAppendf(&ctx->srcGen, "\tFOROVER(%d){ESCAPE(%d)\n", i, i);
-	}
-
-	srcbAppends    (&ctx->srcGen, "\t\tT rdxT;\n");
-	srcbAppends    (&ctx->srcGen, "\t\tK rdxK = getInitValKFn();\n");
-	if (reduxKernelRequiresDstArg(ctx)){
-		srcbAppends(&ctx->srcGen, "\t\tX rdxA = 0;\n");
-	}
-	srcbAppends    (&ctx->srcGen, "\t\t\n");
-
-	for (i=ctx->ndfd;i<ctx->ndfs;i++){
-		srcbAppendf    (&ctx->srcGen, "\t\tFOROVER(%d){ESCAPE(%d)\n", i, i);
+	if(ctx->zeroRdxAxes > 0){
+		/* New reduction axis of 0 length. */
+		axisInit       (reduxInvGetSrcAxis(ctx, j), 0, 0);
+		axisMarkReduced(reduxInvGetSrcAxis(ctx, j), 0);
+		j++;
 	}
+	ctx->ndfs = j;
 
-	srcbAppends    (&ctx->srcGen, "\t\t\tS s = srcVal;\n");
 
 	/**
-	 * Prescalar transformations go here. They transform and coerce the S-typed
-	 * value s into the K-typed value k.
+	 * Pass 2: Flatten out continuous dimensions, where strides and sensitivity
+	 *         allows it.
 	 */
-
-	srcbAppends    (&ctx->srcGen, "\t\t\tK k = s;\n");
-
-	switch (ctx->op){
-		case GA_REDUCE_SUM:
-		  srcbAppends(&ctx->srcGen, "\t\t\trdxK += k;\n");
-		break;
-		case GA_REDUCE_PROD:
-		  srcbAppends(&ctx->srcGen, "\t\t\trdxK *= k;\n");
-		break;
-		case GA_REDUCE_PRODNZ:
-		  srcbAppends(&ctx->srcGen, "\t\t\trdxK *= k==0 ? getInitValKFn() : k;\n");
-		break;
-		case GA_REDUCE_MIN:
-		  srcbAppends(&ctx->srcGen, "\t\t\trdxK  = min(rdxK, k);\n");
-		break;
-		case GA_REDUCE_MAX:
-		  srcbAppends(&ctx->srcGen, "\t\t\trdxK  = max(rdxK, k);\n");
-		break;
-		case GA_REDUCE_ARGMIN:
-		case GA_REDUCE_MINANDARGMIN:
-		  srcbAppends(&ctx->srcGen, "\t\t\trdxK  = min(rdxK, k);\n"
-		                            "\t\t\tif (rdxK == k){\n"
-		                            "\t\t\t\trdxA = rdxIdx;\n"
-		                            "\t\t\t}\n");
-		break;
-		case GA_REDUCE_ARGMAX:
-		case GA_REDUCE_MAXANDARGMAX:
-		  srcbAppends(&ctx->srcGen, "\t\t\trdxK  = max(rdxK, k);\n"
-		                            "\t\t\tif (rdxK == k){\n"
-		                            "\t\t\t\trdxA = rdxIdx;\n"
-		                            "\t\t\t}\n");
-		break;
-		case GA_REDUCE_AND:
-		  srcbAppends(&ctx->srcGen, "\t\t\trdxK &= k;\n");
-		break;
-		case GA_REDUCE_OR:
-		  srcbAppends(&ctx->srcGen, "\t\t\trdxK |= k;\n");
-		break;
-		case GA_REDUCE_XOR:
-		  srcbAppends(&ctx->srcGen, "\t\t\trdxK ^= k;\n");
-		break;
-		case GA_REDUCE_ALL:
-		  srcbAppends(&ctx->srcGen, "\t\t\trdxK  = rdxK && k;\n");
-		break;
-		case GA_REDUCE_ANY:
-		  srcbAppends(&ctx->srcGen, "\t\t\trdxK  = rdxK || k;\n");
-		break;
+	
+	k           = ctx->ndfs;
+	isSensitive = reduxIsSensitive(ctx->op);
+	qsort(ctx->xdSrc, ctx->ndfs, sizeof(*ctx->xdSrc),
+	      isSensitive ? reduxSortFlatSensitive : reduxSortFlatInsensitive);
+	for (i=j=1;i<ctx->ndfs;i++){
+		flatAxis = reduxInvGetSrcAxis(ctx, j-1);
+		sortAxis = reduxInvGetSrcAxis(ctx, i);
+		
+		if (reduxTryFlattenInto(ctx, flatAxis, sortAxis)){
+			k--;
+		}else{
+			*reduxInvGetSrcAxis(ctx, j++) = *sortAxis;
+		}
 	}
+	ctx->ndfs = k;
 
-	for (i=ctx->ndfd;i<ctx->ndfs;i++){
-		srcbAppends(&ctx->srcGen, "\t\t}\n");
-	}
-	srcbAppends(&ctx->srcGen, "\t\t\n");
 
 	/**
-	 * Large code model: Postscalar transformations go here, coercing the
-	 * K-typed value rdxK to the T-typed value rdxT
+	 * Compute number of free and reduced dimensions.
 	 */
 
-	srcbAppends    (&ctx->srcGen, "\t\trdxT = rdxK;\n");
-
-	/* Final writeback. */
-	srcbAppends    (&ctx->srcGen, "\t\twriteBackFn(");
-	srcbBeginList  (&ctx->srcGen, ", ", "");
-	if (reduxKernelRequiresDst(ctx)){
-		srcbAppendElemf(&ctx->srcGen, "&dstVal");
-		srcbAppendElemf(&ctx->srcGen, "rdxT");
-	}
-	if (reduxKernelRequiresDstArg(ctx)){
-		srcbAppendElemf(&ctx->srcGen, "&dstArgVal");
-		srcbAppendElemf(&ctx->srcGen, "rdxA");
+	for(ctx->ndfr=ctx->ndfd=i=0;i<ctx->ndfs;i++){
+		if(axisIsReduced(reduxInvGetSrcAxis(ctx, i))){
+			ctx->ndfr++;
+		}else{
+			ctx->ndfd++;
+		}
 	}
-	srcbEndList    (&ctx->srcGen);
-	srcbAppends    (&ctx->srcGen, ");\n");
 
-	for (i=0;i<ctx->ndfd;i++){
-		srcbAppends(&ctx->srcGen, "\t}\n");
-	}
+	return reduxInvComputeKArgs(ctx);
 }
 
 /**
- * @brief Compile the kernel from source code.
+ * @brief Compute the arguments to the kernel.
+ * 
+ * This is a multistep process and involves a lot of axis sorting on various
+ * criteria.
  */
 
-static int        reduxCompile                  (redux_ctx*  ctx){
-	int    ret, i = 0;
-	int    PRI_TYPECODES[11];
-	size_t PRI_TYPECODES_LEN;
+static int        reduxInvComputeKArgs          (redux_ctx*  ctx){
+	axis_desc* axis, *prevAxis;
+	size_t     target, aL, aLS;
+	int        i, j;
 
 
 	/**
-	 * Construct Argument Typecode Lists.
+	 * STEP 0: Default Kernel Argument Values.
+	 * 
+	 * They should be valid for a "scalar" job. In particular, for any
+	 * non-existent axis, assume length 1.
 	 */
-
-	PRI_TYPECODES[i++] = GA_BUFFER; /* srcPtr */
-	PRI_TYPECODES[i++] = GA_SIZE;   /* srcOff */
-	PRI_TYPECODES[i++] = GA_BUFFER; /* srcSteps */
-	PRI_TYPECODES[i++] = GA_BUFFER; /* srcSize */
-	PRI_TYPECODES[i++] = GA_BUFFER; /* chnkSize */
-	if (reduxKernelRequiresDst(ctx)){
-		PRI_TYPECODES[i++] = GA_BUFFER; /* dstPtr */
-		PRI_TYPECODES[i++] = GA_SIZE;   /* dstOff */
-		PRI_TYPECODES[i++] = GA_BUFFER; /* dstSteps */
+	
+	ctx->phase       = 0;
+	ctx->U           = 1;
+	ctx->V           = 1;
+	ctx->B           = 1;
+	ctx->D           = 1;
+	ctx->H           = 1;
+	ctx->splitFree   = 1;
+	ctx->splitReduce = 1;
+	ctx->xdSplit     = NULL;
+	ctx->l           = calloc(ctx->gr->nds,      sizeof(*ctx->l));
+	ctx->lPDim       = calloc(ctx->gr->ndr,      sizeof(*ctx->lPDim));
+	ctx->sJ          = calloc(ctx->gr->nds,      sizeof(*ctx->sJ));
+	ctx->dJ          = calloc(ctx->gr->ndd,      sizeof(*ctx->dJ));
+	ctx->aJ          = calloc(ctx->gr->ndd,      sizeof(*ctx->aJ));
+	ctx->wdOff       = 0;
+	ctx->pdOff       = 0;
+	ctx->waOff       = 0;
+	ctx->paOff       = 0;
+	ctx->ibs         = calloc(ctx->gr->log2MaxL, sizeof(*ctx->ibs));
+	ctx->ibp         = calloc(ctx->gr->log2MaxL, sizeof(*ctx->ibp));
+	ctx->iblPDim     = calloc(ctx->gr->log2MaxL, sizeof(*ctx->iblPDim));
+	ctx->ibsOff      = calloc(ctx->gr->log2MaxL, sizeof(*ctx->ibsOff));
+	ctx->ibdOff      = calloc(ctx->gr->log2MaxL, sizeof(*ctx->ibdOff));
+	ctx->ibaOff      = calloc(ctx->gr->log2MaxL, sizeof(*ctx->ibaOff));
+	ctx->bs          = 1;
+	ctx->gs          = 1;
+	ctx->kArgs       = calloc(ctx->gr->kNumArgs, sizeof(*ctx->kArgs));
+	
+	if(!ctx->l      || !ctx->lPDim  || !ctx->sJ     || !ctx->dJ       ||
+	   !ctx->aJ     || !ctx->ibs    || !ctx->ibp    || !ctx->iblPDim  ||
+	   !ctx->ibsOff || !ctx->ibdOff || !ctx->ibaOff || !ctx->kArgs){
+		return reduxInvCleanupMsg(ctx, GA_MEMORY_ERROR,
+		       "Failed to allocate memory for kernel invocation arguments!\n");
+	}
+	for(i=0;i<ctx->gr->nds;i++){
+		ctx->l[i] = 1;
+	}
+	for(i=0;i<ctx->gr->ndr;i++){
+		ctx->lPDim[i] = 1;
 	}
-	if (reduxKernelRequiresDstArg(ctx)){
-		PRI_TYPECODES[i++] = GA_BUFFER; /* dstArgPtr */
-		PRI_TYPECODES[i++] = GA_SIZE;   /* dstArgOff */
-		PRI_TYPECODES[i++] = GA_BUFFER; /* dstArgSteps */
+	for(i=0;i<ctx->gr->log2MaxL;i++){
+		ctx->ibs[i] = 1;
 	}
-	PRI_TYPECODES_LEN  = i;
 
 
 	/**
-	 * Compile the kernels.
+	 * STEP 1: Select Intra-Block Axes.
+	 * 
+	 * Sort the axes in the order likely to maximize contiguity of source
+	 * memory accesses, then tag them to the kernel block size limit, possibly
+	 * splitting an axis in the process.
 	 */
-
-	{
-		ret  = GpuKernel_init(&ctx->kernel,
-		                      ctx->gpuCtx,
-		                      1,
-		                      (const char**)&ctx->sourceCode,
-		                      &ctx->sourceCodeLen,
-		                      "reduxKer",
-		                      PRI_TYPECODES_LEN,
-		                      PRI_TYPECODES,
-		                      GA_USE_CLUDA,
-		                      &ctx->errorString0);
-		if (ret != GA_NO_ERROR){
-			return reduxCleanup(ctx, ret);
+	
+	reduxSortAxisPtrsBy(ctx->xdSrcPtrs, ctx->xdSrc, ctx->ndfs,
+	                    reduxSortPtrIBSrcRdSelect);
+	target = reduxGenGetMaxLocalSize(ctx->gr);
+	
+	for(i=0;i<ctx->ndfs && i<ctx->gr->log2MaxL;i++){
+		axis = reduxInvGetSrcSortAxis(ctx, i);
+		aL   = axisGetLen(axis);
+		
+		if(ctx->bs*aL <= target){
+			ctx->bs     *= aL;
+			axisMarkIntraBlock(axis, i, aL);
+		}else{
+			if(target/ctx->bs >= 2){
+				aLS          = target/ctx->bs;
+				ctx->bs     *= aLS;
+				axisMarkIntraBlock(axis, i++, aLS);
+				ctx->xdSplit = axis;
+			}
+			break;
 		}
 	}
+	ctx->ndib = i;
 
-	return reduxSchedule(ctx);
-}
 
-/**
- * @brief Compute a good thread block size / grid size / software chunk size
- *        for the primary/auxilliary kernels.
- */
+	/**
+	 * STEP 2: Compute values dependent only on the intrablock axis selection.
+	 * 
+	 * For instance, the splitFree/splitReduce factors depend only on the split
+	 * axis, if any.
+	 * 
+	 * The shared memory consumption and shared memory offsets depend only
+	 * on block size.
+	 */
 
-static int        reduxSchedule                 (redux_ctx*  ctx){
-	int      i, priNdims = 0;
-	uint64_t maxLgRdx = 0;
-	uint64_t maxLgPri = 0;
-	uint64_t maxLs  [MAX_HW_DIMS];
-	uint64_t maxGg;
-	uint64_t maxGs  [MAX_HW_DIMS];
-	uint64_t priDims[MAX_HW_DIMS];
-	uint64_t bs     [MAX_HW_DIMS];
-	uint64_t gs     [MAX_HW_DIMS];
-	uint64_t cs     [MAX_HW_DIMS];
-	size_t   warpSize, maxL;
-	axis_desc* axis;
+	ctx->splitFree   = reduxInvGetSplitFree     (ctx);
+	ctx->splitReduce = reduxInvGetSplitReduce   (ctx);
+	ctx->SHMEM       = reduxGenGetSHMEMSize     (ctx->gr, ctx->bs);
+	ctx->pdOff       = reduxGenGetSHMEMDstOff   (ctx->gr, ctx->bs);
+	ctx->paOff       = reduxGenGetSHMEMDstArgOff(ctx->gr, ctx->bs);
 
 
 	/**
-	 * Obtain the constraints of our problem.
+	 * STEP 3: Compute U, B, D, H
+	 */
+	
+	for (i=0;i<ctx->ndfs;i++){
+		axis    = reduxInvGetSrcAxis(ctx, i);
+		ctx->U *= axisGetInterLen(axis);
+		ctx->B *= axisIsReduced(axis) ? axisGetInterLen(axis) : 1;
+		ctx->H *= axisIsReduced(axis) ? axisGetIntraLen(axis) : 1;
+	}
+	ctx->D = ctx->bs/ctx->H;
+	
+	
+	/**
+	 * STEP 4: Compute PDim values.
+	 * 
+	 * This will be used for index calculation.
 	 */
 	
-	gpukernel_property(ctx->kernel.k,     GA_KERNEL_PROP_PREFLSIZE, &warpSize);
-	gpukernel_property(ctx->kernel.k,     GA_KERNEL_PROP_MAXLSIZE,  &maxL);
-	maxLgRdx  = maxL;
-	maxLgPri  = maxLgRdx;
-
-	priNdims  = ctx->st1.ndh;
-	maxGs[0]  = ctx->maxGs[0];
-	maxGs[1]  = ctx->maxGs[1];
-	maxGs[2]  = ctx->maxGs[2];
-	maxGg     = ctx->maxGg;
-	maxLs[0]  = ctx->maxLs[0];
-	maxLs[1]  = ctx->maxLs[1];
-	maxLs[2]  = ctx->maxLs[2];
+	reduxSortAxisPtrsBy(ctx->xdSrcPtrs, ctx->xdSrc, ctx->ndfs,
+	                    reduxSortPtrByReduxNum);
 	for (i=0;i<ctx->ndfs;i++){
-		axis = reduxGetSrcFlatAxis(ctx, i);
-		if(axisIsHW(axis, 0)){
-			priDims[axisGetHWAxisNum(axis, 0)] = axisGetLen(axis);
+		axis = reduxInvGetSrcSortAxis(ctx, i);
+		
+		if(axisIsReduced(axis)){
+			if(i==0){
+				axisSetPDim(axis, 1);
+			}else{
+				prevAxis = reduxInvGetSrcSortAxis(ctx, i-1);
+				axisSetPDim(prevAxis, axisGetPDim(axis)*axisGetLen(prevAxis));
+			}
 		}
 	}
-
-
+	
+	
 	/**
-	 * Apply the solver.
+	 * STEP 5: Compute Intra-Block Permute Core.
+	 * 
+	 * Sort the axes in the order most likely to maximize contiguity of
+	 * destination/destination argument memory accesses, then compute the
+	 * permutation that achieves the highest-bandwidth,
+	 * post-horizontal-reduction destination writes.
 	 */
-
-	{
-		reduxScheduleKernel(priNdims,
-		                    priDims,
-		                    warpSize,
-		                    maxLgPri, maxLs,
-		                    maxGg,    maxGs,
-		                    bs, gs, cs);
-		for (i=0;i<priNdims;i++){
-			ctx->st1.bs[i] = bs[i];
-			ctx->st1.gs[i] = gs[i];
-			ctx->st1.cs[i] = cs[i];
+	
+	reduxSortAxisPtrsBy(ctx->xdSrcPtrs, ctx->xdSrc, ctx->ndfs,
+	                    reduxInvRequiresDst(ctx)    ?
+	                    reduxSortPtrIBDstWrSelect   :
+	                    reduxSortPtrIBDstArgWrSelect);
+	for(i=0;i<ctx->ndfs;i++){
+		axis = reduxInvGetSrcSortAxis(ctx, i);
+		
+		if(axisIsIntra(axis)){
+			if(i==0){
+				axisSetIBP(axis, 1);
+			}else{
+				prevAxis = reduxInvGetSrcSortAxis(ctx, i-1);
+				axisSetIBP(axis, axisGetIBP(prevAxis)*axisGetLen(prevAxis));
+			}
 		}
-		if (priNdims <= 0){
-			ctx->st1.bs[i] = ctx->st1.gs[i] = ctx->st1.cs[i] = 1;
+	}
+	
+	/**
+	 * STEP 6. Place the axes in final loop order and perform final placement
+	 *         of:
+	 *              lN, lPDim, sJN, dJN, aJN,
+	 *              ibs, ibp, iblPDim, ibsOff, ibdOff, ibaOff
+	 */
+	
+	reduxSortAxisPtrsBy(ctx->xdSrcPtrs, ctx->xdSrc, ctx->ndfs,
+	                    reduxSortPtrFinalOrder);
+	for(i=0,j=0;i<ctx->ndfs;i++){
+		axis = reduxInvGetSrcSortAxis(ctx, i);
+		
+		if       (axisIsSplit(axis) && !axisIsReduced(axis)){
+			/* Split Free Axis? */
+			ctx->ibs    [             0] = axisGetIntraLen(axis);
+			ctx->ibp    [             0] = axisGetIntraLen(axis);
+			ctx->iblPDim[             0] = axisGetIntraLen(axis);
+			ctx->ibsOff [             0] = axisGetSrcStride(axis);
+			ctx->ibdOff [             0] = axisGetDstStride(axis);
+			ctx->ibaOff [             0] = axisGetDstArgStride(axis);
+			
+			ctx->l      [ctx->gr->ndd-1] = axisGetInterLen(axis);
+			ctx->lPDim  [ctx->gr->ndd-1] = axisGetPDim    (axis);
+			ctx->sJ     [ctx->gr->ndd-1] = 0;
+			ctx->dJ     [ctx->gr->ndd-1] = 0;
+			ctx->aJ     [ctx->gr->ndd-1] = 0;
+		}else if (axisIsSplit(axis) &&  axisIsReduced(axis)){
+			/* Split Reduced Axis? */
+			ctx->ibs    [             0] = axisGetIntraLen(axis);
+			ctx->ibp    [             0] = axisGetIntraLen(axis);
+			ctx->iblPDim[             0] = axisGetIntraLen(axis);
+			ctx->ibsOff [             0] = axisGetSrcStride(axis);
+			ctx->ibdOff [             0] = axisGetDstStride(axis);
+			ctx->ibaOff [             0] = axisGetDstArgStride(axis);
+			
+			ctx->l      [ctx->gr->nds-1] = axisGetInterLen(axis);
+			ctx->lPDim  [ctx->gr->nds-1] = axisGetPDim    (axis);
+			ctx->sJ     [ctx->gr->nds-1] = 0;
+			ctx->dJ     [ctx->gr->nds-1] = 0;
+			ctx->aJ     [ctx->gr->nds-1] = 0;
+		}else if (axisIsInter(axis) && !axisIsReduced(axis)){
+			/* Inter Free Axis? */
+			ctx->l      [             j] = axisGetInterLen(axis);
+			ctx->lPDim  [             j] = axisGetPDim    (axis);
+			ctx->sJ     [             j] = 0;
+			ctx->dJ     [             j] = 0;
+			ctx->aJ     [             j] = 0;
+		}else if (axisIsInter(axis) &&  axisIsReduced(axis)){
+			/* Inter Reduced Axis? */
+			ctx->l      [             j] = axisGetInterLen(axis);
+			ctx->lPDim  [             j] = axisGetPDim    (axis);
+			ctx->sJ     [             j] = 0;
+			ctx->dJ     [             j] = 0;
+			ctx->aJ     [             j] = 0;
+		}else{
+			/* Intra Axis? */
+			ctx->ibs    [             0] = axisGetIntraLen(axis);
+			ctx->ibp    [             0] = axisGetIntraLen(axis);
+			ctx->iblPDim[             0] = axisGetIntraLen(axis);
+			ctx->ibsOff [             0] = axisGetSrcStride(axis);
+			ctx->ibdOff [             0] = axisGetDstStride(axis);
+			ctx->ibaOff [             0] = axisGetDstArgStride(axis);
 		}
 	}
 
-	return reduxInvoke(ctx);
+	return reduxInvSchedule(ctx);
 }
 
+#if 0
+static void       reduxScheduleKernel           (int                  ndims,
+                                                 uint64_t*            dims,
+                                                 uint64_t             warpSize,
+                                                 uint64_t             maxLg,
+                                                 uint64_t*            maxLs,
+                                                 uint64_t             maxGg,
+                                                 uint64_t*            maxGs,
+                                                 uint64_t*            bs,
+                                                 uint64_t*            gs,
+                                                 uint64_t*            cs);
+
 /**
  * @brief Given the parameters of a kernel scheduling problem, solve it as
  *        optimally as possible.
@@ -2545,124 +3382,204 @@ static void       reduxScheduleKernel           (int         ndims,
 		cs[i] = gaIFLGetProduct(&factCS[i]);
 	}
 }
+#endif
 
 /**
- * Invoke the kernel.
+ * @brief With nearly all parameters of the kernel computed, schedule the
+ *        kernel for maximum performance.
+ * 
+ * The thread block size has already been chosen; We only have to choose
+ * 
+ *   1. ctx->gs: The grid size, which is the number of thread blocks.
+ *   2. ctx->V:  The number of vertical reductions per thread block.
+ * 
+ * Two factors drive the scheduling:
+ * 
+ *   1. We want to keep all multiprocessors of the device busy; For this we use
+ *      an estimate of the level of parallelism of the device.
+ *   2. If V can be chosen such that V % B == 0, then only a single kernel
+ *      phase is necessary.
+ * 
+ * Once the scheduling is performed, the workspace can be allocated and
+ * workspace offsets can be computed.
  */
 
-static int        reduxInvoke                   (redux_ctx*  ctx){
-	void* priArgs[11];
-	int   ret, i = 0;
-	int   failedDstSteps     = 0;
-	int   failedDstArgSteps  = 0;
-	int   failedAuxChunkSize = 0;
+static int        reduxInvSchedule              (redux_ctx*           ctx){
+	const int flags = GA_BUFFER_READ_WRITE;
+	size_t    WSPACESIZE;
+	
+	/**
+	 * Get enough blocks to fill available device parallelism to capacity.
+	 * Then, compute corresponding V.
+	 */
+	
+	ctx->gs    = DIVIDECEIL(reduxInvEstimateParallelism(ctx),
+	                        reduxGenGetMaxLocalSize(ctx->gr));
+	ctx->V     = DIVIDECEIL(ctx->U, ctx->gs);
+	
+	/**
+	 * Allocate required workspace.
+	 */
+	
+	ctx->wdOff = reduxGenGetSHMEMDstOff   (ctx->gr, 2*ctx->gs*ctx->D);
+	ctx->waOff = reduxGenGetSHMEMDstArgOff(ctx->gr, 2*ctx->gs*ctx->D);
+	WSPACESIZE = reduxGenGetSHMEMSize     (ctx->gr, 2*ctx->gs*ctx->D);
+	ctx->w     = gpudata_alloc(ctx->gr->gpuCtx, WSPACESIZE, 0, flags, 0);
+	if(!ctx->w){
+		return reduxInvCleanupMsg(ctx, GA_MEMORY_ERROR,
+		       "Could not allocate %zu-byte workspace for reduction!\n",
+		       WSPACESIZE);
+	}
+	
+	return reduxInvoke(ctx);
+}
 
+/**
+ * @brief Invoke the kernel.
+ */
 
+static int        reduxInvoke                   (redux_ctx*           ctx){
+	int   ret, i, k;
+	
 	/**
-	 * Argument Marshalling. This the grossest gross thing in here.
+	 * Argument Marshalling.
 	 */
-
-	const int flags      = GA_BUFFER_READ_ONLY|GA_BUFFER_INIT;
-	ctx->srcStepsGD      = gpudata_alloc(ctx->gpuCtx, ctx->ndfs    * sizeof(size_t),
-	                                     ctx->flatSrcStrides,    flags, 0);
-	ctx->srcSizeGD       = gpudata_alloc(ctx->gpuCtx, ctx->ndfs    * sizeof(size_t),
-	                                     ctx->flatSrcDimensions, flags, 0);
-	ctx->st1.chunkSizeGD = gpudata_alloc(ctx->gpuCtx, ctx->st1.ndh * sizeof(size_t),
-	                                     ctx->st1.cs,            flags, 0);
-
-	priArgs[i++] = (void*) ctx->flatSrcData;
-	priArgs[i++] = (void*)&ctx->flatSrcOffset;
-	priArgs[i++] = (void*) ctx->srcStepsGD;
-	priArgs[i++] = (void*) ctx->srcSizeGD;
-	priArgs[i++] = (void*) ctx->st1.chunkSizeGD;
-	if (reduxKernelRequiresDst   (ctx)){
-		ctx->dstStepsGD      = gpudata_alloc(ctx->gpuCtx, ctx->ndfd * sizeof(size_t),
-		                                     ctx->flatDstStrides,    flags, 0);
-		priArgs[i++]         = (void*) ctx->flatDstData;
-		priArgs[i++]         = (void*)&ctx->flatDstOffset;
-		priArgs[i++]         = (void*) ctx->dstStepsGD;
-		failedDstSteps       =        !ctx->dstStepsGD;
-	}
-	if (reduxKernelRequiresDstArg(ctx)){
-		ctx->dstArgStepsGD   = gpudata_alloc(ctx->gpuCtx, ctx->ndfd * sizeof(size_t),
-		                                     ctx->flatDstArgStrides, flags, 0);
-		priArgs[i++]         = (void*) ctx->flatDstArgData;
-		priArgs[i++]         = (void*)&ctx->flatDstArgOffset;
-		priArgs[i++]         = (void*) ctx->dstArgStepsGD;
-		failedDstArgSteps    =        !ctx->dstArgStepsGD;
+	
+	i = 0;
+	ctx->kArgs[i++] =           (void*)&ctx->phase;
+	ctx->kArgs[i++] =           (void*)&ctx->U;
+	ctx->kArgs[i++] =           (void*)&ctx->V;
+	ctx->kArgs[i++] =           (void*)&ctx->B;
+	ctx->kArgs[i++] =           (void*)&ctx->D;
+	ctx->kArgs[i++] =           (void*)&ctx->H;
+	ctx->kArgs[i++] =           (void*)&ctx->splitFree;
+	ctx->kArgs[i++] =           (void*)&ctx->splitReduce;
+	for(k=0;k < ctx->gr->nds;k++){
+		ctx->kArgs[i++] =       (void*)&ctx->l[k];
+	}
+	for(k=0;k < ctx->gr->ndr && reduxInvRequiresDstArg(ctx);k++){
+		ctx->kArgs[i++] =       (void*)&ctx->lPDim[k];
+	}
+	ctx->kArgs[i++] =           (void*) ctx->flatSrcData;
+	ctx->kArgs[i++] =           (void*)&ctx->flatSrcOffset;
+	for(k=0;k < ctx->gr->nds;k++){
+		ctx->kArgs[i++] =       (void*)&ctx->sJ[k];
+	}
+	if(reduxInvRequiresDst   (ctx)){
+		ctx->kArgs[i++] =       (void*) ctx->flatDstData;
+		ctx->kArgs[i++] =       (void*)&ctx->flatDstOffset;
+		for(k=0;k < ctx->gr->ndd;k++){
+			ctx->kArgs[i++] =   (void*)&ctx->dJ[k];
+		}
+	}
+	if(reduxInvRequiresDstArg(ctx)){
+		ctx->kArgs[i++] =       (void*) ctx->flatDstArgData;
+		ctx->kArgs[i++] =       (void*)&ctx->flatDstArgOffset;
+		for(k=0;k < ctx->gr->ndd;k++){
+			ctx->kArgs[i++] =   (void*)&ctx->aJ[k];
+		}
+	}
+	ctx->kArgs[i++] =           (void*) ctx->w;
+	if(reduxInvKernelRequiresDst   (ctx)){
+		ctx->kArgs[i++] =       (void*)&ctx->wdOff;
+		ctx->kArgs[i++] =       (void*)&ctx->pdOff;
+	}
+	if(reduxInvKernelRequiresDstArg(ctx)){
+		ctx->kArgs[i++] =       (void*)&ctx->waOff;
+		ctx->kArgs[i++] =       (void*)&ctx->paOff;
+	}
+	for(k=0;k < ctx->gr->log2MaxL;k++){
+		ctx->kArgs[i++] =       (void*)&ctx->ibs[k];
 	}
+	for(k=0;k < ctx->gr->log2MaxL;k++){
+		ctx->kArgs[i++] =       (void*)&ctx->ibp[k];
+	}
+	for(k=0;k < ctx->gr->log2MaxL && reduxInvRequiresDstArg(ctx);k++){
+		ctx->kArgs[i++] =       (void*)&ctx->iblPDim[k];
+	}
+	for(k=0;k < ctx->gr->log2MaxL;k++){
+		ctx->kArgs[i++] =       (void*)&ctx->ibsOff[k];
+	}
+	for(k=0;k < ctx->gr->log2MaxL && reduxInvRequiresDst   (ctx);k++){
+		ctx->kArgs[i++] =       (void*)&ctx->ibdOff[k];
+	}
+	for(k=0;k < ctx->gr->log2MaxL && reduxInvRequiresDstArg(ctx);k++){
+		ctx->kArgs[i++] =       (void*)&ctx->ibaOff[k];
+	}
+
 
 
 	/**
-	 * One or three kernels is now invoked, depending on the code model.
+	 * The kernel is now invoked once or twice, for phase 0 or 1.
+	 * 
+	 * Phase 1 is sometimes optional.
 	 */
 
-	if (ctx->srcStepsGD      &&
-	    ctx->srcSizeGD       &&
-	    ctx->st1.chunkSizeGD &&
-	    !failedDstSteps      &&
-	    !failedDstArgSteps   &&
-	    !failedAuxChunkSize){
-		/* Reduction kernel invocation */
-		ret = GpuKernel_call(&ctx->kernel,
-		                     ctx->st1.ndh>0 ? ctx->st1.ndh : 1,
-		                     ctx->st1.gs,
-		                     ctx->st1.bs,
-		                     0,
-		                     priArgs);
+	ctx->phase = 0;
+	ret = GpuKernel_call(&ctx->gr->k, 1, &ctx->gs, &ctx->bs, ctx->SHMEM, ctx->kArgs);
+	if (ret != GA_NO_ERROR){
+		return reduxInvCleanupMsg(ctx, ret,
+		                          "Failure in kernel call, Phase 0!\n");
+	}
+	
+	if(ctx->V%ctx->B != 0){
+		ctx->phase = 1;
+		ret = GpuKernel_call(&ctx->gr->k, 1, &ctx->gs, &ctx->bs, ctx->SHMEM, ctx->kArgs);
 		if (ret != GA_NO_ERROR){
-			return reduxCleanup(ctx, ret);
+			return reduxInvCleanupMsg(ctx, ret,
+			                          "Failure in kernel call, Phase 1!\n");
 		}
-
-		return reduxCleanup(ctx, ret);
-	}else{
-		return reduxCleanup(ctx, GA_MEMORY_ERROR);
 	}
+	
+	/* Success! */
+	return reduxInvCleanup(ctx, GA_NO_ERROR);
 }
 
 /**
  * Cleanup
  */
 
-static int        reduxCleanup                  (redux_ctx*  ctx, int ret){
-	free(ctx->flatSrcDimensions);
-	free(ctx->flatSrcStrides);
-	free(ctx->flatDstStrides);
-	free(ctx->flatDstArgStrides);
-	free(ctx->tmpDstDimensions);
-	free(ctx->tmpDstStrides);
-	free(ctx->tmpDstArgStrides);
-	free(ctx->sourceCode);
-	free(ctx->errorString0);
-	free(ctx->errorString1);
-	ctx->flatSrcDimensions = NULL;
-	ctx->flatSrcStrides    = NULL;
-	ctx->flatDstStrides    = NULL;
-	ctx->flatDstArgStrides = NULL;
-	ctx->tmpDstDimensions  = NULL;
-	ctx->tmpDstStrides     = NULL;
-	ctx->tmpDstArgStrides  = NULL;
-	ctx->sourceCode   = NULL;
-	ctx->errorString0 = NULL;
-	ctx->errorString1 = NULL;
-	
-	gpudata_release(ctx->tmpDstData);
-	gpudata_release(ctx->tmpDstArgData);
-	gpudata_release(ctx->srcStepsGD);
-	gpudata_release(ctx->srcSizeGD);
-	gpudata_release(ctx->dstStepsGD);
-	gpudata_release(ctx->dstArgStepsGD);
-	gpudata_release(ctx->st1.chunkSizeGD);
-	gpudata_release(ctx->st2.chunkSizeGD);
-	ctx->srcStepsGD      = ctx->srcSizeGD       =
-	ctx->dstStepsGD      = ctx->dstArgStepsGD   =
-	ctx->st1.chunkSizeGD = ctx->st2.chunkSizeGD = NULL;
+static int        reduxInvCleanup               (redux_ctx*        ctx, int ret){
+	free(ctx->l);
+	free(ctx->lPDim);
+	free(ctx->sJ);
+	free(ctx->dJ);
+	free(ctx->aJ);
+	free(ctx->ibs);
+	free(ctx->ibp);
+	free(ctx->iblPDim);
+	free(ctx->ibsOff);
+	free(ctx->ibdOff);
+	free(ctx->ibaOff);
+	free(ctx->kArgs);
+	free(ctx->xdSrc);
+	free(ctx->xdSrcPtrs);
+	free(ctx->xdTmpPtrs);
+	
+	gpudata_release(ctx->w);
+	
+	ctx->l                 = NULL;
+	ctx->lPDim             = NULL;
+	ctx->sJ                = NULL;
+	ctx->dJ                = NULL;
+	ctx->aJ                = NULL;
+	ctx->ibs               = NULL;
+	ctx->ibp               = NULL;
+	ctx->iblPDim           = NULL;
+	ctx->ibsOff            = NULL;
+	ctx->ibdOff            = NULL;
+	ctx->ibaOff            = NULL;
+	ctx->kArgs             = NULL;
+	ctx->xdSrc             = NULL;
+	ctx->xdSrcPtrs         = NULL;
+	ctx->xdTmpPtrs         = NULL;
+	
+	ctx->w                 = NULL;
 
 	return ret;
 }
-
-static int        reduxCleanupMsg               (redux_ctx*  ctx, int ret,
-                                                 const char* fmt, ...){
+static int        reduxInvCleanupMsg            (redux_ctx*        ctx, int ret,
+                                                 const char*       fmt, ...){
 #if DEBUG
 	FILE* fp = stderr;
 	
@@ -2675,5 +3592,5 @@ static int        reduxCleanupMsg               (redux_ctx*  ctx, int ret,
 	(void)fmt;
 #endif
 	
-	return reduxCleanup(ctx, ret);
+	return reduxInvCleanup(ctx, ret);
 }
diff --git a/tests/check_reduction.c b/tests/check_reduction.c
index 18cf9e7615..567f384aaf 100644
--- a/tests/check_reduction.c
+++ b/tests/check_reduction.c
@@ -77,7 +77,7 @@ START_TEST(test_maxandargmax_reduction){
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
-	const unsigned reduxList[] = {0,2};
+	const int reduxList[] = {0,2};
 
 	float *pSrc = calloc(sizeof(*pSrc), prodDims);
 	float *pMax = calloc(sizeof(*pMax), dims[1]);
@@ -113,7 +113,12 @@ START_TEST(test_maxandargmax_reduction){
 	ga_assert_ok(GpuArray_memset(&gaMax,    -1));  /* 0xFFFFFFFF is a qNaN. */
 	ga_assert_ok(GpuArray_memset(&gaArgmax, -1));
 
-	ga_assert_ok(GpuArray_reduction(GA_REDUCE_MAXANDARGMAX, &gaMax, &gaArgmax, &gaSrc, 2, reduxList));
+	GpuReduction* gr;
+	GpuReduction_new(&gr, GpuArray_context(&gaSrc),
+	                 GA_REDUCE_MAXANDARGMAX, 1, 2, gaSrc.typecode, 0);
+	ck_assert_ptr_nonnull(gr);
+	ga_assert_ok(GpuReduction_call(gr, &gaMax, &gaArgmax, &gaSrc, 2, reduxList, 0));
+	GpuReduction_free(gr);
 
 	ga_assert_ok(GpuArray_read(pMax,    sizeof(*pMax)   *dims[1], &gaMax));
 	ga_assert_ok(GpuArray_read(pArgmax, sizeof(*pArgmax)*dims[1], &gaArgmax));
@@ -169,7 +174,7 @@ START_TEST(test_maxandargmax_idxtranspose){
 	size_t prodDims    = dims[0]*dims[1]*dims[2];
 	size_t rdxDims[1]  = {50};
 	size_t rdxProdDims = rdxDims[0];
-	const unsigned reduxList[] = {2,0};
+	const int reduxList[] = {2,0};
 
 	float *pSrc = calloc(sizeof(*pSrc), prodDims);
 	float *pMax = calloc(sizeof(*pMax), rdxProdDims);
@@ -205,7 +210,12 @@ START_TEST(test_maxandargmax_idxtranspose){
 	ga_assert_ok(GpuArray_memset(&gaMax,    -1));  /* 0xFFFFFFFF is a qNaN. */
 	ga_assert_ok(GpuArray_memset(&gaArgmax, -1));
 
-	ga_assert_ok(GpuArray_reduction(GA_REDUCE_MAXANDARGMAX, &gaMax, &gaArgmax, &gaSrc, 2, reduxList));
+	GpuReduction* gr;
+	GpuReduction_new(&gr, GpuArray_context(&gaSrc),
+	                 GA_REDUCE_MAXANDARGMAX, 1, 2, gaSrc.typecode, 0);
+	ck_assert_ptr_nonnull(gr);
+	ga_assert_ok(GpuReduction_call(gr, &gaMax, &gaArgmax, &gaSrc, 2, reduxList, 0));
+	GpuReduction_free(gr);
 
 	ga_assert_ok(GpuArray_read(pMax,    sizeof(*pMax)   *rdxProdDims, &gaMax));
 	ga_assert_ok(GpuArray_read(pArgmax, sizeof(*pArgmax)*rdxProdDims, &gaArgmax));
@@ -258,7 +268,7 @@ START_TEST(test_maxandargmax_veryhighrank){
 	size_t prodDims    = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7];
 	size_t rdxDims[4]  = {1171,373,1,2};
 	size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3];
-	const unsigned reduxList[] = {2,4,7,5};
+	const int reduxList[] = {2,4,7,5};
 
 	float *pSrc = calloc(sizeof(*pSrc), prodDims);
 	float *pMax = calloc(sizeof(*pMax), rdxProdDims);
@@ -294,7 +304,12 @@ START_TEST(test_maxandargmax_veryhighrank){
 	ga_assert_ok(GpuArray_memset(&gaMax,    -1));  /* 0xFFFFFFFF is a qNaN. */
 	ga_assert_ok(GpuArray_memset(&gaArgmax, -1));
 
-	ga_assert_ok(GpuArray_reduction(GA_REDUCE_MAXANDARGMAX, &gaMax, &gaArgmax, &gaSrc, 4, reduxList));
+	GpuReduction* gr;
+	GpuReduction_new(&gr, GpuArray_context(&gaSrc),
+	                 GA_REDUCE_MAXANDARGMAX, 4, 4, gaSrc.typecode, 0);
+	ck_assert_ptr_nonnull(gr);
+	ga_assert_ok(GpuReduction_call(gr, &gaMax, &gaArgmax, &gaSrc, 4, reduxList, 0));
+	GpuReduction_free(gr);
 
 	ga_assert_ok(GpuArray_read(pMax,    sizeof(*pMax)   *rdxProdDims, &gaMax));
 	ga_assert_ok(GpuArray_read(pArgmax, sizeof(*pArgmax)*rdxProdDims, &gaArgmax));
@@ -357,7 +372,7 @@ START_TEST(test_maxandargmax_alldimsreduced){
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
-	const unsigned reduxList[] = {0,1,2};
+	const int reduxList[] = {0,1,2};
 
 	float *pSrc    = calloc(sizeof(*pSrc), prodDims);
 	float *pMax    = calloc(1, sizeof(*pMax));
@@ -393,7 +408,12 @@ START_TEST(test_maxandargmax_alldimsreduced){
 	ga_assert_ok(GpuArray_memset(&gaMax,    -1));  /* 0xFFFFFFFF is a qNaN. */
 	ga_assert_ok(GpuArray_memset(&gaArgmax, -1));
 
-	ga_assert_ok(GpuArray_reduction(GA_REDUCE_MAXANDARGMAX, &gaMax, &gaArgmax, &gaSrc, 3, reduxList));
+	GpuReduction* gr;
+	GpuReduction_new(&gr, GpuArray_context(&gaSrc),
+	                 GA_REDUCE_MAXANDARGMAX, 0, 3, gaSrc.typecode, 0);
+	ck_assert_ptr_nonnull(gr);
+	ga_assert_ok(GpuReduction_call(gr, &gaMax, &gaArgmax, &gaSrc, 3, reduxList, 0));
+	GpuReduction_free(gr);
 
 	ga_assert_ok(GpuArray_read(pMax,    sizeof(*pMax),    &gaMax));
 	ga_assert_ok(GpuArray_read(pArgmax, sizeof(*pArgmax), &gaArgmax));
@@ -445,7 +465,7 @@ START_TEST(test_minandargmin_reduction){
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
-	const unsigned reduxList[] = {0,2};
+	const int reduxList[] = {0,2};
 
 	float*  pSrc    = calloc(1, sizeof(*pSrc)    * dims[0]*dims[1]*dims[2]);
 	float*  pMin    = calloc(1, sizeof(*pMin)    *         dims[1]        );
@@ -481,7 +501,12 @@ START_TEST(test_minandargmin_reduction){
 	ga_assert_ok(GpuArray_memset(&gaMin,    -1));  /* 0xFFFFFFFF is a qNaN. */
 	ga_assert_ok(GpuArray_memset(&gaArgmin, -1));
 
-	ga_assert_ok(GpuArray_reduction(GA_REDUCE_MINANDARGMIN, &gaMin, &gaArgmin, &gaSrc, 2, reduxList));
+	GpuReduction* gr;
+	GpuReduction_new(&gr, GpuArray_context(&gaSrc),
+	                 GA_REDUCE_MINANDARGMIN, 1, 2, gaSrc.typecode, 0);
+	ck_assert_ptr_nonnull(gr);
+	ga_assert_ok(GpuReduction_call(gr, &gaMin, &gaArgmin, &gaSrc, 2, reduxList, 0));
+	GpuReduction_free(gr);
 
 	ga_assert_ok(GpuArray_read(pMin,    sizeof(*pMin)   *dims[1], &gaMin));
 	ga_assert_ok(GpuArray_read(pArgmin, sizeof(*pArgmin)*dims[1], &gaArgmin));
@@ -534,7 +559,7 @@ START_TEST(test_minandargmin_veryhighrank){
 	size_t prodDims    = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7];
 	size_t rdxDims[4]  = {1171,373,1,2};
 	size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3];
-	const unsigned reduxList[] = {2,4,7,5};
+	const int reduxList[] = {2,4,7,5};
 
 	float*  pSrc    = calloc(1, sizeof(*pSrc)    * prodDims);
 	float*  pMin    = calloc(1, sizeof(*pMin)    * rdxProdDims);
@@ -570,7 +595,12 @@ START_TEST(test_minandargmin_veryhighrank){
 	ga_assert_ok(GpuArray_memset(&gaMin,    -1));  /* 0xFFFFFFFF is a qNaN. */
 	ga_assert_ok(GpuArray_memset(&gaArgmin, -1));
 
-	ga_assert_ok(GpuArray_reduction(GA_REDUCE_MINANDARGMIN, &gaMin, &gaArgmin, &gaSrc, 4, reduxList));
+	GpuReduction* gr;
+	GpuReduction_new(&gr, GpuArray_context(&gaSrc),
+	                 GA_REDUCE_MINANDARGMIN, 4, 4, gaSrc.typecode, 0);
+	ck_assert_ptr_nonnull(gr);
+	ga_assert_ok(GpuReduction_call(gr, &gaMin, &gaArgmin, &gaSrc, 4, reduxList, 0));
+	GpuReduction_free(gr);
 
 	ga_assert_ok(GpuArray_read(pMin,    sizeof(*pMin)   *rdxProdDims, &gaMin));
 	ga_assert_ok(GpuArray_read(pArgmin, sizeof(*pArgmin)*rdxProdDims, &gaArgmin));
@@ -633,7 +663,7 @@ START_TEST(test_minandargmin_alldimsreduced){
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
-	const unsigned reduxList[] = {0,1,2};
+	const int reduxList[] = {0,1,2};
 
 	float*  pSrc    = calloc(1, sizeof(*pSrc)    * dims[0]*dims[1]*dims[2]);
 	float*  pMin    = calloc(1, sizeof(*pMin)                             );
@@ -669,7 +699,12 @@ START_TEST(test_minandargmin_alldimsreduced){
 	ga_assert_ok(GpuArray_memset(&gaMin,    -1));  /* 0xFFFFFFFF is a qNaN. */
 	ga_assert_ok(GpuArray_memset(&gaArgmin, -1));
 
-	ga_assert_ok(GpuArray_reduction(GA_REDUCE_MINANDARGMIN, &gaMin, &gaArgmin, &gaSrc, 3, reduxList));
+	GpuReduction* gr;
+	GpuReduction_new(&gr, GpuArray_context(&gaSrc),
+	                 GA_REDUCE_MINANDARGMIN, 0, 3, gaSrc.typecode, 0);
+	ck_assert_ptr_nonnull(gr);
+	ga_assert_ok(GpuReduction_call(gr, &gaMin, &gaArgmin, &gaSrc, 3, reduxList, 0));
+	GpuReduction_free(gr);
 
 	ga_assert_ok(GpuArray_read(pMin,    sizeof(*pMin),    &gaMin));
 	ga_assert_ok(GpuArray_read(pArgmin, sizeof(*pArgmin), &gaArgmin));
@@ -721,7 +756,7 @@ START_TEST(test_argmax_reduction){
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
-	const unsigned reduxList[] = {0,2};
+	const int reduxList[] = {0,2};
 
 	float*  pSrc    = calloc(1, sizeof(*pSrc)    * dims[0]*dims[1]*dims[2]);
 	float*  pMax    = calloc(1, sizeof(*pMax)    *         dims[1]        );
@@ -754,7 +789,12 @@ START_TEST(test_argmax_reduction){
 	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaArgmax, -1));
 
-	ga_assert_ok(GpuArray_reduction(GA_REDUCE_ARGMAX, NULL, &gaArgmax, &gaSrc, 2, reduxList));
+	GpuReduction* gr;
+	GpuReduction_new(&gr, GpuArray_context(&gaSrc),
+	                 GA_REDUCE_ARGMAX, 1, 2, gaSrc.typecode, 0);
+	ck_assert_ptr_nonnull(gr);
+	ga_assert_ok(GpuReduction_call(gr, NULL, &gaArgmax, &gaSrc, 2, reduxList, 0));
+	GpuReduction_free(gr);
 
 	ga_assert_ok(GpuArray_read(pArgmax, sizeof(*pArgmax)*dims[1], &gaArgmax));
 
@@ -804,7 +844,7 @@ START_TEST(test_argmax_veryhighrank){
 	size_t prodDims    = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7];
 	size_t rdxDims[4]  = {1171,373,1,2};
 	size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3];
-	const unsigned reduxList[] = {2,4,7,5};
+	const int reduxList[] = {2,4,7,5};
 
 	float*  pSrc    = calloc(1, sizeof(*pSrc)    * prodDims);
 	float*  pMax    = calloc(1, sizeof(*pMax)    * rdxProdDims);
@@ -836,7 +876,12 @@ START_TEST(test_argmax_veryhighrank){
 	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaArgmax, -1));
 
-	ga_assert_ok(GpuArray_reduction(GA_REDUCE_ARGMAX, NULL, &gaArgmax, &gaSrc, 4, reduxList));
+	GpuReduction* gr;
+	GpuReduction_new(&gr, GpuArray_context(&gaSrc),
+	                 GA_REDUCE_ARGMAX, 4, 4, gaSrc.typecode, 0);
+	ck_assert_ptr_nonnull(gr);
+	ga_assert_ok(GpuReduction_call(gr, NULL, &gaArgmax, &gaSrc, 4, reduxList, 0));
+	GpuReduction_free(gr);
 
 	ga_assert_ok(GpuArray_read(pArgmax, sizeof(*pArgmax)*rdxProdDims, &gaArgmax));
 
@@ -896,7 +941,7 @@ START_TEST(test_argmax_alldimsreduced){
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
-	const unsigned reduxList[] = {0,1,2};
+	const int reduxList[] = {0,1,2};
 
 	float*  pSrc    = calloc(1, sizeof(*pSrc)    * dims[0]*dims[1]*dims[2]);
 	float*  pMax    = calloc(1, sizeof(*pMax)                             );
@@ -929,7 +974,12 @@ START_TEST(test_argmax_alldimsreduced){
 	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaArgmax, -1));
 
-	ga_assert_ok(GpuArray_reduction(GA_REDUCE_ARGMAX, NULL, &gaArgmax, &gaSrc, 3, reduxList));
+	GpuReduction* gr;
+	GpuReduction_new(&gr, GpuArray_context(&gaSrc),
+	                 GA_REDUCE_ARGMAX, 0, 3, gaSrc.typecode, 0);
+	ck_assert_ptr_nonnull(gr);
+	ga_assert_ok(GpuReduction_call(gr, NULL, &gaArgmax, &gaSrc, 3, reduxList, 0));
+	GpuReduction_free(gr);
 
 	ga_assert_ok(GpuArray_read(pArgmax, sizeof(*pArgmax), &gaArgmax));
 
@@ -978,7 +1028,7 @@ START_TEST(test_argmin_reduction){
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
-	const unsigned reduxList[] = {0,2};
+	const int reduxList[] = {0,2};
 
 	float*  pSrc    = calloc(1, sizeof(*pSrc)    * dims[0]*dims[1]*dims[2]);
 	float*  pMin    = calloc(1, sizeof(*pMin)    *         dims[1]        );
@@ -1011,7 +1061,12 @@ START_TEST(test_argmin_reduction){
 	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaArgmin, -1));
 
-	ga_assert_ok(GpuArray_reduction(GA_REDUCE_ARGMIN, NULL, &gaArgmin, &gaSrc, 2, reduxList));
+	GpuReduction* gr;
+	GpuReduction_new(&gr, GpuArray_context(&gaSrc),
+	                 GA_REDUCE_ARGMIN, 1, 2, gaSrc.typecode, 0);
+	ck_assert_ptr_nonnull(gr);
+	ga_assert_ok(GpuReduction_call(gr, NULL, &gaArgmin, &gaSrc, 2, reduxList, 0));
+	GpuReduction_free(gr);
 
 	ga_assert_ok(GpuArray_read(pArgmin, sizeof(*pArgmin)*dims[1], &gaArgmin));
 
@@ -1061,7 +1116,7 @@ START_TEST(test_argmin_veryhighrank){
 	size_t prodDims    = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7];
 	size_t rdxDims[4]  = {1171,373,1,2};
 	size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3];
-	const unsigned reduxList[] = {2,4,7,5};
+	const int reduxList[] = {2,4,7,5};
 
 	float*  pSrc    = calloc(1, sizeof(*pSrc)    * prodDims);
 	float*  pMin    = calloc(1, sizeof(*pMin)    * rdxProdDims);
@@ -1093,7 +1148,12 @@ START_TEST(test_argmin_veryhighrank){
 	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaArgmin, -1));
 
-	ga_assert_ok(GpuArray_reduction(GA_REDUCE_ARGMIN, NULL, &gaArgmin, &gaSrc, 4, reduxList));
+	GpuReduction* gr;
+	GpuReduction_new(&gr, GpuArray_context(&gaSrc),
+	                 GA_REDUCE_ARGMIN, 4, 4, gaSrc.typecode, 0);
+	ck_assert_ptr_nonnull(gr);
+	ga_assert_ok(GpuReduction_call(gr, NULL, &gaArgmin, &gaSrc, 4, reduxList, 0));
+	GpuReduction_free(gr);
 
 	ga_assert_ok(GpuArray_read(pArgmin, sizeof(*pArgmin)*rdxProdDims, &gaArgmin));
 
@@ -1153,7 +1213,7 @@ START_TEST(test_argmin_alldimsreduced){
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
-	const unsigned reduxList[] = {0,1,2};
+	const int reduxList[] = {0,1,2};
 
 	float*  pSrc    = calloc(1, sizeof(*pSrc)    * dims[0]*dims[1]*dims[2]);
 	float*  pMin    = calloc(1, sizeof(*pMin)                             );
@@ -1186,7 +1246,12 @@ START_TEST(test_argmin_alldimsreduced){
 	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaArgmin, -1));
 
-	ga_assert_ok(GpuArray_reduction(GA_REDUCE_ARGMIN, NULL, &gaArgmin, &gaSrc, 3, reduxList));
+	GpuReduction* gr;
+	GpuReduction_new(&gr, GpuArray_context(&gaSrc),
+	                 GA_REDUCE_ARGMIN, 0, 3, gaSrc.typecode, 0);
+	ck_assert_ptr_nonnull(gr);
+	ga_assert_ok(GpuReduction_call(gr, NULL, &gaArgmin, &gaSrc, 3, reduxList, 0));
+	GpuReduction_free(gr);
 
 	ga_assert_ok(GpuArray_read(pArgmin, sizeof(*pArgmin), &gaArgmin));
 
@@ -1234,7 +1299,7 @@ START_TEST(test_max_reduction){
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
-	const unsigned reduxList[] = {0,2};
+	const int reduxList[] = {0,2};
 
 	float*  pSrc    = calloc(1, sizeof(*pSrc)    * dims[0]*dims[1]*dims[2]);
 	float*  pMax    = calloc(1, sizeof(*pMax)    *         dims[1]        );
@@ -1265,7 +1330,12 @@ START_TEST(test_max_reduction){
 	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaMax,    -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_reduction(GA_REDUCE_MAX, &gaMax, NULL, &gaSrc, 2, reduxList));
+	GpuReduction* gr;
+	GpuReduction_new(&gr, GpuArray_context(&gaSrc),
+	                 GA_REDUCE_MAX, 1, 2, gaSrc.typecode, 0);
+	ck_assert_ptr_nonnull(gr);
+	ga_assert_ok(GpuReduction_call(gr, &gaMax, NULL, &gaSrc, 2, reduxList, 0));
+	GpuReduction_free(gr);
 
 	ga_assert_ok(GpuArray_read(pMax,    sizeof(*pMax)   *dims[1], &gaMax));
 
@@ -1312,7 +1382,7 @@ START_TEST(test_max_veryhighrank){
 	size_t prodDims    = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7];
 	size_t rdxDims[4]  = {1171,373,1,2};
 	size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3];
-	const unsigned reduxList[] = {2,4,7,5};
+	const int reduxList[] = {2,4,7,5};
 
 	float*  pSrc    = calloc(1, sizeof(*pSrc)    * prodDims);
 	float*  pMax    = calloc(1, sizeof(*pMax)    * rdxProdDims);
@@ -1343,7 +1413,12 @@ START_TEST(test_max_veryhighrank){
 	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaMax,    -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_reduction(GA_REDUCE_MAX, &gaMax, NULL, &gaSrc, 4, reduxList));
+	GpuReduction* gr;
+	GpuReduction_new(&gr, GpuArray_context(&gaSrc),
+	                 GA_REDUCE_MAX, 4, 4, gaSrc.typecode, 0);
+	ck_assert_ptr_nonnull(gr);
+	ga_assert_ok(GpuReduction_call(gr, &gaMax, NULL, &gaSrc, 4, reduxList, 0));
+	GpuReduction_free(gr);
 
 	ga_assert_ok(GpuArray_read(pMax,    sizeof(*pMax)   *rdxProdDims, &gaMax));
 
@@ -1400,7 +1475,7 @@ START_TEST(test_max_alldimsreduced){
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
-	const unsigned reduxList[] = {0,1,2};
+	const int reduxList[] = {0,1,2};
 
 	float*  pSrc    = calloc(1, sizeof(*pSrc)    * dims[0]*dims[1]*dims[2]);
 	float*  pMax    = calloc(1, sizeof(*pMax)                             );
@@ -1431,7 +1506,12 @@ START_TEST(test_max_alldimsreduced){
 	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaMax,    -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_reduction(GA_REDUCE_MAX, &gaMax, NULL, &gaSrc, 3, reduxList));
+	GpuReduction* gr;
+	GpuReduction_new(&gr, GpuArray_context(&gaSrc),
+	                 GA_REDUCE_MAX, 0, 3, gaSrc.typecode, 0);
+	ck_assert_ptr_nonnull(gr);
+	ga_assert_ok(GpuReduction_call(gr, &gaMax, NULL, &gaSrc, 3, reduxList, 0));
+	GpuReduction_free(gr);
 
 	ga_assert_ok(GpuArray_read(pMax,    sizeof(*pMax),    &gaMax));
 
@@ -1476,7 +1556,7 @@ START_TEST(test_min_reduction){
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
-	const unsigned reduxList[] = {0,2};
+	const int reduxList[] = {0,2};
 
 	float*  pSrc    = calloc(1, sizeof(*pSrc)    * dims[0]*dims[1]*dims[2]);
 	float*  pMin    = calloc(1, sizeof(*pMin)    *         dims[1]        );
@@ -1507,7 +1587,12 @@ START_TEST(test_min_reduction){
 	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaMin,    -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_reduction(GA_REDUCE_MIN, &gaMin, NULL, &gaSrc, 2, reduxList));
+	GpuReduction* gr;
+	GpuReduction_new(&gr, GpuArray_context(&gaSrc),
+	                 GA_REDUCE_MIN, 1, 2, gaSrc.typecode, 0);
+	ck_assert_ptr_nonnull(gr);
+	ga_assert_ok(GpuReduction_call(gr, &gaMin, NULL, &gaSrc, 2, reduxList, 0));
+	GpuReduction_free(gr);
 
 	ga_assert_ok(GpuArray_read(pMin,    sizeof(*pMin)   *dims[1], &gaMin));
 
@@ -1554,7 +1639,7 @@ START_TEST(test_min_veryhighrank){
 	size_t prodDims    = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7];
 	size_t rdxDims[4]  = {1171,373,1,2};
 	size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3];
-	const unsigned reduxList[] = {2,4,7,5};
+	const int reduxList[] = {2,4,7,5};
 
 	float*  pSrc    = calloc(1, sizeof(*pSrc)    * prodDims);
 	float*  pMin    = calloc(1, sizeof(*pMin)    * rdxProdDims);
@@ -1585,7 +1670,12 @@ START_TEST(test_min_veryhighrank){
 	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaMin,    -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_reduction(GA_REDUCE_MIN, &gaMin, NULL, &gaSrc, 4, reduxList));
+	GpuReduction* gr;
+	GpuReduction_new(&gr, GpuArray_context(&gaSrc),
+	                 GA_REDUCE_MIN, 4, 4, gaSrc.typecode, 0);
+	ck_assert_ptr_nonnull(gr);
+	ga_assert_ok(GpuReduction_call(gr, &gaMin, NULL, &gaSrc, 4, reduxList, 0));
+	GpuReduction_free(gr);
 
 	ga_assert_ok(GpuArray_read(pMin,    sizeof(*pMin)   *rdxProdDims, &gaMin));
 
@@ -1642,7 +1732,7 @@ START_TEST(test_min_alldimsreduced){
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
-	const unsigned reduxList[] = {0,1,2};
+	const int reduxList[] = {0,1,2};
 
 	float*  pSrc    = calloc(1, sizeof(*pSrc)    * dims[0]*dims[1]*dims[2]);
 	float*  pMin    = calloc(1, sizeof(*pMin)                             );
@@ -1673,7 +1763,12 @@ START_TEST(test_min_alldimsreduced){
 	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaMin,    -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_reduction(GA_REDUCE_MIN, &gaMin, NULL, &gaSrc, 3, reduxList));
+	GpuReduction* gr;
+	GpuReduction_new(&gr, GpuArray_context(&gaSrc),
+	                 GA_REDUCE_MIN, 0, 3, gaSrc.typecode, 0);
+	ck_assert_ptr_nonnull(gr);
+	ga_assert_ok(GpuReduction_call(gr, &gaMin, NULL, &gaSrc, 3, reduxList, 0));
+	GpuReduction_free(gr);
 
 	ga_assert_ok(GpuArray_read(pMin,    sizeof(*pMin),    &gaMin));
 
@@ -1718,7 +1813,7 @@ START_TEST(test_sum_reduction){
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
-	const unsigned reduxList[] = {0,2};
+	const int reduxList[] = {0,2};
 	const float TOL = 1e-5;
 
 	float*  pS = calloc(1, sizeof(*pS)    * dims[0]*dims[1]*dims[2]);
@@ -1750,7 +1845,12 @@ START_TEST(test_sum_reduction){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_reduction(GA_REDUCE_SUM, &gaD, NULL, &gaS, 2, reduxList));
+	GpuReduction* gr;
+	GpuReduction_new(&gr, GpuArray_context(&gaS),
+	                 GA_REDUCE_SUM, 1, 2, gaS.typecode, 0);
+	ck_assert_ptr_nonnull(gr);
+	ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 2, reduxList, 0));
+	GpuReduction_free(gr);
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*dims[1], &gaD));
 
@@ -1794,7 +1894,7 @@ START_TEST(test_sum_veryhighrank){
 	size_t prodDims    = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7];
 	size_t rdxDims[4]  = {1171,373,1,2};
 	size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3];
-	const unsigned reduxList[] = {2,4,7,5};
+	const int reduxList[] = {2,4,7,5};
 	const float TOL    = 1e-5;
 
 	float*  pS = calloc(1, sizeof(*pS) * prodDims);
@@ -1826,7 +1926,12 @@ START_TEST(test_sum_veryhighrank){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_reduction(GA_REDUCE_SUM, &gaD, NULL, &gaS, 4, reduxList));
+	GpuReduction* gr;
+	GpuReduction_new(&gr, GpuArray_context(&gaS),
+	                 GA_REDUCE_SUM, 4, 4, gaS.typecode, 0);
+	ck_assert_ptr_nonnull(gr);
+	ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 4, reduxList, 0));
+	GpuReduction_free(gr);
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*rdxProdDims, &gaD));
 
@@ -1880,7 +1985,7 @@ START_TEST(test_sum_alldimsreduced){
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
-	const unsigned reduxList[] = {0,1,2};
+	const int reduxList[] = {0,1,2};
 	const float TOL = 1e-5;
 
 	float*  pS = calloc(1, sizeof(*pS)    * dims[0]*dims[1]*dims[2]);
@@ -1912,7 +2017,12 @@ START_TEST(test_sum_alldimsreduced){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_reduction(GA_REDUCE_SUM, &gaD, NULL, &gaS, 3, reduxList));
+	GpuReduction* gr;
+	GpuReduction_new(&gr, GpuArray_context(&gaS),
+	                 GA_REDUCE_SUM, 0, 3, gaS.typecode, 0);
+	ck_assert_ptr_nonnull(gr);
+	ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 3, reduxList, 0));
+	GpuReduction_free(gr);
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD), &gaD));
 
@@ -1954,7 +2064,7 @@ START_TEST(test_prod_reduction){
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
-	const unsigned reduxList[] = {0,2};
+	const int reduxList[] = {0,2};
 	const float TOL = 1e-5;
 
 	float*  pS = calloc(1, sizeof(*pS)    * dims[0]*dims[1]*dims[2]);
@@ -1986,7 +2096,12 @@ START_TEST(test_prod_reduction){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_reduction(GA_REDUCE_PROD, &gaD, NULL, &gaS, 2, reduxList));
+	GpuReduction* gr;
+	GpuReduction_new(&gr, GpuArray_context(&gaS),
+	                 GA_REDUCE_PROD, 1, 2, gaS.typecode, 0);
+	ck_assert_ptr_nonnull(gr);
+	ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 2, reduxList, 0));
+	GpuReduction_free(gr);
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*dims[1], &gaD));
 
@@ -2030,7 +2145,7 @@ START_TEST(test_prod_veryhighrank){
 	size_t prodDims    = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7];
 	size_t rdxDims[4]  = {1171,373,1,2};
 	size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3];
-	const unsigned reduxList[] = {2,4,7,5};
+	const int reduxList[] = {2,4,7,5};
 	const float TOL    = 1e-5;
 
 	float*  pS = calloc(1, sizeof(*pS) * prodDims);
@@ -2062,7 +2177,12 @@ START_TEST(test_prod_veryhighrank){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_reduction(GA_REDUCE_PROD, &gaD, NULL, &gaS, 4, reduxList));
+	GpuReduction* gr;
+	GpuReduction_new(&gr, GpuArray_context(&gaS),
+	                 GA_REDUCE_PROD, 4, 4, gaS.typecode, 0);
+	ck_assert_ptr_nonnull(gr);
+	ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 4, reduxList, 0));
+	GpuReduction_free(gr);
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*rdxProdDims, &gaD));
 
@@ -2116,7 +2236,7 @@ START_TEST(test_prod_alldimsreduced){
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
-	const unsigned reduxList[] = {0,1,2};
+	const int reduxList[] = {0,1,2};
 	const float TOL = 1e-5;
 
 	float*  pS = calloc(1, sizeof(*pS)    * dims[0]*dims[1]*dims[2]);
@@ -2148,7 +2268,12 @@ START_TEST(test_prod_alldimsreduced){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_reduction(GA_REDUCE_PROD, &gaD, NULL, &gaS, 3, reduxList));
+	GpuReduction* gr;
+	GpuReduction_new(&gr, GpuArray_context(&gaS),
+	                 GA_REDUCE_PROD, 0, 3, gaS.typecode, 0);
+	ck_assert_ptr_nonnull(gr);
+	ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 3, reduxList, 0));
+	GpuReduction_free(gr);
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD), &gaD));
 
@@ -2190,7 +2315,7 @@ START_TEST(test_prodnz_reduction){
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
-	const unsigned reduxList[] = {0,2};
+	const int reduxList[] = {0,2};
 	const float TOL = 1e-5;
 
 	float*  pS = calloc(1, sizeof(*pS)    * dims[0]*dims[1]*dims[2]);
@@ -2225,7 +2350,12 @@ START_TEST(test_prodnz_reduction){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_reduction(GA_REDUCE_PRODNZ, &gaD, NULL, &gaS, 2, reduxList));
+	GpuReduction* gr;
+	GpuReduction_new(&gr, GpuArray_context(&gaS),
+	                 GA_REDUCE_PRODNZ, 1, 2, gaS.typecode, 0);
+	ck_assert_ptr_nonnull(gr);
+	ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 2, reduxList, 0));
+	GpuReduction_free(gr);
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*dims[1], &gaD));
 
@@ -2269,7 +2399,7 @@ START_TEST(test_prodnz_veryhighrank){
 	size_t prodDims    = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7];
 	size_t rdxDims[4]  = {1171,373,1,2};
 	size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3];
-	const unsigned reduxList[] = {2,4,7,5};
+	const int reduxList[] = {2,4,7,5};
 	const float TOL    = 1e-5;
 
 	float*  pS = calloc(1, sizeof(*pS) * prodDims);
@@ -2304,7 +2434,12 @@ START_TEST(test_prodnz_veryhighrank){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_reduction(GA_REDUCE_PRODNZ, &gaD, NULL, &gaS, 4, reduxList));
+	GpuReduction* gr;
+	GpuReduction_new(&gr, GpuArray_context(&gaS),
+	                 GA_REDUCE_PRODNZ, 4, 4, gaS.typecode, 0);
+	ck_assert_ptr_nonnull(gr);
+	ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 4, reduxList, 0));
+	GpuReduction_free(gr);
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*rdxProdDims, &gaD));
 
@@ -2358,7 +2493,7 @@ START_TEST(test_prodnz_alldimsreduced){
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
-	const unsigned reduxList[] = {0,1,2};
+	const int reduxList[] = {0,1,2};
 	const float TOL = 1e-5;
 
 	float*  pS = calloc(1, sizeof(*pS)    * dims[0]*dims[1]*dims[2]);
@@ -2393,7 +2528,12 @@ START_TEST(test_prodnz_alldimsreduced){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_reduction(GA_REDUCE_PRODNZ, &gaD, NULL, &gaS, 3, reduxList));
+	GpuReduction* gr;
+	GpuReduction_new(&gr, GpuArray_context(&gaS),
+	                 GA_REDUCE_PRODNZ, 0, 3, gaS.typecode, 0);
+	ck_assert_ptr_nonnull(gr);
+	ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 3, reduxList, 0));
+	GpuReduction_free(gr);
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD), &gaD));
 
@@ -2435,7 +2575,7 @@ START_TEST(test_and_reduction){
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
-	const unsigned reduxList[] = {0,2};
+	const int reduxList[] = {0,2};
 
 	uint32_t*  pS = calloc(1, sizeof(*pS)    * dims[0]*dims[1]*dims[2]);
 	uint32_t*  pD = calloc(1, sizeof(*pD)    *         dims[1]        );
@@ -2475,7 +2615,12 @@ START_TEST(test_and_reduction){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));
 
-	ga_assert_ok(GpuArray_reduction(GA_REDUCE_AND, &gaD, NULL, &gaS, 2, reduxList));
+	GpuReduction* gr;
+	GpuReduction_new(&gr, GpuArray_context(&gaS),
+	                 GA_REDUCE_AND, 1, 2, gaS.typecode, 0);
+	ck_assert_ptr_nonnull(gr);
+	ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 2, reduxList, 0));
+	GpuReduction_free(gr);
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*dims[1], &gaD));
 
@@ -2519,7 +2664,7 @@ START_TEST(test_and_veryhighrank){
 	size_t prodDims    = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7];
 	size_t rdxDims[4]  = {1171,373,1,2};
 	size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3];
-	const unsigned reduxList[] = {2,4,7,5};
+	const int reduxList[] = {2,4,7,5};
 
 	uint32_t*  pS = calloc(1, sizeof(*pS) * prodDims);
 	uint32_t*  pD = calloc(1, sizeof(*pD) * rdxProdDims);
@@ -2559,7 +2704,12 @@ START_TEST(test_and_veryhighrank){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_reduction(GA_REDUCE_AND, &gaD, NULL, &gaS, 4, reduxList));
+	GpuReduction* gr;
+	GpuReduction_new(&gr, GpuArray_context(&gaS),
+	                 GA_REDUCE_AND, 4, 4, gaS.typecode, 0);
+	ck_assert_ptr_nonnull(gr);
+	ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 4, reduxList, 0));
+	GpuReduction_free(gr);
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*rdxProdDims, &gaD));
 
@@ -2613,7 +2763,7 @@ START_TEST(test_and_alldimsreduced){
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
-	const unsigned reduxList[] = {0,1,2};
+	const int reduxList[] = {0,1,2};
 
 	uint32_t*  pS = calloc(1, sizeof(*pS)    * dims[0]*dims[1]*dims[2]);
 	uint32_t*  pD = calloc(1, sizeof(*pD)                             );
@@ -2653,7 +2803,12 @@ START_TEST(test_and_alldimsreduced){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_reduction(GA_REDUCE_AND, &gaD, NULL, &gaS, 3, reduxList));
+	GpuReduction* gr;
+	GpuReduction_new(&gr, GpuArray_context(&gaS),
+	                 GA_REDUCE_AND, 0, 3, gaS.typecode, 0);
+	ck_assert_ptr_nonnull(gr);
+	ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 3, reduxList, 0));
+	GpuReduction_free(gr);
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD), &gaD));
 
@@ -2695,7 +2850,7 @@ START_TEST(test_or_reduction){
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
-	const unsigned reduxList[] = {0,2};
+	const int reduxList[] = {0,2};
 
 	uint32_t*  pS = calloc(1, sizeof(*pS)    * dims[0]*dims[1]*dims[2]);
 	uint32_t*  pD = calloc(1, sizeof(*pD)    *         dims[1]        );
@@ -2735,7 +2890,12 @@ START_TEST(test_or_reduction){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));
 
-	ga_assert_ok(GpuArray_reduction(GA_REDUCE_OR, &gaD, NULL, &gaS, 2, reduxList));
+	GpuReduction* gr;
+	GpuReduction_new(&gr, GpuArray_context(&gaS),
+	                 GA_REDUCE_OR, 1, 2, gaS.typecode, 0);
+	ck_assert_ptr_nonnull(gr);
+	ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 2, reduxList, 0));
+	GpuReduction_free(gr);
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*dims[1], &gaD));
 
@@ -2779,7 +2939,7 @@ START_TEST(test_or_veryhighrank){
 	size_t prodDims    = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7];
 	size_t rdxDims[4]  = {1171,373,1,2};
 	size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3];
-	const unsigned reduxList[] = {2,4,7,5};
+	const int reduxList[] = {2,4,7,5};
 
 	uint32_t*  pS = calloc(1, sizeof(*pS) * prodDims);
 	uint32_t*  pD = calloc(1, sizeof(*pD) * rdxProdDims);
@@ -2819,7 +2979,12 @@ START_TEST(test_or_veryhighrank){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_reduction(GA_REDUCE_OR, &gaD, NULL, &gaS, 4, reduxList));
+	GpuReduction* gr;
+	GpuReduction_new(&gr, GpuArray_context(&gaS),
+	                 GA_REDUCE_OR, 4, 4, gaS.typecode, 0);
+	ck_assert_ptr_nonnull(gr);
+	ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 4, reduxList, 0));
+	GpuReduction_free(gr);
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*rdxProdDims, &gaD));
 
@@ -2873,7 +3038,7 @@ START_TEST(test_or_alldimsreduced){
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
-	const unsigned reduxList[] = {0,1,2};
+	const int reduxList[] = {0,1,2};
 
 	uint32_t*  pS = calloc(1, sizeof(*pS)    * dims[0]*dims[1]*dims[2]);
 	uint32_t*  pD = calloc(1, sizeof(*pD)                             );
@@ -2913,7 +3078,12 @@ START_TEST(test_or_alldimsreduced){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_reduction(GA_REDUCE_OR, &gaD, NULL, &gaS, 3, reduxList));
+	GpuReduction* gr;
+	GpuReduction_new(&gr, GpuArray_context(&gaS),
+	                 GA_REDUCE_OR, 0, 3, gaS.typecode, 0);
+	ck_assert_ptr_nonnull(gr);
+	ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 3, reduxList, 0));
+	GpuReduction_free(gr);
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD), &gaD));
 
@@ -2955,7 +3125,7 @@ START_TEST(test_xor_reduction){
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
-	const unsigned reduxList[] = {0,2};
+	const int reduxList[] = {0,2};
 
 	uint32_t*  pS = calloc(1, sizeof(*pS)    * dims[0]*dims[1]*dims[2]);
 	uint32_t*  pD = calloc(1, sizeof(*pD)    *         dims[1]        );
@@ -2991,7 +3161,12 @@ START_TEST(test_xor_reduction){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));
 
-	ga_assert_ok(GpuArray_reduction(GA_REDUCE_XOR, &gaD, NULL, &gaS, 2, reduxList));
+	GpuReduction* gr;
+	GpuReduction_new(&gr, GpuArray_context(&gaS),
+	                 GA_REDUCE_XOR, 1, 2, gaS.typecode, 0);
+	ck_assert_ptr_nonnull(gr);
+	ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 2, reduxList, 0));
+	GpuReduction_free(gr);
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*dims[1], &gaD));
 
@@ -3035,7 +3210,7 @@ START_TEST(test_xor_veryhighrank){
 	size_t prodDims    = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7];
 	size_t rdxDims[4]  = {1171,373,1,2};
 	size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3];
-	const unsigned reduxList[] = {2,4,7,5};
+	const int reduxList[] = {2,4,7,5};
 
 	uint32_t*  pS = calloc(1, sizeof(*pS) * prodDims);
 	uint32_t*  pD = calloc(1, sizeof(*pD) * rdxProdDims);
@@ -3071,7 +3246,12 @@ START_TEST(test_xor_veryhighrank){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_reduction(GA_REDUCE_XOR, &gaD, NULL, &gaS, 4, reduxList));
+	GpuReduction* gr;
+	GpuReduction_new(&gr, GpuArray_context(&gaS),
+	                 GA_REDUCE_XOR, 4, 4, gaS.typecode, 0);
+	ck_assert_ptr_nonnull(gr);
+	ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 4, reduxList, 0));
+	GpuReduction_free(gr);
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*rdxProdDims, &gaD));
 
@@ -3125,7 +3305,7 @@ START_TEST(test_xor_alldimsreduced){
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
-	const unsigned reduxList[] = {0,1,2};
+	const int reduxList[] = {0,1,2};
 
 	uint32_t*  pS = calloc(1, sizeof(*pS)    * dims[0]*dims[1]*dims[2]);
 	uint32_t*  pD = calloc(1, sizeof(*pD)                             );
@@ -3161,7 +3341,12 @@ START_TEST(test_xor_alldimsreduced){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_reduction(GA_REDUCE_XOR, &gaD, NULL, &gaS, 3, reduxList));
+	GpuReduction* gr;
+	GpuReduction_new(&gr, GpuArray_context(&gaS),
+	                 GA_REDUCE_XOR, 0, 3, gaS.typecode, 0);
+	ck_assert_ptr_nonnull(gr);
+	ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 3, reduxList, 0));
+	GpuReduction_free(gr);
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD), &gaD));
 
@@ -3203,7 +3388,7 @@ START_TEST(test_any_reduction){
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
-	const unsigned reduxList[] = {0,2};
+	const int reduxList[] = {0,2};
 
 	uint32_t*  pS = calloc(1, sizeof(*pS)    * dims[0]*dims[1]*dims[2]);
 	uint32_t*  pD = calloc(1, sizeof(*pD)    *         dims[1]        );
@@ -3239,7 +3424,12 @@ START_TEST(test_any_reduction){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));
 
-	ga_assert_ok(GpuArray_reduction(GA_REDUCE_ANY, &gaD, NULL, &gaS, 2, reduxList));
+	GpuReduction* gr;
+	GpuReduction_new(&gr, GpuArray_context(&gaS),
+	                 GA_REDUCE_ANY, 1, 2, gaS.typecode, 0);
+	ck_assert_ptr_nonnull(gr);
+	ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 2, reduxList, 0));
+	GpuReduction_free(gr);
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*dims[1], &gaD));
 
@@ -3283,7 +3473,7 @@ START_TEST(test_any_veryhighrank){
 	size_t prodDims    = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7];
 	size_t rdxDims[4]  = {1171,373,1,2};
 	size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3];
-	const unsigned reduxList[] = {2,4,7,5};
+	const int reduxList[] = {2,4,7,5};
 
 	uint32_t*  pS = calloc(1, sizeof(*pS) * prodDims);
 	uint32_t*  pD = calloc(1, sizeof(*pD) * rdxProdDims);
@@ -3319,7 +3509,12 @@ START_TEST(test_any_veryhighrank){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_reduction(GA_REDUCE_ANY, &gaD, NULL, &gaS, 4, reduxList));
+	GpuReduction* gr;
+	GpuReduction_new(&gr, GpuArray_context(&gaS),
+	                 GA_REDUCE_ANY, 4, 4, gaS.typecode, 0);
+	ck_assert_ptr_nonnull(gr);
+	ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 4, reduxList, 0));
+	GpuReduction_free(gr);
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*rdxProdDims, &gaD));
 
@@ -3373,7 +3568,7 @@ START_TEST(test_any_alldimsreduced){
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
-	const unsigned reduxList[] = {0,1,2};
+	const int reduxList[] = {0,1,2};
 
 	uint32_t*  pS = calloc(1, sizeof(*pS)    * dims[0]*dims[1]*dims[2]);
 	uint32_t*  pD = calloc(1, sizeof(*pD)                             );
@@ -3409,7 +3604,12 @@ START_TEST(test_any_alldimsreduced){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_reduction(GA_REDUCE_ANY, &gaD, NULL, &gaS, 3, reduxList));
+	GpuReduction* gr;
+	GpuReduction_new(&gr, GpuArray_context(&gaS),
+	                 GA_REDUCE_ANY, 0, 3, gaS.typecode, 0);
+	ck_assert_ptr_nonnull(gr);
+	ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 3, reduxList, 0));
+	GpuReduction_free(gr);
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD), &gaD));
 
@@ -3451,7 +3651,7 @@ START_TEST(test_all_reduction){
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
-	const unsigned reduxList[] = {0,2};
+	const int reduxList[] = {0,2};
 
 	uint32_t*  pS = calloc(1, sizeof(*pS)    * dims[0]*dims[1]*dims[2]);
 	uint32_t*  pD = calloc(1, sizeof(*pD)    *         dims[1]        );
@@ -3487,7 +3687,12 @@ START_TEST(test_all_reduction){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));
 
-	ga_assert_ok(GpuArray_reduction(GA_REDUCE_ALL, &gaD, NULL, &gaS, 2, reduxList));
+	GpuReduction* gr;
+	GpuReduction_new(&gr, GpuArray_context(&gaS),
+	                 GA_REDUCE_ALL, 1, 2, gaS.typecode, 0);
+	ck_assert_ptr_nonnull(gr);
+	ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 2, reduxList, 0));
+	GpuReduction_free(gr);
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*dims[1], &gaD));
 
@@ -3531,7 +3736,7 @@ START_TEST(test_all_veryhighrank){
 	size_t prodDims    = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7];
 	size_t rdxDims[4]  = {1171,373,1,2};
 	size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3];
-	const unsigned reduxList[] = {2,4,7,5};
+	const int reduxList[] = {2,4,7,5};
 
 	uint32_t*  pS = calloc(1, sizeof(*pS) * prodDims);
 	uint32_t*  pD = calloc(1, sizeof(*pD) * rdxProdDims);
@@ -3567,7 +3772,12 @@ START_TEST(test_all_veryhighrank){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_reduction(GA_REDUCE_ALL, &gaD, NULL, &gaS, 4, reduxList));
+	GpuReduction* gr;
+	GpuReduction_new(&gr, GpuArray_context(&gaS),
+	                 GA_REDUCE_ALL, 4, 4, gaS.typecode, 0);
+	ck_assert_ptr_nonnull(gr);
+	ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 4, reduxList, 0));
+	GpuReduction_free(gr);
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*rdxProdDims, &gaD));
 
@@ -3621,7 +3831,7 @@ START_TEST(test_all_alldimsreduced){
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
-	const unsigned reduxList[] = {0,1,2};
+	const int reduxList[] = {0,1,2};
 
 	uint32_t*  pS = calloc(1, sizeof(*pS)    * dims[0]*dims[1]*dims[2]);
 	uint32_t*  pD = calloc(1, sizeof(*pD)                             );
@@ -3657,7 +3867,12 @@ START_TEST(test_all_alldimsreduced){
 	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
 
-	ga_assert_ok(GpuArray_reduction(GA_REDUCE_ALL, &gaD, NULL, &gaS, 3, reduxList));
+	GpuReduction* gr;
+	GpuReduction_new(&gr, GpuArray_context(&gaS),
+	                 GA_REDUCE_ALL, 0, 3, gaS.typecode, 0);
+	ck_assert_ptr_nonnull(gr);
+	ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 3, reduxList, 0));
+	GpuReduction_free(gr);
 
 	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD), &gaD));
 
@@ -3694,11 +3909,11 @@ Suite *get_suite(void) {
 	TCase *tc = tcase_create("basic");
 	tcase_add_checked_fixture(tc, setup, teardown);
 	tcase_set_timeout(tc, 120.0);
-
-	tcase_add_test(tc, test_maxandargmax_reduction);
-	tcase_add_test(tc, test_maxandargmax_idxtranspose);
+	
 	tcase_add_test(tc, test_maxandargmax_veryhighrank);
 	tcase_add_test(tc, test_maxandargmax_alldimsreduced);
+	tcase_add_test(tc, test_maxandargmax_reduction);
+	tcase_add_test(tc, test_maxandargmax_idxtranspose);
 
 	tcase_add_test(tc, test_minandargmin_reduction);
 	tcase_add_test(tc, test_minandargmin_veryhighrank);

From 8fc792bca3d08387f7deb8315814843156804db3 Mon Sep 17 00:00:00 2001
From: Olexa Bilaniuk <obilaniu@gmail.com>
Date: Mon, 3 Jul 2017 23:00:39 -0400
Subject: [PATCH 17/34] More fixes.

40% of tests still failing, and the code has a wierd smell to it that I
really don't appreciate.
---
 src/gpuarray_reduction.c | 796 +++++++++++++++++++++++----------------
 tests/check_reduction.c  |   6 +-
 2 files changed, 479 insertions(+), 323 deletions(-)

diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c
index 81376b93b8..f8cd15abfb 100644
--- a/src/gpuarray_reduction.c
+++ b/src/gpuarray_reduction.c
@@ -261,6 +261,15 @@ struct GpuReduction{
 };
 
 
+/* Typedefs */
+typedef void (*GpuReductionIterFn)(GpuReduction* gr,
+                                   int           typecode,
+                                   const char*   typeName,
+                                   const char*   baseName,
+                                   int           num,
+                                   void*         user);
+
+
 /* Static Function prototypes */
 /* Utilities */
 static int        reduxGetSumInit               (int typecode, const char** property);
@@ -276,7 +285,7 @@ static int        reduxSortPtrIBSrcRdSelect     (const void* a, const void* b);
 static int        reduxSortPtrByReduxNum        (const void* a, const void* b);
 static int        reduxSortPtrIBDstWrSelect     (const void* a, const void* b);
 static int        reduxSortPtrIBDstArgWrSelect  (const void* a, const void* b);
-static int        reduxSortPtrFinalOrder        (const void* a, const void* b);
+static int        reduxSortPtrInsertFinalOrder  (const void* a, const void* b);
 
 /* Axis Description API */
 static void       axisInit                      (axis_desc*           axis,
@@ -290,6 +299,7 @@ static int        axisGetReduxNum               (const axis_desc*     axis);
 static size_t     axisGetLen                    (const axis_desc*     axis);
 static size_t     axisGetIntraLen               (const axis_desc*     axis);
 static size_t     axisGetInterLen               (const axis_desc*     axis);
+static size_t     axisGetIntraInterLen          (const axis_desc*     axis);
 static ssize_t    axisGetSrcStride              (const axis_desc*     axis);
 static size_t     axisGetSrcAbsStride           (const axis_desc*     axis);
 static ssize_t    axisGetDstStride              (const axis_desc*     axis);
@@ -312,6 +322,9 @@ static int        axisIsSplit                   (const axis_desc*     axis);
 /*     Generator Control Flow */
 static int        reduxGenInit                  (GpuReduction*        gr);
 static int        reduxGenInferProperties       (GpuReduction*        gr);
+static void       reduxGenIterArgs              (GpuReduction*        gr,
+                                                 GpuReductionIterFn   fn,
+                                                 void*                user);
 static int        reduxGenSrc                   (GpuReduction*        gr);
 static void       reduxGenSrcAppend             (GpuReduction*        gr);
 static void       reduxGenSrcAppendIncludes     (GpuReduction*        gr);
@@ -350,6 +363,30 @@ static int        reduxGenCleanupMsg            (GpuReduction*        gr,  int r
                                                  const char*          fmt, ...);
 
 /*     Generator Utilities */
+static void       reduxGenCountArgs             (GpuReduction*        gr,
+                                                 int                  typecode,
+                                                 const char*          typeName,
+                                                 const char*          baseName,
+                                                 int                  num,
+                                                 void*                user);
+static void       reduxGenSaveArgTypecodes      (GpuReduction*        gr,
+                                                 int                  typecode,
+                                                 const char*          typeName,
+                                                 const char*          baseName,
+                                                 int                  num,
+                                                 void*                user);
+static void       reduxGenAppendArg             (GpuReduction*        gr,
+                                                 int                  typecode,
+                                                 const char*          typeName,
+                                                 const char*          baseName,
+                                                 int                  num,
+                                                 void*                user);
+static void       reduxInvMarshalArg            (GpuReduction*        gr,
+                                                 int                  typecode,
+                                                 const char*          typeName,
+                                                 const char*          baseName,
+                                                 int                  num,
+                                                 void*                user);
 static size_t     reduxGenEstimateParallelism   (const GpuReduction*  gr);
 static int        reduxGenRequiresDst           (const GpuReduction*  gr);
 static int        reduxGenRequiresDstArg        (const GpuReduction*  gr);
@@ -361,6 +398,9 @@ static size_t     reduxGenGetMaxLocalSize       (const GpuReduction*  gr);
 static size_t     reduxGenGetSHMEMSize          (const GpuReduction*  gr, size_t bs);
 static size_t     reduxGenGetSHMEMDstOff        (const GpuReduction*  gr, size_t bs);
 static size_t     reduxGenGetSHMEMDstArgOff     (const GpuReduction*  gr, size_t bs);
+static size_t     reduxGenGetWMEMSize           (const GpuReduction*  gr, size_t bs);
+static size_t     reduxGenGetWMEMDstOff         (const GpuReduction*  gr, size_t bs);
+static size_t     reduxGenGetWMEMDstArgOff      (const GpuReduction*  gr, size_t bs);
 
 /*     Invoker Control Flow */
 static int        reduxInvInit                  (redux_ctx*           ctx);
@@ -922,23 +962,21 @@ static int        reduxSortPtrIBDstArgWrSelect  (const void* a, const void* b){
 
 	return 0;
 }
-static int        reduxSortPtrFinalOrder        (const void* a, const void* b){
+static int        reduxSortPtrInsertFinalOrder  (const void* a, const void* b){
 	const axis_desc* xda  = *(const axis_desc* const*)a;
 	const axis_desc* xdb  = *(const axis_desc* const*)b;
 	
-	/* All intra axes go last. */
+	
+	/* All intra axes go first. */
 	if       (axisIsIntra(xda)  &&  axisIsInter(xdb)){
-		return +1;
-	}else if (axisIsInter(xda)  &&  axisIsIntra(xdb)){
 		return -1;
+	}else if (axisIsInter(xda)  &&  axisIsIntra(xdb)){
+		return +1;
 	}
 	
-	
 	if(axisIsIntra(xda)){
 		/**
 		 * Intra axes sort between themselves by descending intra axis number.
-		 * The split axis is always intra, and since it has the highest intra
-		 * axis number it will always sort first.
 		 */
 		
 		if       (axisGetIBNum(xda)  <  axisGetIBNum(xdb)){
@@ -949,18 +987,23 @@ static int        reduxSortPtrFinalOrder        (const void* a, const void* b){
 		
 		return 0;
 	}else{
-		/* All free inter axes go first (i{0..3}) */
+		/**
+		 * Inter axes sort between themselves
+		 * 
+		 *   - Reduced axes first
+		 *   - Then by ascending source tensor stride
+		 */
+		
 		if       ( axisIsReduced(xda)  && !axisIsReduced(xdb)){
-			return +1;
-		}else if (!axisIsReduced(xda)  &&  axisIsReduced(xdb)){
 			return -1;
+		}else if (!axisIsReduced(xda)  &&  axisIsReduced(xdb)){
+			return +1;
 		}
 		
-		/* Otherwise it's sort by descending source argument absolute stride. */
 		if       (axisGetSrcAbsStride(xda)  <  axisGetSrcAbsStride(xdb)){
-			return +1;
-		}else if (axisGetSrcAbsStride(xda)  >  axisGetSrcAbsStride(xdb)){
 			return -1;
+		}else if (axisGetSrcAbsStride(xda)  >  axisGetSrcAbsStride(xdb)){
+			return +1;
 		}
 	}
 
@@ -1040,6 +1083,9 @@ static size_t     axisGetInterLen               (const axis_desc* axis){
 		return axis->len;
 	}
 }
+static size_t     axisGetIntraInterLen          (const axis_desc* axis){
+	return axisGetIntraLen(axis)*axisGetInterLen(axis);
+}
 static ssize_t    axisGetSrcStride              (const axis_desc* axis){
 	return axisGetLen(axis) > 1 ? axis->srcStride : 0;
 }
@@ -1275,6 +1321,7 @@ static int        reduxGenInit                  (GpuReduction*     gr){
 	gr->kArgTypeCodes = NULL;
 	gr->kSourceCode   = NULL;
 	gr->kErrorString  = NULL;
+	gr->kNumArgs      = 0;
 	
 	return reduxGenInferProperties(gr);
 }
@@ -1285,7 +1332,6 @@ static int        reduxGenInit                  (GpuReduction*     gr){
 
 static int        reduxGenInferProperties       (GpuReduction*     gr){
 	int i, ret;
-	int k;
 	
 	
 	/**
@@ -1412,6 +1458,7 @@ static int        reduxGenInferProperties       (GpuReduction*     gr){
 		       "Problem selecting types to be used in reduction!\n");
 	}
 	
+	
 	/* Compute floor(log2(gr->log2MaxL)). */
 	gr->log2MaxL = gr->maxLg-1;
 	for(i=1;gr->log2MaxL & (gr->log2MaxL+1);i*=2){
@@ -1420,104 +1467,98 @@ static int        reduxGenInferProperties       (GpuReduction*     gr){
 	for(i=0;gr->log2MaxL;i++){
 		gr->log2MaxL >>= 1;
 	}
-	gr->log2MaxL = i;
-	
-	/* Compute number of kernel arguments. */
-	gr->kNumArgs  = 6                                       /* phase, U, V, B, D, H   */
-	              + 2                                       /* splitFree, splitReduce */
-	              + gr->nds                                 /* l{0..n}                */
-	              + reduxGenRequiresDstArg(gr)*gr->ndr      /* l{m..n}PDim            */
-	              + 1                                       /* s                      */
-	              + 1                                       /* sOff                   */
-	              + gr->nds                                 /* sJ{0..n}               */
-	              + reduxGenRequiresDst   (gr)              /* d                      */
-	              + reduxGenRequiresDst   (gr)              /* dOff                   */
-	              + reduxGenRequiresDst   (gr)*gr->ndd      /* dJ{0..m}               */
-	              + reduxGenRequiresDstArg(gr)              /* a                      */
-	              + reduxGenRequiresDstArg(gr)              /* aOff                   */
-	              + reduxGenRequiresDstArg(gr)*gr->ndd      /* aJ{0..m}               */
-	              + 1                                       /* w                      */
-	              + reduxGenKernelRequiresDst   (gr)*2      /* wdOff, pdOff           */
-	              + reduxGenKernelRequiresDstArg(gr)*2      /* waOff, paOff           */
-	              + gr->log2MaxL                            /* bs{0..p}               */
-	              + gr->log2MaxL                            /* bp{0..p}               */
-	              + reduxGenRequiresDstArg(gr)*gr->log2MaxL /* bi{0..p}               */
-	              + gr->log2MaxL                            /* bsOff{0..p}            */
-	              + reduxGenRequiresDst   (gr)*gr->log2MaxL /* bdOff{0..p}            */
-	              + reduxGenRequiresDstArg(gr)*gr->log2MaxL;/* baOff{0..p}            */
-	
-	
-	/* Construct kernel argument typecode list */
+	gr->log2MaxL = i?i:1;
+	
+	
+	/**
+	 * Compute number of kernel arguments and construct kernel argument
+	 * typecode list.
+	 */
+	
+	reduxGenIterArgs(gr, reduxGenCountArgs, 0);
 	gr->kArgTypeCodes = calloc(gr->kNumArgs, sizeof(*gr->kArgTypeCodes));
 	if(!gr->kArgTypeCodes){
 		return reduxGenCleanupMsg(gr, GA_MEMORY_ERROR,
 		                          "Failed to allocate memory for kernel arguments "
 		                          "typecode list!\n");
 	}
-	
 	i = 0;
-	gr->kArgTypeCodes[i++] =           GA_INT;   /* phase */
-	gr->kArgTypeCodes[i++] =           GA_SIZE;  /* U */
-	gr->kArgTypeCodes[i++] =           GA_SIZE;  /* V */
-	gr->kArgTypeCodes[i++] =           GA_SIZE;  /* B */
-	gr->kArgTypeCodes[i++] =           GA_UINT;  /* D */
-	gr->kArgTypeCodes[i++] =           GA_UINT;  /* H */
-	gr->kArgTypeCodes[i++] =           GA_UINT;  /* splitFree */
-	gr->kArgTypeCodes[i++] =           GA_UINT;  /* splitReduce */
+	reduxGenIterArgs(gr, reduxGenSaveArgTypecodes, &i);
+	
+	
+	/* Generate source code. */
+	return reduxGenSrc(gr);
+}
+
+/**
+ * Iterate over the arguments of the reduction operator.
+ */
+
+static void       reduxGenIterArgs              (GpuReduction*        gr,
+                                                 GpuReductionIterFn   fn,
+                                                 void*                user){
+	int k;
+	
+	fn(gr, GA_INT,    "int",                      "phase",       0, user);
+	fn(gr, GA_SIZE,   "TX",                       "U",           0, user);
+	fn(gr, GA_SIZE,   "TX",                       "V",           0, user);
+	fn(gr, GA_SIZE,   "TX",                       "B",           0, user);
+	fn(gr, GA_UINT,   "unsigned",                 "D",           0, user);
+	fn(gr, GA_UINT,   "unsigned",                 "H",           0, user);
+	fn(gr, GA_UINT,   "unsigned",                 "splitFree",   0, user);
+	fn(gr, GA_UINT,   "unsigned",                 "splitReduce", 0, user);
 	for(k=0;k < gr->nds;k++){
-		gr->kArgTypeCodes[i++] =       GA_SIZE;  /* lN */
+		fn(gr, GA_SIZE,   "TX",                       "l%d",         k, user);
 	}
-	for(k=0;k < gr->ndr && reduxGenRequiresDstArg(gr);k++){
-		gr->kArgTypeCodes[i++] =       GA_SIZE;  /* lNPDim */
+	for(k=gr->ndd;k < gr->nds && reduxGenRequiresDstArg(gr);k++){
+		fn(gr, GA_SIZE,   "TX",                       "l%dPDim",     k, user);
 	}
-	gr->kArgTypeCodes[i++] =           GA_BUFFER;/* s */
-	gr->kArgTypeCodes[i++] =           GA_SSIZE; /* sOff */
+	fn(gr, GA_BUFFER, "const GLOBAL_MEM char*",   "s",           0, user);
+	fn(gr, GA_SSIZE,  "TX",                       "sOff",        0, user);
 	for(k=0;k < gr->nds;k++){
-		gr->kArgTypeCodes[i++] =       GA_SSIZE; /* sJN */
+		fn(gr, GA_SIZE,   "TX",                       "sJ%d",        k, user);
 	}
 	if(reduxGenRequiresDst   (gr)){
-		gr->kArgTypeCodes[i++] =       GA_BUFFER;/* d */
-		gr->kArgTypeCodes[i++] =       GA_SSIZE; /* dOff */
+		fn(gr, GA_BUFFER, "GLOBAL_MEM char*",         "d",           0, user);
+		fn(gr, GA_SSIZE,  "TX",                       "dOff",        0, user);
 		for(k=0;k < gr->ndd;k++){
-			gr->kArgTypeCodes[i++] =   GA_SSIZE; /* dJN */
+			fn(gr, GA_SIZE,   "TX",                       "dJ%d",        k, user);
 		}
 	}
 	if(reduxGenRequiresDstArg(gr)){
-		gr->kArgTypeCodes[i++] =       GA_BUFFER;/* a */
-		gr->kArgTypeCodes[i++] =       GA_SSIZE; /* aOff */
+		fn(gr, GA_BUFFER, "GLOBAL_MEM char*",         "a",           0, user);
+		fn(gr, GA_SSIZE,  "TX",                       "aOff",        0, user);
 		for(k=0;k < gr->ndd;k++){
-			gr->kArgTypeCodes[i++] =   GA_SSIZE; /* aJN */
+			fn(gr, GA_SIZE,   "TX",                       "aJ%d",        k, user);
 		}
 	}
-	gr->kArgTypeCodes[i++] =           GA_BUFFER;/* w */
+	fn(gr, GA_BUFFER, "GLOBAL_MEM char*",         "w",           0, user);
 	if(reduxGenKernelRequiresDst   (gr)){
-		gr->kArgTypeCodes[i++] =       GA_SSIZE; /* wdOff */
-		gr->kArgTypeCodes[i++] =       GA_SSIZE; /* pdOff */
+		fn(gr, GA_SSIZE,  "TX",                       "wdOff",       0, user);
+		fn(gr, GA_SSIZE,  "TX",                       "pdOff",       0, user);
 	}
 	if(reduxGenKernelRequiresDstArg(gr)){
-		gr->kArgTypeCodes[i++] =       GA_SSIZE; /* waOff */
-		gr->kArgTypeCodes[i++] =       GA_SSIZE; /* paOff */
+		fn(gr, GA_SSIZE,  "TX",                       "waOff",       0, user);
+		fn(gr, GA_SSIZE,  "TX",                       "paOff",       0, user);
 	}
 	for(k=0;k < gr->log2MaxL;k++){
-		gr->kArgTypeCodes[i++] =       GA_UINT;  /* ibsN */
+		fn(gr, GA_UINT,   "unsigned",                 "ibs%d",       k, user);
 	}
 	for(k=0;k < gr->log2MaxL;k++){
-		gr->kArgTypeCodes[i++] =       GA_UINT;  /* ibpN */
+		fn(gr, GA_UINT,   "unsigned",                 "ibp%d",       k, user);
 	}
 	for(k=0;k < gr->log2MaxL && reduxGenRequiresDstArg(gr);k++){
-		gr->kArgTypeCodes[i++] =       GA_SIZE;  /* iblNPDim */
+		fn(gr, GA_SIZE,   "TX",                       "ibl%dPDim",   k, user);
 	}
 	for(k=0;k < gr->log2MaxL;k++){
-		gr->kArgTypeCodes[i++] =       GA_SSIZE; /* ibsOffN */
+		fn(gr, GA_SSIZE,  "TX",                       "ibsOff%d",    k, user);
 	}
 	for(k=0;k < gr->log2MaxL && reduxGenRequiresDst   (gr);k++){
-		gr->kArgTypeCodes[i++] =       GA_SSIZE; /* ibdOffN */
+		fn(gr, GA_SSIZE,  "TX",                       "ibdOff%d",    k, user);
 	}
 	for(k=0;k < gr->log2MaxL && reduxGenRequiresDstArg(gr);k++){
-		gr->kArgTypeCodes[i++] =       GA_SSIZE; /* ibaOffN */
+		fn(gr, GA_SSIZE,  "TX",                       "ibaOff%d",    k, user);
 	}
-	
-	return reduxGenSrc(gr);
 }
 
 /**
@@ -1769,70 +1810,10 @@ static void       reduxGenSrcAppendReduxKernel  (GpuReduction*     gr){
 	srcbAppends                  (&gr->srcGen, "}\n");
 }
 static void       reduxGenSrcAppendPrototype    (GpuReduction*     gr){
-	int i;
+	int i=0;
 	
-	srcbAppends            (&gr->srcGen, "KERNEL void redux(int                      phase,\n"
-	                                     "                  TX                       U,\n"
-	                                     "                  TX                       V,\n"
-	                                     "                  TX                       B,\n"
-	                                     "                  unsigned                 D,\n"
-	                                     "                  unsigned                 H,\n"
-	                                     "                  unsigned                 splitFree,\n"
-	                                     "                  unsigned                 splitReduce,\n");
-	srcbBeginList          (&gr->srcGen, ",\n", "void");
-	for(i=0;i<(int)(gr->ndd+gr->ndr);i++){
-		srcbAppendElemf    (&gr->srcGen, "                  TX                       l%d", i);
-	}
-	for(i=gr->ndd;i<(int)(gr->ndd+gr->ndr);i++){
-		srcbAppendElemf    (&gr->srcGen, "                  TX                       l%dPDim", i);
-	}
-	srcbAppendElemf        (&gr->srcGen, "                  const GLOBAL_MEM char*   s");
-	srcbAppendElemf        (&gr->srcGen, "                  TX                       sOff");
-	for(i=0;i<(int)(gr->ndd+gr->ndr);i++){
-		srcbAppendElemf    (&gr->srcGen, "                  TX                       sJ%d", i);
-	}
-	if (reduxGenRequiresDst(gr)){
-		srcbAppendElemf    (&gr->srcGen, "                  GLOBAL_MEM char*         d");
-		srcbAppendElemf    (&gr->srcGen, "                  TX                       dOff");
-		for(i=0;i<(int)(gr->ndd);i++){
-			srcbAppendElemf(&gr->srcGen, "                  TX                       dJ%d", i);
-		}
-	}
-	if (reduxGenRequiresDstArg(gr)){
-		srcbAppendElemf    (&gr->srcGen, "                  GLOBAL_MEM char*         a");
-		srcbAppendElemf    (&gr->srcGen, "                  TX                       aOff");
-		for(i=0;i<(int)(gr->ndd);i++){
-			srcbAppendElemf(&gr->srcGen, "                  TX                       aJ%d", i);
-		}
-	}
-	srcbAppendElemf        (&gr->srcGen, "                  GLOBAL_MEM char*         w");
-	if (reduxGenKernelRequiresDst(gr)){
-		srcbAppendElemf    (&gr->srcGen, "                  TX                       wdOff");
-		srcbAppendElemf    (&gr->srcGen, "                  TX                       pdOff");
-	}
-	if (reduxGenKernelRequiresDstArg(gr)){
-		srcbAppendElemf    (&gr->srcGen, "                  TX                       waOff");
-		srcbAppendElemf    (&gr->srcGen, "                  TX                       paOff");
-	}
-	for(i=0;i<(int)(gr->log2MaxL);i++){
-		srcbAppendElemf    (&gr->srcGen, "                  unsigned                 ibs%d", i);
-	}
-	for(i=0;i<(int)(gr->log2MaxL);i++){
-		srcbAppendElemf    (&gr->srcGen, "                  unsigned                 ibp%d", i);
-	}
-	for(i=0;i<(int)(gr->log2MaxL) && reduxGenRequiresDstArg(gr);i++){
-		srcbAppendElemf    (&gr->srcGen, "                  TX                       ibl%dPDim", i);
-	}
-	for(i=0;i<(int)(gr->log2MaxL);i++){
-		srcbAppendElemf    (&gr->srcGen, "                  TX                       ibsOff%d", i);
-	}
-	for(i=0;i<(int)(gr->log2MaxL) && reduxGenRequiresDst   (gr);i++){
-		srcbAppendElemf    (&gr->srcGen, "                  TX                       ibdOff%d", i);
-	}
-	for(i=0;i<(int)(gr->log2MaxL) && reduxGenRequiresDstArg(gr);i++){
-		srcbAppendElemf    (&gr->srcGen, "                  TX                       ibaOff%d", i);
-	}
-	srcbEndList    (&gr->srcGen);
+	srcbAppends            (&gr->srcGen, "KERNEL void redux(");
+	reduxGenIterArgs(gr, reduxGenAppendArg, &i);
 	srcbAppends    (&gr->srcGen, ")");
 }
 static void       reduxGenSrcAppendBlockDecode  (GpuReduction*     gr){
@@ -2004,56 +1985,43 @@ static void       reduxGenSrcAppendBlockDecode  (GpuReduction*     gr){
 	"     * base pointers to their starting point.\n"
 	"     */\n"
 	"    \n"
+	"    TX          z, h, k;\n"
 	"    unsigned    Dunit    = D/splitFree;\n");
 	if(gr->ndd > 0){
 		srcbAppendf(&gr->srcGen,
-		"    TX          l%dMul    = DIVIDECEIL(l%d, splitFree);\n",
+		"    TX          l%dDiv    = DIVIDECEIL(l%d, splitFree);\n",
 		            gr->ndd-1, gr->ndd-1);
 	}
 	if(gr->ndr > 0){
 		srcbAppendf(&gr->srcGen,
-		"    TX          l%dMul    = DIVIDECEIL(l%d, splitReduce);\n",
+		"    TX          l%dDiv    = DIVIDECEIL(l%d, splitReduce);\n",
 		            gr->nds-1, gr->nds-1);
 	}
-	srcbAppends(&gr->srcGen, "    \n");
+	srcbAppends(&gr->srcGen,
+	"    \n"
+	"    z                    = start;\n");
 	for(i=gr->nds-1;i>=0;i--){
-		if(i == gr->nds-1){
+		if(i == gr->nds-1 || i == gr->ndd-1){
 			srcbAppendf(&gr->srcGen,
-			"    TX          i%d       = start %% l%dMul;\n",
-			            i, i);
-			
+			"    TX          i%d       = z %% l%dDiv;z /= l%dDiv;\n",
+			            i, i, i);
 		}else{
 			srcbAppendf(&gr->srcGen,
-			"    TX          i%d       = i%d    / l%d%s %% l%d%s;\n",
-			            i, i+1,
-			            i+1,
-			            reduxGenAxisMaybeSplit(gr, i+1) ? "Mul" : "",
-			            i,
-			            reduxGenAxisMaybeSplit(gr, i)   ? "Mul" : "");
+			"    TX          i%d       = z %% l%d;   z /= l%d;\n",
+			            i, i, i);
 		}
 	}
 	srcbAppends(&gr->srcGen, "    \n");
-	if(gr->ndd > 0){
-		srcbAppendf(&gr->srcGen,
-		"    i%d                  *= splitFree;\n",
-		            gr->ndd-1);
-	}
-	if(gr->ndr > 0){
-		srcbAppendf(&gr->srcGen,
-		"    i%d                  *= splitReduce;\n",
-	                gr->nds-1);
-	}
-	srcbAppends(&gr->srcGen, "    \n");
 	for(i=gr->nds-1;i>=0;i--){
 		if(i == gr->nds-1){
 			srcbAppendf(&gr->srcGen,
-			"    TX          sS%d      = (sJ%d             ) / splitReduce;\n",
+			"    TX          sS%d      = sJ%d;\n",
 			            i, i);
 		}else{
 			srcbAppendf(&gr->srcGen,
-			"    TX          sS%d      = (sJ%d + (TX)l%d*sS%d)%s;\n",
-			            i, i, i+1, i+1,
-			            i == gr->ndd-1 ? " / splitFree" : "");
+			"    TX          sS%d      = sJ%d + l%d%s*sS%d;\n",
+			            i, i, i+1,
+			            reduxGenAxisMaybeSplit(gr, i+1) ? "Div" : "   ", i+1);
 		}
 	}
 	if (reduxGenRequiresDst(gr)){
@@ -2061,12 +2029,13 @@ static void       reduxGenSrcAppendBlockDecode  (GpuReduction*     gr){
 		for(i=gr->ndd-1;i>=0;i--){
 			if(i == gr->ndd-1){
 				srcbAppendf(&gr->srcGen,
-				"    TX          dS%d      = (dJ%d             ) / splitFree;\n",
+				"    TX          dS%d      = dJ%d;\n",
 				            i, i);
 			}else{
 				srcbAppendf(&gr->srcGen,
-				"    TX          dS%d      = (dJ%d + (TX)l%d*dS%d);\n",
-				            i, i, i+1, i+1);
+				"    TX          dS%d      = dJ%d + l%d%s*dS%d;\n",
+				            i, i, i+1,
+				            reduxGenAxisMaybeSplit(gr, i+1) ? "Div" : "   ", i+1);
 			}
 		}
 	}
@@ -2075,12 +2044,13 @@ static void       reduxGenSrcAppendBlockDecode  (GpuReduction*     gr){
 		for(i=gr->ndd-1;i>=0;i--){
 			if(i == gr->ndd-1){
 				srcbAppendf(&gr->srcGen,
-				"    TX          aS%d      = (aJ%d             ) / splitFree;\n",
+				"    TX          aS%d      = aJ%d;\n",
 				            i, i);
 			}else{
 				srcbAppendf(&gr->srcGen,
-				"    TX          aS%d      = (aJ%d + (TX)l%d*aS%d);\n",
-				            i, i, i+1, i+1);
+				"    TX          aS%d      = aJ%d + l%d%s*aS%d;\n",
+				            i, i, i+1,
+				            reduxGenAxisMaybeSplit(gr, i+1) ? "Div" : "   ", i+1);
 			}
 		}
 	}
@@ -2111,6 +2081,17 @@ static void       reduxGenSrcAppendBlockDecode  (GpuReduction*     gr){
 		srcbAppends(&gr->srcGen, ";\n");
 	}
 	srcbAppends(&gr->srcGen, "    \n");
+	if(gr->ndd > 0){
+		srcbAppendf(&gr->srcGen,
+		"    i%d                  *= splitFree;\n",
+		            gr->ndd-1);
+	}
+	if(gr->ndr > 0){
+		srcbAppendf(&gr->srcGen,
+		"    i%d                  *= splitReduce;\n",
+	                gr->nds-1);
+	}
+	srcbAppends(&gr->srcGen, "    \n");
 	if(reduxGenKernelRequiresDst(gr)){
 		srcbAppends(&gr->srcGen,
 		"    TK*         wd       = (TK*)(w     + wdOff);\n"
@@ -2125,10 +2106,7 @@ static void       reduxGenSrcAppendBlockDecode  (GpuReduction*     gr){
 		"    TA*         waR      = &wa[GDIM_0*D];\n"
 		"    TA*         pa       = (TA*)(SHMEM + paOff);\n");
 	}
-	srcbAppends(&gr->srcGen,
-	    "    \n"
-	    "    TX          h, k;\n"
-	    "    \n");
+	srcbAppends(&gr->srcGen, "    \n");
 }
 static void       reduxGenSrcAppendThreadDecode (GpuReduction*     gr){
 	int i;
@@ -2143,18 +2121,13 @@ static void       reduxGenSrcAppendThreadDecode (GpuReduction*     gr){
 	"     * argument pointers, argument indices and permute targets.\n"
 	"     */\n"
 	"    \n"
-	"    unsigned    iSplit   = LID_0/(LDIM_0/(splitFree*splitReduce));\n");
+	"    unsigned    iSplit   = LID_0/(LDIM_0/(splitFree*splitReduce));\n"
+	"    z                    = LID_0;\n");
 	
 	for(i=gr->log2MaxL-1;i>=0;i--){
-		if(i == gr->log2MaxL-1){
-			srcbAppendf(&gr->srcGen,
-			"    int         t%d       = (unsigned)LID_0 %% ibs%d;\n",
-			            i, i);
-		}else{
-			srcbAppendf(&gr->srcGen,
-			"    int         t%d       = (unsigned)t%d    / ibs%d %% ibs%d;\n",
-			            i, i+1, i+1, i);
-		}
+		srcbAppendf(&gr->srcGen,
+		"    int         t%d       = z %% ibs%d;z /= ibs%d;\n",
+		            i, i, i);
 	}
 	if(reduxGenRequiresDstArg(gr)){
 		srcbAppends(&gr->srcGen, "    TX          ti       = ");
@@ -2172,10 +2145,6 @@ static void       reduxGenSrcAppendThreadDecode (GpuReduction*     gr){
 	}
 	srcbEndList(&gr->srcGen);
 	srcbAppends(&gr->srcGen, ";\n");
-	
-	
-	
-	
 	srcbAppends(&gr->srcGen, "    \n"
 	                         "    sOff                += ");
 	srcbBeginList(&gr->srcGen, " + ", "0");
@@ -2561,6 +2530,146 @@ static int        reduxGenCleanupMsg            (GpuReduction*     gr,  int ret,
 	return reduxGenCleanup(gr, ret);
 }
 
+/**
+ * Count # of arguments as determined by iterator.
+ */
+
+static void       reduxGenCountArgs             (GpuReduction*        gr,
+                                                 int                  typecode,
+                                                 const char*          typeName,
+                                                 const char*          baseName,
+                                                 int                  num,
+                                                 void*                user){
+	(void)typecode;
+	(void)typeName;
+	(void)baseName;
+	(void)num;
+	(void)user;
+	
+	gr->kNumArgs++;
+}
+
+/**
+ * Record the typecodes in the arguments typecode array.
+ */
+
+static void       reduxGenSaveArgTypecodes      (GpuReduction*        gr,
+                                                 int                  typecode,
+                                                 const char*          typeName,
+                                                 const char*          baseName,
+                                                 int                  num,
+                                                 void*                user){
+	(void)typeName;
+	(void)baseName;
+	(void)num;
+	(void)user;
+	
+	gr->kArgTypeCodes[(*(int*)user)++] = typecode;
+}
+
+/**
+ * Append an argument declaration to prototype.
+ */
+
+static void       reduxGenAppendArg             (GpuReduction*        gr,
+                                                 int                  typecode,
+                                                 const char*          typeName,
+                                                 const char*          baseName,
+                                                 int                  num,
+                                                 void*                user){
+	(void)user;
+	(void)typecode;
+	
+	if((*(int*)user)++ > 0){
+		srcbAppends(&gr->srcGen, ",\n                  ");
+	}
+	srcbAppendf(&gr->srcGen, "%-25s ", typeName);
+	srcbAppendf(&gr->srcGen, baseName, num);
+}
+
+/**
+ * Marshall argument declaration during invocation.
+ */
+
+static void       reduxInvMarshalArg            (GpuReduction*        gr,
+                                                 int                  typecode,
+                                                 const char*          typeName,
+                                                 const char*          baseName,
+                                                 int                  k,
+                                                 void*                user){
+	redux_ctx* ctx;
+	int*       i;
+	
+	(void)typecode;
+	(void)typeName;
+	
+	ctx = (redux_ctx*)(((void**)user)[0]);
+	i   = (int      *)(((void**)user)[1]);
+	
+	if       (strcmp(baseName, "phase") == 0){
+		ctx->kArgs[(*i)++] = (void*)&ctx->phase;
+	}else if (strcmp(baseName, "U") == 0){
+		ctx->kArgs[(*i)++] = (void*)&ctx->U;
+	}else if (strcmp(baseName, "V") == 0){
+		ctx->kArgs[(*i)++] = (void*)&ctx->V;
+	}else if (strcmp(baseName, "B") == 0){
+		ctx->kArgs[(*i)++] = (void*)&ctx->B;
+	}else if (strcmp(baseName, "D") == 0){
+		ctx->kArgs[(*i)++] = (void*)&ctx->D;
+	}else if (strcmp(baseName, "H") == 0){
+		ctx->kArgs[(*i)++] = (void*)&ctx->H;
+	}else if (strcmp(baseName, "splitFree") == 0){
+		ctx->kArgs[(*i)++] = (void*)&ctx->splitFree;
+	}else if (strcmp(baseName, "splitReduce") == 0){
+		ctx->kArgs[(*i)++] = (void*)&ctx->splitReduce;
+	}else if (strcmp(baseName, "l%d") == 0){
+		ctx->kArgs[(*i)++] = (void*)&ctx->l[k];
+	}else if (strcmp(baseName, "l%dPDim") == 0){
+		ctx->kArgs[(*i)++] = (void*)&ctx->lPDim[k-gr->ndd];
+	}else if (strcmp(baseName, "s") == 0){
+		ctx->kArgs[(*i)++] = (void*) ctx->flatSrcData;
+	}else if (strcmp(baseName, "sOff") == 0){
+		ctx->kArgs[(*i)++] = (void*)&ctx->flatSrcOffset;
+	}else if (strcmp(baseName, "sJ%d") == 0){
+		ctx->kArgs[(*i)++] = (void*)&ctx->sJ[k];
+	}else if (strcmp(baseName, "d") == 0){
+		ctx->kArgs[(*i)++] = (void*) ctx->flatDstData;
+	}else if (strcmp(baseName, "dOff") == 0){
+		ctx->kArgs[(*i)++] = (void*)&ctx->flatDstOffset;
+	}else if (strcmp(baseName, "dJ%d") == 0){
+		ctx->kArgs[(*i)++] = (void*)&ctx->dJ[k];
+	}else if (strcmp(baseName, "a") == 0){
+		ctx->kArgs[(*i)++] = (void*) ctx->flatDstArgData;
+	}else if (strcmp(baseName, "aOff") == 0){
+		ctx->kArgs[(*i)++] = (void*)&ctx->flatDstArgOffset;
+	}else if (strcmp(baseName, "aJ%d") == 0){
+		ctx->kArgs[(*i)++] = (void*)&ctx->aJ[k];
+	}else if (strcmp(baseName, "w") == 0){
+		ctx->kArgs[(*i)++] = (void*) ctx->w;
+	}else if (strcmp(baseName, "wdOff") == 0){
+		ctx->kArgs[(*i)++] = (void*)&ctx->wdOff;
+	}else if (strcmp(baseName, "pdOff") == 0){
+		ctx->kArgs[(*i)++] = (void*)&ctx->pdOff;
+	}else if (strcmp(baseName, "waOff") == 0){
+		ctx->kArgs[(*i)++] = (void*)&ctx->waOff;
+	}else if (strcmp(baseName, "paOff") == 0){
+		ctx->kArgs[(*i)++] = (void*)&ctx->paOff;
+	}else if (strcmp(baseName, "ibs%d") == 0){
+		ctx->kArgs[(*i)++] = (void*)&ctx->ibs[k];
+	}else if (strcmp(baseName, "ibp%d") == 0){
+		ctx->kArgs[(*i)++] = (void*)&ctx->ibp[k];
+	}else if (strcmp(baseName, "ibl%dPDim") == 0){
+		ctx->kArgs[(*i)++] = (void*)&ctx->iblPDim[k];
+	}else if (strcmp(baseName, "ibsOff%d") == 0){
+		ctx->kArgs[(*i)++] = (void*)&ctx->ibsOff[k];
+	}else if (strcmp(baseName, "ibdOff%d") == 0){
+		ctx->kArgs[(*i)++] = (void*)&ctx->ibdOff[k];
+	}else if (strcmp(baseName, "ibaOff%d") == 0){
+		ctx->kArgs[(*i)++] = (void*)&ctx->ibaOff[k];
+	}
+}
+
+
 /**
  * @brief Estimate the level of parallelism available in the GPU context of
  *        this reduction operator.
@@ -2695,7 +2804,7 @@ static size_t     reduxGenGetMaxLocalSize       (const GpuReduction*  gr){
 
 static size_t     reduxGenGetSHMEMSize          (const GpuReduction*  gr, size_t bs){
 	const gpuarray_type* type;
-	size_t               total = 0;
+	size_t               total = 0, permuteSpace;
 	
 	if(reduxGenKernelRequiresDst(gr)){
 		type   = gpuarray_get_type(gr->accTypeCode);
@@ -2708,6 +2817,12 @@ static size_t     reduxGenGetSHMEMSize          (const GpuReduction*  gr, size_t
 		total += bs*type->size;
 	}
 	
+	/* Ensure space for pointer permute. */
+	permuteSpace = gpuarray_get_type(gr->idxTypeCode)->size * bs;
+	if(total < permuteSpace){
+		total = permuteSpace;
+	}
+	
 	return total;
 }
 
@@ -2740,6 +2855,47 @@ static size_t     reduxGenGetSHMEMDstArgOff     (const GpuReduction*  gr, size_t
 	}
 }
 
+/**
+ * Get the amount of Workspace memory required.
+ * 
+ * NOT necessarily the same as amount of SHMEM! The workspace is NOT used for
+ * intrablock offset permutes, for instance.
+ */
+
+static size_t     reduxGenGetWMEMSize           (const GpuReduction*  gr, size_t bs){
+	const gpuarray_type* type;
+	size_t               total = 0;
+	
+	if(reduxGenKernelRequiresDst(gr)){
+		type   = gpuarray_get_type(gr->accTypeCode);
+		total  = DIVIDECEIL(total, type->align)*type->align;
+		total += bs*type->size;
+	}
+	if(reduxGenKernelRequiresDstArg(gr)){
+		type   = gpuarray_get_type(gr->idxTypeCode);
+		total  = DIVIDECEIL(total, type->align)*type->align;
+		total += bs*type->size;
+	}
+	
+	return total;
+}
+
+/**
+ * @brief Get the workspace memory byte offset for dst.
+ */
+
+static size_t     reduxGenGetWMEMDstOff         (const GpuReduction*  gr, size_t bs){
+	return reduxGenGetSHMEMDstOff(gr, bs);
+}
+
+/**
+ * @brief Get the workspace memory byte offset for dstArg.
+ */
+
+static size_t     reduxGenGetWMEMDstArgOff      (const GpuReduction*  gr, size_t bs){
+	return reduxGenGetSHMEMDstArgOff(gr, bs);
+}
+
 /**
  * @brief Initialize the context.
  * 
@@ -3039,7 +3195,7 @@ static int        reduxInvFlattenSource         (redux_ctx*  ctx){
 static int        reduxInvComputeKArgs          (redux_ctx*  ctx){
 	axis_desc* axis, *prevAxis;
 	size_t     target, aL, aLS;
-	int        i, j;
+	int        i, j, k, haveSplitFreeAxis, haveSplitReducedAxis;
 
 
 	/**
@@ -3086,9 +3242,6 @@ static int        reduxInvComputeKArgs          (redux_ctx*  ctx){
 	for(i=0;i<ctx->gr->nds;i++){
 		ctx->l[i] = 1;
 	}
-	for(i=0;i<ctx->gr->ndr;i++){
-		ctx->lPDim[i] = 1;
-	}
 	for(i=0;i<ctx->gr->log2MaxL;i++){
 		ctx->ibs[i] = 1;
 	}
@@ -3117,8 +3270,9 @@ static int        reduxInvComputeKArgs          (redux_ctx*  ctx){
 			if(target/ctx->bs >= 2){
 				aLS          = target/ctx->bs;
 				ctx->bs     *= aLS;
-				axisMarkIntraBlock(axis, i++, aLS);
+				axisMarkIntraBlock(axis, i, aLS);
 				ctx->xdSplit = axis;
+				i++;
 			}
 			break;
 		}
@@ -3172,7 +3326,7 @@ static int        reduxInvComputeKArgs          (redux_ctx*  ctx){
 				axisSetPDim(axis, 1);
 			}else{
 				prevAxis = reduxInvGetSrcSortAxis(ctx, i-1);
-				axisSetPDim(prevAxis, axisGetPDim(axis)*axisGetLen(prevAxis));
+				axisSetPDim(axis, axisGetPDim(prevAxis)*axisGetLen(prevAxis));
 			}
 		}
 	}
@@ -3199,73 +3353,134 @@ static int        reduxInvComputeKArgs          (redux_ctx*  ctx){
 				axisSetIBP(axis, 1);
 			}else{
 				prevAxis = reduxInvGetSrcSortAxis(ctx, i-1);
-				axisSetIBP(axis, axisGetIBP(prevAxis)*axisGetLen(prevAxis));
+				axisSetIBP(axis, axisGetIBP(prevAxis)*axisGetIntraLen(prevAxis));
 			}
 		}
 	}
 	
 	/**
-	 * STEP 6. Place the axes in final loop order and perform final placement
-	 *         of:
-	 *              lN, lPDim, sJN, dJN, aJN,
+	 * STEP 6. Place the intra axis arguments
+	 * 
 	 *              ibs, ibp, iblPDim, ibsOff, ibdOff, ibaOff
+	 * 
+	 * For this we need the axes in final order of insertion.
 	 */
 	
 	reduxSortAxisPtrsBy(ctx->xdSrcPtrs, ctx->xdSrc, ctx->ndfs,
-	                    reduxSortPtrFinalOrder);
-	for(i=0,j=0;i<ctx->ndfs;i++){
+	                    reduxSortPtrInsertFinalOrder);
+	for(i=0;i<ctx->ndib;i++){
+		axis = reduxInvGetSrcSortAxis(ctx,  i);
+		
+		ctx->ibs    [i] = axisGetIntraLen    (axis);
+		ctx->ibp    [i] = axisGetIBP         (axis);
+		ctx->iblPDim[i] = axisGetPDim        (axis);
+		ctx->ibsOff [i] = axisGetSrcStride   (axis);
+		ctx->ibdOff [i] = axisGetDstStride   (axis);
+		ctx->ibaOff [i] = axisGetDstArgStride(axis);
+	}
+	
+	/**
+	 * STEP 7. Place the inter axis arguments
+	 * 
+	 *              lN, lNPDim, sJN, dJN, aJN
+	 * 
+	 * , where N in [0, ctx->gr->ndd) are free axes,
+	 *         N in [ctx->gr->ndd, ctx->gr->nds) are reduced axes,
+	 * and ctx->xdSrcPtr[...] are sorted in the reverse of that order for
+	 * insertion, and excludes any split axis.
+	 * 
+	 * How precisely the insertion is done depends closely on whether there is
+	 * a split axis and if so whether it is free or reduced.
+	 * 
+	 * - If there is a split axis and it is free, then it should be inserted as
+	 *   the first free axis. Its jumps should be
+	 *             sJN = -sSM*intrainterLenM + sSN*splitFree
+	 *             dJN = -dSM*intrainterLenM + dSN*splitFree
+	 *             aJN = -aSM*intrainterLenM + aSN*splitFree
+	 * - If there is a split axis and it is reduced, then it should be inserted
+	 *   as the first reduced axis. Its jump should be
+	 *             sJN = -sSM*intrainterLenM + sSN*splitReduced
+	 * - If there is no split axis, proceed normally in filling the axes.
+	 */
+	
+	haveSplitFreeAxis    = ctx->xdSplit && !axisIsReduced(ctx->xdSplit);
+	haveSplitReducedAxis = ctx->xdSplit &&  axisIsReduced(ctx->xdSplit);
+	
+	/* If we have a reduced split axis, insert it before any other reduced axis. */
+	j  = ctx->gr->nds-1;
+	k  = ctx->gr->ndr-1;
+	if(haveSplitReducedAxis && k>=0){
+		ctx->l      [j]  =           axisGetLen          (ctx->xdSplit);
+		ctx->lPDim  [k]  =           axisGetPDim         (ctx->xdSplit);
+		ctx->sJ     [j] +=  (ssize_t)axisGetSrcStride    (ctx->xdSplit)*
+		                    (ssize_t)axisGetIntraLen     (ctx->xdSplit);
+		if(j>0){
+			ctx->sJ   [j-1] -=  (ssize_t)axisGetSrcStride    (ctx->xdSplit)*
+			                    (ssize_t)axisGetIntraInterLen(ctx->xdSplit);
+		}
+		j--;
+		k--;
+	}
+	
+	/* Insert rest of reduced axes. */
+	for(;i<ctx->ndfs && k>=0;i++,j--,k--){
 		axis = reduxInvGetSrcSortAxis(ctx, i);
+		if(!axisIsReduced(axis)){
+			break;
+		}
 		
-		if       (axisIsSplit(axis) && !axisIsReduced(axis)){
-			/* Split Free Axis? */
-			ctx->ibs    [             0] = axisGetIntraLen(axis);
-			ctx->ibp    [             0] = axisGetIntraLen(axis);
-			ctx->iblPDim[             0] = axisGetIntraLen(axis);
-			ctx->ibsOff [             0] = axisGetSrcStride(axis);
-			ctx->ibdOff [             0] = axisGetDstStride(axis);
-			ctx->ibaOff [             0] = axisGetDstArgStride(axis);
-			
-			ctx->l      [ctx->gr->ndd-1] = axisGetInterLen(axis);
-			ctx->lPDim  [ctx->gr->ndd-1] = axisGetPDim    (axis);
-			ctx->sJ     [ctx->gr->ndd-1] = 0;
-			ctx->dJ     [ctx->gr->ndd-1] = 0;
-			ctx->aJ     [ctx->gr->ndd-1] = 0;
-		}else if (axisIsSplit(axis) &&  axisIsReduced(axis)){
-			/* Split Reduced Axis? */
-			ctx->ibs    [             0] = axisGetIntraLen(axis);
-			ctx->ibp    [             0] = axisGetIntraLen(axis);
-			ctx->iblPDim[             0] = axisGetIntraLen(axis);
-			ctx->ibsOff [             0] = axisGetSrcStride(axis);
-			ctx->ibdOff [             0] = axisGetDstStride(axis);
-			ctx->ibaOff [             0] = axisGetDstArgStride(axis);
-			
-			ctx->l      [ctx->gr->nds-1] = axisGetInterLen(axis);
-			ctx->lPDim  [ctx->gr->nds-1] = axisGetPDim    (axis);
-			ctx->sJ     [ctx->gr->nds-1] = 0;
-			ctx->dJ     [ctx->gr->nds-1] = 0;
-			ctx->aJ     [ctx->gr->nds-1] = 0;
-		}else if (axisIsInter(axis) && !axisIsReduced(axis)){
-			/* Inter Free Axis? */
-			ctx->l      [             j] = axisGetInterLen(axis);
-			ctx->lPDim  [             j] = axisGetPDim    (axis);
-			ctx->sJ     [             j] = 0;
-			ctx->dJ     [             j] = 0;
-			ctx->aJ     [             j] = 0;
-		}else if (axisIsInter(axis) &&  axisIsReduced(axis)){
-			/* Inter Reduced Axis? */
-			ctx->l      [             j] = axisGetInterLen(axis);
-			ctx->lPDim  [             j] = axisGetPDim    (axis);
-			ctx->sJ     [             j] = 0;
-			ctx->dJ     [             j] = 0;
-			ctx->aJ     [             j] = 0;
-		}else{
-			/* Intra Axis? */
-			ctx->ibs    [             0] = axisGetIntraLen(axis);
-			ctx->ibp    [             0] = axisGetIntraLen(axis);
-			ctx->iblPDim[             0] = axisGetIntraLen(axis);
-			ctx->ibsOff [             0] = axisGetSrcStride(axis);
-			ctx->ibdOff [             0] = axisGetDstStride(axis);
-			ctx->ibaOff [             0] = axisGetDstArgStride(axis);
+		ctx->l      [j]  =           axisGetLen          (axis);
+		ctx->lPDim  [k]  =           axisGetPDim         (axis);
+		ctx->sJ     [j] +=  (ssize_t)axisGetSrcStride    (axis)*
+		                    (ssize_t)axisGetIntraLen     (axis);
+		if(j>0){
+			ctx->sJ   [j-1] -=  (ssize_t)axisGetSrcStride    (axis)*
+			                    (ssize_t)axisGetIntraInterLen(axis);
+		}
+	}
+	
+	/* If we have a free split axis, insert it before any other free axis. */
+	k = ctx->gr->ndd-1;
+	if(haveSplitFreeAxis && k>=0){
+		ctx->l      [k]  =           axisGetLen          (ctx->xdSplit);
+		ctx->sJ     [k] +=  (ssize_t)axisGetSrcStride    (ctx->xdSplit)*
+		                    (ssize_t)axisGetIntraLen     (ctx->xdSplit);
+		ctx->dJ     [k] +=  (ssize_t)axisGetDstStride    (ctx->xdSplit)*
+		                    (ssize_t)axisGetIntraLen     (ctx->xdSplit);
+		ctx->aJ     [k] +=  (ssize_t)axisGetDstArgStride (ctx->xdSplit)*
+		                    (ssize_t)axisGetIntraLen     (ctx->xdSplit);
+		if(k>0){
+			ctx->sJ  [k-1] -=  (ssize_t)axisGetSrcStride    (ctx->xdSplit)*
+			                   (ssize_t)axisGetIntraInterLen(ctx->xdSplit);
+			ctx->dJ  [k-1] -=  (ssize_t)axisGetDstStride    (ctx->xdSplit)*
+			                   (ssize_t)axisGetIntraInterLen(ctx->xdSplit);
+			ctx->aJ  [k-1] -=  (ssize_t)axisGetDstArgStride (ctx->xdSplit)*
+			                   (ssize_t)axisGetIntraInterLen(ctx->xdSplit);
+		}
+		k--;
+	}
+	
+	/* Insert rest of free axes. */
+	for(;i<ctx->ndfs && k>=0;i++,k--){
+		axis = reduxInvGetSrcSortAxis(ctx, i);
+		if(axisIsReduced(axis)){
+			break;
+		}
+		
+		ctx->l      [k]  =           axisGetLen          (axis);
+		ctx->sJ     [k] +=  (ssize_t)axisGetSrcStride    (axis)*
+		                    (ssize_t)axisGetIntraLen     (axis);
+		ctx->dJ     [k] +=  (ssize_t)axisGetDstStride    (axis)*
+		                    (ssize_t)axisGetIntraLen     (axis);
+		ctx->aJ     [k] +=  (ssize_t)axisGetDstArgStride (axis)*
+		                    (ssize_t)axisGetIntraLen     (axis);
+		if(k>0){
+			ctx->sJ  [k-1] -=  (ssize_t)axisGetSrcStride    (axis)*
+			                   (ssize_t)axisGetIntraInterLen(axis);
+			ctx->dJ  [k-1] -=  (ssize_t)axisGetDstStride    (axis)*
+			                   (ssize_t)axisGetIntraInterLen(axis);
+			ctx->aJ  [k-1] -=  (ssize_t)axisGetDstArgStride (axis)*
+			                   (ssize_t)axisGetIntraInterLen(axis);
 		}
 	}
 
@@ -3421,9 +3636,9 @@ static int        reduxInvSchedule              (redux_ctx*           ctx){
 	 * Allocate required workspace.
 	 */
 	
-	ctx->wdOff = reduxGenGetSHMEMDstOff   (ctx->gr, 2*ctx->gs*ctx->D);
-	ctx->waOff = reduxGenGetSHMEMDstArgOff(ctx->gr, 2*ctx->gs*ctx->D);
-	WSPACESIZE = reduxGenGetSHMEMSize     (ctx->gr, 2*ctx->gs*ctx->D);
+	ctx->wdOff = reduxGenGetWMEMDstOff   (ctx->gr, 2*ctx->gs*ctx->D);
+	ctx->waOff = reduxGenGetWMEMDstArgOff(ctx->gr, 2*ctx->gs*ctx->D);
+	WSPACESIZE = reduxGenGetWMEMSize     (ctx->gr, 2*ctx->gs*ctx->D);
 	ctx->w     = gpudata_alloc(ctx->gr->gpuCtx, WSPACESIZE, 0, flags, 0);
 	if(!ctx->w){
 		return reduxInvCleanupMsg(ctx, GA_MEMORY_ERROR,
@@ -3439,73 +3654,14 @@ static int        reduxInvSchedule              (redux_ctx*           ctx){
  */
 
 static int        reduxInvoke                   (redux_ctx*           ctx){
-	int   ret, i, k;
+	int   ret, i=0;
+	void* ptrs[2] = {ctx, &i};
 	
 	/**
 	 * Argument Marshalling.
 	 */
 	
-	i = 0;
-	ctx->kArgs[i++] =           (void*)&ctx->phase;
-	ctx->kArgs[i++] =           (void*)&ctx->U;
-	ctx->kArgs[i++] =           (void*)&ctx->V;
-	ctx->kArgs[i++] =           (void*)&ctx->B;
-	ctx->kArgs[i++] =           (void*)&ctx->D;
-	ctx->kArgs[i++] =           (void*)&ctx->H;
-	ctx->kArgs[i++] =           (void*)&ctx->splitFree;
-	ctx->kArgs[i++] =           (void*)&ctx->splitReduce;
-	for(k=0;k < ctx->gr->nds;k++){
-		ctx->kArgs[i++] =       (void*)&ctx->l[k];
-	}
-	for(k=0;k < ctx->gr->ndr && reduxInvRequiresDstArg(ctx);k++){
-		ctx->kArgs[i++] =       (void*)&ctx->lPDim[k];
-	}
-	ctx->kArgs[i++] =           (void*) ctx->flatSrcData;
-	ctx->kArgs[i++] =           (void*)&ctx->flatSrcOffset;
-	for(k=0;k < ctx->gr->nds;k++){
-		ctx->kArgs[i++] =       (void*)&ctx->sJ[k];
-	}
-	if(reduxInvRequiresDst   (ctx)){
-		ctx->kArgs[i++] =       (void*) ctx->flatDstData;
-		ctx->kArgs[i++] =       (void*)&ctx->flatDstOffset;
-		for(k=0;k < ctx->gr->ndd;k++){
-			ctx->kArgs[i++] =   (void*)&ctx->dJ[k];
-		}
-	}
-	if(reduxInvRequiresDstArg(ctx)){
-		ctx->kArgs[i++] =       (void*) ctx->flatDstArgData;
-		ctx->kArgs[i++] =       (void*)&ctx->flatDstArgOffset;
-		for(k=0;k < ctx->gr->ndd;k++){
-			ctx->kArgs[i++] =   (void*)&ctx->aJ[k];
-		}
-	}
-	ctx->kArgs[i++] =           (void*) ctx->w;
-	if(reduxInvKernelRequiresDst   (ctx)){
-		ctx->kArgs[i++] =       (void*)&ctx->wdOff;
-		ctx->kArgs[i++] =       (void*)&ctx->pdOff;
-	}
-	if(reduxInvKernelRequiresDstArg(ctx)){
-		ctx->kArgs[i++] =       (void*)&ctx->waOff;
-		ctx->kArgs[i++] =       (void*)&ctx->paOff;
-	}
-	for(k=0;k < ctx->gr->log2MaxL;k++){
-		ctx->kArgs[i++] =       (void*)&ctx->ibs[k];
-	}
-	for(k=0;k < ctx->gr->log2MaxL;k++){
-		ctx->kArgs[i++] =       (void*)&ctx->ibp[k];
-	}
-	for(k=0;k < ctx->gr->log2MaxL && reduxInvRequiresDstArg(ctx);k++){
-		ctx->kArgs[i++] =       (void*)&ctx->iblPDim[k];
-	}
-	for(k=0;k < ctx->gr->log2MaxL;k++){
-		ctx->kArgs[i++] =       (void*)&ctx->ibsOff[k];
-	}
-	for(k=0;k < ctx->gr->log2MaxL && reduxInvRequiresDst   (ctx);k++){
-		ctx->kArgs[i++] =       (void*)&ctx->ibdOff[k];
-	}
-	for(k=0;k < ctx->gr->log2MaxL && reduxInvRequiresDstArg(ctx);k++){
-		ctx->kArgs[i++] =       (void*)&ctx->ibaOff[k];
-	}
+	reduxGenIterArgs(ctx->gr, reduxInvMarshalArg, ptrs);
 
 
 
diff --git a/tests/check_reduction.c b/tests/check_reduction.c
index 567f384aaf..a961bd8f40 100644
--- a/tests/check_reduction.c
+++ b/tests/check_reduction.c
@@ -3909,11 +3909,11 @@ Suite *get_suite(void) {
 	TCase *tc = tcase_create("basic");
 	tcase_add_checked_fixture(tc, setup, teardown);
 	tcase_set_timeout(tc, 120.0);
-	
-	tcase_add_test(tc, test_maxandargmax_veryhighrank);
-	tcase_add_test(tc, test_maxandargmax_alldimsreduced);
+
 	tcase_add_test(tc, test_maxandargmax_reduction);
 	tcase_add_test(tc, test_maxandargmax_idxtranspose);
+	tcase_add_test(tc, test_maxandargmax_veryhighrank);
+	tcase_add_test(tc, test_maxandargmax_alldimsreduced);
 
 	tcase_add_test(tc, test_minandargmin_reduction);
 	tcase_add_test(tc, test_minandargmin_veryhighrank);

From 8debf2d2c279f5552fd05a320032dbdfa6a6b12f Mon Sep 17 00:00:00 2001
From: Olexa Bilaniuk <obilaniu@gmail.com>
Date: Tue, 4 Jul 2017 01:05:45 -0400
Subject: [PATCH 18/34] Really dumb division bug fixed.

All tests now pass except summation, which fails to meet tolerance.
---
 src/gpuarray_reduction.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c
index f8cd15abfb..af8051cddb 100644
--- a/src/gpuarray_reduction.c
+++ b/src/gpuarray_reduction.c
@@ -2395,10 +2395,11 @@ static void       reduxGenSrcAppendPhase1       (GpuReduction*        gr){
 	"        if(misalignL && doFinish && LID_0 < D){\n"
 	"            SETREDUXSTATE(accV, accI, wdL[(GID_0+0)*D+LID_0], waL[(GID_0+0)*D+LID_0]);\n"
 	"            \n"
-	"            for(k=-1;                /* Starting with the first block to our left... */\n"
-	"                (start      +0)/B == /* Is our write target the same as that of */\n"
-	"                (start+k*V+V-1)/B;   /* the target k blocks to our left? */\n"
-	"                k--){                /* Try moving one more to the left. */\n"
+	"                           /* vvv-- NOTA BENE: The +B hack is REALLY NECESSARY, since C division is rounding to zero: (-1)/B == (B-1)/B for B>1. */\n"
+	"            for(k=-1;                  /* Starting with the first block to our left... */\n"
+	"                (start        +B)/B == /* Is our write target the same as that of */\n"
+	"                (start+k*V+V-1+B)/B;   /* the target k blocks to our left? */\n"
+	"                k--){                  /* Try moving one more to the left. */\n"
 	"                REDUX(accV, accI, wdR[(GID_0+k)*D+LID_0], waR[(GID_0+k)*D+LID_0]);\n"
 	"            }\n"
 	"            \n");

From 5f4ec4ed0f66616cb2072edf915078411e9305f2 Mon Sep 17 00:00:00 2001
From: Olexa Bilaniuk <obilaniu@gmail.com>
Date: Tue, 4 Jul 2017 01:24:14 -0400
Subject: [PATCH 19/34] Fix summation tests:

- Subtract 0.5 from random numbers, so they sum to 0 in expectation.
- Increase tolerance from 1e-5 to 1e-4 just for summation.
---
 tests/check_reduction.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/check_reduction.c b/tests/check_reduction.c
index a961bd8f40..b5e9ba604d 100644
--- a/tests/check_reduction.c
+++ b/tests/check_reduction.c
@@ -1814,7 +1814,7 @@ START_TEST(test_sum_reduction){
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
 	const int reduxList[] = {0,2};
-	const float TOL = 1e-5;
+	const float TOL = 1e-4;
 
 	float*  pS = calloc(1, sizeof(*pS)    * dims[0]*dims[1]*dims[2]);
 	float*  pD = calloc(1, sizeof(*pD)    *         dims[1]        );
@@ -1828,7 +1828,7 @@ START_TEST(test_sum_reduction){
 	 */
 
 	for(i=0;i<prodDims;i++){
-		pS[i] = pcgRand01();
+		pS[i] = pcgRand01()-0.5;
 	}
 
 
@@ -1895,7 +1895,7 @@ START_TEST(test_sum_veryhighrank){
 	size_t rdxDims[4]  = {1171,373,1,2};
 	size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3];
 	const int reduxList[] = {2,4,7,5};
-	const float TOL    = 1e-5;
+	const float TOL    = 1e-4;
 
 	float*  pS = calloc(1, sizeof(*pS) * prodDims);
 	float*  pD = calloc(1, sizeof(*pD) * rdxProdDims);
@@ -1909,7 +1909,7 @@ START_TEST(test_sum_veryhighrank){
 	 */
 
 	for(i=0;i<prodDims;i++){
-		pS[i] = pcgRand01();
+		pS[i] = pcgRand01()-0.5;
 	}
 
 
@@ -1986,7 +1986,7 @@ START_TEST(test_sum_alldimsreduced){
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
 	const int reduxList[] = {0,1,2};
-	const float TOL = 1e-5;
+	const float TOL = 1e-4;
 
 	float*  pS = calloc(1, sizeof(*pS)    * dims[0]*dims[1]*dims[2]);
 	float*  pD = calloc(1, sizeof(*pD)                             );
@@ -2000,7 +2000,7 @@ START_TEST(test_sum_alldimsreduced){
 	 */
 
 	for(i=0;i<prodDims;i++){
-		pS[i] = pcgRand01();
+		pS[i] = pcgRand01()-0.5;
 	}
 
 

From eb108be0176783d51cbc3f460ec33435812bef83 Mon Sep 17 00:00:00 2001
From: Olexa Bilaniuk <obilaniu@gmail.com>
Date: Tue, 4 Jul 2017 04:26:10 -0400
Subject: [PATCH 20/34] Add huge sum-reduction and pepper kernel with
 `restrict` keyword, it doubles the speed.

---
 src/cluda_cuda.h         |  1 +
 src/gpuarray_reduction.c | 47 +++++++++++--------------
 tests/check_reduction.c  | 74 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 95 insertions(+), 27 deletions(-)

diff --git a/src/cluda_cuda.h b/src/cluda_cuda.h
index ed20a8eb1c..e0ad08b90d 100644
--- a/src/cluda_cuda.h
+++ b/src/cluda_cuda.h
@@ -60,6 +60,7 @@
 #define GA_DECL_SHARED_PARAM(type, name)
 #define GA_DECL_SHARED_BODY(type, name) extern __shared__ type name[];
 #define GA_WARP_SIZE warpSize
+#define restrict __restrict__
 
 struct ga_half {
   ga_ushort data;
diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c
index af8051cddb..9b0d74c9dc 100644
--- a/src/gpuarray_reduction.c
+++ b/src/gpuarray_reduction.c
@@ -417,7 +417,6 @@ static int        reduxInvCleanupMsg            (redux_ctx*           ctx, int r
 static size_t     reduxInvEstimateParallelism   (const redux_ctx*  ctx);
 static int        reduxInvRequiresDst           (const redux_ctx*  ctx);
 static int        reduxInvRequiresDstArg        (const redux_ctx*  ctx);
-static int        reduxInvKernelRequiresDst     (const redux_ctx*  ctx);
 static unsigned   reduxInvGetSplitFree          (const redux_ctx*  ctx);
 static unsigned   reduxInvGetSplitReduce        (const redux_ctx*  ctx);
 static axis_desc* reduxInvGetSrcAxis            (const redux_ctx*  ctx, int i);
@@ -1145,12 +1144,6 @@ static int        reduxInvRequiresDst           (const redux_ctx*  ctx){
 static int        reduxInvRequiresDstArg        (const redux_ctx*  ctx){
 	return reduxGenRequiresDstArg(ctx->gr);
 }
-static int        reduxInvKernelRequiresDst     (const redux_ctx*  ctx){
-	return reduxGenKernelRequiresDst(ctx->gr);
-}
-static int        reduxInvKernelRequiresDstArg  (const redux_ctx*  ctx){
-	return reduxGenKernelRequiresDstArg(ctx->gr);
-}
 static unsigned   reduxInvGetSplitFree          (const redux_ctx*  ctx){
 	if(ctx->xdSplit && !axisIsReduced(ctx->xdSplit)){
 		return axisGetIntraLen(ctx->xdSplit);
@@ -1513,26 +1506,26 @@ static void       reduxGenIterArgs              (GpuReduction*        gr,
 	for(k=gr->ndd;k < gr->nds && reduxGenRequiresDstArg(gr);k++){
 		fn(gr, GA_SIZE,   "TX",                       "l%dPDim",     k, user);
 	}
-	fn(gr, GA_BUFFER, "const GLOBAL_MEM char*",   "s",           0, user);
+	fn(gr, GA_BUFFER, "const GLOBAL_MEM char* restrict",   "s",           0, user);
 	fn(gr, GA_SSIZE,  "TX",                       "sOff",        0, user);
 	for(k=0;k < gr->nds;k++){
 		fn(gr, GA_SIZE,   "TX",                       "sJ%d",        k, user);
 	}
 	if(reduxGenRequiresDst   (gr)){
-		fn(gr, GA_BUFFER, "GLOBAL_MEM char*",         "d",           0, user);
+		fn(gr, GA_BUFFER, "GLOBAL_MEM char* restrict",         "d",           0, user);
 		fn(gr, GA_SSIZE,  "TX",                       "dOff",        0, user);
 		for(k=0;k < gr->ndd;k++){
 			fn(gr, GA_SIZE,   "TX",                       "dJ%d",        k, user);
 		}
 	}
 	if(reduxGenRequiresDstArg(gr)){
-		fn(gr, GA_BUFFER, "GLOBAL_MEM char*",         "a",           0, user);
+		fn(gr, GA_BUFFER, "GLOBAL_MEM char* restrict",         "a",           0, user);
 		fn(gr, GA_SSIZE,  "TX",                       "aOff",        0, user);
 		for(k=0;k < gr->ndd;k++){
 			fn(gr, GA_SIZE,   "TX",                       "aJ%d",        k, user);
 		}
 	}
-	fn(gr, GA_BUFFER, "GLOBAL_MEM char*",         "w",           0, user);
+	fn(gr, GA_BUFFER, "GLOBAL_MEM char* restrict",         "w",           0, user);
 	if(reduxGenKernelRequiresDst   (gr)){
 		fn(gr, GA_SSIZE,  "TX",                       "wdOff",       0, user);
 		fn(gr, GA_SSIZE,  "TX",                       "pdOff",       0, user);
@@ -1633,9 +1626,9 @@ static void       reduxGenSrcAppendMacroDefs    (GpuReduction*     gr){
 	 */
 	
 	if (gr->srcTypeCode == GA_HALF && gr->accTypeCode == GA_FLOAT){
-		srcbAppends(&gr->srcGen, "#define LOADS(v, p) do{(v) = (TK)load_half((TS*)(p));}while(0)\n");
+		srcbAppends(&gr->srcGen, "#define LOADS(v, p) do{(v) = (TK)load_half((const TS* restrict)(p));}while(0)\n");
 	}else{
-		srcbAppends(&gr->srcGen, "#define LOADS(v, p) do{(v) = (TK)*(TS*)(p);}while(0)\n");
+		srcbAppends(&gr->srcGen, "#define LOADS(v, p) do{(v) = (TK)*(const TS* restrict)(p);}while(0)\n");
 	}
 	
 	
@@ -1746,9 +1739,9 @@ static void       reduxGenSrcAppendMacroDefs    (GpuReduction*     gr){
 	
 	if (reduxGenRequiresDst(gr)){
 		if (gr->dstTypeCode == GA_HALF && gr->accTypeCode == GA_FLOAT){
-			srcbAppends(&gr->srcGen, "#define STORED(p, v) do{store_half((TD*)(p), (v));}while(0)\n");
+			srcbAppends(&gr->srcGen, "#define STORED(p, v) do{store_half((TD* restrict)(p), (v));}while(0)\n");
 		}else{
-			srcbAppends(&gr->srcGen, "#define STORED(p, v) do{*(TD*)(p) = (v);}while(0)\n");
+			srcbAppends(&gr->srcGen, "#define STORED(p, v) do{*(TD* restrict)(p) = (v);}while(0)\n");
 		}
 	}else{
 		srcbAppends(&gr->srcGen, "#define STORED(p, v) do{}while(0)\n");
@@ -1762,7 +1755,7 @@ static void       reduxGenSrcAppendMacroDefs    (GpuReduction*     gr){
 	 */
 	
 	if (reduxGenRequiresDstArg(gr)){
-		srcbAppends(&gr->srcGen, "#define STOREA(p, v) do{*(TA*)(p) = (v);}while(0)\n");
+		srcbAppends(&gr->srcGen, "#define STOREA(p, v) do{*(TA* restrict)(p) = (v);}while(0)\n");
 	}else{
 		srcbAppends(&gr->srcGen, "#define STOREA(p, v) do{}while(0)\n");
 	}
@@ -2094,17 +2087,17 @@ static void       reduxGenSrcAppendBlockDecode  (GpuReduction*     gr){
 	srcbAppends(&gr->srcGen, "    \n");
 	if(reduxGenKernelRequiresDst(gr)){
 		srcbAppends(&gr->srcGen,
-		"    TK*         wd       = (TK*)(w     + wdOff);\n"
-		"    TK*         wdL      = &wd[0];\n"
-		"    TK*         wdR      = &wd[GDIM_0*D];\n"
-		"    TK*         pd       = (TK*)(SHMEM + pdOff);\n");
+		"    TK* restrict wd       = (TK* restrict)(w     + wdOff);\n"
+		"    TK* restrict wdL      = &wd[0];\n"
+		"    TK* restrict wdR      = &wd[GDIM_0*D];\n"
+		"    TK* restrict pd       = (TK* restrict)(SHMEM + pdOff);\n");
 	}
 	if(reduxGenKernelRequiresDstArg(gr)){
 		srcbAppends(&gr->srcGen,
-		"    TA*         wa       = (TA*)(w     + waOff);\n"
-		"    TA*         waL      = &wa[0];\n"
-		"    TA*         waR      = &wa[GDIM_0*D];\n"
-		"    TA*         pa       = (TA*)(SHMEM + paOff);\n");
+		"    TA* restrict wa       = (TA* restrict)(w     + waOff);\n"
+		"    TA* restrict waL      = &wa[0];\n"
+		"    TA* restrict waR      = &wa[GDIM_0*D];\n"
+		"    TA* restrict pa       = (TA* restrict)(SHMEM + paOff);\n");
 	}
 	srcbAppends(&gr->srcGen, "    \n");
 }
@@ -2182,12 +2175,12 @@ static void       reduxGenSrcAppendThreadDecode (GpuReduction*     gr){
 		                         "    local_barrier();\n");
 	}
 	srcbAppends(&gr->srcGen, "    \n"
-	                         "    const char* ts       = s + sOff;\n");
+	                         "    const char* restrict ts       = s + sOff;\n");
 	if(reduxGenRequiresDst(gr)){
-		srcbAppends(&gr->srcGen, "    char*       td       = d + dOff;\n");
+		srcbAppends(&gr->srcGen, "    char* restrict       td       = d + dOff;\n");
 	}
 	if(reduxGenRequiresDstArg(gr)){
-		srcbAppends(&gr->srcGen, "    char*       ta       = a + aOff;\n");
+		srcbAppends(&gr->srcGen, "    char* restrict       ta       = a + aOff;\n");
 	}
 	srcbAppends(&gr->srcGen, "    \n"
 	                         "    \n");
diff --git a/tests/check_reduction.c b/tests/check_reduction.c
index b5e9ba604d..12a99ded30 100644
--- a/tests/check_reduction.c
+++ b/tests/check_reduction.c
@@ -2054,6 +2054,79 @@ START_TEST(test_sum_alldimsreduced){
 	GpuArray_clear(&gaD);
 }END_TEST
 
+START_TEST(test_sum_huge){
+	pcgSeed(1);
+
+	/**
+	 * We test here a reduction of a huge 1D tensor on all dimensions.
+	 */
+
+	size_t i;
+	size_t dims[1]  = {100000000};
+	size_t prodDims = dims[0];
+	const int reduxList[] = {0};
+	const float TOL = 1e-2;
+
+	float*  pS = calloc(1, sizeof(*pS) * dims[0]);
+	float*  pD = calloc(1, sizeof(*pD));
+
+	ck_assert_ptr_ne(pS,    NULL);
+	ck_assert_ptr_ne(pD,    NULL);
+
+
+	/**
+	 * Initialize source data.
+	 */
+
+	for(i=0;i<prodDims;i++){
+		pS[i] = pcgRand01()-0.5;
+	}
+
+
+	/**
+	 * Run the kernel.
+	 */
+
+	GpuArray gaS;
+	GpuArray gaD;
+
+	ga_assert_ok(GpuArray_empty (&gaS, ctx, GA_FLOAT, 1, &dims[0], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty (&gaD, ctx, GA_FLOAT, 0, NULL, GA_C_ORDER));
+
+	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
+
+	GpuReduction* gr;
+	GpuReduction_new(&gr, GpuArray_context(&gaS),
+	                 GA_REDUCE_SUM, 0, 1, gaS.typecode, 0);
+	ck_assert_ptr_nonnull(gr);
+	ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 1, reduxList, 0));
+	GpuReduction_free(gr);
+
+	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD), &gaD));
+
+
+	/**
+	 * Check that the destination tensors are correct.
+	 */
+	
+	double  gtD = 0;
+	for(i=0;i<dims[0];i++){
+		double  v   = pS[i];
+		gtD += v;
+	}
+	ck_assert_double_eq_tol(gtD, pD[0], TOL);
+
+	/**
+	 * Deallocate.
+	 */
+
+	free(pS);
+	free(pD);
+	GpuArray_clear(&gaS);
+	GpuArray_clear(&gaD);
+}END_TEST
+
 START_TEST(test_prod_reduction){
 	pcgSeed(1);
 
@@ -3938,6 +4011,7 @@ Suite *get_suite(void) {
 	tcase_add_test(tc, test_sum_reduction);
 	tcase_add_test(tc, test_sum_veryhighrank);
 	tcase_add_test(tc, test_sum_alldimsreduced);
+	tcase_add_test(tc, test_sum_huge);
 
 	tcase_add_test(tc, test_prod_reduction);
 	tcase_add_test(tc, test_prod_veryhighrank);

From ce9c067e08699ad85b803ab47a1b09d6cacd0192 Mon Sep 17 00:00:00 2001
From: Olexa Bilaniuk <obilaniu@gmail.com>
Date: Wed, 12 Jul 2017 12:50:32 -0400
Subject: [PATCH 21/34] Massive Refactor into effectively a lattice engine.

---
 src/gpuarray/reduction.h |   69 +-
 src/gpuarray_reduction.c | 4198 ++++++++++++++++++++------------------
 tests/check_reduction.c  |  128 +-
 3 files changed, 2409 insertions(+), 1986 deletions(-)

diff --git a/src/gpuarray/reduction.h b/src/gpuarray/reduction.h
index c8508b841d..77043daa22 100644
--- a/src/gpuarray/reduction.h
+++ b/src/gpuarray/reduction.h
@@ -46,6 +46,8 @@ typedef enum _ga_reduce_op {
 	GA_REDUCE_XOR,             /*     ^               */
 	GA_REDUCE_ALL,             /*  &&/all()           */
 	GA_REDUCE_ANY,             /*  ||/any()           */
+	
+	GA_REDUCE_ENDSUPPORTED     /* Must be last element in enum */
 } ga_reduce_op;
 
 
@@ -57,29 +59,31 @@ typedef enum _ga_reduce_op {
  * @param [out] gr           The reduction operator.
  * @param [in]  gpuCtx       The GPU context.
  * @param [in]  op           The reduction operation to perform.
- * @param [in]  ndf          The minimum number of destination dimensions to support.
- * @param [in]  ndr          The minimum number of reduction   dimensions to support.
- * @param [in]  srcTypeCode  The data type of the source operand.
+ * @param [in]  ndf          The minimum number of free (destination) dimensions to support.
+ * @param [in]  ndr          The minimum number of reduction (source) dimensions to support.
+ * @param [in]  s0TypeCode   The data type of the source operand.
  * @param [in]  flags        Reduction operator creation flags. Currently must be
  *                           set to 0.
  *
- * @return GA_NO_ERROR if the operator was created successfully, or a non-zero
- *         error code otherwise.
+ * @return GA_NO_ERROR      if the operator was created successfully
+ *         GA_INVALID_ERROR if grOut is NULL, or some other argument was invalid
+ *         GA_NO_MEMORY     if memory allocation failed anytime during creation
+ *         or other non-zero error codes otherwise.
  */
 
-GPUARRAY_PUBLIC int   GpuReduction_new   (GpuReduction**   grOut,
-                                          gpucontext*      gpuCtx,
-                                          ga_reduce_op     op,
-                                          unsigned         ndf,
-                                          unsigned         ndr,
-                                          int              srcTypeCode,
-                                          int              flags);
+GPUARRAY_PUBLIC int   GpuReduction_new   (GpuReduction**       grOut,
+                                          gpucontext*          gpuCtx,
+                                          ga_reduce_op         op,
+                                          unsigned             ndf,
+                                          unsigned             ndr,
+                                          int                  s0TypeCode,
+                                          int                  flags);
 
 /**
  * @brief Deallocate an operator allocated by GpuReduction_new().
  */
 
-GPUARRAY_PUBLIC void  GpuReduction_free  (GpuReduction*    gr);
+GPUARRAY_PUBLIC void  GpuReduction_free  (GpuReduction*        gr);
 
 /**
  * @brief Invoke an operator allocated by GpuReduction_new() on a source tensor.
@@ -91,28 +95,27 @@ GPUARRAY_PUBLIC void  GpuReduction_free  (GpuReduction*    gr);
  * destination.
  * 
  * @param [in]  gr         The reduction operator.
- * @param [out] dst        The destination tensor. Has the same type as the source.
- * @param [out] dstArg     For argument of minima/maxima operations. Has type int64.
- * @param [in]  src        The source tensor.
+ * @param [out] d0         The destination tensor.
+ * @param [out] d1         The second destination tensor, for argmin/argmax operations.
+ * @param [in]  s0         The source tensor.
  * @param [in]  reduxLen   The number of axes reduced. Must be >= 1 and
- *                         <= src->nd.
+ *                         <= s0->nd.
  * @param [in]  reduxList  A list of integers of length reduxLen, indicating
  *                         the axes to be reduced. The order of the axes
- *                         matters for dstArg index calculations (GpuArray_argmin,
- *                         GpuArray_argmax, GpuArray_minandargmin,
- *                         GpuArray_maxandargmax). All entries in the list must be
+ *                         matters for dstArg index calculations (argmin, argmax,
+ *                         minandargmin, maxandargmax). All entries in the list must be
  *                         unique, >= 0 and < src->nd.
  *                         
- *                         For example, if a 5D-tensor is max-reduced with an axis
- *                         list of [3,4,1], then reduxLen shall be 3, and the
+ *                         For example, if a 5D-tensor is maxandargmax-reduced with an
+ *                         axis list of [3,4,1], then reduxLen shall be 3, and the
  *                         index calculation in every point shall take the form
  *                         
- *                             dstArgmax[i0,i2] = i3 * src.shape[4] * src.shape[1] +
- *                                                i4 * src.shape[1]                +
- *                                                i1
+ *                             d1[i0,i2] = i3 * s0.shape[4] * s0.shape[1] +
+ *                                         i4 * s0.shape[1]               +
+ *                                         i1
  *                         
  *                         where (i3,i4,i1) are the coordinates of the maximum-
- *                         valued element within subtensor [i0,:,i2,:,:] of src.
+ *                         valued element within subtensor [i0,:,i2,:,:] of s0.
  * @param [in]  flags      Reduction operator invocation flags. Currently must be
  *                         set to 0.
  *
@@ -120,13 +123,13 @@ GPUARRAY_PUBLIC void  GpuReduction_free  (GpuReduction*    gr);
  *         error code otherwise.
  */
 
-GPUARRAY_PUBLIC int   GpuReduction_call  (GpuReduction*    gr,
-                                          GpuArray*        dst,
-                                          GpuArray*        dstArg,
-                                          const GpuArray*  src,
-                                          unsigned         reduxLen,
-                                          const int*       reduxList,
-                                          int              flags);
+GPUARRAY_PUBLIC int   GpuReduction_call  (const GpuReduction*  gr,
+                                          GpuArray*            d0,
+                                          GpuArray*            d1,
+                                          const GpuArray*      s0,
+                                          unsigned             reduxLen,
+                                          const int*           reduxList,
+                                          int                  flags);
 
 
 #ifdef __cplusplus
diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c
index 9b0d74c9dc..35518c0fbc 100644
--- a/src/gpuarray_reduction.c
+++ b/src/gpuarray_reduction.c
@@ -25,8 +25,28 @@
 
 /* Defines */
 #define  DIVIDECEIL(a,b) (((a)+(b)-1)/(b))
-#define  MAX_HW_DIMS                   3
 
+/**
+ * Template Selector
+ * 
+ * This is a bitfield interpreted as follows:
+ * 
+ *     0b000x: Phase 1 processing (Phase 0)
+ *     0b00x0: Split axis is free (Reduced)
+ *     0bxx00: Huge axis is:
+ *             00: Nonexistent
+ *             01: Same as split axis
+ *             10: Same type (free/reduced) as split axis
+ *             11: Opposite type (free/reduced) to split axis
+ */
+
+#define SELECTOR_PHASE1              0x01
+#define SELECTOR_SPLIT_FREE          0x02
+#define SELECTOR_HUGE_AXIS           0x0C
+#define SELECTOR_HUGE_NONE           0x00
+#define SELECTOR_HUGE_IS_SPLIT       0x04
+#define SELECTOR_HUGE_SAME_TYPE      0x08
+#define SELECTOR_HUGE_OPPOSITE_TYPE  0x0C
 
 
 /* Datatypes */
@@ -38,60 +58,154 @@
 struct axis_desc{
 	int      reduxNum;
 	int      ibNum;
-	unsigned ibp;
+	unsigned perm;
 	unsigned isReduced : 1;
 	unsigned isIntra   : 1;
 	size_t   len;
 	size_t   splitLen;
-	size_t   pdim;
-	ssize_t  srcStride;
-	ssize_t  dstStride;
-	ssize_t  dstArgStride;
+	ssize_t  s0S;
+	ssize_t  d0S;
+	ssize_t  d1S;
+	size_t   i0S;
 };
 typedef struct axis_desc axis_desc;
 
 /**
  *                    Reduction Kernel Invoker.
+ */
+
+struct redux_ctx{
+	/* Function Arguments. */
+	const GpuReduction* gr;
+	ga_reduce_op        op;
+	GpuArray*           d0;
+	GpuArray*           d1;
+	const GpuArray*     s0;
+	int                 reduxLen;
+	const int*          reduxList;
+	int                 flags;
+
+	/* General. */
+	int                 nds0;         /* # Source                           axes */
+	int                 nds0r;        /* # Reduced                          axes */
+	int                 ndd0;         /* # Destination                      axes */
+	int                 ndfs0;        /* # Flattened source                 axes */
+	int                 ndfs0r;       /* # Flattened source                 axes */
+	int                 ndfd0;        /* # Flattened source                 axes */
+	int                 ndib;         /* # Intra-block                      axes */
+	int                 zeroAllAxes;  /* # of zero-length                   axes in source tensor */
+	int                 zeroRdxAxes;  /* # of zero-length         reduction axes in source tensor */
+	size_t              prodAllAxes;  /* Product of length of all           axes in source tensor */
+	size_t              prodRdxAxes;  /* Product of length of all reduction axes in source tensor */
+	size_t              prodFreeAxes; /* Product of length of all free      axes in source tensor */
+	
+	/* Flattening */
+	axis_desc*          xdSrc;
+	axis_desc**         xdSrcPtrs;
+	axis_desc*          xdSplit;
+
+	/* Invoker */
+	uint32_t            selector;
+	uint64_t            U;
+	uint64_t            V;
+	uint64_t            B;
+	uint32_t            D;
+	uint32_t            Dunit;
+	uint32_t            H;
+	
+	uint32_t            LSlice;
+	uint64_t            LPadded;
+	uint64_t*           L,   *Li;
+	gpudata*            S0Data;
+	int64_t             S0Off;
+	int64_t*            S0J, *S0Si;
+	gpudata*            D0Data;
+	int64_t             D0Off;
+	int64_t*            D0J, *D0Si;
+	gpudata*            D1Data;
+	int64_t             D1Off;
+	int64_t*            D1J, *D1Si;
+	int64_t*            I0J, *I0Si;
+	
+	gpudata*            W;
+	int64_t             W0Off;
+	ssize_t             W1Off;
+	size_t              shmemBytes;
+	ssize_t             SHMEMK0Off;
+	ssize_t             SHMEMK1Off;
+	
+	unsigned*           perm;
+	
+	void**              kArgs;
+	
+	/* Scheduler */
+	size_t              bs;
+	size_t              gs;
+};
+typedef struct redux_ctx redux_ctx;
+
+
+/**
+ *                    Reduction Operator.
  * 
  * INTRO
  * 
- * Generates the source code for a reduction kernel over arbitrarily-dimensioned,
+ * Generates the source code for a reduction kernel over arbitrarily-ranked,
  * -shaped and -typed tensors.
  * 
+ * It is assumed that at most one axis will ever be of length > 2**31-1. The
+ * assumption is believed safe because no GPU or similar accelerator presently
+ * on Earth has the capacity to store or process 2**62-element tensors.
+ * 
  * 
- * GOALS
+ * TYPE NAMES
  * 
- * The generator has the following goals:
+ *     TS0:  Type of s0 tensor
+ *     TPS0: Promoted type of s0 tensor
+ *     TD0:  Type of d0 tensor
+ *     TD1:  Type of d1 tensor
+ *     TS32: Type of 32-bit narrow, signed,   2's complement integer
+ *     TU32: Type of 32-bit narrow, unsigned, 2's complement integer
+ *     TS64: Type of 64-bit wide,   signed,   2's complement integer
+ *     TU64: Type of 64-bit wide,   unsigned, 2's complement integer
+ *     TK0:  Type of reduction accumulator
+ *     TK1:  Type of flattened index
  * 
- *   1. Maximizing the use of coalesced memory loads within a warp.
- *   2. Maximizing the # of useful threads within a warp.
- *   3. Maximizing the number of warps within a block.
- *   4. Ensuring there are no more than 5 blocks per multiprocessor.
- *   5. Minimizing the workspace size (if it is required).
+ * But note however that: 
+ *   - TS0 is not necessarily the same as TPS0/TD0/TD1
+ *   - TD1 is not necessarily TS32/TU32/TS64/TU64/TK1
+ *   - TK1 is not necessarily TU64
+ *   - TK0 is not necessarily the same as TS0 or TPS0. Moreover, since it may
+ *         be a "custom" type that exists only within the kernel, it might not
+ *         necessarily have a gpuarray_type typecode associated with it.
+ * 
+ *         Example 1: TK0 might eventually become a double-TS0 struct for Kahan
+ *         compensated summation. No typecode exists for a struct of two TS0
+ *         values.
+ * 
+ *         Example 2: If doing a Kahan summation of a GA_HALF array, the
+ *         following might be the case:
+ *             TS0  == GA_HALF
+ *             TPS0 == GA_FLOAT
+ *             TK0  == struct{GA_FLOAT,GA_FLOAT}
  * 
  * 
  * NOTES
  * 
- * Information elements required to perform reduction.
+ * Information elements required to generate source code:
  * 
- *   1. Ndim, shape and dtype of src tensor
- *   2. Ndim, shape and dtype of dst/dstArg tensors
+ *   1. Maximum rank and dtype of s0 tensor
+ *   2. Maximum rank and dtype of d0/d1 tensors
  *   3. GPU context
  *   4. Number of processors
  *   5. Warp size
  *   6. Maximum size of block
- *   7. Maximum size of block dimension X, Y, Z
+ *   7. Maximum size of block axis X
  *   8. Maximum size of grid
- *   9. Maximum size of grid  dimension X, Y, Z
+ *   9. Maximum size of grid  axis X
  *  10. Dtype and initializer of accumulator
- *  11. Sorted src axes for contiguous memory accesses
- *  12. Ndim, shape and dtype of flattened src tensor
- *  13. Number of stages (1 or 2)
- *  14. Size of workspace tensor
- *  15. Intrablock/split/free/reduced axes
- *  16. Source code
  * 
- * Rationale for dependencies:
+ * Rationale for some dependencies:
  * 
  *   1) Get the GPU context and its properties immediately, since an invalid
  *      context is a likely error and we want to fail fast.
@@ -99,105 +213,6 @@ typedef struct axis_desc axis_desc;
  *      the context's properties have been retrieved since they provide
  *      information about the device's natively-supported types and operations
  *      (e.g. half-precision float)
- */
-
-struct redux_ctx{
-	/* Function Arguments. */
-	GpuReduction*   gr;
-	ga_reduce_op    op;
-	GpuArray*       dst;
-	GpuArray*       dstArg;
-	const GpuArray* src;
-	int             reduxLen;
-	const int*      reduxList;
-	int             flags;
-
-	/* General. */
-	int             nds;          /* # Source              dimensions */
-	int             ndr;          /* # Reduced             dimensions */
-	int             ndd;          /* # Destination         dimensions */
-	int             ndfs;         /* # Flattened source    dimensions */
-	int             ndfr;         /* # Flattened source    dimensions */
-	int             ndfd;         /* # Flattened source    dimensions */
-	int             ndib;         /* # Intra-block         dimensions */
-	int             zeroAllAxes;  /* # of zero-length                   axes in source tensor */
-	int             zeroRdxAxes;  /* # of zero-length         reduction axes in source tensor */
-	size_t          prodAllAxes;  /* Product of length of all           axes in source tensor */
-	size_t          prodRdxAxes;  /* Product of length of all reduction axes in source tensor */
-	size_t          prodFreeAxes; /* Product of length of all free      axes in source tensor */
-	
-	/* Flattening */
-	axis_desc*      xdSrc;
-	axis_desc**     xdSrcPtrs;
-	axis_desc**     xdTmpPtrs;
-
-	/* Invoker */
-	int             phase;
-	size_t          U;
-	size_t          V;
-	size_t          B;
-	unsigned        D;
-	unsigned        H;
-	unsigned        splitReduce;
-	unsigned        splitFree;
-	
-	axis_desc*      xdSplit;
-	
-	size_t*         l;
-	size_t*         lPDim;
-	ssize_t*        sJ;
-	ssize_t*        dJ;
-	ssize_t*        aJ;
-	
-	gpudata*        flatSrcData;
-	ssize_t         flatSrcOffset;
-	gpudata*        flatDstData;
-	ssize_t         flatDstOffset;
-	gpudata*        flatDstArgData;
-	ssize_t         flatDstArgOffset;
-	
-	gpudata*        w;
-	size_t          SHMEM;
-	ssize_t         wdOff;
-	ssize_t         pdOff;
-	ssize_t         waOff;
-	ssize_t         paOff;
-	
-	unsigned*       ibs;
-	unsigned*       ibp;
-	size_t*         iblPDim;
-	ssize_t*        ibsOff;
-	ssize_t*        ibdOff;
-	ssize_t*        ibaOff;
-	
-	void**          kArgs;
-	
-	
-	/* Scheduler */
-	size_t          bs;
-	size_t          gs;
-};
-typedef struct redux_ctx redux_ctx;
-
-
-/**
- *                    Reduction Operator.
- * 
- * INTRO
- * 
- * Generates the source code for a reduction kernel over arbitrarily-dimensioned,
- * -shaped and -typed tensors.
- * 
- * 
- * GOALS
- * 
- * The generator has the following goals:
- * 
- *   1. Maximizing the use of coalesced memory loads within a warp.
- *   2. Maximizing the # of useful threads within a warp.
- *   3. Maximizing the number of warps within a block.
- *   4. Ensuring there are no more than 5 blocks per multiprocessor.
- *   5. Minimizing the workspace size (if it is required).
  * 
  * 
  * REFERENCES
@@ -221,7 +236,7 @@ struct GpuReduction{
 	ga_reduce_op     op;
 	int              ndd;
 	int              ndr;
-	int              srcTypeCode;
+	int              TS0tc;
 	int              flags;
 	
 	/* Misc */
@@ -230,21 +245,30 @@ struct GpuReduction{
 	/* Source code Generator. */
 	strb             s;
 	srcb             srcGen;
+	char             kName[256];
 	char*            kSourceCode;
 	size_t           kSourceCodeLen;
-	int              dstTypeCode;
-	int              dstArgTypeCode;
+	int              TPS0tc;
+	int              TD0tc;
+	int              TD1tc;
+	int              TS32tc;
+	int              TU32tc;
+	int              TS64tc;
+	int              TU64tc;
+	struct{
+		size_t       size;
+		size_t       align;
+		char         defn[256];
+		char         init[256];
+	} TK0, TK1;
 	int              idxTypeCode;
 	int              accTypeCode;
 	const char*      srcTypeStr;
 	const char*      dstTypeStr;
 	const char*      dstArgTypeStr;
 	const char*      idxTypeStr;
-	const char*      accTypeStr;
-	const char*      initVal;
 	
 	/* Compile */
-	int              log2MaxL;
 	int              kNumArgs;
 	int*             kArgTypeCodes;
 	char*            kErrorString;
@@ -258,224 +282,237 @@ struct GpuReduction{
 	size_t           maxG0;
 	size_t           maxLM;
 	size_t           maxLK;
+	size_t           maxBS;
+	int              log2MaxBS;
 };
 
 
 /* Typedefs */
-typedef void (*GpuReductionIterFn)(GpuReduction* gr,
-                                   int           typecode,
-                                   const char*   typeName,
-                                   const char*   baseName,
-                                   int           num,
-                                   void*         user);
+typedef void (*GpuReductionIterFn)(const GpuReduction* gr,
+                                   int                 typecode,
+                                   const char*         typeName,
+                                   const char*         baseName,
+                                   int                 num,
+                                   void*               user);
 
 
 /* Static Function prototypes */
 /* Utilities */
-static int        reduxGetSumInit               (int typecode, const char** property);
-static int        reduxGetProdInit              (int typecode, const char** property);
-static int        reduxGetMinInit               (int typecode, const char** property);
-static int        reduxGetMaxInit               (int typecode, const char** property);
-static int        reduxGetAndInit               (int typecode, const char** property);
-static int        reduxGetOrInit                (int typecode, const char** property);
-static int        reduxIsSensitive              (int               typecode);
-static int        reduxSortFlatSensitive        (const void* a, const void* b);
-static int        reduxSortFlatInsensitive      (const void* a, const void* b);
-static int        reduxSortPtrIBSrcRdSelect     (const void* a, const void* b);
-static int        reduxSortPtrByReduxNum        (const void* a, const void* b);
-static int        reduxSortPtrIBDstWrSelect     (const void* a, const void* b);
-static int        reduxSortPtrIBDstArgWrSelect  (const void* a, const void* b);
-static int        reduxSortPtrInsertFinalOrder  (const void* a, const void* b);
+static int         reduxGetSumInit                (int typecode, const char** property);
+static int         reduxGetProdInit               (int typecode, const char** property);
+static int         reduxGetMinInit                (int typecode, const char** property);
+static int         reduxGetMaxInit                (int typecode, const char** property);
+static int         reduxGetAndInit                (int typecode, const char** property);
+static int         reduxGetOrInit                 (int typecode, const char** property);
+static int         reduxIsSensitive               (int op);
+static const char* reduxGetOpName                 (int op);
+static int         reduxIsFloatingPoint           (int typecode);
+static unsigned    reduxCeilLog2                  (uint64_t x);
+static uint64_t    reduxNextPow2                  (uint64_t x);
+static int         reduxSortFlatSensitive         (const void* a, const void* b);
+static int         reduxSortFlatInsensitive       (const void* a, const void* b);
+static int         reduxSortPtrS0AbsStride        (const void* a, const void* b);
+static int         reduxSortPtrByReduxNum         (const void* a, const void* b);
+static int         reduxSortPtrD0WrSelect         (const void* a, const void* b);
+static int         reduxSortPtrD1WrSelect         (const void* a, const void* b);
+static int         reduxSortPtrInsertFinalOrder   (const void* a, const void* b);
 
 /* Axis Description API */
-static void       axisInit                      (axis_desc*           axis,
-                                                 ssize_t              len,
-                                                 ssize_t              srcStride);
-static void       axisMarkReduced               (axis_desc*           axis, int    reduxNum);
-static void       axisMarkIntraBlock            (axis_desc*           axis,
-                                                 int                  ibNum,
-                                                 size_t               ibLen);
-static int        axisGetReduxNum               (const axis_desc*     axis);
-static size_t     axisGetLen                    (const axis_desc*     axis);
-static size_t     axisGetIntraLen               (const axis_desc*     axis);
-static size_t     axisGetInterLen               (const axis_desc*     axis);
-static size_t     axisGetIntraInterLen          (const axis_desc*     axis);
-static ssize_t    axisGetSrcStride              (const axis_desc*     axis);
-static size_t     axisGetSrcAbsStride           (const axis_desc*     axis);
-static ssize_t    axisGetDstStride              (const axis_desc*     axis);
-static size_t     axisGetDstAbsStride           (const axis_desc*     axis);
-static ssize_t    axisGetDstArgStride           (const axis_desc*     axis);
-static size_t     axisGetDstArgAbsStride        (const axis_desc*     axis);
-static unsigned   axisGetIBP                    (const axis_desc*     axis);
-static int        axisGetIBNum                  (const axis_desc*     axis);
-static void       axisSetIBP                    (axis_desc*           axis,
-                                                 unsigned             ibp);
-static size_t     axisGetPDim                   (const axis_desc*     axis);
-static void       axisSetPDim                   (axis_desc*           axis,
-                                                 size_t               pdim);
-static int        axisIsReduced                 (const axis_desc*     axis);
-static int        axisIsIntra                   (const axis_desc*     axis);
-static int        axisIsInter                   (const axis_desc*     axis);
-static int        axisIsSplit                   (const axis_desc*     axis);
+static void        axisInit                       (axis_desc*           axis,
+                                                   ssize_t              len,
+                                                   ssize_t              s0S);
+static void        axisMarkReduced                (axis_desc*           axis, int    reduxNum);
+static void        axisMarkIntraBlock             (axis_desc*           axis,
+                                                   int                  ibNum,
+                                                   size_t               ibLen);
+static int         axisGetReduxNum                (const axis_desc*     axis);
+static size_t      axisGetLen                     (const axis_desc*     axis);
+static size_t      axisGetIntraLen                (const axis_desc*     axis);
+static size_t      axisGetInterLen                (const axis_desc*     axis);
+static size_t      axisGetIntraInterLen           (const axis_desc*     axis);
+static ssize_t     axisGetS0Stride                (const axis_desc*     axis);
+static size_t      axisGetS0AbsStride             (const axis_desc*     axis);
+static ssize_t     axisGetD0Stride                (const axis_desc*     axis);
+static size_t      axisGetD0AbsStride             (const axis_desc*     axis);
+static ssize_t     axisGetD1Stride                (const axis_desc*     axis);
+static size_t      axisGetD1AbsStride             (const axis_desc*     axis);
+static size_t      axisGetI0Stride                (const axis_desc*     axis);
+static void        axisSetI0Stride                (axis_desc*           axis,
+                                                   size_t               pdim);
+static unsigned    axisGetPerm                    (const axis_desc*     axis);
+static int         axisGetIBNum                   (const axis_desc*     axis);
+static void        axisSetPerm                    (axis_desc*           axis,
+                                                   unsigned             ibp);
+static int         axisIsReduced                  (const axis_desc*     axis);
+static int         axisIsIntra                    (const axis_desc*     axis);
+static int         axisIsInter                    (const axis_desc*     axis);
+static int         axisIsSplit                    (const axis_desc*     axis);
 
 /* Reduction Context API */
 /*     Generator Control Flow */
-static int        reduxGenInit                  (GpuReduction*        gr);
-static int        reduxGenInferProperties       (GpuReduction*        gr);
-static void       reduxGenIterArgs              (GpuReduction*        gr,
-                                                 GpuReductionIterFn   fn,
-                                                 void*                user);
-static int        reduxGenSrc                   (GpuReduction*        gr);
-static void       reduxGenSrcAppend             (GpuReduction*        gr);
-static void       reduxGenSrcAppendIncludes     (GpuReduction*        gr);
-static void       reduxGenSrcAppendMacroDefs    (GpuReduction*        gr);
-static void       reduxGenSrcAppendTypedefs     (GpuReduction*        gr);
-static void       reduxGenSrcAppendReduxKernel  (GpuReduction*        gr);
-static void       reduxGenSrcAppendPrototype    (GpuReduction*        gr);
-static void       reduxGenSrcAppendBlockDecode  (GpuReduction*        gr);
-static void       reduxGenSrcAppendThreadDecode (GpuReduction*        gr);
-static void       reduxGenSrcAppendPhase0       (GpuReduction*        gr);
-static void       reduxGenSrcAppendLoops        (GpuReduction*        gr,
-                                                 int                  freeMaybeSplit,
-                                                 int                  reduceMaybeSplit);
-static void       reduxGenSrcAppendLoop         (GpuReduction*        gr,
-                                                 int                  initial,
-                                                 int                  freeMaybeSplit,
-                                                 int                  reduceMaybeSplit);
-static void       reduxGenSrcAppendDecrement    (GpuReduction*        gr);
-static void       reduxGenSrcAppendVertical     (GpuReduction*        gr,
-                                                 int                  freeMaybeSplit,
-                                                 int                  reduceMaybeSplit);
-static void       reduxGenSrcAppendIncrement    (GpuReduction*        gr,
-                                                 int                  axis,
-                                                 int                  initial,
-                                                 int                  freeMaybeSplit,
-                                                 int                  reduceMaybeSplit);
-static void       reduxGenSrcAppendDstWrite     (GpuReduction*        gr,
-                                                 int                  initial,
-                                                 int                  freeMaybeSplit,
-                                                 int                  reduceMaybeSplit);
-static void       reduxGenSrcAppendPhase1       (GpuReduction*        gr);
-static int        reduxGenCompile               (GpuReduction*        gr);
-static int        reduxGenComputeLaunchBounds   (GpuReduction*        gr);
-static int        reduxGenCleanup               (GpuReduction*        gr,  int ret);
-static int        reduxGenCleanupMsg            (GpuReduction*        gr,  int ret,
-                                                 const char*          fmt, ...);
+static int         reduxGenInit                   (GpuReduction*        gr);
+static int         reduxGenInferProperties        (GpuReduction*        gr);
+static void        reduxGenSetMaxBS               (GpuReduction*        gr);
+static void        reduxGenSetKTypes              (GpuReduction*        gr);
+static void        reduxGenIterArgs               (const GpuReduction*  gr,
+                                                   GpuReductionIterFn   fn,
+                                                   void*                user);
+static int         reduxGenSrc                    (GpuReduction*        gr);
+static void        reduxGenSrcAppend              (GpuReduction*        gr);
+static void        reduxGenSrcAppendIncludes      (GpuReduction*        gr);
+static void        reduxGenSrcAppendMacroTypedefs (GpuReduction*        gr);
+static void        reduxGenSrcAppendReduxKernel   (GpuReduction*        gr);
+static void        reduxGenSrcAppendPrototype     (GpuReduction*        gr);
+static void        reduxGenSrcAppendDecode        (GpuReduction*        gr);
+static void        reduxGenSrcAppendPhase0        (GpuReduction*        gr,
+                                                   uint32_t             selector);
+static void        reduxGenSrcAppendLoop          (GpuReduction*        gr,
+                                                   uint32_t             selector,
+                                                   int                  initial);
+static void        reduxGenSrcAppendVertical      (GpuReduction*        gr,
+                                                   uint32_t             selector);
+static void        reduxGenSrcAppendIncrement     (GpuReduction*        gr,
+                                                   uint32_t             selector,
+                                                   int                  initial,
+                                                   int                  axis);
+static void        reduxGenSrcAppendDstWrite      (GpuReduction*        gr,
+                                                   uint32_t             selector,
+                                                   int                  initial);
+static void        reduxGenSrcAppendPhase1        (GpuReduction*        gr);
+static int         reduxGenSrcAxisIsHuge          (GpuReduction*        gr,
+                                                   uint32_t             selector,
+                                                   int                  axis);
+static int         reduxGenSrcAxisIsSplit         (GpuReduction*        gr,
+                                                   uint32_t             selector,
+                                                   int                  axis);
+static int         reduxGenCompile                (GpuReduction*        gr);
+static int         reduxGenComputeLaunchBounds    (GpuReduction*        gr);
+static int         reduxGenCleanup                (GpuReduction*        gr,  int ret);
+static int         reduxGenCleanupMsg             (GpuReduction*        gr,  int ret,
+                                                   const char*          fmt, ...);
 
 /*     Generator Utilities */
-static void       reduxGenCountArgs             (GpuReduction*        gr,
-                                                 int                  typecode,
-                                                 const char*          typeName,
-                                                 const char*          baseName,
-                                                 int                  num,
-                                                 void*                user);
-static void       reduxGenSaveArgTypecodes      (GpuReduction*        gr,
-                                                 int                  typecode,
-                                                 const char*          typeName,
-                                                 const char*          baseName,
-                                                 int                  num,
-                                                 void*                user);
-static void       reduxGenAppendArg             (GpuReduction*        gr,
-                                                 int                  typecode,
-                                                 const char*          typeName,
-                                                 const char*          baseName,
-                                                 int                  num,
-                                                 void*                user);
-static void       reduxInvMarshalArg            (GpuReduction*        gr,
-                                                 int                  typecode,
-                                                 const char*          typeName,
-                                                 const char*          baseName,
-                                                 int                  num,
-                                                 void*                user);
-static size_t     reduxGenEstimateParallelism   (const GpuReduction*  gr);
-static int        reduxGenRequiresDst           (const GpuReduction*  gr);
-static int        reduxGenRequiresDstArg        (const GpuReduction*  gr);
-static int        reduxGenKernelRequiresDst     (const GpuReduction*  gr);
-static int        reduxGenKernelRequiresDstArg  (const GpuReduction*  gr);
-static int        reduxGenAxisMaybeSplit        (const GpuReduction*  gr, int axis);
-static size_t     reduxGenGetReduxStateSize     (const GpuReduction*  gr);
-static size_t     reduxGenGetMaxLocalSize       (const GpuReduction*  gr);
-static size_t     reduxGenGetSHMEMSize          (const GpuReduction*  gr, size_t bs);
-static size_t     reduxGenGetSHMEMDstOff        (const GpuReduction*  gr, size_t bs);
-static size_t     reduxGenGetSHMEMDstArgOff     (const GpuReduction*  gr, size_t bs);
-static size_t     reduxGenGetWMEMSize           (const GpuReduction*  gr, size_t bs);
-static size_t     reduxGenGetWMEMDstOff         (const GpuReduction*  gr, size_t bs);
-static size_t     reduxGenGetWMEMDstArgOff      (const GpuReduction*  gr, size_t bs);
+static void        reduxGenCountArgs              (const GpuReduction*  gr,
+                                                   int                  typecode,
+                                                   const char*          typeName,
+                                                   const char*          baseName,
+                                                   int                  num,
+                                                   void*                user);
+static void        reduxGenSaveArgTypecodes       (const GpuReduction*  gr,
+                                                   int                  typecode,
+                                                   const char*          typeName,
+                                                   const char*          baseName,
+                                                   int                  num,
+                                                   void*                user);
+static void        reduxGenAppendArg              (const GpuReduction*  gr,
+                                                   int                  typecode,
+                                                   const char*          typeName,
+                                                   const char*          baseName,
+                                                   int                  num,
+                                                   void*                user);
+static void        reduxInvMarshalArg             (const GpuReduction*  gr,
+                                                   int                  typecode,
+                                                   const char*          typeName,
+                                                   const char*          baseName,
+                                                   int                  num,
+                                                   void*                user);
+static size_t      reduxGenEstimateParallelism    (const GpuReduction*  gr);
+static int         reduxGenRequiresS0             (const GpuReduction*  gr);
+static int         reduxGenRequiresD0             (const GpuReduction*  gr);
+static int         reduxGenRequiresD1             (const GpuReduction*  gr);
+static int         reduxGenKernelRequiresLatticeS0(const GpuReduction*  gr);
+static int         reduxGenKernelRequiresLatticeD0(const GpuReduction*  gr);
+static int         reduxGenKernelRequiresLatticeD1(const GpuReduction*  gr);
+static int         reduxGenKernelRequiresLatticeI0(const GpuReduction*  gr);
+static int         reduxGenKernelRequiresStateK0  (const GpuReduction*  gr);
+static int         reduxGenKernelRequiresStateK1  (const GpuReduction*  gr);
+static int         reduxGenKernelRequiresWspace   (const GpuReduction*  gr);
+static size_t      reduxGenGetK0Size              (const GpuReduction*  gr);
+static size_t      reduxGenGetK0Align             (const GpuReduction*  gr);
+static size_t      reduxGenGetK1Size              (const GpuReduction*  gr);
+static size_t      reduxGenGetK1Align             (const GpuReduction*  gr);
+static size_t      reduxGenGetReduxStateSize      (const GpuReduction*  gr);
+static size_t      reduxGenGetMaxLocalSize        (const GpuReduction*  gr);
+static size_t      reduxGenGetSHMEMSize           (const GpuReduction*  gr, size_t cells);
+static size_t      reduxGenGetSHMEMK0Off          (const GpuReduction*  gr, size_t cells);
+static size_t      reduxGenGetSHMEMK1Off          (const GpuReduction*  gr, size_t cells);
+static size_t      reduxGenGetWMEMSize            (const GpuReduction*  gr, size_t cells);
+static size_t      reduxGenGetWMEMK0Off           (const GpuReduction*  gr, size_t cells);
+static size_t      reduxGenGetWMEMK1Off           (const GpuReduction*  gr, size_t cells);
 
 /*     Invoker Control Flow */
-static int        reduxInvInit                  (redux_ctx*           ctx);
-static int        reduxInvInferProperties       (redux_ctx*           ctx);
-static int        reduxInvFlattenSource         (redux_ctx*           ctx);
-static int        reduxInvComputeKArgs          (redux_ctx*           ctx);
-static int        reduxInvSchedule              (redux_ctx*           ctx);
-static int        reduxInvoke                   (redux_ctx*           ctx);
-static int        reduxInvCleanup               (redux_ctx*           ctx, int ret);
-static int        reduxInvCleanupMsg            (redux_ctx*           ctx, int ret,
-                                                 const char*          fmt, ...);
+static int         reduxInvInit                   (redux_ctx*           ctx);
+static int         reduxInvInferProperties        (redux_ctx*           ctx);
+static int         reduxInvFlattenSource          (redux_ctx*           ctx);
+static int         reduxInvComputeKernelArgs      (redux_ctx*           ctx);
+static int         reduxInvSchedule               (redux_ctx*           ctx);
+static int         reduxInvoke                    (redux_ctx*           ctx);
+static int         reduxInvCleanup                (redux_ctx*           ctx, int ret);
+static int         reduxInvCleanupMsg             (redux_ctx*           ctx, int ret,
+                                                   const char*          fmt, ...);
 
 /*     Invoker Utilities */
-static size_t     reduxInvEstimateParallelism   (const redux_ctx*  ctx);
-static int        reduxInvRequiresDst           (const redux_ctx*  ctx);
-static int        reduxInvRequiresDstArg        (const redux_ctx*  ctx);
-static unsigned   reduxInvGetSplitFree          (const redux_ctx*  ctx);
-static unsigned   reduxInvGetSplitReduce        (const redux_ctx*  ctx);
-static axis_desc* reduxInvGetSrcAxis            (const redux_ctx*  ctx, int i);
-static axis_desc* reduxInvGetSrcSortAxis        (const redux_ctx*  ctx, int i);
-static int        reduxTryFlattenOut            (const redux_ctx*  ctx,
-                                                 const axis_desc*  out);
-static int        reduxTryFlattenInto           (redux_ctx*        ctx,
-                                                 axis_desc*        into,
-                                                 const axis_desc*  from);
-static void       reduxSortAxisPtrsBy           (axis_desc**       ptrs,
-                                                 axis_desc*        axes,
-                                                 size_t            numAxes,
-                                                 int(*fn)(const void*, const void*));
+static size_t      reduxInvEstimateParallelism    (const redux_ctx*  ctx);
+static int         reduxInvRequiresS0             (const redux_ctx*  ctx);
+static int         reduxInvRequiresD0             (const redux_ctx*  ctx);
+static int         reduxInvRequiresD1             (const redux_ctx*  ctx);
+static axis_desc*  reduxInvGetSrcAxis             (const redux_ctx*  ctx, int i);
+static axis_desc*  reduxInvGetSrcSortAxis         (const redux_ctx*  ctx, int i);
+static int         reduxTryFlattenOut             (const redux_ctx*  ctx,
+                                                   const axis_desc*  axis);
+static int         reduxTryFlattenInto            (redux_ctx*        ctx,
+                                                   axis_desc*        into,
+                                                   const axis_desc*  from);
+static void        reduxSortAxisPtrsBy            (axis_desc**       ptrs,
+                                                   axis_desc*        axes,
+                                                   size_t            numAxes,
+                                                   int(*fn)(const void*, const void*));
 
 
 /* Function Implementations */
 /* Extern Functions */
-GPUARRAY_PUBLIC int   GpuReduction_new   (GpuReduction**   grOut,
-                                          gpucontext*      gpuCtx,
-                                          ga_reduce_op     op,
-                                          unsigned         ndf,
-                                          unsigned         ndr,
-                                          int              srcTypeCode,
-                                          int              flags){
-	if(!grOut){
+GPUARRAY_PUBLIC int   GpuReduction_new           (GpuReduction**       grOut,
+                                                  gpucontext*          gpuCtx,
+                                                  ga_reduce_op         op,
+                                                  unsigned             ndf,
+                                                  unsigned             ndr,
+                                                  int                  s0TypeCode,
+                                                  int                  flags){
+	if (!grOut){
 		return GA_INVALID_ERROR;
 	}
 	
 	*grOut = calloc(1, sizeof(**grOut));
-	if(*grOut){
-		(*grOut)->gpuCtx      = gpuCtx;
-		(*grOut)->op          = op;
-		(*grOut)->ndd         = (int)ndf;
-		(*grOut)->ndr         = (int)ndr;
-		(*grOut)->srcTypeCode = srcTypeCode;
-		(*grOut)->flags       = flags;
+	if (*grOut){
+		(*grOut)->gpuCtx = gpuCtx;
+		(*grOut)->op     = op;
+		(*grOut)->ndd    = (int)ndf;
+		(*grOut)->ndr    = (int)ndr;
+		(*grOut)->TS0tc  = s0TypeCode;
+		(*grOut)->flags  = flags;
 		
 		return reduxGenInit(*grOut);
 	}else{
 		return GA_MEMORY_ERROR;
 	}
 }
-GPUARRAY_PUBLIC void  GpuReduction_free  (GpuReduction*    gr){
+GPUARRAY_PUBLIC void  GpuReduction_free          (GpuReduction*        gr){
 	reduxGenCleanup(gr, !GA_NO_ERROR);
 }
-GPUARRAY_PUBLIC int   GpuReduction_call  (GpuReduction*    gr,
-                                          GpuArray*        dst,
-                                          GpuArray*        dstArg,
-                                          const GpuArray*  src,
-                                          unsigned         reduxLen,
-                                          const int*       reduxList,
-                                          int              flags){
+GPUARRAY_PUBLIC int   GpuReduction_call          (const GpuReduction*  gr,
+                                                  GpuArray*            d0,
+                                                  GpuArray*            d1,
+                                                  const GpuArray*      s0,
+                                                  unsigned             reduxLen,
+                                                  const int*           reduxList,
+                                                  int                  flags){
 	redux_ctx ctxSTACK, *ctx = &ctxSTACK;
 	memset(ctx, 0, sizeof(*ctx));
 
 	ctx->gr        = gr;
-	ctx->dst       = dst;
-	ctx->dstArg    = dstArg;
-	ctx->src       = src;
+	ctx->d0        = d0;
+	ctx->d1        = d1;
+	ctx->s0        = s0;
 	ctx->reduxLen  = reduxLen;
 	ctx->reduxList = reduxList;
 	ctx->flags     = flags;
@@ -497,7 +534,7 @@ GPUARRAY_PUBLIC int   GpuReduction_call  (GpuReduction*    gr,
  * @return Zero if successful; Non-zero if the datatype is not supported.
  */
 
-static int        reduxGetSumInit               (int typecode, const char** property){
+static int         reduxGetSumInit               (int typecode, const char** property){
 	if (typecode == GA_POINTER ||
 	    typecode == GA_BUFFER){
 		return GA_UNSUPPORTED_ERROR;
@@ -517,7 +554,7 @@ static int        reduxGetSumInit               (int typecode, const char** prop
  * @return Zero if successful; Non-zero if the datatype is not supported.
  */
 
-static int        reduxGetProdInit              (int typecode, const char** property){
+static int         reduxGetProdInit              (int typecode, const char** property){
 	if (typecode == GA_POINTER ||
 	    typecode == GA_BUFFER){
 		return GA_UNSUPPORTED_ERROR;
@@ -537,7 +574,7 @@ static int        reduxGetProdInit              (int typecode, const char** prop
  * @return Zero if successful; Non-zero if the datatype is not supported.
  */
 
-static int        reduxGetMinInit               (int typecode, const char** property){
+static int         reduxGetMinInit               (int typecode, const char** property){
 	switch (typecode){
 		case GA_BYTE2:
 		case GA_BYTE3:
@@ -627,7 +664,7 @@ static int        reduxGetMinInit               (int typecode, const char** prop
  * @return Zero if successful; Non-zero if the datatype is not supported.
  */
 
-static int        reduxGetMaxInit               (int typecode, const char** property){
+static int         reduxGetMaxInit               (int typecode, const char** property){
 	switch (typecode){
 		case GA_BOOL:
 		  *property = "1";
@@ -726,7 +763,7 @@ static int        reduxGetMaxInit               (int typecode, const char** prop
  * @return Zero if successful; Non-zero if the datatype is not supported.
  */
 
-static int        reduxGetAndInit               (int typecode, const char** property){
+static int         reduxGetAndInit               (int typecode, const char** property){
 	if (typecode == GA_POINTER ||
 	    typecode == GA_BUFFER){
 		return GA_UNSUPPORTED_ERROR;
@@ -746,7 +783,7 @@ static int        reduxGetAndInit               (int typecode, const char** prop
  * @return Zero if successful; Non-zero if the datatype is not supported.
  */
 
-static int        reduxGetOrInit                (int typecode, const char** property){
+static int         reduxGetOrInit                (int typecode, const char** property){
 	if (typecode == GA_POINTER ||
 	    typecode == GA_BUFFER){
 		return GA_UNSUPPORTED_ERROR;
@@ -756,7 +793,7 @@ static int        reduxGetOrInit                (int typecode, const char** prop
 }
 
 /**
- * @brief Returns whether the reduction is sensitive.
+ * @brief Returns whether the reduction is "sensitive".
  * 
  * A reduction is sensitive when its output satisfies at least one of the
  * following conditions:
@@ -782,8 +819,8 @@ static int        reduxGetOrInit                (int typecode, const char** prop
  * .
  */
 
-static int        reduxIsSensitive              (int               typecode){
-	switch (typecode){
+static int         reduxIsSensitive               (int op){
+	switch (op){
 		case GA_REDUCE_MINANDARGMIN:
 		case GA_REDUCE_MAXANDARGMAX:
 		case GA_REDUCE_ARGMIN:
@@ -794,6 +831,95 @@ static int        reduxIsSensitive              (int               typecode){
 	}
 }
 
+/**
+ * Get a name for the op, usable within a C identifier.
+ */
+
+static const char* reduxGetOpName                 (int op){
+	switch (op){
+		case GA_REDUCE_SUM:          return "Sum";
+		case GA_REDUCE_PROD:         return "Prod";
+		case GA_REDUCE_PRODNZ:       return "ProdNonZero";
+		case GA_REDUCE_MIN:          return "Min";
+		case GA_REDUCE_MAX:          return "Max";
+		case GA_REDUCE_ARGMIN:       return "Argmin";
+		case GA_REDUCE_ARGMAX:       return "Argmax";
+		case GA_REDUCE_MINANDARGMIN: return "MinAndArgmin";
+		case GA_REDUCE_MAXANDARGMAX: return "MaxAndArgmax";
+		case GA_REDUCE_AND:          return "And";
+		case GA_REDUCE_OR:           return "Or";
+		case GA_REDUCE_XOR:          return "Xor";
+		case GA_REDUCE_ALL:          return "All";
+		case GA_REDUCE_ANY:          return "Any";
+		default:                     return NULL;
+	}
+}
+
+/**
+ * Whether or not the typecode is a floating-point type.
+ */
+
+static int         reduxIsFloatingPoint           (int typecode){
+	switch(typecode){
+		case GA_HALF:
+		case GA_HALF2:
+		case GA_HALF4:
+		case GA_HALF8:
+		case GA_HALF16:
+		case GA_FLOAT:
+		case GA_FLOAT2:
+		case GA_FLOAT4:
+		case GA_FLOAT8:
+		case GA_FLOAT16:
+		case GA_DOUBLE:
+		case GA_DOUBLE2:
+		case GA_DOUBLE4:
+		case GA_DOUBLE8:
+		case GA_DOUBLE16:
+		case GA_QUAD:
+		case GA_CFLOAT:
+		case GA_CDOUBLE:
+		case GA_CQUAD:
+		  return 1;
+		default:
+		  return 0;
+	}
+}
+
+/**
+ * Compute ceil(log2(x)).
+ */
+
+static unsigned    reduxCeilLog2                  (uint64_t x){
+	int i;
+	
+	if (x <= 1){
+		return 1;
+	}
+	for (i=0,x--;x;i++,x>>=1){}
+	return i;
+}
+
+/**
+ * Compute next power of 2.
+ * 
+ * If x is a power of two already, return x.
+ */
+
+static uint64_t    reduxNextPow2                  (uint64_t x){
+	if (x & (x-1)){
+		x |= x >>  1;
+		x |= x >>  2;
+		x |= x >>  4;
+		x |= x >>  8;
+		x |= x >> 16;
+		x |= x >> 32;
+		return x+1;
+	}else{
+		return x;
+	}
+}
+
 /**
  * @brief Sort the axes into optimal order for flattening.
  * 
@@ -816,7 +942,7 @@ static int        reduxIsSensitive              (int               typecode){
  *   5.                      then by increasing source axis number.
  */
 
-static int        reduxSortFlatInsensitive      (const void* a, const void* b){
+static int         reduxSortFlatInsensitive      (const void* a, const void* b){
 	const axis_desc* xda  = (const axis_desc*)a;
 	const axis_desc* xdb  = (const axis_desc*)b;
 
@@ -826,15 +952,15 @@ static int        reduxSortFlatInsensitive      (const void* a, const void* b){
 		return -1;
 	}
 	
-	if       (axisGetSrcAbsStride(xda)  <  axisGetSrcAbsStride(xdb)){
+	if       (axisGetS0AbsStride(xda)  <  axisGetS0AbsStride(xdb)){
 		return +1;
-	}else if (axisGetSrcAbsStride(xda)  >  axisGetSrcAbsStride(xdb)){
+	}else if (axisGetS0AbsStride(xda)  >  axisGetS0AbsStride(xdb)){
 		return -1;
 	}
 
 	return 0;
 }
-static int        reduxSortFlatSensitive        (const void* a, const void* b){
+static int         reduxSortFlatSensitive        (const void* a, const void* b){
 	const axis_desc* xda  = (const axis_desc*)a;
 	const axis_desc* xdb  = (const axis_desc*)b;
 
@@ -847,9 +973,9 @@ static int        reduxSortFlatSensitive        (const void* a, const void* b){
 	if (axisIsReduced(xda)){
 		return axisGetReduxNum(xda)<axisGetReduxNum(xdb) ? -1 : +1;
 	}else{
-		if       (axisGetSrcAbsStride(xda)  <  axisGetSrcAbsStride(xdb)){
+		if       (axisGetS0AbsStride(xda)  <  axisGetS0AbsStride(xdb)){
 			return +1;
-		}else if (axisGetSrcAbsStride(xda)  >  axisGetSrcAbsStride(xdb)){
+		}else if (axisGetS0AbsStride(xda)  >  axisGetS0AbsStride(xdb)){
 			return -1;
 		}
 		
@@ -863,19 +989,19 @@ static int        reduxSortFlatSensitive        (const void* a, const void* b){
  * This means ascending order of absolute stride.
  */
 
-static int        reduxSortPtrIBSrcRdSelect     (const void* a, const void* b){
+static int         reduxSortPtrS0AbsStride       (const void* a, const void* b){
 	const axis_desc* xda  = *(const axis_desc* const*)a;
 	const axis_desc* xdb  = *(const axis_desc* const*)b;
 	
-	if       (axisGetSrcAbsStride(xda)  <  axisGetSrcAbsStride(xdb)){
+	if       (axisGetS0AbsStride(xda)  <  axisGetS0AbsStride(xdb)){
 		return -1;
-	}else if (axisGetSrcAbsStride(xda)  >  axisGetSrcAbsStride(xdb)){
+	}else if (axisGetS0AbsStride(xda)  >  axisGetS0AbsStride(xdb)){
 		return +1;
 	}
 
 	return 0;
 }
-static int        reduxSortPtrByReduxNum        (const void* a, const void* b){
+static int         reduxSortPtrByReduxNum        (const void* a, const void* b){
 	const axis_desc* xda  = *(const axis_desc* const*)a;
 	const axis_desc* xdb  = *(const axis_desc* const*)b;
 	
@@ -893,7 +1019,7 @@ static int        reduxSortPtrByReduxNum        (const void* a, const void* b){
 
 	return 0;
 }
-static int        reduxSortPtrIBDstWrSelect     (const void* a, const void* b){
+static int         reduxSortPtrD0WrSelect        (const void* a, const void* b){
 	const axis_desc* xda  = *(const axis_desc* const*)a;
 	const axis_desc* xdb  = *(const axis_desc* const*)b;
 	
@@ -919,15 +1045,15 @@ static int        reduxSortPtrIBDstWrSelect     (const void* a, const void* b){
 	}
 	
 	/* Otherwise it's sort by destination absolute stride. */
-	if       (axisGetDstAbsStride(xda)  <  axisGetDstAbsStride(xdb)){
+	if       (axisGetD0AbsStride(xda)  <  axisGetD0AbsStride(xdb)){
 		return -1;
-	}else if (axisGetDstAbsStride(xda)  >  axisGetDstAbsStride(xdb)){
+	}else if (axisGetD0AbsStride(xda)  >  axisGetD0AbsStride(xdb)){
 		return +1;
 	}
 
 	return 0;
 }
-static int        reduxSortPtrIBDstArgWrSelect  (const void* a, const void* b){
+static int         reduxSortPtrD1WrSelect        (const void* a, const void* b){
 	const axis_desc* xda  = *(const axis_desc* const*)a;
 	const axis_desc* xdb  = *(const axis_desc* const*)b;
 	
@@ -953,15 +1079,15 @@ static int        reduxSortPtrIBDstArgWrSelect  (const void* a, const void* b){
 	}
 	
 	/* Otherwise it's sort by destination argument absolute stride. */
-	if       (axisGetDstArgAbsStride(xda)  <  axisGetDstArgAbsStride(xdb)){
+	if       (axisGetD1AbsStride(xda)  <  axisGetD1AbsStride(xdb)){
 		return -1;
-	}else if (axisGetDstArgAbsStride(xda)  >  axisGetDstArgAbsStride(xdb)){
+	}else if (axisGetD1AbsStride(xda)  >  axisGetD1AbsStride(xdb)){
 		return +1;
 	}
 
 	return 0;
 }
-static int        reduxSortPtrInsertFinalOrder  (const void* a, const void* b){
+static int         reduxSortPtrInsertFinalOrder  (const void* a, const void* b){
 	const axis_desc* xda  = *(const axis_desc* const*)a;
 	const axis_desc* xdb  = *(const axis_desc* const*)b;
 	
@@ -973,7 +1099,7 @@ static int        reduxSortPtrInsertFinalOrder  (const void* a, const void* b){
 		return +1;
 	}
 	
-	if(axisIsIntra(xda)){
+	if (axisIsIntra(xda)){
 		/**
 		 * Intra axes sort between themselves by descending intra axis number.
 		 */
@@ -999,9 +1125,9 @@ static int        reduxSortPtrInsertFinalOrder  (const void* a, const void* b){
 			return +1;
 		}
 		
-		if       (axisGetSrcAbsStride(xda)  <  axisGetSrcAbsStride(xdb)){
+		if       (axisGetS0AbsStride(xda)  <  axisGetS0AbsStride(xdb)){
 			return -1;
-		}else if (axisGetSrcAbsStride(xda)  >  axisGetSrcAbsStride(xdb)){
+		}else if (axisGetS0AbsStride(xda)  >  axisGetS0AbsStride(xdb)){
 			return +1;
 		}
 	}
@@ -1016,28 +1142,28 @@ static int        reduxSortPtrInsertFinalOrder  (const void* a, const void* b){
  * @brief Initialize Axis Description.
  */
 
-static void       axisInit                      (axis_desc*       axis,
-                                                 ssize_t          len,
-                                                 ssize_t          srcStride){
+static void        axisInit                      (axis_desc*       axis,
+                                                  ssize_t          len,
+                                                  ssize_t          s0S){
 	memset(axis, 0, sizeof(*axis));
 	
-	axis->reduxNum        = -1;
-	axis->ibNum           = -1;
-	axis->ibp             = 0;
-	axis->len             = len;
-	axis->splitLen        = 1;
-	axis->pdim            = 0;
+	axis->reduxNum = -1;
+	axis->ibNum    = -1;
+	axis->perm     = 0;
+	axis->len      = len;
+	axis->splitLen = 1;
+	axis->i0S      = 0;
 	
-	axis->srcStride       = srcStride;
-	axis->dstStride       = 0;
-	axis->dstArgStride    = 0;
+	axis->s0S      = s0S;
+	axis->d0S      = 0;
+	axis->d1S      = 0;
 }
 
 /**
  * @brief Mark axis as reduction axis, with position reduxNum in the axis list.
  */
 
-static void       axisMarkReduced               (axis_desc*       axis, int    reduxNum){
+static void        axisMarkReduced               (axis_desc*       axis, int    reduxNum){
 	axis->isReduced = 1;
 	axis->reduxNum  = reduxNum;
 }
@@ -1046,9 +1172,9 @@ static void       axisMarkReduced               (axis_desc*       axis, int    r
  * @brief Mark axis as (split) intrablock axis.
  */
 
-static void       axisMarkIntraBlock            (axis_desc*       axis,
-                                                 int              ibNum,
-                                                 size_t           ibLen){
+static void        axisMarkIntraBlock            (axis_desc*       axis,
+                                                  int              ibNum,
+                                                  size_t           ibLen){
 	axis->isIntra  = 1;
 	axis->ibNum    = ibNum;
 	axis->splitLen = ibLen;
@@ -1058,13 +1184,13 @@ static void       axisMarkIntraBlock            (axis_desc*       axis,
  * @brief Get properties of an axis.
  */
 
-static int        axisGetReduxNum               (const axis_desc* axis){
+static int         axisGetReduxNum               (const axis_desc* axis){
 	return axis->reduxNum;
 }
-static size_t     axisGetLen                    (const axis_desc* axis){
+static size_t      axisGetLen                    (const axis_desc* axis){
 	return axis->len;
 }
-static size_t     axisGetIntraLen               (const axis_desc* axis){
+static size_t      axisGetIntraLen               (const axis_desc* axis){
 	if       (axisIsSplit(axis)){
 		return axis->splitLen;
 	}else if (axisIsIntra(axis)){
@@ -1073,7 +1199,7 @@ static size_t     axisGetIntraLen               (const axis_desc* axis){
 		return 1;
 	}
 }
-static size_t     axisGetInterLen               (const axis_desc* axis){
+static size_t      axisGetInterLen               (const axis_desc* axis){
 	if       (axisIsSplit(axis)){
 		return DIVIDECEIL(axis->len, axis->splitLen);
 	}else if (axisIsIntra(axis)){
@@ -1082,88 +1208,77 @@ static size_t     axisGetInterLen               (const axis_desc* axis){
 		return axis->len;
 	}
 }
-static size_t     axisGetIntraInterLen          (const axis_desc* axis){
+static size_t      axisGetIntraInterLen          (const axis_desc* axis){
 	return axisGetIntraLen(axis)*axisGetInterLen(axis);
 }
-static ssize_t    axisGetSrcStride              (const axis_desc* axis){
-	return axisGetLen(axis) > 1 ? axis->srcStride : 0;
+static ssize_t     axisGetS0Stride               (const axis_desc* axis){
+	return axisGetLen(axis) > 1 ? axis->s0S : 0;
 }
-static size_t     axisGetSrcAbsStride           (const axis_desc* axis){
-	return axisGetSrcStride(axis)<0 ? -(size_t)axisGetSrcStride(axis):
-	                                  +(size_t)axisGetSrcStride(axis);
+static size_t      axisGetS0AbsStride            (const axis_desc* axis){
+	return axisGetS0Stride(axis)<0 ? -(size_t)axisGetS0Stride(axis):
+	                                  +(size_t)axisGetS0Stride(axis);
 }
-static ssize_t    axisGetDstStride              (const axis_desc* axis){
-	return axisGetLen(axis) > 1 ? axis->dstStride : 0;
+static ssize_t     axisGetD0Stride               (const axis_desc* axis){
+	return axisGetLen(axis) > 1 ? axis->d0S : 0;
 }
-static size_t     axisGetDstAbsStride           (const axis_desc* axis){
-	return axisGetDstStride(axis)<0 ? -(size_t)axisGetDstStride(axis):
-	                                  +(size_t)axisGetDstStride(axis);
+static size_t      axisGetD0AbsStride            (const axis_desc* axis){
+	return axisGetD0Stride(axis)<0 ? -(size_t)axisGetD0Stride(axis):
+	                                  +(size_t)axisGetD0Stride(axis);
 }
-static ssize_t    axisGetDstArgStride           (const axis_desc* axis){
-	return axisGetLen(axis) > 1 ? axis->dstArgStride : 0;
+static ssize_t     axisGetD1Stride               (const axis_desc* axis){
+	return axisGetLen(axis) > 1 ? axis->d1S : 0;
 }
-static size_t     axisGetDstArgAbsStride        (const axis_desc* axis){
-	return axisGetDstArgStride(axis)<0 ? -(size_t)axisGetDstArgStride(axis):
-	                                     +(size_t)axisGetDstArgStride(axis);
+static size_t      axisGetD1AbsStride            (const axis_desc* axis){
+	return axisGetD1Stride(axis)<0 ? -(size_t)axisGetD1Stride(axis):
+	                                     +(size_t)axisGetD1Stride(axis);
 }
-static unsigned   axisGetIBP                    (const axis_desc* axis){
-	return axis->ibp;
+static size_t      axisGetI0Stride               (const axis_desc*     axis){
+	return axis->i0S;
 }
-static int        axisGetIBNum                  (const axis_desc* axis){
-	return axis->ibNum;
+static void        axisSetI0Stride               (axis_desc*           axis,
+                                                  size_t               i0S){
+	axis->i0S = i0S;
 }
-static void       axisSetIBP                    (axis_desc*       axis,
-                                                 unsigned         ibp){
-	axis->ibp = ibp;
+static unsigned    axisGetPerm                   (const axis_desc* axis){
+	return axis->perm;
 }
-static size_t     axisGetPDim                   (const axis_desc*     axis){
-	return axis->pdim;
+static int         axisGetIBNum                  (const axis_desc* axis){
+	return axis->ibNum;
 }
-static void       axisSetPDim                   (axis_desc*           axis,
-                                                 size_t               pdim){
-	axis->pdim = pdim;
+static void        axisSetPerm                   (axis_desc*       axis,
+                                                  unsigned         perm){
+	axis->perm = perm;
 }
-static int        axisIsReduced                 (const axis_desc* axis){
+static int         axisIsReduced                 (const axis_desc* axis){
 	return axis->isReduced;
 }
-static int        axisIsIntra                   (const axis_desc* axis){
+static int         axisIsIntra                   (const axis_desc* axis){
 	return axis->isIntra;
 }
-static int        axisIsInter                   (const axis_desc* axis){
+static int         axisIsInter                   (const axis_desc* axis){
 	return !axisIsIntra(axis);
 }
-static int        axisIsSplit                   (const axis_desc* axis){
+static int         axisIsSplit                   (const axis_desc* axis){
 	return axisIsIntra(axis) && axis->splitLen != axis->len;
 }
-static size_t     reduxInvEstimateParallelism   (const redux_ctx*  ctx){
+static size_t      reduxInvEstimateParallelism   (const redux_ctx*  ctx){
 	return reduxGenEstimateParallelism(ctx->gr);
 }
-static int        reduxInvRequiresDst           (const redux_ctx*  ctx){
-	return reduxGenRequiresDst(ctx->gr);
+static int         reduxInvRequiresS0            (const redux_ctx*  ctx){
+	return reduxGenRequiresS0(ctx->gr);
 }
-static int        reduxInvRequiresDstArg        (const redux_ctx*  ctx){
-	return reduxGenRequiresDstArg(ctx->gr);
+static int         reduxInvRequiresD0            (const redux_ctx*  ctx){
+	return reduxGenRequiresD0(ctx->gr);
 }
-static unsigned   reduxInvGetSplitFree          (const redux_ctx*  ctx){
-	if(ctx->xdSplit && !axisIsReduced(ctx->xdSplit)){
-		return axisGetIntraLen(ctx->xdSplit);
-	}else{
-		return 1;
-	}
-}
-static unsigned   reduxInvGetSplitReduce        (const redux_ctx*  ctx){
-	if(ctx->xdSplit && axisIsReduced(ctx->xdSplit)){
-		return axisGetIntraLen(ctx->xdSplit);
-	}else{
-		return 1;
-	}
+static int         reduxInvRequiresD1            (const redux_ctx*  ctx){
+	return reduxGenRequiresD1(ctx->gr);
 }
 
 /**
  * @brief Get description of source axis with given number.
  */
 
-static axis_desc* reduxInvGetSrcAxis            (const redux_ctx*  ctx, int i){
+static axis_desc*  reduxInvGetSrcAxis            (const redux_ctx*  ctx, int i){
 	return &ctx->xdSrc[i];
 }
 
@@ -1171,7 +1286,7 @@ static axis_desc* reduxInvGetSrcAxis            (const redux_ctx*  ctx, int i){
  * @brief Get description of source axis with given number in sort-order.
  */
 
-static axis_desc* reduxInvGetSrcSortAxis        (const redux_ctx*  ctx, int i){
+static axis_desc*  reduxInvGetSrcSortAxis        (const redux_ctx*  ctx, int i){
 	return ctx->xdSrcPtrs[i];
 }
 
@@ -1187,10 +1302,10 @@ static axis_desc* reduxInvGetSrcSortAxis        (const redux_ctx*  ctx, int i){
  * @return Non-zero if flattening attempt successful; Zero otherwise.
  */
 
-static int        reduxTryFlattenOut            (const redux_ctx*  ctx,
-                                                 const axis_desc*  out){
-	if ((axisGetLen   (out) == 1                   )||
-	    (axisIsReduced(out) && ctx->zeroRdxAxes > 0)){
+static int         reduxTryFlattenOut            (const redux_ctx*  ctx,
+                                                  const axis_desc*  axis){
+	if ((axisGetLen   (axis) == 1                   )||
+	    (axisIsReduced(axis) && ctx->zeroRdxAxes > 0)){
 		return 1;
 	}else{
 		return 0;
@@ -1218,66 +1333,66 @@ static int        reduxTryFlattenOut            (const redux_ctx*  ctx,
  * @return Non-zero if flattening attempt successful; Zero otherwise.
  */
 
-static int        reduxTryFlattenInto           (redux_ctx*        ctx,
-                                                 axis_desc*        into,
-                                                 const axis_desc*  from){
-	int signSrc    = 0, signDst    = 0, signDstArg    = 0,
-	    reverseSrc = 0, reverseDst = 0, reverseDstArg = 0;
+static int         reduxTryFlattenInto           (redux_ctx*        ctx,
+                                                  axis_desc*        into,
+                                                  const axis_desc*  from){
+	int signS0    = 0, signD0    = 0, signD1    = 0,
+	    reverseS0 = 0, reverseD0 = 0, reverseD1 = 0;
 	
-	if (axisIsReduced         (into) != axisIsReduced         (from)                 ||
-	    axisGetSrcAbsStride   (into) != axisGetSrcAbsStride   (from)*axisGetLen(from)){
+	if (axisIsReduced     (into) != axisIsReduced     (from)                 ||
+	    axisGetS0AbsStride(into) != axisGetS0AbsStride(from)*axisGetLen(from)){
 		return 0;
 	}
 	
-	if (reduxInvRequiresDst   (ctx) &&
-	    axisGetDstAbsStride   (into) != axisGetDstAbsStride   (from)*axisGetLen(from)){
+	if (reduxInvRequiresD0(ctx)  &&
+	    axisGetD0AbsStride(into) != axisGetD0AbsStride(from)*axisGetLen(from)){
 		return 0;
 	}
 	
-	if (reduxInvRequiresDstArg(ctx) &&
-	    axisGetDstArgAbsStride(into) != axisGetDstArgAbsStride(from)*axisGetLen(from)){
+	if (reduxInvRequiresD1(ctx)  &&
+	    axisGetD1AbsStride(into) != axisGetD1AbsStride(from)*axisGetLen(from)){
 		return 0;
 	}
 	
-	signSrc       = (axisGetSrcStride   (into)^axisGetSrcStride   (from)) < 0;
-	signDst       = (axisGetDstStride   (into)^axisGetDstStride   (from)) < 0;
-	signDstArg    = (axisGetDstArgStride(into)^axisGetDstArgStride(from)) < 0;
-	reverseSrc    = signSrc;
-	reverseDst    = signDst    && reduxInvRequiresDst   (ctx);
-	reverseDstArg = signDstArg && reduxInvRequiresDstArg(ctx);
+	signS0    = (axisGetS0Stride(into)^axisGetS0Stride(from)) < 0;
+	signD0    = (axisGetD0Stride(into)^axisGetD0Stride(from)) < 0;
+	signD1    = (axisGetD1Stride(into)^axisGetD1Stride(from)) < 0;
+	reverseS0 = signS0;
+	reverseD0 = signD0 && reduxInvRequiresD0(ctx);
+	reverseD1 = signD1 && reduxInvRequiresD1(ctx);
 	
 	if (reduxIsSensitive(ctx->op)){
-		if(reverseSrc || reverseDst || reverseDstArg){
+		if (reverseS0 || reverseD0 || reverseD1){
 			return 0;
 		}
 	}
 	
-	if (reduxInvRequiresDst   (ctx) &&
-	    reduxInvRequiresDstArg(ctx) &&
-	    reverseDst != reverseDstArg){
+	if (reduxInvRequiresD0(ctx) &&
+	    reduxInvRequiresD1(ctx) &&
+	    reverseD0 != reverseD1){
 		/* Either both, or neither, of dst and dstArg must require reversal. */
 		return 0;
 	}
 	
-	if (reverseSrc){
-		ctx->flatSrcOffset    += (ssize_t)(axisGetLen(from)-1)*axisGetSrcStride(from);
-		into->srcStride        = -axisGetSrcStride   (from);
+	if (reverseS0){
+		ctx->S0Off += (ssize_t)(axisGetLen(from)-1)*axisGetS0Stride(from);
+		into->s0S   = -axisGetS0Stride(from);
 	}else{
-		into->srcStride        =  axisGetSrcStride   (from);
+		into->s0S   =  axisGetS0Stride(from);
 	}
 	
-	if (reverseDst){
-		ctx->flatDstOffset    += (ssize_t)(axisGetLen(from)-1)*axisGetDstStride(from);
-		into->dstStride        = -axisGetDstStride   (from);
+	if (reverseD0){
+		ctx->D0Off += (ssize_t)(axisGetLen(from)-1)*axisGetD0Stride(from);
+		into->d0S   = -axisGetD0Stride(from);
 	}else{
-		into->dstStride        =  axisGetDstStride   (from);
+		into->d0S   =  axisGetD0Stride(from);
 	}
 	
-	if (reverseDstArg){
-		ctx->flatDstArgOffset += (ssize_t)(axisGetLen(from)-1)*axisGetDstArgStride(from);
-		into->dstArgStride     = -axisGetDstArgStride(from);
+	if (reverseD1){
+		ctx->D1Off += (ssize_t)(axisGetLen(from)-1)*axisGetD1Stride(from);
+		into->d1S   = -axisGetD1Stride(from);
 	}else{
-		into->dstArgStride     =  axisGetDstArgStride(from);
+		into->d1S   =  axisGetD1Stride(from);
 	}
 	
 	into->len *= axisGetLen(from);
@@ -1290,13 +1405,13 @@ static int        reduxTryFlattenInto           (redux_ctx*        ctx,
  * not touching the axes themselves.
  */
 
-static void       reduxSortAxisPtrsBy           (axis_desc**       ptrs,
-                                                 axis_desc*        axes,
-                                                 size_t            numAxes,
-                                                 int(*fn)(const void*, const void*)){
+static void        reduxSortAxisPtrsBy           (axis_desc**       ptrs,
+                                                  axis_desc*        axes,
+                                                  size_t            numAxes,
+                                                  int(*fn)(const void*, const void*)){
 	size_t i;
 	
-	for(i=0;i<numAxes;i++){
+	for (i=0;i<numAxes;i++){
 		ptrs[i] = &axes[i];
 	}
 	
@@ -1310,7 +1425,7 @@ static void       reduxSortAxisPtrsBy           (axis_desc**       ptrs,
  * After this function, calling reduxGenCleanup*() becomes safe.
  */
 
-static int        reduxGenInit                  (GpuReduction*     gr){
+static int         reduxGenInit                  (GpuReduction*     gr){
 	gr->kArgTypeCodes = NULL;
 	gr->kSourceCode   = NULL;
 	gr->kErrorString  = NULL;
@@ -1323,23 +1438,27 @@ static int        reduxGenInit                  (GpuReduction*     gr){
  * @brief Begin inferring the properties of the reduction operator.
  */
 
-static int        reduxGenInferProperties       (GpuReduction*     gr){
-	int i, ret;
+static int         reduxGenInferProperties       (GpuReduction*     gr){
+	int i;
 	
 	
 	/**
 	 * Insane arguments?
 	 */
 	
-	if(gr->ndr <= 0){
+	if (gr->op < 0 || gr->op >= GA_REDUCE_ENDSUPPORTED){
+		return reduxGenCleanupMsg(gr, GA_INVALID_ERROR,
+		       "Unknown reduction operation!\n");
+	}
+	if (gr->ndr <= 0){
 		return reduxGenCleanupMsg(gr, GA_INVALID_ERROR,
 		       "No reduction axes!\n");
 	}
-	if(gr->ndd <  0){
+	if (gr->ndd <  0){
 		return reduxGenCleanupMsg(gr, GA_INVALID_ERROR,
-		       "Destination has less than 0 dimensions!\n");
+		       "Destination tensor has less than 0 rank!\n");
 	}
-	if(gr->flags != 0){
+	if (gr->flags != 0){
 		return reduxGenCleanupMsg(gr, GA_INVALID_ERROR,
 		       "\"flags\" must be set to 0!\n");
 	}
@@ -1377,180 +1496,307 @@ static int        reduxGenInferProperties       (GpuReduction*     gr){
 	 * Type management.
 	 * 
 	 * - Deal with the various typecodes.
-	 * - Determine initializer and error out if reduction unsupported on that
-	 *   datatype.
 	 */
 
-	gr->dstTypeCode    = gr->srcTypeCode;
-	gr->dstArgTypeCode = GA_SSIZE;
-	gr->idxTypeCode    = GA_SSIZE;
-	switch (gr->srcTypeCode){
+	gr->TD0tc  = gr->TS0tc;
+	gr->TD1tc  = GA_SSIZE;
+	gr->TS32tc = GA_INT;
+	gr->TU32tc = GA_UINT;
+	gr->TS64tc = GA_LONG;
+	gr->TU64tc = GA_ULONG;
+	switch(gr->op){
+		case GA_REDUCE_AND:
+		case GA_REDUCE_OR:
+		case GA_REDUCE_XOR:
+			if (reduxIsFloatingPoint(gr->TS0tc)){
+				return reduxGenCleanupMsg(gr, GA_INVALID_ERROR,
+				    "Bitwise operations not applicable to floating-point datatypes!\n");
+			}
+		break;
+		default:
+		break;
+	}
+	reduxGenSetKTypes(gr);
+	
+	
+	/**
+	 * Compute number of kernel arguments and construct kernel argument
+	 * typecode list.
+	 */
+	
+	reduxGenSetMaxBS(gr);
+	reduxGenIterArgs(gr, reduxGenCountArgs, &gr->kNumArgs);
+	gr->kArgTypeCodes = calloc(gr->kNumArgs, sizeof(*gr->kArgTypeCodes));
+	if (!gr->kArgTypeCodes){
+		return reduxGenCleanupMsg(gr, GA_MEMORY_ERROR,
+		                          "Failed to allocate memory for kernel arguments "
+		                          "typecode list!\n");
+	}
+	i = 0;
+	reduxGenIterArgs(gr, reduxGenSaveArgTypecodes, &i);
+	
+	
+	/* Generate source code. */
+	return reduxGenSrc(gr);
+}
+
+/**
+ * Compute maximum block size we shall support in generated kernels.
+ */
+
+static void        reduxGenSetMaxBS              (GpuReduction*        gr){
+	gr->maxBS = gr->maxLM/reduxGenGetReduxStateSize(gr);
+	gr->maxBS = gr->maxBS < gr->maxLg ? gr->maxBS : gr->maxLg;
+	gr->maxBS = gr->maxBS < gr->maxL0 ? gr->maxBS : gr->maxL0;
+	
+	/**
+	 * In practice we want a moderate amount of blocks, not just one monolith
+	 * that occupies a processor for its entire lifetime. E.g. An NVIDIA GPU
+	 * supports 1024 threads / block, but we shall gun for less than that.
+	 * 
+	 * Our heuristic shall be to divide by 4 the maximum number of threads per
+	 * block, so that there's 4 times more blocks than normally there would be.
+	 * This helps on many fronts:
+	 * 
+	 *   - A smaller "tail effect" when the last huge block must wait its turn
+	 *     and then delays the completion of the entire grid
+	 *   - The horizontal reductions take less time per block, and sometimes
+	 *     horizontal reduction time can dominate performance.
+	 *   - Less time taken for across-thread synchronization; And whenever a
+	 *     block's threads are stalled waiting for synchronization, another
+	 *     block's threads can fill in with their global memory requests.
+	 */
+	
+	if (gr->maxBS >= 16){
+		gr->maxBS /= 4;
+	}
+	
+	/* Since ceil(log2(maxBS)) is also heavily used, compute it here */
+	gr->log2MaxBS = reduxCeilLog2(gr->maxBS);
+}
+
+/**
+ * Decide on the TK* accumulator types and initializers we will use.
+ * 
+ * Currently, the only special thing we do is to promote the accumulator type
+ * to GA_FLOATx if the source type is GA_HALFx:
+ * 
+ *     TPS0 = promotion(TS0)
+ * 
+ * Therefore, it is currently always the case that TK0 == TPS0.
+ * 
+ * In the future this might become wierder when the accumulator is a Kahan
+ * summation, for instance, and then TK0 != promoted(TS0).
+ * 
+ * If the user guaranteed to us that TK1 can be made narrower than 64-bit
+ * unsigned through, perhaps, a flag, this is also where we set it.
+ */
+
+static void        reduxGenSetKTypes             (GpuReduction*        gr){
+	const gpuarray_type *TK0     = NULL, *TK1     = NULL, *TPS0    = NULL;
+	const char*          TK0init = NULL;
+	
+	/**
+	 * Handle TPS0 type promotion....
+	 */
+	
+	switch (gr->TS0tc){
 		case GA_HALF:
-		  gr->accTypeCode = GA_FLOAT;
+		  TPS0 = gpuarray_get_type(GA_FLOAT);
 		break;
 		case GA_HALF2:
-		  gr->accTypeCode = GA_FLOAT2;
+		  TPS0 = gpuarray_get_type(GA_FLOAT2);
 		break;
 		case GA_HALF4:
-		  gr->accTypeCode = GA_FLOAT4;
+		  TPS0 = gpuarray_get_type(GA_FLOAT4);
 		break;
 		case GA_HALF8:
-		  gr->accTypeCode = GA_FLOAT8;
+		  TPS0 = gpuarray_get_type(GA_FLOAT8);
 		break;
 		case GA_HALF16:
-		  gr->accTypeCode = GA_FLOAT16;
+		  TPS0 = gpuarray_get_type(GA_FLOAT16);
 		break;
 		default:
-		  gr->accTypeCode = gr->srcTypeCode;
-	}
-	gr->srcTypeStr     = gpuarray_get_type(gr->srcTypeCode)   ->cluda_name;
-	gr->dstTypeStr     = gpuarray_get_type(gr->dstTypeCode)   ->cluda_name;
-	gr->dstArgTypeStr  = gpuarray_get_type(gr->dstArgTypeCode)->cluda_name;
-	gr->idxTypeStr     = gpuarray_get_type(gr->idxTypeCode)   ->cluda_name;
-	gr->accTypeStr     = gpuarray_get_type(gr->accTypeCode)   ->cluda_name;
-	if (!gr->srcTypeStr    ||
-	    !gr->dstTypeStr    ||
-	    !gr->dstArgTypeStr ||
-	    !gr->idxTypeStr    ||
-	    !gr->accTypeStr    ){
-		return reduxGenCleanupMsg(gr, GA_INVALID_ERROR,
-		                          "Have typecode with no CLUDA name!\n");
+		  TPS0 = gpuarray_get_type(gr->TS0tc);
 	}
+	gr->TPS0tc = TPS0->typecode;
+	
+	
+	/**
+	 * Each operator may define and initialize TK0 and/or TK1 any way
+	 * they want.
+	 */
+	
 	switch (gr->op){
 		case GA_REDUCE_SUM:
-		  ret = reduxGetSumInit (gr->accTypeCode, &gr->initVal);
+		  TK0 = TPS0;
+		  reduxGetSumInit (TK0->typecode, &TK0init);
+		  gr->TK0.align = TK0->align;
+		  gr->TK0.size  = TK0->size;
+		  sprintf(gr->TK0.defn, "%s", TK0->cluda_name);
+		  sprintf(gr->TK0.init, "%s", TK0init);
 		break;
 		case GA_REDUCE_PRODNZ:
 		case GA_REDUCE_PROD:
-		  ret = reduxGetProdInit(gr->accTypeCode, &gr->initVal);
+		  TK0 = TPS0;
+		  reduxGetProdInit(TK0->typecode, &TK0init);
+		  gr->TK0.align = TK0->align;
+		  gr->TK0.size  = TK0->size;
+		  sprintf(gr->TK0.defn, "%s", TK0->cluda_name);
+		  sprintf(gr->TK0.init, "%s", TK0init);
 		break;
 		case GA_REDUCE_MINANDARGMIN:
 		case GA_REDUCE_ARGMIN:
 		case GA_REDUCE_MIN:
-		  ret = reduxGetMinInit (gr->accTypeCode, &gr->initVal);
+		  TK0 = TPS0;
+		  TK1 = gpuarray_get_type(GA_SIZE);
+		  reduxGetMinInit (TK0->typecode, &TK0init);
+		  gr->TK0.align = TK0->align;
+		  gr->TK0.size  = TK0->size;
+		  sprintf(gr->TK0.defn, "%s", TK0->cluda_name);
+		  sprintf(gr->TK0.init, "%s", TK0init);
+		  gr->TK1.align = TK1->align;
+		  gr->TK1.size  = TK1->size;
+		  sprintf(gr->TK1.defn, "%s", TK1->cluda_name);
+		  sprintf(gr->TK1.init, "0");
 		break;
 		case GA_REDUCE_MAXANDARGMAX:
 		case GA_REDUCE_ARGMAX:
 		case GA_REDUCE_MAX:
-		  ret = reduxGetMaxInit (gr->accTypeCode, &gr->initVal);
+		  TK0 = TPS0;
+		  TK1 = gpuarray_get_type(GA_SIZE);
+		  reduxGetMaxInit (TK0->typecode, &TK0init);
+		  gr->TK0.align = TK0->align;
+		  gr->TK0.size  = TK0->size;
+		  sprintf(gr->TK0.defn, "%s", TK0->cluda_name);
+		  sprintf(gr->TK0.init, "%s", TK0init);
+		  gr->TK1.align = TK1->align;
+		  gr->TK1.size  = TK1->size;
+		  sprintf(gr->TK1.defn, "%s", TK1->cluda_name);
+		  sprintf(gr->TK1.init, "0");
 		break;
 		case GA_REDUCE_ALL:
 		case GA_REDUCE_AND:
-		  ret = reduxGetAndInit (gr->accTypeCode, &gr->initVal);
+		  TK0 = TPS0;
+		  reduxGetAndInit (TK0->typecode, &TK0init);
+		  gr->TK0.align = TK0->align;
+		  gr->TK0.size  = TK0->size;
+		  sprintf(gr->TK0.defn, "%s", TK0->cluda_name);
+		  sprintf(gr->TK0.init, "%s", TK0init);
 		break;
 		case GA_REDUCE_ANY:
 		case GA_REDUCE_XOR:
 		case GA_REDUCE_OR:
-		  ret = reduxGetOrInit  (gr->accTypeCode, &gr->initVal);
+		  TK0 = TPS0;
+		  reduxGetOrInit  (TK0->typecode, &TK0init);
+		  gr->TK0.align = TK0->align;
+		  gr->TK0.size  = TK0->size;
+		  sprintf(gr->TK0.defn, "%s", TK0->cluda_name);
+		  sprintf(gr->TK0.init, "%s", TK0init);
 		break;
 		default:
-		  ret = GA_UNSUPPORTED_ERROR;
-	}
-	if (ret != GA_NO_ERROR){
-		return reduxGenCleanupMsg(gr, ret,
-		       "Problem selecting types to be used in reduction!\n");
-	}
-	
-	
-	/* Compute floor(log2(gr->log2MaxL)). */
-	gr->log2MaxL = gr->maxLg-1;
-	for(i=1;gr->log2MaxL & (gr->log2MaxL+1);i*=2){
-		gr->log2MaxL |= gr->log2MaxL>>i;
-	}
-	for(i=0;gr->log2MaxL;i++){
-		gr->log2MaxL >>= 1;
-	}
-	gr->log2MaxL = i?i:1;
-	
-	
-	/**
-	 * Compute number of kernel arguments and construct kernel argument
-	 * typecode list.
-	 */
-	
-	reduxGenIterArgs(gr, reduxGenCountArgs, 0);
-	gr->kArgTypeCodes = calloc(gr->kNumArgs, sizeof(*gr->kArgTypeCodes));
-	if(!gr->kArgTypeCodes){
-		return reduxGenCleanupMsg(gr, GA_MEMORY_ERROR,
-		                          "Failed to allocate memory for kernel arguments "
-		                          "typecode list!\n");
+		  ;/* Unreachable */
 	}
-	i = 0;
-	reduxGenIterArgs(gr, reduxGenSaveArgTypecodes, &i);
-	
-	
-	/* Generate source code. */
-	return reduxGenSrc(gr);
 }
 
 /**
  * Iterate over the arguments of the reduction operator.
  */
 
-static void       reduxGenIterArgs              (GpuReduction*        gr,
-                                                 GpuReductionIterFn   fn,
-                                                 void*                user){
+static void        reduxGenIterArgs              (const GpuReduction*  gr,
+                                                  GpuReductionIterFn   fn,
+                                                  void*                user){
 	int k;
 	
-	fn(gr, GA_INT,    "int",                      "phase",       0, user);
-	fn(gr, GA_SIZE,   "TX",                       "U",           0, user);
-	fn(gr, GA_SIZE,   "TX",                       "V",           0, user);
-	fn(gr, GA_SIZE,   "TX",                       "B",           0, user);
-	fn(gr, GA_UINT,   "unsigned",                 "D",           0, user);
-	fn(gr, GA_UINT,   "unsigned",                 "H",           0, user);
-	fn(gr, GA_UINT,   "unsigned",                 "splitFree",   0, user);
-	fn(gr, GA_UINT,   "unsigned",                 "splitReduce", 0, user);
-	for(k=0;k < gr->nds;k++){
-		fn(gr, GA_SIZE,   "TX",                       "l%d",         k, user);
-	}
-	for(k=gr->ndd;k < gr->nds && reduxGenRequiresDstArg(gr);k++){
-		fn(gr, GA_SIZE,   "TX",                       "l%dPDim",     k, user);
-	}
-	fn(gr, GA_BUFFER, "const GLOBAL_MEM char* restrict",   "s",           0, user);
-	fn(gr, GA_SSIZE,  "TX",                       "sOff",        0, user);
-	for(k=0;k < gr->nds;k++){
-		fn(gr, GA_SIZE,   "TX",                       "sJ%d",        k, user);
-	}
-	if(reduxGenRequiresDst   (gr)){
-		fn(gr, GA_BUFFER, "GLOBAL_MEM char* restrict",         "d",           0, user);
-		fn(gr, GA_SSIZE,  "TX",                       "dOff",        0, user);
-		for(k=0;k < gr->ndd;k++){
-			fn(gr, GA_SIZE,   "TX",                       "dJ%d",        k, user);
+	/**
+	 * Template selector
+	 */
+	
+	fn(gr, gr->TU32tc, "TU32",                              "selector",    0, user);
+	
+	/**
+	 * "Universal" parameters describing the partitioning of the problem.
+	 */
+	
+	fn(gr, gr->TU64tc, "TU64",                              "U",           0, user);
+	fn(gr, gr->TU64tc, "TU64",                              "V",           0, user);
+	fn(gr, gr->TU64tc, "TU64",                              "B",           0, user);
+	fn(gr, gr->TU32tc, "TU32",                              "D",           0, user);
+	fn(gr, gr->TU32tc, "TU32",                              "Dunit",       0, user);
+	fn(gr, gr->TU32tc, "TU32",                              "H",           0, user);
+	
+	/* Global Lattice Coordinates */
+	fn(gr, gr->TU32tc, "TU32",                              "LSlice",      0, user);
+	fn(gr, gr->TU32tc, "TU64",                              "LPadded",     0, user);
+	for (k=0;k < gr->nds;k++){
+		fn(gr, gr->TU64tc, "TU64",                              "L%d",         k, user);
+	}
+	for (k=0;k < gr->log2MaxBS;k++){
+		fn(gr, gr->TU32tc, "TU32",                              "L%di",        k, user);
+	}
+	
+	/* S0 Lattice */
+	if (reduxGenKernelRequiresLatticeS0(gr)){
+		fn(gr, GA_BUFFER,  "const GLOBAL_MEM char* restrict",   "S0",          0, user);
+		fn(gr, gr->TS64tc, "TS64",                              "S0Off",       0, user);
+		for (k=0;k < gr->nds;k++){
+			fn(gr, gr->TS64tc, "TS64",                              "S0J%d",       k, user);
 		}
-	}
-	if(reduxGenRequiresDstArg(gr)){
-		fn(gr, GA_BUFFER, "GLOBAL_MEM char* restrict",         "a",           0, user);
-		fn(gr, GA_SSIZE,  "TX",                       "aOff",        0, user);
-		for(k=0;k < gr->ndd;k++){
-			fn(gr, GA_SIZE,   "TX",                       "aJ%d",        k, user);
+		for (k=0;k < gr->log2MaxBS;k++){
+			fn(gr, gr->TS64tc, "TS64",                              "S0S%di",      k, user);
 		}
 	}
-	fn(gr, GA_BUFFER, "GLOBAL_MEM char* restrict",         "w",           0, user);
-	if(reduxGenKernelRequiresDst   (gr)){
-		fn(gr, GA_SSIZE,  "TX",                       "wdOff",       0, user);
-		fn(gr, GA_SSIZE,  "TX",                       "pdOff",       0, user);
-	}
-	if(reduxGenKernelRequiresDstArg(gr)){
-		fn(gr, GA_SSIZE,  "TX",                       "waOff",       0, user);
-		fn(gr, GA_SSIZE,  "TX",                       "paOff",       0, user);
-	}
-	for(k=0;k < gr->log2MaxL;k++){
-		fn(gr, GA_UINT,   "unsigned",                 "ibs%d",       k, user);
-	}
-	for(k=0;k < gr->log2MaxL;k++){
-		fn(gr, GA_UINT,   "unsigned",                 "ibp%d",       k, user);
+	
+	/* d0 Lattice */
+	if (reduxGenKernelRequiresLatticeD0(gr)){
+		fn(gr, GA_BUFFER,  "GLOBAL_MEM char* restrict",         "D0",          0, user);
+		fn(gr, gr->TS64tc, "TS64",                              "D0Off",       0, user);
+		for (k=0;k < gr->ndd;k++){
+			fn(gr, gr->TS64tc, "TS64",                              "D0J%d",       k, user);
+		}
+		for (k=0;k < gr->log2MaxBS;k++){
+			fn(gr, gr->TS64tc, "TS64",                              "D0S%di",      k, user);
+		}
 	}
-	for(k=0;k < gr->log2MaxL && reduxGenRequiresDstArg(gr);k++){
-		fn(gr, GA_SIZE,   "TX",                       "ibl%dPDim",   k, user);
+	
+	/* D1 Lattice */
+	if (reduxGenKernelRequiresLatticeD1(gr)){
+		fn(gr, GA_BUFFER,  "GLOBAL_MEM char* restrict",         "D1",          0, user);
+		fn(gr, gr->TS64tc, "TS64",                              "D1Off",       0, user);
+		for (k=0;k < gr->ndd;k++){
+			fn(gr, gr->TS64tc, "TS64",                              "D1J%d",       k, user);
+		}
+		for (k=0;k < gr->log2MaxBS;k++){
+			fn(gr, gr->TS64tc, "TS64",                              "D1S%di",      k, user);
+		}
 	}
-	for(k=0;k < gr->log2MaxL;k++){
-		fn(gr, GA_SSIZE,  "TX",                       "ibsOff%d",    k, user);
+	
+	/* I0 Lattice */
+	if (reduxGenKernelRequiresLatticeI0(gr)){
+		for (k=0;k < gr->nds;k++){
+			fn(gr, gr->TS64tc, "TS64",                              "I0J%d",       k, user);
+		}
+		for (k=0;k < gr->log2MaxBS;k++){
+			fn(gr, gr->TS64tc, "TS64",                              "I0S%di",      k, user);
+		}
 	}
-	for(k=0;k < gr->log2MaxL && reduxGenRequiresDst   (gr);k++){
-		fn(gr, GA_SSIZE,  "TX",                       "ibdOff%d",    k, user);
+	
+	/* Workspace */
+	if (reduxGenKernelRequiresWspace(gr)){
+		fn(gr, GA_BUFFER,  "GLOBAL_MEM char* restrict",         "W",           0, user);
+		if (reduxGenKernelRequiresStateK0(gr)){
+			fn(gr, gr->TS64tc, "TS64",                              "W0Off",       0, user);
+			fn(gr, gr->TS64tc, "TS64",                              "SHMEMK0Off",  0, user);
+		}
+		if (reduxGenKernelRequiresStateK1(gr)){
+			fn(gr, gr->TS64tc, "TS64",                              "W1Off",       0, user);
+			fn(gr, gr->TS64tc, "TS64",                              "SHMEMK1Off",  0, user);
+		}
 	}
-	for(k=0;k < gr->log2MaxL && reduxGenRequiresDstArg(gr);k++){
-		fn(gr, GA_SSIZE,  "TX",                       "ibaOff%d",    k, user);
+	
+	/* Intra-Block Permute Core */
+	for (k=0;k < gr->log2MaxBS;k++){
+		fn(gr, gr->TU32tc, "TU32",                              "perm%di",     k, user);
 	}
 }
 
@@ -1561,6 +1807,9 @@ static void       reduxGenIterArgs              (GpuReduction*        gr,
  */
 
 static int        reduxGenSrc                   (GpuReduction*     gr){
+	sprintf(gr->kName, "reduxKernel%s_f%d_r%d",
+	        reduxGetOpName(gr->op), gr->ndd, gr->ndr);
+	
 	reduxGenSrcAppend(gr);
 
 	gr->kSourceCodeLen = gr->s.l;
@@ -1580,10 +1829,9 @@ static int        reduxGenSrc                   (GpuReduction*     gr){
  */
 
 static void       reduxGenSrcAppend             (GpuReduction*     gr){
-	reduxGenSrcAppendIncludes      (gr);
-	reduxGenSrcAppendMacroDefs     (gr);
-	reduxGenSrcAppendTypedefs      (gr);
-	reduxGenSrcAppendReduxKernel   (gr);
+	reduxGenSrcAppendIncludes     (gr);
+	reduxGenSrcAppendMacroTypedefs(gr);
+	reduxGenSrcAppendReduxKernel  (gr);
 }
 static void       reduxGenSrcAppendIncludes     (GpuReduction*     gr){
 	srcbAppends(&gr->srcGen, "/* Includes */\n");
@@ -1592,61 +1840,80 @@ static void       reduxGenSrcAppendIncludes     (GpuReduction*     gr){
 	srcbAppends(&gr->srcGen, "\n");
 	srcbAppends(&gr->srcGen, "\n");
 }
-static void       reduxGenSrcAppendMacroDefs    (GpuReduction*     gr){
-	int i;
+static void       reduxGenSrcAppendMacroTypedefs(GpuReduction*     gr){
+	/**
+	 * Typedefs of various types.
+	 */
+	
+	if (reduxGenRequiresS0(gr)){
+		srcbAppendf(&gr->srcGen, "typedef %-20s TS0;\n",  gpuarray_get_type(gr->TS0tc )->cluda_name);
+		srcbAppendf(&gr->srcGen, "typedef %-20s TPS0;\n", gpuarray_get_type(gr->TPS0tc)->cluda_name);
+	}
+	if (reduxGenRequiresD0(gr)){
+		srcbAppendf(&gr->srcGen, "typedef %-20s TD0;\n",  gpuarray_get_type(gr->TD0tc )->cluda_name);
+	}
+	if (reduxGenRequiresD1(gr)){
+		srcbAppendf(&gr->srcGen, "typedef %-20s TD1;\n",  gpuarray_get_type(gr->TD1tc )->cluda_name);
+	}
+	srcbAppendf(&gr->srcGen, "typedef %-20s TS32;\n", gpuarray_get_type(gr->TS32tc)->cluda_name);
+	srcbAppendf(&gr->srcGen, "typedef %-20s TU32;\n", gpuarray_get_type(gr->TU32tc)->cluda_name);
+	srcbAppendf(&gr->srcGen, "typedef %-20s TS64;\n", gpuarray_get_type(gr->TS64tc)->cluda_name);
+	srcbAppendf(&gr->srcGen, "typedef %-20s TU64;\n", gpuarray_get_type(gr->TU64tc)->cluda_name);
+	if (reduxGenKernelRequiresStateK0(gr)){
+		srcbAppendf(&gr->srcGen, "typedef %-20s TK0;\n", gr->TK0.defn);
+	}
+	if (reduxGenKernelRequiresStateK1(gr)){
+		srcbAppendf(&gr->srcGen, "typedef %-20s TK1;\n", gr->TK1.defn);
+	}
+	srcbAppendf(&gr->srcGen, "\n\n\n\n");
+	
 	
 	/**
 	 * DECLREDUXSTATE, INITREDUXSTATE and SETREDUXSTATE macros.
 	 */
 	
-	if       ( reduxGenKernelRequiresDst(gr) &&  reduxGenKernelRequiresDstArg(gr)){
+	if       ( reduxGenKernelRequiresStateK0(gr) &&  reduxGenKernelRequiresStateK1(gr)){
 		srcbAppendf(&gr->srcGen,
-		            "#define DECLREDUXSTATE(V, I) TK V;TX I;\n"
-		            "#define INITREDUXSTATE(V, I) do{(V) = %s;(I) = 0;}while(0)\n"
+		            "#define DECLREDUXSTATE(V, I) TK0 V;TK1 I;\n"
+		            "#define INITREDUXSTATE(V, I) do{(V) = (%s);(I) = (%s);}while(0)\n"
 		            "#define SETREDUXSTATE(V, I, v, i)  do{(V) = (v);(I) = (i);}while(0)\n",
-		            gr->initVal);
-	}else if ( reduxGenKernelRequiresDst(gr) && !reduxGenKernelRequiresDstArg(gr)){
+		            gr->TK0.init, gr->TK1.init);
+	}else if ( reduxGenKernelRequiresStateK0(gr) && !reduxGenKernelRequiresStateK1(gr)){
 		srcbAppendf(&gr->srcGen,
-		            "#define DECLREDUXSTATE(V, I) TK V;\n"
-		            "#define INITREDUXSTATE(V, I) do{(V) = %s;}while(0)\n"
+		            "#define DECLREDUXSTATE(V, I) TK0 V;\n"
+		            "#define INITREDUXSTATE(V, I) do{(V) = (%s);}while(0)\n"
 		            "#define SETREDUXSTATE(V, I, v, i)  do{(V) = (v);}while(0)\n",
-		            gr->initVal);
-	}else if (!reduxGenKernelRequiresDst(gr) &&  reduxGenKernelRequiresDstArg(gr)){
+		            gr->TK0.init);
+	}else if (!reduxGenKernelRequiresStateK0(gr) &&  reduxGenKernelRequiresStateK1(gr)){
 		srcbAppendf(&gr->srcGen,
-		            "#define DECLREDUXSTATE(V, I) TX I;\n"
-		            "#define INITREDUXSTATE(V, I) do{(I) = 0;}while(0)\n"
-		            "#define SETREDUXSTATE(V, I, v, i)  do{(I) = (i);}while(0)\n");
+		            "#define DECLREDUXSTATE(V, I) TK1 I;\n"
+		            "#define INITREDUXSTATE(V, I) do{(I) = (%s);}while(0)\n"
+		            "#define SETREDUXSTATE(V, I, v, i)  do{(I) = (i);}while(0)\n",
+		            gr->TK1.init);
 	}
 	
 	
 	/**
-	 * LOADS(v, p) macro.
+	 * LOADS0(v, p) macro.
+	 * 
+	 * Loads a TK0-typed value v from a TS-typed source pointer p, promoting
+	 * through type TPS0.
 	 * 
-	 * Loads a TK-typed value v from a TS-typed source pointer p.
+	 * In some future, TK0 will not equal TPS0, and so a cast as done below will not
+	 * necessarily be valid. Instead it may require an assignment to a struct member.
 	 */
 	
-	if (gr->srcTypeCode == GA_HALF && gr->accTypeCode == GA_FLOAT){
-		srcbAppends(&gr->srcGen, "#define LOADS(v, p) do{(v) = (TK)load_half((const TS* restrict)(p));}while(0)\n");
+	if (reduxGenKernelRequiresLatticeS0(gr)){
+		if (gr->TS0tc == GA_HALF && gr->TPS0tc == GA_FLOAT){
+			srcbAppends(&gr->srcGen, "#define LOADS0(v, p) do{(v) = (TK0)(TPS0)load_half((const TS0* restrict)(p));}while(0)\n");
+		}else{
+			srcbAppends(&gr->srcGen, "#define LOADS0(v, p) do{(v) = (TK0)(TPS0)*(const TS0* restrict)(p);}while(0)\n");
+		}
 	}else{
-		srcbAppends(&gr->srcGen, "#define LOADS(v, p) do{(v) = (TK)*(const TS* restrict)(p);}while(0)\n");
+		srcbAppends(&gr->srcGen, "#define LOADS0(p, v) do{}while(0)\n");
 	}
 	
 	
-	/**
-	 * GETIDX macro.
-	 * 
-	 * Expands to the current flattened index.
-	 */
-	
-	srcbAppends    (&gr->srcGen, "#define GETIDX   (");
-	srcbBeginList  (&gr->srcGen, " + ", "0");
-	srcbAppendElemf(&gr->srcGen, "ti");
-	for(i=gr->ndd;i<gr->nds;i++){
-		srcbAppendElemf(&gr->srcGen, "i%d*l%dPDim", i, i);
-	}
-	srcbEndList    (&gr->srcGen);
-	srcbAppends    (&gr->srcGen, ")\n");
-	
 	/**
 	 * REDUX macro.
 	 * 
@@ -1654,54 +1921,79 @@ static void       reduxGenSrcAppendMacroDefs    (GpuReduction*     gr){
 	 * flattened index i into reduction states V and I respectively.
 	 */
 	
-	srcbAppends(&gr->srcGen, "#define REDUX(V, I, v, i) do{           \\\n");
 	switch (gr->op){
 		case GA_REDUCE_SUM:
-		  srcbAppendf(&gr->srcGen, "        (V) += (v);                     \\\n");
+		  srcbAppendf(&gr->srcGen, "#define REDUX(V, I, v, i) do{           \\\n"
+		                           "        (V) += (v);                     \\\n"
+		                           "    }while(0)\n");
 		break;
 		case GA_REDUCE_PROD:
-		  srcbAppendf(&gr->srcGen, "        (V) *= (v);                     \\\n");
+		  srcbAppendf(&gr->srcGen, "#define REDUX(V, I, v, i) do{           \\\n"
+		                           "        (V) *= (v);                     \\\n"
+		                           "    }while(0)\n");
 		break;
 		case GA_REDUCE_PRODNZ:
-		  srcbAppendf(&gr->srcGen, "        (V) *= ((v) == 0 ? (%s) : (v)); \\\n", gr->initVal);
+		  srcbAppendf(&gr->srcGen, "#define REDUX(V, I, v, i) do{           \\\n"
+		                           "        if((v) != 0){(V) *= (v);}       \\\n"
+		                           "    }while(0)\n");
 		break;
 		case GA_REDUCE_MIN:
-		  srcbAppendf(&gr->srcGen, "    (V)  = min((V), (v));           \\\n");
+		  srcbAppendf(&gr->srcGen, "#define REDUX(V, I, v, i) do{           \\\n"
+		                           "        (V)  = min((V), (v));           \\\n"
+		                           "    }while(0)\n");
 		break;
 		case GA_REDUCE_MAX:
-		  srcbAppendf(&gr->srcGen, "        (V)  = max((V), (v));           \\\n");
+		  srcbAppendf(&gr->srcGen, "#define REDUX(V, I, v, i) do{           \\\n"
+		                           "        (V)  = max((V), (v));           \\\n"
+		                           "    }while(0)\n");
 		break;
 		case GA_REDUCE_ARGMIN:
 		case GA_REDUCE_MINANDARGMIN:
-		  srcbAppendf(&gr->srcGen, "        (V)  = min((V), (v));           \\\n"
+		  srcbAppendf(&gr->srcGen, "#define REDUX(V, I, v, i) do{           \\\n"
+		                           "        (V)  = min((V), (v));           \\\n"
 		                           "        if((V) == (v)){                 \\\n"
 		                           "            (I) = (i);                  \\\n"
-		                           "        }                               \\\n");
+		                           "        }                               \\\n"
+		                           "    }while(0)\n");
 		break;
 		case GA_REDUCE_ARGMAX:
 		case GA_REDUCE_MAXANDARGMAX:
-		  srcbAppendf(&gr->srcGen, "        (V)  = max((V), (v));           \\\n"
+		  srcbAppendf(&gr->srcGen, "#define REDUX(V, I, v, i) do{           \\\n"
+		                           "        (V)  = max((V), (v));           \\\n"
 		                           "        if((V) == (v)){                 \\\n"
 		                           "            (I) = (i);                  \\\n"
-		                           "        }                               \\\n");
+		                           "        }                               \\\n"
+		                           "    }while(0)\n");
 		break;
 		case GA_REDUCE_AND:
-		  srcbAppendf(&gr->srcGen, "        (V) &= (v);                     \\\n");
+		  srcbAppendf(&gr->srcGen, "#define REDUX(V, I, v, i) do{           \\\n"
+		                           "        (V) &= (v);                     \\\n"
+		                           "    }while(0)\n");
 		break;
 		case GA_REDUCE_OR:
-		  srcbAppendf(&gr->srcGen, "        (V) |= (v);                     \\\n");
+		  srcbAppendf(&gr->srcGen, "#define REDUX(V, I, v, i) do{           \\\n"
+		                           "        (V) |= (v);                     \\\n"
+		                           "    }while(0)\n");
 		break;
 		case GA_REDUCE_XOR:
-		  srcbAppendf(&gr->srcGen, "        (V) ^= (v);                     \\\n");
+		  srcbAppendf(&gr->srcGen, "#define REDUX(V, I, v, i) do{           \\\n"
+		                           "        (V) ^= (v);                     \\\n"
+		                           "    }while(0)\n");
 		break;
 		case GA_REDUCE_ALL:
-		  srcbAppendf(&gr->srcGen, "        (V)  = (V) && (v);              \\\n");
+		  srcbAppendf(&gr->srcGen, "#define REDUX(V, I, v, i) do{           \\\n"
+		                           "        (V)  = (V) && (v);              \\\n"
+		                           "    }while(0)\n");
 		break;
 		case GA_REDUCE_ANY:
-		  srcbAppendf(&gr->srcGen, "        (V)  = (V) || (v);              \\\n");
+		  srcbAppendf(&gr->srcGen, "#define REDUX(V, I, v, i) do{           \\\n"
+		                           "        (V)  = (V) || (v);              \\\n"
+		                           "    }while(0)\n");
+		break;
+		default:
+		  /* Unreachable */
 		break;
 	}
-	srcbAppends(&gr->srcGen, "    }while(0)\n");
 	
 	
 	/**
@@ -1709,55 +2001,112 @@ static void       reduxGenSrcAppendMacroDefs    (GpuReduction*     gr){
 	 * 
 	 * Performs a horizontal reduction operation, first intra-block permuting
 	 * the data and its index and then reducing it till done.
-	 */
-	
+	 * 
+	 *   - If D==LDIM_0, then no horizontal (across-block) reductions are
+	 *     really needed. In this case, the permutation tp:
+	 *       - Is fully in-bounds (tp < LDIM_0 for all threads)
+	 *       - Exists firstly  to make it easy to mask writes     (hard).
+	 *       - Exists secondly to optimize memory write bandwidth (soft).
+	 *     and the value H should be equal to D and to LDIM_0
+	 *   - If D<LDIM_0,  then horizontal reductions are needed. In this case,
+	 *     the permutation tp:
+	 *       - *May* be partially out-of-bounds (tp >= LDIM_0 for some threads)
+	 *       - Exists firstly  to make it easy to mask writes     (hard).
+	 *       - Exists secondly to enable a tree reduction         (hard).
+	 *       - Exists thirdly  to optimize memory write bandwidth (soft).
+	 *     and the value H must be a power of 2 and shall be set to nextPow2(bs).
+	 * 
+	 * E.g. Suppose that a block configuration was D=999, H=1 (bs=999). A
+	 *      permutation we might want is
+	 *          [0,...,332,333,...,665,666,...,998]
+	 *      and we want H = 999.
+	 * E.g. Suppose that a block configuration was D=257, H=3 (bs=771). A
+	 *      permutation we might want is
+	 *          [0,...,256,512,...,768,1024,...,1280]
+	 *      and we want H = 1024.
+	 * E.g. Suppose that a block configuration was D=33, H=17 (bs=561). A
+	 *      permutation we might want is
+	 *          [0,...,32,64,...,96,128,...,160,...,960,...,992,1024,...,1056]
+	 *      and we want H = 1024.
+	 * E.g. Suppose that a block configuration was D=16, H=16 (bs=256). A
+	 *      permutation we might want is
+	 *          [0,...255]
+	 *      and we want H = 256.
+	 * 
+	 */
+	
 	srcbAppends(&gr->srcGen,
-	"#define HREDUX(pd, pa, tp, V, I)                                                    \\\n"
-	"    do{                                                                             \\\n"
-	"        /* Horizontal Reduction */                                                  \\\n"
-	"        SETREDUXSTATE(pd[tp], pa[tp], accV, accI);                                  \\\n"
-	"        local_barrier();                                                            \\\n"
-	"                                                                                    \\\n"
-	"        h = H;                                                                      \\\n"
-	"        while(h>1){                                                                 \\\n"
-	"            if((h&1) && (LID_0 < D)){                                               \\\n"
-	"                REDUX(pd[LID_0], pa[LID_0], pd[LID_0 + D*h-D], pa[LID_0 + D*h-D]);  \\\n"
-	"            }                                                                       \\\n"
-	"            h >>= 1;                                                                \\\n"
-	"            if(LID_0 < D*h){                                                        \\\n"
-	"                REDUX(pd[LID_0], pa[LID_0], pd[LID_0 + D*h  ], pa[LID_0 + D*h  ]);  \\\n"
-	"            }                                                                       \\\n"
-	"            local_barrier();                                                        \\\n"
-	"        }                                                                           \\\n"
+	"#define HREDUX(SHMEMK0, SHMEMK1, perm, k0, k1)     \\\n"
+	"    do{                                            \\\n"
+	"        if(D < LDIM_0){                            \\\n"
+	"            /* SPECIAL FIRST REDUCTION: */         \\\n"
+	"            h = H;                                 \\\n"
+	"                                                   \\\n"
+	"            /* LO Half */                          \\\n"
+	"            if(perm < h){                          \\\n"
+	"                SETREDUXSTATE(SHMEMK0[perm],       \\\n"
+	"                              SHMEMK1[perm],       \\\n"
+	"                              k0,                  \\\n"
+	"                              k1);                 \\\n"
+	"            }                                      \\\n"
+	"            local_barrier();                       \\\n"
+	"                                                   \\\n"
+	"            /* HI Half */                          \\\n"
+	"            if(perm >= h){                         \\\n"
+	"                REDUX        (SHMEMK0[perm-h],     \\\n"
+	"                              SHMEMK1[perm-h],     \\\n"
+	"                              k0,                  \\\n"
+	"                              k1);                 \\\n"
+	"            }                                      \\\n"
+	"            local_barrier();                       \\\n"
+	"                                                   \\\n"
+	"            /* Follow-up reductions */             \\\n"
+	"            while((h >>= 1) >= D){                 \\\n"
+	"                if(LID_0 < h){                     \\\n"
+	"                    REDUX(SHMEMK0[LID_0],          \\\n"
+	"                          SHMEMK1[LID_0],          \\\n"
+	"                          SHMEMK0[LID_0+h],        \\\n"
+	"                          SHMEMK1[LID_0+h]);       \\\n"
+	"                }                                  \\\n"
+	"                local_barrier();                   \\\n"
+	"            }                                      \\\n"
+	"        }else{                                     \\\n"
+	"            /* All-permute */                      \\\n"
+	"            SETREDUXSTATE(SHMEMK0[perm],           \\\n"
+	"                          SHMEMK1[perm],           \\\n"
+	"                          k0,                      \\\n"
+	"                          k1);                     \\\n"
+	"            local_barrier();                       \\\n"
+	"        }                                          \\\n"
 	"    }while(0)\n");
 	
 	/**
-	 * STORED macro.
+	 * STORED0 macro.
 	 * 
-	 * Stores a TK-typed value v into a TS-typed destination pointer p.
+	 * Stores a TK0-typed value v into a TD0-typed destination pointer p.
 	 */
 	
-	if (reduxGenRequiresDst(gr)){
-		if (gr->dstTypeCode == GA_HALF && gr->accTypeCode == GA_FLOAT){
-			srcbAppends(&gr->srcGen, "#define STORED(p, v) do{store_half((TD* restrict)(p), (v));}while(0)\n");
+	if (reduxGenKernelRequiresLatticeD0(gr)){
+		if (gr->TD0tc == GA_HALF && gr->TPS0tc == GA_FLOAT){
+			srcbAppends(&gr->srcGen, "#define STORED0(p, v) do{store_half((TD0* restrict)(p), (v));}while(0)\n");
 		}else{
-			srcbAppends(&gr->srcGen, "#define STORED(p, v) do{*(TD* restrict)(p) = (v);}while(0)\n");
+			srcbAppends(&gr->srcGen, "#define STORED0(p, v) do{*(TD0* restrict)(p) = (v);}while(0)\n");
 		}
 	}else{
-		srcbAppends(&gr->srcGen, "#define STORED(p, v) do{}while(0)\n");
+		srcbAppends(&gr->srcGen, "#define STORED0(p, v) do{}while(0)\n");
 	}
 	
 	
 	/**
-	 * STOREA macro.
+	 * STORED1 macro.
 	 * 
-	 * Stores a TX-typed value v into a TA-typed destination pointer p.
+	 * Stores a TK1-typed value v into a TD1-typed destination pointer p.
 	 */
 	
-	if (reduxGenRequiresDstArg(gr)){
-		srcbAppends(&gr->srcGen, "#define STOREA(p, v) do{*(TA* restrict)(p) = (v);}while(0)\n");
+	if (reduxGenKernelRequiresLatticeD1(gr)){
+		srcbAppends(&gr->srcGen, "#define STORED1(p, v) do{*(TD1* restrict)(p) = (v);}while(0)\n");
 	}else{
-		srcbAppends(&gr->srcGen, "#define STOREA(p, v) do{}while(0)\n");
+		srcbAppends(&gr->srcGen, "#define STORED1(p, v) do{}while(0)\n");
 	}
 	
 	
@@ -1765,682 +2114,694 @@ static void       reduxGenSrcAppendMacroDefs    (GpuReduction*     gr){
 	 * DIVIDECEIL macro.
 	 */
 	
-	srcbAppends(&gr->srcGen, "#define DIVIDECEIL(a,b) (((a)+(b)-1)/(b))\n");
-	
-	srcbAppends(&gr->srcGen, "\n\n\n\n");
-}
-static void       reduxGenSrcAppendTypedefs     (GpuReduction*     gr){
-	srcbAppendf(&gr->srcGen, "typedef %-20s TS;\n", gr->srcTypeStr);
-	srcbAppendf(&gr->srcGen, "typedef %-20s TD;\n", gr->dstTypeStr);
-	srcbAppendf(&gr->srcGen, "typedef %-20s TA;\n", gr->dstArgTypeStr);
-	srcbAppendf(&gr->srcGen, "typedef %-20s TX;\n", gr->idxTypeStr);
-	srcbAppendf(&gr->srcGen, "typedef %-20s TK;\n", gr->accTypeStr);
-	srcbAppendf(&gr->srcGen, "\n\n\n\n");
+	srcbAppends(&gr->srcGen, "#define DIVIDECEIL(a,b) (((a)+(b)-1)/(b))\n\n\n\n\n");
 }
 static void       reduxGenSrcAppendReduxKernel  (GpuReduction*     gr){
 	reduxGenSrcAppendPrototype   (gr);
 	srcbAppends                  (&gr->srcGen, "{\n");
-	reduxGenSrcAppendBlockDecode (gr);
-	reduxGenSrcAppendThreadDecode(gr);
-	srcbAppends                  (&gr->srcGen, "    /**\n"
-	                                           "     * PERFORM REDUCTION.\n"
-	                                           "     * \n"
-	                                           "     * We either perform Phase 0 or Phase 1 according to our argument.\n"
-	                                           "     * \n"
-	                                           "     * Phase 0 is the primary worker and, in special cases, is the only necessary phase.\n"
-	                                           "     * However, it may occasionally do only part of a reduction, in which case it leaves\n"
-	                                           "     * the partial reduction results in a workspace that is then read by Phase 1.\n"
-	                                           "     * \n"
-	                                           "     * Phase 1 is a fixup phase that collects any partial reduction results from Phase 0\n"
-	                                           "     * and completes the reduction before writing to the final destination.\n"
-	                                           "     */\n"
-	                                           "    \n"
-	                                           "    if(phase==0){\n");
-	reduxGenSrcAppendPhase0      (gr);
-	srcbAppends                  (&gr->srcGen, "    }else{\n");
+	reduxGenSrcAppendDecode      (gr);
+	
+	/**
+	 * PERFORM REDUCTION.
+	 * 
+	 * We either perform Phase 0 or Phase 1 according to the selector argument.
+	 * 
+	 * Phase 0 is the primary worker and, in special cases, is the only
+	 * necessary phase. However, it may occasionally do only part of a
+	 * reduction, in which case it leaves the partial reduction results in a
+	 * workspace that is then read by Phase 1.
+	 * 
+	 * Phase 1 is a fixup phase that collects any partial reduction results
+	 * from Phase 0 and completes the reduction before writing to the final
+	 * destination.
+	 * 
+	 * The template selector indicates one of several specialized versions of
+	 * the kernel to be executed. It indicates phase, which is the split axis,
+	 * and which axis if any is "huge".
+	 */
+	
+	srcbAppends                  (&gr->srcGen, "    if(selector&1){\n");
 	reduxGenSrcAppendPhase1      (gr);
+	srcbAppends                  (&gr->srcGen, "    }else if(selector ==  0){\n");
+	reduxGenSrcAppendPhase0      (gr,   0);
+	srcbAppends                  (&gr->srcGen, "    }else if(selector ==  2){\n");
+	reduxGenSrcAppendPhase0      (gr,   2);
+	srcbAppends                  (&gr->srcGen, "    }else if(selector ==  4){\n");
+	reduxGenSrcAppendPhase0      (gr,   4);
+	srcbAppends                  (&gr->srcGen, "    }else if(selector ==  6){\n");
+	reduxGenSrcAppendPhase0      (gr,   6);
+	srcbAppends                  (&gr->srcGen, "    }else if(selector ==  8){\n");
+	reduxGenSrcAppendPhase0      (gr,   8);
+	srcbAppends                  (&gr->srcGen, "    }else if(selector == 10){\n");
+	reduxGenSrcAppendPhase0      (gr,  10);
+	srcbAppends                  (&gr->srcGen, "    }else if(selector == 12){\n");
+	reduxGenSrcAppendPhase0      (gr,  12);
+	srcbAppends                  (&gr->srcGen, "    }else if(selector == 14){\n");
+	reduxGenSrcAppendPhase0      (gr,  14);
 	srcbAppends                  (&gr->srcGen, "    }\n");
 	srcbAppends                  (&gr->srcGen, "}\n");
 }
 static void       reduxGenSrcAppendPrototype    (GpuReduction*     gr){
 	int i=0;
-	
-	srcbAppends            (&gr->srcGen, "KERNEL void redux(");
+
+	srcbAppendf(&gr->srcGen,
+	"KERNEL void\n"
+	"#if defined(__CUDACC__)\n"
+	"__launch_bounds__(%d, 8)\n"
+	"#endif\n",
+	            gr->maxBS);
+	srcbAppendf(&gr->srcGen,
+	"%s(\n                  ",
+	            gr->kName);
 	reduxGenIterArgs(gr, reduxGenAppendArg, &i);
-	srcbAppends    (&gr->srcGen, ")");
+	srcbAppends(&gr->srcGen, ")");
 }
-static void       reduxGenSrcAppendBlockDecode  (GpuReduction*     gr){
+static void       reduxGenSrcAppendDecode       (GpuReduction*     gr){
 	int i;
-	
+
 	srcbAppends(&gr->srcGen,
 	"    GA_DECL_SHARED_BODY(char, SHMEM)\n"
-	"    DECLREDUXSTATE(accV, accI)\n"
-	"    DECLREDUXSTATE(tmpV, tmpI)\n"
-	"    INITREDUXSTATE(accV, accI);\n"
-	"    \n"
-	"     /**\n"
-	"      *  +-------------+-------------+------------+---------------------------------+\n"
-	"      *  |  misalignL  |  misalignR  |  doFinish  |            DESCRIPTION          |\n"
-	"      *  +-------------+-------------+------------+---------------------------------+\n"
-	"      *  |      0      |       0     |      0     |  Impossible unless v == 0,      |\n"
-	"      *  |             |             |            |  which is forbidden.            |\n"
-	"      *  |             |             |            |                                 |\n"
-	"      *  |      0      |       0     |      1     |  V % B == 0. Each block         |\n"
-	"      *  |             |             |            |  handles integer number of      |\n"
-	"      *  |             |             |            |  destination elements, no       |\n"
-	"      *  |             |             |            |  partial results are required,  |\n"
-	"      *  |             |             |            |  workspace is unused.           |\n"
-	"      *  |             |             |            |                                 |\n"
-	"      *  |      0      |       1     |      0     |  V < B. Block begins aligned    |\n"
-	"      *  |             |             |            |  but ends misaligned, before    |\n"
-	"      *  |             |             |            |  the end of its first element.  |\n"
-	"      *  |             |             |            |  Partial result written to      |\n"
-	"      *  |             |             |            |  right-half of array.           |\n"
-	"      *  |             |             |            |                                 |\n"
-	"      *  |      0      |       1     |      1     |  V > B, V % B != 0. Block       |\n"
-	"      *  |             |             |            |  begins aligned but ends        |\n"
-	"      *  |             |             |            |  misaligned, after the end of   |\n"
-	"      *  |             |             |            |  its first element.             |\n"
-	"      *  |             |             |            |  First 1 or more complete       |\n"
-	"      *  |             |             |            |  elements written out directly  |\n"
-	"      *  |             |             |            |  to destination.                |\n"
-	"      *  |             |             |            |  Partial result of last element |\n"
-	"      *  |             |             |            |  written to right-half of array.|\n"
-	"      *  |             |             |            |                                 |\n"
-	"      *  |      1      |       0     |      0     |  Impossible unless v == 0,      |\n"
-	"      *  |             |             |            |  which is forbidden.            |\n"
-	"      *  |             |             |            |                                 |\n"
-	"      *  |      1      |       0     |      1     |  V % B != 0. Partial result of  |\n"
-	"      *  |             |             |            |  first element written to left- |\n"
-	"      *  |             |             |            |  half of array. Zero or more    |\n"
-	"      *  |             |             |            |  complete reductions performed  |\n"
-	"      *  |             |             |            |  and written directly to        |\n"
-	"      *  |             |             |            |  destination. Block ends        |\n"
-	"      *  |             |             |            |  aligned.                       |\n"
-	"      *  |             |             |            |                                 |\n"
-	"      *  |      1      |       1     |      0     |  V < B. Block begins misaligned |\n"
-	"      *  |             |             |            |  and ends misaligned, before    |\n"
-	"      *  |             |             |            |  the end of its first element.  |\n"
-	"      *  |             |             |            |  Partial result written to at   |\n"
-	"      *  |             |             |            |  least right-half of array.     |\n"
-	"      *  |             |             |            |                                 |\n"
-	"      *  |      1      |       1     |      1     |  V % B != 0. Block begins       |\n"
-	"      *  |             |             |            |  misaligned and ends misaligned,|\n"
-	"      *  |             |             |            |  after the end of its first     |\n"
-	"      *  |             |             |            |  element.                       |\n"
-	"      *  |             |             |            |  Partial result of first element|\n"
-	"      *  |             |             |            |  written to left-half of array. |\n"
-	"      *  |             |             |            |  Partial result of last element |\n"
-	"      *  |             |             |            |  written to right-half of array.|\n"
-	"      *  |             |             |            |  0 or more complete elements    |\n"
-	"      *  |             |             |            |  written out directly to        |\n"
-	"      *  |             |             |            |  destination.                   |\n"
-	"      *  +-------------+-------------+------------+---------------------------------+\n"
-	"      * \n"
-	"      * Possible configurations of blocks:\n"
-	"      *   If V % B == 0:  001\n"
-	"      *   If V < B:       010, 110, 111, 101\n"
-	"      *   If V > B:       011, 111, 101\n"
-	"      * \n"
-	"      * Possible configurations for collector blocks (responsible for gathering of\n"
-	"      * results to the left):\n"
-	"      *   101, 111          (misalignL && doFinish)\n"
-	"      * \n"
-	"      * Possible configurations for left-neighbours of collector blocks\n"
-	"      *   110 (any number 0+), then exactly one of:\n"
-	"      *   010, 011, 111\n"
-	"      * \n"
-	"      * Conclusion:\n"
-	"      *     - In Phase 0:\n"
-	"      *         - Always make a right-write if misalignR                (010, 011, 110, 111).\n"
-	"      *         - Make        a left -write at least if collector block (101, 111).\n"
-	"      *     - In Phase 1:\n"
-	"      *         - Exit if not collector block (101, 111)\n"
-	"      *         - If collector block,\n"
-	"      *             - Left -read from self\n"
-	"      *             - Right-read from all left-neighbours with same write-target.\n"
-	"      * \n"
-	"      * Code Structure perfectly satisfying conclusion:\n"
-	"      * \n"
-	"      * if(misalignL){\n"
-	"      *     while(v > 0){\n"
-	"      *         v--;\n"
-	"      *         REDUX();\n"
-	"      *         ReduxLoopIncs_CONTINUE;\n"
-	"      *         HREDUX();\n"
-	"      *         WSLeftWrite();\n"
-	"      *         REINIT();\n"
-	"      *         FreeLoopIncs_BREAK;\n"
-	"      *         BREAK;\n"
-	"      *     }\n"
-	"      * }\n"
-	"      * while(v > 0){\n"
-	"      *     v--;\n"
-	"      *     REDUX();\n"
-	"      *     ReduxLoopIncs_CONTINUE;\n"
-	"      *     HREDUX();\n"
-	"      *     DstWrite();\n"
-	"      *     REINIT();\n"
-	"      *     FreeLoopIncs_CONTINUE;\n"
-	"      *     BREAK;\n"
-	"      * }\n"
-	"      * if(misalignR){\n"
-	"      *     HREDUX();\n"
-	"      *     WSRightWrite();\n"
-	"      * }\n"
-	"      * \n"
-	"      * Code Walkthrough:\n"
-	"      * \n"
-	"      * 000, 100: Impossible, can be ignored.\n"
-	"      * 001:      Only master loop entered, handles exact integer number of destinations.\n"
-	"      * 010:      Master loop entered but broken on vcount before HREDUX() reached.\n"
-	"      *           No reinit executed on breakout. HREDUX(), followed by WSRightWrite() of\n"
-	"      *           partial result.\n"
-	"      * 011:      Master loop entered for at least 1 full destination, then broken on\n"
-	"      *           vcount before HREDUX() reached. No reinit executed on breakout. HREDUX()\n"
-	"      *           followed by WSRightWrite() of partial result.\n"
-	"      * 101:      Left-misalign loop entered and completes a reduction. HREDUX()\n"
-	"      *           performed, WSLeftWrite() performed, reinitialization, bump of outer\n"
-	"      *           loop counters, then breakout. Master loop entered for 0 or more complete\n"
-	"      *           destination elements involving full writeouts to destination and reinit.\n"
-	"      *           Aligned on both misalignL and master loop breakouts. No entry into\n"
-	"      *           misalignR fixup.\n"
-	"      * 110:      Left-misalign loop entered, breaks on vcount before HREDUX(). No reinit\n"
-	"      *           executed on breakout. Master loop not entered. HREDUX(), followed by\n"
-	"      *           WSRightWrite() of partial result.\n"
-	"      * 111:      Left-misalign loop entered and completes a reduction. HREDUX() performed,\n"
-	"      *           WSLeftWrite() performed, reinit, bump of outer loop counters, breakout.\n"
-	"      *           Master loop entered for 0 or more complete destination elements\n"
-	"      *           involving full writeout to destination and reinit.\n"
-	"      *           Master loop broken on vcount before HREDUX(). misalignR fixup entered,\n"
-	"      *           HREDUX(), WSRightWrite().\n"
-	"      */\n"
+	"    DECLREDUXSTATE(tmpK0, I0)\n"
+	"    DECLREDUXSTATE(K0,    K1)\n"
+	"    INITREDUXSTATE(K0,    K1);\n"
 	"    \n"
-	"    TX      start        = GID_0 * V;\n"
-	"    if(start >= U){return;}\n"
-	"    TX      v            = U-start < V ? U-start : V;\n"
+	"    TU64        z, h, k;\n"
 	"    \n"
-	"    int     misalignL    = (start+0)%B != 0;\n"
-	"    int     misalignR    = (start+v)%B != 0;\n"
-	"    int     doFinish     = (start+0)/B != (start+v)/B;\n"
+	/**
+	 *  +-------------+-------------+------------+---------------------------------+
+	 *  |  misalignL  |  misalignR  |  doFinish  |            DESCRIPTION          |
+	 *  +-------------+-------------+------------+---------------------------------+
+	 *  |      0      |       0     |      0     |  Impossible unless v == 0,      |
+	 *  |             |             |            |  which is forbidden.            |
+	 *  |             |             |            |                                 |
+	 *  |      0      |       0     |      1     |  V % B == 0. Each block         |
+	 *  |             |             |            |  handles integer number of      |
+	 *  |             |             |            |  destination elements, no       |
+	 *  |             |             |            |  partial results are required,  |
+	 *  |             |             |            |  workspace is unused.           |
+	 *  |             |             |            |                                 |
+	 *  |      0      |       1     |      0     |  V < B. Block begins aligned    |
+	 *  |             |             |            |  but ends misaligned, before    |
+	 *  |             |             |            |  the end of its first element.  |
+	 *  |             |             |            |  Partial result written to      |
+	 *  |             |             |            |  right-half of array.           |
+	 *  |             |             |            |                                 |
+	 *  |      0      |       1     |      1     |  V > B, V % B != 0. Block       |
+	 *  |             |             |            |  begins aligned but ends        |
+	 *  |             |             |            |  misaligned, after the end of   |
+	 *  |             |             |            |  its first element.             |
+	 *  |             |             |            |  First 1 or more complete       |
+	 *  |             |             |            |  elements written out directly  |
+	 *  |             |             |            |  to destination.                |
+	 *  |             |             |            |  Partial result of last element |
+	 *  |             |             |            |  written to right-half of array.|
+	 *  |             |             |            |                                 |
+	 *  |      1      |       0     |      0     |  Impossible unless v == 0,      |
+	 *  |             |             |            |  which is forbidden.            |
+	 *  |             |             |            |                                 |
+	 *  |      1      |       0     |      1     |  V % B != 0. Partial result of  |
+	 *  |             |             |            |  first element written to left- |
+	 *  |             |             |            |  half of array. Zero or more    |
+	 *  |             |             |            |  complete reductions performed  |
+	 *  |             |             |            |  and written directly to        |
+	 *  |             |             |            |  destination. Block ends        |
+	 *  |             |             |            |  aligned.                       |
+	 *  |             |             |            |                                 |
+	 *  |      1      |       1     |      0     |  V < B. Block begins misaligned |
+	 *  |             |             |            |  and ends misaligned, before    |
+	 *  |             |             |            |  the end of its first element.  |
+	 *  |             |             |            |  Partial result written to at   |
+	 *  |             |             |            |  least right-half of array.     |
+	 *  |             |             |            |                                 |
+	 *  |      1      |       1     |      1     |  V % B != 0. Block begins       |
+	 *  |             |             |            |  misaligned and ends misaligned,|
+	 *  |             |             |            |  after the end of its first     |
+	 *  |             |             |            |  element.                       |
+	 *  |             |             |            |  Partial result of first element|
+	 *  |             |             |            |  written to left-half of array. |
+	 *  |             |             |            |  Partial result of last element |
+	 *  |             |             |            |  written to right-half of array.|
+	 *  |             |             |            |  0 or more complete elements    |
+	 *  |             |             |            |  written out directly to        |
+	 *  |             |             |            |  destination.                   |
+	 *  +-------------+-------------+------------+---------------------------------+
+	 *
+	 * Possible configurations of blocks:
+	 *   If V % B == 0:  001
+	 *   If V < B:       010, 110, 111, 101
+	 *   If V > B:       011, 111, 101
+	 *
+	 * Possible configurations for collector blocks (responsible for gathering of
+	 * results to their right):
+	 *   010, 011, 111       (misalignR && (!misalignL || doFinish))
+	 *
+	 * Possible configurations for right-neighbours of collector blocks
+	 *   110 (any number 0+), then exactly one of:
+	 *   101, 111
+	 *
+	 * Conclusion:
+	 *     - In Phase 0:
+	 *         - Always make a right-write if collector block (010, 011, 111).
+	 *         - Always make a left -write if misalignL       (101, 110, 111).
+	 *     - In Phase 1:
+	 *         - Exit if not collector block (010, 011, 111)
+	 *         - If collector block,
+	 *             - Right-read from self
+	 *             - Left -read from all right-neighbours with same write-target.
+	 *
+	 * Code Structure perfectly satisfying conclusion:
+	 *
+	 * if(misalignR){
+	 *     while(v > 0){
+	 *         v--;
+	 *         REDUX();
+	 *         ReduxLoopDecs_CONTINUE;
+	 *         HREDUX();
+	 *         WSRightWrite();
+	 *         REINIT();
+	 *         FreeLoopDecs_BREAK;
+	 *         BREAK;
+	 *     }
+	 * }
+	 * while(v > 0){
+	 *     v--;
+	 *     REDUX();
+	 *     ReduxLoopDecs_CONTINUE;
+	 *     HREDUX();
+	 *     DstWrite();
+	 *     REINIT();
+	 *     FreeLoopDecs_CONTINUE;
+	 *     BREAK;
+	 * }
+	 * if(misalignL){
+	 *     HREDUX();
+	 *     WSLeftWrite();
+	 * }
+	 *
+	 * Code Walkthrough:
+	 *
+	 * 000, 100: --  Impossible, can be ignored.
+	 * 001:      --  Only master loop entered, handles exact integer number of destinations.
+	 * 010:      -R  Right-misalign loop entered, completes a reduction. HREDUX, partial
+	 *               result right-written to workspace, reinit, bump of free loop counters,
+	 *               break simultaneously on vcount and free loop breaks.
+	 *               Master loop not entered. Left-misalign fixup not entered.
+	 * 011:      -R  Right-misalign loop entered, completes a reduction. HREDUX, partial
+	 *               result right-written to workspace, reinit, bump of free loop counters,
+	 *               break on free loop breaks. Master loop entered for 1+ complete
+	 *               destination elements written direct to destination. Break on vcount.
+	 *               Left-misalign fixup not entered.
+	 * 101:      L-  Master loop entered for 0+ complete destination elements written
+	 *               directly to destination. Master loop broken on vcount. Left-misalign
+	 *               fixup entered, HREDUX, partial result left-written to workspace.
+	 * 110:      L-  Right-misalign loop entered, broken on vcount before HREDUX. No
+	 *               reinit. Master loop not entered. Left-misalign fixup entered, HREDUX,
+	 *               partial result left-written to workspace.
+	 * 111:      LR  Right-misalign loop entered and completes a reduction. HREDUX, partial
+	 *               result right-written to workspace, reinit, bump of free loop counters,
+	 *               breakout. Master loop entered for 0 or more complete destination
+	 *               elements written directly to destination. Master loop broken on vcount
+	 *               before HREDUX. Right-misalign fixup entered, HREDUX, partial result
+	 *               left-written to workspace.
+	 */
 	"    \n"
-	"    /**\n"
-	"     * Decode BLOCK start point.\n"
-	"     * \n"
-	"     * For the purpose of decoding the start point, the split axis's \"length\"\n"
-	"     * is divided by either splitReduce or splitFree and rounded up. Therefore,\n"
-	"     * for those axes the true computed initial starting point must be\n"
-	"     * multiplied by either splitReduce or splitFree.\n"
-	"     * \n"
-	"     * Since we provide not strides but \"jumps\" to the kernel (to move as many\n"
-	"     * things as possible into constant memory and out of the fast path), we\n"
-	"     * must also convert jumps to strides in preparation for offsetting the\n"
-	"     * base pointers to their starting point.\n"
-	"     */\n"
+	"    TU64        left         = GID_0 * V;\n"
+	"    if(left >= U){return;}\n"
+	"    TU64        v            = U-left < V ? U-left : V;\n"
 	"    \n"
-	"    TX          z, h, k;\n"
-	"    unsigned    Dunit    = D/splitFree;\n");
-	if(gr->ndd > 0){
-		srcbAppendf(&gr->srcGen,
-		"    TX          l%dDiv    = DIVIDECEIL(l%d, splitFree);\n",
-		            gr->ndd-1, gr->ndd-1);
-	}
-	if(gr->ndr > 0){
-		srcbAppendf(&gr->srcGen,
-		"    TX          l%dDiv    = DIVIDECEIL(l%d, splitReduce);\n",
-		            gr->nds-1, gr->nds-1);
-	}
-	srcbAppends(&gr->srcGen,
+	"    TS32        misalignL    = (left+0)%B != 0;\n"
+	"    TS32        misalignR    = (left+v)%B != 0;\n"
+	"    TS32        doFinish     = (left+0)/B != (left+v)/B;\n"
+	"    TS32        collector    = misalignR && (!misalignL || doFinish);\n"
 	"    \n"
-	"    z                    = start;\n");
-	for(i=gr->nds-1;i>=0;i--){
-		if(i == gr->nds-1 || i == gr->ndd-1){
+	"    TU32        iSplit       = LID_0/(LDIM_0/LSlice);\n"
+	"    \n");
+	/**
+	 * Decode Intra-/Inter-Block start point.
+	 *
+	 * For the purpose of decoding the start point, the split axis's \"length\"
+	 * is divided by either splitReduce or splitFree and rounded up. Therefore,
+	 * for those axes the true computed initial starting point must be
+	 * multiplied by either splitReduce or splitFree.
+	 *
+	 * Since we provide not strides but \"jumps\" to the kernel (to move as many
+	 * things as possible into constant memory and out of the fast path), we
+	 * must also convert jumps to strides in preparation for offsetting the
+	 * base pointers to their starting point.
+	 *
+	 * This also involves computing the intra-block coordinate of a thread in a
+	 * up-to-log2(MAX_BLOCK_THREADS)-rank coordinate system, then using
+	 * those coordinates to compute intrablock S0/D0/D1/I0/permute targets.
+	 */
+
+	for (i=gr->nds-1;i>=0;i--){
+		if       (i == gr->nds-1 && i == gr->ndd-1){
 			srcbAppendf(&gr->srcGen,
-			"    TX          i%d       = z %% l%dDiv;z /= l%dDiv;\n",
-			            i, i, i);
-		}else{
+			"    TU64        _L%d          = DIVIDECEIL(L%d, LSlice);\n", i, i);
+		}else if (i == gr->nds-1){
 			srcbAppendf(&gr->srcGen,
-			"    TX          i%d       = z %% l%d;   z /= l%d;\n",
-			            i, i, i);
-		}
-	}
-	srcbAppends(&gr->srcGen, "    \n");
-	for(i=gr->nds-1;i>=0;i--){
-		if(i == gr->nds-1){
+			"    TU64        _L%d          = DIVIDECEIL(L%d, (selector&2) ? 1 : LSlice);\n", i, i);
+		}else if (i == gr->ndd-1){
 			srcbAppendf(&gr->srcGen,
-			"    TX          sS%d      = sJ%d;\n",
-			            i, i);
+			"    TU64        _L%d          = DIVIDECEIL(L%d, (selector&2) ? LSlice : 1);\n", i, i);
 		}else{
 			srcbAppendf(&gr->srcGen,
-			"    TX          sS%d      = sJ%d + l%d%s*sS%d;\n",
-			            i, i, i+1,
-			            reduxGenAxisMaybeSplit(gr, i+1) ? "Div" : "   ", i+1);
+			"    TU64        _L%d          = L%d;\n", i, i);
 		}
 	}
-	if (reduxGenRequiresDst(gr)){
+	srcbAppends(&gr->srcGen,
+	"    \n"
+	"    z                        = left+v-1;\n");
+	for (i=gr->nds-1;i>=0;i--){
+		srcbAppendf(&gr->srcGen,
+		"    TS64        _i%d          = z %% _L%d;  z /= _L%d;\n",  i, i, i);
+	}
+	srcbAppends(&gr->srcGen,
+	"    z                        = LID_0;\n");
+	for (i=gr->log2MaxBS-1;i>=0;i--){
+		srcbAppendf(&gr->srcGen,
+		"    TS32        _i%di         = z %%  L%di; z /=  L%di;\n", i, i, i);
+	}
+
+
+	/* Compute Intrablock Permute Core, since it will be used soon */
+	srcbAppends(&gr->srcGen, "    \n");
+	srcbAppends(&gr->srcGen, "    const TU32  perm         = ");
+	srcbBeginList(&gr->srcGen, " + ", "0");
+	for (i=0;i<gr->log2MaxBS;i++){
+		srcbAppendElemf(&gr->srcGen, "_i%di*perm%di", i, i);
+	}
+	srcbEndList(&gr->srcGen);
+	srcbAppends(&gr->srcGen, ";\n");
+
+
+	/* S0 Lattice */
+	if (reduxGenKernelRequiresLatticeS0(gr)){
 		srcbAppends(&gr->srcGen, "    \n");
-		for(i=gr->ndd-1;i>=0;i--){
-			if(i == gr->ndd-1){
+		for (i=gr->nds-1;i>=0;i--){
+			if (i == gr->nds-1){
 				srcbAppendf(&gr->srcGen,
-				"    TX          dS%d      = dJ%d;\n",
-				            i, i);
+				"    TS64        _S0S%d        = S0J%d;\n", i, i);
 			}else{
 				srcbAppendf(&gr->srcGen,
-				"    TX          dS%d      = dJ%d + l%d%s*dS%d;\n",
-				            i, i, i+1,
-				            reduxGenAxisMaybeSplit(gr, i+1) ? "Div" : "   ", i+1);
+				"    TS64        _S0S%d        = S0J%d + _L%d*_S0S%d;\n", i, i, i+1, i+1);
 			}
 		}
+		srcbAppends(&gr->srcGen, "    S0Off                   += ");
+		srcbBeginList(&gr->srcGen, " + ", "0");
+		for (i=0;i<gr->nds;i++){
+			srcbAppendElemf(&gr->srcGen, "_i%d*_S0S%d", i, i);
+		}
+		for (i=0;i<gr->log2MaxBS;i++){
+			srcbAppendElemf(&gr->srcGen, "_i%di*S0S%di", i, i);
+		}
+		srcbEndList(&gr->srcGen);
+		srcbAppends(&gr->srcGen, ";\n"
+		                         "    S0                      += S0Off;\n");
 	}
-	if (reduxGenRequiresDstArg(gr)){
+
+
+	/* D0 Lattice */
+	if (reduxGenKernelRequiresLatticeD0(gr)){
 		srcbAppends(&gr->srcGen, "    \n");
-		for(i=gr->ndd-1;i>=0;i--){
-			if(i == gr->ndd-1){
+		for (i=gr->ndd-1;i>=0;i--){
+			if (i == gr->ndd-1){
 				srcbAppendf(&gr->srcGen,
-				"    TX          aS%d      = aJ%d;\n",
-				            i, i);
+				"    TS64        _D0S%d        = D0J%d;\n", i, i);
 			}else{
 				srcbAppendf(&gr->srcGen,
-				"    TX          aS%d      = aJ%d + l%d%s*aS%d;\n",
-				            i, i, i+1,
-				            reduxGenAxisMaybeSplit(gr, i+1) ? "Div" : "   ", i+1);
+				"    TS64        _D0S%d        = D0J%d + _L%d*_D0S%d;\n", i, i, i+1, i+1);
 			}
 		}
-	}
-	srcbAppends(&gr->srcGen, "    \n");
-	srcbAppends(&gr->srcGen, "    sOff                += ");
-	srcbBeginList(&gr->srcGen, " + ", "0");
-	for(i=0;i<gr->nds;i++){
-		srcbAppendElemf(&gr->srcGen, "(TX)i%d*sS%d", i, i);
-	}
-	srcbEndList(&gr->srcGen);
-	srcbAppends(&gr->srcGen, ";\n");
-	if (reduxGenRequiresDst(gr)){
-		srcbAppends(&gr->srcGen, "    dOff                += ");
+		srcbAppends(&gr->srcGen, "    D0Off                   += ");
 		srcbBeginList(&gr->srcGen, " + ", "0");
-		for(i=0;i<gr->ndd;i++){
-			srcbAppendElemf(&gr->srcGen, "(TX)i%d*dS%d", i, i);
+		for (i=0;i<gr->ndd;i++){
+			srcbAppendElemf(&gr->srcGen, "_i%d*_D0S%d", i, i);
 		}
-		srcbEndList(&gr->srcGen);
-		srcbAppends(&gr->srcGen, ";\n");
-	}
-	if (reduxGenRequiresDstArg(gr)){
-		srcbAppends(&gr->srcGen, "    aOff                += ");
-		srcbBeginList(&gr->srcGen, " + ", "0");
-		for(i=0;i<gr->ndd;i++){
-			srcbAppendElemf(&gr->srcGen, "(TX)i%d*aS%d", i, i);
+		for (i=0;i<gr->log2MaxBS;i++){
+			srcbAppendElemf(&gr->srcGen, "_i%di*D0S%di", i, i);
 		}
 		srcbEndList(&gr->srcGen);
-		srcbAppends(&gr->srcGen, ";\n");
-	}
-	srcbAppends(&gr->srcGen, "    \n");
-	if(gr->ndd > 0){
-		srcbAppendf(&gr->srcGen,
-		"    i%d                  *= splitFree;\n",
-		            gr->ndd-1);
-	}
-	if(gr->ndr > 0){
-		srcbAppendf(&gr->srcGen,
-		"    i%d                  *= splitReduce;\n",
-	                gr->nds-1);
-	}
-	srcbAppends(&gr->srcGen, "    \n");
-	if(reduxGenKernelRequiresDst(gr)){
-		srcbAppends(&gr->srcGen,
-		"    TK* restrict wd       = (TK* restrict)(w     + wdOff);\n"
-		"    TK* restrict wdL      = &wd[0];\n"
-		"    TK* restrict wdR      = &wd[GDIM_0*D];\n"
-		"    TK* restrict pd       = (TK* restrict)(SHMEM + pdOff);\n");
-	}
-	if(reduxGenKernelRequiresDstArg(gr)){
-		srcbAppends(&gr->srcGen,
-		"    TA* restrict wa       = (TA* restrict)(w     + waOff);\n"
-		"    TA* restrict waL      = &wa[0];\n"
-		"    TA* restrict waR      = &wa[GDIM_0*D];\n"
-		"    TA* restrict pa       = (TA* restrict)(SHMEM + paOff);\n");
+		srcbAppends(&gr->srcGen, ";\n"
+		                         "    if(perm < D){\n"
+		                         "        ((TS64*)SHMEM)[perm] = D0Off;\n"
+		                         "    }\n"
+		                         "    local_barrier();\n"
+		                         "    if(LID_0 < D){\n"
+		                         "        D0Off                = ((TS64*)SHMEM)[LID_0];\n"
+		                         "    }\n"
+		                         "    local_barrier();\n"
+		                         "    D0                      += D0Off;\n");
 	}
-	srcbAppends(&gr->srcGen, "    \n");
-}
-static void       reduxGenSrcAppendThreadDecode (GpuReduction*     gr){
-	int i;
 
-	srcbAppends(&gr->srcGen,
-	"    /**\n"
-	"     * Decode THREAD start point.\n"
-	"     * \n"
-	"     * This involves computing the intra-block coordinate of a thread in a\n"
-	"     * up-to-log2(MAX_BLOCK_THREADS)-dimensional coordinate system, then using\n"
-	"     * those coordinates to compute private source/destination/destination\n"
-	"     * argument pointers, argument indices and permute targets.\n"
-	"     */\n"
-	"    \n"
-	"    unsigned    iSplit   = LID_0/(LDIM_0/(splitFree*splitReduce));\n"
-	"    z                    = LID_0;\n");
-	
-	for(i=gr->log2MaxL-1;i>=0;i--){
-		srcbAppendf(&gr->srcGen,
-		"    int         t%d       = z %% ibs%d;z /= ibs%d;\n",
-		            i, i, i);
-	}
-	if(reduxGenRequiresDstArg(gr)){
-		srcbAppends(&gr->srcGen, "    TX          ti       = ");
-		srcbBeginList(&gr->srcGen, " + ", "0");
-		for(i=0;i<gr->log2MaxL;i++){
-			srcbAppendElemf(&gr->srcGen, "t%d*ibl%dPDim", i, i);
+
+	/* D1 Lattice */
+	if (reduxGenKernelRequiresLatticeD1(gr)){
+		srcbAppends(&gr->srcGen, "    \n");
+		for (i=gr->ndd-1;i>=0;i--){
+			if (i == gr->ndd-1){
+				srcbAppendf(&gr->srcGen,
+				"    TS64        _D1S%d        = D1J%d;\n", i, i);
+			}else{
+				srcbAppendf(&gr->srcGen,
+				"    TS64        _D1S%d        = D1J%d + _L%d*_D1S%d;\n", i, i, i+1, i+1);
+			}
 		}
-		srcbEndList(&gr->srcGen);
-		srcbAppends(&gr->srcGen, ";\n");
-	}
-	srcbAppends(&gr->srcGen, "    unsigned    tp       = ");
-	srcbBeginList(&gr->srcGen, " + ", "0");
-	for(i=0;i<gr->log2MaxL;i++){
-		srcbAppendElemf(&gr->srcGen, "t%d*    ibp%d", i, i);
-	}
-	srcbEndList(&gr->srcGen);
-	srcbAppends(&gr->srcGen, ";\n");
-	srcbAppends(&gr->srcGen, "    \n"
-	                         "    sOff                += ");
-	srcbBeginList(&gr->srcGen, " + ", "0");
-	for(i=0;i<gr->log2MaxL;i++){
-		srcbAppendElemf(&gr->srcGen, "t%d*ibsOff%d ", i, i);
-	}
-	srcbEndList(&gr->srcGen);
-	srcbAppends(&gr->srcGen, ";\n");
-	if(reduxGenRequiresDst(gr)){
-		srcbAppends(&gr->srcGen, "    \n"
-		                         "    dOff                += ");
+		srcbAppends(&gr->srcGen, "    D1Off                   += ");
 		srcbBeginList(&gr->srcGen, " + ", "0");
-		for(i=0;i<gr->log2MaxL;i++){
-			srcbAppendElemf(&gr->srcGen, "t%d*ibdOff%d ", i, i);
+		for (i=0;i<gr->ndd;i++){
+			srcbAppendElemf(&gr->srcGen, "_i%d*_D1S%d", i, i);
+		}
+		for (i=0;i<gr->log2MaxBS;i++){
+			srcbAppendElemf(&gr->srcGen, "_i%di*D1S%di", i, i);
 		}
 		srcbEndList(&gr->srcGen);
-		srcbAppends(&gr->srcGen, ";\n");
-		srcbAppends(&gr->srcGen, "    ((TX*)SHMEM)[tp]     = dOff;\n"
+		srcbAppends(&gr->srcGen, ";\n"
+		                         "    if(perm < D){\n"
+		                         "        ((TS64*)SHMEM)[perm] = D1Off;\n"
+		                         "    }\n"
+		                         "    local_barrier();\n"
+		                         "    if(LID_0 < D){\n"
+		                         "        D1Off                = ((TS64*)SHMEM)[LID_0];\n"
+		                         "    }\n"
 		                         "    local_barrier();\n"
-		                         "    dOff                 = ((TX*)SHMEM)[LID_0];\n"
-		                         "    local_barrier();\n");
+		                         "    D1                      += D1Off;\n");
 	}
-	if(reduxGenRequiresDstArg(gr)){
-		srcbAppends(&gr->srcGen, "    \n"
-		                         "    aOff                += ");
+
+
+	/* I0 Lattice */
+	if (reduxGenKernelRequiresLatticeI0(gr)){
+		srcbAppends(&gr->srcGen, "    \n");
+		for (i=gr->nds-1;i>=0;i--){
+			if (i == gr->nds-1){
+				srcbAppendf(&gr->srcGen,
+				"    TS64        _I0S%d        = I0J%d;\n", i, i);
+			}else{
+				srcbAppendf(&gr->srcGen,
+				"    TS64        _I0S%d        = I0J%d + _L%d*_I0S%d;\n", i, i, i+1, i+1);
+			}
+		}
+		srcbAppends(&gr->srcGen, "    I0                       = ");
 		srcbBeginList(&gr->srcGen, " + ", "0");
-		for(i=0;i<gr->log2MaxL;i++){
-			srcbAppendElemf(&gr->srcGen, "t%d*ibaOff%d ", i, i);
+		for (i=0;i<gr->nds;i++){
+			srcbAppendElemf(&gr->srcGen, "_i%d*_I0S%d", i, i);
+		}
+		for (i=0;i<gr->log2MaxBS;i++){
+			srcbAppendElemf(&gr->srcGen, "_i%di*I0S%di", i, i);
 		}
 		srcbEndList(&gr->srcGen);
 		srcbAppends(&gr->srcGen, ";\n");
-		srcbAppends(&gr->srcGen, "    ((TX*)SHMEM)[tp]     = aOff;\n"
-		                         "    local_barrier();\n"
-		                         "    aOff                 = ((TX*)SHMEM)[LID_0];\n"
-		                         "    local_barrier();\n");
 	}
-	srcbAppends(&gr->srcGen, "    \n"
-	                         "    const char* restrict ts       = s + sOff;\n");
-	if(reduxGenRequiresDst(gr)){
-		srcbAppends(&gr->srcGen, "    char* restrict       td       = d + dOff;\n");
+
+
+	/* Workspace */
+	if (reduxGenKernelRequiresWspace(gr)){
+		srcbAppends(&gr->srcGen, "    \n");
+		if (reduxGenKernelRequiresStateK0(gr)){
+			srcbAppends(&gr->srcGen,
+			"    TK0* restrict const W0      = (TK0*)(W     + W0Off);\n"
+			"    TK0* restrict const W0L     = &W0[0];\n"
+			"    TK0* restrict const W0R     = &W0[GDIM_0*D];\n"
+			"    TK0* restrict const SHMEMK0 = (TK0*)(SHMEM + SHMEMK0Off);\n");
+		}
+		if (reduxGenKernelRequiresStateK1(gr)){
+			srcbAppends(&gr->srcGen,
+			"    TK1* restrict const W1      = (TK1*)(W     + W1Off);\n"
+			"    TK1* restrict const W1L     = &W1[0];\n"
+			"    TK1* restrict const W1R     = &W1[GDIM_0*D];\n"
+			"    TK1* restrict const SHMEMK1 = (TK1*)(SHMEM + SHMEMK1Off);\n");
+		}
+	}
+
+
+	/* Fixup the division we did to one of the dimensions. */
+	srcbAppendf(&gr->srcGen, "    \n");
+	if (gr->nds>0){
+		srcbAppendf(&gr->srcGen,
+		"    _i%d                     *= (selector&2) ? 1 : LSlice;\n", gr->nds-1);
 	}
-	if(reduxGenRequiresDstArg(gr)){
-		srcbAppends(&gr->srcGen, "    char* restrict       ta       = a + aOff;\n");
+	if (gr->ndd>0){
+		srcbAppendf(&gr->srcGen,
+		"    _i%d                     *= (selector&2) ? LSlice : 1;\n", gr->ndd-1);
 	}
-	srcbAppends(&gr->srcGen, "    \n"
-	                         "    \n");
-}
-static void       reduxGenSrcAppendPhase0       (GpuReduction*     gr){
+
+
+	/* Add a couple newlines before next section */
 	srcbAppends(&gr->srcGen,
-	"        /* PHASE 0 */\n"
-	"        \n"
-	"        /* Loop Cores. */\n");
-	if (gr->ndd == 0){
-		/**
-		 * Special case: If ndd == 0, we know this is an all-reduce or nearly, so
-		 * we know that the only split axis, if any, is going to be a reduction axis.
-		 * Therefore, splitFree will always be 1, and we only need to generate one
-		 * set of loops.
-		 */
-		
-		reduxGenSrcAppendLoops(gr, 0, 1);
-	}else{
-		srcbAppends(&gr->srcGen, "        if(splitReduce == 1){\n"
-		                         "            /* Free   axis possibly split. */\n");
-		reduxGenSrcAppendLoops(gr, 1, 0);
-		srcbAppends(&gr->srcGen, "        }else{\n"
-		                         "            /* Reduce axis possibly split. */\n");
-		reduxGenSrcAppendLoops(gr, 0, 1);
-		srcbAppends(&gr->srcGen, "        }\n");
-	}
-}
-static void       reduxGenSrcAppendLoops        (GpuReduction*        gr,
-                                                 int                  freeMaybeSplit,
-                                                 int                  reduceMaybeSplit){
-	srcbAppends(&gr->srcGen, "            if(misalignL){\n");
-	reduxGenSrcAppendLoop(gr, 1, freeMaybeSplit, reduceMaybeSplit);
+	"    \n"
+	"    \n");
+}
+static void        reduxGenSrcAppendPhase0        (GpuReduction*        gr,
+                                                   uint32_t             selector){
+	int         i;
+	const char* type;
+
+	/**
+	 * Convert index types depending on the template selected by the selector.
+	 *
+	 * If misaligned on the right, write partial reduction to right-half.
+	 * If misaligned on the left,  write partial reduction to left-half.
+	 *
+	 * The Phase 1 collector blocks will take care of reading the partial
+	 * reduction results and combining them.
+	 */
+
+	srcbAppends(&gr->srcGen, "            ");
+	for (i=0;i<gr->nds;i++){
+		type = reduxGenSrcAxisIsHuge(gr, selector, i) ? "TU64" : "TU32";
+		srcbAppendf(&gr->srcGen, "%s i%d = _i%d;", type, i, i);
+	}
+	srcbAppends(&gr->srcGen, "\n"
+	                         "            \n"
+	                         "            if(misalignR){\n");
+	reduxGenSrcAppendLoop(gr, selector, 1);
 	srcbAppends(&gr->srcGen, "            }\n");
-	reduxGenSrcAppendLoop(gr, 0, freeMaybeSplit, reduceMaybeSplit);
-	srcbAppends(&gr->srcGen,
-	"            \n"
-	"            /**\n"
-	"             * Are we misaligned on the right? If so, we have a partial reduction\n"
-	"             * to save.\n"
-	"             */\n"
-	"            \n"
-	"            if(misalignR){\n"
-	"                HREDUX(pd, pa, tp, accV, accI);\n"
-	"                \n"
-	"                /* Right-write partial reduction to workspace. */\n"
-	"                if(LID_0 < D){\n"
-	"                    SETREDUXSTATE(wdR[GID_0*D+LID_0], waR[GID_0*D+LID_0], pd[LID_0], pa[LID_0]);\n"
-	"                }\n"
-	"            }\n");
-}
-static void       reduxGenSrcAppendLoop         (GpuReduction*        gr,
-                                                 int                  initial,
-                                                 int                  freeMaybeSplit,
-                                                 int                  reduceMaybeSplit){
-	int i;
-	
-	srcbAppends(&gr->srcGen, "            while(v > 0){\n");
-	reduxGenSrcAppendDecrement(gr);
-	reduxGenSrcAppendVertical (gr, freeMaybeSplit, reduceMaybeSplit);
-	srcbAppends(&gr->srcGen, "                /* Reduction Increments */\n");
-	for(i=gr->nds-1;i >= gr->ndd;i--){
-		reduxGenSrcAppendIncrement(gr, i, initial, freeMaybeSplit, reduceMaybeSplit);
-	}
-	srcbAppends(&gr->srcGen, "                /* Horizontal Reduction */\n"
-	                         "                HREDUX(pd, pa, tp, accV, accI);\n"
-	                         "                \n");
-	reduxGenSrcAppendDstWrite(gr, initial, freeMaybeSplit, reduceMaybeSplit);
-	srcbAppends(&gr->srcGen, "                /* Reinitialize accumulators */\n"
-	                         "                INITREDUXSTATE(accV, accI);\n"
-	                         "                \n");
-	srcbAppends(&gr->srcGen, "                /* Free Increments */\n");
-	for(i=gr->ndd-1;i >= 0;i--){
-		reduxGenSrcAppendIncrement(gr, i, initial, freeMaybeSplit, reduceMaybeSplit);
-	}
-	srcbAppends(&gr->srcGen, "                /* Exit loop */\n"
-	                         "                break;\n"
+	reduxGenSrcAppendLoop(gr, selector, 0);
+	srcbAppends(&gr->srcGen, "            if(misalignL){\n"
+	                         "                HREDUX(SHMEMK0, SHMEMK1, perm, K0, K1);\n"
+	                         "                if(LID_0 < D){\n"
+	                         "                    SETREDUXSTATE(W0L[GID_0*D+LID_0],\n"
+	                         "                                  W1L[GID_0*D+LID_0],\n"
+	                         "                                  SHMEMK0[LID_0],\n"
+	                         "                                  SHMEMK1[LID_0]);\n"
+	                         "                }\n"
 	                         "            }\n");
 }
-static void       reduxGenSrcAppendDecrement    (GpuReduction*        gr){
-	srcbAppends(&gr->srcGen, "                /* Decrement. */\n"
-	                         "                v--;\n"
-	                         "                \n");
-}
-static void       reduxGenSrcAppendVertical     (GpuReduction*        gr,
-                                                 int                  freeMaybeSplit,
-                                                 int                  reduceMaybeSplit){
+static void        reduxGenSrcAppendLoop          (GpuReduction*        gr,
+                                                   uint32_t             selector,
+                                                   int                  initial){
 	int i;
-	
-	if(!freeMaybeSplit && !reduceMaybeSplit){
-		srcbAppends(&gr->srcGen, "                /* Vertical Reductions */\n"
-		                         "                LOADS(tmpV, ts);\n"
-		                         "                REDUX(accV, accI, tmpV, GETIDX);\n"
-		                         "                \n");
+
+	srcbAppends(&gr->srcGen, "            while(v > 0){v--;\n");
+	reduxGenSrcAppendVertical (gr, selector);
+	for (i=gr->nds-1;i >= gr->ndd;i--){
+		reduxGenSrcAppendIncrement(gr, selector, initial, i);
+	}
+	srcbAppends(&gr->srcGen, "                HREDUX(SHMEMK0, SHMEMK1, perm, K0, K1);\n");
+	reduxGenSrcAppendDstWrite(gr, selector, initial);
+	srcbAppends(&gr->srcGen, "                INITREDUXSTATE(K0, K1);\n");
+	for (i=gr->ndd-1;i >= 0;i--){
+		reduxGenSrcAppendIncrement(gr, selector, initial, i);
+	}
+	srcbAppends(&gr->srcGen, "                break;\n"
+	                         "            }\n");
+}
+static void        reduxGenSrcAppendVertical      (GpuReduction*        gr,
+                                                   uint32_t             selector){
+	int i = (selector&SELECTOR_SPLIT_FREE) ? gr->ndd-1 : gr->nds-1;
+
+	if (i >= 0){
+		srcbAppendf(&gr->srcGen, "                if(i%d+iSplit < L%d){\n"
+		                         "                    LOADS0(tmpK0, S0);\n"
+		                         "                    REDUX(K0, K1, tmpK0, I0);\n"
+		                         "                }\n", i, i);
 	}else{
-		i = freeMaybeSplit ? gr->ndd-1 : gr->nds-1;
-		srcbAppendf(&gr->srcGen, "                /* Vertical Reductions */\n"
-		                         "                if(i%d+iSplit < l%d){\n"
-		                         "                    LOADS(tmpV, ts);\n"
-		                         "                    REDUX(accV, accI, tmpV, GETIDX);\n"
-		                         "                }\n"
-		                         "                \n", i, i);
-	}
-}
-static void       reduxGenSrcAppendIncrement    (GpuReduction*        gr,
-                                                 int                  axis,
-                                                 int                  initial,
-                                                 int                  freeMaybeSplit,
-                                                 int                  reduceMaybeSplit){
+		srcbAppends(&gr->srcGen, "                LOADS0(tmpK0, S0);\n"
+		                         "                REDUX(K0, K1, tmpK0, I0);\n");
+	}
+}
+static void        reduxGenSrcAppendIncrement     (GpuReduction*        gr,
+                                                   uint32_t             selector,
+                                                   int                  initial,
+                                                   int                  axis){
+	const char* cast        = reduxGenSrcAxisIsHuge(gr, selector, axis) ? "TS64" : "TS32";
 	const char* breakOrCont = (initial) && (axis < gr->ndd) ? "break" : "continue";
-	
-	if       (freeMaybeSplit   && axis == gr->ndd-1){
-		srcbAppendf(&gr->srcGen,
-		"                i%d += splitFree;\n"
-		"                ts += sJ%d;",
-		            axis, axis);
-		if(reduxGenRequiresDst(gr)){
-			srcbAppendf(&gr->srcGen, "td += dJ%d;", axis);
-		}
-		if(reduxGenRequiresDstArg(gr)){
-			srcbAppendf(&gr->srcGen, "ta += aJ%d;", axis);
-		}
-		srcbAppends(&gr->srcGen, "\n");
-		srcbAppendf(&gr->srcGen,
-		"                if  (i%d < l%d){%s;}\n"
-		"                else         {i%d = 0;}\n"
-		"                \n",
-		            axis, axis, breakOrCont, axis);
-	}else if (reduceMaybeSplit && axis == gr->nds-1){
-		srcbAppendf(&gr->srcGen,
-		"                i%d += splitReduce;\n"
-		"                ts += sJ%d;\n"
-		"                if  (i%d < l%d){%s;}\n"
-		"                else         {i%d = 0;}\n"
-		"                \n",
-		            axis, axis, axis, axis, breakOrCont, axis);
+
+	/* Pointer bumps */
+	srcbAppends(&gr->srcGen, "                ");
+	if (reduxGenKernelRequiresLatticeS0(gr)){
+		srcbAppendf(&gr->srcGen, "S0 -= S0J%d;", axis);
 	}else{
-		srcbAppendf(&gr->srcGen,
-		"                i%d++;\n"
-		"                ts += sJ%d;",
-		            axis, axis);
-		if(axis < gr->ndd){
-			if(reduxGenRequiresDst(gr)){
-				srcbAppendf(&gr->srcGen, "td += dJ%d;", axis);
-			}
-			if(reduxGenRequiresDstArg(gr)){
-				srcbAppendf(&gr->srcGen, "ta += aJ%d;", axis);
-			}
-		}
-		srcbAppends(&gr->srcGen, "\n");
-		srcbAppendf(&gr->srcGen,
-		"                if  (i%d < l%d){%s;}\n"
-		"                else         {i%d = 0;}\n"
-		"                \n",
-		            axis, axis, breakOrCont, axis);
-	}
-}
-static void       reduxGenSrcAppendDstWrite     (GpuReduction*        gr,
-                                                 int                  initial,
-                                                 int                  freeMaybeSplit,
-                                                 int                  reduceMaybeSplit){
-	if(initial){
-		srcbAppends(&gr->srcGen, "                /* Workspace Left-Write */\n"
-		                         "                if(LID_0 < D){\n"
-		                         "                    SETREDUXSTATE(wdL[GID_0*D + LID_0], waL[GID_0*D + LID_0], pd[LID_0], pa[LID_0]);\n"
-		                         "                }\n"
-		                         "                \n");
+		srcbAppends(&gr->srcGen, "           ");
+	}
+	if (reduxGenKernelRequiresLatticeD0(gr) && axis < gr->ndd){
+		srcbAppendf(&gr->srcGen, "D0 -= D0J%d;", axis);
 	}else{
-		if(!freeMaybeSplit){
-			srcbAppends(&gr->srcGen, "                /* Destination Write */\n"
-			                         "                if(LID_0 < D){\n"
-			                         "                    STORED(td, pd[LID_0]);\n"
-			                         "                    STOREA(ta, pa[LID_0]);\n"
-			                         "                }\n"
-			                         "                \n");
-		}else{
-			if(gr->ndd > 0){
-				srcbAppendf(&gr->srcGen, "                /* Destination Write */\n"
-				                         "                if(LID_0 < (l%d-i%d<splitFree ? (l%d-i%d)*Dunit : D)){\n"
-				                         "                    STORED(td, pd[LID_0]);\n"
-				                         "                    STOREA(ta, pa[LID_0]);\n"
-				                         "                }\n"
-				                         "                \n",
+		srcbAppends(&gr->srcGen, "           ");
+	}
+	if (reduxGenKernelRequiresLatticeD1(gr) && axis < gr->ndd){
+		srcbAppendf(&gr->srcGen, "D1 -= D1J%d;", axis);
+	}else{
+		srcbAppends(&gr->srcGen, "           ");
+	}
+	if (reduxGenKernelRequiresLatticeI0(gr)){
+		srcbAppendf(&gr->srcGen, "I0 -= I0J%d;", axis);
+	}else{
+		srcbAppends(&gr->srcGen, "           ");
+	}
+
+	/* Index Check */
+	if (reduxGenSrcAxisIsSplit(gr, selector, axis)){
+		srcbAppendf(&gr->srcGen, "i%d-=LSlice;if((%s)i%d >= 0){%s;}else{i%d+=LPadded;}\n",
+		            axis, cast, axis, breakOrCont, axis);
+	}else{
+		srcbAppendf(&gr->srcGen, "i%d--;      if((%s)i%d >= 0){%s;}else{i%d+=L%d;}\n",
+		            axis, cast, axis, breakOrCont, axis, axis);
+	}
+}
+static void        reduxGenSrcAppendDstWrite      (GpuReduction*        gr,
+                                                   uint32_t             selector,
+                                                   int                  initial){
+	if (initial){
+		srcbAppends(&gr->srcGen, "                if(LID_0 < D){\n"
+		                         "                    SETREDUXSTATE(W0R[GID_0*D + LID_0],\n"
+		                         "                                  W1R[GID_0*D + LID_0],\n"
+		                         "                                  SHMEMK0[LID_0],\n"
+		                         "                                  SHMEMK1[LID_0]);\n"
+		                         "                }\n");
+	}else{
+		if (selector & SELECTOR_SPLIT_FREE){
+			if (gr->ndd > 0){
+				srcbAppendf(&gr->srcGen, "                if(LID_0 < ((L%d-i%d)<LSlice ? (L%d-i%d)*Dunit : D)){\n"
+				                         "                    STORED0(D0, SHMEMK0[LID_0]);\n"
+				                         "                    STORED1(D1, SHMEMK1[LID_0]);\n"
+				                         "                }\n",
 				            gr->ndd-1, gr->ndd-1, gr->ndd-1, gr->ndd-1);
 			}else{
-				srcbAppendf(&gr->srcGen, "                STORED(td, pd[LID_0]);\n"
-				                         "                STOREA(ta, pa[LID_0]);\n");
+				srcbAppendf(&gr->srcGen, "                STORED0(D0, SHMEMK0[LID_0]);\n"
+				                         "                STORED1(D1, SHMEMK1[LID_0]);\n");
 			}
+		}else{
+			srcbAppends(&gr->srcGen, "                if(LID_0 < D){\n"
+			                         "                    STORED0(D0, SHMEMK0[LID_0]);\n"
+			                         "                    STORED1(D1, SHMEMK1[LID_0]);\n"
+			                         "                }\n");
 		}
 	}
 }
-static void       reduxGenSrcAppendPhase1       (GpuReduction*        gr){
+static void        reduxGenSrcAppendPhase1        (GpuReduction*        gr){
+	/**
+	 * PHASE 1
+	 *
+	 * If we are a collector block, gather all partial results for the
+	 * same points to the right of the current position in our workspace
+	 * and accumulate them into our partial result, then write out to
+	 * destination/destination argument.
+	 *
+	 * We perform a right-read of our workspace and a left-read of the
+	 * other blocks' workspace.
+	 */
+
 	srcbAppends(&gr->srcGen,
-	"        /* PHASE 1 */\n"
-	"        \n"
-	"        /**\n"
-	"         * If we are a collector block, gather all partial results for the\n"
-	"         * same point to the left of the current position in our workspace\n"
-	"         * and accumulate them into our partial result, then write out to\n"
-	"         * destination/destination argument.\n"
-	"         * We perform a left-read of our workspace and a right-read of the\n"
-	"         * other blocks' workspace.\n"
-	"         */\n"
-	"        \n"
-	"        if(misalignL && doFinish && LID_0 < D){\n"
-	"            SETREDUXSTATE(accV, accI, wdL[(GID_0+0)*D+LID_0], waL[(GID_0+0)*D+LID_0]);\n"
+	"        if(collector && LID_0 < D){\n"
+	"            SETREDUXSTATE(K0, K1, W0R[(GID_0+0)*D+LID_0], W1R[(GID_0+0)*D+LID_0]);\n"
 	"            \n"
-	"                           /* vvv-- NOTA BENE: The +B hack is REALLY NECESSARY, since C division is rounding to zero: (-1)/B == (B-1)/B for B>1. */\n"
-	"            for(k=-1;                  /* Starting with the first block to our left... */\n"
-	"                (start        +B)/B == /* Is our write target the same as that of */\n"
-	"                (start+k*V+V-1+B)/B;   /* the target k blocks to our left? */\n"
-	"                k--){                  /* Try moving one more to the left. */\n"
-	"                REDUX(accV, accI, wdR[(GID_0+k)*D+LID_0], waR[(GID_0+k)*D+LID_0]);\n"
+	"            for(k=1,v=left+v-1,z=v+1; /* Starting with the first block to our right... */\n"
+	"                v/B == z/B;           /* Is our write target the same as that of */\n"
+	"                                      /* the target k blocks to our right? */\n"
+	"                k++,z+=V){            /* Try moving one more to the right. */\n"
+	"                REDUX(K0, K1, W0L[(GID_0+k)*D+LID_0], W1L[(GID_0+k)*D+LID_0]);\n"
 	"            }\n"
 	"            \n");
-	if(gr->ndd > 0){
+	if (gr->ndd > 0){
 		srcbAppendf(&gr->srcGen,
-		"            if(LID_0 < (l%d-i%d<splitFree ? (l%d-i%d)*Dunit : D)){\n"
-		"                STORED(td, accV);\n"
-		"                STOREA(ta, accI);\n"
-		"            }\n",
+		"            if(!(selector&2) || LID_0 < ((L%d-_i%d)<LSlice ? (L%d-_i%d)*Dunit : D)){\n"
+		"                STORED0(D0, K0);\n"
+		"                STORED1(D1, K1);\n"
+		"            }\n"
+		"        }\n",
 		            gr->ndd-1, gr->ndd-1, gr->ndd-1, gr->ndd-1);
 	}else{
 		srcbAppends(&gr->srcGen,
-		            "            STORED(td, accV);\n"
-		            "            STOREA(ta, accI);\n");
+		"            STORED0(D0, K0);\n"
+		"            STORED1(D1, K1);\n"
+		"        }\n");
+	}
+}
+static int         reduxGenSrcAxisIsHuge          (GpuReduction*        gr,
+                                                   uint32_t             selector,
+                                                   int                  axis){
+	int hugeType    = selector & SELECTOR_HUGE_AXIS;
+	int isSplitFree = !!(selector & SELECTOR_SPLIT_FREE);
+	int isAxisFree  = axis < gr->ndd;
+
+	if       (hugeType == SELECTOR_HUGE_IS_SPLIT){
+		return reduxGenSrcAxisIsSplit(gr, selector, axis);
+	}else if (hugeType == SELECTOR_HUGE_SAME_TYPE){
+		if (isSplitFree == isAxisFree){
+			if (isAxisFree){
+				return axis == gr->ndd-2;
+			}else{
+				return axis == gr->nds-2;
+			}
+		}else{
+			return 0;
+		}
+	}else if (hugeType == SELECTOR_HUGE_OPPOSITE_TYPE){
+		if (isSplitFree != isAxisFree){
+			if (isAxisFree){
+				return axis == gr->ndd-1;
+			}else{
+				return axis == gr->nds-1;
+			}
+		}else{
+			return 0;
+		}
+	}else{
+		return 0;
 	}
-	srcbAppends(&gr->srcGen,
-	            "        }\n");
+}
+static int         reduxGenSrcAxisIsSplit         (GpuReduction*        gr,
+                                                   uint32_t             selector,
+                                                   int                  axis){
+	return  ( (selector & SELECTOR_SPLIT_FREE) && axis == gr->ndd-1) ||
+	        (!(selector & SELECTOR_SPLIT_FREE) && axis == gr->nds-1);
 }
 
 /**
  * @brief Compile the generated kernel.
  */
 
-static int        reduxGenCompile               (GpuReduction*     gr){
-	int ret;
-	
+static int        reduxGenCompile               (GpuReduction*        gr){
+	int ret, flags = 0;
+
+	flags |= GA_USE_CLUDA;
+	if (gr->TS0tc == GA_HALF || gr->TD0tc == GA_HALF){
+		flags |= GA_USE_HALF|GA_USE_SMALL;
+	}
+
 	ret  = GpuKernel_init(&gr->k,
 	                      gr->gpuCtx,
 	                      1,
 	                      (const char**)&gr->kSourceCode,
 	                      &gr->kSourceCodeLen,
-	                      "redux",
+	                      gr->kName,
 	                      gr->kNumArgs,
 	                      gr->kArgTypeCodes,
-	                      GA_USE_CLUDA,
+	                      flags,
 	                      &gr->kErrorString);
 
 	if (ret != GA_NO_ERROR){
 		return reduxGenCleanupMsg(gr, ret,
-		       "Failed to compile reduction kernel!\n"
+		       "Failed to compile reduction kernel \"%s\"!\n"
 		       "Error code   is: %d\n"
 		       "Error string is:\n"
 		       "%s\n"
 		       "Source code  is:\n"
 		       "%s\n",
-		       ret, gr->kErrorString, gr->kSourceCode);
+		       gr->kName, ret, gr->kErrorString, gr->kSourceCode);
 	}
-	
+
 	return reduxGenComputeLaunchBounds(gr);
 }
 
@@ -2451,43 +2812,20 @@ static int        reduxGenCompile               (GpuReduction*     gr){
 
 static int        reduxGenComputeLaunchBounds   (GpuReduction*        gr){
 	int    ret;
-	size_t a,b,c;
-	
+
 	/**
 	 * Compute the maximum number of threads this kernel will support,
 	 * since this is critical to the scheduling and will not change now
 	 * that the kernel is compiled.
-	 * 
-	 * This depends on several exhaustible resources and isn't necessarily
-	 * trivial to compute due to the complicated rules we must follow to
-	 * align shared memory, possibly slightly increasing consumption.
 	 */
-	
-	ret = gpukernel_property(gr->k.k, GA_KERNEL_PROP_MAXLSIZE,  &gr->maxLK);
-	if(ret != GA_NO_ERROR){
+
+	ret = gpukernel_property(gr->k.k, GA_KERNEL_PROP_MAXLSIZE, &gr->maxLK);
+	if (ret != GA_NO_ERROR){
 		return reduxGenCleanupMsg(gr, ret,
 		       "Failed to read max local size for compiled kernel!\n");
 	}
-	a         = gr->maxL0;
-	b         = gr->maxLg;
-	c         = gr->maxLM/reduxGenGetReduxStateSize(gr);
-	                                       /* Kernel register use              */
-	gr->maxLK = gr->maxLK<a ? gr->maxLK: a;/* Maximum block size on axis 0     */
-	gr->maxLK = gr->maxLK<b ? gr->maxLK: b;/* Maximum total block size         */
-	gr->maxLK = gr->maxLK<c ? gr->maxLK: c;/* Shared memory per thread.        */
-	
-	/**
-	 * We now have a tight bound on the maximum block size, but due to memory
-	 * alignment rules the memory consumption may be slightly higher than we
-	 * initially computed, and thus the shared memory use can still be
-	 * excessive. The following loop will almost certainly decrement at most
-	 * once, unless type alignments are very wierd.
-	 */
-	
-	while(reduxGenGetSHMEMSize(gr, gr->maxLK) > gr->maxLM){
-		gr->maxLK--;
-	}
-	
+	gr->maxLK = gr->maxLK<gr->maxBS ? gr->maxLK : gr->maxBS;
+
 	return reduxGenCleanup(gr, GA_NO_ERROR);
 }
 
@@ -2496,11 +2834,11 @@ static int        reduxGenComputeLaunchBounds   (GpuReduction*        gr){
  */
 
 static int        reduxGenCleanup               (GpuReduction*     gr,  int ret){
-	if(ret != GA_NO_ERROR){
+	if (ret != GA_NO_ERROR){
 		free(gr->kArgTypeCodes);
 		free(gr->kSourceCode);
 		free(gr->kErrorString);
-	
+
 		memset(gr, 0, sizeof(*gr));
 		free(gr);
 	}
@@ -2511,7 +2849,7 @@ static int        reduxGenCleanupMsg            (GpuReduction*     gr,  int ret,
                                                  const char*       fmt, ...){
 #if DEBUG
 	FILE* fp = stderr;
-	
+
 	va_list ap;
 	va_start(ap, fmt);
 	vfprintf(fp, fmt, ap);
@@ -2520,7 +2858,7 @@ static int        reduxGenCleanupMsg            (GpuReduction*     gr,  int ret,
 #else
 	(void)fmt;
 #endif
-	
+
 	return reduxGenCleanup(gr, ret);
 }
 
@@ -2528,26 +2866,26 @@ static int        reduxGenCleanupMsg            (GpuReduction*     gr,  int ret,
  * Count # of arguments as determined by iterator.
  */
 
-static void       reduxGenCountArgs             (GpuReduction*        gr,
+static void       reduxGenCountArgs             (const GpuReduction*  gr,
                                                  int                  typecode,
                                                  const char*          typeName,
                                                  const char*          baseName,
                                                  int                  num,
                                                  void*                user){
+	(void)gr;
 	(void)typecode;
 	(void)typeName;
 	(void)baseName;
 	(void)num;
-	(void)user;
-	
-	gr->kNumArgs++;
+
+	(*(int*)user)++;
 }
 
 /**
  * Record the typecodes in the arguments typecode array.
  */
 
-static void       reduxGenSaveArgTypecodes      (GpuReduction*        gr,
+static void       reduxGenSaveArgTypecodes      (const GpuReduction*  gr,
                                                  int                  typecode,
                                                  const char*          typeName,
                                                  const char*          baseName,
@@ -2557,7 +2895,7 @@ static void       reduxGenSaveArgTypecodes      (GpuReduction*        gr,
 	(void)baseName;
 	(void)num;
 	(void)user;
-	
+
 	gr->kArgTypeCodes[(*(int*)user)++] = typecode;
 }
 
@@ -2565,7 +2903,7 @@ static void       reduxGenSaveArgTypecodes      (GpuReduction*        gr,
  * Append an argument declaration to prototype.
  */
 
-static void       reduxGenAppendArg             (GpuReduction*        gr,
+static void       reduxGenAppendArg             (const GpuReduction*  gr,
                                                  int                  typecode,
                                                  const char*          typeName,
                                                  const char*          baseName,
@@ -2573,35 +2911,35 @@ static void       reduxGenAppendArg             (GpuReduction*        gr,
                                                  void*                user){
 	(void)user;
 	(void)typecode;
-	
-	if((*(int*)user)++ > 0){
-		srcbAppends(&gr->srcGen, ",\n                  ");
+
+	if ((*(int*)user)++ > 0){
+		srcbAppends(&((GpuReduction*)gr)->srcGen, ",\n                  ");
 	}
-	srcbAppendf(&gr->srcGen, "%-25s ", typeName);
-	srcbAppendf(&gr->srcGen, baseName, num);
+	srcbAppendf(&((GpuReduction*)gr)->srcGen, "%-35s ", typeName);
+	srcbAppendf(&((GpuReduction*)gr)->srcGen, baseName, num);
 }
 
 /**
  * Marshall argument declaration during invocation.
  */
 
-static void       reduxInvMarshalArg            (GpuReduction*        gr,
+static void       reduxInvMarshalArg            (const GpuReduction*  gr,
                                                  int                  typecode,
                                                  const char*          typeName,
                                                  const char*          baseName,
-                                                 int                  k,
+                                                 int                  num,
                                                  void*                user){
 	redux_ctx* ctx;
-	int*       i;
-	
+	int*       i, k = num;
+
 	(void)typecode;
 	(void)typeName;
-	
+
 	ctx = (redux_ctx*)(((void**)user)[0]);
 	i   = (int      *)(((void**)user)[1]);
-	
-	if       (strcmp(baseName, "phase") == 0){
-		ctx->kArgs[(*i)++] = (void*)&ctx->phase;
+
+	if       (strcmp(baseName, "selector") == 0){
+		ctx->kArgs[(*i)++] = (void*)&ctx->selector;
 	}else if (strcmp(baseName, "U") == 0){
 		ctx->kArgs[(*i)++] = (void*)&ctx->U;
 	}else if (strcmp(baseName, "V") == 0){
@@ -2610,56 +2948,58 @@ static void       reduxInvMarshalArg            (GpuReduction*        gr,
 		ctx->kArgs[(*i)++] = (void*)&ctx->B;
 	}else if (strcmp(baseName, "D") == 0){
 		ctx->kArgs[(*i)++] = (void*)&ctx->D;
+	}else if (strcmp(baseName, "Dunit") == 0){
+		ctx->kArgs[(*i)++] = (void*)&ctx->Dunit;
 	}else if (strcmp(baseName, "H") == 0){
 		ctx->kArgs[(*i)++] = (void*)&ctx->H;
-	}else if (strcmp(baseName, "splitFree") == 0){
-		ctx->kArgs[(*i)++] = (void*)&ctx->splitFree;
-	}else if (strcmp(baseName, "splitReduce") == 0){
-		ctx->kArgs[(*i)++] = (void*)&ctx->splitReduce;
-	}else if (strcmp(baseName, "l%d") == 0){
-		ctx->kArgs[(*i)++] = (void*)&ctx->l[k];
-	}else if (strcmp(baseName, "l%dPDim") == 0){
-		ctx->kArgs[(*i)++] = (void*)&ctx->lPDim[k-gr->ndd];
-	}else if (strcmp(baseName, "s") == 0){
-		ctx->kArgs[(*i)++] = (void*) ctx->flatSrcData;
-	}else if (strcmp(baseName, "sOff") == 0){
-		ctx->kArgs[(*i)++] = (void*)&ctx->flatSrcOffset;
-	}else if (strcmp(baseName, "sJ%d") == 0){
-		ctx->kArgs[(*i)++] = (void*)&ctx->sJ[k];
-	}else if (strcmp(baseName, "d") == 0){
-		ctx->kArgs[(*i)++] = (void*) ctx->flatDstData;
-	}else if (strcmp(baseName, "dOff") == 0){
-		ctx->kArgs[(*i)++] = (void*)&ctx->flatDstOffset;
-	}else if (strcmp(baseName, "dJ%d") == 0){
-		ctx->kArgs[(*i)++] = (void*)&ctx->dJ[k];
-	}else if (strcmp(baseName, "a") == 0){
-		ctx->kArgs[(*i)++] = (void*) ctx->flatDstArgData;
-	}else if (strcmp(baseName, "aOff") == 0){
-		ctx->kArgs[(*i)++] = (void*)&ctx->flatDstArgOffset;
-	}else if (strcmp(baseName, "aJ%d") == 0){
-		ctx->kArgs[(*i)++] = (void*)&ctx->aJ[k];
-	}else if (strcmp(baseName, "w") == 0){
-		ctx->kArgs[(*i)++] = (void*) ctx->w;
-	}else if (strcmp(baseName, "wdOff") == 0){
-		ctx->kArgs[(*i)++] = (void*)&ctx->wdOff;
-	}else if (strcmp(baseName, "pdOff") == 0){
-		ctx->kArgs[(*i)++] = (void*)&ctx->pdOff;
-	}else if (strcmp(baseName, "waOff") == 0){
-		ctx->kArgs[(*i)++] = (void*)&ctx->waOff;
-	}else if (strcmp(baseName, "paOff") == 0){
-		ctx->kArgs[(*i)++] = (void*)&ctx->paOff;
-	}else if (strcmp(baseName, "ibs%d") == 0){
-		ctx->kArgs[(*i)++] = (void*)&ctx->ibs[k];
-	}else if (strcmp(baseName, "ibp%d") == 0){
-		ctx->kArgs[(*i)++] = (void*)&ctx->ibp[k];
-	}else if (strcmp(baseName, "ibl%dPDim") == 0){
-		ctx->kArgs[(*i)++] = (void*)&ctx->iblPDim[k];
-	}else if (strcmp(baseName, "ibsOff%d") == 0){
-		ctx->kArgs[(*i)++] = (void*)&ctx->ibsOff[k];
-	}else if (strcmp(baseName, "ibdOff%d") == 0){
-		ctx->kArgs[(*i)++] = (void*)&ctx->ibdOff[k];
-	}else if (strcmp(baseName, "ibaOff%d") == 0){
-		ctx->kArgs[(*i)++] = (void*)&ctx->ibaOff[k];
+	}else if (strcmp(baseName, "LSlice") == 0){
+		ctx->kArgs[(*i)++] = (void*)&ctx->LSlice;
+	}else if (strcmp(baseName, "LPadded") == 0){
+		ctx->kArgs[(*i)++] = (void*)&ctx->LPadded;
+	}else if (strcmp(baseName, "L%d") == 0){
+		ctx->kArgs[(*i)++] = (void*)&ctx->L[k];
+	}else if (strcmp(baseName, "L%di") == 0){
+		ctx->kArgs[(*i)++] = (void*)&ctx->Li[k];
+	}else if (strcmp(baseName, "S0") == 0){
+		ctx->kArgs[(*i)++] = (void*) ctx->S0Data;
+	}else if (strcmp(baseName, "S0Off") == 0){
+		ctx->kArgs[(*i)++] = (void*)&ctx->S0Off;
+	}else if (strcmp(baseName, "S0J%d") == 0){
+		ctx->kArgs[(*i)++] = (void*)&ctx->S0J[k];
+	}else if (strcmp(baseName, "S0S%di") == 0){
+		ctx->kArgs[(*i)++] = (void*)&ctx->S0Si[k];
+	}else if (strcmp(baseName, "D0") == 0){
+		ctx->kArgs[(*i)++] = (void*) ctx->D0Data;
+	}else if (strcmp(baseName, "D0Off") == 0){
+		ctx->kArgs[(*i)++] = (void*)&ctx->D0Off;
+	}else if (strcmp(baseName, "D0J%d") == 0){
+		ctx->kArgs[(*i)++] = (void*)&ctx->D0J[k];
+	}else if (strcmp(baseName, "D0S%di") == 0){
+		ctx->kArgs[(*i)++] = (void*)&ctx->D0Si[k];
+	}else if (strcmp(baseName, "D1") == 0){
+		ctx->kArgs[(*i)++] = (void*) ctx->D1Data;
+	}else if (strcmp(baseName, "D1Off") == 0){
+		ctx->kArgs[(*i)++] = (void*)&ctx->D1Off;
+	}else if (strcmp(baseName, "D1J%d") == 0){
+		ctx->kArgs[(*i)++] = (void*)&ctx->D1J[k];
+	}else if (strcmp(baseName, "D1S%di") == 0){
+		ctx->kArgs[(*i)++] = (void*)&ctx->D1Si[k];
+	}else if (strcmp(baseName, "I0J%d") == 0){
+		ctx->kArgs[(*i)++] = (void*)&ctx->I0J[k];
+	}else if (strcmp(baseName, "I0S%di") == 0){
+		ctx->kArgs[(*i)++] = (void*)&ctx->I0Si[k];
+	}else if (strcmp(baseName, "W") == 0){
+		ctx->kArgs[(*i)++] = (void*) ctx->W;
+	}else if (strcmp(baseName, "W0Off") == 0){
+		ctx->kArgs[(*i)++] = (void*)&ctx->W0Off;
+	}else if (strcmp(baseName, "SHMEMK0Off") == 0){
+		ctx->kArgs[(*i)++] = (void*)&ctx->SHMEMK0Off;
+	}else if (strcmp(baseName, "W1Off") == 0){
+		ctx->kArgs[(*i)++] = (void*)&ctx->W1Off;
+	}else if (strcmp(baseName, "SHMEMK1Off") == 0){
+		ctx->kArgs[(*i)++] = (void*)&ctx->SHMEMK1Off;
+	}else if (strcmp(baseName, "perm%di") == 0){
+		ctx->kArgs[(*i)++] = (void*)&ctx->perm[k];
 	}
 }
 
@@ -2667,7 +3007,7 @@ static void       reduxInvMarshalArg            (GpuReduction*        gr,
 /**
  * @brief Estimate the level of parallelism available in the GPU context of
  *        this reduction operator.
- * 
+ *
  * This is a rough target number of threads.  It would definitely fill the
  * device, plus some substantial margin.
  */
@@ -2676,21 +3016,75 @@ static size_t     reduxGenEstimateParallelism   (const GpuReduction*  gr){
 	/**
 	 * An arbitrary margin factor ensuring there will be a few thread blocks
 	 * per SMX.
-	 * 
+	 *
 	 * E.g. on Kepler, each SMX can handle up to two 1024-thread blocks
-	 * simultaneously, so a margin of 6/SMX should ensure with very high
+	 * simultaneously, so a margin of 16/SMX should ensure with very high
 	 * likelyhood that all SMXes will be fed and kept busy.
 	 */
-	
-	size_t marginFactor = 6;
+
+	size_t marginFactor = 16;
 	return marginFactor * gr->numProcs * gr->maxLg;
 }
 
 /**
- * @brief Returns whether the reduction interface requires a dst argument.
+ * @brief Return whether or not the reduction operator's interface or kernel
+ *        require a specific argument, lattice or storage.
+ *
+ * Specifically, check if the reductions operator's:
+ *   - Interface (reduxGenRequires*())              the passing of an s0/d0/d1    argument
+ *   - Kernel    (reduxGenKernelRequiresLattice*()) the walking of an s0/d0/d1/i0 lattice
+ *   - Kernel    (reduxGenKernelRequiresState*())   contains       a  k0/k1       state
+ *   - Kernel    (reduxGenKernelRequiresWspace())   workspaces named  w*          for states k*.
+ *
+ * The reduction operator's interface, kernel and state are semantically
+ * subtly different. The interface asks whether the GpuReduction_call(), and
+ * therefore the generated kernel, must receive a specific argument:
+ *
+ *    - Argument s0 (Typically the source tensor)
+ *    - Argument d0 (Typically the destination tensor)
+ *    - Argument d1 (Typically the destination argument tensor)
+ *
+ * The kernel asks whether it must internally walk over a specific lattice, where:
+ *
+ *    - Lattice s0 is the lattice of pointers into the s0 tensor.
+ *    - Lattice d0 is the lattice of pointers into the d0 tensor.
+ *    - Lattice d1 is the lattice of pointers into the d1 tensor.
+ *    - Lattice i0 is the lattice of flattened indices into the s0 tensor.
+ *
+ * The state asks whether it should contain:
+ *
+ *    - State k0 (Typically for accumulator states typed `TK` over the s0 lattice
+ *                and written to the d0 lattice)
+ *    - State k1 (Typically for indexes typed `TI` from the i0 lattice and written
+ *                to the d1 lattice)
+ *
+ * The workspace asks whether it is required in order to save partial reduction
+ * states k* computed during Phase 0.
+ *
+ *
+ *
+ * Currently:
+ *
+ *   - All GpuReductions require an s0 argument.
+ *   - All GpuReductions except argmin/argmax require a d0 argument.
+ *   - Only the argmin/argmax/minandargmin/maxandargmax GpuReductions require a d1 argument.
+ *   - All and only the GpuReductions requiring a s0 argument require walking over the s0 lattice.
+ *   - All and only the GpuReductions requiring a d0 argument require walking over the d0 lattice.
+ *   - All and only the GpuReductions requiring a d1 argument require walking over the d1 lattice.
+ *   - All and only the GpuReductions requiring a d1 argument require walking over the i0 lattice.
+ *   - All and only the GpuReductions requiring a s0 lattice walk require a k0 state.
+ *   - All and only the GpuReductions requiring a i0 lattice walk require a k1 state.
+ *   - All GpuReductions potentially require a workspace for their states.
+ *
+ * However, if this reduction engine were generalized to multi-reduction, elemwise or
+ * initialization operations, the above might not necessarily hold anymore.
  */
 
-static int        reduxGenRequiresDst           (const GpuReduction*  gr){
+static int        reduxGenRequiresS0             (const GpuReduction*  gr){
+	(void)gr;
+	return 1;
+}
+static int        reduxGenRequiresD0             (const GpuReduction*  gr){
 	switch (gr->op){
 		case GA_REDUCE_ARGMIN:
 		case GA_REDUCE_ARGMAX:
@@ -2699,12 +3093,7 @@ static int        reduxGenRequiresDst           (const GpuReduction*  gr){
 		  return 1;
 	}
 }
-
-/**
- * @brief Returns whether the reduction interface requires a dstArg argument.
- */
-
-static int        reduxGenRequiresDstArg        (const GpuReduction*  gr){
+static int        reduxGenRequiresD1             (const GpuReduction*  gr){
 	switch (gr->op){
 		case GA_REDUCE_MINANDARGMIN:
 		case GA_REDUCE_MAXANDARGMAX:
@@ -2715,48 +3104,45 @@ static int        reduxGenRequiresDstArg        (const GpuReduction*  gr){
 		  return 0;
 	}
 }
-
-/**
- * @brief Returns whether the generated kernel internally requires a dst
- *        workspace.
- *
- * This is semantically subtly different from reduxGenRequiresDst(). The main
- * difference is in the implementation of the GA_REDUCE_ARGMIN/ARGMAX
- * reductions; both require a dst workspace buffer for the min/max values
- * associated with the indices that they return, even though they will be
- * discarded.
- * 
- * As of now, all reductions use a dst workspace internally.
- */
-
-static int        reduxGenKernelRequiresDst     (const GpuReduction*  gr){
+static int        reduxGenKernelRequiresLatticeS0(const GpuReduction*  gr){
+	return reduxGenRequiresS0(gr);
+}
+static int        reduxGenKernelRequiresLatticeD0(const GpuReduction*  gr){
+	return reduxGenRequiresD0(gr);
+}
+static int        reduxGenKernelRequiresLatticeD1(const GpuReduction*  gr){
+	return reduxGenRequiresD1(gr);
+}
+static int        reduxGenKernelRequiresLatticeI0(const GpuReduction*  gr){
+	return reduxGenRequiresD1(gr);
+}
+static int        reduxGenKernelRequiresStateK0  (const GpuReduction*  gr){
+	return reduxGenKernelRequiresLatticeS0(gr);
+}
+static int        reduxGenKernelRequiresStateK1  (const GpuReduction*  gr){
+	return reduxGenKernelRequiresLatticeI0(gr);
+}
+static int        reduxGenKernelRequiresWspace   (const GpuReduction*  gr){
+	(void)gr;
 	return 1;
 }
 
-/**
- * @brief Returns whether the generated kernel internally requires a dstArg
- *        workspace.
- *
- * This is semantically subtly different from reduxHasDstArg(), since it asks
- * whether the reduction, even though it might not accept a dstArg argument,
- * still requires a dstArg workspace internally.
- * 
- * Currently, there exist no operations that require a dstArg workspace
- * internally but which is not also part of the external interface.
- */
-
-static int        reduxGenKernelRequiresDstArg  (const GpuReduction*  gr){
-	return reduxGenRequiresDstArg(gr);
-}
 
 /**
- * @brief Whether or not an axis is maybe split.
- * 
- * An axis is possibly split if it is the last free or last reduction axis.
+ * Get size and alignment requirements of K0 and K1 states.
  */
 
-static int        reduxGenAxisMaybeSplit        (const GpuReduction*  gr, int axis){
-	return axis == gr->ndd-1 || axis == gr->nds-1;
+static size_t     reduxGenGetK0Size             (const GpuReduction*  gr){
+	return gr->TK0.size;
+}
+static size_t     reduxGenGetK0Align            (const GpuReduction*  gr){
+	return gr->TK0.align;
+}
+static size_t     reduxGenGetK1Size             (const GpuReduction*  gr){
+	return gr->TK1.size;
+}
+static size_t     reduxGenGetK1Align            (const GpuReduction*  gr){
+	return gr->TK1.align;
 }
 
 /**
@@ -2764,20 +3150,15 @@ static int        reduxGenAxisMaybeSplit        (const GpuReduction*  gr, int ax
  */
 
 static size_t     reduxGenGetReduxStateSize     (const GpuReduction*  gr){
-	size_t total = 0, idxSize = gpuarray_get_elsize(gr->idxTypeCode);
-	
+	size_t total = 0, idxSize = gpuarray_get_elsize(gr->TS64tc);
+
 	/* The accumulator and index types can be wider than dst/dstArg's types. */
-	total += reduxGenKernelRequiresDst(gr)           ?
-	         gpuarray_get_elsize(gr->accTypeCode)    :
-	         0;
-	total += reduxGenKernelRequiresDstArg(gr)        ?
-	         gpuarray_get_elsize(gr->idxTypeCode)    :
-	         0;
-	
+	total += reduxGenKernelRequiresStateK0(gr) ? reduxGenGetK0Size(gr) : 0;
+	total += reduxGenKernelRequiresStateK1(gr) ? reduxGenGetK1Size(gr) : 0;
+
 	/* At minimum, there must be space for the offset permute. */
 	total  = total < idxSize ? idxSize : total;
-	          
-	
+
 	/* Return the calculated amount of space. */
 	return total;
 }
@@ -2785,143 +3166,113 @@ static size_t     reduxGenGetReduxStateSize     (const GpuReduction*  gr){
 /**
  * @brief Get the maximum number of threads this operator's kernel can handle.
  */
-
-static size_t     reduxGenGetMaxLocalSize       (const GpuReduction*  gr){
-	return gr->maxLK;
-}
-
-/**
- * @brief Get the shared memory consumption for a given block size.
- * 
- * This is non-trivial since it requires ensuring alignment of datatypes.
- */
-
-static size_t     reduxGenGetSHMEMSize          (const GpuReduction*  gr, size_t bs){
-	const gpuarray_type* type;
-	size_t               total = 0, permuteSpace;
-	
-	if(reduxGenKernelRequiresDst(gr)){
-		type   = gpuarray_get_type(gr->accTypeCode);
-		total  = DIVIDECEIL(total, type->align)*type->align;
-		total += bs*type->size;
-	}
-	if(reduxGenKernelRequiresDstArg(gr)){
-		type   = gpuarray_get_type(gr->idxTypeCode);
-		total  = DIVIDECEIL(total, type->align)*type->align;
-		total += bs*type->size;
-	}
-	
-	/* Ensure space for pointer permute. */
-	permuteSpace = gpuarray_get_type(gr->idxTypeCode)->size * bs;
-	if(total < permuteSpace){
-		total = permuteSpace;
-	}
-	
-	return total;
+
+static size_t     reduxGenGetMaxLocalSize       (const GpuReduction*  gr){
+	return gr->maxLK;
 }
 
 /**
- * @brief Get the shared memory byte offset for dst.
+ * @brief Get the shared memory consumption for a given block size.
  */
 
-static size_t     reduxGenGetSHMEMDstOff        (const GpuReduction*  gr, size_t bs){
-	return 0;
+static size_t      reduxGenGetSHMEMSize           (const GpuReduction*  gr, size_t cells){
+	size_t               total = 0, totalPermute;
+
+	/* Compute size of SHMEM working space */
+	total += reduxGenKernelRequiresStateK0(gr) ? cells*reduxGenGetK0Size(gr) : 0;
+	total += reduxGenKernelRequiresStateK1(gr) ? cells*reduxGenGetK1Size(gr) : 0;
+
+	/* But ensure space for pointer offset permute at beginning of kernel. */
+	totalPermute = cells*gpuarray_get_type(gr->TS64tc)->size;
+	total        = total < totalPermute ? totalPermute : total;
+
+	return total;
 }
 
 /**
- * @brief Get the shared memory byte offset for dstArg.
+ * @brief Get the shared memory byte offset for the k0 and k1 states.
  */
 
-static size_t     reduxGenGetSHMEMDstArgOff     (const GpuReduction*  gr, size_t bs){
-	const gpuarray_type* type;
-	size_t               total = 0;
-	
-	if(reduxGenKernelRequiresDst(gr) && reduxGenKernelRequiresDstArg(gr)){
-		type   = gpuarray_get_type(gr->accTypeCode);
-		total  = DIVIDECEIL(total, type->align)*type->align;
-		total += bs*type->size;
-		type   = gpuarray_get_type(gr->idxTypeCode);
-		total  = DIVIDECEIL(total, type->align)*type->align;
-		
-		return total;
+static size_t      reduxGenGetSHMEMK0Off          (const GpuReduction*  gr, size_t cells){
+	if (!reduxGenKernelRequiresWspace (gr)||
+	   !reduxGenKernelRequiresStateK0(gr)||
+	   !reduxGenKernelRequiresStateK1(gr)){
+		return 0;
+	}
+
+	if (reduxGenGetK0Align(gr) > reduxGenGetK1Align(gr)){
+		return 0;
+	}else{
+		return cells*reduxGenGetK1Size(gr);
+	}
+}
+static size_t      reduxGenGetSHMEMK1Off          (const GpuReduction*  gr, size_t cells){
+	if (!reduxGenKernelRequiresWspace (gr)||
+	   !reduxGenKernelRequiresStateK0(gr)||
+	   !reduxGenKernelRequiresStateK1(gr)){
+		return 0;
+	}
+
+	if (reduxGenGetK0Align(gr) > reduxGenGetK1Align(gr)){
+		return cells*reduxGenGetK0Size(gr);
 	}else{
 		return 0;
 	}
 }
 
 /**
- * Get the amount of Workspace memory required.
- * 
+ * Get the amount of workspace memory required.
+ *
  * NOT necessarily the same as amount of SHMEM! The workspace is NOT used for
  * intrablock offset permutes, for instance.
  */
 
-static size_t     reduxGenGetWMEMSize           (const GpuReduction*  gr, size_t bs){
-	const gpuarray_type* type;
+static size_t      reduxGenGetWMEMSize            (const GpuReduction*  gr, size_t cells){
 	size_t               total = 0;
-	
-	if(reduxGenKernelRequiresDst(gr)){
-		type   = gpuarray_get_type(gr->accTypeCode);
-		total  = DIVIDECEIL(total, type->align)*type->align;
-		total += bs*type->size;
-	}
-	if(reduxGenKernelRequiresDstArg(gr)){
-		type   = gpuarray_get_type(gr->idxTypeCode);
-		total  = DIVIDECEIL(total, type->align)*type->align;
-		total += bs*type->size;
-	}
-	
-	return total;
-}
 
-/**
- * @brief Get the workspace memory byte offset for dst.
- */
+	total += reduxGenKernelRequiresStateK0(gr) ? cells*reduxGenGetK0Size(gr) : 0;
+	total += reduxGenKernelRequiresStateK1(gr) ? cells*reduxGenGetK1Size(gr) : 0;
 
-static size_t     reduxGenGetWMEMDstOff         (const GpuReduction*  gr, size_t bs){
-	return reduxGenGetSHMEMDstOff(gr, bs);
+	return total;
 }
 
 /**
- * @brief Get the workspace memory byte offset for dstArg.
+ * @brief Get the workspace memory byte offset for the k0 and k1 states.
  */
 
-static size_t     reduxGenGetWMEMDstArgOff      (const GpuReduction*  gr, size_t bs){
-	return reduxGenGetSHMEMDstArgOff(gr, bs);
+static size_t      reduxGenGetWMEMK0Off           (const GpuReduction*  gr, size_t cells){
+	return reduxGenGetSHMEMK0Off(gr, cells);
+}
+static size_t      reduxGenGetWMEMK1Off           (const GpuReduction*  gr, size_t cells){
+	return reduxGenGetSHMEMK1Off(gr, cells);
 }
 
 /**
  * @brief Initialize the context.
- * 
+ *
  * After this function, calling reduxInvCleanup*() becomes safe.
  */
 
-static int        reduxInvInit                  (redux_ctx*  ctx){
+static int         reduxInvInit                   (redux_ctx*  ctx){
 	/**
 	 * We initialize certain parts of the context.
 	 */
-	
-	ctx->l                 = NULL;
-	ctx->lPDim             = NULL;
-	ctx->sJ                = NULL;
-	ctx->dJ                = NULL;
-	ctx->aJ                = NULL;
-	ctx->ibs               = NULL;
-	ctx->ibp               = NULL;
-	ctx->iblPDim           = NULL;
-	ctx->ibsOff            = NULL;
-	ctx->ibdOff            = NULL;
-	ctx->ibaOff            = NULL;
-	ctx->kArgs             = NULL;
-	ctx->xdSrc             = NULL;
-	ctx->xdSrcPtrs         = NULL;
-	ctx->xdTmpPtrs         = NULL;
-	ctx->xdSplit           = NULL;
-	
-	ctx->w                 = NULL;
-	
-	ctx->prodAllAxes       = ctx->prodRdxAxes   = ctx->prodFreeAxes  = 1;
-	ctx->bs                = ctx->gs            = 1;
+
+	ctx->L           = ctx->Li        = NULL;
+	ctx->S0J         = ctx->S0Si      = NULL;
+	ctx->D0J         = ctx->D0Si      = NULL;
+	ctx->D1J         = ctx->D1Si      = NULL;
+	ctx->I0J         = ctx->I0Si      = NULL;
+	ctx->perm        = NULL;
+	ctx->kArgs       = NULL;
+	ctx->xdSrc       = NULL;
+	ctx->xdSrcPtrs   = NULL;
+	ctx->xdSplit     = NULL;
+
+	ctx->W           = NULL;
+
+	ctx->prodAllAxes = ctx->prodRdxAxes   = ctx->prodFreeAxes  = 1;
+	ctx->bs          = ctx->gs            = 1;
 
 	return reduxInvInferProperties(ctx);
 }
@@ -2936,50 +3287,51 @@ static int        reduxInvInferProperties       (redux_ctx*  ctx){
 	size_t     d;
 
 
-	/* Insane src, reduxLen, dst or dstArg? */
-	if(!ctx->reduxList){
-		ctx->reduxLen = ctx->src->nd;
-	}
-	if       (!ctx->src){
+	/* Insane s0, reduxLen, d0 or d1? */
+	if       (reduxInvRequiresS0(ctx) && !ctx->s0){
 		return reduxInvCleanupMsg(ctx, GA_INVALID_ERROR,
-		       "src is NULL!\n");
-	}else if (ctx->src->nd  <= 0){
+		       "s0 is NULL, but reduction requires it!\n");
+	}
+	if       (!ctx->reduxList){
+		ctx->reduxLen = reduxInvRequiresS0(ctx) ? ctx->s0->nd : 0;
+	}
+	if       (reduxInvRequiresS0(ctx) && ctx->s0->nd  <= 0){
 		return reduxInvCleanupMsg(ctx, GA_INVALID_ERROR,
-		       "src is a scalar, cannot reduce it!\n");
-	}else if (ctx->reduxLen <  0){
+		       "s0 is a scalar, cannot reduce it further!\n");
+	}else if (reduxInvRequiresS0(ctx) && ctx->reduxLen <  0){
 		return reduxInvCleanupMsg(ctx, GA_INVALID_ERROR,
-		       "Length of list of dimensions to be reduced is less than 0!\n");
-	}else if (ctx->src->nd  <  (unsigned)ctx->reduxLen){
+		       "Length of list of axes to be reduced is less than 0!\n");
+	}else if (reduxInvRequiresS0(ctx) && ctx->s0->nd  <  (unsigned)ctx->reduxLen){
 		return reduxInvCleanupMsg(ctx, GA_INVALID_ERROR,
-		       "src has fewer dimensions than there are dimensions to reduce!\n");
-	}else if (reduxInvRequiresDst   (ctx) && !ctx->dst){
+		       "s0 has fewer axes than there are axes to reduce!\n");
+	}else if (reduxInvRequiresD0(ctx) && !ctx->d0){
 		return reduxInvCleanupMsg(ctx, GA_INVALID_ERROR,
-		       "dst is NULL, but reduction requires it!\n");
-	}else if (reduxInvRequiresDstArg(ctx) && !ctx->dstArg){
+		       "d0 is NULL, but reduction requires it!\n");
+	}else if (reduxInvRequiresD1(ctx) && !ctx->d1){
 		return reduxInvCleanupMsg(ctx, GA_INVALID_ERROR,
-		       "dstArg is NULL, but reduction requires it!\n");
-	}else if (ctx->dst    && ctx->dst->nd   +ctx->reduxLen != ctx->src->nd){
+		       "d1 is NULL, but reduction requires it!\n");
+	}else if (reduxInvRequiresD0(ctx) && reduxInvRequiresS0(ctx) && ctx->d0->nd+ctx->reduxLen != ctx->s0->nd){
 		return reduxInvCleanupMsg(ctx, GA_INVALID_ERROR,
-		       "dst is of incorrect dimensionality for this reduction!\n");
-	}else if (ctx->dstArg && ctx->dstArg->nd+ctx->reduxLen != ctx->src->nd){
+		       "d0 is of incorrect rank for this reduction!\n");
+	}else if (reduxInvRequiresD1(ctx) && reduxInvRequiresS0(ctx) && ctx->d1->nd+ctx->reduxLen != ctx->s0->nd){
 		return reduxInvCleanupMsg(ctx, GA_INVALID_ERROR,
-		       "dstArg is of incorrect dimensionality for this reduction!\n");
+		       "d1 is of incorrect rank for this reduction!\n");
 	}
-	ctx->nds  = ctx->src->nd;
-	ctx->ndr  = ctx->reduxLen;
-	ctx->ndd  = ctx->nds - ctx->ndr;
-	ctx->ndfs = ctx->ndfr = ctx->ndfd = 0;
-	
+	ctx->nds0  = reduxInvRequiresS0(ctx) ? ctx->s0->nd : 0;
+	ctx->nds0r = ctx->reduxLen;
+	ctx->ndd0  = ctx->nds0   - ctx->nds0r;
+	ctx->ndfs0 = ctx->ndfs0r = ctx->ndfd0 = 0;
+
 	/* Insane reduxList? */
-	for (i=0;i<ctx->ndr;i++){
+	for (i=0;i<ctx->nds0r;i++){
 		j = ctx->reduxList ? ctx->reduxList[i] : i;
-		if (j < -ctx->nds || j >= ctx->nds){
+		if (j < -ctx->nds0 || j >= ctx->nds0){
 			return reduxInvCleanupMsg(ctx, GA_INVALID_ERROR,
 			       "Insane axis number %d! Should be [%d, %d)!\n",
-			       j, -ctx->nds, ctx->nds);
+			       j, -ctx->nds0, ctx->nds0);
 		}
-		j = j<0 ? ctx->nds+j : j;
-		d                 = ctx->src->dimensions[j];
+		j = j<0 ? ctx->nds0+j : j;
+		d                 = ctx->s0->dimensions[j];
 		ctx->zeroRdxAxes += !d;
 		ctx->prodRdxAxes *=  d?d:1;
 	}
@@ -2987,55 +3339,55 @@ static int        reduxInvInferProperties       (redux_ctx*  ctx){
 
 	/**
 	 * Insane shape?
-	 * 
+	 *
 	 * The source tensor is allowed to be empty (its shape may contain 0s).
 	 * However, all axes that are of length 0 must be reduction axes.
-	 * 
+	 *
 	 * The reason for this is that a reduction cannot store any output into an
-	 * empty destination tensor (whose dimensions are the free axes), because
+	 * empty destination tensor (whose axes are the free axes), because
 	 * it has 0 space. The operation cannot then fulfill its contract.
-	 * 
+	 *
 	 * On the other hand, when some or all reduction axes of a tensor are of
 	 * length 0, the reduction can be interpreted as initializing the
 	 * destination tensor to the identity value of the operation. For lack of a
 	 * better idea, the destination argument tensor can then be zeroed.
 	 */
 
-	for (i=0;i<ctx->nds;i++){
-		d                 =  ctx->src->dimensions[i];
+	for (i=0;i<ctx->nds0;i++){
+		d                 =  ctx->s0->dimensions[i];
 		ctx->zeroAllAxes += !d;
 		ctx->prodAllAxes *=  d?d:1;
 	}
 	if (ctx->zeroAllAxes != ctx->zeroRdxAxes){
 		return reduxInvCleanupMsg(ctx, GA_INVALID_ERROR,
-		       "Source tensor has length-0 dimensions that are not reduced!\n");
+		       "Source tensor has length-0 axes that are not reduced!\n");
 	}
 	ctx->prodFreeAxes = ctx->prodAllAxes/ctx->prodRdxAxes;
 
 
 	/**
 	 * Allocate and construct source-tensor axis-description lists.
-	 * 
+	 *
 	 * While constructing the descriptions of each axis, verify that:
-	 * 
+	 *
 	 *   1. reduxLen has no duplicates.
-	 *   2. dst and/or dstArg's dimensions match src's dimensions, stripped of
+	 *   2. d0 and/or d1's axes match s0's axes when stripped of
 	 *      the reduction axes.
 	 */
 
-	ctx->xdSrc     = calloc(ctx->nds,   sizeof(*ctx->xdSrc));
-	ctx->xdSrcPtrs = calloc(ctx->nds+1, sizeof(*ctx->xdSrcPtrs));
+	ctx->xdSrc     = calloc(ctx->nds0,   sizeof(*ctx->xdSrc));
+	ctx->xdSrcPtrs = calloc(ctx->nds0+1, sizeof(*ctx->xdSrcPtrs));
 	if (!ctx->xdSrc || !ctx->xdSrcPtrs){
 		return reduxInvCleanup(ctx, GA_MEMORY_ERROR);
 	}
-	for (i=0;i<ctx->nds;i++){
+	for (i=0;i<ctx->nds0;i++){
 		axisInit(&ctx->xdSrc[i],
-		         ctx->src->dimensions[i],
-		         ctx->src->strides[i]);
+		         ctx->s0->dimensions[i],
+		         ctx->s0->strides[i]);
 	}
-	for (i=0;i<ctx->ndr;i++){
+	for (i=0;i<ctx->nds0r;i++){
 		j = ctx->reduxList ? ctx->reduxList[i] : i;
-		j = j<0 ? ctx->nds+j : j;
+		j = j<0 ? ctx->nds0+j : j;
 		a = reduxInvGetSrcAxis(ctx, j);
 		if (axisIsReduced(a)){
 			return reduxInvCleanupMsg(ctx, GA_INVALID_ERROR,
@@ -3045,55 +3397,57 @@ static int        reduxInvInferProperties       (redux_ctx*  ctx){
 		}
 		axisMarkReduced(a, i);
 	}
-	for (i=j=0;i<ctx->nds;i++){
-		axis_desc* a      = reduxInvGetSrcAxis(ctx, i);
-		size_t     srcLen = axisGetLen(a), dstLen, dstArgLen;
-		
+	for (i=j=0;i<ctx->nds0;i++){
+		axis_desc* a     = reduxInvGetSrcAxis(ctx, i);
+		size_t     s0Len = axisGetLen(a), d0Len, d1Len;
+
 		if (axisIsReduced(a)){continue;}
-		if (reduxInvRequiresDst(ctx)){
-			dstLen = ctx->dst->dimensions[j];
-			
-			if(srcLen != dstLen){
+		if (reduxInvRequiresD0(ctx)){
+			d0Len = ctx->d0->dimensions[j];
+
+			if (s0Len != d0Len){
 				return reduxInvCleanupMsg(ctx, GA_INVALID_ERROR,
-				       "Source axis %d has length %zu, but "
-				       "corresponding destination axis %d has length %zu!\n",
-				       i, srcLen, j, dstLen);
+				       "s0 axis %d has length %zu, but "
+				       "corresponding d0 axis %d has length %zu!\n",
+				       i, s0Len, j, d0Len);
 			}
-			
-			a->dstStride    = ctx->dst->strides[j];
+
+			a->d0S    = ctx->d0->strides[j];
 		}
-		if (reduxInvRequiresDstArg(ctx)){
-			dstArgLen = ctx->dstArg->dimensions[j];
-			
-			if(srcLen != dstArgLen){
+		if (reduxInvRequiresD1(ctx)){
+			d1Len = ctx->d1->dimensions[j];
+
+			if (s0Len != d1Len){
 				return reduxInvCleanupMsg(ctx, GA_INVALID_ERROR,
-				       "Source axis %d has length %zu, but "
-				       "corresponding destination-argument axis %d has length %zu!\n",
-				       i, srcLen, j, dstArgLen);
+				       "s0 axis %d has length %zu, but "
+				       "corresponding d1 axis %d has length %zu!\n",
+				       i, s0Len, j, d1Len);
 			}
-			
-			a->dstArgStride = ctx->dstArg->strides[j];
+
+			a->d1S = ctx->d1->strides[j];
 		}
-		
+
 		j++;
 	}
-	
-	
+
+
 	/**
 	 * Grab gpudata buffers and byte offsets before we begin flattening the
 	 * tensors. As we flatten the tensor, we may reverse some axes, leading to
 	 * a bump of the byte offset.
 	 */
-	
-	ctx->flatSrcData       = ctx->src->data;
-	ctx->flatSrcOffset     = ctx->src->offset;
-	if(reduxInvRequiresDst(ctx)){
-		ctx->flatDstData       = ctx->dst->data;
-		ctx->flatDstOffset     = ctx->dst->offset;
+
+	if (reduxInvRequiresS0(ctx)){
+		ctx->S0Data = ctx->s0->data;
+		ctx->S0Off  = ctx->s0->offset;
+	}
+	if (reduxInvRequiresD0(ctx)){
+		ctx->D0Data = ctx->d0->data;
+		ctx->D0Off  = ctx->d0->offset;
 	}
-	if(reduxInvRequiresDstArg(ctx)){
-		ctx->flatDstArgData    = ctx->dstArg->data;
-		ctx->flatDstArgOffset  = ctx->dstArg->offset;
+	if (reduxInvRequiresD1(ctx)){
+		ctx->D1Data = ctx->d1->data;
+		ctx->D1Off  = ctx->d1->offset;
 	}
 
 	return reduxInvFlattenSource(ctx);
@@ -3101,7 +3455,7 @@ static int        reduxInvInferProperties       (redux_ctx*  ctx){
 
 /**
  * @brief Flatten the source tensor as much as is practical.
- * 
+ *
  * This makes the axis lengths as long as possible and the tensor itself as
  * contiguous as possible.
  */
@@ -3110,158 +3464,158 @@ static int        reduxInvFlattenSource         (redux_ctx*  ctx){
 	axis_desc* axis, *flatAxis, *sortAxis;
 	int        i, j, k, isSensitive;
 
-	ctx->ndfs = ctx->nds;
+	ctx->ndfs0 = ctx->nds0;
 
 	/**
-	 * Pass 1: Flatten out 0- and 1-length dimensions. We already know that
-	 * 
-	 *         a) There are no 0-length free dimensions, because that
+	 * Pass 1: Flatten out 0- and 1-length axes. We already know that
+	 *
+	 *         a) There are no 0-length free axes, because that
 	 *            constitutes an invalid input, and
-	 *         b) How many 0-length reduction dimensions there are, because
+	 *         b) How many 0-length reduction axes there are, because
 	 *            we counted them in the error-checking code.
-	 * 
+	 *
 	 * So if there are any 0-length axes, we can delete all reduction axes and
 	 * replace them with a single one.
-	 * 
+	 *
 	 * We can also delete 1-length axes outright, since they can always be
 	 * ignored; They are always indexed at [0].
 	 */
 
-	for (i=j=0;i<ctx->ndfs;i++){
+	for (i=j=0;i<ctx->ndfs0;i++){
 		axis = reduxInvGetSrcAxis(ctx, i);
 		if (!reduxTryFlattenOut(ctx, axis)){
 			*reduxInvGetSrcAxis(ctx, j++) = *axis;
 		}
 	}
-	if(ctx->zeroRdxAxes > 0){
+	if (ctx->zeroRdxAxes > 0){
 		/* New reduction axis of 0 length. */
 		axisInit       (reduxInvGetSrcAxis(ctx, j), 0, 0);
 		axisMarkReduced(reduxInvGetSrcAxis(ctx, j), 0);
 		j++;
 	}
-	ctx->ndfs = j;
+	ctx->ndfs0 = j;
 
 
 	/**
-	 * Pass 2: Flatten out continuous dimensions, where strides and sensitivity
+	 * Pass 2: Flatten out continuous axes, where strides and sensitivity
 	 *         allows it.
 	 */
-	
-	k           = ctx->ndfs;
+
+	k           = ctx->ndfs0;
 	isSensitive = reduxIsSensitive(ctx->op);
-	qsort(ctx->xdSrc, ctx->ndfs, sizeof(*ctx->xdSrc),
+	qsort(ctx->xdSrc, ctx->ndfs0, sizeof(*ctx->xdSrc),
 	      isSensitive ? reduxSortFlatSensitive : reduxSortFlatInsensitive);
-	for (i=j=1;i<ctx->ndfs;i++){
+	for (i=j=1;i<ctx->ndfs0;i++){
 		flatAxis = reduxInvGetSrcAxis(ctx, j-1);
 		sortAxis = reduxInvGetSrcAxis(ctx, i);
-		
+
 		if (reduxTryFlattenInto(ctx, flatAxis, sortAxis)){
 			k--;
 		}else{
 			*reduxInvGetSrcAxis(ctx, j++) = *sortAxis;
 		}
 	}
-	ctx->ndfs = k;
+	ctx->ndfs0 = k;
 
 
 	/**
-	 * Compute number of free and reduced dimensions.
+	 * Compute number of flattened free and reduced axes.
 	 */
 
-	for(ctx->ndfr=ctx->ndfd=i=0;i<ctx->ndfs;i++){
-		if(axisIsReduced(reduxInvGetSrcAxis(ctx, i))){
-			ctx->ndfr++;
+	for (ctx->ndfs0r=ctx->ndfd0=i=0;i<ctx->ndfs0;i++){
+		if (axisIsReduced(reduxInvGetSrcAxis(ctx, i))){
+			ctx->ndfs0r++;
 		}else{
-			ctx->ndfd++;
+			ctx->ndfd0++;
 		}
 	}
 
-	return reduxInvComputeKArgs(ctx);
+	return reduxInvComputeKernelArgs(ctx);
 }
 
 /**
  * @brief Compute the arguments to the kernel.
- * 
+ *
  * This is a multistep process and involves a lot of axis sorting on various
  * criteria.
  */
 
-static int        reduxInvComputeKArgs          (redux_ctx*  ctx){
+static int        reduxInvComputeKernelArgs          (redux_ctx*  ctx){
 	axis_desc* axis, *prevAxis;
-	size_t     target, aL, aLS;
-	int        i, j, k, haveSplitFreeAxis, haveSplitReducedAxis;
+	size_t     target, aL, aLS, perm, i0S;
+	int        i, j, haveSplitFreeAxis, haveSplitReducedAxis;
 
 
 	/**
 	 * STEP 0: Default Kernel Argument Values.
-	 * 
+	 *
 	 * They should be valid for a "scalar" job. In particular, for any
 	 * non-existent axis, assume length 1.
 	 */
-	
-	ctx->phase       = 0;
+
+	ctx->selector    = 0;
 	ctx->U           = 1;
 	ctx->V           = 1;
 	ctx->B           = 1;
 	ctx->D           = 1;
 	ctx->H           = 1;
-	ctx->splitFree   = 1;
-	ctx->splitReduce = 1;
-	ctx->xdSplit     = NULL;
-	ctx->l           = calloc(ctx->gr->nds,      sizeof(*ctx->l));
-	ctx->lPDim       = calloc(ctx->gr->ndr,      sizeof(*ctx->lPDim));
-	ctx->sJ          = calloc(ctx->gr->nds,      sizeof(*ctx->sJ));
-	ctx->dJ          = calloc(ctx->gr->ndd,      sizeof(*ctx->dJ));
-	ctx->aJ          = calloc(ctx->gr->ndd,      sizeof(*ctx->aJ));
-	ctx->wdOff       = 0;
-	ctx->pdOff       = 0;
-	ctx->waOff       = 0;
-	ctx->paOff       = 0;
-	ctx->ibs         = calloc(ctx->gr->log2MaxL, sizeof(*ctx->ibs));
-	ctx->ibp         = calloc(ctx->gr->log2MaxL, sizeof(*ctx->ibp));
-	ctx->iblPDim     = calloc(ctx->gr->log2MaxL, sizeof(*ctx->iblPDim));
-	ctx->ibsOff      = calloc(ctx->gr->log2MaxL, sizeof(*ctx->ibsOff));
-	ctx->ibdOff      = calloc(ctx->gr->log2MaxL, sizeof(*ctx->ibdOff));
-	ctx->ibaOff      = calloc(ctx->gr->log2MaxL, sizeof(*ctx->ibaOff));
+	ctx->LSlice      = 1;
+	ctx->LPadded     = 1;
+	ctx->L           = calloc(ctx->gr->nds,       sizeof(*ctx->L));
+	ctx->Li          = calloc(ctx->gr->log2MaxBS, sizeof(*ctx->Li));
+	ctx->S0J         = calloc(ctx->gr->nds,       sizeof(*ctx->S0J));
+	ctx->S0Si        = calloc(ctx->gr->log2MaxBS, sizeof(*ctx->S0Si));
+	ctx->D0J         = calloc(ctx->gr->ndd,       sizeof(*ctx->D0J));
+	ctx->D0Si        = calloc(ctx->gr->log2MaxBS, sizeof(*ctx->D0Si));
+	ctx->D1J         = calloc(ctx->gr->ndd,       sizeof(*ctx->D1J));
+	ctx->D1Si        = calloc(ctx->gr->log2MaxBS, sizeof(*ctx->D1Si));
+	ctx->I0J         = calloc(ctx->gr->nds,       sizeof(*ctx->I0J));
+	ctx->I0Si        = calloc(ctx->gr->log2MaxBS, sizeof(*ctx->I0Si));
+	ctx->W0Off       = 0;
+	ctx->SHMEMK0Off  = 0;
+	ctx->W1Off       = 0;
+	ctx->SHMEMK1Off  = 0;
+	ctx->perm        = calloc(ctx->gr->log2MaxBS, sizeof(*ctx->perm));
 	ctx->bs          = 1;
 	ctx->gs          = 1;
 	ctx->kArgs       = calloc(ctx->gr->kNumArgs, sizeof(*ctx->kArgs));
-	
-	if(!ctx->l      || !ctx->lPDim  || !ctx->sJ     || !ctx->dJ       ||
-	   !ctx->aJ     || !ctx->ibs    || !ctx->ibp    || !ctx->iblPDim  ||
-	   !ctx->ibsOff || !ctx->ibdOff || !ctx->ibaOff || !ctx->kArgs){
+
+	if (!ctx->L    || !ctx->Li   || !ctx->S0J  || !ctx->S0Si ||
+	   !ctx->D0J  || !ctx->D0Si || !ctx->D1J  || !ctx->D1Si ||
+	   !ctx->I0J  || !ctx->I0Si || !ctx->perm || !ctx->kArgs){
 		return reduxInvCleanupMsg(ctx, GA_MEMORY_ERROR,
 		       "Failed to allocate memory for kernel invocation arguments!\n");
 	}
-	for(i=0;i<ctx->gr->nds;i++){
-		ctx->l[i] = 1;
+
+	for (i=0;i<ctx->gr->nds;i++){
+		ctx->L[i]  = 1;
 	}
-	for(i=0;i<ctx->gr->log2MaxL;i++){
-		ctx->ibs[i] = 1;
+	for (i=0;i<ctx->gr->log2MaxBS;i++){
+		ctx->Li[i] = 1;
 	}
 
 
 	/**
 	 * STEP 1: Select Intra-Block Axes.
-	 * 
+	 *
 	 * Sort the axes in the order likely to maximize contiguity of source
 	 * memory accesses, then tag them to the kernel block size limit, possibly
 	 * splitting an axis in the process.
 	 */
-	
-	reduxSortAxisPtrsBy(ctx->xdSrcPtrs, ctx->xdSrc, ctx->ndfs,
-	                    reduxSortPtrIBSrcRdSelect);
+
+	reduxSortAxisPtrsBy(ctx->xdSrcPtrs, ctx->xdSrc, ctx->ndfs0,
+	                    reduxSortPtrS0AbsStride);
 	target = reduxGenGetMaxLocalSize(ctx->gr);
-	
-	for(i=0;i<ctx->ndfs && i<ctx->gr->log2MaxL;i++){
+
+	for (i=0;i<ctx->ndfs0 && i<ctx->gr->log2MaxBS;i++){
 		axis = reduxInvGetSrcSortAxis(ctx, i);
 		aL   = axisGetLen(axis);
-		
-		if(ctx->bs*aL <= target){
+
+		if (ctx->bs*aL <= target){
 			ctx->bs     *= aL;
 			axisMarkIntraBlock(axis, i, aL);
 		}else{
-			if(target/ctx->bs >= 2){
+			if (target/ctx->bs >= 2){
 				aLS          = target/ctx->bs;
 				ctx->bs     *= aLS;
 				axisMarkIntraBlock(axis, i, aLS);
@@ -3271,344 +3625,292 @@ static int        reduxInvComputeKArgs          (redux_ctx*  ctx){
 			break;
 		}
 	}
-	ctx->ndib = i;
+	ctx->ndib   = i;
+	ctx->LSlice = ctx->xdSplit ? axisGetIntraLen(ctx->xdSplit) : 1;
 
 
 	/**
-	 * STEP 2: Compute values dependent only on the intrablock axis selection.
-	 * 
-	 * For instance, the splitFree/splitReduce factors depend only on the split
-	 * axis, if any.
-	 * 
-	 * The shared memory consumption and shared memory offsets depend only
-	 * on block size.
+	 * STEP 2: Compute U, B, D, Dunit, H
 	 */
 
-	ctx->splitFree   = reduxInvGetSplitFree     (ctx);
-	ctx->splitReduce = reduxInvGetSplitReduce   (ctx);
-	ctx->SHMEM       = reduxGenGetSHMEMSize     (ctx->gr, ctx->bs);
-	ctx->pdOff       = reduxGenGetSHMEMDstOff   (ctx->gr, ctx->bs);
-	ctx->paOff       = reduxGenGetSHMEMDstArgOff(ctx->gr, ctx->bs);
-
-
-	/**
-	 * STEP 3: Compute U, B, D, H
-	 */
-	
-	for (i=0;i<ctx->ndfs;i++){
+	for (i=0;i<ctx->ndfs0;i++){
 		axis    = reduxInvGetSrcAxis(ctx, i);
 		ctx->U *= axisGetInterLen(axis);
 		ctx->B *= axisIsReduced(axis) ? axisGetInterLen(axis) : 1;
-		ctx->H *= axisIsReduced(axis) ? axisGetIntraLen(axis) : 1;
+		ctx->D *=!axisIsReduced(axis) ? axisGetIntraLen(axis) : 1;
 	}
-	ctx->D = ctx->bs/ctx->H;
-	
-	
+	ctx->H     = ctx->D<ctx->bs ? reduxNextPow2(ctx->bs) : ctx->bs;
+	ctx->Dunit = ctx->D/ctx->LSlice;
+
+
 	/**
-	 * STEP 4: Compute PDim values.
-	 * 
+	 * STEP 3: Compute shared memory parameters.
+	 */
+
+	ctx->shmemBytes  = reduxGenGetSHMEMSize (ctx->gr, ctx->H);
+	ctx->SHMEMK0Off  = reduxGenGetSHMEMK0Off(ctx->gr, ctx->H);
+	ctx->SHMEMK1Off  = reduxGenGetSHMEMK1Off(ctx->gr, ctx->H);
+
+
+	/**
+	 * STEP 4: Compute I0 stride values.
+	 *
 	 * This will be used for index calculation.
 	 */
-	
-	reduxSortAxisPtrsBy(ctx->xdSrcPtrs, ctx->xdSrc, ctx->ndfs,
+
+	reduxSortAxisPtrsBy(ctx->xdSrcPtrs, ctx->xdSrc, ctx->ndfs0,
 	                    reduxSortPtrByReduxNum);
-	for (i=0;i<ctx->ndfs;i++){
+	for (i=0,i0S=1;i<ctx->ndfs0;i++){
 		axis = reduxInvGetSrcSortAxis(ctx, i);
-		
-		if(axisIsReduced(axis)){
-			if(i==0){
-				axisSetPDim(axis, 1);
-			}else{
-				prevAxis = reduxInvGetSrcSortAxis(ctx, i-1);
-				axisSetPDim(axis, axisGetPDim(prevAxis)*axisGetLen(prevAxis));
-			}
+
+		if (axisIsReduced(axis)){
+			axisSetI0Stride(axis, i0S);
+			i0S *= axisGetLen(axis);
 		}
 	}
-	
-	
+
+
 	/**
 	 * STEP 5: Compute Intra-Block Permute Core.
-	 * 
+	 *
 	 * Sort the axes in the order most likely to maximize contiguity of
 	 * destination/destination argument memory accesses, then compute the
 	 * permutation that achieves the highest-bandwidth,
 	 * post-horizontal-reduction destination writes.
 	 */
-	
-	reduxSortAxisPtrsBy(ctx->xdSrcPtrs, ctx->xdSrc, ctx->ndfs,
-	                    reduxInvRequiresDst(ctx)    ?
-	                    reduxSortPtrIBDstWrSelect   :
-	                    reduxSortPtrIBDstArgWrSelect);
-	for(i=0;i<ctx->ndfs;i++){
+
+	reduxSortAxisPtrsBy(ctx->xdSrcPtrs, ctx->xdSrc, ctx->ndfs0,
+	                    reduxInvRequiresD0(ctx)?
+	                    reduxSortPtrD0WrSelect :
+	                    reduxSortPtrD1WrSelect);
+	for (i=0,perm=1;i<ctx->ndfs0;i++){
 		axis = reduxInvGetSrcSortAxis(ctx, i);
-		
-		if(axisIsIntra(axis)){
-			if(i==0){
-				axisSetIBP(axis, 1);
-			}else{
+
+		if (axisIsIntra(axis)){
+			if (i>0 && axisIsReduced(axis)){
 				prevAxis = reduxInvGetSrcSortAxis(ctx, i-1);
-				axisSetIBP(axis, axisGetIBP(prevAxis)*axisGetIntraLen(prevAxis));
+				if (!axisIsReduced(prevAxis)){
+					/**
+					 * The permute stride of the lowest-absolute-stride
+					 * reduced axis must be a power of two to make horizontal
+					 * reduction easier.
+					 */
+
+					perm = reduxNextPow2(perm);
+				}
 			}
+			axisSetPerm(axis, perm);
+			perm *= axisGetIntraLen(axis);
 		}
 	}
-	
+
+
 	/**
 	 * STEP 6. Place the intra axis arguments
-	 * 
-	 *              ibs, ibp, iblPDim, ibsOff, ibdOff, ibaOff
-	 * 
+	 *
+	 *              LN, perm, S0SNi, D0SNi, D1SNi, I0SNi
+	 *
 	 * For this we need the axes in final order of insertion.
 	 */
-	
-	reduxSortAxisPtrsBy(ctx->xdSrcPtrs, ctx->xdSrc, ctx->ndfs,
+
+	reduxSortAxisPtrsBy(ctx->xdSrcPtrs, ctx->xdSrc, ctx->ndfs0,
 	                    reduxSortPtrInsertFinalOrder);
-	for(i=0;i<ctx->ndib;i++){
+	for (i=0;i<ctx->ndib;i++){
 		axis = reduxInvGetSrcSortAxis(ctx,  i);
-		
-		ctx->ibs    [i] = axisGetIntraLen    (axis);
-		ctx->ibp    [i] = axisGetIBP         (axis);
-		ctx->iblPDim[i] = axisGetPDim        (axis);
-		ctx->ibsOff [i] = axisGetSrcStride   (axis);
-		ctx->ibdOff [i] = axisGetDstStride   (axis);
-		ctx->ibaOff [i] = axisGetDstArgStride(axis);
+
+		ctx->Li  [i] = axisGetIntraLen(axis);
+		ctx->perm[i] = axisGetPerm    (axis);
+		ctx->S0Si[i] = axisGetS0Stride(axis);
+		ctx->D0Si[i] = axisGetD0Stride(axis);
+		ctx->D1Si[i] = axisGetD1Stride(axis);
+		ctx->I0Si[i] = axisGetI0Stride(axis);
 	}
-	
+
+
 	/**
 	 * STEP 7. Place the inter axis arguments
-	 * 
-	 *              lN, lNPDim, sJN, dJN, aJN
-	 * 
+	 *
+	 *              LN, S0JN, D0JN, D1JN, I0JN
+	 *
 	 * , where N in [0, ctx->gr->ndd) are free axes,
 	 *         N in [ctx->gr->ndd, ctx->gr->nds) are reduced axes,
 	 * and ctx->xdSrcPtr[...] are sorted in the reverse of that order for
-	 * insertion, and excludes any split axis.
-	 * 
+	 * insertion, and excludes any intra axis (including the split one).
+	 *
 	 * How precisely the insertion is done depends closely on whether there is
 	 * a split axis and if so whether it is free or reduced.
-	 * 
+	 *
 	 * - If there is a split axis and it is free, then it should be inserted as
 	 *   the first free axis. Its jumps should be
-	 *             sJN = -sSM*intrainterLenM + sSN*splitFree
-	 *             dJN = -dSM*intrainterLenM + dSN*splitFree
-	 *             aJN = -aSM*intrainterLenM + aSN*splitFree
+	 *             S0JN = -S0SM*intrainterLenM + S0SN*splitFree
+	 *             D0JN = -D0SM*intrainterLenM + D0SN*splitFree
+	 *             D1JN = -D1SM*intrainterLenM + D1SN*splitFree
+	 *             I0JN = -I0SM*intrainterLenM + I0SN*splitFree
 	 * - If there is a split axis and it is reduced, then it should be inserted
 	 *   as the first reduced axis. Its jump should be
-	 *             sJN = -sSM*intrainterLenM + sSN*splitReduced
+	 *             S0JN = -S0SM*intrainterLenM + S0SN*splitReduced
+	 *             I0JN = -I0SM*intrainterLenM + I0SN*splitReduced
 	 * - If there is no split axis, proceed normally in filling the axes.
 	 */
-	
+
 	haveSplitFreeAxis    = ctx->xdSplit && !axisIsReduced(ctx->xdSplit);
 	haveSplitReducedAxis = ctx->xdSplit &&  axisIsReduced(ctx->xdSplit);
-	
+	j                    = ctx->gr->nds-1;
+
 	/* If we have a reduced split axis, insert it before any other reduced axis. */
-	j  = ctx->gr->nds-1;
-	k  = ctx->gr->ndr-1;
-	if(haveSplitReducedAxis && k>=0){
-		ctx->l      [j]  =           axisGetLen          (ctx->xdSplit);
-		ctx->lPDim  [k]  =           axisGetPDim         (ctx->xdSplit);
-		ctx->sJ     [j] +=  (ssize_t)axisGetSrcStride    (ctx->xdSplit)*
-		                    (ssize_t)axisGetIntraLen     (ctx->xdSplit);
-		if(j>0){
-			ctx->sJ   [j-1] -=  (ssize_t)axisGetSrcStride    (ctx->xdSplit)*
-			                    (ssize_t)axisGetIntraInterLen(ctx->xdSplit);
+	if (haveSplitReducedAxis && j>=ctx->gr->ndd){
+		ctx->L  [j]  =          axisGetLen     (ctx->xdSplit);
+		ctx->S0J[j] += (ssize_t)axisGetS0Stride(ctx->xdSplit)*
+		               (ssize_t)axisGetIntraLen(ctx->xdSplit);
+		ctx->I0J[j] += (ssize_t)axisGetI0Stride(ctx->xdSplit)*
+		               (ssize_t)axisGetIntraLen(ctx->xdSplit);
+		if (j>0){
+			ctx->S0J[j-1] -= (ssize_t)axisGetS0Stride     (ctx->xdSplit)*
+			                 (ssize_t)axisGetIntraInterLen(ctx->xdSplit);
+			ctx->I0J[j-1] -= (ssize_t)axisGetI0Stride     (ctx->xdSplit)*
+			                 (ssize_t)axisGetIntraInterLen(ctx->xdSplit);
 		}
 		j--;
-		k--;
 	}
-	
+
 	/* Insert rest of reduced axes. */
-	for(;i<ctx->ndfs && k>=0;i++,j--,k--){
+	for (;i<ctx->ndfs0 && j>=ctx->gr->ndd;i++,j--){
 		axis = reduxInvGetSrcSortAxis(ctx, i);
-		if(!axisIsReduced(axis)){
+		if (!axisIsReduced(axis)){
 			break;
 		}
-		
-		ctx->l      [j]  =           axisGetLen          (axis);
-		ctx->lPDim  [k]  =           axisGetPDim         (axis);
-		ctx->sJ     [j] +=  (ssize_t)axisGetSrcStride    (axis)*
-		                    (ssize_t)axisGetIntraLen     (axis);
-		if(j>0){
-			ctx->sJ   [j-1] -=  (ssize_t)axisGetSrcStride    (axis)*
-			                    (ssize_t)axisGetIntraInterLen(axis);
+
+		ctx->L  [j]  =          axisGetLen     (axis);
+		ctx->S0J[j] += (ssize_t)axisGetS0Stride(axis)*
+		               (ssize_t)axisGetIntraLen(axis);
+		ctx->I0J[j] += (ssize_t)axisGetI0Stride(axis)*
+		               (ssize_t)axisGetIntraLen(axis);
+		if (j>0){
+			ctx->S0J[j-1] -= (ssize_t)axisGetS0Stride     (axis)*
+			                 (ssize_t)axisGetIntraInterLen(axis);
+			ctx->I0J[j-1] -= (ssize_t)axisGetI0Stride     (axis)*
+			                 (ssize_t)axisGetIntraInterLen(axis);
 		}
 	}
-	
+
 	/* If we have a free split axis, insert it before any other free axis. */
-	k = ctx->gr->ndd-1;
-	if(haveSplitFreeAxis && k>=0){
-		ctx->l      [k]  =           axisGetLen          (ctx->xdSplit);
-		ctx->sJ     [k] +=  (ssize_t)axisGetSrcStride    (ctx->xdSplit)*
-		                    (ssize_t)axisGetIntraLen     (ctx->xdSplit);
-		ctx->dJ     [k] +=  (ssize_t)axisGetDstStride    (ctx->xdSplit)*
-		                    (ssize_t)axisGetIntraLen     (ctx->xdSplit);
-		ctx->aJ     [k] +=  (ssize_t)axisGetDstArgStride (ctx->xdSplit)*
-		                    (ssize_t)axisGetIntraLen     (ctx->xdSplit);
-		if(k>0){
-			ctx->sJ  [k-1] -=  (ssize_t)axisGetSrcStride    (ctx->xdSplit)*
-			                   (ssize_t)axisGetIntraInterLen(ctx->xdSplit);
-			ctx->dJ  [k-1] -=  (ssize_t)axisGetDstStride    (ctx->xdSplit)*
-			                   (ssize_t)axisGetIntraInterLen(ctx->xdSplit);
-			ctx->aJ  [k-1] -=  (ssize_t)axisGetDstArgStride (ctx->xdSplit)*
-			                   (ssize_t)axisGetIntraInterLen(ctx->xdSplit);
+	j = ctx->gr->ndd-1;
+	if (haveSplitFreeAxis && j>=0){
+		ctx->L  [j]  =          axisGetLen     (ctx->xdSplit);
+		ctx->S0J[j] += (ssize_t)axisGetS0Stride(ctx->xdSplit)*
+		               (ssize_t)axisGetIntraLen(ctx->xdSplit);
+		ctx->D0J[j] += (ssize_t)axisGetD0Stride(ctx->xdSplit)*
+		               (ssize_t)axisGetIntraLen(ctx->xdSplit);
+		ctx->D1J[j] += (ssize_t)axisGetD1Stride(ctx->xdSplit)*
+		               (ssize_t)axisGetIntraLen(ctx->xdSplit);
+		ctx->I0J[j] += (ssize_t)axisGetI0Stride(ctx->xdSplit)*
+		               (ssize_t)axisGetIntraLen(ctx->xdSplit);
+		if (j>0){
+			ctx->S0J[j-1] -= (ssize_t)axisGetS0Stride     (ctx->xdSplit)*
+			                 (ssize_t)axisGetIntraInterLen(ctx->xdSplit);
+			ctx->D0J[j-1] -= (ssize_t)axisGetD0Stride     (ctx->xdSplit)*
+			                 (ssize_t)axisGetIntraInterLen(ctx->xdSplit);
+			ctx->D1J[j-1] -= (ssize_t)axisGetD1Stride     (ctx->xdSplit)*
+			                 (ssize_t)axisGetIntraInterLen(ctx->xdSplit);
+			ctx->I0J[j-1] -= (ssize_t)axisGetI0Stride     (ctx->xdSplit)*
+			                 (ssize_t)axisGetIntraInterLen(ctx->xdSplit);
 		}
-		k--;
+		j--;
 	}
-	
+
 	/* Insert rest of free axes. */
-	for(;i<ctx->ndfs && k>=0;i++,k--){
+	for (;i<ctx->ndfs0 && j>=0;i++,j--){
 		axis = reduxInvGetSrcSortAxis(ctx, i);
-		if(axisIsReduced(axis)){
+		if (axisIsReduced(axis)){
 			break;
 		}
-		
-		ctx->l      [k]  =           axisGetLen          (axis);
-		ctx->sJ     [k] +=  (ssize_t)axisGetSrcStride    (axis)*
-		                    (ssize_t)axisGetIntraLen     (axis);
-		ctx->dJ     [k] +=  (ssize_t)axisGetDstStride    (axis)*
-		                    (ssize_t)axisGetIntraLen     (axis);
-		ctx->aJ     [k] +=  (ssize_t)axisGetDstArgStride (axis)*
-		                    (ssize_t)axisGetIntraLen     (axis);
-		if(k>0){
-			ctx->sJ  [k-1] -=  (ssize_t)axisGetSrcStride    (axis)*
-			                   (ssize_t)axisGetIntraInterLen(axis);
-			ctx->dJ  [k-1] -=  (ssize_t)axisGetDstStride    (axis)*
-			                   (ssize_t)axisGetIntraInterLen(axis);
-			ctx->aJ  [k-1] -=  (ssize_t)axisGetDstArgStride (axis)*
-			                   (ssize_t)axisGetIntraInterLen(axis);
+
+		ctx->L  [j]  =          axisGetLen     (axis);
+		ctx->S0J[j] += (ssize_t)axisGetS0Stride(axis)*
+		               (ssize_t)axisGetIntraLen(axis);
+		ctx->D0J[j] += (ssize_t)axisGetD0Stride(axis)*
+		               (ssize_t)axisGetIntraLen(axis);
+		ctx->D1J[j] += (ssize_t)axisGetD1Stride(axis)*
+		               (ssize_t)axisGetIntraLen(axis);
+		ctx->I0J[j] += (ssize_t)axisGetI0Stride(axis)*
+		               (ssize_t)axisGetIntraLen(axis);
+		if (j>0){
+			ctx->S0J[j-1] -= (ssize_t)axisGetS0Stride     (axis)*
+			                 (ssize_t)axisGetIntraInterLen(axis);
+			ctx->D0J[j-1] -= (ssize_t)axisGetD0Stride     (axis)*
+			                 (ssize_t)axisGetIntraInterLen(axis);
+			ctx->D1J[j-1] -= (ssize_t)axisGetD1Stride     (axis)*
+			                 (ssize_t)axisGetIntraInterLen(axis);
+			ctx->I0J[j-1] -= (ssize_t)axisGetI0Stride     (axis)*
+			                 (ssize_t)axisGetIntraInterLen(axis);
 		}
 	}
 
-	return reduxInvSchedule(ctx);
-}
-
-#if 0
-static void       reduxScheduleKernel           (int                  ndims,
-                                                 uint64_t*            dims,
-                                                 uint64_t             warpSize,
-                                                 uint64_t             maxLg,
-                                                 uint64_t*            maxLs,
-                                                 uint64_t             maxGg,
-                                                 uint64_t*            maxGs,
-                                                 uint64_t*            bs,
-                                                 uint64_t*            gs,
-                                                 uint64_t*            cs);
-
-/**
- * @brief Given the parameters of a kernel scheduling problem, solve it as
- *        optimally as possible.
- *
- * NB: This is the only function in this entire file that should have
- *     anything to do with the integer factorization APIs.
- */
-
-static void       reduxScheduleKernel           (int         ndims,
-                                                 uint64_t*   dims,
-                                                 uint64_t    warpSize,
-                                                 uint64_t    maxLg,
-                                                 uint64_t*   maxLs,
-                                                 uint64_t    maxGg,
-                                                 uint64_t*   maxGs,
-                                                 uint64_t*   bs,
-                                                 uint64_t*   gs,
-                                                 uint64_t*   cs){
-	uint64_t       warpMod, bestWarpMod  = 1;
-	int            i,       bestWarpAxis = 0;
-	uint64_t       roundedDims[MAX_HW_DIMS];
-	double         slack      [MAX_HW_DIMS];
-	ga_factor_list factBS     [MAX_HW_DIMS];
-	ga_factor_list factGS     [MAX_HW_DIMS];
-	ga_factor_list factCS     [MAX_HW_DIMS];
-
 
 	/**
-	 * Quick check for scalar case.
+	 * STEP 8. Compute the template selector. Requires finding the huge axis,
+	 *         if any. Then, compute LPadded, which depends on the selector
+	 *         value we choose.
 	 */
 
-	if (ndims <= 0){
-		return;
+	if (ctx->xdSplit && !axisIsReduced(ctx->xdSplit)){
+		ctx->selector |= SELECTOR_SPLIT_FREE;
 	}
+	for (i=0;i<ctx->ndfs0;i++){
+		axis = reduxInvGetSrcAxis(ctx, i);
 
-
-	/**
-	 * Identify the dimension to which the warp factor will be given.
-	 *
-	 * The current heuristic is to find the dimension that is either
-	 *   1) Evenly divided by the warp size, or
-	 *   2) As close to filling the last warp as possible.
-	 */
-
-	for (i=0;i<ndims;i++){
-		roundedDims[i] = dims[i];
-		slack      [i] = 1.1;
-		gaIFLInit(&factBS[i]);
-		gaIFLInit(&factGS[i]);
-		gaIFLInit(&factCS[i]);
-
-		warpMod = roundedDims[i] % warpSize;
-		if (bestWarpMod>0 && (warpMod==0 || warpMod>=bestWarpMod)){
-			bestWarpAxis = i;
-			bestWarpMod  = warpMod;
+		if (axisGetLen(axis) >= ((uint64_t)1<<31)){
+			if (axis == ctx->xdSplit){
+				ctx->selector |= SELECTOR_HUGE_IS_SPLIT;
+			}else if (axisIsReduced(axis) == axisIsReduced(ctx->xdSplit)){
+				ctx->selector |= SELECTOR_HUGE_SAME_TYPE;
+			}else{
+				ctx->selector |= SELECTOR_HUGE_OPPOSITE_TYPE;
+			}
 		}
 	}
-
-	if (ndims > 0){
-		roundedDims[bestWarpAxis] = (roundedDims[bestWarpAxis] + warpSize - 1)/warpSize;
-		gaIFactorize(warpSize, 0, 0, &factBS[bestWarpAxis]);
-	}
-
-	/**
-	 * Factorization job. We'll steadily increase the slack in case of failure
-	 * in order to ensure we do get a factorization, which we place into
-	 * chunkSize.
-	 */
-
-	for (i=0;i<ndims;i++){
-		while (!gaIFactorize(roundedDims[i],
-		                     roundedDims[i]*slack[i],
-		                     maxLs      [i],
-		                     &factCS    [i])){
-			/**
-			 * Error! Failed to factorize dimension i with given slack and
-			 * k-smoothness constraints! Increase slack. Once slack reaches
-			 * 2.0 it will factorize guaranteed.
-			 */
-
-			slack[i] += 0.1;
+	if (ctx->selector & SELECTOR_SPLIT_FREE){
+		if (ctx->gr->ndd>0){
+			ctx->LPadded = ctx->L[ctx->gr->ndd-1];
+		}
+	}else{
+		if (ctx->gr->nds>0){
+			ctx->LPadded = ctx->L[ctx->gr->nds-1];
 		}
 	}
+	ctx->LPadded = DIVIDECEIL(ctx->LPadded, ctx->LSlice)*ctx->LSlice;
 
-	/**
-	 * Invoke the scheduler.
-	 *
-	 * The scheduler will move some factors from chunkSize into blockSize and
-	 * gridSize, improving performance.
-	 */
 
-	gaIFLSchedule(ndims, maxLg, maxLs, maxGg, maxGs, factBS, factGS, factCS);
-	for (i=0;i<ndims;i++){
-		bs[i] = gaIFLGetProduct(&factBS[i]);
-		gs[i] = gaIFLGetProduct(&factGS[i]);
-		cs[i] = gaIFLGetProduct(&factCS[i]);
-	}
+	/* Schedule. */
+	return reduxInvSchedule(ctx);
 }
-#endif
 
 /**
  * @brief With nearly all parameters of the kernel computed, schedule the
  *        kernel for maximum performance.
- * 
+ *
  * The thread block size has already been chosen; We only have to choose
- * 
+ *
  *   1. ctx->gs: The grid size, which is the number of thread blocks.
  *   2. ctx->V:  The number of vertical reductions per thread block.
- * 
+ *
  * Two factors drive the scheduling:
- * 
+ *
  *   1. We want to keep all multiprocessors of the device busy; For this we use
  *      an estimate of the level of parallelism of the device.
  *   2. If V can be chosen such that V % B == 0, then only a single kernel
  *      phase is necessary.
- * 
+ *
+ * To do this, we first choose gs to be the number of blocks that roughly fills
+ * the available parallelism given the block size, but reduce it to at most U
+ * (The universal amount of vertical reductions to be done).
+ *
+ * We then select V as the minimum number of vertical reductions per block
+ * that will cover the universe U.
+ *
+ * Lastly, iff there exists a value V <= V' <= 2*V such that V' % B == 0, then
+ * increase V to the smallest such V' and recompute ctx->gs.
+ *
  * Once the scheduling is performed, the workspace can be allocated and
  * workspace offsets can be computed.
  */
@@ -3616,30 +3918,34 @@ static void       reduxScheduleKernel           (int         ndims,
 static int        reduxInvSchedule              (redux_ctx*           ctx){
 	const int flags = GA_BUFFER_READ_WRITE;
 	size_t    WSPACESIZE;
-	
+
 	/**
-	 * Get enough blocks to fill available device parallelism to capacity.
-	 * Then, compute corresponding V.
+	 * Scheduling
 	 */
-	
-	ctx->gs    = DIVIDECEIL(reduxInvEstimateParallelism(ctx),
-	                        reduxGenGetMaxLocalSize(ctx->gr));
-	ctx->V     = DIVIDECEIL(ctx->U, ctx->gs);
-	
+
+	ctx->gs = DIVIDECEIL(reduxInvEstimateParallelism(ctx),
+	                     reduxGenGetMaxLocalSize(ctx->gr));
+	ctx->gs = ctx->gs > ctx->U ? ctx->U : ctx->gs;
+	ctx->V  = DIVIDECEIL(ctx->U, ctx->gs);
+	if (ctx->V%ctx->B != 0 && ctx->V*2 >= ctx->B){
+		ctx->V  = DIVIDECEIL(ctx->V, ctx->B)*ctx->B;
+	}
+	ctx->gs = DIVIDECEIL(ctx->U, ctx->V);
+
 	/**
 	 * Allocate required workspace.
 	 */
-	
-	ctx->wdOff = reduxGenGetWMEMDstOff   (ctx->gr, 2*ctx->gs*ctx->D);
-	ctx->waOff = reduxGenGetWMEMDstArgOff(ctx->gr, 2*ctx->gs*ctx->D);
-	WSPACESIZE = reduxGenGetWMEMSize     (ctx->gr, 2*ctx->gs*ctx->D);
-	ctx->w     = gpudata_alloc(ctx->gr->gpuCtx, WSPACESIZE, 0, flags, 0);
-	if(!ctx->w){
+
+	ctx->W0Off = reduxGenGetWMEMK0Off(ctx->gr, 2*ctx->gs*ctx->D);
+	ctx->W1Off = reduxGenGetWMEMK1Off(ctx->gr, 2*ctx->gs*ctx->D);
+	WSPACESIZE = reduxGenGetWMEMSize (ctx->gr, 2*ctx->gs*ctx->D);
+	ctx->W     = gpudata_alloc(ctx->gr->gpuCtx, WSPACESIZE, 0, flags, 0);
+	if (!ctx->W){
 		return reduxInvCleanupMsg(ctx, GA_MEMORY_ERROR,
 		       "Could not allocate %zu-byte workspace for reduction!\n",
 		       WSPACESIZE);
 	}
-	
+
 	return reduxInvoke(ctx);
 }
 
@@ -3650,37 +3956,36 @@ static int        reduxInvSchedule              (redux_ctx*           ctx){
 static int        reduxInvoke                   (redux_ctx*           ctx){
 	int   ret, i=0;
 	void* ptrs[2] = {ctx, &i};
-	
+
 	/**
 	 * Argument Marshalling.
 	 */
-	
+
 	reduxGenIterArgs(ctx->gr, reduxInvMarshalArg, ptrs);
 
 
 
 	/**
 	 * The kernel is now invoked once or twice, for phase 0 or 1.
-	 * 
-	 * Phase 1 is sometimes optional.
+	 *
+	 * Phase 1 is optional iff V%B == 0.
 	 */
 
-	ctx->phase = 0;
-	ret = GpuKernel_call(&ctx->gr->k, 1, &ctx->gs, &ctx->bs, ctx->SHMEM, ctx->kArgs);
+	ret = GpuKernel_call((GpuKernel*)&ctx->gr->k, 1, &ctx->gs, &ctx->bs, ctx->shmemBytes, ctx->kArgs);
 	if (ret != GA_NO_ERROR){
 		return reduxInvCleanupMsg(ctx, ret,
 		                          "Failure in kernel call, Phase 0!\n");
 	}
-	
-	if(ctx->V%ctx->B != 0){
-		ctx->phase = 1;
-		ret = GpuKernel_call(&ctx->gr->k, 1, &ctx->gs, &ctx->bs, ctx->SHMEM, ctx->kArgs);
+
+	if (ctx->V % ctx->B != 0){
+		ctx->selector |= SELECTOR_PHASE1;
+		ret = GpuKernel_call((GpuKernel*)&ctx->gr->k, 1, &ctx->gs, &ctx->bs, ctx->shmemBytes, ctx->kArgs);
 		if (ret != GA_NO_ERROR){
 			return reduxInvCleanupMsg(ctx, ret,
 			                          "Failure in kernel call, Phase 1!\n");
 		}
 	}
-	
+
 	/* Success! */
 	return reduxInvCleanup(ctx, GA_NO_ERROR);
 }
@@ -3690,41 +3995,43 @@ static int        reduxInvoke                   (redux_ctx*           ctx){
  */
 
 static int        reduxInvCleanup               (redux_ctx*        ctx, int ret){
-	free(ctx->l);
-	free(ctx->lPDim);
-	free(ctx->sJ);
-	free(ctx->dJ);
-	free(ctx->aJ);
-	free(ctx->ibs);
-	free(ctx->ibp);
-	free(ctx->iblPDim);
-	free(ctx->ibsOff);
-	free(ctx->ibdOff);
-	free(ctx->ibaOff);
-	free(ctx->kArgs);
+	ctx->gr                = NULL;
+	ctx->s0                = NULL;
+	ctx->d0                = NULL;
+	ctx->d1                = NULL;
+	ctx->reduxList         = NULL;
+
 	free(ctx->xdSrc);
 	free(ctx->xdSrcPtrs);
-	free(ctx->xdTmpPtrs);
-	
-	gpudata_release(ctx->w);
-	
-	ctx->l                 = NULL;
-	ctx->lPDim             = NULL;
-	ctx->sJ                = NULL;
-	ctx->dJ                = NULL;
-	ctx->aJ                = NULL;
-	ctx->ibs               = NULL;
-	ctx->ibp               = NULL;
-	ctx->iblPDim           = NULL;
-	ctx->ibsOff            = NULL;
-	ctx->ibdOff            = NULL;
-	ctx->ibaOff            = NULL;
-	ctx->kArgs             = NULL;
+	free(ctx->L);
+	free(ctx->Li);
+	free(ctx->S0J);
+	free(ctx->S0Si);
+	free(ctx->D0J);
+	free(ctx->D0Si);
+	free(ctx->D1J);
+	free(ctx->D1Si);
+	free(ctx->I0J);
+	free(ctx->I0Si);
+	free(ctx->perm);
+	free(ctx->kArgs);
+	gpudata_release(ctx->W);
+
 	ctx->xdSrc             = NULL;
 	ctx->xdSrcPtrs         = NULL;
-	ctx->xdTmpPtrs         = NULL;
-	
-	ctx->w                 = NULL;
+	ctx->L                 = NULL;
+	ctx->Li                = NULL;
+	ctx->S0J               = NULL;
+	ctx->S0Si              = NULL;
+	ctx->D0J               = NULL;
+	ctx->D0Si              = NULL;
+	ctx->D1J               = NULL;
+	ctx->D1Si              = NULL;
+	ctx->I0J               = NULL;
+	ctx->I0Si              = NULL;
+	ctx->perm              = NULL;
+	ctx->kArgs             = NULL;
+	ctx->W                 = NULL;
 
 	return ret;
 }
@@ -3732,7 +4039,7 @@ static int        reduxInvCleanupMsg            (redux_ctx*        ctx, int ret,
                                                  const char*       fmt, ...){
 #if DEBUG
 	FILE* fp = stderr;
-	
+
 	va_list ap;
 	va_start(ap, fmt);
 	vfprintf(fp, fmt, ap);
@@ -3741,6 +4048,7 @@ static int        reduxInvCleanupMsg            (redux_ctx*        ctx, int ret,
 #else
 	(void)fmt;
 #endif
-	
+
 	return reduxInvCleanup(ctx, ret);
 }
+
diff --git a/tests/check_reduction.c b/tests/check_reduction.c
index 12a99ded30..7a2141cfae 100644
--- a/tests/check_reduction.c
+++ b/tests/check_reduction.c
@@ -142,7 +142,17 @@ START_TEST(test_maxandargmax_reduction){
 				}
 			}
 		}
-
+		
+		if(gtMax    != pMax[j]){
+			fprintf(stderr, "Mismatch GT %f != %f UUT @ %zu!\n",
+			        gtMax, pMax[j], j);
+			fflush(stderr);
+		}
+		if(gtArgmax != pArgmax[j]){
+			fprintf(stderr, "Mismatch GT %zu != %zu UUT @ %zu!\n",
+			        gtArgmax, pArgmax[j], j);
+			fflush(stderr);
+		}
 		ck_assert_msg(gtMax    == pMax[j],    "Max value mismatch!");
 		ck_assert_msg(gtArgmax == pArgmax[j], "Argmax value mismatch!");
 	}
@@ -256,6 +266,107 @@ START_TEST(test_maxandargmax_idxtranspose){
 	GpuArray_clear(&gaArgmax);
 }END_TEST
 
+START_TEST(test_maxandargmax_bigdestination){
+	pcgSeed(1);
+
+	/**
+	 * We test here a reduction of some random 3D tensor on the first and
+	 * third dimensions.
+	 */
+
+	size_t i,j;
+	size_t dims[2]  = {2,131072};
+	size_t prodDims = dims[0]*dims[1];
+	const int reduxList[] = {0};
+
+	float*  pSrc    = calloc(1, sizeof(*pSrc)    * dims[0]*dims[1]);
+	float*  pMax    = calloc(1, sizeof(*pMax)    *         dims[1]);
+	size_t* pArgmax = calloc(1, sizeof(*pArgmax) *         dims[1]);
+
+	ck_assert_ptr_ne(pSrc,    NULL);
+	ck_assert_ptr_ne(pMax,    NULL);
+	ck_assert_ptr_ne(pArgmax, NULL);
+
+
+	/**
+	 * Initialize source data.
+	 */
+
+	for(i=0;i<prodDims;i++){
+		pSrc[i] = pcgRand01();
+	}
+
+
+	/**
+	 * Run the kernel.
+	 */
+
+	GpuArray gaSrc;
+	GpuArray gaMax;
+	GpuArray gaArgmax;
+
+	ga_assert_ok(GpuArray_empty(&gaSrc,    ctx, GA_FLOAT, 2, &dims[0], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaMax,    ctx, GA_FLOAT, 1, &dims[1], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaArgmax, ctx, GA_SIZE,  1, &dims[1], GA_C_ORDER));
+
+	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaMax,    -1));  /* 0xFFFFFFFF is a qNaN. */
+	ga_assert_ok(GpuArray_memset(&gaArgmax, -1));
+
+	GpuReduction* gr;
+	GpuReduction_new(&gr, GpuArray_context(&gaSrc),
+	                 GA_REDUCE_MAXANDARGMAX, 1, 1, gaSrc.typecode, 0);
+	ck_assert_ptr_nonnull(gr);
+	ga_assert_ok(GpuReduction_call(gr, &gaMax, &gaArgmax, &gaSrc, 1, reduxList, 0));
+	GpuReduction_free(gr);
+
+	ga_assert_ok(GpuArray_read(pMax,    sizeof(*pMax)   *dims[1], &gaMax));
+	ga_assert_ok(GpuArray_read(pArgmax, sizeof(*pArgmax)*dims[1], &gaArgmax));
+
+
+	/**
+	 * Check that the destination tensors are correct.
+	 */
+
+	for(j=0;j<dims[1];j++){
+		size_t gtArgmax = 0;
+		float  gtMax    = pSrc[0*dims[1] + j];
+
+		for(i=0;i<dims[0];i++){
+			float v = pSrc[i*dims[1] + j];
+
+			if(v > gtMax){
+				gtMax    = v;
+				gtArgmax = i;
+			}
+		}
+		
+		if(gtMax    != pMax[j]){
+			fprintf(stderr, "Mismatch GT %f != %f UUT @ %zu!\n",
+			        gtMax, pMax[j], j);
+			fflush(stderr);
+		}
+		if(gtArgmax != pArgmax[j]){
+			fprintf(stderr, "Mismatch GT %zu != %zu UUT @ %zu!\n",
+			        gtArgmax, pArgmax[j], j);
+			fflush(stderr);
+		}
+		ck_assert_msg(gtMax    == pMax[j],    "Max value mismatch!");
+		ck_assert_msg(gtArgmax == pArgmax[j], "Argmax value mismatch!");
+	}
+
+	/**
+	 * Deallocate.
+	 */
+
+	free(pSrc);
+	free(pMax);
+	free(pArgmax);
+	GpuArray_clear(&gaSrc);
+	GpuArray_clear(&gaMax);
+	GpuArray_clear(&gaArgmax);
+}END_TEST
+
 START_TEST(test_maxandargmax_veryhighrank){
 	pcgSeed(1);
 
@@ -2138,7 +2249,7 @@ START_TEST(test_prod_reduction){
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
 	const int reduxList[] = {0,2};
-	const float TOL = 1e-5;
+	const float TOL = 1e-4;
 
 	float*  pS = calloc(1, sizeof(*pS)    * dims[0]*dims[1]*dims[2]);
 	float*  pD = calloc(1, sizeof(*pD)    *         dims[1]        );
@@ -2219,7 +2330,7 @@ START_TEST(test_prod_veryhighrank){
 	size_t rdxDims[4]  = {1171,373,1,2};
 	size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3];
 	const int reduxList[] = {2,4,7,5};
-	const float TOL    = 1e-5;
+	const float TOL    = 1e-4;
 
 	float*  pS = calloc(1, sizeof(*pS) * prodDims);
 	float*  pD = calloc(1, sizeof(*pD) * rdxProdDims);
@@ -2310,7 +2421,7 @@ START_TEST(test_prod_alldimsreduced){
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
 	const int reduxList[] = {0,1,2};
-	const float TOL = 1e-5;
+	const float TOL = 1e-4;
 
 	float*  pS = calloc(1, sizeof(*pS)    * dims[0]*dims[1]*dims[2]);
 	float*  pD = calloc(1, sizeof(*pD)                             );
@@ -2389,7 +2500,7 @@ START_TEST(test_prodnz_reduction){
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
 	const int reduxList[] = {0,2};
-	const float TOL = 1e-5;
+	const float TOL = 1e-4;
 
 	float*  pS = calloc(1, sizeof(*pS)    * dims[0]*dims[1]*dims[2]);
 	float*  pD = calloc(1, sizeof(*pD)    *         dims[1]        );
@@ -2473,7 +2584,7 @@ START_TEST(test_prodnz_veryhighrank){
 	size_t rdxDims[4]  = {1171,373,1,2};
 	size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3];
 	const int reduxList[] = {2,4,7,5};
-	const float TOL    = 1e-5;
+	const float TOL    = 1e-4;
 
 	float*  pS = calloc(1, sizeof(*pS) * prodDims);
 	float*  pD = calloc(1, sizeof(*pD) * rdxProdDims);
@@ -2567,7 +2678,7 @@ START_TEST(test_prodnz_alldimsreduced){
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
 	const int reduxList[] = {0,1,2};
-	const float TOL = 1e-5;
+	const float TOL = 1e-4;
 
 	float*  pS = calloc(1, sizeof(*pS)    * dims[0]*dims[1]*dims[2]);
 	float*  pD = calloc(1, sizeof(*pD)                             );
@@ -3982,9 +4093,10 @@ Suite *get_suite(void) {
 	TCase *tc = tcase_create("basic");
 	tcase_add_checked_fixture(tc, setup, teardown);
 	tcase_set_timeout(tc, 120.0);
-
+	
 	tcase_add_test(tc, test_maxandargmax_reduction);
 	tcase_add_test(tc, test_maxandargmax_idxtranspose);
+	tcase_add_test(tc, test_maxandargmax_bigdestination);
 	tcase_add_test(tc, test_maxandargmax_veryhighrank);
 	tcase_add_test(tc, test_maxandargmax_alldimsreduced);
 

From c9a0389495b3eecb638792bb560102413dea1569 Mon Sep 17 00:00:00 2001
From: Olexa Bilaniuk <obilaniu@gmail.com>
Date: Thu, 13 Jul 2017 23:18:21 -0400
Subject: [PATCH 22/34] More refactoring.

Now, all the veryhighrank tests pass and the others fail for an unknown
reason.
---
 src/cluda_cuda.h.c       |  551 +++----
 src/gpuarray/reduction.h |  112 +-
 src/gpuarray_reduction.c |  507 +++---
 tests/check_reduction.c  | 3369 +++++++++++++++++++++++---------------
 4 files changed, 2656 insertions(+), 1883 deletions(-)

diff --git a/src/cluda_cuda.h.c b/src/cluda_cuda.h.c
index ba3f88cadc..319074542e 100644
--- a/src/cluda_cuda.h.c
+++ b/src/cluda_cuda.h.c
@@ -174,77 +174,171 @@ static const char cluda_cuda_h[] = {
 0x70, 0x65, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x5b, 0x5d, 0x3b, 0x0a,
 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x41, 0x5f,
 0x57, 0x41, 0x52, 0x50, 0x5f, 0x53, 0x49, 0x5a, 0x45, 0x20, 0x77,
-0x61, 0x72, 0x70, 0x53, 0x69, 0x7a, 0x65, 0x0a, 0x0a, 0x73, 0x74,
-0x72, 0x75, 0x63, 0x74, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c,
-0x66, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x73,
-0x68, 0x6f, 0x72, 0x74, 0x20, 0x64, 0x61, 0x74, 0x61, 0x3b, 0x0a,
-0x7d, 0x3b, 0x0a, 0x0a, 0x73, 0x74, 0x61, 0x74, 0x69, 0x63, 0x20,
-0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20,
-0x69, 0x6e, 0x6c, 0x69, 0x6e, 0x65, 0x20, 0x66, 0x6c, 0x6f, 0x61,
-0x74, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x32, 0x66,
-0x6c, 0x6f, 0x61, 0x74, 0x28, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c,
-0x66, 0x20, 0x68, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x66, 0x6c,
-0x6f, 0x61, 0x74, 0x20, 0x72, 0x3b, 0x0a, 0x20, 0x20, 0x61, 0x73,
-0x6d, 0x28, 0x22, 0x7b, 0x20, 0x63, 0x76, 0x74, 0x2e, 0x66, 0x33,
-0x32, 0x2e, 0x66, 0x31, 0x36, 0x20, 0x25, 0x30, 0x2c, 0x20, 0x25,
-0x31, 0x3b, 0x20, 0x7d, 0x5c, 0x6e, 0x22, 0x20, 0x3a, 0x20, 0x22,
-0x3d, 0x66, 0x22, 0x28, 0x72, 0x29, 0x20, 0x3a, 0x20, 0x22, 0x68,
-0x22, 0x28, 0x68, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x29, 0x3b,
-0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x72,
-0x3b, 0x0a, 0x7d, 0x0a, 0x73, 0x74, 0x61, 0x74, 0x69, 0x63, 0x20,
-0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20,
-0x69, 0x6e, 0x6c, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x68,
-0x61, 0x6c, 0x66, 0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61,
-0x74, 0x32, 0x68, 0x61, 0x6c, 0x66, 0x28, 0x66, 0x6c, 0x6f, 0x61,
-0x74, 0x20, 0x66, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x67, 0x61,
-0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x72, 0x3b, 0x0a, 0x20, 0x20,
-0x61, 0x73, 0x6d, 0x28, 0x22, 0x7b, 0x20, 0x63, 0x76, 0x74, 0x2e,
-0x72, 0x6e, 0x2e, 0x66, 0x31, 0x36, 0x2e, 0x66, 0x33, 0x32, 0x20,
-0x25, 0x30, 0x2c, 0x20, 0x25, 0x31, 0x3b, 0x20, 0x7d, 0x5c, 0x6e,
-0x22, 0x20, 0x3a, 0x20, 0x22, 0x3d, 0x68, 0x22, 0x28, 0x72, 0x2e,
-0x64, 0x61, 0x74, 0x61, 0x29, 0x20, 0x3a, 0x20, 0x22, 0x66, 0x22,
-0x28, 0x66, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74,
-0x75, 0x72, 0x6e, 0x20, 0x72, 0x3b, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f,
-0x2a, 0x20, 0x67, 0x61, 0x5f, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x2f,
+0x61, 0x72, 0x70, 0x53, 0x69, 0x7a, 0x65, 0x0a, 0x23, 0x64, 0x65,
+0x66, 0x69, 0x6e, 0x65, 0x20, 0x72, 0x65, 0x73, 0x74, 0x72, 0x69,
+0x63, 0x74, 0x20, 0x5f, 0x5f, 0x72, 0x65, 0x73, 0x74, 0x72, 0x69,
+0x63, 0x74, 0x5f, 0x5f, 0x0a, 0x0a, 0x73, 0x74, 0x72, 0x75, 0x63,
+0x74, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x7b,
+0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x73, 0x68, 0x6f, 0x72,
+0x74, 0x20, 0x64, 0x61, 0x74, 0x61, 0x3b, 0x0a, 0x7d, 0x3b, 0x0a,
+0x0a, 0x73, 0x74, 0x61, 0x74, 0x69, 0x63, 0x20, 0x5f, 0x5f, 0x64,
+0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x69, 0x6e, 0x6c,
+0x69, 0x6e, 0x65, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x67,
+0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x32, 0x66, 0x6c, 0x6f, 0x61,
+0x74, 0x28, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x68,
+0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74,
+0x20, 0x72, 0x3b, 0x0a, 0x20, 0x20, 0x61, 0x73, 0x6d, 0x28, 0x22,
+0x7b, 0x20, 0x63, 0x76, 0x74, 0x2e, 0x66, 0x33, 0x32, 0x2e, 0x66,
+0x31, 0x36, 0x20, 0x25, 0x30, 0x2c, 0x20, 0x25, 0x31, 0x3b, 0x20,
+0x7d, 0x5c, 0x6e, 0x22, 0x20, 0x3a, 0x20, 0x22, 0x3d, 0x66, 0x22,
+0x28, 0x72, 0x29, 0x20, 0x3a, 0x20, 0x22, 0x68, 0x22, 0x28, 0x68,
+0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20,
+0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x72, 0x3b, 0x0a, 0x7d,
+0x0a, 0x73, 0x74, 0x61, 0x74, 0x69, 0x63, 0x20, 0x5f, 0x5f, 0x64,
+0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x69, 0x6e, 0x6c,
+0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66,
+0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x32, 0x68,
+0x61, 0x6c, 0x66, 0x28, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x66,
+0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61,
+0x6c, 0x66, 0x20, 0x72, 0x3b, 0x0a, 0x20, 0x20, 0x61, 0x73, 0x6d,
+0x28, 0x22, 0x7b, 0x20, 0x63, 0x76, 0x74, 0x2e, 0x72, 0x6e, 0x2e,
+0x66, 0x31, 0x36, 0x2e, 0x66, 0x33, 0x32, 0x20, 0x25, 0x30, 0x2c,
+0x20, 0x25, 0x31, 0x3b, 0x20, 0x7d, 0x5c, 0x6e, 0x22, 0x20, 0x3a,
+0x20, 0x22, 0x3d, 0x68, 0x22, 0x28, 0x72, 0x2e, 0x64, 0x61, 0x74,
+0x61, 0x29, 0x20, 0x3a, 0x20, 0x22, 0x66, 0x22, 0x28, 0x66, 0x29,
+0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e,
+0x20, 0x72, 0x3b, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2a, 0x20, 0x67,
+0x61, 0x5f, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64,
+0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f,
+0x61, 0x64, 0x64, 0x5f, 0x69, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62,
+0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64,
+0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66,
+0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64,
+0x64, 0x5f, 0x69, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20,
+0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61,
+0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e,
+0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67,
+0x5f, 0x69, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61,
+0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61,
+0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e,
+0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67,
+0x5f, 0x69, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61,
+0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61,
+0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f,
+0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65,
+0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61,
+0x64, 0x64, 0x5f, 0x49, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29,
+0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28,
+0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69,
+0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64,
+0x5f, 0x49, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61,
+0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c,
+0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65,
+0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f,
+0x49, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74,
+0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c,
+0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65,
+0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f,
+0x49, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74,
+0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c,
+0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x6c,
+0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x2f, 0x0a, 0x5f, 0x5f, 0x64, 0x65,
+0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x67, 0x61, 0x5f, 0x6c,
+0x6f, 0x6e, 0x67, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64,
+0x64, 0x5f, 0x6c, 0x67, 0x28, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e,
+0x67, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61,
+0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20,
+0x7b, 0x0a, 0x20, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65,
+0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67,
+0x20, 0x2a, 0x77, 0x61, 0x64, 0x64, 0x72, 0x20, 0x3d, 0x20, 0x28,
+0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f,
+0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x29, 0x61,
+0x64, 0x64, 0x72, 0x3b, 0x0a, 0x20, 0x20, 0x75, 0x6e, 0x73, 0x69,
+0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c,
+0x6f, 0x6e, 0x67, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x2a,
+0x77, 0x61, 0x64, 0x64, 0x72, 0x3b, 0x0a, 0x20, 0x20, 0x75, 0x6e,
+0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67,
+0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d,
+0x65, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, 0x0a,
+0x20, 0x20, 0x20, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64,
+0x20, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x20,
+0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d,
+0x69, 0x63, 0x43, 0x41, 0x53, 0x28, 0x77, 0x61, 0x64, 0x64, 0x72,
+0x2c, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x2c, 0x20,
+0x28, 0x76, 0x61, 0x6c, 0x20, 0x2b, 0x20, 0x28, 0x67, 0x61, 0x5f,
+0x6c, 0x6f, 0x6e, 0x67, 0x29, 0x28, 0x61, 0x73, 0x73, 0x75, 0x6d,
+0x65, 0x64, 0x29, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x20,
+0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x61, 0x73, 0x73, 0x75,
+0x6d, 0x65, 0x64, 0x20, 0x21, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x29,
+0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20,
+0x28, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x29, 0x6f, 0x6c,
+0x64, 0x3b, 0x0a, 0x7d, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e,
+0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f,
+0x6c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74,
+0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x6c, 0x67, 0x28, 0x61,
+0x2c, 0x20, 0x62, 0x29, 0x0a, 0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69,
+0x63, 0x65, 0x5f, 0x5f, 0x20, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e,
+0x67, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67,
+0x5f, 0x6c, 0x67, 0x28, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67,
+0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f,
+0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b,
+0x0a, 0x20, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64,
+0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20,
+0x72, 0x65, 0x73, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x73, 0x20,
+0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63,
+0x68, 0x28, 0x28, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64,
+0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20,
+0x2a, 0x29, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x76, 0x61, 0x6c,
+0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e,
+0x20, 0x28, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x29, 0x72,
+0x65, 0x73, 0x3b, 0x0a, 0x7d, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69,
+0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68,
+0x67, 0x5f, 0x6c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20,
+0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x6c,
+0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20,
+0x67, 0x61, 0x5f, 0x75, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x2f,
 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74,
-0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x69, 0x67, 0x28, 0x61,
+0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x4c, 0x67, 0x28, 0x61,
 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63,
 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23,
 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d,
-0x5f, 0x61, 0x64, 0x64, 0x5f, 0x69, 0x6c, 0x28, 0x61, 0x2c, 0x20,
+0x5f, 0x61, 0x64, 0x64, 0x5f, 0x4c, 0x6c, 0x28, 0x61, 0x2c, 0x20,
 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64,
 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65,
 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78,
-0x63, 0x68, 0x67, 0x5f, 0x69, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62,
+0x63, 0x68, 0x67, 0x5f, 0x4c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62,
 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63,
 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65,
 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78,
-0x63, 0x68, 0x67, 0x5f, 0x69, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62,
+0x63, 0x68, 0x67, 0x5f, 0x4c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62,
 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63,
 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20,
-0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x2f, 0x0a,
-0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f,
-0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x49, 0x67, 0x28, 0x61, 0x2c,
-0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41,
-0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64,
-0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f,
-0x61, 0x64, 0x64, 0x5f, 0x49, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62,
-0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64,
-0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66,
-0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63,
-0x68, 0x67, 0x5f, 0x49, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29,
-0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68,
-0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66,
-0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63,
-0x68, 0x67, 0x5f, 0x49, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29,
-0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68,
-0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67,
-0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x2f, 0x0a, 0x5f,
-0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x67,
-0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x61, 0x74, 0x6f, 0x6d,
-0x5f, 0x61, 0x64, 0x64, 0x5f, 0x6c, 0x67, 0x28, 0x67, 0x61, 0x5f,
-0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c,
-0x20, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x76, 0x61,
+0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x2a, 0x2f,
+0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74,
+0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x66, 0x67, 0x28, 0x61,
+0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63,
+0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23,
+0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d,
+0x5f, 0x61, 0x64, 0x64, 0x5f, 0x66, 0x6c, 0x28, 0x61, 0x2c, 0x20,
+0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64,
+0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65,
+0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78,
+0x63, 0x68, 0x67, 0x5f, 0x66, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62,
+0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63,
+0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65,
+0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78,
+0x63, 0x68, 0x67, 0x5f, 0x66, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62,
+0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63,
+0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20,
+0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x2a,
+0x2f, 0x0a, 0x23, 0x69, 0x66, 0x20, 0x5f, 0x5f, 0x43, 0x55, 0x44,
+0x41, 0x5f, 0x41, 0x52, 0x43, 0x48, 0x5f, 0x5f, 0x20, 0x3c, 0x20,
+0x36, 0x30, 0x30, 0x0a, 0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63,
+0x65, 0x5f, 0x5f, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62,
+0x6c, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64,
+0x5f, 0x64, 0x67, 0x28, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62,
+0x6c, 0x65, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67,
+0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x76, 0x61,
 0x6c, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x75, 0x6e, 0x73, 0x69,
 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c,
 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x77, 0x61, 0x64, 0x64, 0x72, 0x20,
@@ -262,225 +356,90 @@ static const char cluda_cuda_h[] = {
 0x20, 0x20, 0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x61,
 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x43, 0x41, 0x53, 0x28, 0x77, 0x61,
 0x64, 0x64, 0x72, 0x2c, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65,
-0x64, 0x2c, 0x20, 0x28, 0x76, 0x61, 0x6c, 0x20, 0x2b, 0x20, 0x28,
-0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x29, 0x28, 0x61, 0x73,
-0x73, 0x75, 0x6d, 0x65, 0x64, 0x29, 0x29, 0x29, 0x3b, 0x0a, 0x20,
-0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x61,
-0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, 0x21, 0x3d, 0x20, 0x6f,
-0x6c, 0x64, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75,
-0x72, 0x6e, 0x20, 0x28, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67,
-0x29, 0x6f, 0x6c, 0x64, 0x3b, 0x0a, 0x7d, 0x0a, 0x23, 0x64, 0x65,
+0x64, 0x2c, 0x20, 0x5f, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65,
+0x5f, 0x61, 0x73, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x6c, 0x6f, 0x6e,
+0x67, 0x28, 0x76, 0x61, 0x6c, 0x20, 0x2b, 0x20, 0x5f, 0x5f, 0x6c,
+0x6f, 0x6e, 0x67, 0x6c, 0x6f, 0x6e, 0x67, 0x5f, 0x61, 0x73, 0x5f,
+0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x28, 0x61, 0x73, 0x73, 0x75,
+0x6d, 0x65, 0x64, 0x29, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d,
+0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x61, 0x73, 0x73,
+0x75, 0x6d, 0x65, 0x64, 0x20, 0x21, 0x3d, 0x20, 0x6f, 0x6c, 0x64,
+0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e,
+0x20, 0x5f, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x6c, 0x6f, 0x6e, 0x67,
+0x5f, 0x61, 0x73, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x28,
+0x6f, 0x6c, 0x64, 0x29, 0x3b, 0x0a, 0x7d, 0x0a, 0x23, 0x64, 0x65,
 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61,
-0x64, 0x64, 0x5f, 0x6c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29,
-0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x6c,
-0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x5f, 0x5f, 0x64,
-0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x67, 0x61, 0x5f,
-0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78,
-0x63, 0x68, 0x67, 0x5f, 0x6c, 0x67, 0x28, 0x67, 0x61, 0x5f, 0x6c,
-0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20,
-0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x76, 0x61, 0x6c,
-0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67,
-0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f,
-0x6e, 0x67, 0x20, 0x72, 0x65, 0x73, 0x3b, 0x0a, 0x20, 0x20, 0x72,
-0x65, 0x73, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63,
-0x45, 0x78, 0x63, 0x68, 0x28, 0x28, 0x75, 0x6e, 0x73, 0x69, 0x67,
-0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f,
-0x6e, 0x67, 0x20, 0x2a, 0x29, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20,
-0x76, 0x61, 0x6c, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74,
-0x75, 0x72, 0x6e, 0x20, 0x28, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e,
-0x67, 0x29, 0x72, 0x65, 0x73, 0x3b, 0x0a, 0x7d, 0x0a, 0x23, 0x64,
-0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f,
-0x78, 0x63, 0x68, 0x67, 0x5f, 0x6c, 0x6c, 0x28, 0x61, 0x2c, 0x20,
-0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68,
-0x67, 0x5f, 0x6c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a,
-0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x6c, 0x6f, 0x6e, 0x67,
-0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65,
-0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x4c,
-0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f,
-0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62,
-0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61,
-0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x4c, 0x6c, 0x28,
-0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69,
-0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a,
-0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f,
-0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x4c, 0x67, 0x28, 0x61,
-0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63,
-0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a,
-0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f,
-0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x4c, 0x6c, 0x28, 0x61,
-0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63,
-0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a,
-0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74,
-0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65,
-0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x66,
-0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f,
-0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62,
-0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61,
-0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x66, 0x6c, 0x28,
-0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69,
-0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a,
-0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f,
-0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x66, 0x67, 0x28, 0x61,
-0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63,
-0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a,
-0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f,
-0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x66, 0x6c, 0x28, 0x61,
+0x64, 0x64, 0x5f, 0x64, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29,
+0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64,
+0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x65, 0x6c,
+0x73, 0x65, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20,
+0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x67,
+0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d,
+0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29,
+0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74,
+0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x6c, 0x28, 0x61,
 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63,
-0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a,
-0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c,
-0x65, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x69, 0x66, 0x20, 0x5f, 0x5f,
-0x43, 0x55, 0x44, 0x41, 0x5f, 0x41, 0x52, 0x43, 0x48, 0x5f, 0x5f,
-0x20, 0x3c, 0x20, 0x36, 0x30, 0x30, 0x0a, 0x5f, 0x5f, 0x64, 0x65,
-0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x67, 0x61, 0x5f, 0x64,
-0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f,
-0x61, 0x64, 0x64, 0x5f, 0x64, 0x67, 0x28, 0x67, 0x61, 0x5f, 0x64,
+0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23,
+0x65, 0x6e, 0x64, 0x69, 0x66, 0x0a, 0x5f, 0x5f, 0x64, 0x65, 0x76,
+0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f,
+0x75, 0x62, 0x6c, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78,
+0x63, 0x68, 0x67, 0x5f, 0x64, 0x67, 0x28, 0x67, 0x61, 0x5f, 0x64,
 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72,
 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65,
 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x75,
 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e,
-0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x77, 0x61, 0x64,
-0x64, 0x72, 0x20, 0x3d, 0x20, 0x28, 0x75, 0x6e, 0x73, 0x69, 0x67,
-0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f,
-0x6e, 0x67, 0x20, 0x2a, 0x29, 0x61, 0x64, 0x64, 0x72, 0x3b, 0x0a,
-0x20, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20,
-0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6f,
-0x6c, 0x64, 0x20, 0x3d, 0x20, 0x2a, 0x77, 0x61, 0x64, 0x64, 0x72,
-0x3b, 0x0a, 0x20, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65,
-0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67,
-0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x3b, 0x0a, 0x20,
+0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x72, 0x65, 0x73, 0x3b,
+0x0a, 0x20, 0x20, 0x72, 0x65, 0x73, 0x20, 0x3d, 0x20, 0x61, 0x74,
+0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x28, 0x75,
+0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e,
+0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x29, 0x61, 0x64,
+0x64, 0x72, 0x2c, 0x20, 0x5f, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c,
+0x65, 0x5f, 0x61, 0x73, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x6c, 0x6f,
+0x6e, 0x67, 0x28, 0x76, 0x61, 0x6c, 0x29, 0x29, 0x3b, 0x0a, 0x20,
+0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x5f, 0x5f, 0x6c,
+0x6f, 0x6e, 0x67, 0x6c, 0x6f, 0x6e, 0x67, 0x5f, 0x61, 0x73, 0x5f,
+0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x28, 0x72, 0x65, 0x73, 0x29,
+0x3b, 0x0a, 0x7d, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65,
+0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f,
+0x64, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74,
+0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x64, 0x67, 0x28,
+0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61,
+0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x2f, 0x0a, 0x5f, 0x5f,
+0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x67, 0x61,
+0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f,
+0x61, 0x64, 0x64, 0x5f, 0x65, 0x67, 0x28, 0x67, 0x61, 0x5f, 0x68,
+0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20,
+0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x76, 0x61, 0x6c,
+0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69,
+0x6e, 0x74, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x20, 0x3d, 0x20,
+0x28, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x29,
+0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61,
+0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x7e, 0x32, 0x29, 0x3b, 0x0a,
+0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x6f,
+0x6c, 0x64, 0x2c, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64,
+0x2c, 0x20, 0x73, 0x75, 0x6d, 0x2c, 0x20, 0x6e, 0x65, 0x77, 0x5f,
+0x3b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66,
+0x20, 0x74, 0x6d, 0x70, 0x3b, 0x0a, 0x20, 0x20, 0x6f, 0x6c, 0x64,
+0x20, 0x3d, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x3b, 0x0a, 0x20,
 0x20, 0x64, 0x6f, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x61,
 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, 0x3d, 0x20, 0x6f, 0x6c,
-0x64, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20,
-0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x43, 0x41, 0x53,
-0x28, 0x77, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x61, 0x73, 0x73,
-0x75, 0x6d, 0x65, 0x64, 0x2c, 0x20, 0x5f, 0x5f, 0x64, 0x6f, 0x75,
-0x62, 0x6c, 0x65, 0x5f, 0x61, 0x73, 0x5f, 0x6c, 0x6f, 0x6e, 0x67,
-0x6c, 0x6f, 0x6e, 0x67, 0x28, 0x76, 0x61, 0x6c, 0x20, 0x2b, 0x20,
-0x5f, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x6c, 0x6f, 0x6e, 0x67, 0x5f,
-0x61, 0x73, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x28, 0x61,
-0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x29, 0x29, 0x29, 0x3b, 0x0a,
-0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28,
-0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, 0x21, 0x3d, 0x20,
-0x6f, 0x6c, 0x64, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74,
-0x75, 0x72, 0x6e, 0x20, 0x5f, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x6c,
-0x6f, 0x6e, 0x67, 0x5f, 0x61, 0x73, 0x5f, 0x64, 0x6f, 0x75, 0x62,
-0x6c, 0x65, 0x28, 0x6f, 0x6c, 0x64, 0x29, 0x3b, 0x0a, 0x7d, 0x0a,
-0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f,
-0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x6c, 0x28, 0x61, 0x2c,
-0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64,
-0x64, 0x5f, 0x64, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a,
-0x23, 0x65, 0x6c, 0x73, 0x65, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69,
-0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64,
-0x5f, 0x64, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61,
-0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c,
-0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65,
-0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64,
-0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f,
-0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62,
-0x29, 0x0a, 0x23, 0x65, 0x6e, 0x64, 0x69, 0x66, 0x0a, 0x5f, 0x5f,
-0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x67, 0x61,
-0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x61, 0x74, 0x6f,
-0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x64, 0x67, 0x28, 0x67,
-0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x2a, 0x61,
-0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75,
-0x62, 0x6c, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x0a,
-0x20, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20,
-0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x72,
-0x65, 0x73, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x73, 0x20, 0x3d,
-0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68,
-0x28, 0x28, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20,
-0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a,
-0x29, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x5f, 0x5f, 0x64, 0x6f,
-0x75, 0x62, 0x6c, 0x65, 0x5f, 0x61, 0x73, 0x5f, 0x6c, 0x6f, 0x6e,
-0x67, 0x6c, 0x6f, 0x6e, 0x67, 0x28, 0x76, 0x61, 0x6c, 0x29, 0x29,
-0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20,
-0x5f, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x6c, 0x6f, 0x6e, 0x67, 0x5f,
-0x61, 0x73, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x28, 0x72,
-0x65, 0x73, 0x29, 0x3b, 0x0a, 0x7d, 0x0a, 0x23, 0x64, 0x65, 0x66,
-0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63,
-0x68, 0x67, 0x5f, 0x64, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29,
-0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f,
-0x64, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a,
-0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x2f,
-0x0a, 0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f,
-0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x61, 0x74,
-0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x65, 0x67, 0x28, 0x67,
-0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, 0x64, 0x64,
-0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20,
-0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x67, 0x61,
-0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65,
-0x20, 0x3d, 0x20, 0x28, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74,
-0x20, 0x2a, 0x29, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a,
-0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x7e, 0x32,
-0x29, 0x3b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e,
-0x74, 0x20, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x61, 0x73, 0x73, 0x75,
-0x6d, 0x65, 0x64, 0x2c, 0x20, 0x73, 0x75, 0x6d, 0x2c, 0x20, 0x6e,
-0x65, 0x77, 0x5f, 0x3b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68,
-0x61, 0x6c, 0x66, 0x20, 0x74, 0x6d, 0x70, 0x3b, 0x0a, 0x20, 0x20,
-0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65,
-0x3b, 0x0a, 0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, 0x0a, 0x20, 0x20,
-0x20, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, 0x3d,
-0x20, 0x6f, 0x6c, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74,
-0x6d, 0x70, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x5f,
-0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, 0x28,
-0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x30, 0x2c, 0x20, 0x28, 0x28, 0x67,
-0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72,
-0x20, 0x26, 0x20, 0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, 0x78, 0x34,
-0x34, 0x33, 0x32, 0x20, 0x3a, 0x20, 0x30, 0x78, 0x34, 0x34, 0x31,
-0x30, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, 0x75, 0x6d,
-0x20, 0x3d, 0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74,
-0x32, 0x68, 0x61, 0x6c, 0x66, 0x28, 0x67, 0x61, 0x5f, 0x68, 0x61,
-0x6c, 0x66, 0x32, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x28, 0x76, 0x61,
-0x6c, 0x29, 0x20, 0x2b, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c,
-0x66, 0x32, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x28, 0x74, 0x6d, 0x70,
-0x29, 0x29, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x3b, 0x0a, 0x20, 0x20,
-0x20, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x20, 0x3d, 0x20, 0x5f, 0x5f,
-0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f,
-0x6c, 0x64, 0x2c, 0x20, 0x73, 0x75, 0x6d, 0x2c, 0x20, 0x28, 0x28,
-0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64,
-0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, 0x78,
-0x35, 0x34, 0x31, 0x30, 0x20, 0x3a, 0x20, 0x30, 0x78, 0x33, 0x32,
-0x35, 0x34, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x6c,
-0x64, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x43,
-0x41, 0x53, 0x28, 0x62, 0x61, 0x73, 0x65, 0x2c, 0x20, 0x61, 0x73,
-0x73, 0x75, 0x6d, 0x65, 0x64, 0x2c, 0x20, 0x6e, 0x65, 0x77, 0x5f,
-0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c,
-0x65, 0x20, 0x28, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20,
-0x21, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x29, 0x3b, 0x0a, 0x20, 0x20,
-0x74, 0x6d, 0x70, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20,
-0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d,
-0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x30, 0x2c, 0x20, 0x28, 0x28,
-0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64,
-0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, 0x78,
-0x34, 0x34, 0x33, 0x32, 0x20, 0x3a, 0x20, 0x30, 0x78, 0x34, 0x34,
-0x31, 0x30, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75,
-0x72, 0x6e, 0x20, 0x74, 0x6d, 0x70, 0x3b, 0x0a, 0x7d, 0x0a, 0x23,
-0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d,
-0x5f, 0x61, 0x64, 0x64, 0x5f, 0x65, 0x6c, 0x28, 0x61, 0x2c, 0x20,
-0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64,
-0x5f, 0x65, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x0a,
-0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20,
-0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x61, 0x74, 0x6f,
-0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x65, 0x67, 0x28, 0x67,
-0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, 0x64, 0x64,
-0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20,
-0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x67, 0x61,
-0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65,
-0x20, 0x3d, 0x20, 0x28, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74,
-0x20, 0x2a, 0x29, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a,
-0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x7e, 0x32,
-0x29, 0x3b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e,
-0x74, 0x20, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x61, 0x73, 0x73, 0x75,
-0x6d, 0x65, 0x64, 0x2c, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x3b, 0x0a,
-0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x74,
-0x6d, 0x70, 0x3b, 0x0a, 0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d,
-0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x64,
-0x6f, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x61, 0x73, 0x73,
-0x75, 0x6d, 0x65, 0x64, 0x20, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x3b,
-0x0a, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x20, 0x3d,
-0x20, 0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72,
-0x6d, 0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x2e,
-0x64, 0x61, 0x74, 0x61, 0x2c, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f,
+0x64, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x6d, 0x70, 0x2e,
+0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x62, 0x79,
+0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f, 0x6c, 0x64,
+0x2c, 0x20, 0x30, 0x2c, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73,
+0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20,
+0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, 0x78, 0x34, 0x34, 0x33, 0x32,
+0x20, 0x3a, 0x20, 0x30, 0x78, 0x34, 0x34, 0x31, 0x30, 0x29, 0x3b,
+0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, 0x75, 0x6d, 0x20, 0x3d, 0x20,
+0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x32, 0x68, 0x61,
+0x6c, 0x66, 0x28, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x32,
+0x66, 0x6c, 0x6f, 0x61, 0x74, 0x28, 0x76, 0x61, 0x6c, 0x29, 0x20,
+0x2b, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x32, 0x66,
+0x6c, 0x6f, 0x61, 0x74, 0x28, 0x74, 0x6d, 0x70, 0x29, 0x29, 0x2e,
+0x64, 0x61, 0x74, 0x61, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6e,
+0x65, 0x77, 0x5f, 0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x62, 0x79, 0x74,
+0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f, 0x6c, 0x64, 0x2c,
+0x20, 0x73, 0x75, 0x6d, 0x2c, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f,
 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26,
 0x20, 0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, 0x78, 0x35, 0x34, 0x31,
 0x30, 0x20, 0x3a, 0x20, 0x30, 0x78, 0x33, 0x32, 0x35, 0x34, 0x29,
@@ -499,8 +458,52 @@ static const char cluda_cuda_h[] = {
 0x32, 0x20, 0x3a, 0x20, 0x30, 0x78, 0x34, 0x34, 0x31, 0x30, 0x29,
 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20,
 0x74, 0x6d, 0x70, 0x3b, 0x0a, 0x7d, 0x0a, 0x23, 0x64, 0x65, 0x66,
-0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63,
-0x68, 0x67, 0x5f, 0x65, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29,
+0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64,
+0x64, 0x5f, 0x65, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20,
+0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x65, 0x67,
+0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x0a, 0x5f, 0x5f, 0x64,
+0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x67, 0x61, 0x5f,
+0x68, 0x61, 0x6c, 0x66, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78,
+0x63, 0x68, 0x67, 0x5f, 0x65, 0x67, 0x28, 0x67, 0x61, 0x5f, 0x68,
+0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20,
+0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x76, 0x61, 0x6c,
+0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69,
+0x6e, 0x74, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x20, 0x3d, 0x20,
+0x28, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x29,
+0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61,
+0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x7e, 0x32, 0x29, 0x3b, 0x0a,
+0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x6f,
+0x6c, 0x64, 0x2c, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64,
+0x2c, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x3b, 0x0a, 0x20, 0x20, 0x67,
+0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x74, 0x6d, 0x70, 0x3b,
+0x0a, 0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x2a, 0x62,
+0x61, 0x73, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b,
+0x0a, 0x20, 0x20, 0x20, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65,
+0x64, 0x20, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x3b, 0x0a, 0x20, 0x20,
+0x20, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x20, 0x3d, 0x20, 0x5f, 0x5f,
+0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f,
+0x6c, 0x64, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x2e, 0x64, 0x61, 0x74,
+0x61, 0x2c, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a,
+0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x32, 0x29,
+0x20, 0x3f, 0x20, 0x30, 0x78, 0x35, 0x34, 0x31, 0x30, 0x20, 0x3a,
+0x20, 0x30, 0x78, 0x33, 0x32, 0x35, 0x34, 0x29, 0x3b, 0x0a, 0x20,
+0x20, 0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x61, 0x74,
+0x6f, 0x6d, 0x69, 0x63, 0x43, 0x41, 0x53, 0x28, 0x62, 0x61, 0x73,
+0x65, 0x2c, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x2c,
+0x20, 0x6e, 0x65, 0x77, 0x5f, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d,
+0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x61, 0x73, 0x73,
+0x75, 0x6d, 0x65, 0x64, 0x20, 0x21, 0x3d, 0x20, 0x6f, 0x6c, 0x64,
+0x29, 0x3b, 0x0a, 0x20, 0x20, 0x74, 0x6d, 0x70, 0x2e, 0x64, 0x61,
+0x74, 0x61, 0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65,
+0x5f, 0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20,
+0x30, 0x2c, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a,
+0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x32, 0x29,
+0x20, 0x3f, 0x20, 0x30, 0x78, 0x34, 0x34, 0x33, 0x32, 0x20, 0x3a,
+0x20, 0x30, 0x78, 0x34, 0x34, 0x31, 0x30, 0x29, 0x3b, 0x0a, 0x20,
+0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x6d, 0x70,
+0x3b, 0x0a, 0x7d, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65,
 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f,
-0x65, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x65,
-0x6e, 0x64, 0x69, 0x66, 0x0a, 0x00};
+0x65, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74,
+0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x65, 0x67, 0x28,
+0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x65, 0x6e, 0x64, 0x69,
+0x66, 0x0a, 0x00};
diff --git a/src/gpuarray/reduction.h b/src/gpuarray/reduction.h
index 77043daa22..a536c26371 100644
--- a/src/gpuarray/reduction.h
+++ b/src/gpuarray/reduction.h
@@ -22,8 +22,10 @@ extern "C" {
 
 
 /* Data Structures */
+struct         GpuReductionAttr;
 struct         GpuReduction;
-typedef struct GpuReduction GpuReduction;
+typedef struct GpuReductionAttr GpuReductionAttr;
+typedef struct GpuReduction     GpuReduction;
 
 
 /**
@@ -31,59 +33,87 @@ typedef struct GpuReduction GpuReduction;
  */
 
 typedef enum _ga_reduce_op {
-	                           /*    dst   ,  dstArg  */
-	GA_REDUCE_SUM,             /*     +               */
-	GA_REDUCE_PROD,            /*     *               */
-	GA_REDUCE_PRODNZ,          /*     * (!=0)         */
-	GA_REDUCE_MIN,             /*   min()             */
-	GA_REDUCE_MAX,             /*   max()             */
-	GA_REDUCE_ARGMIN,          /*            argmin() */
-	GA_REDUCE_ARGMAX,          /*            argmax() */
-	GA_REDUCE_MINANDARGMIN,    /*   min()  , argmin() */
-	GA_REDUCE_MAXANDARGMAX,    /*   max()  , argmax() */
-	GA_REDUCE_AND,             /*     &               */
-	GA_REDUCE_OR,              /*     |               */
-	GA_REDUCE_XOR,             /*     ^               */
-	GA_REDUCE_ALL,             /*  &&/all()           */
-	GA_REDUCE_ANY,             /*  ||/any()           */
+	                              /*     d0   ,    d1    */
+	GA_ELEMWISE,
+	GA_REDUCE_COPY=GA_ELEMWISE,   /*   (copy)            */
+	GA_REDUCE_SUM,                /*     +               */
+	GA_REDUCE_PROD,               /*     *               */
+	GA_REDUCE_PRODNZ,             /*     * (!=0)         */
+	GA_REDUCE_MIN,                /*   min()             */
+	GA_REDUCE_MAX,                /*   max()             */
+	GA_REDUCE_ARGMIN,             /*            argmin() */
+	GA_REDUCE_ARGMAX,             /*            argmax() */
+	GA_REDUCE_MINANDARGMIN,       /*   min()  , argmin() */
+	GA_REDUCE_MAXANDARGMAX,       /*   max()  , argmax() */
+	GA_REDUCE_AND,                /*     &               */
+	GA_REDUCE_OR,                 /*     |               */
+	GA_REDUCE_XOR,                /*     ^               */
+	GA_REDUCE_ALL,                /*  &&/all()           */
+	GA_REDUCE_ANY,                /*  ||/any()           */
 	
-	GA_REDUCE_ENDSUPPORTED     /* Must be last element in enum */
+	GA_REDUCE_ENDSUPPORTED        /* Must be last element in enum */
 } ga_reduce_op;
 
 
 /* External Functions */
 
 /**
- * @brief Create a new GPU reduction operator over a list of axes to reduce.
+ * @brief Create, modify and free the attributes of a reduction operator.
+ * 
+ * @param [out] grAttr      The reduction operator attributes object.
+ * @param [in]  op          The reduction operation.
+ * @param [in]  maxSrcDims  The maximum number of supported source dimensions.
+ * @param [in]  maxDstDims  The maximum number of supported destination dimensions.
+ * @param [in]  s0Typecode  The typecode of the source tensor.
+ * @param [in]  d0Typecode  The typecode of the first destination tensor.
+ * @param [in]  d1Typecode  The typecode of the second destination tensor.
+ * @param [in]  i0Typecode  The typecode of the indices.
+ */
+
+GPUARRAY_PUBLIC int   GpuReductionAttr_new          (GpuReductionAttr**         grAttr,
+                                                     gpucontext*                gpuCtx);
+GPUARRAY_PUBLIC int   GpuReductionAttr_setop        (GpuReductionAttr*          grAttr,
+                                                     ga_reduce_op               op);
+GPUARRAY_PUBLIC int   GpuReductionAttr_setdims      (GpuReductionAttr*          grAttr,
+                                                     unsigned                   maxSrcDims,
+                                                     unsigned                   maxDstDims);
+GPUARRAY_PUBLIC int   GpuReductionAttr_sets0type    (GpuReductionAttr*          grAttr,
+                                                     int                        s0Typecode);
+GPUARRAY_PUBLIC int   GpuReductionAttr_setd0type    (GpuReductionAttr*          grAttr,
+                                                     int                        d0Typecode);
+GPUARRAY_PUBLIC int   GpuReductionAttr_setd1type    (GpuReductionAttr*          grAttr,
+                                                     int                        d1Typecode);
+GPUARRAY_PUBLIC int   GpuReductionAttr_seti0type    (GpuReductionAttr*          grAttr,
+                                                     int                        i0Typecode);
+GPUARRAY_PUBLIC int   GpuReductionAttr_appendopname (GpuReductionAttr*          grAttr,
+                                                     size_t                     n,
+                                                     char*                      name);
+GPUARRAY_PUBLIC int   GpuReductionAttr_issensitive  (const GpuReductionAttr*    grAttr);
+GPUARRAY_PUBLIC int   GpuReductionAttr_requiresS0   (const GpuReductionAttr*    grAttr);
+GPUARRAY_PUBLIC int   GpuReductionAttr_requiresD0   (const GpuReductionAttr*    grAttr);
+GPUARRAY_PUBLIC int   GpuReductionAttr_requiresD1   (const GpuReductionAttr*    grAttr);
+GPUARRAY_PUBLIC void  GpuReductionAttr_free         (GpuReductionAttr*          grAttr);
+
+/**
+ * @brief Create a new GPU reduction operator with the given attributes.
  *
  * @param [out] gr           The reduction operator.
- * @param [in]  gpuCtx       The GPU context.
- * @param [in]  op           The reduction operation to perform.
- * @param [in]  ndf          The minimum number of free (destination) dimensions to support.
- * @param [in]  ndr          The minimum number of reduction (source) dimensions to support.
- * @param [in]  s0TypeCode   The data type of the source operand.
- * @param [in]  flags        Reduction operator creation flags. Currently must be
- *                           set to 0.
+ * @param [in]  grAttr       The GPU context.
  *
  * @return GA_NO_ERROR      if the operator was created successfully
- *         GA_INVALID_ERROR if grOut is NULL, or some other argument was invalid
+ *         GA_INVALID_ERROR if some argument was invalid
  *         GA_NO_MEMORY     if memory allocation failed anytime during creation
  *         or other non-zero error codes otherwise.
  */
 
-GPUARRAY_PUBLIC int   GpuReduction_new   (GpuReduction**       grOut,
-                                          gpucontext*          gpuCtx,
-                                          ga_reduce_op         op,
-                                          unsigned             ndf,
-                                          unsigned             ndr,
-                                          int                  s0TypeCode,
-                                          int                  flags);
+GPUARRAY_PUBLIC int   GpuReduction_new              (GpuReduction**             gr,
+                                                     const GpuReductionAttr*    grAttr);
 
 /**
  * @brief Deallocate an operator allocated by GpuReduction_new().
  */
 
-GPUARRAY_PUBLIC void  GpuReduction_free  (GpuReduction*        gr);
+GPUARRAY_PUBLIC void  GpuReduction_free             (GpuReduction*              gr);
 
 /**
  * @brief Invoke an operator allocated by GpuReduction_new() on a source tensor.
@@ -123,13 +153,13 @@ GPUARRAY_PUBLIC void  GpuReduction_free  (GpuReduction*        gr);
  *         error code otherwise.
  */
 
-GPUARRAY_PUBLIC int   GpuReduction_call  (const GpuReduction*  gr,
-                                          GpuArray*            d0,
-                                          GpuArray*            d1,
-                                          const GpuArray*      s0,
-                                          unsigned             reduxLen,
-                                          const int*           reduxList,
-                                          int                  flags);
+GPUARRAY_PUBLIC int   GpuReduction_call            (const GpuReduction*        gr,
+                                                    GpuArray*                  d0,
+                                                    GpuArray*                  d1,
+                                                    const GpuArray*            s0,
+                                                    unsigned                   reduxLen,
+                                                    const int*                 reduxList,
+                                                    int                        flags);
 
 
 #ifdef __cplusplus
diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c
index 35518c0fbc..baead32518 100644
--- a/src/gpuarray_reduction.c
+++ b/src/gpuarray_reduction.c
@@ -77,7 +77,6 @@ typedef struct axis_desc axis_desc;
 struct redux_ctx{
 	/* Function Arguments. */
 	const GpuReduction* gr;
-	ga_reduce_op        op;
 	GpuArray*           d0;
 	GpuArray*           d1;
 	const GpuArray*     s0;
@@ -90,8 +89,6 @@ struct redux_ctx{
 	int                 nds0r;        /* # Reduced                          axes */
 	int                 ndd0;         /* # Destination                      axes */
 	int                 ndfs0;        /* # Flattened source                 axes */
-	int                 ndfs0r;       /* # Flattened source                 axes */
-	int                 ndfd0;        /* # Flattened source                 axes */
 	int                 ndib;         /* # Intra-block                      axes */
 	int                 zeroAllAxes;  /* # of zero-length                   axes in source tensor */
 	int                 zeroRdxAxes;  /* # of zero-length         reduction axes in source tensor */
@@ -145,6 +142,23 @@ struct redux_ctx{
 typedef struct redux_ctx redux_ctx;
 
 
+
+/**
+ *                    Reduction Operator Attributes.
+ */
+
+struct GpuReductionAttr{
+	gpucontext*   gpuCtx;
+	unsigned      numProcs;
+	size_t        maxLg, maxL0, maxGg, maxG0, maxLM;
+	
+	ga_reduce_op  op;
+	int           maxSrcDims;
+	int           maxDstDims;
+	int           s0Typecode, d0Typecode, d1Typecode, i0Typecode;
+};
+
+
 /**
  *                    Reduction Operator.
  * 
@@ -232,15 +246,12 @@ typedef struct redux_ctx redux_ctx;
 
 struct GpuReduction{
 	/* Function Arguments. */
+	GpuReductionAttr grAttr;
 	gpucontext*      gpuCtx;
 	ga_reduce_op     op;
+	int              nds;
 	int              ndd;
 	int              ndr;
-	int              TS0tc;
-	int              flags;
-	
-	/* Misc */
-	int              nds;
 	
 	/* Source code Generator. */
 	strb             s;
@@ -248,9 +259,11 @@ struct GpuReduction{
 	char             kName[256];
 	char*            kSourceCode;
 	size_t           kSourceCodeLen;
+	int              TS0tc;
 	int              TPS0tc;
 	int              TD0tc;
 	int              TD1tc;
+	int              TI0tc;
 	int              TS32tc;
 	int              TU32tc;
 	int              TS64tc;
@@ -275,12 +288,6 @@ struct GpuReduction{
 	GpuKernel        k;
 	
 	/* Scheduling */
-	unsigned         numProcs;
-	size_t           maxLg;
-	size_t           maxL0;
-	size_t           maxGg;
-	size_t           maxG0;
-	size_t           maxLM;
 	size_t           maxLK;
 	size_t           maxBS;
 	int              log2MaxBS;
@@ -304,8 +311,6 @@ static int         reduxGetMinInit                (int typecode, const char** pr
 static int         reduxGetMaxInit                (int typecode, const char** property);
 static int         reduxGetAndInit                (int typecode, const char** property);
 static int         reduxGetOrInit                 (int typecode, const char** property);
-static int         reduxIsSensitive               (int op);
-static const char* reduxGetOpName                 (int op);
 static int         reduxIsFloatingPoint           (int typecode);
 static unsigned    reduxCeilLog2                  (uint64_t x);
 static uint64_t    reduxNextPow2                  (uint64_t x);
@@ -471,41 +476,213 @@ static void        reduxSortAxisPtrsBy            (axis_desc**       ptrs,
 
 /* Function Implementations */
 /* Extern Functions */
-GPUARRAY_PUBLIC int   GpuReduction_new           (GpuReduction**       grOut,
-                                                  gpucontext*          gpuCtx,
-                                                  ga_reduce_op         op,
-                                                  unsigned             ndf,
-                                                  unsigned             ndr,
-                                                  int                  s0TypeCode,
-                                                  int                  flags){
-	if (!grOut){
+GPUARRAY_PUBLIC int   GpuReductionAttr_new          (GpuReductionAttr**         grAttr,
+                                                     gpucontext*                gpuCtx){
+	if(!grAttr){
+		return GA_INVALID_ERROR;
+	}
+	if(!gpuCtx){
+		*grAttr = NULL;
+		return GA_INVALID_ERROR;
+	}
+	*grAttr = calloc(1, sizeof(**grAttr));
+	if(!*grAttr){
+		return GA_MEMORY_ERROR;
+	}
+	
+	(*grAttr)->gpuCtx     = gpuCtx;
+	if (gpucontext_property(gpuCtx, GA_CTX_PROP_NUMPROCS,  &(*grAttr)->numProcs) != GA_NO_ERROR ||
+	    gpucontext_property(gpuCtx, GA_CTX_PROP_MAXLSIZE,  &(*grAttr)->maxLg)    != GA_NO_ERROR ||
+	    gpucontext_property(gpuCtx, GA_CTX_PROP_MAXLSIZE0, &(*grAttr)->maxL0)    != GA_NO_ERROR ||
+	    gpucontext_property(gpuCtx, GA_CTX_PROP_MAXGSIZE,  &(*grAttr)->maxGg)    != GA_NO_ERROR ||
+	    gpucontext_property(gpuCtx, GA_CTX_PROP_MAXGSIZE0, &(*grAttr)->maxG0)    != GA_NO_ERROR ||
+	    gpucontext_property(gpuCtx, GA_CTX_PROP_LMEMSIZE,  &(*grAttr)->maxLM)    != GA_NO_ERROR ){
+		free(*grAttr);
+		return GA_INVALID_ERROR;
+	}
+	(*grAttr)->op         = GA_REDUCE_SUM;
+	(*grAttr)->maxSrcDims = 1;
+	(*grAttr)->maxDstDims = 1;
+	(*grAttr)->s0Typecode = GA_FLOAT;
+	(*grAttr)->d0Typecode = GA_FLOAT;
+	(*grAttr)->d1Typecode = GA_ULONG;
+	(*grAttr)->i0Typecode = GA_ULONG;
+	
+	return GA_NO_ERROR;
+}
+GPUARRAY_PUBLIC int   GpuReductionAttr_setop        (GpuReductionAttr*          grAttr,
+                                                     ga_reduce_op               op){
+	grAttr->op = op;
+	
+	return GA_NO_ERROR;
+}
+GPUARRAY_PUBLIC int   GpuReductionAttr_setdims      (GpuReductionAttr*          grAttr,
+                                                     unsigned                   maxSrcDims,
+                                                     unsigned                   maxDstDims){
+	grAttr->maxSrcDims = maxSrcDims;
+	grAttr->maxDstDims = maxDstDims;
+	
+	return GA_NO_ERROR;
+}
+GPUARRAY_PUBLIC int   GpuReductionAttr_sets0type    (GpuReductionAttr*          grAttr,
+                                                     int                        s0Typecode){
+	switch(grAttr->op){
+		case GA_REDUCE_AND:
+		case GA_REDUCE_OR:
+		case GA_REDUCE_XOR:
+			if (reduxIsFloatingPoint(s0Typecode)){
+				/* Bitwise operations not applicable to floating-point datatypes! */
+				return GA_INVALID_ERROR;
+			}
+		break;
+		default:
+		break;
+	}
+	
+	grAttr->s0Typecode = s0Typecode;
+	
+	return GA_NO_ERROR;
+}
+GPUARRAY_PUBLIC int   GpuReductionAttr_setd0type    (GpuReductionAttr*          grAttr,
+                                                     int                        d0Typecode){
+	grAttr->d0Typecode = d0Typecode;
+	
+	return GA_NO_ERROR;
+}
+GPUARRAY_PUBLIC int   GpuReductionAttr_setd1type    (GpuReductionAttr*          grAttr,
+                                                     int                        d1Typecode){
+	grAttr->d1Typecode = d1Typecode;
+	
+	return GA_NO_ERROR;
+}
+GPUARRAY_PUBLIC int   GpuReductionAttr_seti0type    (GpuReductionAttr*          grAttr,
+                                                     int                        i0Typecode){
+	grAttr->i0Typecode = i0Typecode;
+	
+	return GA_NO_ERROR;
+}
+GPUARRAY_PUBLIC int   GpuReductionAttr_appendopname (GpuReductionAttr*          grAttr,
+                                                     size_t                     n,
+                                                     char*                      name){
+	switch(grAttr->op){
+		case GA_REDUCE_COPY:         return snprintf(name, n, "Copy_%d",            grAttr->maxSrcDims);
+		case GA_REDUCE_SUM:          return snprintf(name, n, "Sum_%d_%d",          grAttr->maxSrcDims, grAttr->maxDstDims);
+		case GA_REDUCE_PROD:         return snprintf(name, n, "Prod_%d_%d",         grAttr->maxSrcDims, grAttr->maxDstDims);
+		case GA_REDUCE_PRODNZ:       return snprintf(name, n, "ProdNonZero_%d_%d",  grAttr->maxSrcDims, grAttr->maxDstDims);
+		case GA_REDUCE_MIN:          return snprintf(name, n, "Min_%d_%d",          grAttr->maxSrcDims, grAttr->maxDstDims);
+		case GA_REDUCE_MAX:          return snprintf(name, n, "Max_%d_%d",          grAttr->maxSrcDims, grAttr->maxDstDims);
+		case GA_REDUCE_ARGMIN:       return snprintf(name, n, "Argmin_%d_%d",       grAttr->maxSrcDims, grAttr->maxDstDims);
+		case GA_REDUCE_ARGMAX:       return snprintf(name, n, "Argmax_%d_%d",       grAttr->maxSrcDims, grAttr->maxDstDims);
+		case GA_REDUCE_MINANDARGMIN: return snprintf(name, n, "MinAndArgmin_%d_%d", grAttr->maxSrcDims, grAttr->maxDstDims);
+		case GA_REDUCE_MAXANDARGMAX: return snprintf(name, n, "MaxAndArgmax_%d_%d", grAttr->maxSrcDims, grAttr->maxDstDims);
+		case GA_REDUCE_AND:          return snprintf(name, n, "And_%d_%d",          grAttr->maxSrcDims, grAttr->maxDstDims);
+		case GA_REDUCE_OR:           return snprintf(name, n, "Or_%d_%d",           grAttr->maxSrcDims, grAttr->maxDstDims);
+		case GA_REDUCE_XOR:          return snprintf(name, n, "Xor_%d_%d",          grAttr->maxSrcDims, grAttr->maxDstDims);
+		case GA_REDUCE_ALL:          return snprintf(name, n, "All_%d_%d",          grAttr->maxSrcDims, grAttr->maxDstDims);
+		case GA_REDUCE_ANY:          return snprintf(name, n, "Any_%d_%d",          grAttr->maxSrcDims, grAttr->maxDstDims);
+		default:                     if(name && n>0){*name = '\0';} return GA_INVALID_ERROR;
+	}
+}
+GPUARRAY_PUBLIC int   GpuReductionAttr_issensitive  (const GpuReductionAttr*    grAttr){
+	/**
+	 * @brief Returns whether the reduction is "sensitive".
+	 * 
+	 * A reduction is sensitive when its output satisfies at least one of the
+	 * following conditions:
+	 * 
+	 *   - It depends on the exact order of axes in the reduxList
+	 *   - It depends on exact signs of the strides of axes in the reduxList
+	 * 
+	 * Such sensitivity may prevent a flattening of contiguous axes even when it
+	 * would have been otherwise permitted.
+	 * 
+	 * For instance, ARGMIN/ARGMAX have this sensitivity, because the dstArg
+	 * tensor's contents are flattened coordinates into the source tensor, and
+	 * the flattening order is precisely reduxList. Permuting it would thus produce
+	 * incorrect output. Moreover, if the strides of a reduction axis were to be
+	 * reversed for the purpose of flattening the axis into another, the computed
+	 * coordinate would again be incorrect.
+	 * 
+	 * 
+	 * TL;DR: Reduction is sensitive if
+	 *   reduce(x, axis=axisList) != reduce(x, axis=axisList[::-1])
+	 * or
+	 *   reduce(x) != reduce(x[::-1])
+	 * .
+	 */
+	
+	switch (grAttr->op){
+		case GA_REDUCE_MINANDARGMIN:
+		case GA_REDUCE_MAXANDARGMAX:
+		case GA_REDUCE_ARGMIN:
+		case GA_REDUCE_ARGMAX:
+		  return 1;
+		default:
+		  return 0;
+	}
+}
+GPUARRAY_PUBLIC int   GpuReductionAttr_requiresS0   (const GpuReductionAttr*    grAttr){
+	switch(grAttr->op){
+		default: return 1;
+	}
+}
+GPUARRAY_PUBLIC int   GpuReductionAttr_requiresD0   (const GpuReductionAttr*    grAttr){
+	switch (grAttr->op){
+		case GA_REDUCE_ARGMIN:
+		case GA_REDUCE_ARGMAX:
+		  return 0;
+		default:
+		  return 1;
+	}
+}
+GPUARRAY_PUBLIC int   GpuReductionAttr_requiresD1   (const GpuReductionAttr*    grAttr){
+	switch (grAttr->op){
+		case GA_REDUCE_MINANDARGMIN:
+		case GA_REDUCE_MAXANDARGMAX:
+		case GA_REDUCE_ARGMIN:
+		case GA_REDUCE_ARGMAX:
+		  return 1;
+		default:
+		  return 0;
+	}
+}
+GPUARRAY_PUBLIC void  GpuReductionAttr_free         (GpuReductionAttr*          grAttr){
+	free(grAttr);
+}
+GPUARRAY_PUBLIC int   GpuReduction_new              (GpuReduction**             gr,
+                                                     const GpuReductionAttr*    grAttr){
+	if (!gr){
+		return GA_INVALID_ERROR;
+	}
+	if (!grAttr){
+		*gr = NULL;
 		return GA_INVALID_ERROR;
 	}
 	
-	*grOut = calloc(1, sizeof(**grOut));
-	if (*grOut){
-		(*grOut)->gpuCtx = gpuCtx;
-		(*grOut)->op     = op;
-		(*grOut)->ndd    = (int)ndf;
-		(*grOut)->ndr    = (int)ndr;
-		(*grOut)->TS0tc  = s0TypeCode;
-		(*grOut)->flags  = flags;
+	*gr = calloc(1, sizeof(**gr));
+	if (*gr){
+		(*gr)->grAttr = *grAttr;
+		(*gr)->gpuCtx = grAttr->gpuCtx;
+		(*gr)->op     = grAttr->op;
+		(*gr)->nds    = (int)grAttr->maxSrcDims;
+		(*gr)->ndd    = (int)grAttr->maxDstDims;
+		(*gr)->ndr    = (int)(grAttr->maxSrcDims-grAttr->maxDstDims);
 		
-		return reduxGenInit(*grOut);
+		return reduxGenInit(*gr);
 	}else{
 		return GA_MEMORY_ERROR;
 	}
 }
-GPUARRAY_PUBLIC void  GpuReduction_free          (GpuReduction*        gr){
+GPUARRAY_PUBLIC void  GpuReduction_free             (GpuReduction*              gr){
 	reduxGenCleanup(gr, !GA_NO_ERROR);
 }
-GPUARRAY_PUBLIC int   GpuReduction_call          (const GpuReduction*  gr,
-                                                  GpuArray*            d0,
-                                                  GpuArray*            d1,
-                                                  const GpuArray*      s0,
-                                                  unsigned             reduxLen,
-                                                  const int*           reduxList,
-                                                  int                  flags){
+GPUARRAY_PUBLIC int   GpuReduction_call             (const GpuReduction*        gr,
+                                                     GpuArray*                  d0,
+                                                     GpuArray*                  d1,
+                                                     const GpuArray*            s0,
+                                                     unsigned                   reduxLen,
+                                                     const int*                 reduxList,
+                                                     int                        flags){
 	redux_ctx ctxSTACK, *ctx = &ctxSTACK;
 	memset(ctx, 0, sizeof(*ctx));
 
@@ -792,69 +969,6 @@ static int         reduxGetOrInit                (int typecode, const char** pro
 	return GA_NO_ERROR;
 }
 
-/**
- * @brief Returns whether the reduction is "sensitive".
- * 
- * A reduction is sensitive when its output satisfies at least one of the
- * following conditions:
- * 
- *   - It depends on the exact order of axes in the reduxList
- *   - It depends on exact signs of the strides of axes in the reduxList
- * 
- * Such sensitivity may prevent a flattening of contiguous axes even when it
- * would have been otherwise permitted.
- * 
- * For instance, ARGMIN/ARGMAX have this sensitivity, because the dstArg
- * tensor's contents are flattened coordinates into the source tensor, and
- * the flattening order is precisely reduxList. Permuting it would thus produce
- * incorrect output. Moreover, if the strides of a reduction axis were to be
- * reversed for the purpose of flattening the axis into another, the computed
- * coordinate would again be incorrect.
- * 
- * 
- * TL;DR: Reduction is sensitive if
- *   reduce(x, axis=axisList) != reduce(x, axis=axisList[::-1])
- * or
- *   reduce(x) != reduce(x[::-1])
- * .
- */
-
-static int         reduxIsSensitive               (int op){
-	switch (op){
-		case GA_REDUCE_MINANDARGMIN:
-		case GA_REDUCE_MAXANDARGMAX:
-		case GA_REDUCE_ARGMIN:
-		case GA_REDUCE_ARGMAX:
-		  return 1;
-		default:
-		  return 0;
-	}
-}
-
-/**
- * Get a name for the op, usable within a C identifier.
- */
-
-static const char* reduxGetOpName                 (int op){
-	switch (op){
-		case GA_REDUCE_SUM:          return "Sum";
-		case GA_REDUCE_PROD:         return "Prod";
-		case GA_REDUCE_PRODNZ:       return "ProdNonZero";
-		case GA_REDUCE_MIN:          return "Min";
-		case GA_REDUCE_MAX:          return "Max";
-		case GA_REDUCE_ARGMIN:       return "Argmin";
-		case GA_REDUCE_ARGMAX:       return "Argmax";
-		case GA_REDUCE_MINANDARGMIN: return "MinAndArgmin";
-		case GA_REDUCE_MAXANDARGMAX: return "MaxAndArgmax";
-		case GA_REDUCE_AND:          return "And";
-		case GA_REDUCE_OR:           return "Or";
-		case GA_REDUCE_XOR:          return "Xor";
-		case GA_REDUCE_ALL:          return "All";
-		case GA_REDUCE_ANY:          return "Any";
-		default:                     return NULL;
-	}
-}
-
 /**
  * Whether or not the typecode is a floating-point type.
  */
@@ -1361,7 +1475,7 @@ static int         reduxTryFlattenInto           (redux_ctx*        ctx,
 	reverseD0 = signD0 && reduxInvRequiresD0(ctx);
 	reverseD1 = signD1 && reduxInvRequiresD1(ctx);
 	
-	if (reduxIsSensitive(ctx->op)){
+	if (GpuReductionAttr_issensitive(&ctx->gr->grAttr)){
 		if (reverseS0 || reverseD0 || reverseD1){
 			return 0;
 		}
@@ -1441,30 +1555,6 @@ static int         reduxGenInit                  (GpuReduction*     gr){
 static int         reduxGenInferProperties       (GpuReduction*     gr){
 	int i;
 	
-	
-	/**
-	 * Insane arguments?
-	 */
-	
-	if (gr->op < 0 || gr->op >= GA_REDUCE_ENDSUPPORTED){
-		return reduxGenCleanupMsg(gr, GA_INVALID_ERROR,
-		       "Unknown reduction operation!\n");
-	}
-	if (gr->ndr <= 0){
-		return reduxGenCleanupMsg(gr, GA_INVALID_ERROR,
-		       "No reduction axes!\n");
-	}
-	if (gr->ndd <  0){
-		return reduxGenCleanupMsg(gr, GA_INVALID_ERROR,
-		       "Destination tensor has less than 0 rank!\n");
-	}
-	if (gr->flags != 0){
-		return reduxGenCleanupMsg(gr, GA_INVALID_ERROR,
-		       "\"flags\" must be set to 0!\n");
-	}
-	gr->nds = gr->ndr+gr->ndd;
-	
-	
 	/**
 	 * Source code buffer preallocation failed?
 	 */
@@ -1476,46 +1566,20 @@ static int         reduxGenInferProperties       (GpuReduction*     gr){
 	srcbInit(&gr->srcGen, &gr->s);
 	
 	
-	/**
-	 * GPU context non-existent, or cannot read its properties?
-	 */
-	
-	if (!gr->gpuCtx                                                                          ||
-	    gpucontext_property(gr->gpuCtx, GA_CTX_PROP_NUMPROCS,  &gr->numProcs) != GA_NO_ERROR ||
-	    gpucontext_property(gr->gpuCtx, GA_CTX_PROP_MAXLSIZE,  &gr->maxLg)    != GA_NO_ERROR ||
-	    gpucontext_property(gr->gpuCtx, GA_CTX_PROP_MAXLSIZE0, &gr->maxL0)    != GA_NO_ERROR ||
-	    gpucontext_property(gr->gpuCtx, GA_CTX_PROP_MAXGSIZE,  &gr->maxGg)    != GA_NO_ERROR ||
-	    gpucontext_property(gr->gpuCtx, GA_CTX_PROP_MAXGSIZE0, &gr->maxG0)    != GA_NO_ERROR ||
-	    gpucontext_property(gr->gpuCtx, GA_CTX_PROP_LMEMSIZE,  &gr->maxLM)    != GA_NO_ERROR ){
-		return reduxGenCleanupMsg(gr, GA_INVALID_ERROR,
-		       "Error obtaining one or more properties from GPU context!\n");
-	}
-	
-	
 	/**
 	 * Type management.
 	 * 
-	 * - Deal with the various typecodes.
+	 * Read out the various typecodes from the attributes.
 	 */
-
-	gr->TD0tc  = gr->TS0tc;
-	gr->TD1tc  = GA_SSIZE;
+	
+	gr->TS0tc  = gr->grAttr.s0Typecode;
+	gr->TD0tc  = gr->grAttr.d0Typecode;
+	gr->TD1tc  = gr->grAttr.d1Typecode;
+	gr->TI0tc  = gr->grAttr.i0Typecode;
 	gr->TS32tc = GA_INT;
 	gr->TU32tc = GA_UINT;
 	gr->TS64tc = GA_LONG;
 	gr->TU64tc = GA_ULONG;
-	switch(gr->op){
-		case GA_REDUCE_AND:
-		case GA_REDUCE_OR:
-		case GA_REDUCE_XOR:
-			if (reduxIsFloatingPoint(gr->TS0tc)){
-				return reduxGenCleanupMsg(gr, GA_INVALID_ERROR,
-				    "Bitwise operations not applicable to floating-point datatypes!\n");
-			}
-		break;
-		default:
-		break;
-	}
 	reduxGenSetKTypes(gr);
 	
 	
@@ -1545,9 +1609,9 @@ static int         reduxGenInferProperties       (GpuReduction*     gr){
  */
 
 static void        reduxGenSetMaxBS              (GpuReduction*        gr){
-	gr->maxBS = gr->maxLM/reduxGenGetReduxStateSize(gr);
-	gr->maxBS = gr->maxBS < gr->maxLg ? gr->maxBS : gr->maxLg;
-	gr->maxBS = gr->maxBS < gr->maxL0 ? gr->maxBS : gr->maxL0;
+	gr->maxBS = gr->grAttr.maxLM/reduxGenGetReduxStateSize(gr);
+	gr->maxBS = gr->maxBS < gr->grAttr.maxLg ? gr->maxBS : gr->grAttr.maxLg;
+	gr->maxBS = gr->maxBS < gr->grAttr.maxL0 ? gr->maxBS : gr->grAttr.maxL0;
 	
 	/**
 	 * In practice we want a moderate amount of blocks, not just one monolith
@@ -1588,8 +1652,9 @@ static void        reduxGenSetMaxBS              (GpuReduction*        gr){
  * In the future this might become wierder when the accumulator is a Kahan
  * summation, for instance, and then TK0 != promoted(TS0).
  * 
- * If the user guaranteed to us that TK1 can be made narrower than 64-bit
- * unsigned through, perhaps, a flag, this is also where we set it.
+ * If the user guaranteed to us through gr->grAttr that TK1 can be made
+ * narrower than 64-bit, this is also where we'd take this into account.
+ * For now we default TK1 to exactly TI0.
  */
 
 static void        reduxGenSetKTypes             (GpuReduction*        gr){
@@ -1627,7 +1692,7 @@ static void        reduxGenSetKTypes             (GpuReduction*        gr){
 	 * they want.
 	 */
 	
-	switch (gr->op){
+	switch (gr->grAttr.op){
 		case GA_REDUCE_SUM:
 		  TK0 = TPS0;
 		  reduxGetSumInit (TK0->typecode, &TK0init);
@@ -1649,7 +1714,7 @@ static void        reduxGenSetKTypes             (GpuReduction*        gr){
 		case GA_REDUCE_ARGMIN:
 		case GA_REDUCE_MIN:
 		  TK0 = TPS0;
-		  TK1 = gpuarray_get_type(GA_SIZE);
+		  TK1 = gpuarray_get_type(gr->TI0tc);
 		  reduxGetMinInit (TK0->typecode, &TK0init);
 		  gr->TK0.align = TK0->align;
 		  gr->TK0.size  = TK0->size;
@@ -1664,7 +1729,7 @@ static void        reduxGenSetKTypes             (GpuReduction*        gr){
 		case GA_REDUCE_ARGMAX:
 		case GA_REDUCE_MAX:
 		  TK0 = TPS0;
-		  TK1 = gpuarray_get_type(GA_SIZE);
+		  TK1 = gpuarray_get_type(gr->TI0tc);
 		  reduxGetMaxInit (TK0->typecode, &TK0init);
 		  gr->TK0.align = TK0->align;
 		  gr->TK0.size  = TK0->size;
@@ -1807,8 +1872,7 @@ static void        reduxGenIterArgs              (const GpuReduction*  gr,
  */
 
 static int        reduxGenSrc                   (GpuReduction*     gr){
-	sprintf(gr->kName, "reduxKernel%s_f%d_r%d",
-	        reduxGetOpName(gr->op), gr->ndd, gr->ndr);
+	GpuReductionAttr_appendopname(&gr->grAttr, sizeof(gr->kName), gr->kName);
 	
 	reduxGenSrcAppend(gr);
 
@@ -1855,6 +1919,9 @@ static void       reduxGenSrcAppendMacroTypedefs(GpuReduction*     gr){
 	if (reduxGenRequiresD1(gr)){
 		srcbAppendf(&gr->srcGen, "typedef %-20s TD1;\n",  gpuarray_get_type(gr->TD1tc )->cluda_name);
 	}
+	if (reduxGenKernelRequiresLatticeI0(gr)){
+		srcbAppendf(&gr->srcGen, "typedef %-20s TI0;\n",  gpuarray_get_type(gr->TI0tc )->cluda_name);
+	}
 	srcbAppendf(&gr->srcGen, "typedef %-20s TS32;\n", gpuarray_get_type(gr->TS32tc)->cluda_name);
 	srcbAppendf(&gr->srcGen, "typedef %-20s TU32;\n", gpuarray_get_type(gr->TU32tc)->cluda_name);
 	srcbAppendf(&gr->srcGen, "typedef %-20s TS64;\n", gpuarray_get_type(gr->TS64tc)->cluda_name);
@@ -1921,7 +1988,7 @@ static void       reduxGenSrcAppendMacroTypedefs(GpuReduction*     gr){
 	 * flattened index i into reduction states V and I respectively.
 	 */
 	
-	switch (gr->op){
+	switch (gr->grAttr.op){
 		case GA_REDUCE_SUM:
 		  srcbAppendf(&gr->srcGen, "#define REDUX(V, I, v, i) do{           \\\n"
 		                           "        (V) += (v);                     \\\n"
@@ -2180,8 +2247,13 @@ static void       reduxGenSrcAppendDecode       (GpuReduction*     gr){
 	int i;
 
 	srcbAppends(&gr->srcGen,
-	"    GA_DECL_SHARED_BODY(char, SHMEM)\n"
-	"    DECLREDUXSTATE(tmpK0, I0)\n"
+	"    GA_DECL_SHARED_BODY(char, SHMEM)\n");
+	if (reduxGenKernelRequiresLatticeI0(gr)){
+		srcbAppends(&gr->srcGen,
+		"    TI0 I0;\n");
+	}
+	srcbAppends(&gr->srcGen,
+	"    TK0 tmpK0;\n"
 	"    DECLREDUXSTATE(K0,    K1)\n"
 	"    INITREDUXSTATE(K0,    K1);\n"
 	"    \n"
@@ -2443,15 +2515,17 @@ static void       reduxGenSrcAppendDecode       (GpuReduction*     gr){
 		}
 		srcbEndList(&gr->srcGen);
 		srcbAppends(&gr->srcGen, ";\n"
+		                         "    local_barrier();\n"
 		                         "    if(perm < D){\n"
-		                         "        ((TS64*)SHMEM)[perm] = D0Off;\n"
+		                         "        ((TS64*)SHMEM)[perm]  = D0Off;\n"
 		                         "    }\n"
-		                         "    local_barrier();\n"
-		                         "    if(LID_0 < D){\n"
-		                         "        D0Off                = ((TS64*)SHMEM)[LID_0];\n"
+		                         "    if(LID_0 >= D){\n"
+		                         "        ((TS64*)SHMEM)[LID_0] = 0;\n"
 		                         "    }\n"
 		                         "    local_barrier();\n"
-		                         "    D0                      += D0Off;\n");
+		                         "    D0Off                     = ((TS64*)SHMEM)[LID_0];\n"
+		                         "    D0                       += D0Off;\n"
+		                         "    local_barrier();\n");
 	}
 
 
@@ -2477,15 +2551,17 @@ static void       reduxGenSrcAppendDecode       (GpuReduction*     gr){
 		}
 		srcbEndList(&gr->srcGen);
 		srcbAppends(&gr->srcGen, ";\n"
+		                         "    local_barrier();\n"
 		                         "    if(perm < D){\n"
-		                         "        ((TS64*)SHMEM)[perm] = D1Off;\n"
+		                         "        ((TS64*)SHMEM)[perm]  = D1Off;\n"
 		                         "    }\n"
-		                         "    local_barrier();\n"
-		                         "    if(LID_0 < D){\n"
-		                         "        D1Off                = ((TS64*)SHMEM)[LID_0];\n"
+		                         "    if(LID_0 >= D){\n"
+		                         "        ((TS64*)SHMEM)[LID_0] = 0;\n"
 		                         "    }\n"
 		                         "    local_barrier();\n"
-		                         "    D1                      += D1Off;\n");
+		                         "    D1Off                     = ((TS64*)SHMEM)[LID_0];\n"
+		                         "    D1                       += D1Off;\n"
+		                         "    local_barrier();\n");
 	}
 
 
@@ -2531,6 +2607,13 @@ static void       reduxGenSrcAppendDecode       (GpuReduction*     gr){
 			"    TK1* restrict const W1R     = &W1[GDIM_0*D];\n"
 			"    TK1* restrict const SHMEMK1 = (TK1*)(SHMEM + SHMEMK1Off);\n");
 		}
+		srcbAppends(&gr->srcGen,
+		"    INITREDUXSTATE(W0L[LID_0], W1L[LID_0]);\n"
+		"    INITREDUXSTATE(W0R[LID_0], W1R[LID_0]);\n"
+		"    if(D < LDIM_0 && LID_0+D<H){\n"
+		"        INITREDUXSTATE(W0L[LID_0+D], W1L[LID_0+D]);\n"
+		"        INITREDUXSTATE(W0R[LID_0+D], W1R[LID_0+D]);\n"
+		"    }\n");
 	}
 
 
@@ -2625,7 +2708,7 @@ static void        reduxGenSrcAppendIncrement     (GpuReduction*        gr,
                                                    int                  initial,
                                                    int                  axis){
 	const char* cast        = reduxGenSrcAxisIsHuge(gr, selector, axis) ? "TS64" : "TS32";
-	const char* breakOrCont = (initial) && (axis < gr->ndd) ? "break" : "continue";
+	const char* breakOrCont = (initial) && (axis < gr->ndd) ? "break   " : "continue";
 
 	/* Pointer bumps */
 	srcbAppends(&gr->srcGen, "                ");
@@ -3023,7 +3106,7 @@ static size_t     reduxGenEstimateParallelism   (const GpuReduction*  gr){
 	 */
 
 	size_t marginFactor = 16;
-	return marginFactor * gr->numProcs * gr->maxLg;
+	return marginFactor * gr->grAttr.numProcs * gr->grAttr.maxLg;
 }
 
 /**
@@ -3081,28 +3164,13 @@ static size_t     reduxGenEstimateParallelism   (const GpuReduction*  gr){
  */
 
 static int        reduxGenRequiresS0             (const GpuReduction*  gr){
-	(void)gr;
-	return 1;
+	return GpuReductionAttr_requiresS0(&gr->grAttr);
 }
 static int        reduxGenRequiresD0             (const GpuReduction*  gr){
-	switch (gr->op){
-		case GA_REDUCE_ARGMIN:
-		case GA_REDUCE_ARGMAX:
-		  return 0;
-		default:
-		  return 1;
-	}
+	return GpuReductionAttr_requiresD0(&gr->grAttr);
 }
 static int        reduxGenRequiresD1             (const GpuReduction*  gr){
-	switch (gr->op){
-		case GA_REDUCE_MINANDARGMIN:
-		case GA_REDUCE_MAXANDARGMAX:
-		case GA_REDUCE_ARGMIN:
-		case GA_REDUCE_ARGMAX:
-		  return 1;
-		default:
-		  return 0;
-	}
+	return GpuReductionAttr_requiresD1(&gr->grAttr);
 }
 static int        reduxGenKernelRequiresLatticeS0(const GpuReduction*  gr){
 	return reduxGenRequiresS0(gr);
@@ -3254,10 +3322,6 @@ static size_t      reduxGenGetWMEMK1Off           (const GpuReduction*  gr, size
  */
 
 static int         reduxInvInit                   (redux_ctx*  ctx){
-	/**
-	 * We initialize certain parts of the context.
-	 */
-
 	ctx->L           = ctx->Li        = NULL;
 	ctx->S0J         = ctx->S0Si      = NULL;
 	ctx->D0J         = ctx->D0Si      = NULL;
@@ -3281,7 +3345,7 @@ static int         reduxInvInit                   (redux_ctx*  ctx){
  * @brief Begin inferring the properties of the reduction invocation.
  */
 
-static int        reduxInvInferProperties       (redux_ctx*  ctx){
+static int         reduxInvInferProperties        (redux_ctx*  ctx){
 	axis_desc* a;
 	int        i, j;
 	size_t     d;
@@ -3320,7 +3384,8 @@ static int        reduxInvInferProperties       (redux_ctx*  ctx){
 	ctx->nds0  = reduxInvRequiresS0(ctx) ? ctx->s0->nd : 0;
 	ctx->nds0r = ctx->reduxLen;
 	ctx->ndd0  = ctx->nds0   - ctx->nds0r;
-	ctx->ndfs0 = ctx->ndfs0r = ctx->ndfd0 = 0;
+	ctx->ndfs0 = ctx->nds0;
+
 
 	/* Insane reduxList? */
 	for (i=0;i<ctx->nds0r;i++){
@@ -3450,7 +3515,10 @@ static int        reduxInvInferProperties       (redux_ctx*  ctx){
 		ctx->D1Off  = ctx->d1->offset;
 	}
 
-	return reduxInvFlattenSource(ctx);
+
+	return ctx->flags & 0                ? //FIXME: Delete this hack after debugging.
+	       reduxInvFlattenSource    (ctx):
+	       reduxInvComputeKernelArgs(ctx);
 }
 
 /**
@@ -3464,8 +3532,6 @@ static int        reduxInvFlattenSource         (redux_ctx*  ctx){
 	axis_desc* axis, *flatAxis, *sortAxis;
 	int        i, j, k, isSensitive;
 
-	ctx->ndfs0 = ctx->nds0;
-
 	/**
 	 * Pass 1: Flatten out 0- and 1-length axes. We already know that
 	 *
@@ -3502,7 +3568,7 @@ static int        reduxInvFlattenSource         (redux_ctx*  ctx){
 	 */
 
 	k           = ctx->ndfs0;
-	isSensitive = reduxIsSensitive(ctx->op);
+	isSensitive = GpuReductionAttr_issensitive(&ctx->gr->grAttr);
 	qsort(ctx->xdSrc, ctx->ndfs0, sizeof(*ctx->xdSrc),
 	      isSensitive ? reduxSortFlatSensitive : reduxSortFlatInsensitive);
 	for (i=j=1;i<ctx->ndfs0;i++){
@@ -3517,19 +3583,6 @@ static int        reduxInvFlattenSource         (redux_ctx*  ctx){
 	}
 	ctx->ndfs0 = k;
 
-
-	/**
-	 * Compute number of flattened free and reduced axes.
-	 */
-
-	for (ctx->ndfs0r=ctx->ndfd0=i=0;i<ctx->ndfs0;i++){
-		if (axisIsReduced(reduxInvGetSrcAxis(ctx, i))){
-			ctx->ndfs0r++;
-		}else{
-			ctx->ndfd0++;
-		}
-	}
-
 	return reduxInvComputeKernelArgs(ctx);
 }
 
diff --git a/tests/check_reduction.c b/tests/check_reduction.c
index 7a2141cfae..8e5eef93e4 100644
--- a/tests/check_reduction.c
+++ b/tests/check_reduction.c
@@ -16,6 +16,7 @@ void teardown(void);
 
 
 /* Defines */
+#define MAXERRPRINT  2
 #define ga_assert_ok(e) ck_assert_int_eq(e, GA_NO_ERROR)
 
 
@@ -74,18 +75,19 @@ START_TEST(test_maxandargmax_reduction){
 	 * third dimensions.
 	 */
 
+	size_t errCnt      = 0;
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
 	const int reduxList[] = {0,2};
 
-	float *pSrc = calloc(sizeof(*pSrc), prodDims);
-	float *pMax = calloc(sizeof(*pMax), dims[1]);
-	unsigned long *pArgmax = calloc(sizeof(*pArgmax), dims[1]);
+	float*  pS0 = calloc(1, sizeof(*pS0) * dims[0]*dims[1]*dims[2]);
+	float*  pD0 = calloc(1, sizeof(*pD0) *         dims[1]        );
+	size_t* pD1 = calloc(1, sizeof(*pD1) *         dims[1]        );
 
-	ck_assert_ptr_ne(pSrc,    NULL);
-	ck_assert_ptr_ne(pMax,    NULL);
-	ck_assert_ptr_ne(pArgmax, NULL);
+	ck_assert_ptr_nonnull(pS0);
+	ck_assert_ptr_nonnull(pD0);
+	ck_assert_ptr_nonnull(pD1);
 
 
 	/**
@@ -93,7 +95,7 @@ START_TEST(test_maxandargmax_reduction){
 	 */
 
 	for(i=0;i<prodDims;i++){
-		pSrc[i] = pcgRand01();
+		pS0[i] = pcgRand01();
 	}
 
 
@@ -101,27 +103,35 @@ START_TEST(test_maxandargmax_reduction){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaSrc;
-	GpuArray gaMax;
-	GpuArray gaArgmax;
-
-	ga_assert_ok(GpuArray_empty(&gaSrc,    ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty(&gaMax,    ctx, GA_FLOAT, 1, &dims[1], GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty(&gaArgmax, ctx, GA_ULONG,  1, &dims[1], GA_C_ORDER));
-
-	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
-	ga_assert_ok(GpuArray_memset(&gaMax,    -1));  /* 0xFFFFFFFF is a qNaN. */
-	ga_assert_ok(GpuArray_memset(&gaArgmax, -1));
-
-	GpuReduction* gr;
-	GpuReduction_new(&gr, GpuArray_context(&gaSrc),
-	                 GA_REDUCE_MAXANDARGMAX, 1, 2, gaSrc.typecode, 0);
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuArray gaD1;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
+
+	ga_assert_ok(GpuArray_empty(&gaS0, ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaD0, ctx, GA_FLOAT, 1, &dims[1], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_SIZE,  1, &dims[1], GA_C_ORDER));
+
+	ga_assert_ok(GpuArray_write(&gaS0, pS0, sizeof(*pS0)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD0, -1));  /* 0xFFFFFFFF is a qNaN. */
+	ga_assert_ok(GpuArray_memset(&gaD1, -1));
+
+	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
+	ck_assert_ptr_nonnull(grAttr);
+	GpuReductionAttr_setop    (grAttr, GA_REDUCE_MAXANDARGMAX);
+	GpuReductionAttr_setdims  (grAttr, gaS0.nd, gaD0.nd);
+	GpuReductionAttr_sets0type(grAttr, gaS0.typecode);
+	GpuReductionAttr_setd0type(grAttr, gaD0.typecode);
+	GpuReductionAttr_setd1type(grAttr, gaD1.typecode);
+	GpuReduction_new(&gr, grAttr);
 	ck_assert_ptr_nonnull(gr);
-	ga_assert_ok(GpuReduction_call(gr, &gaMax, &gaArgmax, &gaSrc, 2, reduxList, 0));
+	ga_assert_ok(GpuReduction_call(gr, &gaD0, &gaD1, &gaS0, gaS0.nd-gaD0.nd, reduxList, 0));
 	GpuReduction_free(gr);
+	GpuReductionAttr_free(grAttr);
 
-	ga_assert_ok(GpuArray_read(pMax,    sizeof(*pMax)   *dims[1], &gaMax));
-	ga_assert_ok(GpuArray_read(pArgmax, sizeof(*pArgmax)*dims[1], &gaArgmax));
+	ga_assert_ok(GpuArray_read(pD0, sizeof(*pD0)*dims[1], &gaD0));
+	ga_assert_ok(GpuArray_read(pD1, sizeof(*pD1)*dims[1], &gaD1));
 
 
 	/**
@@ -129,44 +139,41 @@ START_TEST(test_maxandargmax_reduction){
 	 */
 
 	for(j=0;j<dims[1];j++){
-		size_t gtArgmax = 0;
-		float  gtMax    = pSrc[(0*dims[1] + j)*dims[2] + 0];
+		size_t gtD1 = 0;
+		float  gtD0 = pS0[(0*dims[1] + j)*dims[2] + 0];
 
 		for(i=0;i<dims[0];i++){
 			for(k=0;k<dims[2];k++){
-				float v = pSrc[(i*dims[1] + j)*dims[2] + k];
+				float v = pS0[(i*dims[1] + j)*dims[2] + k];
 
-				if(v > gtMax){
-					gtMax    = v;
-					gtArgmax = i*dims[2] + k;
+				if(v > gtD0){
+					gtD0 = v;
+					gtD1 = i*dims[2] + k;
 				}
 			}
 		}
 		
-		if(gtMax    != pMax[j]){
-			fprintf(stderr, "Mismatch GT %f != %f UUT @ %zu!\n",
-			        gtMax, pMax[j], j);
-			fflush(stderr);
-		}
-		if(gtArgmax != pArgmax[j]){
-			fprintf(stderr, "Mismatch GT %zu != %zu UUT @ %zu!\n",
-			        gtArgmax, pArgmax[j], j);
-			fflush(stderr);
+		if(gtD0 != pD0[j] || gtD1 != pD1[j]){
+			errCnt++;
+			if(errCnt < MAXERRPRINT){
+				fprintf(stderr, "%s:%d: Mismatch GT %f[%zu] != %f[%zu] UUT @ %zu!\n",
+				        __func__, __LINE__, gtD0, gtD1, pD0[j], pD1[j], j);
+				fflush (stderr);
+			}
 		}
-		ck_assert_msg(gtMax    == pMax[j],    "Max value mismatch!");
-		ck_assert_msg(gtArgmax == pArgmax[j], "Argmax value mismatch!");
 	}
+	ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt);
 
 	/**
 	 * Deallocate.
 	 */
 
-	free(pSrc);
-	free(pMax);
-	free(pArgmax);
-	GpuArray_clear(&gaSrc);
-	GpuArray_clear(&gaMax);
-	GpuArray_clear(&gaArgmax);
+	free(pS0);
+	free(pD0);
+	free(pD1);
+	GpuArray_clear(&gaS0);
+	GpuArray_clear(&gaD0);
+	GpuArray_clear(&gaD1);
 }END_TEST
 
 START_TEST(test_maxandargmax_idxtranspose){
@@ -178,7 +185,8 @@ START_TEST(test_maxandargmax_idxtranspose){
 	 * transposition of the argmax "coordinates" and thus a change in its
 	 * "flattened" output version.
 	 */
-
+	
+	size_t errCnt      = 0;
 	size_t i,j,k;
 	size_t dims[3]     = {32,50,79};
 	size_t prodDims    = dims[0]*dims[1]*dims[2];
@@ -186,13 +194,13 @@ START_TEST(test_maxandargmax_idxtranspose){
 	size_t rdxProdDims = rdxDims[0];
 	const int reduxList[] = {2,0};
 
-	float *pSrc = calloc(sizeof(*pSrc), prodDims);
-	float *pMax = calloc(sizeof(*pMax), rdxProdDims);
-	unsigned long *pArgmax = calloc(sizeof(*pArgmax), rdxProdDims);
+	float*  pS0 = calloc(1, sizeof(*pS0) * prodDims);
+	float*  pD0 = calloc(1, sizeof(*pD0) * rdxProdDims);
+	size_t* pD1 = calloc(1, sizeof(*pD1) * rdxProdDims);
 
-	ck_assert_ptr_ne(pSrc,    NULL);
-	ck_assert_ptr_ne(pMax,    NULL);
-	ck_assert_ptr_ne(pArgmax, NULL);
+	ck_assert_ptr_nonnull(pS0);
+	ck_assert_ptr_nonnull(pD0);
+	ck_assert_ptr_nonnull(pD1);
 
 
 	/**
@@ -200,7 +208,7 @@ START_TEST(test_maxandargmax_idxtranspose){
 	 */
 
 	for(i=0;i<prodDims;i++){
-		pSrc[i] = pcgRand01();
+		pS0[i] = pcgRand01();
 	}
 
 
@@ -208,27 +216,35 @@ START_TEST(test_maxandargmax_idxtranspose){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaSrc;
-	GpuArray gaMax;
-	GpuArray gaArgmax;
-
-	ga_assert_ok(GpuArray_empty(&gaSrc,    ctx, GA_FLOAT, 3, dims,    GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty(&gaMax,    ctx, GA_FLOAT, 1, rdxDims, GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty(&gaArgmax, ctx, GA_ULONG,  1, rdxDims, GA_C_ORDER));
-
-	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
-	ga_assert_ok(GpuArray_memset(&gaMax,    -1));  /* 0xFFFFFFFF is a qNaN. */
-	ga_assert_ok(GpuArray_memset(&gaArgmax, -1));
-
-	GpuReduction* gr;
-	GpuReduction_new(&gr, GpuArray_context(&gaSrc),
-	                 GA_REDUCE_MAXANDARGMAX, 1, 2, gaSrc.typecode, 0);
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuArray gaD1;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
+	
+	ga_assert_ok(GpuArray_empty(&gaS0, ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaD0, ctx, GA_FLOAT, 1, &dims[1], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_SIZE,  1, &dims[1], GA_C_ORDER));
+
+	ga_assert_ok(GpuArray_write(&gaS0, pS0, sizeof(*pS0)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD0, -1));  /* 0xFFFFFFFF is a qNaN. */
+	ga_assert_ok(GpuArray_memset(&gaD1, -1));
+
+	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
+	ck_assert_ptr_nonnull(grAttr);
+	GpuReductionAttr_setop    (grAttr, GA_REDUCE_MAXANDARGMAX);
+	GpuReductionAttr_setdims  (grAttr, gaS0.nd, gaD0.nd);
+	GpuReductionAttr_sets0type(grAttr, gaS0.typecode);
+	GpuReductionAttr_setd0type(grAttr, gaD0.typecode);
+	GpuReductionAttr_setd1type(grAttr, gaD1.typecode);
+	GpuReduction_new(&gr, grAttr);
 	ck_assert_ptr_nonnull(gr);
-	ga_assert_ok(GpuReduction_call(gr, &gaMax, &gaArgmax, &gaSrc, 2, reduxList, 0));
+	ga_assert_ok(GpuReduction_call(gr, &gaD0, &gaD1, &gaS0, gaS0.nd-gaD0.nd, reduxList, 0));
 	GpuReduction_free(gr);
+	GpuReductionAttr_free(grAttr);
 
-	ga_assert_ok(GpuArray_read(pMax,    sizeof(*pMax)   *rdxProdDims, &gaMax));
-	ga_assert_ok(GpuArray_read(pArgmax, sizeof(*pArgmax)*rdxProdDims, &gaArgmax));
+	ga_assert_ok(GpuArray_read(pD0, sizeof(*pD0)*dims[1], &gaD0));
+	ga_assert_ok(GpuArray_read(pD1, sizeof(*pD1)*dims[1], &gaD1));
 
 
 	/**
@@ -236,34 +252,41 @@ START_TEST(test_maxandargmax_idxtranspose){
 	 */
 
 	for(j=0;j<dims[1];j++){
-		size_t gtArgmax = 0;
-		float  gtMax    = pSrc[(0*dims[1] + j)*dims[2] + 0];
+		size_t gtD1 = 0;
+		float  gtD0 = pS0[(0*dims[1] + j)*dims[2] + 0];
 
 		for(k=0;k<dims[2];k++){
 			for(i=0;i<dims[0];i++){
-				float v = pSrc[(i*dims[1] + j)*dims[2] + k];
+				float v = pS0[(i*dims[1] + j)*dims[2] + k];
 
-				if(v > gtMax){
-					gtMax    = v;
-					gtArgmax = k*dims[0] + i;
+				if(v > gtD0){
+					gtD0 = v;
+					gtD1 = k*dims[0] + i;
 				}
 			}
 		}
 
-		ck_assert_msg(gtMax    == pMax[j],    "Max value mismatch!");
-		ck_assert_msg(gtArgmax == pArgmax[j], "Argmax value mismatch!");
+		if(gtD0 != pD0[j] || gtD1 != pD1[j]){
+			errCnt++;
+			if(errCnt < MAXERRPRINT){
+				fprintf(stderr, "%s:%d: Mismatch GT %f[%zu] != %f[%zu] UUT @ %zu!\n",
+				__func__, __LINE__, gtD0, gtD1, pD0[j], pD1[j], j);
+				fflush (stderr);
+			}
+		}
 	}
+	ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt);
 
 	/**
 	 * Deallocate.
 	 */
 
-	free(pSrc);
-	free(pMax);
-	free(pArgmax);
-	GpuArray_clear(&gaSrc);
-	GpuArray_clear(&gaMax);
-	GpuArray_clear(&gaArgmax);
+	free(pS0);
+	free(pD0);
+	free(pD1);
+	GpuArray_clear(&gaS0);
+	GpuArray_clear(&gaD0);
+	GpuArray_clear(&gaD1);
 }END_TEST
 
 START_TEST(test_maxandargmax_bigdestination){
@@ -273,19 +296,20 @@ START_TEST(test_maxandargmax_bigdestination){
 	 * We test here a reduction of some random 3D tensor on the first and
 	 * third dimensions.
 	 */
-
+	
+	size_t errCnt      = 0;
 	size_t i,j;
 	size_t dims[2]  = {2,131072};
 	size_t prodDims = dims[0]*dims[1];
 	const int reduxList[] = {0};
 
-	float*  pSrc    = calloc(1, sizeof(*pSrc)    * dims[0]*dims[1]);
-	float*  pMax    = calloc(1, sizeof(*pMax)    *         dims[1]);
-	size_t* pArgmax = calloc(1, sizeof(*pArgmax) *         dims[1]);
+	float*  pS0 = calloc(1, sizeof(*pS0) * dims[0]*dims[1]);
+	float*  pD0 = calloc(1, sizeof(*pD0) *         dims[1]);
+	size_t* pD1 = calloc(1, sizeof(*pD1) *         dims[1]);
 
-	ck_assert_ptr_ne(pSrc,    NULL);
-	ck_assert_ptr_ne(pMax,    NULL);
-	ck_assert_ptr_ne(pArgmax, NULL);
+	ck_assert_ptr_nonnull(pS0);
+	ck_assert_ptr_nonnull(pD0);
+	ck_assert_ptr_nonnull(pD1);
 
 
 	/**
@@ -293,7 +317,7 @@ START_TEST(test_maxandargmax_bigdestination){
 	 */
 
 	for(i=0;i<prodDims;i++){
-		pSrc[i] = pcgRand01();
+		pS0[i] = pcgRand01();
 	}
 
 
@@ -301,27 +325,35 @@ START_TEST(test_maxandargmax_bigdestination){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaSrc;
-	GpuArray gaMax;
-	GpuArray gaArgmax;
-
-	ga_assert_ok(GpuArray_empty(&gaSrc,    ctx, GA_FLOAT, 2, &dims[0], GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty(&gaMax,    ctx, GA_FLOAT, 1, &dims[1], GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty(&gaArgmax, ctx, GA_SIZE,  1, &dims[1], GA_C_ORDER));
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuArray gaD1;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 
-	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
-	ga_assert_ok(GpuArray_memset(&gaMax,    -1));  /* 0xFFFFFFFF is a qNaN. */
-	ga_assert_ok(GpuArray_memset(&gaArgmax, -1));
-
-	GpuReduction* gr;
-	GpuReduction_new(&gr, GpuArray_context(&gaSrc),
-	                 GA_REDUCE_MAXANDARGMAX, 1, 1, gaSrc.typecode, 0);
+	ga_assert_ok(GpuArray_empty(&gaS0, ctx, GA_FLOAT, 2, &dims[0], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaD0, ctx, GA_FLOAT, 1, &dims[1], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_SIZE,  1, &dims[1], GA_C_ORDER));
+	
+	ga_assert_ok(GpuArray_write(&gaS0, pS0, sizeof(*pS0)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD0, -1));  /* 0xFFFFFFFF is a qNaN. */
+	ga_assert_ok(GpuArray_memset(&gaD1, -1));
+
+	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
+	ck_assert_ptr_nonnull(grAttr);
+	GpuReductionAttr_setop    (grAttr, GA_REDUCE_MAXANDARGMAX);
+	GpuReductionAttr_setdims  (grAttr, gaS0.nd, gaD0.nd);
+	GpuReductionAttr_sets0type(grAttr, gaS0.typecode);
+	GpuReductionAttr_setd0type(grAttr, gaD0.typecode);
+	GpuReductionAttr_setd1type(grAttr, gaD1.typecode);
+	GpuReduction_new(&gr, grAttr);
 	ck_assert_ptr_nonnull(gr);
-	ga_assert_ok(GpuReduction_call(gr, &gaMax, &gaArgmax, &gaSrc, 1, reduxList, 0));
+	ga_assert_ok(GpuReduction_call(gr, &gaD0, &gaD1, &gaS0, gaS0.nd-gaD0.nd, reduxList, 0));
 	GpuReduction_free(gr);
+	GpuReductionAttr_free(grAttr);
 
-	ga_assert_ok(GpuArray_read(pMax,    sizeof(*pMax)   *dims[1], &gaMax));
-	ga_assert_ok(GpuArray_read(pArgmax, sizeof(*pArgmax)*dims[1], &gaArgmax));
+	ga_assert_ok(GpuArray_read(pD0,    sizeof(*pD0)   *dims[1], &gaD0));
+	ga_assert_ok(GpuArray_read(pD1, sizeof(*pD1)*dims[1], &gaD1));
 
 
 	/**
@@ -329,42 +361,39 @@ START_TEST(test_maxandargmax_bigdestination){
 	 */
 
 	for(j=0;j<dims[1];j++){
-		size_t gtArgmax = 0;
-		float  gtMax    = pSrc[0*dims[1] + j];
+		size_t gtD1 = 0;
+		float  gtD0 = pS0[0*dims[1] + j];
 
 		for(i=0;i<dims[0];i++){
-			float v = pSrc[i*dims[1] + j];
+			float v = pS0[i*dims[1] + j];
 
-			if(v > gtMax){
-				gtMax    = v;
-				gtArgmax = i;
+			if(v > gtD0){
+				gtD0 = v;
+				gtD1 = i;
 			}
 		}
 		
-		if(gtMax    != pMax[j]){
-			fprintf(stderr, "Mismatch GT %f != %f UUT @ %zu!\n",
-			        gtMax, pMax[j], j);
-			fflush(stderr);
-		}
-		if(gtArgmax != pArgmax[j]){
-			fprintf(stderr, "Mismatch GT %zu != %zu UUT @ %zu!\n",
-			        gtArgmax, pArgmax[j], j);
-			fflush(stderr);
+		if(gtD0 != pD0[j] || gtD1 != pD1[j]){
+			errCnt++;
+			if(errCnt < MAXERRPRINT){
+				fprintf(stderr, "%s:%d: Mismatch GT %f[%zu] != %f[%zu] UUT @ %zu!\n",
+				__func__, __LINE__, gtD0, gtD1, pD0[j], pD1[j], j);
+				fflush (stderr);
+			}
 		}
-		ck_assert_msg(gtMax    == pMax[j],    "Max value mismatch!");
-		ck_assert_msg(gtArgmax == pArgmax[j], "Argmax value mismatch!");
 	}
+	ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt);
 
 	/**
 	 * Deallocate.
 	 */
 
-	free(pSrc);
-	free(pMax);
-	free(pArgmax);
-	GpuArray_clear(&gaSrc);
-	GpuArray_clear(&gaMax);
-	GpuArray_clear(&gaArgmax);
+	free(pS0);
+	free(pD0);
+	free(pD1);
+	GpuArray_clear(&gaS0);
+	GpuArray_clear(&gaD0);
+	GpuArray_clear(&gaD1);
 }END_TEST
 
 START_TEST(test_maxandargmax_veryhighrank){
@@ -374,6 +403,7 @@ START_TEST(test_maxandargmax_veryhighrank){
 	 * Here we test a reduction of a random 8D tensor on four dimensions.
 	 */
 
+	size_t errCnt      = 0;
 	size_t i,j,k,l,m,n,o,p;
 	size_t dims   [8]  = {1171,373,2,1,2,1,2,1};
 	size_t prodDims    = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7];
@@ -381,13 +411,13 @@ START_TEST(test_maxandargmax_veryhighrank){
 	size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3];
 	const int reduxList[] = {2,4,7,5};
 
-	float *pSrc = calloc(sizeof(*pSrc), prodDims);
-	float *pMax = calloc(sizeof(*pMax), rdxProdDims);
-	unsigned long *pArgmax = calloc(sizeof(*pArgmax), rdxProdDims);
+	float*  pS0 = calloc(1, sizeof(*pS0) * prodDims);
+	float*  pD0 = calloc(1, sizeof(*pD0) * rdxProdDims);
+	size_t* pD1 = calloc(1, sizeof(*pD1) * rdxProdDims);
 
-	ck_assert_ptr_ne(pSrc,    NULL);
-	ck_assert_ptr_ne(pMax,    NULL);
-	ck_assert_ptr_ne(pArgmax, NULL);
+	ck_assert_ptr_nonnull(pS0);
+	ck_assert_ptr_nonnull(pD0);
+	ck_assert_ptr_nonnull(pD1);
 
 
 	/**
@@ -395,7 +425,7 @@ START_TEST(test_maxandargmax_veryhighrank){
 	 */
 
 	for(i=0;i<prodDims;i++){
-		pSrc[i] = pcgRand01();
+		pS0[i] = pcgRand01();
 	}
 
 
@@ -403,27 +433,35 @@ START_TEST(test_maxandargmax_veryhighrank){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaSrc;
-	GpuArray gaMax;
-	GpuArray gaArgmax;
-
-	ga_assert_ok(GpuArray_empty(&gaSrc,    ctx, GA_FLOAT, 8, dims,    GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty(&gaMax,    ctx, GA_FLOAT, 4, rdxDims, GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty(&gaArgmax, ctx, GA_ULONG,  4, rdxDims, GA_C_ORDER));
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuArray gaD1;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 
-	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
-	ga_assert_ok(GpuArray_memset(&gaMax,    -1));  /* 0xFFFFFFFF is a qNaN. */
-	ga_assert_ok(GpuArray_memset(&gaArgmax, -1));
-
-	GpuReduction* gr;
-	GpuReduction_new(&gr, GpuArray_context(&gaSrc),
-	                 GA_REDUCE_MAXANDARGMAX, 4, 4, gaSrc.typecode, 0);
+	ga_assert_ok(GpuArray_empty(&gaS0, ctx, GA_FLOAT, 8, dims,    GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaD0, ctx, GA_FLOAT, 4, rdxDims, GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_SIZE,  4, rdxDims, GA_C_ORDER));
+	
+	ga_assert_ok(GpuArray_write(&gaS0, pS0, sizeof(*pS0)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD0, -1));  /* 0xFFFFFFFF is a qNaN. */
+	ga_assert_ok(GpuArray_memset(&gaD1, -1));
+
+	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
+	ck_assert_ptr_nonnull(grAttr);
+	GpuReductionAttr_setop    (grAttr, GA_REDUCE_MAXANDARGMAX);
+	GpuReductionAttr_setdims  (grAttr, gaS0.nd, gaD0.nd);
+	GpuReductionAttr_sets0type(grAttr, gaS0.typecode);
+	GpuReductionAttr_setd0type(grAttr, gaD0.typecode);
+	GpuReductionAttr_setd1type(grAttr, gaD1.typecode);
+	GpuReduction_new(&gr, grAttr);
 	ck_assert_ptr_nonnull(gr);
-	ga_assert_ok(GpuReduction_call(gr, &gaMax, &gaArgmax, &gaSrc, 4, reduxList, 0));
+	ga_assert_ok(GpuReduction_call(gr, &gaD0, &gaD1, &gaS0, gaS0.nd-gaD0.nd, reduxList, 0));
 	GpuReduction_free(gr);
+	GpuReductionAttr_free(grAttr);
 
-	ga_assert_ok(GpuArray_read(pMax,    sizeof(*pMax)   *rdxProdDims, &gaMax));
-	ga_assert_ok(GpuArray_read(pArgmax, sizeof(*pArgmax)*rdxProdDims, &gaArgmax));
+	ga_assert_ok(GpuArray_read(pD0, sizeof(*pD0)*rdxProdDims, &gaD0));
+	ga_assert_ok(GpuArray_read(pD1, sizeof(*pD1)*rdxProdDims, &gaD1));
 
 
 	/**
@@ -434,18 +472,18 @@ START_TEST(test_maxandargmax_veryhighrank){
 		for(j=0;j<dims[1];j++){
 			for(l=0;l<dims[3];l++){
 				for(o=0;o<dims[6];o++){
-					size_t gtArgmax = 0;
-					float  gtMax    = pSrc[(((((((i)*dims[1] + j)*dims[2] + 0)*dims[3] + l)*dims[4] + 0)*dims[5] + 0)*dims[6] + o)*dims[7] + 0];
+					size_t gtD1 = 0;
+					float  gtD0 = pS0[(((((((i)*dims[1] + j)*dims[2] + 0)*dims[3] + l)*dims[4] + 0)*dims[5] + 0)*dims[6] + o)*dims[7] + 0];
 
 					for(k=0;k<dims[2];k++){
 						for(m=0;m<dims[4];m++){
 							for(p=0;p<dims[7];p++){
 								for(n=0;n<dims[5];n++){
-									float v = pSrc[(((((((i)*dims[1] + j)*dims[2] + k)*dims[3] + l)*dims[4] + m)*dims[5] + n)*dims[6] + o)*dims[7] + p];
+									float v = pS0[(((((((i)*dims[1] + j)*dims[2] + k)*dims[3] + l)*dims[4] + m)*dims[5] + n)*dims[6] + o)*dims[7] + p];
 
-									if(v > gtMax){
-										gtMax    = v;
-										gtArgmax = (((k)*dims[4] + m)*dims[7] + p)*dims[5] + n;
+									if(v > gtD0){
+										gtD0 = v;
+										gtD1 = (((k)*dims[4] + m)*dims[7] + p)*dims[5] + n;
 									}
 								}
 							}
@@ -453,24 +491,31 @@ START_TEST(test_maxandargmax_veryhighrank){
 					}
 
 					size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
-					ck_assert_msg(gtMax    == pMax[dstIdx],    "Max value mismatch!");
-					ck_assert_msg(gtArgmax == pArgmax[dstIdx], "Argmax value mismatch!");
+					if(gtD0 != pD0[dstIdx] || gtD1 != pD1[dstIdx]){
+						errCnt++;
+						if(errCnt < MAXERRPRINT){
+							fprintf(stderr, "%s:%d: Mismatch GT %f[%zu] != %f[%zu] UUT @ %zu!\n",
+							__func__, __LINE__, gtD0, gtD1, pD0[dstIdx], pD1[dstIdx], dstIdx);
+							fflush (stderr);
+						}
+					}
 				}
 			}
 		}
 	}
+	ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt);
 
 
 	/**
 	 * Deallocate.
 	 */
 
-	free(pSrc);
-	free(pMax);
-	free(pArgmax);
-	GpuArray_clear(&gaSrc);
-	GpuArray_clear(&gaMax);
-	GpuArray_clear(&gaArgmax);
+	free(pS0);
+	free(pD0);
+	free(pD1);
+	GpuArray_clear(&gaS0);
+	GpuArray_clear(&gaD0);
+	GpuArray_clear(&gaD1);
 }END_TEST
 
 START_TEST(test_maxandargmax_alldimsreduced){
@@ -480,18 +525,19 @@ START_TEST(test_maxandargmax_alldimsreduced){
 	 * We test here a reduction of some random 3D tensor on all dimensions.
 	 */
 
+	size_t errCnt      = 0;
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
 	const int reduxList[] = {0,1,2};
 
-	float *pSrc    = calloc(sizeof(*pSrc), prodDims);
-	float *pMax    = calloc(1, sizeof(*pMax));
-	unsigned long *pArgmax = calloc(1, sizeof(*pArgmax));
+	float*  pS0 = calloc(1, sizeof(*pS0) * dims[0]*dims[1]*dims[2]);
+	float*  pD0 = calloc(1, sizeof(*pD0)                          );
+	size_t* pD1 = calloc(1, sizeof(*pD1)                          );
 
-	ck_assert_ptr_ne(pSrc,    NULL);
-	ck_assert_ptr_ne(pMax,    NULL);
-	ck_assert_ptr_ne(pArgmax, NULL);
+	ck_assert_ptr_nonnull(pS0);
+	ck_assert_ptr_nonnull(pD0);
+	ck_assert_ptr_nonnull(pD1);
 
 
 	/**
@@ -499,7 +545,7 @@ START_TEST(test_maxandargmax_alldimsreduced){
 	 */
 
 	for(i=0;i<prodDims;i++){
-		pSrc[i] = pcgRand01();
+		pS0[i] = pcgRand01();
 	}
 
 
@@ -507,62 +553,76 @@ START_TEST(test_maxandargmax_alldimsreduced){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaSrc;
-	GpuArray gaMax;
-	GpuArray gaArgmax;
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuArray gaD1;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 
-	ga_assert_ok(GpuArray_empty(&gaSrc,    ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty(&gaMax,    ctx, GA_FLOAT, 0, NULL,     GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty(&gaArgmax, ctx, GA_ULONG,  0, NULL,     GA_C_ORDER));
-
-	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
-	ga_assert_ok(GpuArray_memset(&gaMax,    -1));  /* 0xFFFFFFFF is a qNaN. */
-	ga_assert_ok(GpuArray_memset(&gaArgmax, -1));
-
-	GpuReduction* gr;
-	GpuReduction_new(&gr, GpuArray_context(&gaSrc),
-	                 GA_REDUCE_MAXANDARGMAX, 0, 3, gaSrc.typecode, 0);
+	ga_assert_ok(GpuArray_empty(&gaS0, ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaD0, ctx, GA_FLOAT, 0, NULL,     GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_SIZE,  0, NULL,     GA_C_ORDER));
+	
+	ga_assert_ok(GpuArray_write(&gaS0, pS0, sizeof(*pS0)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD0, -1));  /* 0xFFFFFFFF is a qNaN. */
+	ga_assert_ok(GpuArray_memset(&gaD1, -1));
+
+	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
+	ck_assert_ptr_nonnull(grAttr);
+	GpuReductionAttr_setop    (grAttr, GA_REDUCE_MAXANDARGMAX);
+	GpuReductionAttr_setdims  (grAttr, gaS0.nd, gaD0.nd);
+	GpuReductionAttr_sets0type(grAttr, gaS0.typecode);
+	GpuReductionAttr_setd0type(grAttr, gaD0.typecode);
+	GpuReductionAttr_setd1type(grAttr, gaD1.typecode);
+	GpuReduction_new(&gr, grAttr);
 	ck_assert_ptr_nonnull(gr);
-	ga_assert_ok(GpuReduction_call(gr, &gaMax, &gaArgmax, &gaSrc, 3, reduxList, 0));
+	ga_assert_ok(GpuReduction_call(gr, &gaD0, &gaD1, &gaS0, gaS0.nd-gaD0.nd, reduxList, 0));
 	GpuReduction_free(gr);
+	GpuReductionAttr_free(grAttr);
 
-	ga_assert_ok(GpuArray_read(pMax,    sizeof(*pMax),    &gaMax));
-	ga_assert_ok(GpuArray_read(pArgmax, sizeof(*pArgmax), &gaArgmax));
+	ga_assert_ok(GpuArray_read(pD0, sizeof(*pD0), &gaD0));
+	ga_assert_ok(GpuArray_read(pD1, sizeof(*pD1), &gaD1));
 
 
 	/**
 	 * Check that the destination tensors are correct.
 	 */
 
-	size_t gtArgmax = 0;
-	float  gtMax    = pSrc[0];
+	size_t gtD1 = 0;
+	float  gtD0 = pS0[0];
 
 	for(i=0;i<dims[0];i++){
 		for(j=0;j<dims[1];j++){
 			for(k=0;k<dims[2];k++){
-				float v = pSrc[(i*dims[1] + j)*dims[2] + k];
+				float v = pS0[(i*dims[1] + j)*dims[2] + k];
 
-				if(v > gtMax){
-					gtMax    = v;
-					gtArgmax = (i*dims[1] + j)*dims[2] + k;
+				if(v > gtD0){
+					gtD0 = v;
+					gtD1 = (i*dims[1] + j)*dims[2] + k;
 				}
 			}
 		}
 	}
-
-	ck_assert_msg(gtMax    == pMax[0],    "Max value mismatch!");
-	ck_assert_msg(gtArgmax == pArgmax[0], "Argmax value mismatch!");
+	if(gtD0 != pD0[0] || gtD1 != pD1[0]){
+		errCnt++;
+		if(errCnt < MAXERRPRINT){
+			fprintf(stderr, "%s:%d: Mismatch GT %f[%zu] != %f[%zu] UUT @ %zu!\n",
+			__func__, __LINE__, gtD0, gtD1, pD0[0], pD1[0], (size_t)0);
+			fflush (stderr);
+		}
+	}
+	ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt);
 
 	/**
 	 * Deallocate.
 	 */
 
-	free(pSrc);
-	free(pMax);
-	free(pArgmax);
-	GpuArray_clear(&gaSrc);
-	GpuArray_clear(&gaMax);
-	GpuArray_clear(&gaArgmax);
+	free(pS0);
+	free(pD0);
+	free(pD1);
+	GpuArray_clear(&gaS0);
+	GpuArray_clear(&gaD0);
+	GpuArray_clear(&gaD1);
 }END_TEST
 
 START_TEST(test_minandargmin_reduction){
@@ -573,18 +633,19 @@ START_TEST(test_minandargmin_reduction){
 	 * third dimensions.
 	 */
 
+	size_t errCnt      = 0;
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
 	const int reduxList[] = {0,2};
 
-	float*  pSrc    = calloc(1, sizeof(*pSrc)    * dims[0]*dims[1]*dims[2]);
-	float*  pMin    = calloc(1, sizeof(*pMin)    *         dims[1]        );
-	size_t* pArgmin = calloc(1, sizeof(*pArgmin) *         dims[1]        );
+	float*  pS0 = calloc(1, sizeof(*pS0) * dims[0]*dims[1]*dims[2]);
+	float*  pD0 = calloc(1, sizeof(*pD0) *         dims[1]        );
+	size_t* pD1 = calloc(1, sizeof(*pD1) *         dims[1]        );
 
-	ck_assert_ptr_ne(pSrc,    NULL);
-	ck_assert_ptr_ne(pMin,    NULL);
-	ck_assert_ptr_ne(pArgmin, NULL);
+	ck_assert_ptr_nonnull(pS0);
+	ck_assert_ptr_nonnull(pD0);
+	ck_assert_ptr_nonnull(pD1);
 
 
 	/**
@@ -592,7 +653,7 @@ START_TEST(test_minandargmin_reduction){
 	 */
 
 	for(i=0;i<prodDims;i++){
-		pSrc[i] = pcgRand01();
+		pS0[i] = pcgRand01();
 	}
 
 
@@ -600,27 +661,35 @@ START_TEST(test_minandargmin_reduction){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaSrc;
-	GpuArray gaMin;
-	GpuArray gaArgmin;
-
-	ga_assert_ok(GpuArray_empty(&gaSrc,    ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty(&gaMin,    ctx, GA_FLOAT, 1, &dims[1], GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty(&gaArgmin, ctx, GA_SIZE,  1, &dims[1], GA_C_ORDER));
-
-	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
-	ga_assert_ok(GpuArray_memset(&gaMin,    -1));  /* 0xFFFFFFFF is a qNaN. */
-	ga_assert_ok(GpuArray_memset(&gaArgmin, -1));
-
-	GpuReduction* gr;
-	GpuReduction_new(&gr, GpuArray_context(&gaSrc),
-	                 GA_REDUCE_MINANDARGMIN, 1, 2, gaSrc.typecode, 0);
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuArray gaD1;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
+
+	ga_assert_ok(GpuArray_empty(&gaS0, ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaD0, ctx, GA_FLOAT, 1, &dims[1], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_SIZE,  1, &dims[1], GA_C_ORDER));
+
+	ga_assert_ok(GpuArray_write(&gaS0, pS0, sizeof(*pS0)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD0, -1));  /* 0xFFFFFFFF is a qNaN. */
+	ga_assert_ok(GpuArray_memset(&gaD1, -1));
+
+	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
+	ck_assert_ptr_nonnull(grAttr);
+	GpuReductionAttr_setop    (grAttr, GA_REDUCE_MINANDARGMIN);
+	GpuReductionAttr_setdims  (grAttr, gaS0.nd, gaD0.nd);
+	GpuReductionAttr_sets0type(grAttr, gaS0.typecode);
+	GpuReductionAttr_setd0type(grAttr, gaD0.typecode);
+	GpuReductionAttr_setd1type(grAttr, gaD1.typecode);
+	GpuReduction_new(&gr, grAttr);
 	ck_assert_ptr_nonnull(gr);
-	ga_assert_ok(GpuReduction_call(gr, &gaMin, &gaArgmin, &gaSrc, 2, reduxList, 0));
+	ga_assert_ok(GpuReduction_call(gr, &gaD0, &gaD1, &gaS0, gaS0.nd-gaD0.nd, reduxList, 0));
 	GpuReduction_free(gr);
+	GpuReductionAttr_free(grAttr);
 
-	ga_assert_ok(GpuArray_read(pMin,    sizeof(*pMin)   *dims[1], &gaMin));
-	ga_assert_ok(GpuArray_read(pArgmin, sizeof(*pArgmin)*dims[1], &gaArgmin));
+	ga_assert_ok(GpuArray_read(pD0, sizeof(*pD0)*dims[1], &gaD0));
+	ga_assert_ok(GpuArray_read(pD1, sizeof(*pD1)*dims[1], &gaD1));
 
 
 	/**
@@ -628,34 +697,41 @@ START_TEST(test_minandargmin_reduction){
 	 */
 
 	for(j=0;j<dims[1];j++){
-		size_t gtArgmin = 0;
-		float  gtMin    = pSrc[(0*dims[1] + j)*dims[2] + 0];
+		size_t gtD1 = 0;
+		float  gtD0 = pS0[(0*dims[1] + j)*dims[2] + 0];
 
 		for(i=0;i<dims[0];i++){
 			for(k=0;k<dims[2];k++){
-				float v = pSrc[(i*dims[1] + j)*dims[2] + k];
+				float v = pS0[(i*dims[1] + j)*dims[2] + k];
 
-				if(v < gtMin){
-					gtMin    = v;
-					gtArgmin = i*dims[2] + k;
+				if(v < gtD0){
+					gtD0 = v;
+					gtD1 = i*dims[2] + k;
 				}
 			}
 		}
-
-		ck_assert_msg(gtMin    == pMin[j],    "Min value mismatch!");
-		ck_assert_msg(gtArgmin == pArgmin[j], "Argmin value mismatch!");
+		
+		if(gtD0 != pD0[j] || gtD1 != pD1[j]){
+			errCnt++;
+			if(errCnt < MAXERRPRINT){
+				fprintf(stderr, "%s:%d: Mismatch GT %f[%zu] != %f[%zu] UUT @ %zu!\n",
+				__func__, __LINE__, gtD0, gtD1, pD0[j], pD1[j], j);
+				fflush (stderr);
+			}
+		}
 	}
+	ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt);
 
 	/**
 	 * Deallocate.
 	 */
 
-	free(pSrc);
-	free(pMin);
-	free(pArgmin);
-	GpuArray_clear(&gaSrc);
-	GpuArray_clear(&gaMin);
-	GpuArray_clear(&gaArgmin);
+	free(pS0);
+	free(pD0);
+	free(pD1);
+	GpuArray_clear(&gaS0);
+	GpuArray_clear(&gaD0);
+	GpuArray_clear(&gaD1);
 }END_TEST
 
 START_TEST(test_minandargmin_veryhighrank){
@@ -665,6 +741,7 @@ START_TEST(test_minandargmin_veryhighrank){
 	 * Here we test a reduction of a random 8D tensor on four dimensions.
 	 */
 
+	size_t errCnt      = 0;
 	size_t i,j,k,l,m,n,o,p;
 	size_t dims   [8]  = {1171,373,2,1,2,1,2,1};
 	size_t prodDims    = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7];
@@ -672,13 +749,13 @@ START_TEST(test_minandargmin_veryhighrank){
 	size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3];
 	const int reduxList[] = {2,4,7,5};
 
-	float*  pSrc    = calloc(1, sizeof(*pSrc)    * prodDims);
-	float*  pMin    = calloc(1, sizeof(*pMin)    * rdxProdDims);
-	size_t* pArgmin = calloc(1, sizeof(*pArgmin) * rdxProdDims);
+	float*  pS0    = calloc(1, sizeof(*pS0)    * prodDims);
+	float*  pD0    = calloc(1, sizeof(*pD0)    * rdxProdDims);
+	size_t* pD1 = calloc(1, sizeof(*pD1) * rdxProdDims);
 
-	ck_assert_ptr_ne(pSrc,    NULL);
-	ck_assert_ptr_ne(pMin,    NULL);
-	ck_assert_ptr_ne(pArgmin, NULL);
+	ck_assert_ptr_nonnull(pS0);
+	ck_assert_ptr_nonnull(pD0);
+	ck_assert_ptr_nonnull(pD1);
 
 
 	/**
@@ -686,7 +763,7 @@ START_TEST(test_minandargmin_veryhighrank){
 	 */
 
 	for(i=0;i<prodDims;i++){
-		pSrc[i] = pcgRand01();
+		pS0[i] = pcgRand01();
 	}
 
 
@@ -694,27 +771,35 @@ START_TEST(test_minandargmin_veryhighrank){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaSrc;
-	GpuArray gaMin;
-	GpuArray gaArgmin;
-
-	ga_assert_ok(GpuArray_empty(&gaSrc,    ctx, GA_FLOAT, 8, dims,    GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty(&gaMin,    ctx, GA_FLOAT, 4, rdxDims, GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty(&gaArgmin, ctx, GA_SIZE,  4, rdxDims, GA_C_ORDER));
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuArray gaD1;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 
-	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
-	ga_assert_ok(GpuArray_memset(&gaMin,    -1));  /* 0xFFFFFFFF is a qNaN. */
-	ga_assert_ok(GpuArray_memset(&gaArgmin, -1));
-
-	GpuReduction* gr;
-	GpuReduction_new(&gr, GpuArray_context(&gaSrc),
-	                 GA_REDUCE_MINANDARGMIN, 4, 4, gaSrc.typecode, 0);
+	ga_assert_ok(GpuArray_empty(&gaS0, ctx, GA_FLOAT, 8, dims,    GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaD0, ctx, GA_FLOAT, 4, rdxDims, GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_SIZE,  4, rdxDims, GA_C_ORDER));
+	
+	ga_assert_ok(GpuArray_write(&gaS0, pS0, sizeof(*pS0)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD0, -1));  /* 0xFFFFFFFF is a qNaN. */
+	ga_assert_ok(GpuArray_memset(&gaD1, -1));
+
+	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
+	ck_assert_ptr_nonnull(grAttr);
+	GpuReductionAttr_setop    (grAttr, GA_REDUCE_MINANDARGMIN);
+	GpuReductionAttr_setdims  (grAttr, gaS0.nd, gaD0.nd);
+	GpuReductionAttr_sets0type(grAttr, gaS0.typecode);
+	GpuReductionAttr_setd0type(grAttr, gaD0.typecode);
+	GpuReductionAttr_setd1type(grAttr, gaD1.typecode);
+	GpuReduction_new(&gr, grAttr);
 	ck_assert_ptr_nonnull(gr);
-	ga_assert_ok(GpuReduction_call(gr, &gaMin, &gaArgmin, &gaSrc, 4, reduxList, 0));
+	ga_assert_ok(GpuReduction_call(gr, &gaD0, &gaD1, &gaS0, gaS0.nd-gaD0.nd, reduxList, 0));
 	GpuReduction_free(gr);
+	GpuReductionAttr_free(grAttr);
 
-	ga_assert_ok(GpuArray_read(pMin,    sizeof(*pMin)   *rdxProdDims, &gaMin));
-	ga_assert_ok(GpuArray_read(pArgmin, sizeof(*pArgmin)*rdxProdDims, &gaArgmin));
+	ga_assert_ok(GpuArray_read(pD0, sizeof(*pD0)*rdxProdDims, &gaD0));
+	ga_assert_ok(GpuArray_read(pD1, sizeof(*pD1)*rdxProdDims, &gaD1));
 
 
 	/**
@@ -725,18 +810,18 @@ START_TEST(test_minandargmin_veryhighrank){
 		for(j=0;j<dims[1];j++){
 			for(l=0;l<dims[3];l++){
 				for(o=0;o<dims[6];o++){
-					size_t gtArgmin = 0;
-					float  gtMin    = pSrc[(((((((i)*dims[1] + j)*dims[2] + 0)*dims[3] + l)*dims[4] + 0)*dims[5] + 0)*dims[6] + o)*dims[7] + 0];
+					size_t gtD1 = 0;
+					float  gtD0 = pS0[(((((((i)*dims[1] + j)*dims[2] + 0)*dims[3] + l)*dims[4] + 0)*dims[5] + 0)*dims[6] + o)*dims[7] + 0];
 
 					for(k=0;k<dims[2];k++){
 						for(m=0;m<dims[4];m++){
 							for(p=0;p<dims[7];p++){
 								for(n=0;n<dims[5];n++){
-									float v = pSrc[(((((((i)*dims[1] + j)*dims[2] + k)*dims[3] + l)*dims[4] + m)*dims[5] + n)*dims[6] + o)*dims[7] + p];
+									float v = pS0[(((((((i)*dims[1] + j)*dims[2] + k)*dims[3] + l)*dims[4] + m)*dims[5] + n)*dims[6] + o)*dims[7] + p];
 
-									if(v < gtMin){
-										gtMin    = v;
-										gtArgmin = (((k)*dims[4] + m)*dims[7] + p)*dims[5] + n;
+									if(v < gtD0){
+										gtD0 = v;
+										gtD1 = (((k)*dims[4] + m)*dims[7] + p)*dims[5] + n;
 									}
 								}
 							}
@@ -744,24 +829,31 @@ START_TEST(test_minandargmin_veryhighrank){
 					}
 
 					size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
-					ck_assert_msg(gtMin    == pMin[dstIdx],    "Min value mismatch!");
-					ck_assert_msg(gtArgmin == pArgmin[dstIdx], "Argmin value mismatch!");
+					if(gtD0 != pD0[dstIdx] || gtD1 != pD1[dstIdx]){
+						errCnt++;
+						if(errCnt < MAXERRPRINT){
+							fprintf(stderr, "%s:%d: Mismatch GT %f[%zu] != %f[%zu] UUT @ %zu!\n",
+							__func__, __LINE__, gtD0, gtD1, pD0[dstIdx], pD1[dstIdx], dstIdx);
+							fflush (stderr);
+						}
+					}
 				}
 			}
 		}
 	}
+	ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt);
 
 
 	/**
 	 * Deallocate.
 	 */
 
-	free(pSrc);
-	free(pMin);
-	free(pArgmin);
-	GpuArray_clear(&gaSrc);
-	GpuArray_clear(&gaMin);
-	GpuArray_clear(&gaArgmin);
+	free(pS0);
+	free(pD0);
+	free(pD1);
+	GpuArray_clear(&gaS0);
+	GpuArray_clear(&gaD0);
+	GpuArray_clear(&gaD1);
 }END_TEST
 
 START_TEST(test_minandargmin_alldimsreduced){
@@ -771,18 +863,19 @@ START_TEST(test_minandargmin_alldimsreduced){
 	 * We test here a reduction of some random 3D tensor on all dimensions.
 	 */
 
+	size_t errCnt      = 0;
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
 	const int reduxList[] = {0,1,2};
 
-	float*  pSrc    = calloc(1, sizeof(*pSrc)    * dims[0]*dims[1]*dims[2]);
-	float*  pMin    = calloc(1, sizeof(*pMin)                             );
-	size_t* pArgmin = calloc(1, sizeof(*pArgmin)                          );
+	float*  pS0 = calloc(1, sizeof(*pS0)* dims[0]*dims[1]*dims[2]);
+	float*  pD0 = calloc(1, sizeof(*pD0)                         );
+	size_t* pD1 = calloc(1, sizeof(*pD1)                         );
 
-	ck_assert_ptr_ne(pSrc,    NULL);
-	ck_assert_ptr_ne(pMin,    NULL);
-	ck_assert_ptr_ne(pArgmin, NULL);
+	ck_assert_ptr_nonnull(pS0);
+	ck_assert_ptr_nonnull(pD0);
+	ck_assert_ptr_nonnull(pD1);
 
 
 	/**
@@ -790,7 +883,7 @@ START_TEST(test_minandargmin_alldimsreduced){
 	 */
 
 	for(i=0;i<prodDims;i++){
-		pSrc[i] = pcgRand01();
+		pS0[i] = pcgRand01();
 	}
 
 
@@ -798,62 +891,76 @@ START_TEST(test_minandargmin_alldimsreduced){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaSrc;
-	GpuArray gaMin;
-	GpuArray gaArgmin;
-
-	ga_assert_ok(GpuArray_empty(&gaSrc,    ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty(&gaMin,    ctx, GA_FLOAT, 0, NULL,     GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty(&gaArgmin, ctx, GA_SIZE,  0, NULL,     GA_C_ORDER));
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuArray gaD1;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 
-	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
-	ga_assert_ok(GpuArray_memset(&gaMin,    -1));  /* 0xFFFFFFFF is a qNaN. */
-	ga_assert_ok(GpuArray_memset(&gaArgmin, -1));
-
-	GpuReduction* gr;
-	GpuReduction_new(&gr, GpuArray_context(&gaSrc),
-	                 GA_REDUCE_MINANDARGMIN, 0, 3, gaSrc.typecode, 0);
+	ga_assert_ok(GpuArray_empty(&gaS0, ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaD0, ctx, GA_FLOAT, 0, NULL,     GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_SIZE,  0, NULL,     GA_C_ORDER));
+	
+	ga_assert_ok(GpuArray_write(&gaS0, pS0, sizeof(*pS0)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD0, -1));  /* 0xFFFFFFFF is a qNaN. */
+	ga_assert_ok(GpuArray_memset(&gaD1, -1));
+
+	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
+	ck_assert_ptr_nonnull(grAttr);
+	GpuReductionAttr_setop    (grAttr, GA_REDUCE_MINANDARGMIN);
+	GpuReductionAttr_setdims  (grAttr, gaS0.nd, gaD0.nd);
+	GpuReductionAttr_sets0type(grAttr, gaS0.typecode);
+	GpuReductionAttr_setd0type(grAttr, gaD0.typecode);
+	GpuReductionAttr_setd1type(grAttr, gaD1.typecode);
+	GpuReduction_new(&gr, grAttr);
 	ck_assert_ptr_nonnull(gr);
-	ga_assert_ok(GpuReduction_call(gr, &gaMin, &gaArgmin, &gaSrc, 3, reduxList, 0));
+	ga_assert_ok(GpuReduction_call(gr, &gaD0, &gaD1, &gaS0, gaS0.nd-gaD0.nd, reduxList, 0));
 	GpuReduction_free(gr);
+	GpuReductionAttr_free(grAttr);
 
-	ga_assert_ok(GpuArray_read(pMin,    sizeof(*pMin),    &gaMin));
-	ga_assert_ok(GpuArray_read(pArgmin, sizeof(*pArgmin), &gaArgmin));
+	ga_assert_ok(GpuArray_read(pD0, sizeof(*pD0), &gaD0));
+	ga_assert_ok(GpuArray_read(pD1, sizeof(*pD1), &gaD1));
 
 
 	/**
 	 * Check that the destination tensors are correct.
 	 */
 
-	size_t gtArgmin = 0;
-	float  gtMin    = pSrc[0];
+	size_t gtD1 = 0;
+	float  gtD0 = pS0[0];
 
 	for(i=0;i<dims[0];i++){
 		for(j=0;j<dims[1];j++){
 			for(k=0;k<dims[2];k++){
-				float v = pSrc[(i*dims[1] + j)*dims[2] + k];
+				float v = pS0[(i*dims[1] + j)*dims[2] + k];
 
-				if(v < gtMin){
-					gtMin    = v;
-					gtArgmin = (i*dims[1] + j)*dims[2] + k;
+				if(v < gtD0){
+					gtD0 = v;
+					gtD1 = (i*dims[1] + j)*dims[2] + k;
 				}
 			}
 		}
 	}
-
-	ck_assert_msg(gtMin    == pMin[0],    "Min value mismatch!");
-	ck_assert_msg(gtArgmin == pArgmin[0], "Argmin value mismatch!");
+	if(gtD0 != pD0[0] || gtD1 != pD1[0]){
+		errCnt++;
+		if(errCnt < MAXERRPRINT){
+			fprintf(stderr, "%s:%d: Mismatch GT %f[%zu] != %f[%zu] UUT @ %zu!\n",
+			__func__, __LINE__, gtD0, gtD1, pD0[0], pD1[0], (size_t)0);
+			fflush (stderr);
+		}
+	}
+	ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt);
 
 	/**
 	 * Deallocate.
 	 */
 
-	free(pSrc);
-	free(pMin);
-	free(pArgmin);
-	GpuArray_clear(&gaSrc);
-	GpuArray_clear(&gaMin);
-	GpuArray_clear(&gaArgmin);
+	free(pS0);
+	free(pD0);
+	free(pD1);
+	GpuArray_clear(&gaS0);
+	GpuArray_clear(&gaD0);
+	GpuArray_clear(&gaD1);
 }END_TEST
 
 START_TEST(test_argmax_reduction){
@@ -864,18 +971,19 @@ START_TEST(test_argmax_reduction){
 	 * third dimensions.
 	 */
 
+	size_t errCnt      = 0;
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
 	const int reduxList[] = {0,2};
 
-	float*  pSrc    = calloc(1, sizeof(*pSrc)    * dims[0]*dims[1]*dims[2]);
-	float*  pMax    = calloc(1, sizeof(*pMax)    *         dims[1]        );
-	size_t* pArgmax = calloc(1, sizeof(*pArgmax) *         dims[1]        );
+	float*  pS0 = calloc(1, sizeof(*pS0) * dims[0]*dims[1]*dims[2]);
+	float*  pD0 = calloc(1, sizeof(*pD0) *         dims[1]        );
+	size_t* pD1 = calloc(1, sizeof(*pD1) *         dims[1]        );
 
-	ck_assert_ptr_ne(pSrc,    NULL);
-	ck_assert_ptr_ne(pMax,    NULL);
-	ck_assert_ptr_ne(pArgmax, NULL);
+	ck_assert_ptr_nonnull(pS0);
+	ck_assert_ptr_nonnull(pD0);
+	ck_assert_ptr_nonnull(pD1);
 
 
 	/**
@@ -883,7 +991,7 @@ START_TEST(test_argmax_reduction){
 	 */
 
 	for(i=0;i<prodDims;i++){
-		pSrc[i] = pcgRand01();
+		pS0[i] = pcgRand01();
 	}
 
 
@@ -891,23 +999,30 @@ START_TEST(test_argmax_reduction){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaSrc;
-	GpuArray gaArgmax;
+	GpuArray gaS0;
+	GpuArray gaD1;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 
-	ga_assert_ok(GpuArray_empty(&gaSrc,    ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty(&gaArgmax, ctx, GA_SIZE,  1, &dims[1], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaS0, ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_SIZE,  1, &dims[1], GA_C_ORDER));
 
-	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
-	ga_assert_ok(GpuArray_memset(&gaArgmax, -1));
+	ga_assert_ok(GpuArray_write(&gaS0, pS0, sizeof(*pS0)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD1, -1));
 
-	GpuReduction* gr;
-	GpuReduction_new(&gr, GpuArray_context(&gaSrc),
-	                 GA_REDUCE_ARGMAX, 1, 2, gaSrc.typecode, 0);
+	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
+	ck_assert_ptr_nonnull(grAttr);
+	GpuReductionAttr_setop    (grAttr, GA_REDUCE_ARGMAX);
+	GpuReductionAttr_setdims  (grAttr, gaS0.nd, gaD1.nd);
+	GpuReductionAttr_sets0type(grAttr, gaS0.typecode);
+	GpuReductionAttr_setd1type(grAttr, gaD1.typecode);
+	GpuReduction_new(&gr, grAttr);
 	ck_assert_ptr_nonnull(gr);
-	ga_assert_ok(GpuReduction_call(gr, NULL, &gaArgmax, &gaSrc, 2, reduxList, 0));
+	ga_assert_ok(GpuReduction_call(gr, NULL, &gaD1, &gaS0, gaS0.nd-gaD1.nd, reduxList, 0));
 	GpuReduction_free(gr);
+	GpuReductionAttr_free(grAttr);
 
-	ga_assert_ok(GpuArray_read(pArgmax, sizeof(*pArgmax)*dims[1], &gaArgmax));
+	ga_assert_ok(GpuArray_read(pD1, sizeof(*pD1)*dims[1], &gaD1));
 
 
 	/**
@@ -915,32 +1030,40 @@ START_TEST(test_argmax_reduction){
 	 */
 
 	for(j=0;j<dims[1];j++){
-		size_t gtArgmax = 0;
-		float  gtMax    = pSrc[(0*dims[1] + j)*dims[2] + 0];
+		size_t gtD1 = 0;
+		float  gtD0 = pS0[(0*dims[1] + j)*dims[2] + 0];
 
 		for(i=0;i<dims[0];i++){
 			for(k=0;k<dims[2];k++){
-				float v = pSrc[(i*dims[1] + j)*dims[2] + k];
+				float v = pS0[(i*dims[1] + j)*dims[2] + k];
 
-				if(v > gtMax){
-					gtMax    = v;
-					gtArgmax = i*dims[2] + k;
+				if(v > gtD0){
+					gtD0 = v;
+					gtD1 = i*dims[2] + k;
 				}
 			}
 		}
-
-		ck_assert_msg(gtArgmax == pArgmax[j], "Argmax value mismatch!");
+		
+		if(gtD1 != pD1[j]){
+			errCnt++;
+			if(errCnt < MAXERRPRINT){
+				fprintf(stderr, "%s:%d: Mismatch GT [%zu] != [%zu] UUT @ %zu!\n",
+				__func__, __LINE__, gtD1, pD1[j], j);
+				fflush (stderr);
+			}
+		}
 	}
+	ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt);
 
 	/**
 	 * Deallocate.
 	 */
 
-	free(pSrc);
-	free(pMax);
-	free(pArgmax);
-	GpuArray_clear(&gaSrc);
-	GpuArray_clear(&gaArgmax);
+	free(pS0);
+	free(pD0);
+	free(pD1);
+	GpuArray_clear(&gaS0);
+	GpuArray_clear(&gaD1);
 }END_TEST
 
 START_TEST(test_argmax_veryhighrank){
@@ -950,6 +1073,7 @@ START_TEST(test_argmax_veryhighrank){
 	 * Here we test a reduction of a random 8D tensor on four dimensions.
 	 */
 
+	size_t errCnt      = 0;
 	size_t i,j,k,l,m,n,o,p;
 	size_t dims   [8]  = {1171,373,2,1,2,1,2,1};
 	size_t prodDims    = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7];
@@ -957,12 +1081,12 @@ START_TEST(test_argmax_veryhighrank){
 	size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3];
 	const int reduxList[] = {2,4,7,5};
 
-	float*  pSrc    = calloc(1, sizeof(*pSrc)    * prodDims);
-	float*  pMax    = calloc(1, sizeof(*pMax)    * rdxProdDims);
-	size_t* pArgmax = calloc(1, sizeof(*pArgmax) * rdxProdDims);
+	float*  pS0 = calloc(1, sizeof(*pS0) * prodDims);
+	float*  pD0 = calloc(1, sizeof(*pD0) * rdxProdDims);
+	size_t* pD1 = calloc(1, sizeof(*pD1) * rdxProdDims);
 
-	ck_assert_ptr_ne(pSrc,    NULL);
-	ck_assert_ptr_ne(pArgmax, NULL);
+	ck_assert_ptr_nonnull(pS0);
+	ck_assert_ptr_nonnull(pD1);
 
 
 	/**
@@ -970,7 +1094,7 @@ START_TEST(test_argmax_veryhighrank){
 	 */
 
 	for(i=0;i<prodDims;i++){
-		pSrc[i] = pcgRand01();
+		pS0[i] = pcgRand01();
 	}
 
 
@@ -978,23 +1102,30 @@ START_TEST(test_argmax_veryhighrank){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaSrc;
-	GpuArray gaArgmax;
+	GpuArray gaS0;
+	GpuArray gaD1;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 
-	ga_assert_ok(GpuArray_empty(&gaSrc,    ctx, GA_FLOAT, 8, dims,    GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty(&gaArgmax, ctx, GA_SIZE,  4, rdxDims, GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaS0, ctx, GA_FLOAT, 8, dims,    GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_SIZE,  4, rdxDims, GA_C_ORDER));
 
-	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
-	ga_assert_ok(GpuArray_memset(&gaArgmax, -1));
-
-	GpuReduction* gr;
-	GpuReduction_new(&gr, GpuArray_context(&gaSrc),
-	                 GA_REDUCE_ARGMAX, 4, 4, gaSrc.typecode, 0);
+	ga_assert_ok(GpuArray_write(&gaS0, pS0, sizeof(*pS0)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD1, -1));
+	
+	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
+	ck_assert_ptr_nonnull(grAttr);
+	GpuReductionAttr_setop    (grAttr, GA_REDUCE_ARGMAX);
+	GpuReductionAttr_setdims  (grAttr, gaS0.nd, gaD1.nd);
+	GpuReductionAttr_sets0type(grAttr, gaS0.typecode);
+	GpuReductionAttr_setd1type(grAttr, gaD1.typecode);
+	GpuReduction_new(&gr, grAttr);
 	ck_assert_ptr_nonnull(gr);
-	ga_assert_ok(GpuReduction_call(gr, NULL, &gaArgmax, &gaSrc, 4, reduxList, 0));
+	ga_assert_ok(GpuReduction_call(gr, NULL, &gaD1, &gaS0, gaS0.nd-gaD1.nd, reduxList, 0));
 	GpuReduction_free(gr);
+	GpuReductionAttr_free(grAttr);
 
-	ga_assert_ok(GpuArray_read(pArgmax, sizeof(*pArgmax)*rdxProdDims, &gaArgmax));
+	ga_assert_ok(GpuArray_read(pD1, sizeof(*pD1)*rdxProdDims, &gaD1));
 
 
 	/**
@@ -1005,18 +1136,18 @@ START_TEST(test_argmax_veryhighrank){
 		for(j=0;j<dims[1];j++){
 			for(l=0;l<dims[3];l++){
 				for(o=0;o<dims[6];o++){
-					size_t gtArgmax = 0;
-					float  gtMax    = pSrc[(((((((i)*dims[1] + j)*dims[2] + 0)*dims[3] + l)*dims[4] + 0)*dims[5] + 0)*dims[6] + o)*dims[7] + 0];
+					size_t gtD1 = 0;
+					float  gtD0 = pS0[(((((((i)*dims[1] + j)*dims[2] + 0)*dims[3] + l)*dims[4] + 0)*dims[5] + 0)*dims[6] + o)*dims[7] + 0];
 
 					for(k=0;k<dims[2];k++){
 						for(m=0;m<dims[4];m++){
 							for(p=0;p<dims[7];p++){
 								for(n=0;n<dims[5];n++){
-									float v = pSrc[(((((((i)*dims[1] + j)*dims[2] + k)*dims[3] + l)*dims[4] + m)*dims[5] + n)*dims[6] + o)*dims[7] + p];
+									float v = pS0[(((((((i)*dims[1] + j)*dims[2] + k)*dims[3] + l)*dims[4] + m)*dims[5] + n)*dims[6] + o)*dims[7] + p];
 
-									if(v > gtMax){
-										gtMax    = v;
-										gtArgmax = (((k)*dims[4] + m)*dims[7] + p)*dims[5] + n;
+									if(v > gtD0){
+										gtD0 = v;
+										gtD1 = (((k)*dims[4] + m)*dims[7] + p)*dims[5] + n;
 									}
 								}
 							}
@@ -1024,22 +1155,30 @@ START_TEST(test_argmax_veryhighrank){
 					}
 
 					size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
-					ck_assert_msg(gtArgmax == pArgmax[dstIdx], "Argmax value mismatch!");
+					if(gtD1 != pD1[dstIdx]){
+						errCnt++;
+						if(errCnt < MAXERRPRINT){
+							fprintf(stderr, "%s:%d: Mismatch GT [%zu] != [%zu] UUT @ %zu!\n",
+							__func__, __LINE__, gtD1, pD1[dstIdx], dstIdx);
+							fflush (stderr);
+						}
+					}
 				}
 			}
 		}
 	}
+	ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt);
 
 
 	/**
 	 * Deallocate.
 	 */
 
-	free(pSrc);
-	free(pMax);
-	free(pArgmax);
-	GpuArray_clear(&gaSrc);
-	GpuArray_clear(&gaArgmax);
+	free(pS0);
+	free(pD0);
+	free(pD1);
+	GpuArray_clear(&gaS0);
+	GpuArray_clear(&gaD1);
 }END_TEST
 
 START_TEST(test_argmax_alldimsreduced){
@@ -1049,18 +1188,19 @@ START_TEST(test_argmax_alldimsreduced){
 	 * We test here a reduction of some random 3D tensor on all dimensions.
 	 */
 
+	size_t errCnt      = 0;
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
 	const int reduxList[] = {0,1,2};
 
-	float*  pSrc    = calloc(1, sizeof(*pSrc)    * dims[0]*dims[1]*dims[2]);
-	float*  pMax    = calloc(1, sizeof(*pMax)                             );
-	size_t* pArgmax = calloc(1, sizeof(*pArgmax)                          );
+	float*  pS0 = calloc(1, sizeof(*pS0)    * dims[0]*dims[1]*dims[2]);
+	float*  pD0 = calloc(1, sizeof(*pD0)                             );
+	size_t* pD1 = calloc(1, sizeof(*pD1)                          );
 
-	ck_assert_ptr_ne(pSrc,    NULL);
-	ck_assert_ptr_ne(pMax,    NULL);
-	ck_assert_ptr_ne(pArgmax, NULL);
+	ck_assert_ptr_nonnull(pS0);
+	ck_assert_ptr_nonnull(pD0);
+	ck_assert_ptr_nonnull(pD1);
 
 
 	/**
@@ -1068,7 +1208,7 @@ START_TEST(test_argmax_alldimsreduced){
 	 */
 
 	for(i=0;i<prodDims;i++){
-		pSrc[i] = pcgRand01();
+		pS0[i] = pcgRand01();
 	}
 
 
@@ -1076,56 +1216,70 @@ START_TEST(test_argmax_alldimsreduced){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaSrc;
-	GpuArray gaArgmax;
-
-	ga_assert_ok(GpuArray_empty(&gaSrc,    ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty(&gaArgmax, ctx, GA_SIZE,  0, NULL,     GA_C_ORDER));
+	GpuArray gaS0;
+	GpuArray gaD1;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 
-	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
-	ga_assert_ok(GpuArray_memset(&gaArgmax, -1));
+	ga_assert_ok(GpuArray_empty(&gaS0, ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_SIZE,  0, NULL,     GA_C_ORDER));
 
-	GpuReduction* gr;
-	GpuReduction_new(&gr, GpuArray_context(&gaSrc),
-	                 GA_REDUCE_ARGMAX, 0, 3, gaSrc.typecode, 0);
+	ga_assert_ok(GpuArray_write(&gaS0, pS0, sizeof(*pS0)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD1, -1));
+	
+	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
+	ck_assert_ptr_nonnull(grAttr);
+	GpuReductionAttr_setop    (grAttr, GA_REDUCE_ARGMAX);
+	GpuReductionAttr_setdims  (grAttr, gaS0.nd, gaD1.nd);
+	GpuReductionAttr_sets0type(grAttr, gaS0.typecode);
+	GpuReductionAttr_setd1type(grAttr, gaD1.typecode);
+	GpuReduction_new(&gr, grAttr);
 	ck_assert_ptr_nonnull(gr);
-	ga_assert_ok(GpuReduction_call(gr, NULL, &gaArgmax, &gaSrc, 3, reduxList, 0));
+	ga_assert_ok(GpuReduction_call(gr, NULL, &gaD1, &gaS0, gaS0.nd-gaD1.nd, reduxList, 0));
 	GpuReduction_free(gr);
+	GpuReductionAttr_free(grAttr);
 
-	ga_assert_ok(GpuArray_read(pArgmax, sizeof(*pArgmax), &gaArgmax));
+	ga_assert_ok(GpuArray_read(pD1, sizeof(*pD1), &gaD1));
 
 
 	/**
 	 * Check that the destination tensors are correct.
 	 */
 
-	size_t gtArgmax = 0;
-	float  gtMax    = pSrc[0];
+	size_t gtD1 = 0;
+	float  gtD0 = pS0[0];
 
 	for(i=0;i<dims[0];i++){
 		for(j=0;j<dims[1];j++){
 			for(k=0;k<dims[2];k++){
-				float v = pSrc[(i*dims[1] + j)*dims[2] + k];
+				float v = pS0[(i*dims[1] + j)*dims[2] + k];
 
-				if(v > gtMax){
-					gtMax    = v;
-					gtArgmax = (i*dims[1] + j)*dims[2] + k;
+				if(v > gtD0){
+					gtD0 = v;
+					gtD1 = (i*dims[1] + j)*dims[2] + k;
 				}
 			}
 		}
 	}
-
-	ck_assert_msg(gtArgmax == pArgmax[0], "Argmax value mismatch!");
+	if(gtD1 != pD1[0]){
+		errCnt++;
+		if(errCnt < MAXERRPRINT){
+			fprintf(stderr, "%s:%d: Mismatch GT [%zu] != [%zu] UUT @ %zu!\n",
+			__func__, __LINE__, gtD1, pD1[0], (size_t)0);
+			fflush (stderr);
+		}
+	}
+	ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt);
 
 	/**
 	 * Deallocate.
 	 */
 
-	free(pSrc);
-	free(pMax);
-	free(pArgmax);
-	GpuArray_clear(&gaSrc);
-	GpuArray_clear(&gaArgmax);
+	free(pS0);
+	free(pD0);
+	free(pD1);
+	GpuArray_clear(&gaS0);
+	GpuArray_clear(&gaD1);
 }END_TEST
 
 START_TEST(test_argmin_reduction){
@@ -1136,18 +1290,19 @@ START_TEST(test_argmin_reduction){
 	 * third dimensions.
 	 */
 
+	size_t errCnt      = 0;
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
 	const int reduxList[] = {0,2};
 
-	float*  pSrc    = calloc(1, sizeof(*pSrc)    * dims[0]*dims[1]*dims[2]);
-	float*  pMin    = calloc(1, sizeof(*pMin)    *         dims[1]        );
-	size_t* pArgmin = calloc(1, sizeof(*pArgmin) *         dims[1]        );
+	float*  pS0    = calloc(1, sizeof(*pS0)    * dims[0]*dims[1]*dims[2]);
+	float*  pD0    = calloc(1, sizeof(*pD0)    *         dims[1]        );
+	size_t* pD1 = calloc(1, sizeof(*pD1) *         dims[1]        );
 
-	ck_assert_ptr_ne(pSrc,    NULL);
-	ck_assert_ptr_ne(pMin,    NULL);
-	ck_assert_ptr_ne(pArgmin, NULL);
+	ck_assert_ptr_nonnull(pS0);
+	ck_assert_ptr_nonnull(pD0);
+	ck_assert_ptr_nonnull(pD1);
 
 
 	/**
@@ -1155,7 +1310,7 @@ START_TEST(test_argmin_reduction){
 	 */
 
 	for(i=0;i<prodDims;i++){
-		pSrc[i] = pcgRand01();
+		pS0[i] = pcgRand01();
 	}
 
 
@@ -1163,23 +1318,30 @@ START_TEST(test_argmin_reduction){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaSrc;
-	GpuArray gaArgmin;
-
-	ga_assert_ok(GpuArray_empty(&gaSrc,    ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty(&gaArgmin, ctx, GA_SIZE,  1, &dims[1], GA_C_ORDER));
+	GpuArray gaS0;
+	GpuArray gaD1;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 
-	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
-	ga_assert_ok(GpuArray_memset(&gaArgmin, -1));
+	ga_assert_ok(GpuArray_empty(&gaS0, ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_SIZE,  1, &dims[1], GA_C_ORDER));
 
-	GpuReduction* gr;
-	GpuReduction_new(&gr, GpuArray_context(&gaSrc),
-	                 GA_REDUCE_ARGMIN, 1, 2, gaSrc.typecode, 0);
+	ga_assert_ok(GpuArray_write(&gaS0, pS0, sizeof(*pS0)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD1, -1));
+	
+	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
+	ck_assert_ptr_nonnull(grAttr);
+	GpuReductionAttr_setop    (grAttr, GA_REDUCE_ARGMIN);
+	GpuReductionAttr_setdims  (grAttr, gaS0.nd, gaD1.nd);
+	GpuReductionAttr_sets0type(grAttr, gaS0.typecode);
+	GpuReductionAttr_setd1type(grAttr, gaD1.typecode);
+	GpuReduction_new(&gr, grAttr);
 	ck_assert_ptr_nonnull(gr);
-	ga_assert_ok(GpuReduction_call(gr, NULL, &gaArgmin, &gaSrc, 2, reduxList, 0));
+	ga_assert_ok(GpuReduction_call(gr, NULL, &gaD1, &gaS0, gaS0.nd-gaD1.nd, reduxList, 0));
 	GpuReduction_free(gr);
+	GpuReductionAttr_free(grAttr);
 
-	ga_assert_ok(GpuArray_read(pArgmin, sizeof(*pArgmin)*dims[1], &gaArgmin));
+	ga_assert_ok(GpuArray_read(pD1, sizeof(*pD1)*dims[1], &gaD1));
 
 
 	/**
@@ -1187,32 +1349,40 @@ START_TEST(test_argmin_reduction){
 	 */
 
 	for(j=0;j<dims[1];j++){
-		size_t gtArgmin = 0;
-		float  gtMin    = pSrc[(0*dims[1] + j)*dims[2] + 0];
+		size_t gtD1 = 0;
+		float  gtD0 = pS0[(0*dims[1] + j)*dims[2] + 0];
 
 		for(i=0;i<dims[0];i++){
 			for(k=0;k<dims[2];k++){
-				float v = pSrc[(i*dims[1] + j)*dims[2] + k];
+				float v = pS0[(i*dims[1] + j)*dims[2] + k];
 
-				if(v < gtMin){
-					gtMin    = v;
-					gtArgmin = i*dims[2] + k;
+				if(v < gtD0){
+					gtD0 = v;
+					gtD1 = i*dims[2] + k;
 				}
 			}
 		}
-
-		ck_assert_msg(gtArgmin == pArgmin[j], "Argmin value mismatch!");
+		
+		if(gtD1 != pD1[j]){
+			errCnt++;
+			if(errCnt < MAXERRPRINT){
+				fprintf(stderr, "%s:%d: Mismatch GT [%zu] != [%zu] UUT @ %zu!\n",
+				__func__, __LINE__, gtD1, pD1[j], j);
+				fflush (stderr);
+			}
+		}
 	}
+	ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt);
 
 	/**
 	 * Deallocate.
 	 */
 
-	free(pSrc);
-	free(pMin);
-	free(pArgmin);
-	GpuArray_clear(&gaSrc);
-	GpuArray_clear(&gaArgmin);
+	free(pS0);
+	free(pD0);
+	free(pD1);
+	GpuArray_clear(&gaS0);
+	GpuArray_clear(&gaD1);
 }END_TEST
 
 START_TEST(test_argmin_veryhighrank){
@@ -1222,6 +1392,7 @@ START_TEST(test_argmin_veryhighrank){
 	 * Here we test a reduction of a random 8D tensor on four dimensions.
 	 */
 
+	size_t errCnt      = 0;
 	size_t i,j,k,l,m,n,o,p;
 	size_t dims   [8]  = {1171,373,2,1,2,1,2,1};
 	size_t prodDims    = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7];
@@ -1229,12 +1400,12 @@ START_TEST(test_argmin_veryhighrank){
 	size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3];
 	const int reduxList[] = {2,4,7,5};
 
-	float*  pSrc    = calloc(1, sizeof(*pSrc)    * prodDims);
-	float*  pMin    = calloc(1, sizeof(*pMin)    * rdxProdDims);
-	size_t* pArgmin = calloc(1, sizeof(*pArgmin) * rdxProdDims);
+	float*  pS0    = calloc(1, sizeof(*pS0)    * prodDims);
+	float*  pD0    = calloc(1, sizeof(*pD0)    * rdxProdDims);
+	size_t* pD1 = calloc(1, sizeof(*pD1) * rdxProdDims);
 
-	ck_assert_ptr_ne(pSrc,    NULL);
-	ck_assert_ptr_ne(pArgmin, NULL);
+	ck_assert_ptr_nonnull(pS0);
+	ck_assert_ptr_nonnull(pD1);
 
 
 	/**
@@ -1242,7 +1413,7 @@ START_TEST(test_argmin_veryhighrank){
 	 */
 
 	for(i=0;i<prodDims;i++){
-		pSrc[i] = pcgRand01();
+		pS0[i] = pcgRand01();
 	}
 
 
@@ -1250,23 +1421,30 @@ START_TEST(test_argmin_veryhighrank){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaSrc;
-	GpuArray gaArgmin;
-
-	ga_assert_ok(GpuArray_empty(&gaSrc,    ctx, GA_FLOAT, 8, dims,    GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty(&gaArgmin, ctx, GA_SIZE,  4, rdxDims, GA_C_ORDER));
+	GpuArray gaS0;
+	GpuArray gaD1;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 
-	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
-	ga_assert_ok(GpuArray_memset(&gaArgmin, -1));
+	ga_assert_ok(GpuArray_empty(&gaS0,    ctx, GA_FLOAT, 8, dims,    GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_SIZE,  4, rdxDims, GA_C_ORDER));
 
-	GpuReduction* gr;
-	GpuReduction_new(&gr, GpuArray_context(&gaSrc),
-	                 GA_REDUCE_ARGMIN, 4, 4, gaSrc.typecode, 0);
+	ga_assert_ok(GpuArray_write(&gaS0,    pS0, sizeof(*pS0)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD1, -1));
+	
+	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
+	ck_assert_ptr_nonnull(grAttr);
+	GpuReductionAttr_setop    (grAttr, GA_REDUCE_ARGMIN);
+	GpuReductionAttr_setdims  (grAttr, gaS0.nd, gaD1.nd);
+	GpuReductionAttr_sets0type(grAttr, gaS0.typecode);
+	GpuReductionAttr_setd1type(grAttr, gaD1.typecode);
+	GpuReduction_new(&gr, grAttr);
 	ck_assert_ptr_nonnull(gr);
-	ga_assert_ok(GpuReduction_call(gr, NULL, &gaArgmin, &gaSrc, 4, reduxList, 0));
+	ga_assert_ok(GpuReduction_call(gr, NULL, &gaD1, &gaS0, gaS0.nd-gaD1.nd, reduxList, 0));
 	GpuReduction_free(gr);
+	GpuReductionAttr_free(grAttr);
 
-	ga_assert_ok(GpuArray_read(pArgmin, sizeof(*pArgmin)*rdxProdDims, &gaArgmin));
+	ga_assert_ok(GpuArray_read(pD1, sizeof(*pD1)*rdxProdDims, &gaD1));
 
 
 	/**
@@ -1277,18 +1455,18 @@ START_TEST(test_argmin_veryhighrank){
 		for(j=0;j<dims[1];j++){
 			for(l=0;l<dims[3];l++){
 				for(o=0;o<dims[6];o++){
-					size_t gtArgmin = 0;
-					float  gtMin    = pSrc[(((((((i)*dims[1] + j)*dims[2] + 0)*dims[3] + l)*dims[4] + 0)*dims[5] + 0)*dims[6] + o)*dims[7] + 0];
+					size_t gtD1 = 0;
+					float  gtD0 = pS0[(((((((i)*dims[1] + j)*dims[2] + 0)*dims[3] + l)*dims[4] + 0)*dims[5] + 0)*dims[6] + o)*dims[7] + 0];
 
 					for(k=0;k<dims[2];k++){
 						for(m=0;m<dims[4];m++){
 							for(p=0;p<dims[7];p++){
 								for(n=0;n<dims[5];n++){
-									float v = pSrc[(((((((i)*dims[1] + j)*dims[2] + k)*dims[3] + l)*dims[4] + m)*dims[5] + n)*dims[6] + o)*dims[7] + p];
+									float v = pS0[(((((((i)*dims[1] + j)*dims[2] + k)*dims[3] + l)*dims[4] + m)*dims[5] + n)*dims[6] + o)*dims[7] + p];
 
-									if(v < gtMin){
-										gtMin    = v;
-										gtArgmin = (((k)*dims[4] + m)*dims[7] + p)*dims[5] + n;
+									if(v < gtD0){
+										gtD0 = v;
+										gtD1 = (((k)*dims[4] + m)*dims[7] + p)*dims[5] + n;
 									}
 								}
 							}
@@ -1296,22 +1474,30 @@ START_TEST(test_argmin_veryhighrank){
 					}
 
 					size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
-					ck_assert_msg(gtArgmin == pArgmin[dstIdx], "Argmin value mismatch!");
+					if(gtD1 != pD1[dstIdx]){
+						errCnt++;
+						if(errCnt < MAXERRPRINT){
+							fprintf(stderr, "%s:%d: Mismatch GT [%zu] != [%zu] UUT @ %zu!\n",
+							__func__, __LINE__, gtD1, pD1[dstIdx], dstIdx);
+							fflush (stderr);
+						}
+					}
 				}
 			}
 		}
 	}
+	ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt);
 
 
 	/**
 	 * Deallocate.
 	 */
 
-	free(pSrc);
-	free(pMin);
-	free(pArgmin);
-	GpuArray_clear(&gaSrc);
-	GpuArray_clear(&gaArgmin);
+	free(pS0);
+	free(pD0);
+	free(pD1);
+	GpuArray_clear(&gaS0);
+	GpuArray_clear(&gaD1);
 }END_TEST
 
 START_TEST(test_argmin_alldimsreduced){
@@ -1321,18 +1507,19 @@ START_TEST(test_argmin_alldimsreduced){
 	 * We test here a reduction of some random 3D tensor on all dimensions.
 	 */
 
+	size_t errCnt      = 0;
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
 	const int reduxList[] = {0,1,2};
 
-	float*  pSrc    = calloc(1, sizeof(*pSrc)    * dims[0]*dims[1]*dims[2]);
-	float*  pMin    = calloc(1, sizeof(*pMin)                             );
-	size_t* pArgmin = calloc(1, sizeof(*pArgmin)                          );
+	float*  pS0 = calloc(1, sizeof(*pS0) * dims[0]*dims[1]*dims[2]);
+	float*  pD0 = calloc(1, sizeof(*pD0)                          );
+	size_t* pD1 = calloc(1, sizeof(*pD1)                          );
 
-	ck_assert_ptr_ne(pSrc,    NULL);
-	ck_assert_ptr_ne(pMin,    NULL);
-	ck_assert_ptr_ne(pArgmin, NULL);
+	ck_assert_ptr_nonnull(pS0);
+	ck_assert_ptr_nonnull(pD0);
+	ck_assert_ptr_nonnull(pD1);
 
 
 	/**
@@ -1340,7 +1527,7 @@ START_TEST(test_argmin_alldimsreduced){
 	 */
 
 	for(i=0;i<prodDims;i++){
-		pSrc[i] = pcgRand01();
+		pS0[i] = pcgRand01();
 	}
 
 
@@ -1348,56 +1535,70 @@ START_TEST(test_argmin_alldimsreduced){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaSrc;
-	GpuArray gaArgmin;
+	GpuArray gaS0;
+	GpuArray gaD1;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 
-	ga_assert_ok(GpuArray_empty(&gaSrc,    ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty(&gaArgmin, ctx, GA_SIZE,  0, NULL,     GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaS0, ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_SIZE,  0, NULL,     GA_C_ORDER));
 
-	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
-	ga_assert_ok(GpuArray_memset(&gaArgmin, -1));
-
-	GpuReduction* gr;
-	GpuReduction_new(&gr, GpuArray_context(&gaSrc),
-	                 GA_REDUCE_ARGMIN, 0, 3, gaSrc.typecode, 0);
+	ga_assert_ok(GpuArray_write(&gaS0, pS0, sizeof(*pS0)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD1, -1));
+	
+	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
+	ck_assert_ptr_nonnull(grAttr);
+	GpuReductionAttr_setop    (grAttr, GA_REDUCE_ARGMIN);
+	GpuReductionAttr_setdims  (grAttr, gaS0.nd, gaD1.nd);
+	GpuReductionAttr_sets0type(grAttr, gaS0.typecode);
+	GpuReductionAttr_setd1type(grAttr, gaD1.typecode);
+	GpuReduction_new(&gr, grAttr);
 	ck_assert_ptr_nonnull(gr);
-	ga_assert_ok(GpuReduction_call(gr, NULL, &gaArgmin, &gaSrc, 3, reduxList, 0));
+	ga_assert_ok(GpuReduction_call(gr, NULL, &gaD1, &gaS0, gaS0.nd-gaD1.nd, reduxList, 0));
 	GpuReduction_free(gr);
+	GpuReductionAttr_free(grAttr);
 
-	ga_assert_ok(GpuArray_read(pArgmin, sizeof(*pArgmin), &gaArgmin));
+	ga_assert_ok(GpuArray_read(pD1, sizeof(*pD1), &gaD1));
 
 
 	/**
 	 * Check that the destination tensors are correct.
 	 */
 
-	size_t gtArgmin = 0;
-	float  gtMin    = pSrc[0];
+	size_t gtD1 = 0;
+	float  gtD0 = pS0[0];
 
 	for(i=0;i<dims[0];i++){
 		for(j=0;j<dims[1];j++){
 			for(k=0;k<dims[2];k++){
-				float v = pSrc[(i*dims[1] + j)*dims[2] + k];
+				float v = pS0[(i*dims[1] + j)*dims[2] + k];
 
-				if(v < gtMin){
-					gtMin    = v;
-					gtArgmin = (i*dims[1] + j)*dims[2] + k;
+				if(v < gtD0){
+					gtD0 = v;
+					gtD1 = (i*dims[1] + j)*dims[2] + k;
 				}
 			}
 		}
 	}
-
-	ck_assert_msg(gtArgmin == pArgmin[0], "Argmin value mismatch!");
+	if(gtD1 != pD1[0]){
+		errCnt++;
+		if(errCnt < MAXERRPRINT){
+			fprintf(stderr, "%s:%d: Mismatch GT [%zu] != [%zu] UUT @ %zu!\n",
+			__func__, __LINE__, gtD1, pD1[0], (size_t)0);
+			fflush (stderr);
+		}
+	}
+	ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt);
 
 	/**
 	 * Deallocate.
 	 */
 
-	free(pSrc);
-	free(pMin);
-	free(pArgmin);
-	GpuArray_clear(&gaSrc);
-	GpuArray_clear(&gaArgmin);
+	free(pS0);
+	free(pD0);
+	free(pD1);
+	GpuArray_clear(&gaS0);
+	GpuArray_clear(&gaD1);
 }END_TEST
 
 START_TEST(test_max_reduction){
@@ -1407,16 +1608,17 @@ START_TEST(test_max_reduction){
 	 * We test here a reduction of some random 3D tensor on all dimensions.
 	 */
 
+	size_t errCnt      = 0;
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
 	const int reduxList[] = {0,2};
 
-	float*  pSrc    = calloc(1, sizeof(*pSrc)    * dims[0]*dims[1]*dims[2]);
-	float*  pMax    = calloc(1, sizeof(*pMax)    *         dims[1]        );
+	float*  pS0    = calloc(1, sizeof(*pS0)    * dims[0]*dims[1]*dims[2]);
+	float*  pD0    = calloc(1, sizeof(*pD0)    *         dims[1]        );
 
-	ck_assert_ptr_ne(pSrc,    NULL);
-	ck_assert_ptr_ne(pMax,    NULL);
+	ck_assert_ptr_nonnull(pS0);
+	ck_assert_ptr_nonnull(pD0);
 
 
 	/**
@@ -1424,7 +1626,7 @@ START_TEST(test_max_reduction){
 	 */
 
 	for(i=0;i<prodDims;i++){
-		pSrc[i] = pcgRand01();
+		pS0[i] = pcgRand01();
 	}
 
 
@@ -1432,23 +1634,30 @@ START_TEST(test_max_reduction){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaSrc;
-	GpuArray gaMax;
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 
-	ga_assert_ok(GpuArray_empty(&gaSrc,    ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty(&gaMax,    ctx, GA_FLOAT, 1, &dims[1], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaS0,    ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaD0,    ctx, GA_FLOAT, 1, &dims[1], GA_C_ORDER));
 
-	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
-	ga_assert_ok(GpuArray_memset(&gaMax,    -1));  /* 0xFFFFFFFF is a qNaN. */
-
-	GpuReduction* gr;
-	GpuReduction_new(&gr, GpuArray_context(&gaSrc),
-	                 GA_REDUCE_MAX, 1, 2, gaSrc.typecode, 0);
+	ga_assert_ok(GpuArray_write(&gaS0,    pS0, sizeof(*pS0)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD0,    -1));  /* 0xFFFFFFFF is a qNaN. */
+	
+	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
+	ck_assert_ptr_nonnull(grAttr);
+	GpuReductionAttr_setop    (grAttr, GA_REDUCE_MAX);
+	GpuReductionAttr_setdims  (grAttr, gaS0.nd, gaD0.nd);
+	GpuReductionAttr_sets0type(grAttr, gaS0.typecode);
+	GpuReductionAttr_setd0type(grAttr, gaD0.typecode);
+	GpuReduction_new(&gr, grAttr);
 	ck_assert_ptr_nonnull(gr);
-	ga_assert_ok(GpuReduction_call(gr, &gaMax, NULL, &gaSrc, 2, reduxList, 0));
+	ga_assert_ok(GpuReduction_call(gr, &gaD0, NULL, &gaS0, gaS0.nd-gaD0.nd, reduxList, 0));
 	GpuReduction_free(gr);
+	GpuReductionAttr_free(grAttr);
 
-	ga_assert_ok(GpuArray_read(pMax,    sizeof(*pMax)   *dims[1], &gaMax));
+	ga_assert_ok(GpuArray_read(pD0, sizeof(*pD0)*dims[1], &gaD0));
 
 
 	/**
@@ -1456,29 +1665,37 @@ START_TEST(test_max_reduction){
 	 */
 
 	for(j=0;j<dims[1];j++){
-		float  gtMax    = pSrc[(0*dims[1] + j)*dims[2] + 0];
+		float  gtD0 = pS0[(0*dims[1] + j)*dims[2] + 0];
 
 		for(i=0;i<dims[0];i++){
 			for(k=0;k<dims[2];k++){
-				float v = pSrc[(i*dims[1] + j)*dims[2] + k];
+				float v = pS0[(i*dims[1] + j)*dims[2] + k];
 
-				if(v > gtMax){
-					gtMax    = v;
+				if(v > gtD0){
+					gtD0 = v;
 				}
 			}
 		}
-
-		ck_assert_msg(gtMax    == pMax[j],    "Max value mismatch!");
+		
+		if(gtD0 != pD0[j]){
+			errCnt++;
+			if(errCnt < MAXERRPRINT){
+				fprintf(stderr, "%s:%d: Mismatch GT %f != %f UUT @ %zu!\n",
+				__func__, __LINE__, gtD0, pD0[j], j);
+				fflush (stderr);
+			}
+		}
 	}
+	ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt);
 
 	/**
 	 * Deallocate.
 	 */
 
-	free(pSrc);
-	free(pMax);
-	GpuArray_clear(&gaSrc);
-	GpuArray_clear(&gaMax);
+	free(pS0);
+	free(pD0);
+	GpuArray_clear(&gaS0);
+	GpuArray_clear(&gaD0);
 }END_TEST
 
 START_TEST(test_max_veryhighrank){
@@ -1488,6 +1705,7 @@ START_TEST(test_max_veryhighrank){
 	 * Here we test a reduction of a random 8D tensor on four dimensions.
 	 */
 
+	size_t errCnt      = 0;
 	size_t i,j,k,l,m,n,o,p;
 	size_t dims   [8]  = {1171,373,2,1,2,1,2,1};
 	size_t prodDims    = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7];
@@ -1495,11 +1713,11 @@ START_TEST(test_max_veryhighrank){
 	size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3];
 	const int reduxList[] = {2,4,7,5};
 
-	float*  pSrc    = calloc(1, sizeof(*pSrc)    * prodDims);
-	float*  pMax    = calloc(1, sizeof(*pMax)    * rdxProdDims);
+	float*  pS0    = calloc(1, sizeof(*pS0)    * prodDims);
+	float*  pD0    = calloc(1, sizeof(*pD0)    * rdxProdDims);
 
-	ck_assert_ptr_ne(pSrc,    NULL);
-	ck_assert_ptr_ne(pMax,    NULL);
+	ck_assert_ptr_nonnull(pS0);
+	ck_assert_ptr_nonnull(pD0);
 
 
 	/**
@@ -1507,7 +1725,7 @@ START_TEST(test_max_veryhighrank){
 	 */
 
 	for(i=0;i<prodDims;i++){
-		pSrc[i] = pcgRand01();
+		pS0[i] = pcgRand01();
 	}
 
 
@@ -1515,23 +1733,30 @@ START_TEST(test_max_veryhighrank){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaSrc;
-	GpuArray gaMax;
-
-	ga_assert_ok(GpuArray_empty(&gaSrc,    ctx, GA_FLOAT, 8, dims,    GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty(&gaMax,    ctx, GA_FLOAT, 4, rdxDims, GA_C_ORDER));
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 
-	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
-	ga_assert_ok(GpuArray_memset(&gaMax,    -1));  /* 0xFFFFFFFF is a qNaN. */
+	ga_assert_ok(GpuArray_empty(&gaS0,    ctx, GA_FLOAT, 8, dims,    GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaD0,    ctx, GA_FLOAT, 4, rdxDims, GA_C_ORDER));
 
-	GpuReduction* gr;
-	GpuReduction_new(&gr, GpuArray_context(&gaSrc),
-	                 GA_REDUCE_MAX, 4, 4, gaSrc.typecode, 0);
+	ga_assert_ok(GpuArray_write(&gaS0,    pS0, sizeof(*pS0)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD0,    -1));  /* 0xFFFFFFFF is a qNaN. */
+	
+	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
+	ck_assert_ptr_nonnull(grAttr);
+	GpuReductionAttr_setop    (grAttr, GA_REDUCE_MAX);
+	GpuReductionAttr_setdims  (grAttr, gaS0.nd, gaD0.nd);
+	GpuReductionAttr_sets0type(grAttr, gaS0.typecode);
+	GpuReductionAttr_setd0type(grAttr, gaD0.typecode);
+	GpuReduction_new(&gr, grAttr);
 	ck_assert_ptr_nonnull(gr);
-	ga_assert_ok(GpuReduction_call(gr, &gaMax, NULL, &gaSrc, 4, reduxList, 0));
+	ga_assert_ok(GpuReduction_call(gr, &gaD0, NULL, &gaS0, gaS0.nd-gaD0.nd, reduxList, 0));
 	GpuReduction_free(gr);
+	GpuReductionAttr_free(grAttr);
 
-	ga_assert_ok(GpuArray_read(pMax,    sizeof(*pMax)   *rdxProdDims, &gaMax));
+	ga_assert_ok(GpuArray_read(pD0,    sizeof(*pD0)   *rdxProdDims, &gaD0));
 
 
 	/**
@@ -1542,16 +1767,16 @@ START_TEST(test_max_veryhighrank){
 		for(j=0;j<dims[1];j++){
 			for(l=0;l<dims[3];l++){
 				for(o=0;o<dims[6];o++){
-					float  gtMax    = pSrc[(((((((i)*dims[1] + j)*dims[2] + 0)*dims[3] + l)*dims[4] + 0)*dims[5] + 0)*dims[6] + o)*dims[7] + 0];
+					float  gtD0 = pS0[(((((((i)*dims[1] + j)*dims[2] + 0)*dims[3] + l)*dims[4] + 0)*dims[5] + 0)*dims[6] + o)*dims[7] + 0];
 
 					for(k=0;k<dims[2];k++){
 						for(m=0;m<dims[4];m++){
 							for(p=0;p<dims[7];p++){
 								for(n=0;n<dims[5];n++){
-									float v = pSrc[(((((((i)*dims[1] + j)*dims[2] + k)*dims[3] + l)*dims[4] + m)*dims[5] + n)*dims[6] + o)*dims[7] + p];
+									float v = pS0[(((((((i)*dims[1] + j)*dims[2] + k)*dims[3] + l)*dims[4] + m)*dims[5] + n)*dims[6] + o)*dims[7] + p];
 
-									if(v > gtMax){
-										gtMax    = v;
+									if(v > gtD0){
+										gtD0 = v;
 									}
 								}
 							}
@@ -1559,21 +1784,29 @@ START_TEST(test_max_veryhighrank){
 					}
 
 					size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
-					ck_assert_msg(gtMax    == pMax[dstIdx],    "Max value mismatch!");
+					if(gtD0 != pD0[dstIdx]){
+						errCnt++;
+						if(errCnt < MAXERRPRINT){
+							fprintf(stderr, "%s:%d: Mismatch GT %f != %f UUT @ %zu!\n",
+							__func__, __LINE__, gtD0, pD0[dstIdx], dstIdx);
+							fflush (stderr);
+						}
+					}
 				}
 			}
 		}
 	}
+	ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt);
 
 
 	/**
 	 * Deallocate.
 	 */
 
-	free(pSrc);
-	free(pMax);
-	GpuArray_clear(&gaSrc);
-	GpuArray_clear(&gaMax);
+	free(pS0);
+	free(pD0);
+	GpuArray_clear(&gaS0);
+	GpuArray_clear(&gaD0);
 }END_TEST
 
 START_TEST(test_max_alldimsreduced){
@@ -1583,16 +1816,17 @@ START_TEST(test_max_alldimsreduced){
 	 * We test here a reduction of some random 3D tensor on all dimensions.
 	 */
 
+	size_t errCnt      = 0;
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
 	const int reduxList[] = {0,1,2};
 
-	float*  pSrc    = calloc(1, sizeof(*pSrc)    * dims[0]*dims[1]*dims[2]);
-	float*  pMax    = calloc(1, sizeof(*pMax)                             );
+	float*  pS0    = calloc(1, sizeof(*pS0)    * dims[0]*dims[1]*dims[2]);
+	float*  pD0    = calloc(1, sizeof(*pD0)                             );
 
-	ck_assert_ptr_ne(pSrc,    NULL);
-	ck_assert_ptr_ne(pMax,    NULL);
+	ck_assert_ptr_nonnull(pS0);
+	ck_assert_ptr_nonnull(pD0);
 
 
 	/**
@@ -1600,7 +1834,7 @@ START_TEST(test_max_alldimsreduced){
 	 */
 
 	for(i=0;i<prodDims;i++){
-		pSrc[i] = pcgRand01();
+		pS0[i] = pcgRand01();
 	}
 
 
@@ -1608,53 +1842,67 @@ START_TEST(test_max_alldimsreduced){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaSrc;
-	GpuArray gaMax;
-
-	ga_assert_ok(GpuArray_empty(&gaSrc,    ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty(&gaMax,    ctx, GA_FLOAT, 0, NULL,     GA_C_ORDER));
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 
-	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
-	ga_assert_ok(GpuArray_memset(&gaMax,    -1));  /* 0xFFFFFFFF is a qNaN. */
+	ga_assert_ok(GpuArray_empty(&gaS0,    ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaD0,    ctx, GA_FLOAT, 0, NULL,     GA_C_ORDER));
 
-	GpuReduction* gr;
-	GpuReduction_new(&gr, GpuArray_context(&gaSrc),
-	                 GA_REDUCE_MAX, 0, 3, gaSrc.typecode, 0);
+	ga_assert_ok(GpuArray_write(&gaS0,    pS0, sizeof(*pS0)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD0,    -1));  /* 0xFFFFFFFF is a qNaN. */
+	
+	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
+	ck_assert_ptr_nonnull(grAttr);
+	GpuReductionAttr_setop    (grAttr, GA_REDUCE_MAX);
+	GpuReductionAttr_setdims  (grAttr, gaS0.nd, gaD0.nd);
+	GpuReductionAttr_sets0type(grAttr, gaS0.typecode);
+	GpuReductionAttr_setd0type(grAttr, gaD0.typecode);
+	GpuReduction_new(&gr, grAttr);
 	ck_assert_ptr_nonnull(gr);
-	ga_assert_ok(GpuReduction_call(gr, &gaMax, NULL, &gaSrc, 3, reduxList, 0));
+	ga_assert_ok(GpuReduction_call(gr, &gaD0, NULL, &gaS0, gaS0.nd-gaD0.nd, reduxList, 0));
 	GpuReduction_free(gr);
+	GpuReductionAttr_free(grAttr);
 
-	ga_assert_ok(GpuArray_read(pMax,    sizeof(*pMax),    &gaMax));
+	ga_assert_ok(GpuArray_read(pD0,    sizeof(*pD0),    &gaD0));
 
 
 	/**
 	 * Check that the destaxation tensors are correct.
 	 */
 
-	float  gtMax    = pSrc[0];
+	float  gtD0 = pS0[0];
 
 	for(i=0;i<dims[0];i++){
 		for(j=0;j<dims[1];j++){
 			for(k=0;k<dims[2];k++){
-				float v = pSrc[(i*dims[1] + j)*dims[2] + k];
+				float v = pS0[(i*dims[1] + j)*dims[2] + k];
 
-				if(v > gtMax){
-					gtMax    = v;
+				if(v > gtD0){
+					gtD0 = v;
 				}
 			}
 		}
 	}
-
-	ck_assert_msg(gtMax    == pMax[0],    "Max value mismatch!");
+	if(gtD0 != pD0[0]){
+		errCnt++;
+		if(errCnt < MAXERRPRINT){
+			fprintf(stderr, "%s:%d: Mismatch GT %f != %f UUT @ %zu!\n",
+			__func__, __LINE__, gtD0, pD0[0], (size_t)0);
+			fflush (stderr);
+		}
+	}
+	ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt);
 
 	/**
 	 * Deallocate.
 	 */
 
-	free(pSrc);
-	free(pMax);
-	GpuArray_clear(&gaSrc);
-	GpuArray_clear(&gaMax);
+	free(pS0);
+	free(pD0);
+	GpuArray_clear(&gaS0);
+	GpuArray_clear(&gaD0);
 }END_TEST
 
 START_TEST(test_min_reduction){
@@ -1664,16 +1912,17 @@ START_TEST(test_min_reduction){
 	 * We test here a reduction of some random 3D tensor on all dimensions.
 	 */
 
+	size_t errCnt      = 0;
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
 	const int reduxList[] = {0,2};
 
-	float*  pSrc    = calloc(1, sizeof(*pSrc)    * dims[0]*dims[1]*dims[2]);
-	float*  pMin    = calloc(1, sizeof(*pMin)    *         dims[1]        );
+	float*  pS0    = calloc(1, sizeof(*pS0)    * dims[0]*dims[1]*dims[2]);
+	float*  pD0    = calloc(1, sizeof(*pD0)    *         dims[1]        );
 
-	ck_assert_ptr_ne(pSrc,    NULL);
-	ck_assert_ptr_ne(pMin,    NULL);
+	ck_assert_ptr_nonnull(pS0);
+	ck_assert_ptr_nonnull(pD0);
 
 
 	/**
@@ -1681,7 +1930,7 @@ START_TEST(test_min_reduction){
 	 */
 
 	for(i=0;i<prodDims;i++){
-		pSrc[i] = pcgRand01();
+		pS0[i] = pcgRand01();
 	}
 
 
@@ -1689,23 +1938,30 @@ START_TEST(test_min_reduction){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaSrc;
-	GpuArray gaMin;
-
-	ga_assert_ok(GpuArray_empty(&gaSrc,    ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty(&gaMin,    ctx, GA_FLOAT, 1, &dims[1], GA_C_ORDER));
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 
-	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
-	ga_assert_ok(GpuArray_memset(&gaMin,    -1));  /* 0xFFFFFFFF is a qNaN. */
+	ga_assert_ok(GpuArray_empty(&gaS0,    ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaD0,    ctx, GA_FLOAT, 1, &dims[1], GA_C_ORDER));
 
-	GpuReduction* gr;
-	GpuReduction_new(&gr, GpuArray_context(&gaSrc),
-	                 GA_REDUCE_MIN, 1, 2, gaSrc.typecode, 0);
+	ga_assert_ok(GpuArray_write(&gaS0,    pS0, sizeof(*pS0)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD0,    -1));  /* 0xFFFFFFFF is a qNaN. */
+	
+	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
+	ck_assert_ptr_nonnull(grAttr);
+	GpuReductionAttr_setop    (grAttr, GA_REDUCE_MIN);
+	GpuReductionAttr_setdims  (grAttr, gaS0.nd, gaD0.nd);
+	GpuReductionAttr_sets0type(grAttr, gaS0.typecode);
+	GpuReductionAttr_setd0type(grAttr, gaD0.typecode);
+	GpuReduction_new(&gr, grAttr);
 	ck_assert_ptr_nonnull(gr);
-	ga_assert_ok(GpuReduction_call(gr, &gaMin, NULL, &gaSrc, 2, reduxList, 0));
+	ga_assert_ok(GpuReduction_call(gr, &gaD0, NULL, &gaS0, gaS0.nd-gaD0.nd, reduxList, 0));
 	GpuReduction_free(gr);
+	GpuReductionAttr_free(grAttr);
 
-	ga_assert_ok(GpuArray_read(pMin,    sizeof(*pMin)   *dims[1], &gaMin));
+	ga_assert_ok(GpuArray_read(pD0,    sizeof(*pD0)   *dims[1], &gaD0));
 
 
 	/**
@@ -1713,29 +1969,37 @@ START_TEST(test_min_reduction){
 	 */
 
 	for(j=0;j<dims[1];j++){
-		float  gtMin    = pSrc[(0*dims[1] + j)*dims[2] + 0];
+		float  gtD0 = pS0[(0*dims[1] + j)*dims[2] + 0];
 
 		for(i=0;i<dims[0];i++){
 			for(k=0;k<dims[2];k++){
-				float v = pSrc[(i*dims[1] + j)*dims[2] + k];
+				float v = pS0[(i*dims[1] + j)*dims[2] + k];
 
-				if(v < gtMin){
-					gtMin    = v;
+				if(v < gtD0){
+					gtD0 = v;
 				}
 			}
 		}
-
-		ck_assert_msg(gtMin    == pMin[j],    "Min value mismatch!");
+		
+		if(gtD0 != pD0[j]){
+			errCnt++;
+			if(errCnt < MAXERRPRINT){
+				fprintf(stderr, "%s:%d: Mismatch GT %f != %f UUT @ %zu!\n",
+				__func__, __LINE__, gtD0, pD0[j], j);
+				fflush (stderr);
+			}
+		}
 	}
+	ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt);
 
 	/**
 	 * Deallocate.
 	 */
 
-	free(pSrc);
-	free(pMin);
-	GpuArray_clear(&gaSrc);
-	GpuArray_clear(&gaMin);
+	free(pS0);
+	free(pD0);
+	GpuArray_clear(&gaS0);
+	GpuArray_clear(&gaD0);
 }END_TEST
 
 START_TEST(test_min_veryhighrank){
@@ -1745,6 +2009,7 @@ START_TEST(test_min_veryhighrank){
 	 * Here we test a reduction of a random 8D tensor on four dimensions.
 	 */
 
+	size_t errCnt      = 0;
 	size_t i,j,k,l,m,n,o,p;
 	size_t dims   [8]  = {1171,373,2,1,2,1,2,1};
 	size_t prodDims    = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7];
@@ -1752,11 +2017,11 @@ START_TEST(test_min_veryhighrank){
 	size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3];
 	const int reduxList[] = {2,4,7,5};
 
-	float*  pSrc    = calloc(1, sizeof(*pSrc)    * prodDims);
-	float*  pMin    = calloc(1, sizeof(*pMin)    * rdxProdDims);
+	float*  pS0    = calloc(1, sizeof(*pS0)    * prodDims);
+	float*  pD0    = calloc(1, sizeof(*pD0)    * rdxProdDims);
 
-	ck_assert_ptr_ne(pSrc,    NULL);
-	ck_assert_ptr_ne(pMin,    NULL);
+	ck_assert_ptr_nonnull(pS0);
+	ck_assert_ptr_nonnull(pD0);
 
 
 	/**
@@ -1764,7 +2029,7 @@ START_TEST(test_min_veryhighrank){
 	 */
 
 	for(i=0;i<prodDims;i++){
-		pSrc[i] = pcgRand01();
+		pS0[i] = pcgRand01();
 	}
 
 
@@ -1772,23 +2037,30 @@ START_TEST(test_min_veryhighrank){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaSrc;
-	GpuArray gaMin;
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 
-	ga_assert_ok(GpuArray_empty(&gaSrc,    ctx, GA_FLOAT, 8, dims,    GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty(&gaMin,    ctx, GA_FLOAT, 4, rdxDims, GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaS0,    ctx, GA_FLOAT, 8, dims,    GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaD0,    ctx, GA_FLOAT, 4, rdxDims, GA_C_ORDER));
 
-	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
-	ga_assert_ok(GpuArray_memset(&gaMin,    -1));  /* 0xFFFFFFFF is a qNaN. */
-
-	GpuReduction* gr;
-	GpuReduction_new(&gr, GpuArray_context(&gaSrc),
-	                 GA_REDUCE_MIN, 4, 4, gaSrc.typecode, 0);
+	ga_assert_ok(GpuArray_write(&gaS0,    pS0, sizeof(*pS0)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD0,    -1));  /* 0xFFFFFFFF is a qNaN. */
+	
+	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
+	ck_assert_ptr_nonnull(grAttr);
+	GpuReductionAttr_setop    (grAttr, GA_REDUCE_MIN);
+	GpuReductionAttr_setdims  (grAttr, gaS0.nd, gaD0.nd);
+	GpuReductionAttr_sets0type(grAttr, gaS0.typecode);
+	GpuReductionAttr_setd0type(grAttr, gaD0.typecode);
+	GpuReduction_new(&gr, grAttr);
 	ck_assert_ptr_nonnull(gr);
-	ga_assert_ok(GpuReduction_call(gr, &gaMin, NULL, &gaSrc, 4, reduxList, 0));
+	ga_assert_ok(GpuReduction_call(gr, &gaD0, NULL, &gaS0, gaS0.nd-gaD0.nd, reduxList, 0));
 	GpuReduction_free(gr);
+	GpuReductionAttr_free(grAttr);
 
-	ga_assert_ok(GpuArray_read(pMin,    sizeof(*pMin)   *rdxProdDims, &gaMin));
+	ga_assert_ok(GpuArray_read(pD0,    sizeof(*pD0)   *rdxProdDims, &gaD0));
 
 
 	/**
@@ -1799,16 +2071,16 @@ START_TEST(test_min_veryhighrank){
 		for(j=0;j<dims[1];j++){
 			for(l=0;l<dims[3];l++){
 				for(o=0;o<dims[6];o++){
-					float  gtMin    = pSrc[(((((((i)*dims[1] + j)*dims[2] + 0)*dims[3] + l)*dims[4] + 0)*dims[5] + 0)*dims[6] + o)*dims[7] + 0];
+					float  gtD0 = pS0[(((((((i)*dims[1] + j)*dims[2] + 0)*dims[3] + l)*dims[4] + 0)*dims[5] + 0)*dims[6] + o)*dims[7] + 0];
 
 					for(k=0;k<dims[2];k++){
 						for(m=0;m<dims[4];m++){
 							for(p=0;p<dims[7];p++){
 								for(n=0;n<dims[5];n++){
-									float v = pSrc[(((((((i)*dims[1] + j)*dims[2] + k)*dims[3] + l)*dims[4] + m)*dims[5] + n)*dims[6] + o)*dims[7] + p];
+									float v = pS0[(((((((i)*dims[1] + j)*dims[2] + k)*dims[3] + l)*dims[4] + m)*dims[5] + n)*dims[6] + o)*dims[7] + p];
 
-									if(v < gtMin){
-										gtMin    = v;
+									if(v < gtD0){
+										gtD0 = v;
 									}
 								}
 							}
@@ -1816,21 +2088,29 @@ START_TEST(test_min_veryhighrank){
 					}
 
 					size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
-					ck_assert_msg(gtMin    == pMin[dstIdx],    "Min value mismatch!");
+					if(gtD0 != pD0[dstIdx]){
+						errCnt++;
+						if(errCnt < MAXERRPRINT){
+							fprintf(stderr, "%s:%d: Mismatch GT %f != %f UUT @ %zu!\n",
+							__func__, __LINE__, gtD0, pD0[dstIdx], dstIdx);
+							fflush (stderr);
+						}
+					}
 				}
 			}
 		}
 	}
+	ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt);
 
 
 	/**
 	 * Deallocate.
 	 */
 
-	free(pSrc);
-	free(pMin);
-	GpuArray_clear(&gaSrc);
-	GpuArray_clear(&gaMin);
+	free(pS0);
+	free(pD0);
+	GpuArray_clear(&gaS0);
+	GpuArray_clear(&gaD0);
 }END_TEST
 
 START_TEST(test_min_alldimsreduced){
@@ -1840,16 +2120,17 @@ START_TEST(test_min_alldimsreduced){
 	 * We test here a reduction of some random 3D tensor on all dimensions.
 	 */
 
+	size_t errCnt      = 0;
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
 	const int reduxList[] = {0,1,2};
 
-	float*  pSrc    = calloc(1, sizeof(*pSrc)    * dims[0]*dims[1]*dims[2]);
-	float*  pMin    = calloc(1, sizeof(*pMin)                             );
+	float*  pS0    = calloc(1, sizeof(*pS0)    * dims[0]*dims[1]*dims[2]);
+	float*  pD0    = calloc(1, sizeof(*pD0)                             );
 
-	ck_assert_ptr_ne(pSrc,    NULL);
-	ck_assert_ptr_ne(pMin,    NULL);
+	ck_assert_ptr_nonnull(pS0);
+	ck_assert_ptr_nonnull(pD0);
 
 
 	/**
@@ -1857,7 +2138,7 @@ START_TEST(test_min_alldimsreduced){
 	 */
 
 	for(i=0;i<prodDims;i++){
-		pSrc[i] = pcgRand01();
+		pS0[i] = pcgRand01();
 	}
 
 
@@ -1865,53 +2146,67 @@ START_TEST(test_min_alldimsreduced){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaSrc;
-	GpuArray gaMin;
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 
-	ga_assert_ok(GpuArray_empty(&gaSrc,    ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty(&gaMin,    ctx, GA_FLOAT, 0, NULL,     GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaS0,    ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaD0,    ctx, GA_FLOAT, 0, NULL,     GA_C_ORDER));
 
-	ga_assert_ok(GpuArray_write(&gaSrc,    pSrc, sizeof(*pSrc)*prodDims));
-	ga_assert_ok(GpuArray_memset(&gaMin,    -1));  /* 0xFFFFFFFF is a qNaN. */
-
-	GpuReduction* gr;
-	GpuReduction_new(&gr, GpuArray_context(&gaSrc),
-	                 GA_REDUCE_MIN, 0, 3, gaSrc.typecode, 0);
+	ga_assert_ok(GpuArray_write(&gaS0,    pS0, sizeof(*pS0)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD0,    -1));  /* 0xFFFFFFFF is a qNaN. */
+	
+	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
+	ck_assert_ptr_nonnull(grAttr);
+	GpuReductionAttr_setop    (grAttr, GA_REDUCE_MIN);
+	GpuReductionAttr_setdims  (grAttr, gaS0.nd, gaD0.nd);
+	GpuReductionAttr_sets0type(grAttr, gaS0.typecode);
+	GpuReductionAttr_setd0type(grAttr, gaD0.typecode);
+	GpuReduction_new(&gr, grAttr);
 	ck_assert_ptr_nonnull(gr);
-	ga_assert_ok(GpuReduction_call(gr, &gaMin, NULL, &gaSrc, 3, reduxList, 0));
+	ga_assert_ok(GpuReduction_call(gr, &gaD0, NULL, &gaS0, gaS0.nd-gaD0.nd, reduxList, 0));
 	GpuReduction_free(gr);
+	GpuReductionAttr_free(grAttr);
 
-	ga_assert_ok(GpuArray_read(pMin,    sizeof(*pMin),    &gaMin));
+	ga_assert_ok(GpuArray_read(pD0,    sizeof(*pD0),    &gaD0));
 
 
 	/**
 	 * Check that the destination tensors are correct.
 	 */
 
-	float  gtMin    = pSrc[0];
+	float  gtD0 = pS0[0];
 
 	for(i=0;i<dims[0];i++){
 		for(j=0;j<dims[1];j++){
 			for(k=0;k<dims[2];k++){
-				float v = pSrc[(i*dims[1] + j)*dims[2] + k];
+				float v = pS0[(i*dims[1] + j)*dims[2] + k];
 
-				if(v < gtMin){
-					gtMin    = v;
+				if(v < gtD0){
+					gtD0 = v;
 				}
 			}
 		}
 	}
-
-	ck_assert_msg(gtMin    == pMin[0],    "Min value mismatch!");
+	if(gtD0 != pD0[0]){
+		errCnt++;
+		if(errCnt < MAXERRPRINT){
+			fprintf(stderr, "%s:%d: Mismatch GT %f != %f UUT @ %zu!\n",
+			__func__, __LINE__, gtD0, pD0[0], (size_t)0);
+			fflush (stderr);
+		}
+	}
+	ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt);
 
 	/**
 	 * Deallocate.
 	 */
 
-	free(pSrc);
-	free(pMin);
-	GpuArray_clear(&gaSrc);
-	GpuArray_clear(&gaMin);
+	free(pS0);
+	free(pD0);
+	GpuArray_clear(&gaS0);
+	GpuArray_clear(&gaD0);
 }END_TEST
 
 START_TEST(test_sum_reduction){
@@ -1921,17 +2216,18 @@ START_TEST(test_sum_reduction){
 	 * We test here a reduction of some random 3D tensor on all dimensions.
 	 */
 
+	size_t errCnt      = 0;
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
 	const int reduxList[] = {0,2};
 	const float TOL = 1e-4;
 
-	float*  pS = calloc(1, sizeof(*pS)    * dims[0]*dims[1]*dims[2]);
-	float*  pD = calloc(1, sizeof(*pD)    *         dims[1]        );
+	float*  pS0 = calloc(1, sizeof(*pS0)    * dims[0]*dims[1]*dims[2]);
+	float*  pD0 = calloc(1, sizeof(*pD0)    *         dims[1]        );
 
-	ck_assert_ptr_ne(pS,    NULL);
-	ck_assert_ptr_ne(pD,    NULL);
+	ck_assert_ptr_nonnull(pS0);
+	ck_assert_ptr_nonnull(pD0);
 
 
 	/**
@@ -1939,7 +2235,7 @@ START_TEST(test_sum_reduction){
 	 */
 
 	for(i=0;i<prodDims;i++){
-		pS[i] = pcgRand01()-0.5;
+		pS0[i] = pcgRand01()-0.5;
 	}
 
 
@@ -1947,23 +2243,30 @@ START_TEST(test_sum_reduction){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS;
-	GpuArray gaD;
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 
-	ga_assert_ok(GpuArray_empty (&gaS, ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty (&gaD, ctx, GA_FLOAT, 1, &dims[1], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty (&gaS0, ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty (&gaD0, ctx, GA_FLOAT, 1, &dims[1], GA_C_ORDER));
 
-	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
-	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
-
-	GpuReduction* gr;
-	GpuReduction_new(&gr, GpuArray_context(&gaS),
-	                 GA_REDUCE_SUM, 1, 2, gaS.typecode, 0);
+	ga_assert_ok(GpuArray_write (&gaS0, pS0, sizeof(*pS0)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD0, -1));  /* 0xFFFFFFFF is a qNaN. */
+	
+	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
+	ck_assert_ptr_nonnull(grAttr);
+	GpuReductionAttr_setop    (grAttr, GA_REDUCE_SUM);
+	GpuReductionAttr_setdims  (grAttr, gaS0.nd, gaD0.nd);
+	GpuReductionAttr_sets0type(grAttr, gaS0.typecode);
+	GpuReductionAttr_setd0type(grAttr, gaD0.typecode);
+	GpuReduction_new(&gr, grAttr);
 	ck_assert_ptr_nonnull(gr);
-	ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 2, reduxList, 0));
+	ga_assert_ok(GpuReduction_call(gr, &gaD0, NULL, &gaS0, gaS0.nd-gaD0.nd, reduxList, 0));
 	GpuReduction_free(gr);
+	GpuReductionAttr_free(grAttr);
 
-	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*dims[1], &gaD));
+	ga_assert_ok(GpuArray_read  (pD0,   sizeof(*pD0)*dims[1], &gaD0));
 
 
 	/**
@@ -1971,26 +2274,34 @@ START_TEST(test_sum_reduction){
 	 */
 
 	for(j=0;j<dims[1];j++){
-		float  gtD = 0;
+		float  gtD0 = 0;
 
 		for(i=0;i<dims[0];i++){
 			for(k=0;k<dims[2];k++){
-				float v = pS[(i*dims[1] + j)*dims[2] + k];
-				gtD += v;
+				float v = pS0[(i*dims[1] + j)*dims[2] + k];
+				gtD0 += v;
+			}
+		}
+		
+		if(fabs(gtD0-pD0[j]) >= TOL){
+			errCnt++;
+			if(errCnt < MAXERRPRINT){
+				fprintf(stderr, "%s:%d: Mismatch GT %f != %f UUT @ %zu (TOL=%f)!\n",
+				__func__, __LINE__, gtD0, pD0[j], j, TOL);
+				fflush (stderr);
 			}
 		}
-
-		ck_assert_double_eq_tol(gtD, pD[j], TOL);
 	}
+	ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt);
 
 	/**
 	 * Deallocate.
 	 */
 
-	free(pS);
-	free(pD);
-	GpuArray_clear(&gaS);
-	GpuArray_clear(&gaD);
+	free(pS0);
+	free(pD0);
+	GpuArray_clear(&gaS0);
+	GpuArray_clear(&gaD0);
 }END_TEST
 
 START_TEST(test_sum_veryhighrank){
@@ -2000,6 +2311,7 @@ START_TEST(test_sum_veryhighrank){
 	 * Here we test a reduction of a random 8D tensor on four dimensions.
 	 */
 
+	size_t errCnt      = 0;
 	size_t i,j,k,l,m,n,o,p;
 	size_t dims   [8]  = {1171,373,2,1,2,1,2,1};
 	size_t prodDims    = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7];
@@ -2008,11 +2320,11 @@ START_TEST(test_sum_veryhighrank){
 	const int reduxList[] = {2,4,7,5};
 	const float TOL    = 1e-4;
 
-	float*  pS = calloc(1, sizeof(*pS) * prodDims);
-	float*  pD = calloc(1, sizeof(*pD) * rdxProdDims);
+	float*  pS0 = calloc(1, sizeof(*pS0) * prodDims);
+	float*  pD0 = calloc(1, sizeof(*pD0) * rdxProdDims);
 
-	ck_assert_ptr_ne(pS, NULL);
-	ck_assert_ptr_ne(pD, NULL);
+	ck_assert_ptr_nonnull(pS0);
+	ck_assert_ptr_nonnull(pD0);
 
 
 	/**
@@ -2020,7 +2332,7 @@ START_TEST(test_sum_veryhighrank){
 	 */
 
 	for(i=0;i<prodDims;i++){
-		pS[i] = pcgRand01()-0.5;
+		pS0[i] = pcgRand01()-0.5;
 	}
 
 
@@ -2028,23 +2340,30 @@ START_TEST(test_sum_veryhighrank){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS;
-	GpuArray gaD;
-
-	ga_assert_ok(GpuArray_empty (&gaS, ctx, GA_FLOAT, 8, dims,    GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty (&gaD, ctx, GA_FLOAT, 4, rdxDims, GA_C_ORDER));
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 
-	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
-	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
+	ga_assert_ok(GpuArray_empty (&gaS0, ctx, GA_FLOAT, 8, dims,    GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty (&gaD0, ctx, GA_FLOAT, 4, rdxDims, GA_C_ORDER));
 
-	GpuReduction* gr;
-	GpuReduction_new(&gr, GpuArray_context(&gaS),
-	                 GA_REDUCE_SUM, 4, 4, gaS.typecode, 0);
+	ga_assert_ok(GpuArray_write (&gaS0, pS0, sizeof(*pS0)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD0, -1));  /* 0xFFFFFFFF is a qNaN. */
+	
+	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
+	ck_assert_ptr_nonnull(grAttr);
+	GpuReductionAttr_setop    (grAttr, GA_REDUCE_SUM);
+	GpuReductionAttr_setdims  (grAttr, gaS0.nd, gaD0.nd);
+	GpuReductionAttr_sets0type(grAttr, gaS0.typecode);
+	GpuReductionAttr_setd0type(grAttr, gaD0.typecode);
+	GpuReduction_new(&gr, grAttr);
 	ck_assert_ptr_nonnull(gr);
-	ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 4, reduxList, 0));
+	ga_assert_ok(GpuReduction_call(gr, &gaD0, NULL, &gaS0, gaS0.nd-gaD0.nd, reduxList, 0));
 	GpuReduction_free(gr);
+	GpuReductionAttr_free(grAttr);
 
-	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*rdxProdDims, &gaD));
+	ga_assert_ok(GpuArray_read  (pD0,   sizeof(*pD0)*rdxProdDims, &gaD0));
 
 
 	/**
@@ -2055,35 +2374,43 @@ START_TEST(test_sum_veryhighrank){
 		for(j=0;j<dims[1];j++){
 			for(l=0;l<dims[3];l++){
 				for(o=0;o<dims[6];o++){
-					float gtD = 0;
+					float gtD0 = 0;
 
 					for(k=0;k<dims[2];k++){
 						for(m=0;m<dims[4];m++){
 							for(p=0;p<dims[7];p++){
 								for(n=0;n<dims[5];n++){
-									float v = pS[(((((((i)*dims[1] + j)*dims[2] + k)*dims[3] + l)*dims[4] + m)*dims[5] + n)*dims[6] + o)*dims[7] + p];
-									gtD += v;
+									float v = pS0[(((((((i)*dims[1] + j)*dims[2] + k)*dims[3] + l)*dims[4] + m)*dims[5] + n)*dims[6] + o)*dims[7] + p];
+									gtD0 += v;
 								}
 							}
 						}
 					}
 
 					size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
-					ck_assert_double_eq_tol(gtD, pD[dstIdx], TOL);
+					if(fabs(gtD0-pD0[dstIdx]) >= TOL){
+						errCnt++;
+						if(errCnt < MAXERRPRINT){
+							fprintf(stderr, "%s:%d: Mismatch GT %f != %f UUT @ %zu (TOL=%f)!\n",
+							__func__, __LINE__, gtD0, pD0[dstIdx], dstIdx, TOL);
+							fflush (stderr);
+						}
+					}
 				}
 			}
 		}
 	}
+	ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt);
 
 
 	/**
 	 * Deallocate.
 	 */
 
-	free(pS);
-	free(pD);
-	GpuArray_clear(&gaS);
-	GpuArray_clear(&gaD);
+	free(pS0);
+	free(pD0);
+	GpuArray_clear(&gaS0);
+	GpuArray_clear(&gaD0);
 }END_TEST
 
 START_TEST(test_sum_alldimsreduced){
@@ -2093,17 +2420,18 @@ START_TEST(test_sum_alldimsreduced){
 	 * We test here a reduction of some random 3D tensor on all dimensions.
 	 */
 
+	size_t errCnt      = 0;
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
 	const int reduxList[] = {0,1,2};
 	const float TOL = 1e-4;
 
-	float*  pS = calloc(1, sizeof(*pS)    * dims[0]*dims[1]*dims[2]);
-	float*  pD = calloc(1, sizeof(*pD)                             );
+	float*  pS0 = calloc(1, sizeof(*pS0)    * dims[0]*dims[1]*dims[2]);
+	float*  pD0 = calloc(1, sizeof(*pD0)                             );
 
-	ck_assert_ptr_ne(pS,    NULL);
-	ck_assert_ptr_ne(pD,    NULL);
+	ck_assert_ptr_nonnull(pS0);
+	ck_assert_ptr_nonnull(pD0);
 
 
 	/**
@@ -2111,7 +2439,7 @@ START_TEST(test_sum_alldimsreduced){
 	 */
 
 	for(i=0;i<prodDims;i++){
-		pS[i] = pcgRand01()-0.5;
+		pS0[i] = pcgRand01()-0.5;
 	}
 
 
@@ -2119,50 +2447,64 @@ START_TEST(test_sum_alldimsreduced){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS;
-	GpuArray gaD;
-
-	ga_assert_ok(GpuArray_empty (&gaS, ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty (&gaD, ctx, GA_FLOAT, 0, NULL,     GA_C_ORDER));
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 
-	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
-	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
+	ga_assert_ok(GpuArray_empty (&gaS0, ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty (&gaD0, ctx, GA_FLOAT, 0, NULL,     GA_C_ORDER));
 
-	GpuReduction* gr;
-	GpuReduction_new(&gr, GpuArray_context(&gaS),
-	                 GA_REDUCE_SUM, 0, 3, gaS.typecode, 0);
+	ga_assert_ok(GpuArray_write (&gaS0, pS0, sizeof(*pS0)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD0, -1));  /* 0xFFFFFFFF is a qNaN. */
+	
+	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
+	ck_assert_ptr_nonnull(grAttr);
+	GpuReductionAttr_setop    (grAttr, GA_REDUCE_SUM);
+	GpuReductionAttr_setdims  (grAttr, gaS0.nd, gaD0.nd);
+	GpuReductionAttr_sets0type(grAttr, gaS0.typecode);
+	GpuReductionAttr_setd0type(grAttr, gaD0.typecode);
+	GpuReduction_new(&gr, grAttr);
 	ck_assert_ptr_nonnull(gr);
-	ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 3, reduxList, 0));
+	ga_assert_ok(GpuReduction_call(gr, &gaD0, NULL, &gaS0, gaS0.nd-gaD0.nd, reduxList, 0));
 	GpuReduction_free(gr);
+	GpuReductionAttr_free(grAttr);
 
-	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD), &gaD));
+	ga_assert_ok(GpuArray_read  (pD0,   sizeof(*pD0), &gaD0));
 
 
 	/**
 	 * Check that the destination tensors are correct.
 	 */
 
-	float  gtD = 0;
+	float  gtD0 = 0;
 
 	for(i=0;i<dims[0];i++){
 		for(j=0;j<dims[1];j++){
 			for(k=0;k<dims[2];k++){
-				float v = pS[(i*dims[1] + j)*dims[2] + k];
-				gtD += v;
+				float v = pS0[(i*dims[1] + j)*dims[2] + k];
+				gtD0 += v;
 			}
 		}
 	}
-
-	ck_assert_double_eq_tol(gtD, pD[0], TOL);
+	if(fabs(gtD0-pD0[0]) >= TOL){
+		errCnt++;
+		if(errCnt < MAXERRPRINT){
+			fprintf(stderr, "%s:%d: Mismatch GT %f != %f UUT @ %zu (TOL=%f)!\n",
+			__func__, __LINE__, gtD0, pD0[0], (size_t)0, TOL);
+			fflush (stderr);
+		}
+	}
+	ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt);
 
 	/**
 	 * Deallocate.
 	 */
 
-	free(pS);
-	free(pD);
-	GpuArray_clear(&gaS);
-	GpuArray_clear(&gaD);
+	free(pS0);
+	free(pD0);
+	GpuArray_clear(&gaS0);
+	GpuArray_clear(&gaD0);
 }END_TEST
 
 START_TEST(test_sum_huge){
@@ -2172,17 +2514,18 @@ START_TEST(test_sum_huge){
 	 * We test here a reduction of a huge 1D tensor on all dimensions.
 	 */
 
+	size_t errCnt      = 0;
 	size_t i;
 	size_t dims[1]  = {100000000};
 	size_t prodDims = dims[0];
 	const int reduxList[] = {0};
 	const float TOL = 1e-2;
 
-	float*  pS = calloc(1, sizeof(*pS) * dims[0]);
-	float*  pD = calloc(1, sizeof(*pD));
+	float*  pS0 = calloc(1, sizeof(*pS0) * dims[0]);
+	float*  pD0 = calloc(1, sizeof(*pD0));
 
-	ck_assert_ptr_ne(pS,    NULL);
-	ck_assert_ptr_ne(pD,    NULL);
+	ck_assert_ptr_nonnull(pS0);
+	ck_assert_ptr_nonnull(pD0);
 
 
 	/**
@@ -2190,7 +2533,7 @@ START_TEST(test_sum_huge){
 	 */
 
 	for(i=0;i<prodDims;i++){
-		pS[i] = pcgRand01()-0.5;
+		pS0[i] = pcgRand01()-0.5;
 	}
 
 
@@ -2198,44 +2541,59 @@ START_TEST(test_sum_huge){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS;
-	GpuArray gaD;
-
-	ga_assert_ok(GpuArray_empty (&gaS, ctx, GA_FLOAT, 1, &dims[0], GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty (&gaD, ctx, GA_FLOAT, 0, NULL, GA_C_ORDER));
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 
-	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
-	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
+	ga_assert_ok(GpuArray_empty (&gaS0, ctx, GA_FLOAT, 1, &dims[0], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty (&gaD0, ctx, GA_FLOAT, 0, NULL, GA_C_ORDER));
 
-	GpuReduction* gr;
-	GpuReduction_new(&gr, GpuArray_context(&gaS),
-	                 GA_REDUCE_SUM, 0, 1, gaS.typecode, 0);
+	ga_assert_ok(GpuArray_write (&gaS0, pS0, sizeof(*pS0)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD0, -1));  /* 0xFFFFFFFF is a qNaN. */
+	
+	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
+	ck_assert_ptr_nonnull(grAttr);
+	GpuReductionAttr_setop    (grAttr, GA_REDUCE_SUM);
+	GpuReductionAttr_setdims  (grAttr, gaS0.nd, gaD0.nd);
+	GpuReductionAttr_sets0type(grAttr, gaS0.typecode);
+	GpuReductionAttr_setd0type(grAttr, gaD0.typecode);
+	GpuReduction_new(&gr, grAttr);
 	ck_assert_ptr_nonnull(gr);
-	ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 1, reduxList, 0));
+	ga_assert_ok(GpuReduction_call(gr, &gaD0, NULL, &gaS0, gaS0.nd-gaD0.nd, reduxList, 0));
 	GpuReduction_free(gr);
+	GpuReductionAttr_free(grAttr);
 
-	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD), &gaD));
+	ga_assert_ok(GpuArray_read  (pD0,   sizeof(*pD0), &gaD0));
 
 
 	/**
 	 * Check that the destination tensors are correct.
 	 */
 	
-	double  gtD = 0;
+	double  gtD0 = 0;
 	for(i=0;i<dims[0];i++){
-		double  v   = pS[i];
-		gtD += v;
+		double  v   = pS0[i];
+		gtD0 += v;
+	}
+	if(fabs(gtD0-pD0[0]) >= TOL){
+		errCnt++;
+		if(errCnt < MAXERRPRINT){
+			fprintf(stderr, "%s:%d: Mismatch GT %f != %f UUT @ %zu (TOL=%f)!\n",
+			__func__, __LINE__, gtD0, pD0[0], (size_t)0, TOL);
+			fflush (stderr);
+		}
 	}
-	ck_assert_double_eq_tol(gtD, pD[0], TOL);
+	ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt);
 
 	/**
 	 * Deallocate.
 	 */
 
-	free(pS);
-	free(pD);
-	GpuArray_clear(&gaS);
-	GpuArray_clear(&gaD);
+	free(pS0);
+	free(pD0);
+	GpuArray_clear(&gaS0);
+	GpuArray_clear(&gaD0);
 }END_TEST
 
 START_TEST(test_prod_reduction){
@@ -2245,17 +2603,18 @@ START_TEST(test_prod_reduction){
 	 * We test here a reduction of some random 3D tensor on all dimensions.
 	 */
 
+	size_t errCnt      = 0;
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
 	const int reduxList[] = {0,2};
 	const float TOL = 1e-4;
 
-	float*  pS = calloc(1, sizeof(*pS)    * dims[0]*dims[1]*dims[2]);
-	float*  pD = calloc(1, sizeof(*pD)    *         dims[1]        );
+	float*  pS0 = calloc(1, sizeof(*pS0)    * dims[0]*dims[1]*dims[2]);
+	float*  pD0 = calloc(1, sizeof(*pD0)    *         dims[1]        );
 
-	ck_assert_ptr_ne(pS,    NULL);
-	ck_assert_ptr_ne(pD,    NULL);
+	ck_assert_ptr_nonnull(pS0);
+	ck_assert_ptr_nonnull(pD0);
 
 
 	/**
@@ -2263,7 +2622,7 @@ START_TEST(test_prod_reduction){
 	 */
 
 	for(i=0;i<prodDims;i++){
-		pS[i] = (pcgRand01()-0.5)*0.1 + 1;
+		pS0[i] = (pcgRand01()-0.5)*0.1 + 1;
 	}
 
 
@@ -2271,23 +2630,30 @@ START_TEST(test_prod_reduction){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS;
-	GpuArray gaD;
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 
-	ga_assert_ok(GpuArray_empty (&gaS, ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty (&gaD, ctx, GA_FLOAT, 1, &dims[1], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty (&gaS0, ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty (&gaD0, ctx, GA_FLOAT, 1, &dims[1], GA_C_ORDER));
 
-	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
-	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
-
-	GpuReduction* gr;
-	GpuReduction_new(&gr, GpuArray_context(&gaS),
-	                 GA_REDUCE_PROD, 1, 2, gaS.typecode, 0);
+	ga_assert_ok(GpuArray_write (&gaS0, pS0, sizeof(*pS0)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD0, -1));  /* 0xFFFFFFFF is a qNaN. */
+	
+	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
+	ck_assert_ptr_nonnull(grAttr);
+	GpuReductionAttr_setop    (grAttr, GA_REDUCE_PROD);
+	GpuReductionAttr_setdims  (grAttr, gaS0.nd, gaD0.nd);
+	GpuReductionAttr_sets0type(grAttr, gaS0.typecode);
+	GpuReductionAttr_setd0type(grAttr, gaD0.typecode);
+	GpuReduction_new(&gr, grAttr);
 	ck_assert_ptr_nonnull(gr);
-	ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 2, reduxList, 0));
+	ga_assert_ok(GpuReduction_call(gr, &gaD0, NULL, &gaS0, gaS0.nd-gaD0.nd, reduxList, 0));
 	GpuReduction_free(gr);
+	GpuReductionAttr_free(grAttr);
 
-	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*dims[1], &gaD));
+	ga_assert_ok(GpuArray_read  (pD0,   sizeof(*pD0)*dims[1], &gaD0));
 
 
 	/**
@@ -2295,26 +2661,34 @@ START_TEST(test_prod_reduction){
 	 */
 
 	for(j=0;j<dims[1];j++){
-		float  gtD = 1;
+		float  gtD0 = 1;
 
 		for(i=0;i<dims[0];i++){
 			for(k=0;k<dims[2];k++){
-				float v = pS[(i*dims[1] + j)*dims[2] + k];
-				gtD *= v;
+				float v = pS0[(i*dims[1] + j)*dims[2] + k];
+				gtD0 *= v;
+			}
+		}
+		
+		if(fabs(gtD0-pD0[j]) >= TOL){
+			errCnt++;
+			if(errCnt < MAXERRPRINT){
+				fprintf(stderr, "%s:%d: Mismatch GT %f != %f UUT @ %zu (TOL=%f)!\n",
+				__func__, __LINE__, gtD0, pD0[j], j, TOL);
+				fflush (stderr);
 			}
 		}
-
-		ck_assert_double_eq_tol(gtD, pD[j], TOL);
 	}
+	ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt);
 
 	/**
 	 * Deallocate.
 	 */
 
-	free(pS);
-	free(pD);
-	GpuArray_clear(&gaS);
-	GpuArray_clear(&gaD);
+	free(pS0);
+	free(pD0);
+	GpuArray_clear(&gaS0);
+	GpuArray_clear(&gaD0);
 }END_TEST
 
 START_TEST(test_prod_veryhighrank){
@@ -2324,6 +2698,7 @@ START_TEST(test_prod_veryhighrank){
 	 * Here we test a reduction of a random 8D tensor on four dimensions.
 	 */
 
+	size_t errCnt      = 0;
 	size_t i,j,k,l,m,n,o,p;
 	size_t dims   [8]  = {1171,373,2,1,2,1,2,1};
 	size_t prodDims    = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7];
@@ -2332,11 +2707,11 @@ START_TEST(test_prod_veryhighrank){
 	const int reduxList[] = {2,4,7,5};
 	const float TOL    = 1e-4;
 
-	float*  pS = calloc(1, sizeof(*pS) * prodDims);
-	float*  pD = calloc(1, sizeof(*pD) * rdxProdDims);
+	float*  pS0 = calloc(1, sizeof(*pS0) * prodDims);
+	float*  pD0 = calloc(1, sizeof(*pD0) * rdxProdDims);
 
-	ck_assert_ptr_ne(pS, NULL);
-	ck_assert_ptr_ne(pD, NULL);
+	ck_assert_ptr_nonnull(pS0);
+	ck_assert_ptr_nonnull(pD0);
 
 
 	/**
@@ -2344,7 +2719,7 @@ START_TEST(test_prod_veryhighrank){
 	 */
 
 	for(i=0;i<prodDims;i++){
-		pS[i] = (pcgRand01()-0.5)*0.1 + 1;
+		pS0[i] = (pcgRand01()-0.5)*0.1 + 1;
 	}
 
 
@@ -2352,23 +2727,30 @@ START_TEST(test_prod_veryhighrank){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS;
-	GpuArray gaD;
-
-	ga_assert_ok(GpuArray_empty (&gaS, ctx, GA_FLOAT, 8, dims,    GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty (&gaD, ctx, GA_FLOAT, 4, rdxDims, GA_C_ORDER));
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 
-	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
-	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
+	ga_assert_ok(GpuArray_empty (&gaS0, ctx, GA_FLOAT, 8, dims,    GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty (&gaD0, ctx, GA_FLOAT, 4, rdxDims, GA_C_ORDER));
 
-	GpuReduction* gr;
-	GpuReduction_new(&gr, GpuArray_context(&gaS),
-	                 GA_REDUCE_PROD, 4, 4, gaS.typecode, 0);
+	ga_assert_ok(GpuArray_write (&gaS0, pS0, sizeof(*pS0)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD0, -1));  /* 0xFFFFFFFF is a qNaN. */
+	
+	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
+	ck_assert_ptr_nonnull(grAttr);
+	GpuReductionAttr_setop    (grAttr, GA_REDUCE_PROD);
+	GpuReductionAttr_setdims  (grAttr, gaS0.nd, gaD0.nd);
+	GpuReductionAttr_sets0type(grAttr, gaS0.typecode);
+	GpuReductionAttr_setd0type(grAttr, gaD0.typecode);
+	GpuReduction_new(&gr, grAttr);
 	ck_assert_ptr_nonnull(gr);
-	ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 4, reduxList, 0));
+	ga_assert_ok(GpuReduction_call(gr, &gaD0, NULL, &gaS0, gaS0.nd-gaD0.nd, reduxList, 0));
 	GpuReduction_free(gr);
+	GpuReductionAttr_free(grAttr);
 
-	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*rdxProdDims, &gaD));
+	ga_assert_ok(GpuArray_read  (pD0,   sizeof(*pD0)*rdxProdDims, &gaD0));
 
 
 	/**
@@ -2379,35 +2761,43 @@ START_TEST(test_prod_veryhighrank){
 		for(j=0;j<dims[1];j++){
 			for(l=0;l<dims[3];l++){
 				for(o=0;o<dims[6];o++){
-					float gtD = 1;
+					float gtD0 = 1;
 
 					for(k=0;k<dims[2];k++){
 						for(m=0;m<dims[4];m++){
 							for(p=0;p<dims[7];p++){
 								for(n=0;n<dims[5];n++){
-									float v = pS[(((((((i)*dims[1] + j)*dims[2] + k)*dims[3] + l)*dims[4] + m)*dims[5] + n)*dims[6] + o)*dims[7] + p];
-									gtD *= v;
+									float v = pS0[(((((((i)*dims[1] + j)*dims[2] + k)*dims[3] + l)*dims[4] + m)*dims[5] + n)*dims[6] + o)*dims[7] + p];
+									gtD0 *= v;
 								}
 							}
 						}
 					}
 
 					size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
-					ck_assert_double_eq_tol(gtD, pD[dstIdx], TOL);
+					if(fabs(gtD0-pD0[dstIdx]) >= TOL){
+						errCnt++;
+						if(errCnt < MAXERRPRINT){
+							fprintf(stderr, "%s:%d: Mismatch GT %f != %f UUT @ %zu (TOL=%f)!\n",
+							__func__, __LINE__, gtD0, pD0[dstIdx], dstIdx, TOL);
+							fflush (stderr);
+						}
+					}
 				}
 			}
 		}
 	}
+	ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt);
 
 
 	/**
 	 * Deallocate.
 	 */
 
-	free(pS);
-	free(pD);
-	GpuArray_clear(&gaS);
-	GpuArray_clear(&gaD);
+	free(pS0);
+	free(pD0);
+	GpuArray_clear(&gaS0);
+	GpuArray_clear(&gaD0);
 }END_TEST
 
 START_TEST(test_prod_alldimsreduced){
@@ -2417,17 +2807,18 @@ START_TEST(test_prod_alldimsreduced){
 	 * We test here a reduction of some random 3D tensor on all dimensions.
 	 */
 
+	size_t errCnt      = 0;
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
 	const int reduxList[] = {0,1,2};
 	const float TOL = 1e-4;
 
-	float*  pS = calloc(1, sizeof(*pS)    * dims[0]*dims[1]*dims[2]);
-	float*  pD = calloc(1, sizeof(*pD)                             );
+	float*  pS0 = calloc(1, sizeof(*pS0)    * dims[0]*dims[1]*dims[2]);
+	float*  pD0 = calloc(1, sizeof(*pD0)                             );
 
-	ck_assert_ptr_ne(pS,    NULL);
-	ck_assert_ptr_ne(pD,    NULL);
+	ck_assert_ptr_nonnull(pS0);
+	ck_assert_ptr_nonnull(pD0);
 
 
 	/**
@@ -2435,7 +2826,7 @@ START_TEST(test_prod_alldimsreduced){
 	 */
 
 	for(i=0;i<prodDims;i++){
-		pS[i] = (pcgRand01()-0.5)*0.1 + 1;
+		pS0[i] = (pcgRand01()-0.5)*0.1 + 1;
 	}
 
 
@@ -2443,50 +2834,64 @@ START_TEST(test_prod_alldimsreduced){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS;
-	GpuArray gaD;
-
-	ga_assert_ok(GpuArray_empty (&gaS, ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty (&gaD, ctx, GA_FLOAT, 0, NULL,     GA_C_ORDER));
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 
-	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
-	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
+	ga_assert_ok(GpuArray_empty (&gaS0, ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty (&gaD0, ctx, GA_FLOAT, 0, NULL,     GA_C_ORDER));
 
-	GpuReduction* gr;
-	GpuReduction_new(&gr, GpuArray_context(&gaS),
-	                 GA_REDUCE_PROD, 0, 3, gaS.typecode, 0);
+	ga_assert_ok(GpuArray_write (&gaS0, pS0, sizeof(*pS0)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD0, -1));  /* 0xFFFFFFFF is a qNaN. */
+	
+	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
+	ck_assert_ptr_nonnull(grAttr);
+	GpuReductionAttr_setop    (grAttr, GA_REDUCE_PROD);
+	GpuReductionAttr_setdims  (grAttr, gaS0.nd, gaD0.nd);
+	GpuReductionAttr_sets0type(grAttr, gaS0.typecode);
+	GpuReductionAttr_setd0type(grAttr, gaD0.typecode);
+	GpuReduction_new(&gr, grAttr);
 	ck_assert_ptr_nonnull(gr);
-	ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 3, reduxList, 0));
+	ga_assert_ok(GpuReduction_call(gr, &gaD0, NULL, &gaS0, gaS0.nd-gaD0.nd, reduxList, 0));
 	GpuReduction_free(gr);
+	GpuReductionAttr_free(grAttr);
 
-	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD), &gaD));
+	ga_assert_ok(GpuArray_read  (pD0,   sizeof(*pD0), &gaD0));
 
 
 	/**
 	 * Check that the destination tensors are correct.
 	 */
 
-	float  gtD = 1;
+	float  gtD0 = 1;
 
 	for(i=0;i<dims[0];i++){
 		for(j=0;j<dims[1];j++){
 			for(k=0;k<dims[2];k++){
-				float v = pS[(i*dims[1] + j)*dims[2] + k];
-				gtD *= v;
+				float v = pS0[(i*dims[1] + j)*dims[2] + k];
+				gtD0 *= v;
 			}
 		}
 	}
-
-	ck_assert_double_eq_tol(gtD, pD[0], TOL);
+	if(fabs(gtD0-pD0[0]) >= TOL){
+		errCnt++;
+		if(errCnt < MAXERRPRINT){
+			fprintf(stderr, "%s:%d: Mismatch GT %f != %f UUT @ %zu (TOL=%f)!\n",
+			__func__, __LINE__, gtD0, pD0[0], (size_t)0, TOL);
+			fflush (stderr);
+		}
+	}
+	ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt);
 
 	/**
 	 * Deallocate.
 	 */
 
-	free(pS);
-	free(pD);
-	GpuArray_clear(&gaS);
-	GpuArray_clear(&gaD);
+	free(pS0);
+	free(pD0);
+	GpuArray_clear(&gaS0);
+	GpuArray_clear(&gaD0);
 }END_TEST
 
 START_TEST(test_prodnz_reduction){
@@ -2496,17 +2901,18 @@ START_TEST(test_prodnz_reduction){
 	 * We test here a reduction of some random 3D tensor on all dimensions.
 	 */
 
+	size_t errCnt      = 0;
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
 	const int reduxList[] = {0,2};
 	const float TOL = 1e-4;
 
-	float*  pS = calloc(1, sizeof(*pS)    * dims[0]*dims[1]*dims[2]);
-	float*  pD = calloc(1, sizeof(*pD)    *         dims[1]        );
+	float*  pS0 = calloc(1, sizeof(*pS0)    * dims[0]*dims[1]*dims[2]);
+	float*  pD0 = calloc(1, sizeof(*pD0)    *         dims[1]        );
 
-	ck_assert_ptr_ne(pS,    NULL);
-	ck_assert_ptr_ne(pD,    NULL);
+	ck_assert_ptr_nonnull(pS0);
+	ck_assert_ptr_nonnull(pD0);
 
 
 	/**
@@ -2514,9 +2920,9 @@ START_TEST(test_prodnz_reduction){
 	 */
 
 	for(i=0;i<prodDims;i++){
-		pS[i] = (pcgRand01()-0.5)*0.1 + 1;
+		pS0[i] = (pcgRand01()-0.5)*0.1 + 1;
 		if(pcgRand01()<0.1){
-			pS[i] = 0;
+			pS0[i] = 0;
 		}
 	}
 
@@ -2525,23 +2931,30 @@ START_TEST(test_prodnz_reduction){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS;
-	GpuArray gaD;
-
-	ga_assert_ok(GpuArray_empty (&gaS, ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty (&gaD, ctx, GA_FLOAT, 1, &dims[1], GA_C_ORDER));
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 
-	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
-	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
+	ga_assert_ok(GpuArray_empty (&gaS0, ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty (&gaD0, ctx, GA_FLOAT, 1, &dims[1], GA_C_ORDER));
 
-	GpuReduction* gr;
-	GpuReduction_new(&gr, GpuArray_context(&gaS),
-	                 GA_REDUCE_PRODNZ, 1, 2, gaS.typecode, 0);
+	ga_assert_ok(GpuArray_write (&gaS0, pS0, sizeof(*pS0)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD0, -1));  /* 0xFFFFFFFF is a qNaN. */
+	
+	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
+	ck_assert_ptr_nonnull(grAttr);
+	GpuReductionAttr_setop    (grAttr, GA_REDUCE_PRODNZ);
+	GpuReductionAttr_setdims  (grAttr, gaS0.nd, gaD0.nd);
+	GpuReductionAttr_sets0type(grAttr, gaS0.typecode);
+	GpuReductionAttr_setd0type(grAttr, gaD0.typecode);
+	GpuReduction_new(&gr, grAttr);
 	ck_assert_ptr_nonnull(gr);
-	ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 2, reduxList, 0));
+	ga_assert_ok(GpuReduction_call(gr, &gaD0, NULL, &gaS0, gaS0.nd-gaD0.nd, reduxList, 0));
 	GpuReduction_free(gr);
+	GpuReductionAttr_free(grAttr);
 
-	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*dims[1], &gaD));
+	ga_assert_ok(GpuArray_read  (pD0,   sizeof(*pD0)*dims[1], &gaD0));
 
 
 	/**
@@ -2549,26 +2962,34 @@ START_TEST(test_prodnz_reduction){
 	 */
 
 	for(j=0;j<dims[1];j++){
-		float  gtD = 1;
+		float  gtD0 = 1;
 
 		for(i=0;i<dims[0];i++){
 			for(k=0;k<dims[2];k++){
-				float v = pS[(i*dims[1] + j)*dims[2] + k];
-				gtD *= v==0 ? 1 : v;
+				float v = pS0[(i*dims[1] + j)*dims[2] + k];
+				gtD0 *= v==0 ? 1 : v;
+			}
+		}
+		
+		if(fabs(gtD0-pD0[j]) >= TOL){
+			errCnt++;
+			if(errCnt < MAXERRPRINT){
+				fprintf(stderr, "%s:%d: Mismatch GT %f != %f UUT @ %zu (TOL=%f)!\n",
+				__func__, __LINE__, gtD0, pD0[j], j, TOL);
+				fflush (stderr);
 			}
 		}
-
-		ck_assert_double_eq_tol(gtD, pD[j], TOL);
 	}
+	ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt);
 
 	/**
 	 * Deallocate.
 	 */
 
-	free(pS);
-	free(pD);
-	GpuArray_clear(&gaS);
-	GpuArray_clear(&gaD);
+	free(pS0);
+	free(pD0);
+	GpuArray_clear(&gaS0);
+	GpuArray_clear(&gaD0);
 }END_TEST
 
 START_TEST(test_prodnz_veryhighrank){
@@ -2578,6 +2999,7 @@ START_TEST(test_prodnz_veryhighrank){
 	 * Here we test a reduction of a random 8D tensor on four dimensions.
 	 */
 
+	size_t errCnt      = 0;
 	size_t i,j,k,l,m,n,o,p;
 	size_t dims   [8]  = {1171,373,2,1,2,1,2,1};
 	size_t prodDims    = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7];
@@ -2586,11 +3008,11 @@ START_TEST(test_prodnz_veryhighrank){
 	const int reduxList[] = {2,4,7,5};
 	const float TOL    = 1e-4;
 
-	float*  pS = calloc(1, sizeof(*pS) * prodDims);
-	float*  pD = calloc(1, sizeof(*pD) * rdxProdDims);
+	float*  pS0 = calloc(1, sizeof(*pS0) * prodDims);
+	float*  pD0 = calloc(1, sizeof(*pD0) * rdxProdDims);
 
-	ck_assert_ptr_ne(pS, NULL);
-	ck_assert_ptr_ne(pD, NULL);
+	ck_assert_ptr_nonnull(pS0);
+	ck_assert_ptr_nonnull(pD0);
 
 
 	/**
@@ -2598,9 +3020,9 @@ START_TEST(test_prodnz_veryhighrank){
 	 */
 
 	for(i=0;i<prodDims;i++){
-		pS[i] = (pcgRand01()-0.5)*0.1 + 1;
+		pS0[i] = (pcgRand01()-0.5)*0.1 + 1;
 		if(pcgRand01()<0.1){
-			pS[i] = 0;
+			pS0[i] = 0;
 		}
 	}
 
@@ -2609,23 +3031,30 @@ START_TEST(test_prodnz_veryhighrank){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS;
-	GpuArray gaD;
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 
-	ga_assert_ok(GpuArray_empty (&gaS, ctx, GA_FLOAT, 8, dims,    GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty (&gaD, ctx, GA_FLOAT, 4, rdxDims, GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty (&gaS0, ctx, GA_FLOAT, 8, dims,    GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty (&gaD0, ctx, GA_FLOAT, 4, rdxDims, GA_C_ORDER));
 
-	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
-	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
-
-	GpuReduction* gr;
-	GpuReduction_new(&gr, GpuArray_context(&gaS),
-	                 GA_REDUCE_PRODNZ, 4, 4, gaS.typecode, 0);
+	ga_assert_ok(GpuArray_write (&gaS0, pS0, sizeof(*pS0)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD0, -1));  /* 0xFFFFFFFF is a qNaN. */
+	
+	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
+	ck_assert_ptr_nonnull(grAttr);
+	GpuReductionAttr_setop    (grAttr, GA_REDUCE_PRODNZ);
+	GpuReductionAttr_setdims  (grAttr, gaS0.nd, gaD0.nd);
+	GpuReductionAttr_sets0type(grAttr, gaS0.typecode);
+	GpuReductionAttr_setd0type(grAttr, gaD0.typecode);
+	GpuReduction_new(&gr, grAttr);
 	ck_assert_ptr_nonnull(gr);
-	ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 4, reduxList, 0));
+	ga_assert_ok(GpuReduction_call(gr, &gaD0, NULL, &gaS0, gaS0.nd-gaD0.nd, reduxList, 0));
 	GpuReduction_free(gr);
+	GpuReductionAttr_free(grAttr);
 
-	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*rdxProdDims, &gaD));
+	ga_assert_ok(GpuArray_read  (pD0,   sizeof(*pD0)*rdxProdDims, &gaD0));
 
 
 	/**
@@ -2636,35 +3065,43 @@ START_TEST(test_prodnz_veryhighrank){
 		for(j=0;j<dims[1];j++){
 			for(l=0;l<dims[3];l++){
 				for(o=0;o<dims[6];o++){
-					float gtD = 1;
+					float gtD0 = 1;
 
 					for(k=0;k<dims[2];k++){
 						for(m=0;m<dims[4];m++){
 							for(p=0;p<dims[7];p++){
 								for(n=0;n<dims[5];n++){
-									float v = pS[(((((((i)*dims[1] + j)*dims[2] + k)*dims[3] + l)*dims[4] + m)*dims[5] + n)*dims[6] + o)*dims[7] + p];
-									gtD *= v==0 ? 1 : v;
+									float v = pS0[(((((((i)*dims[1] + j)*dims[2] + k)*dims[3] + l)*dims[4] + m)*dims[5] + n)*dims[6] + o)*dims[7] + p];
+									gtD0 *= v==0 ? 1 : v;
 								}
 							}
 						}
 					}
 
 					size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
-					ck_assert_double_eq_tol(gtD, pD[dstIdx], TOL);
+					if(fabs(gtD0-pD0[dstIdx]) >= TOL){
+						errCnt++;
+						if(errCnt < MAXERRPRINT){
+							fprintf(stderr, "%s:%d: Mismatch GT %f != %f UUT @ %zu (TOL=%f)!\n",
+							__func__, __LINE__, gtD0, pD0[dstIdx], dstIdx, TOL);
+							fflush (stderr);
+						}
+					}
 				}
 			}
 		}
 	}
+	ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt);
 
 
 	/**
 	 * Deallocate.
 	 */
 
-	free(pS);
-	free(pD);
-	GpuArray_clear(&gaS);
-	GpuArray_clear(&gaD);
+	free(pS0);
+	free(pD0);
+	GpuArray_clear(&gaS0);
+	GpuArray_clear(&gaD0);
 }END_TEST
 
 START_TEST(test_prodnz_alldimsreduced){
@@ -2674,17 +3111,18 @@ START_TEST(test_prodnz_alldimsreduced){
 	 * We test here a reduction of some random 3D tensor on all dimensions.
 	 */
 
+	size_t errCnt      = 0;
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
 	const int reduxList[] = {0,1,2};
 	const float TOL = 1e-4;
 
-	float*  pS = calloc(1, sizeof(*pS)    * dims[0]*dims[1]*dims[2]);
-	float*  pD = calloc(1, sizeof(*pD)                             );
+	float*  pS0 = calloc(1, sizeof(*pS0)    * dims[0]*dims[1]*dims[2]);
+	float*  pD0 = calloc(1, sizeof(*pD0)                             );
 
-	ck_assert_ptr_ne(pS,    NULL);
-	ck_assert_ptr_ne(pD,    NULL);
+	ck_assert_ptr_nonnull(pS0);
+	ck_assert_ptr_nonnull(pD0);
 
 
 	/**
@@ -2692,9 +3130,9 @@ START_TEST(test_prodnz_alldimsreduced){
 	 */
 
 	for(i=0;i<prodDims;i++){
-		pS[i] = (pcgRand01()-0.5)*0.1 + 1;
+		pS0[i] = (pcgRand01()-0.5)*0.1 + 1;
 		if(pcgRand01()<0.1){
-			pS[i] = 0;
+			pS0[i] = 0;
 		}
 	}
 
@@ -2703,50 +3141,64 @@ START_TEST(test_prodnz_alldimsreduced){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS;
-	GpuArray gaD;
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 
-	ga_assert_ok(GpuArray_empty (&gaS, ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty (&gaD, ctx, GA_FLOAT, 0, NULL,     GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty (&gaS0, ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty (&gaD0, ctx, GA_FLOAT, 0, NULL,     GA_C_ORDER));
 
-	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
-	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
-
-	GpuReduction* gr;
-	GpuReduction_new(&gr, GpuArray_context(&gaS),
-	                 GA_REDUCE_PRODNZ, 0, 3, gaS.typecode, 0);
+	ga_assert_ok(GpuArray_write (&gaS0, pS0, sizeof(*pS0)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD0, -1));  /* 0xFFFFFFFF is a qNaN. */
+	
+	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
+	ck_assert_ptr_nonnull(grAttr);
+	GpuReductionAttr_setop    (grAttr, GA_REDUCE_PRODNZ);
+	GpuReductionAttr_setdims  (grAttr, gaS0.nd, gaD0.nd);
+	GpuReductionAttr_sets0type(grAttr, gaS0.typecode);
+	GpuReductionAttr_setd0type(grAttr, gaD0.typecode);
+	GpuReduction_new(&gr, grAttr);
 	ck_assert_ptr_nonnull(gr);
-	ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 3, reduxList, 0));
+	ga_assert_ok(GpuReduction_call(gr, &gaD0, NULL, &gaS0, gaS0.nd-gaD0.nd, reduxList, 0));
 	GpuReduction_free(gr);
+	GpuReductionAttr_free(grAttr);
 
-	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD), &gaD));
+	ga_assert_ok(GpuArray_read  (pD0,   sizeof(*pD0), &gaD0));
 
 
 	/**
 	 * Check that the destination tensors are correct.
 	 */
 
-	float  gtD = 1;
+	float  gtD0 = 1;
 
 	for(i=0;i<dims[0];i++){
 		for(j=0;j<dims[1];j++){
 			for(k=0;k<dims[2];k++){
-				float v = pS[(i*dims[1] + j)*dims[2] + k];
-				gtD *= v==0 ? 1 : v;
+				float v = pS0[(i*dims[1] + j)*dims[2] + k];
+				gtD0 *= v==0 ? 1 : v;
 			}
 		}
 	}
-
-	ck_assert_double_eq_tol(gtD, pD[0], TOL);
+	if(fabs(gtD0-pD0[0]) >= TOL){
+		errCnt++;
+		if(errCnt < MAXERRPRINT){
+			fprintf(stderr, "%s:%d: Mismatch GT %f != %f UUT @ %zu (TOL=%f)!\n",
+			__func__, __LINE__, gtD0, pD0[0], (size_t)0, TOL);
+			fflush (stderr);
+		}
+	}
+	ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt);
 
 	/**
 	 * Deallocate.
 	 */
 
-	free(pS);
-	free(pD);
-	GpuArray_clear(&gaS);
-	GpuArray_clear(&gaD);
+	free(pS0);
+	free(pD0);
+	GpuArray_clear(&gaS0);
+	GpuArray_clear(&gaD0);
 }END_TEST
 
 START_TEST(test_and_reduction){
@@ -2756,16 +3208,17 @@ START_TEST(test_and_reduction){
 	 * We test here a reduction of some random 3D tensor on all dimensions.
 	 */
 
+	size_t errCnt      = 0;
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
 	const int reduxList[] = {0,2};
 
-	uint32_t*  pS = calloc(1, sizeof(*pS)    * dims[0]*dims[1]*dims[2]);
-	uint32_t*  pD = calloc(1, sizeof(*pD)    *         dims[1]        );
+	uint32_t*  pS0 = calloc(1, sizeof(*pS0)    * dims[0]*dims[1]*dims[2]);
+	uint32_t*  pD0 = calloc(1, sizeof(*pD0)    *         dims[1]        );
 
-	ck_assert_ptr_ne(pS,    NULL);
-	ck_assert_ptr_ne(pD,    NULL);
+	ck_assert_ptr_nonnull(pS0);
+	ck_assert_ptr_nonnull(pD0);
 
 
 	/**
@@ -2778,11 +3231,11 @@ START_TEST(test_and_reduction){
 		 * probability.
 		 */
 
-		pS[i]  = (uint32_t)(pcgRand01() * (uint32_t)-1);
-		pS[i] |= (uint32_t)(pcgRand01() * (uint32_t)-1);
-		pS[i] |= (uint32_t)(pcgRand01() * (uint32_t)-1);
-		pS[i] |= (uint32_t)(pcgRand01() * (uint32_t)-1);
-		pS[i] |= (uint32_t)(pcgRand01() * (uint32_t)-1);
+		pS0[i]  = (uint32_t)(pcgRand01() * (uint32_t)-1);
+		pS0[i] |= (uint32_t)(pcgRand01() * (uint32_t)-1);
+		pS0[i] |= (uint32_t)(pcgRand01() * (uint32_t)-1);
+		pS0[i] |= (uint32_t)(pcgRand01() * (uint32_t)-1);
+		pS0[i] |= (uint32_t)(pcgRand01() * (uint32_t)-1);
 	}
 
 
@@ -2790,23 +3243,30 @@ START_TEST(test_and_reduction){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS;
-	GpuArray gaD;
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 
-	ga_assert_ok(GpuArray_empty (&gaS, ctx, GA_UINT, 3, &dims[0], GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty (&gaD, ctx, GA_UINT, 1, &dims[1], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty (&gaS0, ctx, GA_UINT, 3, &dims[0], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty (&gaD0, ctx, GA_UINT, 1, &dims[1], GA_C_ORDER));
 
-	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
-	ga_assert_ok(GpuArray_memset(&gaD, -1));
-
-	GpuReduction* gr;
-	GpuReduction_new(&gr, GpuArray_context(&gaS),
-	                 GA_REDUCE_AND, 1, 2, gaS.typecode, 0);
+	ga_assert_ok(GpuArray_write (&gaS0, pS0, sizeof(*pS0)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD0, -1));
+	
+	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
+	ck_assert_ptr_nonnull(grAttr);
+	GpuReductionAttr_setop    (grAttr, GA_REDUCE_AND);
+	GpuReductionAttr_setdims  (grAttr, gaS0.nd, gaD0.nd);
+	GpuReductionAttr_sets0type(grAttr, gaS0.typecode);
+	GpuReductionAttr_setd0type(grAttr, gaD0.typecode);
+	GpuReduction_new(&gr, grAttr);
 	ck_assert_ptr_nonnull(gr);
-	ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 2, reduxList, 0));
+	ga_assert_ok(GpuReduction_call(gr, &gaD0, NULL, &gaS0, gaS0.nd-gaD0.nd, reduxList, 0));
 	GpuReduction_free(gr);
+	GpuReductionAttr_free(grAttr);
 
-	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*dims[1], &gaD));
+	ga_assert_ok(GpuArray_read  (pD0,   sizeof(*pD0)*dims[1], &gaD0));
 
 
 	/**
@@ -2814,26 +3274,34 @@ START_TEST(test_and_reduction){
 	 */
 
 	for(j=0;j<dims[1];j++){
-		uint32_t  gtD = -1;
+		uint32_t  gtD0 = -1;
 
 		for(i=0;i<dims[0];i++){
 			for(k=0;k<dims[2];k++){
-				uint32_t v = pS[(i*dims[1] + j)*dims[2] + k];
-				gtD &= v;
+				uint32_t v = pS0[(i*dims[1] + j)*dims[2] + k];
+				gtD0 &= v;
+			}
+		}
+		
+		if(gtD0 != pD0[j]){
+			errCnt++;
+			if(errCnt < MAXERRPRINT){
+				fprintf(stderr, "%s:%d: Mismatch GT %u != %u UUT @ %zu!\n",
+				__func__, __LINE__, gtD0, pD0[j], j);
+				fflush (stderr);
 			}
 		}
-
-		ck_assert_uint_eq(gtD, pD[j]);
 	}
+	ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt);
 
 	/**
 	 * Deallocate.
 	 */
 
-	free(pS);
-	free(pD);
-	GpuArray_clear(&gaS);
-	GpuArray_clear(&gaD);
+	free(pS0);
+	free(pD0);
+	GpuArray_clear(&gaS0);
+	GpuArray_clear(&gaD0);
 }END_TEST
 
 START_TEST(test_and_veryhighrank){
@@ -2843,6 +3311,7 @@ START_TEST(test_and_veryhighrank){
 	 * Here we test a reduction of a random 8D tensor on four dimensions.
 	 */
 
+	size_t errCnt      = 0;
 	size_t i,j,k,l,m,n,o,p;
 	size_t dims   [8]  = {1171,373,2,1,2,1,2,1};
 	size_t prodDims    = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7];
@@ -2850,11 +3319,11 @@ START_TEST(test_and_veryhighrank){
 	size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3];
 	const int reduxList[] = {2,4,7,5};
 
-	uint32_t*  pS = calloc(1, sizeof(*pS) * prodDims);
-	uint32_t*  pD = calloc(1, sizeof(*pD) * rdxProdDims);
+	uint32_t*  pS0 = calloc(1, sizeof(*pS0) * prodDims);
+	uint32_t*  pD0 = calloc(1, sizeof(*pD0) * rdxProdDims);
 
-	ck_assert_ptr_ne(pS, NULL);
-	ck_assert_ptr_ne(pD, NULL);
+	ck_assert_ptr_nonnull(pS0);
+	ck_assert_ptr_nonnull(pD0);
 
 
 	/**
@@ -2867,11 +3336,11 @@ START_TEST(test_and_veryhighrank){
 		 * probability.
 		 */
 
-		pS[i]  = (uint32_t)(pcgRand01() * (uint32_t)-1);
-		pS[i] |= (uint32_t)(pcgRand01() * (uint32_t)-1);
-		pS[i] |= (uint32_t)(pcgRand01() * (uint32_t)-1);
-		pS[i] |= (uint32_t)(pcgRand01() * (uint32_t)-1);
-		pS[i] |= (uint32_t)(pcgRand01() * (uint32_t)-1);
+		pS0[i]  = (uint32_t)(pcgRand01() * (uint32_t)-1);
+		pS0[i] |= (uint32_t)(pcgRand01() * (uint32_t)-1);
+		pS0[i] |= (uint32_t)(pcgRand01() * (uint32_t)-1);
+		pS0[i] |= (uint32_t)(pcgRand01() * (uint32_t)-1);
+		pS0[i] |= (uint32_t)(pcgRand01() * (uint32_t)-1);
 	}
 
 
@@ -2879,23 +3348,30 @@ START_TEST(test_and_veryhighrank){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS;
-	GpuArray gaD;
-
-	ga_assert_ok(GpuArray_empty (&gaS, ctx, GA_UINT, 8, dims,    GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty (&gaD, ctx, GA_UINT, 4, rdxDims, GA_C_ORDER));
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 
-	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
-	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
+	ga_assert_ok(GpuArray_empty (&gaS0, ctx, GA_UINT, 8, dims,    GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty (&gaD0, ctx, GA_UINT, 4, rdxDims, GA_C_ORDER));
 
-	GpuReduction* gr;
-	GpuReduction_new(&gr, GpuArray_context(&gaS),
-	                 GA_REDUCE_AND, 4, 4, gaS.typecode, 0);
+	ga_assert_ok(GpuArray_write (&gaS0, pS0, sizeof(*pS0)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD0, -1));  /* 0xFFFFFFFF is a qNaN. */
+	
+	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
+	ck_assert_ptr_nonnull(grAttr);
+	GpuReductionAttr_setop    (grAttr, GA_REDUCE_AND);
+	GpuReductionAttr_setdims  (grAttr, gaS0.nd, gaD0.nd);
+	GpuReductionAttr_sets0type(grAttr, gaS0.typecode);
+	GpuReductionAttr_setd0type(grAttr, gaD0.typecode);
+	GpuReduction_new(&gr, grAttr);
 	ck_assert_ptr_nonnull(gr);
-	ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 4, reduxList, 0));
+	ga_assert_ok(GpuReduction_call(gr, &gaD0, NULL, &gaS0, gaS0.nd-gaD0.nd, reduxList, 0));
 	GpuReduction_free(gr);
+	GpuReductionAttr_free(grAttr);
 
-	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*rdxProdDims, &gaD));
+	ga_assert_ok(GpuArray_read  (pD0,   sizeof(*pD0)*rdxProdDims, &gaD0));
 
 
 	/**
@@ -2906,35 +3382,43 @@ START_TEST(test_and_veryhighrank){
 		for(j=0;j<dims[1];j++){
 			for(l=0;l<dims[3];l++){
 				for(o=0;o<dims[6];o++){
-					uint32_t gtD = -1;
+					uint32_t gtD0 = -1;
 
 					for(k=0;k<dims[2];k++){
 						for(m=0;m<dims[4];m++){
 							for(p=0;p<dims[7];p++){
 								for(n=0;n<dims[5];n++){
-									uint32_t v = pS[(((((((i)*dims[1] + j)*dims[2] + k)*dims[3] + l)*dims[4] + m)*dims[5] + n)*dims[6] + o)*dims[7] + p];
-									gtD &= v;
+									uint32_t v = pS0[(((((((i)*dims[1] + j)*dims[2] + k)*dims[3] + l)*dims[4] + m)*dims[5] + n)*dims[6] + o)*dims[7] + p];
+									gtD0 &= v;
 								}
 							}
 						}
 					}
 
 					size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
-					ck_assert_uint_eq(gtD, pD[dstIdx]);
+					if(gtD0 != pD0[dstIdx]){
+						errCnt++;
+						if(errCnt < MAXERRPRINT){
+							fprintf(stderr, "%s:%d: Mismatch GT %u != %u UUT @ %zu!\n",
+							__func__, __LINE__, gtD0, pD0[dstIdx], dstIdx);
+							fflush (stderr);
+						}
+					}
 				}
 			}
 		}
 	}
+	ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt);
 
 
 	/**
 	 * Deallocate.
 	 */
 
-	free(pS);
-	free(pD);
-	GpuArray_clear(&gaS);
-	GpuArray_clear(&gaD);
+	free(pS0);
+	free(pD0);
+	GpuArray_clear(&gaS0);
+	GpuArray_clear(&gaD0);
 }END_TEST
 
 START_TEST(test_and_alldimsreduced){
@@ -2944,16 +3428,17 @@ START_TEST(test_and_alldimsreduced){
 	 * We test here a reduction of some random 3D tensor on all dimensions.
 	 */
 
+	size_t errCnt      = 0;
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
 	const int reduxList[] = {0,1,2};
 
-	uint32_t*  pS = calloc(1, sizeof(*pS)    * dims[0]*dims[1]*dims[2]);
-	uint32_t*  pD = calloc(1, sizeof(*pD)                             );
+	uint32_t*  pS0 = calloc(1, sizeof(*pS0)    * dims[0]*dims[1]*dims[2]);
+	uint32_t*  pD0 = calloc(1, sizeof(*pD0)                             );
 
-	ck_assert_ptr_ne(pS,    NULL);
-	ck_assert_ptr_ne(pD,    NULL);
+	ck_assert_ptr_nonnull(pS0);
+	ck_assert_ptr_nonnull(pD0);
 
 
 	/**
@@ -2966,11 +3451,11 @@ START_TEST(test_and_alldimsreduced){
 		 * probability.
 		 */
 
-		pS[i]  = (uint32_t)(pcgRand01() * (uint32_t)-1);
-		pS[i] |= (uint32_t)(pcgRand01() * (uint32_t)-1);
-		pS[i] |= (uint32_t)(pcgRand01() * (uint32_t)-1);
-		pS[i] |= (uint32_t)(pcgRand01() * (uint32_t)-1);
-		pS[i] |= (uint32_t)(pcgRand01() * (uint32_t)-1);
+		pS0[i]  = (uint32_t)(pcgRand01() * (uint32_t)-1);
+		pS0[i] |= (uint32_t)(pcgRand01() * (uint32_t)-1);
+		pS0[i] |= (uint32_t)(pcgRand01() * (uint32_t)-1);
+		pS0[i] |= (uint32_t)(pcgRand01() * (uint32_t)-1);
+		pS0[i] |= (uint32_t)(pcgRand01() * (uint32_t)-1);
 	}
 
 
@@ -2978,50 +3463,64 @@ START_TEST(test_and_alldimsreduced){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS;
-	GpuArray gaD;
-
-	ga_assert_ok(GpuArray_empty (&gaS, ctx, GA_UINT, 3, &dims[0], GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty (&gaD, ctx, GA_UINT, 0, NULL,     GA_C_ORDER));
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 
-	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
-	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
+	ga_assert_ok(GpuArray_empty (&gaS0, ctx, GA_UINT, 3, &dims[0], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty (&gaD0, ctx, GA_UINT, 0, NULL,     GA_C_ORDER));
 
-	GpuReduction* gr;
-	GpuReduction_new(&gr, GpuArray_context(&gaS),
-	                 GA_REDUCE_AND, 0, 3, gaS.typecode, 0);
+	ga_assert_ok(GpuArray_write (&gaS0, pS0, sizeof(*pS0)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD0, -1));  /* 0xFFFFFFFF is a qNaN. */
+	
+	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
+	ck_assert_ptr_nonnull(grAttr);
+	GpuReductionAttr_setop    (grAttr, GA_REDUCE_AND);
+	GpuReductionAttr_setdims  (grAttr, gaS0.nd, gaD0.nd);
+	GpuReductionAttr_sets0type(grAttr, gaS0.typecode);
+	GpuReductionAttr_setd0type(grAttr, gaD0.typecode);
+	GpuReduction_new(&gr, grAttr);
 	ck_assert_ptr_nonnull(gr);
-	ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 3, reduxList, 0));
+	ga_assert_ok(GpuReduction_call(gr, &gaD0, NULL, &gaS0, gaS0.nd-gaD0.nd, reduxList, 0));
 	GpuReduction_free(gr);
+	GpuReductionAttr_free(grAttr);
 
-	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD), &gaD));
+	ga_assert_ok(GpuArray_read  (pD0,   sizeof(*pD0), &gaD0));
 
 
 	/**
 	 * Check that the destination tensors are correct.
 	 */
 
-	uint32_t  gtD = -1;
+	uint32_t  gtD0 = -1;
 
 	for(i=0;i<dims[0];i++){
 		for(j=0;j<dims[1];j++){
 			for(k=0;k<dims[2];k++){
-				uint32_t v = pS[(i*dims[1] + j)*dims[2] + k];
-				gtD &= v;
+				uint32_t v = pS0[(i*dims[1] + j)*dims[2] + k];
+				gtD0 &= v;
 			}
 		}
 	}
-
-	ck_assert_uint_eq(gtD, pD[0]);
+	if(gtD0 != pD0[0]){
+		errCnt++;
+		if(errCnt < MAXERRPRINT){
+			fprintf(stderr, "%s:%d: Mismatch GT %u != %u UUT @ %zu!\n",
+			__func__, __LINE__, gtD0, pD0[0], (size_t)0);
+			fflush (stderr);
+		}
+	}
+	ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt);
 
 	/**
 	 * Deallocate.
 	 */
 
-	free(pS);
-	free(pD);
-	GpuArray_clear(&gaS);
-	GpuArray_clear(&gaD);
+	free(pS0);
+	free(pD0);
+	GpuArray_clear(&gaS0);
+	GpuArray_clear(&gaD0);
 }END_TEST
 
 START_TEST(test_or_reduction){
@@ -3031,16 +3530,17 @@ START_TEST(test_or_reduction){
 	 * We test here a reduction of some random 3D tensor on all dimensions.
 	 */
 
+	size_t errCnt      = 0;
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
 	const int reduxList[] = {0,2};
 
-	uint32_t*  pS = calloc(1, sizeof(*pS)    * dims[0]*dims[1]*dims[2]);
-	uint32_t*  pD = calloc(1, sizeof(*pD)    *         dims[1]        );
+	uint32_t*  pS0 = calloc(1, sizeof(*pS0)    * dims[0]*dims[1]*dims[2]);
+	uint32_t*  pD0 = calloc(1, sizeof(*pD0)    *         dims[1]        );
 
-	ck_assert_ptr_ne(pS,    NULL);
-	ck_assert_ptr_ne(pD,    NULL);
+	ck_assert_ptr_nonnull(pS0);
+	ck_assert_ptr_nonnull(pD0);
 
 
 	/**
@@ -3053,11 +3553,11 @@ START_TEST(test_or_reduction){
 		 * probability.
 		 */
 
-		pS[i]  = (uint32_t)(pcgRand01() * (uint32_t)-1);
-		pS[i] &= (uint32_t)(pcgRand01() * (uint32_t)-1);
-		pS[i] &= (uint32_t)(pcgRand01() * (uint32_t)-1);
-		pS[i] &= (uint32_t)(pcgRand01() * (uint32_t)-1);
-		pS[i] &= (uint32_t)(pcgRand01() * (uint32_t)-1);
+		pS0[i]  = (uint32_t)(pcgRand01() * (uint32_t)-1);
+		pS0[i] &= (uint32_t)(pcgRand01() * (uint32_t)-1);
+		pS0[i] &= (uint32_t)(pcgRand01() * (uint32_t)-1);
+		pS0[i] &= (uint32_t)(pcgRand01() * (uint32_t)-1);
+		pS0[i] &= (uint32_t)(pcgRand01() * (uint32_t)-1);
 	}
 
 
@@ -3065,23 +3565,30 @@ START_TEST(test_or_reduction){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS;
-	GpuArray gaD;
-
-	ga_assert_ok(GpuArray_empty (&gaS, ctx, GA_UINT, 3, &dims[0], GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty (&gaD, ctx, GA_UINT, 1, &dims[1], GA_C_ORDER));
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 
-	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
-	ga_assert_ok(GpuArray_memset(&gaD, -1));
+	ga_assert_ok(GpuArray_empty (&gaS0, ctx, GA_UINT, 3, &dims[0], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty (&gaD0, ctx, GA_UINT, 1, &dims[1], GA_C_ORDER));
 
-	GpuReduction* gr;
-	GpuReduction_new(&gr, GpuArray_context(&gaS),
-	                 GA_REDUCE_OR, 1, 2, gaS.typecode, 0);
+	ga_assert_ok(GpuArray_write (&gaS0, pS0, sizeof(*pS0)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD0, -1));
+	
+	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
+	ck_assert_ptr_nonnull(grAttr);
+	GpuReductionAttr_setop    (grAttr, GA_REDUCE_OR);
+	GpuReductionAttr_setdims  (grAttr, gaS0.nd, gaD0.nd);
+	GpuReductionAttr_sets0type(grAttr, gaS0.typecode);
+	GpuReductionAttr_setd0type(grAttr, gaD0.typecode);
+	GpuReduction_new(&gr, grAttr);
 	ck_assert_ptr_nonnull(gr);
-	ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 2, reduxList, 0));
+	ga_assert_ok(GpuReduction_call(gr, &gaD0, NULL, &gaS0, gaS0.nd-gaD0.nd, reduxList, 0));
 	GpuReduction_free(gr);
+	GpuReductionAttr_free(grAttr);
 
-	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*dims[1], &gaD));
+	ga_assert_ok(GpuArray_read  (pD0,   sizeof(*pD0)*dims[1], &gaD0));
 
 
 	/**
@@ -3089,26 +3596,34 @@ START_TEST(test_or_reduction){
 	 */
 
 	for(j=0;j<dims[1];j++){
-		uint32_t  gtD = 0;
+		uint32_t  gtD0 = 0;
 
 		for(i=0;i<dims[0];i++){
 			for(k=0;k<dims[2];k++){
-				uint32_t v = pS[(i*dims[1] + j)*dims[2] + k];
-				gtD |= v;
+				uint32_t v = pS0[(i*dims[1] + j)*dims[2] + k];
+				gtD0 |= v;
 			}
 		}
 
-		ck_assert_uint_eq(gtD, pD[j]);
+		if(gtD0 != pD0[j]){
+			errCnt++;
+			if(errCnt < MAXERRPRINT){
+				fprintf(stderr, "%s:%d: Mismatch GT %u != %u UUT @ %zu!\n",
+				__func__, __LINE__, gtD0, pD0[j], (size_t)j);
+				fflush (stderr);
+			}
+		}
 	}
+	ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt);
 
 	/**
 	 * Deallocate.
 	 */
 
-	free(pS);
-	free(pD);
-	GpuArray_clear(&gaS);
-	GpuArray_clear(&gaD);
+	free(pS0);
+	free(pD0);
+	GpuArray_clear(&gaS0);
+	GpuArray_clear(&gaD0);
 }END_TEST
 
 START_TEST(test_or_veryhighrank){
@@ -3118,6 +3633,7 @@ START_TEST(test_or_veryhighrank){
 	 * Here we test a reduction of a random 8D tensor on four dimensions.
 	 */
 
+	size_t errCnt      = 0;
 	size_t i,j,k,l,m,n,o,p;
 	size_t dims   [8]  = {1171,373,2,1,2,1,2,1};
 	size_t prodDims    = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7];
@@ -3125,11 +3641,11 @@ START_TEST(test_or_veryhighrank){
 	size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3];
 	const int reduxList[] = {2,4,7,5};
 
-	uint32_t*  pS = calloc(1, sizeof(*pS) * prodDims);
-	uint32_t*  pD = calloc(1, sizeof(*pD) * rdxProdDims);
+	uint32_t*  pS0 = calloc(1, sizeof(*pS0) * prodDims);
+	uint32_t*  pD0 = calloc(1, sizeof(*pD0) * rdxProdDims);
 
-	ck_assert_ptr_ne(pS, NULL);
-	ck_assert_ptr_ne(pD, NULL);
+	ck_assert_ptr_nonnull(pS0);
+	ck_assert_ptr_nonnull(pD0);
 
 
 	/**
@@ -3142,11 +3658,11 @@ START_TEST(test_or_veryhighrank){
 		 * probability.
 		 */
 
-		pS[i]  = (uint32_t)(pcgRand01() * (uint32_t)-1);
-		pS[i] &= (uint32_t)(pcgRand01() * (uint32_t)-1);
-		pS[i] &= (uint32_t)(pcgRand01() * (uint32_t)-1);
-		pS[i] &= (uint32_t)(pcgRand01() * (uint32_t)-1);
-		pS[i] &= (uint32_t)(pcgRand01() * (uint32_t)-1);
+		pS0[i]  = (uint32_t)(pcgRand01() * (uint32_t)-1);
+		pS0[i] &= (uint32_t)(pcgRand01() * (uint32_t)-1);
+		pS0[i] &= (uint32_t)(pcgRand01() * (uint32_t)-1);
+		pS0[i] &= (uint32_t)(pcgRand01() * (uint32_t)-1);
+		pS0[i] &= (uint32_t)(pcgRand01() * (uint32_t)-1);
 	}
 
 
@@ -3154,23 +3670,30 @@ START_TEST(test_or_veryhighrank){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS;
-	GpuArray gaD;
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 
-	ga_assert_ok(GpuArray_empty (&gaS, ctx, GA_UINT, 8, dims,    GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty (&gaD, ctx, GA_UINT, 4, rdxDims, GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty (&gaS0, ctx, GA_UINT, 8, dims,    GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty (&gaD0, ctx, GA_UINT, 4, rdxDims, GA_C_ORDER));
 
-	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
-	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
-
-	GpuReduction* gr;
-	GpuReduction_new(&gr, GpuArray_context(&gaS),
-	                 GA_REDUCE_OR, 4, 4, gaS.typecode, 0);
+	ga_assert_ok(GpuArray_write (&gaS0, pS0, sizeof(*pS0)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD0, -1));  /* 0xFFFFFFFF is a qNaN. */
+	
+	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
+	ck_assert_ptr_nonnull(grAttr);
+	GpuReductionAttr_setop    (grAttr, GA_REDUCE_OR);
+	GpuReductionAttr_setdims  (grAttr, gaS0.nd, gaD0.nd);
+	GpuReductionAttr_sets0type(grAttr, gaS0.typecode);
+	GpuReductionAttr_setd0type(grAttr, gaD0.typecode);
+	GpuReduction_new(&gr, grAttr);
 	ck_assert_ptr_nonnull(gr);
-	ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 4, reduxList, 0));
+	ga_assert_ok(GpuReduction_call(gr, &gaD0, NULL, &gaS0, gaS0.nd-gaD0.nd, reduxList, 0));
 	GpuReduction_free(gr);
+	GpuReductionAttr_free(grAttr);
 
-	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*rdxProdDims, &gaD));
+	ga_assert_ok(GpuArray_read  (pD0,   sizeof(*pD0)*rdxProdDims, &gaD0));
 
 
 	/**
@@ -3181,35 +3704,43 @@ START_TEST(test_or_veryhighrank){
 		for(j=0;j<dims[1];j++){
 			for(l=0;l<dims[3];l++){
 				for(o=0;o<dims[6];o++){
-					uint32_t gtD = 0;
+					uint32_t gtD0 = 0;
 
 					for(k=0;k<dims[2];k++){
 						for(m=0;m<dims[4];m++){
 							for(p=0;p<dims[7];p++){
 								for(n=0;n<dims[5];n++){
-									uint32_t v = pS[(((((((i)*dims[1] + j)*dims[2] + k)*dims[3] + l)*dims[4] + m)*dims[5] + n)*dims[6] + o)*dims[7] + p];
-									gtD |= v;
+									uint32_t v = pS0[(((((((i)*dims[1] + j)*dims[2] + k)*dims[3] + l)*dims[4] + m)*dims[5] + n)*dims[6] + o)*dims[7] + p];
+									gtD0 |= v;
 								}
 							}
 						}
 					}
 
 					size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
-					ck_assert_uint_eq(gtD, pD[dstIdx]);
+					if(gtD0 != pD0[dstIdx]){
+						errCnt++;
+						if(errCnt < MAXERRPRINT){
+							fprintf(stderr, "%s:%d: Mismatch GT %u != %u UUT @ %zu!\n",
+							__func__, __LINE__, gtD0, pD0[dstIdx], dstIdx);
+							fflush (stderr);
+						}
+					}
 				}
 			}
 		}
 	}
+	ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt);
 
 
 	/**
 	 * Deallocate.
 	 */
 
-	free(pS);
-	free(pD);
-	GpuArray_clear(&gaS);
-	GpuArray_clear(&gaD);
+	free(pS0);
+	free(pD0);
+	GpuArray_clear(&gaS0);
+	GpuArray_clear(&gaD0);
 }END_TEST
 
 START_TEST(test_or_alldimsreduced){
@@ -3219,16 +3750,17 @@ START_TEST(test_or_alldimsreduced){
 	 * We test here a reduction of some random 3D tensor on all dimensions.
 	 */
 
+	size_t errCnt      = 0;
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
 	const int reduxList[] = {0,1,2};
 
-	uint32_t*  pS = calloc(1, sizeof(*pS)    * dims[0]*dims[1]*dims[2]);
-	uint32_t*  pD = calloc(1, sizeof(*pD)                             );
+	uint32_t*  pS0 = calloc(1, sizeof(*pS0)    * dims[0]*dims[1]*dims[2]);
+	uint32_t*  pD0 = calloc(1, sizeof(*pD0)                             );
 
-	ck_assert_ptr_ne(pS,    NULL);
-	ck_assert_ptr_ne(pD,    NULL);
+	ck_assert_ptr_nonnull(pS0);
+	ck_assert_ptr_nonnull(pD0);
 
 
 	/**
@@ -3241,11 +3773,11 @@ START_TEST(test_or_alldimsreduced){
 		 * probability.
 		 */
 
-		pS[i]  = (uint32_t)(pcgRand01() * (uint32_t)-1);
-		pS[i] &= (uint32_t)(pcgRand01() * (uint32_t)-1);
-		pS[i] &= (uint32_t)(pcgRand01() * (uint32_t)-1);
-		pS[i] &= (uint32_t)(pcgRand01() * (uint32_t)-1);
-		pS[i] &= (uint32_t)(pcgRand01() * (uint32_t)-1);
+		pS0[i]  = (uint32_t)(pcgRand01() * (uint32_t)-1);
+		pS0[i] &= (uint32_t)(pcgRand01() * (uint32_t)-1);
+		pS0[i] &= (uint32_t)(pcgRand01() * (uint32_t)-1);
+		pS0[i] &= (uint32_t)(pcgRand01() * (uint32_t)-1);
+		pS0[i] &= (uint32_t)(pcgRand01() * (uint32_t)-1);
 	}
 
 
@@ -3253,50 +3785,64 @@ START_TEST(test_or_alldimsreduced){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS;
-	GpuArray gaD;
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 
-	ga_assert_ok(GpuArray_empty (&gaS, ctx, GA_UINT, 3, &dims[0], GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty (&gaD, ctx, GA_UINT, 0, NULL,     GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty (&gaS0, ctx, GA_UINT, 3, &dims[0], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty (&gaD0, ctx, GA_UINT, 0, NULL,     GA_C_ORDER));
 
-	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
-	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
-
-	GpuReduction* gr;
-	GpuReduction_new(&gr, GpuArray_context(&gaS),
-	                 GA_REDUCE_OR, 0, 3, gaS.typecode, 0);
+	ga_assert_ok(GpuArray_write (&gaS0, pS0, sizeof(*pS0)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD0, -1));  /* 0xFFFFFFFF is a qNaN. */
+	
+	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
+	ck_assert_ptr_nonnull(grAttr);
+	GpuReductionAttr_setop    (grAttr, GA_REDUCE_OR);
+	GpuReductionAttr_setdims  (grAttr, gaS0.nd, gaD0.nd);
+	GpuReductionAttr_sets0type(grAttr, gaS0.typecode);
+	GpuReductionAttr_setd0type(grAttr, gaD0.typecode);
+	GpuReduction_new(&gr, grAttr);
 	ck_assert_ptr_nonnull(gr);
-	ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 3, reduxList, 0));
+	ga_assert_ok(GpuReduction_call(gr, &gaD0, NULL, &gaS0, gaS0.nd-gaD0.nd, reduxList, 0));
 	GpuReduction_free(gr);
+	GpuReductionAttr_free(grAttr);
 
-	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD), &gaD));
+	ga_assert_ok(GpuArray_read  (pD0,   sizeof(*pD0), &gaD0));
 
 
 	/**
 	 * Check that the destination tensors are correct.
 	 */
 
-	uint32_t  gtD = 0;
+	uint32_t  gtD0 = 0;
 
 	for(i=0;i<dims[0];i++){
 		for(j=0;j<dims[1];j++){
 			for(k=0;k<dims[2];k++){
-				uint32_t v = pS[(i*dims[1] + j)*dims[2] + k];
-				gtD |= v;
+				uint32_t v = pS0[(i*dims[1] + j)*dims[2] + k];
+				gtD0 |= v;
 			}
 		}
 	}
-
-	ck_assert_uint_eq(gtD, pD[0]);
+	if(gtD0 != pD0[0]){
+		errCnt++;
+		if(errCnt < MAXERRPRINT){
+			fprintf(stderr, "%s:%d: Mismatch GT %u != %u UUT @ %zu!\n",
+			__func__, __LINE__, gtD0, pD0[0], (size_t)0);
+			fflush (stderr);
+		}
+	}
+	ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt);
 
 	/**
 	 * Deallocate.
 	 */
 
-	free(pS);
-	free(pD);
-	GpuArray_clear(&gaS);
-	GpuArray_clear(&gaD);
+	free(pS0);
+	free(pD0);
+	GpuArray_clear(&gaS0);
+	GpuArray_clear(&gaD0);
 }END_TEST
 
 START_TEST(test_xor_reduction){
@@ -3306,16 +3852,17 @@ START_TEST(test_xor_reduction){
 	 * We test here a reduction of some random 3D tensor on all dimensions.
 	 */
 
+	size_t errCnt      = 0;
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
 	const int reduxList[] = {0,2};
 
-	uint32_t*  pS = calloc(1, sizeof(*pS)    * dims[0]*dims[1]*dims[2]);
-	uint32_t*  pD = calloc(1, sizeof(*pD)    *         dims[1]        );
+	uint32_t*  pS0 = calloc(1, sizeof(*pS0)    * dims[0]*dims[1]*dims[2]);
+	uint32_t*  pD0 = calloc(1, sizeof(*pD0)    *         dims[1]        );
 
-	ck_assert_ptr_ne(pS,    NULL);
-	ck_assert_ptr_ne(pD,    NULL);
+	ck_assert_ptr_nonnull(pS0);
+	ck_assert_ptr_nonnull(pD0);
 
 
 	/**
@@ -3328,7 +3875,7 @@ START_TEST(test_xor_reduction){
 		 * probability.
 		 */
 
-		pS[i]  = (uint32_t)(pcgRand01() * (uint32_t)-1);
+		pS0[i]  = (uint32_t)(pcgRand01() * (uint32_t)-1);
 	}
 
 
@@ -3336,23 +3883,30 @@ START_TEST(test_xor_reduction){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS;
-	GpuArray gaD;
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 
-	ga_assert_ok(GpuArray_empty (&gaS, ctx, GA_UINT, 3, &dims[0], GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty (&gaD, ctx, GA_UINT, 1, &dims[1], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty (&gaS0, ctx, GA_UINT, 3, &dims[0], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty (&gaD0, ctx, GA_UINT, 1, &dims[1], GA_C_ORDER));
 
-	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
-	ga_assert_ok(GpuArray_memset(&gaD, -1));
-
-	GpuReduction* gr;
-	GpuReduction_new(&gr, GpuArray_context(&gaS),
-	                 GA_REDUCE_XOR, 1, 2, gaS.typecode, 0);
+	ga_assert_ok(GpuArray_write (&gaS0, pS0, sizeof(*pS0)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD0, -1));
+	
+	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
+	ck_assert_ptr_nonnull(grAttr);
+	GpuReductionAttr_setop    (grAttr, GA_REDUCE_XOR);
+	GpuReductionAttr_setdims  (grAttr, gaS0.nd, gaD0.nd);
+	GpuReductionAttr_sets0type(grAttr, gaS0.typecode);
+	GpuReductionAttr_setd0type(grAttr, gaD0.typecode);
+	GpuReduction_new(&gr, grAttr);
 	ck_assert_ptr_nonnull(gr);
-	ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 2, reduxList, 0));
+	ga_assert_ok(GpuReduction_call(gr, &gaD0, NULL, &gaS0, gaS0.nd-gaD0.nd, reduxList, 0));
 	GpuReduction_free(gr);
+	GpuReductionAttr_free(grAttr);
 
-	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*dims[1], &gaD));
+	ga_assert_ok(GpuArray_read  (pD0,   sizeof(*pD0)*dims[1], &gaD0));
 
 
 	/**
@@ -3360,26 +3914,34 @@ START_TEST(test_xor_reduction){
 	 */
 
 	for(j=0;j<dims[1];j++){
-		uint32_t  gtD = 0;
+		uint32_t  gtD0 = 0;
 
 		for(i=0;i<dims[0];i++){
 			for(k=0;k<dims[2];k++){
-				uint32_t v = pS[(i*dims[1] + j)*dims[2] + k];
-				gtD ^= v;
+				uint32_t v = pS0[(i*dims[1] + j)*dims[2] + k];
+				gtD0 ^= v;
 			}
 		}
 
-		ck_assert_uint_eq(gtD, pD[j]);
+		if(gtD0 != pD0[j]){
+			errCnt++;
+			if(errCnt < MAXERRPRINT){
+				fprintf(stderr, "%s:%d: Mismatch GT %u != %u UUT @ %zu!\n",
+				__func__, __LINE__, gtD0, pD0[j], (size_t)j);
+				fflush (stderr);
+			}
+		}
 	}
+	ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt);
 
 	/**
 	 * Deallocate.
 	 */
 
-	free(pS);
-	free(pD);
-	GpuArray_clear(&gaS);
-	GpuArray_clear(&gaD);
+	free(pS0);
+	free(pD0);
+	GpuArray_clear(&gaS0);
+	GpuArray_clear(&gaD0);
 }END_TEST
 
 START_TEST(test_xor_veryhighrank){
@@ -3389,6 +3951,7 @@ START_TEST(test_xor_veryhighrank){
 	 * Here we test a reduction of a random 8D tensor on four dimensions.
 	 */
 
+	size_t errCnt      = 0;
 	size_t i,j,k,l,m,n,o,p;
 	size_t dims   [8]  = {1171,373,2,1,2,1,2,1};
 	size_t prodDims    = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7];
@@ -3396,11 +3959,11 @@ START_TEST(test_xor_veryhighrank){
 	size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3];
 	const int reduxList[] = {2,4,7,5};
 
-	uint32_t*  pS = calloc(1, sizeof(*pS) * prodDims);
-	uint32_t*  pD = calloc(1, sizeof(*pD) * rdxProdDims);
+	uint32_t*  pS0 = calloc(1, sizeof(*pS0) * prodDims);
+	uint32_t*  pD0 = calloc(1, sizeof(*pD0) * rdxProdDims);
 
-	ck_assert_ptr_ne(pS, NULL);
-	ck_assert_ptr_ne(pD, NULL);
+	ck_assert_ptr_nonnull(pS0);
+	ck_assert_ptr_nonnull(pD0);
 
 
 	/**
@@ -3413,7 +3976,7 @@ START_TEST(test_xor_veryhighrank){
 		 * probability.
 		 */
 
-		pS[i]  = (uint32_t)(pcgRand01() * (uint32_t)-1);
+		pS0[i]  = (uint32_t)(pcgRand01() * (uint32_t)-1);
 	}
 
 
@@ -3421,23 +3984,30 @@ START_TEST(test_xor_veryhighrank){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS;
-	GpuArray gaD;
-
-	ga_assert_ok(GpuArray_empty (&gaS, ctx, GA_UINT, 8, dims,    GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty (&gaD, ctx, GA_UINT, 4, rdxDims, GA_C_ORDER));
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 
-	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
-	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
+	ga_assert_ok(GpuArray_empty (&gaS0, ctx, GA_UINT, 8, dims,    GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty (&gaD0, ctx, GA_UINT, 4, rdxDims, GA_C_ORDER));
 
-	GpuReduction* gr;
-	GpuReduction_new(&gr, GpuArray_context(&gaS),
-	                 GA_REDUCE_XOR, 4, 4, gaS.typecode, 0);
+	ga_assert_ok(GpuArray_write (&gaS0, pS0, sizeof(*pS0)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD0, -1));  /* 0xFFFFFFFF is a qNaN. */
+	
+	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
+	ck_assert_ptr_nonnull(grAttr);
+	GpuReductionAttr_setop    (grAttr, GA_REDUCE_XOR);
+	GpuReductionAttr_setdims  (grAttr, gaS0.nd, gaD0.nd);
+	GpuReductionAttr_sets0type(grAttr, gaS0.typecode);
+	GpuReductionAttr_setd0type(grAttr, gaD0.typecode);
+	GpuReduction_new(&gr, grAttr);
 	ck_assert_ptr_nonnull(gr);
-	ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 4, reduxList, 0));
+	ga_assert_ok(GpuReduction_call(gr, &gaD0, NULL, &gaS0, gaS0.nd-gaD0.nd, reduxList, 0));
 	GpuReduction_free(gr);
+	GpuReductionAttr_free(grAttr);
 
-	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*rdxProdDims, &gaD));
+	ga_assert_ok(GpuArray_read  (pD0,   sizeof(*pD0)*rdxProdDims, &gaD0));
 
 
 	/**
@@ -3448,35 +4018,43 @@ START_TEST(test_xor_veryhighrank){
 		for(j=0;j<dims[1];j++){
 			for(l=0;l<dims[3];l++){
 				for(o=0;o<dims[6];o++){
-					uint32_t gtD = 0;
+					uint32_t gtD0 = 0;
 
 					for(k=0;k<dims[2];k++){
 						for(m=0;m<dims[4];m++){
 							for(p=0;p<dims[7];p++){
 								for(n=0;n<dims[5];n++){
-									uint32_t v = pS[(((((((i)*dims[1] + j)*dims[2] + k)*dims[3] + l)*dims[4] + m)*dims[5] + n)*dims[6] + o)*dims[7] + p];
-									gtD ^= v;
+									uint32_t v = pS0[(((((((i)*dims[1] + j)*dims[2] + k)*dims[3] + l)*dims[4] + m)*dims[5] + n)*dims[6] + o)*dims[7] + p];
+									gtD0 ^= v;
 								}
 							}
 						}
 					}
 
 					size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
-					ck_assert_uint_eq(gtD, pD[dstIdx]);
+					if(gtD0 != pD0[dstIdx]){
+						errCnt++;
+						if(errCnt < MAXERRPRINT){
+							fprintf(stderr, "%s:%d: Mismatch GT %u != %u UUT @ %zu!\n",
+							__func__, __LINE__, gtD0, pD0[dstIdx], dstIdx);
+							fflush (stderr);
+						}
+					}
 				}
 			}
 		}
 	}
+	ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt);
 
 
 	/**
 	 * Deallocate.
 	 */
 
-	free(pS);
-	free(pD);
-	GpuArray_clear(&gaS);
-	GpuArray_clear(&gaD);
+	free(pS0);
+	free(pD0);
+	GpuArray_clear(&gaS0);
+	GpuArray_clear(&gaD0);
 }END_TEST
 
 START_TEST(test_xor_alldimsreduced){
@@ -3486,16 +4064,17 @@ START_TEST(test_xor_alldimsreduced){
 	 * We test here a reduction of some random 3D tensor on all dimensions.
 	 */
 
+	size_t errCnt      = 0;
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
 	const int reduxList[] = {0,1,2};
 
-	uint32_t*  pS = calloc(1, sizeof(*pS)    * dims[0]*dims[1]*dims[2]);
-	uint32_t*  pD = calloc(1, sizeof(*pD)                             );
+	uint32_t*  pS0 = calloc(1, sizeof(*pS0)    * dims[0]*dims[1]*dims[2]);
+	uint32_t*  pD0 = calloc(1, sizeof(*pD0)                             );
 
-	ck_assert_ptr_ne(pS,    NULL);
-	ck_assert_ptr_ne(pD,    NULL);
+	ck_assert_ptr_nonnull(pS0);
+	ck_assert_ptr_nonnull(pD0);
 
 
 	/**
@@ -3508,7 +4087,7 @@ START_TEST(test_xor_alldimsreduced){
 		 * probability.
 		 */
 
-		pS[i]  = (uint32_t)(pcgRand01() * (uint32_t)-1);
+		pS0[i]  = (uint32_t)(pcgRand01() * (uint32_t)-1);
 	}
 
 
@@ -3516,50 +4095,64 @@ START_TEST(test_xor_alldimsreduced){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS;
-	GpuArray gaD;
-
-	ga_assert_ok(GpuArray_empty (&gaS, ctx, GA_UINT, 3, &dims[0], GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty (&gaD, ctx, GA_UINT, 0, NULL,     GA_C_ORDER));
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 
-	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
-	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
+	ga_assert_ok(GpuArray_empty (&gaS0, ctx, GA_UINT, 3, &dims[0], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty (&gaD0, ctx, GA_UINT, 0, NULL,     GA_C_ORDER));
 
-	GpuReduction* gr;
-	GpuReduction_new(&gr, GpuArray_context(&gaS),
-	                 GA_REDUCE_XOR, 0, 3, gaS.typecode, 0);
+	ga_assert_ok(GpuArray_write (&gaS0, pS0, sizeof(*pS0)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD0, -1));  /* 0xFFFFFFFF is a qNaN. */
+	
+	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
+	ck_assert_ptr_nonnull(grAttr);
+	GpuReductionAttr_setop    (grAttr, GA_REDUCE_XOR);
+	GpuReductionAttr_setdims  (grAttr, gaS0.nd, gaD0.nd);
+	GpuReductionAttr_sets0type(grAttr, gaS0.typecode);
+	GpuReductionAttr_setd0type(grAttr, gaD0.typecode);
+	GpuReduction_new(&gr, grAttr);
 	ck_assert_ptr_nonnull(gr);
-	ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 3, reduxList, 0));
+	ga_assert_ok(GpuReduction_call(gr, &gaD0, NULL, &gaS0, gaS0.nd-gaD0.nd, reduxList, 0));
 	GpuReduction_free(gr);
+	GpuReductionAttr_free(grAttr);
 
-	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD), &gaD));
+	ga_assert_ok(GpuArray_read  (pD0,   sizeof(*pD0), &gaD0));
 
 
 	/**
 	 * Check that the destination tensors are correct.
 	 */
 
-	uint32_t  gtD = 0;
+	uint32_t  gtD0 = 0;
 
 	for(i=0;i<dims[0];i++){
 		for(j=0;j<dims[1];j++){
 			for(k=0;k<dims[2];k++){
-				uint32_t v = pS[(i*dims[1] + j)*dims[2] + k];
-				gtD ^= v;
+				uint32_t v = pS0[(i*dims[1] + j)*dims[2] + k];
+				gtD0 ^= v;
 			}
 		}
 	}
-
-	ck_assert_uint_eq(gtD, pD[0]);
+	if(gtD0 != pD0[0]){
+		errCnt++;
+		if(errCnt < MAXERRPRINT){
+			fprintf(stderr, "%s:%d: Mismatch GT %u != %u UUT @ %zu!\n",
+			__func__, __LINE__, gtD0, pD0[0], (size_t)0);
+			fflush (stderr);
+		}
+	}
+	ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt);
 
 	/**
 	 * Deallocate.
 	 */
 
-	free(pS);
-	free(pD);
-	GpuArray_clear(&gaS);
-	GpuArray_clear(&gaD);
+	free(pS0);
+	free(pD0);
+	GpuArray_clear(&gaS0);
+	GpuArray_clear(&gaD0);
 }END_TEST
 
 START_TEST(test_any_reduction){
@@ -3569,16 +4162,17 @@ START_TEST(test_any_reduction){
 	 * We test here a reduction of some random 3D tensor on all dimensions.
 	 */
 
+	size_t errCnt      = 0;
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
 	const int reduxList[] = {0,2};
 
-	uint32_t*  pS = calloc(1, sizeof(*pS)    * dims[0]*dims[1]*dims[2]);
-	uint32_t*  pD = calloc(1, sizeof(*pD)    *         dims[1]        );
+	uint32_t*  pS0 = calloc(1, sizeof(*pS0)    * dims[0]*dims[1]*dims[2]);
+	uint32_t*  pD0 = calloc(1, sizeof(*pD0)    *         dims[1]        );
 
-	ck_assert_ptr_ne(pS,    NULL);
-	ck_assert_ptr_ne(pD,    NULL);
+	ck_assert_ptr_nonnull(pS0);
+	ck_assert_ptr_nonnull(pD0);
 
 
 	/**
@@ -3591,7 +4185,7 @@ START_TEST(test_any_reduction){
 		 * probability.
 		 */
 
-		pS[i]  = pcgRand01() < 0.05;
+		pS0[i]  = pcgRand01() < 0.05;
 	}
 
 
@@ -3599,23 +4193,30 @@ START_TEST(test_any_reduction){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS;
-	GpuArray gaD;
-
-	ga_assert_ok(GpuArray_empty (&gaS, ctx, GA_UINT, 3, &dims[0], GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty (&gaD, ctx, GA_UINT, 1, &dims[1], GA_C_ORDER));
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 
-	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
-	ga_assert_ok(GpuArray_memset(&gaD, -1));
+	ga_assert_ok(GpuArray_empty (&gaS0, ctx, GA_UINT, 3, &dims[0], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty (&gaD0, ctx, GA_UINT, 1, &dims[1], GA_C_ORDER));
 
-	GpuReduction* gr;
-	GpuReduction_new(&gr, GpuArray_context(&gaS),
-	                 GA_REDUCE_ANY, 1, 2, gaS.typecode, 0);
+	ga_assert_ok(GpuArray_write (&gaS0, pS0, sizeof(*pS0)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD0, -1));
+	
+	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
+	ck_assert_ptr_nonnull(grAttr);
+	GpuReductionAttr_setop    (grAttr, GA_REDUCE_ANY);
+	GpuReductionAttr_setdims  (grAttr, gaS0.nd, gaD0.nd);
+	GpuReductionAttr_sets0type(grAttr, gaS0.typecode);
+	GpuReductionAttr_setd0type(grAttr, gaD0.typecode);
+	GpuReduction_new(&gr, grAttr);
 	ck_assert_ptr_nonnull(gr);
-	ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 2, reduxList, 0));
+	ga_assert_ok(GpuReduction_call(gr, &gaD0, NULL, &gaS0, gaS0.nd-gaD0.nd, reduxList, 0));
 	GpuReduction_free(gr);
+	GpuReductionAttr_free(grAttr);
 
-	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*dims[1], &gaD));
+	ga_assert_ok(GpuArray_read  (pD0,   sizeof(*pD0)*dims[1], &gaD0));
 
 
 	/**
@@ -3623,26 +4224,34 @@ START_TEST(test_any_reduction){
 	 */
 
 	for(j=0;j<dims[1];j++){
-		uint32_t  gtD = 0;
+		uint32_t  gtD0 = 0;
 
 		for(i=0;i<dims[0];i++){
 			for(k=0;k<dims[2];k++){
-				uint32_t v = pS[(i*dims[1] + j)*dims[2] + k];
-				gtD = gtD || v;
+				uint32_t v = pS0[(i*dims[1] + j)*dims[2] + k];
+				gtD0 = gtD0 || v;
 			}
 		}
 
-		ck_assert_uint_eq(gtD, pD[j]);
+		if(gtD0 != pD0[j]){
+			errCnt++;
+			if(errCnt < MAXERRPRINT){
+				fprintf(stderr, "%s:%d: Mismatch GT %u != %u UUT @ %zu!\n",
+				__func__, __LINE__, gtD0, pD0[j], (size_t)j);
+				fflush (stderr);
+			}
+		}
 	}
+	ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt);
 
 	/**
 	 * Deallocate.
 	 */
 
-	free(pS);
-	free(pD);
-	GpuArray_clear(&gaS);
-	GpuArray_clear(&gaD);
+	free(pS0);
+	free(pD0);
+	GpuArray_clear(&gaS0);
+	GpuArray_clear(&gaD0);
 }END_TEST
 
 START_TEST(test_any_veryhighrank){
@@ -3652,6 +4261,7 @@ START_TEST(test_any_veryhighrank){
 	 * Here we test a reduction of a random 8D tensor on four dimensions.
 	 */
 
+	size_t errCnt      = 0;
 	size_t i,j,k,l,m,n,o,p;
 	size_t dims   [8]  = {1171,373,2,1,2,1,2,1};
 	size_t prodDims    = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7];
@@ -3659,11 +4269,11 @@ START_TEST(test_any_veryhighrank){
 	size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3];
 	const int reduxList[] = {2,4,7,5};
 
-	uint32_t*  pS = calloc(1, sizeof(*pS) * prodDims);
-	uint32_t*  pD = calloc(1, sizeof(*pD) * rdxProdDims);
+	uint32_t*  pS0 = calloc(1, sizeof(*pS0) * prodDims);
+	uint32_t*  pD0 = calloc(1, sizeof(*pD0) * rdxProdDims);
 
-	ck_assert_ptr_ne(pS, NULL);
-	ck_assert_ptr_ne(pD, NULL);
+	ck_assert_ptr_nonnull(pS0);
+	ck_assert_ptr_nonnull(pD0);
 
 
 	/**
@@ -3676,7 +4286,7 @@ START_TEST(test_any_veryhighrank){
 		 * probability.
 		 */
 
-		pS[i]  = pcgRand01() < 0.05;
+		pS0[i]  = pcgRand01() < 0.05;
 	}
 
 
@@ -3684,23 +4294,30 @@ START_TEST(test_any_veryhighrank){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS;
-	GpuArray gaD;
-
-	ga_assert_ok(GpuArray_empty (&gaS, ctx, GA_UINT, 8, dims,    GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty (&gaD, ctx, GA_UINT, 4, rdxDims, GA_C_ORDER));
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 
-	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
-	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
+	ga_assert_ok(GpuArray_empty (&gaS0, ctx, GA_UINT, 8, dims,    GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty (&gaD0, ctx, GA_UINT, 4, rdxDims, GA_C_ORDER));
 
-	GpuReduction* gr;
-	GpuReduction_new(&gr, GpuArray_context(&gaS),
-	                 GA_REDUCE_ANY, 4, 4, gaS.typecode, 0);
+	ga_assert_ok(GpuArray_write (&gaS0, pS0, sizeof(*pS0)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD0, -1));  /* 0xFFFFFFFF is a qNaN. */
+	
+	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
+	ck_assert_ptr_nonnull(grAttr);
+	GpuReductionAttr_setop    (grAttr, GA_REDUCE_ANY);
+	GpuReductionAttr_setdims  (grAttr, gaS0.nd, gaD0.nd);
+	GpuReductionAttr_sets0type(grAttr, gaS0.typecode);
+	GpuReductionAttr_setd0type(grAttr, gaD0.typecode);
+	GpuReduction_new(&gr, grAttr);
 	ck_assert_ptr_nonnull(gr);
-	ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 4, reduxList, 0));
+	ga_assert_ok(GpuReduction_call(gr, &gaD0, NULL, &gaS0, gaS0.nd-gaD0.nd, reduxList, 0));
 	GpuReduction_free(gr);
+	GpuReductionAttr_free(grAttr);
 
-	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*rdxProdDims, &gaD));
+	ga_assert_ok(GpuArray_read  (pD0,   sizeof(*pD0)*rdxProdDims, &gaD0));
 
 
 	/**
@@ -3711,35 +4328,43 @@ START_TEST(test_any_veryhighrank){
 		for(j=0;j<dims[1];j++){
 			for(l=0;l<dims[3];l++){
 				for(o=0;o<dims[6];o++){
-					uint32_t gtD = 0;
+					uint32_t gtD0 = 0;
 
 					for(k=0;k<dims[2];k++){
 						for(m=0;m<dims[4];m++){
 							for(p=0;p<dims[7];p++){
 								for(n=0;n<dims[5];n++){
-									uint32_t v = pS[(((((((i)*dims[1] + j)*dims[2] + k)*dims[3] + l)*dims[4] + m)*dims[5] + n)*dims[6] + o)*dims[7] + p];
-									gtD = gtD || v;
+									uint32_t v = pS0[(((((((i)*dims[1] + j)*dims[2] + k)*dims[3] + l)*dims[4] + m)*dims[5] + n)*dims[6] + o)*dims[7] + p];
+									gtD0 = gtD0 || v;
 								}
 							}
 						}
 					}
 
 					size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
-					ck_assert_uint_eq(gtD, pD[dstIdx]);
+					if(gtD0 != pD0[dstIdx]){
+						errCnt++;
+						if(errCnt < MAXERRPRINT){
+							fprintf(stderr, "%s:%d: Mismatch GT %u != %u UUT @ %zu!\n",
+							__func__, __LINE__, gtD0, pD0[dstIdx], dstIdx);
+							fflush (stderr);
+						}
+					}
 				}
 			}
 		}
 	}
+	ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt);
 
 
 	/**
 	 * Deallocate.
 	 */
 
-	free(pS);
-	free(pD);
-	GpuArray_clear(&gaS);
-	GpuArray_clear(&gaD);
+	free(pS0);
+	free(pD0);
+	GpuArray_clear(&gaS0);
+	GpuArray_clear(&gaD0);
 }END_TEST
 
 START_TEST(test_any_alldimsreduced){
@@ -3749,16 +4374,17 @@ START_TEST(test_any_alldimsreduced){
 	 * We test here a reduction of some random 3D tensor on all dimensions.
 	 */
 
+	size_t errCnt      = 0;
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
 	const int reduxList[] = {0,1,2};
 
-	uint32_t*  pS = calloc(1, sizeof(*pS)    * dims[0]*dims[1]*dims[2]);
-	uint32_t*  pD = calloc(1, sizeof(*pD)                             );
+	uint32_t*  pS0 = calloc(1, sizeof(*pS0)    * dims[0]*dims[1]*dims[2]);
+	uint32_t*  pD0 = calloc(1, sizeof(*pD0)                             );
 
-	ck_assert_ptr_ne(pS,    NULL);
-	ck_assert_ptr_ne(pD,    NULL);
+	ck_assert_ptr_nonnull(pS0);
+	ck_assert_ptr_nonnull(pD0);
 
 
 	/**
@@ -3771,7 +4397,7 @@ START_TEST(test_any_alldimsreduced){
 		 * probability.
 		 */
 
-		pS[i]  = pcgRand01() < 0.05;
+		pS0[i]  = pcgRand01() < 0.05;
 	}
 
 
@@ -3779,50 +4405,64 @@ START_TEST(test_any_alldimsreduced){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS;
-	GpuArray gaD;
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 
-	ga_assert_ok(GpuArray_empty (&gaS, ctx, GA_UINT, 3, &dims[0], GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty (&gaD, ctx, GA_UINT, 0, NULL,     GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty (&gaS0, ctx, GA_UINT, 3, &dims[0], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty (&gaD0, ctx, GA_UINT, 0, NULL,     GA_C_ORDER));
 
-	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
-	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
-
-	GpuReduction* gr;
-	GpuReduction_new(&gr, GpuArray_context(&gaS),
-	                 GA_REDUCE_ANY, 0, 3, gaS.typecode, 0);
+	ga_assert_ok(GpuArray_write (&gaS0, pS0, sizeof(*pS0)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD0, -1));  /* 0xFFFFFFFF is a qNaN. */
+	
+	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
+	ck_assert_ptr_nonnull(grAttr);
+	GpuReductionAttr_setop    (grAttr, GA_REDUCE_ANY);
+	GpuReductionAttr_setdims  (grAttr, gaS0.nd, gaD0.nd);
+	GpuReductionAttr_sets0type(grAttr, gaS0.typecode);
+	GpuReductionAttr_setd0type(grAttr, gaD0.typecode);
+	GpuReduction_new(&gr, grAttr);
 	ck_assert_ptr_nonnull(gr);
-	ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 3, reduxList, 0));
+	ga_assert_ok(GpuReduction_call(gr, &gaD0, NULL, &gaS0, gaS0.nd-gaD0.nd, reduxList, 0));
 	GpuReduction_free(gr);
+	GpuReductionAttr_free(grAttr);
 
-	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD), &gaD));
+	ga_assert_ok(GpuArray_read  (pD0,   sizeof(*pD0), &gaD0));
 
 
 	/**
 	 * Check that the destination tensors are correct.
 	 */
 
-	uint32_t  gtD = 0;
+	uint32_t  gtD0 = 0;
 
 	for(i=0;i<dims[0];i++){
 		for(j=0;j<dims[1];j++){
 			for(k=0;k<dims[2];k++){
-				uint32_t v = pS[(i*dims[1] + j)*dims[2] + k];
-				gtD = gtD || v;
+				uint32_t v = pS0[(i*dims[1] + j)*dims[2] + k];
+				gtD0 = gtD0 || v;
 			}
 		}
 	}
-
-	ck_assert_uint_eq(gtD, pD[0]);
+	if(gtD0 != pD0[0]){
+		errCnt++;
+		if(errCnt < MAXERRPRINT){
+			fprintf(stderr, "%s:%d: Mismatch GT %u != %u UUT @ %zu!\n",
+			__func__, __LINE__, gtD0, pD0[0], (size_t)0);
+			fflush (stderr);
+		}
+	}
+	ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt);
 
 	/**
 	 * Deallocate.
 	 */
 
-	free(pS);
-	free(pD);
-	GpuArray_clear(&gaS);
-	GpuArray_clear(&gaD);
+	free(pS0);
+	free(pD0);
+	GpuArray_clear(&gaS0);
+	GpuArray_clear(&gaD0);
 }END_TEST
 
 START_TEST(test_all_reduction){
@@ -3832,16 +4472,17 @@ START_TEST(test_all_reduction){
 	 * We test here a reduction of some random 3D tensor on all dimensions.
 	 */
 
+	size_t errCnt      = 0;
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
 	const int reduxList[] = {0,2};
 
-	uint32_t*  pS = calloc(1, sizeof(*pS)    * dims[0]*dims[1]*dims[2]);
-	uint32_t*  pD = calloc(1, sizeof(*pD)    *         dims[1]        );
+	uint32_t*  pS0 = calloc(1, sizeof(*pS0)    * dims[0]*dims[1]*dims[2]);
+	uint32_t*  pD0 = calloc(1, sizeof(*pD0)    *         dims[1]        );
 
-	ck_assert_ptr_ne(pS,    NULL);
-	ck_assert_ptr_ne(pD,    NULL);
+	ck_assert_ptr_nonnull(pS0);
+	ck_assert_ptr_nonnull(pD0);
 
 
 	/**
@@ -3854,7 +4495,7 @@ START_TEST(test_all_reduction){
 		 * probability.
 		 */
 
-		pS[i]  = pcgRand01() > 0.05;
+		pS0[i]  = pcgRand01() > 0.05;
 	}
 
 
@@ -3862,23 +4503,30 @@ START_TEST(test_all_reduction){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS;
-	GpuArray gaD;
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 
-	ga_assert_ok(GpuArray_empty (&gaS, ctx, GA_UINT, 3, &dims[0], GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty (&gaD, ctx, GA_UINT, 1, &dims[1], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty (&gaS0, ctx, GA_UINT, 3, &dims[0], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty (&gaD0, ctx, GA_UINT, 1, &dims[1], GA_C_ORDER));
 
-	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
-	ga_assert_ok(GpuArray_memset(&gaD, -1));
-
-	GpuReduction* gr;
-	GpuReduction_new(&gr, GpuArray_context(&gaS),
-	                 GA_REDUCE_ALL, 1, 2, gaS.typecode, 0);
+	ga_assert_ok(GpuArray_write (&gaS0, pS0, sizeof(*pS0)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD0, -1));
+	
+	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
+	ck_assert_ptr_nonnull(grAttr);
+	GpuReductionAttr_setop    (grAttr, GA_REDUCE_ALL);
+	GpuReductionAttr_setdims  (grAttr, gaS0.nd, gaD0.nd);
+	GpuReductionAttr_sets0type(grAttr, gaS0.typecode);
+	GpuReductionAttr_setd0type(grAttr, gaD0.typecode);
+	GpuReduction_new(&gr, grAttr);
 	ck_assert_ptr_nonnull(gr);
-	ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 2, reduxList, 0));
+	ga_assert_ok(GpuReduction_call(gr, &gaD0, NULL, &gaS0, gaS0.nd-gaD0.nd, reduxList, 0));
 	GpuReduction_free(gr);
+	GpuReductionAttr_free(grAttr);
 
-	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*dims[1], &gaD));
+	ga_assert_ok(GpuArray_read  (pD0,   sizeof(*pD0)*dims[1], &gaD0));
 
 
 	/**
@@ -3886,26 +4534,34 @@ START_TEST(test_all_reduction){
 	 */
 
 	for(j=0;j<dims[1];j++){
-		uint32_t  gtD = 1;
+		uint32_t  gtD0 = 1;
 
 		for(i=0;i<dims[0];i++){
 			for(k=0;k<dims[2];k++){
-				uint32_t v = pS[(i*dims[1] + j)*dims[2] + k];
-				gtD = gtD && v;
+				uint32_t v = pS0[(i*dims[1] + j)*dims[2] + k];
+				gtD0 = gtD0 && v;
 			}
 		}
 
-		ck_assert_uint_eq(gtD, pD[j]);
+		if(gtD0 != pD0[j]){
+			errCnt++;
+			if(errCnt < MAXERRPRINT){
+				fprintf(stderr, "%s:%d: Mismatch GT %u != %u UUT @ %zu!\n",
+				__func__, __LINE__, gtD0, pD0[j], (size_t)j);
+				fflush (stderr);
+			}
+		}
 	}
+	ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt);
 
 	/**
 	 * Deallocate.
 	 */
 
-	free(pS);
-	free(pD);
-	GpuArray_clear(&gaS);
-	GpuArray_clear(&gaD);
+	free(pS0);
+	free(pD0);
+	GpuArray_clear(&gaS0);
+	GpuArray_clear(&gaD0);
 }END_TEST
 
 START_TEST(test_all_veryhighrank){
@@ -3915,6 +4571,7 @@ START_TEST(test_all_veryhighrank){
 	 * Here we test a reduction of a random 8D tensor on four dimensions.
 	 */
 
+	size_t errCnt      = 0;
 	size_t i,j,k,l,m,n,o,p;
 	size_t dims   [8]  = {1171,373,2,1,2,1,2,1};
 	size_t prodDims    = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7];
@@ -3922,11 +4579,11 @@ START_TEST(test_all_veryhighrank){
 	size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3];
 	const int reduxList[] = {2,4,7,5};
 
-	uint32_t*  pS = calloc(1, sizeof(*pS) * prodDims);
-	uint32_t*  pD = calloc(1, sizeof(*pD) * rdxProdDims);
+	uint32_t*  pS0 = calloc(1, sizeof(*pS0) * prodDims);
+	uint32_t*  pD0 = calloc(1, sizeof(*pD0) * rdxProdDims);
 
-	ck_assert_ptr_ne(pS, NULL);
-	ck_assert_ptr_ne(pD, NULL);
+	ck_assert_ptr_nonnull(pS0);
+	ck_assert_ptr_nonnull(pD0);
 
 
 	/**
@@ -3939,7 +4596,7 @@ START_TEST(test_all_veryhighrank){
 		 * probability.
 		 */
 
-		pS[i]  = pcgRand01() > 0.05;
+		pS0[i]  = pcgRand01() > 0.05;
 	}
 
 
@@ -3947,23 +4604,30 @@ START_TEST(test_all_veryhighrank){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS;
-	GpuArray gaD;
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 
-	ga_assert_ok(GpuArray_empty (&gaS, ctx, GA_UINT, 8, dims,    GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty (&gaD, ctx, GA_UINT, 4, rdxDims, GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty (&gaS0, ctx, GA_UINT, 8, dims,    GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty (&gaD0, ctx, GA_UINT, 4, rdxDims, GA_C_ORDER));
 
-	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
-	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
-
-	GpuReduction* gr;
-	GpuReduction_new(&gr, GpuArray_context(&gaS),
-	                 GA_REDUCE_ALL, 4, 4, gaS.typecode, 0);
+	ga_assert_ok(GpuArray_write (&gaS0, pS0, sizeof(*pS0)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD0, -1));  /* 0xFFFFFFFF is a qNaN. */
+	
+	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
+	ck_assert_ptr_nonnull(grAttr);
+	GpuReductionAttr_setop    (grAttr, GA_REDUCE_ALL);
+	GpuReductionAttr_setdims  (grAttr, gaS0.nd, gaD0.nd);
+	GpuReductionAttr_sets0type(grAttr, gaS0.typecode);
+	GpuReductionAttr_setd0type(grAttr, gaD0.typecode);
+	GpuReduction_new(&gr, grAttr);
 	ck_assert_ptr_nonnull(gr);
-	ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 4, reduxList, 0));
+	ga_assert_ok(GpuReduction_call(gr, &gaD0, NULL, &gaS0, gaS0.nd-gaD0.nd, reduxList, 0));
 	GpuReduction_free(gr);
+	GpuReductionAttr_free(grAttr);
 
-	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD)*rdxProdDims, &gaD));
+	ga_assert_ok(GpuArray_read  (pD0,   sizeof(*pD0)*rdxProdDims, &gaD0));
 
 
 	/**
@@ -3974,35 +4638,43 @@ START_TEST(test_all_veryhighrank){
 		for(j=0;j<dims[1];j++){
 			for(l=0;l<dims[3];l++){
 				for(o=0;o<dims[6];o++){
-					uint32_t gtD = 1;
+					uint32_t gtD0 = 1;
 
 					for(k=0;k<dims[2];k++){
 						for(m=0;m<dims[4];m++){
 							for(p=0;p<dims[7];p++){
 								for(n=0;n<dims[5];n++){
-									uint32_t v = pS[(((((((i)*dims[1] + j)*dims[2] + k)*dims[3] + l)*dims[4] + m)*dims[5] + n)*dims[6] + o)*dims[7] + p];
-									gtD = gtD && v;
+									uint32_t v = pS0[(((((((i)*dims[1] + j)*dims[2] + k)*dims[3] + l)*dims[4] + m)*dims[5] + n)*dims[6] + o)*dims[7] + p];
+									gtD0 = gtD0 && v;
 								}
 							}
 						}
 					}
 
 					size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
-					ck_assert_uint_eq(gtD, pD[dstIdx]);
+					if(gtD0 != pD0[dstIdx]){
+						errCnt++;
+						if(errCnt < MAXERRPRINT){
+							fprintf(stderr, "%s:%d: Mismatch GT %u != %u UUT @ %zu!\n",
+							__func__, __LINE__, gtD0, pD0[dstIdx], dstIdx);
+							fflush (stderr);
+						}
+					}
 				}
 			}
 		}
 	}
+	ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt);
 
 
 	/**
 	 * Deallocate.
 	 */
 
-	free(pS);
-	free(pD);
-	GpuArray_clear(&gaS);
-	GpuArray_clear(&gaD);
+	free(pS0);
+	free(pD0);
+	GpuArray_clear(&gaS0);
+	GpuArray_clear(&gaD0);
 }END_TEST
 
 START_TEST(test_all_alldimsreduced){
@@ -4012,16 +4684,17 @@ START_TEST(test_all_alldimsreduced){
 	 * We test here a reduction of some random 3D tensor on all dimensions.
 	 */
 
+	size_t errCnt      = 0;
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
 	const int reduxList[] = {0,1,2};
 
-	uint32_t*  pS = calloc(1, sizeof(*pS)    * dims[0]*dims[1]*dims[2]);
-	uint32_t*  pD = calloc(1, sizeof(*pD)                             );
+	uint32_t*  pS0 = calloc(1, sizeof(*pS0)    * dims[0]*dims[1]*dims[2]);
+	uint32_t*  pD0 = calloc(1, sizeof(*pD0)                             );
 
-	ck_assert_ptr_ne(pS,    NULL);
-	ck_assert_ptr_ne(pD,    NULL);
+	ck_assert_ptr_nonnull(pS0);
+	ck_assert_ptr_nonnull(pD0);
 
 
 	/**
@@ -4034,7 +4707,7 @@ START_TEST(test_all_alldimsreduced){
 		 * probability.
 		 */
 
-		pS[i]  = pcgRand01() > 0.05;
+		pS0[i]  = pcgRand01() > 0.05;
 	}
 
 
@@ -4042,50 +4715,64 @@ START_TEST(test_all_alldimsreduced){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS;
-	GpuArray gaD;
-
-	ga_assert_ok(GpuArray_empty (&gaS, ctx, GA_UINT, 3, &dims[0], GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty (&gaD, ctx, GA_UINT, 0, NULL,     GA_C_ORDER));
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 
-	ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims));
-	ga_assert_ok(GpuArray_memset(&gaD, -1));  /* 0xFFFFFFFF is a qNaN. */
+	ga_assert_ok(GpuArray_empty (&gaS0, ctx, GA_UINT, 3, &dims[0], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty (&gaD0, ctx, GA_UINT, 0, NULL,     GA_C_ORDER));
 
-	GpuReduction* gr;
-	GpuReduction_new(&gr, GpuArray_context(&gaS),
-	                 GA_REDUCE_ALL, 0, 3, gaS.typecode, 0);
+	ga_assert_ok(GpuArray_write (&gaS0, pS0, sizeof(*pS0)*prodDims));
+	ga_assert_ok(GpuArray_memset(&gaD0, -1));  /* 0xFFFFFFFF is a qNaN. */
+	
+	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
+	ck_assert_ptr_nonnull(grAttr);
+	GpuReductionAttr_setop    (grAttr, GA_REDUCE_ALL);
+	GpuReductionAttr_setdims  (grAttr, gaS0.nd, gaD0.nd);
+	GpuReductionAttr_sets0type(grAttr, gaS0.typecode);
+	GpuReductionAttr_setd0type(grAttr, gaD0.typecode);
+	GpuReduction_new(&gr, grAttr);
 	ck_assert_ptr_nonnull(gr);
-	ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 3, reduxList, 0));
+	ga_assert_ok(GpuReduction_call(gr, &gaD0, NULL, &gaS0, gaS0.nd-gaD0.nd, reduxList, 0));
 	GpuReduction_free(gr);
+	GpuReductionAttr_free(grAttr);
 
-	ga_assert_ok(GpuArray_read  (pD,   sizeof(*pD), &gaD));
+	ga_assert_ok(GpuArray_read  (pD0,   sizeof(*pD0), &gaD0));
 
 
 	/**
 	 * Check that the destination tensors are correct.
 	 */
 
-	uint32_t  gtD = 1;
+	uint32_t  gtD0 = 1;
 
 	for(i=0;i<dims[0];i++){
 		for(j=0;j<dims[1];j++){
 			for(k=0;k<dims[2];k++){
-				uint32_t v = pS[(i*dims[1] + j)*dims[2] + k];
-				gtD = gtD && v;
+				uint32_t v = pS0[(i*dims[1] + j)*dims[2] + k];
+				gtD0 = gtD0 && v;
 			}
 		}
 	}
-
-	ck_assert_uint_eq(gtD, pD[0]);
+	if(gtD0 != pD0[0]){
+		errCnt++;
+		if(errCnt < MAXERRPRINT){
+			fprintf(stderr, "%s:%d: Mismatch GT %u != %u UUT @ %zu!\n",
+			__func__, __LINE__, gtD0, pD0[0], (size_t)0);
+			fflush (stderr);
+		}
+	}
+	ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt);
 
 	/**
 	 * Deallocate.
 	 */
 
-	free(pS);
-	free(pD);
-	GpuArray_clear(&gaS);
-	GpuArray_clear(&gaD);
+	free(pS0);
+	free(pD0);
+	GpuArray_clear(&gaS0);
+	GpuArray_clear(&gaD0);
 }END_TEST
 
 Suite *get_suite(void) {

From 6fb07938c8411687257b94adad6e90af5d1bee06 Mon Sep 17 00:00:00 2001
From: Olexa Bilaniuk <obilaniu@gmail.com>
Date: Fri, 14 Jul 2017 12:03:26 -0400
Subject: [PATCH 23/34] Delete an "initialization" that should not be there.

---
 src/gpuarray_reduction.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c
index baead32518..8c3c665252 100644
--- a/src/gpuarray_reduction.c
+++ b/src/gpuarray_reduction.c
@@ -112,7 +112,8 @@ struct redux_ctx{
 	
 	uint32_t            LSlice;
 	uint64_t            LPadded;
-	uint64_t*           L,   *Li;
+	uint64_t*           L;
+	uint32_t*           Li;
 	gpudata*            S0Data;
 	int64_t             S0Off;
 	int64_t*            S0J, *S0Si;
@@ -2607,13 +2608,6 @@ static void       reduxGenSrcAppendDecode       (GpuReduction*     gr){
 			"    TK1* restrict const W1R     = &W1[GDIM_0*D];\n"
 			"    TK1* restrict const SHMEMK1 = (TK1*)(SHMEM + SHMEMK1Off);\n");
 		}
-		srcbAppends(&gr->srcGen,
-		"    INITREDUXSTATE(W0L[LID_0], W1L[LID_0]);\n"
-		"    INITREDUXSTATE(W0R[LID_0], W1R[LID_0]);\n"
-		"    if(D < LDIM_0 && LID_0+D<H){\n"
-		"        INITREDUXSTATE(W0L[LID_0+D], W1L[LID_0+D]);\n"
-		"        INITREDUXSTATE(W0R[LID_0+D], W1R[LID_0+D]);\n"
-		"    }\n");
 	}
 
 
@@ -3322,7 +3316,8 @@ static size_t      reduxGenGetWMEMK1Off           (const GpuReduction*  gr, size
  */
 
 static int         reduxInvInit                   (redux_ctx*  ctx){
-	ctx->L           = ctx->Li        = NULL;
+	ctx->L           = NULL;
+	ctx->Li          = NULL;
 	ctx->S0J         = ctx->S0Si      = NULL;
 	ctx->D0J         = ctx->D0Si      = NULL;
 	ctx->D1J         = ctx->D1Si      = NULL;

From 4a17f4835291062b1fd7fc2e2bd2fdf26a79a575 Mon Sep 17 00:00:00 2001
From: Olexa Bilaniuk <obilaniu@gmail.com>
Date: Fri, 14 Jul 2017 12:46:32 -0400
Subject: [PATCH 24/34] Added an initialization that WAS needed.

---
 src/gpuarray_reduction.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c
index 8c3c665252..a4f19359cb 100644
--- a/src/gpuarray_reduction.c
+++ b/src/gpuarray_reduction.c
@@ -2608,6 +2608,12 @@ static void       reduxGenSrcAppendDecode       (GpuReduction*     gr){
 			"    TK1* restrict const W1R     = &W1[GDIM_0*D];\n"
 			"    TK1* restrict const SHMEMK1 = (TK1*)(SHMEM + SHMEMK1Off);\n");
 		}
+		srcbAppends(&gr->srcGen,
+		"    INITREDUXSTATE(SHMEMK0[LID_0], SHMEMK1[LID_0]);\n"
+		"    if(D<LDIM_0 && LID_0+LDIM_0<H){\n"
+		"        INITREDUXSTATE(SHMEMK0[LID_0+LDIM_0], SHMEMK1[LID_0+LDIM_0]);\n"
+		"    }\n"
+		"    local_barrier();\n");
 	}
 
 
@@ -3511,9 +3517,7 @@ static int         reduxInvInferProperties        (redux_ctx*  ctx){
 	}
 
 
-	return ctx->flags & 0                ? //FIXME: Delete this hack after debugging.
-	       reduxInvFlattenSource    (ctx):
-	       reduxInvComputeKernelArgs(ctx);
+	return reduxInvFlattenSource(ctx);
 }
 
 /**

From 328c957210224134dae9f14f7ad35cdba8a9c10b Mon Sep 17 00:00:00 2001
From: Olexa Bilaniuk <obilaniu@gmail.com>
Date: Fri, 14 Jul 2017 13:26:31 -0400
Subject: [PATCH 25/34] Add a bunch of local_barrier()'s.

They are overkill but seem to fix the problems with the testcases, at
least so far.
---
 src/gpuarray_reduction.c |  3 ++
 tests/check_reduction.c  | 92 ++++++++++++++++++++--------------------
 2 files changed, 49 insertions(+), 46 deletions(-)

diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c
index a4f19359cb..c8e64bd3a0 100644
--- a/src/gpuarray_reduction.c
+++ b/src/gpuarray_reduction.c
@@ -2609,6 +2609,7 @@ static void       reduxGenSrcAppendDecode       (GpuReduction*     gr){
 			"    TK1* restrict const SHMEMK1 = (TK1*)(SHMEM + SHMEMK1Off);\n");
 		}
 		srcbAppends(&gr->srcGen,
+		"    local_barrier();\n"
 		"    INITREDUXSTATE(SHMEMK0[LID_0], SHMEMK1[LID_0]);\n"
 		"    if(D<LDIM_0 && LID_0+LDIM_0<H){\n"
 		"        INITREDUXSTATE(SHMEMK0[LID_0+LDIM_0], SHMEMK1[LID_0+LDIM_0]);\n"
@@ -2745,6 +2746,7 @@ static void        reduxGenSrcAppendIncrement     (GpuReduction*        gr,
 static void        reduxGenSrcAppendDstWrite      (GpuReduction*        gr,
                                                    uint32_t             selector,
                                                    int                  initial){
+	srcbAppends(&gr->srcGen, "                local_barrier();\n");
 	if (initial){
 		srcbAppends(&gr->srcGen, "                if(LID_0 < D){\n"
 		                         "                    SETREDUXSTATE(W0R[GID_0*D + LID_0],\n"
@@ -2771,6 +2773,7 @@ static void        reduxGenSrcAppendDstWrite      (GpuReduction*        gr,
 			                         "                }\n");
 		}
 	}
+	srcbAppends(&gr->srcGen, "                local_barrier();\n");
 }
 static void        reduxGenSrcAppendPhase1        (GpuReduction*        gr){
 	/**
diff --git a/tests/check_reduction.c b/tests/check_reduction.c
index 8e5eef93e4..6a1e8c6a97 100644
--- a/tests/check_reduction.c
+++ b/tests/check_reduction.c
@@ -16,7 +16,7 @@ void teardown(void);
 
 
 /* Defines */
-#define MAXERRPRINT  2
+#define MAXERRPRINT  16
 #define ga_assert_ok(e) ck_assert_int_eq(e, GA_NO_ERROR)
 
 
@@ -155,7 +155,7 @@ START_TEST(test_maxandargmax_reduction){
 		
 		if(gtD0 != pD0[j] || gtD1 != pD1[j]){
 			errCnt++;
-			if(errCnt < MAXERRPRINT){
+			if(errCnt <= MAXERRPRINT){
 				fprintf(stderr, "%s:%d: Mismatch GT %f[%zu] != %f[%zu] UUT @ %zu!\n",
 				        __func__, __LINE__, gtD0, gtD1, pD0[j], pD1[j], j);
 				fflush (stderr);
@@ -268,7 +268,7 @@ START_TEST(test_maxandargmax_idxtranspose){
 
 		if(gtD0 != pD0[j] || gtD1 != pD1[j]){
 			errCnt++;
-			if(errCnt < MAXERRPRINT){
+			if(errCnt <= MAXERRPRINT){
 				fprintf(stderr, "%s:%d: Mismatch GT %f[%zu] != %f[%zu] UUT @ %zu!\n",
 				__func__, __LINE__, gtD0, gtD1, pD0[j], pD1[j], j);
 				fflush (stderr);
@@ -375,7 +375,7 @@ START_TEST(test_maxandargmax_bigdestination){
 		
 		if(gtD0 != pD0[j] || gtD1 != pD1[j]){
 			errCnt++;
-			if(errCnt < MAXERRPRINT){
+			if(errCnt <= MAXERRPRINT){
 				fprintf(stderr, "%s:%d: Mismatch GT %f[%zu] != %f[%zu] UUT @ %zu!\n",
 				__func__, __LINE__, gtD0, gtD1, pD0[j], pD1[j], j);
 				fflush (stderr);
@@ -493,7 +493,7 @@ START_TEST(test_maxandargmax_veryhighrank){
 					size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
 					if(gtD0 != pD0[dstIdx] || gtD1 != pD1[dstIdx]){
 						errCnt++;
-						if(errCnt < MAXERRPRINT){
+						if(errCnt <= MAXERRPRINT){
 							fprintf(stderr, "%s:%d: Mismatch GT %f[%zu] != %f[%zu] UUT @ %zu!\n",
 							__func__, __LINE__, gtD0, gtD1, pD0[dstIdx], pD1[dstIdx], dstIdx);
 							fflush (stderr);
@@ -605,7 +605,7 @@ START_TEST(test_maxandargmax_alldimsreduced){
 	}
 	if(gtD0 != pD0[0] || gtD1 != pD1[0]){
 		errCnt++;
-		if(errCnt < MAXERRPRINT){
+		if(errCnt <= MAXERRPRINT){
 			fprintf(stderr, "%s:%d: Mismatch GT %f[%zu] != %f[%zu] UUT @ %zu!\n",
 			__func__, __LINE__, gtD0, gtD1, pD0[0], pD1[0], (size_t)0);
 			fflush (stderr);
@@ -713,7 +713,7 @@ START_TEST(test_minandargmin_reduction){
 		
 		if(gtD0 != pD0[j] || gtD1 != pD1[j]){
 			errCnt++;
-			if(errCnt < MAXERRPRINT){
+			if(errCnt <= MAXERRPRINT){
 				fprintf(stderr, "%s:%d: Mismatch GT %f[%zu] != %f[%zu] UUT @ %zu!\n",
 				__func__, __LINE__, gtD0, gtD1, pD0[j], pD1[j], j);
 				fflush (stderr);
@@ -831,7 +831,7 @@ START_TEST(test_minandargmin_veryhighrank){
 					size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
 					if(gtD0 != pD0[dstIdx] || gtD1 != pD1[dstIdx]){
 						errCnt++;
-						if(errCnt < MAXERRPRINT){
+						if(errCnt <= MAXERRPRINT){
 							fprintf(stderr, "%s:%d: Mismatch GT %f[%zu] != %f[%zu] UUT @ %zu!\n",
 							__func__, __LINE__, gtD0, gtD1, pD0[dstIdx], pD1[dstIdx], dstIdx);
 							fflush (stderr);
@@ -943,7 +943,7 @@ START_TEST(test_minandargmin_alldimsreduced){
 	}
 	if(gtD0 != pD0[0] || gtD1 != pD1[0]){
 		errCnt++;
-		if(errCnt < MAXERRPRINT){
+		if(errCnt <= MAXERRPRINT){
 			fprintf(stderr, "%s:%d: Mismatch GT %f[%zu] != %f[%zu] UUT @ %zu!\n",
 			__func__, __LINE__, gtD0, gtD1, pD0[0], pD1[0], (size_t)0);
 			fflush (stderr);
@@ -1046,7 +1046,7 @@ START_TEST(test_argmax_reduction){
 		
 		if(gtD1 != pD1[j]){
 			errCnt++;
-			if(errCnt < MAXERRPRINT){
+			if(errCnt <= MAXERRPRINT){
 				fprintf(stderr, "%s:%d: Mismatch GT [%zu] != [%zu] UUT @ %zu!\n",
 				__func__, __LINE__, gtD1, pD1[j], j);
 				fflush (stderr);
@@ -1157,7 +1157,7 @@ START_TEST(test_argmax_veryhighrank){
 					size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
 					if(gtD1 != pD1[dstIdx]){
 						errCnt++;
-						if(errCnt < MAXERRPRINT){
+						if(errCnt <= MAXERRPRINT){
 							fprintf(stderr, "%s:%d: Mismatch GT [%zu] != [%zu] UUT @ %zu!\n",
 							__func__, __LINE__, gtD1, pD1[dstIdx], dstIdx);
 							fflush (stderr);
@@ -1263,7 +1263,7 @@ START_TEST(test_argmax_alldimsreduced){
 	}
 	if(gtD1 != pD1[0]){
 		errCnt++;
-		if(errCnt < MAXERRPRINT){
+		if(errCnt <= MAXERRPRINT){
 			fprintf(stderr, "%s:%d: Mismatch GT [%zu] != [%zu] UUT @ %zu!\n",
 			__func__, __LINE__, gtD1, pD1[0], (size_t)0);
 			fflush (stderr);
@@ -1365,7 +1365,7 @@ START_TEST(test_argmin_reduction){
 		
 		if(gtD1 != pD1[j]){
 			errCnt++;
-			if(errCnt < MAXERRPRINT){
+			if(errCnt <= MAXERRPRINT){
 				fprintf(stderr, "%s:%d: Mismatch GT [%zu] != [%zu] UUT @ %zu!\n",
 				__func__, __LINE__, gtD1, pD1[j], j);
 				fflush (stderr);
@@ -1476,7 +1476,7 @@ START_TEST(test_argmin_veryhighrank){
 					size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
 					if(gtD1 != pD1[dstIdx]){
 						errCnt++;
-						if(errCnt < MAXERRPRINT){
+						if(errCnt <= MAXERRPRINT){
 							fprintf(stderr, "%s:%d: Mismatch GT [%zu] != [%zu] UUT @ %zu!\n",
 							__func__, __LINE__, gtD1, pD1[dstIdx], dstIdx);
 							fflush (stderr);
@@ -1582,7 +1582,7 @@ START_TEST(test_argmin_alldimsreduced){
 	}
 	if(gtD1 != pD1[0]){
 		errCnt++;
-		if(errCnt < MAXERRPRINT){
+		if(errCnt <= MAXERRPRINT){
 			fprintf(stderr, "%s:%d: Mismatch GT [%zu] != [%zu] UUT @ %zu!\n",
 			__func__, __LINE__, gtD1, pD1[0], (size_t)0);
 			fflush (stderr);
@@ -1679,7 +1679,7 @@ START_TEST(test_max_reduction){
 		
 		if(gtD0 != pD0[j]){
 			errCnt++;
-			if(errCnt < MAXERRPRINT){
+			if(errCnt <= MAXERRPRINT){
 				fprintf(stderr, "%s:%d: Mismatch GT %f != %f UUT @ %zu!\n",
 				__func__, __LINE__, gtD0, pD0[j], j);
 				fflush (stderr);
@@ -1786,7 +1786,7 @@ START_TEST(test_max_veryhighrank){
 					size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
 					if(gtD0 != pD0[dstIdx]){
 						errCnt++;
-						if(errCnt < MAXERRPRINT){
+						if(errCnt <= MAXERRPRINT){
 							fprintf(stderr, "%s:%d: Mismatch GT %f != %f UUT @ %zu!\n",
 							__func__, __LINE__, gtD0, pD0[dstIdx], dstIdx);
 							fflush (stderr);
@@ -1887,7 +1887,7 @@ START_TEST(test_max_alldimsreduced){
 	}
 	if(gtD0 != pD0[0]){
 		errCnt++;
-		if(errCnt < MAXERRPRINT){
+		if(errCnt <= MAXERRPRINT){
 			fprintf(stderr, "%s:%d: Mismatch GT %f != %f UUT @ %zu!\n",
 			__func__, __LINE__, gtD0, pD0[0], (size_t)0);
 			fflush (stderr);
@@ -1983,7 +1983,7 @@ START_TEST(test_min_reduction){
 		
 		if(gtD0 != pD0[j]){
 			errCnt++;
-			if(errCnt < MAXERRPRINT){
+			if(errCnt <= MAXERRPRINT){
 				fprintf(stderr, "%s:%d: Mismatch GT %f != %f UUT @ %zu!\n",
 				__func__, __LINE__, gtD0, pD0[j], j);
 				fflush (stderr);
@@ -2090,7 +2090,7 @@ START_TEST(test_min_veryhighrank){
 					size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
 					if(gtD0 != pD0[dstIdx]){
 						errCnt++;
-						if(errCnt < MAXERRPRINT){
+						if(errCnt <= MAXERRPRINT){
 							fprintf(stderr, "%s:%d: Mismatch GT %f != %f UUT @ %zu!\n",
 							__func__, __LINE__, gtD0, pD0[dstIdx], dstIdx);
 							fflush (stderr);
@@ -2191,7 +2191,7 @@ START_TEST(test_min_alldimsreduced){
 	}
 	if(gtD0 != pD0[0]){
 		errCnt++;
-		if(errCnt < MAXERRPRINT){
+		if(errCnt <= MAXERRPRINT){
 			fprintf(stderr, "%s:%d: Mismatch GT %f != %f UUT @ %zu!\n",
 			__func__, __LINE__, gtD0, pD0[0], (size_t)0);
 			fflush (stderr);
@@ -2285,7 +2285,7 @@ START_TEST(test_sum_reduction){
 		
 		if(fabs(gtD0-pD0[j]) >= TOL){
 			errCnt++;
-			if(errCnt < MAXERRPRINT){
+			if(errCnt <= MAXERRPRINT){
 				fprintf(stderr, "%s:%d: Mismatch GT %f != %f UUT @ %zu (TOL=%f)!\n",
 				__func__, __LINE__, gtD0, pD0[j], j, TOL);
 				fflush (stderr);
@@ -2390,7 +2390,7 @@ START_TEST(test_sum_veryhighrank){
 					size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
 					if(fabs(gtD0-pD0[dstIdx]) >= TOL){
 						errCnt++;
-						if(errCnt < MAXERRPRINT){
+						if(errCnt <= MAXERRPRINT){
 							fprintf(stderr, "%s:%d: Mismatch GT %f != %f UUT @ %zu (TOL=%f)!\n",
 							__func__, __LINE__, gtD0, pD0[dstIdx], dstIdx, TOL);
 							fflush (stderr);
@@ -2489,7 +2489,7 @@ START_TEST(test_sum_alldimsreduced){
 	}
 	if(fabs(gtD0-pD0[0]) >= TOL){
 		errCnt++;
-		if(errCnt < MAXERRPRINT){
+		if(errCnt <= MAXERRPRINT){
 			fprintf(stderr, "%s:%d: Mismatch GT %f != %f UUT @ %zu (TOL=%f)!\n",
 			__func__, __LINE__, gtD0, pD0[0], (size_t)0, TOL);
 			fflush (stderr);
@@ -2578,7 +2578,7 @@ START_TEST(test_sum_huge){
 	}
 	if(fabs(gtD0-pD0[0]) >= TOL){
 		errCnt++;
-		if(errCnt < MAXERRPRINT){
+		if(errCnt <= MAXERRPRINT){
 			fprintf(stderr, "%s:%d: Mismatch GT %f != %f UUT @ %zu (TOL=%f)!\n",
 			__func__, __LINE__, gtD0, pD0[0], (size_t)0, TOL);
 			fflush (stderr);
@@ -2672,7 +2672,7 @@ START_TEST(test_prod_reduction){
 		
 		if(fabs(gtD0-pD0[j]) >= TOL){
 			errCnt++;
-			if(errCnt < MAXERRPRINT){
+			if(errCnt <= MAXERRPRINT){
 				fprintf(stderr, "%s:%d: Mismatch GT %f != %f UUT @ %zu (TOL=%f)!\n",
 				__func__, __LINE__, gtD0, pD0[j], j, TOL);
 				fflush (stderr);
@@ -2777,7 +2777,7 @@ START_TEST(test_prod_veryhighrank){
 					size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
 					if(fabs(gtD0-pD0[dstIdx]) >= TOL){
 						errCnt++;
-						if(errCnt < MAXERRPRINT){
+						if(errCnt <= MAXERRPRINT){
 							fprintf(stderr, "%s:%d: Mismatch GT %f != %f UUT @ %zu (TOL=%f)!\n",
 							__func__, __LINE__, gtD0, pD0[dstIdx], dstIdx, TOL);
 							fflush (stderr);
@@ -2876,7 +2876,7 @@ START_TEST(test_prod_alldimsreduced){
 	}
 	if(fabs(gtD0-pD0[0]) >= TOL){
 		errCnt++;
-		if(errCnt < MAXERRPRINT){
+		if(errCnt <= MAXERRPRINT){
 			fprintf(stderr, "%s:%d: Mismatch GT %f != %f UUT @ %zu (TOL=%f)!\n",
 			__func__, __LINE__, gtD0, pD0[0], (size_t)0, TOL);
 			fflush (stderr);
@@ -2973,7 +2973,7 @@ START_TEST(test_prodnz_reduction){
 		
 		if(fabs(gtD0-pD0[j]) >= TOL){
 			errCnt++;
-			if(errCnt < MAXERRPRINT){
+			if(errCnt <= MAXERRPRINT){
 				fprintf(stderr, "%s:%d: Mismatch GT %f != %f UUT @ %zu (TOL=%f)!\n",
 				__func__, __LINE__, gtD0, pD0[j], j, TOL);
 				fflush (stderr);
@@ -3081,7 +3081,7 @@ START_TEST(test_prodnz_veryhighrank){
 					size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
 					if(fabs(gtD0-pD0[dstIdx]) >= TOL){
 						errCnt++;
-						if(errCnt < MAXERRPRINT){
+						if(errCnt <= MAXERRPRINT){
 							fprintf(stderr, "%s:%d: Mismatch GT %f != %f UUT @ %zu (TOL=%f)!\n",
 							__func__, __LINE__, gtD0, pD0[dstIdx], dstIdx, TOL);
 							fflush (stderr);
@@ -3183,7 +3183,7 @@ START_TEST(test_prodnz_alldimsreduced){
 	}
 	if(fabs(gtD0-pD0[0]) >= TOL){
 		errCnt++;
-		if(errCnt < MAXERRPRINT){
+		if(errCnt <= MAXERRPRINT){
 			fprintf(stderr, "%s:%d: Mismatch GT %f != %f UUT @ %zu (TOL=%f)!\n",
 			__func__, __LINE__, gtD0, pD0[0], (size_t)0, TOL);
 			fflush (stderr);
@@ -3285,7 +3285,7 @@ START_TEST(test_and_reduction){
 		
 		if(gtD0 != pD0[j]){
 			errCnt++;
-			if(errCnt < MAXERRPRINT){
+			if(errCnt <= MAXERRPRINT){
 				fprintf(stderr, "%s:%d: Mismatch GT %u != %u UUT @ %zu!\n",
 				__func__, __LINE__, gtD0, pD0[j], j);
 				fflush (stderr);
@@ -3398,7 +3398,7 @@ START_TEST(test_and_veryhighrank){
 					size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
 					if(gtD0 != pD0[dstIdx]){
 						errCnt++;
-						if(errCnt < MAXERRPRINT){
+						if(errCnt <= MAXERRPRINT){
 							fprintf(stderr, "%s:%d: Mismatch GT %u != %u UUT @ %zu!\n",
 							__func__, __LINE__, gtD0, pD0[dstIdx], dstIdx);
 							fflush (stderr);
@@ -3505,7 +3505,7 @@ START_TEST(test_and_alldimsreduced){
 	}
 	if(gtD0 != pD0[0]){
 		errCnt++;
-		if(errCnt < MAXERRPRINT){
+		if(errCnt <= MAXERRPRINT){
 			fprintf(stderr, "%s:%d: Mismatch GT %u != %u UUT @ %zu!\n",
 			__func__, __LINE__, gtD0, pD0[0], (size_t)0);
 			fflush (stderr);
@@ -3607,7 +3607,7 @@ START_TEST(test_or_reduction){
 
 		if(gtD0 != pD0[j]){
 			errCnt++;
-			if(errCnt < MAXERRPRINT){
+			if(errCnt <= MAXERRPRINT){
 				fprintf(stderr, "%s:%d: Mismatch GT %u != %u UUT @ %zu!\n",
 				__func__, __LINE__, gtD0, pD0[j], (size_t)j);
 				fflush (stderr);
@@ -3720,7 +3720,7 @@ START_TEST(test_or_veryhighrank){
 					size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
 					if(gtD0 != pD0[dstIdx]){
 						errCnt++;
-						if(errCnt < MAXERRPRINT){
+						if(errCnt <= MAXERRPRINT){
 							fprintf(stderr, "%s:%d: Mismatch GT %u != %u UUT @ %zu!\n",
 							__func__, __LINE__, gtD0, pD0[dstIdx], dstIdx);
 							fflush (stderr);
@@ -3827,7 +3827,7 @@ START_TEST(test_or_alldimsreduced){
 	}
 	if(gtD0 != pD0[0]){
 		errCnt++;
-		if(errCnt < MAXERRPRINT){
+		if(errCnt <= MAXERRPRINT){
 			fprintf(stderr, "%s:%d: Mismatch GT %u != %u UUT @ %zu!\n",
 			__func__, __LINE__, gtD0, pD0[0], (size_t)0);
 			fflush (stderr);
@@ -3925,7 +3925,7 @@ START_TEST(test_xor_reduction){
 
 		if(gtD0 != pD0[j]){
 			errCnt++;
-			if(errCnt < MAXERRPRINT){
+			if(errCnt <= MAXERRPRINT){
 				fprintf(stderr, "%s:%d: Mismatch GT %u != %u UUT @ %zu!\n",
 				__func__, __LINE__, gtD0, pD0[j], (size_t)j);
 				fflush (stderr);
@@ -4034,7 +4034,7 @@ START_TEST(test_xor_veryhighrank){
 					size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
 					if(gtD0 != pD0[dstIdx]){
 						errCnt++;
-						if(errCnt < MAXERRPRINT){
+						if(errCnt <= MAXERRPRINT){
 							fprintf(stderr, "%s:%d: Mismatch GT %u != %u UUT @ %zu!\n",
 							__func__, __LINE__, gtD0, pD0[dstIdx], dstIdx);
 							fflush (stderr);
@@ -4137,7 +4137,7 @@ START_TEST(test_xor_alldimsreduced){
 	}
 	if(gtD0 != pD0[0]){
 		errCnt++;
-		if(errCnt < MAXERRPRINT){
+		if(errCnt <= MAXERRPRINT){
 			fprintf(stderr, "%s:%d: Mismatch GT %u != %u UUT @ %zu!\n",
 			__func__, __LINE__, gtD0, pD0[0], (size_t)0);
 			fflush (stderr);
@@ -4235,7 +4235,7 @@ START_TEST(test_any_reduction){
 
 		if(gtD0 != pD0[j]){
 			errCnt++;
-			if(errCnt < MAXERRPRINT){
+			if(errCnt <= MAXERRPRINT){
 				fprintf(stderr, "%s:%d: Mismatch GT %u != %u UUT @ %zu!\n",
 				__func__, __LINE__, gtD0, pD0[j], (size_t)j);
 				fflush (stderr);
@@ -4344,7 +4344,7 @@ START_TEST(test_any_veryhighrank){
 					size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
 					if(gtD0 != pD0[dstIdx]){
 						errCnt++;
-						if(errCnt < MAXERRPRINT){
+						if(errCnt <= MAXERRPRINT){
 							fprintf(stderr, "%s:%d: Mismatch GT %u != %u UUT @ %zu!\n",
 							__func__, __LINE__, gtD0, pD0[dstIdx], dstIdx);
 							fflush (stderr);
@@ -4447,7 +4447,7 @@ START_TEST(test_any_alldimsreduced){
 	}
 	if(gtD0 != pD0[0]){
 		errCnt++;
-		if(errCnt < MAXERRPRINT){
+		if(errCnt <= MAXERRPRINT){
 			fprintf(stderr, "%s:%d: Mismatch GT %u != %u UUT @ %zu!\n",
 			__func__, __LINE__, gtD0, pD0[0], (size_t)0);
 			fflush (stderr);
@@ -4545,7 +4545,7 @@ START_TEST(test_all_reduction){
 
 		if(gtD0 != pD0[j]){
 			errCnt++;
-			if(errCnt < MAXERRPRINT){
+			if(errCnt <= MAXERRPRINT){
 				fprintf(stderr, "%s:%d: Mismatch GT %u != %u UUT @ %zu!\n",
 				__func__, __LINE__, gtD0, pD0[j], (size_t)j);
 				fflush (stderr);
@@ -4654,7 +4654,7 @@ START_TEST(test_all_veryhighrank){
 					size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
 					if(gtD0 != pD0[dstIdx]){
 						errCnt++;
-						if(errCnt < MAXERRPRINT){
+						if(errCnt <= MAXERRPRINT){
 							fprintf(stderr, "%s:%d: Mismatch GT %u != %u UUT @ %zu!\n",
 							__func__, __LINE__, gtD0, pD0[dstIdx], dstIdx);
 							fflush (stderr);
@@ -4757,7 +4757,7 @@ START_TEST(test_all_alldimsreduced){
 	}
 	if(gtD0 != pD0[0]){
 		errCnt++;
-		if(errCnt < MAXERRPRINT){
+		if(errCnt <= MAXERRPRINT){
 			fprintf(stderr, "%s:%d: Mismatch GT %u != %u UUT @ %zu!\n",
 			__func__, __LINE__, gtD0, pD0[0], (size_t)0);
 			fflush (stderr);

From 925688c4344c0e379bd1e8dd243ddb8fe3e3a841 Mon Sep 17 00:00:00 2001
From: Olexa Bilaniuk <obilaniu@gmail.com>
Date: Fri, 14 Jul 2017 14:03:09 -0400
Subject: [PATCH 26/34] Style fixes.

---
 src/gpuarray_reduction.c | 638 +++++++++++++++++++--------------------
 1 file changed, 319 insertions(+), 319 deletions(-)

diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c
index c8e64bd3a0..6d2e6fc17f 100644
--- a/src/gpuarray_reduction.c
+++ b/src/gpuarray_reduction.c
@@ -306,188 +306,188 @@ typedef void (*GpuReductionIterFn)(const GpuReduction* gr,
 
 /* Static Function prototypes */
 /* Utilities */
-static int         reduxGetSumInit                (int typecode, const char** property);
-static int         reduxGetProdInit               (int typecode, const char** property);
-static int         reduxGetMinInit                (int typecode, const char** property);
-static int         reduxGetMaxInit                (int typecode, const char** property);
-static int         reduxGetAndInit                (int typecode, const char** property);
-static int         reduxGetOrInit                 (int typecode, const char** property);
-static int         reduxIsFloatingPoint           (int typecode);
-static unsigned    reduxCeilLog2                  (uint64_t x);
-static uint64_t    reduxNextPow2                  (uint64_t x);
-static int         reduxSortFlatSensitive         (const void* a, const void* b);
-static int         reduxSortFlatInsensitive       (const void* a, const void* b);
-static int         reduxSortPtrS0AbsStride        (const void* a, const void* b);
-static int         reduxSortPtrByReduxNum         (const void* a, const void* b);
-static int         reduxSortPtrD0WrSelect         (const void* a, const void* b);
-static int         reduxSortPtrD1WrSelect         (const void* a, const void* b);
-static int         reduxSortPtrInsertFinalOrder   (const void* a, const void* b);
+static int         reduxGetSumInit                  (int typecode, const char** property);
+static int         reduxGetProdInit                 (int typecode, const char** property);
+static int         reduxGetMinInit                  (int typecode, const char** property);
+static int         reduxGetMaxInit                  (int typecode, const char** property);
+static int         reduxGetAndInit                  (int typecode, const char** property);
+static int         reduxGetOrInit                   (int typecode, const char** property);
+static int         reduxIsFloatingPoint             (int typecode);
+static unsigned    reduxCeilLog2                    (uint64_t x);
+static uint64_t    reduxNextPow2                    (uint64_t x);
+static int         reduxSortFlatInsensitive         (const void* a, const void* b);
+static int         reduxSortFlatSensitive           (const void* a, const void* b);
+static int         reduxSortPtrS0AbsStride          (const void* a, const void* b);
+static int         reduxSortPtrByReduxNum           (const void* a, const void* b);
+static int         reduxSortPtrD0WrSelect           (const void* a, const void* b);
+static int         reduxSortPtrD1WrSelect           (const void* a, const void* b);
+static int         reduxSortPtrInsertFinalOrder     (const void* a, const void* b);
 
 /* Axis Description API */
-static void        axisInit                       (axis_desc*           axis,
-                                                   ssize_t              len,
-                                                   ssize_t              s0S);
-static void        axisMarkReduced                (axis_desc*           axis, int    reduxNum);
-static void        axisMarkIntraBlock             (axis_desc*           axis,
-                                                   int                  ibNum,
-                                                   size_t               ibLen);
-static int         axisGetReduxNum                (const axis_desc*     axis);
-static size_t      axisGetLen                     (const axis_desc*     axis);
-static size_t      axisGetIntraLen                (const axis_desc*     axis);
-static size_t      axisGetInterLen                (const axis_desc*     axis);
-static size_t      axisGetIntraInterLen           (const axis_desc*     axis);
-static ssize_t     axisGetS0Stride                (const axis_desc*     axis);
-static size_t      axisGetS0AbsStride             (const axis_desc*     axis);
-static ssize_t     axisGetD0Stride                (const axis_desc*     axis);
-static size_t      axisGetD0AbsStride             (const axis_desc*     axis);
-static ssize_t     axisGetD1Stride                (const axis_desc*     axis);
-static size_t      axisGetD1AbsStride             (const axis_desc*     axis);
-static size_t      axisGetI0Stride                (const axis_desc*     axis);
-static void        axisSetI0Stride                (axis_desc*           axis,
-                                                   size_t               pdim);
-static unsigned    axisGetPerm                    (const axis_desc*     axis);
-static int         axisGetIBNum                   (const axis_desc*     axis);
-static void        axisSetPerm                    (axis_desc*           axis,
-                                                   unsigned             ibp);
-static int         axisIsReduced                  (const axis_desc*     axis);
-static int         axisIsIntra                    (const axis_desc*     axis);
-static int         axisIsInter                    (const axis_desc*     axis);
-static int         axisIsSplit                    (const axis_desc*     axis);
+static void        axisInit                         (axis_desc*           axis,
+                                                     ssize_t              len,
+                                                     ssize_t              s0S);
+static void        axisMarkReduced                  (axis_desc*           axis, int    reduxNum);
+static void        axisMarkIntraBlock               (axis_desc*           axis,
+                                                     int                  ibNum,
+                                                     size_t               ibLen);
+static int         axisGetReduxNum                  (const axis_desc*     axis);
+static size_t      axisGetLen                       (const axis_desc*     axis);
+static size_t      axisGetIntraLen                  (const axis_desc*     axis);
+static size_t      axisGetInterLen                  (const axis_desc*     axis);
+static size_t      axisGetIntraInterLen             (const axis_desc*     axis);
+static ssize_t     axisGetS0Stride                  (const axis_desc*     axis);
+static size_t      axisGetS0AbsStride               (const axis_desc*     axis);
+static ssize_t     axisGetD0Stride                  (const axis_desc*     axis);
+static size_t      axisGetD0AbsStride               (const axis_desc*     axis);
+static ssize_t     axisGetD1Stride                  (const axis_desc*     axis);
+static size_t      axisGetD1AbsStride               (const axis_desc*     axis);
+static size_t      axisGetI0Stride                  (const axis_desc*     axis);
+static void        axisSetI0Stride                  (axis_desc*           axis,
+                                                     size_t               pdim);
+static unsigned    axisGetPerm                      (const axis_desc*     axis);
+static int         axisGetIBNum                     (const axis_desc*     axis);
+static void        axisSetPerm                      (axis_desc*           axis,
+                                                     unsigned             ibp);
+static int         axisIsReduced                    (const axis_desc*     axis);
+static int         axisIsIntra                      (const axis_desc*     axis);
+static int         axisIsInter                      (const axis_desc*     axis);
+static int         axisIsSplit                      (const axis_desc*     axis);
 
 /* Reduction Context API */
 /*     Generator Control Flow */
-static int         reduxGenInit                   (GpuReduction*        gr);
-static int         reduxGenInferProperties        (GpuReduction*        gr);
-static void        reduxGenSetMaxBS               (GpuReduction*        gr);
-static void        reduxGenSetKTypes              (GpuReduction*        gr);
-static void        reduxGenIterArgs               (const GpuReduction*  gr,
-                                                   GpuReductionIterFn   fn,
-                                                   void*                user);
-static int         reduxGenSrc                    (GpuReduction*        gr);
-static void        reduxGenSrcAppend              (GpuReduction*        gr);
-static void        reduxGenSrcAppendIncludes      (GpuReduction*        gr);
-static void        reduxGenSrcAppendMacroTypedefs (GpuReduction*        gr);
-static void        reduxGenSrcAppendReduxKernel   (GpuReduction*        gr);
-static void        reduxGenSrcAppendPrototype     (GpuReduction*        gr);
-static void        reduxGenSrcAppendDecode        (GpuReduction*        gr);
-static void        reduxGenSrcAppendPhase0        (GpuReduction*        gr,
-                                                   uint32_t             selector);
-static void        reduxGenSrcAppendLoop          (GpuReduction*        gr,
-                                                   uint32_t             selector,
-                                                   int                  initial);
-static void        reduxGenSrcAppendVertical      (GpuReduction*        gr,
-                                                   uint32_t             selector);
-static void        reduxGenSrcAppendIncrement     (GpuReduction*        gr,
-                                                   uint32_t             selector,
-                                                   int                  initial,
-                                                   int                  axis);
-static void        reduxGenSrcAppendDstWrite      (GpuReduction*        gr,
-                                                   uint32_t             selector,
-                                                   int                  initial);
-static void        reduxGenSrcAppendPhase1        (GpuReduction*        gr);
-static int         reduxGenSrcAxisIsHuge          (GpuReduction*        gr,
-                                                   uint32_t             selector,
-                                                   int                  axis);
-static int         reduxGenSrcAxisIsSplit         (GpuReduction*        gr,
-                                                   uint32_t             selector,
-                                                   int                  axis);
-static int         reduxGenCompile                (GpuReduction*        gr);
-static int         reduxGenComputeLaunchBounds    (GpuReduction*        gr);
-static int         reduxGenCleanup                (GpuReduction*        gr,  int ret);
-static int         reduxGenCleanupMsg             (GpuReduction*        gr,  int ret,
-                                                   const char*          fmt, ...);
+static int         reduxGenInit                     (GpuReduction*        gr);
+static int         reduxGenInferProperties          (GpuReduction*        gr);
+static void        reduxGenSetMaxBS                 (GpuReduction*        gr);
+static void        reduxGenSetKTypes                (GpuReduction*        gr);
+static void        reduxGenIterArgs                 (const GpuReduction*  gr,
+                                                     GpuReductionIterFn   fn,
+                                                     void*                user);
+static int         reduxGenSrc                      (GpuReduction*        gr);
+static void        reduxGenSrcAppend                (GpuReduction*        gr);
+static void        reduxGenSrcAppendIncludes        (GpuReduction*        gr);
+static void        reduxGenSrcAppendMacroTypedefs   (GpuReduction*        gr);
+static void        reduxGenSrcAppendReduxKernel     (GpuReduction*        gr);
+static void        reduxGenSrcAppendPrototype       (GpuReduction*        gr);
+static void        reduxGenSrcAppendDecode          (GpuReduction*        gr);
+static void        reduxGenSrcAppendPhase0          (GpuReduction*        gr,
+                                                     uint32_t             selector);
+static void        reduxGenSrcAppendLoop            (GpuReduction*        gr,
+                                                     uint32_t             selector,
+                                                     int                  initial);
+static void        reduxGenSrcAppendVertical        (GpuReduction*        gr,
+                                                     uint32_t             selector);
+static void        reduxGenSrcAppendIncrement       (GpuReduction*        gr,
+                                                     uint32_t             selector,
+                                                     int                  initial,
+                                                     int                  axis);
+static void        reduxGenSrcAppendDstWrite        (GpuReduction*        gr,
+                                                     uint32_t             selector,
+                                                     int                  initial);
+static void        reduxGenSrcAppendPhase1          (GpuReduction*        gr);
+static int         reduxGenSrcAxisIsHuge            (GpuReduction*        gr,
+                                                     uint32_t             selector,
+                                                     int                  axis);
+static int         reduxGenSrcAxisIsSplit           (GpuReduction*        gr,
+                                                     uint32_t             selector,
+                                                     int                  axis);
+static int         reduxGenCompile                  (GpuReduction*        gr);
+static int         reduxGenComputeLaunchBounds      (GpuReduction*        gr);
+static int         reduxGenCleanup                  (GpuReduction*        gr,  int ret);
+static int         reduxGenCleanupMsg               (GpuReduction*        gr,  int ret,
+                                                     const char*          fmt, ...);
 
 /*     Generator Utilities */
-static void        reduxGenCountArgs              (const GpuReduction*  gr,
-                                                   int                  typecode,
-                                                   const char*          typeName,
-                                                   const char*          baseName,
-                                                   int                  num,
-                                                   void*                user);
-static void        reduxGenSaveArgTypecodes       (const GpuReduction*  gr,
-                                                   int                  typecode,
-                                                   const char*          typeName,
-                                                   const char*          baseName,
-                                                   int                  num,
-                                                   void*                user);
-static void        reduxGenAppendArg              (const GpuReduction*  gr,
-                                                   int                  typecode,
-                                                   const char*          typeName,
-                                                   const char*          baseName,
-                                                   int                  num,
-                                                   void*                user);
-static void        reduxInvMarshalArg             (const GpuReduction*  gr,
-                                                   int                  typecode,
-                                                   const char*          typeName,
-                                                   const char*          baseName,
-                                                   int                  num,
-                                                   void*                user);
-static size_t      reduxGenEstimateParallelism    (const GpuReduction*  gr);
-static int         reduxGenRequiresS0             (const GpuReduction*  gr);
-static int         reduxGenRequiresD0             (const GpuReduction*  gr);
-static int         reduxGenRequiresD1             (const GpuReduction*  gr);
-static int         reduxGenKernelRequiresLatticeS0(const GpuReduction*  gr);
-static int         reduxGenKernelRequiresLatticeD0(const GpuReduction*  gr);
-static int         reduxGenKernelRequiresLatticeD1(const GpuReduction*  gr);
-static int         reduxGenKernelRequiresLatticeI0(const GpuReduction*  gr);
-static int         reduxGenKernelRequiresStateK0  (const GpuReduction*  gr);
-static int         reduxGenKernelRequiresStateK1  (const GpuReduction*  gr);
-static int         reduxGenKernelRequiresWspace   (const GpuReduction*  gr);
-static size_t      reduxGenGetK0Size              (const GpuReduction*  gr);
-static size_t      reduxGenGetK0Align             (const GpuReduction*  gr);
-static size_t      reduxGenGetK1Size              (const GpuReduction*  gr);
-static size_t      reduxGenGetK1Align             (const GpuReduction*  gr);
-static size_t      reduxGenGetReduxStateSize      (const GpuReduction*  gr);
-static size_t      reduxGenGetMaxLocalSize        (const GpuReduction*  gr);
-static size_t      reduxGenGetSHMEMSize           (const GpuReduction*  gr, size_t cells);
-static size_t      reduxGenGetSHMEMK0Off          (const GpuReduction*  gr, size_t cells);
-static size_t      reduxGenGetSHMEMK1Off          (const GpuReduction*  gr, size_t cells);
-static size_t      reduxGenGetWMEMSize            (const GpuReduction*  gr, size_t cells);
-static size_t      reduxGenGetWMEMK0Off           (const GpuReduction*  gr, size_t cells);
-static size_t      reduxGenGetWMEMK1Off           (const GpuReduction*  gr, size_t cells);
+static void        reduxGenCountArgs                (const GpuReduction*  gr,
+                                                     int                  typecode,
+                                                     const char*          typeName,
+                                                     const char*          baseName,
+                                                     int                  num,
+                                                     void*                user);
+static void        reduxGenSaveArgTypecodes         (const GpuReduction*  gr,
+                                                     int                  typecode,
+                                                     const char*          typeName,
+                                                     const char*          baseName,
+                                                     int                  num,
+                                                     void*                user);
+static void        reduxGenAppendArg                (const GpuReduction*  gr,
+                                                     int                  typecode,
+                                                     const char*          typeName,
+                                                     const char*          baseName,
+                                                     int                  num,
+                                                     void*                user);
+static void        reduxInvMarshalArg               (const GpuReduction*  gr,
+                                                     int                  typecode,
+                                                     const char*          typeName,
+                                                     const char*          baseName,
+                                                     int                  num,
+                                                     void*                user);
+static size_t      reduxGenEstimateParallelism      (const GpuReduction*  gr);
+static int         reduxGenRequiresS0               (const GpuReduction*  gr);
+static int         reduxGenRequiresD0               (const GpuReduction*  gr);
+static int         reduxGenRequiresD1               (const GpuReduction*  gr);
+static int         reduxGenKernelRequiresLatticeS0  (const GpuReduction*  gr);
+static int         reduxGenKernelRequiresLatticeD0  (const GpuReduction*  gr);
+static int         reduxGenKernelRequiresLatticeD1  (const GpuReduction*  gr);
+static int         reduxGenKernelRequiresLatticeI0  (const GpuReduction*  gr);
+static int         reduxGenKernelRequiresStateK0    (const GpuReduction*  gr);
+static int         reduxGenKernelRequiresStateK1    (const GpuReduction*  gr);
+static int         reduxGenKernelRequiresWspace     (const GpuReduction*  gr);
+static size_t      reduxGenGetK0Size                (const GpuReduction*  gr);
+static size_t      reduxGenGetK0Align               (const GpuReduction*  gr);
+static size_t      reduxGenGetK1Size                (const GpuReduction*  gr);
+static size_t      reduxGenGetK1Align               (const GpuReduction*  gr);
+static size_t      reduxGenGetReduxStateSize        (const GpuReduction*  gr);
+static size_t      reduxGenGetMaxLocalSize          (const GpuReduction*  gr);
+static size_t      reduxGenGetSHMEMSize             (const GpuReduction*  gr, size_t cells);
+static size_t      reduxGenGetSHMEMK0Off            (const GpuReduction*  gr, size_t cells);
+static size_t      reduxGenGetSHMEMK1Off            (const GpuReduction*  gr, size_t cells);
+static size_t      reduxGenGetWMEMSize              (const GpuReduction*  gr, size_t cells);
+static size_t      reduxGenGetWMEMK0Off             (const GpuReduction*  gr, size_t cells);
+static size_t      reduxGenGetWMEMK1Off             (const GpuReduction*  gr, size_t cells);
 
 /*     Invoker Control Flow */
-static int         reduxInvInit                   (redux_ctx*           ctx);
-static int         reduxInvInferProperties        (redux_ctx*           ctx);
-static int         reduxInvFlattenSource          (redux_ctx*           ctx);
-static int         reduxInvComputeKernelArgs      (redux_ctx*           ctx);
-static int         reduxInvSchedule               (redux_ctx*           ctx);
-static int         reduxInvoke                    (redux_ctx*           ctx);
-static int         reduxInvCleanup                (redux_ctx*           ctx, int ret);
-static int         reduxInvCleanupMsg             (redux_ctx*           ctx, int ret,
-                                                   const char*          fmt, ...);
+static int         reduxInvInit                     (redux_ctx*           ctx);
+static int         reduxInvInferProperties          (redux_ctx*           ctx);
+static int         reduxInvFlattenSource            (redux_ctx*           ctx);
+static int         reduxInvComputeKernelArgs        (redux_ctx*           ctx);
+static int         reduxInvSchedule                 (redux_ctx*           ctx);
+static int         reduxInvoke                      (redux_ctx*           ctx);
+static int         reduxInvCleanup                  (redux_ctx*           ctx, int ret);
+static int         reduxInvCleanupMsg               (redux_ctx*           ctx, int ret,
+                                                     const char*          fmt, ...);
 
 /*     Invoker Utilities */
-static size_t      reduxInvEstimateParallelism    (const redux_ctx*  ctx);
-static int         reduxInvRequiresS0             (const redux_ctx*  ctx);
-static int         reduxInvRequiresD0             (const redux_ctx*  ctx);
-static int         reduxInvRequiresD1             (const redux_ctx*  ctx);
-static axis_desc*  reduxInvGetSrcAxis             (const redux_ctx*  ctx, int i);
-static axis_desc*  reduxInvGetSrcSortAxis         (const redux_ctx*  ctx, int i);
-static int         reduxTryFlattenOut             (const redux_ctx*  ctx,
-                                                   const axis_desc*  axis);
-static int         reduxTryFlattenInto            (redux_ctx*        ctx,
-                                                   axis_desc*        into,
-                                                   const axis_desc*  from);
-static void        reduxSortAxisPtrsBy            (axis_desc**       ptrs,
-                                                   axis_desc*        axes,
-                                                   size_t            numAxes,
-                                                   int(*fn)(const void*, const void*));
+static size_t      reduxInvEstimateParallelism      (const redux_ctx*       ctx);
+static int         reduxInvRequiresS0               (const redux_ctx*       ctx);
+static int         reduxInvRequiresD0               (const redux_ctx*       ctx);
+static int         reduxInvRequiresD1               (const redux_ctx*       ctx);
+static axis_desc*  reduxInvGetSrcAxis               (const redux_ctx*       ctx, int i);
+static axis_desc*  reduxInvGetSrcSortAxis           (const redux_ctx*       ctx, int i);
+static int         reduxTryFlattenOut               (const redux_ctx*       ctx,
+                                                     const axis_desc*       axis);
+static int         reduxTryFlattenInto              (redux_ctx*             ctx,
+                                                     axis_desc*             into,
+                                                     const axis_desc*       from);
+static void        reduxSortAxisPtrsBy              (axis_desc**            ptrs,
+                                                     axis_desc*             axes,
+                                                     size_t                 numAxes,
+                                                     int(*fn)(const void*, const void*));
 
 
 /* Function Implementations */
 /* Extern Functions */
 GPUARRAY_PUBLIC int   GpuReductionAttr_new          (GpuReductionAttr**         grAttr,
                                                      gpucontext*                gpuCtx){
-	if(!grAttr){
+	if (!grAttr){
 		return GA_INVALID_ERROR;
 	}
-	if(!gpuCtx){
+	if (!gpuCtx){
 		*grAttr = NULL;
 		return GA_INVALID_ERROR;
 	}
 	*grAttr = calloc(1, sizeof(**grAttr));
-	if(!*grAttr){
+	if (!*grAttr){
 		return GA_MEMORY_ERROR;
 	}
 	
@@ -527,7 +527,7 @@ GPUARRAY_PUBLIC int   GpuReductionAttr_setdims      (GpuReductionAttr*
 }
 GPUARRAY_PUBLIC int   GpuReductionAttr_sets0type    (GpuReductionAttr*          grAttr,
                                                      int                        s0Typecode){
-	switch(grAttr->op){
+	switch (grAttr->op){
 		case GA_REDUCE_AND:
 		case GA_REDUCE_OR:
 		case GA_REDUCE_XOR:
@@ -565,7 +565,7 @@ GPUARRAY_PUBLIC int   GpuReductionAttr_seti0type    (GpuReductionAttr*
 GPUARRAY_PUBLIC int   GpuReductionAttr_appendopname (GpuReductionAttr*          grAttr,
                                                      size_t                     n,
                                                      char*                      name){
-	switch(grAttr->op){
+	switch (grAttr->op){
 		case GA_REDUCE_COPY:         return snprintf(name, n, "Copy_%d",            grAttr->maxSrcDims);
 		case GA_REDUCE_SUM:          return snprintf(name, n, "Sum_%d_%d",          grAttr->maxSrcDims, grAttr->maxDstDims);
 		case GA_REDUCE_PROD:         return snprintf(name, n, "Prod_%d_%d",         grAttr->maxSrcDims, grAttr->maxDstDims);
@@ -581,7 +581,7 @@ GPUARRAY_PUBLIC int   GpuReductionAttr_appendopname (GpuReductionAttr*
 		case GA_REDUCE_XOR:          return snprintf(name, n, "Xor_%d_%d",          grAttr->maxSrcDims, grAttr->maxDstDims);
 		case GA_REDUCE_ALL:          return snprintf(name, n, "All_%d_%d",          grAttr->maxSrcDims, grAttr->maxDstDims);
 		case GA_REDUCE_ANY:          return snprintf(name, n, "Any_%d_%d",          grAttr->maxSrcDims, grAttr->maxDstDims);
-		default:                     if(name && n>0){*name = '\0';} return GA_INVALID_ERROR;
+		default:                     if (name && n>0){*name = '\0';} return GA_INVALID_ERROR;
 	}
 }
 GPUARRAY_PUBLIC int   GpuReductionAttr_issensitive  (const GpuReductionAttr*    grAttr){
@@ -623,7 +623,7 @@ GPUARRAY_PUBLIC int   GpuReductionAttr_issensitive  (const GpuReductionAttr*
 	}
 }
 GPUARRAY_PUBLIC int   GpuReductionAttr_requiresS0   (const GpuReductionAttr*    grAttr){
-	switch(grAttr->op){
+	switch (grAttr->op){
 		default: return 1;
 	}
 }
@@ -712,7 +712,7 @@ GPUARRAY_PUBLIC int   GpuReduction_call             (const GpuReduction*
  * @return Zero if successful; Non-zero if the datatype is not supported.
  */
 
-static int         reduxGetSumInit               (int typecode, const char** property){
+static int         reduxGetSumInit                  (int typecode, const char** property){
 	if (typecode == GA_POINTER ||
 	    typecode == GA_BUFFER){
 		return GA_UNSUPPORTED_ERROR;
@@ -732,7 +732,7 @@ static int         reduxGetSumInit               (int typecode, const char** pro
  * @return Zero if successful; Non-zero if the datatype is not supported.
  */
 
-static int         reduxGetProdInit              (int typecode, const char** property){
+static int         reduxGetProdInit                 (int typecode, const char** property){
 	if (typecode == GA_POINTER ||
 	    typecode == GA_BUFFER){
 		return GA_UNSUPPORTED_ERROR;
@@ -752,7 +752,7 @@ static int         reduxGetProdInit              (int typecode, const char** pro
  * @return Zero if successful; Non-zero if the datatype is not supported.
  */
 
-static int         reduxGetMinInit               (int typecode, const char** property){
+static int         reduxGetMinInit                  (int typecode, const char** property){
 	switch (typecode){
 		case GA_BYTE2:
 		case GA_BYTE3:
@@ -842,7 +842,7 @@ static int         reduxGetMinInit               (int typecode, const char** pro
  * @return Zero if successful; Non-zero if the datatype is not supported.
  */
 
-static int         reduxGetMaxInit               (int typecode, const char** property){
+static int         reduxGetMaxInit                  (int typecode, const char** property){
 	switch (typecode){
 		case GA_BOOL:
 		  *property = "1";
@@ -941,7 +941,7 @@ static int         reduxGetMaxInit               (int typecode, const char** pro
  * @return Zero if successful; Non-zero if the datatype is not supported.
  */
 
-static int         reduxGetAndInit               (int typecode, const char** property){
+static int         reduxGetAndInit                  (int typecode, const char** property){
 	if (typecode == GA_POINTER ||
 	    typecode == GA_BUFFER){
 		return GA_UNSUPPORTED_ERROR;
@@ -961,7 +961,7 @@ static int         reduxGetAndInit               (int typecode, const char** pro
  * @return Zero if successful; Non-zero if the datatype is not supported.
  */
 
-static int         reduxGetOrInit                (int typecode, const char** property){
+static int         reduxGetOrInit                   (int typecode, const char** property){
 	if (typecode == GA_POINTER ||
 	    typecode == GA_BUFFER){
 		return GA_UNSUPPORTED_ERROR;
@@ -974,8 +974,8 @@ static int         reduxGetOrInit                (int typecode, const char** pro
  * Whether or not the typecode is a floating-point type.
  */
 
-static int         reduxIsFloatingPoint           (int typecode){
-	switch(typecode){
+static int         reduxIsFloatingPoint             (int typecode){
+	switch (typecode){
 		case GA_HALF:
 		case GA_HALF2:
 		case GA_HALF4:
@@ -1005,7 +1005,7 @@ static int         reduxIsFloatingPoint           (int typecode){
  * Compute ceil(log2(x)).
  */
 
-static unsigned    reduxCeilLog2                  (uint64_t x){
+static unsigned    reduxCeilLog2                    (uint64_t x){
 	int i;
 	
 	if (x <= 1){
@@ -1021,7 +1021,7 @@ static unsigned    reduxCeilLog2                  (uint64_t x){
  * If x is a power of two already, return x.
  */
 
-static uint64_t    reduxNextPow2                  (uint64_t x){
+static uint64_t    reduxNextPow2                    (uint64_t x){
 	if (x & (x-1)){
 		x |= x >>  1;
 		x |= x >>  2;
@@ -1057,7 +1057,7 @@ static uint64_t    reduxNextPow2                  (uint64_t x){
  *   5.                      then by increasing source axis number.
  */
 
-static int         reduxSortFlatInsensitive      (const void* a, const void* b){
+static int         reduxSortFlatInsensitive         (const void* a, const void* b){
 	const axis_desc* xda  = (const axis_desc*)a;
 	const axis_desc* xdb  = (const axis_desc*)b;
 
@@ -1075,7 +1075,7 @@ static int         reduxSortFlatInsensitive      (const void* a, const void* b){
 
 	return 0;
 }
-static int         reduxSortFlatSensitive        (const void* a, const void* b){
+static int         reduxSortFlatSensitive           (const void* a, const void* b){
 	const axis_desc* xda  = (const axis_desc*)a;
 	const axis_desc* xdb  = (const axis_desc*)b;
 
@@ -1104,7 +1104,7 @@ static int         reduxSortFlatSensitive        (const void* a, const void* b){
  * This means ascending order of absolute stride.
  */
 
-static int         reduxSortPtrS0AbsStride       (const void* a, const void* b){
+static int         reduxSortPtrS0AbsStride          (const void* a, const void* b){
 	const axis_desc* xda  = *(const axis_desc* const*)a;
 	const axis_desc* xdb  = *(const axis_desc* const*)b;
 	
@@ -1116,7 +1116,7 @@ static int         reduxSortPtrS0AbsStride       (const void* a, const void* b){
 
 	return 0;
 }
-static int         reduxSortPtrByReduxNum        (const void* a, const void* b){
+static int         reduxSortPtrByReduxNum           (const void* a, const void* b){
 	const axis_desc* xda  = *(const axis_desc* const*)a;
 	const axis_desc* xdb  = *(const axis_desc* const*)b;
 	
@@ -1134,7 +1134,7 @@ static int         reduxSortPtrByReduxNum        (const void* a, const void* b){
 
 	return 0;
 }
-static int         reduxSortPtrD0WrSelect        (const void* a, const void* b){
+static int         reduxSortPtrD0WrSelect           (const void* a, const void* b){
 	const axis_desc* xda  = *(const axis_desc* const*)a;
 	const axis_desc* xdb  = *(const axis_desc* const*)b;
 	
@@ -1168,7 +1168,7 @@ static int         reduxSortPtrD0WrSelect        (const void* a, const void* b){
 
 	return 0;
 }
-static int         reduxSortPtrD1WrSelect        (const void* a, const void* b){
+static int         reduxSortPtrD1WrSelect           (const void* a, const void* b){
 	const axis_desc* xda  = *(const axis_desc* const*)a;
 	const axis_desc* xdb  = *(const axis_desc* const*)b;
 	
@@ -1202,7 +1202,7 @@ static int         reduxSortPtrD1WrSelect        (const void* a, const void* b){
 
 	return 0;
 }
-static int         reduxSortPtrInsertFinalOrder  (const void* a, const void* b){
+static int         reduxSortPtrInsertFinalOrder     (const void* a, const void* b){
 	const axis_desc* xda  = *(const axis_desc* const*)a;
 	const axis_desc* xdb  = *(const axis_desc* const*)b;
 	
@@ -1257,9 +1257,9 @@ static int         reduxSortPtrInsertFinalOrder  (const void* a, const void* b){
  * @brief Initialize Axis Description.
  */
 
-static void        axisInit                      (axis_desc*       axis,
-                                                  ssize_t          len,
-                                                  ssize_t          s0S){
+static void        axisInit                         (axis_desc*           axis,
+                                                     ssize_t              len,
+                                                     ssize_t              s0S){
 	memset(axis, 0, sizeof(*axis));
 	
 	axis->reduxNum = -1;
@@ -1278,7 +1278,7 @@ static void        axisInit                      (axis_desc*       axis,
  * @brief Mark axis as reduction axis, with position reduxNum in the axis list.
  */
 
-static void        axisMarkReduced               (axis_desc*       axis, int    reduxNum){
+static void        axisMarkReduced                  (axis_desc*           axis, int    reduxNum){
 	axis->isReduced = 1;
 	axis->reduxNum  = reduxNum;
 }
@@ -1287,9 +1287,9 @@ static void        axisMarkReduced               (axis_desc*       axis, int
  * @brief Mark axis as (split) intrablock axis.
  */
 
-static void        axisMarkIntraBlock            (axis_desc*       axis,
-                                                  int              ibNum,
-                                                  size_t           ibLen){
+static void        axisMarkIntraBlock               (axis_desc*           axis,
+                                                     int                  ibNum,
+                                                     size_t               ibLen){
 	axis->isIntra  = 1;
 	axis->ibNum    = ibNum;
 	axis->splitLen = ibLen;
@@ -1299,13 +1299,13 @@ static void        axisMarkIntraBlock            (axis_desc*       axis,
  * @brief Get properties of an axis.
  */
 
-static int         axisGetReduxNum               (const axis_desc* axis){
+static int         axisGetReduxNum                  (const axis_desc*     axis){
 	return axis->reduxNum;
 }
-static size_t      axisGetLen                    (const axis_desc* axis){
+static size_t      axisGetLen                       (const axis_desc*     axis){
 	return axis->len;
 }
-static size_t      axisGetIntraLen               (const axis_desc* axis){
+static size_t      axisGetIntraLen                  (const axis_desc*     axis){
 	if       (axisIsSplit(axis)){
 		return axis->splitLen;
 	}else if (axisIsIntra(axis)){
@@ -1314,7 +1314,7 @@ static size_t      axisGetIntraLen               (const axis_desc* axis){
 		return 1;
 	}
 }
-static size_t      axisGetInterLen               (const axis_desc* axis){
+static size_t      axisGetInterLen                  (const axis_desc*     axis){
 	if       (axisIsSplit(axis)){
 		return DIVIDECEIL(axis->len, axis->splitLen);
 	}else if (axisIsIntra(axis)){
@@ -1323,69 +1323,69 @@ static size_t      axisGetInterLen               (const axis_desc* axis){
 		return axis->len;
 	}
 }
-static size_t      axisGetIntraInterLen          (const axis_desc* axis){
+static size_t      axisGetIntraInterLen             (const axis_desc*     axis){
 	return axisGetIntraLen(axis)*axisGetInterLen(axis);
 }
-static ssize_t     axisGetS0Stride               (const axis_desc* axis){
+static ssize_t     axisGetS0Stride                  (const axis_desc*     axis){
 	return axisGetLen(axis) > 1 ? axis->s0S : 0;
 }
-static size_t      axisGetS0AbsStride            (const axis_desc* axis){
+static size_t      axisGetS0AbsStride               (const axis_desc*     axis){
 	return axisGetS0Stride(axis)<0 ? -(size_t)axisGetS0Stride(axis):
 	                                  +(size_t)axisGetS0Stride(axis);
 }
-static ssize_t     axisGetD0Stride               (const axis_desc* axis){
+static ssize_t     axisGetD0Stride                  (const axis_desc*     axis){
 	return axisGetLen(axis) > 1 ? axis->d0S : 0;
 }
-static size_t      axisGetD0AbsStride            (const axis_desc* axis){
+static size_t      axisGetD0AbsStride               (const axis_desc*     axis){
 	return axisGetD0Stride(axis)<0 ? -(size_t)axisGetD0Stride(axis):
 	                                  +(size_t)axisGetD0Stride(axis);
 }
-static ssize_t     axisGetD1Stride               (const axis_desc* axis){
+static ssize_t     axisGetD1Stride                  (const axis_desc*     axis){
 	return axisGetLen(axis) > 1 ? axis->d1S : 0;
 }
-static size_t      axisGetD1AbsStride            (const axis_desc* axis){
+static size_t      axisGetD1AbsStride               (const axis_desc*     axis){
 	return axisGetD1Stride(axis)<0 ? -(size_t)axisGetD1Stride(axis):
 	                                     +(size_t)axisGetD1Stride(axis);
 }
-static size_t      axisGetI0Stride               (const axis_desc*     axis){
+static size_t      axisGetI0Stride                  (const axis_desc*     axis){
 	return axis->i0S;
 }
-static void        axisSetI0Stride               (axis_desc*           axis,
-                                                  size_t               i0S){
+static void        axisSetI0Stride                  (axis_desc*           axis,
+                                                     size_t               i0S){
 	axis->i0S = i0S;
 }
-static unsigned    axisGetPerm                   (const axis_desc* axis){
+static unsigned    axisGetPerm                      (const axis_desc*     axis){
 	return axis->perm;
 }
-static int         axisGetIBNum                  (const axis_desc* axis){
+static int         axisGetIBNum                     (const axis_desc*     axis){
 	return axis->ibNum;
 }
-static void        axisSetPerm                   (axis_desc*       axis,
-                                                  unsigned         perm){
+static void        axisSetPerm                      (axis_desc*           axis,
+                                                     unsigned             perm){
 	axis->perm = perm;
 }
-static int         axisIsReduced                 (const axis_desc* axis){
+static int         axisIsReduced                    (const axis_desc*     axis){
 	return axis->isReduced;
 }
-static int         axisIsIntra                   (const axis_desc* axis){
+static int         axisIsIntra                      (const axis_desc*     axis){
 	return axis->isIntra;
 }
-static int         axisIsInter                   (const axis_desc* axis){
+static int         axisIsInter                      (const axis_desc*     axis){
 	return !axisIsIntra(axis);
 }
-static int         axisIsSplit                   (const axis_desc* axis){
+static int         axisIsSplit                      (const axis_desc*     axis){
 	return axisIsIntra(axis) && axis->splitLen != axis->len;
 }
-static size_t      reduxInvEstimateParallelism   (const redux_ctx*  ctx){
+static size_t      reduxInvEstimateParallelism      (const redux_ctx*     ctx){
 	return reduxGenEstimateParallelism(ctx->gr);
 }
-static int         reduxInvRequiresS0            (const redux_ctx*  ctx){
+static int         reduxInvRequiresS0               (const redux_ctx*     ctx){
 	return reduxGenRequiresS0(ctx->gr);
 }
-static int         reduxInvRequiresD0            (const redux_ctx*  ctx){
+static int         reduxInvRequiresD0               (const redux_ctx*     ctx){
 	return reduxGenRequiresD0(ctx->gr);
 }
-static int         reduxInvRequiresD1            (const redux_ctx*  ctx){
+static int         reduxInvRequiresD1               (const redux_ctx*     ctx){
 	return reduxGenRequiresD1(ctx->gr);
 }
 
@@ -1393,7 +1393,7 @@ static int         reduxInvRequiresD1            (const redux_ctx*  ctx){
  * @brief Get description of source axis with given number.
  */
 
-static axis_desc*  reduxInvGetSrcAxis            (const redux_ctx*  ctx, int i){
+static axis_desc*  reduxInvGetSrcAxis               (const redux_ctx*       ctx, int i){
 	return &ctx->xdSrc[i];
 }
 
@@ -1401,7 +1401,7 @@ static axis_desc*  reduxInvGetSrcAxis            (const redux_ctx*  ctx, int i){
  * @brief Get description of source axis with given number in sort-order.
  */
 
-static axis_desc*  reduxInvGetSrcSortAxis        (const redux_ctx*  ctx, int i){
+static axis_desc*  reduxInvGetSrcSortAxis           (const redux_ctx*       ctx, int i){
 	return ctx->xdSrcPtrs[i];
 }
 
@@ -1417,8 +1417,8 @@ static axis_desc*  reduxInvGetSrcSortAxis        (const redux_ctx*  ctx, int i){
  * @return Non-zero if flattening attempt successful; Zero otherwise.
  */
 
-static int         reduxTryFlattenOut            (const redux_ctx*  ctx,
-                                                  const axis_desc*  axis){
+static int         reduxTryFlattenOut               (const redux_ctx*       ctx,
+                                                     const axis_desc*       axis){
 	if ((axisGetLen   (axis) == 1                   )||
 	    (axisIsReduced(axis) && ctx->zeroRdxAxes > 0)){
 		return 1;
@@ -1448,9 +1448,9 @@ static int         reduxTryFlattenOut            (const redux_ctx*  ctx,
  * @return Non-zero if flattening attempt successful; Zero otherwise.
  */
 
-static int         reduxTryFlattenInto           (redux_ctx*        ctx,
-                                                  axis_desc*        into,
-                                                  const axis_desc*  from){
+static int         reduxTryFlattenInto              (redux_ctx*             ctx,
+                                                     axis_desc*             into,
+                                                     const axis_desc*       from){
 	int signS0    = 0, signD0    = 0, signD1    = 0,
 	    reverseS0 = 0, reverseD0 = 0, reverseD1 = 0;
 	
@@ -1520,10 +1520,10 @@ static int         reduxTryFlattenInto           (redux_ctx*        ctx,
  * not touching the axes themselves.
  */
 
-static void        reduxSortAxisPtrsBy           (axis_desc**       ptrs,
-                                                  axis_desc*        axes,
-                                                  size_t            numAxes,
-                                                  int(*fn)(const void*, const void*)){
+static void        reduxSortAxisPtrsBy              (axis_desc**            ptrs,
+                                                     axis_desc*             axes,
+                                                     size_t                 numAxes,
+                                                     int(*fn)(const void*, const void*)){
 	size_t i;
 	
 	for (i=0;i<numAxes;i++){
@@ -1540,7 +1540,7 @@ static void        reduxSortAxisPtrsBy           (axis_desc**       ptrs,
  * After this function, calling reduxGenCleanup*() becomes safe.
  */
 
-static int         reduxGenInit                  (GpuReduction*     gr){
+static int         reduxGenInit                     (GpuReduction*        gr){
 	gr->kArgTypeCodes = NULL;
 	gr->kSourceCode   = NULL;
 	gr->kErrorString  = NULL;
@@ -1553,7 +1553,7 @@ static int         reduxGenInit                  (GpuReduction*     gr){
  * @brief Begin inferring the properties of the reduction operator.
  */
 
-static int         reduxGenInferProperties       (GpuReduction*     gr){
+static int         reduxGenInferProperties          (GpuReduction*        gr){
 	int i;
 	
 	/**
@@ -1609,7 +1609,7 @@ static int         reduxGenInferProperties       (GpuReduction*     gr){
  * Compute maximum block size we shall support in generated kernels.
  */
 
-static void        reduxGenSetMaxBS              (GpuReduction*        gr){
+static void        reduxGenSetMaxBS                 (GpuReduction*        gr){
 	gr->maxBS = gr->grAttr.maxLM/reduxGenGetReduxStateSize(gr);
 	gr->maxBS = gr->maxBS < gr->grAttr.maxLg ? gr->maxBS : gr->grAttr.maxLg;
 	gr->maxBS = gr->maxBS < gr->grAttr.maxL0 ? gr->maxBS : gr->grAttr.maxL0;
@@ -1658,7 +1658,7 @@ static void        reduxGenSetMaxBS              (GpuReduction*        gr){
  * For now we default TK1 to exactly TI0.
  */
 
-static void        reduxGenSetKTypes             (GpuReduction*        gr){
+static void        reduxGenSetKTypes                (GpuReduction*        gr){
 	const gpuarray_type *TK0     = NULL, *TK1     = NULL, *TPS0    = NULL;
 	const char*          TK0init = NULL;
 	
@@ -1769,9 +1769,9 @@ static void        reduxGenSetKTypes             (GpuReduction*        gr){
  * Iterate over the arguments of the reduction operator.
  */
 
-static void        reduxGenIterArgs              (const GpuReduction*  gr,
-                                                  GpuReductionIterFn   fn,
-                                                  void*                user){
+static void        reduxGenIterArgs                 (const GpuReduction*  gr,
+                                                     GpuReductionIterFn   fn,
+                                                     void*                user){
 	int k;
 	
 	/**
@@ -1872,7 +1872,7 @@ static void        reduxGenIterArgs              (const GpuReduction*  gr,
  * @return GA_MEMORY_ERROR if not enough memory left; GA_NO_ERROR otherwise.
  */
 
-static int        reduxGenSrc                   (GpuReduction*     gr){
+static int         reduxGenSrc                      (GpuReduction*        gr){
 	GpuReductionAttr_appendopname(&gr->grAttr, sizeof(gr->kName), gr->kName);
 	
 	reduxGenSrcAppend(gr);
@@ -1893,19 +1893,19 @@ static int        reduxGenSrc                   (GpuReduction*     gr){
  * @brief Append source code to the string buffer.
  */
 
-static void       reduxGenSrcAppend             (GpuReduction*     gr){
+static void        reduxGenSrcAppend                (GpuReduction*        gr){
 	reduxGenSrcAppendIncludes     (gr);
 	reduxGenSrcAppendMacroTypedefs(gr);
 	reduxGenSrcAppendReduxKernel  (gr);
 }
-static void       reduxGenSrcAppendIncludes     (GpuReduction*     gr){
+static void        reduxGenSrcAppendIncludes        (GpuReduction*        gr){
 	srcbAppends(&gr->srcGen, "/* Includes */\n");
 	srcbAppends(&gr->srcGen, "#include \"cluda.h\"\n");
 	srcbAppends(&gr->srcGen, "\n");
 	srcbAppends(&gr->srcGen, "\n");
 	srcbAppends(&gr->srcGen, "\n");
 }
-static void       reduxGenSrcAppendMacroTypedefs(GpuReduction*     gr){
+static void        reduxGenSrcAppendMacroTypedefs   (GpuReduction*        gr){
 	/**
 	 * Typedefs of various types.
 	 */
@@ -2184,7 +2184,7 @@ static void       reduxGenSrcAppendMacroTypedefs(GpuReduction*     gr){
 	
 	srcbAppends(&gr->srcGen, "#define DIVIDECEIL(a,b) (((a)+(b)-1)/(b))\n\n\n\n\n");
 }
-static void       reduxGenSrcAppendReduxKernel  (GpuReduction*     gr){
+static void        reduxGenSrcAppendReduxKernel     (GpuReduction*        gr){
 	reduxGenSrcAppendPrototype   (gr);
 	srcbAppends                  (&gr->srcGen, "{\n");
 	reduxGenSrcAppendDecode      (gr);
@@ -2229,7 +2229,7 @@ static void       reduxGenSrcAppendReduxKernel  (GpuReduction*     gr){
 	srcbAppends                  (&gr->srcGen, "    }\n");
 	srcbAppends                  (&gr->srcGen, "}\n");
 }
-static void       reduxGenSrcAppendPrototype    (GpuReduction*     gr){
+static void        reduxGenSrcAppendPrototype       (GpuReduction*        gr){
 	int i=0;
 
 	srcbAppendf(&gr->srcGen,
@@ -2244,7 +2244,7 @@ static void       reduxGenSrcAppendPrototype    (GpuReduction*     gr){
 	reduxGenIterArgs(gr, reduxGenAppendArg, &i);
 	srcbAppends(&gr->srcGen, ")");
 }
-static void       reduxGenSrcAppendDecode       (GpuReduction*     gr){
+static void        reduxGenSrcAppendDecode          (GpuReduction*        gr){
 	int i;
 
 	srcbAppends(&gr->srcGen,
@@ -2635,8 +2635,8 @@ static void       reduxGenSrcAppendDecode       (GpuReduction*     gr){
 	"    \n"
 	"    \n");
 }
-static void        reduxGenSrcAppendPhase0        (GpuReduction*        gr,
-                                                   uint32_t             selector){
+static void        reduxGenSrcAppendPhase0          (GpuReduction*        gr,
+                                                     uint32_t             selector){
 	int         i;
 	const char* type;
 
@@ -2671,9 +2671,9 @@ static void        reduxGenSrcAppendPhase0        (GpuReduction*        gr,
 	                         "                }\n"
 	                         "            }\n");
 }
-static void        reduxGenSrcAppendLoop          (GpuReduction*        gr,
-                                                   uint32_t             selector,
-                                                   int                  initial){
+static void        reduxGenSrcAppendLoop            (GpuReduction*        gr,
+                                                     uint32_t             selector,
+                                                     int                  initial){
 	int i;
 
 	srcbAppends(&gr->srcGen, "            while(v > 0){v--;\n");
@@ -2690,8 +2690,8 @@ static void        reduxGenSrcAppendLoop          (GpuReduction*        gr,
 	srcbAppends(&gr->srcGen, "                break;\n"
 	                         "            }\n");
 }
-static void        reduxGenSrcAppendVertical      (GpuReduction*        gr,
-                                                   uint32_t             selector){
+static void        reduxGenSrcAppendVertical        (GpuReduction*        gr,
+                                                     uint32_t             selector){
 	int i = (selector&SELECTOR_SPLIT_FREE) ? gr->ndd-1 : gr->nds-1;
 
 	if (i >= 0){
@@ -2704,10 +2704,10 @@ static void        reduxGenSrcAppendVertical      (GpuReduction*        gr,
 		                         "                REDUX(K0, K1, tmpK0, I0);\n");
 	}
 }
-static void        reduxGenSrcAppendIncrement     (GpuReduction*        gr,
-                                                   uint32_t             selector,
-                                                   int                  initial,
-                                                   int                  axis){
+static void        reduxGenSrcAppendIncrement       (GpuReduction*        gr,
+                                                     uint32_t             selector,
+                                                     int                  initial,
+                                                     int                  axis){
 	const char* cast        = reduxGenSrcAxisIsHuge(gr, selector, axis) ? "TS64" : "TS32";
 	const char* breakOrCont = (initial) && (axis < gr->ndd) ? "break   " : "continue";
 
@@ -2743,9 +2743,9 @@ static void        reduxGenSrcAppendIncrement     (GpuReduction*        gr,
 		            axis, cast, axis, breakOrCont, axis, axis);
 	}
 }
-static void        reduxGenSrcAppendDstWrite      (GpuReduction*        gr,
-                                                   uint32_t             selector,
-                                                   int                  initial){
+static void        reduxGenSrcAppendDstWrite        (GpuReduction*        gr,
+                                                     uint32_t             selector,
+                                                     int                  initial){
 	srcbAppends(&gr->srcGen, "                local_barrier();\n");
 	if (initial){
 		srcbAppends(&gr->srcGen, "                if(LID_0 < D){\n"
@@ -2775,7 +2775,7 @@ static void        reduxGenSrcAppendDstWrite      (GpuReduction*        gr,
 	}
 	srcbAppends(&gr->srcGen, "                local_barrier();\n");
 }
-static void        reduxGenSrcAppendPhase1        (GpuReduction*        gr){
+static void        reduxGenSrcAppendPhase1          (GpuReduction*        gr){
 	/**
 	 * PHASE 1
 	 *
@@ -2814,9 +2814,9 @@ static void        reduxGenSrcAppendPhase1        (GpuReduction*        gr){
 		"        }\n");
 	}
 }
-static int         reduxGenSrcAxisIsHuge          (GpuReduction*        gr,
-                                                   uint32_t             selector,
-                                                   int                  axis){
+static int         reduxGenSrcAxisIsHuge            (GpuReduction*        gr,
+                                                     uint32_t             selector,
+                                                     int                  axis){
 	int hugeType    = selector & SELECTOR_HUGE_AXIS;
 	int isSplitFree = !!(selector & SELECTOR_SPLIT_FREE);
 	int isAxisFree  = axis < gr->ndd;
@@ -2847,9 +2847,9 @@ static int         reduxGenSrcAxisIsHuge          (GpuReduction*        gr,
 		return 0;
 	}
 }
-static int         reduxGenSrcAxisIsSplit         (GpuReduction*        gr,
-                                                   uint32_t             selector,
-                                                   int                  axis){
+static int         reduxGenSrcAxisIsSplit           (GpuReduction*        gr,
+                                                     uint32_t             selector,
+                                                     int                  axis){
 	return  ( (selector & SELECTOR_SPLIT_FREE) && axis == gr->ndd-1) ||
 	        (!(selector & SELECTOR_SPLIT_FREE) && axis == gr->nds-1);
 }
@@ -2858,7 +2858,7 @@ static int         reduxGenSrcAxisIsSplit         (GpuReduction*        gr,
  * @brief Compile the generated kernel.
  */
 
-static int        reduxGenCompile               (GpuReduction*        gr){
+static int         reduxGenCompile                  (GpuReduction*        gr){
 	int ret, flags = 0;
 
 	flags |= GA_USE_CLUDA;
@@ -2896,7 +2896,7 @@ static int        reduxGenCompile               (GpuReduction*        gr){
  *        support launching.
  */
 
-static int        reduxGenComputeLaunchBounds   (GpuReduction*        gr){
+static int         reduxGenComputeLaunchBounds      (GpuReduction*        gr){
 	int    ret;
 
 	/**
@@ -2919,7 +2919,7 @@ static int        reduxGenComputeLaunchBounds   (GpuReduction*        gr){
  * @brief Cleanup generator context.
  */
 
-static int        reduxGenCleanup               (GpuReduction*     gr,  int ret){
+static int         reduxGenCleanup                  (GpuReduction*        gr,  int ret){
 	if (ret != GA_NO_ERROR){
 		free(gr->kArgTypeCodes);
 		free(gr->kSourceCode);
@@ -2931,8 +2931,8 @@ static int        reduxGenCleanup               (GpuReduction*     gr,  int ret)
 
 	return ret;
 }
-static int        reduxGenCleanupMsg            (GpuReduction*     gr,  int ret,
-                                                 const char*       fmt, ...){
+static int         reduxGenCleanupMsg               (GpuReduction*        gr,  int ret,
+                                                     const char*          fmt, ...){
 #if DEBUG
 	FILE* fp = stderr;
 
@@ -2952,12 +2952,12 @@ static int        reduxGenCleanupMsg            (GpuReduction*     gr,  int ret,
  * Count # of arguments as determined by iterator.
  */
 
-static void       reduxGenCountArgs             (const GpuReduction*  gr,
-                                                 int                  typecode,
-                                                 const char*          typeName,
-                                                 const char*          baseName,
-                                                 int                  num,
-                                                 void*                user){
+static void        reduxGenCountArgs                (const GpuReduction*  gr,
+                                                     int                  typecode,
+                                                     const char*          typeName,
+                                                     const char*          baseName,
+                                                     int                  num,
+                                                     void*                user){
 	(void)gr;
 	(void)typecode;
 	(void)typeName;
@@ -2971,12 +2971,12 @@ static void       reduxGenCountArgs             (const GpuReduction*  gr,
  * Record the typecodes in the arguments typecode array.
  */
 
-static void       reduxGenSaveArgTypecodes      (const GpuReduction*  gr,
-                                                 int                  typecode,
-                                                 const char*          typeName,
-                                                 const char*          baseName,
-                                                 int                  num,
-                                                 void*                user){
+static void        reduxGenSaveArgTypecodes         (const GpuReduction*  gr,
+                                                     int                  typecode,
+                                                     const char*          typeName,
+                                                     const char*          baseName,
+                                                     int                  num,
+                                                     void*                user){
 	(void)typeName;
 	(void)baseName;
 	(void)num;
@@ -2989,12 +2989,12 @@ static void       reduxGenSaveArgTypecodes      (const GpuReduction*  gr,
  * Append an argument declaration to prototype.
  */
 
-static void       reduxGenAppendArg             (const GpuReduction*  gr,
-                                                 int                  typecode,
-                                                 const char*          typeName,
-                                                 const char*          baseName,
-                                                 int                  num,
-                                                 void*                user){
+static void        reduxGenAppendArg                (const GpuReduction*  gr,
+                                                     int                  typecode,
+                                                     const char*          typeName,
+                                                     const char*          baseName,
+                                                     int                  num,
+                                                     void*                user){
 	(void)user;
 	(void)typecode;
 
@@ -3009,12 +3009,12 @@ static void       reduxGenAppendArg             (const GpuReduction*  gr,
  * Marshall argument declaration during invocation.
  */
 
-static void       reduxInvMarshalArg            (const GpuReduction*  gr,
-                                                 int                  typecode,
-                                                 const char*          typeName,
-                                                 const char*          baseName,
-                                                 int                  num,
-                                                 void*                user){
+static void        reduxInvMarshalArg               (const GpuReduction*  gr,
+                                                     int                  typecode,
+                                                     const char*          typeName,
+                                                     const char*          baseName,
+                                                     int                  num,
+                                                     void*                user){
 	redux_ctx* ctx;
 	int*       i, k = num;
 
@@ -3098,7 +3098,7 @@ static void       reduxInvMarshalArg            (const GpuReduction*  gr,
  * device, plus some substantial margin.
  */
 
-static size_t     reduxGenEstimateParallelism   (const GpuReduction*  gr){
+static size_t      reduxGenEstimateParallelism      (const GpuReduction*  gr){
 	/**
 	 * An arbitrary margin factor ensuring there will be a few thread blocks
 	 * per SMX.
@@ -3166,34 +3166,34 @@ static size_t     reduxGenEstimateParallelism   (const GpuReduction*  gr){
  * initialization operations, the above might not necessarily hold anymore.
  */
 
-static int        reduxGenRequiresS0             (const GpuReduction*  gr){
+static int         reduxGenRequiresS0               (const GpuReduction*  gr){
 	return GpuReductionAttr_requiresS0(&gr->grAttr);
 }
-static int        reduxGenRequiresD0             (const GpuReduction*  gr){
+static int         reduxGenRequiresD0               (const GpuReduction*  gr){
 	return GpuReductionAttr_requiresD0(&gr->grAttr);
 }
-static int        reduxGenRequiresD1             (const GpuReduction*  gr){
+static int         reduxGenRequiresD1               (const GpuReduction*  gr){
 	return GpuReductionAttr_requiresD1(&gr->grAttr);
 }
-static int        reduxGenKernelRequiresLatticeS0(const GpuReduction*  gr){
+static int         reduxGenKernelRequiresLatticeS0  (const GpuReduction*  gr){
 	return reduxGenRequiresS0(gr);
 }
-static int        reduxGenKernelRequiresLatticeD0(const GpuReduction*  gr){
+static int         reduxGenKernelRequiresLatticeD0  (const GpuReduction*  gr){
 	return reduxGenRequiresD0(gr);
 }
-static int        reduxGenKernelRequiresLatticeD1(const GpuReduction*  gr){
+static int         reduxGenKernelRequiresLatticeD1  (const GpuReduction*  gr){
 	return reduxGenRequiresD1(gr);
 }
-static int        reduxGenKernelRequiresLatticeI0(const GpuReduction*  gr){
+static int         reduxGenKernelRequiresLatticeI0  (const GpuReduction*  gr){
 	return reduxGenRequiresD1(gr);
 }
-static int        reduxGenKernelRequiresStateK0  (const GpuReduction*  gr){
+static int         reduxGenKernelRequiresStateK0    (const GpuReduction*  gr){
 	return reduxGenKernelRequiresLatticeS0(gr);
 }
-static int        reduxGenKernelRequiresStateK1  (const GpuReduction*  gr){
+static int         reduxGenKernelRequiresStateK1    (const GpuReduction*  gr){
 	return reduxGenKernelRequiresLatticeI0(gr);
 }
-static int        reduxGenKernelRequiresWspace   (const GpuReduction*  gr){
+static int         reduxGenKernelRequiresWspace     (const GpuReduction*  gr){
 	(void)gr;
 	return 1;
 }
@@ -3203,16 +3203,16 @@ static int        reduxGenKernelRequiresWspace   (const GpuReduction*  gr){
  * Get size and alignment requirements of K0 and K1 states.
  */
 
-static size_t     reduxGenGetK0Size             (const GpuReduction*  gr){
+static size_t      reduxGenGetK0Size                (const GpuReduction*  gr){
 	return gr->TK0.size;
 }
-static size_t     reduxGenGetK0Align            (const GpuReduction*  gr){
+static size_t      reduxGenGetK0Align               (const GpuReduction*  gr){
 	return gr->TK0.align;
 }
-static size_t     reduxGenGetK1Size             (const GpuReduction*  gr){
+static size_t      reduxGenGetK1Size                (const GpuReduction*  gr){
 	return gr->TK1.size;
 }
-static size_t     reduxGenGetK1Align            (const GpuReduction*  gr){
+static size_t      reduxGenGetK1Align               (const GpuReduction*  gr){
 	return gr->TK1.align;
 }
 
@@ -3220,7 +3220,7 @@ static size_t     reduxGenGetK1Align            (const GpuReduction*  gr){
  * @brief Get the number of bytes of workspace per (partial) reduction per thread.
  */
 
-static size_t     reduxGenGetReduxStateSize     (const GpuReduction*  gr){
+static size_t      reduxGenGetReduxStateSize        (const GpuReduction*  gr){
 	size_t total = 0, idxSize = gpuarray_get_elsize(gr->TS64tc);
 
 	/* The accumulator and index types can be wider than dst/dstArg's types. */
@@ -3238,7 +3238,7 @@ static size_t     reduxGenGetReduxStateSize     (const GpuReduction*  gr){
  * @brief Get the maximum number of threads this operator's kernel can handle.
  */
 
-static size_t     reduxGenGetMaxLocalSize       (const GpuReduction*  gr){
+static size_t      reduxGenGetMaxLocalSize          (const GpuReduction*  gr){
 	return gr->maxLK;
 }
 
@@ -3246,7 +3246,7 @@ static size_t     reduxGenGetMaxLocalSize       (const GpuReduction*  gr){
  * @brief Get the shared memory consumption for a given block size.
  */
 
-static size_t      reduxGenGetSHMEMSize           (const GpuReduction*  gr, size_t cells){
+static size_t      reduxGenGetSHMEMSize             (const GpuReduction*  gr, size_t cells){
 	size_t               total = 0, totalPermute;
 
 	/* Compute size of SHMEM working space */
@@ -3264,7 +3264,7 @@ static size_t      reduxGenGetSHMEMSize           (const GpuReduction*  gr, size
  * @brief Get the shared memory byte offset for the k0 and k1 states.
  */
 
-static size_t      reduxGenGetSHMEMK0Off          (const GpuReduction*  gr, size_t cells){
+static size_t      reduxGenGetSHMEMK0Off            (const GpuReduction*  gr, size_t cells){
 	if (!reduxGenKernelRequiresWspace (gr)||
 	   !reduxGenKernelRequiresStateK0(gr)||
 	   !reduxGenKernelRequiresStateK1(gr)){
@@ -3277,7 +3277,7 @@ static size_t      reduxGenGetSHMEMK0Off          (const GpuReduction*  gr, size
 		return cells*reduxGenGetK1Size(gr);
 	}
 }
-static size_t      reduxGenGetSHMEMK1Off          (const GpuReduction*  gr, size_t cells){
+static size_t      reduxGenGetSHMEMK1Off            (const GpuReduction*  gr, size_t cells){
 	if (!reduxGenKernelRequiresWspace (gr)||
 	   !reduxGenKernelRequiresStateK0(gr)||
 	   !reduxGenKernelRequiresStateK1(gr)){
@@ -3298,7 +3298,7 @@ static size_t      reduxGenGetSHMEMK1Off          (const GpuReduction*  gr, size
  * intrablock offset permutes, for instance.
  */
 
-static size_t      reduxGenGetWMEMSize            (const GpuReduction*  gr, size_t cells){
+static size_t      reduxGenGetWMEMSize              (const GpuReduction*  gr, size_t cells){
 	size_t               total = 0;
 
 	total += reduxGenKernelRequiresStateK0(gr) ? cells*reduxGenGetK0Size(gr) : 0;
@@ -3311,10 +3311,10 @@ static size_t      reduxGenGetWMEMSize            (const GpuReduction*  gr, size
  * @brief Get the workspace memory byte offset for the k0 and k1 states.
  */
 
-static size_t      reduxGenGetWMEMK0Off           (const GpuReduction*  gr, size_t cells){
+static size_t      reduxGenGetWMEMK0Off             (const GpuReduction*  gr, size_t cells){
 	return reduxGenGetSHMEMK0Off(gr, cells);
 }
-static size_t      reduxGenGetWMEMK1Off           (const GpuReduction*  gr, size_t cells){
+static size_t      reduxGenGetWMEMK1Off             (const GpuReduction*  gr, size_t cells){
 	return reduxGenGetSHMEMK1Off(gr, cells);
 }
 
@@ -3324,7 +3324,7 @@ static size_t      reduxGenGetWMEMK1Off           (const GpuReduction*  gr, size
  * After this function, calling reduxInvCleanup*() becomes safe.
  */
 
-static int         reduxInvInit                   (redux_ctx*  ctx){
+static int         reduxInvInit                     (redux_ctx*           ctx){
 	ctx->L           = NULL;
 	ctx->Li          = NULL;
 	ctx->S0J         = ctx->S0Si      = NULL;
@@ -3349,7 +3349,7 @@ static int         reduxInvInit                   (redux_ctx*  ctx){
  * @brief Begin inferring the properties of the reduction invocation.
  */
 
-static int         reduxInvInferProperties        (redux_ctx*  ctx){
+static int         reduxInvInferProperties          (redux_ctx*           ctx){
 	axis_desc* a;
 	int        i, j;
 	size_t     d;
@@ -3530,7 +3530,7 @@ static int         reduxInvInferProperties        (redux_ctx*  ctx){
  * contiguous as possible.
  */
 
-static int        reduxInvFlattenSource         (redux_ctx*  ctx){
+static int         reduxInvFlattenSource            (redux_ctx*           ctx){
 	axis_desc* axis, *flatAxis, *sortAxis;
 	int        i, j, k, isSensitive;
 
@@ -3595,7 +3595,7 @@ static int        reduxInvFlattenSource         (redux_ctx*  ctx){
  * criteria.
  */
 
-static int        reduxInvComputeKernelArgs          (redux_ctx*  ctx){
+static int         reduxInvComputeKernelArgs        (redux_ctx*           ctx){
 	axis_desc* axis, *prevAxis;
 	size_t     target, aL, aLS, perm, i0S;
 	int        i, j, haveSplitFreeAxis, haveSplitReducedAxis;
@@ -4008,7 +4008,7 @@ static int        reduxInvSchedule              (redux_ctx*           ctx){
  * @brief Invoke the kernel.
  */
 
-static int        reduxInvoke                   (redux_ctx*           ctx){
+static int         reduxInvoke                      (redux_ctx*           ctx){
 	int   ret, i=0;
 	void* ptrs[2] = {ctx, &i};
 
@@ -4049,7 +4049,7 @@ static int        reduxInvoke                   (redux_ctx*           ctx){
  * Cleanup
  */
 
-static int        reduxInvCleanup               (redux_ctx*        ctx, int ret){
+static int         reduxInvCleanup                  (redux_ctx*           ctx, int ret){
 	ctx->gr                = NULL;
 	ctx->s0                = NULL;
 	ctx->d0                = NULL;
@@ -4090,8 +4090,8 @@ static int        reduxInvCleanup               (redux_ctx*        ctx, int ret)
 
 	return ret;
 }
-static int        reduxInvCleanupMsg            (redux_ctx*        ctx, int ret,
-                                                 const char*       fmt, ...){
+static int         reduxInvCleanupMsg               (redux_ctx*           ctx, int ret,
+                                                     const char*          fmt, ...){
 #if DEBUG
 	FILE* fp = stderr;
 

From 8f5250e732f0e0054c323dd37a8f5d9e0c8c2c40 Mon Sep 17 00:00:00 2001
From: Olexa Bilaniuk <obilaniu@gmail.com>
Date: Sun, 23 Jul 2017 00:54:10 -0400
Subject: [PATCH 27/34] Muzzle -Wdeclaration-after-statement in
 check_reduction.c.

There is now not a single -Wdeclaration-after-statement warning
origination in that file.
---
 tests/check_reduction.c | 791 ++++++++++++++++++----------------------
 1 file changed, 360 insertions(+), 431 deletions(-)

diff --git a/tests/check_reduction.c b/tests/check_reduction.c
index 6a1e8c6a97..60411ead57 100644
--- a/tests/check_reduction.c
+++ b/tests/check_reduction.c
@@ -68,13 +68,16 @@ static       double   pcgRand01(void){
  */
 
 START_TEST(test_maxandargmax_reduction){
-	pcgSeed(1);
-
 	/**
 	 * We test here a reduction of some random 3D tensor on the first and
 	 * third dimensions.
 	 */
-
+	
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuArray gaD1;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 	size_t errCnt      = 0;
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
@@ -93,7 +96,8 @@ START_TEST(test_maxandargmax_reduction){
 	/**
 	 * Initialize source data.
 	 */
-
+	
+	pcgSeed(1);
 	for(i=0;i<prodDims;i++){
 		pS0[i] = pcgRand01();
 	}
@@ -103,12 +107,6 @@ START_TEST(test_maxandargmax_reduction){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS0;
-	GpuArray gaD0;
-	GpuArray gaD1;
-	GpuReductionAttr* grAttr;
-	GpuReduction*     gr;
-
 	ga_assert_ok(GpuArray_empty(&gaS0, ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty(&gaD0, ctx, GA_FLOAT, 1, &dims[1], GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_SIZE,  1, &dims[1], GA_C_ORDER));
@@ -177,15 +175,18 @@ START_TEST(test_maxandargmax_reduction){
 }END_TEST
 
 START_TEST(test_maxandargmax_idxtranspose){
-	pcgSeed(1);
-
 	/**
 	 * We test here the same reduction as test_reduction, except with a
 	 * reversed reduxList {2,0} instead of {0,2}. That should lead to a
 	 * transposition of the argmax "coordinates" and thus a change in its
 	 * "flattened" output version.
 	 */
-	
+
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuArray gaD1;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 	size_t errCnt      = 0;
 	size_t i,j,k;
 	size_t dims[3]     = {32,50,79};
@@ -207,6 +208,7 @@ START_TEST(test_maxandargmax_idxtranspose){
 	 * Initialize source data.
 	 */
 
+	pcgSeed(1);
 	for(i=0;i<prodDims;i++){
 		pS0[i] = pcgRand01();
 	}
@@ -216,12 +218,6 @@ START_TEST(test_maxandargmax_idxtranspose){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS0;
-	GpuArray gaD0;
-	GpuArray gaD1;
-	GpuReductionAttr* grAttr;
-	GpuReduction*     gr;
-	
 	ga_assert_ok(GpuArray_empty(&gaS0, ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty(&gaD0, ctx, GA_FLOAT, 1, &dims[1], GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_SIZE,  1, &dims[1], GA_C_ORDER));
@@ -290,13 +286,16 @@ START_TEST(test_maxandargmax_idxtranspose){
 }END_TEST
 
 START_TEST(test_maxandargmax_bigdestination){
-	pcgSeed(1);
-
 	/**
 	 * We test here a reduction of some random 3D tensor on the first and
 	 * third dimensions.
 	 */
-	
+
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuArray gaD1;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 	size_t errCnt      = 0;
 	size_t i,j;
 	size_t dims[2]  = {2,131072};
@@ -316,6 +315,7 @@ START_TEST(test_maxandargmax_bigdestination){
 	 * Initialize source data.
 	 */
 
+	pcgSeed(1);
 	for(i=0;i<prodDims;i++){
 		pS0[i] = pcgRand01();
 	}
@@ -325,16 +325,10 @@ START_TEST(test_maxandargmax_bigdestination){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS0;
-	GpuArray gaD0;
-	GpuArray gaD1;
-	GpuReductionAttr* grAttr;
-	GpuReduction*     gr;
-
 	ga_assert_ok(GpuArray_empty(&gaS0, ctx, GA_FLOAT, 2, &dims[0], GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty(&gaD0, ctx, GA_FLOAT, 1, &dims[1], GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_SIZE,  1, &dims[1], GA_C_ORDER));
-	
+
 	ga_assert_ok(GpuArray_write(&gaS0, pS0, sizeof(*pS0)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD0, -1));  /* 0xFFFFFFFF is a qNaN. */
 	ga_assert_ok(GpuArray_memset(&gaD1, -1));
@@ -372,7 +366,7 @@ START_TEST(test_maxandargmax_bigdestination){
 				gtD1 = i;
 			}
 		}
-		
+
 		if(gtD0 != pD0[j] || gtD1 != pD1[j]){
 			errCnt++;
 			if(errCnt <= MAXERRPRINT){
@@ -397,13 +391,16 @@ START_TEST(test_maxandargmax_bigdestination){
 }END_TEST
 
 START_TEST(test_maxandargmax_veryhighrank){
-	pcgSeed(1);
-
 	/**
 	 * Here we test a reduction of a random 8D tensor on four dimensions.
 	 */
 
-	size_t errCnt      = 0;
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuArray gaD1;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
+	size_t errCnt      = 0, dstIdx;
 	size_t i,j,k,l,m,n,o,p;
 	size_t dims   [8]  = {1171,373,2,1,2,1,2,1};
 	size_t prodDims    = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7];
@@ -424,6 +421,7 @@ START_TEST(test_maxandargmax_veryhighrank){
 	 * Initialize source data.
 	 */
 
+	pcgSeed(1);
 	for(i=0;i<prodDims;i++){
 		pS0[i] = pcgRand01();
 	}
@@ -433,16 +431,10 @@ START_TEST(test_maxandargmax_veryhighrank){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS0;
-	GpuArray gaD0;
-	GpuArray gaD1;
-	GpuReductionAttr* grAttr;
-	GpuReduction*     gr;
-
 	ga_assert_ok(GpuArray_empty(&gaS0, ctx, GA_FLOAT, 8, dims,    GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty(&gaD0, ctx, GA_FLOAT, 4, rdxDims, GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_SIZE,  4, rdxDims, GA_C_ORDER));
-	
+
 	ga_assert_ok(GpuArray_write(&gaS0, pS0, sizeof(*pS0)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD0, -1));  /* 0xFFFFFFFF is a qNaN. */
 	ga_assert_ok(GpuArray_memset(&gaD1, -1));
@@ -490,7 +482,7 @@ START_TEST(test_maxandargmax_veryhighrank){
 						}
 					}
 
-					size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
+					dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
 					if(gtD0 != pD0[dstIdx] || gtD1 != pD1[dstIdx]){
 						errCnt++;
 						if(errCnt <= MAXERRPRINT){
@@ -519,17 +511,22 @@ START_TEST(test_maxandargmax_veryhighrank){
 }END_TEST
 
 START_TEST(test_maxandargmax_alldimsreduced){
-	pcgSeed(1);
-
 	/**
 	 * We test here a reduction of some random 3D tensor on all dimensions.
 	 */
 
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuArray gaD1;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 	size_t errCnt      = 0;
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
 	const int reduxList[] = {0,1,2};
+	size_t gtD1;
+	float  gtD0;
 
 	float*  pS0 = calloc(1, sizeof(*pS0) * dims[0]*dims[1]*dims[2]);
 	float*  pD0 = calloc(1, sizeof(*pD0)                          );
@@ -544,6 +541,7 @@ START_TEST(test_maxandargmax_alldimsreduced){
 	 * Initialize source data.
 	 */
 
+	pcgSeed(1);
 	for(i=0;i<prodDims;i++){
 		pS0[i] = pcgRand01();
 	}
@@ -553,16 +551,10 @@ START_TEST(test_maxandargmax_alldimsreduced){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS0;
-	GpuArray gaD0;
-	GpuArray gaD1;
-	GpuReductionAttr* grAttr;
-	GpuReduction*     gr;
-
 	ga_assert_ok(GpuArray_empty(&gaS0, ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty(&gaD0, ctx, GA_FLOAT, 0, NULL,     GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_SIZE,  0, NULL,     GA_C_ORDER));
-	
+
 	ga_assert_ok(GpuArray_write(&gaS0, pS0, sizeof(*pS0)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD0, -1));  /* 0xFFFFFFFF is a qNaN. */
 	ga_assert_ok(GpuArray_memset(&gaD1, -1));
@@ -588,8 +580,8 @@ START_TEST(test_maxandargmax_alldimsreduced){
 	 * Check that the destination tensors are correct.
 	 */
 
-	size_t gtD1 = 0;
-	float  gtD0 = pS0[0];
+	gtD1 = 0;
+	gtD0 = pS0[0];
 
 	for(i=0;i<dims[0];i++){
 		for(j=0;j<dims[1];j++){
@@ -626,13 +618,16 @@ START_TEST(test_maxandargmax_alldimsreduced){
 }END_TEST
 
 START_TEST(test_minandargmin_reduction){
-	pcgSeed(1);
-
 	/**
 	 * We test here a reduction of some random 3D tensor on the first and
 	 * third dimensions.
 	 */
 
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuArray gaD1;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 	size_t errCnt      = 0;
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
@@ -652,6 +647,7 @@ START_TEST(test_minandargmin_reduction){
 	 * Initialize source data.
 	 */
 
+	pcgSeed(1);
 	for(i=0;i<prodDims;i++){
 		pS0[i] = pcgRand01();
 	}
@@ -661,12 +657,6 @@ START_TEST(test_minandargmin_reduction){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS0;
-	GpuArray gaD0;
-	GpuArray gaD1;
-	GpuReductionAttr* grAttr;
-	GpuReduction*     gr;
-
 	ga_assert_ok(GpuArray_empty(&gaS0, ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty(&gaD0, ctx, GA_FLOAT, 1, &dims[1], GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_SIZE,  1, &dims[1], GA_C_ORDER));
@@ -710,7 +700,7 @@ START_TEST(test_minandargmin_reduction){
 				}
 			}
 		}
-		
+
 		if(gtD0 != pD0[j] || gtD1 != pD1[j]){
 			errCnt++;
 			if(errCnt <= MAXERRPRINT){
@@ -735,13 +725,16 @@ START_TEST(test_minandargmin_reduction){
 }END_TEST
 
 START_TEST(test_minandargmin_veryhighrank){
-	pcgSeed(1);
-
 	/**
 	 * Here we test a reduction of a random 8D tensor on four dimensions.
 	 */
 
-	size_t errCnt      = 0;
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuArray gaD1;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
+	size_t errCnt      = 0, dstIdx;
 	size_t i,j,k,l,m,n,o,p;
 	size_t dims   [8]  = {1171,373,2,1,2,1,2,1};
 	size_t prodDims    = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7];
@@ -762,6 +755,7 @@ START_TEST(test_minandargmin_veryhighrank){
 	 * Initialize source data.
 	 */
 
+	pcgSeed(1);
 	for(i=0;i<prodDims;i++){
 		pS0[i] = pcgRand01();
 	}
@@ -771,16 +765,10 @@ START_TEST(test_minandargmin_veryhighrank){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS0;
-	GpuArray gaD0;
-	GpuArray gaD1;
-	GpuReductionAttr* grAttr;
-	GpuReduction*     gr;
-
 	ga_assert_ok(GpuArray_empty(&gaS0, ctx, GA_FLOAT, 8, dims,    GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty(&gaD0, ctx, GA_FLOAT, 4, rdxDims, GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_SIZE,  4, rdxDims, GA_C_ORDER));
-	
+
 	ga_assert_ok(GpuArray_write(&gaS0, pS0, sizeof(*pS0)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD0, -1));  /* 0xFFFFFFFF is a qNaN. */
 	ga_assert_ok(GpuArray_memset(&gaD1, -1));
@@ -828,7 +816,7 @@ START_TEST(test_minandargmin_veryhighrank){
 						}
 					}
 
-					size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
+					dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
 					if(gtD0 != pD0[dstIdx] || gtD1 != pD1[dstIdx]){
 						errCnt++;
 						if(errCnt <= MAXERRPRINT){
@@ -857,17 +845,22 @@ START_TEST(test_minandargmin_veryhighrank){
 }END_TEST
 
 START_TEST(test_minandargmin_alldimsreduced){
-	pcgSeed(1);
-
 	/**
 	 * We test here a reduction of some random 3D tensor on all dimensions.
 	 */
 
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuArray gaD1;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 	size_t errCnt      = 0;
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
 	const int reduxList[] = {0,1,2};
+	size_t gtD1;
+	float  gtD0;
 
 	float*  pS0 = calloc(1, sizeof(*pS0)* dims[0]*dims[1]*dims[2]);
 	float*  pD0 = calloc(1, sizeof(*pD0)                         );
@@ -882,6 +875,7 @@ START_TEST(test_minandargmin_alldimsreduced){
 	 * Initialize source data.
 	 */
 
+	pcgSeed(1);
 	for(i=0;i<prodDims;i++){
 		pS0[i] = pcgRand01();
 	}
@@ -891,16 +885,10 @@ START_TEST(test_minandargmin_alldimsreduced){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS0;
-	GpuArray gaD0;
-	GpuArray gaD1;
-	GpuReductionAttr* grAttr;
-	GpuReduction*     gr;
-
 	ga_assert_ok(GpuArray_empty(&gaS0, ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty(&gaD0, ctx, GA_FLOAT, 0, NULL,     GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_SIZE,  0, NULL,     GA_C_ORDER));
-	
+
 	ga_assert_ok(GpuArray_write(&gaS0, pS0, sizeof(*pS0)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD0, -1));  /* 0xFFFFFFFF is a qNaN. */
 	ga_assert_ok(GpuArray_memset(&gaD1, -1));
@@ -926,8 +914,8 @@ START_TEST(test_minandargmin_alldimsreduced){
 	 * Check that the destination tensors are correct.
 	 */
 
-	size_t gtD1 = 0;
-	float  gtD0 = pS0[0];
+	gtD1 = 0;
+	gtD0 = pS0[0];
 
 	for(i=0;i<dims[0];i++){
 		for(j=0;j<dims[1];j++){
@@ -964,13 +952,15 @@ START_TEST(test_minandargmin_alldimsreduced){
 }END_TEST
 
 START_TEST(test_argmax_reduction){
-	pcgSeed(1);
-
 	/**
 	 * We test here a reduction of some random 3D tensor on the first and
 	 * third dimensions.
 	 */
 
+	GpuArray gaS0;
+	GpuArray gaD1;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 	size_t errCnt      = 0;
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
@@ -990,6 +980,7 @@ START_TEST(test_argmax_reduction){
 	 * Initialize source data.
 	 */
 
+	pcgSeed(1);
 	for(i=0;i<prodDims;i++){
 		pS0[i] = pcgRand01();
 	}
@@ -999,11 +990,6 @@ START_TEST(test_argmax_reduction){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS0;
-	GpuArray gaD1;
-	GpuReductionAttr* grAttr;
-	GpuReduction*     gr;
-
 	ga_assert_ok(GpuArray_empty(&gaS0, ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_SIZE,  1, &dims[1], GA_C_ORDER));
 
@@ -1043,7 +1029,7 @@ START_TEST(test_argmax_reduction){
 				}
 			}
 		}
-		
+
 		if(gtD1 != pD1[j]){
 			errCnt++;
 			if(errCnt <= MAXERRPRINT){
@@ -1067,13 +1053,15 @@ START_TEST(test_argmax_reduction){
 }END_TEST
 
 START_TEST(test_argmax_veryhighrank){
-	pcgSeed(1);
-
 	/**
 	 * Here we test a reduction of a random 8D tensor on four dimensions.
 	 */
 
-	size_t errCnt      = 0;
+	GpuArray gaS0;
+	GpuArray gaD1;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
+	size_t errCnt      = 0, dstIdx;
 	size_t i,j,k,l,m,n,o,p;
 	size_t dims   [8]  = {1171,373,2,1,2,1,2,1};
 	size_t prodDims    = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7];
@@ -1093,6 +1081,7 @@ START_TEST(test_argmax_veryhighrank){
 	 * Initialize source data.
 	 */
 
+	pcgSeed(1);
 	for(i=0;i<prodDims;i++){
 		pS0[i] = pcgRand01();
 	}
@@ -1102,17 +1091,12 @@ START_TEST(test_argmax_veryhighrank){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS0;
-	GpuArray gaD1;
-	GpuReductionAttr* grAttr;
-	GpuReduction*     gr;
-
 	ga_assert_ok(GpuArray_empty(&gaS0, ctx, GA_FLOAT, 8, dims,    GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_SIZE,  4, rdxDims, GA_C_ORDER));
 
 	ga_assert_ok(GpuArray_write(&gaS0, pS0, sizeof(*pS0)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD1, -1));
-	
+
 	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
 	ck_assert_ptr_nonnull(grAttr);
 	GpuReductionAttr_setop    (grAttr, GA_REDUCE_ARGMAX);
@@ -1154,7 +1138,7 @@ START_TEST(test_argmax_veryhighrank){
 						}
 					}
 
-					size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
+					dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
 					if(gtD1 != pD1[dstIdx]){
 						errCnt++;
 						if(errCnt <= MAXERRPRINT){
@@ -1182,17 +1166,21 @@ START_TEST(test_argmax_veryhighrank){
 }END_TEST
 
 START_TEST(test_argmax_alldimsreduced){
-	pcgSeed(1);
-
 	/**
 	 * We test here a reduction of some random 3D tensor on all dimensions.
 	 */
 
+	GpuArray gaS0;
+	GpuArray gaD1;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 	size_t errCnt      = 0;
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
 	const int reduxList[] = {0,1,2};
+	size_t gtD1;
+	float  gtD0;
 
 	float*  pS0 = calloc(1, sizeof(*pS0)    * dims[0]*dims[1]*dims[2]);
 	float*  pD0 = calloc(1, sizeof(*pD0)                             );
@@ -1207,6 +1195,7 @@ START_TEST(test_argmax_alldimsreduced){
 	 * Initialize source data.
 	 */
 
+	pcgSeed(1);
 	for(i=0;i<prodDims;i++){
 		pS0[i] = pcgRand01();
 	}
@@ -1216,17 +1205,12 @@ START_TEST(test_argmax_alldimsreduced){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS0;
-	GpuArray gaD1;
-	GpuReductionAttr* grAttr;
-	GpuReduction*     gr;
-
 	ga_assert_ok(GpuArray_empty(&gaS0, ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_SIZE,  0, NULL,     GA_C_ORDER));
 
 	ga_assert_ok(GpuArray_write(&gaS0, pS0, sizeof(*pS0)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD1, -1));
-	
+
 	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
 	ck_assert_ptr_nonnull(grAttr);
 	GpuReductionAttr_setop    (grAttr, GA_REDUCE_ARGMAX);
@@ -1246,8 +1230,8 @@ START_TEST(test_argmax_alldimsreduced){
 	 * Check that the destination tensors are correct.
 	 */
 
-	size_t gtD1 = 0;
-	float  gtD0 = pS0[0];
+	gtD1 = 0;
+	gtD0 = pS0[0];
 
 	for(i=0;i<dims[0];i++){
 		for(j=0;j<dims[1];j++){
@@ -1283,13 +1267,15 @@ START_TEST(test_argmax_alldimsreduced){
 }END_TEST
 
 START_TEST(test_argmin_reduction){
-	pcgSeed(1);
-
 	/**
 	 * We test here a reduction of some random 3D tensor on the first and
 	 * third dimensions.
 	 */
 
+	GpuArray gaS0;
+	GpuArray gaD1;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 	size_t errCnt      = 0;
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
@@ -1309,6 +1295,7 @@ START_TEST(test_argmin_reduction){
 	 * Initialize source data.
 	 */
 
+	pcgSeed(1);
 	for(i=0;i<prodDims;i++){
 		pS0[i] = pcgRand01();
 	}
@@ -1318,17 +1305,12 @@ START_TEST(test_argmin_reduction){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS0;
-	GpuArray gaD1;
-	GpuReductionAttr* grAttr;
-	GpuReduction*     gr;
-
 	ga_assert_ok(GpuArray_empty(&gaS0, ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_SIZE,  1, &dims[1], GA_C_ORDER));
 
 	ga_assert_ok(GpuArray_write(&gaS0, pS0, sizeof(*pS0)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD1, -1));
-	
+
 	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
 	ck_assert_ptr_nonnull(grAttr);
 	GpuReductionAttr_setop    (grAttr, GA_REDUCE_ARGMIN);
@@ -1362,7 +1344,7 @@ START_TEST(test_argmin_reduction){
 				}
 			}
 		}
-		
+
 		if(gtD1 != pD1[j]){
 			errCnt++;
 			if(errCnt <= MAXERRPRINT){
@@ -1386,13 +1368,15 @@ START_TEST(test_argmin_reduction){
 }END_TEST
 
 START_TEST(test_argmin_veryhighrank){
-	pcgSeed(1);
-
 	/**
 	 * Here we test a reduction of a random 8D tensor on four dimensions.
 	 */
 
-	size_t errCnt      = 0;
+	GpuArray gaS0;
+	GpuArray gaD1;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
+	size_t errCnt      = 0, dstIdx;
 	size_t i,j,k,l,m,n,o,p;
 	size_t dims   [8]  = {1171,373,2,1,2,1,2,1};
 	size_t prodDims    = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7];
@@ -1412,6 +1396,7 @@ START_TEST(test_argmin_veryhighrank){
 	 * Initialize source data.
 	 */
 
+	pcgSeed(1);
 	for(i=0;i<prodDims;i++){
 		pS0[i] = pcgRand01();
 	}
@@ -1421,17 +1406,12 @@ START_TEST(test_argmin_veryhighrank){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS0;
-	GpuArray gaD1;
-	GpuReductionAttr* grAttr;
-	GpuReduction*     gr;
-
 	ga_assert_ok(GpuArray_empty(&gaS0,    ctx, GA_FLOAT, 8, dims,    GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_SIZE,  4, rdxDims, GA_C_ORDER));
 
 	ga_assert_ok(GpuArray_write(&gaS0,    pS0, sizeof(*pS0)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD1, -1));
-	
+
 	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
 	ck_assert_ptr_nonnull(grAttr);
 	GpuReductionAttr_setop    (grAttr, GA_REDUCE_ARGMIN);
@@ -1473,7 +1453,7 @@ START_TEST(test_argmin_veryhighrank){
 						}
 					}
 
-					size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
+					dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
 					if(gtD1 != pD1[dstIdx]){
 						errCnt++;
 						if(errCnt <= MAXERRPRINT){
@@ -1501,17 +1481,21 @@ START_TEST(test_argmin_veryhighrank){
 }END_TEST
 
 START_TEST(test_argmin_alldimsreduced){
-	pcgSeed(1);
-
 	/**
 	 * We test here a reduction of some random 3D tensor on all dimensions.
 	 */
 
+	GpuArray gaS0;
+	GpuArray gaD1;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 	size_t errCnt      = 0;
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
 	const int reduxList[] = {0,1,2};
+	size_t gtD1;
+	float  gtD0;
 
 	float*  pS0 = calloc(1, sizeof(*pS0) * dims[0]*dims[1]*dims[2]);
 	float*  pD0 = calloc(1, sizeof(*pD0)                          );
@@ -1526,6 +1510,7 @@ START_TEST(test_argmin_alldimsreduced){
 	 * Initialize source data.
 	 */
 
+	pcgSeed(1);
 	for(i=0;i<prodDims;i++){
 		pS0[i] = pcgRand01();
 	}
@@ -1535,17 +1520,12 @@ START_TEST(test_argmin_alldimsreduced){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS0;
-	GpuArray gaD1;
-	GpuReductionAttr* grAttr;
-	GpuReduction*     gr;
-
 	ga_assert_ok(GpuArray_empty(&gaS0, ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_SIZE,  0, NULL,     GA_C_ORDER));
 
 	ga_assert_ok(GpuArray_write(&gaS0, pS0, sizeof(*pS0)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD1, -1));
-	
+
 	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
 	ck_assert_ptr_nonnull(grAttr);
 	GpuReductionAttr_setop    (grAttr, GA_REDUCE_ARGMIN);
@@ -1565,8 +1545,8 @@ START_TEST(test_argmin_alldimsreduced){
 	 * Check that the destination tensors are correct.
 	 */
 
-	size_t gtD1 = 0;
-	float  gtD0 = pS0[0];
+	gtD1 = 0;
+	gtD0 = pS0[0];
 
 	for(i=0;i<dims[0];i++){
 		for(j=0;j<dims[1];j++){
@@ -1602,12 +1582,14 @@ START_TEST(test_argmin_alldimsreduced){
 }END_TEST
 
 START_TEST(test_max_reduction){
-	pcgSeed(1);
-
 	/**
 	 * We test here a reduction of some random 3D tensor on all dimensions.
 	 */
 
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 	size_t errCnt      = 0;
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
@@ -1625,6 +1607,7 @@ START_TEST(test_max_reduction){
 	 * axitialize source data.
 	 */
 
+	pcgSeed(1);
 	for(i=0;i<prodDims;i++){
 		pS0[i] = pcgRand01();
 	}
@@ -1634,17 +1617,12 @@ START_TEST(test_max_reduction){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS0;
-	GpuArray gaD0;
-	GpuReductionAttr* grAttr;
-	GpuReduction*     gr;
-
 	ga_assert_ok(GpuArray_empty(&gaS0,    ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty(&gaD0,    ctx, GA_FLOAT, 1, &dims[1], GA_C_ORDER));
 
 	ga_assert_ok(GpuArray_write(&gaS0,    pS0, sizeof(*pS0)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD0,    -1));  /* 0xFFFFFFFF is a qNaN. */
-	
+
 	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
 	ck_assert_ptr_nonnull(grAttr);
 	GpuReductionAttr_setop    (grAttr, GA_REDUCE_MAX);
@@ -1676,7 +1654,7 @@ START_TEST(test_max_reduction){
 				}
 			}
 		}
-		
+
 		if(gtD0 != pD0[j]){
 			errCnt++;
 			if(errCnt <= MAXERRPRINT){
@@ -1699,13 +1677,15 @@ START_TEST(test_max_reduction){
 }END_TEST
 
 START_TEST(test_max_veryhighrank){
-	pcgSeed(1);
-
 	/**
 	 * Here we test a reduction of a random 8D tensor on four dimensions.
 	 */
 
-	size_t errCnt      = 0;
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
+	size_t errCnt      = 0, dstIdx;
 	size_t i,j,k,l,m,n,o,p;
 	size_t dims   [8]  = {1171,373,2,1,2,1,2,1};
 	size_t prodDims    = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7];
@@ -1724,6 +1704,7 @@ START_TEST(test_max_veryhighrank){
 	 * Initialize source data.
 	 */
 
+	pcgSeed(1);
 	for(i=0;i<prodDims;i++){
 		pS0[i] = pcgRand01();
 	}
@@ -1733,17 +1714,12 @@ START_TEST(test_max_veryhighrank){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS0;
-	GpuArray gaD0;
-	GpuReductionAttr* grAttr;
-	GpuReduction*     gr;
-
 	ga_assert_ok(GpuArray_empty(&gaS0,    ctx, GA_FLOAT, 8, dims,    GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty(&gaD0,    ctx, GA_FLOAT, 4, rdxDims, GA_C_ORDER));
 
 	ga_assert_ok(GpuArray_write(&gaS0,    pS0, sizeof(*pS0)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD0,    -1));  /* 0xFFFFFFFF is a qNaN. */
-	
+
 	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
 	ck_assert_ptr_nonnull(grAttr);
 	GpuReductionAttr_setop    (grAttr, GA_REDUCE_MAX);
@@ -1783,7 +1759,7 @@ START_TEST(test_max_veryhighrank){
 						}
 					}
 
-					size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
+					dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
 					if(gtD0 != pD0[dstIdx]){
 						errCnt++;
 						if(errCnt <= MAXERRPRINT){
@@ -1810,17 +1786,20 @@ START_TEST(test_max_veryhighrank){
 }END_TEST
 
 START_TEST(test_max_alldimsreduced){
-	pcgSeed(1);
-
 	/**
 	 * We test here a reduction of some random 3D tensor on all dimensions.
 	 */
 
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 	size_t errCnt      = 0;
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
 	const int reduxList[] = {0,1,2};
+	float  gtD0;
 
 	float*  pS0    = calloc(1, sizeof(*pS0)    * dims[0]*dims[1]*dims[2]);
 	float*  pD0    = calloc(1, sizeof(*pD0)                             );
@@ -1833,6 +1812,7 @@ START_TEST(test_max_alldimsreduced){
 	 * axitialize source data.
 	 */
 
+	pcgSeed(1);
 	for(i=0;i<prodDims;i++){
 		pS0[i] = pcgRand01();
 	}
@@ -1842,17 +1822,12 @@ START_TEST(test_max_alldimsreduced){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS0;
-	GpuArray gaD0;
-	GpuReductionAttr* grAttr;
-	GpuReduction*     gr;
-
 	ga_assert_ok(GpuArray_empty(&gaS0,    ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty(&gaD0,    ctx, GA_FLOAT, 0, NULL,     GA_C_ORDER));
 
 	ga_assert_ok(GpuArray_write(&gaS0,    pS0, sizeof(*pS0)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD0,    -1));  /* 0xFFFFFFFF is a qNaN. */
-	
+
 	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
 	ck_assert_ptr_nonnull(grAttr);
 	GpuReductionAttr_setop    (grAttr, GA_REDUCE_MAX);
@@ -1872,7 +1847,7 @@ START_TEST(test_max_alldimsreduced){
 	 * Check that the destaxation tensors are correct.
 	 */
 
-	float  gtD0 = pS0[0];
+	gtD0 = pS0[0];
 
 	for(i=0;i<dims[0];i++){
 		for(j=0;j<dims[1];j++){
@@ -1906,12 +1881,14 @@ START_TEST(test_max_alldimsreduced){
 }END_TEST
 
 START_TEST(test_min_reduction){
-	pcgSeed(1);
-
 	/**
 	 * We test here a reduction of some random 3D tensor on all dimensions.
 	 */
 
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 	size_t errCnt      = 0;
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
@@ -1929,6 +1906,7 @@ START_TEST(test_min_reduction){
 	 * Initialize source data.
 	 */
 
+	pcgSeed(1);
 	for(i=0;i<prodDims;i++){
 		pS0[i] = pcgRand01();
 	}
@@ -1938,17 +1916,12 @@ START_TEST(test_min_reduction){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS0;
-	GpuArray gaD0;
-	GpuReductionAttr* grAttr;
-	GpuReduction*     gr;
-
 	ga_assert_ok(GpuArray_empty(&gaS0,    ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty(&gaD0,    ctx, GA_FLOAT, 1, &dims[1], GA_C_ORDER));
 
 	ga_assert_ok(GpuArray_write(&gaS0,    pS0, sizeof(*pS0)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD0,    -1));  /* 0xFFFFFFFF is a qNaN. */
-	
+
 	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
 	ck_assert_ptr_nonnull(grAttr);
 	GpuReductionAttr_setop    (grAttr, GA_REDUCE_MIN);
@@ -1980,7 +1953,7 @@ START_TEST(test_min_reduction){
 				}
 			}
 		}
-		
+
 		if(gtD0 != pD0[j]){
 			errCnt++;
 			if(errCnt <= MAXERRPRINT){
@@ -2003,13 +1976,15 @@ START_TEST(test_min_reduction){
 }END_TEST
 
 START_TEST(test_min_veryhighrank){
-	pcgSeed(1);
-
 	/**
 	 * Here we test a reduction of a random 8D tensor on four dimensions.
 	 */
 
-	size_t errCnt      = 0;
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
+	size_t errCnt      = 0, dstIdx;
 	size_t i,j,k,l,m,n,o,p;
 	size_t dims   [8]  = {1171,373,2,1,2,1,2,1};
 	size_t prodDims    = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7];
@@ -2028,6 +2003,7 @@ START_TEST(test_min_veryhighrank){
 	 * Initialize source data.
 	 */
 
+	pcgSeed(1);
 	for(i=0;i<prodDims;i++){
 		pS0[i] = pcgRand01();
 	}
@@ -2037,17 +2013,12 @@ START_TEST(test_min_veryhighrank){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS0;
-	GpuArray gaD0;
-	GpuReductionAttr* grAttr;
-	GpuReduction*     gr;
-
 	ga_assert_ok(GpuArray_empty(&gaS0,    ctx, GA_FLOAT, 8, dims,    GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty(&gaD0,    ctx, GA_FLOAT, 4, rdxDims, GA_C_ORDER));
 
 	ga_assert_ok(GpuArray_write(&gaS0,    pS0, sizeof(*pS0)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD0,    -1));  /* 0xFFFFFFFF is a qNaN. */
-	
+
 	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
 	ck_assert_ptr_nonnull(grAttr);
 	GpuReductionAttr_setop    (grAttr, GA_REDUCE_MIN);
@@ -2087,7 +2058,7 @@ START_TEST(test_min_veryhighrank){
 						}
 					}
 
-					size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
+					dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
 					if(gtD0 != pD0[dstIdx]){
 						errCnt++;
 						if(errCnt <= MAXERRPRINT){
@@ -2114,17 +2085,20 @@ START_TEST(test_min_veryhighrank){
 }END_TEST
 
 START_TEST(test_min_alldimsreduced){
-	pcgSeed(1);
-
 	/**
 	 * We test here a reduction of some random 3D tensor on all dimensions.
 	 */
 
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 	size_t errCnt      = 0;
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
 	const int reduxList[] = {0,1,2};
+	float  gtD0;
 
 	float*  pS0    = calloc(1, sizeof(*pS0)    * dims[0]*dims[1]*dims[2]);
 	float*  pD0    = calloc(1, sizeof(*pD0)                             );
@@ -2137,6 +2111,7 @@ START_TEST(test_min_alldimsreduced){
 	 * Initialize source data.
 	 */
 
+	pcgSeed(1);
 	for(i=0;i<prodDims;i++){
 		pS0[i] = pcgRand01();
 	}
@@ -2146,17 +2121,12 @@ START_TEST(test_min_alldimsreduced){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS0;
-	GpuArray gaD0;
-	GpuReductionAttr* grAttr;
-	GpuReduction*     gr;
-
 	ga_assert_ok(GpuArray_empty(&gaS0,    ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty(&gaD0,    ctx, GA_FLOAT, 0, NULL,     GA_C_ORDER));
 
 	ga_assert_ok(GpuArray_write(&gaS0,    pS0, sizeof(*pS0)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD0,    -1));  /* 0xFFFFFFFF is a qNaN. */
-	
+
 	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
 	ck_assert_ptr_nonnull(grAttr);
 	GpuReductionAttr_setop    (grAttr, GA_REDUCE_MIN);
@@ -2176,7 +2146,7 @@ START_TEST(test_min_alldimsreduced){
 	 * Check that the destination tensors are correct.
 	 */
 
-	float  gtD0 = pS0[0];
+	gtD0 = pS0[0];
 
 	for(i=0;i<dims[0];i++){
 		for(j=0;j<dims[1];j++){
@@ -2210,12 +2180,14 @@ START_TEST(test_min_alldimsreduced){
 }END_TEST
 
 START_TEST(test_sum_reduction){
-	pcgSeed(1);
-
 	/**
 	 * We test here a reduction of some random 3D tensor on all dimensions.
 	 */
 
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 	size_t errCnt      = 0;
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
@@ -2234,6 +2206,7 @@ START_TEST(test_sum_reduction){
 	 * Initialize source data.
 	 */
 
+	pcgSeed(1);
 	for(i=0;i<prodDims;i++){
 		pS0[i] = pcgRand01()-0.5;
 	}
@@ -2243,17 +2216,12 @@ START_TEST(test_sum_reduction){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS0;
-	GpuArray gaD0;
-	GpuReductionAttr* grAttr;
-	GpuReduction*     gr;
-
 	ga_assert_ok(GpuArray_empty (&gaS0, ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty (&gaD0, ctx, GA_FLOAT, 1, &dims[1], GA_C_ORDER));
 
 	ga_assert_ok(GpuArray_write (&gaS0, pS0, sizeof(*pS0)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD0, -1));  /* 0xFFFFFFFF is a qNaN. */
-	
+
 	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
 	ck_assert_ptr_nonnull(grAttr);
 	GpuReductionAttr_setop    (grAttr, GA_REDUCE_SUM);
@@ -2282,7 +2250,7 @@ START_TEST(test_sum_reduction){
 				gtD0 += v;
 			}
 		}
-		
+
 		if(fabs(gtD0-pD0[j]) >= TOL){
 			errCnt++;
 			if(errCnt <= MAXERRPRINT){
@@ -2305,13 +2273,15 @@ START_TEST(test_sum_reduction){
 }END_TEST
 
 START_TEST(test_sum_veryhighrank){
-	pcgSeed(1);
-
 	/**
 	 * Here we test a reduction of a random 8D tensor on four dimensions.
 	 */
 
-	size_t errCnt      = 0;
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
+	size_t errCnt      = 0, dstIdx;
 	size_t i,j,k,l,m,n,o,p;
 	size_t dims   [8]  = {1171,373,2,1,2,1,2,1};
 	size_t prodDims    = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7];
@@ -2331,6 +2301,7 @@ START_TEST(test_sum_veryhighrank){
 	 * Initialize source data.
 	 */
 
+	pcgSeed(1);
 	for(i=0;i<prodDims;i++){
 		pS0[i] = pcgRand01()-0.5;
 	}
@@ -2340,17 +2311,12 @@ START_TEST(test_sum_veryhighrank){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS0;
-	GpuArray gaD0;
-	GpuReductionAttr* grAttr;
-	GpuReduction*     gr;
-
 	ga_assert_ok(GpuArray_empty (&gaS0, ctx, GA_FLOAT, 8, dims,    GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty (&gaD0, ctx, GA_FLOAT, 4, rdxDims, GA_C_ORDER));
 
 	ga_assert_ok(GpuArray_write (&gaS0, pS0, sizeof(*pS0)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD0, -1));  /* 0xFFFFFFFF is a qNaN. */
-	
+
 	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
 	ck_assert_ptr_nonnull(grAttr);
 	GpuReductionAttr_setop    (grAttr, GA_REDUCE_SUM);
@@ -2387,7 +2353,7 @@ START_TEST(test_sum_veryhighrank){
 						}
 					}
 
-					size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
+					dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
 					if(fabs(gtD0-pD0[dstIdx]) >= TOL){
 						errCnt++;
 						if(errCnt <= MAXERRPRINT){
@@ -2414,18 +2380,21 @@ START_TEST(test_sum_veryhighrank){
 }END_TEST
 
 START_TEST(test_sum_alldimsreduced){
-	pcgSeed(1);
-
 	/**
 	 * We test here a reduction of some random 3D tensor on all dimensions.
 	 */
 
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 	size_t errCnt      = 0;
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
 	const int reduxList[] = {0,1,2};
 	const float TOL = 1e-4;
+	float  gtD0;
 
 	float*  pS0 = calloc(1, sizeof(*pS0)    * dims[0]*dims[1]*dims[2]);
 	float*  pD0 = calloc(1, sizeof(*pD0)                             );
@@ -2438,6 +2407,7 @@ START_TEST(test_sum_alldimsreduced){
 	 * Initialize source data.
 	 */
 
+	pcgSeed(1);
 	for(i=0;i<prodDims;i++){
 		pS0[i] = pcgRand01()-0.5;
 	}
@@ -2447,17 +2417,12 @@ START_TEST(test_sum_alldimsreduced){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS0;
-	GpuArray gaD0;
-	GpuReductionAttr* grAttr;
-	GpuReduction*     gr;
-
 	ga_assert_ok(GpuArray_empty (&gaS0, ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty (&gaD0, ctx, GA_FLOAT, 0, NULL,     GA_C_ORDER));
 
 	ga_assert_ok(GpuArray_write (&gaS0, pS0, sizeof(*pS0)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD0, -1));  /* 0xFFFFFFFF is a qNaN. */
-	
+
 	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
 	ck_assert_ptr_nonnull(grAttr);
 	GpuReductionAttr_setop    (grAttr, GA_REDUCE_SUM);
@@ -2477,7 +2442,7 @@ START_TEST(test_sum_alldimsreduced){
 	 * Check that the destination tensors are correct.
 	 */
 
-	float  gtD0 = 0;
+	gtD0 = 0;
 
 	for(i=0;i<dims[0];i++){
 		for(j=0;j<dims[1];j++){
@@ -2508,18 +2473,21 @@ START_TEST(test_sum_alldimsreduced){
 }END_TEST
 
 START_TEST(test_sum_huge){
-	pcgSeed(1);
-
 	/**
 	 * We test here a reduction of a huge 1D tensor on all dimensions.
 	 */
 
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 	size_t errCnt      = 0;
 	size_t i;
 	size_t dims[1]  = {100000000};
 	size_t prodDims = dims[0];
 	const int reduxList[] = {0};
 	const float TOL = 1e-2;
+	double gtD0;
 
 	float*  pS0 = calloc(1, sizeof(*pS0) * dims[0]);
 	float*  pD0 = calloc(1, sizeof(*pD0));
@@ -2532,6 +2500,7 @@ START_TEST(test_sum_huge){
 	 * Initialize source data.
 	 */
 
+	pcgSeed(1);
 	for(i=0;i<prodDims;i++){
 		pS0[i] = pcgRand01()-0.5;
 	}
@@ -2541,17 +2510,12 @@ START_TEST(test_sum_huge){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS0;
-	GpuArray gaD0;
-	GpuReductionAttr* grAttr;
-	GpuReduction*     gr;
-
 	ga_assert_ok(GpuArray_empty (&gaS0, ctx, GA_FLOAT, 1, &dims[0], GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty (&gaD0, ctx, GA_FLOAT, 0, NULL, GA_C_ORDER));
 
 	ga_assert_ok(GpuArray_write (&gaS0, pS0, sizeof(*pS0)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD0, -1));  /* 0xFFFFFFFF is a qNaN. */
-	
+
 	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
 	ck_assert_ptr_nonnull(grAttr);
 	GpuReductionAttr_setop    (grAttr, GA_REDUCE_SUM);
@@ -2570,8 +2534,8 @@ START_TEST(test_sum_huge){
 	/**
 	 * Check that the destination tensors are correct.
 	 */
-	
-	double  gtD0 = 0;
+
+	gtD0 = 0;
 	for(i=0;i<dims[0];i++){
 		double  v   = pS0[i];
 		gtD0 += v;
@@ -2597,12 +2561,14 @@ START_TEST(test_sum_huge){
 }END_TEST
 
 START_TEST(test_prod_reduction){
-	pcgSeed(1);
-
 	/**
 	 * We test here a reduction of some random 3D tensor on all dimensions.
 	 */
 
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 	size_t errCnt      = 0;
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
@@ -2621,6 +2587,7 @@ START_TEST(test_prod_reduction){
 	 * Initialize source data.
 	 */
 
+	pcgSeed(1);
 	for(i=0;i<prodDims;i++){
 		pS0[i] = (pcgRand01()-0.5)*0.1 + 1;
 	}
@@ -2630,17 +2597,12 @@ START_TEST(test_prod_reduction){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS0;
-	GpuArray gaD0;
-	GpuReductionAttr* grAttr;
-	GpuReduction*     gr;
-
 	ga_assert_ok(GpuArray_empty (&gaS0, ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty (&gaD0, ctx, GA_FLOAT, 1, &dims[1], GA_C_ORDER));
 
 	ga_assert_ok(GpuArray_write (&gaS0, pS0, sizeof(*pS0)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD0, -1));  /* 0xFFFFFFFF is a qNaN. */
-	
+
 	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
 	ck_assert_ptr_nonnull(grAttr);
 	GpuReductionAttr_setop    (grAttr, GA_REDUCE_PROD);
@@ -2669,7 +2631,7 @@ START_TEST(test_prod_reduction){
 				gtD0 *= v;
 			}
 		}
-		
+
 		if(fabs(gtD0-pD0[j]) >= TOL){
 			errCnt++;
 			if(errCnt <= MAXERRPRINT){
@@ -2692,13 +2654,15 @@ START_TEST(test_prod_reduction){
 }END_TEST
 
 START_TEST(test_prod_veryhighrank){
-	pcgSeed(1);
-
 	/**
 	 * Here we test a reduction of a random 8D tensor on four dimensions.
 	 */
 
-	size_t errCnt      = 0;
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
+	size_t errCnt      = 0, dstIdx;
 	size_t i,j,k,l,m,n,o,p;
 	size_t dims   [8]  = {1171,373,2,1,2,1,2,1};
 	size_t prodDims    = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7];
@@ -2718,6 +2682,7 @@ START_TEST(test_prod_veryhighrank){
 	 * Initialize source data.
 	 */
 
+	pcgSeed(1);
 	for(i=0;i<prodDims;i++){
 		pS0[i] = (pcgRand01()-0.5)*0.1 + 1;
 	}
@@ -2727,17 +2692,12 @@ START_TEST(test_prod_veryhighrank){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS0;
-	GpuArray gaD0;
-	GpuReductionAttr* grAttr;
-	GpuReduction*     gr;
-
 	ga_assert_ok(GpuArray_empty (&gaS0, ctx, GA_FLOAT, 8, dims,    GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty (&gaD0, ctx, GA_FLOAT, 4, rdxDims, GA_C_ORDER));
 
 	ga_assert_ok(GpuArray_write (&gaS0, pS0, sizeof(*pS0)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD0, -1));  /* 0xFFFFFFFF is a qNaN. */
-	
+
 	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
 	ck_assert_ptr_nonnull(grAttr);
 	GpuReductionAttr_setop    (grAttr, GA_REDUCE_PROD);
@@ -2774,7 +2734,7 @@ START_TEST(test_prod_veryhighrank){
 						}
 					}
 
-					size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
+					dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
 					if(fabs(gtD0-pD0[dstIdx]) >= TOL){
 						errCnt++;
 						if(errCnt <= MAXERRPRINT){
@@ -2801,18 +2761,21 @@ START_TEST(test_prod_veryhighrank){
 }END_TEST
 
 START_TEST(test_prod_alldimsreduced){
-	pcgSeed(1);
-
 	/**
 	 * We test here a reduction of some random 3D tensor on all dimensions.
 	 */
 
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 	size_t errCnt      = 0;
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
 	const int reduxList[] = {0,1,2};
 	const float TOL = 1e-4;
+	float  gtD0;
 
 	float*  pS0 = calloc(1, sizeof(*pS0)    * dims[0]*dims[1]*dims[2]);
 	float*  pD0 = calloc(1, sizeof(*pD0)                             );
@@ -2825,6 +2788,7 @@ START_TEST(test_prod_alldimsreduced){
 	 * Initialize source data.
 	 */
 
+	pcgSeed(1);
 	for(i=0;i<prodDims;i++){
 		pS0[i] = (pcgRand01()-0.5)*0.1 + 1;
 	}
@@ -2834,17 +2798,12 @@ START_TEST(test_prod_alldimsreduced){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS0;
-	GpuArray gaD0;
-	GpuReductionAttr* grAttr;
-	GpuReduction*     gr;
-
 	ga_assert_ok(GpuArray_empty (&gaS0, ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty (&gaD0, ctx, GA_FLOAT, 0, NULL,     GA_C_ORDER));
 
 	ga_assert_ok(GpuArray_write (&gaS0, pS0, sizeof(*pS0)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD0, -1));  /* 0xFFFFFFFF is a qNaN. */
-	
+
 	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
 	ck_assert_ptr_nonnull(grAttr);
 	GpuReductionAttr_setop    (grAttr, GA_REDUCE_PROD);
@@ -2864,7 +2823,7 @@ START_TEST(test_prod_alldimsreduced){
 	 * Check that the destination tensors are correct.
 	 */
 
-	float  gtD0 = 1;
+	gtD0 = 1;
 
 	for(i=0;i<dims[0];i++){
 		for(j=0;j<dims[1];j++){
@@ -2895,12 +2854,14 @@ START_TEST(test_prod_alldimsreduced){
 }END_TEST
 
 START_TEST(test_prodnz_reduction){
-	pcgSeed(1);
-
 	/**
 	 * We test here a reduction of some random 3D tensor on all dimensions.
 	 */
 
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 	size_t errCnt      = 0;
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
@@ -2919,6 +2880,7 @@ START_TEST(test_prodnz_reduction){
 	 * Initialize source data.
 	 */
 
+	pcgSeed(1);
 	for(i=0;i<prodDims;i++){
 		pS0[i] = (pcgRand01()-0.5)*0.1 + 1;
 		if(pcgRand01()<0.1){
@@ -2927,21 +2889,16 @@ START_TEST(test_prodnz_reduction){
 	}
 
 
-	/**
-	 * Run the kernel.
-	 */
-
-	GpuArray gaS0;
-	GpuArray gaD0;
-	GpuReductionAttr* grAttr;
-	GpuReduction*     gr;
-
+	/**
+	 * Run the kernel.
+	 */
+
 	ga_assert_ok(GpuArray_empty (&gaS0, ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty (&gaD0, ctx, GA_FLOAT, 1, &dims[1], GA_C_ORDER));
 
 	ga_assert_ok(GpuArray_write (&gaS0, pS0, sizeof(*pS0)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD0, -1));  /* 0xFFFFFFFF is a qNaN. */
-	
+
 	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
 	ck_assert_ptr_nonnull(grAttr);
 	GpuReductionAttr_setop    (grAttr, GA_REDUCE_PRODNZ);
@@ -2970,7 +2927,7 @@ START_TEST(test_prodnz_reduction){
 				gtD0 *= v==0 ? 1 : v;
 			}
 		}
-		
+
 		if(fabs(gtD0-pD0[j]) >= TOL){
 			errCnt++;
 			if(errCnt <= MAXERRPRINT){
@@ -2993,13 +2950,15 @@ START_TEST(test_prodnz_reduction){
 }END_TEST
 
 START_TEST(test_prodnz_veryhighrank){
-	pcgSeed(1);
-
 	/**
 	 * Here we test a reduction of a random 8D tensor on four dimensions.
 	 */
 
-	size_t errCnt      = 0;
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
+	size_t errCnt      = 0, dstIdx;
 	size_t i,j,k,l,m,n,o,p;
 	size_t dims   [8]  = {1171,373,2,1,2,1,2,1};
 	size_t prodDims    = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7];
@@ -3019,6 +2978,7 @@ START_TEST(test_prodnz_veryhighrank){
 	 * Initialize source data.
 	 */
 
+	pcgSeed(1);
 	for(i=0;i<prodDims;i++){
 		pS0[i] = (pcgRand01()-0.5)*0.1 + 1;
 		if(pcgRand01()<0.1){
@@ -3031,17 +2991,12 @@ START_TEST(test_prodnz_veryhighrank){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS0;
-	GpuArray gaD0;
-	GpuReductionAttr* grAttr;
-	GpuReduction*     gr;
-
 	ga_assert_ok(GpuArray_empty (&gaS0, ctx, GA_FLOAT, 8, dims,    GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty (&gaD0, ctx, GA_FLOAT, 4, rdxDims, GA_C_ORDER));
 
 	ga_assert_ok(GpuArray_write (&gaS0, pS0, sizeof(*pS0)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD0, -1));  /* 0xFFFFFFFF is a qNaN. */
-	
+
 	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
 	ck_assert_ptr_nonnull(grAttr);
 	GpuReductionAttr_setop    (grAttr, GA_REDUCE_PRODNZ);
@@ -3078,7 +3033,7 @@ START_TEST(test_prodnz_veryhighrank){
 						}
 					}
 
-					size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
+					dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
 					if(fabs(gtD0-pD0[dstIdx]) >= TOL){
 						errCnt++;
 						if(errCnt <= MAXERRPRINT){
@@ -3105,18 +3060,21 @@ START_TEST(test_prodnz_veryhighrank){
 }END_TEST
 
 START_TEST(test_prodnz_alldimsreduced){
-	pcgSeed(1);
-
 	/**
 	 * We test here a reduction of some random 3D tensor on all dimensions.
 	 */
 
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 	size_t errCnt      = 0;
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
 	const int reduxList[] = {0,1,2};
 	const float TOL = 1e-4;
+	float  gtD0;
 
 	float*  pS0 = calloc(1, sizeof(*pS0)    * dims[0]*dims[1]*dims[2]);
 	float*  pD0 = calloc(1, sizeof(*pD0)                             );
@@ -3129,6 +3087,7 @@ START_TEST(test_prodnz_alldimsreduced){
 	 * Initialize source data.
 	 */
 
+	pcgSeed(1);
 	for(i=0;i<prodDims;i++){
 		pS0[i] = (pcgRand01()-0.5)*0.1 + 1;
 		if(pcgRand01()<0.1){
@@ -3141,17 +3100,12 @@ START_TEST(test_prodnz_alldimsreduced){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS0;
-	GpuArray gaD0;
-	GpuReductionAttr* grAttr;
-	GpuReduction*     gr;
-
 	ga_assert_ok(GpuArray_empty (&gaS0, ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty (&gaD0, ctx, GA_FLOAT, 0, NULL,     GA_C_ORDER));
 
 	ga_assert_ok(GpuArray_write (&gaS0, pS0, sizeof(*pS0)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD0, -1));  /* 0xFFFFFFFF is a qNaN. */
-	
+
 	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
 	ck_assert_ptr_nonnull(grAttr);
 	GpuReductionAttr_setop    (grAttr, GA_REDUCE_PRODNZ);
@@ -3171,7 +3125,7 @@ START_TEST(test_prodnz_alldimsreduced){
 	 * Check that the destination tensors are correct.
 	 */
 
-	float  gtD0 = 1;
+	gtD0 = 1;
 
 	for(i=0;i<dims[0];i++){
 		for(j=0;j<dims[1];j++){
@@ -3202,12 +3156,14 @@ START_TEST(test_prodnz_alldimsreduced){
 }END_TEST
 
 START_TEST(test_and_reduction){
-	pcgSeed(1);
-
 	/**
 	 * We test here a reduction of some random 3D tensor on all dimensions.
 	 */
 
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 	size_t errCnt      = 0;
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
@@ -3225,6 +3181,7 @@ START_TEST(test_and_reduction){
 	 * Initialize source data.
 	 */
 
+	pcgSeed(1);
 	for(i=0;i<prodDims;i++){
 		/**
 		 * We are testing logic-AND, so the bits should be 1 with high
@@ -3243,17 +3200,12 @@ START_TEST(test_and_reduction){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS0;
-	GpuArray gaD0;
-	GpuReductionAttr* grAttr;
-	GpuReduction*     gr;
-
 	ga_assert_ok(GpuArray_empty (&gaS0, ctx, GA_UINT, 3, &dims[0], GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty (&gaD0, ctx, GA_UINT, 1, &dims[1], GA_C_ORDER));
 
 	ga_assert_ok(GpuArray_write (&gaS0, pS0, sizeof(*pS0)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD0, -1));
-	
+
 	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
 	ck_assert_ptr_nonnull(grAttr);
 	GpuReductionAttr_setop    (grAttr, GA_REDUCE_AND);
@@ -3282,7 +3234,7 @@ START_TEST(test_and_reduction){
 				gtD0 &= v;
 			}
 		}
-		
+
 		if(gtD0 != pD0[j]){
 			errCnt++;
 			if(errCnt <= MAXERRPRINT){
@@ -3305,13 +3257,15 @@ START_TEST(test_and_reduction){
 }END_TEST
 
 START_TEST(test_and_veryhighrank){
-	pcgSeed(1);
-
 	/**
 	 * Here we test a reduction of a random 8D tensor on four dimensions.
 	 */
 
-	size_t errCnt      = 0;
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
+	size_t errCnt      = 0, dstIdx;
 	size_t i,j,k,l,m,n,o,p;
 	size_t dims   [8]  = {1171,373,2,1,2,1,2,1};
 	size_t prodDims    = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7];
@@ -3330,6 +3284,7 @@ START_TEST(test_and_veryhighrank){
 	 * Initialize source data.
 	 */
 
+	pcgSeed(1);
 	for(i=0;i<prodDims;i++){
 		/**
 		 * We are testing logic-AND, so the bits should be 1 with high
@@ -3348,17 +3303,12 @@ START_TEST(test_and_veryhighrank){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS0;
-	GpuArray gaD0;
-	GpuReductionAttr* grAttr;
-	GpuReduction*     gr;
-
 	ga_assert_ok(GpuArray_empty (&gaS0, ctx, GA_UINT, 8, dims,    GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty (&gaD0, ctx, GA_UINT, 4, rdxDims, GA_C_ORDER));
 
 	ga_assert_ok(GpuArray_write (&gaS0, pS0, sizeof(*pS0)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD0, -1));  /* 0xFFFFFFFF is a qNaN. */
-	
+
 	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
 	ck_assert_ptr_nonnull(grAttr);
 	GpuReductionAttr_setop    (grAttr, GA_REDUCE_AND);
@@ -3395,7 +3345,7 @@ START_TEST(test_and_veryhighrank){
 						}
 					}
 
-					size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
+					dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
 					if(gtD0 != pD0[dstIdx]){
 						errCnt++;
 						if(errCnt <= MAXERRPRINT){
@@ -3422,17 +3372,20 @@ START_TEST(test_and_veryhighrank){
 }END_TEST
 
 START_TEST(test_and_alldimsreduced){
-	pcgSeed(1);
-
 	/**
 	 * We test here a reduction of some random 3D tensor on all dimensions.
 	 */
 
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 	size_t errCnt      = 0;
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
 	const int reduxList[] = {0,1,2};
+	uint32_t  gtD0;
 
 	uint32_t*  pS0 = calloc(1, sizeof(*pS0)    * dims[0]*dims[1]*dims[2]);
 	uint32_t*  pD0 = calloc(1, sizeof(*pD0)                             );
@@ -3445,6 +3398,7 @@ START_TEST(test_and_alldimsreduced){
 	 * Initialize source data.
 	 */
 
+	pcgSeed(1);
 	for(i=0;i<prodDims;i++){
 		/**
 		 * We are testing logic-AND, so the bits should be 1 with high
@@ -3463,17 +3417,12 @@ START_TEST(test_and_alldimsreduced){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS0;
-	GpuArray gaD0;
-	GpuReductionAttr* grAttr;
-	GpuReduction*     gr;
-
 	ga_assert_ok(GpuArray_empty (&gaS0, ctx, GA_UINT, 3, &dims[0], GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty (&gaD0, ctx, GA_UINT, 0, NULL,     GA_C_ORDER));
 
 	ga_assert_ok(GpuArray_write (&gaS0, pS0, sizeof(*pS0)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD0, -1));  /* 0xFFFFFFFF is a qNaN. */
-	
+
 	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
 	ck_assert_ptr_nonnull(grAttr);
 	GpuReductionAttr_setop    (grAttr, GA_REDUCE_AND);
@@ -3493,7 +3442,7 @@ START_TEST(test_and_alldimsreduced){
 	 * Check that the destination tensors are correct.
 	 */
 
-	uint32_t  gtD0 = -1;
+	gtD0 = -1;
 
 	for(i=0;i<dims[0];i++){
 		for(j=0;j<dims[1];j++){
@@ -3524,12 +3473,14 @@ START_TEST(test_and_alldimsreduced){
 }END_TEST
 
 START_TEST(test_or_reduction){
-	pcgSeed(1);
-
 	/**
 	 * We test here a reduction of some random 3D tensor on all dimensions.
 	 */
 
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 	size_t errCnt      = 0;
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
@@ -3547,6 +3498,7 @@ START_TEST(test_or_reduction){
 	 * Initialize source data.
 	 */
 
+	pcgSeed(1);
 	for(i=0;i<prodDims;i++){
 		/**
 		 * We are testing logic-OR, so the bits should be 0 with high
@@ -3565,17 +3517,12 @@ START_TEST(test_or_reduction){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS0;
-	GpuArray gaD0;
-	GpuReductionAttr* grAttr;
-	GpuReduction*     gr;
-
 	ga_assert_ok(GpuArray_empty (&gaS0, ctx, GA_UINT, 3, &dims[0], GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty (&gaD0, ctx, GA_UINT, 1, &dims[1], GA_C_ORDER));
 
 	ga_assert_ok(GpuArray_write (&gaS0, pS0, sizeof(*pS0)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD0, -1));
-	
+
 	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
 	ck_assert_ptr_nonnull(grAttr);
 	GpuReductionAttr_setop    (grAttr, GA_REDUCE_OR);
@@ -3627,13 +3574,15 @@ START_TEST(test_or_reduction){
 }END_TEST
 
 START_TEST(test_or_veryhighrank){
-	pcgSeed(1);
-
 	/**
 	 * Here we test a reduction of a random 8D tensor on four dimensions.
 	 */
 
-	size_t errCnt      = 0;
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
+	size_t errCnt      = 0, dstIdx;
 	size_t i,j,k,l,m,n,o,p;
 	size_t dims   [8]  = {1171,373,2,1,2,1,2,1};
 	size_t prodDims    = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7];
@@ -3652,6 +3601,7 @@ START_TEST(test_or_veryhighrank){
 	 * Initialize source data.
 	 */
 
+	pcgSeed(1);
 	for(i=0;i<prodDims;i++){
 		/**
 		 * We are testing logic-OR, so the bits should be 0 with high
@@ -3670,17 +3620,12 @@ START_TEST(test_or_veryhighrank){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS0;
-	GpuArray gaD0;
-	GpuReductionAttr* grAttr;
-	GpuReduction*     gr;
-
 	ga_assert_ok(GpuArray_empty (&gaS0, ctx, GA_UINT, 8, dims,    GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty (&gaD0, ctx, GA_UINT, 4, rdxDims, GA_C_ORDER));
 
 	ga_assert_ok(GpuArray_write (&gaS0, pS0, sizeof(*pS0)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD0, -1));  /* 0xFFFFFFFF is a qNaN. */
-	
+
 	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
 	ck_assert_ptr_nonnull(grAttr);
 	GpuReductionAttr_setop    (grAttr, GA_REDUCE_OR);
@@ -3717,7 +3662,7 @@ START_TEST(test_or_veryhighrank){
 						}
 					}
 
-					size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
+					dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
 					if(gtD0 != pD0[dstIdx]){
 						errCnt++;
 						if(errCnt <= MAXERRPRINT){
@@ -3744,17 +3689,20 @@ START_TEST(test_or_veryhighrank){
 }END_TEST
 
 START_TEST(test_or_alldimsreduced){
-	pcgSeed(1);
-
 	/**
 	 * We test here a reduction of some random 3D tensor on all dimensions.
 	 */
 
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 	size_t errCnt      = 0;
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
 	const int reduxList[] = {0,1,2};
+	uint32_t  gtD0;
 
 	uint32_t*  pS0 = calloc(1, sizeof(*pS0)    * dims[0]*dims[1]*dims[2]);
 	uint32_t*  pD0 = calloc(1, sizeof(*pD0)                             );
@@ -3767,6 +3715,7 @@ START_TEST(test_or_alldimsreduced){
 	 * Initialize source data.
 	 */
 
+	pcgSeed(1);
 	for(i=0;i<prodDims;i++){
 		/**
 		 * We are testing logic-OR, so the bits should be 0 with high
@@ -3785,17 +3734,12 @@ START_TEST(test_or_alldimsreduced){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS0;
-	GpuArray gaD0;
-	GpuReductionAttr* grAttr;
-	GpuReduction*     gr;
-
 	ga_assert_ok(GpuArray_empty (&gaS0, ctx, GA_UINT, 3, &dims[0], GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty (&gaD0, ctx, GA_UINT, 0, NULL,     GA_C_ORDER));
 
 	ga_assert_ok(GpuArray_write (&gaS0, pS0, sizeof(*pS0)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD0, -1));  /* 0xFFFFFFFF is a qNaN. */
-	
+
 	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
 	ck_assert_ptr_nonnull(grAttr);
 	GpuReductionAttr_setop    (grAttr, GA_REDUCE_OR);
@@ -3815,7 +3759,7 @@ START_TEST(test_or_alldimsreduced){
 	 * Check that the destination tensors are correct.
 	 */
 
-	uint32_t  gtD0 = 0;
+	gtD0 = 0;
 
 	for(i=0;i<dims[0];i++){
 		for(j=0;j<dims[1];j++){
@@ -3846,12 +3790,14 @@ START_TEST(test_or_alldimsreduced){
 }END_TEST
 
 START_TEST(test_xor_reduction){
-	pcgSeed(1);
-
 	/**
 	 * We test here a reduction of some random 3D tensor on all dimensions.
 	 */
 
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 	size_t errCnt      = 0;
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
@@ -3869,6 +3815,7 @@ START_TEST(test_xor_reduction){
 	 * Initialize source data.
 	 */
 
+	pcgSeed(1);
 	for(i=0;i<prodDims;i++){
 		/**
 		 * We are testing logic-XOR, so the bits should be 1 with even
@@ -3883,17 +3830,12 @@ START_TEST(test_xor_reduction){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS0;
-	GpuArray gaD0;
-	GpuReductionAttr* grAttr;
-	GpuReduction*     gr;
-
 	ga_assert_ok(GpuArray_empty (&gaS0, ctx, GA_UINT, 3, &dims[0], GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty (&gaD0, ctx, GA_UINT, 1, &dims[1], GA_C_ORDER));
 
 	ga_assert_ok(GpuArray_write (&gaS0, pS0, sizeof(*pS0)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD0, -1));
-	
+
 	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
 	ck_assert_ptr_nonnull(grAttr);
 	GpuReductionAttr_setop    (grAttr, GA_REDUCE_XOR);
@@ -3945,13 +3887,15 @@ START_TEST(test_xor_reduction){
 }END_TEST
 
 START_TEST(test_xor_veryhighrank){
-	pcgSeed(1);
-
 	/**
 	 * Here we test a reduction of a random 8D tensor on four dimensions.
 	 */
 
-	size_t errCnt      = 0;
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
+	size_t errCnt      = 0, dstIdx;
 	size_t i,j,k,l,m,n,o,p;
 	size_t dims   [8]  = {1171,373,2,1,2,1,2,1};
 	size_t prodDims    = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7];
@@ -3970,6 +3914,7 @@ START_TEST(test_xor_veryhighrank){
 	 * Initialize source data.
 	 */
 
+	pcgSeed(1);
 	for(i=0;i<prodDims;i++){
 		/**
 		 * We are testing logic-XOR, so the bits should be 1 with even
@@ -3984,17 +3929,12 @@ START_TEST(test_xor_veryhighrank){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS0;
-	GpuArray gaD0;
-	GpuReductionAttr* grAttr;
-	GpuReduction*     gr;
-
 	ga_assert_ok(GpuArray_empty (&gaS0, ctx, GA_UINT, 8, dims,    GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty (&gaD0, ctx, GA_UINT, 4, rdxDims, GA_C_ORDER));
 
 	ga_assert_ok(GpuArray_write (&gaS0, pS0, sizeof(*pS0)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD0, -1));  /* 0xFFFFFFFF is a qNaN. */
-	
+
 	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
 	ck_assert_ptr_nonnull(grAttr);
 	GpuReductionAttr_setop    (grAttr, GA_REDUCE_XOR);
@@ -4031,7 +3971,7 @@ START_TEST(test_xor_veryhighrank){
 						}
 					}
 
-					size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
+					dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
 					if(gtD0 != pD0[dstIdx]){
 						errCnt++;
 						if(errCnt <= MAXERRPRINT){
@@ -4058,17 +3998,20 @@ START_TEST(test_xor_veryhighrank){
 }END_TEST
 
 START_TEST(test_xor_alldimsreduced){
-	pcgSeed(1);
-
 	/**
 	 * We test here a reduction of some random 3D tensor on all dimensions.
 	 */
 
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 	size_t errCnt      = 0;
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
 	const int reduxList[] = {0,1,2};
+	uint32_t  gtD0;
 
 	uint32_t*  pS0 = calloc(1, sizeof(*pS0)    * dims[0]*dims[1]*dims[2]);
 	uint32_t*  pD0 = calloc(1, sizeof(*pD0)                             );
@@ -4081,6 +4024,7 @@ START_TEST(test_xor_alldimsreduced){
 	 * Initialize source data.
 	 */
 
+	pcgSeed(1);
 	for(i=0;i<prodDims;i++){
 		/**
 		 * We are testing logic-XOR, so the bits should be 1 with even
@@ -4095,17 +4039,12 @@ START_TEST(test_xor_alldimsreduced){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS0;
-	GpuArray gaD0;
-	GpuReductionAttr* grAttr;
-	GpuReduction*     gr;
-
 	ga_assert_ok(GpuArray_empty (&gaS0, ctx, GA_UINT, 3, &dims[0], GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty (&gaD0, ctx, GA_UINT, 0, NULL,     GA_C_ORDER));
 
 	ga_assert_ok(GpuArray_write (&gaS0, pS0, sizeof(*pS0)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD0, -1));  /* 0xFFFFFFFF is a qNaN. */
-	
+
 	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
 	ck_assert_ptr_nonnull(grAttr);
 	GpuReductionAttr_setop    (grAttr, GA_REDUCE_XOR);
@@ -4125,7 +4064,7 @@ START_TEST(test_xor_alldimsreduced){
 	 * Check that the destination tensors are correct.
 	 */
 
-	uint32_t  gtD0 = 0;
+	gtD0 = 0;
 
 	for(i=0;i<dims[0];i++){
 		for(j=0;j<dims[1];j++){
@@ -4156,12 +4095,14 @@ START_TEST(test_xor_alldimsreduced){
 }END_TEST
 
 START_TEST(test_any_reduction){
-	pcgSeed(1);
-
 	/**
 	 * We test here a reduction of some random 3D tensor on all dimensions.
 	 */
 
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 	size_t errCnt      = 0;
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
@@ -4179,6 +4120,7 @@ START_TEST(test_any_reduction){
 	 * Initialize source data.
 	 */
 
+	pcgSeed(1);
 	for(i=0;i<prodDims;i++){
 		/**
 		 * We are testing logic-ANY, so the values should be 0 with high
@@ -4193,17 +4135,12 @@ START_TEST(test_any_reduction){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS0;
-	GpuArray gaD0;
-	GpuReductionAttr* grAttr;
-	GpuReduction*     gr;
-
 	ga_assert_ok(GpuArray_empty (&gaS0, ctx, GA_UINT, 3, &dims[0], GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty (&gaD0, ctx, GA_UINT, 1, &dims[1], GA_C_ORDER));
 
 	ga_assert_ok(GpuArray_write (&gaS0, pS0, sizeof(*pS0)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD0, -1));
-	
+
 	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
 	ck_assert_ptr_nonnull(grAttr);
 	GpuReductionAttr_setop    (grAttr, GA_REDUCE_ANY);
@@ -4255,13 +4192,15 @@ START_TEST(test_any_reduction){
 }END_TEST
 
 START_TEST(test_any_veryhighrank){
-	pcgSeed(1);
-
 	/**
 	 * Here we test a reduction of a random 8D tensor on four dimensions.
 	 */
 
-	size_t errCnt      = 0;
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
+	size_t errCnt      = 0, dstIdx;
 	size_t i,j,k,l,m,n,o,p;
 	size_t dims   [8]  = {1171,373,2,1,2,1,2,1};
 	size_t prodDims    = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7];
@@ -4280,6 +4219,7 @@ START_TEST(test_any_veryhighrank){
 	 * Initialize source data.
 	 */
 
+	pcgSeed(1);
 	for(i=0;i<prodDims;i++){
 		/**
 		 * We are testing logic-ANY, so the values should be 0 with high
@@ -4294,17 +4234,12 @@ START_TEST(test_any_veryhighrank){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS0;
-	GpuArray gaD0;
-	GpuReductionAttr* grAttr;
-	GpuReduction*     gr;
-
 	ga_assert_ok(GpuArray_empty (&gaS0, ctx, GA_UINT, 8, dims,    GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty (&gaD0, ctx, GA_UINT, 4, rdxDims, GA_C_ORDER));
 
 	ga_assert_ok(GpuArray_write (&gaS0, pS0, sizeof(*pS0)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD0, -1));  /* 0xFFFFFFFF is a qNaN. */
-	
+
 	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
 	ck_assert_ptr_nonnull(grAttr);
 	GpuReductionAttr_setop    (grAttr, GA_REDUCE_ANY);
@@ -4341,7 +4276,7 @@ START_TEST(test_any_veryhighrank){
 						}
 					}
 
-					size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
+					dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
 					if(gtD0 != pD0[dstIdx]){
 						errCnt++;
 						if(errCnt <= MAXERRPRINT){
@@ -4368,17 +4303,20 @@ START_TEST(test_any_veryhighrank){
 }END_TEST
 
 START_TEST(test_any_alldimsreduced){
-	pcgSeed(1);
-
 	/**
 	 * We test here a reduction of some random 3D tensor on all dimensions.
 	 */
 
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 	size_t errCnt      = 0;
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
 	const int reduxList[] = {0,1,2};
+	uint32_t  gtD0;
 
 	uint32_t*  pS0 = calloc(1, sizeof(*pS0)    * dims[0]*dims[1]*dims[2]);
 	uint32_t*  pD0 = calloc(1, sizeof(*pD0)                             );
@@ -4391,6 +4329,7 @@ START_TEST(test_any_alldimsreduced){
 	 * Initialize source data.
 	 */
 
+	pcgSeed(1);
 	for(i=0;i<prodDims;i++){
 		/**
 		 * We are testing logic-ANY, so the values should be 0 with high
@@ -4405,17 +4344,12 @@ START_TEST(test_any_alldimsreduced){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS0;
-	GpuArray gaD0;
-	GpuReductionAttr* grAttr;
-	GpuReduction*     gr;
-
 	ga_assert_ok(GpuArray_empty (&gaS0, ctx, GA_UINT, 3, &dims[0], GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty (&gaD0, ctx, GA_UINT, 0, NULL,     GA_C_ORDER));
 
 	ga_assert_ok(GpuArray_write (&gaS0, pS0, sizeof(*pS0)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD0, -1));  /* 0xFFFFFFFF is a qNaN. */
-	
+
 	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
 	ck_assert_ptr_nonnull(grAttr);
 	GpuReductionAttr_setop    (grAttr, GA_REDUCE_ANY);
@@ -4435,7 +4369,7 @@ START_TEST(test_any_alldimsreduced){
 	 * Check that the destination tensors are correct.
 	 */
 
-	uint32_t  gtD0 = 0;
+	gtD0 = 0;
 
 	for(i=0;i<dims[0];i++){
 		for(j=0;j<dims[1];j++){
@@ -4466,12 +4400,14 @@ START_TEST(test_any_alldimsreduced){
 }END_TEST
 
 START_TEST(test_all_reduction){
-	pcgSeed(1);
-
 	/**
 	 * We test here a reduction of some random 3D tensor on all dimensions.
 	 */
 
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 	size_t errCnt      = 0;
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
@@ -4489,6 +4425,7 @@ START_TEST(test_all_reduction){
 	 * Initialize source data.
 	 */
 
+	pcgSeed(1);
 	for(i=0;i<prodDims;i++){
 		/**
 		 * We are testing logic-ALL, so the values should be non-0 with high
@@ -4503,17 +4440,12 @@ START_TEST(test_all_reduction){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS0;
-	GpuArray gaD0;
-	GpuReductionAttr* grAttr;
-	GpuReduction*     gr;
-
 	ga_assert_ok(GpuArray_empty (&gaS0, ctx, GA_UINT, 3, &dims[0], GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty (&gaD0, ctx, GA_UINT, 1, &dims[1], GA_C_ORDER));
 
 	ga_assert_ok(GpuArray_write (&gaS0, pS0, sizeof(*pS0)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD0, -1));
-	
+
 	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
 	ck_assert_ptr_nonnull(grAttr);
 	GpuReductionAttr_setop    (grAttr, GA_REDUCE_ALL);
@@ -4565,13 +4497,15 @@ START_TEST(test_all_reduction){
 }END_TEST
 
 START_TEST(test_all_veryhighrank){
-	pcgSeed(1);
-
 	/**
 	 * Here we test a reduction of a random 8D tensor on four dimensions.
 	 */
 
-	size_t errCnt      = 0;
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
+	size_t errCnt      = 0, dstIdx;
 	size_t i,j,k,l,m,n,o,p;
 	size_t dims   [8]  = {1171,373,2,1,2,1,2,1};
 	size_t prodDims    = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7];
@@ -4590,6 +4524,7 @@ START_TEST(test_all_veryhighrank){
 	 * Initialize source data.
 	 */
 
+	pcgSeed(1);
 	for(i=0;i<prodDims;i++){
 		/**
 		 * We are testing logic-ALL, so the values should be non-0 with high
@@ -4604,17 +4539,12 @@ START_TEST(test_all_veryhighrank){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS0;
-	GpuArray gaD0;
-	GpuReductionAttr* grAttr;
-	GpuReduction*     gr;
-
 	ga_assert_ok(GpuArray_empty (&gaS0, ctx, GA_UINT, 8, dims,    GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty (&gaD0, ctx, GA_UINT, 4, rdxDims, GA_C_ORDER));
 
 	ga_assert_ok(GpuArray_write (&gaS0, pS0, sizeof(*pS0)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD0, -1));  /* 0xFFFFFFFF is a qNaN. */
-	
+
 	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
 	ck_assert_ptr_nonnull(grAttr);
 	GpuReductionAttr_setop    (grAttr, GA_REDUCE_ALL);
@@ -4651,7 +4581,7 @@ START_TEST(test_all_veryhighrank){
 						}
 					}
 
-					size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
+					dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o;
 					if(gtD0 != pD0[dstIdx]){
 						errCnt++;
 						if(errCnt <= MAXERRPRINT){
@@ -4678,17 +4608,20 @@ START_TEST(test_all_veryhighrank){
 }END_TEST
 
 START_TEST(test_all_alldimsreduced){
-	pcgSeed(1);
-
 	/**
 	 * We test here a reduction of some random 3D tensor on all dimensions.
 	 */
 
+	GpuArray gaS0;
+	GpuArray gaD0;
+	GpuReductionAttr* grAttr;
+	GpuReduction*     gr;
 	size_t errCnt      = 0;
 	size_t i,j,k;
 	size_t dims[3]  = {32,50,79};
 	size_t prodDims = dims[0]*dims[1]*dims[2];
 	const int reduxList[] = {0,1,2};
+	uint32_t  gtD0;
 
 	uint32_t*  pS0 = calloc(1, sizeof(*pS0)    * dims[0]*dims[1]*dims[2]);
 	uint32_t*  pD0 = calloc(1, sizeof(*pD0)                             );
@@ -4701,6 +4634,7 @@ START_TEST(test_all_alldimsreduced){
 	 * Initialize source data.
 	 */
 
+	pcgSeed(1);
 	for(i=0;i<prodDims;i++){
 		/**
 		 * We are testing logic-ALL, so the values should be non-0 with high
@@ -4715,17 +4649,12 @@ START_TEST(test_all_alldimsreduced){
 	 * Run the kernel.
 	 */
 
-	GpuArray gaS0;
-	GpuArray gaD0;
-	GpuReductionAttr* grAttr;
-	GpuReduction*     gr;
-
 	ga_assert_ok(GpuArray_empty (&gaS0, ctx, GA_UINT, 3, &dims[0], GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty (&gaD0, ctx, GA_UINT, 0, NULL,     GA_C_ORDER));
 
 	ga_assert_ok(GpuArray_write (&gaS0, pS0, sizeof(*pS0)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD0, -1));  /* 0xFFFFFFFF is a qNaN. */
-	
+
 	GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0));
 	ck_assert_ptr_nonnull(grAttr);
 	GpuReductionAttr_setop    (grAttr, GA_REDUCE_ALL);
@@ -4745,7 +4674,7 @@ START_TEST(test_all_alldimsreduced){
 	 * Check that the destination tensors are correct.
 	 */
 
-	uint32_t  gtD0 = 1;
+	gtD0 = 1;
 
 	for(i=0;i<dims[0];i++){
 		for(j=0;j<dims[1];j++){
@@ -4780,7 +4709,7 @@ Suite *get_suite(void) {
 	TCase *tc = tcase_create("basic");
 	tcase_add_checked_fixture(tc, setup, teardown);
 	tcase_set_timeout(tc, 120.0);
-	
+
 	tcase_add_test(tc, test_maxandargmax_reduction);
 	tcase_add_test(tc, test_maxandargmax_idxtranspose);
 	tcase_add_test(tc, test_maxandargmax_bigdestination);

From fac52b693b770419a20c9830be84157e20fc1fc8 Mon Sep 17 00:00:00 2001
From: Olexa Bilaniuk <obilaniu@gmail.com>
Date: Tue, 25 Jul 2017 15:42:36 -0400
Subject: [PATCH 28/34] Easy feedback fixes applied.

---
 src/gpuarray_reduction.c | 39 +++++++++++++++++----------------------
 1 file changed, 17 insertions(+), 22 deletions(-)

diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c
index 6d2e6fc17f..092b8f8509 100644
--- a/src/gpuarray_reduction.c
+++ b/src/gpuarray_reduction.c
@@ -248,8 +248,6 @@ struct GpuReductionAttr{
 struct GpuReduction{
 	/* Function Arguments. */
 	GpuReductionAttr grAttr;
-	gpucontext*      gpuCtx;
-	ga_reduce_op     op;
 	int              nds;
 	int              ndd;
 	int              ndr;
@@ -652,6 +650,8 @@ GPUARRAY_PUBLIC void  GpuReductionAttr_free         (GpuReductionAttr*
 }
 GPUARRAY_PUBLIC int   GpuReduction_new              (GpuReduction**             gr,
                                                      const GpuReductionAttr*    grAttr){
+	GpuReduction* grOut = NULL;
+	
 	if (!gr){
 		return GA_INVALID_ERROR;
 	}
@@ -660,16 +660,14 @@ GPUARRAY_PUBLIC int   GpuReduction_new              (GpuReduction**
 		return GA_INVALID_ERROR;
 	}
 	
-	*gr = calloc(1, sizeof(**gr));
-	if (*gr){
-		(*gr)->grAttr = *grAttr;
-		(*gr)->gpuCtx = grAttr->gpuCtx;
-		(*gr)->op     = grAttr->op;
-		(*gr)->nds    = (int)grAttr->maxSrcDims;
-		(*gr)->ndd    = (int)grAttr->maxDstDims;
-		(*gr)->ndr    = (int)(grAttr->maxSrcDims-grAttr->maxDstDims);
+	grOut = calloc(1, sizeof(*grOut));
+	if (grOut){
+		grOut->grAttr = *grAttr;
+		grOut->nds    = (int)grAttr->maxSrcDims;
+		grOut->ndd    = (int)grAttr->maxDstDims;
+		grOut->ndr    = (int)(grAttr->maxSrcDims - grAttr->maxDstDims);
 		
-		return reduxGenInit(*gr);
+		return reduxGenInit(grOut);
 	}else{
 		return GA_MEMORY_ERROR;
 	}
@@ -684,7 +682,8 @@ GPUARRAY_PUBLIC int   GpuReduction_call             (const GpuReduction*
                                                      unsigned                   reduxLen,
                                                      const int*                 reduxList,
                                                      int                        flags){
-	redux_ctx ctxSTACK, *ctx = &ctxSTACK;
+	redux_ctx ctxSTACK;
+	redux_ctx *ctx = &ctxSTACK;
 	memset(ctx, 0, sizeof(*ctx));
 
 	ctx->gr        = gr;
@@ -713,8 +712,7 @@ GPUARRAY_PUBLIC int   GpuReduction_call             (const GpuReduction*
  */
 
 static int         reduxGetSumInit                  (int typecode, const char** property){
-	if (typecode == GA_POINTER ||
-	    typecode == GA_BUFFER){
+	if (typecode < 0){
 		return GA_UNSUPPORTED_ERROR;
 	}
 	*property = "0";
@@ -733,8 +731,7 @@ static int         reduxGetSumInit                  (int typecode, const char**
  */
 
 static int         reduxGetProdInit                 (int typecode, const char** property){
-	if (typecode == GA_POINTER ||
-	    typecode == GA_BUFFER){
+	if (typecode < 0){
 		return GA_UNSUPPORTED_ERROR;
 	}
 	*property = "1";
@@ -942,8 +939,7 @@ static int         reduxGetMaxInit                  (int typecode, const char**
  */
 
 static int         reduxGetAndInit                  (int typecode, const char** property){
-	if (typecode == GA_POINTER ||
-	    typecode == GA_BUFFER){
+	if (typecode < 0){
 		return GA_UNSUPPORTED_ERROR;
 	}
 	*property = "~0";
@@ -962,8 +958,7 @@ static int         reduxGetAndInit                  (int typecode, const char**
  */
 
 static int         reduxGetOrInit                   (int typecode, const char** property){
-	if (typecode == GA_POINTER ||
-	    typecode == GA_BUFFER){
+	if (typecode < 0){
 		return GA_UNSUPPORTED_ERROR;
 	}
 	*property = "0";
@@ -2867,7 +2862,7 @@ static int         reduxGenCompile                  (GpuReduction*        gr){
 	}
 
 	ret  = GpuKernel_init(&gr->k,
-	                      gr->gpuCtx,
+	                      gr->grAttr.gpuCtx,
 	                      1,
 	                      (const char**)&gr->kSourceCode,
 	                      &gr->kSourceCodeLen,
@@ -3994,7 +3989,7 @@ static int        reduxInvSchedule              (redux_ctx*           ctx){
 	ctx->W0Off = reduxGenGetWMEMK0Off(ctx->gr, 2*ctx->gs*ctx->D);
 	ctx->W1Off = reduxGenGetWMEMK1Off(ctx->gr, 2*ctx->gs*ctx->D);
 	WSPACESIZE = reduxGenGetWMEMSize (ctx->gr, 2*ctx->gs*ctx->D);
-	ctx->W     = gpudata_alloc(ctx->gr->gpuCtx, WSPACESIZE, 0, flags, 0);
+	ctx->W     = gpudata_alloc(ctx->gr->grAttr.gpuCtx, WSPACESIZE, 0, flags, 0);
 	if (!ctx->W){
 		return reduxInvCleanupMsg(ctx, GA_MEMORY_ERROR,
 		       "Could not allocate %zu-byte workspace for reduction!\n",

From f129c69073f0e1f3f0f8c6559a7aa68271323337 Mon Sep 17 00:00:00 2001
From: Olexa Bilaniuk <obilaniu@gmail.com>
Date: Fri, 4 Aug 2017 14:40:57 -0400
Subject: [PATCH 29/34] Add stdargs support to the error API.

---
 src/util/error.c | 15 ++++++++++-----
 src/util/error.h |  1 +
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/src/util/error.c b/src/util/error.c
index 19ce184363..2f570144e8 100644
--- a/src/util/error.c
+++ b/src/util/error.c
@@ -29,15 +29,20 @@ int error_set(error *e, int code, const char *msg) {
   return code;
 }
 
-int error_fmt(error *e, int code, const char *fmt, ...) {
-  va_list ap;
-
+int error_vfmt(error *e, int code, const char *fmt, va_list ap) {
   e->code = code;
-  va_start(ap, fmt);
   vsnprintf(e->msg, ERROR_MSGBUF_LEN, fmt, ap);
-  va_end(ap);
 #ifdef DEBUG
   fprintf(stderr, "ERROR %d: %s\n", e->code, e->msg);
 #endif
   return code;
 }
+
+int error_fmt(error *e, int code, const char *fmt, ...) {
+  int ret;
+  va_list ap;
+  va_start(ap, fmt);
+  ret = error_vfmt(e, code, fmt, ap);
+  va_end(ap);
+  return ret;
+}
diff --git a/src/util/error.h b/src/util/error.h
index fc1ecb1663..0f7651fec0 100644
--- a/src/util/error.h
+++ b/src/util/error.h
@@ -18,6 +18,7 @@ int error_alloc(error **e);
 void error_free(error *e);
 int error_set(error *e, int code, const char *msg);
 int error_fmt(error *e, int code, const char *fmt, ...);
+int error_vfmt(error *e, int code, const char *fmt, va_list ap);
 
 extern error *global_err;
 

From 76fd38ca3b94299090ca335b31dd5d745aa28484 Mon Sep 17 00:00:00 2001
From: Olexa Bilaniuk <obilaniu@gmail.com>
Date: Sat, 26 Aug 2017 19:57:52 -0400
Subject: [PATCH 30/34] Deleted recently-removed properties.

---
 src/gpuarray_reduction.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c
index 092b8f8509..88d5d9e5cf 100644
--- a/src/gpuarray_reduction.c
+++ b/src/gpuarray_reduction.c
@@ -151,7 +151,7 @@ typedef struct redux_ctx redux_ctx;
 struct GpuReductionAttr{
 	gpucontext*   gpuCtx;
 	unsigned      numProcs;
-	size_t        maxLg, maxL0, maxGg, maxG0, maxLM;
+	size_t        maxL0, maxG0, maxLM;
 	
 	ga_reduce_op  op;
 	int           maxSrcDims;
@@ -491,9 +491,7 @@ GPUARRAY_PUBLIC int   GpuReductionAttr_new          (GpuReductionAttr**
 	
 	(*grAttr)->gpuCtx     = gpuCtx;
 	if (gpucontext_property(gpuCtx, GA_CTX_PROP_NUMPROCS,  &(*grAttr)->numProcs) != GA_NO_ERROR ||
-	    gpucontext_property(gpuCtx, GA_CTX_PROP_MAXLSIZE,  &(*grAttr)->maxLg)    != GA_NO_ERROR ||
 	    gpucontext_property(gpuCtx, GA_CTX_PROP_MAXLSIZE0, &(*grAttr)->maxL0)    != GA_NO_ERROR ||
-	    gpucontext_property(gpuCtx, GA_CTX_PROP_MAXGSIZE,  &(*grAttr)->maxGg)    != GA_NO_ERROR ||
 	    gpucontext_property(gpuCtx, GA_CTX_PROP_MAXGSIZE0, &(*grAttr)->maxG0)    != GA_NO_ERROR ||
 	    gpucontext_property(gpuCtx, GA_CTX_PROP_LMEMSIZE,  &(*grAttr)->maxLM)    != GA_NO_ERROR ){
 		free(*grAttr);
@@ -1606,7 +1604,6 @@ static int         reduxGenInferProperties          (GpuReduction*        gr){
 
 static void        reduxGenSetMaxBS                 (GpuReduction*        gr){
 	gr->maxBS = gr->grAttr.maxLM/reduxGenGetReduxStateSize(gr);
-	gr->maxBS = gr->maxBS < gr->grAttr.maxLg ? gr->maxBS : gr->grAttr.maxLg;
 	gr->maxBS = gr->maxBS < gr->grAttr.maxL0 ? gr->maxBS : gr->grAttr.maxL0;
 	
 	/**
@@ -2856,7 +2853,6 @@ static int         reduxGenSrcAxisIsSplit           (GpuReduction*        gr,
 static int         reduxGenCompile                  (GpuReduction*        gr){
 	int ret, flags = 0;
 
-	flags |= GA_USE_CLUDA;
 	if (gr->TS0tc == GA_HALF || gr->TD0tc == GA_HALF){
 		flags |= GA_USE_HALF|GA_USE_SMALL;
 	}
@@ -3104,7 +3100,7 @@ static size_t      reduxGenEstimateParallelism      (const GpuReduction*  gr){
 	 */
 
 	size_t marginFactor = 16;
-	return marginFactor * gr->grAttr.numProcs * gr->grAttr.maxLg;
+	return marginFactor * gr->grAttr.numProcs * gr->grAttr.maxL0;
 }
 
 /**

From 0832fa1bfb954dc23b607bdc2d5303bd417c8109 Mon Sep 17 00:00:00 2001
From: Olexa Bilaniuk <obilaniu@gmail.com>
Date: Sat, 26 Aug 2017 19:58:11 -0400
Subject: [PATCH 31/34] Added missing header

---
 src/util/error.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/util/error.h b/src/util/error.h
index 0f7651fec0..7577b4cee9 100644
--- a/src/util/error.h
+++ b/src/util/error.h
@@ -3,6 +3,7 @@
 
 #include <errno.h>
 #include <string.h>
+#include <stdarg.h>
 
 #include <gpuarray/error.h>
 

From c679474767aa71b87e86e6f040af4230c6ff91ab Mon Sep 17 00:00:00 2001
From: Olexa Bilaniuk <obilaniu@gmail.com>
Date: Sat, 26 Aug 2017 20:23:33 -0400
Subject: [PATCH 32/34] For test purposes, create buffer of ULONG rather than
 unsupported SIZE.

---
 tests/check_reduction.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/tests/check_reduction.c b/tests/check_reduction.c
index 60411ead57..ca52946fa5 100644
--- a/tests/check_reduction.c
+++ b/tests/check_reduction.c
@@ -109,7 +109,7 @@ START_TEST(test_maxandargmax_reduction){
 
 	ga_assert_ok(GpuArray_empty(&gaS0, ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty(&gaD0, ctx, GA_FLOAT, 1, &dims[1], GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_SIZE,  1, &dims[1], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_ULONG, 1, &dims[1], GA_C_ORDER));
 
 	ga_assert_ok(GpuArray_write(&gaS0, pS0, sizeof(*pS0)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD0, -1));  /* 0xFFFFFFFF is a qNaN. */
@@ -220,7 +220,7 @@ START_TEST(test_maxandargmax_idxtranspose){
 
 	ga_assert_ok(GpuArray_empty(&gaS0, ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty(&gaD0, ctx, GA_FLOAT, 1, &dims[1], GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_SIZE,  1, &dims[1], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_ULONG, 1, &dims[1], GA_C_ORDER));
 
 	ga_assert_ok(GpuArray_write(&gaS0, pS0, sizeof(*pS0)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD0, -1));  /* 0xFFFFFFFF is a qNaN. */
@@ -327,7 +327,7 @@ START_TEST(test_maxandargmax_bigdestination){
 
 	ga_assert_ok(GpuArray_empty(&gaS0, ctx, GA_FLOAT, 2, &dims[0], GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty(&gaD0, ctx, GA_FLOAT, 1, &dims[1], GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_SIZE,  1, &dims[1], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_ULONG, 1, &dims[1], GA_C_ORDER));
 
 	ga_assert_ok(GpuArray_write(&gaS0, pS0, sizeof(*pS0)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD0, -1));  /* 0xFFFFFFFF is a qNaN. */
@@ -433,7 +433,7 @@ START_TEST(test_maxandargmax_veryhighrank){
 
 	ga_assert_ok(GpuArray_empty(&gaS0, ctx, GA_FLOAT, 8, dims,    GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty(&gaD0, ctx, GA_FLOAT, 4, rdxDims, GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_SIZE,  4, rdxDims, GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_ULONG, 4, rdxDims, GA_C_ORDER));
 
 	ga_assert_ok(GpuArray_write(&gaS0, pS0, sizeof(*pS0)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD0, -1));  /* 0xFFFFFFFF is a qNaN. */
@@ -553,7 +553,7 @@ START_TEST(test_maxandargmax_alldimsreduced){
 
 	ga_assert_ok(GpuArray_empty(&gaS0, ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty(&gaD0, ctx, GA_FLOAT, 0, NULL,     GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_SIZE,  0, NULL,     GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_ULONG, 0, NULL,     GA_C_ORDER));
 
 	ga_assert_ok(GpuArray_write(&gaS0, pS0, sizeof(*pS0)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD0, -1));  /* 0xFFFFFFFF is a qNaN. */
@@ -659,7 +659,7 @@ START_TEST(test_minandargmin_reduction){
 
 	ga_assert_ok(GpuArray_empty(&gaS0, ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty(&gaD0, ctx, GA_FLOAT, 1, &dims[1], GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_SIZE,  1, &dims[1], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_ULONG, 1, &dims[1], GA_C_ORDER));
 
 	ga_assert_ok(GpuArray_write(&gaS0, pS0, sizeof(*pS0)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD0, -1));  /* 0xFFFFFFFF is a qNaN. */
@@ -767,7 +767,7 @@ START_TEST(test_minandargmin_veryhighrank){
 
 	ga_assert_ok(GpuArray_empty(&gaS0, ctx, GA_FLOAT, 8, dims,    GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty(&gaD0, ctx, GA_FLOAT, 4, rdxDims, GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_SIZE,  4, rdxDims, GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_ULONG, 4, rdxDims, GA_C_ORDER));
 
 	ga_assert_ok(GpuArray_write(&gaS0, pS0, sizeof(*pS0)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD0, -1));  /* 0xFFFFFFFF is a qNaN. */
@@ -887,7 +887,7 @@ START_TEST(test_minandargmin_alldimsreduced){
 
 	ga_assert_ok(GpuArray_empty(&gaS0, ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
 	ga_assert_ok(GpuArray_empty(&gaD0, ctx, GA_FLOAT, 0, NULL,     GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_SIZE,  0, NULL,     GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_ULONG, 0, NULL,     GA_C_ORDER));
 
 	ga_assert_ok(GpuArray_write(&gaS0, pS0, sizeof(*pS0)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD0, -1));  /* 0xFFFFFFFF is a qNaN. */
@@ -991,7 +991,7 @@ START_TEST(test_argmax_reduction){
 	 */
 
 	ga_assert_ok(GpuArray_empty(&gaS0, ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_SIZE,  1, &dims[1], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_ULONG, 1, &dims[1], GA_C_ORDER));
 
 	ga_assert_ok(GpuArray_write(&gaS0, pS0, sizeof(*pS0)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD1, -1));
@@ -1092,7 +1092,7 @@ START_TEST(test_argmax_veryhighrank){
 	 */
 
 	ga_assert_ok(GpuArray_empty(&gaS0, ctx, GA_FLOAT, 8, dims,    GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_SIZE,  4, rdxDims, GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_ULONG, 4, rdxDims, GA_C_ORDER));
 
 	ga_assert_ok(GpuArray_write(&gaS0, pS0, sizeof(*pS0)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD1, -1));
@@ -1206,7 +1206,7 @@ START_TEST(test_argmax_alldimsreduced){
 	 */
 
 	ga_assert_ok(GpuArray_empty(&gaS0, ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_SIZE,  0, NULL,     GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_ULONG, 0, NULL,     GA_C_ORDER));
 
 	ga_assert_ok(GpuArray_write(&gaS0, pS0, sizeof(*pS0)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD1, -1));
@@ -1306,7 +1306,7 @@ START_TEST(test_argmin_reduction){
 	 */
 
 	ga_assert_ok(GpuArray_empty(&gaS0, ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_SIZE,  1, &dims[1], GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_ULONG, 1, &dims[1], GA_C_ORDER));
 
 	ga_assert_ok(GpuArray_write(&gaS0, pS0, sizeof(*pS0)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD1, -1));
@@ -1407,7 +1407,7 @@ START_TEST(test_argmin_veryhighrank){
 	 */
 
 	ga_assert_ok(GpuArray_empty(&gaS0,    ctx, GA_FLOAT, 8, dims,    GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_SIZE,  4, rdxDims, GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_ULONG, 4, rdxDims, GA_C_ORDER));
 
 	ga_assert_ok(GpuArray_write(&gaS0,    pS0, sizeof(*pS0)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD1, -1));
@@ -1521,7 +1521,7 @@ START_TEST(test_argmin_alldimsreduced){
 	 */
 
 	ga_assert_ok(GpuArray_empty(&gaS0, ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER));
-	ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_SIZE,  0, NULL,     GA_C_ORDER));
+	ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_ULONG, 0, NULL,     GA_C_ORDER));
 
 	ga_assert_ok(GpuArray_write(&gaS0, pS0, sizeof(*pS0)*prodDims));
 	ga_assert_ok(GpuArray_memset(&gaD1, -1));

From ecde75cbec916b493a9530a65696806ebcdb863e Mon Sep 17 00:00:00 2001
From: Olexa Bilaniuk <obilaniu@gmail.com>
Date: Sat, 26 Aug 2017 20:27:54 -0400
Subject: [PATCH 33/34] Bugfix in GpuReduction_new().

---
 src/gpuarray_reduction.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c
index 88d5d9e5cf..9763d18b40 100644
--- a/src/gpuarray_reduction.c
+++ b/src/gpuarray_reduction.c
@@ -648,6 +648,7 @@ GPUARRAY_PUBLIC void  GpuReductionAttr_free         (GpuReductionAttr*
 }
 GPUARRAY_PUBLIC int   GpuReduction_new              (GpuReduction**             gr,
                                                      const GpuReductionAttr*    grAttr){
+	int           ret;
 	GpuReduction* grOut = NULL;
 	
 	if (!gr){
@@ -665,8 +666,16 @@ GPUARRAY_PUBLIC int   GpuReduction_new              (GpuReduction**
 		grOut->ndd    = (int)grAttr->maxDstDims;
 		grOut->ndr    = (int)(grAttr->maxSrcDims - grAttr->maxDstDims);
 		
-		return reduxGenInit(grOut);
+		ret = reduxGenInit(grOut);
+		if(ret == GA_NO_ERROR){
+			*gr = grOut;
+		}else{
+			GpuReduction_free(grOut);
+			*gr = NULL;
+		}
+		return ret;
 	}else{
+		*gr = NULL;
 		return GA_MEMORY_ERROR;
 	}
 }

From 79d3649f53f352e363c67085686db8fe98052d08 Mon Sep 17 00:00:00 2001
From: Olexa Bilaniuk <obilaniu@gmail.com>
Date: Sat, 26 Aug 2017 20:40:58 -0400
Subject: [PATCH 34/34] Bugfixes in check_reduction.c

---
 tests/check_reduction.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/check_reduction.c b/tests/check_reduction.c
index ca52946fa5..973a348299 100644
--- a/tests/check_reduction.c
+++ b/tests/check_reduction.c
@@ -7,6 +7,7 @@
 #include <stdint.h>
 #include <stddef.h>
 #include <stdlib.h>
+#include <math.h>
 
 
 extern void *ctx;
@@ -18,7 +19,9 @@ void teardown(void);
 /* Defines */
 #define MAXERRPRINT  16
 #define ga_assert_ok(e) ck_assert_int_eq(e, GA_NO_ERROR)
-
+#ifndef ck_assert_ptr_nonnull
+#define ck_assert_ptr_nonnull(p) ck_assert_msg((p), "Null Pointer!")
+#endif