From c3ae76c2bf06e737ebcec478867cfb96c343c26c Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Mon, 23 Jan 2017 19:33:24 -0500 Subject: [PATCH 01/34] Current status of reduction generalization and small-destination support. --- src/gpuarray/array.h | 115 ++- src/gpuarray_reduction.c | 1586 ++++++++++++++++++++++++++++++-------- tests/check_reduction.c | 242 +++++- 3 files changed, 1570 insertions(+), 373 deletions(-) diff --git a/src/gpuarray/array.h b/src/gpuarray/array.h index a99366a7c4..5ea9377b9a 100644 --- a/src/gpuarray/array.h +++ b/src/gpuarray/array.h @@ -118,6 +118,27 @@ typedef enum _ga_order { GA_F_ORDER=1 } ga_order; +/** + * Supported array reduction operations. + */ + +typedef enum _ga_reduce_op { + GA_REDUCE_SUM, /* + */ + GA_REDUCE_PROD, /* * */ + GA_REDUCE_PRODNZ, /* * (!=0) */ + GA_REDUCE_MIN, /* min() */ + GA_REDUCE_MAX, /* max() */ + GA_REDUCE_ARGMIN, /* argmin() */ + GA_REDUCE_ARGMAX, /* argmax() */ + GA_REDUCE_MINANDARGMIN, /* min(), argmin() */ + GA_REDUCE_MAXANDARGMAX, /* max(), argmax() */ + GA_REDUCE_AND, /* & */ + GA_REDUCE_OR, /* | */ + GA_REDUCE_XOR, /* ^ */ + GA_REDUCE_ALL, /* &&/all() */ + GA_REDUCE_ANY, /* ||/any() */ +} ga_reduce_op; + /** * Checks if all the specified flags are set. * @@ -604,26 +625,31 @@ GPUARRAY_PUBLIC void GpuArray_fprintf(FILE *fd, const GpuArray *a); GPUARRAY_PUBLIC int GpuArray_fdump(FILE *fd, const GpuArray *a); + /** - * @brief Computes simultaneously the maxima and the arguments of maxima over - * specified axes of the tensor. + * @brief Compute a reduction sum (+), product (*), non-zero product (* != 0), + * min, max, argmin, argmax, min-and-argmin, max-and-argmax, and (&), + * or (|), xor (^), all (&&) or any (||) over a list of axes to reduce. * - * Returns two tensors of identical shape. Both tensors' axes are a subset of - * the axes of the original tensor. The axes to be reduced are specified by - * the caller, and the maxima and arguments of maxima are computed over them. + * Returns one (in the case of min-and-argmin/max-and-argmax, two) destination + * tensors. The destination tensor(s)' axes are a strict subset of the axes of the + * source tensor. The axes to be reduced are specified by the caller, and the + * reduction is performed over these axes, which are then removed in the + * destination. * - * @param [out] dstMax The resulting tensor of maxima - * @param [out] dstArgmax the resulting tensor of arguments at maxima + * @param [out] dst The destination tensor. Has the same type as the source. + * @param [out] dstArg For argument of minima/maxima operations. Has type int64. * @param [in] src The source tensor. * @param [in] reduxLen The number of axes reduced. Must be >= 1 and * <= src->nd. * @param [in] reduxList A list of integers of length reduxLen, indicating * the axes to be reduced. The order of the axes - * matters for dstArgmax index calculations. All - * entries in the list must be unique, >= 0 and - * < src->nd. + * matters for dstArg index calculations (GpuArray_argmin, + * GpuArray_argmax, GpuArray_minandargmin, + * GpuArray_maxandargmax). All entries in the list must be + * unique, >= 0 and < src->nd. * - * For example, if a 5D-tensor is reduced with an axis + * For example, if a 5D-tensor is max-reduced with an axis * list of [3,4,1], then reduxLen shall be 3, and the * index calculation in every point shall take the form * @@ -637,11 +663,74 @@ GPUARRAY_PUBLIC int GpuArray_fdump(FILE *fd, const GpuArray *a); * code otherwise. */ -GPUARRAY_PUBLIC int GpuArray_maxandargmax(GpuArray* dstMax, - GpuArray* dstArgmax, +GPUARRAY_PUBLIC int GpuArray_sum (GpuArray* dst, const GpuArray* src, unsigned reduxLen, const unsigned* reduxList); +GPUARRAY_PUBLIC int GpuArray_prod (GpuArray* dst, + const GpuArray* src, + unsigned reduxLen, + const unsigned* reduxList); +GPUARRAY_PUBLIC int GpuArray_prodnz (GpuArray* dst, + const GpuArray* src, + unsigned reduxLen, + const unsigned* reduxList); +GPUARRAY_PUBLIC int GpuArray_min (GpuArray* dst, + const GpuArray* src, + unsigned reduxLen, + const unsigned* reduxList); +GPUARRAY_PUBLIC int GpuArray_max (GpuArray* dst, + const GpuArray* src, + unsigned reduxLen, + const unsigned* reduxList); +GPUARRAY_PUBLIC int GpuArray_argmin (GpuArray* dstArg, + const GpuArray* src, + unsigned reduxLen, + const unsigned* reduxList); +GPUARRAY_PUBLIC int GpuArray_argmax (GpuArray* dstArg, + const GpuArray* src, + unsigned reduxLen, + const unsigned* reduxList); +GPUARRAY_PUBLIC int GpuArray_minandargmin(GpuArray* dst, + GpuArray* dstArg, + const GpuArray* src, + unsigned reduxLen, + const unsigned* reduxList); +GPUARRAY_PUBLIC int GpuArray_maxandargmax(GpuArray* dst, + GpuArray* dstArg, + const GpuArray* src, + unsigned reduxLen, + const unsigned* reduxList); +GPUARRAY_PUBLIC int GpuArray_and (GpuArray* dst, + const GpuArray* src, + unsigned reduxLen, + const unsigned* reduxList); +GPUARRAY_PUBLIC int GpuArray_or (GpuArray* dst, + const GpuArray* src, + unsigned reduxLen, + const unsigned* reduxList); +GPUARRAY_PUBLIC int GpuArray_xor (GpuArray* dst, + const GpuArray* src, + unsigned reduxLen, + const unsigned* reduxList); +GPUARRAY_PUBLIC int GpuArray_all (GpuArray* dst, + const GpuArray* src, + unsigned reduxLen, + const unsigned* reduxList); +GPUARRAY_PUBLIC int GpuArray_any (GpuArray* dst, + const GpuArray* src, + unsigned reduxLen, + const unsigned* reduxList); +GPUARRAY_PUBLIC int GpuArray_reduction (ga_reduce_op op, + GpuArray* dst, + GpuArray* dstArg, + const GpuArray* src, + unsigned reduxLen, + const unsigned* reduxList); + + + + #ifdef __cplusplus } diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c index fc4fc56975..8a6a2dc98b 100644 --- a/src/gpuarray_reduction.c +++ b/src/gpuarray_reduction.c @@ -21,106 +21,613 @@ #include "util/integerfactoring.h" +/* Defines */ +#define MAX_HW_DIMS 3 + + + /* Datatypes */ -struct maxandargmax_ctx{ + +/** + * Reduction Kernel Generator. + * + * The generator produces a kernel from one of two "code models": + * - Large + * - Small + * Which one is used depends on the size of the destination tensor and the + * number of reductions for each destination element. A destination tensor + * with more than SMALL_REDUX_THRESHOLD elements or more elements than + * reductions for each element will result in use of the large code model; + * Otherwise the small code model is used. + * + * + * LARGE CODE MODEL: + * + * In the large code model, each destination element is processed by a + * single thread. + * + * Each thread begins with an initial value in a register, reads from all + * source elements contributing to the reduction, computes the result and + * writes it to the destination element. + * + * A single kernel is generated that performs prescalar transformations, the + * reduction itself, postscalar transformations and the write to global memory. + * + * + * SMALL CODE MODEL: + * + * In the small code model, each destination element is processed by + * multiple threads. + * + * The destination tensor is first initialized with the initial value. Then, + * one several threads cooperate to perform the reduction atomically on each + * destination element. Lastly, postscalar transformations are applied + * in-place. + * + * Two or three kernels are generated: The initialization kernel, the main + * kernel that performs prescalar transformations and the reduction itself, and + * possibly also a postscalar transformation kernel when it is required. + * + * + * Kernel Template: + * + * The following kernel code template displays the code generated for the + * small code model. For the large code model, no pre/postRedux() kernels + * are generated (since their functionality is incorporated within the main + * redux() kernel), no atomicRedux() function needs to be generated because + * writes to global memory are unconditional and not contended. + * + * + * //Includes + * #include + * #include + * #include + * + * + * //Typedefs: + * typedef float T + * typedef int64_t X + * + * + * //Initializer (in case initial T cannot be expressed as a literal) + * static T getInitVal(void){ + * return ... + * } + * + * + * //Reduce into global memory destination a value. + * static void atomicRedux(GLOBAL_MEM T* dst, T val){ + * ... + * } + * + * + * //Load data from source and apply pre-operations. + * static T loadVal(X i0, X i1, ..., X iN, + * const GLOBAL_MEM T* src, + * const GLOBAL_MEM X* srcSteps, + * ...?){ + * return ... + * } + * + * + * //Initialization kernel, + * KERNEL void preRedux(const GLOBAL_MEM X* srcSize, + * const GLOBAL_MEM X* chunkSize, + * GLOBAL_MEM T* dst, + * const X dstOff, + * const GLOBAL_MEM X* dstSteps){ + * //OFFSETS + * dst += dstOff; + * + * //Initialize + * dst[...] = getInitVal(); + * } + * + * + * //Reduction Kernel. + * KERNEL void redux(const GLOBAL_MEM T* src, + * const X srcOff, + * const GLOBAL_MEM X* srcSteps, + * const GLOBAL_MEM X* srcSize, + * const GLOBAL_MEM X* chunkSize, + * GLOBAL_MEM T* dst, + * const X dstOff, + * const GLOBAL_MEM X* dstSteps, + * GLOBAL_MEM X* dstArg, + * const X dstArgOff, + * const GLOBAL_MEM X* dstArgSteps){ + * //OFFSETS + * src += srcOff + * dst += dstOff + * dstArg += dstArgOff + * + * //Declare Indices + * //Compute Ranges + * + * //Define macros + * //Outer Loops + * //Inner Loops + * //Undefine macros + * } + * + * + * //Post-scalar kernel, + * KERNEL void postRedux(const GLOBAL_MEM X* srcSize, + * const GLOBAL_MEM X* chunkSize, + * GLOBAL_MEM T* dst, + * const X dstOff, + * const GLOBAL_MEM X* dstSteps){ + * //OFFSETS + * dst += dstOff; + * + * //Initialize + * dst[...] = getInitVal(); + * } + * + * + * Initial Reduction Values + * +--------------+-----+-----+---------+---------+-----+-----+-----+-----+-----+ + * | Type\Op | + | * | max | min | & | | | ^ | && | || | + * +--------------+-----+-----+---------+---------+-----+-----+-----+-----+-----+ + * | signed int | 0 | 1 | INT_MIN | INT_MAX | ~0 | 0 | 0 | ~0 | 0 | + * | unsigned int | 0 | 1 | 0 | ~0 | ~0 | 0 | 0 | ~0 | 0 | + * | floating | 0.0 | 1.0 | NAN | NAN | | | | | | + * +--------------+-----+-----+---------+---------+-----+-----+-----+-----+-----+ + */ + +struct redux_ctx{ /* Function Arguments. */ - GpuArray* dstMax; - GpuArray* dstArgmax; + ga_reduce_op op; + GpuArray* dst; + GpuArray* dstArg; const GpuArray* src; int reduxLen; const int* reduxList; /* General. */ - int ret; int* axisList; gpucontext* gpuCtx; /* Source code Generator. */ - const char* dstMaxType; - const char* dstArgmaxType; + int srcTypeCode; + int dstTypeCode; + int dstArgTypeCode; + int idxTypeCode; + int accTypeCode; + const char* srcTypeStr; + const char* dstTypeStr; + const char* dstArgTypeStr; + const char* idxTypeStr; + const char* accTypeStr; + const char* initVal; int ndd; int ndr; int nds; int ndh; + int ndhd; + int ndhr; + int largeCodeModel; strb s; char* sourceCode; + GpuKernel preKernel; GpuKernel kernel; + GpuKernel postKernel; /* Scheduler */ - int hwAxisList[3]; - size_t blockSize [3]; - size_t gridSize [3]; - size_t chunkSize [3]; + int hwAxisList[MAX_HW_DIMS]; + size_t blockSize [MAX_HW_DIMS]; + size_t gridSize [MAX_HW_DIMS]; + size_t chunkSize [MAX_HW_DIMS]; /* Invoker */ gpudata* srcStepsGD; gpudata* srcSizeGD; gpudata* chunkSizeGD; - gpudata* dstMaxStepsGD; - gpudata* dstArgmaxStepsGD; + gpudata* dstStepsGD; + gpudata* dstArgStepsGD; }; -typedef struct maxandargmax_ctx maxandargmax_ctx; +typedef struct redux_ctx redux_ctx; /* Function prototypes */ -static int axisInSet (int v, - const int* set, - size_t setLen, - size_t* where); -static void appendIdxes (strb* s, - const char* prologue, - const char* prefix, - int startIdx, - int endIdx, - const char* suffix, - const char* epilogue); -static int maxandargmaxCheckargs (maxandargmax_ctx* ctx); -static int maxandargmaxSelectHwAxes (maxandargmax_ctx* ctx); -static int maxandargmaxGenSource (maxandargmax_ctx* ctx); -static void maxandargmaxAppendKernel (maxandargmax_ctx* ctx); -static void maxandargmaxAppendTypedefs (maxandargmax_ctx* ctx); -static void maxandargmaxAppendPrototype (maxandargmax_ctx* ctx); -static void maxandargmaxAppendOffsets (maxandargmax_ctx* ctx); -static void maxandargmaxAppendIndexDeclarations(maxandargmax_ctx* ctx); -static void maxandargmaxAppendRangeCalculations(maxandargmax_ctx* ctx); -static void maxandargmaxAppendLoops (maxandargmax_ctx* ctx); -static void maxandargmaxAppendLoopMacroDefs (maxandargmax_ctx* ctx); -static void maxandargmaxAppendLoopOuter (maxandargmax_ctx* ctx); -static void maxandargmaxAppendLoopInner (maxandargmax_ctx* ctx); -static void maxandargmaxAppendLoopMacroUndefs (maxandargmax_ctx* ctx); -static void maxandargmaxComputeAxisList (maxandargmax_ctx* ctx); -static int maxandargmaxCompile (maxandargmax_ctx* ctx); -static int maxandargmaxSchedule (maxandargmax_ctx* ctx); -static int maxandargmaxInvoke (maxandargmax_ctx* ctx); -static int maxandargmaxCleanup (maxandargmax_ctx* ctx); +static int reduxGetSumInit (int typecode, const char** property); +static int reduxGetProdInit (int typecode, const char** property); +static int reduxGetMinInit (int typecode, const char** property); +static int reduxGetMaxInit (int typecode, const char** property); +static int reduxGetAndInit (int typecode, const char** property); +static int reduxGetOrInit (int typecode, const char** property); +static int axisInSet (int v, + const int* set, + size_t setLen, + size_t* where); +static void appendIdxes (strb* s, + const char* prologue, + const char* prefix, + int startIdx, + int endIdx, + const char* suffix, + const char* epilogue); +static int reduxCheckargs (redux_ctx* ctx); +static void reduxSelectTypes (redux_ctx* ctx); +static int reduxSelectModel (redux_ctx* ctx); +static int reduxIsSmallCodeModel (redux_ctx* ctx); +static int reduxIsLargeCodeModel (redux_ctx* ctx); +static int reduxHasDst (redux_ctx* ctx); +static int reduxHasDstArg (redux_ctx* ctx); +static int reduxKernelRequiresDst (redux_ctx* ctx); +static int reduxKernelRequiresDstArg (redux_ctx* ctx); +static int reduxCanAppendHwAxis (redux_ctx* ctx, int wantReductionAxis); +static void reduxAppendLargestAxisToHwList(redux_ctx* ctx, int wantReductionAxis); +static int reduxSelectHwAxes (redux_ctx* ctx); +static int reduxComputeAxisList (redux_ctx* ctx); +static int reduxGenSource (redux_ctx* ctx); +static void reduxAppendSource (redux_ctx* ctx); +static void reduxAppendIncludes (redux_ctx* ctx); +static void reduxAppendTypedefs (redux_ctx* ctx); +static void reduxAppendFuncGetInitVal (redux_ctx* ctx); +static void reduxAppendFuncLoadVal (redux_ctx* ctx); +static void reduxAppendFuncReduxVal (redux_ctx* ctx); +static void reduxAppendFuncPreKernel (redux_ctx* ctx); +static void reduxAppendFuncKernel (redux_ctx* ctx); +static void reduxAppendFuncPostKernel (redux_ctx* ctx); +static void reduxAppendPrototype (redux_ctx* ctx); +static void reduxAppendOffsets (redux_ctx* ctx); +static void reduxAppendIndexDeclarations (redux_ctx* ctx); +static void reduxAppendRangeCalculations (redux_ctx* ctx); +static void reduxAppendLoops (redux_ctx* ctx); +static void reduxAppendLoopMacroDefs (redux_ctx* ctx); +static void reduxAppendLoopOuter (redux_ctx* ctx); +static void reduxAppendLoopInner (redux_ctx* ctx); +static void reduxAppendLoopMacroUndefs (redux_ctx* ctx); +static int reduxCompileLarge (redux_ctx* ctx); +static int reduxCompileSmall (redux_ctx* ctx); +static int reduxScheduleLarge (redux_ctx* ctx); +static int reduxInvokeLarge (redux_ctx* ctx); +static int reduxCleanup (redux_ctx* ctx, int ret); /* Function implementation */ -GPUARRAY_PUBLIC int GpuArray_maxandargmax (GpuArray* dstMax, - GpuArray* dstArgmax, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList){ - maxandargmax_ctx ctxSTACK = {0}; - maxandargmax_ctx *ctx = &ctxSTACK; - - ctxSTACK.dstMax = dstMax; - ctxSTACK.dstArgmax = dstArgmax; - ctxSTACK.src = src; - ctxSTACK.reduxLen = (int)reduxLen; - ctxSTACK.reduxList = (const int*)reduxList; - - if(maxandargmaxCheckargs (ctx) == GA_NO_ERROR && - maxandargmaxSelectHwAxes(ctx) == GA_NO_ERROR && - maxandargmaxGenSource (ctx) == GA_NO_ERROR && - maxandargmaxCompile (ctx) == GA_NO_ERROR && - maxandargmaxSchedule (ctx) == GA_NO_ERROR && - maxandargmaxInvoke (ctx) == GA_NO_ERROR){ - return maxandargmaxCleanup(ctx); - }else{ - return maxandargmaxCleanup(ctx); +GPUARRAY_PUBLIC int GpuArray_sum (GpuArray* dst, + const GpuArray* src, + unsigned reduxLen, + const unsigned* reduxList){ + return GpuArray_reduction(GA_REDUCE_SUM, + dst, NULL, src, reduxLen, reduxList); +} +GPUARRAY_PUBLIC int GpuArray_prod (GpuArray* dst, + const GpuArray* src, + unsigned reduxLen, + const unsigned* reduxList){ + return GpuArray_reduction(GA_REDUCE_PROD, + dst, NULL, src, reduxLen, reduxList); +} +GPUARRAY_PUBLIC int GpuArray_prodnz (GpuArray* dst, + const GpuArray* src, + unsigned reduxLen, + const unsigned* reduxList){ + return GpuArray_reduction(GA_REDUCE_PRODNZ, + dst, NULL, src, reduxLen, reduxList); +} +GPUARRAY_PUBLIC int GpuArray_min (GpuArray* dst, + const GpuArray* src, + unsigned reduxLen, + const unsigned* reduxList){ + return GpuArray_reduction(GA_REDUCE_MIN, + dst, NULL, src, reduxLen, reduxList); +} +GPUARRAY_PUBLIC int GpuArray_max (GpuArray* dst, + const GpuArray* src, + unsigned reduxLen, + const unsigned* reduxList){ + return GpuArray_reduction(GA_REDUCE_MAX, + dst, NULL, src, reduxLen, reduxList); +} +GPUARRAY_PUBLIC int GpuArray_argmin (GpuArray* dstArg, + const GpuArray* src, + unsigned reduxLen, + const unsigned* reduxList){ + return GpuArray_reduction(GA_REDUCE_ARGMIN, + NULL, dstArg, src, reduxLen, reduxList); +} +GPUARRAY_PUBLIC int GpuArray_argmax (GpuArray* dstArg, + const GpuArray* src, + unsigned reduxLen, + const unsigned* reduxList){ + return GpuArray_reduction(GA_REDUCE_ARGMAX, + NULL, dstArg, src, reduxLen, reduxList); +} +GPUARRAY_PUBLIC int GpuArray_minandargmin(GpuArray* dst, + GpuArray* dstArg, + const GpuArray* src, + unsigned reduxLen, + const unsigned* reduxList){ + return GpuArray_reduction(GA_REDUCE_MINANDARGMIN, + dst, dstArg, src, reduxLen, reduxList); +} +GPUARRAY_PUBLIC int GpuArray_maxandargmax(GpuArray* dst, + GpuArray* dstArg, + const GpuArray* src, + unsigned reduxLen, + const unsigned* reduxList){ + return GpuArray_reduction(GA_REDUCE_MAXANDARGMAX, + dst, dstArg, src, reduxLen, reduxList); +} +GPUARRAY_PUBLIC int GpuArray_and (GpuArray* dst, + const GpuArray* src, + unsigned reduxLen, + const unsigned* reduxList){ + return GpuArray_reduction(GA_REDUCE_AND, + dst, NULL, src, reduxLen, reduxList); +} +GPUARRAY_PUBLIC int GpuArray_or (GpuArray* dst, + const GpuArray* src, + unsigned reduxLen, + const unsigned* reduxList){ + return GpuArray_reduction(GA_REDUCE_OR, + dst, NULL, src, reduxLen, reduxList); +} +GPUARRAY_PUBLIC int GpuArray_xor (GpuArray* dst, + const GpuArray* src, + unsigned reduxLen, + const unsigned* reduxList){ + return GpuArray_reduction(GA_REDUCE_XOR, + dst, NULL, src, reduxLen, reduxList); +} +GPUARRAY_PUBLIC int GpuArray_all (GpuArray* dst, + const GpuArray* src, + unsigned reduxLen, + const unsigned* reduxList){ + return GpuArray_reduction(GA_REDUCE_ALL, + dst, NULL, src, reduxLen, reduxList); +} +GPUARRAY_PUBLIC int GpuArray_any (GpuArray* dst, + const GpuArray* src, + unsigned reduxLen, + const unsigned* reduxList){ + return GpuArray_reduction(GA_REDUCE_ANY, + dst, NULL, src, reduxLen, reduxList); +} +GPUARRAY_PUBLIC int GpuArray_reduction (ga_reduce_op op, + GpuArray* dst, + GpuArray* dstArg, + const GpuArray* src, + unsigned reduxLen, + const unsigned* reduxList){ + redux_ctx ctxSTACK = {op, dst, dstArg, src, + (int)reduxLen, (const int*)reduxList}, + *ctx = &ctxSTACK; + + return reduxCheckargs(ctx); +} + +/** + * @brief Get an expression representing a suitable initialization value for + * the given datatype and a sum-reduction operation. + * + * @param [in] typecode Typecode of the type whose initializer is to be + * requested. + * @param [out] property A pointer to a string. On return it will be set to + * the initializer expression. + * @return Zero if successful; Non-zero if the datatype is not supported. + */ + +static int reduxGetSumInit (int typecode, const char** property){ + if(typecode == GA_POINTER || + typecode == GA_BUFFER){ + return GA_UNSUPPORTED_ERROR; + } + *property = "0"; + return GA_NO_ERROR; +} + +/** + * @brief Get an expression representing a suitable initialization value for + * the given datatype and a prod-reduction operation. + * + * @param [in] typecode Typecode of the type whose initializer is to be + * requested. + * @param [out] property A pointer to a string. On return it will be set to + * the initializer expression. + * @return Zero if successful; Non-zero if the datatype is not supported. + */ + +static int reduxGetProdInit (int typecode, const char** property){ + if(typecode == GA_POINTER || + typecode == GA_BUFFER){ + return GA_UNSUPPORTED_ERROR; + } + *property = "1"; + return GA_NO_ERROR; +} + +/** + * @brief Get an expression representing a suitable initialization value for + * the given datatype and a max-reduction operation. + * + * @param [in] typecode Typecode of the type whose initializer is to be + * requested. + * @param [out] property A pointer to a string. On return it will be set to + * the initializer expression. + * @return Zero if successful; Non-zero if the datatype is not supported. + */ + +static int reduxGetMinInit (int typecode, const char** property){ + switch(typecode){ + case GA_BYTE2: + case GA_BYTE3: + case GA_BYTE4: + case GA_BYTE8: + case GA_BYTE16: + case GA_BYTE: *property = "SCHAR_MIN"; break; + case GA_SHORT2: + case GA_SHORT3: + case GA_SHORT4: + case GA_SHORT8: + case GA_SHORT16: + case GA_SHORT: *property = "SHRT_MIN"; break; + case GA_INT2: + case GA_INT3: + case GA_INT4: + case GA_INT8: + case GA_INT16: + case GA_INT: *property = "INT_MIN"; break; + case GA_LONG2: + case GA_LONG3: + case GA_LONG4: + case GA_LONG8: + case GA_LONG16: + case GA_LONG: *property = "LONG_MIN"; break; + case GA_LONGLONG: *property = "LLONG_MIN"; break; + case GA_BOOL: + case GA_UBYTE2: + case GA_UBYTE3: + case GA_UBYTE4: + case GA_UBYTE8: + case GA_UBYTE16: + case GA_UBYTE: + case GA_USHORT2: + case GA_USHORT3: + case GA_USHORT4: + case GA_USHORT8: + case GA_USHORT16: + case GA_USHORT: + case GA_UINT2: + case GA_UINT3: + case GA_UINT4: + case GA_UINT8: + case GA_UINT16: + case GA_UINT: + case GA_ULONG2: + case GA_ULONG3: + case GA_ULONG4: + case GA_ULONG8: + case GA_ULONG16: + case GA_ULONG: + case GA_ULONGLONG: + case GA_SIZE: *property = "0"; break; + case GA_HALF: + case GA_FLOAT: + case GA_DOUBLE: + case GA_QUAD: *property = "NAN"; break; + default: return GA_UNSUPPORTED_ERROR; + } + + return GA_NO_ERROR; +} + +/** + * @brief Get an expression representing a suitable initialization value for + * the given datatype and a min-reduction operation. + * + * @param [in] typecode Typecode of the type whose initializer is to be + * requested. + * @param [out] property A pointer to a string. On return it will be set to + * the initializer expression. + * @return Zero if successful; Non-zero if the datatype is not supported. + */ + +static int reduxGetMaxInit (int typecode, const char** property){ + switch(typecode){ + case GA_BOOL: *property = "1"; break; + case GA_BYTE2: + case GA_BYTE3: + case GA_BYTE4: + case GA_BYTE8: + case GA_BYTE16: + case GA_BYTE: *property = "SCHAR_MAX"; break; + case GA_UBYTE2: + case GA_UBYTE3: + case GA_UBYTE4: + case GA_UBYTE8: + case GA_UBYTE16: + case GA_UBYTE: *property = "UCHAR_MAX"; break; + case GA_SHORT2: + case GA_SHORT3: + case GA_SHORT4: + case GA_SHORT8: + case GA_SHORT16: + case GA_SHORT: *property = "SHRT_MAX"; break; + case GA_USHORT2: + case GA_USHORT3: + case GA_USHORT4: + case GA_USHORT8: + case GA_USHORT16: + case GA_USHORT: *property = "USHRT_MAX"; break; + case GA_INT2: + case GA_INT3: + case GA_INT4: + case GA_INT8: + case GA_INT16: + case GA_INT: *property = "INT_MAX"; break; + case GA_UINT2: + case GA_UINT3: + case GA_UINT4: + case GA_UINT8: + case GA_UINT16: + case GA_UINT: *property = "UINT_MAX"; break; + case GA_LONG2: + case GA_LONG3: + case GA_LONG4: + case GA_LONG8: + case GA_LONG16: + case GA_LONG: *property = "LONG_MAX"; break; + case GA_ULONG2: + case GA_ULONG3: + case GA_ULONG4: + case GA_ULONG8: + case GA_ULONG16: + case GA_ULONG: *property = "ULONG_MAX"; break; + case GA_LONGLONG: *property = "LLONG_MAX"; break; + case GA_ULONGLONG: *property = "ULLONG_MAX"; break; + case GA_HALF: + case GA_FLOAT: + case GA_DOUBLE: + case GA_QUAD: *property = "NAN"; break; + default: return GA_UNSUPPORTED_ERROR; + } + + return GA_NO_ERROR; +} + +/** + * @brief Get an expression representing a suitable initialization value for + * the given datatype and a and-reduction operation. + * + * @param [in] typecode Typecode of the type whose initializer is to be + * requested. + * @param [out] property A pointer to a string. On return it will be set to + * the initializer expression. + * @return Zero if successful; Non-zero if the datatype is not supported. + */ + +static int reduxGetAndInit (int typecode, const char** property){ + if(typecode == GA_POINTER || + typecode == GA_BUFFER){ + return GA_UNSUPPORTED_ERROR; + } + *property = "~0"; + return GA_NO_ERROR; +} + +/** + * @brief Get an expression representing a suitable initialization value for + * the given datatype and a or-reduction operation. + * + * @param [in] typecode Typecode of the type whose initializer is to be + * requested. + * @param [out] property A pointer to a string. On return it will be set to + * the initializer expression. + * @return Zero if successful; Non-zero if the datatype is not supported. + */ + +static int reduxGetOrInit (int typecode, const char** property){ + if(typecode == GA_POINTER || + typecode == GA_BUFFER){ + return GA_UNSUPPORTED_ERROR; } + *property = "0"; + return GA_NO_ERROR; } /** @@ -133,10 +640,10 @@ GPUARRAY_PUBLIC int GpuArray_maxandargmax (GpuArray* dstMax, * @return Non-zero if the set is non-empty and v is in it; Zero otherwise. */ -static int axisInSet (int v, - const int* set, - size_t setLen, - size_t* where){ +static int axisInSet (int v, + const int* set, + size_t setLen, + size_t* where){ size_t i; for(i=0;iret = GA_NO_ERROR; ctx->axisList = NULL; ctx->gpuCtx = NULL; - ctx->dstMaxType = ctx->dstArgmaxType = NULL; + ctx->srcTypeStr = ctx->dstTypeStr = ctx->dstArgTypeStr = + ctx->accTypeStr = ctx->idxTypeStr = NULL; + ctx->initVal = NULL; ctx->ndh = 0; + ctx->ndhd = 0; + ctx->ndhr = 0; ctx->sourceCode = NULL; + ctx->s = INIT_STRB; - ctx->hwAxisList[0] = ctx->hwAxisList[1] = ctx->hwAxisList[2] = 0; - ctx->blockSize [0] = ctx->blockSize [1] = ctx->blockSize [2] = 1; - ctx->gridSize [0] = ctx->gridSize [1] = ctx->gridSize [2] = 1; - ctx->chunkSize [0] = ctx->chunkSize [1] = ctx->chunkSize [2] = 1; + for(i=0;ihwAxisList[i] = 0; + ctx->blockSize [i] = 1; + ctx->gridSize [i] = 1; + ctx->chunkSize [i] = 1; + } - ctx->srcStepsGD = ctx->srcSizeGD = ctx->chunkSizeGD = - ctx->dstMaxStepsGD = ctx->dstArgmaxStepsGD = NULL; + ctx->srcStepsGD = ctx->srcSizeGD = ctx->chunkSizeGD = + ctx->dstStepsGD = ctx->dstArgStepsGD = NULL; + /* *** IT IS NOW SAFE TO CALL reduxCleanup() *** */ - /* Insane src or reduxLen? */ - if(!ctx->dstMax || !ctx->dstArgmax || !ctx->src || ctx->src->nd == 0 || - ctx->reduxLen == 0 || ctx->reduxLen > (int)ctx->src->nd){ - return ctx->ret=GA_INVALID_ERROR; + /* Insane src, reduxLen, dst or dstArg? */ + if(!ctx->src || ctx->src->nd <= 0 || ctx->reduxLen == 0 || + ctx->reduxLen > (int)ctx->src->nd){ + return reduxCleanup(ctx, GA_INVALID_ERROR); + } + if((reduxHasDst (ctx) && !ctx->dst) || + (reduxHasDstArg(ctx) && !ctx->dstArg)){ + return reduxCleanup(ctx, GA_INVALID_ERROR); } + /* Insane or duplicate list entry? */ for(i=0;ireduxLen;i++){ if(ctx->reduxList[i] < 0 || ctx->reduxList[i] >= (int)ctx->src->nd || axisInSet(ctx->reduxList[i], ctx->reduxList, i, 0)){ - return ctx->ret=GA_INVALID_ERROR; + return reduxCleanup(ctx, GA_INVALID_ERROR); } } - /* Unknown type? */ - ctx->dstMaxType = gpuarray_get_type(ctx->src->typecode)->cluda_name; - ctx->dstArgmaxType = gpuarray_get_type(GA_SSIZE) ->cluda_name; - if(!ctx->dstMaxType || !ctx->dstArgmaxType){ - return ctx->ret=GA_INVALID_ERROR; - } /* GPU context non-existent? */ - ctx->gpuCtx = GpuArray_context(ctx->src); + ctx->gpuCtx = GpuArray_context(ctx->src); if(!ctx->gpuCtx){ - return ctx->ret=GA_INVALID_ERROR; + return reduxCleanup(ctx, GA_INVALID_ERROR); + } + + + /* Unknown type? */ + reduxSelectTypes(ctx); + if(!ctx->srcTypeStr || !ctx->dstTypeStr || !ctx->dstArgTypeStr || + !ctx->accTypeStr){ + return reduxCleanup(ctx, GA_INVALID_ERROR); + } + + + /* Determine initializer, and error out if reduction unsupported. */ + switch(ctx->op){ + case GA_REDUCE_SUM: ret = reduxGetSumInit (ctx->accTypeCode, &ctx->initVal); break; + case GA_REDUCE_PRODNZ: + case GA_REDUCE_PROD: ret = reduxGetProdInit(ctx->accTypeCode, &ctx->initVal); break; + case GA_REDUCE_MINANDARGMIN: + case GA_REDUCE_ARGMIN: + case GA_REDUCE_MIN: ret = reduxGetMinInit (ctx->accTypeCode, &ctx->initVal); break; + case GA_REDUCE_MAXANDARGMAX: + case GA_REDUCE_ARGMAX: + case GA_REDUCE_MAX: ret = reduxGetMaxInit (ctx->accTypeCode, &ctx->initVal); break; + case GA_REDUCE_ALL: + case GA_REDUCE_AND: ret = reduxGetAndInit (ctx->accTypeCode, &ctx->initVal); break; + case GA_REDUCE_ANY: + case GA_REDUCE_XOR: + case GA_REDUCE_OR: ret = reduxGetOrInit (ctx->accTypeCode, &ctx->initVal); break; + default: ret = GA_UNSUPPORTED_ERROR; break; + } + if(ret != GA_NO_ERROR){ + return reduxCleanup(ctx, ret); } @@ -256,114 +801,493 @@ static int maxandargmaxCheckargs (maxandargmax_ctx* ctx){ ctx->nds = ctx->src->nd; ctx->ndr = ctx->reduxLen; ctx->ndd = ctx->nds - ctx->ndr; + strb_ensure(&ctx->s, 5*1024); + - return ctx->ret; + + return reduxSelectModel(ctx); } /** - * @brief Select which axes (up to 3) will be assigned to hardware - * dimensions. + * @brief Select types for the reduction kernel's implementation. + * + * There are 5 types of relevance: + * - Source (S=Source) + * - Destination (T=Target) + * - Destination Argument (A=Arg) + * - Index (X=indeX) + * - Accumulator (K=aKKumulator/reduction) */ -static int maxandargmaxSelectHwAxes (maxandargmax_ctx* ctx){ - int i, j, maxI = 0; - size_t maxV; +static void reduxSelectTypes (redux_ctx* ctx){ + /* Deal with the various typecodes. */ + ctx->srcTypeCode = ctx->src->typecode; + ctx->dstTypeCode = ctx->srcTypeCode; + ctx->dstArgTypeCode = GA_SSIZE; + ctx->idxTypeCode = GA_SSIZE; + switch(ctx->srcTypeCode){ + case GA_HALF: ctx->accTypeCode = GA_FLOAT; + case GA_HALF2: ctx->accTypeCode = GA_FLOAT2; + case GA_HALF4: ctx->accTypeCode = GA_FLOAT4; + case GA_HALF8: ctx->accTypeCode = GA_FLOAT8; + case GA_HALF16: ctx->accTypeCode = GA_FLOAT16; + default: ctx->accTypeCode = ctx->srcTypeCode; + } + + /* Get the string version as well. */ + ctx->srcTypeStr = gpuarray_get_type(ctx->srcTypeCode) ->cluda_name; + ctx->dstTypeStr = gpuarray_get_type(ctx->dstTypeCode) ->cluda_name; + ctx->dstArgTypeStr = gpuarray_get_type(ctx->dstArgTypeCode)->cluda_name; + ctx->idxTypeStr = gpuarray_get_type(ctx->idxTypeCode) ->cluda_name; + ctx->accTypeStr = gpuarray_get_type(ctx->accTypeCode) ->cluda_name; +} + +/** + * @brief Select which code model will be used: + * + * - Large (Destination tensor >= SMALL_REDUX_THRESHOLD elements, or + * destination tensor size >= # of reductions per destination + * tensor element): + * All destination elements have their own thread. + * - Small (otherwise): + * Multiple threads cooperate on a single destination element. + */ + +static int reduxSelectModel (redux_ctx* ctx){ + int i, ret; + unsigned numProcs; + size_t localSize; + size_t dstNumElem = 1, reduxPerElem = 1; - ctx->ndh = ctx->ndd<3 ? ctx->ndd : 3; /** - * The ctx->hwAxisLen largest axes are selected and assigned in - * descending order to X, Y, Z. + * Query device for approximate total level of parallelism. If destination + * tensor is so big it can keep all threads busy on individual elements, + * use large code model; Otherwise use small code model, where threads will + * have to cooperate. */ + + ret = gpucontext_property(ctx->gpuCtx, GA_CTX_PROP_NUMPROCS, &numProcs); + if(ret != GA_NO_ERROR){ + return reduxCleanup(ctx, ret); + } + ret = gpucontext_property(ctx->gpuCtx, GA_CTX_PROP_MAXLSIZE, &localSize); + if(ret != GA_NO_ERROR){ + return reduxCleanup(ctx, ret); + } - for(i=0;indh;i++){ - maxV = 0; - - for(j=0;jnds;j++){ - if(!axisInSet(j, ctx->hwAxisList, i, 0) && - !axisInSet(j, ctx->reduxList, ctx->ndr, 0) && - ctx->src->dimensions[j] >= maxV){ - maxV = ctx->src->dimensions[j]; - maxI = j; - } + + /** + * Compute #elems in dst and # reductions per dst element. + */ + + for(i=0;inds;i++){ + if(axisInSet(i, ctx->reduxList, ctx->nds, NULL)){ + reduxPerElem *= ctx->src->dimensions[i]; + }else{ + dstNumElem *= ctx->src->dimensions[i]; } + } + ctx->largeCodeModel = dstNumElem >= numProcs*localSize || + dstNumElem >= reduxPerElem + || 1;/* BUG: Erase when small code model implemented. */ + /** + * *** IT IS NOW SAFE TO CALL: *** + * - reduxIsLargeModel() + * - reduxIsSmallModel() + * - reduxKernelRequiresDst() + * - reduxKernelRequiresDstArg() + */ + + + return reduxSelectHwAxes(ctx); +} + +/** + * @brief Returns whether we are using the small code model or not. + */ - ctx->hwAxisList[i] = maxI; +static int reduxIsSmallCodeModel (redux_ctx* ctx){ + return !reduxIsLargeCodeModel(ctx); +} + +/** + * @brief Returns whether we are using the large code model or not. + */ + +static int reduxIsLargeCodeModel (redux_ctx* ctx){ + return ctx->largeCodeModel; +} + +/** + * @brief Returns whether the reduction interface requires a dst argument. + */ + +static int reduxHasDst (redux_ctx* ctx){ + switch(ctx->op){ + case GA_REDUCE_ARGMIN: + case GA_REDUCE_ARGMAX: return 0; + default: return 1; } +} + +/** + * @brief Returns whether the reduction interface requires a dstArg argument. + */ - return ctx->ret=GA_NO_ERROR; +static int reduxHasDstArg (redux_ctx* ctx){ + switch(ctx->op){ + case GA_REDUCE_MINANDARGMIN: + case GA_REDUCE_MAXANDARGMAX: + case GA_REDUCE_ARGMIN: + case GA_REDUCE_ARGMAX: return 1; + default: return 0; + } } /** - * @brief Generate the kernel code for MaxAndArgmax. - * - * @return GA_MEMORY_ERROR if not enough memory left; GA_NO_ERROR otherwise. + * @brief Returns whether the generated kernel internally requires a dst + * argument. + * + * This is semantically subtly different from reduxHasDst(). The main + * difference is in the implementation of the GA_REDUCE_ARGMIN/ARGMAX + * reductions; Either *might* require a dst buffer, which will have to be + * allocated, even though it will be discared. */ -static int maxandargmaxGenSource (maxandargmax_ctx* ctx){ - /* Compute internal axis remapping. */ +static int reduxKernelRequiresDst (redux_ctx* ctx){ + switch(ctx->op){ + case GA_REDUCE_ARGMIN: + case GA_REDUCE_ARGMAX: return reduxIsSmallCodeModel(ctx); + default: return 1; + } +} + +/** + * @brief Returns whether the generated kernel internally requires a dstArg + * argument. + * + * This is semantically subtly different from reduxHasDstArg(), since it asks + * whether the reduction, even though it does not accept a dstArg argument, + * still requires a dstArg internally. + */ + +static int reduxKernelRequiresDstArg (redux_ctx* ctx){ + /** + * At present there exists no reduction whose implementation requires + * a dstArg but whose interface does not. + * + * E.g. the max() and min() reductions do NOT currently require a temporary + * buffer for indexes, and will not in the foreseeable future. + */ + + return reduxHasDstArg(ctx); +} + +/** + * @brief Check whether we can add another reduction axis + * (wantReductionAxis=1) or destination axis (wantReductionAxis=0) to + * the hardware axis list. + */ + +static int reduxCanAppendHwAxis (redux_ctx* ctx, int wantReductionAxis){ + if(ctx->ndh >= MAX_HW_DIMS){ + return 0; + }else{ + return wantReductionAxis ? ctx->ndhr < ctx->ndr: + ctx->ndhd < ctx->ndd; + } +} + +/** + * @brief Append the largest reduction axis (wantReductionAxis=1) or + * destination axis (wantReductionAxis=0) that isn't yet in the hardware + * axis list into said hardware axis list. + */ + +static void reduxAppendLargestAxisToHwList(redux_ctx* ctx, int wantReductionAxis){ + int maxI = 0, i, isInHwList, isInReduxList, isInDesiredList, isLargestSoFar; + size_t maxV = 0; + + /* Find */ + for(i=0;inds;i++){ + isInHwList = axisInSet(i, ctx->hwAxisList, ctx->ndh, 0); + isInReduxList = axisInSet(i, ctx->reduxList, ctx->ndr, 0); + isInDesiredList = wantReductionAxis ? isInReduxList : !isInReduxList; + isLargestSoFar = ctx->src->dimensions[i] >= maxV; + + if(!isInHwList && isInDesiredList && isLargestSoFar){ + maxV = ctx->src->dimensions[i]; + maxI = i; + } + } + + /* Append */ + ctx->hwAxisList[ctx->ndh++] = maxI; + if(wantReductionAxis){ + ctx->ndhr++; + }else{ + ctx->ndhd++; + } +} + +/** + * @brief Select which axes (up to MAX_HW_DIMS) will be assigned to hardware + * dimensions. + * + * For the "large" code model: The up-to-MAX_HW_DIMS largest destination tensor + * dimensions are selected. + * For the "small" code model: Up to MAX_HW_DIMS reduction dimensions (largest- + * to-smallest) are selected. If less than + * MAX_HW_DIMS dimensions were selected, + * destination tensor dimensions are selected until + * MAX_HW_DIMS total dimensions are selected, or no + * destination tensors are left. + */ + +static int reduxSelectHwAxes (redux_ctx* ctx){ + if(reduxIsSmallCodeModel(ctx)){ + while(reduxCanAppendHwAxis(ctx, 1)){ + reduxAppendLargestAxisToHwList(ctx, 1); + } + } + + while(reduxCanAppendHwAxis(ctx, 0)){ + reduxAppendLargestAxisToHwList(ctx, 0); + } + + return reduxComputeAxisList(ctx); +} + +/** + * @brief Compute the axis list. + * + * The axis list describes the mapping between the nested loops of the kernel + * as well as their accompanying indices (i0*, i1*, ..., in*) on one hand, and + * the axes of the source tensor. The first axis in the list corresponds to the + * outermost loop and the last axis in the list to the innermost. + * + * The first ctx->ndd axes correspond to the outer loops that iterate over + * each destination element. The last ctx->ndr axes correspond to the inner + * loops that iterate over the dimensions of elements that are to be reduced. + * + * @return GA_MEMORY_ERROR if allocating the list failed; Otherwise, returns + * GA_NO_ERROR. + */ + +static int reduxComputeAxisList (redux_ctx* ctx){ + int i, f=0; + ctx->axisList = malloc(ctx->nds * sizeof(unsigned)); if(!ctx->axisList){ - return ctx->ret=GA_MEMORY_ERROR; + return reduxCleanup(ctx, GA_MEMORY_ERROR); } - maxandargmaxComputeAxisList(ctx); - /* Generate kernel proper. */ - strb_ensure(&ctx->s, 5*1024); - maxandargmaxAppendKernel(ctx); - free(ctx->axisList); - ctx->axisList = NULL; + for(i=0;inds;i++){ + if(!axisInSet(i, ctx->reduxList, ctx->ndr, 0)){ + ctx->axisList[f++] = i; + } + } + memcpy(&ctx->axisList[f], ctx->reduxList, ctx->ndr * sizeof(*ctx->reduxList)); + + + return reduxGenSource(ctx); +} + +/** + * @brief Generate the kernel code for the reduction. + * + * @return GA_MEMORY_ERROR if not enough memory left; GA_NO_ERROR otherwise. + */ + +static int reduxGenSource (redux_ctx* ctx){ + reduxAppendSource(ctx); ctx->sourceCode = strb_cstr(&ctx->s); if(!ctx->sourceCode){ - return ctx->ret=GA_MEMORY_ERROR; + return reduxCleanup(ctx, GA_MEMORY_ERROR); } - - /* Return it. */ - return ctx->ret=GA_NO_ERROR; + + return reduxIsLargeCodeModel(ctx) ? reduxCompileLarge(ctx): + reduxCompileSmall(ctx); } -static void maxandargmaxAppendKernel (maxandargmax_ctx* ctx){ - strb_appends (&ctx->s, "#include \"cluda.h\"\n"); - maxandargmaxAppendTypedefs (ctx); - maxandargmaxAppendPrototype (ctx); - strb_appends (&ctx->s, "{\n"); - maxandargmaxAppendOffsets (ctx); - maxandargmaxAppendIndexDeclarations(ctx); - maxandargmaxAppendRangeCalculations(ctx); - maxandargmaxAppendLoops (ctx); - strb_appends (&ctx->s, "}\n"); +static void reduxAppendSource (redux_ctx* ctx){ + reduxAppendIncludes (ctx); + reduxAppendTypedefs (ctx); + reduxAppendFuncGetInitVal (ctx); + reduxAppendFuncLoadVal (ctx); + reduxAppendFuncReduxVal (ctx); + reduxAppendFuncPreKernel (ctx); + reduxAppendFuncKernel (ctx); + reduxAppendFuncPostKernel (ctx); } -static void maxandargmaxAppendTypedefs (maxandargmax_ctx* ctx){ +static void reduxAppendIncludes (redux_ctx* ctx){ + strb_appends(&ctx->s, "/* Includes */\n"); + strb_appends(&ctx->s, "#include \"cluda.h\"\n"); + strb_appends(&ctx->s, "\n"); + strb_appends(&ctx->s, "\n"); + strb_appends(&ctx->s, "\n"); +} +static void reduxAppendTypedefs (redux_ctx* ctx){ strb_appends(&ctx->s, "/* Typedefs */\n"); - strb_appendf(&ctx->s, "typedef %s T;/* The type of the array being processed. */\n", ctx->dstMaxType); - strb_appendf(&ctx->s, "typedef %s X;/* Index type: signed 32/64-bit. */\n", ctx->dstArgmaxType); + strb_appendf(&ctx->s, "typedef %s S;/* The type of the source array. */\n", ctx->srcTypeStr); + strb_appendf(&ctx->s, "typedef %s T;/* The type of the destination array. */\n", ctx->dstTypeStr); + strb_appendf(&ctx->s, "typedef %s A;/* The type of the destination argument array. */\n", ctx->dstArgTypeStr); + strb_appendf(&ctx->s, "typedef %s X;/* The type of the indices: signed 32/64-bit. */\n", ctx->idxTypeStr); + strb_appendf(&ctx->s, "typedef %s K;/* The type of the accumulator variable. */\n", ctx->accTypeStr); + strb_appends(&ctx->s, "\n"); + strb_appends(&ctx->s, "\n"); + strb_appends(&ctx->s, "\n"); +} +static void reduxAppendFuncGetInitVal (redux_ctx* ctx){ + strb_appends(&ctx->s, "/**\n"); + strb_appends(&ctx->s, " * Initial value function.\n"); + strb_appends(&ctx->s, " */\n"); + strb_appends(&ctx->s, "\n"); + strb_appends(&ctx->s, "WITHIN_KERNEL K getInitVal(void){\n"); + strb_appendf(&ctx->s, "\treturn (%s);\n", ctx->initVal); + strb_appends(&ctx->s, "}\n"); + strb_appends(&ctx->s, "\n"); + strb_appends(&ctx->s, "\n"); + strb_appends(&ctx->s, "\n"); +} +static void reduxAppendFuncLoadVal (redux_ctx* ctx){ + int i; + + strb_appends(&ctx->s, "/**\n"); + strb_appends(&ctx->s, " * Multidimensional source element loader.\n"); + strb_appends(&ctx->s, " *\n"); + strb_appends(&ctx->s, " * Also implements prescalar transformations if any.\n"); + strb_appends(&ctx->s, " */\n"); + strb_appends(&ctx->s, "\n"); + appendIdxes (&ctx->s, "WITHIN_KERNEL K loadVal(", "X i", 0, ctx->nds, "", ""); + if(ctx->nds > 0){ + strb_appends(&ctx->s, ", "); + } + strb_appends(&ctx->s, "const GLOBAL_MEM S* src, const GLOBAL_MEM X* srcSteps){\n"); + strb_appends(&ctx->s, "\tS v = (*(const GLOBAL_MEM S*)((const GLOBAL_MEM char*)src + "); + for(i=0;inds;i++){ + strb_appendf(&ctx->s, "i%d*srcSteps[%d] + \\\n\t ", i, ctx->axisList[i]); + } + strb_appends(&ctx->s, "0));\n"); + strb_appends(&ctx->s, "\treturn v;\n"); + strb_appends(&ctx->s, "}\n"); + strb_appends(&ctx->s, "\n"); strb_appends(&ctx->s, "\n"); strb_appends(&ctx->s, "\n"); +} +static void reduxAppendFuncReduxVal (redux_ctx* ctx){ + int i, anyArgsEmitted = 0; + + /* Function Signature. */ + strb_appends(&ctx->s, "/**\n"); + strb_appends(&ctx->s, " * Global memory value reduction function.\n"); + strb_appends(&ctx->s, " *\n"); + strb_appends(&ctx->s, " * Responsible for either:\n"); + strb_appends(&ctx->s, " * 1) Safe writeback of final value to memory, or\n"); + strb_appends(&ctx->s, " * 2) Safe atomic reduction of partial value into memory.\n"); + strb_appends(&ctx->s, " */\n"); + strb_appends(&ctx->s, "\n"); + appendIdxes (&ctx->s, "WITHIN_KERNEL void reduxVal(", "X i", 0, ctx->ndd, "", ""); + anyArgsEmitted = ctx->ndd>0; + if(anyArgsEmitted){ + strb_appends(&ctx->s, ", "); + } + if(reduxKernelRequiresDst (ctx)){ + anyArgsEmitted = 1; + strb_appends(&ctx->s, "GLOBAL_MEM T* dst, const GLOBAL_MEM X* dstSteps, K v"); + } + if(anyArgsEmitted){ + strb_appends(&ctx->s, ", "); + } + if(reduxKernelRequiresDstArg(ctx)){ + anyArgsEmitted = 1; + strb_appends(&ctx->s, "GLOBAL_MEM A* dstArg, const GLOBAL_MEM X* dstArgSteps, X i"); + } + strb_appends(&ctx->s, "){\n"); + + + /* Post-scalar transformations go here. */ + + + /* Write to memory. */ + if(reduxIsLargeCodeModel(ctx)){ + /* Large code model. Easy: just write out the data, since it's safe. */ + if(reduxKernelRequiresDst (ctx)){ + strb_appends(&ctx->s, "\t(*(GLOBAL_MEM T*)((GLOBAL_MEM char*)dst + "); + for(i=0;indd;i++){ + strb_appendf(&ctx->s, "i%d*dstSteps[%d] +\n\t ", i, i); + } + strb_appends(&ctx->s, "0)) = v;\n"); + } + if(reduxKernelRequiresDstArg(ctx)){ + strb_appends(&ctx->s, "\t(*(GLOBAL_MEM A*)((GLOBAL_MEM char*)dstArg + "); + for(i=0;indd;i++){ + strb_appendf(&ctx->s, "i%d*dstArgSteps[%d] +\n\t ", i, i); + } + strb_appends(&ctx->s, "0)) = i;\n"); + } + }else{ + /* BUG: Implement the atomic reduction, one or two CAS loops. */ + if ( reduxKernelRequiresDst (ctx) && !reduxKernelRequiresDstArg(ctx)){ + + }else if(!reduxKernelRequiresDst (ctx) && reduxKernelRequiresDstArg(ctx)){ + + }else if( reduxKernelRequiresDst (ctx) && reduxKernelRequiresDstArg(ctx)){ + + } + } + + /* Close off function. */ + strb_appends(&ctx->s, "}\n"); + strb_appends(&ctx->s, "\n"); + strb_appends(&ctx->s, "\n"); + strb_appends(&ctx->s, "\n"); +} +static void reduxAppendFuncPreKernel (redux_ctx* ctx){ + +} +static void reduxAppendFuncKernel (redux_ctx* ctx){ + reduxAppendPrototype (ctx); + strb_appends (&ctx->s, "{\n"); + reduxAppendOffsets (ctx); + reduxAppendIndexDeclarations(ctx); + reduxAppendRangeCalculations(ctx); + reduxAppendLoops (ctx); + strb_appends (&ctx->s, "}\n"); +} +static void reduxAppendFuncPostKernel (redux_ctx* ctx){ + +} +static void reduxAppendPrototype (redux_ctx* ctx){ + strb_appends(&ctx->s, "/**\n"); + strb_appends(&ctx->s, " * Reduction Kernel.\n"); + strb_appends(&ctx->s, " *\n"); + strb_appends(&ctx->s, " * Implements actual reduction operation.\n"); + strb_appends(&ctx->s, " */\n"); strb_appends(&ctx->s, "\n"); + strb_appends(&ctx->s, "KERNEL void redux(const GLOBAL_MEM S* src,\n"); + strb_appends(&ctx->s, " const X srcOff,\n"); + strb_appends(&ctx->s, " const GLOBAL_MEM X* srcSteps,\n"); + strb_appends(&ctx->s, " const GLOBAL_MEM X* srcSize,\n"); + strb_appends(&ctx->s, " const GLOBAL_MEM X* chunkSize,\n"); + strb_appends(&ctx->s, " GLOBAL_MEM T* dst,\n"); + strb_appends(&ctx->s, " const X dstOff,\n"); + strb_appends(&ctx->s, " const GLOBAL_MEM X* dstSteps,\n"); + strb_appends(&ctx->s, " GLOBAL_MEM A* dstArg,\n"); + strb_appends(&ctx->s, " const X dstArgOff,\n"); + strb_appends(&ctx->s, " const GLOBAL_MEM X* dstArgSteps)"); } -static void maxandargmaxAppendPrototype (maxandargmax_ctx* ctx){ - strb_appends(&ctx->s, "KERNEL void maxandargmax(const GLOBAL_MEM T* src,\n"); - strb_appends(&ctx->s, " const X srcOff,\n"); - strb_appends(&ctx->s, " const GLOBAL_MEM X* srcSteps,\n"); - strb_appends(&ctx->s, " const GLOBAL_MEM X* srcSize,\n"); - strb_appends(&ctx->s, " const GLOBAL_MEM X* chunkSize,\n"); - strb_appends(&ctx->s, " GLOBAL_MEM T* dstMax,\n"); - strb_appends(&ctx->s, " const X dstMaxOff,\n"); - strb_appends(&ctx->s, " const GLOBAL_MEM X* dstMaxSteps,\n"); - strb_appends(&ctx->s, " GLOBAL_MEM X* dstArgmax,\n"); - strb_appends(&ctx->s, " const X dstArgmaxOff,\n"); - strb_appends(&ctx->s, " const GLOBAL_MEM X* dstArgmaxSteps)"); -} -static void maxandargmaxAppendOffsets (maxandargmax_ctx* ctx){ +static void reduxAppendOffsets (redux_ctx* ctx){ strb_appends(&ctx->s, "\t/* Add offsets */\n"); - strb_appends(&ctx->s, "\tsrc = (const GLOBAL_MEM T*)((const GLOBAL_MEM char*)src + srcOff);\n"); - strb_appends(&ctx->s, "\tdstMax = (GLOBAL_MEM T*) ((GLOBAL_MEM char*) dstMax + dstMaxOff);\n"); - strb_appends(&ctx->s, "\tdstArgmax = (GLOBAL_MEM X*) ((GLOBAL_MEM char*) dstArgmax + dstArgmaxOff);\n"); + strb_appends(&ctx->s, "\tsrc = (const GLOBAL_MEM T*)((const GLOBAL_MEM char*)src + srcOff);\n"); + strb_appends(&ctx->s, "\tdst = (GLOBAL_MEM T*) ((GLOBAL_MEM char*) dst + dstOff);\n"); + strb_appends(&ctx->s, "\tdstArg = (GLOBAL_MEM X*) ((GLOBAL_MEM char*) dstArg + dstArgOff);\n"); strb_appends(&ctx->s, "\t\n"); strb_appends(&ctx->s, "\t\n"); } -static void maxandargmaxAppendIndexDeclarations(maxandargmax_ctx* ctx){ +static void reduxAppendIndexDeclarations (redux_ctx* ctx){ int i; - strb_appends(&ctx->s, "\t/* GPU kernel coordinates. Always 3D. */\n"); + strb_appends(&ctx->s, "\t/* GPU kernel coordinates. Always 3D in OpenCL/CUDA. */\n"); strb_appends(&ctx->s, "\tX bi0 = GID_0, bi1 = GID_1, bi2 = GID_2;\n"); strb_appends(&ctx->s, "\tX bd0 = LDIM_0, bd1 = LDIM_1, bd2 = LDIM_2;\n"); @@ -393,7 +1317,7 @@ static void maxandargmaxAppendIndexDeclarations(maxandargmax_ctx* ctx){ strb_appends(&ctx->s, "\t\n"); strb_appends(&ctx->s, "\t\n"); } -static void maxandargmaxAppendRangeCalculations(maxandargmax_ctx* ctx){ +static void reduxAppendRangeCalculations (redux_ctx* ctx){ size_t hwDim; int i; @@ -407,10 +1331,10 @@ static void maxandargmaxAppendRangeCalculations(maxandargmax_ctx* ctx){ strb_appendf(&ctx->s, "\ti%dSStep = srcSteps[%d];\n", i, ctx->axisList[i]); } for(i=0;indd;i++){ - strb_appendf(&ctx->s, "\ti%dMStep = dstMaxSteps[%d];\n", i, i); + strb_appendf(&ctx->s, "\ti%dMStep = dstSteps[%d];\n", i, i); } for(i=0;indd;i++){ - strb_appendf(&ctx->s, "\ti%dAStep = dstArgmaxSteps[%d];\n", i, i); + strb_appendf(&ctx->s, "\ti%dAStep = dstArgSteps[%d];\n", i, i); } for(i=ctx->nds-1;i>=ctx->ndd;i--){ /** @@ -426,7 +1350,7 @@ static void maxandargmaxAppendRangeCalculations(maxandargmax_ctx* ctx){ } for(i=0;inds;i++){ /** - * Up to 3 dimensions get to rely on hardware loops. + * Up to MAX_HW_DIMS dimensions get to rely on hardware loops. * The others, if any, have to use software looping beginning at 0. */ @@ -438,7 +1362,7 @@ static void maxandargmaxAppendRangeCalculations(maxandargmax_ctx* ctx){ } for(i=0;inds;i++){ /** - * Up to 3 dimensions get to rely on hardware loops. + * Up to MAX_HW_DIMS dimensions get to rely on hardware loops. * The others, if any, have to use software looping beginning at 0. */ @@ -452,17 +1376,17 @@ static void maxandargmaxAppendRangeCalculations(maxandargmax_ctx* ctx){ strb_appends(&ctx->s, "\t\n"); strb_appends(&ctx->s, "\t\n"); } -static void maxandargmaxAppendLoops (maxandargmax_ctx* ctx){ +static void reduxAppendLoops (redux_ctx* ctx){ strb_appends(&ctx->s, "\t/**\n"); strb_appends(&ctx->s, "\t * FREE LOOPS.\n"); strb_appends(&ctx->s, "\t */\n"); strb_appends(&ctx->s, "\t\n"); - maxandargmaxAppendLoopMacroDefs (ctx); - maxandargmaxAppendLoopOuter (ctx); - maxandargmaxAppendLoopMacroUndefs(ctx); + reduxAppendLoopMacroDefs (ctx); + reduxAppendLoopOuter (ctx); + reduxAppendLoopMacroUndefs(ctx); } -static void maxandargmaxAppendLoopMacroDefs (maxandargmax_ctx* ctx){ +static void reduxAppendLoopMacroDefs (redux_ctx* ctx){ int i; /** @@ -477,16 +1401,6 @@ static void maxandargmaxAppendLoopMacroDefs (maxandargmax_ctx* ctx){ strb_appends(&ctx->s, "#define ESCAPE(idx) if(i##idx >= i##idx##Dim){continue;}\n"); - /** - * SRCINDEXER Macro - */ - - appendIdxes (&ctx->s, "#define SRCINDEXER(", "i", 0, ctx->nds, "", ") (*(GLOBAL_MEM T*)((GLOBAL_MEM char*)src + "); - for(i=0;inds;i++){ - strb_appendf(&ctx->s, "i%d*i%dSStep + \\\n ", i, i); - } - strb_appends(&ctx->s, "0))\n"); - /** * RDXINDEXER Macro */ @@ -496,28 +1410,8 @@ static void maxandargmaxAppendLoopMacroDefs (maxandargmax_ctx* ctx){ strb_appendf(&ctx->s, "i%d*i%dPDim + \\\n ", i, i); } strb_appends(&ctx->s, "0)\n"); - - /** - * DSTMINDEXER Macro - */ - - appendIdxes (&ctx->s, "#define DSTMINDEXER(", "i", 0, ctx->ndd, "", ") (*(GLOBAL_MEM T*)((GLOBAL_MEM char*)dstMax + "); - for(i=0;indd;i++){ - strb_appendf(&ctx->s, "i%d*i%dMStep + \\\n ", i, i); - } - strb_appends(&ctx->s, "0))\n"); - - /** - * DSTAINDEXER Macro - */ - - appendIdxes (&ctx->s, "#define DSTAINDEXER(", "i", 0, ctx->ndd, "", ") (*(GLOBAL_MEM X*)((GLOBAL_MEM char*)dstArgmax + "); - for(i=0;indd;i++){ - strb_appendf(&ctx->s, "i%d*i%dAStep + \\\n ", i, i); - } - strb_appends(&ctx->s, "0))\n"); } -static void maxandargmaxAppendLoopOuter (maxandargmax_ctx* ctx){ +static void reduxAppendLoopOuter (redux_ctx* ctx){ int i; /** @@ -532,7 +1426,7 @@ static void maxandargmaxAppendLoopOuter (maxandargmax_ctx* ctx){ * Inner Loop Generation */ - maxandargmaxAppendLoopInner(ctx); + reduxAppendLoopInner(ctx); /** * Outer Loop Trailer Generation @@ -542,87 +1436,111 @@ static void maxandargmaxAppendLoopOuter (maxandargmax_ctx* ctx){ strb_appends(&ctx->s, "\t}\n"); } } -static void maxandargmaxAppendLoopInner (maxandargmax_ctx* ctx){ +static void reduxAppendLoopInner (redux_ctx* ctx){ int i; /** * Inner Loop Prologue */ - strb_appends(&ctx->s, "\t/**\n"); - strb_appends(&ctx->s, "\t * Reduction initialization.\n"); - strb_appends(&ctx->s, "\t */\n"); - strb_appends(&ctx->s, "\t\n"); - - appendIdxes (&ctx->s, "\tT maxV = SRCINDEXER(", "i", 0, ctx->ndd, "", ""); - if(ctx->ndd && ctx->ndr){strb_appends(&ctx->s, ",");} - appendIdxes (&ctx->s, "", "i", ctx->ndd, ctx->nds, "Start", ");\n"); - - appendIdxes (&ctx->s, "\tX maxI = RDXINDEXER(", "i", ctx->ndd, ctx->nds, "Start", ");\n"); - - strb_appends(&ctx->s, "\t\n"); - strb_appends(&ctx->s, "\t/**\n"); - strb_appends(&ctx->s, "\t * REDUCTION LOOPS.\n"); - strb_appends(&ctx->s, "\t */\n"); - strb_appends(&ctx->s, "\t\n"); + strb_appends(&ctx->s, "\t\t/**\n"); + strb_appends(&ctx->s, "\t\t * Reduction initialization.\n"); + strb_appends(&ctx->s, "\t\t */\n"); + strb_appends(&ctx->s, "\t\t\n"); + strb_appends(&ctx->s, "\t\tK rdxV = getInitVal();\n"); + strb_appends(&ctx->s, "\t\tX argI = 0;\n"); + strb_appends(&ctx->s, "\t\t\n"); + strb_appends(&ctx->s, "\t\t/**\n"); + strb_appends(&ctx->s, "\t\t * REDUCTION LOOPS.\n"); + strb_appends(&ctx->s, "\t\t */\n"); + strb_appends(&ctx->s, "\t\t\n"); /** * Inner Loop Header Generation */ for(i=ctx->ndd;inds;i++){ - strb_appendf(&ctx->s, "\tFOROVER(%d){ESCAPE(%d)\n", i, i); + strb_appendf(&ctx->s, "\t\tFOROVER(%d){ESCAPE(%d)\n", i, i); } /** * Inner Loop Body Generation */ - appendIdxes (&ctx->s, "\tT V = SRCINDEXER(", "i", 0, ctx->nds, "", ");\n"); - strb_appends(&ctx->s, "\t\n"); - strb_appends(&ctx->s, "\tif(V > maxV){\n"); - strb_appends(&ctx->s, "\t\tmaxV = V;\n"); - appendIdxes (&ctx->s, "\t\tmaxI = RDXINDEXER(", "i", ctx->ndd, ctx->nds, "", ");\n"); - strb_appends(&ctx->s, "\t}\n"); + appendIdxes (&ctx->s, "\t\t\tK v = loadVal(", "i", 0, ctx->nds, "", ""); + if(ctx->nds > 0){ + strb_appends(&ctx->s, ", "); + } + strb_appends(&ctx->s, "src, srcSteps);\n"); + strb_appends(&ctx->s, "\t\t\t\n"); + switch(ctx->op){ + case GA_REDUCE_SUM: strb_appends(&ctx->s, "\t\t\trdxV += v;\n"); break; + case GA_REDUCE_PROD: strb_appends(&ctx->s, "\t\t\trdxV *= v;\n"); break; + case GA_REDUCE_PRODNZ: strb_appends(&ctx->s, "\t\t\trdxV *= v==0 ? getInitVal() : v;\n"); break; + case GA_REDUCE_MIN: strb_appends(&ctx->s, "\t\t\trdxV = min(rdxV, v);\n"); break; + case GA_REDUCE_MAX: strb_appends(&ctx->s, "\t\t\trdxV = max(rdxV, v);\n"); break; + case GA_REDUCE_ARGMIN: + case GA_REDUCE_MINANDARGMIN: + strb_appends(&ctx->s, "\t\t\trdxV = min(rdxV, v);\n"); + strb_appends(&ctx->s, "\t\t\tif(v == rdxV){\n"); + appendIdxes (&ctx->s, "\t\t\t\targI = RDXINDEXER(", "i", ctx->ndd, ctx->nds, "", ");\n"); + strb_appends(&ctx->s, "\t\t\t}\n"); + break; + case GA_REDUCE_ARGMAX: + case GA_REDUCE_MAXANDARGMAX: + strb_appends(&ctx->s, "\t\t\trdxV = max(rdxV, v);\n"); + strb_appends(&ctx->s, "\t\t\tif(v == rdxV){\n"); + appendIdxes (&ctx->s, "\t\t\t\targI = RDXINDEXER(", "i", ctx->ndd, ctx->nds, "", ");\n"); + strb_appends(&ctx->s, "\t\t\t}\n"); + break; + case GA_REDUCE_AND: strb_appends(&ctx->s, "\t\t\trdxV &= v;\n"); break; + case GA_REDUCE_OR: strb_appends(&ctx->s, "\t\t\trdxV |= v;\n"); break; + case GA_REDUCE_XOR: strb_appends(&ctx->s, "\t\t\trdxV ^= v;\n"); break; + case GA_REDUCE_ALL: strb_appends(&ctx->s, "\t\t\trdxV = rdxV && v;\n"); break; + case GA_REDUCE_ANY: strb_appends(&ctx->s, "\t\t\trdxV = rdxV || v;\n"); break; + } /** * Inner Loop Trailer Generation */ for(i=ctx->ndd;inds;i++){ - strb_appends(&ctx->s, "\t}\n"); + strb_appends(&ctx->s, "\t\t}\n"); } - strb_appends(&ctx->s, "\t\n"); + strb_appends(&ctx->s, "\t\t\n"); /** * Inner Loop Epilogue Generation */ - strb_appends(&ctx->s, "\t/**\n"); - strb_appends(&ctx->s, "\t * Destination writeback.\n"); - strb_appends(&ctx->s, "\t */\n"); - strb_appends(&ctx->s, "\t\n"); - appendIdxes (&ctx->s, "\tDSTMINDEXER(", "i", 0, ctx->ndd, "", ") = maxV;\n"); - appendIdxes (&ctx->s, "\tDSTAINDEXER(", "i", 0, ctx->ndd, "", ") = maxI;\n"); + strb_appends(&ctx->s, "\t\t/**\n"); + strb_appends(&ctx->s, "\t\t * Destination writeback.\n"); + strb_appends(&ctx->s, "\t\t */\n"); + strb_appends(&ctx->s, "\t\t\n"); + if ( reduxKernelRequiresDst (ctx) && !reduxKernelRequiresDstArg(ctx)){ + appendIdxes (&ctx->s, "\t\treduxVal(", "i", 0, ctx->ndd, "", ""); + if(ctx->ndd > 0){ + strb_appends(&ctx->s, ", "); + } + strb_appends(&ctx->s, "dst, dstSteps, rdxV);\n"); + }else if(!reduxKernelRequiresDst (ctx) && reduxKernelRequiresDstArg(ctx)){ + appendIdxes (&ctx->s, "\t\treduxVal(", "i", 0, ctx->ndd, "", ""); + if(ctx->ndd > 0){ + strb_appends(&ctx->s, ", "); + } + strb_appends(&ctx->s, "dstArg, dstArgSteps, argI);\n"); + }else if( reduxKernelRequiresDst (ctx) && reduxKernelRequiresDstArg(ctx)){ + appendIdxes (&ctx->s, "\t\treduxVal(", "i", 0, ctx->ndd, "", ""); + if(ctx->ndd > 0){ + strb_appends(&ctx->s, ", "); + } + strb_appends(&ctx->s, "dst, dstSteps, rdxV, dstArg, dstArgSteps, argI);\n"); + } } -static void maxandargmaxAppendLoopMacroUndefs (maxandargmax_ctx* ctx){ +static void reduxAppendLoopMacroUndefs (redux_ctx* ctx){ strb_appends(&ctx->s, "#undef FOROVER\n"); strb_appends(&ctx->s, "#undef ESCAPE\n"); - strb_appends(&ctx->s, "#undef SRCINDEXER\n"); strb_appends(&ctx->s, "#undef RDXINDEXER\n"); - strb_appends(&ctx->s, "#undef DSTMINDEXER\n"); - strb_appends(&ctx->s, "#undef DSTAINDEXER\n"); -} -static void maxandargmaxComputeAxisList (maxandargmax_ctx* ctx){ - int i, f=0; - - for(i=0;inds;i++){ - if(axisInSet(i, ctx->reduxList, ctx->ndr, 0)){ - continue; - } - ctx->axisList[f++] = i; - } - memcpy(&ctx->axisList[f], ctx->reduxList, ctx->ndr * sizeof(*ctx->reduxList)); } /** @@ -631,59 +1549,65 @@ static void maxandargmaxComputeAxisList (maxandargmax_ctx* ctx){ * @return */ -static int maxandargmaxCompile (maxandargmax_ctx* ctx){ +static int reduxCompileLarge (redux_ctx* ctx){ const int ARG_TYPECODES[] = { GA_BUFFER, /* src */ GA_SIZE, /* srcOff */ GA_BUFFER, /* srcSteps */ GA_BUFFER, /* srcSize */ GA_BUFFER, /* chnkSize */ - GA_BUFFER, /* dstMax */ - GA_SIZE, /* dstMaxOff */ - GA_BUFFER, /* dstMaxSteps */ - GA_BUFFER, /* dstArgmax */ - GA_SIZE, /* dstArgmaxOff */ - GA_BUFFER /* dstArgmaxSteps */ + GA_BUFFER, /* dst */ + GA_SIZE, /* dstOff */ + GA_BUFFER, /* dstSteps */ + GA_BUFFER, /* dstArg */ + GA_SIZE, /* dstArgOff */ + GA_BUFFER /* dstArgSteps */ }; - const unsigned int ARG_TYPECODES_LEN = sizeof(ARG_TYPECODES)/sizeof(*ARG_TYPECODES); - const char* SRCS[1]; - - SRCS[0] = ctx->sourceCode; - - ctx->ret = GpuKernel_init(&ctx->kernel, + const size_t ARG_TYPECODES_LEN = sizeof(ARG_TYPECODES)/sizeof(*ARG_TYPECODES); + const char* SRCS[1] = {ctx->sourceCode}; + const size_t SRC_LENS[1] = {strlen(ctx->sourceCode)}; + const size_t SRCS_LEN = sizeof(SRCS)/sizeof(*SRCS); + + int ret = GpuKernel_init(&ctx->kernel, ctx->gpuCtx, - 1, + SRCS_LEN, SRCS, - NULL, - "maxandargmax", + SRC_LENS, + "redux", ARG_TYPECODES_LEN, ARG_TYPECODES, 0, (char**)0); - free(ctx->sourceCode); - ctx->sourceCode = NULL; - return ctx->ret; + if(ret != GA_NO_ERROR){ + return reduxCleanup(ctx, ret); + }else{ + return reduxScheduleLarge(ctx); + } +} +static int reduxCompileSmall (redux_ctx* ctx){ + /* BUG: Implement small code model. */ + return reduxCompileLarge(ctx); } /** * Compute a good thread block size / grid size / software chunk size for Nvidia. */ -static int maxandargmaxSchedule (maxandargmax_ctx* ctx){ +static int reduxScheduleLarge (redux_ctx* ctx){ int i; size_t warpMod; size_t bestWarpMod = 1; unsigned bestWarpAxis = 0; uint64_t maxLg; - uint64_t maxLs[3]; + uint64_t maxLs[MAX_HW_DIMS]; uint64_t maxGg; - uint64_t maxGs[3]; - uint64_t dims [3]; - double slack[3]; - ga_factor_list factBS[3]; - ga_factor_list factGS[3]; - ga_factor_list factCS[3]; + uint64_t maxGs [MAX_HW_DIMS]; + uint64_t dims [MAX_HW_DIMS]; + double slack [MAX_HW_DIMS]; + ga_factor_list factBS[MAX_HW_DIMS]; + ga_factor_list factGS[MAX_HW_DIMS]; + ga_factor_list factCS[MAX_HW_DIMS]; /** @@ -772,76 +1696,78 @@ static int maxandargmaxSchedule (maxandargmax_ctx* ctx){ } /* Return. */ - return ctx->ret=GA_NO_ERROR; + return reduxInvokeLarge(ctx); } /** * Invoke the kernel. */ -static int maxandargmaxInvoke (maxandargmax_ctx* ctx){ +static int reduxInvokeLarge (redux_ctx* ctx){ void* args[11]; + int ret; /** * Argument Marshalling. This the grossest gross thing in here. */ - const int flags = GA_BUFFER_READ_ONLY|GA_BUFFER_INIT; - ctx->srcStepsGD = gpudata_alloc(ctx->gpuCtx, ctx->nds * sizeof(size_t), - ctx->src->strides, flags, 0); - ctx->srcSizeGD = gpudata_alloc(ctx->gpuCtx, ctx->nds * sizeof(size_t), - ctx->src->dimensions, flags, 0); - ctx->chunkSizeGD = gpudata_alloc(ctx->gpuCtx, ctx->ndh * sizeof(size_t), - ctx->chunkSize, flags, 0); - ctx->dstMaxStepsGD = gpudata_alloc(ctx->gpuCtx, ctx->ndd * sizeof(size_t), - ctx->dstMax->strides, flags, 0); - ctx->dstArgmaxStepsGD = gpudata_alloc(ctx->gpuCtx, ctx->ndd * sizeof(size_t), - ctx->dstArgmax->strides, flags, 0); + const int flags = GA_BUFFER_READ_ONLY|GA_BUFFER_INIT; + ctx->srcStepsGD = gpudata_alloc(ctx->gpuCtx, ctx->nds * sizeof(size_t), + ctx->src->strides, flags, 0); + ctx->srcSizeGD = gpudata_alloc(ctx->gpuCtx, ctx->nds * sizeof(size_t), + ctx->src->dimensions, flags, 0); + ctx->chunkSizeGD = gpudata_alloc(ctx->gpuCtx, ctx->ndh * sizeof(size_t), + ctx->chunkSize, flags, 0); + ctx->dstStepsGD = gpudata_alloc(ctx->gpuCtx, ctx->ndd * sizeof(size_t), + ctx->dst->strides, flags, 0); + ctx->dstArgStepsGD = gpudata_alloc(ctx->gpuCtx, ctx->ndd * sizeof(size_t), + ctx->dstArg->strides, flags, 0); args[ 0] = (void*) ctx->src->data; args[ 1] = (void*)&ctx->src->offset; args[ 2] = (void*) ctx->srcStepsGD; args[ 3] = (void*) ctx->srcSizeGD; args[ 4] = (void*) ctx->chunkSizeGD; - args[ 5] = (void*) ctx->dstMax->data; - args[ 6] = (void*)&ctx->dstMax->offset; - args[ 7] = (void*) ctx->dstMaxStepsGD; - args[ 8] = (void*) ctx->dstArgmax->data; - args[ 9] = (void*)&ctx->dstArgmax->offset; - args[10] = (void*) ctx->dstArgmaxStepsGD; - - if(ctx->srcStepsGD && - ctx->srcSizeGD && - ctx->chunkSizeGD && - ctx->dstMaxStepsGD && - ctx->dstArgmaxStepsGD){ - ctx->ret = GpuKernel_call(&ctx->kernel, - ctx->ndh>0 ? ctx->ndh : 1, - ctx->gridSize, - ctx->blockSize, - 0, - args); + args[ 5] = (void*) ctx->dst->data; + args[ 6] = (void*)&ctx->dst->offset; + args[ 7] = (void*) ctx->dstStepsGD; + args[ 8] = (void*) ctx->dstArg->data; + args[ 9] = (void*)&ctx->dstArg->offset; + args[10] = (void*) ctx->dstArgStepsGD; + + if(ctx->srcStepsGD && + ctx->srcSizeGD && + ctx->chunkSizeGD && + ctx->dstStepsGD && + ctx->dstArgStepsGD){ + ret = GpuKernel_call(&ctx->kernel, + ctx->ndh>0 ? ctx->ndh : 1, + ctx->gridSize, + ctx->blockSize, + 0, + args); + return reduxCleanup(ctx, ret); }else{ - ctx->ret = GA_MEMORY_ERROR; + return reduxCleanup(ctx, GA_MEMORY_ERROR); } - - gpudata_release(ctx->srcStepsGD); - gpudata_release(ctx->srcSizeGD); - gpudata_release(ctx->chunkSizeGD); - gpudata_release(ctx->dstMaxStepsGD); - gpudata_release(ctx->dstArgmaxStepsGD); - - return ctx->ret; } /** * Cleanup */ -static int maxandargmaxCleanup (maxandargmax_ctx* ctx){ +static int reduxCleanup (redux_ctx* ctx, int ret){ free(ctx->axisList); free(ctx->sourceCode); - ctx->axisList = NULL; - ctx->sourceCode = NULL; + ctx->axisList = NULL; + ctx->sourceCode = NULL; + + gpudata_release(ctx->srcStepsGD); + gpudata_release(ctx->srcSizeGD); + gpudata_release(ctx->chunkSizeGD); + gpudata_release(ctx->dstStepsGD); + gpudata_release(ctx->dstArgStepsGD); + ctx->srcStepsGD = ctx->srcSizeGD = ctx->chunkSizeGD = + ctx->dstStepsGD = ctx->dstArgStepsGD = NULL; - return ctx->ret; + return ret; } diff --git a/tests/check_reduction.c b/tests/check_reduction.c index ca3f231bf4..d8c14aa572 100644 --- a/tests/check_reduction.c +++ b/tests/check_reduction.c @@ -67,15 +67,14 @@ static double pcgRand01(void){ * Test cases. */ -START_TEST(test_reduction){ +START_TEST(test_maxandargmax_reduction){ + pcgSeed(1); + /** * We test here a reduction of some random 3D tensor on the first and * third dimensions. */ - GpuArray gaSrc; - GpuArray gaMax; - GpuArray gaArgmax; size_t i,j,k; size_t dims[3] = {32,50,79}; size_t prodDims = dims[0]*dims[1]*dims[2]; @@ -94,7 +93,6 @@ START_TEST(test_reduction){ * Initialize source data. */ - pcgSeed(1); for(i=0;i Date: Wed, 25 Jan 2017 19:23:44 -0500 Subject: [PATCH 02/34] Add strb_init() function. It allows initializing at runtime an strb. This can't always be done at compile-time, for instance if it is dynamically allocated. --- src/util/strb.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/util/strb.h b/src/util/strb.h index 3289de5796..223145908e 100644 --- a/src/util/strb.h +++ b/src/util/strb.h @@ -77,6 +77,15 @@ static inline int strb_error(strb *sb) { return sb->l == (size_t)-1; } +/* + * Initialize at runtime an strb. + */ + +static inline void strb_init(strb* sb){ + const strb s = STRB_STATIC_INIT; + *sb = s; +} + /* * Clear any allocation the strb may have done and reset all of its From a21bcb575f1deb5b6ccfc6a66682c3b6edb2b0e3 Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Wed, 25 Jan 2017 19:50:58 -0500 Subject: [PATCH 03/34] Moved the reduction API to reduction.h. --- src/CMakeLists.txt | 1 + src/gpuarray/array.h | 125 ------------------------------- src/gpuarray/reduction.h | 157 +++++++++++++++++++++++++++++++++++++++ src/gpuarray_reduction.c | 71 ++++++++++++------ tests/check_reduction.c | 81 +++++++++++++++++++- 5 files changed, 284 insertions(+), 151 deletions(-) create mode 100644 src/gpuarray/reduction.h diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index b687e5da1a..1505014e5e 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -143,6 +143,7 @@ set(headers gpuarray/extension.h gpuarray/ext_cuda.h gpuarray/kernel.h + gpuarray/reduction.h gpuarray/types.h gpuarray/util.h ) diff --git a/src/gpuarray/array.h b/src/gpuarray/array.h index 5ea9377b9a..639d176489 100644 --- a/src/gpuarray/array.h +++ b/src/gpuarray/array.h @@ -118,27 +118,6 @@ typedef enum _ga_order { GA_F_ORDER=1 } ga_order; -/** - * Supported array reduction operations. - */ - -typedef enum _ga_reduce_op { - GA_REDUCE_SUM, /* + */ - GA_REDUCE_PROD, /* * */ - GA_REDUCE_PRODNZ, /* * (!=0) */ - GA_REDUCE_MIN, /* min() */ - GA_REDUCE_MAX, /* max() */ - GA_REDUCE_ARGMIN, /* argmin() */ - GA_REDUCE_ARGMAX, /* argmax() */ - GA_REDUCE_MINANDARGMIN, /* min(), argmin() */ - GA_REDUCE_MAXANDARGMAX, /* max(), argmax() */ - GA_REDUCE_AND, /* & */ - GA_REDUCE_OR, /* | */ - GA_REDUCE_XOR, /* ^ */ - GA_REDUCE_ALL, /* &&/all() */ - GA_REDUCE_ANY, /* ||/any() */ -} ga_reduce_op; - /** * Checks if all the specified flags are set. * @@ -626,110 +605,6 @@ GPUARRAY_PUBLIC void GpuArray_fprintf(FILE *fd, const GpuArray *a); GPUARRAY_PUBLIC int GpuArray_fdump(FILE *fd, const GpuArray *a); -/** - * @brief Compute a reduction sum (+), product (*), non-zero product (* != 0), - * min, max, argmin, argmax, min-and-argmin, max-and-argmax, and (&), - * or (|), xor (^), all (&&) or any (||) over a list of axes to reduce. - * - * Returns one (in the case of min-and-argmin/max-and-argmax, two) destination - * tensors. The destination tensor(s)' axes are a strict subset of the axes of the - * source tensor. The axes to be reduced are specified by the caller, and the - * reduction is performed over these axes, which are then removed in the - * destination. - * - * @param [out] dst The destination tensor. Has the same type as the source. - * @param [out] dstArg For argument of minima/maxima operations. Has type int64. - * @param [in] src The source tensor. - * @param [in] reduxLen The number of axes reduced. Must be >= 1 and - * <= src->nd. - * @param [in] reduxList A list of integers of length reduxLen, indicating - * the axes to be reduced. The order of the axes - * matters for dstArg index calculations (GpuArray_argmin, - * GpuArray_argmax, GpuArray_minandargmin, - * GpuArray_maxandargmax). All entries in the list must be - * unique, >= 0 and < src->nd. - * - * For example, if a 5D-tensor is max-reduced with an axis - * list of [3,4,1], then reduxLen shall be 3, and the - * index calculation in every point shall take the form - * - * dstArgmax[i0,i2] = i3 * src.shape[4] * src.shape[1] + - * i4 * src.shape[1] + - * i1 - * - * where (i3,i4,i1) are the coordinates of the maximum- - * valued element within subtensor [i0,:,i2,:,:] of src. - * @return GA_NO_ERROR if the operation was successful, or a non-zero error - * code otherwise. - */ - -GPUARRAY_PUBLIC int GpuArray_sum (GpuArray* dst, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList); -GPUARRAY_PUBLIC int GpuArray_prod (GpuArray* dst, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList); -GPUARRAY_PUBLIC int GpuArray_prodnz (GpuArray* dst, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList); -GPUARRAY_PUBLIC int GpuArray_min (GpuArray* dst, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList); -GPUARRAY_PUBLIC int GpuArray_max (GpuArray* dst, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList); -GPUARRAY_PUBLIC int GpuArray_argmin (GpuArray* dstArg, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList); -GPUARRAY_PUBLIC int GpuArray_argmax (GpuArray* dstArg, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList); -GPUARRAY_PUBLIC int GpuArray_minandargmin(GpuArray* dst, - GpuArray* dstArg, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList); -GPUARRAY_PUBLIC int GpuArray_maxandargmax(GpuArray* dst, - GpuArray* dstArg, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList); -GPUARRAY_PUBLIC int GpuArray_and (GpuArray* dst, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList); -GPUARRAY_PUBLIC int GpuArray_or (GpuArray* dst, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList); -GPUARRAY_PUBLIC int GpuArray_xor (GpuArray* dst, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList); -GPUARRAY_PUBLIC int GpuArray_all (GpuArray* dst, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList); -GPUARRAY_PUBLIC int GpuArray_any (GpuArray* dst, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList); -GPUARRAY_PUBLIC int GpuArray_reduction (ga_reduce_op op, - GpuArray* dst, - GpuArray* dstArg, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList); - - - #ifdef __cplusplus diff --git a/src/gpuarray/reduction.h b/src/gpuarray/reduction.h new file mode 100644 index 0000000000..1db5664535 --- /dev/null +++ b/src/gpuarray/reduction.h @@ -0,0 +1,157 @@ +#ifndef GPUARRAY_REDUCTION_H +#define GPUARRAY_REDUCTION_H +/** + * \file reduction.h + * \brief Reduction functions. + */ + +#include + +#ifdef _MSC_VER +#ifndef inline +#define inline __inline +#endif +#endif + +#ifdef __cplusplus +extern "C" { +#endif +#ifdef CONFUSE_EMACS +} +#endif + + +/** + * Supported array reduction operations. + */ + +typedef enum _ga_reduce_op { + GA_REDUCE_SUM, /* + */ + GA_REDUCE_PROD, /* * */ + GA_REDUCE_PRODNZ, /* * (!=0) */ + GA_REDUCE_MIN, /* min() */ + GA_REDUCE_MAX, /* max() */ + GA_REDUCE_ARGMIN, /* argmin() */ + GA_REDUCE_ARGMAX, /* argmax() */ + GA_REDUCE_MINANDARGMIN, /* min(), argmin() */ + GA_REDUCE_MAXANDARGMAX, /* max(), argmax() */ + GA_REDUCE_AND, /* & */ + GA_REDUCE_OR, /* | */ + GA_REDUCE_XOR, /* ^ */ + GA_REDUCE_ALL, /* &&/all() */ + GA_REDUCE_ANY, /* ||/any() */ +} ga_reduce_op; + + + +/** + * @brief Compute a reduction sum (+), product (*), non-zero product (* != 0), + * min, max, argmin, argmax, min-and-argmin, max-and-argmax, and (&), + * or (|), xor (^), all (&&) or any (||) over a list of axes to reduce. + * + * Returns one (in the case of min-and-argmin/max-and-argmax, two) destination + * tensors. The destination tensor(s)' axes are a strict subset of the axes of the + * source tensor. The axes to be reduced are specified by the caller, and the + * reduction is performed over these axes, which are then removed in the + * destination. + * + * @param [out] dst The destination tensor. Has the same type as the source. + * @param [out] dstArg For argument of minima/maxima operations. Has type int64. + * @param [in] src The source tensor. + * @param [in] reduxLen The number of axes reduced. Must be >= 1 and + * <= src->nd. + * @param [in] reduxList A list of integers of length reduxLen, indicating + * the axes to be reduced. The order of the axes + * matters for dstArg index calculations (GpuArray_argmin, + * GpuArray_argmax, GpuArray_minandargmin, + * GpuArray_maxandargmax). All entries in the list must be + * unique, >= 0 and < src->nd. + * + * For example, if a 5D-tensor is max-reduced with an axis + * list of [3,4,1], then reduxLen shall be 3, and the + * index calculation in every point shall take the form + * + * dstArgmax[i0,i2] = i3 * src.shape[4] * src.shape[1] + + * i4 * src.shape[1] + + * i1 + * + * where (i3,i4,i1) are the coordinates of the maximum- + * valued element within subtensor [i0,:,i2,:,:] of src. + * @return GA_NO_ERROR if the operation was successful, or a non-zero error + * code otherwise. + */ + +GPUARRAY_PUBLIC int GpuArray_sum (GpuArray* dst, + const GpuArray* src, + unsigned reduxLen, + const unsigned* reduxList); +GPUARRAY_PUBLIC int GpuArray_prod (GpuArray* dst, + const GpuArray* src, + unsigned reduxLen, + const unsigned* reduxList); +GPUARRAY_PUBLIC int GpuArray_prodnz (GpuArray* dst, + const GpuArray* src, + unsigned reduxLen, + const unsigned* reduxList); +GPUARRAY_PUBLIC int GpuArray_min (GpuArray* dst, + const GpuArray* src, + unsigned reduxLen, + const unsigned* reduxList); +GPUARRAY_PUBLIC int GpuArray_max (GpuArray* dst, + const GpuArray* src, + unsigned reduxLen, + const unsigned* reduxList); +GPUARRAY_PUBLIC int GpuArray_argmin (GpuArray* dstArg, + const GpuArray* src, + unsigned reduxLen, + const unsigned* reduxList); +GPUARRAY_PUBLIC int GpuArray_argmax (GpuArray* dstArg, + const GpuArray* src, + unsigned reduxLen, + const unsigned* reduxList); +GPUARRAY_PUBLIC int GpuArray_minandargmin(GpuArray* dst, + GpuArray* dstArg, + const GpuArray* src, + unsigned reduxLen, + const unsigned* reduxList); +GPUARRAY_PUBLIC int GpuArray_maxandargmax(GpuArray* dst, + GpuArray* dstArg, + const GpuArray* src, + unsigned reduxLen, + const unsigned* reduxList); +GPUARRAY_PUBLIC int GpuArray_and (GpuArray* dst, + const GpuArray* src, + unsigned reduxLen, + const unsigned* reduxList); +GPUARRAY_PUBLIC int GpuArray_or (GpuArray* dst, + const GpuArray* src, + unsigned reduxLen, + const unsigned* reduxList); +GPUARRAY_PUBLIC int GpuArray_xor (GpuArray* dst, + const GpuArray* src, + unsigned reduxLen, + const unsigned* reduxList); +GPUARRAY_PUBLIC int GpuArray_all (GpuArray* dst, + const GpuArray* src, + unsigned reduxLen, + const unsigned* reduxList); +GPUARRAY_PUBLIC int GpuArray_any (GpuArray* dst, + const GpuArray* src, + unsigned reduxLen, + const unsigned* reduxList); +GPUARRAY_PUBLIC int GpuArray_reduction (ga_reduce_op op, + GpuArray* dst, + GpuArray* dstArg, + const GpuArray* src, + unsigned reduxLen, + const unsigned* reduxList); + + + + + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c index 8a6a2dc98b..859da8e272 100644 --- a/src/gpuarray_reduction.c +++ b/src/gpuarray_reduction.c @@ -12,9 +12,9 @@ #include #include "private.h" -#include "gpuarray/array.h" #include "gpuarray/error.h" #include "gpuarray/kernel.h" +#include "gpuarray/reduction.h" #include "gpuarray/util.h" #include "util/strb.h" @@ -704,7 +704,6 @@ static void appendIdxes (strb* s, static int reduxCheckargs (redux_ctx* ctx){ int i, ret; - const strb INIT_STRB = STRB_STATIC_INIT; /** * We initialize certain parts of the context. @@ -720,7 +719,7 @@ static int reduxCheckargs (redux_ctx* ctx){ ctx->ndhd = 0; ctx->ndhr = 0; ctx->sourceCode = NULL; - ctx->s = INIT_STRB; + strb_init(&ctx->s); for(i=0;ihwAxisList[i] = 0; @@ -1169,6 +1168,10 @@ static void reduxAppendFuncLoadVal (redux_ctx* ctx){ strb_appendf(&ctx->s, "i%d*srcSteps[%d] + \\\n\t ", i, ctx->axisList[i]); } strb_appends(&ctx->s, "0));\n"); + + /* Prescalar transformations go here... */ + + /* Return the value. */ strb_appends(&ctx->s, "\treturn v;\n"); strb_appends(&ctx->s, "}\n"); strb_appends(&ctx->s, "\n"); @@ -1189,17 +1192,17 @@ static void reduxAppendFuncReduxVal (redux_ctx* ctx){ strb_appends(&ctx->s, "\n"); appendIdxes (&ctx->s, "WITHIN_KERNEL void reduxVal(", "X i", 0, ctx->ndd, "", ""); anyArgsEmitted = ctx->ndd>0; - if(anyArgsEmitted){ - strb_appends(&ctx->s, ", "); - } if(reduxKernelRequiresDst (ctx)){ + if(anyArgsEmitted){ + strb_appends(&ctx->s, ", "); + } anyArgsEmitted = 1; strb_appends(&ctx->s, "GLOBAL_MEM T* dst, const GLOBAL_MEM X* dstSteps, K v"); } - if(anyArgsEmitted){ - strb_appends(&ctx->s, ", "); - } if(reduxKernelRequiresDstArg(ctx)){ + if(anyArgsEmitted){ + strb_appends(&ctx->s, ", "); + } anyArgsEmitted = 1; strb_appends(&ctx->s, "GLOBAL_MEM A* dstArg, const GLOBAL_MEM X* dstArgSteps, X i"); } @@ -1248,12 +1251,12 @@ static void reduxAppendFuncPreKernel (redux_ctx* ctx){ } static void reduxAppendFuncKernel (redux_ctx* ctx){ reduxAppendPrototype (ctx); - strb_appends (&ctx->s, "{\n"); + strb_appends (&ctx->s, "{\n"); reduxAppendOffsets (ctx); reduxAppendIndexDeclarations(ctx); reduxAppendRangeCalculations(ctx); reduxAppendLoops (ctx); - strb_appends (&ctx->s, "}\n"); + strb_appends (&ctx->s, "}\n"); } static void reduxAppendFuncPostKernel (redux_ctx* ctx){ @@ -1280,8 +1283,12 @@ static void reduxAppendPrototype (redux_ctx* ctx){ static void reduxAppendOffsets (redux_ctx* ctx){ strb_appends(&ctx->s, "\t/* Add offsets */\n"); strb_appends(&ctx->s, "\tsrc = (const GLOBAL_MEM T*)((const GLOBAL_MEM char*)src + srcOff);\n"); - strb_appends(&ctx->s, "\tdst = (GLOBAL_MEM T*) ((GLOBAL_MEM char*) dst + dstOff);\n"); - strb_appends(&ctx->s, "\tdstArg = (GLOBAL_MEM X*) ((GLOBAL_MEM char*) dstArg + dstArgOff);\n"); + if(reduxKernelRequiresDst(ctx)){ + strb_appends(&ctx->s, "\tdst = (GLOBAL_MEM T*) ((GLOBAL_MEM char*) dst + dstOff);\n"); + } + if(reduxKernelRequiresDstArg(ctx)){ + strb_appends(&ctx->s, "\tdstArg = (GLOBAL_MEM X*) ((GLOBAL_MEM char*) dstArg + dstArgOff);\n"); + } strb_appends(&ctx->s, "\t\n"); strb_appends(&ctx->s, "\t\n"); } @@ -1448,7 +1455,9 @@ static void reduxAppendLoopInner (redux_ctx* ctx){ strb_appends(&ctx->s, "\t\t */\n"); strb_appends(&ctx->s, "\t\t\n"); strb_appends(&ctx->s, "\t\tK rdxV = getInitVal();\n"); - strb_appends(&ctx->s, "\t\tX argI = 0;\n"); + if(reduxKernelRequiresDstArg(ctx)){ + strb_appends(&ctx->s, "\t\tX argI = 0;\n"); + } strb_appends(&ctx->s, "\t\t\n"); strb_appends(&ctx->s, "\t\t/**\n"); strb_appends(&ctx->s, "\t\t * REDUCTION LOOPS.\n"); @@ -1718,21 +1727,35 @@ static int reduxInvokeLarge (redux_ctx* ctx){ ctx->src->dimensions, flags, 0); ctx->chunkSizeGD = gpudata_alloc(ctx->gpuCtx, ctx->ndh * sizeof(size_t), ctx->chunkSize, flags, 0); - ctx->dstStepsGD = gpudata_alloc(ctx->gpuCtx, ctx->ndd * sizeof(size_t), - ctx->dst->strides, flags, 0); - ctx->dstArgStepsGD = gpudata_alloc(ctx->gpuCtx, ctx->ndd * sizeof(size_t), - ctx->dstArg->strides, flags, 0); + if(reduxKernelRequiresDst(ctx)){ + ctx->dstStepsGD = gpudata_alloc(ctx->gpuCtx, ctx->ndd * sizeof(size_t), + ctx->dst->strides, flags, 0); + } + if(reduxKernelRequiresDstArg(ctx)){ + ctx->dstArgStepsGD = gpudata_alloc(ctx->gpuCtx, ctx->ndd * sizeof(size_t), + ctx->dstArg->strides, flags, 0); + } args[ 0] = (void*) ctx->src->data; args[ 1] = (void*)&ctx->src->offset; args[ 2] = (void*) ctx->srcStepsGD; args[ 3] = (void*) ctx->srcSizeGD; args[ 4] = (void*) ctx->chunkSizeGD; - args[ 5] = (void*) ctx->dst->data; - args[ 6] = (void*)&ctx->dst->offset; - args[ 7] = (void*) ctx->dstStepsGD; - args[ 8] = (void*) ctx->dstArg->data; - args[ 9] = (void*)&ctx->dstArg->offset; - args[10] = (void*) ctx->dstArgStepsGD; + if ( reduxKernelRequiresDst (ctx) && reduxKernelRequiresDstArg(ctx)){ + args[ 5] = (void*) ctx->dst->data; + args[ 6] = (void*)&ctx->dst->offset; + args[ 7] = (void*) ctx->dstStepsGD; + args[ 8] = (void*) ctx->dstArg->data; + args[ 9] = (void*)&ctx->dstArg->offset; + args[10] = (void*) ctx->dstArgStepsGD; + }else if( reduxKernelRequiresDst (ctx) && !reduxKernelRequiresDstArg(ctx)){ + args[ 5] = (void*) ctx->dst->data; + args[ 6] = (void*)&ctx->dst->offset; + args[ 7] = (void*) ctx->dstStepsGD; + }else if(!reduxKernelRequiresDst (ctx) && reduxKernelRequiresDstArg(ctx)){ + args[ 5] = (void*) ctx->dstArg->data; + args[ 6] = (void*)&ctx->dstArg->offset; + args[ 7] = (void*) ctx->dstArgStepsGD; + } if(ctx->srcStepsGD && ctx->srcSizeGD && diff --git a/tests/check_reduction.c b/tests/check_reduction.c index d8c14aa572..2d47d6541d 100644 --- a/tests/check_reduction.c +++ b/tests/check_reduction.c @@ -1,8 +1,7 @@ #include -#include -#include #include +#include #include #include @@ -610,6 +609,83 @@ START_TEST(test_minandargmin_alldimsreduced){ GpuArray_clear(&gaArgmin); }END_TEST +START_TEST(test_min_alldimsreduced){ + pcgSeed(1); + + /** + * We test here a reduction of some random 3D tensor on all dimensions. + */ + + size_t i,j,k; + size_t dims[3] = {32,50,79}; + size_t prodDims = dims[0]*dims[1]*dims[2]; + const unsigned reduxList[] = {0,1,2}; + + float* pSrc = calloc(1, sizeof(*pSrc) * dims[0]*dims[1]*dims[2]); + float* pMin = calloc(1, sizeof(*pMin) ); + + ck_assert_ptr_ne(pSrc, NULL); + ck_assert_ptr_ne(pMin, NULL); + + + /** + * Initialize source data. + */ + + for(i=0;i Date: Wed, 25 Jan 2017 20:09:18 -0500 Subject: [PATCH 04/34] Feedback Applied. Spaces after a bunch of keywords, do certain string appends more efficiently, no appending includes. --- src/gpuarray_reduction.c | 289 ++++++++++++++++++--------------------- 1 file changed, 136 insertions(+), 153 deletions(-) diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c index 859da8e272..5534d84c36 100644 --- a/src/gpuarray_reduction.c +++ b/src/gpuarray_reduction.c @@ -395,8 +395,8 @@ GPUARRAY_PUBLIC int GpuArray_reduction (ga_reduce_op op, unsigned reduxLen, const unsigned* reduxList){ redux_ctx ctxSTACK = {op, dst, dstArg, src, - (int)reduxLen, (const int*)reduxList}, - *ctx = &ctxSTACK; + (int)reduxLen, (const int*)reduxList}; + redux_ctx *ctx = &ctxSTACK; return reduxCheckargs(ctx); } @@ -413,8 +413,8 @@ GPUARRAY_PUBLIC int GpuArray_reduction (ga_reduce_op op, */ static int reduxGetSumInit (int typecode, const char** property){ - if(typecode == GA_POINTER || - typecode == GA_BUFFER){ + if (typecode == GA_POINTER || + typecode == GA_BUFFER){ return GA_UNSUPPORTED_ERROR; } *property = "0"; @@ -433,8 +433,8 @@ static int reduxGetSumInit (int typecode, const char** property) */ static int reduxGetProdInit (int typecode, const char** property){ - if(typecode == GA_POINTER || - typecode == GA_BUFFER){ + if (typecode == GA_POINTER || + typecode == GA_BUFFER){ return GA_UNSUPPORTED_ERROR; } *property = "1"; @@ -453,7 +453,7 @@ static int reduxGetProdInit (int typecode, const char** property) */ static int reduxGetMinInit (int typecode, const char** property){ - switch(typecode){ + switch (typecode){ case GA_BYTE2: case GA_BYTE3: case GA_BYTE4: @@ -528,7 +528,7 @@ static int reduxGetMinInit (int typecode, const char** property) */ static int reduxGetMaxInit (int typecode, const char** property){ - switch(typecode){ + switch (typecode){ case GA_BOOL: *property = "1"; break; case GA_BYTE2: case GA_BYTE3: @@ -602,8 +602,8 @@ static int reduxGetMaxInit (int typecode, const char** property) */ static int reduxGetAndInit (int typecode, const char** property){ - if(typecode == GA_POINTER || - typecode == GA_BUFFER){ + if (typecode == GA_POINTER || + typecode == GA_BUFFER){ return GA_UNSUPPORTED_ERROR; } *property = "~0"; @@ -622,8 +622,8 @@ static int reduxGetAndInit (int typecode, const char** property) */ static int reduxGetOrInit (int typecode, const char** property){ - if(typecode == GA_POINTER || - typecode == GA_BUFFER){ + if (typecode == GA_POINTER || + typecode == GA_BUFFER){ return GA_UNSUPPORTED_ERROR; } *property = "0"; @@ -646,9 +646,9 @@ static int axisInSet (int v, size_t* where){ size_t i; - for(i=0;isourceCode = NULL; strb_init(&ctx->s); - for(i=0;ihwAxisList[i] = 0; ctx->blockSize [i] = 1; ctx->gridSize [i] = 1; @@ -734,21 +734,21 @@ static int reduxCheckargs (redux_ctx* ctx){ /* Insane src, reduxLen, dst or dstArg? */ - if(!ctx->src || ctx->src->nd <= 0 || ctx->reduxLen == 0 || - ctx->reduxLen > (int)ctx->src->nd){ + if (!ctx->src || ctx->src->nd <= 0 || ctx->reduxLen == 0 || + ctx->reduxLen > (int)ctx->src->nd){ return reduxCleanup(ctx, GA_INVALID_ERROR); } - if((reduxHasDst (ctx) && !ctx->dst) || - (reduxHasDstArg(ctx) && !ctx->dstArg)){ + if ((reduxHasDst (ctx) && !ctx->dst) || + (reduxHasDstArg(ctx) && !ctx->dstArg)){ return reduxCleanup(ctx, GA_INVALID_ERROR); } /* Insane or duplicate list entry? */ - for(i=0;ireduxLen;i++){ - if(ctx->reduxList[i] < 0 || - ctx->reduxList[i] >= (int)ctx->src->nd || - axisInSet(ctx->reduxList[i], ctx->reduxList, i, 0)){ + for (i=0;ireduxLen;i++){ + if (ctx->reduxList[i] < 0 || + ctx->reduxList[i] >= (int)ctx->src->nd || + axisInSet(ctx->reduxList[i], ctx->reduxList, i, 0)){ return reduxCleanup(ctx, GA_INVALID_ERROR); } } @@ -756,21 +756,21 @@ static int reduxCheckargs (redux_ctx* ctx){ /* GPU context non-existent? */ ctx->gpuCtx = GpuArray_context(ctx->src); - if(!ctx->gpuCtx){ + if (!ctx->gpuCtx){ return reduxCleanup(ctx, GA_INVALID_ERROR); } /* Unknown type? */ reduxSelectTypes(ctx); - if(!ctx->srcTypeStr || !ctx->dstTypeStr || !ctx->dstArgTypeStr || - !ctx->accTypeStr){ + if (!ctx->srcTypeStr || !ctx->dstTypeStr || !ctx->dstArgTypeStr || + !ctx->accTypeStr){ return reduxCleanup(ctx, GA_INVALID_ERROR); } /* Determine initializer, and error out if reduction unsupported. */ - switch(ctx->op){ + switch (ctx->op){ case GA_REDUCE_SUM: ret = reduxGetSumInit (ctx->accTypeCode, &ctx->initVal); break; case GA_REDUCE_PRODNZ: case GA_REDUCE_PROD: ret = reduxGetProdInit(ctx->accTypeCode, &ctx->initVal); break; @@ -787,7 +787,7 @@ static int reduxCheckargs (redux_ctx* ctx){ case GA_REDUCE_OR: ret = reduxGetOrInit (ctx->accTypeCode, &ctx->initVal); break; default: ret = GA_UNSUPPORTED_ERROR; break; } - if(ret != GA_NO_ERROR){ + if (ret != GA_NO_ERROR){ return reduxCleanup(ctx, ret); } @@ -824,7 +824,7 @@ static void reduxSelectTypes (redux_ctx* ctx){ ctx->dstTypeCode = ctx->srcTypeCode; ctx->dstArgTypeCode = GA_SSIZE; ctx->idxTypeCode = GA_SSIZE; - switch(ctx->srcTypeCode){ + switch (ctx->srcTypeCode){ case GA_HALF: ctx->accTypeCode = GA_FLOAT; case GA_HALF2: ctx->accTypeCode = GA_FLOAT2; case GA_HALF4: ctx->accTypeCode = GA_FLOAT4; @@ -867,11 +867,11 @@ static int reduxSelectModel (redux_ctx* ctx){ */ ret = gpucontext_property(ctx->gpuCtx, GA_CTX_PROP_NUMPROCS, &numProcs); - if(ret != GA_NO_ERROR){ + if (ret != GA_NO_ERROR){ return reduxCleanup(ctx, ret); } ret = gpucontext_property(ctx->gpuCtx, GA_CTX_PROP_MAXLSIZE, &localSize); - if(ret != GA_NO_ERROR){ + if (ret != GA_NO_ERROR){ return reduxCleanup(ctx, ret); } @@ -880,8 +880,8 @@ static int reduxSelectModel (redux_ctx* ctx){ * Compute #elems in dst and # reductions per dst element. */ - for(i=0;inds;i++){ - if(axisInSet(i, ctx->reduxList, ctx->nds, NULL)){ + for (i=0;inds;i++){ + if (axisInSet(i, ctx->reduxList, ctx->nds, NULL)){ reduxPerElem *= ctx->src->dimensions[i]; }else{ dstNumElem *= ctx->src->dimensions[i]; @@ -923,7 +923,7 @@ static int reduxIsLargeCodeModel (redux_ctx* ctx){ */ static int reduxHasDst (redux_ctx* ctx){ - switch(ctx->op){ + switch (ctx->op){ case GA_REDUCE_ARGMIN: case GA_REDUCE_ARGMAX: return 0; default: return 1; @@ -935,7 +935,7 @@ static int reduxHasDst (redux_ctx* ctx){ */ static int reduxHasDstArg (redux_ctx* ctx){ - switch(ctx->op){ + switch (ctx->op){ case GA_REDUCE_MINANDARGMIN: case GA_REDUCE_MAXANDARGMAX: case GA_REDUCE_ARGMIN: @@ -955,7 +955,7 @@ static int reduxHasDstArg (redux_ctx* ctx){ */ static int reduxKernelRequiresDst (redux_ctx* ctx){ - switch(ctx->op){ + switch (ctx->op){ case GA_REDUCE_ARGMIN: case GA_REDUCE_ARGMAX: return reduxIsSmallCodeModel(ctx); default: return 1; @@ -990,7 +990,7 @@ static int reduxKernelRequiresDstArg (redux_ctx* ctx){ */ static int reduxCanAppendHwAxis (redux_ctx* ctx, int wantReductionAxis){ - if(ctx->ndh >= MAX_HW_DIMS){ + if (ctx->ndh >= MAX_HW_DIMS){ return 0; }else{ return wantReductionAxis ? ctx->ndhr < ctx->ndr: @@ -1009,13 +1009,13 @@ static void reduxAppendLargestAxisToHwList(redux_ctx* ctx, int wantReductionAxi size_t maxV = 0; /* Find */ - for(i=0;inds;i++){ + for (i=0;inds;i++){ isInHwList = axisInSet(i, ctx->hwAxisList, ctx->ndh, 0); isInReduxList = axisInSet(i, ctx->reduxList, ctx->ndr, 0); isInDesiredList = wantReductionAxis ? isInReduxList : !isInReduxList; isLargestSoFar = ctx->src->dimensions[i] >= maxV; - if(!isInHwList && isInDesiredList && isLargestSoFar){ + if (!isInHwList && isInDesiredList && isLargestSoFar){ maxV = ctx->src->dimensions[i]; maxI = i; } @@ -1023,7 +1023,7 @@ static void reduxAppendLargestAxisToHwList(redux_ctx* ctx, int wantReductionAxi /* Append */ ctx->hwAxisList[ctx->ndh++] = maxI; - if(wantReductionAxis){ + if (wantReductionAxis){ ctx->ndhr++; }else{ ctx->ndhd++; @@ -1045,7 +1045,7 @@ static void reduxAppendLargestAxisToHwList(redux_ctx* ctx, int wantReductionAxi */ static int reduxSelectHwAxes (redux_ctx* ctx){ - if(reduxIsSmallCodeModel(ctx)){ + if (reduxIsSmallCodeModel(ctx)){ while(reduxCanAppendHwAxis(ctx, 1)){ reduxAppendLargestAxisToHwList(ctx, 1); } @@ -1078,12 +1078,12 @@ static int reduxComputeAxisList (redux_ctx* ctx){ int i, f=0; ctx->axisList = malloc(ctx->nds * sizeof(unsigned)); - if(!ctx->axisList){ + if (!ctx->axisList){ return reduxCleanup(ctx, GA_MEMORY_ERROR); } - for(i=0;inds;i++){ - if(!axisInSet(i, ctx->reduxList, ctx->ndr, 0)){ + for (i=0;inds;i++){ + if (!axisInSet(i, ctx->reduxList, ctx->ndr, 0)){ ctx->axisList[f++] = i; } } @@ -1102,7 +1102,7 @@ static int reduxComputeAxisList (redux_ctx* ctx){ static int reduxGenSource (redux_ctx* ctx){ reduxAppendSource(ctx); ctx->sourceCode = strb_cstr(&ctx->s); - if(!ctx->sourceCode){ + if (!ctx->sourceCode){ return reduxCleanup(ctx, GA_MEMORY_ERROR); } @@ -1133,21 +1133,15 @@ static void reduxAppendTypedefs (redux_ctx* ctx){ strb_appendf(&ctx->s, "typedef %s A;/* The type of the destination argument array. */\n", ctx->dstArgTypeStr); strb_appendf(&ctx->s, "typedef %s X;/* The type of the indices: signed 32/64-bit. */\n", ctx->idxTypeStr); strb_appendf(&ctx->s, "typedef %s K;/* The type of the accumulator variable. */\n", ctx->accTypeStr); - strb_appends(&ctx->s, "\n"); - strb_appends(&ctx->s, "\n"); - strb_appends(&ctx->s, "\n"); + strb_appends(&ctx->s, "\n\n\n"); } static void reduxAppendFuncGetInitVal (redux_ctx* ctx){ strb_appends(&ctx->s, "/**\n"); strb_appends(&ctx->s, " * Initial value function.\n"); - strb_appends(&ctx->s, " */\n"); - strb_appends(&ctx->s, "\n"); + strb_appends(&ctx->s, " */\n\n"); strb_appends(&ctx->s, "WITHIN_KERNEL K getInitVal(void){\n"); strb_appendf(&ctx->s, "\treturn (%s);\n", ctx->initVal); - strb_appends(&ctx->s, "}\n"); - strb_appends(&ctx->s, "\n"); - strb_appends(&ctx->s, "\n"); - strb_appends(&ctx->s, "\n"); + strb_appends(&ctx->s, "}\n\n\n\n"); } static void reduxAppendFuncLoadVal (redux_ctx* ctx){ int i; @@ -1159,12 +1153,12 @@ static void reduxAppendFuncLoadVal (redux_ctx* ctx){ strb_appends(&ctx->s, " */\n"); strb_appends(&ctx->s, "\n"); appendIdxes (&ctx->s, "WITHIN_KERNEL K loadVal(", "X i", 0, ctx->nds, "", ""); - if(ctx->nds > 0){ + if (ctx->nds > 0){ strb_appends(&ctx->s, ", "); } strb_appends(&ctx->s, "const GLOBAL_MEM S* src, const GLOBAL_MEM X* srcSteps){\n"); strb_appends(&ctx->s, "\tS v = (*(const GLOBAL_MEM S*)((const GLOBAL_MEM char*)src + "); - for(i=0;inds;i++){ + for (i=0;inds;i++){ strb_appendf(&ctx->s, "i%d*srcSteps[%d] + \\\n\t ", i, ctx->axisList[i]); } strb_appends(&ctx->s, "0));\n"); @@ -1173,10 +1167,7 @@ static void reduxAppendFuncLoadVal (redux_ctx* ctx){ /* Return the value. */ strb_appends(&ctx->s, "\treturn v;\n"); - strb_appends(&ctx->s, "}\n"); - strb_appends(&ctx->s, "\n"); - strb_appends(&ctx->s, "\n"); - strb_appends(&ctx->s, "\n"); + strb_appends(&ctx->s, "}\n\n\n\n"); } static void reduxAppendFuncReduxVal (redux_ctx* ctx){ int i, anyArgsEmitted = 0; @@ -1192,15 +1183,15 @@ static void reduxAppendFuncReduxVal (redux_ctx* ctx){ strb_appends(&ctx->s, "\n"); appendIdxes (&ctx->s, "WITHIN_KERNEL void reduxVal(", "X i", 0, ctx->ndd, "", ""); anyArgsEmitted = ctx->ndd>0; - if(reduxKernelRequiresDst (ctx)){ - if(anyArgsEmitted){ + if (reduxKernelRequiresDst (ctx)){ + if (anyArgsEmitted){ strb_appends(&ctx->s, ", "); } anyArgsEmitted = 1; strb_appends(&ctx->s, "GLOBAL_MEM T* dst, const GLOBAL_MEM X* dstSteps, K v"); } - if(reduxKernelRequiresDstArg(ctx)){ - if(anyArgsEmitted){ + if (reduxKernelRequiresDstArg(ctx)){ + if (anyArgsEmitted){ strb_appends(&ctx->s, ", "); } anyArgsEmitted = 1; @@ -1213,38 +1204,35 @@ static void reduxAppendFuncReduxVal (redux_ctx* ctx){ /* Write to memory. */ - if(reduxIsLargeCodeModel(ctx)){ + if (reduxIsLargeCodeModel(ctx)){ /* Large code model. Easy: just write out the data, since it's safe. */ - if(reduxKernelRequiresDst (ctx)){ + if (reduxKernelRequiresDst (ctx)){ strb_appends(&ctx->s, "\t(*(GLOBAL_MEM T*)((GLOBAL_MEM char*)dst + "); - for(i=0;indd;i++){ + for (i=0;indd;i++){ strb_appendf(&ctx->s, "i%d*dstSteps[%d] +\n\t ", i, i); } strb_appends(&ctx->s, "0)) = v;\n"); } - if(reduxKernelRequiresDstArg(ctx)){ + if (reduxKernelRequiresDstArg(ctx)){ strb_appends(&ctx->s, "\t(*(GLOBAL_MEM A*)((GLOBAL_MEM char*)dstArg + "); - for(i=0;indd;i++){ + for (i=0;indd;i++){ strb_appendf(&ctx->s, "i%d*dstArgSteps[%d] +\n\t ", i, i); } strb_appends(&ctx->s, "0)) = i;\n"); } }else{ /* BUG: Implement the atomic reduction, one or two CAS loops. */ - if ( reduxKernelRequiresDst (ctx) && !reduxKernelRequiresDstArg(ctx)){ + if ( reduxKernelRequiresDst (ctx) && !reduxKernelRequiresDstArg(ctx)){ - }else if(!reduxKernelRequiresDst (ctx) && reduxKernelRequiresDstArg(ctx)){ + }else if (!reduxKernelRequiresDst (ctx) && reduxKernelRequiresDstArg(ctx)){ - }else if( reduxKernelRequiresDst (ctx) && reduxKernelRequiresDstArg(ctx)){ + }else if ( reduxKernelRequiresDst (ctx) && reduxKernelRequiresDstArg(ctx)){ } } /* Close off function. */ - strb_appends(&ctx->s, "}\n"); - strb_appends(&ctx->s, "\n"); - strb_appends(&ctx->s, "\n"); - strb_appends(&ctx->s, "\n"); + strb_appends(&ctx->s, "}\n\n\n\n"); } static void reduxAppendFuncPreKernel (redux_ctx* ctx){ @@ -1266,8 +1254,7 @@ static void reduxAppendPrototype (redux_ctx* ctx){ strb_appends(&ctx->s, " * Reduction Kernel.\n"); strb_appends(&ctx->s, " *\n"); strb_appends(&ctx->s, " * Implements actual reduction operation.\n"); - strb_appends(&ctx->s, " */\n"); - strb_appends(&ctx->s, "\n"); + strb_appends(&ctx->s, " */\n\n"); strb_appends(&ctx->s, "KERNEL void redux(const GLOBAL_MEM S* src,\n"); strb_appends(&ctx->s, " const X srcOff,\n"); strb_appends(&ctx->s, " const GLOBAL_MEM X* srcSteps,\n"); @@ -1283,14 +1270,13 @@ static void reduxAppendPrototype (redux_ctx* ctx){ static void reduxAppendOffsets (redux_ctx* ctx){ strb_appends(&ctx->s, "\t/* Add offsets */\n"); strb_appends(&ctx->s, "\tsrc = (const GLOBAL_MEM T*)((const GLOBAL_MEM char*)src + srcOff);\n"); - if(reduxKernelRequiresDst(ctx)){ + if (reduxKernelRequiresDst(ctx)){ strb_appends(&ctx->s, "\tdst = (GLOBAL_MEM T*) ((GLOBAL_MEM char*) dst + dstOff);\n"); } - if(reduxKernelRequiresDstArg(ctx)){ + if (reduxKernelRequiresDstArg(ctx)){ strb_appends(&ctx->s, "\tdstArg = (GLOBAL_MEM X*) ((GLOBAL_MEM char*) dstArg + dstArgOff);\n"); } - strb_appends(&ctx->s, "\t\n"); - strb_appends(&ctx->s, "\t\n"); + strb_appends(&ctx->s, "\t\n\t\n"); } static void reduxAppendIndexDeclarations (redux_ctx* ctx){ int i; @@ -1300,29 +1286,27 @@ static void reduxAppendIndexDeclarations (redux_ctx* ctx){ strb_appends(&ctx->s, "\tX bd0 = LDIM_0, bd1 = LDIM_1, bd2 = LDIM_2;\n"); strb_appends(&ctx->s, "\tX ti0 = LID_0, ti1 = LID_1, ti2 = LID_2;\n"); strb_appends(&ctx->s, "\tX gi0 = bi0*bd0+ti0, gi1 = bi1*bd1+ti1, gi2 = bi2*bd2+ti2;\n"); - if(ctx->ndh>0){ + if (ctx->ndh>0){ strb_appends(&ctx->s, "\tX "); - for(i=0;indh;i++){ + for (i=0;indh;i++){ strb_appendf(&ctx->s, "ci%u = chunkSize[%u]%s", i, i, (i==ctx->ndh-1) ? ";\n" : ", "); } } - strb_appends(&ctx->s, "\t\n"); - strb_appends(&ctx->s, "\t\n"); + strb_appends(&ctx->s, "\t\n\t\n"); strb_appends(&ctx->s, "\t/* Free indices & Reduction indices */\n"); - if(ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "", ";\n");} - if(ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "Dim", ";\n");} - if(ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "Start", ";\n");} - if(ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "End", ";\n");} - if(ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "SStep", ";\n");} - if(ctx->ndd > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->ndd, "MStep", ";\n");} - if(ctx->ndd > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->ndd, "AStep", ";\n");} - if(ctx->nds > ctx->ndd){appendIdxes (&ctx->s, "\tX ", "i", ctx->ndd, ctx->nds, "PDim", ";\n");} + if (ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "", ";\n");} + if (ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "Dim", ";\n");} + if (ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "Start", ";\n");} + if (ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "End", ";\n");} + if (ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "SStep", ";\n");} + if (ctx->ndd > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->ndd, "MStep", ";\n");} + if (ctx->ndd > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->ndd, "AStep", ";\n");} + if (ctx->nds > ctx->ndd){appendIdxes (&ctx->s, "\tX ", "i", ctx->ndd, ctx->nds, "PDim", ";\n");} - strb_appends(&ctx->s, "\t\n"); - strb_appends(&ctx->s, "\t\n"); + strb_appends(&ctx->s, "\t\n\t\n"); } static void reduxAppendRangeCalculations (redux_ctx* ctx){ size_t hwDim; @@ -1331,57 +1315,56 @@ static void reduxAppendRangeCalculations (redux_ctx* ctx){ /* Use internal remapping when computing the ranges for this thread. */ strb_appends(&ctx->s, "\t/* Compute ranges for this thread. */\n"); - for(i=0;inds;i++){ + for (i=0;inds;i++){ strb_appendf(&ctx->s, "\ti%dDim = srcSize[%d];\n", i, ctx->axisList[i]); } - for(i=0;inds;i++){ + for (i=0;inds;i++){ strb_appendf(&ctx->s, "\ti%dSStep = srcSteps[%d];\n", i, ctx->axisList[i]); } - for(i=0;indd;i++){ + for (i=0;indd;i++){ strb_appendf(&ctx->s, "\ti%dMStep = dstSteps[%d];\n", i, i); } - for(i=0;indd;i++){ + for (i=0;indd;i++){ strb_appendf(&ctx->s, "\ti%dAStep = dstArgSteps[%d];\n", i, i); } - for(i=ctx->nds-1;i>=ctx->ndd;i--){ + for (i=ctx->nds-1;i>=ctx->ndd;i--){ /** * If this is the last index, it's the first cumulative dimension * product we generate, and thus we initialize to 1. */ - if(i == ctx->nds-1){ + if (i == ctx->nds-1){ strb_appendf(&ctx->s, "\ti%dPDim = 1;\n", i); }else{ strb_appendf(&ctx->s, "\ti%dPDim = i%dPDim * i%dDim;\n", i, i+1, i+1); } } - for(i=0;inds;i++){ + for (i=0;inds;i++){ /** * Up to MAX_HW_DIMS dimensions get to rely on hardware loops. * The others, if any, have to use software looping beginning at 0. */ - if(axisInSet(ctx->axisList[i], ctx->hwAxisList, ctx->ndh, &hwDim)){ + if (axisInSet(ctx->axisList[i], ctx->hwAxisList, ctx->ndh, &hwDim)){ strb_appendf(&ctx->s, "\ti%dStart = gi%d * ci%d;\n", i, hwDim, hwDim); }else{ strb_appendf(&ctx->s, "\ti%dStart = 0;\n", i); } } - for(i=0;inds;i++){ + for (i=0;inds;i++){ /** * Up to MAX_HW_DIMS dimensions get to rely on hardware loops. * The others, if any, have to use software looping beginning at 0. */ - if(axisInSet(ctx->axisList[i], ctx->hwAxisList, ctx->ndh, &hwDim)){ + if (axisInSet(ctx->axisList[i], ctx->hwAxisList, ctx->ndh, &hwDim)){ strb_appendf(&ctx->s, "\ti%dEnd = i%dStart + ci%d;\n", i, i, hwDim); }else{ strb_appendf(&ctx->s, "\ti%dEnd = i%dStart + i%dDim;\n", i, i, i); } } - strb_appends(&ctx->s, "\t\n"); - strb_appends(&ctx->s, "\t\n"); + strb_appends(&ctx->s, "\t\n\t\n"); } static void reduxAppendLoops (redux_ctx* ctx){ strb_appends(&ctx->s, "\t/**\n"); @@ -1413,7 +1396,7 @@ static void reduxAppendLoopMacroDefs (redux_ctx* ctx){ */ appendIdxes (&ctx->s, "#define RDXINDEXER(", "i", ctx->ndd, ctx->nds, "", ") ("); - for(i=ctx->ndd;inds;i++){ + for (i=ctx->ndd;inds;i++){ strb_appendf(&ctx->s, "i%d*i%dPDim + \\\n ", i, i); } strb_appends(&ctx->s, "0)\n"); @@ -1425,7 +1408,7 @@ static void reduxAppendLoopOuter (redux_ctx* ctx){ * Outer Loop Header Generation */ - for(i=0;indd;i++){ + for (i=0;indd;i++){ strb_appendf(&ctx->s, "\tFOROVER(%d){ESCAPE(%d)\n", i, i); } @@ -1439,7 +1422,7 @@ static void reduxAppendLoopOuter (redux_ctx* ctx){ * Outer Loop Trailer Generation */ - for(i=0;indd;i++){ + for (i=0;indd;i++){ strb_appends(&ctx->s, "\t}\n"); } } @@ -1455,7 +1438,7 @@ static void reduxAppendLoopInner (redux_ctx* ctx){ strb_appends(&ctx->s, "\t\t */\n"); strb_appends(&ctx->s, "\t\t\n"); strb_appends(&ctx->s, "\t\tK rdxV = getInitVal();\n"); - if(reduxKernelRequiresDstArg(ctx)){ + if (reduxKernelRequiresDstArg(ctx)){ strb_appends(&ctx->s, "\t\tX argI = 0;\n"); } strb_appends(&ctx->s, "\t\t\n"); @@ -1468,7 +1451,7 @@ static void reduxAppendLoopInner (redux_ctx* ctx){ * Inner Loop Header Generation */ - for(i=ctx->ndd;inds;i++){ + for (i=ctx->ndd;inds;i++){ strb_appendf(&ctx->s, "\t\tFOROVER(%d){ESCAPE(%d)\n", i, i); } @@ -1477,12 +1460,12 @@ static void reduxAppendLoopInner (redux_ctx* ctx){ */ appendIdxes (&ctx->s, "\t\t\tK v = loadVal(", "i", 0, ctx->nds, "", ""); - if(ctx->nds > 0){ + if (ctx->nds > 0){ strb_appends(&ctx->s, ", "); } strb_appends(&ctx->s, "src, srcSteps);\n"); strb_appends(&ctx->s, "\t\t\t\n"); - switch(ctx->op){ + switch (ctx->op){ case GA_REDUCE_SUM: strb_appends(&ctx->s, "\t\t\trdxV += v;\n"); break; case GA_REDUCE_PROD: strb_appends(&ctx->s, "\t\t\trdxV *= v;\n"); break; case GA_REDUCE_PRODNZ: strb_appends(&ctx->s, "\t\t\trdxV *= v==0 ? getInitVal() : v;\n"); break; @@ -1513,7 +1496,7 @@ static void reduxAppendLoopInner (redux_ctx* ctx){ * Inner Loop Trailer Generation */ - for(i=ctx->ndd;inds;i++){ + for (i=ctx->ndd;inds;i++){ strb_appends(&ctx->s, "\t\t}\n"); } strb_appends(&ctx->s, "\t\t\n"); @@ -1526,21 +1509,21 @@ static void reduxAppendLoopInner (redux_ctx* ctx){ strb_appends(&ctx->s, "\t\t * Destination writeback.\n"); strb_appends(&ctx->s, "\t\t */\n"); strb_appends(&ctx->s, "\t\t\n"); - if ( reduxKernelRequiresDst (ctx) && !reduxKernelRequiresDstArg(ctx)){ + if ( reduxKernelRequiresDst (ctx) && !reduxKernelRequiresDstArg(ctx)){ appendIdxes (&ctx->s, "\t\treduxVal(", "i", 0, ctx->ndd, "", ""); - if(ctx->ndd > 0){ + if (ctx->ndd > 0){ strb_appends(&ctx->s, ", "); } strb_appends(&ctx->s, "dst, dstSteps, rdxV);\n"); - }else if(!reduxKernelRequiresDst (ctx) && reduxKernelRequiresDstArg(ctx)){ + }else if (!reduxKernelRequiresDst (ctx) && reduxKernelRequiresDstArg(ctx)){ appendIdxes (&ctx->s, "\t\treduxVal(", "i", 0, ctx->ndd, "", ""); - if(ctx->ndd > 0){ + if (ctx->ndd > 0){ strb_appends(&ctx->s, ", "); } strb_appends(&ctx->s, "dstArg, dstArgSteps, argI);\n"); - }else if( reduxKernelRequiresDst (ctx) && reduxKernelRequiresDstArg(ctx)){ + }else if ( reduxKernelRequiresDst (ctx) && reduxKernelRequiresDstArg(ctx)){ appendIdxes (&ctx->s, "\t\treduxVal(", "i", 0, ctx->ndd, "", ""); - if(ctx->ndd > 0){ + if (ctx->ndd > 0){ strb_appends(&ctx->s, ", "); } strb_appends(&ctx->s, "dst, dstSteps, rdxV, dstArg, dstArgSteps, argI);\n"); @@ -1560,23 +1543,23 @@ static void reduxAppendLoopMacroUndefs (redux_ctx* ctx){ static int reduxCompileLarge (redux_ctx* ctx){ const int ARG_TYPECODES[] = { - GA_BUFFER, /* src */ - GA_SIZE, /* srcOff */ - GA_BUFFER, /* srcSteps */ - GA_BUFFER, /* srcSize */ - GA_BUFFER, /* chnkSize */ - GA_BUFFER, /* dst */ - GA_SIZE, /* dstOff */ - GA_BUFFER, /* dstSteps */ - GA_BUFFER, /* dstArg */ - GA_SIZE, /* dstArgOff */ - GA_BUFFER /* dstArgSteps */ + GA_BUFFER, /* src */ + GA_SIZE, /* srcOff */ + GA_BUFFER, /* srcSteps */ + GA_BUFFER, /* srcSize */ + GA_BUFFER, /* chnkSize */ + GA_BUFFER, /* dst */ + GA_SIZE, /* dstOff */ + GA_BUFFER, /* dstSteps */ + GA_BUFFER, /* dstArg */ + GA_SIZE, /* dstArgOff */ + GA_BUFFER /* dstArgSteps */ }; const size_t ARG_TYPECODES_LEN = sizeof(ARG_TYPECODES)/sizeof(*ARG_TYPECODES); const char* SRCS[1] = {ctx->sourceCode}; const size_t SRC_LENS[1] = {strlen(ctx->sourceCode)}; const size_t SRCS_LEN = sizeof(SRCS)/sizeof(*SRCS); - + int ret = GpuKernel_init(&ctx->kernel, ctx->gpuCtx, SRCS_LEN, @@ -1588,7 +1571,7 @@ static int reduxCompileLarge (redux_ctx* ctx){ 0, (char**)0); - if(ret != GA_NO_ERROR){ + if (ret != GA_NO_ERROR){ return reduxCleanup(ctx, ret); }else{ return reduxScheduleLarge(ctx); @@ -1652,20 +1635,20 @@ static int reduxScheduleLarge (redux_ctx* ctx){ dims[0] = dims[1] = dims[2] = 1; slack[0] = slack[1] = slack[2] = 1.1; - for(i=0;indh;i++){ + for (i=0;indh;i++){ dims[i] = ctx->src->dimensions[ctx->hwAxisList[i]]; gaIFLInit(&factBS[i]); gaIFLInit(&factGS[i]); gaIFLInit(&factCS[i]); warpMod = dims[i]%warpSize; - if(bestWarpMod>0 && (warpMod==0 || warpMod>=bestWarpMod)){ + if (bestWarpMod>0 && (warpMod==0 || warpMod>=bestWarpMod)){ bestWarpAxis = i; bestWarpMod = warpMod; } } - if(ctx->ndh > 0){ + if (ctx->ndh > 0){ dims[bestWarpAxis] = (dims[bestWarpAxis] + warpSize - 1)/warpSize; gaIFactorize(warpSize, 0, 0, &factBS[bestWarpAxis]); } @@ -1676,8 +1659,8 @@ static int reduxScheduleLarge (redux_ctx* ctx){ * chunkSize. */ - for(i=0;indh;i++){ - while(!gaIFactorize(dims[i], (uint64_t)(dims[i]*slack[i]), maxLs[i], &factCS[i])){ + for (i=0;indh;i++){ + while (!gaIFactorize(dims[i], (uint64_t)(dims[i]*slack[i]), maxLs[i], &factCS[i])){ /** * Error! Failed to factorize dimension i with given slack and * k-smoothness constraints! Increase slack. Once slack reaches @@ -1698,7 +1681,7 @@ static int reduxScheduleLarge (redux_ctx* ctx){ gaIFLSchedule(ctx->ndh, maxLg, maxLs, maxGg, maxGs, factBS, factGS, factCS); /* Output. */ - for(i=0;indh;i++){ + for (i=0;indh;i++){ ctx->blockSize[i] = gaIFLGetProduct(&factBS[i]); ctx->gridSize [i] = gaIFLGetProduct(&factGS[i]); ctx->chunkSize[i] = gaIFLGetProduct(&factCS[i]); @@ -1727,11 +1710,11 @@ static int reduxInvokeLarge (redux_ctx* ctx){ ctx->src->dimensions, flags, 0); ctx->chunkSizeGD = gpudata_alloc(ctx->gpuCtx, ctx->ndh * sizeof(size_t), ctx->chunkSize, flags, 0); - if(reduxKernelRequiresDst(ctx)){ + if (reduxKernelRequiresDst(ctx)){ ctx->dstStepsGD = gpudata_alloc(ctx->gpuCtx, ctx->ndd * sizeof(size_t), ctx->dst->strides, flags, 0); } - if(reduxKernelRequiresDstArg(ctx)){ + if (reduxKernelRequiresDstArg(ctx)){ ctx->dstArgStepsGD = gpudata_alloc(ctx->gpuCtx, ctx->ndd * sizeof(size_t), ctx->dstArg->strides, flags, 0); } @@ -1740,28 +1723,28 @@ static int reduxInvokeLarge (redux_ctx* ctx){ args[ 2] = (void*) ctx->srcStepsGD; args[ 3] = (void*) ctx->srcSizeGD; args[ 4] = (void*) ctx->chunkSizeGD; - if ( reduxKernelRequiresDst (ctx) && reduxKernelRequiresDstArg(ctx)){ + if ( reduxKernelRequiresDst (ctx) && reduxKernelRequiresDstArg(ctx)){ args[ 5] = (void*) ctx->dst->data; args[ 6] = (void*)&ctx->dst->offset; args[ 7] = (void*) ctx->dstStepsGD; args[ 8] = (void*) ctx->dstArg->data; args[ 9] = (void*)&ctx->dstArg->offset; args[10] = (void*) ctx->dstArgStepsGD; - }else if( reduxKernelRequiresDst (ctx) && !reduxKernelRequiresDstArg(ctx)){ + }else if ( reduxKernelRequiresDst (ctx) && !reduxKernelRequiresDstArg(ctx)){ args[ 5] = (void*) ctx->dst->data; args[ 6] = (void*)&ctx->dst->offset; args[ 7] = (void*) ctx->dstStepsGD; - }else if(!reduxKernelRequiresDst (ctx) && reduxKernelRequiresDstArg(ctx)){ + }else if (!reduxKernelRequiresDst (ctx) && reduxKernelRequiresDstArg(ctx)){ args[ 5] = (void*) ctx->dstArg->data; args[ 6] = (void*)&ctx->dstArg->offset; args[ 7] = (void*) ctx->dstArgStepsGD; } - if(ctx->srcStepsGD && - ctx->srcSizeGD && - ctx->chunkSizeGD && - ctx->dstStepsGD && - ctx->dstArgStepsGD){ + if (ctx->srcStepsGD && + ctx->srcSizeGD && + ctx->chunkSizeGD && + ctx->dstStepsGD && + ctx->dstArgStepsGD){ ret = GpuKernel_call(&ctx->kernel, ctx->ndh>0 ? ctx->ndh : 1, ctx->gridSize, From a0654c204ff70a54a7a56a25ba22de6463802daf Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Wed, 25 Jan 2017 20:30:35 -0500 Subject: [PATCH 05/34] More style fixes on switches. --- src/gpuarray_reduction.c | 290 +++++++++++++++++++++++++-------------- 1 file changed, 190 insertions(+), 100 deletions(-) diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c index 5534d84c36..62502fb1fd 100644 --- a/src/gpuarray_reduction.c +++ b/src/gpuarray_reduction.c @@ -459,26 +459,36 @@ static int reduxGetMinInit (int typecode, const char** property) case GA_BYTE4: case GA_BYTE8: case GA_BYTE16: - case GA_BYTE: *property = "SCHAR_MIN"; break; + case GA_BYTE: + *property = "SCHAR_MIN"; + break; case GA_SHORT2: case GA_SHORT3: case GA_SHORT4: case GA_SHORT8: case GA_SHORT16: - case GA_SHORT: *property = "SHRT_MIN"; break; + case GA_SHORT: + *property = "SHRT_MIN"; + break; case GA_INT2: case GA_INT3: case GA_INT4: case GA_INT8: case GA_INT16: - case GA_INT: *property = "INT_MIN"; break; + case GA_INT: + *property = "INT_MIN"; + break; case GA_LONG2: case GA_LONG3: case GA_LONG4: case GA_LONG8: case GA_LONG16: - case GA_LONG: *property = "LONG_MIN"; break; - case GA_LONGLONG: *property = "LLONG_MIN"; break; + case GA_LONG: + *property = "LONG_MIN"; + break; + case GA_LONGLONG: + *property = "LLONG_MIN"; + break; case GA_BOOL: case GA_UBYTE2: case GA_UBYTE3: @@ -505,14 +515,19 @@ static int reduxGetMinInit (int typecode, const char** property) case GA_ULONG16: case GA_ULONG: case GA_ULONGLONG: - case GA_SIZE: *property = "0"; break; + case GA_SIZE: + *property = "0"; + break; case GA_HALF: case GA_FLOAT: case GA_DOUBLE: - case GA_QUAD: *property = "NAN"; break; - default: return GA_UNSUPPORTED_ERROR; + case GA_QUAD: + *property = "NAN"; + break; + default: + return GA_UNSUPPORTED_ERROR; } - + return GA_NO_ERROR; } @@ -529,64 +544,89 @@ static int reduxGetMinInit (int typecode, const char** property) static int reduxGetMaxInit (int typecode, const char** property){ switch (typecode){ - case GA_BOOL: *property = "1"; break; + case GA_BOOL: + *property = "1"; + break; case GA_BYTE2: case GA_BYTE3: case GA_BYTE4: case GA_BYTE8: case GA_BYTE16: - case GA_BYTE: *property = "SCHAR_MAX"; break; + case GA_BYTE: + *property = "SCHAR_MAX"; + break; case GA_UBYTE2: case GA_UBYTE3: case GA_UBYTE4: case GA_UBYTE8: case GA_UBYTE16: - case GA_UBYTE: *property = "UCHAR_MAX"; break; + case GA_UBYTE: + *property = "UCHAR_MAX"; + break; case GA_SHORT2: case GA_SHORT3: case GA_SHORT4: case GA_SHORT8: case GA_SHORT16: - case GA_SHORT: *property = "SHRT_MAX"; break; + case GA_SHORT: + *property = "SHRT_MAX"; + break; case GA_USHORT2: case GA_USHORT3: case GA_USHORT4: case GA_USHORT8: case GA_USHORT16: - case GA_USHORT: *property = "USHRT_MAX"; break; + case GA_USHORT: + *property = "USHRT_MAX"; + break; case GA_INT2: case GA_INT3: case GA_INT4: case GA_INT8: case GA_INT16: - case GA_INT: *property = "INT_MAX"; break; + case GA_INT: + *property = "INT_MAX"; + break; case GA_UINT2: case GA_UINT3: case GA_UINT4: case GA_UINT8: case GA_UINT16: - case GA_UINT: *property = "UINT_MAX"; break; + case GA_UINT: + *property = "UINT_MAX"; + break; case GA_LONG2: case GA_LONG3: case GA_LONG4: case GA_LONG8: case GA_LONG16: - case GA_LONG: *property = "LONG_MAX"; break; + case GA_LONG: + *property = "LONG_MAX"; + break; case GA_ULONG2: case GA_ULONG3: case GA_ULONG4: case GA_ULONG8: case GA_ULONG16: - case GA_ULONG: *property = "ULONG_MAX"; break; - case GA_LONGLONG: *property = "LLONG_MAX"; break; - case GA_ULONGLONG: *property = "ULLONG_MAX"; break; + case GA_ULONG: + *property = "ULONG_MAX"; + break; + case GA_LONGLONG: + *property = "LLONG_MAX"; + break; + case GA_ULONGLONG: + *property = "ULLONG_MAX"; + break; case GA_HALF: case GA_FLOAT: case GA_DOUBLE: - case GA_QUAD: *property = "NAN"; break; - default: return GA_UNSUPPORTED_ERROR; + case GA_QUAD: + *property = "NAN"; + break; + default: + return GA_UNSUPPORTED_ERROR; } - + return GA_NO_ERROR; } @@ -771,21 +811,34 @@ static int reduxCheckargs (redux_ctx* ctx){ /* Determine initializer, and error out if reduction unsupported. */ switch (ctx->op){ - case GA_REDUCE_SUM: ret = reduxGetSumInit (ctx->accTypeCode, &ctx->initVal); break; + case GA_REDUCE_SUM: + ret = reduxGetSumInit (ctx->accTypeCode, &ctx->initVal); + break; case GA_REDUCE_PRODNZ: - case GA_REDUCE_PROD: ret = reduxGetProdInit(ctx->accTypeCode, &ctx->initVal); break; + case GA_REDUCE_PROD: + ret = reduxGetProdInit(ctx->accTypeCode, &ctx->initVal); + break; case GA_REDUCE_MINANDARGMIN: case GA_REDUCE_ARGMIN: - case GA_REDUCE_MIN: ret = reduxGetMinInit (ctx->accTypeCode, &ctx->initVal); break; + case GA_REDUCE_MIN: + ret = reduxGetMinInit (ctx->accTypeCode, &ctx->initVal); + break; case GA_REDUCE_MAXANDARGMAX: case GA_REDUCE_ARGMAX: - case GA_REDUCE_MAX: ret = reduxGetMaxInit (ctx->accTypeCode, &ctx->initVal); break; + case GA_REDUCE_MAX: + ret = reduxGetMaxInit (ctx->accTypeCode, &ctx->initVal); + break; case GA_REDUCE_ALL: - case GA_REDUCE_AND: ret = reduxGetAndInit (ctx->accTypeCode, &ctx->initVal); break; + case GA_REDUCE_AND: + ret = reduxGetAndInit (ctx->accTypeCode, &ctx->initVal); + break; case GA_REDUCE_ANY: case GA_REDUCE_XOR: - case GA_REDUCE_OR: ret = reduxGetOrInit (ctx->accTypeCode, &ctx->initVal); break; - default: ret = GA_UNSUPPORTED_ERROR; break; + case GA_REDUCE_OR: + ret = reduxGetOrInit (ctx->accTypeCode, &ctx->initVal); + break; + default: + ret = GA_UNSUPPORTED_ERROR; } if (ret != GA_NO_ERROR){ return reduxCleanup(ctx, ret); @@ -809,7 +862,7 @@ static int reduxCheckargs (redux_ctx* ctx){ /** * @brief Select types for the reduction kernel's implementation. - * + * * There are 5 types of relevance: * - Source (S=Source) * - Destination (T=Target) @@ -825,14 +878,25 @@ static void reduxSelectTypes (redux_ctx* ctx){ ctx->dstArgTypeCode = GA_SSIZE; ctx->idxTypeCode = GA_SSIZE; switch (ctx->srcTypeCode){ - case GA_HALF: ctx->accTypeCode = GA_FLOAT; - case GA_HALF2: ctx->accTypeCode = GA_FLOAT2; - case GA_HALF4: ctx->accTypeCode = GA_FLOAT4; - case GA_HALF8: ctx->accTypeCode = GA_FLOAT8; - case GA_HALF16: ctx->accTypeCode = GA_FLOAT16; - default: ctx->accTypeCode = ctx->srcTypeCode; - } - + case GA_HALF: + ctx->accTypeCode = GA_FLOAT; + break; + case GA_HALF2: + ctx->accTypeCode = GA_FLOAT2; + break; + case GA_HALF4: + ctx->accTypeCode = GA_FLOAT4; + break; + case GA_HALF8: + ctx->accTypeCode = GA_FLOAT8; + break; + case GA_HALF16: + ctx->accTypeCode = GA_FLOAT16; + break; + default: + ctx->accTypeCode = ctx->srcTypeCode; + } + /* Get the string version as well. */ ctx->srcTypeStr = gpuarray_get_type(ctx->srcTypeCode) ->cluda_name; ctx->dstTypeStr = gpuarray_get_type(ctx->dstTypeCode) ->cluda_name; @@ -843,7 +907,7 @@ static void reduxSelectTypes (redux_ctx* ctx){ /** * @brief Select which code model will be used: - * + * * - Large (Destination tensor >= SMALL_REDUX_THRESHOLD elements, or * destination tensor size >= # of reductions per destination * tensor element): @@ -865,7 +929,7 @@ static int reduxSelectModel (redux_ctx* ctx){ * use large code model; Otherwise use small code model, where threads will * have to cooperate. */ - + ret = gpucontext_property(ctx->gpuCtx, GA_CTX_PROP_NUMPROCS, &numProcs); if (ret != GA_NO_ERROR){ return reduxCleanup(ctx, ret); @@ -897,8 +961,8 @@ static int reduxSelectModel (redux_ctx* ctx){ * - reduxKernelRequiresDst() * - reduxKernelRequiresDstArg() */ - - + + return reduxSelectHwAxes(ctx); } @@ -925,8 +989,10 @@ static int reduxIsLargeCodeModel (redux_ctx* ctx){ static int reduxHasDst (redux_ctx* ctx){ switch (ctx->op){ case GA_REDUCE_ARGMIN: - case GA_REDUCE_ARGMAX: return 0; - default: return 1; + case GA_REDUCE_ARGMAX: + return 0; + default: + return 1; } } @@ -939,15 +1005,17 @@ static int reduxHasDstArg (redux_ctx* ctx){ case GA_REDUCE_MINANDARGMIN: case GA_REDUCE_MAXANDARGMAX: case GA_REDUCE_ARGMIN: - case GA_REDUCE_ARGMAX: return 1; - default: return 0; + case GA_REDUCE_ARGMAX: + return 1; + default: + return 0; } } /** * @brief Returns whether the generated kernel internally requires a dst * argument. - * + * * This is semantically subtly different from reduxHasDst(). The main * difference is in the implementation of the GA_REDUCE_ARGMIN/ARGMAX * reductions; Either *might* require a dst buffer, which will have to be @@ -957,15 +1025,17 @@ static int reduxHasDstArg (redux_ctx* ctx){ static int reduxKernelRequiresDst (redux_ctx* ctx){ switch (ctx->op){ case GA_REDUCE_ARGMIN: - case GA_REDUCE_ARGMAX: return reduxIsSmallCodeModel(ctx); - default: return 1; + case GA_REDUCE_ARGMAX: + return reduxIsSmallCodeModel(ctx); + default: + return 1; } } /** * @brief Returns whether the generated kernel internally requires a dstArg * argument. - * + * * This is semantically subtly different from reduxHasDstArg(), since it asks * whether the reduction, even though it does not accept a dstArg argument, * still requires a dstArg internally. @@ -975,11 +1045,11 @@ static int reduxKernelRequiresDstArg (redux_ctx* ctx){ /** * At present there exists no reduction whose implementation requires * a dstArg but whose interface does not. - * + * * E.g. the max() and min() reductions do NOT currently require a temporary * buffer for indexes, and will not in the foreseeable future. */ - + return reduxHasDstArg(ctx); } @@ -1007,20 +1077,20 @@ static int reduxCanAppendHwAxis (redux_ctx* ctx, int wantReductionAxi static void reduxAppendLargestAxisToHwList(redux_ctx* ctx, int wantReductionAxis){ int maxI = 0, i, isInHwList, isInReduxList, isInDesiredList, isLargestSoFar; size_t maxV = 0; - + /* Find */ for (i=0;inds;i++){ isInHwList = axisInSet(i, ctx->hwAxisList, ctx->ndh, 0); isInReduxList = axisInSet(i, ctx->reduxList, ctx->ndr, 0); isInDesiredList = wantReductionAxis ? isInReduxList : !isInReduxList; isLargestSoFar = ctx->src->dimensions[i] >= maxV; - + if (!isInHwList && isInDesiredList && isLargestSoFar){ maxV = ctx->src->dimensions[i]; maxI = i; } } - + /* Append */ ctx->hwAxisList[ctx->ndh++] = maxI; if (wantReductionAxis){ @@ -1033,7 +1103,7 @@ static void reduxAppendLargestAxisToHwList(redux_ctx* ctx, int wantReductionAxi /** * @brief Select which axes (up to MAX_HW_DIMS) will be assigned to hardware * dimensions. - * + * * For the "large" code model: The up-to-MAX_HW_DIMS largest destination tensor * dimensions are selected. * For the "small" code model: Up to MAX_HW_DIMS reduction dimensions (largest- @@ -1046,37 +1116,37 @@ static void reduxAppendLargestAxisToHwList(redux_ctx* ctx, int wantReductionAxi static int reduxSelectHwAxes (redux_ctx* ctx){ if (reduxIsSmallCodeModel(ctx)){ - while(reduxCanAppendHwAxis(ctx, 1)){ + while (reduxCanAppendHwAxis(ctx, 1)){ reduxAppendLargestAxisToHwList(ctx, 1); } } - - while(reduxCanAppendHwAxis(ctx, 0)){ + + while (reduxCanAppendHwAxis(ctx, 0)){ reduxAppendLargestAxisToHwList(ctx, 0); } - + return reduxComputeAxisList(ctx); } /** * @brief Compute the axis list. - * + * * The axis list describes the mapping between the nested loops of the kernel * as well as their accompanying indices (i0*, i1*, ..., in*) on one hand, and * the axes of the source tensor. The first axis in the list corresponds to the * outermost loop and the last axis in the list to the innermost. - * + * * The first ctx->ndd axes correspond to the outer loops that iterate over * each destination element. The last ctx->ndr axes correspond to the inner * loops that iterate over the dimensions of elements that are to be reduced. - * + * * @return GA_MEMORY_ERROR if allocating the list failed; Otherwise, returns * GA_NO_ERROR. */ static int reduxComputeAxisList (redux_ctx* ctx){ int i, f=0; - + ctx->axisList = malloc(ctx->nds * sizeof(unsigned)); if (!ctx->axisList){ return reduxCleanup(ctx, GA_MEMORY_ERROR); @@ -1088,8 +1158,8 @@ static int reduxComputeAxisList (redux_ctx* ctx){ } } memcpy(&ctx->axisList[f], ctx->reduxList, ctx->ndr * sizeof(*ctx->reduxList)); - - + + return reduxGenSource(ctx); } @@ -1105,7 +1175,7 @@ static int reduxGenSource (redux_ctx* ctx){ if (!ctx->sourceCode){ return reduxCleanup(ctx, GA_MEMORY_ERROR); } - + return reduxIsLargeCodeModel(ctx) ? reduxCompileLarge(ctx): reduxCompileSmall(ctx); } @@ -1145,7 +1215,7 @@ static void reduxAppendFuncGetInitVal (redux_ctx* ctx){ } static void reduxAppendFuncLoadVal (redux_ctx* ctx){ int i; - + strb_appends(&ctx->s, "/**\n"); strb_appends(&ctx->s, " * Multidimensional source element loader.\n"); strb_appends(&ctx->s, " *\n"); @@ -1162,16 +1232,16 @@ static void reduxAppendFuncLoadVal (redux_ctx* ctx){ strb_appendf(&ctx->s, "i%d*srcSteps[%d] + \\\n\t ", i, ctx->axisList[i]); } strb_appends(&ctx->s, "0));\n"); - + /* Prescalar transformations go here... */ - + /* Return the value. */ strb_appends(&ctx->s, "\treturn v;\n"); strb_appends(&ctx->s, "}\n\n\n\n"); } static void reduxAppendFuncReduxVal (redux_ctx* ctx){ int i, anyArgsEmitted = 0; - + /* Function Signature. */ strb_appends(&ctx->s, "/**\n"); strb_appends(&ctx->s, " * Global memory value reduction function.\n"); @@ -1198,11 +1268,11 @@ static void reduxAppendFuncReduxVal (redux_ctx* ctx){ strb_appends(&ctx->s, "GLOBAL_MEM A* dstArg, const GLOBAL_MEM X* dstArgSteps, X i"); } strb_appends(&ctx->s, "){\n"); - - + + /* Post-scalar transformations go here. */ - - + + /* Write to memory. */ if (reduxIsLargeCodeModel(ctx)){ /* Large code model. Easy: just write out the data, since it's safe. */ @@ -1223,19 +1293,19 @@ static void reduxAppendFuncReduxVal (redux_ctx* ctx){ }else{ /* BUG: Implement the atomic reduction, one or two CAS loops. */ if ( reduxKernelRequiresDst (ctx) && !reduxKernelRequiresDstArg(ctx)){ - + }else if (!reduxKernelRequiresDst (ctx) && reduxKernelRequiresDstArg(ctx)){ - + }else if ( reduxKernelRequiresDst (ctx) && reduxKernelRequiresDstArg(ctx)){ - + } } - + /* Close off function. */ strb_appends(&ctx->s, "}\n\n\n\n"); } static void reduxAppendFuncPreKernel (redux_ctx* ctx){ - + } static void reduxAppendFuncKernel (redux_ctx* ctx){ reduxAppendPrototype (ctx); @@ -1247,7 +1317,7 @@ static void reduxAppendFuncKernel (redux_ctx* ctx){ strb_appends (&ctx->s, "}\n"); } static void reduxAppendFuncPostKernel (redux_ctx* ctx){ - + } static void reduxAppendPrototype (redux_ctx* ctx){ strb_appends(&ctx->s, "/**\n"); @@ -1466,30 +1536,50 @@ static void reduxAppendLoopInner (redux_ctx* ctx){ strb_appends(&ctx->s, "src, srcSteps);\n"); strb_appends(&ctx->s, "\t\t\t\n"); switch (ctx->op){ - case GA_REDUCE_SUM: strb_appends(&ctx->s, "\t\t\trdxV += v;\n"); break; - case GA_REDUCE_PROD: strb_appends(&ctx->s, "\t\t\trdxV *= v;\n"); break; - case GA_REDUCE_PRODNZ: strb_appends(&ctx->s, "\t\t\trdxV *= v==0 ? getInitVal() : v;\n"); break; - case GA_REDUCE_MIN: strb_appends(&ctx->s, "\t\t\trdxV = min(rdxV, v);\n"); break; - case GA_REDUCE_MAX: strb_appends(&ctx->s, "\t\t\trdxV = max(rdxV, v);\n"); break; + case GA_REDUCE_SUM: + strb_appends(&ctx->s, "\t\t\trdxV += v;\n"); + break; + case GA_REDUCE_PROD: + strb_appends(&ctx->s, "\t\t\trdxV *= v;\n"); + break; + case GA_REDUCE_PRODNZ: + strb_appends(&ctx->s, "\t\t\trdxV *= v==0 ? getInitVal() : v;\n"); + break; + case GA_REDUCE_MIN: + strb_appends(&ctx->s, "\t\t\trdxV = min(rdxV, v);\n"); + break; + case GA_REDUCE_MAX: + strb_appends(&ctx->s, "\t\t\trdxV = max(rdxV, v);\n"); + break; case GA_REDUCE_ARGMIN: case GA_REDUCE_MINANDARGMIN: - strb_appends(&ctx->s, "\t\t\trdxV = min(rdxV, v);\n"); - strb_appends(&ctx->s, "\t\t\tif(v == rdxV){\n"); - appendIdxes (&ctx->s, "\t\t\t\targI = RDXINDEXER(", "i", ctx->ndd, ctx->nds, "", ");\n"); - strb_appends(&ctx->s, "\t\t\t}\n"); + strb_appends(&ctx->s, "\t\t\trdxV = min(rdxV, v);\n"); + strb_appends(&ctx->s, "\t\t\tif(v == rdxV){\n"); + appendIdxes (&ctx->s, "\t\t\t\targI = RDXINDEXER(", "i", ctx->ndd, ctx->nds, "", ");\n"); + strb_appends(&ctx->s, "\t\t\t}\n"); break; case GA_REDUCE_ARGMAX: case GA_REDUCE_MAXANDARGMAX: - strb_appends(&ctx->s, "\t\t\trdxV = max(rdxV, v);\n"); - strb_appends(&ctx->s, "\t\t\tif(v == rdxV){\n"); - appendIdxes (&ctx->s, "\t\t\t\targI = RDXINDEXER(", "i", ctx->ndd, ctx->nds, "", ");\n"); - strb_appends(&ctx->s, "\t\t\t}\n"); + strb_appends(&ctx->s, "\t\t\trdxV = max(rdxV, v);\n"); + strb_appends(&ctx->s, "\t\t\tif(v == rdxV){\n"); + appendIdxes (&ctx->s, "\t\t\t\targI = RDXINDEXER(", "i", ctx->ndd, ctx->nds, "", ");\n"); + strb_appends(&ctx->s, "\t\t\t}\n"); + break; + case GA_REDUCE_AND: + strb_appends(&ctx->s, "\t\t\trdxV &= v;\n"); + break; + case GA_REDUCE_OR: + strb_appends(&ctx->s, "\t\t\trdxV |= v;\n"); + break; + case GA_REDUCE_XOR: + strb_appends(&ctx->s, "\t\t\trdxV ^= v;\n"); + break; + case GA_REDUCE_ALL: + strb_appends(&ctx->s, "\t\t\trdxV = rdxV && v;\n"); + break; + case GA_REDUCE_ANY: + strb_appends(&ctx->s, "\t\t\trdxV = rdxV || v;\n"); break; - case GA_REDUCE_AND: strb_appends(&ctx->s, "\t\t\trdxV &= v;\n"); break; - case GA_REDUCE_OR: strb_appends(&ctx->s, "\t\t\trdxV |= v;\n"); break; - case GA_REDUCE_XOR: strb_appends(&ctx->s, "\t\t\trdxV ^= v;\n"); break; - case GA_REDUCE_ALL: strb_appends(&ctx->s, "\t\t\trdxV = rdxV && v;\n"); break; - case GA_REDUCE_ANY: strb_appends(&ctx->s, "\t\t\trdxV = rdxV || v;\n"); break; } /** From 67e163e99cb0f786c845be4cdc41aac5e76fb960 Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Fri, 3 Mar 2017 07:45:34 -0500 Subject: [PATCH 06/34] Refactoring of all non-code-gen-related functions. --- src/gpuarray_reduction.c | 782 ++++++++++++++++++++++++++------------- 1 file changed, 526 insertions(+), 256 deletions(-) diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c index 62502fb1fd..a5940f504d 100644 --- a/src/gpuarray_reduction.c +++ b/src/gpuarray_reduction.c @@ -23,6 +23,10 @@ /* Defines */ #define MAX_HW_DIMS 3 +#define KERNEL_PRIMARY 0 +#define KERNEL_AUXILIARY 1 +#define AXIS_FREE 0 +#define AXIS_REDUX 1 @@ -185,7 +189,8 @@ struct redux_ctx{ const int* reduxList; /* General. */ - int* axisList; + int* srcAxisList; + int* dstAxisList; gpucontext* gpuCtx; /* Source code Generator. */ @@ -203,21 +208,40 @@ struct redux_ctx{ int ndd; int ndr; int nds; - int ndh; - int ndhd; - int ndhr; int largeCodeModel; strb s; char* sourceCode; + size_t sourceCodeLen; + char* errorString0; + char* errorString1; + char* errorString2; GpuKernel preKernel; GpuKernel kernel; GpuKernel postKernel; - /* Scheduler */ - int hwAxisList[MAX_HW_DIMS]; - size_t blockSize [MAX_HW_DIMS]; - size_t gridSize [MAX_HW_DIMS]; - size_t chunkSize [MAX_HW_DIMS]; + /** + * Scheduler + * + * There are two sets of kernels that may be scheduled: + * 1) The reduction kernel. This is the only kernel scheduled in the + * large code model. + * 2) The initialization and post-scalar kernels. These are scheduled + * only in the small code model. + * + * The reduction kernel is the "primary" kernel. The other two, if needed, + * are referred to as "auxiliary" kernels. + */ + + struct{ + int ndh; + int ndhd; + int ndhr; + int axisList [MAX_HW_DIMS]; + size_t bs [MAX_HW_DIMS]; + size_t gs [MAX_HW_DIMS]; + size_t cs [MAX_HW_DIMS]; + gpudata* chunkSizeGD; + } pri, aux; /* Invoker */ gpudata* srcStepsGD; @@ -257,8 +281,12 @@ static int reduxHasDst (redux_ctx* ctx); static int reduxHasDstArg (redux_ctx* ctx); static int reduxKernelRequiresDst (redux_ctx* ctx); static int reduxKernelRequiresDstArg (redux_ctx* ctx); -static int reduxCanAppendHwAxis (redux_ctx* ctx, int wantReductionAxis); -static void reduxAppendLargestAxisToHwList(redux_ctx* ctx, int wantReductionAxis); +static int reduxCanAppendHwAxis (redux_ctx* ctx, + int kernelType, + int axisType); +static void reduxAppendLargestAxisToHwList(redux_ctx* ctx, + int kernelType, + int axisType); static int reduxSelectHwAxes (redux_ctx* ctx); static int reduxComputeAxisList (redux_ctx* ctx); static int reduxGenSource (redux_ctx* ctx); @@ -280,10 +308,19 @@ static void reduxAppendLoopMacroDefs (redux_ctx* ctx); static void reduxAppendLoopOuter (redux_ctx* ctx); static void reduxAppendLoopInner (redux_ctx* ctx); static void reduxAppendLoopMacroUndefs (redux_ctx* ctx); -static int reduxCompileLarge (redux_ctx* ctx); -static int reduxCompileSmall (redux_ctx* ctx); -static int reduxScheduleLarge (redux_ctx* ctx); -static int reduxInvokeLarge (redux_ctx* ctx); +static int reduxCompile (redux_ctx* ctx); +static int reduxSchedule (redux_ctx* ctx); +static void reduxScheduleKernel (int ndims, + uint64_t* dims, + uint64_t warpSize, + uint64_t maxLg, + uint64_t* maxLs, + uint64_t maxGg, + uint64_t* maxGs, + uint64_t* bs, + uint64_t* gs, + uint64_t* cs); +static int reduxInvoke (redux_ctx* ctx); static int reduxCleanup (redux_ctx* ctx, int ret); @@ -749,27 +786,33 @@ static int reduxCheckargs (redux_ctx* ctx){ * We initialize certain parts of the context. */ - ctx->axisList = NULL; + ctx->srcAxisList = NULL; + ctx->dstAxisList = NULL; ctx->gpuCtx = NULL; ctx->srcTypeStr = ctx->dstTypeStr = ctx->dstArgTypeStr = ctx->accTypeStr = ctx->idxTypeStr = NULL; ctx->initVal = NULL; - ctx->ndh = 0; - ctx->ndhd = 0; - ctx->ndhr = 0; + ctx->pri.ndh = ctx->aux.ndh = 0; + ctx->pri.ndhd = ctx->aux.ndhd = 0; + ctx->pri.ndhr = ctx->aux.ndhr = 0; ctx->sourceCode = NULL; + ctx->sourceCodeLen = 0; + ctx->errorString0 = NULL; + ctx->errorString1 = NULL; + ctx->errorString2 = NULL; strb_init(&ctx->s); for (i=0;ihwAxisList[i] = 0; - ctx->blockSize [i] = 1; - ctx->gridSize [i] = 1; - ctx->chunkSize [i] = 1; + ctx->aux.axisList[i] = ctx->pri.axisList[i] = 0; + ctx->aux.bs [i] = ctx->pri.bs [i] = 1; + ctx->aux.gs [i] = ctx->pri.gs [i] = 1; + ctx->aux.cs [i] = ctx->pri.cs [i] = 1; } - ctx->srcStepsGD = ctx->srcSizeGD = ctx->chunkSizeGD = - ctx->dstStepsGD = ctx->dstArgStepsGD = NULL; + ctx->srcStepsGD = ctx->srcSizeGD = + ctx->dstStepsGD = ctx->dstArgStepsGD = + ctx->pri.chunkSizeGD = ctx->aux.chunkSizeGD = NULL; /* *** IT IS NOW SAFE TO CALL reduxCleanup() *** */ @@ -1054,75 +1097,108 @@ static int reduxKernelRequiresDstArg (redux_ctx* ctx){ } /** - * @brief Check whether we can add another reduction axis - * (wantReductionAxis=1) or destination axis (wantReductionAxis=0) to - * the hardware axis list. + * @brief Check whether we can add another reduction axis or free axis + * to the hardware axis list for either the primary or secondary kernel. */ -static int reduxCanAppendHwAxis (redux_ctx* ctx, int wantReductionAxis){ - if (ctx->ndh >= MAX_HW_DIMS){ +static int reduxCanAppendHwAxis (redux_ctx* ctx, + int kernelType, + int axisType){ + int kernelNdh = kernelType == KERNEL_PRIMARY ? ctx->pri.ndh : ctx->aux.ndh; + int kernelNdhr = kernelType == KERNEL_PRIMARY ? ctx->pri.ndhr : ctx->aux.ndhr; + int kernelNdhd = kernelType == KERNEL_PRIMARY ? ctx->pri.ndhd : ctx->aux.ndhd; + + if(kernelNdh >= MAX_HW_DIMS){ return 0; }else{ - return wantReductionAxis ? ctx->ndhr < ctx->ndr: - ctx->ndhd < ctx->ndd; + return axisType == AXIS_REDUX ? kernelNdhr < ctx->ndr: + kernelNdhd < ctx->ndd; } } /** - * @brief Append the largest reduction axis (wantReductionAxis=1) or - * destination axis (wantReductionAxis=0) that isn't yet in the hardware - * axis list into said hardware axis list. + * @brief Append the largest reduction axis or free axis that isn't yet + * in the hardware axis list for either the primary or secondary kernel + * into said hardware axis list. */ -static void reduxAppendLargestAxisToHwList(redux_ctx* ctx, int wantReductionAxis){ +static void reduxAppendLargestAxisToHwList(redux_ctx* ctx, + int kernelType, + int axisType){ int maxI = 0, i, isInHwList, isInReduxList, isInDesiredList, isLargestSoFar; - size_t maxV = 0; + int* hwAxisList, * ndh, * ndhr, * ndhd; + size_t v, maxV = 0; + + /* Get pointers to the correct kernel's variables */ + hwAxisList = kernelType == KERNEL_PRIMARY ? ctx->pri.axisList: + ctx->aux.axisList; + ndh = kernelType == KERNEL_PRIMARY ? &ctx->pri.ndh: + &ctx->aux.ndh; + ndhr = kernelType == KERNEL_PRIMARY ? &ctx->pri.ndhr: + &ctx->aux.ndhr; + ndhd = kernelType == KERNEL_PRIMARY ? &ctx->pri.ndhd: + &ctx->aux.ndhd; /* Find */ for (i=0;inds;i++){ - isInHwList = axisInSet(i, ctx->hwAxisList, ctx->ndh, 0); - isInReduxList = axisInSet(i, ctx->reduxList, ctx->ndr, 0); - isInDesiredList = wantReductionAxis ? isInReduxList : !isInReduxList; - isLargestSoFar = ctx->src->dimensions[i] >= maxV; + isInHwList = axisInSet(i, hwAxisList, *ndh, 0); + isInReduxList = axisInSet(i, ctx->reduxList, ctx->ndr, 0); + isInDesiredList = axisType == AXIS_REDUX ? isInReduxList: + !isInReduxList; + v = ctx->src->dimensions[i]; + isLargestSoFar = v >= maxV; if (!isInHwList && isInDesiredList && isLargestSoFar){ - maxV = ctx->src->dimensions[i]; + maxV = v; maxI = i; } } /* Append */ - ctx->hwAxisList[ctx->ndh++] = maxI; - if (wantReductionAxis){ - ctx->ndhr++; + hwAxisList[(*ndh)++] = maxI; + if (axisType == AXIS_REDUX){ + (*ndhr)++; }else{ - ctx->ndhd++; + (*ndhd)++; } } /** * @brief Select which axes (up to MAX_HW_DIMS) will be assigned to hardware - * dimensions. + * dimensions for both the primary and auxiliary kernels. + * + * LARGE code model: Up to the MAX_HW_DIMS largest free axes are selected. + * Because the primary reduction kernel does everything, it's + * not necessary to compute an auxiliary kernel axis + * selection (or at least, one distinct from the primary + * kernel's). + * + * SMALL code model: For the primary reduction kernel, up to MAX_HW_DIMS + * reduction axes (largest-to-smallest) are selected. If less + * than MAX_HW_DIMS axes were selected, free axes are + * selected until MAX_HW_DIMS total axes are selected, or no + * free axes are left. * - * For the "large" code model: The up-to-MAX_HW_DIMS largest destination tensor - * dimensions are selected. - * For the "small" code model: Up to MAX_HW_DIMS reduction dimensions (largest- - * to-smallest) are selected. If less than - * MAX_HW_DIMS dimensions were selected, - * destination tensor dimensions are selected until - * MAX_HW_DIMS total dimensions are selected, or no - * destination tensors are left. + * For the auxiliary reduction kernel, up to the MAX_HW_DIMS + * largest free axes are selected. */ static int reduxSelectHwAxes (redux_ctx* ctx){ - if (reduxIsSmallCodeModel(ctx)){ - while (reduxCanAppendHwAxis(ctx, 1)){ - reduxAppendLargestAxisToHwList(ctx, 1); + if (reduxIsLargeCodeModel(ctx)){ + while (reduxCanAppendHwAxis (ctx, KERNEL_PRIMARY, AXIS_FREE)){ + reduxAppendLargestAxisToHwList(ctx, KERNEL_PRIMARY, AXIS_FREE); + } + }else{ + while (reduxCanAppendHwAxis (ctx, KERNEL_PRIMARY, AXIS_REDUX)){ + reduxAppendLargestAxisToHwList(ctx, KERNEL_PRIMARY, AXIS_REDUX); + } + while (reduxCanAppendHwAxis (ctx, KERNEL_PRIMARY, AXIS_FREE)){ + reduxAppendLargestAxisToHwList(ctx, KERNEL_PRIMARY, AXIS_FREE); } - } - while (reduxCanAppendHwAxis(ctx, 0)){ - reduxAppendLargestAxisToHwList(ctx, 0); + while (reduxCanAppendHwAxis (ctx, KERNEL_AUXILIARY, AXIS_FREE)){ + reduxAppendLargestAxisToHwList(ctx, KERNEL_AUXILIARY, AXIS_FREE); + } } return reduxComputeAxisList(ctx); @@ -1147,17 +1223,17 @@ static int reduxSelectHwAxes (redux_ctx* ctx){ static int reduxComputeAxisList (redux_ctx* ctx){ int i, f=0; - ctx->axisList = malloc(ctx->nds * sizeof(unsigned)); - if (!ctx->axisList){ + ctx->srcAxisList = malloc(ctx->nds * sizeof(unsigned)); + if (!ctx->srcAxisList){ return reduxCleanup(ctx, GA_MEMORY_ERROR); } for (i=0;inds;i++){ if (!axisInSet(i, ctx->reduxList, ctx->ndr, 0)){ - ctx->axisList[f++] = i; + ctx->srcAxisList[f++] = i; } } - memcpy(&ctx->axisList[f], ctx->reduxList, ctx->ndr * sizeof(*ctx->reduxList)); + memcpy(&ctx->srcAxisList[f], ctx->reduxList, ctx->ndr * sizeof(*ctx->reduxList)); return reduxGenSource(ctx); @@ -1171,13 +1247,13 @@ static int reduxComputeAxisList (redux_ctx* ctx){ static int reduxGenSource (redux_ctx* ctx){ reduxAppendSource(ctx); - ctx->sourceCode = strb_cstr(&ctx->s); + ctx->sourceCodeLen = ctx->s.l; + ctx->sourceCode = strb_cstr(&ctx->s); if (!ctx->sourceCode){ return reduxCleanup(ctx, GA_MEMORY_ERROR); } - return reduxIsLargeCodeModel(ctx) ? reduxCompileLarge(ctx): - reduxCompileSmall(ctx); + return reduxCompile(ctx); } static void reduxAppendSource (redux_ctx* ctx){ reduxAppendIncludes (ctx); @@ -1185,9 +1261,11 @@ static void reduxAppendSource (redux_ctx* ctx){ reduxAppendFuncGetInitVal (ctx); reduxAppendFuncLoadVal (ctx); reduxAppendFuncReduxVal (ctx); - reduxAppendFuncPreKernel (ctx); + if(reduxIsSmallCodeModel(ctx)){ + reduxAppendFuncPreKernel (ctx); + reduxAppendFuncPostKernel (ctx); + } reduxAppendFuncKernel (ctx); - reduxAppendFuncPostKernel (ctx); } static void reduxAppendIncludes (redux_ctx* ctx){ strb_appends(&ctx->s, "/* Includes */\n"); @@ -1197,31 +1275,30 @@ static void reduxAppendIncludes (redux_ctx* ctx){ strb_appends(&ctx->s, "\n"); } static void reduxAppendTypedefs (redux_ctx* ctx){ - strb_appends(&ctx->s, "/* Typedefs */\n"); - strb_appendf(&ctx->s, "typedef %s S;/* The type of the source array. */\n", ctx->srcTypeStr); - strb_appendf(&ctx->s, "typedef %s T;/* The type of the destination array. */\n", ctx->dstTypeStr); - strb_appendf(&ctx->s, "typedef %s A;/* The type of the destination argument array. */\n", ctx->dstArgTypeStr); - strb_appendf(&ctx->s, "typedef %s X;/* The type of the indices: signed 32/64-bit. */\n", ctx->idxTypeStr); - strb_appendf(&ctx->s, "typedef %s K;/* The type of the accumulator variable. */\n", ctx->accTypeStr); - strb_appends(&ctx->s, "\n\n\n"); + strb_appendf(&ctx->s, "typedef %s S;\n", ctx->srcTypeStr); /* The type of the source array. */ + strb_appendf(&ctx->s, "typedef %s T;\n", ctx->dstTypeStr); /* The type of the destination array. */ + strb_appendf(&ctx->s, "typedef %s A;\n", ctx->dstArgTypeStr);/* The type of the destination argument array. */ + strb_appendf(&ctx->s, "typedef %s X;\n", ctx->idxTypeStr); /* The type of the indices: signed 32/64-bit. */ + strb_appendf(&ctx->s, "typedef %s K;\n", ctx->accTypeStr); /* The type of the accumulator variable. */ } static void reduxAppendFuncGetInitVal (redux_ctx* ctx){ - strb_appends(&ctx->s, "/**\n"); - strb_appends(&ctx->s, " * Initial value function.\n"); - strb_appends(&ctx->s, " */\n\n"); - strb_appends(&ctx->s, "WITHIN_KERNEL K getInitVal(void){\n"); - strb_appendf(&ctx->s, "\treturn (%s);\n", ctx->initVal); - strb_appends(&ctx->s, "}\n\n\n\n"); + /** + * Initial value function. + */ + + strb_appendf(&ctx->s, "WITHIN_KERNEL K getInitVal(void){\n" + "\treturn (%s);\n" + "}\n\n\n\n", ctx->initVal); } static void reduxAppendFuncLoadVal (redux_ctx* ctx){ int i; - strb_appends(&ctx->s, "/**\n"); - strb_appends(&ctx->s, " * Multidimensional source element loader.\n"); - strb_appends(&ctx->s, " *\n"); - strb_appends(&ctx->s, " * Also implements prescalar transformations if any.\n"); - strb_appends(&ctx->s, " */\n"); - strb_appends(&ctx->s, "\n"); + /** + * Multidimensional source element loader. + * + * Also implements prescalar transformations if any. + */ + appendIdxes (&ctx->s, "WITHIN_KERNEL K loadVal(", "X i", 0, ctx->nds, "", ""); if (ctx->nds > 0){ strb_appends(&ctx->s, ", "); @@ -1229,7 +1306,7 @@ static void reduxAppendFuncLoadVal (redux_ctx* ctx){ strb_appends(&ctx->s, "const GLOBAL_MEM S* src, const GLOBAL_MEM X* srcSteps){\n"); strb_appends(&ctx->s, "\tS v = (*(const GLOBAL_MEM S*)((const GLOBAL_MEM char*)src + "); for (i=0;inds;i++){ - strb_appendf(&ctx->s, "i%d*srcSteps[%d] + \\\n\t ", i, ctx->axisList[i]); + strb_appendf(&ctx->s, "i%d*srcSteps[%d] + \\\n\t ", i, ctx->srcAxisList[i]); } strb_appends(&ctx->s, "0));\n"); @@ -1242,15 +1319,16 @@ static void reduxAppendFuncLoadVal (redux_ctx* ctx){ static void reduxAppendFuncReduxVal (redux_ctx* ctx){ int i, anyArgsEmitted = 0; - /* Function Signature. */ - strb_appends(&ctx->s, "/**\n"); - strb_appends(&ctx->s, " * Global memory value reduction function.\n"); - strb_appends(&ctx->s, " *\n"); - strb_appends(&ctx->s, " * Responsible for either:\n"); - strb_appends(&ctx->s, " * 1) Safe writeback of final value to memory, or\n"); - strb_appends(&ctx->s, " * 2) Safe atomic reduction of partial value into memory.\n"); - strb_appends(&ctx->s, " */\n"); - strb_appends(&ctx->s, "\n"); + /** + * Function Signature. + * + * Global memory value reduction function. + * + * Responsible for either: + * 1) Safe writeback of final value to memory, or + * 2) Safe atomic reduction of partial value into memory. + */ + appendIdxes (&ctx->s, "WITHIN_KERNEL void reduxVal(", "X i", 0, ctx->ndd, "", ""); anyArgsEmitted = ctx->ndd>0; if (reduxKernelRequiresDst (ctx)){ @@ -1356,11 +1434,11 @@ static void reduxAppendIndexDeclarations (redux_ctx* ctx){ strb_appends(&ctx->s, "\tX bd0 = LDIM_0, bd1 = LDIM_1, bd2 = LDIM_2;\n"); strb_appends(&ctx->s, "\tX ti0 = LID_0, ti1 = LID_1, ti2 = LID_2;\n"); strb_appends(&ctx->s, "\tX gi0 = bi0*bd0+ti0, gi1 = bi1*bd1+ti1, gi2 = bi2*bd2+ti2;\n"); - if (ctx->ndh>0){ + if (ctx->pri.ndh>0){ strb_appends(&ctx->s, "\tX "); - for (i=0;indh;i++){ + for (i=0;ipri.ndh;i++){ strb_appendf(&ctx->s, "ci%u = chunkSize[%u]%s", - i, i, (i==ctx->ndh-1) ? ";\n" : ", "); + i, i, (i==ctx->pri.ndh-1) ? ";\n" : ", "); } } @@ -1386,10 +1464,10 @@ static void reduxAppendRangeCalculations (redux_ctx* ctx){ strb_appends(&ctx->s, "\t/* Compute ranges for this thread. */\n"); for (i=0;inds;i++){ - strb_appendf(&ctx->s, "\ti%dDim = srcSize[%d];\n", i, ctx->axisList[i]); + strb_appendf(&ctx->s, "\ti%dDim = srcSize[%d];\n", i, ctx->srcAxisList[i]); } for (i=0;inds;i++){ - strb_appendf(&ctx->s, "\ti%dSStep = srcSteps[%d];\n", i, ctx->axisList[i]); + strb_appendf(&ctx->s, "\ti%dSStep = srcSteps[%d];\n", i, ctx->srcAxisList[i]); } for (i=0;indd;i++){ strb_appendf(&ctx->s, "\ti%dMStep = dstSteps[%d];\n", i, i); @@ -1415,7 +1493,7 @@ static void reduxAppendRangeCalculations (redux_ctx* ctx){ * The others, if any, have to use software looping beginning at 0. */ - if (axisInSet(ctx->axisList[i], ctx->hwAxisList, ctx->ndh, &hwDim)){ + if (axisInSet(ctx->srcAxisList[i], ctx->pri.axisList, ctx->pri.ndh, &hwDim)){ strb_appendf(&ctx->s, "\ti%dStart = gi%d * ci%d;\n", i, hwDim, hwDim); }else{ strb_appendf(&ctx->s, "\ti%dStart = 0;\n", i); @@ -1427,7 +1505,7 @@ static void reduxAppendRangeCalculations (redux_ctx* ctx){ * The others, if any, have to use software looping beginning at 0. */ - if (axisInSet(ctx->axisList[i], ctx->hwAxisList, ctx->ndh, &hwDim)){ + if (axisInSet(ctx->srcAxisList[i], ctx->pri.axisList, ctx->pri.ndh, &hwDim)){ strb_appendf(&ctx->s, "\ti%dEnd = i%dStart + ci%d;\n", i, i, hwDim); }else{ strb_appendf(&ctx->s, "\ti%dEnd = i%dStart + i%dDim;\n", i, i, i); @@ -1627,119 +1705,255 @@ static void reduxAppendLoopMacroUndefs (redux_ctx* ctx){ /** * @brief Compile the kernel from source code. - * - * @return */ -static int reduxCompileLarge (redux_ctx* ctx){ - const int ARG_TYPECODES[] = { - GA_BUFFER, /* src */ - GA_SIZE, /* srcOff */ - GA_BUFFER, /* srcSteps */ - GA_BUFFER, /* srcSize */ - GA_BUFFER, /* chnkSize */ - GA_BUFFER, /* dst */ - GA_SIZE, /* dstOff */ - GA_BUFFER, /* dstSteps */ - GA_BUFFER, /* dstArg */ - GA_SIZE, /* dstArgOff */ - GA_BUFFER /* dstArgSteps */ - }; - const size_t ARG_TYPECODES_LEN = sizeof(ARG_TYPECODES)/sizeof(*ARG_TYPECODES); - const char* SRCS[1] = {ctx->sourceCode}; - const size_t SRC_LENS[1] = {strlen(ctx->sourceCode)}; - const size_t SRCS_LEN = sizeof(SRCS)/sizeof(*SRCS); - - int ret = GpuKernel_init(&ctx->kernel, - ctx->gpuCtx, - SRCS_LEN, - SRCS, - SRC_LENS, - "redux", - ARG_TYPECODES_LEN, - ARG_TYPECODES, - 0, - (char**)0); - - if (ret != GA_NO_ERROR){ - return reduxCleanup(ctx, ret); - }else{ - return reduxScheduleLarge(ctx); +static int reduxCompile (redux_ctx* ctx){ + int ret, i = 0; + int PRI_TYPECODES[11]; + size_t PRI_TYPECODES_LEN; + int* AUX_TYPECODES; + size_t AUX_TYPECODES_LEN; + + + /** + * Construct Argument Typecode Lists. + */ + + PRI_TYPECODES[i++] = GA_BUFFER; /* src */ + PRI_TYPECODES[i++] = GA_SIZE; /* srcOff */ + PRI_TYPECODES[i++] = GA_BUFFER; /* srcSteps */ + PRI_TYPECODES[i++] = GA_BUFFER; /* srcSize */ + PRI_TYPECODES[i++] = GA_BUFFER; /* chnkSize */ + if(reduxKernelRequiresDst(ctx)){ + PRI_TYPECODES[i++] = GA_BUFFER; /* dst */ + PRI_TYPECODES[i++] = GA_SIZE; /* dstOff */ + PRI_TYPECODES[i++] = GA_BUFFER; /* dstSteps */ + } + if(reduxKernelRequiresDstArg(ctx)){ + PRI_TYPECODES[i++] = GA_BUFFER; /* dstArg */ + PRI_TYPECODES[i++] = GA_SIZE; /* dstArgOff */ + PRI_TYPECODES[i++] = GA_BUFFER; /* dstArgSteps */ + } + PRI_TYPECODES_LEN = i; + AUX_TYPECODES = &PRI_TYPECODES[3]; + AUX_TYPECODES_LEN = PRI_TYPECODES_LEN-3; + + + /** + * Compile the kernels. + */ + + { + ret = GpuKernel_init(&ctx->kernel, + ctx->gpuCtx, + 1, + (const char**)&ctx->sourceCode, + &ctx->sourceCodeLen, + "redux", + PRI_TYPECODES_LEN, + PRI_TYPECODES, + GA_USE_CLUDA, + &ctx->errorString0); + if (ret != GA_NO_ERROR){ + return reduxCleanup(ctx, ret); + } } -} -static int reduxCompileSmall (redux_ctx* ctx){ - /* BUG: Implement small code model. */ - return reduxCompileLarge(ctx); + if(reduxIsSmallCodeModel(ctx)){ + ret = GpuKernel_init(&ctx->kernel, + ctx->gpuCtx, + 1, + (const char**)&ctx->sourceCode, + &ctx->sourceCodeLen, + "preRedux", + AUX_TYPECODES_LEN, + AUX_TYPECODES, + GA_USE_CLUDA, + &ctx->errorString1); + if (ret != GA_NO_ERROR){ + return reduxCleanup(ctx, ret); + } + ret = GpuKernel_init(&ctx->kernel, + ctx->gpuCtx, + 1, + (const char**)&ctx->sourceCode, + &ctx->sourceCodeLen, + "postRedux", + AUX_TYPECODES_LEN, + AUX_TYPECODES, + GA_USE_CLUDA, + &ctx->errorString2); + if (ret != GA_NO_ERROR){ + return reduxCleanup(ctx, ret); + } + } + + return reduxSchedule(ctx); } /** - * Compute a good thread block size / grid size / software chunk size for Nvidia. + * @brief Compute a good thread block size / grid size / software chunk size + * for the primary/auxilliary kernels. */ -static int reduxScheduleLarge (redux_ctx* ctx){ - int i; - size_t warpMod; - size_t bestWarpMod = 1; - unsigned bestWarpAxis = 0; - uint64_t maxLg; - uint64_t maxLs[MAX_HW_DIMS]; - uint64_t maxGg; - uint64_t maxGs [MAX_HW_DIMS]; - uint64_t dims [MAX_HW_DIMS]; - double slack [MAX_HW_DIMS]; - ga_factor_list factBS[MAX_HW_DIMS]; - ga_factor_list factGS[MAX_HW_DIMS]; - ga_factor_list factCS[MAX_HW_DIMS]; - - +static int reduxSchedule (redux_ctx* ctx){ + int i, priNdims, auxNdims; + uint64_t maxLgRdx, maxLgPre, maxLgPost; + uint64_t maxLgPri, maxLgAux; + uint64_t maxLs [MAX_HW_DIMS]; + uint64_t maxGg; + uint64_t maxGs [MAX_HW_DIMS]; + uint64_t priDims[MAX_HW_DIMS]; + uint64_t auxDims[MAX_HW_DIMS]; + uint64_t bs [MAX_HW_DIMS]; + uint64_t gs [MAX_HW_DIMS]; + uint64_t cs [MAX_HW_DIMS]; + size_t warpSize, + maxL, maxL0, maxL1, maxL2, + maxG, maxG0, maxG1, maxG2; + + /** * Obtain the constraints of our problem. */ - size_t warpSize, - maxL, maxL0, maxL1, maxL2, /* Maximum total and per-dimension thread/block sizes */ - maxG, maxG0, maxG1, maxG2; /* Maximum total and per-dimension block /grid sizes */ - gpukernel_property(ctx->kernel.k, GA_KERNEL_PROP_PREFLSIZE, &warpSize); - gpukernel_property(ctx->kernel.k, GA_KERNEL_PROP_MAXLSIZE, &maxL); - gpudata_property (ctx->src->data, GA_CTX_PROP_MAXLSIZE0, &maxL0); - gpudata_property (ctx->src->data, GA_CTX_PROP_MAXLSIZE1, &maxL1); - gpudata_property (ctx->src->data, GA_CTX_PROP_MAXLSIZE2, &maxL2); - gpudata_property (ctx->src->data, GA_CTX_PROP_MAXGSIZE0, &maxG0); - maxG = maxG0; - gpudata_property (ctx->src->data, GA_CTX_PROP_MAXGSIZE1, &maxG1); - gpudata_property (ctx->src->data, GA_CTX_PROP_MAXGSIZE2, &maxG2); - + gpudata_property (ctx->src->data, GA_CTX_PROP_MAXLSIZE0, &maxL0); + gpudata_property (ctx->src->data, GA_CTX_PROP_MAXLSIZE1, &maxL1); + gpudata_property (ctx->src->data, GA_CTX_PROP_MAXLSIZE2, &maxL2); + gpudata_property (ctx->src->data, GA_CTX_PROP_MAXGSIZE, &maxG); + gpudata_property (ctx->src->data, GA_CTX_PROP_MAXGSIZE0, &maxG0); + gpudata_property (ctx->src->data, GA_CTX_PROP_MAXGSIZE1, &maxG1); + gpudata_property (ctx->src->data, GA_CTX_PROP_MAXGSIZE2, &maxG2); + gpukernel_property(ctx->kernel.k, GA_KERNEL_PROP_PREFLSIZE, &warpSize); + gpukernel_property(ctx->kernel.k, GA_KERNEL_PROP_MAXLSIZE, &maxL); + maxLgRdx = maxL; + maxLgPri = maxLgRdx; + if(reduxIsSmallCodeModel(ctx)){ + gpukernel_property(ctx->preKernel.k, GA_KERNEL_PROP_MAXLSIZE, &maxL); + maxLgPre = maxL; + gpukernel_property(ctx->postKernel.k, GA_KERNEL_PROP_MAXLSIZE, &maxL); + maxLgPost = maxL; + maxLgAux = maxLgPrepri.ndh; + maxGs[0] = maxG0; + maxGs[1] = maxG1; + maxGs[2] = maxG2; + maxGg = maxG; + maxLs[0] = maxL0; + maxLs[1] = maxL1; + maxLs[2] = maxL2; + for (i=0;isrc->dimensions[ctx->pri.axisList[i]]; + } + if(reduxIsSmallCodeModel(ctx)){ + auxNdims = ctx->aux.ndh; + for (i=0;isrc->dimensions[ctx->aux.axisList[i]]; + } + } + + /** - * Prepare inputs to the solver. - * - * This involves, amongst others, - * - Initializing the blockSize, gridSize and chunkSize factor lists for all - * hardware dimensions. - * - Finding on which hardware axis is it optimal to place the warpSize factor. + * Apply the solver. */ + + { + reduxScheduleKernel(priNdims, + priDims, + warpSize, + maxLgPri, maxLs, + maxGg, maxGs, + bs, gs, cs); + for (i=0;ipri.bs[i] = bs[i]; + ctx->pri.gs[i] = gs[i]; + ctx->pri.cs[i] = cs[i]; + } + if (priNdims <= 0){ + ctx->pri.bs[i] = ctx->pri.gs[i] = ctx->pri.cs[i] = 1; + } + } + if (reduxIsSmallCodeModel(ctx)){ + reduxScheduleKernel(auxNdims, + auxDims, + warpSize, + maxLgAux, maxLs, + maxGg, maxGs, + bs, gs, cs); + for (i=0;iaux.bs[i] = bs[i]; + ctx->aux.gs[i] = gs[i]; + ctx->aux.cs[i] = cs[i]; + } + if (auxNdims <= 0){ + ctx->aux.bs[i] = ctx->aux.gs[i] = ctx->aux.cs[i] = 1; + } + } + + return reduxInvoke(ctx); +} + +/** + * @brief Given the parameters of a kernel scheduling problem, solve it as + * optimally as possible. + * + * NB: This is the only function in this entire file that should have + * anything to do with the integer factorization APIs. + */ - maxLg = maxL; - maxLs[0] = maxL0, maxLs[1]=maxL1, maxLs[2]=maxL2; - maxGg = maxG; - maxGs[0] = maxG0, maxGs[1]=maxG1, maxGs[2]=maxG2; - dims[0] = dims[1] = dims[2] = 1; - slack[0] = slack[1] = slack[2] = 1.1; +static void reduxScheduleKernel (int ndims, + uint64_t* dims, + uint64_t warpSize, + uint64_t maxLg, + uint64_t* maxLs, + uint64_t maxGg, + uint64_t* maxGs, + uint64_t* bs, + uint64_t* gs, + uint64_t* cs){ + uint64_t warpMod, bestWarpMod = 1; + int i, bestWarpAxis = 0; + uint64_t roundedDims[MAX_HW_DIMS]; + double slack [MAX_HW_DIMS]; + ga_factor_list factBS [MAX_HW_DIMS]; + ga_factor_list factGS [MAX_HW_DIMS]; + ga_factor_list factCS [MAX_HW_DIMS]; + + + /** + * Quick check for scalar case. + */ + + if (ndims <= 0){ + return; + } + + + /** + * Identify the dimension to which the warp factor will be given. + * + * The current heuristic is to find the dimension that is either + * 1) Evenly divided by the warp size, or + * 2) As close to filling the last warp as possible. + */ - for (i=0;indh;i++){ - dims[i] = ctx->src->dimensions[ctx->hwAxisList[i]]; + for (i=0;i0 && (warpMod==0 || warpMod>=bestWarpMod)){ bestWarpAxis = i; bestWarpMod = warpMod; } } - if (ctx->ndh > 0){ - dims[bestWarpAxis] = (dims[bestWarpAxis] + warpSize - 1)/warpSize; + if (ndims > 0){ + roundedDims[bestWarpAxis] = (roundedDims[bestWarpAxis] + warpSize - 1)/warpSize; gaIFactorize(warpSize, 0, 0, &factBS[bestWarpAxis]); } @@ -1749,8 +1963,11 @@ static int reduxScheduleLarge (redux_ctx* ctx){ * chunkSize. */ - for (i=0;indh;i++){ - while (!gaIFactorize(dims[i], (uint64_t)(dims[i]*slack[i]), maxLs[i], &factCS[i])){ + for (i=0;indh, maxLg, maxLs, maxGg, maxGs, factBS, factGS, factCS); - - /* Output. */ - for (i=0;indh;i++){ - ctx->blockSize[i] = gaIFLGetProduct(&factBS[i]); - ctx->gridSize [i] = gaIFLGetProduct(&factGS[i]); - ctx->chunkSize[i] = gaIFLGetProduct(&factCS[i]); + gaIFLSchedule(ndims, maxLg, maxLs, maxGg, maxGs, factBS, factGS, factCS); + for (i=0;isrcStepsGD = gpudata_alloc(ctx->gpuCtx, ctx->nds * sizeof(size_t), - ctx->src->strides, flags, 0); - ctx->srcSizeGD = gpudata_alloc(ctx->gpuCtx, ctx->nds * sizeof(size_t), - ctx->src->dimensions, flags, 0); - ctx->chunkSizeGD = gpudata_alloc(ctx->gpuCtx, ctx->ndh * sizeof(size_t), - ctx->chunkSize, flags, 0); - if (reduxKernelRequiresDst(ctx)){ - ctx->dstStepsGD = gpudata_alloc(ctx->gpuCtx, ctx->ndd * sizeof(size_t), - ctx->dst->strides, flags, 0); + const int flags = GA_BUFFER_READ_ONLY|GA_BUFFER_INIT; + ctx->srcStepsGD = gpudata_alloc(ctx->gpuCtx, ctx->nds * sizeof(size_t), + ctx->src->strides, flags, 0); + ctx->srcSizeGD = gpudata_alloc(ctx->gpuCtx, ctx->nds * sizeof(size_t), + ctx->src->dimensions, flags, 0); + ctx->pri.chunkSizeGD = gpudata_alloc(ctx->gpuCtx, ctx->pri.ndh * sizeof(size_t), + ctx->pri.cs, flags, 0); + + priArgs[i++] = (void*) ctx->src->data; + priArgs[i++] = (void*)&ctx->src->offset; + priArgs[i++] = (void*) ctx->srcStepsGD; + priArgs[i++] = (void*) ctx->srcSizeGD; + priArgs[i++] = (void*) ctx->pri.chunkSizeGD; + if (reduxKernelRequiresDst (ctx)){ + ctx->dstStepsGD = gpudata_alloc(ctx->gpuCtx, ctx->ndd * sizeof(size_t), + ctx->dst->strides, flags, 0); + priArgs[i++] = (void*) ctx->dst->data; + priArgs[i++] = (void*)&ctx->dst->offset; + priArgs[i++] = (void*) ctx->dstStepsGD; + failedDstSteps = !ctx->dstStepsGD; } if (reduxKernelRequiresDstArg(ctx)){ - ctx->dstArgStepsGD = gpudata_alloc(ctx->gpuCtx, ctx->ndd * sizeof(size_t), - ctx->dstArg->strides, flags, 0); - } - args[ 0] = (void*) ctx->src->data; - args[ 1] = (void*)&ctx->src->offset; - args[ 2] = (void*) ctx->srcStepsGD; - args[ 3] = (void*) ctx->srcSizeGD; - args[ 4] = (void*) ctx->chunkSizeGD; - if ( reduxKernelRequiresDst (ctx) && reduxKernelRequiresDstArg(ctx)){ - args[ 5] = (void*) ctx->dst->data; - args[ 6] = (void*)&ctx->dst->offset; - args[ 7] = (void*) ctx->dstStepsGD; - args[ 8] = (void*) ctx->dstArg->data; - args[ 9] = (void*)&ctx->dstArg->offset; - args[10] = (void*) ctx->dstArgStepsGD; - }else if ( reduxKernelRequiresDst (ctx) && !reduxKernelRequiresDstArg(ctx)){ - args[ 5] = (void*) ctx->dst->data; - args[ 6] = (void*)&ctx->dst->offset; - args[ 7] = (void*) ctx->dstStepsGD; - }else if (!reduxKernelRequiresDst (ctx) && reduxKernelRequiresDstArg(ctx)){ - args[ 5] = (void*) ctx->dstArg->data; - args[ 6] = (void*)&ctx->dstArg->offset; - args[ 7] = (void*) ctx->dstArgStepsGD; + ctx->dstArgStepsGD = gpudata_alloc(ctx->gpuCtx, ctx->ndd * sizeof(size_t), + ctx->dstArg->strides, flags, 0); + priArgs[i++] = (void*) ctx->dstArg->data; + priArgs[i++] = (void*)&ctx->dstArg->offset; + priArgs[i++] = (void*) ctx->dstArgStepsGD; + failedDstArgSteps = !ctx->dstArgStepsGD; + } + if (reduxIsSmallCodeModel(ctx)){ + /** + * The auxiliary kernel's args are identical to the primary kernel's, + * except that the first three arguments are deleted and the fifth + * argument (now second), called chunkSize, is different. + */ + + memcpy(auxArgs, &priArgs[3], sizeof(auxArgs)); + ctx->aux.chunkSizeGD = gpudata_alloc(ctx->gpuCtx, ctx->aux.ndh * sizeof(size_t), + ctx->aux.cs, flags, 0); + auxArgs[ 1 ] = (void*) ctx->aux.chunkSizeGD; + failedAuxChunkSize = !ctx->aux.chunkSizeGD; } - if (ctx->srcStepsGD && - ctx->srcSizeGD && - ctx->chunkSizeGD && - ctx->dstStepsGD && - ctx->dstArgStepsGD){ + + /** + * One or three kernels is now invoked, depending on the code model. + */ + + if (ctx->srcStepsGD && + ctx->srcSizeGD && + ctx->pri.chunkSizeGD && + !failedDstSteps && + !failedDstArgSteps && + !failedAuxChunkSize){ + /* Pre-kernel invocation, if necessary */ + if(reduxIsSmallCodeModel(ctx)){ + ret = GpuKernel_call(&ctx->preKernel, + ctx->aux.ndh>0 ? ctx->aux.ndh : 1, + ctx->aux.gs, + ctx->aux.bs, + 0, + auxArgs); + if (ret != GA_NO_ERROR){ + return reduxCleanup(ctx, ret); + } + } + + /* Reduction kernel invocation */ ret = GpuKernel_call(&ctx->kernel, - ctx->ndh>0 ? ctx->ndh : 1, - ctx->gridSize, - ctx->blockSize, + ctx->pri.ndh>0 ? ctx->pri.ndh : 1, + ctx->pri.gs, + ctx->pri.bs, 0, - args); + priArgs); + if (ret != GA_NO_ERROR){ + return reduxCleanup(ctx, ret); + } + + /* Post-kernel invocation, if necessary */ + if(reduxIsSmallCodeModel(ctx)){ + ret = GpuKernel_call(&ctx->postKernel, + ctx->aux.ndh>0 ? ctx->aux.ndh : 1, + ctx->aux.gs, + ctx->aux.bs, + 0, + auxArgs); + if (ret != GA_NO_ERROR){ + return reduxCleanup(ctx, ret); + } + } + return reduxCleanup(ctx, ret); }else{ return reduxCleanup(ctx, GA_MEMORY_ERROR); @@ -1852,18 +2112,28 @@ static int reduxInvokeLarge (redux_ctx* ctx){ */ static int reduxCleanup (redux_ctx* ctx, int ret){ - free(ctx->axisList); + free(ctx->srcAxisList); + free(ctx->dstAxisList); free(ctx->sourceCode); - ctx->axisList = NULL; - ctx->sourceCode = NULL; + free(ctx->errorString0); + free(ctx->errorString1); + free(ctx->errorString2); + ctx->srcAxisList = NULL; + ctx->dstAxisList = NULL; + ctx->sourceCode = NULL; + ctx->errorString0 = NULL; + ctx->errorString1 = NULL; + ctx->errorString2 = NULL; gpudata_release(ctx->srcStepsGD); gpudata_release(ctx->srcSizeGD); - gpudata_release(ctx->chunkSizeGD); gpudata_release(ctx->dstStepsGD); gpudata_release(ctx->dstArgStepsGD); - ctx->srcStepsGD = ctx->srcSizeGD = ctx->chunkSizeGD = - ctx->dstStepsGD = ctx->dstArgStepsGD = NULL; + gpudata_release(ctx->pri.chunkSizeGD); + gpudata_release(ctx->aux.chunkSizeGD); + ctx->srcStepsGD = ctx->srcSizeGD = + ctx->dstStepsGD = ctx->dstArgStepsGD = + ctx->pri.chunkSizeGD = ctx->aux.chunkSizeGD = NULL; return ret; } From b88ae5716d661c6b05a13d2675018d698daba18d Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Fri, 3 Mar 2017 17:10:50 -0500 Subject: [PATCH 07/34] Added variadic string append function strb_appendv(). --- src/util/strb.c | 32 ++++++++++++++++++++++---------- src/util/strb.h | 12 ++++++++++++ 2 files changed, 34 insertions(+), 10 deletions(-) diff --git a/src/util/strb.c b/src/util/strb.c index dda9dcdfc2..ddf50924ca 100644 --- a/src/util/strb.c +++ b/src/util/strb.c @@ -43,25 +43,29 @@ int strb_grow(strb *sb, size_t n) { return 0; } -void strb_appendf(strb *sb, const char *f, ...) { - va_list ap; - int s; +void strb_appendv(strb *sb, const char *f, va_list ap) { + va_list apSave; + int s; - va_start(ap, f); #ifdef _MSC_VER - s = _vscprintf(f, ap); + /** + * va_copy() is a C99 novelty that a particular company should have started + * supporting a long time ago, to their undying shame. + */ + + apSave = ap; + s = _vscprintf(f, apSave); #else - s = vsnprintf(NULL, 0, f, ap); + va_copy(apSave, ap); + s = vsnprintf(NULL, 0, f, apSave); #endif - va_end(ap); - + va_end(apSave); + if (s < 0) { strb_seterror(sb); return; } s += 1; if (strb_ensure(sb, s)) return; - va_start(ap, f); s = vsnprintf(sb->s+sb->l, s, f, ap); - va_end(ap); sb->l += s; } @@ -100,3 +104,11 @@ int strb_write(int fd, strb *sb) { } return 0; } + +void strb_appendf(strb *sb, const char *f, ...) { + va_list ap; + va_start(ap, f); + strb_appendv(sb, f, ap); + va_end(ap); +} + diff --git a/src/util/strb.h b/src/util/strb.h index 223145908e..88a2c08794 100644 --- a/src/util/strb.h +++ b/src/util/strb.h @@ -2,6 +2,7 @@ #define STRB_H #include "private_config.h" +#include #ifdef __cplusplus extern "C" { @@ -187,6 +188,17 @@ void strb_read(strb *sb, int fd, size_t sz); */ int strb_write(int fd, strb *sb); +/* + * Appends the result of a sprintf using the format string `f` and + * following variadic arguments list, excluding terminating nul. + * + * Unlike sprintf, this function makes sure not to run off the end of + * memory and behaves like asprintf in that respect. + * + * A format error will place the strb in error mode. + */ +void strb_appendv(strb *, const char *f, va_list ap); + /* * Returns a C string from the content of the strb. * From 0949626935955fb8a2427b4e3bb97c753f0bc6b7 Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Sat, 4 Mar 2017 19:29:50 -0500 Subject: [PATCH 08/34] Massive refactor of kernel codegen. --- src/gpuarray_reduction.c | 984 +++++++++++++++++++-------------------- src/util/srcgen.h | 106 +++++ 2 files changed, 596 insertions(+), 494 deletions(-) create mode 100644 src/util/srcgen.h diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c index a5940f504d..61f1688a4f 100644 --- a/src/gpuarray_reduction.c +++ b/src/gpuarray_reduction.c @@ -18,6 +18,7 @@ #include "gpuarray/util.h" #include "util/strb.h" +#include "util/srcgen.h" #include "util/integerfactoring.h" @@ -34,7 +35,7 @@ /** * Reduction Kernel Generator. - * + * * The generator produces a kernel from one of two "code models": * - Large * - Small @@ -43,132 +44,142 @@ * with more than SMALL_REDUX_THRESHOLD elements or more elements than * reductions for each element will result in use of the large code model; * Otherwise the small code model is used. - * - * + * + * * LARGE CODE MODEL: - * + * * In the large code model, each destination element is processed by a * single thread. - * + * * Each thread begins with an initial value in a register, reads from all * source elements contributing to the reduction, computes the result and * writes it to the destination element. - * + * * A single kernel is generated that performs prescalar transformations, the * reduction itself, postscalar transformations and the write to global memory. - * - * + * + * * SMALL CODE MODEL: - * + * * In the small code model, each destination element is processed by * multiple threads. - * + * * The destination tensor is first initialized with the initial value. Then, * one several threads cooperate to perform the reduction atomically on each * destination element. Lastly, postscalar transformations are applied * in-place. - * + * * Two or three kernels are generated: The initialization kernel, the main * kernel that performs prescalar transformations and the reduction itself, and * possibly also a postscalar transformation kernel when it is required. - * - * + * + * * Kernel Template: - * + * * The following kernel code template displays the code generated for the * small code model. For the large code model, no pre/postRedux() kernels * are generated (since their functionality is incorporated within the main * redux() kernel), no atomicRedux() function needs to be generated because * writes to global memory are unconditional and not contended. - * - * - * //Includes - * #include - * #include - * #include - * - * + * + * + * //Macros + * #define FOROVER + * #define ESCAPE + * #define srcVal //Indexer + * #define dstVal //Indexer + * #define dstArgVal //Indexer + * #define rdxIdx //Special reduction index computer + * + * * //Typedefs: - * typedef float T - * typedef int64_t X - * - * - * //Initializer (in case initial T cannot be expressed as a literal) - * static T getInitVal(void){ + * typedef float S //The type of the source array. + * typedef float T //The type of the destination array. + * typedef ssize_t A //The type of the destination argument array. + * typedef ssize_t X //The type of the indices: signed 32/64-bit. + * typedef float K //The type of the accumulator variable. + * + * + * //Initializer (in case initial value of accumulator cannot be expressed + * //as a literal) + * static K getInitValTFn(void){ * return ... * } - * - * + * static K getInitValKFn(void){ + * return ... + * } + * + * * //Reduce into global memory destination a value. - * static void atomicRedux(GLOBAL_MEM T* dst, T val){ - * ... + * static void writeBackFn(GLOBAL_MEM T* d_, T d, + * GLOBAL_MEM A* a_, A a){ + * //Large code model: + * *dPtr = d; + * *aPtr = a; + * + * //Small code model: + * // Something complex possibly involving CAS loops * } - * - * - * //Load data from source and apply pre-operations. - * static T loadVal(X i0, X i1, ..., X iN, - * const GLOBAL_MEM T* src, - * const GLOBAL_MEM X* srcSteps, - * ...?){ + * + * + * //Load data from source and apply pre-operations, coercing the type to + * //the accumulator type K. + * static K loadValFn(X i0, X i1, ..., X iN, + * const GLOBAL_MEM S* srcPtr, + * const X srcOff, + * const GLOBAL_MEM X* srcSteps, + * ...?){ * return ... * } - * - * - * //Initialization kernel, - * KERNEL void preRedux(const GLOBAL_MEM X* srcSize, - * const GLOBAL_MEM X* chunkSize, - * GLOBAL_MEM T* dst, - * const X dstOff, - * const GLOBAL_MEM X* dstSteps){ - * //OFFSETS - * dst += dstOff; - * - * //Initialize - * dst[...] = getInitVal(); + * + * + * //Initialization kernel + * KERNEL void initKer(const GLOBAL_MEM X* srcSize, + * const GLOBAL_MEM X* chunkSize, + * GLOBAL_MEM T* dstPtr, + * const X dstOff, + * const GLOBAL_MEM X* dstSteps){ + * dstVal = getInitValTFn(); * } - * - * + * + * * //Reduction Kernel. - * KERNEL void redux(const GLOBAL_MEM T* src, - * const X srcOff, - * const GLOBAL_MEM X* srcSteps, - * const GLOBAL_MEM X* srcSize, - * const GLOBAL_MEM X* chunkSize, - * GLOBAL_MEM T* dst, - * const X dstOff, - * const GLOBAL_MEM X* dstSteps, - * GLOBAL_MEM X* dstArg, - * const X dstArgOff, - * const GLOBAL_MEM X* dstArgSteps){ - * //OFFSETS - * src += srcOff - * dst += dstOff - * dstArg += dstArgOff - * + * KERNEL void reduxKer(GLOBAL_MEM S* srcPtr, + * const X srcOff, + * const GLOBAL_MEM X* srcSteps, + * const GLOBAL_MEM X* srcSize, + * const GLOBAL_MEM X* chunkSize, + * GLOBAL_MEM T* dstPtr, + * const X dstOff, + * const GLOBAL_MEM X* dstSteps, + * GLOBAL_MEM A* dstArgPtr, + * const X dstArgOff, + * const GLOBAL_MEM X* dstArgSteps){ * //Declare Indices * //Compute Ranges - * - * //Define macros + * * //Outer Loops + * K rdxK = getInitValKFn(); + * A rdxA = 0; * //Inner Loops - * //Undefine macros + * K k = loadValFn(indices..., srcPtr, srcOff, srcSteps) + * rdxK = k + * rdxA = rdxIdx + * writeBackFn(&dstVal, d, &dstArgVal, a); * } - * - * + * + * * //Post-scalar kernel, - * KERNEL void postRedux(const GLOBAL_MEM X* srcSize, - * const GLOBAL_MEM X* chunkSize, - * GLOBAL_MEM T* dst, - * const X dstOff, - * const GLOBAL_MEM X* dstSteps){ - * //OFFSETS - * dst += dstOff; - * - * //Initialize - * dst[...] = getInitVal(); + * KERNEL void postKer(const GLOBAL_MEM X* srcSize, + * const GLOBAL_MEM X* chunkSize, + * GLOBAL_MEM T* dst, + * const X dstOff, + * const GLOBAL_MEM X* dstSteps){ + * //Default: Nothing. + * dstVal = dstVal * } - * - * + * + * * Initial Reduction Values * +--------------+-----+-----+---------+---------+-----+-----+-----+-----+-----+ * | Type\Op | + | * | max | min | & | | | ^ | && | || | @@ -189,8 +200,10 @@ struct redux_ctx{ const int* reduxList; /* General. */ + GpuArray* wsDst; + GpuArray* wsDstArg; int* srcAxisList; - int* dstAxisList; + size_t* dstDims; gpucontext* gpuCtx; /* Source code Generator. */ @@ -204,12 +217,14 @@ struct redux_ctx{ const char* dstArgTypeStr; const char* idxTypeStr; const char* accTypeStr; - const char* initVal; + const char* initValT; + const char* initValK; int ndd; int ndr; int nds; int largeCodeModel; strb s; + srcb srcGen; char* sourceCode; size_t sourceCodeLen; char* errorString0; @@ -274,11 +289,10 @@ static void appendIdxes (strb* s, const char* epilogue); static int reduxCheckargs (redux_ctx* ctx); static void reduxSelectTypes (redux_ctx* ctx); -static int reduxSelectModel (redux_ctx* ctx); static int reduxIsSmallCodeModel (redux_ctx* ctx); static int reduxIsLargeCodeModel (redux_ctx* ctx); -static int reduxHasDst (redux_ctx* ctx); -static int reduxHasDstArg (redux_ctx* ctx); +static int reduxRequiresDst (redux_ctx* ctx); +static int reduxRequiresDstArg (redux_ctx* ctx); static int reduxKernelRequiresDst (redux_ctx* ctx); static int reduxKernelRequiresDstArg (redux_ctx* ctx); static int reduxCanAppendHwAxis (redux_ctx* ctx, @@ -292,22 +306,22 @@ static int reduxComputeAxisList (redux_ctx* ctx); static int reduxGenSource (redux_ctx* ctx); static void reduxAppendSource (redux_ctx* ctx); static void reduxAppendIncludes (redux_ctx* ctx); +static void reduxAppendTensorDeclArgs (redux_ctx* ctx, + const char* type, + const char* baseName); +static void reduxAppendTensorCallArgs (redux_ctx* ctx, + const char* baseName); +static void reduxAppendMacroDefs (redux_ctx* ctx); static void reduxAppendTypedefs (redux_ctx* ctx); -static void reduxAppendFuncGetInitVal (redux_ctx* ctx); -static void reduxAppendFuncLoadVal (redux_ctx* ctx); -static void reduxAppendFuncReduxVal (redux_ctx* ctx); -static void reduxAppendFuncPreKernel (redux_ctx* ctx); -static void reduxAppendFuncKernel (redux_ctx* ctx); -static void reduxAppendFuncPostKernel (redux_ctx* ctx); +static void reduxAppendGetInitValFns (redux_ctx* ctx); +static void reduxAppendWriteBackFn (redux_ctx* ctx); +static void reduxAppendReduxKernel (redux_ctx* ctx); static void reduxAppendPrototype (redux_ctx* ctx); -static void reduxAppendOffsets (redux_ctx* ctx); static void reduxAppendIndexDeclarations (redux_ctx* ctx); static void reduxAppendRangeCalculations (redux_ctx* ctx); static void reduxAppendLoops (redux_ctx* ctx); -static void reduxAppendLoopMacroDefs (redux_ctx* ctx); -static void reduxAppendLoopOuter (redux_ctx* ctx); -static void reduxAppendLoopInner (redux_ctx* ctx); -static void reduxAppendLoopMacroUndefs (redux_ctx* ctx); +static void reduxAppendInitKernel (redux_ctx* ctx); +static void reduxAppendPostKernel (redux_ctx* ctx); static int reduxCompile (redux_ctx* ctx); static int reduxSchedule (redux_ctx* ctx); static void reduxScheduleKernel (int ndims, @@ -771,28 +785,36 @@ static void appendIdxes (strb* s, } /** - * @brief Check the sanity of the arguments, in agreement with the + * @brief Check the sanity of the arguments in agreement with the * documentation for GpuArray_reduction(). * - * Also initialize certain parts of the context. + * Also initialize certain parts of the context, allocate memory + * buffers and fail out if at any point the environment gives us + * a problem. * - * @return GA_INVALID_ERROR if arguments invalid; GA_NO_ERROR otherwise. + * @return GA_INVALID_ERROR if arguments invalid; GA_NO_MEMORY if out of + * memory, GA_NO_ERROR otherwise. */ static int reduxCheckargs (redux_ctx* ctx){ - int i, ret; + int i, j, ret, retT, retK; + unsigned numProcs; + size_t localSize; + size_t dstNumElem = 1, reduxPerElem = 1; /** * We initialize certain parts of the context. */ + ctx->wsDst = NULL; + ctx->wsDstArg = NULL; ctx->srcAxisList = NULL; - ctx->dstAxisList = NULL; + ctx->dstDims = NULL; ctx->gpuCtx = NULL; ctx->srcTypeStr = ctx->dstTypeStr = ctx->dstArgTypeStr = ctx->accTypeStr = ctx->idxTypeStr = NULL; - ctx->initVal = NULL; + ctx->initValK = NULL; ctx->pri.ndh = ctx->aux.ndh = 0; ctx->pri.ndhd = ctx->aux.ndhd = 0; ctx->pri.ndhr = ctx->aux.ndhr = 0; @@ -802,6 +824,7 @@ static int reduxCheckargs (redux_ctx* ctx){ ctx->errorString1 = NULL; ctx->errorString2 = NULL; strb_init(&ctx->s); + srcbInit (&ctx->srcGen, &ctx->s); for (i=0;iaux.axisList[i] = ctx->pri.axisList[i] = 0; @@ -817,12 +840,14 @@ static int reduxCheckargs (redux_ctx* ctx){ /* Insane src, reduxLen, dst or dstArg? */ - if (!ctx->src || ctx->src->nd <= 0 || ctx->reduxLen == 0 || - ctx->reduxLen > (int)ctx->src->nd){ - return reduxCleanup(ctx, GA_INVALID_ERROR); - } - if ((reduxHasDst (ctx) && !ctx->dst) || - (reduxHasDstArg(ctx) && !ctx->dstArg)){ + if (!ctx->src || + (reduxRequiresDst (ctx) && !ctx->dst) || + (reduxRequiresDstArg(ctx) && !ctx->dstArg) || + (ctx->src->nd <= 0) || + (ctx->reduxLen <= 0) || + (ctx->src->nd < (unsigned)ctx->reduxLen) || + (ctx->dst && ctx->dst->nd +ctx->reduxLen != ctx->src->nd) || + (ctx->dstArg && ctx->dstArg->nd+ctx->reduxLen != ctx->src->nd) ){ return reduxCleanup(ctx, GA_INVALID_ERROR); } @@ -855,36 +880,46 @@ static int reduxCheckargs (redux_ctx* ctx){ /* Determine initializer, and error out if reduction unsupported. */ switch (ctx->op){ case GA_REDUCE_SUM: - ret = reduxGetSumInit (ctx->accTypeCode, &ctx->initVal); + retT = reduxGetSumInit (ctx->dstTypeCode, &ctx->initValT); + retK = reduxGetSumInit (ctx->accTypeCode, &ctx->initValK); break; case GA_REDUCE_PRODNZ: case GA_REDUCE_PROD: - ret = reduxGetProdInit(ctx->accTypeCode, &ctx->initVal); + retT = reduxGetProdInit(ctx->dstTypeCode, &ctx->initValT); + retK = reduxGetProdInit(ctx->accTypeCode, &ctx->initValK); break; case GA_REDUCE_MINANDARGMIN: case GA_REDUCE_ARGMIN: case GA_REDUCE_MIN: - ret = reduxGetMinInit (ctx->accTypeCode, &ctx->initVal); + retT = reduxGetMinInit (ctx->dstTypeCode, &ctx->initValT); + retK = reduxGetMinInit (ctx->accTypeCode, &ctx->initValK); break; case GA_REDUCE_MAXANDARGMAX: case GA_REDUCE_ARGMAX: case GA_REDUCE_MAX: - ret = reduxGetMaxInit (ctx->accTypeCode, &ctx->initVal); + retT = reduxGetMaxInit (ctx->dstTypeCode, &ctx->initValT); + retK = reduxGetMaxInit (ctx->accTypeCode, &ctx->initValK); break; case GA_REDUCE_ALL: case GA_REDUCE_AND: - ret = reduxGetAndInit (ctx->accTypeCode, &ctx->initVal); + retT = reduxGetAndInit (ctx->dstTypeCode, &ctx->initValT); + retK = reduxGetAndInit (ctx->accTypeCode, &ctx->initValK); break; case GA_REDUCE_ANY: case GA_REDUCE_XOR: case GA_REDUCE_OR: - ret = reduxGetOrInit (ctx->accTypeCode, &ctx->initVal); + retT = reduxGetOrInit (ctx->dstTypeCode, &ctx->initValT); + retK = reduxGetOrInit (ctx->accTypeCode, &ctx->initValK); break; default: - ret = GA_UNSUPPORTED_ERROR; + retT = GA_UNSUPPORTED_ERROR; + retK = GA_UNSUPPORTED_ERROR; } - if (ret != GA_NO_ERROR){ - return reduxCleanup(ctx, ret); + if (retT != GA_NO_ERROR){ + return reduxCleanup(ctx, retT); + } + if (retK != GA_NO_ERROR){ + return reduxCleanup(ctx, retK); } @@ -896,11 +931,109 @@ static int reduxCheckargs (redux_ctx* ctx){ ctx->nds = ctx->src->nd; ctx->ndr = ctx->reduxLen; ctx->ndd = ctx->nds - ctx->ndr; - strb_ensure(&ctx->s, 5*1024); + strb_ensure(&ctx->s, 3*1024); + + + /** + * And make a few small dynamic memory allocations for the benefit of the + * rest of the code, allowing error checking to happen early and fail fast. + */ + + ctx->srcAxisList = malloc(ctx->nds * sizeof(unsigned)); + ctx->dstDims = malloc(ctx->ndd * sizeof(size_t)); + if (!ctx->srcAxisList || + !ctx->dstDims ){ + return reduxCleanup(ctx, GA_MEMORY_ERROR); + } + + + /** + * Query device for approximate total level of parallelism. If destination + * tensor is so big it can keep all threads busy on individual elements, + * use large code model; Otherwise use small code model, where threads will + * have to cooperate. + * + * - Large (Destination tensor >= SMALL_REDUX_THRESHOLD elements, or + * destination tensor size >= # of reductions per destination + * tensor element): + * All destination elements have their own thread. + * - Small (otherwise): + * Multiple threads cooperate on a single destination element. + */ + ret = gpucontext_property(ctx->gpuCtx, GA_CTX_PROP_NUMPROCS, &numProcs); + if (ret != GA_NO_ERROR){ + return reduxCleanup(ctx, ret); + } + ret = gpucontext_property(ctx->gpuCtx, GA_CTX_PROP_MAXLSIZE, &localSize); + if (ret != GA_NO_ERROR){ + return reduxCleanup(ctx, ret); + } + for (i=j=0;inds;i++){ + if (axisInSet(i, ctx->reduxList, ctx->nds, NULL)){ + reduxPerElem *= ctx->src->dimensions[i]; + }else{ + dstNumElem *= ctx->src->dimensions[i]; + ctx->dstDims[j++] = ctx->src->dimensions[i];; + } + } - return reduxSelectModel(ctx); + ctx->largeCodeModel = dstNumElem >= numProcs*localSize || + dstNumElem >= reduxPerElem + || 1;/* BUG: Erase when small code model implemented. */ + /** + * *** IT IS NOW SAFE TO CALL: *** + * - reduxIsLargeModel() + * - reduxIsSmallModel() + * - reduxKernelRequiresDst() + * - reduxKernelRequiresDstArg() + */ + + + /** + * Allocate workspaces. + * + * Certain reductions may require a workspace that isn't provided by the user. + * For instance, **when using the small code model**, argmin/argmax require + * a dst buffer, but the user didn't supply one (as he would have for + * maxandargmax/minandargmin). We must allocate and deallocate it ourselves. + * + * Otherwise we use the user-supplied buffers. + */ + + if (!reduxRequiresDst (ctx) && reduxKernelRequiresDst(ctx)){ + ctx->wsDst = malloc(sizeof(*ctx->wsDst)); + if (!ctx->wsDst){ + return reduxCleanup(ctx, GA_MEMORY_ERROR); + } + + ret = GpuArray_empty(ctx->wsDst, ctx->gpuCtx, ctx->dstTypeCode, + ctx->ndd, ctx->dstDims, GA_C_ORDER); + if(ret != GA_NO_ERROR){ + return reduxCleanup(ctx, ret); + } + }else{ + ctx->wsDst = ctx->dst; + } + if (!reduxRequiresDstArg(ctx) && reduxKernelRequiresDstArg(ctx)){ + ctx->wsDstArg = malloc(sizeof(*ctx->wsDstArg)); + if (!ctx->wsDstArg){ + return reduxCleanup(ctx, GA_MEMORY_ERROR); + } + + ret = GpuArray_empty(ctx->wsDstArg, ctx->gpuCtx, ctx->dstArgTypeCode, + ctx->ndd, ctx->dstDims, GA_C_ORDER); + if(ret != GA_NO_ERROR){ + return reduxCleanup(ctx, ret); + } + }else{ + ctx->wsDstArg = ctx->dstArg; + } + + + + return reduxSelectHwAxes(ctx); } /** @@ -948,67 +1081,6 @@ static void reduxSelectTypes (redux_ctx* ctx){ ctx->accTypeStr = gpuarray_get_type(ctx->accTypeCode) ->cluda_name; } -/** - * @brief Select which code model will be used: - * - * - Large (Destination tensor >= SMALL_REDUX_THRESHOLD elements, or - * destination tensor size >= # of reductions per destination - * tensor element): - * All destination elements have their own thread. - * - Small (otherwise): - * Multiple threads cooperate on a single destination element. - */ - -static int reduxSelectModel (redux_ctx* ctx){ - int i, ret; - unsigned numProcs; - size_t localSize; - size_t dstNumElem = 1, reduxPerElem = 1; - - - /** - * Query device for approximate total level of parallelism. If destination - * tensor is so big it can keep all threads busy on individual elements, - * use large code model; Otherwise use small code model, where threads will - * have to cooperate. - */ - - ret = gpucontext_property(ctx->gpuCtx, GA_CTX_PROP_NUMPROCS, &numProcs); - if (ret != GA_NO_ERROR){ - return reduxCleanup(ctx, ret); - } - ret = gpucontext_property(ctx->gpuCtx, GA_CTX_PROP_MAXLSIZE, &localSize); - if (ret != GA_NO_ERROR){ - return reduxCleanup(ctx, ret); - } - - - /** - * Compute #elems in dst and # reductions per dst element. - */ - - for (i=0;inds;i++){ - if (axisInSet(i, ctx->reduxList, ctx->nds, NULL)){ - reduxPerElem *= ctx->src->dimensions[i]; - }else{ - dstNumElem *= ctx->src->dimensions[i]; - } - } - ctx->largeCodeModel = dstNumElem >= numProcs*localSize || - dstNumElem >= reduxPerElem - || 1;/* BUG: Erase when small code model implemented. */ - /** - * *** IT IS NOW SAFE TO CALL: *** - * - reduxIsLargeModel() - * - reduxIsSmallModel() - * - reduxKernelRequiresDst() - * - reduxKernelRequiresDstArg() - */ - - - return reduxSelectHwAxes(ctx); -} - /** * @brief Returns whether we are using the small code model or not. */ @@ -1029,7 +1101,7 @@ static int reduxIsLargeCodeModel (redux_ctx* ctx){ * @brief Returns whether the reduction interface requires a dst argument. */ -static int reduxHasDst (redux_ctx* ctx){ +static int reduxRequiresDst (redux_ctx* ctx){ switch (ctx->op){ case GA_REDUCE_ARGMIN: case GA_REDUCE_ARGMAX: @@ -1043,7 +1115,7 @@ static int reduxHasDst (redux_ctx* ctx){ * @brief Returns whether the reduction interface requires a dstArg argument. */ -static int reduxHasDstArg (redux_ctx* ctx){ +static int reduxRequiresDstArg (redux_ctx* ctx){ switch (ctx->op){ case GA_REDUCE_MINANDARGMIN: case GA_REDUCE_MAXANDARGMAX: @@ -1093,7 +1165,7 @@ static int reduxKernelRequiresDstArg (redux_ctx* ctx){ * buffer for indexes, and will not in the foreseeable future. */ - return reduxHasDstArg(ctx); + return reduxRequiresDstArg(ctx); } /** @@ -1107,8 +1179,8 @@ static int reduxCanAppendHwAxis (redux_ctx* ctx, int kernelNdh = kernelType == KERNEL_PRIMARY ? ctx->pri.ndh : ctx->aux.ndh; int kernelNdhr = kernelType == KERNEL_PRIMARY ? ctx->pri.ndhr : ctx->aux.ndhr; int kernelNdhd = kernelType == KERNEL_PRIMARY ? ctx->pri.ndhd : ctx->aux.ndhd; - - if(kernelNdh >= MAX_HW_DIMS){ + + if (kernelNdh >= MAX_HW_DIMS){ return 0; }else{ return axisType == AXIS_REDUX ? kernelNdhr < ctx->ndr: @@ -1215,19 +1287,11 @@ static int reduxSelectHwAxes (redux_ctx* ctx){ * The first ctx->ndd axes correspond to the outer loops that iterate over * each destination element. The last ctx->ndr axes correspond to the inner * loops that iterate over the dimensions of elements that are to be reduced. - * - * @return GA_MEMORY_ERROR if allocating the list failed; Otherwise, returns - * GA_NO_ERROR. */ static int reduxComputeAxisList (redux_ctx* ctx){ int i, f=0; - ctx->srcAxisList = malloc(ctx->nds * sizeof(unsigned)); - if (!ctx->srcAxisList){ - return reduxCleanup(ctx, GA_MEMORY_ERROR); - } - for (i=0;inds;i++){ if (!axisInSet(i, ctx->reduxList, ctx->ndr, 0)){ ctx->srcAxisList[f++] = i; @@ -1257,15 +1321,81 @@ static int reduxGenSource (redux_ctx* ctx){ } static void reduxAppendSource (redux_ctx* ctx){ reduxAppendIncludes (ctx); + reduxAppendMacroDefs (ctx); reduxAppendTypedefs (ctx); - reduxAppendFuncGetInitVal (ctx); - reduxAppendFuncLoadVal (ctx); - reduxAppendFuncReduxVal (ctx); - if(reduxIsSmallCodeModel(ctx)){ - reduxAppendFuncPreKernel (ctx); - reduxAppendFuncPostKernel (ctx); - } - reduxAppendFuncKernel (ctx); + reduxAppendGetInitValFns (ctx); + reduxAppendWriteBackFn (ctx); + reduxAppendReduxKernel (ctx); + if (reduxIsSmallCodeModel(ctx)){ + reduxAppendInitKernel (ctx); + reduxAppendPostKernel (ctx); + } +} +static void reduxAppendTensorDeclArgs (redux_ctx* ctx, + const char* type, + const char* baseName){ + srcbAppendElemf(&ctx->srcGen, "%s* %sPtr", type, baseName); + srcbAppendElemf(&ctx->srcGen, "const X %sOff", baseName); + srcbAppendElemf(&ctx->srcGen, "const GLOBAL_MEM X* %sSteps", baseName); + (void)reduxAppendTensorCallArgs;/* Silence unused warning */ +} +static void reduxAppendTensorCallArgs (redux_ctx* ctx, + const char* baseName){ + srcbAppendElemf(&ctx->srcGen, "%sPtr", baseName); + srcbAppendElemf(&ctx->srcGen, "%sOff", baseName); + srcbAppendElemf(&ctx->srcGen, "%sSteps", baseName); +} +static void reduxAppendMacroDefs (redux_ctx* ctx){ + int i; + + srcbAppends (&ctx->srcGen, "#define FOROVER(idx) for(i##idx = i##idx##Start; i##idx < i##idx##End; i##idx++)\n"); + srcbAppends (&ctx->srcGen, "#define ESCAPE(idx) if(i##idx >= i##idx##Dim){continue;}\n"); + + /* srcVal indexer */ + srcbAppends (&ctx->srcGen, "#define srcVal (*(const GLOBAL_MEM S*)("); + srcbBeginList (&ctx->srcGen, "+", "0"); + srcbAppendElemf(&ctx->srcGen, "(const GLOBAL_MEM char*)srcPtr"); + srcbAppendElemf(&ctx->srcGen, "srcOff"); + for (i=0;inds;i++){ + srcbAppendElemf(&ctx->srcGen, "i%d*i%dSStep", i, i); + } + srcbEndList (&ctx->srcGen); + srcbAppends (&ctx->srcGen, "))\n"); + + /* dstVal indexer */ + if (reduxKernelRequiresDst(ctx)){ + srcbAppends (&ctx->srcGen, "#define dstVal (*(GLOBAL_MEM T*)("); + srcbBeginList (&ctx->srcGen, "+", "0"); + srcbAppendElemf(&ctx->srcGen, "(GLOBAL_MEM char*)dstPtr"); + srcbAppendElemf(&ctx->srcGen, "dstOff"); + for (i=0;indd;i++){ + srcbAppendElemf(&ctx->srcGen, "i%d*i%dDStep", i, i); + } + srcbEndList (&ctx->srcGen); + srcbAppends (&ctx->srcGen, "))\n"); + } + + /* dstArgVal indexer */ + if (reduxKernelRequiresDstArg(ctx)){ + srcbAppends (&ctx->srcGen, "#define dstArgVal (*(GLOBAL_MEM A*)("); + srcbBeginList (&ctx->srcGen, "+", "0"); + srcbAppendElemf(&ctx->srcGen, "(GLOBAL_MEM char*)dstArgPtr"); + srcbAppendElemf(&ctx->srcGen, "dstArgOff"); + for (i=0;indd;i++){ + srcbAppendElemf(&ctx->srcGen, "i%d*i%dAStep", i, i); + } + srcbEndList (&ctx->srcGen); + srcbAppends (&ctx->srcGen, "))\n"); + } + + /* rdxIdx indexer */ + srcbAppends (&ctx->srcGen, "#define rdxIdx ("); + srcbBeginList (&ctx->srcGen, "+", "0"); + for (i=ctx->ndd;inds;i++){ + srcbAppendElemf(&ctx->srcGen, "i%d*i%dPDim", i, i); + } + srcbEndList (&ctx->srcGen); + srcbAppends (&ctx->srcGen, ")\n"); } static void reduxAppendIncludes (redux_ctx* ctx){ strb_appends(&ctx->s, "/* Includes */\n"); @@ -1281,47 +1411,20 @@ static void reduxAppendTypedefs (redux_ctx* ctx){ strb_appendf(&ctx->s, "typedef %s X;\n", ctx->idxTypeStr); /* The type of the indices: signed 32/64-bit. */ strb_appendf(&ctx->s, "typedef %s K;\n", ctx->accTypeStr); /* The type of the accumulator variable. */ } -static void reduxAppendFuncGetInitVal (redux_ctx* ctx){ +static void reduxAppendGetInitValFns (redux_ctx* ctx){ /** - * Initial value function. + * Initial value functions. */ - strb_appendf(&ctx->s, "WITHIN_KERNEL K getInitVal(void){\n" + strb_appendf(&ctx->s, "WITHIN_KERNEL T getInitValTFn(void){\n" "\treturn (%s);\n" - "}\n\n\n\n", ctx->initVal); -} -static void reduxAppendFuncLoadVal (redux_ctx* ctx){ - int i; - - /** - * Multidimensional source element loader. - * - * Also implements prescalar transformations if any. - */ - - appendIdxes (&ctx->s, "WITHIN_KERNEL K loadVal(", "X i", 0, ctx->nds, "", ""); - if (ctx->nds > 0){ - strb_appends(&ctx->s, ", "); - } - strb_appends(&ctx->s, "const GLOBAL_MEM S* src, const GLOBAL_MEM X* srcSteps){\n"); - strb_appends(&ctx->s, "\tS v = (*(const GLOBAL_MEM S*)((const GLOBAL_MEM char*)src + "); - for (i=0;inds;i++){ - strb_appendf(&ctx->s, "i%d*srcSteps[%d] + \\\n\t ", i, ctx->srcAxisList[i]); - } - strb_appends(&ctx->s, "0));\n"); - - /* Prescalar transformations go here... */ - - /* Return the value. */ - strb_appends(&ctx->s, "\treturn v;\n"); - strb_appends(&ctx->s, "}\n\n\n\n"); + "}\n\n\n\n" + "WITHIN_KERNEL K getInitValKFn(void){\n" + "\treturn (%s);\n" + "}\n\n\n\n", ctx->initValT, ctx->initValK); } -static void reduxAppendFuncReduxVal (redux_ctx* ctx){ - int i, anyArgsEmitted = 0; - +static void reduxAppendWriteBackFn (redux_ctx* ctx){ /** - * Function Signature. - * * Global memory value reduction function. * * Responsible for either: @@ -1329,44 +1432,25 @@ static void reduxAppendFuncReduxVal (redux_ctx* ctx){ * 2) Safe atomic reduction of partial value into memory. */ - appendIdxes (&ctx->s, "WITHIN_KERNEL void reduxVal(", "X i", 0, ctx->ndd, "", ""); - anyArgsEmitted = ctx->ndd>0; - if (reduxKernelRequiresDst (ctx)){ - if (anyArgsEmitted){ - strb_appends(&ctx->s, ", "); - } - anyArgsEmitted = 1; - strb_appends(&ctx->s, "GLOBAL_MEM T* dst, const GLOBAL_MEM X* dstSteps, K v"); + srcbAppends (&ctx->srcGen, "WITHIN_KERNEL void writeBackFn("); + srcbBeginList (&ctx->srcGen, ", ", "void"); + if (reduxKernelRequiresDst(ctx)){ + srcbAppendElemf(&ctx->srcGen, "GLOBAL_MEM T* d_"); + srcbAppendElemf(&ctx->srcGen, "T d"); } if (reduxKernelRequiresDstArg(ctx)){ - if (anyArgsEmitted){ - strb_appends(&ctx->s, ", "); - } - anyArgsEmitted = 1; - strb_appends(&ctx->s, "GLOBAL_MEM A* dstArg, const GLOBAL_MEM X* dstArgSteps, X i"); + srcbAppendElemf(&ctx->srcGen, "GLOBAL_MEM A* a_"); + srcbAppendElemf(&ctx->srcGen, "A a"); } - strb_appends(&ctx->s, "){\n"); - + srcbEndList (&ctx->srcGen); + srcbAppends (&ctx->srcGen, "){\n"); - /* Post-scalar transformations go here. */ - - - /* Write to memory. */ if (reduxIsLargeCodeModel(ctx)){ - /* Large code model. Easy: just write out the data, since it's safe. */ if (reduxKernelRequiresDst (ctx)){ - strb_appends(&ctx->s, "\t(*(GLOBAL_MEM T*)((GLOBAL_MEM char*)dst + "); - for (i=0;indd;i++){ - strb_appendf(&ctx->s, "i%d*dstSteps[%d] +\n\t ", i, i); - } - strb_appends(&ctx->s, "0)) = v;\n"); + srcbAppends (&ctx->srcGen, "\t*d_ = d;\n"); } if (reduxKernelRequiresDstArg(ctx)){ - strb_appends(&ctx->s, "\t(*(GLOBAL_MEM A*)((GLOBAL_MEM char*)dstArg + "); - for (i=0;indd;i++){ - strb_appendf(&ctx->s, "i%d*dstArgSteps[%d] +\n\t ", i, i); - } - strb_appends(&ctx->s, "0)) = i;\n"); + srcbAppends (&ctx->srcGen, "\t*a_ = a;\n"); } }else{ /* BUG: Implement the atomic reduction, one or two CAS loops. */ @@ -1382,49 +1466,28 @@ static void reduxAppendFuncReduxVal (redux_ctx* ctx){ /* Close off function. */ strb_appends(&ctx->s, "}\n\n\n\n"); } -static void reduxAppendFuncPreKernel (redux_ctx* ctx){ - -} -static void reduxAppendFuncKernel (redux_ctx* ctx){ +static void reduxAppendReduxKernel (redux_ctx* ctx){ reduxAppendPrototype (ctx); strb_appends (&ctx->s, "{\n"); - reduxAppendOffsets (ctx); reduxAppendIndexDeclarations(ctx); reduxAppendRangeCalculations(ctx); reduxAppendLoops (ctx); strb_appends (&ctx->s, "}\n"); -} -static void reduxAppendFuncPostKernel (redux_ctx* ctx){ - } static void reduxAppendPrototype (redux_ctx* ctx){ - strb_appends(&ctx->s, "/**\n"); - strb_appends(&ctx->s, " * Reduction Kernel.\n"); - strb_appends(&ctx->s, " *\n"); - strb_appends(&ctx->s, " * Implements actual reduction operation.\n"); - strb_appends(&ctx->s, " */\n\n"); - strb_appends(&ctx->s, "KERNEL void redux(const GLOBAL_MEM S* src,\n"); - strb_appends(&ctx->s, " const X srcOff,\n"); - strb_appends(&ctx->s, " const GLOBAL_MEM X* srcSteps,\n"); - strb_appends(&ctx->s, " const GLOBAL_MEM X* srcSize,\n"); - strb_appends(&ctx->s, " const GLOBAL_MEM X* chunkSize,\n"); - strb_appends(&ctx->s, " GLOBAL_MEM T* dst,\n"); - strb_appends(&ctx->s, " const X dstOff,\n"); - strb_appends(&ctx->s, " const GLOBAL_MEM X* dstSteps,\n"); - strb_appends(&ctx->s, " GLOBAL_MEM A* dstArg,\n"); - strb_appends(&ctx->s, " const X dstArgOff,\n"); - strb_appends(&ctx->s, " const GLOBAL_MEM X* dstArgSteps)"); -} -static void reduxAppendOffsets (redux_ctx* ctx){ - strb_appends(&ctx->s, "\t/* Add offsets */\n"); - strb_appends(&ctx->s, "\tsrc = (const GLOBAL_MEM T*)((const GLOBAL_MEM char*)src + srcOff);\n"); - if (reduxKernelRequiresDst(ctx)){ - strb_appends(&ctx->s, "\tdst = (GLOBAL_MEM T*) ((GLOBAL_MEM char*) dst + dstOff);\n"); + srcbAppends (&ctx->srcGen, "KERNEL void reduxKer("); + srcbBeginList (&ctx->srcGen, ", ", "void"); + reduxAppendTensorDeclArgs(ctx, "S", "src"); + srcbAppendElemf(&ctx->srcGen, "const GLOBAL_MEM X* srcSize"); + srcbAppendElemf(&ctx->srcGen, "const GLOBAL_MEM X* chunkSize"); + if(reduxKernelRequiresDst(ctx)){ + reduxAppendTensorDeclArgs(ctx, "T", "dst"); } - if (reduxKernelRequiresDstArg(ctx)){ - strb_appends(&ctx->s, "\tdstArg = (GLOBAL_MEM X*) ((GLOBAL_MEM char*) dstArg + dstArgOff);\n"); + if(reduxKernelRequiresDstArg(ctx)){ + reduxAppendTensorDeclArgs(ctx, "A", "dstArg"); } - strb_appends(&ctx->s, "\t\n\t\n"); + srcbEndList (&ctx->srcGen); + srcbAppends (&ctx->srcGen, ")"); } static void reduxAppendIndexDeclarations (redux_ctx* ctx){ int i; @@ -1441,39 +1504,39 @@ static void reduxAppendIndexDeclarations (redux_ctx* ctx){ i, i, (i==ctx->pri.ndh-1) ? ";\n" : ", "); } } - strb_appends(&ctx->s, "\t\n\t\n"); strb_appends(&ctx->s, "\t/* Free indices & Reduction indices */\n"); - if (ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "", ";\n");} if (ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "Dim", ";\n");} if (ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "Start", ";\n");} if (ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "End", ";\n");} if (ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "SStep", ";\n");} - if (ctx->ndd > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->ndd, "MStep", ";\n");} + if (ctx->ndd > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->ndd, "DStep", ";\n");} if (ctx->ndd > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->ndd, "AStep", ";\n");} if (ctx->nds > ctx->ndd){appendIdxes (&ctx->s, "\tX ", "i", ctx->ndd, ctx->nds, "PDim", ";\n");} - strb_appends(&ctx->s, "\t\n\t\n"); } static void reduxAppendRangeCalculations (redux_ctx* ctx){ size_t hwDim; int i; - /* Use internal remapping when computing the ranges for this thread. */ strb_appends(&ctx->s, "\t/* Compute ranges for this thread. */\n"); for (i=0;inds;i++){ - strb_appendf(&ctx->s, "\ti%dDim = srcSize[%d];\n", i, ctx->srcAxisList[i]); + strb_appendf(&ctx->s, "\ti%dDim = srcSize[%d];\n", i, ctx->srcAxisList[i]); } for (i=0;inds;i++){ strb_appendf(&ctx->s, "\ti%dSStep = srcSteps[%d];\n", i, ctx->srcAxisList[i]); } - for (i=0;indd;i++){ - strb_appendf(&ctx->s, "\ti%dMStep = dstSteps[%d];\n", i, i); + if(reduxKernelRequiresDst(ctx)){ + for (i=0;indd;i++){ + strb_appendf(&ctx->s, "\ti%dDStep = dstSteps[%d];\n", i, i); + } } - for (i=0;indd;i++){ - strb_appendf(&ctx->s, "\ti%dAStep = dstArgSteps[%d];\n", i, i); + if(reduxKernelRequiresDstArg(ctx)){ + for (i=0;indd;i++){ + strb_appendf(&ctx->s, "\ti%dAStep = dstArgSteps[%d];\n", i, i); + } } for (i=ctx->nds-1;i>=ctx->ndd;i--){ /** @@ -1515,192 +1578,114 @@ static void reduxAppendRangeCalculations (redux_ctx* ctx){ strb_appends(&ctx->s, "\t\n\t\n"); } static void reduxAppendLoops (redux_ctx* ctx){ - strb_appends(&ctx->s, "\t/**\n"); - strb_appends(&ctx->s, "\t * FREE LOOPS.\n"); - strb_appends(&ctx->s, "\t */\n"); - strb_appends(&ctx->s, "\t\n"); - - reduxAppendLoopMacroDefs (ctx); - reduxAppendLoopOuter (ctx); - reduxAppendLoopMacroUndefs(ctx); -} -static void reduxAppendLoopMacroDefs (redux_ctx* ctx){ - int i; - - /** - * FOROVER Macro - */ - - strb_appends(&ctx->s, "#define FOROVER(idx) for(i##idx = i##idx##Start; i##idx < i##idx##End; i##idx++)\n"); - - /** - * ESCAPE Macro - */ - - strb_appends(&ctx->s, "#define ESCAPE(idx) if(i##idx >= i##idx##Dim){continue;}\n"); - - /** - * RDXINDEXER Macro - */ - - appendIdxes (&ctx->s, "#define RDXINDEXER(", "i", ctx->ndd, ctx->nds, "", ") ("); - for (i=ctx->ndd;inds;i++){ - strb_appendf(&ctx->s, "i%d*i%dPDim + \\\n ", i, i); - } - strb_appends(&ctx->s, "0)\n"); -} -static void reduxAppendLoopOuter (redux_ctx* ctx){ int i; - /** - * Outer Loop Header Generation - */ - - for (i=0;indd;i++){ - strb_appendf(&ctx->s, "\tFOROVER(%d){ESCAPE(%d)\n", i, i); - } - - /** - * Inner Loop Generation - */ - - reduxAppendLoopInner(ctx); - - /** - * Outer Loop Trailer Generation - */ - for (i=0;indd;i++){ - strb_appends(&ctx->s, "\t}\n"); + srcbAppendf(&ctx->srcGen, "\tFOROVER(%d){ESCAPE(%d)\n", i, i); } -} -static void reduxAppendLoopInner (redux_ctx* ctx){ - int i; - /** - * Inner Loop Prologue - */ - - strb_appends(&ctx->s, "\t\t/**\n"); - strb_appends(&ctx->s, "\t\t * Reduction initialization.\n"); - strb_appends(&ctx->s, "\t\t */\n"); - strb_appends(&ctx->s, "\t\t\n"); - strb_appends(&ctx->s, "\t\tK rdxV = getInitVal();\n"); + srcbAppends (&ctx->srcGen, "\t\tT rdxT;\n"); + srcbAppends (&ctx->srcGen, "\t\tK rdxK = getInitValKFn();\n"); if (reduxKernelRequiresDstArg(ctx)){ - strb_appends(&ctx->s, "\t\tX argI = 0;\n"); + srcbAppends(&ctx->srcGen, "\t\tX rdxA = 0;\n"); } - strb_appends(&ctx->s, "\t\t\n"); - strb_appends(&ctx->s, "\t\t/**\n"); - strb_appends(&ctx->s, "\t\t * REDUCTION LOOPS.\n"); - strb_appends(&ctx->s, "\t\t */\n"); - strb_appends(&ctx->s, "\t\t\n"); - - /** - * Inner Loop Header Generation - */ + srcbAppends (&ctx->srcGen, "\t\t\n"); for (i=ctx->ndd;inds;i++){ - strb_appendf(&ctx->s, "\t\tFOROVER(%d){ESCAPE(%d)\n", i, i); + srcbAppendf (&ctx->srcGen, "\t\tFOROVER(%d){ESCAPE(%d)\n", i, i); } + srcbAppends (&ctx->srcGen, "\t\t\tS s = srcVal;\n"); + /** - * Inner Loop Body Generation + * Prescalar transformations go here. They transform and coerce the S-typed + * value s into the K-typed value k. */ - appendIdxes (&ctx->s, "\t\t\tK v = loadVal(", "i", 0, ctx->nds, "", ""); - if (ctx->nds > 0){ - strb_appends(&ctx->s, ", "); - } - strb_appends(&ctx->s, "src, srcSteps);\n"); - strb_appends(&ctx->s, "\t\t\t\n"); + srcbAppends (&ctx->srcGen, "\t\t\tK k = s;\n"); + switch (ctx->op){ case GA_REDUCE_SUM: - strb_appends(&ctx->s, "\t\t\trdxV += v;\n"); + srcbAppends(&ctx->srcGen, "\t\t\trdxK += k;\n"); break; case GA_REDUCE_PROD: - strb_appends(&ctx->s, "\t\t\trdxV *= v;\n"); + srcbAppends(&ctx->srcGen, "\t\t\trdxK *= k;\n"); break; case GA_REDUCE_PRODNZ: - strb_appends(&ctx->s, "\t\t\trdxV *= v==0 ? getInitVal() : v;\n"); + srcbAppends(&ctx->srcGen, "\t\t\trdxK *= k==0 ? getInitValKFn() : k;\n"); break; case GA_REDUCE_MIN: - strb_appends(&ctx->s, "\t\t\trdxV = min(rdxV, v);\n"); + srcbAppends(&ctx->srcGen, "\t\t\trdxK = min(rdxK, k);\n"); break; case GA_REDUCE_MAX: - strb_appends(&ctx->s, "\t\t\trdxV = max(rdxV, v);\n"); + srcbAppends(&ctx->srcGen, "\t\t\trdxK = max(rdxK, k);\n"); break; case GA_REDUCE_ARGMIN: case GA_REDUCE_MINANDARGMIN: - strb_appends(&ctx->s, "\t\t\trdxV = min(rdxV, v);\n"); - strb_appends(&ctx->s, "\t\t\tif(v == rdxV){\n"); - appendIdxes (&ctx->s, "\t\t\t\targI = RDXINDEXER(", "i", ctx->ndd, ctx->nds, "", ");\n"); - strb_appends(&ctx->s, "\t\t\t}\n"); + srcbAppends(&ctx->srcGen, "\t\t\trdxK = min(rdxK, k);\n" + "\t\t\tif(rdxK == k){\n" + "\t\t\t\trdxA = rdxIdx;\n" + "\t\t\t}\n"); break; case GA_REDUCE_ARGMAX: case GA_REDUCE_MAXANDARGMAX: - strb_appends(&ctx->s, "\t\t\trdxV = max(rdxV, v);\n"); - strb_appends(&ctx->s, "\t\t\tif(v == rdxV){\n"); - appendIdxes (&ctx->s, "\t\t\t\targI = RDXINDEXER(", "i", ctx->ndd, ctx->nds, "", ");\n"); - strb_appends(&ctx->s, "\t\t\t}\n"); + srcbAppends(&ctx->srcGen, "\t\t\trdxK = max(rdxK, k);\n" + "\t\t\tif(rdxK == k){\n" + "\t\t\t\trdxA = rdxIdx;\n" + "\t\t\t}\n"); break; case GA_REDUCE_AND: - strb_appends(&ctx->s, "\t\t\trdxV &= v;\n"); + srcbAppends(&ctx->srcGen, "\t\t\trdxK &= k;\n"); break; case GA_REDUCE_OR: - strb_appends(&ctx->s, "\t\t\trdxV |= v;\n"); + srcbAppends(&ctx->srcGen, "\t\t\trdxK |= k;\n"); break; case GA_REDUCE_XOR: - strb_appends(&ctx->s, "\t\t\trdxV ^= v;\n"); + srcbAppends(&ctx->srcGen, "\t\t\trdxK ^= k;\n"); break; case GA_REDUCE_ALL: - strb_appends(&ctx->s, "\t\t\trdxV = rdxV && v;\n"); + srcbAppends(&ctx->srcGen, "\t\t\trdxK = rdxK && k;\n"); break; case GA_REDUCE_ANY: - strb_appends(&ctx->s, "\t\t\trdxV = rdxV || v;\n"); + srcbAppends(&ctx->srcGen, "\t\t\trdxK = rdxK || k;\n"); break; } - /** - * Inner Loop Trailer Generation - */ - for (i=ctx->ndd;inds;i++){ - strb_appends(&ctx->s, "\t\t}\n"); + srcbAppends(&ctx->srcGen, "\t\t}\n"); } - strb_appends(&ctx->s, "\t\t\n"); + srcbAppends(&ctx->srcGen, "\t\t\n"); /** - * Inner Loop Epilogue Generation + * Large code model: Postscalar transformations go here, coercing the + * K-typed value rdxK to the T-typed value rdxT */ - strb_appends(&ctx->s, "\t\t/**\n"); - strb_appends(&ctx->s, "\t\t * Destination writeback.\n"); - strb_appends(&ctx->s, "\t\t */\n"); - strb_appends(&ctx->s, "\t\t\n"); - if ( reduxKernelRequiresDst (ctx) && !reduxKernelRequiresDstArg(ctx)){ - appendIdxes (&ctx->s, "\t\treduxVal(", "i", 0, ctx->ndd, "", ""); - if (ctx->ndd > 0){ - strb_appends(&ctx->s, ", "); - } - strb_appends(&ctx->s, "dst, dstSteps, rdxV);\n"); - }else if (!reduxKernelRequiresDst (ctx) && reduxKernelRequiresDstArg(ctx)){ - appendIdxes (&ctx->s, "\t\treduxVal(", "i", 0, ctx->ndd, "", ""); - if (ctx->ndd > 0){ - strb_appends(&ctx->s, ", "); - } - strb_appends(&ctx->s, "dstArg, dstArgSteps, argI);\n"); - }else if ( reduxKernelRequiresDst (ctx) && reduxKernelRequiresDstArg(ctx)){ - appendIdxes (&ctx->s, "\t\treduxVal(", "i", 0, ctx->ndd, "", ""); - if (ctx->ndd > 0){ - strb_appends(&ctx->s, ", "); - } - strb_appends(&ctx->s, "dst, dstSteps, rdxV, dstArg, dstArgSteps, argI);\n"); + srcbAppends (&ctx->srcGen, "\t\trdxT = rdxK;\n"); + + /* Final writeback. */ + srcbAppends (&ctx->srcGen, "\t\twriteBackFn("); + srcbBeginList (&ctx->srcGen, ", ", ""); + if (reduxKernelRequiresDst(ctx)){ + srcbAppendElemf(&ctx->srcGen, "&dstVal"); + srcbAppendElemf(&ctx->srcGen, "rdxT"); + } + if (reduxKernelRequiresDstArg(ctx)){ + srcbAppendElemf(&ctx->srcGen, "&dstArgVal"); + srcbAppendElemf(&ctx->srcGen, "rdxA"); + } + srcbEndList (&ctx->srcGen); + srcbAppends (&ctx->srcGen, ");\n"); + + for (i=0;indd;i++){ + srcbAppends(&ctx->srcGen, "\t}\n"); } } -static void reduxAppendLoopMacroUndefs (redux_ctx* ctx){ - strb_appends(&ctx->s, "#undef FOROVER\n"); - strb_appends(&ctx->s, "#undef ESCAPE\n"); - strb_appends(&ctx->s, "#undef RDXINDEXER\n"); +static void reduxAppendInitKernel (redux_ctx* ctx){ + /* BUG: Implement this for small code model. */ +} +static void reduxAppendPostKernel (redux_ctx* ctx){ + /* BUG: Implement this for small code model. */ } /** @@ -1713,43 +1698,43 @@ static int reduxCompile (redux_ctx* ctx){ size_t PRI_TYPECODES_LEN; int* AUX_TYPECODES; size_t AUX_TYPECODES_LEN; - - + + /** * Construct Argument Typecode Lists. */ - - PRI_TYPECODES[i++] = GA_BUFFER; /* src */ + + PRI_TYPECODES[i++] = GA_BUFFER; /* srcPtr */ PRI_TYPECODES[i++] = GA_SIZE; /* srcOff */ PRI_TYPECODES[i++] = GA_BUFFER; /* srcSteps */ PRI_TYPECODES[i++] = GA_BUFFER; /* srcSize */ PRI_TYPECODES[i++] = GA_BUFFER; /* chnkSize */ - if(reduxKernelRequiresDst(ctx)){ - PRI_TYPECODES[i++] = GA_BUFFER; /* dst */ + if (reduxKernelRequiresDst(ctx)){ + PRI_TYPECODES[i++] = GA_BUFFER; /* dstPtr */ PRI_TYPECODES[i++] = GA_SIZE; /* dstOff */ PRI_TYPECODES[i++] = GA_BUFFER; /* dstSteps */ } - if(reduxKernelRequiresDstArg(ctx)){ - PRI_TYPECODES[i++] = GA_BUFFER; /* dstArg */ + if (reduxKernelRequiresDstArg(ctx)){ + PRI_TYPECODES[i++] = GA_BUFFER; /* dstArgPtr */ PRI_TYPECODES[i++] = GA_SIZE; /* dstArgOff */ PRI_TYPECODES[i++] = GA_BUFFER; /* dstArgSteps */ } PRI_TYPECODES_LEN = i; AUX_TYPECODES = &PRI_TYPECODES[3]; AUX_TYPECODES_LEN = PRI_TYPECODES_LEN-3; - - + + /** * Compile the kernels. */ - + { ret = GpuKernel_init(&ctx->kernel, ctx->gpuCtx, 1, (const char**)&ctx->sourceCode, &ctx->sourceCodeLen, - "redux", + "reduxKer", PRI_TYPECODES_LEN, PRI_TYPECODES, GA_USE_CLUDA, @@ -1758,13 +1743,13 @@ static int reduxCompile (redux_ctx* ctx){ return reduxCleanup(ctx, ret); } } - if(reduxIsSmallCodeModel(ctx)){ + if (reduxIsSmallCodeModel(ctx)){ ret = GpuKernel_init(&ctx->kernel, ctx->gpuCtx, 1, (const char**)&ctx->sourceCode, &ctx->sourceCodeLen, - "preRedux", + "initKer", AUX_TYPECODES_LEN, AUX_TYPECODES, GA_USE_CLUDA, @@ -1777,7 +1762,7 @@ static int reduxCompile (redux_ctx* ctx){ 1, (const char**)&ctx->sourceCode, &ctx->sourceCodeLen, - "postRedux", + "postKer", AUX_TYPECODES_LEN, AUX_TYPECODES, GA_USE_CLUDA, @@ -1810,8 +1795,8 @@ static int reduxSchedule (redux_ctx* ctx){ size_t warpSize, maxL, maxL0, maxL1, maxL2, maxG, maxG0, maxG1, maxG2; - - + + /** * Obtain the constraints of our problem. */ @@ -1827,14 +1812,14 @@ static int reduxSchedule (redux_ctx* ctx){ gpukernel_property(ctx->kernel.k, GA_KERNEL_PROP_MAXLSIZE, &maxL); maxLgRdx = maxL; maxLgPri = maxLgRdx; - if(reduxIsSmallCodeModel(ctx)){ + if (reduxIsSmallCodeModel(ctx)){ gpukernel_property(ctx->preKernel.k, GA_KERNEL_PROP_MAXLSIZE, &maxL); maxLgPre = maxL; gpukernel_property(ctx->postKernel.k, GA_KERNEL_PROP_MAXLSIZE, &maxL); maxLgPost = maxL; maxLgAux = maxLgPrepri.ndh; maxGs[0] = maxG0; maxGs[1] = maxG1; @@ -1846,18 +1831,18 @@ static int reduxSchedule (redux_ctx* ctx){ for (i=0;isrc->dimensions[ctx->pri.axisList[i]]; } - if(reduxIsSmallCodeModel(ctx)){ + if (reduxIsSmallCodeModel(ctx)){ auxNdims = ctx->aux.ndh; for (i=0;isrc->dimensions[ctx->aux.axisList[i]]; } } - - + + /** * Apply the solver. */ - + { reduxScheduleKernel(priNdims, priDims, @@ -1890,14 +1875,14 @@ static int reduxSchedule (redux_ctx* ctx){ ctx->aux.bs[i] = ctx->aux.gs[i] = ctx->aux.cs[i] = 1; } } - + return reduxInvoke(ctx); } /** * @brief Given the parameters of a kernel scheduling problem, solve it as * optimally as possible. - * + * * NB: This is the only function in this entire file that should have * anything to do with the integer factorization APIs. */ @@ -1919,20 +1904,20 @@ static void reduxScheduleKernel (int ndims, ga_factor_list factBS [MAX_HW_DIMS]; ga_factor_list factGS [MAX_HW_DIMS]; ga_factor_list factCS [MAX_HW_DIMS]; - - + + /** * Quick check for scalar case. */ - + if (ndims <= 0){ return; } - - + + /** * Identify the dimension to which the warp factor will be given. - * + * * The current heuristic is to find the dimension that is either * 1) Evenly divided by the warp size, or * 2) As close to filling the last warp as possible. @@ -2017,7 +2002,7 @@ static int reduxInvoke (redux_ctx* ctx){ ctx->src->dimensions, flags, 0); ctx->pri.chunkSizeGD = gpudata_alloc(ctx->gpuCtx, ctx->pri.ndh * sizeof(size_t), ctx->pri.cs, flags, 0); - + priArgs[i++] = (void*) ctx->src->data; priArgs[i++] = (void*)&ctx->src->offset; priArgs[i++] = (void*) ctx->srcStepsGD; @@ -2025,21 +2010,21 @@ static int reduxInvoke (redux_ctx* ctx){ priArgs[i++] = (void*) ctx->pri.chunkSizeGD; if (reduxKernelRequiresDst (ctx)){ ctx->dstStepsGD = gpudata_alloc(ctx->gpuCtx, ctx->ndd * sizeof(size_t), - ctx->dst->strides, flags, 0); - priArgs[i++] = (void*) ctx->dst->data; - priArgs[i++] = (void*)&ctx->dst->offset; + ctx->wsDst->strides, flags, 0); + priArgs[i++] = (void*) ctx->wsDst->data; + priArgs[i++] = (void*)&ctx->wsDst->offset; priArgs[i++] = (void*) ctx->dstStepsGD; failedDstSteps = !ctx->dstStepsGD; } if (reduxKernelRequiresDstArg(ctx)){ ctx->dstArgStepsGD = gpudata_alloc(ctx->gpuCtx, ctx->ndd * sizeof(size_t), - ctx->dstArg->strides, flags, 0); - priArgs[i++] = (void*) ctx->dstArg->data; - priArgs[i++] = (void*)&ctx->dstArg->offset; + ctx->wsDstArg->strides, flags, 0); + priArgs[i++] = (void*) ctx->wsDstArg->data; + priArgs[i++] = (void*)&ctx->wsDstArg->offset; priArgs[i++] = (void*) ctx->dstArgStepsGD; failedDstArgSteps = !ctx->dstArgStepsGD; } - if (reduxIsSmallCodeModel(ctx)){ + if (reduxIsSmallCodeModel (ctx)){ /** * The auxiliary kernel's args are identical to the primary kernel's, * except that the first three arguments are deleted and the fifth @@ -2065,7 +2050,7 @@ static int reduxInvoke (redux_ctx* ctx){ !failedDstArgSteps && !failedAuxChunkSize){ /* Pre-kernel invocation, if necessary */ - if(reduxIsSmallCodeModel(ctx)){ + if (reduxIsSmallCodeModel(ctx)){ ret = GpuKernel_call(&ctx->preKernel, ctx->aux.ndh>0 ? ctx->aux.ndh : 1, ctx->aux.gs, @@ -2089,7 +2074,7 @@ static int reduxInvoke (redux_ctx* ctx){ } /* Post-kernel invocation, if necessary */ - if(reduxIsSmallCodeModel(ctx)){ + if (reduxIsSmallCodeModel(ctx)){ ret = GpuKernel_call(&ctx->postKernel, ctx->aux.ndh>0 ? ctx->aux.ndh : 1, ctx->aux.gs, @@ -2112,14 +2097,25 @@ static int reduxInvoke (redux_ctx* ctx){ */ static int reduxCleanup (redux_ctx* ctx, int ret){ + if (ctx->dst != ctx->wsDst){ + GpuArray_clear(ctx->wsDst); + free(ctx->wsDst); + ctx->wsDst = NULL; + } + if (ctx->dstArg != ctx->wsDstArg){ + GpuArray_clear(ctx->wsDstArg); + free(ctx->wsDstArg); + ctx->wsDstArg = NULL; + } + free(ctx->srcAxisList); - free(ctx->dstAxisList); + free(ctx->dstDims); free(ctx->sourceCode); free(ctx->errorString0); free(ctx->errorString1); free(ctx->errorString2); ctx->srcAxisList = NULL; - ctx->dstAxisList = NULL; + ctx->dstDims = NULL; ctx->sourceCode = NULL; ctx->errorString0 = NULL; ctx->errorString1 = NULL; diff --git a/src/util/srcgen.h b/src/util/srcgen.h new file mode 100644 index 0000000000..c577b47c72 --- /dev/null +++ b/src/util/srcgen.h @@ -0,0 +1,106 @@ +/* Include Guards */ +#ifndef SRCGEN_H +#define SRCGEN_H + + +/* Includes */ +#include "util/strb.h" + + +/* Extern "C" Guard */ +#ifdef __cplusplus +extern "C" { +#endif +#ifdef CONFUSE_EMACS +} +#endif + + + +/* Data Structure Prototypes & Typedefs */ +struct srcb; +typedef struct srcb srcb; + + + +/* Enumerations */ +enum srcb_state{ + SRCB_STATE_NONE, + SRCB_STATE_INLIST, +}; +typedef enum srcb_state srcb_state; + + + +/* Data Structures */ + +/** + * @brief The srcb struct + * + * The Source Code Buffer. Augments strb with C-like language generation tools. + */ + +struct srcb{ + strb* s; + srcb_state state; + int numElems; + const char* sep; + const char* empty; +}; + + + +/* Functions */ +static inline void srcbInit (srcb* s, strb* sb){ + s->s = sb; + s->state = SRCB_STATE_NONE; + s->numElems = 0; +} +static inline void srcbBeginList(srcb* s, const char* sep, const char* empty){ + s->state = SRCB_STATE_INLIST; + s->numElems = 0; + s->sep = sep; + s->empty = empty; +} +static inline void srcbEndList(srcb* s){ + if(s->numElems == 0){ + strb_appends(s->s, s->empty); + } + + s->state = SRCB_STATE_NONE; + s->numElems = 0; + s->sep = ""; + s->empty = ""; +} +static inline void srcbAppendElemv(srcb* s, const char *f, va_list ap){ + if(s->numElems > 0){ + strb_appends(s->s, s->sep); + } + + strb_appendv(s->s, f, ap); + + s->numElems++; +} +static inline void srcbAppendElemf(srcb* s, const char *f, ...){ + va_list ap; + va_start(ap, f); + srcbAppendElemv(s, f, ap); + va_end(ap); +} +static inline void srcbAppends(srcb* s, const char *f){ + strb_appends(s->s, f); +} +static inline void srcbAppendf(srcb* s, const char *f, ...){ + va_list ap; + va_start(ap, f); + strb_appendv(s->s, f, ap); + va_end(ap); +} + + +/* End Extern "C" Guard */ +#ifdef __cplusplus +} +#endif + +#endif From 8fe9083490e2f65e8e7f25cd1ba72d290879f50c Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Sun, 5 Mar 2017 01:24:21 -0500 Subject: [PATCH 09/34] Added testcases for all reductions. All tests pass, but currently the codegen is locked to the large code model (the small code model has most of the groundwork laid down but has several extra complexities which haven't yet been implemented, like atomic reduction operators. --- tests/check_reduction.c | 3120 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 3086 insertions(+), 34 deletions(-) diff --git a/tests/check_reduction.c b/tests/check_reduction.c index 2d47d6541d..370f074167 100644 --- a/tests/check_reduction.c +++ b/tests/check_reduction.c @@ -23,7 +23,7 @@ void teardown(void); /** * PRNG based on PCG XSH RR 64/32 (LCG) - * + * * Used to generate random data for the kernel tests. */ @@ -44,15 +44,15 @@ static void pcgSeed (uint64_t seed){ } static uint32_t pcgRand (void){ pcgS = pcgS*pcgM + pcgA; - + /** * PCG does something akin to an unbalanced Feistel round to blind the LCG * state: - * + * * The rightmost 59 bits are involved in an xorshift by 18. * The leftmost 5 bits select a rotation of the 32 bits 58:27. */ - + return pcgRor32((pcgS^(pcgS>>18))>>27, pcgS>>59); } static double pcgRand01(void){ @@ -522,6 +522,107 @@ START_TEST(test_minandargmin_reduction){ GpuArray_clear(&gaArgmin); }END_TEST +START_TEST(test_minandargmin_veryhighrank){ + pcgSeed(1); + + /** + * Here we test a reduction of a random 8D tensor on four dimensions. + */ + + size_t i,j,k,l,m,n,o,p; + size_t dims [8] = {1171,373,2,1,2,1,2,1}; + size_t prodDims = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7]; + size_t rdxDims[4] = {1171,373,1,2}; + size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3]; + const unsigned reduxList[] = {2,4,7,5}; + + float* pSrc = calloc(1, sizeof(*pSrc) * prodDims); + float* pMin = calloc(1, sizeof(*pMin) * rdxProdDims); + size_t* pArgmin = calloc(1, sizeof(*pArgmin) * rdxProdDims); + + ck_assert_ptr_ne(pSrc, NULL); + ck_assert_ptr_ne(pMin, NULL); + ck_assert_ptr_ne(pArgmin, NULL); + + + /** + * Initialize source data. + */ + + for(i=0;i gtMax){ + gtMax = v; + gtArgmax = i*dims[2] + k; + } + } + } + + ck_assert_msg(gtArgmax == pArgmax[j], "Argmax value mismatch!"); + } + + /** + * Deallocate. + */ + + free(pSrc); + free(pMax); + free(pArgmax); + GpuArray_clear(&gaSrc); + GpuArray_clear(&gaArgmax); +}END_TEST + +START_TEST(test_argmax_veryhighrank){ + pcgSeed(1); + + /** + * Here we test a reduction of a random 8D tensor on four dimensions. + */ + + size_t i,j,k,l,m,n,o,p; + size_t dims [8] = {1171,373,2,1,2,1,2,1}; + size_t prodDims = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7]; + size_t rdxDims[4] = {1171,373,1,2}; + size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3]; + const unsigned reduxList[] = {2,4,7,5}; + + float* pSrc = calloc(1, sizeof(*pSrc) * prodDims); + float* pMax = calloc(1, sizeof(*pMax) * rdxProdDims); + size_t* pArgmax = calloc(1, sizeof(*pArgmax) * rdxProdDims); + + ck_assert_ptr_ne(pSrc, NULL); + ck_assert_ptr_ne(pArgmax, NULL); + + + /** + * Initialize source data. + */ + + for(i=0;i gtMax){ + gtMax = v; + gtArgmax = (((k)*dims[4] + m)*dims[7] + p)*dims[5] + n; + } + } + } + } + } + + size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o; + ck_assert_msg(gtArgmax == pArgmax[dstIdx], "Argmax value mismatch!"); } } } } - ck_assert_msg(gtMin == pMin[0], "Min value mismatch!"); /** * Deallocate. */ free(pSrc); - free(pMin); + free(pMax); + free(pArgmax); GpuArray_clear(&gaSrc); - GpuArray_clear(&gaMin); + GpuArray_clear(&gaArgmax); }END_TEST +START_TEST(test_argmax_alldimsreduced){ + pcgSeed(1); -Suite *get_suite(void) { - Suite *s = suite_create("reduction"); - TCase *tc = tcase_create("basic"); - tcase_add_checked_fixture(tc, setup, teardown); - tcase_set_timeout(tc, 15.0); + /** + * We test here a reduction of some random 3D tensor on all dimensions. + */ - tcase_add_test(tc, test_maxandargmax_reduction); - tcase_add_test(tc, test_maxandargmax_idxtranspose); - tcase_add_test(tc, test_maxandargmax_veryhighrank); - tcase_add_test(tc, test_maxandargmax_alldimsreduced); - tcase_add_test(tc, test_minandargmin_reduction); - tcase_add_test(tc, test_minandargmin_alldimsreduced); - tcase_add_test(tc, test_min_alldimsreduced); + size_t i,j,k; + size_t dims[3] = {32,50,79}; + size_t prodDims = dims[0]*dims[1]*dims[2]; + const unsigned reduxList[] = {0,1,2}; + + float* pSrc = calloc(1, sizeof(*pSrc) * dims[0]*dims[1]*dims[2]); + float* pMax = calloc(1, sizeof(*pMax) ); + size_t* pArgmax = calloc(1, sizeof(*pArgmax) ); + + ck_assert_ptr_ne(pSrc, NULL); + ck_assert_ptr_ne(pMax, NULL); + ck_assert_ptr_ne(pArgmax, NULL); + + + /** + * Initialize source data. + */ + + for(i=0;i gtMax){ + gtMax = v; + gtArgmax = (i*dims[1] + j)*dims[2] + k; + } + } + } + } + + ck_assert_msg(gtArgmax == pArgmax[0], "Argmax value mismatch!"); + + /** + * Deallocate. + */ + + free(pSrc); + free(pMax); + free(pArgmax); + GpuArray_clear(&gaSrc); + GpuArray_clear(&gaArgmax); +}END_TEST + +START_TEST(test_argmin_reduction){ + pcgSeed(1); + + /** + * We test here a reduction of some random 3D tensor on the first and + * third dimensions. + */ + + size_t i,j,k; + size_t dims[3] = {32,50,79}; + size_t prodDims = dims[0]*dims[1]*dims[2]; + const unsigned reduxList[] = {0,2}; + + float* pSrc = calloc(1, sizeof(*pSrc) * dims[0]*dims[1]*dims[2]); + float* pMin = calloc(1, sizeof(*pMin) * dims[1] ); + size_t* pArgmin = calloc(1, sizeof(*pArgmin) * dims[1] ); + + ck_assert_ptr_ne(pSrc, NULL); + ck_assert_ptr_ne(pMin, NULL); + ck_assert_ptr_ne(pArgmin, NULL); + + + /** + * Initialize source data. + */ + + for(i=0;i gtMax){ + gtMax = v; + } + } + } + + ck_assert_msg(gtMax == pMax[j], "Max value mismatch!"); + } + + /** + * Deallocate. + */ + + free(pSrc); + free(pMax); + GpuArray_clear(&gaSrc); + GpuArray_clear(&gaMax); +}END_TEST + +START_TEST(test_max_veryhighrank){ + pcgSeed(1); + + /** + * Here we test a reduction of a random 8D tensor on four dimensions. + */ + + size_t i,j,k,l,m,n,o,p; + size_t dims [8] = {1171,373,2,1,2,1,2,1}; + size_t prodDims = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7]; + size_t rdxDims[4] = {1171,373,1,2}; + size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3]; + const unsigned reduxList[] = {2,4,7,5}; + + float* pSrc = calloc(1, sizeof(*pSrc) * prodDims); + float* pMax = calloc(1, sizeof(*pMax) * rdxProdDims); + + ck_assert_ptr_ne(pSrc, NULL); + ck_assert_ptr_ne(pMax, NULL); + + + /** + * Initialize source data. + */ + + for(i=0;i gtMax){ + gtMax = v; + } + } + } + } + } + + size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o; + ck_assert_msg(gtMax == pMax[dstIdx], "Max value mismatch!"); + } + } + } + } + + + /** + * Deallocate. + */ + + free(pSrc); + free(pMax); + GpuArray_clear(&gaSrc); + GpuArray_clear(&gaMax); +}END_TEST + +START_TEST(test_max_alldimsreduced){ + pcgSeed(1); + + /** + * We test here a reduction of some random 3D tensor on all dimensions. + */ + + size_t i,j,k; + size_t dims[3] = {32,50,79}; + size_t prodDims = dims[0]*dims[1]*dims[2]; + const unsigned reduxList[] = {0,1,2}; + + float* pSrc = calloc(1, sizeof(*pSrc) * dims[0]*dims[1]*dims[2]); + float* pMax = calloc(1, sizeof(*pMax) ); + + ck_assert_ptr_ne(pSrc, NULL); + ck_assert_ptr_ne(pMax, NULL); + + + /** + * axitialize source data. + */ + + for(i=0;i gtMax){ + gtMax = v; + } + } + } + } + + ck_assert_msg(gtMax == pMax[0], "Max value mismatch!"); + + /** + * Deallocate. + */ + + free(pSrc); + free(pMax); + GpuArray_clear(&gaSrc); + GpuArray_clear(&gaMax); +}END_TEST + +START_TEST(test_min_reduction){ + pcgSeed(1); + + /** + * We test here a reduction of some random 3D tensor on all dimensions. + */ + + size_t i,j,k; + size_t dims[3] = {32,50,79}; + size_t prodDims = dims[0]*dims[1]*dims[2]; + const unsigned reduxList[] = {0,2}; + + float* pSrc = calloc(1, sizeof(*pSrc) * dims[0]*dims[1]*dims[2]); + float* pMin = calloc(1, sizeof(*pMin) * dims[1] ); + + ck_assert_ptr_ne(pSrc, NULL); + ck_assert_ptr_ne(pMin, NULL); + + + /** + * Initialize source data. + */ + + for(i=0;i 0.05; + } + + + /** + * Run the kernel. + */ + + GpuArray gaS; + GpuArray gaD; + + ga_assert_ok(GpuArray_empty (&gaS, ctx, GA_UINT, 3, &dims[0], GA_C_ORDER)); + ga_assert_ok(GpuArray_empty (&gaD, ctx, GA_UINT, 1, &dims[1], GA_C_ORDER)); + + ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); + ga_assert_ok(GpuArray_memset(&gaD, -1)); + + ga_assert_ok(GpuArray_all (&gaD, &gaS, 2, reduxList)); + + ga_assert_ok(GpuArray_read (pD, sizeof(*pD)*dims[1], &gaD)); + + + /** + * Check that the destination tensors are correct. + */ + + for(j=0;j 0.05; + } + + + /** + * Run the kernel. + */ + + GpuArray gaS; + GpuArray gaD; + + ga_assert_ok(GpuArray_empty (&gaS, ctx, GA_UINT, 8, dims, GA_C_ORDER)); + ga_assert_ok(GpuArray_empty (&gaD, ctx, GA_UINT, 4, rdxDims, GA_C_ORDER)); + + ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); + ga_assert_ok(GpuArray_memset(&gaD, -1)); /* 0xFFFFFFFF is a qNaN. */ + + ga_assert_ok(GpuArray_all (&gaD, &gaS, 4, reduxList)); + + ga_assert_ok(GpuArray_read (pD, sizeof(*pD)*rdxProdDims, &gaD)); + + + /** + * Check that the destination tensors are correct. + */ + + for(i=0;i 0.05; + } + + + /** + * Run the kernel. + */ + + GpuArray gaS; + GpuArray gaD; + + ga_assert_ok(GpuArray_empty (&gaS, ctx, GA_UINT, 3, &dims[0], GA_C_ORDER)); + ga_assert_ok(GpuArray_empty (&gaD, ctx, GA_UINT, 0, NULL, GA_C_ORDER)); + + ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); + ga_assert_ok(GpuArray_memset(&gaD, -1)); /* 0xFFFFFFFF is a qNaN. */ + + ga_assert_ok(GpuArray_all (&gaD, &gaS, 3, reduxList)); + + ga_assert_ok(GpuArray_read (pD, sizeof(*pD), &gaD)); + + + /** + * Check that the destination tensors are correct. + */ + + uint32_t gtD = 1; + + for(i=0;i Date: Sun, 5 Mar 2017 01:43:37 -0500 Subject: [PATCH 10/34] Muzzle incorrect GCC maybe-uninitialized diagnostic. Clang and MSVC correctly recognize that all paths to the allegedly- uninitialized variables are, in fact, dominated by their initialization. --- src/gpuarray_reduction.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c index 61f1688a4f..5bab3a65f7 100644 --- a/src/gpuarray_reduction.c +++ b/src/gpuarray_reduction.c @@ -1781,9 +1781,9 @@ static int reduxCompile (redux_ctx* ctx){ */ static int reduxSchedule (redux_ctx* ctx){ - int i, priNdims, auxNdims; - uint64_t maxLgRdx, maxLgPre, maxLgPost; - uint64_t maxLgPri, maxLgAux; + int i, priNdims = 0, auxNdims = 0; + uint64_t maxLgRdx = 0, maxLgPre = 0, maxLgPost = 0; + uint64_t maxLgPri = 0, maxLgAux = 0; uint64_t maxLs [MAX_HW_DIMS]; uint64_t maxGg; uint64_t maxGs [MAX_HW_DIMS]; From 19bd9390937e394b2fa264ea3a37e05aa981bd1c Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Mon, 15 May 2017 14:43:26 -0400 Subject: [PATCH 11/34] Current State --- src/gpuarray/reduction.h | 91 +- src/gpuarray_reduction.c | 1829 ++++++++++++++++++++++++++------------ tests/check_reduction.c | 86 +- 3 files changed, 1310 insertions(+), 696 deletions(-) diff --git a/src/gpuarray/reduction.h b/src/gpuarray/reduction.h index 1db5664535..f6638c9a83 100644 --- a/src/gpuarray/reduction.h +++ b/src/gpuarray/reduction.h @@ -26,28 +26,26 @@ extern "C" { */ typedef enum _ga_reduce_op { - GA_REDUCE_SUM, /* + */ - GA_REDUCE_PROD, /* * */ - GA_REDUCE_PRODNZ, /* * (!=0) */ - GA_REDUCE_MIN, /* min() */ - GA_REDUCE_MAX, /* max() */ - GA_REDUCE_ARGMIN, /* argmin() */ - GA_REDUCE_ARGMAX, /* argmax() */ - GA_REDUCE_MINANDARGMIN, /* min(), argmin() */ - GA_REDUCE_MAXANDARGMAX, /* max(), argmax() */ - GA_REDUCE_AND, /* & */ - GA_REDUCE_OR, /* | */ - GA_REDUCE_XOR, /* ^ */ - GA_REDUCE_ALL, /* &&/all() */ - GA_REDUCE_ANY, /* ||/any() */ + GA_REDUCE_SUM, /* + */ + GA_REDUCE_PROD, /* * */ + GA_REDUCE_PRODNZ, /* * (!=0) */ + GA_REDUCE_MIN, /* min() */ + GA_REDUCE_MAX, /* max() */ + GA_REDUCE_ARGMIN, /* argmin() */ + GA_REDUCE_ARGMAX, /* argmax() */ + GA_REDUCE_MINANDARGMIN, /* min(), argmin() */ + GA_REDUCE_MAXANDARGMAX, /* max(), argmax() */ + GA_REDUCE_AND, /* & */ + GA_REDUCE_OR, /* | */ + GA_REDUCE_XOR, /* ^ */ + GA_REDUCE_ALL, /* &&/all() */ + GA_REDUCE_ANY, /* ||/any() */ } ga_reduce_op; /** - * @brief Compute a reduction sum (+), product (*), non-zero product (* != 0), - * min, max, argmin, argmax, min-and-argmin, max-and-argmax, and (&), - * or (|), xor (^), all (&&) or any (||) over a list of axes to reduce. + * @brief Compute a reduction over a list of axes to reduce. * * Returns one (in the case of min-and-argmin/max-and-argmax, two) destination * tensors. The destination tensor(s)' axes are a strict subset of the axes of the @@ -55,6 +53,7 @@ typedef enum _ga_reduce_op { * reduction is performed over these axes, which are then removed in the * destination. * + * @param [in] op The reduction operation to perform. * @param [out] dst The destination tensor. Has the same type as the source. * @param [out] dstArg For argument of minima/maxima operations. Has type int64. * @param [in] src The source tensor. @@ -81,64 +80,6 @@ typedef enum _ga_reduce_op { * code otherwise. */ -GPUARRAY_PUBLIC int GpuArray_sum (GpuArray* dst, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList); -GPUARRAY_PUBLIC int GpuArray_prod (GpuArray* dst, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList); -GPUARRAY_PUBLIC int GpuArray_prodnz (GpuArray* dst, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList); -GPUARRAY_PUBLIC int GpuArray_min (GpuArray* dst, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList); -GPUARRAY_PUBLIC int GpuArray_max (GpuArray* dst, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList); -GPUARRAY_PUBLIC int GpuArray_argmin (GpuArray* dstArg, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList); -GPUARRAY_PUBLIC int GpuArray_argmax (GpuArray* dstArg, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList); -GPUARRAY_PUBLIC int GpuArray_minandargmin(GpuArray* dst, - GpuArray* dstArg, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList); -GPUARRAY_PUBLIC int GpuArray_maxandargmax(GpuArray* dst, - GpuArray* dstArg, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList); -GPUARRAY_PUBLIC int GpuArray_and (GpuArray* dst, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList); -GPUARRAY_PUBLIC int GpuArray_or (GpuArray* dst, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList); -GPUARRAY_PUBLIC int GpuArray_xor (GpuArray* dst, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList); -GPUARRAY_PUBLIC int GpuArray_all (GpuArray* dst, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList); -GPUARRAY_PUBLIC int GpuArray_any (GpuArray* dst, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList); GPUARRAY_PUBLIC int GpuArray_reduction (ga_reduce_op op, GpuArray* dst, GpuArray* dstArg, diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c index 5bab3a65f7..072f1e2685 100644 --- a/src/gpuarray_reduction.c +++ b/src/gpuarray_reduction.c @@ -6,6 +6,7 @@ #include #include #include +#include #include "gpuarray/config.h" #include #include @@ -33,47 +34,96 @@ /* Datatypes */ +/** + * @brief Axis Description. + */ + +struct axis_desc{ + int reduxNum; + unsigned isReduced : 1; + unsigned isHW : 1; + unsigned isSW : 1; + size_t warpLen; + size_t len; + ssize_t srcStride, srcOffset; + ssize_t dstStride, dstOffset; + ssize_t dstArgStride, dstArgOffset; + ssize_t tmpDstStride, tmpDstOffset; + ssize_t tmpDstArgStride, tmpDstArgOffset; +}; +typedef struct axis_desc axis_desc; + /** * Reduction Kernel Generator. - * - * The generator produces a kernel from one of two "code models": - * - Large - * - Small - * Which one is used depends on the size of the destination tensor and the - * number of reductions for each destination element. A destination tensor - * with more than SMALL_REDUX_THRESHOLD elements or more elements than - * reductions for each element will result in use of the large code model; - * Otherwise the small code model is used. - * - * - * LARGE CODE MODEL: - * - * In the large code model, each destination element is processed by a - * single thread. - * - * Each thread begins with an initial value in a register, reads from all - * source elements contributing to the reduction, computes the result and - * writes it to the destination element. - * - * A single kernel is generated that performs prescalar transformations, the - * reduction itself, postscalar transformations and the write to global memory. - * - * - * SMALL CODE MODEL: - * - * In the small code model, each destination element is processed by - * multiple threads. - * - * The destination tensor is first initialized with the initial value. Then, - * one several threads cooperate to perform the reduction atomically on each - * destination element. Lastly, postscalar transformations are applied - * in-place. - * - * Two or three kernels are generated: The initialization kernel, the main - * kernel that performs prescalar transformations and the reduction itself, and - * possibly also a postscalar transformation kernel when it is required. - * - * + * + * INTRO + * + * Generates the source code for a reduction kernel over arbitrarily-dimensioned, + * -shaped and -typed tensors. + * + * + * GOALS + * + * The generator has the following goals: + * + * 1. Maximizing the use of coalesced memory loads within a warp. + * 2. Maximizing the # of useful threads within a warp. + * 3. Maximizing the number of warps within a block. + * + * NOTE: It is possible to guarantee for any tensor problem of at least + * 2*WARP_SIZE in scale that either + * 1. All warp blocks in the X dimension have more than 50% threads + * active 100% of the time, or + * 2. The warp blocks in the X dimension have 100% threads active more + * than 50% of the time. + * + * 4. Ensuring there are no more blocks than are permitted by the warp + * configuration and 2nd-stage workspace size (if required). + * 5. Ensuring there are no more than 5 blocks per multiprocessor. + * 6. Minimizing the 2nd-stage workspace (if it is required). + * 7. Striding the 2nd-stage workspace for maximum convenience (if it is + * required). Make it contiguous. + * + * + * NOTES + * + * Information elements required to perform reduction. + * + * 1. Ndim, shape and dtype of src tensor + * 2. Ndim, shape and dtype of dst/dstArg tensors + * 3. GPU context + * 4. Number of processors + * 5. Warp size + * 6. Maximum size of block + * 7. Maximum size of block dimension X, Y, Z + * 8. Maximum size of grid + * 9. Maximum size of grid dimension X, Y, Z + * 10. Dtype and initializer of accumulator + * 11. Sorted src axes for contiguous memory accesses + * 12. Ndim, shape and dtype of flattened src tensor + * 13. Number of stages (1 or 2) + * 14. Ndim, shape and dtype of workspace tensor + * 15. Warp axes + * 16. Hardware axes + * 17. Software axes + * 18. Source code + * + * Rationale for dependencies: + * + * 1) Get the GPU context and its properties immediately, since an invalid + * context is a likely error and we want to fail fast. + * 2) The type and initializer of the accumulator should be determined after + * the context's properties have been retrieved since they provide + * information about the device's natively-supported types and operations. + * + * REFERENCES + * + * http://lpgpu.org/wp/wp-content/uploads/2013/05/poster_andresch_acaces2014.pdf + * + * + * + * + * * Kernel Template: * * The following kernel code template displays the code generated for the @@ -200,11 +250,41 @@ struct redux_ctx{ const int* reduxList; /* General. */ + int nds; /* # Source dimensions */ + int ndr; /* # Reduced dimensions */ + int ndd; /* # Destination dimensions */ + int ndw; /* # Warp dimensions */ + int ndp; /* # Partial warp dimensions */ + int ndf; /* # Flattened source dimensions */ + int ndt; /* # Temporary workspace dimensions */ + int zeroAllAxes; /* # of zero-length axes in source tensor */ + int zeroRdxAxes; /* # of zero-length reduction axes in source tensor */ + size_t prodAllAxes; /* Product of length of all axes in source tensor */ + size_t prodRdxAxes; /* Product of length of all reduction axes in source tensor */ + size_t prodFreeAxes; /* Product of length of all free axes in source tensor */ + size_t prodWarpAxes; /* Number of active threads per warp. Strictly <= warpSize. */ + int splitWarpAxis;/* Index of the split warp axis within the source tensor's shape; -1 otherwise. */ + + gpucontext* gpuCtx; + unsigned numProcs; + size_t warpSize; + size_t maxLg; + size_t maxLs[MAX_HW_DIMS]; + size_t maxGg; + size_t maxGs[MAX_HW_DIMS]; + + axis_desc* xdSrc; + axis_desc* xdSrcFlat; + axis_desc* xdTmp; + + axis_desc** xdSrcPtrs; + + int numStages; + GpuArray* wsDst; GpuArray* wsDstArg; int* srcAxisList; size_t* dstDims; - gpucontext* gpuCtx; /* Source code Generator. */ int srcTypeCode; @@ -219,9 +299,6 @@ struct redux_ctx{ const char* accTypeStr; const char* initValT; const char* initValK; - int ndd; - int ndr; - int nds; int largeCodeModel; strb s; srcb srcGen; @@ -269,187 +346,134 @@ typedef struct redux_ctx redux_ctx; -/* Function prototypes */ -static int reduxGetSumInit (int typecode, const char** property); -static int reduxGetProdInit (int typecode, const char** property); -static int reduxGetMinInit (int typecode, const char** property); -static int reduxGetMaxInit (int typecode, const char** property); -static int reduxGetAndInit (int typecode, const char** property); -static int reduxGetOrInit (int typecode, const char** property); -static int axisInSet (int v, - const int* set, - size_t setLen, - size_t* where); -static void appendIdxes (strb* s, - const char* prologue, - const char* prefix, - int startIdx, - int endIdx, - const char* suffix, - const char* epilogue); -static int reduxCheckargs (redux_ctx* ctx); -static void reduxSelectTypes (redux_ctx* ctx); -static int reduxIsSmallCodeModel (redux_ctx* ctx); -static int reduxIsLargeCodeModel (redux_ctx* ctx); -static int reduxRequiresDst (redux_ctx* ctx); -static int reduxRequiresDstArg (redux_ctx* ctx); -static int reduxKernelRequiresDst (redux_ctx* ctx); -static int reduxKernelRequiresDstArg (redux_ctx* ctx); -static int reduxCanAppendHwAxis (redux_ctx* ctx, - int kernelType, - int axisType); -static void reduxAppendLargestAxisToHwList(redux_ctx* ctx, - int kernelType, - int axisType); -static int reduxSelectHwAxes (redux_ctx* ctx); -static int reduxComputeAxisList (redux_ctx* ctx); -static int reduxGenSource (redux_ctx* ctx); -static void reduxAppendSource (redux_ctx* ctx); -static void reduxAppendIncludes (redux_ctx* ctx); -static void reduxAppendTensorDeclArgs (redux_ctx* ctx, - const char* type, - const char* baseName); -static void reduxAppendTensorCallArgs (redux_ctx* ctx, - const char* baseName); -static void reduxAppendMacroDefs (redux_ctx* ctx); -static void reduxAppendTypedefs (redux_ctx* ctx); -static void reduxAppendGetInitValFns (redux_ctx* ctx); -static void reduxAppendWriteBackFn (redux_ctx* ctx); -static void reduxAppendReduxKernel (redux_ctx* ctx); -static void reduxAppendPrototype (redux_ctx* ctx); -static void reduxAppendIndexDeclarations (redux_ctx* ctx); -static void reduxAppendRangeCalculations (redux_ctx* ctx); -static void reduxAppendLoops (redux_ctx* ctx); -static void reduxAppendInitKernel (redux_ctx* ctx); -static void reduxAppendPostKernel (redux_ctx* ctx); -static int reduxCompile (redux_ctx* ctx); -static int reduxSchedule (redux_ctx* ctx); -static void reduxScheduleKernel (int ndims, - uint64_t* dims, - uint64_t warpSize, - uint64_t maxLg, - uint64_t* maxLs, - uint64_t maxGg, - uint64_t* maxGs, - uint64_t* bs, - uint64_t* gs, - uint64_t* cs); -static int reduxInvoke (redux_ctx* ctx); -static int reduxCleanup (redux_ctx* ctx, int ret); +/* Static Function prototypes */ +/* Utilities */ +static int reduxGetSumInit (int typecode, const char** property); +static int reduxGetProdInit (int typecode, const char** property); +static int reduxGetMinInit (int typecode, const char** property); +static int reduxGetMaxInit (int typecode, const char** property); +static int reduxGetAndInit (int typecode, const char** property); +static int reduxGetOrInit (int typecode, const char** property); +static int reduxSortFlatSensitive (const void* a, const void* b); +static int reduxSortFlatInsensitive (const void* a, const void* b); +static int reduxSortWarp (const void* a, const void* b); +static int axisInSet (int v, + const int* set, + size_t setLen, + size_t* where); +static void appendIdxes (strb* s, + const char* prologue, + const char* prefix, + int startIdx, + int endIdx, + const char* suffix, + const char* epilogue); + +/* Axis Description API */ +static void axisInit (axis_desc* axis, + ssize_t len, + ssize_t srcStride); +static void axisMarkReduced (axis_desc* axis, int reduxNum); +static void axisMarkWarp (axis_desc* axis, size_t partialSlice); +static int axisGetReduxNum (const axis_desc* axis); +static size_t axisGetLen (const axis_desc* axis); +static ssize_t axisGetSrcStride (const axis_desc* axis); +static size_t axisGetSrcAbsStride (const axis_desc* axis); +static ssize_t axisGetSrcOffset (const axis_desc* axis); +static ssize_t axisGetDstStride (const axis_desc* axis); +static size_t axisGetDstAbsStride (const axis_desc* axis); +static ssize_t axisGetDstOffset (const axis_desc* axis); +static ssize_t axisGetDstArgStride (const axis_desc* axis); +static size_t axisGetDstArgAbsStride (const axis_desc* axis); +static ssize_t axisGetDstArgOffset (const axis_desc* axis); +static int axisIsReduced (const axis_desc* axis); +static int axisIsWarp (const axis_desc* axis); +static int axisIsPartialWarp (const axis_desc* axis); + +/* Reduction Context API */ +/* Utilities */ +static int reduxRequiresDst (const redux_ctx* ctx); +static int reduxRequiresDstArg (const redux_ctx* ctx); +static int reduxKernelRequiresDst (const redux_ctx* ctx); +static int reduxKernelRequiresDstArg (const redux_ctx* ctx); +static int reduxIsSensitive (const redux_ctx* ctx); +static int reduxIsSmallCodeModel (const redux_ctx* ctx); +static int reduxIsLargeCodeModel (const redux_ctx* ctx); +static axis_desc* reduxGetSrcAxis (const redux_ctx* ctx, int i); +static axis_desc* reduxGetSrcSortAxis (const redux_ctx* ctx, int i); +static axis_desc* reduxGetSrcFlatAxis (const redux_ctx* ctx, int i); +static int reduxTryFlattenInto (const redux_ctx* ctx, + axis_desc* into, + const axis_desc* from); +static int reduxCanAppendHwAxis (redux_ctx* ctx, + int kernelType, + int axisType); +static void reduxAppendLargestAxisToHwList(redux_ctx* ctx, + int kernelType, + int axisType); +/* Control Flow */ +static int reduxInit (redux_ctx* ctx); +static int reduxInferProperties (redux_ctx* ctx); +static int reduxFlattenSource (redux_ctx* ctx); +static int reduxSelectWarpAxes (redux_ctx* ctx); +static int reduxSelectNumStages (redux_ctx* ctx); +static int reduxSelectHwAxes (redux_ctx* ctx); +static int reduxComputeAxisList (redux_ctx* ctx); +static int reduxGenSource (redux_ctx* ctx); +static void reduxAppendSource (redux_ctx* ctx); +static void reduxAppendIncludes (redux_ctx* ctx); +static void reduxAppendTensorDeclArgs (redux_ctx* ctx, + const char* type, + const char* baseName); +static void reduxAppendTensorCallArgs (redux_ctx* ctx, + const char* baseName); +static void reduxAppendMacroDefs (redux_ctx* ctx); +static void reduxAppendTypedefs (redux_ctx* ctx); +static void reduxAppendGetInitValFns (redux_ctx* ctx); +static void reduxAppendWriteBackFn (redux_ctx* ctx); +static void reduxAppendReduxKernel (redux_ctx* ctx); +static void reduxAppendPrototype (redux_ctx* ctx); +static void reduxAppendIndexDeclarations (redux_ctx* ctx); +static void reduxAppendRangeCalculations (redux_ctx* ctx); +static void reduxAppendLoops (redux_ctx* ctx); +static void reduxAppendInitKernel (redux_ctx* ctx); +static void reduxAppendPostKernel (redux_ctx* ctx); +static int reduxCompile (redux_ctx* ctx); +static int reduxSchedule (redux_ctx* ctx); +static void reduxScheduleKernel (int ndims, + uint64_t* dims, + uint64_t warpSize, + uint64_t maxLg, + uint64_t* maxLs, + uint64_t maxGg, + uint64_t* maxGs, + uint64_t* bs, + uint64_t* gs, + uint64_t* cs); +static int reduxInvoke (redux_ctx* ctx); +static int reduxCleanup (redux_ctx* ctx, int ret); +static int reduxCleanupMsg (redux_ctx* ctx, int ret, + const char* fmt, ...); /* Function implementation */ -GPUARRAY_PUBLIC int GpuArray_sum (GpuArray* dst, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList){ - return GpuArray_reduction(GA_REDUCE_SUM, - dst, NULL, src, reduxLen, reduxList); -} -GPUARRAY_PUBLIC int GpuArray_prod (GpuArray* dst, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList){ - return GpuArray_reduction(GA_REDUCE_PROD, - dst, NULL, src, reduxLen, reduxList); -} -GPUARRAY_PUBLIC int GpuArray_prodnz (GpuArray* dst, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList){ - return GpuArray_reduction(GA_REDUCE_PRODNZ, - dst, NULL, src, reduxLen, reduxList); -} -GPUARRAY_PUBLIC int GpuArray_min (GpuArray* dst, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList){ - return GpuArray_reduction(GA_REDUCE_MIN, - dst, NULL, src, reduxLen, reduxList); -} -GPUARRAY_PUBLIC int GpuArray_max (GpuArray* dst, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList){ - return GpuArray_reduction(GA_REDUCE_MAX, - dst, NULL, src, reduxLen, reduxList); -} -GPUARRAY_PUBLIC int GpuArray_argmin (GpuArray* dstArg, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList){ - return GpuArray_reduction(GA_REDUCE_ARGMIN, - NULL, dstArg, src, reduxLen, reduxList); -} -GPUARRAY_PUBLIC int GpuArray_argmax (GpuArray* dstArg, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList){ - return GpuArray_reduction(GA_REDUCE_ARGMAX, - NULL, dstArg, src, reduxLen, reduxList); -} -GPUARRAY_PUBLIC int GpuArray_minandargmin(GpuArray* dst, - GpuArray* dstArg, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList){ - return GpuArray_reduction(GA_REDUCE_MINANDARGMIN, - dst, dstArg, src, reduxLen, reduxList); -} -GPUARRAY_PUBLIC int GpuArray_maxandargmax(GpuArray* dst, - GpuArray* dstArg, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList){ - return GpuArray_reduction(GA_REDUCE_MAXANDARGMAX, - dst, dstArg, src, reduxLen, reduxList); -} -GPUARRAY_PUBLIC int GpuArray_and (GpuArray* dst, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList){ - return GpuArray_reduction(GA_REDUCE_AND, - dst, NULL, src, reduxLen, reduxList); -} -GPUARRAY_PUBLIC int GpuArray_or (GpuArray* dst, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList){ - return GpuArray_reduction(GA_REDUCE_OR, - dst, NULL, src, reduxLen, reduxList); -} -GPUARRAY_PUBLIC int GpuArray_xor (GpuArray* dst, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList){ - return GpuArray_reduction(GA_REDUCE_XOR, - dst, NULL, src, reduxLen, reduxList); -} -GPUARRAY_PUBLIC int GpuArray_all (GpuArray* dst, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList){ - return GpuArray_reduction(GA_REDUCE_ALL, - dst, NULL, src, reduxLen, reduxList); -} -GPUARRAY_PUBLIC int GpuArray_any (GpuArray* dst, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList){ - return GpuArray_reduction(GA_REDUCE_ANY, - dst, NULL, src, reduxLen, reduxList); -} GPUARRAY_PUBLIC int GpuArray_reduction (ga_reduce_op op, GpuArray* dst, GpuArray* dstArg, const GpuArray* src, unsigned reduxLen, const unsigned* reduxList){ - redux_ctx ctxSTACK = {op, dst, dstArg, src, - (int)reduxLen, (const int*)reduxList}; - redux_ctx *ctx = &ctxSTACK; + redux_ctx ctxSTACK, *ctx = &ctxSTACK; + memset(ctx, 0, sizeof(*ctx)); - return reduxCheckargs(ctx); + ctx->op = op; + ctx->dst = dst; + ctx->dstArg = dstArg; + ctx->src = src; + ctx->reduxLen = reduxLen; + ctx->reduxList = (const int*)reduxList; + + return reduxInit(ctx); } /** @@ -463,7 +487,7 @@ GPUARRAY_PUBLIC int GpuArray_reduction (ga_reduce_op op, * @return Zero if successful; Non-zero if the datatype is not supported. */ -static int reduxGetSumInit (int typecode, const char** property){ +static int reduxGetSumInit (int typecode, const char** property){ if (typecode == GA_POINTER || typecode == GA_BUFFER){ return GA_UNSUPPORTED_ERROR; @@ -483,7 +507,7 @@ static int reduxGetSumInit (int typecode, const char** property) * @return Zero if successful; Non-zero if the datatype is not supported. */ -static int reduxGetProdInit (int typecode, const char** property){ +static int reduxGetProdInit (int typecode, const char** property){ if (typecode == GA_POINTER || typecode == GA_BUFFER){ return GA_UNSUPPORTED_ERROR; @@ -503,7 +527,7 @@ static int reduxGetProdInit (int typecode, const char** property) * @return Zero if successful; Non-zero if the datatype is not supported. */ -static int reduxGetMinInit (int typecode, const char** property){ +static int reduxGetMinInit (int typecode, const char** property){ switch (typecode){ case GA_BYTE2: case GA_BYTE3: @@ -593,7 +617,7 @@ static int reduxGetMinInit (int typecode, const char** property) * @return Zero if successful; Non-zero if the datatype is not supported. */ -static int reduxGetMaxInit (int typecode, const char** property){ +static int reduxGetMaxInit (int typecode, const char** property){ switch (typecode){ case GA_BOOL: *property = "1"; @@ -692,7 +716,7 @@ static int reduxGetMaxInit (int typecode, const char** property) * @return Zero if successful; Non-zero if the datatype is not supported. */ -static int reduxGetAndInit (int typecode, const char** property){ +static int reduxGetAndInit (int typecode, const char** property){ if (typecode == GA_POINTER || typecode == GA_BUFFER){ return GA_UNSUPPORTED_ERROR; @@ -712,7 +736,7 @@ static int reduxGetAndInit (int typecode, const char** property) * @return Zero if successful; Non-zero if the datatype is not supported. */ -static int reduxGetOrInit (int typecode, const char** property){ +static int reduxGetOrInit (int typecode, const char** property){ if (typecode == GA_POINTER || typecode == GA_BUFFER){ return GA_UNSUPPORTED_ERROR; @@ -721,6 +745,110 @@ static int reduxGetOrInit (int typecode, const char** property) return GA_NO_ERROR; } +/** + * @brief Sort the axes into optimal order for flattening. + * + * Two orderings exist: "Sensitive" and "Insensitive", for reductions that are + * sensitive (or not) to indexing. + * + * In all cases: + * + * 1. Free axes are sorted before reduction axes. + * 2. Free axes are sorted by decreasing absolute stride. + * 3. then by increasing source axis number. + * + * In the sensitive case: + * + * 4. Reduction axes are sorted by their position in reduxList. + * + * In the insensitive case: + * + * 4. Reduction axes are sorted by decreasing absolute stride. + * 5. then by increasing source axis number. + */ + +static int reduxSortFlatInsensitive (const void* a, const void* b){ + const axis_desc* xda = (const axis_desc*)a; + const axis_desc* xdb = (const axis_desc*)b; + + if ( axisIsReduced(xda) && !axisIsReduced(xdb)){ + return +1; + }else if (!axisIsReduced(xda) && axisIsReduced(xdb)){ + return -1; + } + + if (axisGetSrcAbsStride(xda) < axisGetSrcAbsStride(xdb)){ + return +1; + }else if (axisGetSrcAbsStride(xda) > axisGetSrcAbsStride(xdb)){ + return -1; + } + + return 0; +} +static int reduxSortFlatSensitive (const void* a, const void* b){ + const axis_desc* xda = (const axis_desc*)a; + const axis_desc* xdb = (const axis_desc*)b; + + if ( axisIsReduced(xda) && !axisIsReduced(xdb)){ + return +1; + }else if (!axisIsReduced(xda) && axisIsReduced(xdb)){ + return -1; + } + + if (axisIsReduced(xda)){ + return axisGetReduxNum(xda) axisGetSrcAbsStride(xdb)){ + return -1; + } + + return 0; + } +} + +/** + * @brief Sort axes in preferred order for integration into warp. + * + * The axes with stride != 0 are sorted by lowest absolute + * stride. Picking the few axes with the lowest absolute stride (while + * keeping the product of their dimensions <= warpSize) should maximize + * memory bandwidth of the warp. + * + * The restriction stride != 0 is intended to avoid waste of memory + * bandwidth. Once a memory transaction is necessary, it typically operates at + * far greater granularity than just 32 bits (4 bytes). + * + * Sorting by absolute stride should result, in the case of a packed tensor, in + * the memory accesses being close to perfectly contiguous. + */ + +static int reduxSortWarp (const void* a, const void* b){ + const axis_desc* xda = *(const axis_desc* const *)a; + const axis_desc* xdb = *(const axis_desc* const *)b; + + if ( axisGetSrcStride(xda) && !axisGetSrcStride(xdb)){ + return -1; + }else if (!axisGetSrcStride(xda) && axisGetSrcStride(xdb)){ + return +1; + } + + if (axisGetSrcAbsStride(xda) < axisGetSrcAbsStride(xdb)){ + return -1; + }else if (axisGetSrcAbsStride(xda) > axisGetSrcAbsStride(xdb)){ + return +1; + } + + if ( axisIsReduced(xda) && !axisIsReduced(xdb)){ + return -1; + }else if (!axisIsReduced(xda) && axisIsReduced(xdb)){ + return +1; + } + + return 0; +} + /** * @brief Check whether axis numbered v is already in the given set of axes. * @@ -731,10 +859,10 @@ static int reduxGetOrInit (int typecode, const char** property) * @return Non-zero if the set is non-empty and v is in it; Zero otherwise. */ -static int axisInSet (int v, - const int* set, - size_t setLen, - size_t* where){ +static int axisInSet (int v, + const int* set, + size_t setLen, + size_t* where){ size_t i; for (i=0;ireduxNum = -1; + axis->warpLen = 0; + axis->len = len; + + axis->srcStride = srcStride; + axis->srcOffset = 0; + + axis->dstStride = 0; + axis->dstOffset = 0; + + axis->dstArgStride = 0; + axis->dstArgOffset = 0; + + axis->tmpDstStride = 0; + axis->tmpDstOffset = 0; + + axis->tmpDstArgStride = 0; + axis->tmpDstArgOffset = 0; +} + /** - * @brief Check the sanity of the arguments in agreement with the - * documentation for GpuArray_reduction(). + * @brief Mark axis as reduction axis, with position reduxNum in the axis list. + */ + +static void axisMarkReduced (axis_desc* axis, int reduxNum){ + axis->isReduced = 1; + axis->reduxNum = reduxNum; +} + +/** + * @brief Mark axis as warp axis. + */ + +static void axisMarkWarp (axis_desc* axis, size_t warpLen){ + axis->warpLen = warpLen; +} + +/** + * @brief Get properties of an axis. + */ + +static int axisGetReduxNum (const axis_desc* axis){ + return axis->reduxNum; +} +static size_t axisGetLen (const axis_desc* axis){ + return axis->len; +} +static ssize_t axisGetSrcStride (const axis_desc* axis){ + return axisGetLen(axis) > 1 ? axis->srcStride : 0; +} +static size_t axisGetSrcAbsStride (const axis_desc* axis){ + return axisGetSrcStride(axis)<0 ? -(size_t)axisGetSrcStride(axis): + +(size_t)axisGetSrcStride(axis); +} +static ssize_t axisGetSrcOffset (const axis_desc* axis){ + return axis->srcOffset; +} +static ssize_t axisGetDstStride (const axis_desc* axis){ + return axisGetLen(axis) > 1 ? axis->dstStride : 0; +} +static size_t axisGetDstAbsStride (const axis_desc* axis){ + return axisGetDstStride(axis)<0 ? -(size_t)axisGetDstStride(axis): + +(size_t)axisGetDstStride(axis); +} +static ssize_t axisGetDstOffset (const axis_desc* axis){ + return axis->dstOffset; +} +static ssize_t axisGetDstArgStride (const axis_desc* axis){ + return axisGetLen(axis) > 1 ? axis->dstArgStride : 0; +} +static size_t axisGetDstArgAbsStride (const axis_desc* axis){ + return axisGetDstArgStride(axis)<0 ? -(size_t)axisGetDstArgStride(axis): + +(size_t)axisGetDstArgStride(axis); +} +static ssize_t axisGetDstArgOffset (const axis_desc* axis){ + return axis->dstArgOffset; +} +static int axisIsReduced (const axis_desc* axis){ + return axis->isReduced; +} +static int axisIsWarp (const axis_desc* axis){ + return !!axis->warpLen; +} +static int axisIsPartialWarp (const axis_desc* axis){ + return axis->warpLen > 0 && axis->warpLen != axis->len; +} + +/** + * @brief Returns whether the reduction interface requires a dst argument. + */ + +static int reduxRequiresDst (const redux_ctx* ctx){ + switch (ctx->op){ + case GA_REDUCE_ARGMIN: + case GA_REDUCE_ARGMAX: + return 0; + default: + return 1; + } +} + +/** + * @brief Returns whether the reduction interface requires a dstArg argument. + */ + +static int reduxRequiresDstArg (const redux_ctx* ctx){ + switch (ctx->op){ + case GA_REDUCE_MINANDARGMIN: + case GA_REDUCE_MAXANDARGMAX: + case GA_REDUCE_ARGMIN: + case GA_REDUCE_ARGMAX: + return 1; + default: + return 0; + } +} + +/** + * @brief Returns whether the generated kernel internally requires a dst + * argument. * - * Also initialize certain parts of the context, allocate memory - * buffers and fail out if at any point the environment gives us - * a problem. + * This is semantically subtly different from reduxHasDst(). The main + * difference is in the implementation of the GA_REDUCE_ARGMIN/ARGMAX + * reductions; Either *might* require a dst buffer, which will have to be + * allocated, even though it will be discared. + */ + +static int reduxKernelRequiresDst (const redux_ctx* ctx){ + switch (ctx->op){ + case GA_REDUCE_ARGMIN: + case GA_REDUCE_ARGMAX: + return reduxIsSmallCodeModel(ctx); + default: + return 1; + } +} + +/** + * @brief Returns whether the generated kernel internally requires a dstArg + * argument. * - * @return GA_INVALID_ERROR if arguments invalid; GA_NO_MEMORY if out of - * memory, GA_NO_ERROR otherwise. + * This is semantically subtly different from reduxHasDstArg(), since it asks + * whether the reduction, even though it does not accept a dstArg argument, + * still requires a dstArg internally. + */ + +static int reduxKernelRequiresDstArg (const redux_ctx* ctx){ + /** + * At present there exists no reduction whose implementation requires + * a dstArg but whose interface does not. + * + * E.g. the max() and min() reductions do NOT currently require a temporary + * buffer for indexes, and will not in the foreseeable future. + */ + + return reduxRequiresDstArg(ctx); +} + +/** + * @brief Returns whether the reduction is sensitive. + * + * A reduction is sensitive when its output satisfies at least one of the + * following conditions: + * + * - It depends on the exact order of axes in the reduxList + * - It depends on exact signs of the strides of axes in the reduxList + * + * Such sensitivity may prevent a flattening of contiguous axes even when it + * would have been otherwise permitted. + * + * For instance, ARGMIN/ARGMAX have this sensitivity, because the dstArg + * tensor's contents are flattened coordinates into the source tensor, and + * the flattening order is precisely reduxList. Permuting it would thus produce + * incorrect output. Moreover, if the strides of a reduction axis were to be + * reversed for the purpose of flattening the axis into another, the computed + * coordinate would again be incorrect. + * + * + * TL;DR: Reduction is sensitive if + * reduce(x, axis=axisList) != reduce(x, axis=axisList[::-1]) + * or + * reduce(x) != reduce(x[::-1]) + * . + */ + +static int reduxIsSensitive (const redux_ctx* ctx){ + switch (ctx->op){ + case GA_REDUCE_MINANDARGMIN: + case GA_REDUCE_MAXANDARGMAX: + case GA_REDUCE_ARGMIN: + case GA_REDUCE_ARGMAX: + return 1; + default: + return 0; + } +} + +/** + * @brief Returns whether we are using the small code model or not. + */ + +static int reduxIsSmallCodeModel (const redux_ctx* ctx){ + return !reduxIsLargeCodeModel(ctx); +} + +/** + * @brief Returns whether we are using the large code model or not. + */ + +static int reduxIsLargeCodeModel (const redux_ctx* ctx){ + return ctx->largeCodeModel; +} + +/** + * @brief Get description of source axis with given number. + */ + +static axis_desc* reduxGetSrcAxis (const redux_ctx* ctx, int i){ + return &ctx->xdSrc[i]; +} + +/** + * @brief Get description of source axis with given number in sort-order. */ -static int reduxCheckargs (redux_ctx* ctx){ - int i, j, ret, retT, retK; - unsigned numProcs; - size_t localSize; - size_t dstNumElem = 1, reduxPerElem = 1; +static axis_desc* reduxGetSrcSortAxis (const redux_ctx* ctx, int i){ + return ctx->xdSrcPtrs[i]; +} + +/** + * @brief Get description of flattened source axis with given number. + */ + +static axis_desc* reduxGetSrcFlatAxis (const redux_ctx* ctx, int i){ + return &ctx->xdSrcFlat[i]; +} + +/** + * @brief Attempt to flatten an axis `from` into an axis `into`. + * + * An axis can be considered for flattening into the previous one if ALL of + * the following conditions hold: + * + * 1. The product of the previous axis' length by its stride exactly + * matches the current axis' stride. + * 2. Both axes are reduced. + * + * For reductions where axis order matters (e.g. those that compute + * indices, like argmax/argmin), ALL of the following additional conditions + * must hold: + * + * 3. The sign of the strides must match. + * 4. The axis numbers must follow consecutively in the reduction list + * (this is ensured by the "sensitive" sort order) + * + * @return Non-zero if flattening attempt successful; Zero otherwise. + */ + +static int reduxTryFlattenInto (const redux_ctx* ctx, + axis_desc* into, + const axis_desc* from){ + int signSrc = 0, signDst = 0, signDstArg = 0, + reverseSrc = 0, reverseDst = 0, reverseDstArg = 0; + + if (axisIsReduced (into) != axisIsReduced (from) || + axisGetSrcAbsStride (into) != axisGetSrcAbsStride (from)*axisGetLen(from)){ + return 0; + } + + if (reduxRequiresDst(ctx) && + axisGetDstAbsStride (into) != axisGetDstAbsStride (from)*axisGetLen(from)){ + return 0; + } + + if (reduxRequiresDstArg(ctx) && + axisGetDstArgAbsStride(into) != axisGetDstArgAbsStride(from)*axisGetLen(from)){ + return 0; + } + + signSrc = (axisGetSrcStride (into)^axisGetSrcStride (from)) < 0; + signDst = (axisGetDstStride (into)^axisGetDstStride (from)) < 0; + signDstArg = (axisGetDstArgStride(into)^axisGetDstArgStride(from)) < 0; + reverseSrc = signSrc; + reverseDst = signDst && reduxRequiresDst (ctx); + reverseDstArg = signDstArg && reduxRequiresDstArg(ctx); + + if (reduxIsSensitive(ctx)){ + if(reverseSrc || reverseDst || reverseDstArg){ + return 0; + } + } + + if (reduxRequiresDst (ctx) && + reduxRequiresDstArg(ctx) && + reverseDst != reverseDstArg){ + /* Either both, or neither, of dst and dstArg must require reversal. */ + return 0; + } + + if (reverseSrc){ + into->srcOffset += (ssize_t)(axisGetLen(from)-1)*axisGetSrcStride(from); + into->srcStride = -axisGetSrcStride (from); + }else{ + into->srcStride = axisGetSrcStride (from); + } + + if (reverseDst){ + into->dstOffset += (ssize_t)(axisGetLen(from)-1)*axisGetDstStride(from); + into->dstStride = -axisGetDstStride (from); + }else{ + into->dstStride = axisGetDstStride (from); + } + + if (reverseDstArg){ + into->dstArgOffset += (ssize_t)(axisGetLen(from)-1)*axisGetDstArgStride(from); + into->dstArgStride = -axisGetDstArgStride(from); + }else{ + into->dstArgStride = axisGetDstArgStride(from); + } + + into->srcOffset += axisGetSrcOffset (from); + into->dstOffset += axisGetDstOffset (from); + into->dstArgOffset += axisGetDstArgOffset(from); + into->len *= axisGetLen (from); + + return 1; +} + +/** + * @brief Check whether we can add another reduction axis or free axis + * to the hardware axis list for either the primary or secondary kernel. + */ + +static int reduxCanAppendHwAxis (redux_ctx* ctx, + int kernelType, + int axisType){ + int kernelNdh = kernelType == KERNEL_PRIMARY ? ctx->pri.ndh : ctx->aux.ndh; + int kernelNdhr = kernelType == KERNEL_PRIMARY ? ctx->pri.ndhr : ctx->aux.ndhr; + int kernelNdhd = kernelType == KERNEL_PRIMARY ? ctx->pri.ndhd : ctx->aux.ndhd; + + if (kernelNdh >= MAX_HW_DIMS){ + return 0; + }else{ + return axisType == AXIS_REDUX ? kernelNdhr < ctx->ndr: + kernelNdhd < ctx->ndd; + } +} + +/** + * @brief Append the largest reduction axis or free axis that isn't yet + * in the hardware axis list for either the primary or secondary kernel + * into said hardware axis list. + */ + +static void reduxAppendLargestAxisToHwList(redux_ctx* ctx, + int kernelType, + int axisType){ + int maxI = 0, i, isInHwList, isInReduxList, isInDesiredList, isLargestSoFar; + int* hwAxisList, * ndh, * ndhr, * ndhd; + size_t v, maxV = 0; + + /* Get pointers to the correct kernel's variables */ + hwAxisList = kernelType == KERNEL_PRIMARY ? ctx->pri.axisList: + ctx->aux.axisList; + ndh = kernelType == KERNEL_PRIMARY ? &ctx->pri.ndh: + &ctx->aux.ndh; + ndhr = kernelType == KERNEL_PRIMARY ? &ctx->pri.ndhr: + &ctx->aux.ndhr; + ndhd = kernelType == KERNEL_PRIMARY ? &ctx->pri.ndhd: + &ctx->aux.ndhd; + + /* Find */ + for (i=0;inds;i++){ + isInHwList = axisInSet(i, hwAxisList, *ndh, 0); + isInReduxList = axisInSet(i, ctx->reduxList, ctx->ndr, 0); + isInDesiredList = axisType == AXIS_REDUX ? isInReduxList: + !isInReduxList; + v = ctx->src->dimensions[i]; + isLargestSoFar = v >= maxV; + + if (!isInHwList && isInDesiredList && isLargestSoFar){ + maxV = v; + maxI = i; + } + } + + /* Append */ + hwAxisList[(*ndh)++] = maxI; + if (axisType == AXIS_REDUX){ + (*ndhr)++; + }else{ + (*ndhd)++; + } +} + +/** + * @brief Initialize the context. + * + * After this function, calling reduxCleanup() becomes safe. + */ + +static int reduxInit (redux_ctx* ctx){ + int i; /** * We initialize certain parts of the context. @@ -814,15 +1339,16 @@ static int reduxCheckargs (redux_ctx* ctx){ ctx->srcTypeStr = ctx->dstTypeStr = ctx->dstArgTypeStr = ctx->accTypeStr = ctx->idxTypeStr = NULL; - ctx->initValK = NULL; - ctx->pri.ndh = ctx->aux.ndh = 0; - ctx->pri.ndhd = ctx->aux.ndhd = 0; - ctx->pri.ndhr = ctx->aux.ndhr = 0; + ctx->initValK = NULL; ctx->sourceCode = NULL; - ctx->sourceCodeLen = 0; ctx->errorString0 = NULL; ctx->errorString1 = NULL; ctx->errorString2 = NULL; + + ctx->splitWarpAxis = -1; + ctx->numStages = 1; + ctx->prodWarpAxes = 1; + ctx->prodAllAxes = ctx->prodRdxAxes = ctx->prodFreeAxes = 1; strb_init(&ctx->s); srcbInit (&ctx->srcGen, &ctx->s); @@ -836,219 +1362,134 @@ static int reduxCheckargs (redux_ctx* ctx){ ctx->srcStepsGD = ctx->srcSizeGD = ctx->dstStepsGD = ctx->dstArgStepsGD = ctx->pri.chunkSizeGD = ctx->aux.chunkSizeGD = NULL; - /* *** IT IS NOW SAFE TO CALL reduxCleanup() *** */ - - /* Insane src, reduxLen, dst or dstArg? */ - if (!ctx->src || - (reduxRequiresDst (ctx) && !ctx->dst) || - (reduxRequiresDstArg(ctx) && !ctx->dstArg) || - (ctx->src->nd <= 0) || - (ctx->reduxLen <= 0) || - (ctx->src->nd < (unsigned)ctx->reduxLen) || - (ctx->dst && ctx->dst->nd +ctx->reduxLen != ctx->src->nd) || - (ctx->dstArg && ctx->dstArg->nd+ctx->reduxLen != ctx->src->nd) ){ - return reduxCleanup(ctx, GA_INVALID_ERROR); - } - - - /* Insane or duplicate list entry? */ - for (i=0;ireduxLen;i++){ - if (ctx->reduxList[i] < 0 || - ctx->reduxList[i] >= (int)ctx->src->nd || - axisInSet(ctx->reduxList[i], ctx->reduxList, i, 0)){ - return reduxCleanup(ctx, GA_INVALID_ERROR); - } - } + return reduxInferProperties(ctx); +} +/** + * @brief Begin inferring the properties of the reduction. + */ - /* GPU context non-existent? */ - ctx->gpuCtx = GpuArray_context(ctx->src); - if (!ctx->gpuCtx){ - return reduxCleanup(ctx, GA_INVALID_ERROR); - } +static int reduxInferProperties (redux_ctx* ctx){ + axis_desc* a; + int i, j, retT, retK; + size_t d; - /* Unknown type? */ - reduxSelectTypes(ctx); - if (!ctx->srcTypeStr || !ctx->dstTypeStr || !ctx->dstArgTypeStr || - !ctx->accTypeStr){ - return reduxCleanup(ctx, GA_INVALID_ERROR); + /* Source code buffer preallocation failed? */ + if (strb_ensure(&ctx->s, 4*1024) != 0){ + return reduxCleanupMsg(ctx, GA_MEMORY_ERROR, + "Could not preallocate source code buffer!\n"); } - /* Determine initializer, and error out if reduction unsupported. */ - switch (ctx->op){ - case GA_REDUCE_SUM: - retT = reduxGetSumInit (ctx->dstTypeCode, &ctx->initValT); - retK = reduxGetSumInit (ctx->accTypeCode, &ctx->initValK); - break; - case GA_REDUCE_PRODNZ: - case GA_REDUCE_PROD: - retT = reduxGetProdInit(ctx->dstTypeCode, &ctx->initValT); - retK = reduxGetProdInit(ctx->accTypeCode, &ctx->initValK); - break; - case GA_REDUCE_MINANDARGMIN: - case GA_REDUCE_ARGMIN: - case GA_REDUCE_MIN: - retT = reduxGetMinInit (ctx->dstTypeCode, &ctx->initValT); - retK = reduxGetMinInit (ctx->accTypeCode, &ctx->initValK); - break; - case GA_REDUCE_MAXANDARGMAX: - case GA_REDUCE_ARGMAX: - case GA_REDUCE_MAX: - retT = reduxGetMaxInit (ctx->dstTypeCode, &ctx->initValT); - retK = reduxGetMaxInit (ctx->accTypeCode, &ctx->initValK); - break; - case GA_REDUCE_ALL: - case GA_REDUCE_AND: - retT = reduxGetAndInit (ctx->dstTypeCode, &ctx->initValT); - retK = reduxGetAndInit (ctx->accTypeCode, &ctx->initValK); - break; - case GA_REDUCE_ANY: - case GA_REDUCE_XOR: - case GA_REDUCE_OR: - retT = reduxGetOrInit (ctx->dstTypeCode, &ctx->initValT); - retK = reduxGetOrInit (ctx->accTypeCode, &ctx->initValK); - break; - default: - retT = GA_UNSUPPORTED_ERROR; - retK = GA_UNSUPPORTED_ERROR; - } - if (retT != GA_NO_ERROR){ - return reduxCleanup(ctx, retT); - } - if (retK != GA_NO_ERROR){ - return reduxCleanup(ctx, retK); + /* Insane src, reduxLen, dst or dstArg? */ + if (!ctx->src){ + return reduxCleanupMsg(ctx, GA_INVALID_ERROR, + "src is NULL!\n"); + }else if (ctx->src->nd <= 0){ + return reduxCleanupMsg(ctx, GA_INVALID_ERROR, + "src has less than 1 dimensions!\n"); + }else if (ctx->reduxLen <= 0){ + return reduxCleanupMsg(ctx, GA_INVALID_ERROR, + "List of dimensions to be reduced is empty!\n"); + }else if (ctx->src->nd < (unsigned)ctx->reduxLen){ + return reduxCleanupMsg(ctx, GA_INVALID_ERROR, + "src has fewer dimensions than there are dimensions to reduce!\n"); + }else if (reduxRequiresDst (ctx) && !ctx->dst){ + return reduxCleanupMsg(ctx, GA_INVALID_ERROR, + "dst is NULL, but reduction requires it!\n"); + }else if (reduxRequiresDstArg(ctx) && !ctx->dstArg){ + return reduxCleanupMsg(ctx, GA_INVALID_ERROR, + "dstArg is NULL, but reduction requires it!\n"); + }else if (ctx->dst && ctx->dst->nd +ctx->reduxLen != ctx->src->nd){ + return reduxCleanupMsg(ctx, GA_INVALID_ERROR, + "dst is of incorrect dimensionality for this reduction!\n"); + }else if (ctx->dstArg && ctx->dstArg->nd+ctx->reduxLen != ctx->src->nd){ + return reduxCleanupMsg(ctx, GA_INVALID_ERROR, + "dstArg is of incorrect dimensionality for this reduction!\n"); } - - - /** - * We initialize some more parts of the context, using the guarantees - * we now have about the sanity of the arguments. - */ - ctx->nds = ctx->src->nd; ctx->ndr = ctx->reduxLen; ctx->ndd = ctx->nds - ctx->ndr; - strb_ensure(&ctx->s, 3*1024); - - - /** - * And make a few small dynamic memory allocations for the benefit of the - * rest of the code, allowing error checking to happen early and fail fast. - */ - - ctx->srcAxisList = malloc(ctx->nds * sizeof(unsigned)); - ctx->dstDims = malloc(ctx->ndd * sizeof(size_t)); - if (!ctx->srcAxisList || - !ctx->dstDims ){ - return reduxCleanup(ctx, GA_MEMORY_ERROR); + ctx->ndw = 0; + ctx->ndp = 0; + ctx->ndf = 0; + ctx->ndt = ctx->ndd + 1; + + /* Insane reduxList? */ + for (i=0;indr;i++){ + j = ctx->reduxList[i]; + if (j < -ctx->nds || j >= ctx->nds){ + return reduxCleanupMsg(ctx, GA_INVALID_ERROR, + "Insane axis number %d! Should be [%d, %d)!\n", + j, -ctx->nds, ctx->nds); + } + j = j<0 ? ctx->nds+j : j; + d = ctx->src->dimensions[j]; + ctx->zeroRdxAxes += !d; + ctx->prodRdxAxes *= d?d:1; } /** - * Query device for approximate total level of parallelism. If destination - * tensor is so big it can keep all threads busy on individual elements, - * use large code model; Otherwise use small code model, where threads will - * have to cooperate. - * - * - Large (Destination tensor >= SMALL_REDUX_THRESHOLD elements, or - * destination tensor size >= # of reductions per destination - * tensor element): - * All destination elements have their own thread. - * - Small (otherwise): - * Multiple threads cooperate on a single destination element. + * Insane shape? + * + * The source tensor is allowed to be empty (its shape may contain 0s). + * However, all axes that are of length 0 must be reduction axes. + * + * The reason for this is that a reduction cannot store any output into an + * empty destination tensor (whose dimensions are the free axes), because + * it has 0 space. The operation cannot then fulfill its contract. + * + * On the other hand, when some or all reduction axes of a tensor are of + * length 0, the reduction can be interpreted as initializing the + * destination tensor to the identity value of the operation. For lack of a + * better idea, the destination argument tensor can then be zeroed. */ - ret = gpucontext_property(ctx->gpuCtx, GA_CTX_PROP_NUMPROCS, &numProcs); - if (ret != GA_NO_ERROR){ - return reduxCleanup(ctx, ret); + for (i=0;inds;i++){ + d = ctx->src->dimensions[i]; + ctx->zeroAllAxes += !d; + ctx->prodAllAxes *= d?d:1; } - ret = gpucontext_property(ctx->gpuCtx, GA_CTX_PROP_MAXLSIZE, &localSize); - if (ret != GA_NO_ERROR){ - return reduxCleanup(ctx, ret); + if (ctx->zeroAllAxes != ctx->zeroRdxAxes){ + return reduxCleanupMsg(ctx, GA_INVALID_ERROR, + "Source tensor has length-0 dimensions that are not reduced!"); } + ctx->prodFreeAxes = ctx->prodAllAxes/ctx->prodRdxAxes; - for (i=j=0;inds;i++){ - if (axisInSet(i, ctx->reduxList, ctx->nds, NULL)){ - reduxPerElem *= ctx->src->dimensions[i]; - }else{ - dstNumElem *= ctx->src->dimensions[i]; - ctx->dstDims[j++] = ctx->src->dimensions[i];; - } - } - ctx->largeCodeModel = dstNumElem >= numProcs*localSize || - dstNumElem >= reduxPerElem - || 1;/* BUG: Erase when small code model implemented. */ /** - * *** IT IS NOW SAFE TO CALL: *** - * - reduxIsLargeModel() - * - reduxIsSmallModel() - * - reduxKernelRequiresDst() - * - reduxKernelRequiresDstArg() + * GPU context non-existent, or cannot read its properties? */ - - /** - * Allocate workspaces. - * - * Certain reductions may require a workspace that isn't provided by the user. - * For instance, **when using the small code model**, argmin/argmax require - * a dst buffer, but the user didn't supply one (as he would have for - * maxandargmax/minandargmin). We must allocate and deallocate it ourselves. - * - * Otherwise we use the user-supplied buffers. - */ - - if (!reduxRequiresDst (ctx) && reduxKernelRequiresDst(ctx)){ - ctx->wsDst = malloc(sizeof(*ctx->wsDst)); - if (!ctx->wsDst){ - return reduxCleanup(ctx, GA_MEMORY_ERROR); - } - - ret = GpuArray_empty(ctx->wsDst, ctx->gpuCtx, ctx->dstTypeCode, - ctx->ndd, ctx->dstDims, GA_C_ORDER); - if(ret != GA_NO_ERROR){ - return reduxCleanup(ctx, ret); - } - }else{ - ctx->wsDst = ctx->dst; - } - if (!reduxRequiresDstArg(ctx) && reduxKernelRequiresDstArg(ctx)){ - ctx->wsDstArg = malloc(sizeof(*ctx->wsDstArg)); - if (!ctx->wsDstArg){ - return reduxCleanup(ctx, GA_MEMORY_ERROR); - } - - ret = GpuArray_empty(ctx->wsDstArg, ctx->gpuCtx, ctx->dstArgTypeCode, - ctx->ndd, ctx->dstDims, GA_C_ORDER); - if(ret != GA_NO_ERROR){ - return reduxCleanup(ctx, ret); - } - }else{ - ctx->wsDstArg = ctx->dstArg; + ctx->gpuCtx = GpuArray_context(ctx->src); + if (!ctx->gpuCtx || + gpucontext_property(ctx->gpuCtx, GA_CTX_PROP_NUMPROCS, &ctx->numProcs) != GA_NO_ERROR || + gpucontext_property(ctx->gpuCtx, GA_CTX_PROP_MAXLSIZE, &ctx->maxLg) != GA_NO_ERROR || + gpudata_property(ctx->src->data, GA_CTX_PROP_MAXLSIZE0, &ctx->maxLs[0]) != GA_NO_ERROR || + gpudata_property(ctx->src->data, GA_CTX_PROP_MAXLSIZE1, &ctx->maxLs[1]) != GA_NO_ERROR || + gpudata_property(ctx->src->data, GA_CTX_PROP_MAXLSIZE2, &ctx->maxLs[2]) != GA_NO_ERROR || + gpudata_property(ctx->src->data, GA_CTX_PROP_MAXGSIZE, &ctx->maxGg) != GA_NO_ERROR || + gpudata_property(ctx->src->data, GA_CTX_PROP_MAXGSIZE0, &ctx->maxGs[0]) != GA_NO_ERROR || + gpudata_property(ctx->src->data, GA_CTX_PROP_MAXGSIZE1, &ctx->maxGs[1]) != GA_NO_ERROR || + gpudata_property(ctx->src->data, GA_CTX_PROP_MAXGSIZE2, &ctx->maxGs[2]) != GA_NO_ERROR ){ + /* gpukernel_property(ctx->kernel.k, GA_KERNEL_PROP_PREFLSIZE, &warpSize); */ + return reduxCleanupMsg(ctx, GA_INVALID_ERROR, + "Error obtaining one or more properties from GPU context!\n"); } + ctx->warpSize = 32; + /** + * Type management. + * + * - Deal with the various typecodes. + * - Determine initializer and error out if reduction unsupported on that + * datatype. + */ - return reduxSelectHwAxes(ctx); -} - -/** - * @brief Select types for the reduction kernel's implementation. - * - * There are 5 types of relevance: - * - Source (S=Source) - * - Destination (T=Target) - * - Destination Argument (A=Arg) - * - Index (X=indeX) - * - Accumulator (K=aKKumulator/reduction) - */ - -static void reduxSelectTypes (redux_ctx* ctx){ - /* Deal with the various typecodes. */ ctx->srcTypeCode = ctx->src->typecode; ctx->dstTypeCode = ctx->srcTypeCode; ctx->dstArgTypeCode = GA_SSIZE; @@ -1060,179 +1501,330 @@ static void reduxSelectTypes (redux_ctx* ctx){ case GA_HALF2: ctx->accTypeCode = GA_FLOAT2; break; - case GA_HALF4: - ctx->accTypeCode = GA_FLOAT4; + case GA_HALF4: + ctx->accTypeCode = GA_FLOAT4; + break; + case GA_HALF8: + ctx->accTypeCode = GA_FLOAT8; + break; + case GA_HALF16: + ctx->accTypeCode = GA_FLOAT16; + break; + default: + ctx->accTypeCode = ctx->srcTypeCode; + } + ctx->srcTypeStr = gpuarray_get_type(ctx->srcTypeCode) ->cluda_name; + ctx->dstTypeStr = gpuarray_get_type(ctx->dstTypeCode) ->cluda_name; + ctx->dstArgTypeStr = gpuarray_get_type(ctx->dstArgTypeCode)->cluda_name; + ctx->idxTypeStr = gpuarray_get_type(ctx->idxTypeCode) ->cluda_name; + ctx->accTypeStr = gpuarray_get_type(ctx->accTypeCode) ->cluda_name; + if (!ctx->srcTypeStr || + !ctx->dstTypeStr || + !ctx->dstArgTypeStr || + !ctx->idxTypeStr || + !ctx->accTypeStr ){ + return reduxCleanup(ctx, GA_INVALID_ERROR); + } + switch (ctx->op){ + case GA_REDUCE_SUM: + retT = reduxGetSumInit (ctx->dstTypeCode, &ctx->initValT); + retK = reduxGetSumInit (ctx->accTypeCode, &ctx->initValK); + break; + case GA_REDUCE_PRODNZ: + case GA_REDUCE_PROD: + retT = reduxGetProdInit(ctx->dstTypeCode, &ctx->initValT); + retK = reduxGetProdInit(ctx->accTypeCode, &ctx->initValK); + break; + case GA_REDUCE_MINANDARGMIN: + case GA_REDUCE_ARGMIN: + case GA_REDUCE_MIN: + retT = reduxGetMinInit (ctx->dstTypeCode, &ctx->initValT); + retK = reduxGetMinInit (ctx->accTypeCode, &ctx->initValK); + break; + case GA_REDUCE_MAXANDARGMAX: + case GA_REDUCE_ARGMAX: + case GA_REDUCE_MAX: + retT = reduxGetMaxInit (ctx->dstTypeCode, &ctx->initValT); + retK = reduxGetMaxInit (ctx->accTypeCode, &ctx->initValK); break; - case GA_HALF8: - ctx->accTypeCode = GA_FLOAT8; + case GA_REDUCE_ALL: + case GA_REDUCE_AND: + retT = reduxGetAndInit (ctx->dstTypeCode, &ctx->initValT); + retK = reduxGetAndInit (ctx->accTypeCode, &ctx->initValK); break; - case GA_HALF16: - ctx->accTypeCode = GA_FLOAT16; + case GA_REDUCE_ANY: + case GA_REDUCE_XOR: + case GA_REDUCE_OR: + retT = reduxGetOrInit (ctx->dstTypeCode, &ctx->initValT); + retK = reduxGetOrInit (ctx->accTypeCode, &ctx->initValK); break; default: - ctx->accTypeCode = ctx->srcTypeCode; + retT = GA_UNSUPPORTED_ERROR; + retK = GA_UNSUPPORTED_ERROR; + } + if (retT != GA_NO_ERROR){ + return reduxCleanupMsg(ctx, retT, + "Problem selecting types to be used in reduction!\n"); + } + if (retK != GA_NO_ERROR){ + return reduxCleanupMsg(ctx, retK, + "Problem selecting types to be used in reduction!\n"); } - /* Get the string version as well. */ - ctx->srcTypeStr = gpuarray_get_type(ctx->srcTypeCode) ->cluda_name; - ctx->dstTypeStr = gpuarray_get_type(ctx->dstTypeCode) ->cluda_name; - ctx->dstArgTypeStr = gpuarray_get_type(ctx->dstArgTypeCode)->cluda_name; - ctx->idxTypeStr = gpuarray_get_type(ctx->idxTypeCode) ->cluda_name; - ctx->accTypeStr = gpuarray_get_type(ctx->accTypeCode) ->cluda_name; -} - -/** - * @brief Returns whether we are using the small code model or not. - */ -static int reduxIsSmallCodeModel (redux_ctx* ctx){ - return !reduxIsLargeCodeModel(ctx); -} + /** + * Allocate and construct source-tensor axis-description lists. + * + * While constructing the descriptions of each axis, verify that: + * + * 1. reduxLen has no duplicates. + * 2. dst and/or dstArg's dimensions match src's dimensions, stripped of + * the reduction axes. + */ -/** - * @brief Returns whether we are using the large code model or not. - */ + ctx->xdSrc = calloc(ctx->nds, sizeof(*ctx->xdSrc)); + ctx->xdSrcPtrs = calloc(ctx->nds, sizeof(*ctx->xdSrcPtrs)); + ctx->xdSrcFlat = calloc(ctx->nds, sizeof(*ctx->xdSrcFlat)); + ctx->xdTmp = calloc(ctx->ndt, sizeof(*ctx->xdTmp)); + if (!ctx->xdSrc || !ctx->xdSrcPtrs || !ctx->xdSrcFlat || !ctx->xdTmp){ + return reduxCleanup(ctx, GA_MEMORY_ERROR); + } + for (i=0;inds;i++){ + axisInit(&ctx->xdSrc[i], + ctx->src->dimensions[i], + ctx->src->strides[i]); + } + for (i=0;indr;i++){ + j = ctx->reduxList[i]; + j = j<0 ? ctx->nds+j : j; + a = reduxGetSrcAxis(ctx, j); + if (axisIsReduced(a)){ + return reduxCleanupMsg(ctx, GA_INVALID_ERROR, + "Axis %d appears multiple times in the " + "reduction axis list!\n", + j); + } + axisMarkReduced(a, i); + } + for (i=j=0;inds;i++){ + axis_desc* a = reduxGetSrcAxis(ctx, i); + size_t srcLen = axisGetLen(a), dstLen, dstArgLen; + + if (axisIsReduced(a)){continue;} + if (reduxRequiresDst(ctx)){ + dstLen = ctx->dst->dimensions[j]; + + if(srcLen != dstLen){ + return reduxCleanupMsg(ctx, GA_INVALID_ERROR, + "Source axis %d has length %zu, but " + "corresponding destination axis %d has length %zu!\n", + i, srcLen, j, dstLen); + } + + a->dstStride = ctx->dst->strides[j]; + } + if (reduxRequiresDstArg(ctx)){ + dstArgLen = ctx->dstArg->dimensions[j]; + + if(srcLen != dstArgLen){ + return reduxCleanupMsg(ctx, GA_INVALID_ERROR, + "Source axis %d has length %zu, but " + "corresponding destination-argument axis %d has length %zu!\n", + i, srcLen, j, dstArgLen); + } + + a->dstArgStride = ctx->dstArg->strides[j]; + } + + j++; + } -static int reduxIsLargeCodeModel (redux_ctx* ctx){ - return ctx->largeCodeModel; -} -/** - * @brief Returns whether the reduction interface requires a dst argument. - */ + /** + * Begin flattening the source tensor. + */ -static int reduxRequiresDst (redux_ctx* ctx){ - switch (ctx->op){ - case GA_REDUCE_ARGMIN: - case GA_REDUCE_ARGMAX: - return 0; - default: - return 1; - } + return reduxFlattenSource(ctx); } /** - * @brief Returns whether the reduction interface requires a dstArg argument. + * @brief Flatten the source tensor as much as is practical. + * + * This makes the axis lengths as long as possible and the tensor itself as + * contiguous as possible. */ -static int reduxRequiresDstArg (redux_ctx* ctx){ - switch (ctx->op){ - case GA_REDUCE_MINANDARGMIN: - case GA_REDUCE_MAXANDARGMAX: - case GA_REDUCE_ARGMIN: - case GA_REDUCE_ARGMAX: - return 1; - default: - return 0; +static int reduxFlattenSource (redux_ctx* ctx){ + axis_desc* axis, *flatAxis, *sortAxis; + int i, j, isSensitive; + + /** + * Copy source axis descriptions list to flattened source axis description + * list, in preparation for attempts at flattening. + */ + + memcpy(ctx->xdSrcFlat, ctx->xdSrc, ctx->nds*sizeof(*ctx->xdSrcFlat)); + ctx->ndf = ctx->nds; + + /** + * Pass 1: Flatten out 0-length dimensions. We already know that + * + * a) There are no 0-length free dimensions, because that + * constitutes an invalid input, and + * b) How many 0-length reduction dimensions there are, because + * we counted them in the error-checking code. + * + * So if there are any 0-length axes, we can delete all reduction axes and + * replace them with a single one. + */ + + if (ctx->zeroRdxAxes > 0){ + for (i=j=0;indf;i++){ + axis = reduxGetSrcFlatAxis(ctx, i); + + if (!axisIsReduced(axis)){ + *reduxGetSrcFlatAxis(ctx, j++) = *axis; + } + } + + axisInit (reduxGetSrcFlatAxis(ctx, j), 0, 0); + axisMarkReduced(reduxGetSrcFlatAxis(ctx, j), 0); + j++; + ctx->ndf = j; + } + + /** + * Pass 2: Flatten out 1-length dimensions, since they can always be + * ignored; They are always indexed at [0]. + */ + + for (i=j=0;indf;i++){ + axis = reduxGetSrcFlatAxis(ctx, i); + + if (axisGetLen(axis) != 1){ + *reduxGetSrcFlatAxis(ctx, j++) = *axis; + } + } + ctx->ndf = j; + + /** + * Pass 3: Flatten out continuous dimensions, where strides and sensitivity + * allows it. + */ + + isSensitive = reduxIsSensitive(ctx); + + qsort(ctx->xdSrcFlat, ctx->ndf, sizeof(*ctx->xdSrcFlat), + isSensitive ? reduxSortFlatSensitive : reduxSortFlatInsensitive); + + for (i=j=1;indf;i++){ + flatAxis = reduxGetSrcFlatAxis(ctx, j-1); + sortAxis = reduxGetSrcFlatAxis(ctx, i); + + if (!reduxTryFlattenInto(ctx, flatAxis, sortAxis)){ + *reduxGetSrcFlatAxis(ctx, j++) = *sortAxis; + } } + ctx->ndf = j; + + return reduxSelectWarpAxes(ctx); } /** - * @brief Returns whether the generated kernel internally requires a dst - * argument. - * - * This is semantically subtly different from reduxHasDst(). The main - * difference is in the implementation of the GA_REDUCE_ARGMIN/ARGMAX - * reductions; Either *might* require a dst buffer, which will have to be - * allocated, even though it will be discared. + * @brief Select the warp axes in such a way as to maximize memory bandwidth. */ -static int reduxKernelRequiresDst (redux_ctx* ctx){ - switch (ctx->op){ - case GA_REDUCE_ARGMIN: - case GA_REDUCE_ARGMAX: - return reduxIsSmallCodeModel(ctx); - default: - return 1; - } -} +static int reduxSelectWarpAxes (redux_ctx* ctx){ + axis_desc* a; + int i; + size_t aL; -/** - * @brief Returns whether the generated kernel internally requires a dstArg - * argument. - * - * This is semantically subtly different from reduxHasDstArg(), since it asks - * whether the reduction, even though it does not accept a dstArg argument, - * still requires a dstArg internally. - */ -static int reduxKernelRequiresDstArg (redux_ctx* ctx){ /** - * At present there exists no reduction whose implementation requires - * a dstArg but whose interface does not. - * - * E.g. the max() and min() reductions do NOT currently require a temporary - * buffer for indexes, and will not in the foreseeable future. + * NOTE: At this point it is possible for there to be no axes + * (ctx->ndf == 0), but this will only occur if all axes of the original + * tensor were length-1 (i.e., if this was a scalar masquerading as a + * multidimensional tensor). + * + * We check for this case and simulate a 1-dimensional, 1-length tensor. */ - return reduxRequiresDstArg(ctx); -} + if(ctx->ndf == 0){ + axisInit (reduxGetSrcFlatAxis(ctx, ctx->ndf), 1, 0); + axisMarkReduced(reduxGetSrcFlatAxis(ctx, ctx->ndf), 0); + ctx->ndf = 1; + } -/** - * @brief Check whether we can add another reduction axis or free axis - * to the hardware axis list for either the primary or secondary kernel. - */ -static int reduxCanAppendHwAxis (redux_ctx* ctx, - int kernelType, - int axisType){ - int kernelNdh = kernelType == KERNEL_PRIMARY ? ctx->pri.ndh : ctx->aux.ndh; - int kernelNdhr = kernelType == KERNEL_PRIMARY ? ctx->pri.ndhr : ctx->aux.ndhr; - int kernelNdhd = kernelType == KERNEL_PRIMARY ? ctx->pri.ndhd : ctx->aux.ndhd; + /** + * Select Warp Axes. + * + * Using a particular heuristic order (*), sort the axis list by + * suitability for belonging to the warp. Then, pick the first few axes, + * until the product of their lengths exceeds the warp size. + * + * (*) See documentation of value-comparison function. + */ - if (kernelNdh >= MAX_HW_DIMS){ - return 0; - }else{ - return axisType == AXIS_REDUX ? kernelNdhr < ctx->ndr: - kernelNdhd < ctx->ndd; + for(i=0;indf;i++){ + ctx->xdSrcPtrs[i] = reduxGetSrcFlatAxis(ctx, i); } -} -/** - * @brief Append the largest reduction axis or free axis that isn't yet - * in the hardware axis list for either the primary or secondary kernel - * into said hardware axis list. - */ + qsort(ctx->xdSrcPtrs, ctx->ndf, sizeof(*ctx->xdSrcPtrs), reduxSortWarp); -static void reduxAppendLargestAxisToHwList(redux_ctx* ctx, - int kernelType, - int axisType){ - int maxI = 0, i, isInHwList, isInReduxList, isInDesiredList, isLargestSoFar; - int* hwAxisList, * ndh, * ndhr, * ndhd; - size_t v, maxV = 0; + for (i=0;indf;i++){ + a = reduxGetSrcSortAxis(ctx, i); + aL = axisGetLen(a); + if (aL <= 1){break;} + + ctx->prodWarpAxes *= aL; + if (ctx->prodWarpAxes <= ctx->warpSize){ + axisMarkWarp(a, aL); + ctx->ndw++; + }else{ + /** + * The product of warp lengths just exceeded warpSize. We backtrack + * by undoing the multiplication by aL. We then check whether we + * can "split" this axis by extracting at least a factor of 2 into + * warpLen. If yes, we mark is as the (only) warp axis that is + * split by setting its warpLen to something neither 0 nor len. + */ + + ctx->prodWarpAxes /= aL; + aL = ctx->warpSize/ctx->prodWarpAxes; + if (aL >= 2){ + axisMarkWarp(a, aL); + ctx->prodWarpAxes *= aL; + ctx->splitWarpAxis = i; + ctx->ndw++; + ctx->ndp++; + } + break; + } + } - /* Get pointers to the correct kernel's variables */ - hwAxisList = kernelType == KERNEL_PRIMARY ? ctx->pri.axisList: - ctx->aux.axisList; - ndh = kernelType == KERNEL_PRIMARY ? &ctx->pri.ndh: - &ctx->aux.ndh; - ndhr = kernelType == KERNEL_PRIMARY ? &ctx->pri.ndhr: - &ctx->aux.ndhr; - ndhd = kernelType == KERNEL_PRIMARY ? &ctx->pri.ndhd: - &ctx->aux.ndhd; - /* Find */ - for (i=0;inds;i++){ - isInHwList = axisInSet(i, hwAxisList, *ndh, 0); - isInReduxList = axisInSet(i, ctx->reduxList, ctx->ndr, 0); - isInDesiredList = axisType == AXIS_REDUX ? isInReduxList: - !isInReduxList; - v = ctx->src->dimensions[i]; - isLargestSoFar = v >= maxV; + return reduxSelectNumStages(ctx); +} - if (!isInHwList && isInDesiredList && isLargestSoFar){ - maxV = v; - maxI = i; - } - } +/** + * @brief Select the number of stages of the reduction. + * + * This depends a lot on the GPU and the specific size of the reduction. + */ - /* Append */ - hwAxisList[(*ndh)++] = maxI; - if (axisType == AXIS_REDUX){ - (*ndhr)++; +static int reduxSelectNumStages (redux_ctx* ctx){ + size_t parallelism = 2 * ctx->numProcs * ctx->maxLg; + + if(ctx->zeroRdxAxes || /* Reduction is empty? */ + ctx->prodFreeAxes > ctx->prodRdxAxes || /* Large # of destination elements? */ + ctx->prodFreeAxes > parallelism ){ /* # of destination elements large enough to fill available parallelism? */ + ctx->numStages = 1; }else{ - (*ndhd)++; + ctx->numStages = 2; } + + return reduxSelectHwAxes(ctx); } /** @@ -1255,7 +1847,67 @@ static void reduxAppendLargestAxisToHwList(redux_ctx* ctx, * largest free axes are selected. */ -static int reduxSelectHwAxes (redux_ctx* ctx){ +static int reduxSelectHwAxes (redux_ctx* ctx){ + int ret; + + ctx->srcAxisList = malloc(ctx->nds * sizeof(unsigned)); + ctx->dstDims = malloc(ctx->ndd * sizeof(size_t)); + if (!ctx->srcAxisList || + !ctx->dstDims ){ + return reduxCleanup(ctx, GA_MEMORY_ERROR); + } + + ctx->largeCodeModel = 1;/* BUG: Erase when small code model fixed. */ + /** + * *** IT IS NOW SAFE TO CALL: *** + * - reduxIsLargeModel() + * - reduxIsSmallModel() + * - reduxKernelRequiresDst() + * - reduxKernelRequiresDstArg() + */ + + + /** + * Allocate workspaces. + * + * Certain reductions may require a workspace that isn't provided by the user. + * For instance, **when using the small code model**, argmin/argmax require + * a dst buffer, but the user didn't supply one (as he would have for + * maxandargmax/minandargmin). We must allocate and deallocate it ourselves. + * + * Otherwise we use the user-supplied buffers. + */ + + if (!reduxRequiresDst (ctx) && reduxKernelRequiresDst(ctx)){ + ctx->wsDst = malloc(sizeof(*ctx->wsDst)); + if (!ctx->wsDst){ + return reduxCleanup(ctx, GA_MEMORY_ERROR); + } + + ret = GpuArray_empty(ctx->wsDst, ctx->gpuCtx, ctx->dstTypeCode, + ctx->ndd, ctx->dstDims, GA_C_ORDER); + if (ret != GA_NO_ERROR){ + return reduxCleanup(ctx, ret); + } + }else{ + ctx->wsDst = ctx->dst; + } + if (!reduxRequiresDstArg(ctx) && reduxKernelRequiresDstArg(ctx)){ + ctx->wsDstArg = malloc(sizeof(*ctx->wsDstArg)); + if (!ctx->wsDstArg){ + return reduxCleanup(ctx, GA_MEMORY_ERROR); + } + + ret = GpuArray_empty(ctx->wsDstArg, ctx->gpuCtx, ctx->dstArgTypeCode, + ctx->ndd, ctx->dstDims, GA_C_ORDER); + if (ret != GA_NO_ERROR){ + return reduxCleanup(ctx, ret); + } + }else{ + ctx->wsDstArg = ctx->dstArg; + } + + if (reduxIsLargeCodeModel(ctx)){ while (reduxCanAppendHwAxis (ctx, KERNEL_PRIMARY, AXIS_FREE)){ reduxAppendLargestAxisToHwList(ctx, KERNEL_PRIMARY, AXIS_FREE); @@ -1348,8 +2000,8 @@ static void reduxAppendTensorCallArgs (redux_ctx* ctx, static void reduxAppendMacroDefs (redux_ctx* ctx){ int i; - srcbAppends (&ctx->srcGen, "#define FOROVER(idx) for(i##idx = i##idx##Start; i##idx < i##idx##End; i##idx++)\n"); - srcbAppends (&ctx->srcGen, "#define ESCAPE(idx) if(i##idx >= i##idx##Dim){continue;}\n"); + srcbAppends (&ctx->srcGen, "#define FOROVER(idx) for (i##idx = i##idx##Start; i##idx < i##idx##End; i##idx++)\n"); + srcbAppends (&ctx->srcGen, "#define ESCAPE(idx) if (i##idx >= i##idx##Dim){continue;}\n"); /* srcVal indexer */ srcbAppends (&ctx->srcGen, "#define srcVal (*(const GLOBAL_MEM S*)("); @@ -1480,10 +2132,10 @@ static void reduxAppendPrototype (redux_ctx* ctx){ reduxAppendTensorDeclArgs(ctx, "S", "src"); srcbAppendElemf(&ctx->srcGen, "const GLOBAL_MEM X* srcSize"); srcbAppendElemf(&ctx->srcGen, "const GLOBAL_MEM X* chunkSize"); - if(reduxKernelRequiresDst(ctx)){ + if (reduxKernelRequiresDst(ctx)){ reduxAppendTensorDeclArgs(ctx, "T", "dst"); } - if(reduxKernelRequiresDstArg(ctx)){ + if (reduxKernelRequiresDstArg(ctx)){ reduxAppendTensorDeclArgs(ctx, "A", "dstArg"); } srcbEndList (&ctx->srcGen); @@ -1528,12 +2180,12 @@ static void reduxAppendRangeCalculations (redux_ctx* ctx){ for (i=0;inds;i++){ strb_appendf(&ctx->s, "\ti%dSStep = srcSteps[%d];\n", i, ctx->srcAxisList[i]); } - if(reduxKernelRequiresDst(ctx)){ + if (reduxKernelRequiresDst(ctx)){ for (i=0;indd;i++){ strb_appendf(&ctx->s, "\ti%dDStep = dstSteps[%d];\n", i, i); } } - if(reduxKernelRequiresDstArg(ctx)){ + if (reduxKernelRequiresDstArg(ctx)){ for (i=0;indd;i++){ strb_appendf(&ctx->s, "\ti%dAStep = dstArgSteps[%d];\n", i, i); } @@ -1623,14 +2275,14 @@ static void reduxAppendLoops (redux_ctx* ctx){ case GA_REDUCE_ARGMIN: case GA_REDUCE_MINANDARGMIN: srcbAppends(&ctx->srcGen, "\t\t\trdxK = min(rdxK, k);\n" - "\t\t\tif(rdxK == k){\n" + "\t\t\tif (rdxK == k){\n" "\t\t\t\trdxA = rdxIdx;\n" "\t\t\t}\n"); break; case GA_REDUCE_ARGMAX: case GA_REDUCE_MAXANDARGMAX: srcbAppends(&ctx->srcGen, "\t\t\trdxK = max(rdxK, k);\n" - "\t\t\tif(rdxK == k){\n" + "\t\t\tif (rdxK == k){\n" "\t\t\t\trdxA = rdxIdx;\n" "\t\t\t}\n"); break; @@ -2096,14 +2748,18 @@ static int reduxInvoke (redux_ctx* ctx){ * Cleanup */ -static int reduxCleanup (redux_ctx* ctx, int ret){ +static int reduxCleanup (redux_ctx* ctx, int ret){ if (ctx->dst != ctx->wsDst){ - GpuArray_clear(ctx->wsDst); + if(ctx->wsDst){ + GpuArray_clear(ctx->wsDst); + } free(ctx->wsDst); ctx->wsDst = NULL; } if (ctx->dstArg != ctx->wsDstArg){ - GpuArray_clear(ctx->wsDstArg); + if(ctx->wsDstArg){ + GpuArray_clear(ctx->wsDstArg); + } free(ctx->wsDstArg); ctx->wsDstArg = NULL; } @@ -2133,3 +2789,20 @@ static int reduxCleanup (redux_ctx* ctx, int ret){ return ret; } + +static int reduxCleanupMsg (redux_ctx* ctx, int ret, + const char* fmt, ...){ +#if DEBUG + FILE* fp = stderr; + + va_list ap; + va_start(ap, fmt); + vfprintf(fp, fmt, ap); + va_end(ap); + fflush(fp); +#else + (void)fmt; +#endif + + return reduxCleanup(ctx, ret); +} diff --git a/tests/check_reduction.c b/tests/check_reduction.c index 370f074167..18cf9e7615 100644 --- a/tests/check_reduction.c +++ b/tests/check_reduction.c @@ -113,7 +113,7 @@ START_TEST(test_maxandargmax_reduction){ ga_assert_ok(GpuArray_memset(&gaMax, -1)); /* 0xFFFFFFFF is a qNaN. */ ga_assert_ok(GpuArray_memset(&gaArgmax, -1)); - ga_assert_ok(GpuArray_maxandargmax(&gaMax, &gaArgmax, &gaSrc, 2, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_MAXANDARGMAX, &gaMax, &gaArgmax, &gaSrc, 2, reduxList)); ga_assert_ok(GpuArray_read(pMax, sizeof(*pMax) *dims[1], &gaMax)); ga_assert_ok(GpuArray_read(pArgmax, sizeof(*pArgmax)*dims[1], &gaArgmax)); @@ -205,7 +205,7 @@ START_TEST(test_maxandargmax_idxtranspose){ ga_assert_ok(GpuArray_memset(&gaMax, -1)); /* 0xFFFFFFFF is a qNaN. */ ga_assert_ok(GpuArray_memset(&gaArgmax, -1)); - ga_assert_ok(GpuArray_maxandargmax(&gaMax, &gaArgmax, &gaSrc, 2, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_MAXANDARGMAX, &gaMax, &gaArgmax, &gaSrc, 2, reduxList)); ga_assert_ok(GpuArray_read(pMax, sizeof(*pMax) *rdxProdDims, &gaMax)); ga_assert_ok(GpuArray_read(pArgmax, sizeof(*pArgmax)*rdxProdDims, &gaArgmax)); @@ -294,7 +294,7 @@ START_TEST(test_maxandargmax_veryhighrank){ ga_assert_ok(GpuArray_memset(&gaMax, -1)); /* 0xFFFFFFFF is a qNaN. */ ga_assert_ok(GpuArray_memset(&gaArgmax, -1)); - ga_assert_ok(GpuArray_maxandargmax(&gaMax, &gaArgmax, &gaSrc, 4, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_MAXANDARGMAX, &gaMax, &gaArgmax, &gaSrc, 4, reduxList)); ga_assert_ok(GpuArray_read(pMax, sizeof(*pMax) *rdxProdDims, &gaMax)); ga_assert_ok(GpuArray_read(pArgmax, sizeof(*pArgmax)*rdxProdDims, &gaArgmax)); @@ -393,7 +393,7 @@ START_TEST(test_maxandargmax_alldimsreduced){ ga_assert_ok(GpuArray_memset(&gaMax, -1)); /* 0xFFFFFFFF is a qNaN. */ ga_assert_ok(GpuArray_memset(&gaArgmax, -1)); - ga_assert_ok(GpuArray_maxandargmax(&gaMax, &gaArgmax, &gaSrc, 3, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_MAXANDARGMAX, &gaMax, &gaArgmax, &gaSrc, 3, reduxList)); ga_assert_ok(GpuArray_read(pMax, sizeof(*pMax), &gaMax)); ga_assert_ok(GpuArray_read(pArgmax, sizeof(*pArgmax), &gaArgmax)); @@ -481,7 +481,7 @@ START_TEST(test_minandargmin_reduction){ ga_assert_ok(GpuArray_memset(&gaMin, -1)); /* 0xFFFFFFFF is a qNaN. */ ga_assert_ok(GpuArray_memset(&gaArgmin, -1)); - ga_assert_ok(GpuArray_minandargmin(&gaMin, &gaArgmin, &gaSrc, 2, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_MINANDARGMIN, &gaMin, &gaArgmin, &gaSrc, 2, reduxList)); ga_assert_ok(GpuArray_read(pMin, sizeof(*pMin) *dims[1], &gaMin)); ga_assert_ok(GpuArray_read(pArgmin, sizeof(*pArgmin)*dims[1], &gaArgmin)); @@ -570,7 +570,7 @@ START_TEST(test_minandargmin_veryhighrank){ ga_assert_ok(GpuArray_memset(&gaMin, -1)); /* 0xFFFFFFFF is a qNaN. */ ga_assert_ok(GpuArray_memset(&gaArgmin, -1)); - ga_assert_ok(GpuArray_minandargmin(&gaMin, &gaArgmin, &gaSrc, 4, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_MINANDARGMIN, &gaMin, &gaArgmin, &gaSrc, 4, reduxList)); ga_assert_ok(GpuArray_read(pMin, sizeof(*pMin) *rdxProdDims, &gaMin)); ga_assert_ok(GpuArray_read(pArgmin, sizeof(*pArgmin)*rdxProdDims, &gaArgmin)); @@ -669,7 +669,7 @@ START_TEST(test_minandargmin_alldimsreduced){ ga_assert_ok(GpuArray_memset(&gaMin, -1)); /* 0xFFFFFFFF is a qNaN. */ ga_assert_ok(GpuArray_memset(&gaArgmin, -1)); - ga_assert_ok(GpuArray_minandargmin(&gaMin, &gaArgmin, &gaSrc, 3, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_MINANDARGMIN, &gaMin, &gaArgmin, &gaSrc, 3, reduxList)); ga_assert_ok(GpuArray_read(pMin, sizeof(*pMin), &gaMin)); ga_assert_ok(GpuArray_read(pArgmin, sizeof(*pArgmin), &gaArgmin)); @@ -754,7 +754,7 @@ START_TEST(test_argmax_reduction){ ga_assert_ok(GpuArray_write(&gaSrc, pSrc, sizeof(*pSrc)*prodDims)); ga_assert_ok(GpuArray_memset(&gaArgmax, -1)); - ga_assert_ok(GpuArray_argmax(&gaArgmax, &gaSrc, 2, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_ARGMAX, NULL, &gaArgmax, &gaSrc, 2, reduxList)); ga_assert_ok(GpuArray_read(pArgmax, sizeof(*pArgmax)*dims[1], &gaArgmax)); @@ -836,7 +836,7 @@ START_TEST(test_argmax_veryhighrank){ ga_assert_ok(GpuArray_write(&gaSrc, pSrc, sizeof(*pSrc)*prodDims)); ga_assert_ok(GpuArray_memset(&gaArgmax, -1)); - ga_assert_ok(GpuArray_argmax(&gaArgmax, &gaSrc, 4, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_ARGMAX, NULL, &gaArgmax, &gaSrc, 4, reduxList)); ga_assert_ok(GpuArray_read(pArgmax, sizeof(*pArgmax)*rdxProdDims, &gaArgmax)); @@ -929,7 +929,7 @@ START_TEST(test_argmax_alldimsreduced){ ga_assert_ok(GpuArray_write(&gaSrc, pSrc, sizeof(*pSrc)*prodDims)); ga_assert_ok(GpuArray_memset(&gaArgmax, -1)); - ga_assert_ok(GpuArray_argmax(&gaArgmax, &gaSrc, 3, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_ARGMAX, NULL, &gaArgmax, &gaSrc, 3, reduxList)); ga_assert_ok(GpuArray_read(pArgmax, sizeof(*pArgmax), &gaArgmax)); @@ -1011,7 +1011,7 @@ START_TEST(test_argmin_reduction){ ga_assert_ok(GpuArray_write(&gaSrc, pSrc, sizeof(*pSrc)*prodDims)); ga_assert_ok(GpuArray_memset(&gaArgmin, -1)); - ga_assert_ok(GpuArray_argmin(&gaArgmin, &gaSrc, 2, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_ARGMIN, NULL, &gaArgmin, &gaSrc, 2, reduxList)); ga_assert_ok(GpuArray_read(pArgmin, sizeof(*pArgmin)*dims[1], &gaArgmin)); @@ -1093,7 +1093,7 @@ START_TEST(test_argmin_veryhighrank){ ga_assert_ok(GpuArray_write(&gaSrc, pSrc, sizeof(*pSrc)*prodDims)); ga_assert_ok(GpuArray_memset(&gaArgmin, -1)); - ga_assert_ok(GpuArray_argmin(&gaArgmin, &gaSrc, 4, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_ARGMIN, NULL, &gaArgmin, &gaSrc, 4, reduxList)); ga_assert_ok(GpuArray_read(pArgmin, sizeof(*pArgmin)*rdxProdDims, &gaArgmin)); @@ -1186,7 +1186,7 @@ START_TEST(test_argmin_alldimsreduced){ ga_assert_ok(GpuArray_write(&gaSrc, pSrc, sizeof(*pSrc)*prodDims)); ga_assert_ok(GpuArray_memset(&gaArgmin, -1)); - ga_assert_ok(GpuArray_argmin(&gaArgmin, &gaSrc, 3, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_ARGMIN, NULL, &gaArgmin, &gaSrc, 3, reduxList)); ga_assert_ok(GpuArray_read(pArgmin, sizeof(*pArgmin), &gaArgmin)); @@ -1265,7 +1265,7 @@ START_TEST(test_max_reduction){ ga_assert_ok(GpuArray_write(&gaSrc, pSrc, sizeof(*pSrc)*prodDims)); ga_assert_ok(GpuArray_memset(&gaMax, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_max(&gaMax, &gaSrc, 2, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_MAX, &gaMax, NULL, &gaSrc, 2, reduxList)); ga_assert_ok(GpuArray_read(pMax, sizeof(*pMax) *dims[1], &gaMax)); @@ -1343,7 +1343,7 @@ START_TEST(test_max_veryhighrank){ ga_assert_ok(GpuArray_write(&gaSrc, pSrc, sizeof(*pSrc)*prodDims)); ga_assert_ok(GpuArray_memset(&gaMax, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_max(&gaMax, &gaSrc, 4, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_MAX, &gaMax, NULL, &gaSrc, 4, reduxList)); ga_assert_ok(GpuArray_read(pMax, sizeof(*pMax) *rdxProdDims, &gaMax)); @@ -1431,7 +1431,7 @@ START_TEST(test_max_alldimsreduced){ ga_assert_ok(GpuArray_write(&gaSrc, pSrc, sizeof(*pSrc)*prodDims)); ga_assert_ok(GpuArray_memset(&gaMax, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_max(&gaMax, &gaSrc, 3, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_MAX, &gaMax, NULL, &gaSrc, 3, reduxList)); ga_assert_ok(GpuArray_read(pMax, sizeof(*pMax), &gaMax)); @@ -1507,7 +1507,7 @@ START_TEST(test_min_reduction){ ga_assert_ok(GpuArray_write(&gaSrc, pSrc, sizeof(*pSrc)*prodDims)); ga_assert_ok(GpuArray_memset(&gaMin, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_min(&gaMin, &gaSrc, 2, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_MIN, &gaMin, NULL, &gaSrc, 2, reduxList)); ga_assert_ok(GpuArray_read(pMin, sizeof(*pMin) *dims[1], &gaMin)); @@ -1585,7 +1585,7 @@ START_TEST(test_min_veryhighrank){ ga_assert_ok(GpuArray_write(&gaSrc, pSrc, sizeof(*pSrc)*prodDims)); ga_assert_ok(GpuArray_memset(&gaMin, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_min(&gaMin, &gaSrc, 4, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_MIN, &gaMin, NULL, &gaSrc, 4, reduxList)); ga_assert_ok(GpuArray_read(pMin, sizeof(*pMin) *rdxProdDims, &gaMin)); @@ -1673,7 +1673,7 @@ START_TEST(test_min_alldimsreduced){ ga_assert_ok(GpuArray_write(&gaSrc, pSrc, sizeof(*pSrc)*prodDims)); ga_assert_ok(GpuArray_memset(&gaMin, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_min(&gaMin, &gaSrc, 3, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_MIN, &gaMin, NULL, &gaSrc, 3, reduxList)); ga_assert_ok(GpuArray_read(pMin, sizeof(*pMin), &gaMin)); @@ -1750,7 +1750,7 @@ START_TEST(test_sum_reduction){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_sum (&gaD, &gaS, 2, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_SUM, &gaD, NULL, &gaS, 2, reduxList)); ga_assert_ok(GpuArray_read (pD, sizeof(*pD)*dims[1], &gaD)); @@ -1826,7 +1826,7 @@ START_TEST(test_sum_veryhighrank){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_sum (&gaD, &gaS, 4, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_SUM, &gaD, NULL, &gaS, 4, reduxList)); ga_assert_ok(GpuArray_read (pD, sizeof(*pD)*rdxProdDims, &gaD)); @@ -1912,7 +1912,7 @@ START_TEST(test_sum_alldimsreduced){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_sum (&gaD, &gaS, 3, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_SUM, &gaD, NULL, &gaS, 3, reduxList)); ga_assert_ok(GpuArray_read (pD, sizeof(*pD), &gaD)); @@ -1986,7 +1986,7 @@ START_TEST(test_prod_reduction){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_prod (&gaD, &gaS, 2, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_PROD, &gaD, NULL, &gaS, 2, reduxList)); ga_assert_ok(GpuArray_read (pD, sizeof(*pD)*dims[1], &gaD)); @@ -2062,7 +2062,7 @@ START_TEST(test_prod_veryhighrank){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_prod (&gaD, &gaS, 4, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_PROD, &gaD, NULL, &gaS, 4, reduxList)); ga_assert_ok(GpuArray_read (pD, sizeof(*pD)*rdxProdDims, &gaD)); @@ -2148,7 +2148,7 @@ START_TEST(test_prod_alldimsreduced){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_prod (&gaD, &gaS, 3, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_PROD, &gaD, NULL, &gaS, 3, reduxList)); ga_assert_ok(GpuArray_read (pD, sizeof(*pD), &gaD)); @@ -2225,7 +2225,7 @@ START_TEST(test_prodnz_reduction){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_prodnz(&gaD, &gaS, 2, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_PRODNZ, &gaD, NULL, &gaS, 2, reduxList)); ga_assert_ok(GpuArray_read (pD, sizeof(*pD)*dims[1], &gaD)); @@ -2304,7 +2304,7 @@ START_TEST(test_prodnz_veryhighrank){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_prodnz(&gaD, &gaS, 4, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_PRODNZ, &gaD, NULL, &gaS, 4, reduxList)); ga_assert_ok(GpuArray_read (pD, sizeof(*pD)*rdxProdDims, &gaD)); @@ -2393,7 +2393,7 @@ START_TEST(test_prodnz_alldimsreduced){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_prodnz(&gaD, &gaS, 3, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_PRODNZ, &gaD, NULL, &gaS, 3, reduxList)); ga_assert_ok(GpuArray_read (pD, sizeof(*pD), &gaD)); @@ -2475,7 +2475,7 @@ START_TEST(test_and_reduction){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); - ga_assert_ok(GpuArray_and (&gaD, &gaS, 2, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_AND, &gaD, NULL, &gaS, 2, reduxList)); ga_assert_ok(GpuArray_read (pD, sizeof(*pD)*dims[1], &gaD)); @@ -2559,7 +2559,7 @@ START_TEST(test_and_veryhighrank){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_and (&gaD, &gaS, 4, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_AND, &gaD, NULL, &gaS, 4, reduxList)); ga_assert_ok(GpuArray_read (pD, sizeof(*pD)*rdxProdDims, &gaD)); @@ -2653,7 +2653,7 @@ START_TEST(test_and_alldimsreduced){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_and (&gaD, &gaS, 3, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_AND, &gaD, NULL, &gaS, 3, reduxList)); ga_assert_ok(GpuArray_read (pD, sizeof(*pD), &gaD)); @@ -2735,7 +2735,7 @@ START_TEST(test_or_reduction){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); - ga_assert_ok(GpuArray_or (&gaD, &gaS, 2, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_OR, &gaD, NULL, &gaS, 2, reduxList)); ga_assert_ok(GpuArray_read (pD, sizeof(*pD)*dims[1], &gaD)); @@ -2819,7 +2819,7 @@ START_TEST(test_or_veryhighrank){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_or (&gaD, &gaS, 4, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_OR, &gaD, NULL, &gaS, 4, reduxList)); ga_assert_ok(GpuArray_read (pD, sizeof(*pD)*rdxProdDims, &gaD)); @@ -2913,7 +2913,7 @@ START_TEST(test_or_alldimsreduced){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_or (&gaD, &gaS, 3, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_OR, &gaD, NULL, &gaS, 3, reduxList)); ga_assert_ok(GpuArray_read (pD, sizeof(*pD), &gaD)); @@ -2991,7 +2991,7 @@ START_TEST(test_xor_reduction){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); - ga_assert_ok(GpuArray_xor (&gaD, &gaS, 2, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_XOR, &gaD, NULL, &gaS, 2, reduxList)); ga_assert_ok(GpuArray_read (pD, sizeof(*pD)*dims[1], &gaD)); @@ -3071,7 +3071,7 @@ START_TEST(test_xor_veryhighrank){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_xor (&gaD, &gaS, 4, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_XOR, &gaD, NULL, &gaS, 4, reduxList)); ga_assert_ok(GpuArray_read (pD, sizeof(*pD)*rdxProdDims, &gaD)); @@ -3161,7 +3161,7 @@ START_TEST(test_xor_alldimsreduced){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_xor (&gaD, &gaS, 3, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_XOR, &gaD, NULL, &gaS, 3, reduxList)); ga_assert_ok(GpuArray_read (pD, sizeof(*pD), &gaD)); @@ -3239,7 +3239,7 @@ START_TEST(test_any_reduction){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); - ga_assert_ok(GpuArray_any (&gaD, &gaS, 2, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_ANY, &gaD, NULL, &gaS, 2, reduxList)); ga_assert_ok(GpuArray_read (pD, sizeof(*pD)*dims[1], &gaD)); @@ -3319,7 +3319,7 @@ START_TEST(test_any_veryhighrank){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_any (&gaD, &gaS, 4, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_ANY, &gaD, NULL, &gaS, 4, reduxList)); ga_assert_ok(GpuArray_read (pD, sizeof(*pD)*rdxProdDims, &gaD)); @@ -3409,7 +3409,7 @@ START_TEST(test_any_alldimsreduced){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_any (&gaD, &gaS, 3, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_ANY, &gaD, NULL, &gaS, 3, reduxList)); ga_assert_ok(GpuArray_read (pD, sizeof(*pD), &gaD)); @@ -3487,7 +3487,7 @@ START_TEST(test_all_reduction){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); - ga_assert_ok(GpuArray_all (&gaD, &gaS, 2, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_ALL, &gaD, NULL, &gaS, 2, reduxList)); ga_assert_ok(GpuArray_read (pD, sizeof(*pD)*dims[1], &gaD)); @@ -3567,7 +3567,7 @@ START_TEST(test_all_veryhighrank){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_all (&gaD, &gaS, 4, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_ALL, &gaD, NULL, &gaS, 4, reduxList)); ga_assert_ok(GpuArray_read (pD, sizeof(*pD)*rdxProdDims, &gaD)); @@ -3657,7 +3657,7 @@ START_TEST(test_all_alldimsreduced){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_all (&gaD, &gaS, 3, reduxList)); + ga_assert_ok(GpuArray_reduction(GA_REDUCE_ALL, &gaD, NULL, &gaS, 3, reduxList)); ga_assert_ok(GpuArray_read (pD, sizeof(*pD), &gaD)); From 1a2df8dc54926d161d13b9813fbc0349302b3f5d Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Tue, 13 Jun 2017 17:06:02 -0400 Subject: [PATCH 12/34] Current State --- src/gpuarray_reduction.c | 167 +++++++++++++++++++++++++++------------ 1 file changed, 118 insertions(+), 49 deletions(-) diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c index 072f1e2685..1c1721ee4f 100644 --- a/src/gpuarray_reduction.c +++ b/src/gpuarray_reduction.c @@ -41,10 +41,7 @@ struct axis_desc{ int reduxNum; unsigned isReduced : 1; - unsigned isHW : 1; - unsigned isSW : 1; - size_t warpLen; - size_t len; + size_t len, warpLen, sliceLen; ssize_t srcStride, srcOffset; ssize_t dstStride, dstOffset; ssize_t dstArgStride, dstArgOffset; @@ -392,6 +389,7 @@ static int axisIsPartialWarp (const axis_desc* axis); /* Reduction Context API */ /* Utilities */ +static size_t reduxEstimateParallelism (const redux_ctx* ctx); static int reduxRequiresDst (const redux_ctx* ctx); static int reduxRequiresDstArg (const redux_ctx* ctx); static int reduxKernelRequiresDst (const redux_ctx* ctx); @@ -417,6 +415,8 @@ static int reduxInferProperties (redux_ctx* ctx); static int reduxFlattenSource (redux_ctx* ctx); static int reduxSelectWarpAxes (redux_ctx* ctx); static int reduxSelectNumStages (redux_ctx* ctx); +static int reduxPlan1Stage (redux_ctx* ctx); +static int reduxPlan2Stage (redux_ctx* ctx); static int reduxSelectHwAxes (redux_ctx* ctx); static int reduxComputeAxisList (redux_ctx* ctx); static int reduxGenSource (redux_ctx* ctx); @@ -1010,6 +1010,28 @@ static int axisIsPartialWarp (const axis_desc* axis){ return axis->warpLen > 0 && axis->warpLen != axis->len; } +/** + * @brief Estimate the level of parallelism in the device. + * + * This is a rough target number of threads. It would definitely fill the + * device, plus some substantial margin. + */ + +static size_t reduxEstimateParallelism (const redux_ctx* ctx){ + /** + * An arbitrary margin factor ensuring there will be a few thread blocks + * per SMX. + * + * E.g. on Kepler, each SMX can handle up to two 1024-thread blocks + * simultaneously, so a margin of 6/SMX should ensure with very high + * likelyhood that all SMXes will be fed and kept busy. + */ + + size_t marginFactor = 6; + + return marginFactor*ctx->numProcs*ctx->maxLg; +} + /** * @brief Returns whether the reduction interface requires a dst argument. */ @@ -1582,10 +1604,10 @@ static int reduxInferProperties (redux_ctx* ctx){ * the reduction axes. */ - ctx->xdSrc = calloc(ctx->nds, sizeof(*ctx->xdSrc)); - ctx->xdSrcPtrs = calloc(ctx->nds, sizeof(*ctx->xdSrcPtrs)); - ctx->xdSrcFlat = calloc(ctx->nds, sizeof(*ctx->xdSrcFlat)); - ctx->xdTmp = calloc(ctx->ndt, sizeof(*ctx->xdTmp)); + ctx->xdSrc = calloc(ctx->nds, sizeof(*ctx->xdSrc)); + ctx->xdSrcPtrs = calloc(ctx->nds+1, sizeof(*ctx->xdSrcPtrs)); + ctx->xdSrcFlat = calloc(ctx->nds+1, sizeof(*ctx->xdSrcFlat)); + ctx->xdTmp = calloc(ctx->ndt, sizeof(*ctx->xdTmp)); if (!ctx->xdSrc || !ctx->xdSrcPtrs || !ctx->xdSrcFlat || !ctx->xdTmp){ return reduxCleanup(ctx, GA_MEMORY_ERROR); } @@ -1814,15 +1836,62 @@ static int reduxSelectWarpAxes (redux_ctx* ctx){ */ static int reduxSelectNumStages (redux_ctx* ctx){ - size_t parallelism = 2 * ctx->numProcs * ctx->maxLg; + size_t parallelism = reduxEstimateParallelism(ctx); - if(ctx->zeroRdxAxes || /* Reduction is empty? */ - ctx->prodFreeAxes > ctx->prodRdxAxes || /* Large # of destination elements? */ - ctx->prodFreeAxes > parallelism ){ /* # of destination elements large enough to fill available parallelism? */ - ctx->numStages = 1; + if (ctx->zeroRdxAxes || /* Reduction over 0 elements? */ + ctx->prodAllAxes <= ctx->maxLg || /* Reduction over few elements? */ + ctx->prodFreeAxes >= ctx->prodRdxAxes || /* More destinations than reductions? */ + ctx->prodFreeAxes >= parallelism ){ /* Destination very large? */ + return reduxPlan1Stage(ctx); }else{ - ctx->numStages = 2; + return reduxPlan2Stage(ctx); } +} + +/** + * @brief Plan a 1-stage reduction. + * + * Inputs: ctx->xdSrcFlat[0...ctx->ndf-1] + * + * This plan involves a direct write to the destinations, and does not require + * working space. + * + * Because the reduction is deterministic, all reductions required for any + * destination element must be performed within a single thread block. + * + * In this implementation we choose to perform only intra-warp reductions, + * insulating ourselves from having to worry about the interplay between block + * size and kernel source code (A kernel's max block size is limited by + * numerous factors including its own source code, but the specific kernel we + * pick and generate requires foreknowledge of its block size. Chicken or egg). + */ + +static int reduxPlan1Stage (redux_ctx* ctx){ + ctx->numStages = 1; + + + + return reduxSelectHwAxes(ctx); +} + +/** + * @brief Plan a 2-stage reduction. + * + * Inputs: ctx->xdSrcFlat[0...ctx->ndf-1] + * + * This plan involves splitting the reduction into two stages: + * + * Stage 1: A reduction by approximately R = sqrt(prodRdxAxes) elements per + * destination elements into allocated temporary workspace(s) + * of approximate size dst.shape + (prodRdxAxes/R,) + * Stage 2: A reduction by approximately prodRdxAxes/R elements into the + * final destination. + */ + +static int reduxPlan2Stage (redux_ctx* ctx){ + ctx->numStages = 2; + + /* NOTE: Use gpuarray_get_elsize(typecode) */ return reduxSelectHwAxes(ctx); } @@ -1941,7 +2010,7 @@ static int reduxSelectHwAxes (redux_ctx* ctx){ * loops that iterate over the dimensions of elements that are to be reduced. */ -static int reduxComputeAxisList (redux_ctx* ctx){ +static int reduxComputeAxisList (redux_ctx* ctx){ int i, f=0; for (i=0;inds;i++){ @@ -1961,7 +2030,7 @@ static int reduxComputeAxisList (redux_ctx* ctx){ * @return GA_MEMORY_ERROR if not enough memory left; GA_NO_ERROR otherwise. */ -static int reduxGenSource (redux_ctx* ctx){ +static int reduxGenSource (redux_ctx* ctx){ reduxAppendSource(ctx); ctx->sourceCodeLen = ctx->s.l; ctx->sourceCode = strb_cstr(&ctx->s); @@ -1971,7 +2040,7 @@ static int reduxGenSource (redux_ctx* ctx){ return reduxCompile(ctx); } -static void reduxAppendSource (redux_ctx* ctx){ +static void reduxAppendSource (redux_ctx* ctx){ reduxAppendIncludes (ctx); reduxAppendMacroDefs (ctx); reduxAppendTypedefs (ctx); @@ -1983,21 +2052,21 @@ static void reduxAppendSource (redux_ctx* ctx){ reduxAppendPostKernel (ctx); } } -static void reduxAppendTensorDeclArgs (redux_ctx* ctx, - const char* type, - const char* baseName){ +static void reduxAppendTensorDeclArgs (redux_ctx* ctx, + const char* type, + const char* baseName){ srcbAppendElemf(&ctx->srcGen, "%s* %sPtr", type, baseName); srcbAppendElemf(&ctx->srcGen, "const X %sOff", baseName); srcbAppendElemf(&ctx->srcGen, "const GLOBAL_MEM X* %sSteps", baseName); (void)reduxAppendTensorCallArgs;/* Silence unused warning */ } -static void reduxAppendTensorCallArgs (redux_ctx* ctx, - const char* baseName){ +static void reduxAppendTensorCallArgs (redux_ctx* ctx, + const char* baseName){ srcbAppendElemf(&ctx->srcGen, "%sPtr", baseName); srcbAppendElemf(&ctx->srcGen, "%sOff", baseName); srcbAppendElemf(&ctx->srcGen, "%sSteps", baseName); } -static void reduxAppendMacroDefs (redux_ctx* ctx){ +static void reduxAppendMacroDefs (redux_ctx* ctx){ int i; srcbAppends (&ctx->srcGen, "#define FOROVER(idx) for (i##idx = i##idx##Start; i##idx < i##idx##End; i##idx++)\n"); @@ -2049,21 +2118,21 @@ static void reduxAppendMacroDefs (redux_ctx* ctx){ srcbEndList (&ctx->srcGen); srcbAppends (&ctx->srcGen, ")\n"); } -static void reduxAppendIncludes (redux_ctx* ctx){ +static void reduxAppendIncludes (redux_ctx* ctx){ strb_appends(&ctx->s, "/* Includes */\n"); strb_appends(&ctx->s, "#include \"cluda.h\"\n"); strb_appends(&ctx->s, "\n"); strb_appends(&ctx->s, "\n"); strb_appends(&ctx->s, "\n"); } -static void reduxAppendTypedefs (redux_ctx* ctx){ +static void reduxAppendTypedefs (redux_ctx* ctx){ strb_appendf(&ctx->s, "typedef %s S;\n", ctx->srcTypeStr); /* The type of the source array. */ strb_appendf(&ctx->s, "typedef %s T;\n", ctx->dstTypeStr); /* The type of the destination array. */ strb_appendf(&ctx->s, "typedef %s A;\n", ctx->dstArgTypeStr);/* The type of the destination argument array. */ strb_appendf(&ctx->s, "typedef %s X;\n", ctx->idxTypeStr); /* The type of the indices: signed 32/64-bit. */ strb_appendf(&ctx->s, "typedef %s K;\n", ctx->accTypeStr); /* The type of the accumulator variable. */ } -static void reduxAppendGetInitValFns (redux_ctx* ctx){ +static void reduxAppendGetInitValFns (redux_ctx* ctx){ /** * Initial value functions. */ @@ -2075,7 +2144,7 @@ static void reduxAppendGetInitValFns (redux_ctx* ctx){ "\treturn (%s);\n" "}\n\n\n\n", ctx->initValT, ctx->initValK); } -static void reduxAppendWriteBackFn (redux_ctx* ctx){ +static void reduxAppendWriteBackFn (redux_ctx* ctx){ /** * Global memory value reduction function. * @@ -2118,7 +2187,7 @@ static void reduxAppendWriteBackFn (redux_ctx* ctx){ /* Close off function. */ strb_appends(&ctx->s, "}\n\n\n\n"); } -static void reduxAppendReduxKernel (redux_ctx* ctx){ +static void reduxAppendReduxKernel (redux_ctx* ctx){ reduxAppendPrototype (ctx); strb_appends (&ctx->s, "{\n"); reduxAppendIndexDeclarations(ctx); @@ -2126,7 +2195,7 @@ static void reduxAppendReduxKernel (redux_ctx* ctx){ reduxAppendLoops (ctx); strb_appends (&ctx->s, "}\n"); } -static void reduxAppendPrototype (redux_ctx* ctx){ +static void reduxAppendPrototype (redux_ctx* ctx){ srcbAppends (&ctx->srcGen, "KERNEL void reduxKer("); srcbBeginList (&ctx->srcGen, ", ", "void"); reduxAppendTensorDeclArgs(ctx, "S", "src"); @@ -2141,7 +2210,7 @@ static void reduxAppendPrototype (redux_ctx* ctx){ srcbEndList (&ctx->srcGen); srcbAppends (&ctx->srcGen, ")"); } -static void reduxAppendIndexDeclarations (redux_ctx* ctx){ +static void reduxAppendIndexDeclarations (redux_ctx* ctx){ int i; strb_appends(&ctx->s, "\t/* GPU kernel coordinates. Always 3D in OpenCL/CUDA. */\n"); @@ -2168,7 +2237,7 @@ static void reduxAppendIndexDeclarations (redux_ctx* ctx){ if (ctx->nds > ctx->ndd){appendIdxes (&ctx->s, "\tX ", "i", ctx->ndd, ctx->nds, "PDim", ";\n");} strb_appends(&ctx->s, "\t\n\t\n"); } -static void reduxAppendRangeCalculations (redux_ctx* ctx){ +static void reduxAppendRangeCalculations (redux_ctx* ctx){ size_t hwDim; int i; @@ -2229,7 +2298,7 @@ static void reduxAppendRangeCalculations (redux_ctx* ctx){ strb_appends(&ctx->s, "\t\n\t\n"); } -static void reduxAppendLoops (redux_ctx* ctx){ +static void reduxAppendLoops (redux_ctx* ctx){ int i; for (i=0;indd;i++){ @@ -2333,10 +2402,10 @@ static void reduxAppendLoops (redux_ctx* ctx){ srcbAppends(&ctx->srcGen, "\t}\n"); } } -static void reduxAppendInitKernel (redux_ctx* ctx){ +static void reduxAppendInitKernel (redux_ctx* ctx){ /* BUG: Implement this for small code model. */ } -static void reduxAppendPostKernel (redux_ctx* ctx){ +static void reduxAppendPostKernel (redux_ctx* ctx){ /* BUG: Implement this for small code model. */ } @@ -2344,7 +2413,7 @@ static void reduxAppendPostKernel (redux_ctx* ctx){ * @brief Compile the kernel from source code. */ -static int reduxCompile (redux_ctx* ctx){ +static int reduxCompile (redux_ctx* ctx){ int ret, i = 0; int PRI_TYPECODES[11]; size_t PRI_TYPECODES_LEN; @@ -2432,7 +2501,7 @@ static int reduxCompile (redux_ctx* ctx){ * for the primary/auxilliary kernels. */ -static int reduxSchedule (redux_ctx* ctx){ +static int reduxSchedule (redux_ctx* ctx){ int i, priNdims = 0, auxNdims = 0; uint64_t maxLgRdx = 0, maxLgPre = 0, maxLgPost = 0; uint64_t maxLgPri = 0, maxLgAux = 0; @@ -2539,16 +2608,16 @@ static int reduxSchedule (redux_ctx* ctx){ * anything to do with the integer factorization APIs. */ -static void reduxScheduleKernel (int ndims, - uint64_t* dims, - uint64_t warpSize, - uint64_t maxLg, - uint64_t* maxLs, - uint64_t maxGg, - uint64_t* maxGs, - uint64_t* bs, - uint64_t* gs, - uint64_t* cs){ +static void reduxScheduleKernel (int ndims, + uint64_t* dims, + uint64_t warpSize, + uint64_t maxLg, + uint64_t* maxLs, + uint64_t maxGg, + uint64_t* maxGs, + uint64_t* bs, + uint64_t* gs, + uint64_t* cs){ uint64_t warpMod, bestWarpMod = 1; int i, bestWarpAxis = 0; uint64_t roundedDims[MAX_HW_DIMS]; @@ -2634,7 +2703,7 @@ static void reduxScheduleKernel (int ndims, * Invoke the kernel. */ -static int reduxInvoke (redux_ctx* ctx){ +static int reduxInvoke (redux_ctx* ctx){ void* priArgs[11]; void* auxArgs[ 8]; int ret, i = 0; @@ -2790,8 +2859,8 @@ static int reduxCleanup (redux_ctx* ctx, int ret){ return ret; } -static int reduxCleanupMsg (redux_ctx* ctx, int ret, - const char* fmt, ...){ +static int reduxCleanupMsg (redux_ctx* ctx, int ret, + const char* fmt, ...){ #if DEBUG FILE* fp = stderr; From fffd323e8efa735a69fb36ddc2037d77efae9c9b Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Wed, 14 Jun 2017 11:07:12 -0400 Subject: [PATCH 13/34] Remove warp axis select. --- src/gpuarray_reduction.c | 129 --------------------------------------- 1 file changed, 129 deletions(-) diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c index 1c1721ee4f..123c059964 100644 --- a/src/gpuarray_reduction.c +++ b/src/gpuarray_reduction.c @@ -250,8 +250,6 @@ struct redux_ctx{ int nds; /* # Source dimensions */ int ndr; /* # Reduced dimensions */ int ndd; /* # Destination dimensions */ - int ndw; /* # Warp dimensions */ - int ndp; /* # Partial warp dimensions */ int ndf; /* # Flattened source dimensions */ int ndt; /* # Temporary workspace dimensions */ int zeroAllAxes; /* # of zero-length axes in source tensor */ @@ -259,8 +257,6 @@ struct redux_ctx{ size_t prodAllAxes; /* Product of length of all axes in source tensor */ size_t prodRdxAxes; /* Product of length of all reduction axes in source tensor */ size_t prodFreeAxes; /* Product of length of all free axes in source tensor */ - size_t prodWarpAxes; /* Number of active threads per warp. Strictly <= warpSize. */ - int splitWarpAxis;/* Index of the split warp axis within the source tensor's shape; -1 otherwise. */ gpucontext* gpuCtx; unsigned numProcs; @@ -353,7 +349,6 @@ static int reduxGetAndInit (int typecode, const char** prop static int reduxGetOrInit (int typecode, const char** property); static int reduxSortFlatSensitive (const void* a, const void* b); static int reduxSortFlatInsensitive (const void* a, const void* b); -static int reduxSortWarp (const void* a, const void* b); static int axisInSet (int v, const int* set, size_t setLen, @@ -371,7 +366,6 @@ static void axisInit (axis_desc* axis, ssize_t len, ssize_t srcStride); static void axisMarkReduced (axis_desc* axis, int reduxNum); -static void axisMarkWarp (axis_desc* axis, size_t partialSlice); static int axisGetReduxNum (const axis_desc* axis); static size_t axisGetLen (const axis_desc* axis); static ssize_t axisGetSrcStride (const axis_desc* axis); @@ -384,8 +378,6 @@ static ssize_t axisGetDstArgStride (const axis_desc* axis); static size_t axisGetDstArgAbsStride (const axis_desc* axis); static ssize_t axisGetDstArgOffset (const axis_desc* axis); static int axisIsReduced (const axis_desc* axis); -static int axisIsWarp (const axis_desc* axis); -static int axisIsPartialWarp (const axis_desc* axis); /* Reduction Context API */ /* Utilities */ @@ -413,7 +405,6 @@ static void reduxAppendLargestAxisToHwList(redux_ctx* ctx, static int reduxInit (redux_ctx* ctx); static int reduxInferProperties (redux_ctx* ctx); static int reduxFlattenSource (redux_ctx* ctx); -static int reduxSelectWarpAxes (redux_ctx* ctx); static int reduxSelectNumStages (redux_ctx* ctx); static int reduxPlan1Stage (redux_ctx* ctx); static int reduxPlan2Stage (redux_ctx* ctx); @@ -808,47 +799,6 @@ static int reduxSortFlatSensitive (const void* a, const void* b){ } } -/** - * @brief Sort axes in preferred order for integration into warp. - * - * The axes with stride != 0 are sorted by lowest absolute - * stride. Picking the few axes with the lowest absolute stride (while - * keeping the product of their dimensions <= warpSize) should maximize - * memory bandwidth of the warp. - * - * The restriction stride != 0 is intended to avoid waste of memory - * bandwidth. Once a memory transaction is necessary, it typically operates at - * far greater granularity than just 32 bits (4 bytes). - * - * Sorting by absolute stride should result, in the case of a packed tensor, in - * the memory accesses being close to perfectly contiguous. - */ - -static int reduxSortWarp (const void* a, const void* b){ - const axis_desc* xda = *(const axis_desc* const *)a; - const axis_desc* xdb = *(const axis_desc* const *)b; - - if ( axisGetSrcStride(xda) && !axisGetSrcStride(xdb)){ - return -1; - }else if (!axisGetSrcStride(xda) && axisGetSrcStride(xdb)){ - return +1; - } - - if (axisGetSrcAbsStride(xda) < axisGetSrcAbsStride(xdb)){ - return -1; - }else if (axisGetSrcAbsStride(xda) > axisGetSrcAbsStride(xdb)){ - return +1; - } - - if ( axisIsReduced(xda) && !axisIsReduced(xdb)){ - return -1; - }else if (!axisIsReduced(xda) && axisIsReduced(xdb)){ - return +1; - } - - return 0; -} - /** * @brief Check whether axis numbered v is already in the given set of axes. * @@ -952,14 +902,6 @@ static void axisMarkReduced (axis_desc* axis, int r axis->reduxNum = reduxNum; } -/** - * @brief Mark axis as warp axis. - */ - -static void axisMarkWarp (axis_desc* axis, size_t warpLen){ - axis->warpLen = warpLen; -} - /** * @brief Get properties of an axis. */ @@ -1003,12 +945,6 @@ static ssize_t axisGetDstArgOffset (const axis_desc* axis){ static int axisIsReduced (const axis_desc* axis){ return axis->isReduced; } -static int axisIsWarp (const axis_desc* axis){ - return !!axis->warpLen; -} -static int axisIsPartialWarp (const axis_desc* axis){ - return axis->warpLen > 0 && axis->warpLen != axis->len; -} /** * @brief Estimate the level of parallelism in the device. @@ -1367,9 +1303,7 @@ static int reduxInit (redux_ctx* ctx){ ctx->errorString1 = NULL; ctx->errorString2 = NULL; - ctx->splitWarpAxis = -1; ctx->numStages = 1; - ctx->prodWarpAxes = 1; ctx->prodAllAxes = ctx->prodRdxAxes = ctx->prodFreeAxes = 1; strb_init(&ctx->s); srcbInit (&ctx->srcGen, &ctx->s); @@ -1434,8 +1368,6 @@ static int reduxInferProperties (redux_ctx* ctx){ ctx->nds = ctx->src->nd; ctx->ndr = ctx->reduxLen; ctx->ndd = ctx->nds - ctx->ndr; - ctx->ndw = 0; - ctx->ndp = 0; ctx->ndf = 0; ctx->ndt = ctx->ndd + 1; @@ -1749,18 +1681,6 @@ static int reduxFlattenSource (redux_ctx* ctx){ } ctx->ndf = j; - return reduxSelectWarpAxes(ctx); -} - -/** - * @brief Select the warp axes in such a way as to maximize memory bandwidth. - */ - -static int reduxSelectWarpAxes (redux_ctx* ctx){ - axis_desc* a; - int i; - size_t aL; - /** * NOTE: At this point it is possible for there to be no axes @@ -1777,55 +1697,6 @@ static int reduxSelectWarpAxes (redux_ctx* ctx){ ctx->ndf = 1; } - - /** - * Select Warp Axes. - * - * Using a particular heuristic order (*), sort the axis list by - * suitability for belonging to the warp. Then, pick the first few axes, - * until the product of their lengths exceeds the warp size. - * - * (*) See documentation of value-comparison function. - */ - - for(i=0;indf;i++){ - ctx->xdSrcPtrs[i] = reduxGetSrcFlatAxis(ctx, i); - } - - qsort(ctx->xdSrcPtrs, ctx->ndf, sizeof(*ctx->xdSrcPtrs), reduxSortWarp); - - for (i=0;indf;i++){ - a = reduxGetSrcSortAxis(ctx, i); - aL = axisGetLen(a); - if (aL <= 1){break;} - - ctx->prodWarpAxes *= aL; - if (ctx->prodWarpAxes <= ctx->warpSize){ - axisMarkWarp(a, aL); - ctx->ndw++; - }else{ - /** - * The product of warp lengths just exceeded warpSize. We backtrack - * by undoing the multiplication by aL. We then check whether we - * can "split" this axis by extracting at least a factor of 2 into - * warpLen. If yes, we mark is as the (only) warp axis that is - * split by setting its warpLen to something neither 0 nor len. - */ - - ctx->prodWarpAxes /= aL; - aL = ctx->warpSize/ctx->prodWarpAxes; - if (aL >= 2){ - axisMarkWarp(a, aL); - ctx->prodWarpAxes *= aL; - ctx->splitWarpAxis = i; - ctx->ndw++; - ctx->ndp++; - } - break; - } - } - - return reduxSelectNumStages(ctx); } From 1cfe552d8e71c447529de0603d865e11dd118a37 Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Wed, 14 Jun 2017 16:11:19 -0400 Subject: [PATCH 14/34] Massive cleanup. --- src/gpuarray_reduction.c | 975 +++++++++++++++++---------------------- 1 file changed, 418 insertions(+), 557 deletions(-) diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c index 123c059964..24243f78ca 100644 --- a/src/gpuarray_reduction.c +++ b/src/gpuarray_reduction.c @@ -41,7 +41,8 @@ struct axis_desc{ int reduxNum; unsigned isReduced : 1; - size_t len, warpLen, sliceLen; + int hwAxisStage0, hwAxisStage1; + size_t len, tmpLen, sliceLen; ssize_t srcStride, srcOffset; ssize_t dstStride, dstOffset; ssize_t dstArgStride, dstArgOffset; @@ -250,7 +251,9 @@ struct redux_ctx{ int nds; /* # Source dimensions */ int ndr; /* # Reduced dimensions */ int ndd; /* # Destination dimensions */ - int ndf; /* # Flattened source dimensions */ + int ndfs; /* # Flattened source dimensions */ + int ndfr; /* # Flattened source dimensions */ + int ndfd; /* # Flattened source dimensions */ int ndt; /* # Temporary workspace dimensions */ int zeroAllAxes; /* # of zero-length axes in source tensor */ int zeroRdxAxes; /* # of zero-length reduction axes in source tensor */ @@ -258,6 +261,7 @@ struct redux_ctx{ size_t prodRdxAxes; /* Product of length of all reduction axes in source tensor */ size_t prodFreeAxes; /* Product of length of all free axes in source tensor */ + /* GPU Context & Device */ gpucontext* gpuCtx; unsigned numProcs; size_t warpSize; @@ -266,18 +270,33 @@ struct redux_ctx{ size_t maxGg; size_t maxGs[MAX_HW_DIMS]; + /* Flattening */ axis_desc* xdSrc; axis_desc* xdSrcFlat; - axis_desc* xdTmp; - axis_desc** xdSrcPtrs; + size_t* flatSrcDimensions; + ssize_t* flatSrcStrides; + gpudata* flatSrcData; + ssize_t flatSrcOffset; + ssize_t* flatDstStrides; + gpudata* flatDstData; + ssize_t flatDstOffset; + ssize_t* flatDstArgStrides; + gpudata* flatDstArgData; + ssize_t flatDstArgOffset; + + /* Select number of stages */ int numStages; - GpuArray* wsDst; - GpuArray* wsDstArg; - int* srcAxisList; - size_t* dstDims; + /* Workspaces, in the case of 2-stage reduction */ + size_t* tmpSrcDimensions; + ssize_t* tmpDstStrides; + gpudata* tmpDstData; + ssize_t tmpDstOffset; + ssize_t* tmpDstArgStrides; + gpudata* tmpDstArgData; + ssize_t tmpDstArgOffset; /* Source code Generator. */ int srcTypeCode; @@ -292,14 +311,12 @@ struct redux_ctx{ const char* accTypeStr; const char* initValT; const char* initValK; - int largeCodeModel; strb s; srcb srcGen; char* sourceCode; size_t sourceCodeLen; char* errorString0; char* errorString1; - char* errorString2; GpuKernel preKernel; GpuKernel kernel; GpuKernel postKernel; @@ -319,14 +336,14 @@ struct redux_ctx{ struct{ int ndh; + int ndhp; int ndhd; int ndhr; - int axisList [MAX_HW_DIMS]; size_t bs [MAX_HW_DIMS]; size_t gs [MAX_HW_DIMS]; size_t cs [MAX_HW_DIMS]; gpudata* chunkSizeGD; - } pri, aux; + } st1, st2; /* Invoker */ gpudata* srcStepsGD; @@ -349,10 +366,8 @@ static int reduxGetAndInit (int typecode, const char** prop static int reduxGetOrInit (int typecode, const char** property); static int reduxSortFlatSensitive (const void* a, const void* b); static int reduxSortFlatInsensitive (const void* a, const void* b); -static int axisInSet (int v, - const int* set, - size_t setLen, - size_t* where); +static int reduxSortPlan1Stage (const void* a, const void* b); +static int reduxSortPlan2Stage0 (const void* a, const void* b); static void appendIdxes (strb* s, const char* prologue, const char* prefix, @@ -362,22 +377,24 @@ static void appendIdxes (strb* s, const char* epilogue); /* Axis Description API */ -static void axisInit (axis_desc* axis, - ssize_t len, - ssize_t srcStride); -static void axisMarkReduced (axis_desc* axis, int reduxNum); -static int axisGetReduxNum (const axis_desc* axis); -static size_t axisGetLen (const axis_desc* axis); -static ssize_t axisGetSrcStride (const axis_desc* axis); -static size_t axisGetSrcAbsStride (const axis_desc* axis); -static ssize_t axisGetSrcOffset (const axis_desc* axis); -static ssize_t axisGetDstStride (const axis_desc* axis); -static size_t axisGetDstAbsStride (const axis_desc* axis); -static ssize_t axisGetDstOffset (const axis_desc* axis); -static ssize_t axisGetDstArgStride (const axis_desc* axis); -static size_t axisGetDstArgAbsStride (const axis_desc* axis); -static ssize_t axisGetDstArgOffset (const axis_desc* axis); -static int axisIsReduced (const axis_desc* axis); +static void axisInit (axis_desc* axis, + ssize_t len, + ssize_t srcStride); +static void axisMarkReduced (axis_desc* axis, int reduxNum); +static int axisGetReduxNum (const axis_desc* axis); +static size_t axisGetLen (const axis_desc* axis); +static ssize_t axisGetSrcStride (const axis_desc* axis); +static size_t axisGetSrcAbsStride (const axis_desc* axis); +static ssize_t axisGetSrcOffset (const axis_desc* axis); +static ssize_t axisGetDstStride (const axis_desc* axis); +static size_t axisGetDstAbsStride (const axis_desc* axis); +static ssize_t axisGetDstOffset (const axis_desc* axis); +static ssize_t axisGetDstArgStride (const axis_desc* axis); +static size_t axisGetDstArgAbsStride (const axis_desc* axis); +static ssize_t axisGetDstArgOffset (const axis_desc* axis); +static int axisIsReduced (const axis_desc* axis); +static int axisIsHW (const axis_desc* axis, int stage); +static int axisGetHWAxisNum (const axis_desc* axis, int stage); /* Reduction Context API */ /* Utilities */ @@ -387,64 +404,58 @@ static int reduxRequiresDstArg (const redux_ctx* ctx); static int reduxKernelRequiresDst (const redux_ctx* ctx); static int reduxKernelRequiresDstArg (const redux_ctx* ctx); static int reduxIsSensitive (const redux_ctx* ctx); -static int reduxIsSmallCodeModel (const redux_ctx* ctx); -static int reduxIsLargeCodeModel (const redux_ctx* ctx); +static int reduxIs1Stage (const redux_ctx* ctx); +static int reduxIs2Stage (const redux_ctx* ctx); static axis_desc* reduxGetSrcAxis (const redux_ctx* ctx, int i); static axis_desc* reduxGetSrcSortAxis (const redux_ctx* ctx, int i); static axis_desc* reduxGetSrcFlatAxis (const redux_ctx* ctx, int i); -static int reduxTryFlattenInto (const redux_ctx* ctx, - axis_desc* into, - const axis_desc* from); -static int reduxCanAppendHwAxis (redux_ctx* ctx, - int kernelType, - int axisType); -static void reduxAppendLargestAxisToHwList(redux_ctx* ctx, - int kernelType, - int axisType); +static int reduxTryFlattenInto (const redux_ctx* ctx, + axis_desc* into, + const axis_desc* from); +static void reduxSortAxisPtrsBy (axis_desc** ptrs, + axis_desc* axes, + size_t numAxes, + int(*fn)(const void*, const void*)); /* Control Flow */ -static int reduxInit (redux_ctx* ctx); -static int reduxInferProperties (redux_ctx* ctx); -static int reduxFlattenSource (redux_ctx* ctx); -static int reduxSelectNumStages (redux_ctx* ctx); -static int reduxPlan1Stage (redux_ctx* ctx); -static int reduxPlan2Stage (redux_ctx* ctx); -static int reduxSelectHwAxes (redux_ctx* ctx); -static int reduxComputeAxisList (redux_ctx* ctx); -static int reduxGenSource (redux_ctx* ctx); -static void reduxAppendSource (redux_ctx* ctx); -static void reduxAppendIncludes (redux_ctx* ctx); -static void reduxAppendTensorDeclArgs (redux_ctx* ctx, - const char* type, - const char* baseName); -static void reduxAppendTensorCallArgs (redux_ctx* ctx, - const char* baseName); -static void reduxAppendMacroDefs (redux_ctx* ctx); -static void reduxAppendTypedefs (redux_ctx* ctx); -static void reduxAppendGetInitValFns (redux_ctx* ctx); -static void reduxAppendWriteBackFn (redux_ctx* ctx); -static void reduxAppendReduxKernel (redux_ctx* ctx); -static void reduxAppendPrototype (redux_ctx* ctx); -static void reduxAppendIndexDeclarations (redux_ctx* ctx); -static void reduxAppendRangeCalculations (redux_ctx* ctx); -static void reduxAppendLoops (redux_ctx* ctx); -static void reduxAppendInitKernel (redux_ctx* ctx); -static void reduxAppendPostKernel (redux_ctx* ctx); -static int reduxCompile (redux_ctx* ctx); -static int reduxSchedule (redux_ctx* ctx); -static void reduxScheduleKernel (int ndims, - uint64_t* dims, - uint64_t warpSize, - uint64_t maxLg, - uint64_t* maxLs, - uint64_t maxGg, - uint64_t* maxGs, - uint64_t* bs, - uint64_t* gs, - uint64_t* cs); -static int reduxInvoke (redux_ctx* ctx); -static int reduxCleanup (redux_ctx* ctx, int ret); -static int reduxCleanupMsg (redux_ctx* ctx, int ret, - const char* fmt, ...); +static int reduxInit (redux_ctx* ctx); +static int reduxInferProperties (redux_ctx* ctx); +static int reduxFlattenSource (redux_ctx* ctx); +static int reduxSelectNumStages (redux_ctx* ctx); +static int reduxPlan1Stage (redux_ctx* ctx); +static int reduxPlan2Stage (redux_ctx* ctx); +static int reduxGenSource (redux_ctx* ctx); +static void reduxAppendSource (redux_ctx* ctx); +static void reduxAppendIncludes (redux_ctx* ctx); +static void reduxAppendTensorDeclArgs (redux_ctx* ctx, + const char* type, + const char* baseName); +static void reduxAppendTensorCallArgs (redux_ctx* ctx, + const char* baseName); +static void reduxAppendMacroDefs (redux_ctx* ctx); +static void reduxAppendTypedefs (redux_ctx* ctx); +static void reduxAppendGetInitValFns (redux_ctx* ctx); +static void reduxAppendWriteBackFn (redux_ctx* ctx); +static void reduxAppendReduxKernel (redux_ctx* ctx); +static void reduxAppendPrototype (redux_ctx* ctx); +static void reduxAppendIndexDeclarations (redux_ctx* ctx); +static void reduxAppendRangeCalculations (redux_ctx* ctx); +static void reduxAppendLoops (redux_ctx* ctx); +static int reduxCompile (redux_ctx* ctx); +static int reduxSchedule (redux_ctx* ctx); +static void reduxScheduleKernel (int ndims, + uint64_t* dims, + uint64_t warpSize, + uint64_t maxLg, + uint64_t* maxLs, + uint64_t maxGg, + uint64_t* maxGs, + uint64_t* bs, + uint64_t* gs, + uint64_t* cs); +static int reduxInvoke (redux_ctx* ctx); +static int reduxCleanup (redux_ctx* ctx, int ret); +static int reduxCleanupMsg (redux_ctx* ctx, int ret, + const char* fmt, ...); /* Function implementation */ @@ -800,29 +811,39 @@ static int reduxSortFlatSensitive (const void* a, const void* b){ } /** - * @brief Check whether axis numbered v is already in the given set of axes. - * - * @param [in] v - * @param [in] set - * @param [in] setLen - * @param [out] where - * @return Non-zero if the set is non-empty and v is in it; Zero otherwise. + * For the plan of a 1-stage reduction, we need to sort the free axes by + * decreasing length. */ -static int axisInSet (int v, - const int* set, - size_t setLen, - size_t* where){ - size_t i; +static int reduxSortPlan1Stage (const void* a, const void* b){ + const axis_desc* xda = *(const axis_desc* const*)a; + const axis_desc* xdb = *(const axis_desc* const*)b; - for (i=0;ireduxNum = -1; - axis->warpLen = 0; + axis->hwAxisStage0 = axis->hwAxisStage1 = -1; axis->len = len; + axis->tmpLen = 0; + axis->sliceLen = 0; axis->srcStride = srcStride; axis->srcOffset = 0; @@ -945,6 +968,12 @@ static ssize_t axisGetDstArgOffset (const axis_desc* axis){ static int axisIsReduced (const axis_desc* axis){ return axis->isReduced; } +static int axisIsHW (const axis_desc* axis, int stage){ + return (stage == 0 ? axis->hwAxisStage0 : axis->hwAxisStage1) >= 0; +} +static int axisGetHWAxisNum (const axis_desc* axis, int stage){ + return stage == 0 ? axis->hwAxisStage0 : axis->hwAxisStage1; +} /** * @brief Estimate the level of parallelism in the device. @@ -1012,7 +1041,7 @@ static int reduxKernelRequiresDst (const redux_ctx* ctx){ switch (ctx->op){ case GA_REDUCE_ARGMIN: case GA_REDUCE_ARGMAX: - return reduxIsSmallCodeModel(ctx); + return reduxIs2Stage(ctx); default: return 1; } @@ -1079,19 +1108,19 @@ static int reduxIsSensitive (const redux_ctx* ctx){ } /** - * @brief Returns whether we are using the small code model or not. + * @brief Is the reduction 1-stage? */ -static int reduxIsSmallCodeModel (const redux_ctx* ctx){ - return !reduxIsLargeCodeModel(ctx); +static int reduxIs1Stage (const redux_ctx* ctx){ + return ctx->numStages == 1; } /** - * @brief Returns whether we are using the large code model or not. + * @brief Is the reduction 2-stage? */ -static int reduxIsLargeCodeModel (const redux_ctx* ctx){ - return ctx->largeCodeModel; +static int reduxIs2Stage (const redux_ctx* ctx){ + return !reduxIs1Stage(ctx); } /** @@ -1139,9 +1168,9 @@ static axis_desc* reduxGetSrcFlatAxis (const redux_ctx* ctx, int i){ * @return Non-zero if flattening attempt successful; Zero otherwise. */ -static int reduxTryFlattenInto (const redux_ctx* ctx, - axis_desc* into, - const axis_desc* from){ +static int reduxTryFlattenInto (const redux_ctx* ctx, + axis_desc* into, + const axis_desc* from){ int signSrc = 0, signDst = 0, signDstArg = 0, reverseSrc = 0, reverseDst = 0, reverseDstArg = 0; @@ -1210,70 +1239,21 @@ static int reduxTryFlattenInto (const redux_ctx* ctx, } /** - * @brief Check whether we can add another reduction axis or free axis - * to the hardware axis list for either the primary or secondary kernel. + * Sort an array of *pointers* to axes by the given comparison function, while + * not touching the axes themselves. */ -static int reduxCanAppendHwAxis (redux_ctx* ctx, - int kernelType, - int axisType){ - int kernelNdh = kernelType == KERNEL_PRIMARY ? ctx->pri.ndh : ctx->aux.ndh; - int kernelNdhr = kernelType == KERNEL_PRIMARY ? ctx->pri.ndhr : ctx->aux.ndhr; - int kernelNdhd = kernelType == KERNEL_PRIMARY ? ctx->pri.ndhd : ctx->aux.ndhd; - - if (kernelNdh >= MAX_HW_DIMS){ - return 0; - }else{ - return axisType == AXIS_REDUX ? kernelNdhr < ctx->ndr: - kernelNdhd < ctx->ndd; - } -} - -/** - * @brief Append the largest reduction axis or free axis that isn't yet - * in the hardware axis list for either the primary or secondary kernel - * into said hardware axis list. - */ - -static void reduxAppendLargestAxisToHwList(redux_ctx* ctx, - int kernelType, - int axisType){ - int maxI = 0, i, isInHwList, isInReduxList, isInDesiredList, isLargestSoFar; - int* hwAxisList, * ndh, * ndhr, * ndhd; - size_t v, maxV = 0; - - /* Get pointers to the correct kernel's variables */ - hwAxisList = kernelType == KERNEL_PRIMARY ? ctx->pri.axisList: - ctx->aux.axisList; - ndh = kernelType == KERNEL_PRIMARY ? &ctx->pri.ndh: - &ctx->aux.ndh; - ndhr = kernelType == KERNEL_PRIMARY ? &ctx->pri.ndhr: - &ctx->aux.ndhr; - ndhd = kernelType == KERNEL_PRIMARY ? &ctx->pri.ndhd: - &ctx->aux.ndhd; - - /* Find */ - for (i=0;inds;i++){ - isInHwList = axisInSet(i, hwAxisList, *ndh, 0); - isInReduxList = axisInSet(i, ctx->reduxList, ctx->ndr, 0); - isInDesiredList = axisType == AXIS_REDUX ? isInReduxList: - !isInReduxList; - v = ctx->src->dimensions[i]; - isLargestSoFar = v >= maxV; - - if (!isInHwList && isInDesiredList && isLargestSoFar){ - maxV = v; - maxI = i; - } - } - - /* Append */ - hwAxisList[(*ndh)++] = maxI; - if (axisType == AXIS_REDUX){ - (*ndhr)++; - }else{ - (*ndhd)++; +static void reduxSortAxisPtrsBy (axis_desc** ptrs, + axis_desc* axes, + size_t numAxes, + int(*fn)(const void*, const void*)){ + size_t i; + + for(i=0;iwsDst = NULL; - ctx->wsDstArg = NULL; - ctx->srcAxisList = NULL; - ctx->dstDims = NULL; ctx->gpuCtx = NULL; ctx->srcTypeStr = ctx->dstTypeStr = ctx->dstArgTypeStr = @@ -1301,7 +1277,6 @@ static int reduxInit (redux_ctx* ctx){ ctx->sourceCode = NULL; ctx->errorString0 = NULL; ctx->errorString1 = NULL; - ctx->errorString2 = NULL; ctx->numStages = 1; ctx->prodAllAxes = ctx->prodRdxAxes = ctx->prodFreeAxes = 1; @@ -1309,15 +1284,14 @@ static int reduxInit (redux_ctx* ctx){ srcbInit (&ctx->srcGen, &ctx->s); for (i=0;iaux.axisList[i] = ctx->pri.axisList[i] = 0; - ctx->aux.bs [i] = ctx->pri.bs [i] = 1; - ctx->aux.gs [i] = ctx->pri.gs [i] = 1; - ctx->aux.cs [i] = ctx->pri.cs [i] = 1; + ctx->st2.bs [i] = ctx->st1.bs [i] = 1; + ctx->st2.gs [i] = ctx->st1.gs [i] = 1; + ctx->st2.cs [i] = ctx->st1.cs [i] = 1; } ctx->srcStepsGD = ctx->srcSizeGD = ctx->dstStepsGD = ctx->dstArgStepsGD = - ctx->pri.chunkSizeGD = ctx->aux.chunkSizeGD = NULL; + ctx->st1.chunkSizeGD = ctx->st2.chunkSizeGD = NULL; return reduxInferProperties(ctx); } @@ -1365,11 +1339,11 @@ static int reduxInferProperties (redux_ctx* ctx){ return reduxCleanupMsg(ctx, GA_INVALID_ERROR, "dstArg is of incorrect dimensionality for this reduction!\n"); } - ctx->nds = ctx->src->nd; - ctx->ndr = ctx->reduxLen; - ctx->ndd = ctx->nds - ctx->ndr; - ctx->ndf = 0; - ctx->ndt = ctx->ndd + 1; + ctx->nds = ctx->src->nd; + ctx->ndr = ctx->reduxLen; + ctx->ndd = ctx->nds - ctx->ndr; + ctx->ndfs = ctx->ndfr = ctx->ndfd = 0; + ctx->ndt = ctx->ndd + 1; /* Insane reduxList? */ for (i=0;indr;i++){ @@ -1539,8 +1513,7 @@ static int reduxInferProperties (redux_ctx* ctx){ ctx->xdSrc = calloc(ctx->nds, sizeof(*ctx->xdSrc)); ctx->xdSrcPtrs = calloc(ctx->nds+1, sizeof(*ctx->xdSrcPtrs)); ctx->xdSrcFlat = calloc(ctx->nds+1, sizeof(*ctx->xdSrcFlat)); - ctx->xdTmp = calloc(ctx->ndt, sizeof(*ctx->xdTmp)); - if (!ctx->xdSrc || !ctx->xdSrcPtrs || !ctx->xdSrcFlat || !ctx->xdTmp){ + if (!ctx->xdSrc || !ctx->xdSrcPtrs || !ctx->xdSrcFlat){ return reduxCleanup(ctx, GA_MEMORY_ERROR); } for (i=0;inds;i++){ @@ -1618,7 +1591,7 @@ static int reduxFlattenSource (redux_ctx* ctx){ */ memcpy(ctx->xdSrcFlat, ctx->xdSrc, ctx->nds*sizeof(*ctx->xdSrcFlat)); - ctx->ndf = ctx->nds; + ctx->ndfs = ctx->nds; /** * Pass 1: Flatten out 0-length dimensions. We already know that @@ -1633,7 +1606,7 @@ static int reduxFlattenSource (redux_ctx* ctx){ */ if (ctx->zeroRdxAxes > 0){ - for (i=j=0;indf;i++){ + for (i=j=0;indfs;i++){ axis = reduxGetSrcFlatAxis(ctx, i); if (!axisIsReduced(axis)){ @@ -1644,7 +1617,7 @@ static int reduxFlattenSource (redux_ctx* ctx){ axisInit (reduxGetSrcFlatAxis(ctx, j), 0, 0); axisMarkReduced(reduxGetSrcFlatAxis(ctx, j), 0); j++; - ctx->ndf = j; + ctx->ndfs = j; } /** @@ -1652,14 +1625,14 @@ static int reduxFlattenSource (redux_ctx* ctx){ * ignored; They are always indexed at [0]. */ - for (i=j=0;indf;i++){ + for (i=j=0;indfs;i++){ axis = reduxGetSrcFlatAxis(ctx, i); if (axisGetLen(axis) != 1){ *reduxGetSrcFlatAxis(ctx, j++) = *axis; } } - ctx->ndf = j; + ctx->ndfs = j; /** * Pass 3: Flatten out continuous dimensions, where strides and sensitivity @@ -1668,10 +1641,10 @@ static int reduxFlattenSource (redux_ctx* ctx){ isSensitive = reduxIsSensitive(ctx); - qsort(ctx->xdSrcFlat, ctx->ndf, sizeof(*ctx->xdSrcFlat), + qsort(ctx->xdSrcFlat, ctx->ndfs, sizeof(*ctx->xdSrcFlat), isSensitive ? reduxSortFlatSensitive : reduxSortFlatInsensitive); - for (i=j=1;indf;i++){ + for (i=j=1;indfs;i++){ flatAxis = reduxGetSrcFlatAxis(ctx, j-1); sortAxis = reduxGetSrcFlatAxis(ctx, i); @@ -1679,7 +1652,7 @@ static int reduxFlattenSource (redux_ctx* ctx){ *reduxGetSrcFlatAxis(ctx, j++) = *sortAxis; } } - ctx->ndf = j; + ctx->ndfs = j; /** @@ -1691,10 +1664,72 @@ static int reduxFlattenSource (redux_ctx* ctx){ * We check for this case and simulate a 1-dimensional, 1-length tensor. */ - if(ctx->ndf == 0){ - axisInit (reduxGetSrcFlatAxis(ctx, ctx->ndf), 1, 0); - axisMarkReduced(reduxGetSrcFlatAxis(ctx, ctx->ndf), 0); - ctx->ndf = 1; + if(ctx->ndfs == 0){ + axisInit (reduxGetSrcFlatAxis(ctx, ctx->ndfs), 1, 0); + axisMarkReduced(reduxGetSrcFlatAxis(ctx, ctx->ndfs), 0); + ctx->ndfs = 1; + } + + + /** + * Having flattened the tensor to the very best of our ability, allocate + * and/or compute + * + * ctx->ndfr + * ctx->ndfd + * ctx->flatSrcDimensions + * ctx->flatSrcStrides + * ctx->flatSrcData + * ctx->flatSrcOffset + axis offsets + * ctx->flatDstStrides + * ctx->flatDstData + * ctx->flatDstOffset + axis offsets + * ctx->flatDstArgStrides + * ctx->flatDstArgData + * ctx->flatDstArgOffset + axis offsets + * + * and suchlike data that will be used post-flatten. + */ + + ctx->flatSrcDimensions = malloc(ctx->ndfs * sizeof(*ctx->flatSrcDimensions)); + ctx->flatSrcStrides = malloc(ctx->ndfs * sizeof(*ctx->flatSrcStrides)); + ctx->flatDstStrides = malloc(ctx->ndfs * sizeof(*ctx->flatDstStrides)); + ctx->flatDstArgStrides = malloc(ctx->ndfs * sizeof(*ctx->flatDstArgStrides)); + if(!ctx->flatSrcDimensions || !ctx->flatSrcStrides || + !ctx->flatDstStrides || !ctx->flatDstArgStrides){ + return reduxCleanup(ctx, GA_MEMORY_ERROR); + } + + ctx->flatSrcData = ctx->src->data; + ctx->flatSrcOffset = ctx->src->offset; + if(reduxRequiresDst(ctx)){ + ctx->flatDstData = ctx->dst->data; + ctx->flatDstOffset = ctx->dst->offset; + } + if(reduxRequiresDstArg(ctx)){ + ctx->flatDstArgData = ctx->dstArg->data; + ctx->flatDstArgOffset = ctx->dstArg->offset; + } + for(ctx->ndfd=ctx->ndfr=i=0;indfs;i++){ + axis = reduxGetSrcFlatAxis(ctx, i); + if(axisIsReduced(axis)){ + ctx->ndfr++; + }else{ + if(reduxRequiresDst(ctx)){ + ctx->flatDstStrides[ctx->ndfd] = axisGetDstStride(axis); + ctx->flatDstOffset += axisGetDstOffset(axis); + } + if(reduxRequiresDstArg(ctx)){ + ctx->flatDstArgStrides[ctx->ndfd] = axisGetDstArgStride(axis); + ctx->flatDstArgOffset += axisGetDstArgOffset(axis); + } + + ctx->ndfd++; + } + + ctx->flatSrcDimensions[i] = axisGetLen (axis); + ctx->flatSrcStrides[i] = axisGetSrcStride(axis); + ctx->flatSrcOffset += axisGetSrcOffset(axis); } return reduxSelectNumStages(ctx); @@ -1713,9 +1748,13 @@ static int reduxSelectNumStages (redux_ctx* ctx){ ctx->prodAllAxes <= ctx->maxLg || /* Reduction over few elements? */ ctx->prodFreeAxes >= ctx->prodRdxAxes || /* More destinations than reductions? */ ctx->prodFreeAxes >= parallelism ){ /* Destination very large? */ + ctx->numStages = 1; return reduxPlan1Stage(ctx); }else{ - return reduxPlan2Stage(ctx); + /* BUG: Switch to 2Stage when small code model fixed. */ + (void)reduxPlan2Stage; + ctx->numStages = 1; + return reduxPlan1Stage(ctx); } } @@ -1738,11 +1777,25 @@ static int reduxSelectNumStages (redux_ctx* ctx){ */ static int reduxPlan1Stage (redux_ctx* ctx){ - ctx->numStages = 1; + int i; + axis_desc* axis; + + reduxSortAxisPtrsBy(ctx->xdSrcPtrs, ctx->xdSrcFlat, ctx->ndfs, + reduxSortPlan1Stage); + ctx->st1.ndh = 0; + ctx->st1.ndhp = 0; + ctx->st1.ndhr = 0; + for (i=0;indfd && ihwAxisStage0 = i; + + ctx->st1.ndh++; + } + ctx->st1.ndhd = ctx->st1.ndh; - return reduxSelectHwAxes(ctx); + return reduxGenSource(ctx); } /** @@ -1752,146 +1805,77 @@ static int reduxPlan1Stage (redux_ctx* ctx){ * * This plan involves splitting the reduction into two stages: * - * Stage 1: A reduction by approximately R = sqrt(prodRdxAxes) elements per - * destination elements into allocated temporary workspace(s) - * of approximate size dst.shape + (prodRdxAxes/R,) - * Stage 2: A reduction by approximately prodRdxAxes/R elements into the - * final destination. + * Stage 0: A huge reduction only along reduction axes into a workspace. + * Stage 1: A small reduction into the destination. + * + * We select only reduction axes in the first stage. */ static int reduxPlan2Stage (redux_ctx* ctx){ - ctx->numStages = 2; - - /* NOTE: Use gpuarray_get_elsize(typecode) */ - - return reduxSelectHwAxes(ctx); -} - -/** - * @brief Select which axes (up to MAX_HW_DIMS) will be assigned to hardware - * dimensions for both the primary and auxiliary kernels. - * - * LARGE code model: Up to the MAX_HW_DIMS largest free axes are selected. - * Because the primary reduction kernel does everything, it's - * not necessary to compute an auxiliary kernel axis - * selection (or at least, one distinct from the primary - * kernel's). - * - * SMALL code model: For the primary reduction kernel, up to MAX_HW_DIMS - * reduction axes (largest-to-smallest) are selected. If less - * than MAX_HW_DIMS axes were selected, free axes are - * selected until MAX_HW_DIMS total axes are selected, or no - * free axes are left. - * - * For the auxiliary reduction kernel, up to the MAX_HW_DIMS - * largest free axes are selected. - */ - -static int reduxSelectHwAxes (redux_ctx* ctx){ - int ret; + int i; + axis_desc* axis; + size_t a = 1, aL, aPartial, target = ctx->maxLg; - ctx->srcAxisList = malloc(ctx->nds * sizeof(unsigned)); - ctx->dstDims = malloc(ctx->ndd * sizeof(size_t)); - if (!ctx->srcAxisList || - !ctx->dstDims ){ - return reduxCleanup(ctx, GA_MEMORY_ERROR); - } - - ctx->largeCodeModel = 1;/* BUG: Erase when small code model fixed. */ /** - * *** IT IS NOW SAFE TO CALL: *** - * - reduxIsLargeModel() - * - reduxIsSmallModel() - * - reduxKernelRequiresDst() - * - reduxKernelRequiresDstArg() + * Sort axis descriptions reduction-axes-first then longest-first, and + * select up to 3 reduction axes, splitting them s.t. their product does + * not exceed the max block size. */ - - - /** - * Allocate workspaces. - * - * Certain reductions may require a workspace that isn't provided by the user. - * For instance, **when using the small code model**, argmin/argmax require - * a dst buffer, but the user didn't supply one (as he would have for - * maxandargmax/minandargmin). We must allocate and deallocate it ourselves. - * - * Otherwise we use the user-supplied buffers. - */ - - if (!reduxRequiresDst (ctx) && reduxKernelRequiresDst(ctx)){ - ctx->wsDst = malloc(sizeof(*ctx->wsDst)); - if (!ctx->wsDst){ - return reduxCleanup(ctx, GA_MEMORY_ERROR); - } - - ret = GpuArray_empty(ctx->wsDst, ctx->gpuCtx, ctx->dstTypeCode, - ctx->ndd, ctx->dstDims, GA_C_ORDER); - if (ret != GA_NO_ERROR){ - return reduxCleanup(ctx, ret); - } - }else{ - ctx->wsDst = ctx->dst; - } - if (!reduxRequiresDstArg(ctx) && reduxKernelRequiresDstArg(ctx)){ - ctx->wsDstArg = malloc(sizeof(*ctx->wsDstArg)); - if (!ctx->wsDstArg){ - return reduxCleanup(ctx, GA_MEMORY_ERROR); + + reduxSortAxisPtrsBy(ctx->xdSrcPtrs, ctx->xdSrcFlat, ctx->ndfs, + reduxSortPlan2Stage0); + + ctx->st1.ndh = 0; + ctx->st1.ndhp = 0; + ctx->st1.ndhr = 0; + ctx->st1.ndhd = 0; + + for(i=0;indfs && iwsDstArg, ctx->gpuCtx, ctx->dstArgTypeCode, - ctx->ndd, ctx->dstDims, GA_C_ORDER); - if (ret != GA_NO_ERROR){ - return reduxCleanup(ctx, ret); + + aL = axisGetLen(axis); + a *= aL; + if(a <= target){ + axis->hwAxisStage0 = i; + axis->sliceLen = aL; + axis->tmpLen = (axis->len+axis->sliceLen-1)/axis->sliceLen; + + ctx->st1.ndh++; + }else{ + a /= aL; + aPartial = target/a; + if(aPartial >= 2){ + a *= aPartial; + + axis->hwAxisStage0 = i++; + axis->sliceLen = aPartial; + axis->tmpLen = (axis->len+axis->sliceLen-1)/axis->sliceLen; + + ctx->st1.ndh++; + ctx->st1.ndhp++; + } + break; } - }else{ - ctx->wsDstArg = ctx->dstArg; } - - - if (reduxIsLargeCodeModel(ctx)){ - while (reduxCanAppendHwAxis (ctx, KERNEL_PRIMARY, AXIS_FREE)){ - reduxAppendLargestAxisToHwList(ctx, KERNEL_PRIMARY, AXIS_FREE); - } - }else{ - while (reduxCanAppendHwAxis (ctx, KERNEL_PRIMARY, AXIS_REDUX)){ - reduxAppendLargestAxisToHwList(ctx, KERNEL_PRIMARY, AXIS_REDUX); - } - while (reduxCanAppendHwAxis (ctx, KERNEL_PRIMARY, AXIS_FREE)){ - reduxAppendLargestAxisToHwList(ctx, KERNEL_PRIMARY, AXIS_FREE); - } - - while (reduxCanAppendHwAxis (ctx, KERNEL_AUXILIARY, AXIS_FREE)){ - reduxAppendLargestAxisToHwList(ctx, KERNEL_AUXILIARY, AXIS_FREE); - } + ctx->st1.ndhr = ctx->st1.ndh; + + /** + * We now have enough information to allocate the workspaces. + */ + + if(!reduxRequiresDst (ctx) && reduxKernelRequiresDst(ctx)){ + } - - return reduxComputeAxisList(ctx); -} - -/** - * @brief Compute the axis list. - * - * The axis list describes the mapping between the nested loops of the kernel - * as well as their accompanying indices (i0*, i1*, ..., in*) on one hand, and - * the axes of the source tensor. The first axis in the list corresponds to the - * outermost loop and the last axis in the list to the innermost. - * - * The first ctx->ndd axes correspond to the outer loops that iterate over - * each destination element. The last ctx->ndr axes correspond to the inner - * loops that iterate over the dimensions of elements that are to be reduced. - */ - -static int reduxComputeAxisList (redux_ctx* ctx){ - int i, f=0; - - for (i=0;inds;i++){ - if (!axisInSet(i, ctx->reduxList, ctx->ndr, 0)){ - ctx->srcAxisList[f++] = i; - } + if(!reduxRequiresDstArg(ctx) && reduxKernelRequiresDstArg(ctx)){ + } - memcpy(&ctx->srcAxisList[f], ctx->reduxList, ctx->ndr * sizeof(*ctx->reduxList)); - - + + + /* NOTE: Use gpuarray_get_elsize(typecode) */ + return reduxGenSource(ctx); } @@ -1918,10 +1902,6 @@ static void reduxAppendSource (redux_ctx* ctx){ reduxAppendGetInitValFns (ctx); reduxAppendWriteBackFn (ctx); reduxAppendReduxKernel (ctx); - if (reduxIsSmallCodeModel(ctx)){ - reduxAppendInitKernel (ctx); - reduxAppendPostKernel (ctx); - } } static void reduxAppendTensorDeclArgs (redux_ctx* ctx, const char* type, @@ -1948,7 +1928,7 @@ static void reduxAppendMacroDefs (redux_ctx* ctx){ srcbBeginList (&ctx->srcGen, "+", "0"); srcbAppendElemf(&ctx->srcGen, "(const GLOBAL_MEM char*)srcPtr"); srcbAppendElemf(&ctx->srcGen, "srcOff"); - for (i=0;inds;i++){ + for (i=0;indfs;i++){ srcbAppendElemf(&ctx->srcGen, "i%d*i%dSStep", i, i); } srcbEndList (&ctx->srcGen); @@ -1960,7 +1940,7 @@ static void reduxAppendMacroDefs (redux_ctx* ctx){ srcbBeginList (&ctx->srcGen, "+", "0"); srcbAppendElemf(&ctx->srcGen, "(GLOBAL_MEM char*)dstPtr"); srcbAppendElemf(&ctx->srcGen, "dstOff"); - for (i=0;indd;i++){ + for (i=0;indfd;i++){ srcbAppendElemf(&ctx->srcGen, "i%d*i%dDStep", i, i); } srcbEndList (&ctx->srcGen); @@ -1973,7 +1953,7 @@ static void reduxAppendMacroDefs (redux_ctx* ctx){ srcbBeginList (&ctx->srcGen, "+", "0"); srcbAppendElemf(&ctx->srcGen, "(GLOBAL_MEM char*)dstArgPtr"); srcbAppendElemf(&ctx->srcGen, "dstArgOff"); - for (i=0;indd;i++){ + for (i=0;indfd;i++){ srcbAppendElemf(&ctx->srcGen, "i%d*i%dAStep", i, i); } srcbEndList (&ctx->srcGen); @@ -1983,7 +1963,7 @@ static void reduxAppendMacroDefs (redux_ctx* ctx){ /* rdxIdx indexer */ srcbAppends (&ctx->srcGen, "#define rdxIdx ("); srcbBeginList (&ctx->srcGen, "+", "0"); - for (i=ctx->ndd;inds;i++){ + for (i=ctx->ndfd;indfs;i++){ srcbAppendElemf(&ctx->srcGen, "i%d*i%dPDim", i, i); } srcbEndList (&ctx->srcGen); @@ -2037,7 +2017,7 @@ static void reduxAppendWriteBackFn (redux_ctx* ctx){ srcbEndList (&ctx->srcGen); srcbAppends (&ctx->srcGen, "){\n"); - if (reduxIsLargeCodeModel(ctx)){ + if (reduxIs1Stage(ctx)){ if (reduxKernelRequiresDst (ctx)){ srcbAppends (&ctx->srcGen, "\t*d_ = d;\n"); } @@ -2089,78 +2069,85 @@ static void reduxAppendIndexDeclarations (redux_ctx* ctx){ strb_appends(&ctx->s, "\tX bd0 = LDIM_0, bd1 = LDIM_1, bd2 = LDIM_2;\n"); strb_appends(&ctx->s, "\tX ti0 = LID_0, ti1 = LID_1, ti2 = LID_2;\n"); strb_appends(&ctx->s, "\tX gi0 = bi0*bd0+ti0, gi1 = bi1*bd1+ti1, gi2 = bi2*bd2+ti2;\n"); - if (ctx->pri.ndh>0){ + if (ctx->st1.ndh>0){ strb_appends(&ctx->s, "\tX "); - for (i=0;ipri.ndh;i++){ + for (i=0;ist1.ndh;i++){ strb_appendf(&ctx->s, "ci%u = chunkSize[%u]%s", - i, i, (i==ctx->pri.ndh-1) ? ";\n" : ", "); + i, i, (i==ctx->st1.ndh-1) ? ";\n" : ", "); } } strb_appends(&ctx->s, "\t\n\t\n"); strb_appends(&ctx->s, "\t/* Free indices & Reduction indices */\n"); - if (ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "", ";\n");} - if (ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "Dim", ";\n");} - if (ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "Start", ";\n");} - if (ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "End", ";\n");} - if (ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->nds, "SStep", ";\n");} - if (ctx->ndd > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->ndd, "DStep", ";\n");} - if (ctx->ndd > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->ndd, "AStep", ";\n");} - if (ctx->nds > ctx->ndd){appendIdxes (&ctx->s, "\tX ", "i", ctx->ndd, ctx->nds, "PDim", ";\n");} + if (ctx->ndfs > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->ndfs, "", ";\n");} + if (ctx->ndfs > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->ndfs, "Dim", ";\n");} + if (ctx->ndfs > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->ndfs, "Start", ";\n");} + if (ctx->ndfs > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->ndfs, "End", ";\n");} + if (ctx->ndfs > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->ndfs, "SStep", ";\n");} + if (ctx->ndfd > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->ndfd, "DStep", ";\n");} + if (ctx->ndfd > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->ndfd, "AStep", ";\n");} + if (ctx->ndfs > ctx->ndfd){appendIdxes (&ctx->s, "\tX ", "i", ctx->ndfd, ctx->ndfs, "PDim", ";\n");} strb_appends(&ctx->s, "\t\n\t\n"); } static void reduxAppendRangeCalculations (redux_ctx* ctx){ - size_t hwDim; - int i; + axis_desc* axis; + size_t hwDim; + int i; strb_appends(&ctx->s, "\t/* Compute ranges for this thread. */\n"); - for (i=0;inds;i++){ - strb_appendf(&ctx->s, "\ti%dDim = srcSize[%d];\n", i, ctx->srcAxisList[i]); + for (i=0;indfs;i++){ + strb_appendf(&ctx->s, "\ti%dDim = srcSize[%d];\n", i, i); } - for (i=0;inds;i++){ - strb_appendf(&ctx->s, "\ti%dSStep = srcSteps[%d];\n", i, ctx->srcAxisList[i]); + for (i=0;indfs;i++){ + strb_appendf(&ctx->s, "\ti%dSStep = srcSteps[%d];\n", i, i); } if (reduxKernelRequiresDst(ctx)){ - for (i=0;indd;i++){ + for (i=0;indfd;i++){ strb_appendf(&ctx->s, "\ti%dDStep = dstSteps[%d];\n", i, i); } } if (reduxKernelRequiresDstArg(ctx)){ - for (i=0;indd;i++){ + for (i=0;indfd;i++){ strb_appendf(&ctx->s, "\ti%dAStep = dstArgSteps[%d];\n", i, i); } } - for (i=ctx->nds-1;i>=ctx->ndd;i--){ + for (i=ctx->ndfs-1;i>=ctx->ndfd;i--){ /** * If this is the last index, it's the first cumulative dimension * product we generate, and thus we initialize to 1. */ - if (i == ctx->nds-1){ + if (i == ctx->ndfs-1){ strb_appendf(&ctx->s, "\ti%dPDim = 1;\n", i); }else{ strb_appendf(&ctx->s, "\ti%dPDim = i%dPDim * i%dDim;\n", i, i+1, i+1); } } - for (i=0;inds;i++){ + for (i=0;indfs;i++){ /** * Up to MAX_HW_DIMS dimensions get to rely on hardware loops. * The others, if any, have to use software looping beginning at 0. */ - if (axisInSet(ctx->srcAxisList[i], ctx->pri.axisList, ctx->pri.ndh, &hwDim)){ + axis = reduxGetSrcFlatAxis(ctx, i); + if (axisIsHW(axis, 0)){ + hwDim = axisGetHWAxisNum(axis, 0); + //axisInSet(i, ctx->st1.axisList, ctx->st1.ndh, &hwDim); strb_appendf(&ctx->s, "\ti%dStart = gi%d * ci%d;\n", i, hwDim, hwDim); }else{ strb_appendf(&ctx->s, "\ti%dStart = 0;\n", i); } } - for (i=0;inds;i++){ + for (i=0;indfs;i++){ /** * Up to MAX_HW_DIMS dimensions get to rely on hardware loops. * The others, if any, have to use software looping beginning at 0. */ - if (axisInSet(ctx->srcAxisList[i], ctx->pri.axisList, ctx->pri.ndh, &hwDim)){ + axis = reduxGetSrcFlatAxis(ctx, i); + if (axisIsHW(axis, 0)){ + hwDim = axisGetHWAxisNum(axis, 0); + //axisInSet(i, ctx->st1.axisList, ctx->st1.ndh, &hwDim); strb_appendf(&ctx->s, "\ti%dEnd = i%dStart + ci%d;\n", i, i, hwDim); }else{ strb_appendf(&ctx->s, "\ti%dEnd = i%dStart + i%dDim;\n", i, i, i); @@ -2172,7 +2159,7 @@ static void reduxAppendRangeCalculations (redux_ctx* ctx){ static void reduxAppendLoops (redux_ctx* ctx){ int i; - for (i=0;indd;i++){ + for (i=0;indfd;i++){ srcbAppendf(&ctx->srcGen, "\tFOROVER(%d){ESCAPE(%d)\n", i, i); } @@ -2183,7 +2170,7 @@ static void reduxAppendLoops (redux_ctx* ctx){ } srcbAppends (&ctx->srcGen, "\t\t\n"); - for (i=ctx->ndd;inds;i++){ + for (i=ctx->ndfd;indfs;i++){ srcbAppendf (&ctx->srcGen, "\t\tFOROVER(%d){ESCAPE(%d)\n", i, i); } @@ -2243,7 +2230,7 @@ static void reduxAppendLoops (redux_ctx* ctx){ break; } - for (i=ctx->ndd;inds;i++){ + for (i=ctx->ndfd;indfs;i++){ srcbAppends(&ctx->srcGen, "\t\t}\n"); } srcbAppends(&ctx->srcGen, "\t\t\n"); @@ -2269,16 +2256,10 @@ static void reduxAppendLoops (redux_ctx* ctx){ srcbEndList (&ctx->srcGen); srcbAppends (&ctx->srcGen, ");\n"); - for (i=0;indd;i++){ + for (i=0;indfd;i++){ srcbAppends(&ctx->srcGen, "\t}\n"); } } -static void reduxAppendInitKernel (redux_ctx* ctx){ - /* BUG: Implement this for small code model. */ -} -static void reduxAppendPostKernel (redux_ctx* ctx){ - /* BUG: Implement this for small code model. */ -} /** * @brief Compile the kernel from source code. @@ -2288,8 +2269,6 @@ static int reduxCompile (redux_ctx* ctx){ int ret, i = 0; int PRI_TYPECODES[11]; size_t PRI_TYPECODES_LEN; - int* AUX_TYPECODES; - size_t AUX_TYPECODES_LEN; /** @@ -2312,8 +2291,6 @@ static int reduxCompile (redux_ctx* ctx){ PRI_TYPECODES[i++] = GA_BUFFER; /* dstArgSteps */ } PRI_TYPECODES_LEN = i; - AUX_TYPECODES = &PRI_TYPECODES[3]; - AUX_TYPECODES_LEN = PRI_TYPECODES_LEN-3; /** @@ -2335,34 +2312,6 @@ static int reduxCompile (redux_ctx* ctx){ return reduxCleanup(ctx, ret); } } - if (reduxIsSmallCodeModel(ctx)){ - ret = GpuKernel_init(&ctx->kernel, - ctx->gpuCtx, - 1, - (const char**)&ctx->sourceCode, - &ctx->sourceCodeLen, - "initKer", - AUX_TYPECODES_LEN, - AUX_TYPECODES, - GA_USE_CLUDA, - &ctx->errorString1); - if (ret != GA_NO_ERROR){ - return reduxCleanup(ctx, ret); - } - ret = GpuKernel_init(&ctx->kernel, - ctx->gpuCtx, - 1, - (const char**)&ctx->sourceCode, - &ctx->sourceCodeLen, - "postKer", - AUX_TYPECODES_LEN, - AUX_TYPECODES, - GA_USE_CLUDA, - &ctx->errorString2); - if (ret != GA_NO_ERROR){ - return reduxCleanup(ctx, ret); - } - } return reduxSchedule(ctx); } @@ -2373,60 +2322,41 @@ static int reduxCompile (redux_ctx* ctx){ */ static int reduxSchedule (redux_ctx* ctx){ - int i, priNdims = 0, auxNdims = 0; - uint64_t maxLgRdx = 0, maxLgPre = 0, maxLgPost = 0; - uint64_t maxLgPri = 0, maxLgAux = 0; + int i, priNdims = 0; + uint64_t maxLgRdx = 0; + uint64_t maxLgPri = 0; uint64_t maxLs [MAX_HW_DIMS]; uint64_t maxGg; uint64_t maxGs [MAX_HW_DIMS]; uint64_t priDims[MAX_HW_DIMS]; - uint64_t auxDims[MAX_HW_DIMS]; uint64_t bs [MAX_HW_DIMS]; uint64_t gs [MAX_HW_DIMS]; uint64_t cs [MAX_HW_DIMS]; - size_t warpSize, - maxL, maxL0, maxL1, maxL2, - maxG, maxG0, maxG1, maxG2; + size_t warpSize, maxL; + axis_desc* axis; /** * Obtain the constraints of our problem. */ - - gpudata_property (ctx->src->data, GA_CTX_PROP_MAXLSIZE0, &maxL0); - gpudata_property (ctx->src->data, GA_CTX_PROP_MAXLSIZE1, &maxL1); - gpudata_property (ctx->src->data, GA_CTX_PROP_MAXLSIZE2, &maxL2); - gpudata_property (ctx->src->data, GA_CTX_PROP_MAXGSIZE, &maxG); - gpudata_property (ctx->src->data, GA_CTX_PROP_MAXGSIZE0, &maxG0); - gpudata_property (ctx->src->data, GA_CTX_PROP_MAXGSIZE1, &maxG1); - gpudata_property (ctx->src->data, GA_CTX_PROP_MAXGSIZE2, &maxG2); + gpukernel_property(ctx->kernel.k, GA_KERNEL_PROP_PREFLSIZE, &warpSize); gpukernel_property(ctx->kernel.k, GA_KERNEL_PROP_MAXLSIZE, &maxL); maxLgRdx = maxL; maxLgPri = maxLgRdx; - if (reduxIsSmallCodeModel(ctx)){ - gpukernel_property(ctx->preKernel.k, GA_KERNEL_PROP_MAXLSIZE, &maxL); - maxLgPre = maxL; - gpukernel_property(ctx->postKernel.k, GA_KERNEL_PROP_MAXLSIZE, &maxL); - maxLgPost = maxL; - maxLgAux = maxLgPrepri.ndh; - maxGs[0] = maxG0; - maxGs[1] = maxG1; - maxGs[2] = maxG2; - maxGg = maxG; - maxLs[0] = maxL0; - maxLs[1] = maxL1; - maxLs[2] = maxL2; - for (i=0;isrc->dimensions[ctx->pri.axisList[i]]; - } - if (reduxIsSmallCodeModel(ctx)){ - auxNdims = ctx->aux.ndh; - for (i=0;isrc->dimensions[ctx->aux.axisList[i]]; + + priNdims = ctx->st1.ndh; + maxGs[0] = ctx->maxGs[0]; + maxGs[1] = ctx->maxGs[1]; + maxGs[2] = ctx->maxGs[2]; + maxGg = ctx->maxGg; + maxLs[0] = ctx->maxLs[0]; + maxLs[1] = ctx->maxLs[1]; + maxLs[2] = ctx->maxLs[2]; + for (i=0;indfs;i++){ + axis = reduxGetSrcFlatAxis(ctx, i); + if(axisIsHW(axis, 0)){ + priDims[axisGetHWAxisNum(axis, 0)] = axisGetLen(axis); } } @@ -2443,28 +2373,12 @@ static int reduxSchedule (redux_ctx* ctx){ maxGg, maxGs, bs, gs, cs); for (i=0;ipri.bs[i] = bs[i]; - ctx->pri.gs[i] = gs[i]; - ctx->pri.cs[i] = cs[i]; + ctx->st1.bs[i] = bs[i]; + ctx->st1.gs[i] = gs[i]; + ctx->st1.cs[i] = cs[i]; } if (priNdims <= 0){ - ctx->pri.bs[i] = ctx->pri.gs[i] = ctx->pri.cs[i] = 1; - } - } - if (reduxIsSmallCodeModel(ctx)){ - reduxScheduleKernel(auxNdims, - auxDims, - warpSize, - maxLgAux, maxLs, - maxGg, maxGs, - bs, gs, cs); - for (i=0;iaux.bs[i] = bs[i]; - ctx->aux.gs[i] = gs[i]; - ctx->aux.cs[i] = cs[i]; - } - if (auxNdims <= 0){ - ctx->aux.bs[i] = ctx->aux.gs[i] = ctx->aux.cs[i] = 1; + ctx->st1.bs[i] = ctx->st1.gs[i] = ctx->st1.cs[i] = 1; } } @@ -2576,7 +2490,6 @@ static void reduxScheduleKernel (int ndims, static int reduxInvoke (redux_ctx* ctx){ void* priArgs[11]; - void* auxArgs[ 8]; int ret, i = 0; int failedDstSteps = 0; int failedDstArgSteps = 0; @@ -2588,47 +2501,34 @@ static int reduxInvoke (redux_ctx* ctx){ */ const int flags = GA_BUFFER_READ_ONLY|GA_BUFFER_INIT; - ctx->srcStepsGD = gpudata_alloc(ctx->gpuCtx, ctx->nds * sizeof(size_t), - ctx->src->strides, flags, 0); - ctx->srcSizeGD = gpudata_alloc(ctx->gpuCtx, ctx->nds * sizeof(size_t), - ctx->src->dimensions, flags, 0); - ctx->pri.chunkSizeGD = gpudata_alloc(ctx->gpuCtx, ctx->pri.ndh * sizeof(size_t), - ctx->pri.cs, flags, 0); - - priArgs[i++] = (void*) ctx->src->data; - priArgs[i++] = (void*)&ctx->src->offset; + ctx->srcStepsGD = gpudata_alloc(ctx->gpuCtx, ctx->ndfs * sizeof(size_t), + ctx->flatSrcStrides, flags, 0); + ctx->srcSizeGD = gpudata_alloc(ctx->gpuCtx, ctx->ndfs * sizeof(size_t), + ctx->flatSrcDimensions, flags, 0); + ctx->st1.chunkSizeGD = gpudata_alloc(ctx->gpuCtx, ctx->st1.ndh * sizeof(size_t), + ctx->st1.cs, flags, 0); + + priArgs[i++] = (void*) ctx->flatSrcData; + priArgs[i++] = (void*)&ctx->flatSrcOffset; priArgs[i++] = (void*) ctx->srcStepsGD; priArgs[i++] = (void*) ctx->srcSizeGD; - priArgs[i++] = (void*) ctx->pri.chunkSizeGD; + priArgs[i++] = (void*) ctx->st1.chunkSizeGD; if (reduxKernelRequiresDst (ctx)){ - ctx->dstStepsGD = gpudata_alloc(ctx->gpuCtx, ctx->ndd * sizeof(size_t), - ctx->wsDst->strides, flags, 0); - priArgs[i++] = (void*) ctx->wsDst->data; - priArgs[i++] = (void*)&ctx->wsDst->offset; + ctx->dstStepsGD = gpudata_alloc(ctx->gpuCtx, ctx->ndfd * sizeof(size_t), + ctx->flatDstStrides, flags, 0); + priArgs[i++] = (void*) ctx->flatDstData; + priArgs[i++] = (void*)&ctx->flatDstOffset; priArgs[i++] = (void*) ctx->dstStepsGD; failedDstSteps = !ctx->dstStepsGD; } if (reduxKernelRequiresDstArg(ctx)){ - ctx->dstArgStepsGD = gpudata_alloc(ctx->gpuCtx, ctx->ndd * sizeof(size_t), - ctx->wsDstArg->strides, flags, 0); - priArgs[i++] = (void*) ctx->wsDstArg->data; - priArgs[i++] = (void*)&ctx->wsDstArg->offset; + ctx->dstArgStepsGD = gpudata_alloc(ctx->gpuCtx, ctx->ndfd * sizeof(size_t), + ctx->flatDstArgStrides, flags, 0); + priArgs[i++] = (void*) ctx->flatDstArgData; + priArgs[i++] = (void*)&ctx->flatDstArgOffset; priArgs[i++] = (void*) ctx->dstArgStepsGD; failedDstArgSteps = !ctx->dstArgStepsGD; } - if (reduxIsSmallCodeModel (ctx)){ - /** - * The auxiliary kernel's args are identical to the primary kernel's, - * except that the first three arguments are deleted and the fifth - * argument (now second), called chunkSize, is different. - */ - - memcpy(auxArgs, &priArgs[3], sizeof(auxArgs)); - ctx->aux.chunkSizeGD = gpudata_alloc(ctx->gpuCtx, ctx->aux.ndh * sizeof(size_t), - ctx->aux.cs, flags, 0); - auxArgs[ 1 ] = (void*) ctx->aux.chunkSizeGD; - failedAuxChunkSize = !ctx->aux.chunkSizeGD; - } /** @@ -2637,47 +2537,21 @@ static int reduxInvoke (redux_ctx* ctx){ if (ctx->srcStepsGD && ctx->srcSizeGD && - ctx->pri.chunkSizeGD && + ctx->st1.chunkSizeGD && !failedDstSteps && !failedDstArgSteps && !failedAuxChunkSize){ - /* Pre-kernel invocation, if necessary */ - if (reduxIsSmallCodeModel(ctx)){ - ret = GpuKernel_call(&ctx->preKernel, - ctx->aux.ndh>0 ? ctx->aux.ndh : 1, - ctx->aux.gs, - ctx->aux.bs, - 0, - auxArgs); - if (ret != GA_NO_ERROR){ - return reduxCleanup(ctx, ret); - } - } - /* Reduction kernel invocation */ ret = GpuKernel_call(&ctx->kernel, - ctx->pri.ndh>0 ? ctx->pri.ndh : 1, - ctx->pri.gs, - ctx->pri.bs, + ctx->st1.ndh>0 ? ctx->st1.ndh : 1, + ctx->st1.gs, + ctx->st1.bs, 0, priArgs); if (ret != GA_NO_ERROR){ return reduxCleanup(ctx, ret); } - /* Post-kernel invocation, if necessary */ - if (reduxIsSmallCodeModel(ctx)){ - ret = GpuKernel_call(&ctx->postKernel, - ctx->aux.ndh>0 ? ctx->aux.ndh : 1, - ctx->aux.gs, - ctx->aux.bs, - 0, - auxArgs); - if (ret != GA_NO_ERROR){ - return reduxCleanup(ctx, ret); - } - } - return reduxCleanup(ctx, ret); }else{ return reduxCleanup(ctx, GA_MEMORY_ERROR); @@ -2689,43 +2563,30 @@ static int reduxInvoke (redux_ctx* ctx){ */ static int reduxCleanup (redux_ctx* ctx, int ret){ - if (ctx->dst != ctx->wsDst){ - if(ctx->wsDst){ - GpuArray_clear(ctx->wsDst); - } - free(ctx->wsDst); - ctx->wsDst = NULL; - } - if (ctx->dstArg != ctx->wsDstArg){ - if(ctx->wsDstArg){ - GpuArray_clear(ctx->wsDstArg); - } - free(ctx->wsDstArg); - ctx->wsDstArg = NULL; - } - - free(ctx->srcAxisList); - free(ctx->dstDims); + free(ctx->flatSrcDimensions); + free(ctx->flatSrcStrides); + free(ctx->flatDstStrides); + free(ctx->flatDstArgStrides); free(ctx->sourceCode); free(ctx->errorString0); free(ctx->errorString1); - free(ctx->errorString2); - ctx->srcAxisList = NULL; - ctx->dstDims = NULL; + ctx->flatSrcDimensions = NULL; + ctx->flatSrcStrides = NULL; + ctx->flatDstStrides = NULL; + ctx->flatDstArgStrides = NULL; ctx->sourceCode = NULL; ctx->errorString0 = NULL; ctx->errorString1 = NULL; - ctx->errorString2 = NULL; gpudata_release(ctx->srcStepsGD); gpudata_release(ctx->srcSizeGD); gpudata_release(ctx->dstStepsGD); gpudata_release(ctx->dstArgStepsGD); - gpudata_release(ctx->pri.chunkSizeGD); - gpudata_release(ctx->aux.chunkSizeGD); + gpudata_release(ctx->st1.chunkSizeGD); + gpudata_release(ctx->st2.chunkSizeGD); ctx->srcStepsGD = ctx->srcSizeGD = ctx->dstStepsGD = ctx->dstArgStepsGD = - ctx->pri.chunkSizeGD = ctx->aux.chunkSizeGD = NULL; + ctx->st1.chunkSizeGD = ctx->st2.chunkSizeGD = NULL; return ret; } From 2317ca1e864045856e7d21ba6ed393b585c651f5 Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Wed, 14 Jun 2017 18:34:44 -0400 Subject: [PATCH 15/34] More planning for 2-stage reduction. --- src/gpuarray_reduction.c | 116 +++++++++++++++++++++++++++++++-------- 1 file changed, 93 insertions(+), 23 deletions(-) diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c index 24243f78ca..8f0f22f2fc 100644 --- a/src/gpuarray_reduction.c +++ b/src/gpuarray_reduction.c @@ -46,8 +46,6 @@ struct axis_desc{ ssize_t srcStride, srcOffset; ssize_t dstStride, dstOffset; ssize_t dstArgStride, dstArgOffset; - ssize_t tmpDstStride, tmpDstOffset; - ssize_t tmpDstArgStride, tmpDstArgOffset; }; typedef struct axis_desc axis_desc; @@ -274,6 +272,7 @@ struct redux_ctx{ axis_desc* xdSrc; axis_desc* xdSrcFlat; axis_desc** xdSrcPtrs; + axis_desc** xdTmpPtrs; size_t* flatSrcDimensions; ssize_t* flatSrcStrides; @@ -290,13 +289,11 @@ struct redux_ctx{ int numStages; /* Workspaces, in the case of 2-stage reduction */ - size_t* tmpSrcDimensions; + size_t* tmpDstDimensions; ssize_t* tmpDstStrides; gpudata* tmpDstData; - ssize_t tmpDstOffset; ssize_t* tmpDstArgStrides; gpudata* tmpDstArgData; - ssize_t tmpDstArgOffset; /* Source code Generator. */ int srcTypeCode; @@ -383,6 +380,8 @@ static void axisInit (axis_desc* axis, static void axisMarkReduced (axis_desc* axis, int reduxNum); static int axisGetReduxNum (const axis_desc* axis); static size_t axisGetLen (const axis_desc* axis); +static size_t axisGetTmpLen (const axis_desc* axis); +static size_t axisGetSliceLen (const axis_desc* axis); static ssize_t axisGetSrcStride (const axis_desc* axis); static size_t axisGetSrcAbsStride (const axis_desc* axis); static ssize_t axisGetSrcOffset (const axis_desc* axis); @@ -409,6 +408,7 @@ static int reduxIs2Stage (const redux_ctx* ctx); static axis_desc* reduxGetSrcAxis (const redux_ctx* ctx, int i); static axis_desc* reduxGetSrcSortAxis (const redux_ctx* ctx, int i); static axis_desc* reduxGetSrcFlatAxis (const redux_ctx* ctx, int i); +static axis_desc* reduxGetTmpAxis (const redux_ctx* ctx, int i); static int reduxTryFlattenInto (const redux_ctx* ctx, axis_desc* into, const axis_desc* from); @@ -908,12 +908,6 @@ static void axisInit (axis_desc* axis, axis->dstArgStride = 0; axis->dstArgOffset = 0; - - axis->tmpDstStride = 0; - axis->tmpDstOffset = 0; - - axis->tmpDstArgStride = 0; - axis->tmpDstArgOffset = 0; } /** @@ -935,6 +929,12 @@ static int axisGetReduxNum (const axis_desc* axis){ static size_t axisGetLen (const axis_desc* axis){ return axis->len; } +static size_t axisGetTmpLen (const axis_desc* axis){ + return axis->tmpLen; +} +static size_t axisGetSliceLen (const axis_desc* axis){ + return axis->sliceLen; +} static ssize_t axisGetSrcStride (const axis_desc* axis){ return axisGetLen(axis) > 1 ? axis->srcStride : 0; } @@ -971,6 +971,9 @@ static int axisIsReduced (const axis_desc* axis){ static int axisIsHW (const axis_desc* axis, int stage){ return (stage == 0 ? axis->hwAxisStage0 : axis->hwAxisStage1) >= 0; } +static int axisIsPartialHW (const axis_desc* axis, int stage){ + return axisIsHW(axis, stage) && axis->sliceLen != axis->len; +} static int axisGetHWAxisNum (const axis_desc* axis, int stage){ return stage == 0 ? axis->hwAxisStage0 : axis->hwAxisStage1; } @@ -1034,7 +1037,7 @@ static int reduxRequiresDstArg (const redux_ctx* ctx){ * This is semantically subtly different from reduxHasDst(). The main * difference is in the implementation of the GA_REDUCE_ARGMIN/ARGMAX * reductions; Either *might* require a dst buffer, which will have to be - * allocated, even though it will be discared. + * allocated, even though it will be discarded. */ static int reduxKernelRequiresDst (const redux_ctx* ctx){ @@ -1147,6 +1150,14 @@ static axis_desc* reduxGetSrcFlatAxis (const redux_ctx* ctx, int i){ return &ctx->xdSrcFlat[i]; } +/** + * @brief Get description of temporary axis with given number. + */ + +static axis_desc* reduxGetTmpAxis (const redux_ctx* ctx, int i){ + return ctx->xdTmpPtrs[i]; +} + /** * @brief Attempt to flatten an axis `from` into an axis `into`. * @@ -1343,7 +1354,6 @@ static int reduxInferProperties (redux_ctx* ctx){ ctx->ndr = ctx->reduxLen; ctx->ndd = ctx->nds - ctx->ndr; ctx->ndfs = ctx->ndfr = ctx->ndfd = 0; - ctx->ndt = ctx->ndd + 1; /* Insane reduxList? */ for (i=0;indr;i++){ @@ -1749,13 +1759,11 @@ static int reduxSelectNumStages (redux_ctx* ctx){ ctx->prodFreeAxes >= ctx->prodRdxAxes || /* More destinations than reductions? */ ctx->prodFreeAxes >= parallelism ){ /* Destination very large? */ ctx->numStages = 1; - return reduxPlan1Stage(ctx); }else{ /* BUG: Switch to 2Stage when small code model fixed. */ - (void)reduxPlan2Stage; ctx->numStages = 1; - return reduxPlan1Stage(ctx); } + return ctx->numStages == 1 ? reduxPlan1Stage(ctx) : reduxPlan2Stage(ctx); } /** @@ -1812,11 +1820,13 @@ static int reduxPlan1Stage (redux_ctx* ctx){ */ static int reduxPlan2Stage (redux_ctx* ctx){ - int i; + int i, j, ret = 0; axis_desc* axis; - size_t a = 1, aL, aPartial, target = ctx->maxLg; + size_t a = 1, aL, aPartial, target = reduxEstimateParallelism(ctx), sz; /** + * Plan Stage 0. + * * Sort axis descriptions reduction-axes-first then longest-first, and * select up to 3 reduction axes, splitting them s.t. their product does * not exceed the max block size. @@ -1841,7 +1851,7 @@ static int reduxPlan2Stage (redux_ctx* ctx){ if(a <= target){ axis->hwAxisStage0 = i; axis->sliceLen = aL; - axis->tmpLen = (axis->len+axis->sliceLen-1)/axis->sliceLen; + axis->tmpLen = 1; ctx->st1.ndh++; }else{ @@ -1866,15 +1876,67 @@ static int reduxPlan2Stage (redux_ctx* ctx){ * We now have enough information to allocate the workspaces. */ - if(!reduxRequiresDst (ctx) && reduxKernelRequiresDst(ctx)){ + ctx->ndt = ctx->ndfs - ctx->st1.ndh + ctx->st1.ndhp; + ctx->xdTmpPtrs = malloc(ctx->ndt*sizeof(*ctx->xdTmpPtrs)); + ctx->tmpDstDimensions = malloc(ctx->ndt*sizeof(*ctx->tmpDstDimensions)); + ctx->tmpDstStrides = malloc(ctx->ndt*sizeof(*ctx->tmpDstStrides)); + ctx->tmpDstArgStrides = malloc(ctx->ndt*sizeof(*ctx->tmpDstArgStrides)); + if(!ctx->xdTmpPtrs || !ctx->tmpDstDimensions || !ctx->tmpDstStrides || + !ctx->tmpDstArgStrides){ + return reduxCleanup(ctx, GA_MEMORY_ERROR); + } + for(i=j=0;indfs;i++){ + axis = reduxGetSrcFlatAxis(ctx, i); + if(!axisIsHW(axis, 0) || axisIsPartialHW(axis, 0)){ + ctx->xdTmpPtrs [j] = axis; + ctx->tmpDstDimensions[j] = axisGetTmpLen(axis); + } + } + + if (reduxKernelRequiresDst(ctx)){ + sz = gpuarray_get_elsize(ctx->dstTypeCode); + for(i=ctx->ndt-1;i>=0;i--){ + ctx->tmpDstStrides[i] = sz; + sz *= ctx->tmpDstDimensions[i]; + } + + ctx->tmpDstData = gpudata_alloc(ctx->gpuCtx, sz, 0, 0, &ret); + if(ret != GA_NO_ERROR){ + return reduxCleanup(ctx, ret); + } } - if(!reduxRequiresDstArg(ctx) && reduxKernelRequiresDstArg(ctx)){ + if (reduxKernelRequiresDstArg(ctx)){ + sz = gpuarray_get_elsize(ctx->dstArgTypeCode); + for(i=ctx->ndt-1;i>=0;i--){ + ctx->tmpDstArgStrides[i] = sz; + sz *= ctx->tmpDstDimensions[i]; + } + + ctx->tmpDstArgData = gpudata_alloc(ctx->gpuCtx, sz, 0, 0, &ret); + if(ret != GA_NO_ERROR){ + return reduxCleanup(ctx, ret); + } } + /** + * Plan Stage 1. + */ + + qsort(ctx->xdTmpPtrs, ctx->ndt, sizeof(*ctx->xdTmpPtrs), reduxSortPlan1Stage); - /* NOTE: Use gpuarray_get_elsize(typecode) */ + ctx->st2.ndh = 0; + ctx->st2.ndhp = 0; + ctx->st2.ndhr = 0; + + for (i=0;indfd && ihwAxisStage1 = i; + + ctx->st2.ndh++; + } + ctx->st2.ndhd = ctx->st2.ndh; return reduxGenSource(ctx); } @@ -2567,6 +2629,9 @@ static int reduxCleanup (redux_ctx* ctx, int ret){ free(ctx->flatSrcStrides); free(ctx->flatDstStrides); free(ctx->flatDstArgStrides); + free(ctx->tmpDstDimensions); + free(ctx->tmpDstStrides); + free(ctx->tmpDstArgStrides); free(ctx->sourceCode); free(ctx->errorString0); free(ctx->errorString1); @@ -2574,10 +2639,15 @@ static int reduxCleanup (redux_ctx* ctx, int ret){ ctx->flatSrcStrides = NULL; ctx->flatDstStrides = NULL; ctx->flatDstArgStrides = NULL; + ctx->tmpDstDimensions = NULL; + ctx->tmpDstStrides = NULL; + ctx->tmpDstArgStrides = NULL; ctx->sourceCode = NULL; ctx->errorString0 = NULL; ctx->errorString1 = NULL; - + + gpudata_release(ctx->tmpDstData); + gpudata_release(ctx->tmpDstArgData); gpudata_release(ctx->srcStepsGD); gpudata_release(ctx->srcSizeGD); gpudata_release(ctx->dstStepsGD); From c3977d8c1f28dd2156ae010d6306a41fd140b905 Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Tue, 27 Jun 2017 19:03:32 -0400 Subject: [PATCH 16/34] Near-complete rewrite based on 1/2-phase code model with workspace. --- src/gpuarray/reduction.h | 94 +- src/gpuarray_reduction.c | 4167 +++++++++++++++++++++++--------------- tests/check_reduction.c | 393 +++- 3 files changed, 2912 insertions(+), 1742 deletions(-) diff --git a/src/gpuarray/reduction.h b/src/gpuarray/reduction.h index f6638c9a83..c8508b841d 100644 --- a/src/gpuarray/reduction.h +++ b/src/gpuarray/reduction.h @@ -21,39 +21,76 @@ extern "C" { #endif +/* Data Structures */ +struct GpuReduction; +typedef struct GpuReduction GpuReduction; + + /** * Supported array reduction operations. */ typedef enum _ga_reduce_op { - GA_REDUCE_SUM, /* + */ - GA_REDUCE_PROD, /* * */ - GA_REDUCE_PRODNZ, /* * (!=0) */ - GA_REDUCE_MIN, /* min() */ - GA_REDUCE_MAX, /* max() */ - GA_REDUCE_ARGMIN, /* argmin() */ - GA_REDUCE_ARGMAX, /* argmax() */ - GA_REDUCE_MINANDARGMIN, /* min(), argmin() */ - GA_REDUCE_MAXANDARGMAX, /* max(), argmax() */ - GA_REDUCE_AND, /* & */ - GA_REDUCE_OR, /* | */ - GA_REDUCE_XOR, /* ^ */ - GA_REDUCE_ALL, /* &&/all() */ - GA_REDUCE_ANY, /* ||/any() */ + /* dst , dstArg */ + GA_REDUCE_SUM, /* + */ + GA_REDUCE_PROD, /* * */ + GA_REDUCE_PRODNZ, /* * (!=0) */ + GA_REDUCE_MIN, /* min() */ + GA_REDUCE_MAX, /* max() */ + GA_REDUCE_ARGMIN, /* argmin() */ + GA_REDUCE_ARGMAX, /* argmax() */ + GA_REDUCE_MINANDARGMIN, /* min() , argmin() */ + GA_REDUCE_MAXANDARGMAX, /* max() , argmax() */ + GA_REDUCE_AND, /* & */ + GA_REDUCE_OR, /* | */ + GA_REDUCE_XOR, /* ^ */ + GA_REDUCE_ALL, /* &&/all() */ + GA_REDUCE_ANY, /* ||/any() */ } ga_reduce_op; +/* External Functions */ /** - * @brief Compute a reduction over a list of axes to reduce. + * @brief Create a new GPU reduction operator over a list of axes to reduce. + * + * @param [out] gr The reduction operator. + * @param [in] gpuCtx The GPU context. + * @param [in] op The reduction operation to perform. + * @param [in] ndf The minimum number of destination dimensions to support. + * @param [in] ndr The minimum number of reduction dimensions to support. + * @param [in] srcTypeCode The data type of the source operand. + * @param [in] flags Reduction operator creation flags. Currently must be + * set to 0. + * + * @return GA_NO_ERROR if the operator was created successfully, or a non-zero + * error code otherwise. + */ + +GPUARRAY_PUBLIC int GpuReduction_new (GpuReduction** grOut, + gpucontext* gpuCtx, + ga_reduce_op op, + unsigned ndf, + unsigned ndr, + int srcTypeCode, + int flags); + +/** + * @brief Deallocate an operator allocated by GpuReduction_new(). + */ + +GPUARRAY_PUBLIC void GpuReduction_free (GpuReduction* gr); + +/** + * @brief Invoke an operator allocated by GpuReduction_new() on a source tensor. * * Returns one (in the case of min-and-argmin/max-and-argmax, two) destination * tensors. The destination tensor(s)' axes are a strict subset of the axes of the * source tensor. The axes to be reduced are specified by the caller, and the * reduction is performed over these axes, which are then removed in the * destination. - * - * @param [in] op The reduction operation to perform. + * + * @param [in] gr The reduction operator. * @param [out] dst The destination tensor. Has the same type as the source. * @param [out] dstArg For argument of minima/maxima operations. Has type int64. * @param [in] src The source tensor. @@ -76,19 +113,20 @@ typedef enum _ga_reduce_op { * * where (i3,i4,i1) are the coordinates of the maximum- * valued element within subtensor [i0,:,i2,:,:] of src. - * @return GA_NO_ERROR if the operation was successful, or a non-zero error - * code otherwise. + * @param [in] flags Reduction operator invocation flags. Currently must be + * set to 0. + * + * @return GA_NO_ERROR if the operator was invoked successfully, or a non-zero + * error code otherwise. */ -GPUARRAY_PUBLIC int GpuArray_reduction (ga_reduce_op op, - GpuArray* dst, - GpuArray* dstArg, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList); - - - +GPUARRAY_PUBLIC int GpuReduction_call (GpuReduction* gr, + GpuArray* dst, + GpuArray* dstArg, + const GpuArray* src, + unsigned reduxLen, + const int* reduxList, + int flags); #ifdef __cplusplus diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c index 8f0f22f2fc..81376b93b8 100644 --- a/src/gpuarray_reduction.c +++ b/src/gpuarray_reduction.c @@ -24,11 +24,8 @@ /* Defines */ +#define DIVIDECEIL(a,b) (((a)+(b)-1)/(b)) #define MAX_HW_DIMS 3 -#define KERNEL_PRIMARY 0 -#define KERNEL_AUXILIARY 1 -#define AXIS_FREE 0 -#define AXIS_REDUX 1 @@ -40,17 +37,21 @@ struct axis_desc{ int reduxNum; - unsigned isReduced : 1; - int hwAxisStage0, hwAxisStage1; - size_t len, tmpLen, sliceLen; - ssize_t srcStride, srcOffset; - ssize_t dstStride, dstOffset; - ssize_t dstArgStride, dstArgOffset; + int ibNum; + unsigned ibp; + unsigned isReduced : 1; + unsigned isIntra : 1; + size_t len; + size_t splitLen; + size_t pdim; + ssize_t srcStride; + ssize_t dstStride; + ssize_t dstArgStride; }; typedef struct axis_desc axis_desc; /** - * Reduction Kernel Generator. + * Reduction Kernel Invoker. * * INTRO * @@ -65,20 +66,8 @@ typedef struct axis_desc axis_desc; * 1. Maximizing the use of coalesced memory loads within a warp. * 2. Maximizing the # of useful threads within a warp. * 3. Maximizing the number of warps within a block. - * - * NOTE: It is possible to guarantee for any tensor problem of at least - * 2*WARP_SIZE in scale that either - * 1. All warp blocks in the X dimension have more than 50% threads - * active 100% of the time, or - * 2. The warp blocks in the X dimension have 100% threads active more - * than 50% of the time. - * - * 4. Ensuring there are no more blocks than are permitted by the warp - * configuration and 2nd-stage workspace size (if required). - * 5. Ensuring there are no more than 5 blocks per multiprocessor. - * 6. Minimizing the 2nd-stage workspace (if it is required). - * 7. Striding the 2nd-stage workspace for maximum convenience (if it is - * required). Make it contiguous. + * 4. Ensuring there are no more than 5 blocks per multiprocessor. + * 5. Minimizing the workspace size (if it is required). * * * NOTES @@ -98,11 +87,9 @@ typedef struct axis_desc axis_desc; * 11. Sorted src axes for contiguous memory accesses * 12. Ndim, shape and dtype of flattened src tensor * 13. Number of stages (1 or 2) - * 14. Ndim, shape and dtype of workspace tensor - * 15. Warp axes - * 16. Hardware axes - * 17. Software axes - * 18. Source code + * 14. Size of workspace tensor + * 15. Intrablock/split/free/reduced axes + * 16. Source code * * Rationale for dependencies: * @@ -110,140 +97,20 @@ typedef struct axis_desc axis_desc; * context is a likely error and we want to fail fast. * 2) The type and initializer of the accumulator should be determined after * the context's properties have been retrieved since they provide - * information about the device's natively-supported types and operations. - * - * REFERENCES - * - * http://lpgpu.org/wp/wp-content/uploads/2013/05/poster_andresch_acaces2014.pdf - * - * - * - * - * - * Kernel Template: - * - * The following kernel code template displays the code generated for the - * small code model. For the large code model, no pre/postRedux() kernels - * are generated (since their functionality is incorporated within the main - * redux() kernel), no atomicRedux() function needs to be generated because - * writes to global memory are unconditional and not contended. - * - * - * //Macros - * #define FOROVER - * #define ESCAPE - * #define srcVal //Indexer - * #define dstVal //Indexer - * #define dstArgVal //Indexer - * #define rdxIdx //Special reduction index computer - * - * - * //Typedefs: - * typedef float S //The type of the source array. - * typedef float T //The type of the destination array. - * typedef ssize_t A //The type of the destination argument array. - * typedef ssize_t X //The type of the indices: signed 32/64-bit. - * typedef float K //The type of the accumulator variable. - * - * - * //Initializer (in case initial value of accumulator cannot be expressed - * //as a literal) - * static K getInitValTFn(void){ - * return ... - * } - * static K getInitValKFn(void){ - * return ... - * } - * - * - * //Reduce into global memory destination a value. - * static void writeBackFn(GLOBAL_MEM T* d_, T d, - * GLOBAL_MEM A* a_, A a){ - * //Large code model: - * *dPtr = d; - * *aPtr = a; - * - * //Small code model: - * // Something complex possibly involving CAS loops - * } - * - * - * //Load data from source and apply pre-operations, coercing the type to - * //the accumulator type K. - * static K loadValFn(X i0, X i1, ..., X iN, - * const GLOBAL_MEM S* srcPtr, - * const X srcOff, - * const GLOBAL_MEM X* srcSteps, - * ...?){ - * return ... - * } - * - * - * //Initialization kernel - * KERNEL void initKer(const GLOBAL_MEM X* srcSize, - * const GLOBAL_MEM X* chunkSize, - * GLOBAL_MEM T* dstPtr, - * const X dstOff, - * const GLOBAL_MEM X* dstSteps){ - * dstVal = getInitValTFn(); - * } - * - * - * //Reduction Kernel. - * KERNEL void reduxKer(GLOBAL_MEM S* srcPtr, - * const X srcOff, - * const GLOBAL_MEM X* srcSteps, - * const GLOBAL_MEM X* srcSize, - * const GLOBAL_MEM X* chunkSize, - * GLOBAL_MEM T* dstPtr, - * const X dstOff, - * const GLOBAL_MEM X* dstSteps, - * GLOBAL_MEM A* dstArgPtr, - * const X dstArgOff, - * const GLOBAL_MEM X* dstArgSteps){ - * //Declare Indices - * //Compute Ranges - * - * //Outer Loops - * K rdxK = getInitValKFn(); - * A rdxA = 0; - * //Inner Loops - * K k = loadValFn(indices..., srcPtr, srcOff, srcSteps) - * rdxK = k - * rdxA = rdxIdx - * writeBackFn(&dstVal, d, &dstArgVal, a); - * } - * - * - * //Post-scalar kernel, - * KERNEL void postKer(const GLOBAL_MEM X* srcSize, - * const GLOBAL_MEM X* chunkSize, - * GLOBAL_MEM T* dst, - * const X dstOff, - * const GLOBAL_MEM X* dstSteps){ - * //Default: Nothing. - * dstVal = dstVal - * } - * - * - * Initial Reduction Values - * +--------------+-----+-----+---------+---------+-----+-----+-----+-----+-----+ - * | Type\Op | + | * | max | min | & | | | ^ | && | || | - * +--------------+-----+-----+---------+---------+-----+-----+-----+-----+-----+ - * | signed int | 0 | 1 | INT_MIN | INT_MAX | ~0 | 0 | 0 | ~0 | 0 | - * | unsigned int | 0 | 1 | 0 | ~0 | ~0 | 0 | 0 | ~0 | 0 | - * | floating | 0.0 | 1.0 | NAN | NAN | | | | | | - * +--------------+-----+-----+---------+---------+-----+-----+-----+-----+-----+ + * information about the device's natively-supported types and operations + * (e.g. half-precision float) */ struct redux_ctx{ /* Function Arguments. */ + GpuReduction* gr; ga_reduce_op op; GpuArray* dst; GpuArray* dstArg; const GpuArray* src; int reduxLen; const int* reduxList; + int flags; /* General. */ int nds; /* # Source dimensions */ @@ -252,105 +119,146 @@ struct redux_ctx{ int ndfs; /* # Flattened source dimensions */ int ndfr; /* # Flattened source dimensions */ int ndfd; /* # Flattened source dimensions */ - int ndt; /* # Temporary workspace dimensions */ + int ndib; /* # Intra-block dimensions */ int zeroAllAxes; /* # of zero-length axes in source tensor */ int zeroRdxAxes; /* # of zero-length reduction axes in source tensor */ size_t prodAllAxes; /* Product of length of all axes in source tensor */ size_t prodRdxAxes; /* Product of length of all reduction axes in source tensor */ size_t prodFreeAxes; /* Product of length of all free axes in source tensor */ - /* GPU Context & Device */ - gpucontext* gpuCtx; - unsigned numProcs; - size_t warpSize; - size_t maxLg; - size_t maxLs[MAX_HW_DIMS]; - size_t maxGg; - size_t maxGs[MAX_HW_DIMS]; - /* Flattening */ axis_desc* xdSrc; - axis_desc* xdSrcFlat; axis_desc** xdSrcPtrs; axis_desc** xdTmpPtrs; + + /* Invoker */ + int phase; + size_t U; + size_t V; + size_t B; + unsigned D; + unsigned H; + unsigned splitReduce; + unsigned splitFree; + + axis_desc* xdSplit; + + size_t* l; + size_t* lPDim; + ssize_t* sJ; + ssize_t* dJ; + ssize_t* aJ; - size_t* flatSrcDimensions; - ssize_t* flatSrcStrides; gpudata* flatSrcData; ssize_t flatSrcOffset; - ssize_t* flatDstStrides; gpudata* flatDstData; ssize_t flatDstOffset; - ssize_t* flatDstArgStrides; gpudata* flatDstArgData; ssize_t flatDstArgOffset; - /* Select number of stages */ - int numStages; + gpudata* w; + size_t SHMEM; + ssize_t wdOff; + ssize_t pdOff; + ssize_t waOff; + ssize_t paOff; - /* Workspaces, in the case of 2-stage reduction */ - size_t* tmpDstDimensions; - ssize_t* tmpDstStrides; - gpudata* tmpDstData; - ssize_t* tmpDstArgStrides; - gpudata* tmpDstArgData; - - /* Source code Generator. */ - int srcTypeCode; - int dstTypeCode; - int dstArgTypeCode; - int idxTypeCode; - int accTypeCode; - const char* srcTypeStr; - const char* dstTypeStr; - const char* dstArgTypeStr; - const char* idxTypeStr; - const char* accTypeStr; - const char* initValT; - const char* initValK; - strb s; - srcb srcGen; - char* sourceCode; - size_t sourceCodeLen; - char* errorString0; - char* errorString1; - GpuKernel preKernel; - GpuKernel kernel; - GpuKernel postKernel; + unsigned* ibs; + unsigned* ibp; + size_t* iblPDim; + ssize_t* ibsOff; + ssize_t* ibdOff; + ssize_t* ibaOff; + + void** kArgs; + + + /* Scheduler */ + size_t bs; + size_t gs; +}; +typedef struct redux_ctx redux_ctx; - /** - * Scheduler - * - * There are two sets of kernels that may be scheduled: - * 1) The reduction kernel. This is the only kernel scheduled in the - * large code model. - * 2) The initialization and post-scalar kernels. These are scheduled - * only in the small code model. - * - * The reduction kernel is the "primary" kernel. The other two, if needed, - * are referred to as "auxiliary" kernels. - */ - struct{ - int ndh; - int ndhp; - int ndhd; - int ndhr; - size_t bs [MAX_HW_DIMS]; - size_t gs [MAX_HW_DIMS]; - size_t cs [MAX_HW_DIMS]; - gpudata* chunkSizeGD; - } st1, st2; +/** + * Reduction Operator. + * + * INTRO + * + * Generates the source code for a reduction kernel over arbitrarily-dimensioned, + * -shaped and -typed tensors. + * + * + * GOALS + * + * The generator has the following goals: + * + * 1. Maximizing the use of coalesced memory loads within a warp. + * 2. Maximizing the # of useful threads within a warp. + * 3. Maximizing the number of warps within a block. + * 4. Ensuring there are no more than 5 blocks per multiprocessor. + * 5. Minimizing the workspace size (if it is required). + * + * + * REFERENCES + * + * http://lpgpu.org/wp/wp-content/uploads/2013/05/poster_andresch_acaces2014.pdf + * + * + * Initial Reduction Values + * +--------------+-----+-----+---------+---------+-----+-----+-----+-----+-----+ + * | Type\Op | + | * | max | min | & | | | ^ | && | || | + * +--------------+-----+-----+---------+---------+-----+-----+-----+-----+-----+ + * | signed int | 0 | 1 | INT_MIN | INT_MAX | ~0 | 0 | 0 | ~0 | 0 | + * | unsigned int | 0 | 1 | 0 | ~0 | ~0 | 0 | 0 | ~0 | 0 | + * | floating | 0.0 | 1.0 | NAN | NAN | | | | | | + * +--------------+-----+-----+---------+---------+-----+-----+-----+-----+-----+ + */ - /* Invoker */ - gpudata* srcStepsGD; - gpudata* srcSizeGD; - gpudata* chunkSizeGD; - gpudata* dstStepsGD; - gpudata* dstArgStepsGD; +struct GpuReduction{ + /* Function Arguments. */ + gpucontext* gpuCtx; + ga_reduce_op op; + int ndd; + int ndr; + int srcTypeCode; + int flags; + + /* Misc */ + int nds; + + /* Source code Generator. */ + strb s; + srcb srcGen; + char* kSourceCode; + size_t kSourceCodeLen; + int dstTypeCode; + int dstArgTypeCode; + int idxTypeCode; + int accTypeCode; + const char* srcTypeStr; + const char* dstTypeStr; + const char* dstArgTypeStr; + const char* idxTypeStr; + const char* accTypeStr; + const char* initVal; + + /* Compile */ + int log2MaxL; + int kNumArgs; + int* kArgTypeCodes; + char* kErrorString; + GpuKernel k; + + /* Scheduling */ + unsigned numProcs; + size_t maxLg; + size_t maxL0; + size_t maxGg; + size_t maxG0; + size_t maxLM; + size_t maxLK; }; -typedef struct redux_ctx redux_ctx; - /* Static Function prototypes */ @@ -361,123 +269,184 @@ static int reduxGetMinInit (int typecode, const char** prop static int reduxGetMaxInit (int typecode, const char** property); static int reduxGetAndInit (int typecode, const char** property); static int reduxGetOrInit (int typecode, const char** property); +static int reduxIsSensitive (int typecode); static int reduxSortFlatSensitive (const void* a, const void* b); static int reduxSortFlatInsensitive (const void* a, const void* b); -static int reduxSortPlan1Stage (const void* a, const void* b); -static int reduxSortPlan2Stage0 (const void* a, const void* b); -static void appendIdxes (strb* s, - const char* prologue, - const char* prefix, - int startIdx, - int endIdx, - const char* suffix, - const char* epilogue); +static int reduxSortPtrIBSrcRdSelect (const void* a, const void* b); +static int reduxSortPtrByReduxNum (const void* a, const void* b); +static int reduxSortPtrIBDstWrSelect (const void* a, const void* b); +static int reduxSortPtrIBDstArgWrSelect (const void* a, const void* b); +static int reduxSortPtrFinalOrder (const void* a, const void* b); /* Axis Description API */ -static void axisInit (axis_desc* axis, - ssize_t len, - ssize_t srcStride); -static void axisMarkReduced (axis_desc* axis, int reduxNum); -static int axisGetReduxNum (const axis_desc* axis); -static size_t axisGetLen (const axis_desc* axis); -static size_t axisGetTmpLen (const axis_desc* axis); -static size_t axisGetSliceLen (const axis_desc* axis); -static ssize_t axisGetSrcStride (const axis_desc* axis); -static size_t axisGetSrcAbsStride (const axis_desc* axis); -static ssize_t axisGetSrcOffset (const axis_desc* axis); -static ssize_t axisGetDstStride (const axis_desc* axis); -static size_t axisGetDstAbsStride (const axis_desc* axis); -static ssize_t axisGetDstOffset (const axis_desc* axis); -static ssize_t axisGetDstArgStride (const axis_desc* axis); -static size_t axisGetDstArgAbsStride (const axis_desc* axis); -static ssize_t axisGetDstArgOffset (const axis_desc* axis); -static int axisIsReduced (const axis_desc* axis); -static int axisIsHW (const axis_desc* axis, int stage); -static int axisGetHWAxisNum (const axis_desc* axis, int stage); +static void axisInit (axis_desc* axis, + ssize_t len, + ssize_t srcStride); +static void axisMarkReduced (axis_desc* axis, int reduxNum); +static void axisMarkIntraBlock (axis_desc* axis, + int ibNum, + size_t ibLen); +static int axisGetReduxNum (const axis_desc* axis); +static size_t axisGetLen (const axis_desc* axis); +static size_t axisGetIntraLen (const axis_desc* axis); +static size_t axisGetInterLen (const axis_desc* axis); +static ssize_t axisGetSrcStride (const axis_desc* axis); +static size_t axisGetSrcAbsStride (const axis_desc* axis); +static ssize_t axisGetDstStride (const axis_desc* axis); +static size_t axisGetDstAbsStride (const axis_desc* axis); +static ssize_t axisGetDstArgStride (const axis_desc* axis); +static size_t axisGetDstArgAbsStride (const axis_desc* axis); +static unsigned axisGetIBP (const axis_desc* axis); +static int axisGetIBNum (const axis_desc* axis); +static void axisSetIBP (axis_desc* axis, + unsigned ibp); +static size_t axisGetPDim (const axis_desc* axis); +static void axisSetPDim (axis_desc* axis, + size_t pdim); +static int axisIsReduced (const axis_desc* axis); +static int axisIsIntra (const axis_desc* axis); +static int axisIsInter (const axis_desc* axis); +static int axisIsSplit (const axis_desc* axis); /* Reduction Context API */ -/* Utilities */ -static size_t reduxEstimateParallelism (const redux_ctx* ctx); -static int reduxRequiresDst (const redux_ctx* ctx); -static int reduxRequiresDstArg (const redux_ctx* ctx); -static int reduxKernelRequiresDst (const redux_ctx* ctx); -static int reduxKernelRequiresDstArg (const redux_ctx* ctx); -static int reduxIsSensitive (const redux_ctx* ctx); -static int reduxIs1Stage (const redux_ctx* ctx); -static int reduxIs2Stage (const redux_ctx* ctx); -static axis_desc* reduxGetSrcAxis (const redux_ctx* ctx, int i); -static axis_desc* reduxGetSrcSortAxis (const redux_ctx* ctx, int i); -static axis_desc* reduxGetSrcFlatAxis (const redux_ctx* ctx, int i); -static axis_desc* reduxGetTmpAxis (const redux_ctx* ctx, int i); -static int reduxTryFlattenInto (const redux_ctx* ctx, +/* Generator Control Flow */ +static int reduxGenInit (GpuReduction* gr); +static int reduxGenInferProperties (GpuReduction* gr); +static int reduxGenSrc (GpuReduction* gr); +static void reduxGenSrcAppend (GpuReduction* gr); +static void reduxGenSrcAppendIncludes (GpuReduction* gr); +static void reduxGenSrcAppendMacroDefs (GpuReduction* gr); +static void reduxGenSrcAppendTypedefs (GpuReduction* gr); +static void reduxGenSrcAppendReduxKernel (GpuReduction* gr); +static void reduxGenSrcAppendPrototype (GpuReduction* gr); +static void reduxGenSrcAppendBlockDecode (GpuReduction* gr); +static void reduxGenSrcAppendThreadDecode (GpuReduction* gr); +static void reduxGenSrcAppendPhase0 (GpuReduction* gr); +static void reduxGenSrcAppendLoops (GpuReduction* gr, + int freeMaybeSplit, + int reduceMaybeSplit); +static void reduxGenSrcAppendLoop (GpuReduction* gr, + int initial, + int freeMaybeSplit, + int reduceMaybeSplit); +static void reduxGenSrcAppendDecrement (GpuReduction* gr); +static void reduxGenSrcAppendVertical (GpuReduction* gr, + int freeMaybeSplit, + int reduceMaybeSplit); +static void reduxGenSrcAppendIncrement (GpuReduction* gr, + int axis, + int initial, + int freeMaybeSplit, + int reduceMaybeSplit); +static void reduxGenSrcAppendDstWrite (GpuReduction* gr, + int initial, + int freeMaybeSplit, + int reduceMaybeSplit); +static void reduxGenSrcAppendPhase1 (GpuReduction* gr); +static int reduxGenCompile (GpuReduction* gr); +static int reduxGenComputeLaunchBounds (GpuReduction* gr); +static int reduxGenCleanup (GpuReduction* gr, int ret); +static int reduxGenCleanupMsg (GpuReduction* gr, int ret, + const char* fmt, ...); + +/* Generator Utilities */ +static size_t reduxGenEstimateParallelism (const GpuReduction* gr); +static int reduxGenRequiresDst (const GpuReduction* gr); +static int reduxGenRequiresDstArg (const GpuReduction* gr); +static int reduxGenKernelRequiresDst (const GpuReduction* gr); +static int reduxGenKernelRequiresDstArg (const GpuReduction* gr); +static int reduxGenAxisMaybeSplit (const GpuReduction* gr, int axis); +static size_t reduxGenGetReduxStateSize (const GpuReduction* gr); +static size_t reduxGenGetMaxLocalSize (const GpuReduction* gr); +static size_t reduxGenGetSHMEMSize (const GpuReduction* gr, size_t bs); +static size_t reduxGenGetSHMEMDstOff (const GpuReduction* gr, size_t bs); +static size_t reduxGenGetSHMEMDstArgOff (const GpuReduction* gr, size_t bs); + +/* Invoker Control Flow */ +static int reduxInvInit (redux_ctx* ctx); +static int reduxInvInferProperties (redux_ctx* ctx); +static int reduxInvFlattenSource (redux_ctx* ctx); +static int reduxInvComputeKArgs (redux_ctx* ctx); +static int reduxInvSchedule (redux_ctx* ctx); +static int reduxInvoke (redux_ctx* ctx); +static int reduxInvCleanup (redux_ctx* ctx, int ret); +static int reduxInvCleanupMsg (redux_ctx* ctx, int ret, + const char* fmt, ...); + +/* Invoker Utilities */ +static size_t reduxInvEstimateParallelism (const redux_ctx* ctx); +static int reduxInvRequiresDst (const redux_ctx* ctx); +static int reduxInvRequiresDstArg (const redux_ctx* ctx); +static int reduxInvKernelRequiresDst (const redux_ctx* ctx); +static unsigned reduxInvGetSplitFree (const redux_ctx* ctx); +static unsigned reduxInvGetSplitReduce (const redux_ctx* ctx); +static axis_desc* reduxInvGetSrcAxis (const redux_ctx* ctx, int i); +static axis_desc* reduxInvGetSrcSortAxis (const redux_ctx* ctx, int i); +static int reduxTryFlattenOut (const redux_ctx* ctx, + const axis_desc* out); +static int reduxTryFlattenInto (redux_ctx* ctx, axis_desc* into, const axis_desc* from); static void reduxSortAxisPtrsBy (axis_desc** ptrs, axis_desc* axes, size_t numAxes, int(*fn)(const void*, const void*)); -/* Control Flow */ -static int reduxInit (redux_ctx* ctx); -static int reduxInferProperties (redux_ctx* ctx); -static int reduxFlattenSource (redux_ctx* ctx); -static int reduxSelectNumStages (redux_ctx* ctx); -static int reduxPlan1Stage (redux_ctx* ctx); -static int reduxPlan2Stage (redux_ctx* ctx); -static int reduxGenSource (redux_ctx* ctx); -static void reduxAppendSource (redux_ctx* ctx); -static void reduxAppendIncludes (redux_ctx* ctx); -static void reduxAppendTensorDeclArgs (redux_ctx* ctx, - const char* type, - const char* baseName); -static void reduxAppendTensorCallArgs (redux_ctx* ctx, - const char* baseName); -static void reduxAppendMacroDefs (redux_ctx* ctx); -static void reduxAppendTypedefs (redux_ctx* ctx); -static void reduxAppendGetInitValFns (redux_ctx* ctx); -static void reduxAppendWriteBackFn (redux_ctx* ctx); -static void reduxAppendReduxKernel (redux_ctx* ctx); -static void reduxAppendPrototype (redux_ctx* ctx); -static void reduxAppendIndexDeclarations (redux_ctx* ctx); -static void reduxAppendRangeCalculations (redux_ctx* ctx); -static void reduxAppendLoops (redux_ctx* ctx); -static int reduxCompile (redux_ctx* ctx); -static int reduxSchedule (redux_ctx* ctx); -static void reduxScheduleKernel (int ndims, - uint64_t* dims, - uint64_t warpSize, - uint64_t maxLg, - uint64_t* maxLs, - uint64_t maxGg, - uint64_t* maxGs, - uint64_t* bs, - uint64_t* gs, - uint64_t* cs); -static int reduxInvoke (redux_ctx* ctx); -static int reduxCleanup (redux_ctx* ctx, int ret); -static int reduxCleanupMsg (redux_ctx* ctx, int ret, - const char* fmt, ...); - - -/* Function implementation */ -GPUARRAY_PUBLIC int GpuArray_reduction (ga_reduce_op op, - GpuArray* dst, - GpuArray* dstArg, - const GpuArray* src, - unsigned reduxLen, - const unsigned* reduxList){ + + +/* Function Implementations */ +/* Extern Functions */ +GPUARRAY_PUBLIC int GpuReduction_new (GpuReduction** grOut, + gpucontext* gpuCtx, + ga_reduce_op op, + unsigned ndf, + unsigned ndr, + int srcTypeCode, + int flags){ + if(!grOut){ + return GA_INVALID_ERROR; + } + + *grOut = calloc(1, sizeof(**grOut)); + if(*grOut){ + (*grOut)->gpuCtx = gpuCtx; + (*grOut)->op = op; + (*grOut)->ndd = (int)ndf; + (*grOut)->ndr = (int)ndr; + (*grOut)->srcTypeCode = srcTypeCode; + (*grOut)->flags = flags; + + return reduxGenInit(*grOut); + }else{ + return GA_MEMORY_ERROR; + } +} +GPUARRAY_PUBLIC void GpuReduction_free (GpuReduction* gr){ + reduxGenCleanup(gr, !GA_NO_ERROR); +} +GPUARRAY_PUBLIC int GpuReduction_call (GpuReduction* gr, + GpuArray* dst, + GpuArray* dstArg, + const GpuArray* src, + unsigned reduxLen, + const int* reduxList, + int flags){ redux_ctx ctxSTACK, *ctx = &ctxSTACK; memset(ctx, 0, sizeof(*ctx)); - ctx->op = op; + ctx->gr = gr; ctx->dst = dst; ctx->dstArg = dstArg; ctx->src = src; ctx->reduxLen = reduxLen; - ctx->reduxList = (const int*)reduxList; + ctx->reduxList = reduxList; + ctx->flags = flags; - return reduxInit(ctx); + return reduxInvInit(ctx); } + +/* Static Functions */ + /** * @brief Get an expression representing a suitable initialization value for * the given datatype and a sum-reduction operation. @@ -747,6 +716,45 @@ static int reduxGetOrInit (int typecode, const char** prop return GA_NO_ERROR; } +/** + * @brief Returns whether the reduction is sensitive. + * + * A reduction is sensitive when its output satisfies at least one of the + * following conditions: + * + * - It depends on the exact order of axes in the reduxList + * - It depends on exact signs of the strides of axes in the reduxList + * + * Such sensitivity may prevent a flattening of contiguous axes even when it + * would have been otherwise permitted. + * + * For instance, ARGMIN/ARGMAX have this sensitivity, because the dstArg + * tensor's contents are flattened coordinates into the source tensor, and + * the flattening order is precisely reduxList. Permuting it would thus produce + * incorrect output. Moreover, if the strides of a reduction axis were to be + * reversed for the purpose of flattening the axis into another, the computed + * coordinate would again be incorrect. + * + * + * TL;DR: Reduction is sensitive if + * reduce(x, axis=axisList) != reduce(x, axis=axisList[::-1]) + * or + * reduce(x) != reduce(x[::-1]) + * . + */ + +static int reduxIsSensitive (int typecode){ + switch (typecode){ + case GA_REDUCE_MINANDARGMIN: + case GA_REDUCE_MAXANDARGMAX: + case GA_REDUCE_ARGMIN: + case GA_REDUCE_ARGMAX: + return 1; + default: + return 0; + } +} + /** * @brief Sort the axes into optimal order for flattening. * @@ -811,78 +819,155 @@ static int reduxSortFlatSensitive (const void* a, const void* b){ } /** - * For the plan of a 1-stage reduction, we need to sort the free axes by - * decreasing length. + * @brief Sort the axes into optimal order for contiguous memory access. + * + * This means ascending order of absolute stride. */ -static int reduxSortPlan1Stage (const void* a, const void* b){ +static int reduxSortPtrIBSrcRdSelect (const void* a, const void* b){ const axis_desc* xda = *(const axis_desc* const*)a; const axis_desc* xdb = *(const axis_desc* const*)b; + + if (axisGetSrcAbsStride(xda) < axisGetSrcAbsStride(xdb)){ + return -1; + }else if (axisGetSrcAbsStride(xda) > axisGetSrcAbsStride(xdb)){ + return +1; + } - if ( axisIsReduced(xda) && !axisIsReduced(xdb)){ + return 0; +} +static int reduxSortPtrByReduxNum (const void* a, const void* b){ + const axis_desc* xda = *(const axis_desc* const*)a; + const axis_desc* xdb = *(const axis_desc* const*)b; + + if ( axisIsReduced(xda) && !axisIsReduced(xdb)){ + return -1; + }else if (!axisIsReduced(xda) && axisIsReduced(xdb)){ return +1; - }else if (!axisIsReduced(xda) && axisIsReduced(xdb)){ + } + + if (axisGetReduxNum(xda) < axisGetReduxNum(xdb)){ + return +1; + }else if (axisGetReduxNum(xda) > axisGetReduxNum(xdb)){ return -1; } - return axisGetLen(xda) axisGetDstAbsStride(xdb)){ return +1; } - return axisGetLen(xda) axisGetDstArgAbsStride(xdb)){ + return +1; + } -/** - * @brief Append a comma-separated list of indices, whose name contains an - * incrementing integer, to a string buffer. - * - * - * @param [in] s The string buffer to which to append. - * @param [in] prologue Text that is prepended in front and NOT repeated. - * @param [in] prefix Text that is prepended in front of the integer and - * repeated. - * @param [in] startIdx First value of the integer (inclusive) - * @param [in] endIdx Last value of the integer (exclusive) - * @param [in] suffix Text that is appended after the integer, followed by - * a comma if it isn't the last index, and repeated. - * @param [in] epilogue Text that is appended and NOT repeated. - */ - -static void appendIdxes (strb* s, - const char* prologue, - const char* prefix, - int startIdx, - int endIdx, - const char* suffix, - const char* epilogue){ - int i; - - prologue = prologue ? prologue : ""; - prefix = prefix ? prefix : ""; - suffix = suffix ? suffix : ""; - epilogue = epilogue ? epilogue : ""; - - strb_appends(s, prologue); - for (i=startIdx;i axisGetIBNum(xdb)){ + return -1; + } + + return 0; + }else{ + /* All free inter axes go first (i{0..3}) */ + if ( axisIsReduced(xda) && !axisIsReduced(xdb)){ + return +1; + }else if (!axisIsReduced(xda) && axisIsReduced(xdb)){ + return -1; + } + + /* Otherwise it's sort by descending source argument absolute stride. */ + if (axisGetSrcAbsStride(xda) < axisGetSrcAbsStride(xdb)){ + return +1; + }else if (axisGetSrcAbsStride(xda) > axisGetSrcAbsStride(xdb)){ + return -1; + } } - strb_appends(s, epilogue); + + return 0; } + /* Axis Description API */ /** @@ -895,19 +980,15 @@ static void axisInit (axis_desc* axis, memset(axis, 0, sizeof(*axis)); axis->reduxNum = -1; - axis->hwAxisStage0 = axis->hwAxisStage1 = -1; + axis->ibNum = -1; + axis->ibp = 0; axis->len = len; - axis->tmpLen = 0; - axis->sliceLen = 0; + axis->splitLen = 1; + axis->pdim = 0; axis->srcStride = srcStride; - axis->srcOffset = 0; - axis->dstStride = 0; - axis->dstOffset = 0; - axis->dstArgStride = 0; - axis->dstArgOffset = 0; } /** @@ -919,6 +1000,18 @@ static void axisMarkReduced (axis_desc* axis, int r axis->reduxNum = reduxNum; } +/** + * @brief Mark axis as (split) intrablock axis. + */ + +static void axisMarkIntraBlock (axis_desc* axis, + int ibNum, + size_t ibLen){ + axis->isIntra = 1; + axis->ibNum = ibNum; + axis->splitLen = ibLen; +} + /** * @brief Get properties of an axis. */ @@ -929,11 +1022,23 @@ static int axisGetReduxNum (const axis_desc* axis){ static size_t axisGetLen (const axis_desc* axis){ return axis->len; } -static size_t axisGetTmpLen (const axis_desc* axis){ - return axis->tmpLen; +static size_t axisGetIntraLen (const axis_desc* axis){ + if (axisIsSplit(axis)){ + return axis->splitLen; + }else if (axisIsIntra(axis)){ + return axis->len; + }else{ + return 1; + } } -static size_t axisGetSliceLen (const axis_desc* axis){ - return axis->sliceLen; +static size_t axisGetInterLen (const axis_desc* axis){ + if (axisIsSplit(axis)){ + return DIVIDECEIL(axis->len, axis->splitLen); + }else if (axisIsIntra(axis)){ + return 1; + }else{ + return axis->len; + } } static ssize_t axisGetSrcStride (const axis_desc* axis){ return axisGetLen(axis) > 1 ? axis->srcStride : 0; @@ -942,9 +1047,6 @@ static size_t axisGetSrcAbsStride (const axis_desc* axis){ return axisGetSrcStride(axis)<0 ? -(size_t)axisGetSrcStride(axis): +(size_t)axisGetSrcStride(axis); } -static ssize_t axisGetSrcOffset (const axis_desc* axis){ - return axis->srcOffset; -} static ssize_t axisGetDstStride (const axis_desc* axis){ return axisGetLen(axis) > 1 ? axis->dstStride : 0; } @@ -952,9 +1054,6 @@ static size_t axisGetDstAbsStride (const axis_desc* axis){ return axisGetDstStride(axis)<0 ? -(size_t)axisGetDstStride(axis): +(size_t)axisGetDstStride(axis); } -static ssize_t axisGetDstOffset (const axis_desc* axis){ - return axis->dstOffset; -} static ssize_t axisGetDstArgStride (const axis_desc* axis){ return axisGetLen(axis) > 1 ? axis->dstArgStride : 0; } @@ -962,175 +1061,70 @@ static size_t axisGetDstArgAbsStride (const axis_desc* axis){ return axisGetDstArgStride(axis)<0 ? -(size_t)axisGetDstArgStride(axis): +(size_t)axisGetDstArgStride(axis); } -static ssize_t axisGetDstArgOffset (const axis_desc* axis){ - return axis->dstArgOffset; +static unsigned axisGetIBP (const axis_desc* axis){ + return axis->ibp; } -static int axisIsReduced (const axis_desc* axis){ +static int axisGetIBNum (const axis_desc* axis){ + return axis->ibNum; +} +static void axisSetIBP (axis_desc* axis, + unsigned ibp){ + axis->ibp = ibp; +} +static size_t axisGetPDim (const axis_desc* axis){ + return axis->pdim; +} +static void axisSetPDim (axis_desc* axis, + size_t pdim){ + axis->pdim = pdim; +} +static int axisIsReduced (const axis_desc* axis){ return axis->isReduced; } -static int axisIsHW (const axis_desc* axis, int stage){ - return (stage == 0 ? axis->hwAxisStage0 : axis->hwAxisStage1) >= 0; +static int axisIsIntra (const axis_desc* axis){ + return axis->isIntra; } -static int axisIsPartialHW (const axis_desc* axis, int stage){ - return axisIsHW(axis, stage) && axis->sliceLen != axis->len; +static int axisIsInter (const axis_desc* axis){ + return !axisIsIntra(axis); } -static int axisGetHWAxisNum (const axis_desc* axis, int stage){ - return stage == 0 ? axis->hwAxisStage0 : axis->hwAxisStage1; +static int axisIsSplit (const axis_desc* axis){ + return axisIsIntra(axis) && axis->splitLen != axis->len; } - -/** - * @brief Estimate the level of parallelism in the device. - * - * This is a rough target number of threads. It would definitely fill the - * device, plus some substantial margin. - */ - -static size_t reduxEstimateParallelism (const redux_ctx* ctx){ - /** - * An arbitrary margin factor ensuring there will be a few thread blocks - * per SMX. - * - * E.g. on Kepler, each SMX can handle up to two 1024-thread blocks - * simultaneously, so a margin of 6/SMX should ensure with very high - * likelyhood that all SMXes will be fed and kept busy. - */ - - size_t marginFactor = 6; - - return marginFactor*ctx->numProcs*ctx->maxLg; +static size_t reduxInvEstimateParallelism (const redux_ctx* ctx){ + return reduxGenEstimateParallelism(ctx->gr); } - -/** - * @brief Returns whether the reduction interface requires a dst argument. - */ - -static int reduxRequiresDst (const redux_ctx* ctx){ - switch (ctx->op){ - case GA_REDUCE_ARGMIN: - case GA_REDUCE_ARGMAX: - return 0; - default: - return 1; - } +static int reduxInvRequiresDst (const redux_ctx* ctx){ + return reduxGenRequiresDst(ctx->gr); } - -/** - * @brief Returns whether the reduction interface requires a dstArg argument. - */ - -static int reduxRequiresDstArg (const redux_ctx* ctx){ - switch (ctx->op){ - case GA_REDUCE_MINANDARGMIN: - case GA_REDUCE_MAXANDARGMAX: - case GA_REDUCE_ARGMIN: - case GA_REDUCE_ARGMAX: - return 1; - default: - return 0; - } +static int reduxInvRequiresDstArg (const redux_ctx* ctx){ + return reduxGenRequiresDstArg(ctx->gr); } - -/** - * @brief Returns whether the generated kernel internally requires a dst - * argument. - * - * This is semantically subtly different from reduxHasDst(). The main - * difference is in the implementation of the GA_REDUCE_ARGMIN/ARGMAX - * reductions; Either *might* require a dst buffer, which will have to be - * allocated, even though it will be discarded. - */ - -static int reduxKernelRequiresDst (const redux_ctx* ctx){ - switch (ctx->op){ - case GA_REDUCE_ARGMIN: - case GA_REDUCE_ARGMAX: - return reduxIs2Stage(ctx); - default: - return 1; - } +static int reduxInvKernelRequiresDst (const redux_ctx* ctx){ + return reduxGenKernelRequiresDst(ctx->gr); } - -/** - * @brief Returns whether the generated kernel internally requires a dstArg - * argument. - * - * This is semantically subtly different from reduxHasDstArg(), since it asks - * whether the reduction, even though it does not accept a dstArg argument, - * still requires a dstArg internally. - */ - -static int reduxKernelRequiresDstArg (const redux_ctx* ctx){ - /** - * At present there exists no reduction whose implementation requires - * a dstArg but whose interface does not. - * - * E.g. the max() and min() reductions do NOT currently require a temporary - * buffer for indexes, and will not in the foreseeable future. - */ - - return reduxRequiresDstArg(ctx); +static int reduxInvKernelRequiresDstArg (const redux_ctx* ctx){ + return reduxGenKernelRequiresDstArg(ctx->gr); } - -/** - * @brief Returns whether the reduction is sensitive. - * - * A reduction is sensitive when its output satisfies at least one of the - * following conditions: - * - * - It depends on the exact order of axes in the reduxList - * - It depends on exact signs of the strides of axes in the reduxList - * - * Such sensitivity may prevent a flattening of contiguous axes even when it - * would have been otherwise permitted. - * - * For instance, ARGMIN/ARGMAX have this sensitivity, because the dstArg - * tensor's contents are flattened coordinates into the source tensor, and - * the flattening order is precisely reduxList. Permuting it would thus produce - * incorrect output. Moreover, if the strides of a reduction axis were to be - * reversed for the purpose of flattening the axis into another, the computed - * coordinate would again be incorrect. - * - * - * TL;DR: Reduction is sensitive if - * reduce(x, axis=axisList) != reduce(x, axis=axisList[::-1]) - * or - * reduce(x) != reduce(x[::-1]) - * . - */ - -static int reduxIsSensitive (const redux_ctx* ctx){ - switch (ctx->op){ - case GA_REDUCE_MINANDARGMIN: - case GA_REDUCE_MAXANDARGMAX: - case GA_REDUCE_ARGMIN: - case GA_REDUCE_ARGMAX: - return 1; - default: - return 0; +static unsigned reduxInvGetSplitFree (const redux_ctx* ctx){ + if(ctx->xdSplit && !axisIsReduced(ctx->xdSplit)){ + return axisGetIntraLen(ctx->xdSplit); + }else{ + return 1; } } - -/** - * @brief Is the reduction 1-stage? - */ - -static int reduxIs1Stage (const redux_ctx* ctx){ - return ctx->numStages == 1; -} - -/** - * @brief Is the reduction 2-stage? - */ - -static int reduxIs2Stage (const redux_ctx* ctx){ - return !reduxIs1Stage(ctx); +static unsigned reduxInvGetSplitReduce (const redux_ctx* ctx){ + if(ctx->xdSplit && axisIsReduced(ctx->xdSplit)){ + return axisGetIntraLen(ctx->xdSplit); + }else{ + return 1; + } } /** * @brief Get description of source axis with given number. */ -static axis_desc* reduxGetSrcAxis (const redux_ctx* ctx, int i){ +static axis_desc* reduxInvGetSrcAxis (const redux_ctx* ctx, int i){ return &ctx->xdSrc[i]; } @@ -1138,24 +1132,30 @@ static axis_desc* reduxGetSrcAxis (const redux_ctx* ctx, int i){ * @brief Get description of source axis with given number in sort-order. */ -static axis_desc* reduxGetSrcSortAxis (const redux_ctx* ctx, int i){ +static axis_desc* reduxInvGetSrcSortAxis (const redux_ctx* ctx, int i){ return ctx->xdSrcPtrs[i]; } /** - * @brief Get description of flattened source axis with given number. - */ - -static axis_desc* reduxGetSrcFlatAxis (const redux_ctx* ctx, int i){ - return &ctx->xdSrcFlat[i]; -} - -/** - * @brief Get description of temporary axis with given number. + * @brief Attempt to flatten out an axis from the context. + * + * An axis can be flattened out if: + * + * 1. The axis is of length 1. + * 2. The axis is a reduction axis, and there exists at least one reduction + * axis of length 0 in the source tensor. + * + * @return Non-zero if flattening attempt successful; Zero otherwise. */ -static axis_desc* reduxGetTmpAxis (const redux_ctx* ctx, int i){ - return ctx->xdTmpPtrs[i]; +static int reduxTryFlattenOut (const redux_ctx* ctx, + const axis_desc* out){ + if ((axisGetLen (out) == 1 )|| + (axisIsReduced(out) && ctx->zeroRdxAxes > 0)){ + return 1; + }else{ + return 0; + } } /** @@ -1179,7 +1179,7 @@ static axis_desc* reduxGetTmpAxis (const redux_ctx* ctx, int i){ * @return Non-zero if flattening attempt successful; Zero otherwise. */ -static int reduxTryFlattenInto (const redux_ctx* ctx, +static int reduxTryFlattenInto (redux_ctx* ctx, axis_desc* into, const axis_desc* from){ int signSrc = 0, signDst = 0, signDstArg = 0, @@ -1190,12 +1190,12 @@ static int reduxTryFlattenInto (const redux_ctx* ctx, return 0; } - if (reduxRequiresDst(ctx) && + if (reduxInvRequiresDst (ctx) && axisGetDstAbsStride (into) != axisGetDstAbsStride (from)*axisGetLen(from)){ return 0; } - if (reduxRequiresDstArg(ctx) && + if (reduxInvRequiresDstArg(ctx) && axisGetDstArgAbsStride(into) != axisGetDstArgAbsStride(from)*axisGetLen(from)){ return 0; } @@ -1204,47 +1204,44 @@ static int reduxTryFlattenInto (const redux_ctx* ctx, signDst = (axisGetDstStride (into)^axisGetDstStride (from)) < 0; signDstArg = (axisGetDstArgStride(into)^axisGetDstArgStride(from)) < 0; reverseSrc = signSrc; - reverseDst = signDst && reduxRequiresDst (ctx); - reverseDstArg = signDstArg && reduxRequiresDstArg(ctx); + reverseDst = signDst && reduxInvRequiresDst (ctx); + reverseDstArg = signDstArg && reduxInvRequiresDstArg(ctx); - if (reduxIsSensitive(ctx)){ + if (reduxIsSensitive(ctx->op)){ if(reverseSrc || reverseDst || reverseDstArg){ return 0; } } - if (reduxRequiresDst (ctx) && - reduxRequiresDstArg(ctx) && + if (reduxInvRequiresDst (ctx) && + reduxInvRequiresDstArg(ctx) && reverseDst != reverseDstArg){ /* Either both, or neither, of dst and dstArg must require reversal. */ return 0; } if (reverseSrc){ - into->srcOffset += (ssize_t)(axisGetLen(from)-1)*axisGetSrcStride(from); - into->srcStride = -axisGetSrcStride (from); + ctx->flatSrcOffset += (ssize_t)(axisGetLen(from)-1)*axisGetSrcStride(from); + into->srcStride = -axisGetSrcStride (from); }else{ - into->srcStride = axisGetSrcStride (from); + into->srcStride = axisGetSrcStride (from); } if (reverseDst){ - into->dstOffset += (ssize_t)(axisGetLen(from)-1)*axisGetDstStride(from); - into->dstStride = -axisGetDstStride (from); + ctx->flatDstOffset += (ssize_t)(axisGetLen(from)-1)*axisGetDstStride(from); + into->dstStride = -axisGetDstStride (from); }else{ - into->dstStride = axisGetDstStride (from); + into->dstStride = axisGetDstStride (from); } if (reverseDstArg){ - into->dstArgOffset += (ssize_t)(axisGetLen(from)-1)*axisGetDstArgStride(from); - into->dstArgStride = -axisGetDstArgStride(from); + ctx->flatDstArgOffset += (ssize_t)(axisGetLen(from)-1)*axisGetDstArgStride(from); + into->dstArgStride = -axisGetDstArgStride(from); }else{ - into->dstArgStride = axisGetDstArgStride(from); + into->dstArgStride = axisGetDstArgStride(from); } - into->srcOffset += axisGetSrcOffset (from); - into->dstOffset += axisGetDstOffset (from); - into->dstArgOffset += axisGetDstArgOffset(from); - into->len *= axisGetLen (from); + into->len *= axisGetLen(from); return 1; } @@ -1267,159 +1264,76 @@ static void reduxSortAxisPtrsBy (axis_desc** ptrs, qsort(ptrs, numAxes, sizeof(*ptrs), fn); } + /** - * @brief Initialize the context. + * @brief Initialize generator context. * - * After this function, calling reduxCleanup() becomes safe. + * After this function, calling reduxGenCleanup*() becomes safe. */ -static int reduxInit (redux_ctx* ctx){ - int i; - - /** - * We initialize certain parts of the context. - */ - - ctx->gpuCtx = NULL; - - ctx->srcTypeStr = ctx->dstTypeStr = ctx->dstArgTypeStr = - ctx->accTypeStr = ctx->idxTypeStr = NULL; - ctx->initValK = NULL; - ctx->sourceCode = NULL; - ctx->errorString0 = NULL; - ctx->errorString1 = NULL; - - ctx->numStages = 1; - ctx->prodAllAxes = ctx->prodRdxAxes = ctx->prodFreeAxes = 1; - strb_init(&ctx->s); - srcbInit (&ctx->srcGen, &ctx->s); - - for (i=0;ist2.bs [i] = ctx->st1.bs [i] = 1; - ctx->st2.gs [i] = ctx->st1.gs [i] = 1; - ctx->st2.cs [i] = ctx->st1.cs [i] = 1; - } - - ctx->srcStepsGD = ctx->srcSizeGD = - ctx->dstStepsGD = ctx->dstArgStepsGD = - ctx->st1.chunkSizeGD = ctx->st2.chunkSizeGD = NULL; - - return reduxInferProperties(ctx); +static int reduxGenInit (GpuReduction* gr){ + gr->kArgTypeCodes = NULL; + gr->kSourceCode = NULL; + gr->kErrorString = NULL; + + return reduxGenInferProperties(gr); } /** - * @brief Begin inferring the properties of the reduction. + * @brief Begin inferring the properties of the reduction operator. */ -static int reduxInferProperties (redux_ctx* ctx){ - axis_desc* a; - int i, j, retT, retK; - size_t d; - - - /* Source code buffer preallocation failed? */ - if (strb_ensure(&ctx->s, 4*1024) != 0){ - return reduxCleanupMsg(ctx, GA_MEMORY_ERROR, - "Could not preallocate source code buffer!\n"); +static int reduxGenInferProperties (GpuReduction* gr){ + int i, ret; + int k; + + + /** + * Insane arguments? + */ + + if(gr->ndr <= 0){ + return reduxGenCleanupMsg(gr, GA_INVALID_ERROR, + "No reduction axes!\n"); } - - - /* Insane src, reduxLen, dst or dstArg? */ - if (!ctx->src){ - return reduxCleanupMsg(ctx, GA_INVALID_ERROR, - "src is NULL!\n"); - }else if (ctx->src->nd <= 0){ - return reduxCleanupMsg(ctx, GA_INVALID_ERROR, - "src has less than 1 dimensions!\n"); - }else if (ctx->reduxLen <= 0){ - return reduxCleanupMsg(ctx, GA_INVALID_ERROR, - "List of dimensions to be reduced is empty!\n"); - }else if (ctx->src->nd < (unsigned)ctx->reduxLen){ - return reduxCleanupMsg(ctx, GA_INVALID_ERROR, - "src has fewer dimensions than there are dimensions to reduce!\n"); - }else if (reduxRequiresDst (ctx) && !ctx->dst){ - return reduxCleanupMsg(ctx, GA_INVALID_ERROR, - "dst is NULL, but reduction requires it!\n"); - }else if (reduxRequiresDstArg(ctx) && !ctx->dstArg){ - return reduxCleanupMsg(ctx, GA_INVALID_ERROR, - "dstArg is NULL, but reduction requires it!\n"); - }else if (ctx->dst && ctx->dst->nd +ctx->reduxLen != ctx->src->nd){ - return reduxCleanupMsg(ctx, GA_INVALID_ERROR, - "dst is of incorrect dimensionality for this reduction!\n"); - }else if (ctx->dstArg && ctx->dstArg->nd+ctx->reduxLen != ctx->src->nd){ - return reduxCleanupMsg(ctx, GA_INVALID_ERROR, - "dstArg is of incorrect dimensionality for this reduction!\n"); + if(gr->ndd < 0){ + return reduxGenCleanupMsg(gr, GA_INVALID_ERROR, + "Destination has less than 0 dimensions!\n"); } - ctx->nds = ctx->src->nd; - ctx->ndr = ctx->reduxLen; - ctx->ndd = ctx->nds - ctx->ndr; - ctx->ndfs = ctx->ndfr = ctx->ndfd = 0; - - /* Insane reduxList? */ - for (i=0;indr;i++){ - j = ctx->reduxList[i]; - if (j < -ctx->nds || j >= ctx->nds){ - return reduxCleanupMsg(ctx, GA_INVALID_ERROR, - "Insane axis number %d! Should be [%d, %d)!\n", - j, -ctx->nds, ctx->nds); - } - j = j<0 ? ctx->nds+j : j; - d = ctx->src->dimensions[j]; - ctx->zeroRdxAxes += !d; - ctx->prodRdxAxes *= d?d:1; + if(gr->flags != 0){ + return reduxGenCleanupMsg(gr, GA_INVALID_ERROR, + "\"flags\" must be set to 0!\n"); } - - + gr->nds = gr->ndr+gr->ndd; + + /** - * Insane shape? - * - * The source tensor is allowed to be empty (its shape may contain 0s). - * However, all axes that are of length 0 must be reduction axes. - * - * The reason for this is that a reduction cannot store any output into an - * empty destination tensor (whose dimensions are the free axes), because - * it has 0 space. The operation cannot then fulfill its contract. - * - * On the other hand, when some or all reduction axes of a tensor are of - * length 0, the reduction can be interpreted as initializing the - * destination tensor to the identity value of the operation. For lack of a - * better idea, the destination argument tensor can then be zeroed. + * Source code buffer preallocation failed? */ - - for (i=0;inds;i++){ - d = ctx->src->dimensions[i]; - ctx->zeroAllAxes += !d; - ctx->prodAllAxes *= d?d:1; - } - if (ctx->zeroAllAxes != ctx->zeroRdxAxes){ - return reduxCleanupMsg(ctx, GA_INVALID_ERROR, - "Source tensor has length-0 dimensions that are not reduced!"); + + if (strb_ensure(&gr->s, 32*1024) != 0){ + return reduxGenCleanupMsg(gr, GA_MEMORY_ERROR, + "Could not preallocate source code buffer!\n"); } - ctx->prodFreeAxes = ctx->prodAllAxes/ctx->prodRdxAxes; - - + srcbInit(&gr->srcGen, &gr->s); + + /** * GPU context non-existent, or cannot read its properties? */ - - ctx->gpuCtx = GpuArray_context(ctx->src); - if (!ctx->gpuCtx || - gpucontext_property(ctx->gpuCtx, GA_CTX_PROP_NUMPROCS, &ctx->numProcs) != GA_NO_ERROR || - gpucontext_property(ctx->gpuCtx, GA_CTX_PROP_MAXLSIZE, &ctx->maxLg) != GA_NO_ERROR || - gpudata_property(ctx->src->data, GA_CTX_PROP_MAXLSIZE0, &ctx->maxLs[0]) != GA_NO_ERROR || - gpudata_property(ctx->src->data, GA_CTX_PROP_MAXLSIZE1, &ctx->maxLs[1]) != GA_NO_ERROR || - gpudata_property(ctx->src->data, GA_CTX_PROP_MAXLSIZE2, &ctx->maxLs[2]) != GA_NO_ERROR || - gpudata_property(ctx->src->data, GA_CTX_PROP_MAXGSIZE, &ctx->maxGg) != GA_NO_ERROR || - gpudata_property(ctx->src->data, GA_CTX_PROP_MAXGSIZE0, &ctx->maxGs[0]) != GA_NO_ERROR || - gpudata_property(ctx->src->data, GA_CTX_PROP_MAXGSIZE1, &ctx->maxGs[1]) != GA_NO_ERROR || - gpudata_property(ctx->src->data, GA_CTX_PROP_MAXGSIZE2, &ctx->maxGs[2]) != GA_NO_ERROR ){ - /* gpukernel_property(ctx->kernel.k, GA_KERNEL_PROP_PREFLSIZE, &warpSize); */ - return reduxCleanupMsg(ctx, GA_INVALID_ERROR, - "Error obtaining one or more properties from GPU context!\n"); + + if (!gr->gpuCtx || + gpucontext_property(gr->gpuCtx, GA_CTX_PROP_NUMPROCS, &gr->numProcs) != GA_NO_ERROR || + gpucontext_property(gr->gpuCtx, GA_CTX_PROP_MAXLSIZE, &gr->maxLg) != GA_NO_ERROR || + gpucontext_property(gr->gpuCtx, GA_CTX_PROP_MAXLSIZE0, &gr->maxL0) != GA_NO_ERROR || + gpucontext_property(gr->gpuCtx, GA_CTX_PROP_MAXGSIZE, &gr->maxGg) != GA_NO_ERROR || + gpucontext_property(gr->gpuCtx, GA_CTX_PROP_MAXGSIZE0, &gr->maxG0) != GA_NO_ERROR || + gpucontext_property(gr->gpuCtx, GA_CTX_PROP_LMEMSIZE, &gr->maxLM) != GA_NO_ERROR ){ + return reduxGenCleanupMsg(gr, GA_INVALID_ERROR, + "Error obtaining one or more properties from GPU context!\n"); } - ctx->warpSize = 32; - - + + /** * Type management. * @@ -1428,1025 +1342,1948 @@ static int reduxInferProperties (redux_ctx* ctx){ * datatype. */ - ctx->srcTypeCode = ctx->src->typecode; - ctx->dstTypeCode = ctx->srcTypeCode; - ctx->dstArgTypeCode = GA_SSIZE; - ctx->idxTypeCode = GA_SSIZE; - switch (ctx->srcTypeCode){ + gr->dstTypeCode = gr->srcTypeCode; + gr->dstArgTypeCode = GA_SSIZE; + gr->idxTypeCode = GA_SSIZE; + switch (gr->srcTypeCode){ case GA_HALF: - ctx->accTypeCode = GA_FLOAT; + gr->accTypeCode = GA_FLOAT; break; case GA_HALF2: - ctx->accTypeCode = GA_FLOAT2; + gr->accTypeCode = GA_FLOAT2; break; case GA_HALF4: - ctx->accTypeCode = GA_FLOAT4; + gr->accTypeCode = GA_FLOAT4; break; case GA_HALF8: - ctx->accTypeCode = GA_FLOAT8; + gr->accTypeCode = GA_FLOAT8; break; case GA_HALF16: - ctx->accTypeCode = GA_FLOAT16; + gr->accTypeCode = GA_FLOAT16; break; default: - ctx->accTypeCode = ctx->srcTypeCode; - } - ctx->srcTypeStr = gpuarray_get_type(ctx->srcTypeCode) ->cluda_name; - ctx->dstTypeStr = gpuarray_get_type(ctx->dstTypeCode) ->cluda_name; - ctx->dstArgTypeStr = gpuarray_get_type(ctx->dstArgTypeCode)->cluda_name; - ctx->idxTypeStr = gpuarray_get_type(ctx->idxTypeCode) ->cluda_name; - ctx->accTypeStr = gpuarray_get_type(ctx->accTypeCode) ->cluda_name; - if (!ctx->srcTypeStr || - !ctx->dstTypeStr || - !ctx->dstArgTypeStr || - !ctx->idxTypeStr || - !ctx->accTypeStr ){ - return reduxCleanup(ctx, GA_INVALID_ERROR); - } - switch (ctx->op){ + gr->accTypeCode = gr->srcTypeCode; + } + gr->srcTypeStr = gpuarray_get_type(gr->srcTypeCode) ->cluda_name; + gr->dstTypeStr = gpuarray_get_type(gr->dstTypeCode) ->cluda_name; + gr->dstArgTypeStr = gpuarray_get_type(gr->dstArgTypeCode)->cluda_name; + gr->idxTypeStr = gpuarray_get_type(gr->idxTypeCode) ->cluda_name; + gr->accTypeStr = gpuarray_get_type(gr->accTypeCode) ->cluda_name; + if (!gr->srcTypeStr || + !gr->dstTypeStr || + !gr->dstArgTypeStr || + !gr->idxTypeStr || + !gr->accTypeStr ){ + return reduxGenCleanupMsg(gr, GA_INVALID_ERROR, + "Have typecode with no CLUDA name!\n"); + } + switch (gr->op){ case GA_REDUCE_SUM: - retT = reduxGetSumInit (ctx->dstTypeCode, &ctx->initValT); - retK = reduxGetSumInit (ctx->accTypeCode, &ctx->initValK); + ret = reduxGetSumInit (gr->accTypeCode, &gr->initVal); break; case GA_REDUCE_PRODNZ: case GA_REDUCE_PROD: - retT = reduxGetProdInit(ctx->dstTypeCode, &ctx->initValT); - retK = reduxGetProdInit(ctx->accTypeCode, &ctx->initValK); + ret = reduxGetProdInit(gr->accTypeCode, &gr->initVal); break; case GA_REDUCE_MINANDARGMIN: case GA_REDUCE_ARGMIN: case GA_REDUCE_MIN: - retT = reduxGetMinInit (ctx->dstTypeCode, &ctx->initValT); - retK = reduxGetMinInit (ctx->accTypeCode, &ctx->initValK); + ret = reduxGetMinInit (gr->accTypeCode, &gr->initVal); break; case GA_REDUCE_MAXANDARGMAX: case GA_REDUCE_ARGMAX: case GA_REDUCE_MAX: - retT = reduxGetMaxInit (ctx->dstTypeCode, &ctx->initValT); - retK = reduxGetMaxInit (ctx->accTypeCode, &ctx->initValK); + ret = reduxGetMaxInit (gr->accTypeCode, &gr->initVal); break; case GA_REDUCE_ALL: case GA_REDUCE_AND: - retT = reduxGetAndInit (ctx->dstTypeCode, &ctx->initValT); - retK = reduxGetAndInit (ctx->accTypeCode, &ctx->initValK); + ret = reduxGetAndInit (gr->accTypeCode, &gr->initVal); break; case GA_REDUCE_ANY: case GA_REDUCE_XOR: case GA_REDUCE_OR: - retT = reduxGetOrInit (ctx->dstTypeCode, &ctx->initValT); - retK = reduxGetOrInit (ctx->accTypeCode, &ctx->initValK); + ret = reduxGetOrInit (gr->accTypeCode, &gr->initVal); break; default: - retT = GA_UNSUPPORTED_ERROR; - retK = GA_UNSUPPORTED_ERROR; + ret = GA_UNSUPPORTED_ERROR; } - if (retT != GA_NO_ERROR){ - return reduxCleanupMsg(ctx, retT, - "Problem selecting types to be used in reduction!\n"); + if (ret != GA_NO_ERROR){ + return reduxGenCleanupMsg(gr, ret, + "Problem selecting types to be used in reduction!\n"); } - if (retK != GA_NO_ERROR){ - return reduxCleanupMsg(ctx, retK, - "Problem selecting types to be used in reduction!\n"); + + /* Compute floor(log2(gr->log2MaxL)). */ + gr->log2MaxL = gr->maxLg-1; + for(i=1;gr->log2MaxL & (gr->log2MaxL+1);i*=2){ + gr->log2MaxL |= gr->log2MaxL>>i; } - - - /** - * Allocate and construct source-tensor axis-description lists. - * - * While constructing the descriptions of each axis, verify that: - * - * 1. reduxLen has no duplicates. - * 2. dst and/or dstArg's dimensions match src's dimensions, stripped of - * the reduction axes. - */ - - ctx->xdSrc = calloc(ctx->nds, sizeof(*ctx->xdSrc)); - ctx->xdSrcPtrs = calloc(ctx->nds+1, sizeof(*ctx->xdSrcPtrs)); - ctx->xdSrcFlat = calloc(ctx->nds+1, sizeof(*ctx->xdSrcFlat)); - if (!ctx->xdSrc || !ctx->xdSrcPtrs || !ctx->xdSrcFlat){ - return reduxCleanup(ctx, GA_MEMORY_ERROR); + for(i=0;gr->log2MaxL;i++){ + gr->log2MaxL >>= 1; } - for (i=0;inds;i++){ - axisInit(&ctx->xdSrc[i], - ctx->src->dimensions[i], - ctx->src->strides[i]); + gr->log2MaxL = i; + + /* Compute number of kernel arguments. */ + gr->kNumArgs = 6 /* phase, U, V, B, D, H */ + + 2 /* splitFree, splitReduce */ + + gr->nds /* l{0..n} */ + + reduxGenRequiresDstArg(gr)*gr->ndr /* l{m..n}PDim */ + + 1 /* s */ + + 1 /* sOff */ + + gr->nds /* sJ{0..n} */ + + reduxGenRequiresDst (gr) /* d */ + + reduxGenRequiresDst (gr) /* dOff */ + + reduxGenRequiresDst (gr)*gr->ndd /* dJ{0..m} */ + + reduxGenRequiresDstArg(gr) /* a */ + + reduxGenRequiresDstArg(gr) /* aOff */ + + reduxGenRequiresDstArg(gr)*gr->ndd /* aJ{0..m} */ + + 1 /* w */ + + reduxGenKernelRequiresDst (gr)*2 /* wdOff, pdOff */ + + reduxGenKernelRequiresDstArg(gr)*2 /* waOff, paOff */ + + gr->log2MaxL /* bs{0..p} */ + + gr->log2MaxL /* bp{0..p} */ + + reduxGenRequiresDstArg(gr)*gr->log2MaxL /* bi{0..p} */ + + gr->log2MaxL /* bsOff{0..p} */ + + reduxGenRequiresDst (gr)*gr->log2MaxL /* bdOff{0..p} */ + + reduxGenRequiresDstArg(gr)*gr->log2MaxL;/* baOff{0..p} */ + + + /* Construct kernel argument typecode list */ + gr->kArgTypeCodes = calloc(gr->kNumArgs, sizeof(*gr->kArgTypeCodes)); + if(!gr->kArgTypeCodes){ + return reduxGenCleanupMsg(gr, GA_MEMORY_ERROR, + "Failed to allocate memory for kernel arguments " + "typecode list!\n"); } - for (i=0;indr;i++){ - j = ctx->reduxList[i]; - j = j<0 ? ctx->nds+j : j; - a = reduxGetSrcAxis(ctx, j); - if (axisIsReduced(a)){ - return reduxCleanupMsg(ctx, GA_INVALID_ERROR, - "Axis %d appears multiple times in the " - "reduction axis list!\n", - j); + + i = 0; + gr->kArgTypeCodes[i++] = GA_INT; /* phase */ + gr->kArgTypeCodes[i++] = GA_SIZE; /* U */ + gr->kArgTypeCodes[i++] = GA_SIZE; /* V */ + gr->kArgTypeCodes[i++] = GA_SIZE; /* B */ + gr->kArgTypeCodes[i++] = GA_UINT; /* D */ + gr->kArgTypeCodes[i++] = GA_UINT; /* H */ + gr->kArgTypeCodes[i++] = GA_UINT; /* splitFree */ + gr->kArgTypeCodes[i++] = GA_UINT; /* splitReduce */ + for(k=0;k < gr->nds;k++){ + gr->kArgTypeCodes[i++] = GA_SIZE; /* lN */ + } + for(k=0;k < gr->ndr && reduxGenRequiresDstArg(gr);k++){ + gr->kArgTypeCodes[i++] = GA_SIZE; /* lNPDim */ + } + gr->kArgTypeCodes[i++] = GA_BUFFER;/* s */ + gr->kArgTypeCodes[i++] = GA_SSIZE; /* sOff */ + for(k=0;k < gr->nds;k++){ + gr->kArgTypeCodes[i++] = GA_SSIZE; /* sJN */ + } + if(reduxGenRequiresDst (gr)){ + gr->kArgTypeCodes[i++] = GA_BUFFER;/* d */ + gr->kArgTypeCodes[i++] = GA_SSIZE; /* dOff */ + for(k=0;k < gr->ndd;k++){ + gr->kArgTypeCodes[i++] = GA_SSIZE; /* dJN */ } - axisMarkReduced(a, i); } - for (i=j=0;inds;i++){ - axis_desc* a = reduxGetSrcAxis(ctx, i); - size_t srcLen = axisGetLen(a), dstLen, dstArgLen; - - if (axisIsReduced(a)){continue;} - if (reduxRequiresDst(ctx)){ - dstLen = ctx->dst->dimensions[j]; - - if(srcLen != dstLen){ - return reduxCleanupMsg(ctx, GA_INVALID_ERROR, - "Source axis %d has length %zu, but " - "corresponding destination axis %d has length %zu!\n", - i, srcLen, j, dstLen); - } - - a->dstStride = ctx->dst->strides[j]; - } - if (reduxRequiresDstArg(ctx)){ - dstArgLen = ctx->dstArg->dimensions[j]; - - if(srcLen != dstArgLen){ - return reduxCleanupMsg(ctx, GA_INVALID_ERROR, - "Source axis %d has length %zu, but " - "corresponding destination-argument axis %d has length %zu!\n", - i, srcLen, j, dstArgLen); - } - - a->dstArgStride = ctx->dstArg->strides[j]; + if(reduxGenRequiresDstArg(gr)){ + gr->kArgTypeCodes[i++] = GA_BUFFER;/* a */ + gr->kArgTypeCodes[i++] = GA_SSIZE; /* aOff */ + for(k=0;k < gr->ndd;k++){ + gr->kArgTypeCodes[i++] = GA_SSIZE; /* aJN */ } - - j++; } + gr->kArgTypeCodes[i++] = GA_BUFFER;/* w */ + if(reduxGenKernelRequiresDst (gr)){ + gr->kArgTypeCodes[i++] = GA_SSIZE; /* wdOff */ + gr->kArgTypeCodes[i++] = GA_SSIZE; /* pdOff */ + } + if(reduxGenKernelRequiresDstArg(gr)){ + gr->kArgTypeCodes[i++] = GA_SSIZE; /* waOff */ + gr->kArgTypeCodes[i++] = GA_SSIZE; /* paOff */ + } + for(k=0;k < gr->log2MaxL;k++){ + gr->kArgTypeCodes[i++] = GA_UINT; /* ibsN */ + } + for(k=0;k < gr->log2MaxL;k++){ + gr->kArgTypeCodes[i++] = GA_UINT; /* ibpN */ + } + for(k=0;k < gr->log2MaxL && reduxGenRequiresDstArg(gr);k++){ + gr->kArgTypeCodes[i++] = GA_SIZE; /* iblNPDim */ + } + for(k=0;k < gr->log2MaxL;k++){ + gr->kArgTypeCodes[i++] = GA_SSIZE; /* ibsOffN */ + } + for(k=0;k < gr->log2MaxL && reduxGenRequiresDst (gr);k++){ + gr->kArgTypeCodes[i++] = GA_SSIZE; /* ibdOffN */ + } + for(k=0;k < gr->log2MaxL && reduxGenRequiresDstArg(gr);k++){ + gr->kArgTypeCodes[i++] = GA_SSIZE; /* ibaOffN */ + } + + return reduxGenSrc(gr); +} +/** + * @brief Generate the kernel source code for the reduction. + * + * @return GA_MEMORY_ERROR if not enough memory left; GA_NO_ERROR otherwise. + */ + +static int reduxGenSrc (GpuReduction* gr){ + reduxGenSrcAppend(gr); - /** - * Begin flattening the source tensor. - */ - - return reduxFlattenSource(ctx); + gr->kSourceCodeLen = gr->s.l; + gr->kSourceCode = strb_cstr(&gr->s); + + if (gr->kSourceCode){ + return reduxGenCompile(gr); + }else{ + return reduxGenCleanupMsg(gr, GA_MEMORY_ERROR, + "Failure in source code string buffer allocation " + "during codegen!\n"); + } } /** - * @brief Flatten the source tensor as much as is practical. - * - * This makes the axis lengths as long as possible and the tensor itself as - * contiguous as possible. + * @brief Append source code to the string buffer. */ -static int reduxFlattenSource (redux_ctx* ctx){ - axis_desc* axis, *flatAxis, *sortAxis; - int i, j, isSensitive; +static void reduxGenSrcAppend (GpuReduction* gr){ + reduxGenSrcAppendIncludes (gr); + reduxGenSrcAppendMacroDefs (gr); + reduxGenSrcAppendTypedefs (gr); + reduxGenSrcAppendReduxKernel (gr); +} +static void reduxGenSrcAppendIncludes (GpuReduction* gr){ + srcbAppends(&gr->srcGen, "/* Includes */\n"); + srcbAppends(&gr->srcGen, "#include \"cluda.h\"\n"); + srcbAppends(&gr->srcGen, "\n"); + srcbAppends(&gr->srcGen, "\n"); + srcbAppends(&gr->srcGen, "\n"); +} +static void reduxGenSrcAppendMacroDefs (GpuReduction* gr){ + int i; /** - * Copy source axis descriptions list to flattened source axis description - * list, in preparation for attempts at flattening. + * DECLREDUXSTATE, INITREDUXSTATE and SETREDUXSTATE macros. */ - memcpy(ctx->xdSrcFlat, ctx->xdSrc, ctx->nds*sizeof(*ctx->xdSrcFlat)); - ctx->ndfs = ctx->nds; - + if ( reduxGenKernelRequiresDst(gr) && reduxGenKernelRequiresDstArg(gr)){ + srcbAppendf(&gr->srcGen, + "#define DECLREDUXSTATE(V, I) TK V;TX I;\n" + "#define INITREDUXSTATE(V, I) do{(V) = %s;(I) = 0;}while(0)\n" + "#define SETREDUXSTATE(V, I, v, i) do{(V) = (v);(I) = (i);}while(0)\n", + gr->initVal); + }else if ( reduxGenKernelRequiresDst(gr) && !reduxGenKernelRequiresDstArg(gr)){ + srcbAppendf(&gr->srcGen, + "#define DECLREDUXSTATE(V, I) TK V;\n" + "#define INITREDUXSTATE(V, I) do{(V) = %s;}while(0)\n" + "#define SETREDUXSTATE(V, I, v, i) do{(V) = (v);}while(0)\n", + gr->initVal); + }else if (!reduxGenKernelRequiresDst(gr) && reduxGenKernelRequiresDstArg(gr)){ + srcbAppendf(&gr->srcGen, + "#define DECLREDUXSTATE(V, I) TX I;\n" + "#define INITREDUXSTATE(V, I) do{(I) = 0;}while(0)\n" + "#define SETREDUXSTATE(V, I, v, i) do{(I) = (i);}while(0)\n"); + } + + /** - * Pass 1: Flatten out 0-length dimensions. We already know that + * LOADS(v, p) macro. * - * a) There are no 0-length free dimensions, because that - * constitutes an invalid input, and - * b) How many 0-length reduction dimensions there are, because - * we counted them in the error-checking code. - * - * So if there are any 0-length axes, we can delete all reduction axes and - * replace them with a single one. + * Loads a TK-typed value v from a TS-typed source pointer p. */ - if (ctx->zeroRdxAxes > 0){ - for (i=j=0;indfs;i++){ - axis = reduxGetSrcFlatAxis(ctx, i); - - if (!axisIsReduced(axis)){ - *reduxGetSrcFlatAxis(ctx, j++) = *axis; - } - } - - axisInit (reduxGetSrcFlatAxis(ctx, j), 0, 0); - axisMarkReduced(reduxGetSrcFlatAxis(ctx, j), 0); - j++; - ctx->ndfs = j; + if (gr->srcTypeCode == GA_HALF && gr->accTypeCode == GA_FLOAT){ + srcbAppends(&gr->srcGen, "#define LOADS(v, p) do{(v) = (TK)load_half((TS*)(p));}while(0)\n"); + }else{ + srcbAppends(&gr->srcGen, "#define LOADS(v, p) do{(v) = (TK)*(TS*)(p);}while(0)\n"); } + /** - * Pass 2: Flatten out 1-length dimensions, since they can always be - * ignored; They are always indexed at [0]. + * GETIDX macro. + * + * Expands to the current flattened index. */ - for (i=j=0;indfs;i++){ - axis = reduxGetSrcFlatAxis(ctx, i); - - if (axisGetLen(axis) != 1){ - *reduxGetSrcFlatAxis(ctx, j++) = *axis; - } + srcbAppends (&gr->srcGen, "#define GETIDX ("); + srcbBeginList (&gr->srcGen, " + ", "0"); + srcbAppendElemf(&gr->srcGen, "ti"); + for(i=gr->ndd;inds;i++){ + srcbAppendElemf(&gr->srcGen, "i%d*l%dPDim", i, i); + } + srcbEndList (&gr->srcGen); + srcbAppends (&gr->srcGen, ")\n"); + + /** + * REDUX macro. + * + * Performs a reduction operation, jointly reducing a datum v and its + * flattened index i into reduction states V and I respectively. + */ + + srcbAppends(&gr->srcGen, "#define REDUX(V, I, v, i) do{ \\\n"); + switch (gr->op){ + case GA_REDUCE_SUM: + srcbAppendf(&gr->srcGen, " (V) += (v); \\\n"); + break; + case GA_REDUCE_PROD: + srcbAppendf(&gr->srcGen, " (V) *= (v); \\\n"); + break; + case GA_REDUCE_PRODNZ: + srcbAppendf(&gr->srcGen, " (V) *= ((v) == 0 ? (%s) : (v)); \\\n", gr->initVal); + break; + case GA_REDUCE_MIN: + srcbAppendf(&gr->srcGen, " (V) = min((V), (v)); \\\n"); + break; + case GA_REDUCE_MAX: + srcbAppendf(&gr->srcGen, " (V) = max((V), (v)); \\\n"); + break; + case GA_REDUCE_ARGMIN: + case GA_REDUCE_MINANDARGMIN: + srcbAppendf(&gr->srcGen, " (V) = min((V), (v)); \\\n" + " if((V) == (v)){ \\\n" + " (I) = (i); \\\n" + " } \\\n"); + break; + case GA_REDUCE_ARGMAX: + case GA_REDUCE_MAXANDARGMAX: + srcbAppendf(&gr->srcGen, " (V) = max((V), (v)); \\\n" + " if((V) == (v)){ \\\n" + " (I) = (i); \\\n" + " } \\\n"); + break; + case GA_REDUCE_AND: + srcbAppendf(&gr->srcGen, " (V) &= (v); \\\n"); + break; + case GA_REDUCE_OR: + srcbAppendf(&gr->srcGen, " (V) |= (v); \\\n"); + break; + case GA_REDUCE_XOR: + srcbAppendf(&gr->srcGen, " (V) ^= (v); \\\n"); + break; + case GA_REDUCE_ALL: + srcbAppendf(&gr->srcGen, " (V) = (V) && (v); \\\n"); + break; + case GA_REDUCE_ANY: + srcbAppendf(&gr->srcGen, " (V) = (V) || (v); \\\n"); + break; } - ctx->ndfs = j; + srcbAppends(&gr->srcGen, " }while(0)\n"); + /** - * Pass 3: Flatten out continuous dimensions, where strides and sensitivity - * allows it. + * HREDUX macro. + * + * Performs a horizontal reduction operation, first intra-block permuting + * the data and its index and then reducing it till done. */ - isSensitive = reduxIsSensitive(ctx); + srcbAppends(&gr->srcGen, + "#define HREDUX(pd, pa, tp, V, I) \\\n" + " do{ \\\n" + " /* Horizontal Reduction */ \\\n" + " SETREDUXSTATE(pd[tp], pa[tp], accV, accI); \\\n" + " local_barrier(); \\\n" + " \\\n" + " h = H; \\\n" + " while(h>1){ \\\n" + " if((h&1) && (LID_0 < D)){ \\\n" + " REDUX(pd[LID_0], pa[LID_0], pd[LID_0 + D*h-D], pa[LID_0 + D*h-D]); \\\n" + " } \\\n" + " h >>= 1; \\\n" + " if(LID_0 < D*h){ \\\n" + " REDUX(pd[LID_0], pa[LID_0], pd[LID_0 + D*h ], pa[LID_0 + D*h ]); \\\n" + " } \\\n" + " local_barrier(); \\\n" + " } \\\n" + " }while(0)\n"); - qsort(ctx->xdSrcFlat, ctx->ndfs, sizeof(*ctx->xdSrcFlat), - isSensitive ? reduxSortFlatSensitive : reduxSortFlatInsensitive); + /** + * STORED macro. + * + * Stores a TK-typed value v into a TS-typed destination pointer p. + */ - for (i=j=1;indfs;i++){ - flatAxis = reduxGetSrcFlatAxis(ctx, j-1); - sortAxis = reduxGetSrcFlatAxis(ctx, i); - - if (!reduxTryFlattenInto(ctx, flatAxis, sortAxis)){ - *reduxGetSrcFlatAxis(ctx, j++) = *sortAxis; + if (reduxGenRequiresDst(gr)){ + if (gr->dstTypeCode == GA_HALF && gr->accTypeCode == GA_FLOAT){ + srcbAppends(&gr->srcGen, "#define STORED(p, v) do{store_half((TD*)(p), (v));}while(0)\n"); + }else{ + srcbAppends(&gr->srcGen, "#define STORED(p, v) do{*(TD*)(p) = (v);}while(0)\n"); } + }else{ + srcbAppends(&gr->srcGen, "#define STORED(p, v) do{}while(0)\n"); } - ctx->ndfs = j; - - + + /** - * NOTE: At this point it is possible for there to be no axes - * (ctx->ndf == 0), but this will only occur if all axes of the original - * tensor were length-1 (i.e., if this was a scalar masquerading as a - * multidimensional tensor). + * STOREA macro. * - * We check for this case and simulate a 1-dimensional, 1-length tensor. + * Stores a TX-typed value v into a TA-typed destination pointer p. */ - - if(ctx->ndfs == 0){ - axisInit (reduxGetSrcFlatAxis(ctx, ctx->ndfs), 1, 0); - axisMarkReduced(reduxGetSrcFlatAxis(ctx, ctx->ndfs), 0); - ctx->ndfs = 1; + + if (reduxGenRequiresDstArg(gr)){ + srcbAppends(&gr->srcGen, "#define STOREA(p, v) do{*(TA*)(p) = (v);}while(0)\n"); + }else{ + srcbAppends(&gr->srcGen, "#define STOREA(p, v) do{}while(0)\n"); } - - + + /** - * Having flattened the tensor to the very best of our ability, allocate - * and/or compute - * - * ctx->ndfr - * ctx->ndfd - * ctx->flatSrcDimensions - * ctx->flatSrcStrides - * ctx->flatSrcData - * ctx->flatSrcOffset + axis offsets - * ctx->flatDstStrides - * ctx->flatDstData - * ctx->flatDstOffset + axis offsets - * ctx->flatDstArgStrides - * ctx->flatDstArgData - * ctx->flatDstArgOffset + axis offsets - * - * and suchlike data that will be used post-flatten. + * DIVIDECEIL macro. */ - ctx->flatSrcDimensions = malloc(ctx->ndfs * sizeof(*ctx->flatSrcDimensions)); - ctx->flatSrcStrides = malloc(ctx->ndfs * sizeof(*ctx->flatSrcStrides)); - ctx->flatDstStrides = malloc(ctx->ndfs * sizeof(*ctx->flatDstStrides)); - ctx->flatDstArgStrides = malloc(ctx->ndfs * sizeof(*ctx->flatDstArgStrides)); - if(!ctx->flatSrcDimensions || !ctx->flatSrcStrides || - !ctx->flatDstStrides || !ctx->flatDstArgStrides){ - return reduxCleanup(ctx, GA_MEMORY_ERROR); + srcbAppends(&gr->srcGen, "#define DIVIDECEIL(a,b) (((a)+(b)-1)/(b))\n"); + + srcbAppends(&gr->srcGen, "\n\n\n\n"); +} +static void reduxGenSrcAppendTypedefs (GpuReduction* gr){ + srcbAppendf(&gr->srcGen, "typedef %-20s TS;\n", gr->srcTypeStr); + srcbAppendf(&gr->srcGen, "typedef %-20s TD;\n", gr->dstTypeStr); + srcbAppendf(&gr->srcGen, "typedef %-20s TA;\n", gr->dstArgTypeStr); + srcbAppendf(&gr->srcGen, "typedef %-20s TX;\n", gr->idxTypeStr); + srcbAppendf(&gr->srcGen, "typedef %-20s TK;\n", gr->accTypeStr); + srcbAppendf(&gr->srcGen, "\n\n\n\n"); +} +static void reduxGenSrcAppendReduxKernel (GpuReduction* gr){ + reduxGenSrcAppendPrototype (gr); + srcbAppends (&gr->srcGen, "{\n"); + reduxGenSrcAppendBlockDecode (gr); + reduxGenSrcAppendThreadDecode(gr); + srcbAppends (&gr->srcGen, " /**\n" + " * PERFORM REDUCTION.\n" + " * \n" + " * We either perform Phase 0 or Phase 1 according to our argument.\n" + " * \n" + " * Phase 0 is the primary worker and, in special cases, is the only necessary phase.\n" + " * However, it may occasionally do only part of a reduction, in which case it leaves\n" + " * the partial reduction results in a workspace that is then read by Phase 1.\n" + " * \n" + " * Phase 1 is a fixup phase that collects any partial reduction results from Phase 0\n" + " * and completes the reduction before writing to the final destination.\n" + " */\n" + " \n" + " if(phase==0){\n"); + reduxGenSrcAppendPhase0 (gr); + srcbAppends (&gr->srcGen, " }else{\n"); + reduxGenSrcAppendPhase1 (gr); + srcbAppends (&gr->srcGen, " }\n"); + srcbAppends (&gr->srcGen, "}\n"); +} +static void reduxGenSrcAppendPrototype (GpuReduction* gr){ + int i; + + srcbAppends (&gr->srcGen, "KERNEL void redux(int phase,\n" + " TX U,\n" + " TX V,\n" + " TX B,\n" + " unsigned D,\n" + " unsigned H,\n" + " unsigned splitFree,\n" + " unsigned splitReduce,\n"); + srcbBeginList (&gr->srcGen, ",\n", "void"); + for(i=0;i<(int)(gr->ndd+gr->ndr);i++){ + srcbAppendElemf (&gr->srcGen, " TX l%d", i); + } + for(i=gr->ndd;i<(int)(gr->ndd+gr->ndr);i++){ + srcbAppendElemf (&gr->srcGen, " TX l%dPDim", i); + } + srcbAppendElemf (&gr->srcGen, " const GLOBAL_MEM char* s"); + srcbAppendElemf (&gr->srcGen, " TX sOff"); + for(i=0;i<(int)(gr->ndd+gr->ndr);i++){ + srcbAppendElemf (&gr->srcGen, " TX sJ%d", i); + } + if (reduxGenRequiresDst(gr)){ + srcbAppendElemf (&gr->srcGen, " GLOBAL_MEM char* d"); + srcbAppendElemf (&gr->srcGen, " TX dOff"); + for(i=0;i<(int)(gr->ndd);i++){ + srcbAppendElemf(&gr->srcGen, " TX dJ%d", i); + } + } + if (reduxGenRequiresDstArg(gr)){ + srcbAppendElemf (&gr->srcGen, " GLOBAL_MEM char* a"); + srcbAppendElemf (&gr->srcGen, " TX aOff"); + for(i=0;i<(int)(gr->ndd);i++){ + srcbAppendElemf(&gr->srcGen, " TX aJ%d", i); + } + } + srcbAppendElemf (&gr->srcGen, " GLOBAL_MEM char* w"); + if (reduxGenKernelRequiresDst(gr)){ + srcbAppendElemf (&gr->srcGen, " TX wdOff"); + srcbAppendElemf (&gr->srcGen, " TX pdOff"); + } + if (reduxGenKernelRequiresDstArg(gr)){ + srcbAppendElemf (&gr->srcGen, " TX waOff"); + srcbAppendElemf (&gr->srcGen, " TX paOff"); + } + for(i=0;i<(int)(gr->log2MaxL);i++){ + srcbAppendElemf (&gr->srcGen, " unsigned ibs%d", i); } + for(i=0;i<(int)(gr->log2MaxL);i++){ + srcbAppendElemf (&gr->srcGen, " unsigned ibp%d", i); + } + for(i=0;i<(int)(gr->log2MaxL) && reduxGenRequiresDstArg(gr);i++){ + srcbAppendElemf (&gr->srcGen, " TX ibl%dPDim", i); + } + for(i=0;i<(int)(gr->log2MaxL);i++){ + srcbAppendElemf (&gr->srcGen, " TX ibsOff%d", i); + } + for(i=0;i<(int)(gr->log2MaxL) && reduxGenRequiresDst (gr);i++){ + srcbAppendElemf (&gr->srcGen, " TX ibdOff%d", i); + } + for(i=0;i<(int)(gr->log2MaxL) && reduxGenRequiresDstArg(gr);i++){ + srcbAppendElemf (&gr->srcGen, " TX ibaOff%d", i); + } + srcbEndList (&gr->srcGen); + srcbAppends (&gr->srcGen, ")"); +} +static void reduxGenSrcAppendBlockDecode (GpuReduction* gr){ + int i; - ctx->flatSrcData = ctx->src->data; - ctx->flatSrcOffset = ctx->src->offset; - if(reduxRequiresDst(ctx)){ - ctx->flatDstData = ctx->dst->data; - ctx->flatDstOffset = ctx->dst->offset; + srcbAppends(&gr->srcGen, + " GA_DECL_SHARED_BODY(char, SHMEM)\n" + " DECLREDUXSTATE(accV, accI)\n" + " DECLREDUXSTATE(tmpV, tmpI)\n" + " INITREDUXSTATE(accV, accI);\n" + " \n" + " /**\n" + " * +-------------+-------------+------------+---------------------------------+\n" + " * | misalignL | misalignR | doFinish | DESCRIPTION |\n" + " * +-------------+-------------+------------+---------------------------------+\n" + " * | 0 | 0 | 0 | Impossible unless v == 0, |\n" + " * | | | | which is forbidden. |\n" + " * | | | | |\n" + " * | 0 | 0 | 1 | V % B == 0. Each block |\n" + " * | | | | handles integer number of |\n" + " * | | | | destination elements, no |\n" + " * | | | | partial results are required, |\n" + " * | | | | workspace is unused. |\n" + " * | | | | |\n" + " * | 0 | 1 | 0 | V < B. Block begins aligned |\n" + " * | | | | but ends misaligned, before |\n" + " * | | | | the end of its first element. |\n" + " * | | | | Partial result written to |\n" + " * | | | | right-half of array. |\n" + " * | | | | |\n" + " * | 0 | 1 | 1 | V > B, V % B != 0. Block |\n" + " * | | | | begins aligned but ends |\n" + " * | | | | misaligned, after the end of |\n" + " * | | | | its first element. |\n" + " * | | | | First 1 or more complete |\n" + " * | | | | elements written out directly |\n" + " * | | | | to destination. |\n" + " * | | | | Partial result of last element |\n" + " * | | | | written to right-half of array.|\n" + " * | | | | |\n" + " * | 1 | 0 | 0 | Impossible unless v == 0, |\n" + " * | | | | which is forbidden. |\n" + " * | | | | |\n" + " * | 1 | 0 | 1 | V % B != 0. Partial result of |\n" + " * | | | | first element written to left- |\n" + " * | | | | half of array. Zero or more |\n" + " * | | | | complete reductions performed |\n" + " * | | | | and written directly to |\n" + " * | | | | destination. Block ends |\n" + " * | | | | aligned. |\n" + " * | | | | |\n" + " * | 1 | 1 | 0 | V < B. Block begins misaligned |\n" + " * | | | | and ends misaligned, before |\n" + " * | | | | the end of its first element. |\n" + " * | | | | Partial result written to at |\n" + " * | | | | least right-half of array. |\n" + " * | | | | |\n" + " * | 1 | 1 | 1 | V % B != 0. Block begins |\n" + " * | | | | misaligned and ends misaligned,|\n" + " * | | | | after the end of its first |\n" + " * | | | | element. |\n" + " * | | | | Partial result of first element|\n" + " * | | | | written to left-half of array. |\n" + " * | | | | Partial result of last element |\n" + " * | | | | written to right-half of array.|\n" + " * | | | | 0 or more complete elements |\n" + " * | | | | written out directly to |\n" + " * | | | | destination. |\n" + " * +-------------+-------------+------------+---------------------------------+\n" + " * \n" + " * Possible configurations of blocks:\n" + " * If V % B == 0: 001\n" + " * If V < B: 010, 110, 111, 101\n" + " * If V > B: 011, 111, 101\n" + " * \n" + " * Possible configurations for collector blocks (responsible for gathering of\n" + " * results to the left):\n" + " * 101, 111 (misalignL && doFinish)\n" + " * \n" + " * Possible configurations for left-neighbours of collector blocks\n" + " * 110 (any number 0+), then exactly one of:\n" + " * 010, 011, 111\n" + " * \n" + " * Conclusion:\n" + " * - In Phase 0:\n" + " * - Always make a right-write if misalignR (010, 011, 110, 111).\n" + " * - Make a left -write at least if collector block (101, 111).\n" + " * - In Phase 1:\n" + " * - Exit if not collector block (101, 111)\n" + " * - If collector block,\n" + " * - Left -read from self\n" + " * - Right-read from all left-neighbours with same write-target.\n" + " * \n" + " * Code Structure perfectly satisfying conclusion:\n" + " * \n" + " * if(misalignL){\n" + " * while(v > 0){\n" + " * v--;\n" + " * REDUX();\n" + " * ReduxLoopIncs_CONTINUE;\n" + " * HREDUX();\n" + " * WSLeftWrite();\n" + " * REINIT();\n" + " * FreeLoopIncs_BREAK;\n" + " * BREAK;\n" + " * }\n" + " * }\n" + " * while(v > 0){\n" + " * v--;\n" + " * REDUX();\n" + " * ReduxLoopIncs_CONTINUE;\n" + " * HREDUX();\n" + " * DstWrite();\n" + " * REINIT();\n" + " * FreeLoopIncs_CONTINUE;\n" + " * BREAK;\n" + " * }\n" + " * if(misalignR){\n" + " * HREDUX();\n" + " * WSRightWrite();\n" + " * }\n" + " * \n" + " * Code Walkthrough:\n" + " * \n" + " * 000, 100: Impossible, can be ignored.\n" + " * 001: Only master loop entered, handles exact integer number of destinations.\n" + " * 010: Master loop entered but broken on vcount before HREDUX() reached.\n" + " * No reinit executed on breakout. HREDUX(), followed by WSRightWrite() of\n" + " * partial result.\n" + " * 011: Master loop entered for at least 1 full destination, then broken on\n" + " * vcount before HREDUX() reached. No reinit executed on breakout. HREDUX()\n" + " * followed by WSRightWrite() of partial result.\n" + " * 101: Left-misalign loop entered and completes a reduction. HREDUX()\n" + " * performed, WSLeftWrite() performed, reinitialization, bump of outer\n" + " * loop counters, then breakout. Master loop entered for 0 or more complete\n" + " * destination elements involving full writeouts to destination and reinit.\n" + " * Aligned on both misalignL and master loop breakouts. No entry into\n" + " * misalignR fixup.\n" + " * 110: Left-misalign loop entered, breaks on vcount before HREDUX(). No reinit\n" + " * executed on breakout. Master loop not entered. HREDUX(), followed by\n" + " * WSRightWrite() of partial result.\n" + " * 111: Left-misalign loop entered and completes a reduction. HREDUX() performed,\n" + " * WSLeftWrite() performed, reinit, bump of outer loop counters, breakout.\n" + " * Master loop entered for 0 or more complete destination elements\n" + " * involving full writeout to destination and reinit.\n" + " * Master loop broken on vcount before HREDUX(). misalignR fixup entered,\n" + " * HREDUX(), WSRightWrite().\n" + " */\n" + " \n" + " TX start = GID_0 * V;\n" + " if(start >= U){return;}\n" + " TX v = U-start < V ? U-start : V;\n" + " \n" + " int misalignL = (start+0)%B != 0;\n" + " int misalignR = (start+v)%B != 0;\n" + " int doFinish = (start+0)/B != (start+v)/B;\n" + " \n" + " /**\n" + " * Decode BLOCK start point.\n" + " * \n" + " * For the purpose of decoding the start point, the split axis's \"length\"\n" + " * is divided by either splitReduce or splitFree and rounded up. Therefore,\n" + " * for those axes the true computed initial starting point must be\n" + " * multiplied by either splitReduce or splitFree.\n" + " * \n" + " * Since we provide not strides but \"jumps\" to the kernel (to move as many\n" + " * things as possible into constant memory and out of the fast path), we\n" + " * must also convert jumps to strides in preparation for offsetting the\n" + " * base pointers to their starting point.\n" + " */\n" + " \n" + " unsigned Dunit = D/splitFree;\n"); + if(gr->ndd > 0){ + srcbAppendf(&gr->srcGen, + " TX l%dMul = DIVIDECEIL(l%d, splitFree);\n", + gr->ndd-1, gr->ndd-1); + } + if(gr->ndr > 0){ + srcbAppendf(&gr->srcGen, + " TX l%dMul = DIVIDECEIL(l%d, splitReduce);\n", + gr->nds-1, gr->nds-1); + } + srcbAppends(&gr->srcGen, " \n"); + for(i=gr->nds-1;i>=0;i--){ + if(i == gr->nds-1){ + srcbAppendf(&gr->srcGen, + " TX i%d = start %% l%dMul;\n", + i, i); + + }else{ + srcbAppendf(&gr->srcGen, + " TX i%d = i%d / l%d%s %% l%d%s;\n", + i, i+1, + i+1, + reduxGenAxisMaybeSplit(gr, i+1) ? "Mul" : "", + i, + reduxGenAxisMaybeSplit(gr, i) ? "Mul" : ""); + } } - if(reduxRequiresDstArg(ctx)){ - ctx->flatDstArgData = ctx->dstArg->data; - ctx->flatDstArgOffset = ctx->dstArg->offset; + srcbAppends(&gr->srcGen, " \n"); + if(gr->ndd > 0){ + srcbAppendf(&gr->srcGen, + " i%d *= splitFree;\n", + gr->ndd-1); + } + if(gr->ndr > 0){ + srcbAppendf(&gr->srcGen, + " i%d *= splitReduce;\n", + gr->nds-1); + } + srcbAppends(&gr->srcGen, " \n"); + for(i=gr->nds-1;i>=0;i--){ + if(i == gr->nds-1){ + srcbAppendf(&gr->srcGen, + " TX sS%d = (sJ%d ) / splitReduce;\n", + i, i); + }else{ + srcbAppendf(&gr->srcGen, + " TX sS%d = (sJ%d + (TX)l%d*sS%d)%s;\n", + i, i, i+1, i+1, + i == gr->ndd-1 ? " / splitFree" : ""); + } } - for(ctx->ndfd=ctx->ndfr=i=0;indfs;i++){ - axis = reduxGetSrcFlatAxis(ctx, i); - if(axisIsReduced(axis)){ - ctx->ndfr++; + if (reduxGenRequiresDst(gr)){ + srcbAppends(&gr->srcGen, " \n"); + for(i=gr->ndd-1;i>=0;i--){ + if(i == gr->ndd-1){ + srcbAppendf(&gr->srcGen, + " TX dS%d = (dJ%d ) / splitFree;\n", + i, i); + }else{ + srcbAppendf(&gr->srcGen, + " TX dS%d = (dJ%d + (TX)l%d*dS%d);\n", + i, i, i+1, i+1); + } + } + } + if (reduxGenRequiresDstArg(gr)){ + srcbAppends(&gr->srcGen, " \n"); + for(i=gr->ndd-1;i>=0;i--){ + if(i == gr->ndd-1){ + srcbAppendf(&gr->srcGen, + " TX aS%d = (aJ%d ) / splitFree;\n", + i, i); + }else{ + srcbAppendf(&gr->srcGen, + " TX aS%d = (aJ%d + (TX)l%d*aS%d);\n", + i, i, i+1, i+1); + } + } + } + srcbAppends(&gr->srcGen, " \n"); + srcbAppends(&gr->srcGen, " sOff += "); + srcbBeginList(&gr->srcGen, " + ", "0"); + for(i=0;inds;i++){ + srcbAppendElemf(&gr->srcGen, "(TX)i%d*sS%d", i, i); + } + srcbEndList(&gr->srcGen); + srcbAppends(&gr->srcGen, ";\n"); + if (reduxGenRequiresDst(gr)){ + srcbAppends(&gr->srcGen, " dOff += "); + srcbBeginList(&gr->srcGen, " + ", "0"); + for(i=0;indd;i++){ + srcbAppendElemf(&gr->srcGen, "(TX)i%d*dS%d", i, i); + } + srcbEndList(&gr->srcGen); + srcbAppends(&gr->srcGen, ";\n"); + } + if (reduxGenRequiresDstArg(gr)){ + srcbAppends(&gr->srcGen, " aOff += "); + srcbBeginList(&gr->srcGen, " + ", "0"); + for(i=0;indd;i++){ + srcbAppendElemf(&gr->srcGen, "(TX)i%d*aS%d", i, i); + } + srcbEndList(&gr->srcGen); + srcbAppends(&gr->srcGen, ";\n"); + } + srcbAppends(&gr->srcGen, " \n"); + if(reduxGenKernelRequiresDst(gr)){ + srcbAppends(&gr->srcGen, + " TK* wd = (TK*)(w + wdOff);\n" + " TK* wdL = &wd[0];\n" + " TK* wdR = &wd[GDIM_0*D];\n" + " TK* pd = (TK*)(SHMEM + pdOff);\n"); + } + if(reduxGenKernelRequiresDstArg(gr)){ + srcbAppends(&gr->srcGen, + " TA* wa = (TA*)(w + waOff);\n" + " TA* waL = &wa[0];\n" + " TA* waR = &wa[GDIM_0*D];\n" + " TA* pa = (TA*)(SHMEM + paOff);\n"); + } + srcbAppends(&gr->srcGen, + " \n" + " TX h, k;\n" + " \n"); +} +static void reduxGenSrcAppendThreadDecode (GpuReduction* gr){ + int i; + + srcbAppends(&gr->srcGen, + " /**\n" + " * Decode THREAD start point.\n" + " * \n" + " * This involves computing the intra-block coordinate of a thread in a\n" + " * up-to-log2(MAX_BLOCK_THREADS)-dimensional coordinate system, then using\n" + " * those coordinates to compute private source/destination/destination\n" + " * argument pointers, argument indices and permute targets.\n" + " */\n" + " \n" + " unsigned iSplit = LID_0/(LDIM_0/(splitFree*splitReduce));\n"); + + for(i=gr->log2MaxL-1;i>=0;i--){ + if(i == gr->log2MaxL-1){ + srcbAppendf(&gr->srcGen, + " int t%d = (unsigned)LID_0 %% ibs%d;\n", + i, i); }else{ - if(reduxRequiresDst(ctx)){ - ctx->flatDstStrides[ctx->ndfd] = axisGetDstStride(axis); - ctx->flatDstOffset += axisGetDstOffset(axis); + srcbAppendf(&gr->srcGen, + " int t%d = (unsigned)t%d / ibs%d %% ibs%d;\n", + i, i+1, i+1, i); + } + } + if(reduxGenRequiresDstArg(gr)){ + srcbAppends(&gr->srcGen, " TX ti = "); + srcbBeginList(&gr->srcGen, " + ", "0"); + for(i=0;ilog2MaxL;i++){ + srcbAppendElemf(&gr->srcGen, "t%d*ibl%dPDim", i, i); + } + srcbEndList(&gr->srcGen); + srcbAppends(&gr->srcGen, ";\n"); + } + srcbAppends(&gr->srcGen, " unsigned tp = "); + srcbBeginList(&gr->srcGen, " + ", "0"); + for(i=0;ilog2MaxL;i++){ + srcbAppendElemf(&gr->srcGen, "t%d* ibp%d", i, i); + } + srcbEndList(&gr->srcGen); + srcbAppends(&gr->srcGen, ";\n"); + + + + + srcbAppends(&gr->srcGen, " \n" + " sOff += "); + srcbBeginList(&gr->srcGen, " + ", "0"); + for(i=0;ilog2MaxL;i++){ + srcbAppendElemf(&gr->srcGen, "t%d*ibsOff%d ", i, i); + } + srcbEndList(&gr->srcGen); + srcbAppends(&gr->srcGen, ";\n"); + if(reduxGenRequiresDst(gr)){ + srcbAppends(&gr->srcGen, " \n" + " dOff += "); + srcbBeginList(&gr->srcGen, " + ", "0"); + for(i=0;ilog2MaxL;i++){ + srcbAppendElemf(&gr->srcGen, "t%d*ibdOff%d ", i, i); + } + srcbEndList(&gr->srcGen); + srcbAppends(&gr->srcGen, ";\n"); + srcbAppends(&gr->srcGen, " ((TX*)SHMEM)[tp] = dOff;\n" + " local_barrier();\n" + " dOff = ((TX*)SHMEM)[LID_0];\n" + " local_barrier();\n"); + } + if(reduxGenRequiresDstArg(gr)){ + srcbAppends(&gr->srcGen, " \n" + " aOff += "); + srcbBeginList(&gr->srcGen, " + ", "0"); + for(i=0;ilog2MaxL;i++){ + srcbAppendElemf(&gr->srcGen, "t%d*ibaOff%d ", i, i); + } + srcbEndList(&gr->srcGen); + srcbAppends(&gr->srcGen, ";\n"); + srcbAppends(&gr->srcGen, " ((TX*)SHMEM)[tp] = aOff;\n" + " local_barrier();\n" + " aOff = ((TX*)SHMEM)[LID_0];\n" + " local_barrier();\n"); + } + srcbAppends(&gr->srcGen, " \n" + " const char* ts = s + sOff;\n"); + if(reduxGenRequiresDst(gr)){ + srcbAppends(&gr->srcGen, " char* td = d + dOff;\n"); + } + if(reduxGenRequiresDstArg(gr)){ + srcbAppends(&gr->srcGen, " char* ta = a + aOff;\n"); + } + srcbAppends(&gr->srcGen, " \n" + " \n"); +} +static void reduxGenSrcAppendPhase0 (GpuReduction* gr){ + srcbAppends(&gr->srcGen, + " /* PHASE 0 */\n" + " \n" + " /* Loop Cores. */\n"); + if (gr->ndd == 0){ + /** + * Special case: If ndd == 0, we know this is an all-reduce or nearly, so + * we know that the only split axis, if any, is going to be a reduction axis. + * Therefore, splitFree will always be 1, and we only need to generate one + * set of loops. + */ + + reduxGenSrcAppendLoops(gr, 0, 1); + }else{ + srcbAppends(&gr->srcGen, " if(splitReduce == 1){\n" + " /* Free axis possibly split. */\n"); + reduxGenSrcAppendLoops(gr, 1, 0); + srcbAppends(&gr->srcGen, " }else{\n" + " /* Reduce axis possibly split. */\n"); + reduxGenSrcAppendLoops(gr, 0, 1); + srcbAppends(&gr->srcGen, " }\n"); + } +} +static void reduxGenSrcAppendLoops (GpuReduction* gr, + int freeMaybeSplit, + int reduceMaybeSplit){ + srcbAppends(&gr->srcGen, " if(misalignL){\n"); + reduxGenSrcAppendLoop(gr, 1, freeMaybeSplit, reduceMaybeSplit); + srcbAppends(&gr->srcGen, " }\n"); + reduxGenSrcAppendLoop(gr, 0, freeMaybeSplit, reduceMaybeSplit); + srcbAppends(&gr->srcGen, + " \n" + " /**\n" + " * Are we misaligned on the right? If so, we have a partial reduction\n" + " * to save.\n" + " */\n" + " \n" + " if(misalignR){\n" + " HREDUX(pd, pa, tp, accV, accI);\n" + " \n" + " /* Right-write partial reduction to workspace. */\n" + " if(LID_0 < D){\n" + " SETREDUXSTATE(wdR[GID_0*D+LID_0], waR[GID_0*D+LID_0], pd[LID_0], pa[LID_0]);\n" + " }\n" + " }\n"); +} +static void reduxGenSrcAppendLoop (GpuReduction* gr, + int initial, + int freeMaybeSplit, + int reduceMaybeSplit){ + int i; + + srcbAppends(&gr->srcGen, " while(v > 0){\n"); + reduxGenSrcAppendDecrement(gr); + reduxGenSrcAppendVertical (gr, freeMaybeSplit, reduceMaybeSplit); + srcbAppends(&gr->srcGen, " /* Reduction Increments */\n"); + for(i=gr->nds-1;i >= gr->ndd;i--){ + reduxGenSrcAppendIncrement(gr, i, initial, freeMaybeSplit, reduceMaybeSplit); + } + srcbAppends(&gr->srcGen, " /* Horizontal Reduction */\n" + " HREDUX(pd, pa, tp, accV, accI);\n" + " \n"); + reduxGenSrcAppendDstWrite(gr, initial, freeMaybeSplit, reduceMaybeSplit); + srcbAppends(&gr->srcGen, " /* Reinitialize accumulators */\n" + " INITREDUXSTATE(accV, accI);\n" + " \n"); + srcbAppends(&gr->srcGen, " /* Free Increments */\n"); + for(i=gr->ndd-1;i >= 0;i--){ + reduxGenSrcAppendIncrement(gr, i, initial, freeMaybeSplit, reduceMaybeSplit); + } + srcbAppends(&gr->srcGen, " /* Exit loop */\n" + " break;\n" + " }\n"); +} +static void reduxGenSrcAppendDecrement (GpuReduction* gr){ + srcbAppends(&gr->srcGen, " /* Decrement. */\n" + " v--;\n" + " \n"); +} +static void reduxGenSrcAppendVertical (GpuReduction* gr, + int freeMaybeSplit, + int reduceMaybeSplit){ + int i; + + if(!freeMaybeSplit && !reduceMaybeSplit){ + srcbAppends(&gr->srcGen, " /* Vertical Reductions */\n" + " LOADS(tmpV, ts);\n" + " REDUX(accV, accI, tmpV, GETIDX);\n" + " \n"); + }else{ + i = freeMaybeSplit ? gr->ndd-1 : gr->nds-1; + srcbAppendf(&gr->srcGen, " /* Vertical Reductions */\n" + " if(i%d+iSplit < l%d){\n" + " LOADS(tmpV, ts);\n" + " REDUX(accV, accI, tmpV, GETIDX);\n" + " }\n" + " \n", i, i); + } +} +static void reduxGenSrcAppendIncrement (GpuReduction* gr, + int axis, + int initial, + int freeMaybeSplit, + int reduceMaybeSplit){ + const char* breakOrCont = (initial) && (axis < gr->ndd) ? "break" : "continue"; + + if (freeMaybeSplit && axis == gr->ndd-1){ + srcbAppendf(&gr->srcGen, + " i%d += splitFree;\n" + " ts += sJ%d;", + axis, axis); + if(reduxGenRequiresDst(gr)){ + srcbAppendf(&gr->srcGen, "td += dJ%d;", axis); + } + if(reduxGenRequiresDstArg(gr)){ + srcbAppendf(&gr->srcGen, "ta += aJ%d;", axis); + } + srcbAppends(&gr->srcGen, "\n"); + srcbAppendf(&gr->srcGen, + " if (i%d < l%d){%s;}\n" + " else {i%d = 0;}\n" + " \n", + axis, axis, breakOrCont, axis); + }else if (reduceMaybeSplit && axis == gr->nds-1){ + srcbAppendf(&gr->srcGen, + " i%d += splitReduce;\n" + " ts += sJ%d;\n" + " if (i%d < l%d){%s;}\n" + " else {i%d = 0;}\n" + " \n", + axis, axis, axis, axis, breakOrCont, axis); + }else{ + srcbAppendf(&gr->srcGen, + " i%d++;\n" + " ts += sJ%d;", + axis, axis); + if(axis < gr->ndd){ + if(reduxGenRequiresDst(gr)){ + srcbAppendf(&gr->srcGen, "td += dJ%d;", axis); } - if(reduxRequiresDstArg(ctx)){ - ctx->flatDstArgStrides[ctx->ndfd] = axisGetDstArgStride(axis); - ctx->flatDstArgOffset += axisGetDstArgOffset(axis); + if(reduxGenRequiresDstArg(gr)){ + srcbAppendf(&gr->srcGen, "ta += aJ%d;", axis); } - - ctx->ndfd++; } + srcbAppends(&gr->srcGen, "\n"); + srcbAppendf(&gr->srcGen, + " if (i%d < l%d){%s;}\n" + " else {i%d = 0;}\n" + " \n", + axis, axis, breakOrCont, axis); + } +} +static void reduxGenSrcAppendDstWrite (GpuReduction* gr, + int initial, + int freeMaybeSplit, + int reduceMaybeSplit){ + if(initial){ + srcbAppends(&gr->srcGen, " /* Workspace Left-Write */\n" + " if(LID_0 < D){\n" + " SETREDUXSTATE(wdL[GID_0*D + LID_0], waL[GID_0*D + LID_0], pd[LID_0], pa[LID_0]);\n" + " }\n" + " \n"); + }else{ + if(!freeMaybeSplit){ + srcbAppends(&gr->srcGen, " /* Destination Write */\n" + " if(LID_0 < D){\n" + " STORED(td, pd[LID_0]);\n" + " STOREA(ta, pa[LID_0]);\n" + " }\n" + " \n"); + }else{ + if(gr->ndd > 0){ + srcbAppendf(&gr->srcGen, " /* Destination Write */\n" + " if(LID_0 < (l%d-i%dndd-1, gr->ndd-1, gr->ndd-1, gr->ndd-1); + }else{ + srcbAppendf(&gr->srcGen, " STORED(td, pd[LID_0]);\n" + " STOREA(ta, pa[LID_0]);\n"); + } + } + } +} +static void reduxGenSrcAppendPhase1 (GpuReduction* gr){ + srcbAppends(&gr->srcGen, + " /* PHASE 1 */\n" + " \n" + " /**\n" + " * If we are a collector block, gather all partial results for the\n" + " * same point to the left of the current position in our workspace\n" + " * and accumulate them into our partial result, then write out to\n" + " * destination/destination argument.\n" + " * We perform a left-read of our workspace and a right-read of the\n" + " * other blocks' workspace.\n" + " */\n" + " \n" + " if(misalignL && doFinish && LID_0 < D){\n" + " SETREDUXSTATE(accV, accI, wdL[(GID_0+0)*D+LID_0], waL[(GID_0+0)*D+LID_0]);\n" + " \n" + " for(k=-1; /* Starting with the first block to our left... */\n" + " (start +0)/B == /* Is our write target the same as that of */\n" + " (start+k*V+V-1)/B; /* the target k blocks to our left? */\n" + " k--){ /* Try moving one more to the left. */\n" + " REDUX(accV, accI, wdR[(GID_0+k)*D+LID_0], waR[(GID_0+k)*D+LID_0]);\n" + " }\n" + " \n"); + if(gr->ndd > 0){ + srcbAppendf(&gr->srcGen, + " if(LID_0 < (l%d-i%dndd-1, gr->ndd-1, gr->ndd-1, gr->ndd-1); + }else{ + srcbAppends(&gr->srcGen, + " STORED(td, accV);\n" + " STOREA(ta, accI);\n"); + } + srcbAppends(&gr->srcGen, + " }\n"); +} - ctx->flatSrcDimensions[i] = axisGetLen (axis); - ctx->flatSrcStrides[i] = axisGetSrcStride(axis); - ctx->flatSrcOffset += axisGetSrcOffset(axis); +/** + * @brief Compile the generated kernel. + */ + +static int reduxGenCompile (GpuReduction* gr){ + int ret; + + ret = GpuKernel_init(&gr->k, + gr->gpuCtx, + 1, + (const char**)&gr->kSourceCode, + &gr->kSourceCodeLen, + "redux", + gr->kNumArgs, + gr->kArgTypeCodes, + GA_USE_CLUDA, + &gr->kErrorString); + + if (ret != GA_NO_ERROR){ + return reduxGenCleanupMsg(gr, ret, + "Failed to compile reduction kernel!\n" + "Error code is: %d\n" + "Error string is:\n" + "%s\n" + "Source code is:\n" + "%s\n", + ret, gr->kErrorString, gr->kSourceCode); } + + return reduxGenComputeLaunchBounds(gr); +} + +/** + * @brief Compute the maximum number of threads this reduction operator will + * support launching. + */ - return reduxSelectNumStages(ctx); +static int reduxGenComputeLaunchBounds (GpuReduction* gr){ + int ret; + size_t a,b,c; + + /** + * Compute the maximum number of threads this kernel will support, + * since this is critical to the scheduling and will not change now + * that the kernel is compiled. + * + * This depends on several exhaustible resources and isn't necessarily + * trivial to compute due to the complicated rules we must follow to + * align shared memory, possibly slightly increasing consumption. + */ + + ret = gpukernel_property(gr->k.k, GA_KERNEL_PROP_MAXLSIZE, &gr->maxLK); + if(ret != GA_NO_ERROR){ + return reduxGenCleanupMsg(gr, ret, + "Failed to read max local size for compiled kernel!\n"); + } + a = gr->maxL0; + b = gr->maxLg; + c = gr->maxLM/reduxGenGetReduxStateSize(gr); + /* Kernel register use */ + gr->maxLK = gr->maxLKmaxLK: a;/* Maximum block size on axis 0 */ + gr->maxLK = gr->maxLKmaxLK: b;/* Maximum total block size */ + gr->maxLK = gr->maxLKmaxLK: c;/* Shared memory per thread. */ + + /** + * We now have a tight bound on the maximum block size, but due to memory + * alignment rules the memory consumption may be slightly higher than we + * initially computed, and thus the shared memory use can still be + * excessive. The following loop will almost certainly decrement at most + * once, unless type alignments are very wierd. + */ + + while(reduxGenGetSHMEMSize(gr, gr->maxLK) > gr->maxLM){ + gr->maxLK--; + } + + return reduxGenCleanup(gr, GA_NO_ERROR); } /** - * @brief Select the number of stages of the reduction. + * @brief Cleanup generator context. + */ + +static int reduxGenCleanup (GpuReduction* gr, int ret){ + if(ret != GA_NO_ERROR){ + free(gr->kArgTypeCodes); + free(gr->kSourceCode); + free(gr->kErrorString); + + memset(gr, 0, sizeof(*gr)); + free(gr); + } + + return ret; +} +static int reduxGenCleanupMsg (GpuReduction* gr, int ret, + const char* fmt, ...){ +#if DEBUG + FILE* fp = stderr; + + va_list ap; + va_start(ap, fmt); + vfprintf(fp, fmt, ap); + va_end(ap); + fflush(fp); +#else + (void)fmt; +#endif + + return reduxGenCleanup(gr, ret); +} + +/** + * @brief Estimate the level of parallelism available in the GPU context of + * this reduction operator. * - * This depends a lot on the GPU and the specific size of the reduction. + * This is a rough target number of threads. It would definitely fill the + * device, plus some substantial margin. */ -static int reduxSelectNumStages (redux_ctx* ctx){ - size_t parallelism = reduxEstimateParallelism(ctx); +static size_t reduxGenEstimateParallelism (const GpuReduction* gr){ + /** + * An arbitrary margin factor ensuring there will be a few thread blocks + * per SMX. + * + * E.g. on Kepler, each SMX can handle up to two 1024-thread blocks + * simultaneously, so a margin of 6/SMX should ensure with very high + * likelyhood that all SMXes will be fed and kept busy. + */ - if (ctx->zeroRdxAxes || /* Reduction over 0 elements? */ - ctx->prodAllAxes <= ctx->maxLg || /* Reduction over few elements? */ - ctx->prodFreeAxes >= ctx->prodRdxAxes || /* More destinations than reductions? */ - ctx->prodFreeAxes >= parallelism ){ /* Destination very large? */ - ctx->numStages = 1; - }else{ - /* BUG: Switch to 2Stage when small code model fixed. */ - ctx->numStages = 1; + size_t marginFactor = 6; + return marginFactor * gr->numProcs * gr->maxLg; +} + +/** + * @brief Returns whether the reduction interface requires a dst argument. + */ + +static int reduxGenRequiresDst (const GpuReduction* gr){ + switch (gr->op){ + case GA_REDUCE_ARGMIN: + case GA_REDUCE_ARGMAX: + return 0; + default: + return 1; + } +} + +/** + * @brief Returns whether the reduction interface requires a dstArg argument. + */ + +static int reduxGenRequiresDstArg (const GpuReduction* gr){ + switch (gr->op){ + case GA_REDUCE_MINANDARGMIN: + case GA_REDUCE_MAXANDARGMAX: + case GA_REDUCE_ARGMIN: + case GA_REDUCE_ARGMAX: + return 1; + default: + return 0; } - return ctx->numStages == 1 ? reduxPlan1Stage(ctx) : reduxPlan2Stage(ctx); } /** - * @brief Plan a 1-stage reduction. + * @brief Returns whether the generated kernel internally requires a dst + * workspace. + * + * This is semantically subtly different from reduxGenRequiresDst(). The main + * difference is in the implementation of the GA_REDUCE_ARGMIN/ARGMAX + * reductions; both require a dst workspace buffer for the min/max values + * associated with the indices that they return, even though they will be + * discarded. * - * Inputs: ctx->xdSrcFlat[0...ctx->ndf-1] + * As of now, all reductions use a dst workspace internally. + */ + +static int reduxGenKernelRequiresDst (const GpuReduction* gr){ + return 1; +} + +/** + * @brief Returns whether the generated kernel internally requires a dstArg + * workspace. + * + * This is semantically subtly different from reduxHasDstArg(), since it asks + * whether the reduction, even though it might not accept a dstArg argument, + * still requires a dstArg workspace internally. * - * This plan involves a direct write to the destinations, and does not require - * working space. + * Currently, there exist no operations that require a dstArg workspace + * internally but which is not also part of the external interface. + */ + +static int reduxGenKernelRequiresDstArg (const GpuReduction* gr){ + return reduxGenRequiresDstArg(gr); +} + +/** + * @brief Whether or not an axis is maybe split. * - * Because the reduction is deterministic, all reductions required for any - * destination element must be performed within a single thread block. + * An axis is possibly split if it is the last free or last reduction axis. + */ + +static int reduxGenAxisMaybeSplit (const GpuReduction* gr, int axis){ + return axis == gr->ndd-1 || axis == gr->nds-1; +} + +/** + * @brief Get the number of bytes of workspace per (partial) reduction per thread. + */ + +static size_t reduxGenGetReduxStateSize (const GpuReduction* gr){ + size_t total = 0, idxSize = gpuarray_get_elsize(gr->idxTypeCode); + + /* The accumulator and index types can be wider than dst/dstArg's types. */ + total += reduxGenKernelRequiresDst(gr) ? + gpuarray_get_elsize(gr->accTypeCode) : + 0; + total += reduxGenKernelRequiresDstArg(gr) ? + gpuarray_get_elsize(gr->idxTypeCode) : + 0; + + /* At minimum, there must be space for the offset permute. */ + total = total < idxSize ? idxSize : total; + + + /* Return the calculated amount of space. */ + return total; +} + +/** + * @brief Get the maximum number of threads this operator's kernel can handle. + */ + +static size_t reduxGenGetMaxLocalSize (const GpuReduction* gr){ + return gr->maxLK; +} + +/** + * @brief Get the shared memory consumption for a given block size. * - * In this implementation we choose to perform only intra-warp reductions, - * insulating ourselves from having to worry about the interplay between block - * size and kernel source code (A kernel's max block size is limited by - * numerous factors including its own source code, but the specific kernel we - * pick and generate requires foreknowledge of its block size. Chicken or egg). + * This is non-trivial since it requires ensuring alignment of datatypes. */ -static int reduxPlan1Stage (redux_ctx* ctx){ - int i; - axis_desc* axis; +static size_t reduxGenGetSHMEMSize (const GpuReduction* gr, size_t bs){ + const gpuarray_type* type; + size_t total = 0; - reduxSortAxisPtrsBy(ctx->xdSrcPtrs, ctx->xdSrcFlat, ctx->ndfs, - reduxSortPlan1Stage); + if(reduxGenKernelRequiresDst(gr)){ + type = gpuarray_get_type(gr->accTypeCode); + total = DIVIDECEIL(total, type->align)*type->align; + total += bs*type->size; + } + if(reduxGenKernelRequiresDstArg(gr)){ + type = gpuarray_get_type(gr->idxTypeCode); + total = DIVIDECEIL(total, type->align)*type->align; + total += bs*type->size; + } - ctx->st1.ndh = 0; - ctx->st1.ndhp = 0; - ctx->st1.ndhr = 0; + return total; +} + +/** + * @brief Get the shared memory byte offset for dst. + */ + +static size_t reduxGenGetSHMEMDstOff (const GpuReduction* gr, size_t bs){ + return 0; +} + +/** + * @brief Get the shared memory byte offset for dstArg. + */ + +static size_t reduxGenGetSHMEMDstArgOff (const GpuReduction* gr, size_t bs){ + const gpuarray_type* type; + size_t total = 0; - for (i=0;indfd && ihwAxisStage0 = i; + if(reduxGenKernelRequiresDst(gr) && reduxGenKernelRequiresDstArg(gr)){ + type = gpuarray_get_type(gr->accTypeCode); + total = DIVIDECEIL(total, type->align)*type->align; + total += bs*type->size; + type = gpuarray_get_type(gr->idxTypeCode); + total = DIVIDECEIL(total, type->align)*type->align; - ctx->st1.ndh++; + return total; + }else{ + return 0; } - ctx->st1.ndhd = ctx->st1.ndh; - - return reduxGenSource(ctx); } /** - * @brief Plan a 2-stage reduction. - * - * Inputs: ctx->xdSrcFlat[0...ctx->ndf-1] - * - * This plan involves splitting the reduction into two stages: - * - * Stage 0: A huge reduction only along reduction axes into a workspace. - * Stage 1: A small reduction into the destination. + * @brief Initialize the context. * - * We select only reduction axes in the first stage. + * After this function, calling reduxInvCleanup*() becomes safe. */ -static int reduxPlan2Stage (redux_ctx* ctx){ - int i, j, ret = 0; - axis_desc* axis; - size_t a = 1, aL, aPartial, target = reduxEstimateParallelism(ctx), sz; - +static int reduxInvInit (redux_ctx* ctx){ /** - * Plan Stage 0. - * - * Sort axis descriptions reduction-axes-first then longest-first, and - * select up to 3 reduction axes, splitting them s.t. their product does - * not exceed the max block size. + * We initialize certain parts of the context. */ - reduxSortAxisPtrsBy(ctx->xdSrcPtrs, ctx->xdSrcFlat, ctx->ndfs, - reduxSortPlan2Stage0); + ctx->l = NULL; + ctx->lPDim = NULL; + ctx->sJ = NULL; + ctx->dJ = NULL; + ctx->aJ = NULL; + ctx->ibs = NULL; + ctx->ibp = NULL; + ctx->iblPDim = NULL; + ctx->ibsOff = NULL; + ctx->ibdOff = NULL; + ctx->ibaOff = NULL; + ctx->kArgs = NULL; + ctx->xdSrc = NULL; + ctx->xdSrcPtrs = NULL; + ctx->xdTmpPtrs = NULL; + ctx->xdSplit = NULL; - ctx->st1.ndh = 0; - ctx->st1.ndhp = 0; - ctx->st1.ndhr = 0; - ctx->st1.ndhd = 0; + ctx->w = NULL; - for(i=0;indfs && ihwAxisStage0 = i; - axis->sliceLen = aL; - axis->tmpLen = 1; - - ctx->st1.ndh++; - }else{ - a /= aL; - aPartial = target/a; - if(aPartial >= 2){ - a *= aPartial; - - axis->hwAxisStage0 = i++; - axis->sliceLen = aPartial; - axis->tmpLen = (axis->len+axis->sliceLen-1)/axis->sliceLen; - - ctx->st1.ndh++; - ctx->st1.ndhp++; - } - break; - } + ctx->prodAllAxes = ctx->prodRdxAxes = ctx->prodFreeAxes = 1; + ctx->bs = ctx->gs = 1; + + return reduxInvInferProperties(ctx); +} + +/** + * @brief Begin inferring the properties of the reduction invocation. + */ + +static int reduxInvInferProperties (redux_ctx* ctx){ + axis_desc* a; + int i, j; + size_t d; + + + /* Insane src, reduxLen, dst or dstArg? */ + if(!ctx->reduxList){ + ctx->reduxLen = ctx->src->nd; } - ctx->st1.ndhr = ctx->st1.ndh; + if (!ctx->src){ + return reduxInvCleanupMsg(ctx, GA_INVALID_ERROR, + "src is NULL!\n"); + }else if (ctx->src->nd <= 0){ + return reduxInvCleanupMsg(ctx, GA_INVALID_ERROR, + "src is a scalar, cannot reduce it!\n"); + }else if (ctx->reduxLen < 0){ + return reduxInvCleanupMsg(ctx, GA_INVALID_ERROR, + "Length of list of dimensions to be reduced is less than 0!\n"); + }else if (ctx->src->nd < (unsigned)ctx->reduxLen){ + return reduxInvCleanupMsg(ctx, GA_INVALID_ERROR, + "src has fewer dimensions than there are dimensions to reduce!\n"); + }else if (reduxInvRequiresDst (ctx) && !ctx->dst){ + return reduxInvCleanupMsg(ctx, GA_INVALID_ERROR, + "dst is NULL, but reduction requires it!\n"); + }else if (reduxInvRequiresDstArg(ctx) && !ctx->dstArg){ + return reduxInvCleanupMsg(ctx, GA_INVALID_ERROR, + "dstArg is NULL, but reduction requires it!\n"); + }else if (ctx->dst && ctx->dst->nd +ctx->reduxLen != ctx->src->nd){ + return reduxInvCleanupMsg(ctx, GA_INVALID_ERROR, + "dst is of incorrect dimensionality for this reduction!\n"); + }else if (ctx->dstArg && ctx->dstArg->nd+ctx->reduxLen != ctx->src->nd){ + return reduxInvCleanupMsg(ctx, GA_INVALID_ERROR, + "dstArg is of incorrect dimensionality for this reduction!\n"); + } + ctx->nds = ctx->src->nd; + ctx->ndr = ctx->reduxLen; + ctx->ndd = ctx->nds - ctx->ndr; + ctx->ndfs = ctx->ndfr = ctx->ndfd = 0; + /* Insane reduxList? */ + for (i=0;indr;i++){ + j = ctx->reduxList ? ctx->reduxList[i] : i; + if (j < -ctx->nds || j >= ctx->nds){ + return reduxInvCleanupMsg(ctx, GA_INVALID_ERROR, + "Insane axis number %d! Should be [%d, %d)!\n", + j, -ctx->nds, ctx->nds); + } + j = j<0 ? ctx->nds+j : j; + d = ctx->src->dimensions[j]; + ctx->zeroRdxAxes += !d; + ctx->prodRdxAxes *= d?d:1; + } + + /** - * We now have enough information to allocate the workspaces. + * Insane shape? + * + * The source tensor is allowed to be empty (its shape may contain 0s). + * However, all axes that are of length 0 must be reduction axes. + * + * The reason for this is that a reduction cannot store any output into an + * empty destination tensor (whose dimensions are the free axes), because + * it has 0 space. The operation cannot then fulfill its contract. + * + * On the other hand, when some or all reduction axes of a tensor are of + * length 0, the reduction can be interpreted as initializing the + * destination tensor to the identity value of the operation. For lack of a + * better idea, the destination argument tensor can then be zeroed. */ - - ctx->ndt = ctx->ndfs - ctx->st1.ndh + ctx->st1.ndhp; - ctx->xdTmpPtrs = malloc(ctx->ndt*sizeof(*ctx->xdTmpPtrs)); - ctx->tmpDstDimensions = malloc(ctx->ndt*sizeof(*ctx->tmpDstDimensions)); - ctx->tmpDstStrides = malloc(ctx->ndt*sizeof(*ctx->tmpDstStrides)); - ctx->tmpDstArgStrides = malloc(ctx->ndt*sizeof(*ctx->tmpDstArgStrides)); - if(!ctx->xdTmpPtrs || !ctx->tmpDstDimensions || !ctx->tmpDstStrides || - !ctx->tmpDstArgStrides){ - return reduxCleanup(ctx, GA_MEMORY_ERROR); - } - for(i=j=0;indfs;i++){ - axis = reduxGetSrcFlatAxis(ctx, i); - if(!axisIsHW(axis, 0) || axisIsPartialHW(axis, 0)){ - ctx->xdTmpPtrs [j] = axis; - ctx->tmpDstDimensions[j] = axisGetTmpLen(axis); - } + + for (i=0;inds;i++){ + d = ctx->src->dimensions[i]; + ctx->zeroAllAxes += !d; + ctx->prodAllAxes *= d?d:1; } - - if (reduxKernelRequiresDst(ctx)){ - sz = gpuarray_get_elsize(ctx->dstTypeCode); - - for(i=ctx->ndt-1;i>=0;i--){ - ctx->tmpDstStrides[i] = sz; - sz *= ctx->tmpDstDimensions[i]; - } - - ctx->tmpDstData = gpudata_alloc(ctx->gpuCtx, sz, 0, 0, &ret); - if(ret != GA_NO_ERROR){ - return reduxCleanup(ctx, ret); + if (ctx->zeroAllAxes != ctx->zeroRdxAxes){ + return reduxInvCleanupMsg(ctx, GA_INVALID_ERROR, + "Source tensor has length-0 dimensions that are not reduced!\n"); + } + ctx->prodFreeAxes = ctx->prodAllAxes/ctx->prodRdxAxes; + + + /** + * Allocate and construct source-tensor axis-description lists. + * + * While constructing the descriptions of each axis, verify that: + * + * 1. reduxLen has no duplicates. + * 2. dst and/or dstArg's dimensions match src's dimensions, stripped of + * the reduction axes. + */ + + ctx->xdSrc = calloc(ctx->nds, sizeof(*ctx->xdSrc)); + ctx->xdSrcPtrs = calloc(ctx->nds+1, sizeof(*ctx->xdSrcPtrs)); + if (!ctx->xdSrc || !ctx->xdSrcPtrs){ + return reduxInvCleanup(ctx, GA_MEMORY_ERROR); + } + for (i=0;inds;i++){ + axisInit(&ctx->xdSrc[i], + ctx->src->dimensions[i], + ctx->src->strides[i]); + } + for (i=0;indr;i++){ + j = ctx->reduxList ? ctx->reduxList[i] : i; + j = j<0 ? ctx->nds+j : j; + a = reduxInvGetSrcAxis(ctx, j); + if (axisIsReduced(a)){ + return reduxInvCleanupMsg(ctx, GA_INVALID_ERROR, + "Axis %d appears multiple times in the " + "reduction axis list!\n", + j); } + axisMarkReduced(a, i); } - if (reduxKernelRequiresDstArg(ctx)){ - sz = gpuarray_get_elsize(ctx->dstArgTypeCode); + for (i=j=0;inds;i++){ + axis_desc* a = reduxInvGetSrcAxis(ctx, i); + size_t srcLen = axisGetLen(a), dstLen, dstArgLen; - for(i=ctx->ndt-1;i>=0;i--){ - ctx->tmpDstArgStrides[i] = sz; - sz *= ctx->tmpDstDimensions[i]; + if (axisIsReduced(a)){continue;} + if (reduxInvRequiresDst(ctx)){ + dstLen = ctx->dst->dimensions[j]; + + if(srcLen != dstLen){ + return reduxInvCleanupMsg(ctx, GA_INVALID_ERROR, + "Source axis %d has length %zu, but " + "corresponding destination axis %d has length %zu!\n", + i, srcLen, j, dstLen); + } + + a->dstStride = ctx->dst->strides[j]; } - - ctx->tmpDstArgData = gpudata_alloc(ctx->gpuCtx, sz, 0, 0, &ret); - if(ret != GA_NO_ERROR){ - return reduxCleanup(ctx, ret); + if (reduxInvRequiresDstArg(ctx)){ + dstArgLen = ctx->dstArg->dimensions[j]; + + if(srcLen != dstArgLen){ + return reduxInvCleanupMsg(ctx, GA_INVALID_ERROR, + "Source axis %d has length %zu, but " + "corresponding destination-argument axis %d has length %zu!\n", + i, srcLen, j, dstArgLen); + } + + a->dstArgStride = ctx->dstArg->strides[j]; } + + j++; } + /** - * Plan Stage 1. + * Grab gpudata buffers and byte offsets before we begin flattening the + * tensors. As we flatten the tensor, we may reverse some axes, leading to + * a bump of the byte offset. */ - qsort(ctx->xdTmpPtrs, ctx->ndt, sizeof(*ctx->xdTmpPtrs), reduxSortPlan1Stage); - - ctx->st2.ndh = 0; - ctx->st2.ndhp = 0; - ctx->st2.ndhr = 0; - - for (i=0;indfd && ihwAxisStage1 = i; - - ctx->st2.ndh++; + ctx->flatSrcData = ctx->src->data; + ctx->flatSrcOffset = ctx->src->offset; + if(reduxInvRequiresDst(ctx)){ + ctx->flatDstData = ctx->dst->data; + ctx->flatDstOffset = ctx->dst->offset; } - ctx->st2.ndhd = ctx->st2.ndh; - - return reduxGenSource(ctx); + if(reduxInvRequiresDstArg(ctx)){ + ctx->flatDstArgData = ctx->dstArg->data; + ctx->flatDstArgOffset = ctx->dstArg->offset; + } + + return reduxInvFlattenSource(ctx); } /** - * @brief Generate the kernel code for the reduction. - * - * @return GA_MEMORY_ERROR if not enough memory left; GA_NO_ERROR otherwise. + * @brief Flatten the source tensor as much as is practical. + * + * This makes the axis lengths as long as possible and the tensor itself as + * contiguous as possible. */ -static int reduxGenSource (redux_ctx* ctx){ - reduxAppendSource(ctx); - ctx->sourceCodeLen = ctx->s.l; - ctx->sourceCode = strb_cstr(&ctx->s); - if (!ctx->sourceCode){ - return reduxCleanup(ctx, GA_MEMORY_ERROR); - } - - return reduxCompile(ctx); -} -static void reduxAppendSource (redux_ctx* ctx){ - reduxAppendIncludes (ctx); - reduxAppendMacroDefs (ctx); - reduxAppendTypedefs (ctx); - reduxAppendGetInitValFns (ctx); - reduxAppendWriteBackFn (ctx); - reduxAppendReduxKernel (ctx); -} -static void reduxAppendTensorDeclArgs (redux_ctx* ctx, - const char* type, - const char* baseName){ - srcbAppendElemf(&ctx->srcGen, "%s* %sPtr", type, baseName); - srcbAppendElemf(&ctx->srcGen, "const X %sOff", baseName); - srcbAppendElemf(&ctx->srcGen, "const GLOBAL_MEM X* %sSteps", baseName); - (void)reduxAppendTensorCallArgs;/* Silence unused warning */ -} -static void reduxAppendTensorCallArgs (redux_ctx* ctx, - const char* baseName){ - srcbAppendElemf(&ctx->srcGen, "%sPtr", baseName); - srcbAppendElemf(&ctx->srcGen, "%sOff", baseName); - srcbAppendElemf(&ctx->srcGen, "%sSteps", baseName); -} -static void reduxAppendMacroDefs (redux_ctx* ctx){ - int i; - - srcbAppends (&ctx->srcGen, "#define FOROVER(idx) for (i##idx = i##idx##Start; i##idx < i##idx##End; i##idx++)\n"); - srcbAppends (&ctx->srcGen, "#define ESCAPE(idx) if (i##idx >= i##idx##Dim){continue;}\n"); +static int reduxInvFlattenSource (redux_ctx* ctx){ + axis_desc* axis, *flatAxis, *sortAxis; + int i, j, k, isSensitive; - /* srcVal indexer */ - srcbAppends (&ctx->srcGen, "#define srcVal (*(const GLOBAL_MEM S*)("); - srcbBeginList (&ctx->srcGen, "+", "0"); - srcbAppendElemf(&ctx->srcGen, "(const GLOBAL_MEM char*)srcPtr"); - srcbAppendElemf(&ctx->srcGen, "srcOff"); - for (i=0;indfs;i++){ - srcbAppendElemf(&ctx->srcGen, "i%d*i%dSStep", i, i); - } - srcbEndList (&ctx->srcGen); - srcbAppends (&ctx->srcGen, "))\n"); - - /* dstVal indexer */ - if (reduxKernelRequiresDst(ctx)){ - srcbAppends (&ctx->srcGen, "#define dstVal (*(GLOBAL_MEM T*)("); - srcbBeginList (&ctx->srcGen, "+", "0"); - srcbAppendElemf(&ctx->srcGen, "(GLOBAL_MEM char*)dstPtr"); - srcbAppendElemf(&ctx->srcGen, "dstOff"); - for (i=0;indfd;i++){ - srcbAppendElemf(&ctx->srcGen, "i%d*i%dDStep", i, i); - } - srcbEndList (&ctx->srcGen); - srcbAppends (&ctx->srcGen, "))\n"); - } - - /* dstArgVal indexer */ - if (reduxKernelRequiresDstArg(ctx)){ - srcbAppends (&ctx->srcGen, "#define dstArgVal (*(GLOBAL_MEM A*)("); - srcbBeginList (&ctx->srcGen, "+", "0"); - srcbAppendElemf(&ctx->srcGen, "(GLOBAL_MEM char*)dstArgPtr"); - srcbAppendElemf(&ctx->srcGen, "dstArgOff"); - for (i=0;indfd;i++){ - srcbAppendElemf(&ctx->srcGen, "i%d*i%dAStep", i, i); - } - srcbEndList (&ctx->srcGen); - srcbAppends (&ctx->srcGen, "))\n"); - } - - /* rdxIdx indexer */ - srcbAppends (&ctx->srcGen, "#define rdxIdx ("); - srcbBeginList (&ctx->srcGen, "+", "0"); - for (i=ctx->ndfd;indfs;i++){ - srcbAppendElemf(&ctx->srcGen, "i%d*i%dPDim", i, i); - } - srcbEndList (&ctx->srcGen); - srcbAppends (&ctx->srcGen, ")\n"); -} -static void reduxAppendIncludes (redux_ctx* ctx){ - strb_appends(&ctx->s, "/* Includes */\n"); - strb_appends(&ctx->s, "#include \"cluda.h\"\n"); - strb_appends(&ctx->s, "\n"); - strb_appends(&ctx->s, "\n"); - strb_appends(&ctx->s, "\n"); -} -static void reduxAppendTypedefs (redux_ctx* ctx){ - strb_appendf(&ctx->s, "typedef %s S;\n", ctx->srcTypeStr); /* The type of the source array. */ - strb_appendf(&ctx->s, "typedef %s T;\n", ctx->dstTypeStr); /* The type of the destination array. */ - strb_appendf(&ctx->s, "typedef %s A;\n", ctx->dstArgTypeStr);/* The type of the destination argument array. */ - strb_appendf(&ctx->s, "typedef %s X;\n", ctx->idxTypeStr); /* The type of the indices: signed 32/64-bit. */ - strb_appendf(&ctx->s, "typedef %s K;\n", ctx->accTypeStr); /* The type of the accumulator variable. */ -} -static void reduxAppendGetInitValFns (redux_ctx* ctx){ - /** - * Initial value functions. - */ + ctx->ndfs = ctx->nds; - strb_appendf(&ctx->s, "WITHIN_KERNEL T getInitValTFn(void){\n" - "\treturn (%s);\n" - "}\n\n\n\n" - "WITHIN_KERNEL K getInitValKFn(void){\n" - "\treturn (%s);\n" - "}\n\n\n\n", ctx->initValT, ctx->initValK); -} -static void reduxAppendWriteBackFn (redux_ctx* ctx){ /** - * Global memory value reduction function. - * - * Responsible for either: - * 1) Safe writeback of final value to memory, or - * 2) Safe atomic reduction of partial value into memory. + * Pass 1: Flatten out 0- and 1-length dimensions. We already know that + * + * a) There are no 0-length free dimensions, because that + * constitutes an invalid input, and + * b) How many 0-length reduction dimensions there are, because + * we counted them in the error-checking code. + * + * So if there are any 0-length axes, we can delete all reduction axes and + * replace them with a single one. + * + * We can also delete 1-length axes outright, since they can always be + * ignored; They are always indexed at [0]. */ - srcbAppends (&ctx->srcGen, "WITHIN_KERNEL void writeBackFn("); - srcbBeginList (&ctx->srcGen, ", ", "void"); - if (reduxKernelRequiresDst(ctx)){ - srcbAppendElemf(&ctx->srcGen, "GLOBAL_MEM T* d_"); - srcbAppendElemf(&ctx->srcGen, "T d"); - } - if (reduxKernelRequiresDstArg(ctx)){ - srcbAppendElemf(&ctx->srcGen, "GLOBAL_MEM A* a_"); - srcbAppendElemf(&ctx->srcGen, "A a"); - } - srcbEndList (&ctx->srcGen); - srcbAppends (&ctx->srcGen, "){\n"); - - if (reduxIs1Stage(ctx)){ - if (reduxKernelRequiresDst (ctx)){ - srcbAppends (&ctx->srcGen, "\t*d_ = d;\n"); - } - if (reduxKernelRequiresDstArg(ctx)){ - srcbAppends (&ctx->srcGen, "\t*a_ = a;\n"); - } - }else{ - /* BUG: Implement the atomic reduction, one or two CAS loops. */ - if ( reduxKernelRequiresDst (ctx) && !reduxKernelRequiresDstArg(ctx)){ - - }else if (!reduxKernelRequiresDst (ctx) && reduxKernelRequiresDstArg(ctx)){ - - }else if ( reduxKernelRequiresDst (ctx) && reduxKernelRequiresDstArg(ctx)){ - - } - } - - /* Close off function. */ - strb_appends(&ctx->s, "}\n\n\n\n"); -} -static void reduxAppendReduxKernel (redux_ctx* ctx){ - reduxAppendPrototype (ctx); - strb_appends (&ctx->s, "{\n"); - reduxAppendIndexDeclarations(ctx); - reduxAppendRangeCalculations(ctx); - reduxAppendLoops (ctx); - strb_appends (&ctx->s, "}\n"); -} -static void reduxAppendPrototype (redux_ctx* ctx){ - srcbAppends (&ctx->srcGen, "KERNEL void reduxKer("); - srcbBeginList (&ctx->srcGen, ", ", "void"); - reduxAppendTensorDeclArgs(ctx, "S", "src"); - srcbAppendElemf(&ctx->srcGen, "const GLOBAL_MEM X* srcSize"); - srcbAppendElemf(&ctx->srcGen, "const GLOBAL_MEM X* chunkSize"); - if (reduxKernelRequiresDst(ctx)){ - reduxAppendTensorDeclArgs(ctx, "T", "dst"); - } - if (reduxKernelRequiresDstArg(ctx)){ - reduxAppendTensorDeclArgs(ctx, "A", "dstArg"); - } - srcbEndList (&ctx->srcGen); - srcbAppends (&ctx->srcGen, ")"); -} -static void reduxAppendIndexDeclarations (redux_ctx* ctx){ - int i; - strb_appends(&ctx->s, "\t/* GPU kernel coordinates. Always 3D in OpenCL/CUDA. */\n"); - - strb_appends(&ctx->s, "\tX bi0 = GID_0, bi1 = GID_1, bi2 = GID_2;\n"); - strb_appends(&ctx->s, "\tX bd0 = LDIM_0, bd1 = LDIM_1, bd2 = LDIM_2;\n"); - strb_appends(&ctx->s, "\tX ti0 = LID_0, ti1 = LID_1, ti2 = LID_2;\n"); - strb_appends(&ctx->s, "\tX gi0 = bi0*bd0+ti0, gi1 = bi1*bd1+ti1, gi2 = bi2*bd2+ti2;\n"); - if (ctx->st1.ndh>0){ - strb_appends(&ctx->s, "\tX "); - for (i=0;ist1.ndh;i++){ - strb_appendf(&ctx->s, "ci%u = chunkSize[%u]%s", - i, i, (i==ctx->st1.ndh-1) ? ";\n" : ", "); - } - } - strb_appends(&ctx->s, "\t\n\t\n"); - strb_appends(&ctx->s, "\t/* Free indices & Reduction indices */\n"); - if (ctx->ndfs > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->ndfs, "", ";\n");} - if (ctx->ndfs > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->ndfs, "Dim", ";\n");} - if (ctx->ndfs > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->ndfs, "Start", ";\n");} - if (ctx->ndfs > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->ndfs, "End", ";\n");} - if (ctx->ndfs > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->ndfs, "SStep", ";\n");} - if (ctx->ndfd > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->ndfd, "DStep", ";\n");} - if (ctx->ndfd > 0){appendIdxes (&ctx->s, "\tX ", "i", 0, ctx->ndfd, "AStep", ";\n");} - if (ctx->ndfs > ctx->ndfd){appendIdxes (&ctx->s, "\tX ", "i", ctx->ndfd, ctx->ndfs, "PDim", ";\n");} - strb_appends(&ctx->s, "\t\n\t\n"); -} -static void reduxAppendRangeCalculations (redux_ctx* ctx){ - axis_desc* axis; - size_t hwDim; - int i; - - strb_appends(&ctx->s, "\t/* Compute ranges for this thread. */\n"); - - for (i=0;indfs;i++){ - strb_appendf(&ctx->s, "\ti%dDim = srcSize[%d];\n", i, i); - } - for (i=0;indfs;i++){ - strb_appendf(&ctx->s, "\ti%dSStep = srcSteps[%d];\n", i, i); - } - if (reduxKernelRequiresDst(ctx)){ - for (i=0;indfd;i++){ - strb_appendf(&ctx->s, "\ti%dDStep = dstSteps[%d];\n", i, i); - } - } - if (reduxKernelRequiresDstArg(ctx)){ - for (i=0;indfd;i++){ - strb_appendf(&ctx->s, "\ti%dAStep = dstArgSteps[%d];\n", i, i); - } - } - for (i=ctx->ndfs-1;i>=ctx->ndfd;i--){ - /** - * If this is the last index, it's the first cumulative dimension - * product we generate, and thus we initialize to 1. - */ - - if (i == ctx->ndfs-1){ - strb_appendf(&ctx->s, "\ti%dPDim = 1;\n", i); - }else{ - strb_appendf(&ctx->s, "\ti%dPDim = i%dPDim * i%dDim;\n", i, i+1, i+1); - } - } - for (i=0;indfs;i++){ - /** - * Up to MAX_HW_DIMS dimensions get to rely on hardware loops. - * The others, if any, have to use software looping beginning at 0. - */ - - axis = reduxGetSrcFlatAxis(ctx, i); - if (axisIsHW(axis, 0)){ - hwDim = axisGetHWAxisNum(axis, 0); - //axisInSet(i, ctx->st1.axisList, ctx->st1.ndh, &hwDim); - strb_appendf(&ctx->s, "\ti%dStart = gi%d * ci%d;\n", i, hwDim, hwDim); - }else{ - strb_appendf(&ctx->s, "\ti%dStart = 0;\n", i); - } - } - for (i=0;indfs;i++){ - /** - * Up to MAX_HW_DIMS dimensions get to rely on hardware loops. - * The others, if any, have to use software looping beginning at 0. - */ - - axis = reduxGetSrcFlatAxis(ctx, i); - if (axisIsHW(axis, 0)){ - hwDim = axisGetHWAxisNum(axis, 0); - //axisInSet(i, ctx->st1.axisList, ctx->st1.ndh, &hwDim); - strb_appendf(&ctx->s, "\ti%dEnd = i%dStart + ci%d;\n", i, i, hwDim); - }else{ - strb_appendf(&ctx->s, "\ti%dEnd = i%dStart + i%dDim;\n", i, i, i); + for (i=j=0;indfs;i++){ + axis = reduxInvGetSrcAxis(ctx, i); + if (!reduxTryFlattenOut(ctx, axis)){ + *reduxInvGetSrcAxis(ctx, j++) = *axis; } } - - strb_appends(&ctx->s, "\t\n\t\n"); -} -static void reduxAppendLoops (redux_ctx* ctx){ - int i; - - for (i=0;indfd;i++){ - srcbAppendf(&ctx->srcGen, "\tFOROVER(%d){ESCAPE(%d)\n", i, i); - } - - srcbAppends (&ctx->srcGen, "\t\tT rdxT;\n"); - srcbAppends (&ctx->srcGen, "\t\tK rdxK = getInitValKFn();\n"); - if (reduxKernelRequiresDstArg(ctx)){ - srcbAppends(&ctx->srcGen, "\t\tX rdxA = 0;\n"); - } - srcbAppends (&ctx->srcGen, "\t\t\n"); - - for (i=ctx->ndfd;indfs;i++){ - srcbAppendf (&ctx->srcGen, "\t\tFOROVER(%d){ESCAPE(%d)\n", i, i); + if(ctx->zeroRdxAxes > 0){ + /* New reduction axis of 0 length. */ + axisInit (reduxInvGetSrcAxis(ctx, j), 0, 0); + axisMarkReduced(reduxInvGetSrcAxis(ctx, j), 0); + j++; } + ctx->ndfs = j; - srcbAppends (&ctx->srcGen, "\t\t\tS s = srcVal;\n"); /** - * Prescalar transformations go here. They transform and coerce the S-typed - * value s into the K-typed value k. + * Pass 2: Flatten out continuous dimensions, where strides and sensitivity + * allows it. */ - - srcbAppends (&ctx->srcGen, "\t\t\tK k = s;\n"); - - switch (ctx->op){ - case GA_REDUCE_SUM: - srcbAppends(&ctx->srcGen, "\t\t\trdxK += k;\n"); - break; - case GA_REDUCE_PROD: - srcbAppends(&ctx->srcGen, "\t\t\trdxK *= k;\n"); - break; - case GA_REDUCE_PRODNZ: - srcbAppends(&ctx->srcGen, "\t\t\trdxK *= k==0 ? getInitValKFn() : k;\n"); - break; - case GA_REDUCE_MIN: - srcbAppends(&ctx->srcGen, "\t\t\trdxK = min(rdxK, k);\n"); - break; - case GA_REDUCE_MAX: - srcbAppends(&ctx->srcGen, "\t\t\trdxK = max(rdxK, k);\n"); - break; - case GA_REDUCE_ARGMIN: - case GA_REDUCE_MINANDARGMIN: - srcbAppends(&ctx->srcGen, "\t\t\trdxK = min(rdxK, k);\n" - "\t\t\tif (rdxK == k){\n" - "\t\t\t\trdxA = rdxIdx;\n" - "\t\t\t}\n"); - break; - case GA_REDUCE_ARGMAX: - case GA_REDUCE_MAXANDARGMAX: - srcbAppends(&ctx->srcGen, "\t\t\trdxK = max(rdxK, k);\n" - "\t\t\tif (rdxK == k){\n" - "\t\t\t\trdxA = rdxIdx;\n" - "\t\t\t}\n"); - break; - case GA_REDUCE_AND: - srcbAppends(&ctx->srcGen, "\t\t\trdxK &= k;\n"); - break; - case GA_REDUCE_OR: - srcbAppends(&ctx->srcGen, "\t\t\trdxK |= k;\n"); - break; - case GA_REDUCE_XOR: - srcbAppends(&ctx->srcGen, "\t\t\trdxK ^= k;\n"); - break; - case GA_REDUCE_ALL: - srcbAppends(&ctx->srcGen, "\t\t\trdxK = rdxK && k;\n"); - break; - case GA_REDUCE_ANY: - srcbAppends(&ctx->srcGen, "\t\t\trdxK = rdxK || k;\n"); - break; + + k = ctx->ndfs; + isSensitive = reduxIsSensitive(ctx->op); + qsort(ctx->xdSrc, ctx->ndfs, sizeof(*ctx->xdSrc), + isSensitive ? reduxSortFlatSensitive : reduxSortFlatInsensitive); + for (i=j=1;indfs;i++){ + flatAxis = reduxInvGetSrcAxis(ctx, j-1); + sortAxis = reduxInvGetSrcAxis(ctx, i); + + if (reduxTryFlattenInto(ctx, flatAxis, sortAxis)){ + k--; + }else{ + *reduxInvGetSrcAxis(ctx, j++) = *sortAxis; + } } + ctx->ndfs = k; - for (i=ctx->ndfd;indfs;i++){ - srcbAppends(&ctx->srcGen, "\t\t}\n"); - } - srcbAppends(&ctx->srcGen, "\t\t\n"); /** - * Large code model: Postscalar transformations go here, coercing the - * K-typed value rdxK to the T-typed value rdxT + * Compute number of free and reduced dimensions. */ - srcbAppends (&ctx->srcGen, "\t\trdxT = rdxK;\n"); - - /* Final writeback. */ - srcbAppends (&ctx->srcGen, "\t\twriteBackFn("); - srcbBeginList (&ctx->srcGen, ", ", ""); - if (reduxKernelRequiresDst(ctx)){ - srcbAppendElemf(&ctx->srcGen, "&dstVal"); - srcbAppendElemf(&ctx->srcGen, "rdxT"); - } - if (reduxKernelRequiresDstArg(ctx)){ - srcbAppendElemf(&ctx->srcGen, "&dstArgVal"); - srcbAppendElemf(&ctx->srcGen, "rdxA"); + for(ctx->ndfr=ctx->ndfd=i=0;indfs;i++){ + if(axisIsReduced(reduxInvGetSrcAxis(ctx, i))){ + ctx->ndfr++; + }else{ + ctx->ndfd++; + } } - srcbEndList (&ctx->srcGen); - srcbAppends (&ctx->srcGen, ");\n"); - for (i=0;indfd;i++){ - srcbAppends(&ctx->srcGen, "\t}\n"); - } + return reduxInvComputeKArgs(ctx); } /** - * @brief Compile the kernel from source code. + * @brief Compute the arguments to the kernel. + * + * This is a multistep process and involves a lot of axis sorting on various + * criteria. */ -static int reduxCompile (redux_ctx* ctx){ - int ret, i = 0; - int PRI_TYPECODES[11]; - size_t PRI_TYPECODES_LEN; +static int reduxInvComputeKArgs (redux_ctx* ctx){ + axis_desc* axis, *prevAxis; + size_t target, aL, aLS; + int i, j; /** - * Construct Argument Typecode Lists. + * STEP 0: Default Kernel Argument Values. + * + * They should be valid for a "scalar" job. In particular, for any + * non-existent axis, assume length 1. */ - - PRI_TYPECODES[i++] = GA_BUFFER; /* srcPtr */ - PRI_TYPECODES[i++] = GA_SIZE; /* srcOff */ - PRI_TYPECODES[i++] = GA_BUFFER; /* srcSteps */ - PRI_TYPECODES[i++] = GA_BUFFER; /* srcSize */ - PRI_TYPECODES[i++] = GA_BUFFER; /* chnkSize */ - if (reduxKernelRequiresDst(ctx)){ - PRI_TYPECODES[i++] = GA_BUFFER; /* dstPtr */ - PRI_TYPECODES[i++] = GA_SIZE; /* dstOff */ - PRI_TYPECODES[i++] = GA_BUFFER; /* dstSteps */ + + ctx->phase = 0; + ctx->U = 1; + ctx->V = 1; + ctx->B = 1; + ctx->D = 1; + ctx->H = 1; + ctx->splitFree = 1; + ctx->splitReduce = 1; + ctx->xdSplit = NULL; + ctx->l = calloc(ctx->gr->nds, sizeof(*ctx->l)); + ctx->lPDim = calloc(ctx->gr->ndr, sizeof(*ctx->lPDim)); + ctx->sJ = calloc(ctx->gr->nds, sizeof(*ctx->sJ)); + ctx->dJ = calloc(ctx->gr->ndd, sizeof(*ctx->dJ)); + ctx->aJ = calloc(ctx->gr->ndd, sizeof(*ctx->aJ)); + ctx->wdOff = 0; + ctx->pdOff = 0; + ctx->waOff = 0; + ctx->paOff = 0; + ctx->ibs = calloc(ctx->gr->log2MaxL, sizeof(*ctx->ibs)); + ctx->ibp = calloc(ctx->gr->log2MaxL, sizeof(*ctx->ibp)); + ctx->iblPDim = calloc(ctx->gr->log2MaxL, sizeof(*ctx->iblPDim)); + ctx->ibsOff = calloc(ctx->gr->log2MaxL, sizeof(*ctx->ibsOff)); + ctx->ibdOff = calloc(ctx->gr->log2MaxL, sizeof(*ctx->ibdOff)); + ctx->ibaOff = calloc(ctx->gr->log2MaxL, sizeof(*ctx->ibaOff)); + ctx->bs = 1; + ctx->gs = 1; + ctx->kArgs = calloc(ctx->gr->kNumArgs, sizeof(*ctx->kArgs)); + + if(!ctx->l || !ctx->lPDim || !ctx->sJ || !ctx->dJ || + !ctx->aJ || !ctx->ibs || !ctx->ibp || !ctx->iblPDim || + !ctx->ibsOff || !ctx->ibdOff || !ctx->ibaOff || !ctx->kArgs){ + return reduxInvCleanupMsg(ctx, GA_MEMORY_ERROR, + "Failed to allocate memory for kernel invocation arguments!\n"); + } + for(i=0;igr->nds;i++){ + ctx->l[i] = 1; + } + for(i=0;igr->ndr;i++){ + ctx->lPDim[i] = 1; } - if (reduxKernelRequiresDstArg(ctx)){ - PRI_TYPECODES[i++] = GA_BUFFER; /* dstArgPtr */ - PRI_TYPECODES[i++] = GA_SIZE; /* dstArgOff */ - PRI_TYPECODES[i++] = GA_BUFFER; /* dstArgSteps */ + for(i=0;igr->log2MaxL;i++){ + ctx->ibs[i] = 1; } - PRI_TYPECODES_LEN = i; /** - * Compile the kernels. + * STEP 1: Select Intra-Block Axes. + * + * Sort the axes in the order likely to maximize contiguity of source + * memory accesses, then tag them to the kernel block size limit, possibly + * splitting an axis in the process. */ - - { - ret = GpuKernel_init(&ctx->kernel, - ctx->gpuCtx, - 1, - (const char**)&ctx->sourceCode, - &ctx->sourceCodeLen, - "reduxKer", - PRI_TYPECODES_LEN, - PRI_TYPECODES, - GA_USE_CLUDA, - &ctx->errorString0); - if (ret != GA_NO_ERROR){ - return reduxCleanup(ctx, ret); + + reduxSortAxisPtrsBy(ctx->xdSrcPtrs, ctx->xdSrc, ctx->ndfs, + reduxSortPtrIBSrcRdSelect); + target = reduxGenGetMaxLocalSize(ctx->gr); + + for(i=0;indfs && igr->log2MaxL;i++){ + axis = reduxInvGetSrcSortAxis(ctx, i); + aL = axisGetLen(axis); + + if(ctx->bs*aL <= target){ + ctx->bs *= aL; + axisMarkIntraBlock(axis, i, aL); + }else{ + if(target/ctx->bs >= 2){ + aLS = target/ctx->bs; + ctx->bs *= aLS; + axisMarkIntraBlock(axis, i++, aLS); + ctx->xdSplit = axis; + } + break; } } + ctx->ndib = i; - return reduxSchedule(ctx); -} -/** - * @brief Compute a good thread block size / grid size / software chunk size - * for the primary/auxilliary kernels. - */ + /** + * STEP 2: Compute values dependent only on the intrablock axis selection. + * + * For instance, the splitFree/splitReduce factors depend only on the split + * axis, if any. + * + * The shared memory consumption and shared memory offsets depend only + * on block size. + */ -static int reduxSchedule (redux_ctx* ctx){ - int i, priNdims = 0; - uint64_t maxLgRdx = 0; - uint64_t maxLgPri = 0; - uint64_t maxLs [MAX_HW_DIMS]; - uint64_t maxGg; - uint64_t maxGs [MAX_HW_DIMS]; - uint64_t priDims[MAX_HW_DIMS]; - uint64_t bs [MAX_HW_DIMS]; - uint64_t gs [MAX_HW_DIMS]; - uint64_t cs [MAX_HW_DIMS]; - size_t warpSize, maxL; - axis_desc* axis; + ctx->splitFree = reduxInvGetSplitFree (ctx); + ctx->splitReduce = reduxInvGetSplitReduce (ctx); + ctx->SHMEM = reduxGenGetSHMEMSize (ctx->gr, ctx->bs); + ctx->pdOff = reduxGenGetSHMEMDstOff (ctx->gr, ctx->bs); + ctx->paOff = reduxGenGetSHMEMDstArgOff(ctx->gr, ctx->bs); /** - * Obtain the constraints of our problem. + * STEP 3: Compute U, B, D, H + */ + + for (i=0;indfs;i++){ + axis = reduxInvGetSrcAxis(ctx, i); + ctx->U *= axisGetInterLen(axis); + ctx->B *= axisIsReduced(axis) ? axisGetInterLen(axis) : 1; + ctx->H *= axisIsReduced(axis) ? axisGetIntraLen(axis) : 1; + } + ctx->D = ctx->bs/ctx->H; + + + /** + * STEP 4: Compute PDim values. + * + * This will be used for index calculation. */ - gpukernel_property(ctx->kernel.k, GA_KERNEL_PROP_PREFLSIZE, &warpSize); - gpukernel_property(ctx->kernel.k, GA_KERNEL_PROP_MAXLSIZE, &maxL); - maxLgRdx = maxL; - maxLgPri = maxLgRdx; - - priNdims = ctx->st1.ndh; - maxGs[0] = ctx->maxGs[0]; - maxGs[1] = ctx->maxGs[1]; - maxGs[2] = ctx->maxGs[2]; - maxGg = ctx->maxGg; - maxLs[0] = ctx->maxLs[0]; - maxLs[1] = ctx->maxLs[1]; - maxLs[2] = ctx->maxLs[2]; + reduxSortAxisPtrsBy(ctx->xdSrcPtrs, ctx->xdSrc, ctx->ndfs, + reduxSortPtrByReduxNum); for (i=0;indfs;i++){ - axis = reduxGetSrcFlatAxis(ctx, i); - if(axisIsHW(axis, 0)){ - priDims[axisGetHWAxisNum(axis, 0)] = axisGetLen(axis); + axis = reduxInvGetSrcSortAxis(ctx, i); + + if(axisIsReduced(axis)){ + if(i==0){ + axisSetPDim(axis, 1); + }else{ + prevAxis = reduxInvGetSrcSortAxis(ctx, i-1); + axisSetPDim(prevAxis, axisGetPDim(axis)*axisGetLen(prevAxis)); + } } } - - + + /** - * Apply the solver. + * STEP 5: Compute Intra-Block Permute Core. + * + * Sort the axes in the order most likely to maximize contiguity of + * destination/destination argument memory accesses, then compute the + * permutation that achieves the highest-bandwidth, + * post-horizontal-reduction destination writes. */ - - { - reduxScheduleKernel(priNdims, - priDims, - warpSize, - maxLgPri, maxLs, - maxGg, maxGs, - bs, gs, cs); - for (i=0;ist1.bs[i] = bs[i]; - ctx->st1.gs[i] = gs[i]; - ctx->st1.cs[i] = cs[i]; + + reduxSortAxisPtrsBy(ctx->xdSrcPtrs, ctx->xdSrc, ctx->ndfs, + reduxInvRequiresDst(ctx) ? + reduxSortPtrIBDstWrSelect : + reduxSortPtrIBDstArgWrSelect); + for(i=0;indfs;i++){ + axis = reduxInvGetSrcSortAxis(ctx, i); + + if(axisIsIntra(axis)){ + if(i==0){ + axisSetIBP(axis, 1); + }else{ + prevAxis = reduxInvGetSrcSortAxis(ctx, i-1); + axisSetIBP(axis, axisGetIBP(prevAxis)*axisGetLen(prevAxis)); + } } - if (priNdims <= 0){ - ctx->st1.bs[i] = ctx->st1.gs[i] = ctx->st1.cs[i] = 1; + } + + /** + * STEP 6. Place the axes in final loop order and perform final placement + * of: + * lN, lPDim, sJN, dJN, aJN, + * ibs, ibp, iblPDim, ibsOff, ibdOff, ibaOff + */ + + reduxSortAxisPtrsBy(ctx->xdSrcPtrs, ctx->xdSrc, ctx->ndfs, + reduxSortPtrFinalOrder); + for(i=0,j=0;indfs;i++){ + axis = reduxInvGetSrcSortAxis(ctx, i); + + if (axisIsSplit(axis) && !axisIsReduced(axis)){ + /* Split Free Axis? */ + ctx->ibs [ 0] = axisGetIntraLen(axis); + ctx->ibp [ 0] = axisGetIntraLen(axis); + ctx->iblPDim[ 0] = axisGetIntraLen(axis); + ctx->ibsOff [ 0] = axisGetSrcStride(axis); + ctx->ibdOff [ 0] = axisGetDstStride(axis); + ctx->ibaOff [ 0] = axisGetDstArgStride(axis); + + ctx->l [ctx->gr->ndd-1] = axisGetInterLen(axis); + ctx->lPDim [ctx->gr->ndd-1] = axisGetPDim (axis); + ctx->sJ [ctx->gr->ndd-1] = 0; + ctx->dJ [ctx->gr->ndd-1] = 0; + ctx->aJ [ctx->gr->ndd-1] = 0; + }else if (axisIsSplit(axis) && axisIsReduced(axis)){ + /* Split Reduced Axis? */ + ctx->ibs [ 0] = axisGetIntraLen(axis); + ctx->ibp [ 0] = axisGetIntraLen(axis); + ctx->iblPDim[ 0] = axisGetIntraLen(axis); + ctx->ibsOff [ 0] = axisGetSrcStride(axis); + ctx->ibdOff [ 0] = axisGetDstStride(axis); + ctx->ibaOff [ 0] = axisGetDstArgStride(axis); + + ctx->l [ctx->gr->nds-1] = axisGetInterLen(axis); + ctx->lPDim [ctx->gr->nds-1] = axisGetPDim (axis); + ctx->sJ [ctx->gr->nds-1] = 0; + ctx->dJ [ctx->gr->nds-1] = 0; + ctx->aJ [ctx->gr->nds-1] = 0; + }else if (axisIsInter(axis) && !axisIsReduced(axis)){ + /* Inter Free Axis? */ + ctx->l [ j] = axisGetInterLen(axis); + ctx->lPDim [ j] = axisGetPDim (axis); + ctx->sJ [ j] = 0; + ctx->dJ [ j] = 0; + ctx->aJ [ j] = 0; + }else if (axisIsInter(axis) && axisIsReduced(axis)){ + /* Inter Reduced Axis? */ + ctx->l [ j] = axisGetInterLen(axis); + ctx->lPDim [ j] = axisGetPDim (axis); + ctx->sJ [ j] = 0; + ctx->dJ [ j] = 0; + ctx->aJ [ j] = 0; + }else{ + /* Intra Axis? */ + ctx->ibs [ 0] = axisGetIntraLen(axis); + ctx->ibp [ 0] = axisGetIntraLen(axis); + ctx->iblPDim[ 0] = axisGetIntraLen(axis); + ctx->ibsOff [ 0] = axisGetSrcStride(axis); + ctx->ibdOff [ 0] = axisGetDstStride(axis); + ctx->ibaOff [ 0] = axisGetDstArgStride(axis); } } - return reduxInvoke(ctx); + return reduxInvSchedule(ctx); } +#if 0 +static void reduxScheduleKernel (int ndims, + uint64_t* dims, + uint64_t warpSize, + uint64_t maxLg, + uint64_t* maxLs, + uint64_t maxGg, + uint64_t* maxGs, + uint64_t* bs, + uint64_t* gs, + uint64_t* cs); + /** * @brief Given the parameters of a kernel scheduling problem, solve it as * optimally as possible. @@ -2545,124 +3382,204 @@ static void reduxScheduleKernel (int ndims, cs[i] = gaIFLGetProduct(&factCS[i]); } } +#endif /** - * Invoke the kernel. + * @brief With nearly all parameters of the kernel computed, schedule the + * kernel for maximum performance. + * + * The thread block size has already been chosen; We only have to choose + * + * 1. ctx->gs: The grid size, which is the number of thread blocks. + * 2. ctx->V: The number of vertical reductions per thread block. + * + * Two factors drive the scheduling: + * + * 1. We want to keep all multiprocessors of the device busy; For this we use + * an estimate of the level of parallelism of the device. + * 2. If V can be chosen such that V % B == 0, then only a single kernel + * phase is necessary. + * + * Once the scheduling is performed, the workspace can be allocated and + * workspace offsets can be computed. */ -static int reduxInvoke (redux_ctx* ctx){ - void* priArgs[11]; - int ret, i = 0; - int failedDstSteps = 0; - int failedDstArgSteps = 0; - int failedAuxChunkSize = 0; +static int reduxInvSchedule (redux_ctx* ctx){ + const int flags = GA_BUFFER_READ_WRITE; + size_t WSPACESIZE; + + /** + * Get enough blocks to fill available device parallelism to capacity. + * Then, compute corresponding V. + */ + + ctx->gs = DIVIDECEIL(reduxInvEstimateParallelism(ctx), + reduxGenGetMaxLocalSize(ctx->gr)); + ctx->V = DIVIDECEIL(ctx->U, ctx->gs); + + /** + * Allocate required workspace. + */ + + ctx->wdOff = reduxGenGetSHMEMDstOff (ctx->gr, 2*ctx->gs*ctx->D); + ctx->waOff = reduxGenGetSHMEMDstArgOff(ctx->gr, 2*ctx->gs*ctx->D); + WSPACESIZE = reduxGenGetSHMEMSize (ctx->gr, 2*ctx->gs*ctx->D); + ctx->w = gpudata_alloc(ctx->gr->gpuCtx, WSPACESIZE, 0, flags, 0); + if(!ctx->w){ + return reduxInvCleanupMsg(ctx, GA_MEMORY_ERROR, + "Could not allocate %zu-byte workspace for reduction!\n", + WSPACESIZE); + } + + return reduxInvoke(ctx); +} +/** + * @brief Invoke the kernel. + */ +static int reduxInvoke (redux_ctx* ctx){ + int ret, i, k; + /** - * Argument Marshalling. This the grossest gross thing in here. + * Argument Marshalling. */ - - const int flags = GA_BUFFER_READ_ONLY|GA_BUFFER_INIT; - ctx->srcStepsGD = gpudata_alloc(ctx->gpuCtx, ctx->ndfs * sizeof(size_t), - ctx->flatSrcStrides, flags, 0); - ctx->srcSizeGD = gpudata_alloc(ctx->gpuCtx, ctx->ndfs * sizeof(size_t), - ctx->flatSrcDimensions, flags, 0); - ctx->st1.chunkSizeGD = gpudata_alloc(ctx->gpuCtx, ctx->st1.ndh * sizeof(size_t), - ctx->st1.cs, flags, 0); - - priArgs[i++] = (void*) ctx->flatSrcData; - priArgs[i++] = (void*)&ctx->flatSrcOffset; - priArgs[i++] = (void*) ctx->srcStepsGD; - priArgs[i++] = (void*) ctx->srcSizeGD; - priArgs[i++] = (void*) ctx->st1.chunkSizeGD; - if (reduxKernelRequiresDst (ctx)){ - ctx->dstStepsGD = gpudata_alloc(ctx->gpuCtx, ctx->ndfd * sizeof(size_t), - ctx->flatDstStrides, flags, 0); - priArgs[i++] = (void*) ctx->flatDstData; - priArgs[i++] = (void*)&ctx->flatDstOffset; - priArgs[i++] = (void*) ctx->dstStepsGD; - failedDstSteps = !ctx->dstStepsGD; - } - if (reduxKernelRequiresDstArg(ctx)){ - ctx->dstArgStepsGD = gpudata_alloc(ctx->gpuCtx, ctx->ndfd * sizeof(size_t), - ctx->flatDstArgStrides, flags, 0); - priArgs[i++] = (void*) ctx->flatDstArgData; - priArgs[i++] = (void*)&ctx->flatDstArgOffset; - priArgs[i++] = (void*) ctx->dstArgStepsGD; - failedDstArgSteps = !ctx->dstArgStepsGD; + + i = 0; + ctx->kArgs[i++] = (void*)&ctx->phase; + ctx->kArgs[i++] = (void*)&ctx->U; + ctx->kArgs[i++] = (void*)&ctx->V; + ctx->kArgs[i++] = (void*)&ctx->B; + ctx->kArgs[i++] = (void*)&ctx->D; + ctx->kArgs[i++] = (void*)&ctx->H; + ctx->kArgs[i++] = (void*)&ctx->splitFree; + ctx->kArgs[i++] = (void*)&ctx->splitReduce; + for(k=0;k < ctx->gr->nds;k++){ + ctx->kArgs[i++] = (void*)&ctx->l[k]; + } + for(k=0;k < ctx->gr->ndr && reduxInvRequiresDstArg(ctx);k++){ + ctx->kArgs[i++] = (void*)&ctx->lPDim[k]; + } + ctx->kArgs[i++] = (void*) ctx->flatSrcData; + ctx->kArgs[i++] = (void*)&ctx->flatSrcOffset; + for(k=0;k < ctx->gr->nds;k++){ + ctx->kArgs[i++] = (void*)&ctx->sJ[k]; + } + if(reduxInvRequiresDst (ctx)){ + ctx->kArgs[i++] = (void*) ctx->flatDstData; + ctx->kArgs[i++] = (void*)&ctx->flatDstOffset; + for(k=0;k < ctx->gr->ndd;k++){ + ctx->kArgs[i++] = (void*)&ctx->dJ[k]; + } + } + if(reduxInvRequiresDstArg(ctx)){ + ctx->kArgs[i++] = (void*) ctx->flatDstArgData; + ctx->kArgs[i++] = (void*)&ctx->flatDstArgOffset; + for(k=0;k < ctx->gr->ndd;k++){ + ctx->kArgs[i++] = (void*)&ctx->aJ[k]; + } + } + ctx->kArgs[i++] = (void*) ctx->w; + if(reduxInvKernelRequiresDst (ctx)){ + ctx->kArgs[i++] = (void*)&ctx->wdOff; + ctx->kArgs[i++] = (void*)&ctx->pdOff; + } + if(reduxInvKernelRequiresDstArg(ctx)){ + ctx->kArgs[i++] = (void*)&ctx->waOff; + ctx->kArgs[i++] = (void*)&ctx->paOff; + } + for(k=0;k < ctx->gr->log2MaxL;k++){ + ctx->kArgs[i++] = (void*)&ctx->ibs[k]; } + for(k=0;k < ctx->gr->log2MaxL;k++){ + ctx->kArgs[i++] = (void*)&ctx->ibp[k]; + } + for(k=0;k < ctx->gr->log2MaxL && reduxInvRequiresDstArg(ctx);k++){ + ctx->kArgs[i++] = (void*)&ctx->iblPDim[k]; + } + for(k=0;k < ctx->gr->log2MaxL;k++){ + ctx->kArgs[i++] = (void*)&ctx->ibsOff[k]; + } + for(k=0;k < ctx->gr->log2MaxL && reduxInvRequiresDst (ctx);k++){ + ctx->kArgs[i++] = (void*)&ctx->ibdOff[k]; + } + for(k=0;k < ctx->gr->log2MaxL && reduxInvRequiresDstArg(ctx);k++){ + ctx->kArgs[i++] = (void*)&ctx->ibaOff[k]; + } + /** - * One or three kernels is now invoked, depending on the code model. + * The kernel is now invoked once or twice, for phase 0 or 1. + * + * Phase 1 is sometimes optional. */ - if (ctx->srcStepsGD && - ctx->srcSizeGD && - ctx->st1.chunkSizeGD && - !failedDstSteps && - !failedDstArgSteps && - !failedAuxChunkSize){ - /* Reduction kernel invocation */ - ret = GpuKernel_call(&ctx->kernel, - ctx->st1.ndh>0 ? ctx->st1.ndh : 1, - ctx->st1.gs, - ctx->st1.bs, - 0, - priArgs); + ctx->phase = 0; + ret = GpuKernel_call(&ctx->gr->k, 1, &ctx->gs, &ctx->bs, ctx->SHMEM, ctx->kArgs); + if (ret != GA_NO_ERROR){ + return reduxInvCleanupMsg(ctx, ret, + "Failure in kernel call, Phase 0!\n"); + } + + if(ctx->V%ctx->B != 0){ + ctx->phase = 1; + ret = GpuKernel_call(&ctx->gr->k, 1, &ctx->gs, &ctx->bs, ctx->SHMEM, ctx->kArgs); if (ret != GA_NO_ERROR){ - return reduxCleanup(ctx, ret); + return reduxInvCleanupMsg(ctx, ret, + "Failure in kernel call, Phase 1!\n"); } - - return reduxCleanup(ctx, ret); - }else{ - return reduxCleanup(ctx, GA_MEMORY_ERROR); } + + /* Success! */ + return reduxInvCleanup(ctx, GA_NO_ERROR); } /** * Cleanup */ -static int reduxCleanup (redux_ctx* ctx, int ret){ - free(ctx->flatSrcDimensions); - free(ctx->flatSrcStrides); - free(ctx->flatDstStrides); - free(ctx->flatDstArgStrides); - free(ctx->tmpDstDimensions); - free(ctx->tmpDstStrides); - free(ctx->tmpDstArgStrides); - free(ctx->sourceCode); - free(ctx->errorString0); - free(ctx->errorString1); - ctx->flatSrcDimensions = NULL; - ctx->flatSrcStrides = NULL; - ctx->flatDstStrides = NULL; - ctx->flatDstArgStrides = NULL; - ctx->tmpDstDimensions = NULL; - ctx->tmpDstStrides = NULL; - ctx->tmpDstArgStrides = NULL; - ctx->sourceCode = NULL; - ctx->errorString0 = NULL; - ctx->errorString1 = NULL; - - gpudata_release(ctx->tmpDstData); - gpudata_release(ctx->tmpDstArgData); - gpudata_release(ctx->srcStepsGD); - gpudata_release(ctx->srcSizeGD); - gpudata_release(ctx->dstStepsGD); - gpudata_release(ctx->dstArgStepsGD); - gpudata_release(ctx->st1.chunkSizeGD); - gpudata_release(ctx->st2.chunkSizeGD); - ctx->srcStepsGD = ctx->srcSizeGD = - ctx->dstStepsGD = ctx->dstArgStepsGD = - ctx->st1.chunkSizeGD = ctx->st2.chunkSizeGD = NULL; +static int reduxInvCleanup (redux_ctx* ctx, int ret){ + free(ctx->l); + free(ctx->lPDim); + free(ctx->sJ); + free(ctx->dJ); + free(ctx->aJ); + free(ctx->ibs); + free(ctx->ibp); + free(ctx->iblPDim); + free(ctx->ibsOff); + free(ctx->ibdOff); + free(ctx->ibaOff); + free(ctx->kArgs); + free(ctx->xdSrc); + free(ctx->xdSrcPtrs); + free(ctx->xdTmpPtrs); + + gpudata_release(ctx->w); + + ctx->l = NULL; + ctx->lPDim = NULL; + ctx->sJ = NULL; + ctx->dJ = NULL; + ctx->aJ = NULL; + ctx->ibs = NULL; + ctx->ibp = NULL; + ctx->iblPDim = NULL; + ctx->ibsOff = NULL; + ctx->ibdOff = NULL; + ctx->ibaOff = NULL; + ctx->kArgs = NULL; + ctx->xdSrc = NULL; + ctx->xdSrcPtrs = NULL; + ctx->xdTmpPtrs = NULL; + + ctx->w = NULL; return ret; } - -static int reduxCleanupMsg (redux_ctx* ctx, int ret, - const char* fmt, ...){ +static int reduxInvCleanupMsg (redux_ctx* ctx, int ret, + const char* fmt, ...){ #if DEBUG FILE* fp = stderr; @@ -2675,5 +3592,5 @@ static int reduxCleanupMsg (redux_ctx* ctx, int ret, (void)fmt; #endif - return reduxCleanup(ctx, ret); + return reduxInvCleanup(ctx, ret); } diff --git a/tests/check_reduction.c b/tests/check_reduction.c index 18cf9e7615..567f384aaf 100644 --- a/tests/check_reduction.c +++ b/tests/check_reduction.c @@ -77,7 +77,7 @@ START_TEST(test_maxandargmax_reduction){ size_t i,j,k; size_t dims[3] = {32,50,79}; size_t prodDims = dims[0]*dims[1]*dims[2]; - const unsigned reduxList[] = {0,2}; + const int reduxList[] = {0,2}; float *pSrc = calloc(sizeof(*pSrc), prodDims); float *pMax = calloc(sizeof(*pMax), dims[1]); @@ -113,7 +113,12 @@ START_TEST(test_maxandargmax_reduction){ ga_assert_ok(GpuArray_memset(&gaMax, -1)); /* 0xFFFFFFFF is a qNaN. */ ga_assert_ok(GpuArray_memset(&gaArgmax, -1)); - ga_assert_ok(GpuArray_reduction(GA_REDUCE_MAXANDARGMAX, &gaMax, &gaArgmax, &gaSrc, 2, reduxList)); + GpuReduction* gr; + GpuReduction_new(&gr, GpuArray_context(&gaSrc), + GA_REDUCE_MAXANDARGMAX, 1, 2, gaSrc.typecode, 0); + ck_assert_ptr_nonnull(gr); + ga_assert_ok(GpuReduction_call(gr, &gaMax, &gaArgmax, &gaSrc, 2, reduxList, 0)); + GpuReduction_free(gr); ga_assert_ok(GpuArray_read(pMax, sizeof(*pMax) *dims[1], &gaMax)); ga_assert_ok(GpuArray_read(pArgmax, sizeof(*pArgmax)*dims[1], &gaArgmax)); @@ -169,7 +174,7 @@ START_TEST(test_maxandargmax_idxtranspose){ size_t prodDims = dims[0]*dims[1]*dims[2]; size_t rdxDims[1] = {50}; size_t rdxProdDims = rdxDims[0]; - const unsigned reduxList[] = {2,0}; + const int reduxList[] = {2,0}; float *pSrc = calloc(sizeof(*pSrc), prodDims); float *pMax = calloc(sizeof(*pMax), rdxProdDims); @@ -205,7 +210,12 @@ START_TEST(test_maxandargmax_idxtranspose){ ga_assert_ok(GpuArray_memset(&gaMax, -1)); /* 0xFFFFFFFF is a qNaN. */ ga_assert_ok(GpuArray_memset(&gaArgmax, -1)); - ga_assert_ok(GpuArray_reduction(GA_REDUCE_MAXANDARGMAX, &gaMax, &gaArgmax, &gaSrc, 2, reduxList)); + GpuReduction* gr; + GpuReduction_new(&gr, GpuArray_context(&gaSrc), + GA_REDUCE_MAXANDARGMAX, 1, 2, gaSrc.typecode, 0); + ck_assert_ptr_nonnull(gr); + ga_assert_ok(GpuReduction_call(gr, &gaMax, &gaArgmax, &gaSrc, 2, reduxList, 0)); + GpuReduction_free(gr); ga_assert_ok(GpuArray_read(pMax, sizeof(*pMax) *rdxProdDims, &gaMax)); ga_assert_ok(GpuArray_read(pArgmax, sizeof(*pArgmax)*rdxProdDims, &gaArgmax)); @@ -258,7 +268,7 @@ START_TEST(test_maxandargmax_veryhighrank){ size_t prodDims = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7]; size_t rdxDims[4] = {1171,373,1,2}; size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3]; - const unsigned reduxList[] = {2,4,7,5}; + const int reduxList[] = {2,4,7,5}; float *pSrc = calloc(sizeof(*pSrc), prodDims); float *pMax = calloc(sizeof(*pMax), rdxProdDims); @@ -294,7 +304,12 @@ START_TEST(test_maxandargmax_veryhighrank){ ga_assert_ok(GpuArray_memset(&gaMax, -1)); /* 0xFFFFFFFF is a qNaN. */ ga_assert_ok(GpuArray_memset(&gaArgmax, -1)); - ga_assert_ok(GpuArray_reduction(GA_REDUCE_MAXANDARGMAX, &gaMax, &gaArgmax, &gaSrc, 4, reduxList)); + GpuReduction* gr; + GpuReduction_new(&gr, GpuArray_context(&gaSrc), + GA_REDUCE_MAXANDARGMAX, 4, 4, gaSrc.typecode, 0); + ck_assert_ptr_nonnull(gr); + ga_assert_ok(GpuReduction_call(gr, &gaMax, &gaArgmax, &gaSrc, 4, reduxList, 0)); + GpuReduction_free(gr); ga_assert_ok(GpuArray_read(pMax, sizeof(*pMax) *rdxProdDims, &gaMax)); ga_assert_ok(GpuArray_read(pArgmax, sizeof(*pArgmax)*rdxProdDims, &gaArgmax)); @@ -357,7 +372,7 @@ START_TEST(test_maxandargmax_alldimsreduced){ size_t i,j,k; size_t dims[3] = {32,50,79}; size_t prodDims = dims[0]*dims[1]*dims[2]; - const unsigned reduxList[] = {0,1,2}; + const int reduxList[] = {0,1,2}; float *pSrc = calloc(sizeof(*pSrc), prodDims); float *pMax = calloc(1, sizeof(*pMax)); @@ -393,7 +408,12 @@ START_TEST(test_maxandargmax_alldimsreduced){ ga_assert_ok(GpuArray_memset(&gaMax, -1)); /* 0xFFFFFFFF is a qNaN. */ ga_assert_ok(GpuArray_memset(&gaArgmax, -1)); - ga_assert_ok(GpuArray_reduction(GA_REDUCE_MAXANDARGMAX, &gaMax, &gaArgmax, &gaSrc, 3, reduxList)); + GpuReduction* gr; + GpuReduction_new(&gr, GpuArray_context(&gaSrc), + GA_REDUCE_MAXANDARGMAX, 0, 3, gaSrc.typecode, 0); + ck_assert_ptr_nonnull(gr); + ga_assert_ok(GpuReduction_call(gr, &gaMax, &gaArgmax, &gaSrc, 3, reduxList, 0)); + GpuReduction_free(gr); ga_assert_ok(GpuArray_read(pMax, sizeof(*pMax), &gaMax)); ga_assert_ok(GpuArray_read(pArgmax, sizeof(*pArgmax), &gaArgmax)); @@ -445,7 +465,7 @@ START_TEST(test_minandargmin_reduction){ size_t i,j,k; size_t dims[3] = {32,50,79}; size_t prodDims = dims[0]*dims[1]*dims[2]; - const unsigned reduxList[] = {0,2}; + const int reduxList[] = {0,2}; float* pSrc = calloc(1, sizeof(*pSrc) * dims[0]*dims[1]*dims[2]); float* pMin = calloc(1, sizeof(*pMin) * dims[1] ); @@ -481,7 +501,12 @@ START_TEST(test_minandargmin_reduction){ ga_assert_ok(GpuArray_memset(&gaMin, -1)); /* 0xFFFFFFFF is a qNaN. */ ga_assert_ok(GpuArray_memset(&gaArgmin, -1)); - ga_assert_ok(GpuArray_reduction(GA_REDUCE_MINANDARGMIN, &gaMin, &gaArgmin, &gaSrc, 2, reduxList)); + GpuReduction* gr; + GpuReduction_new(&gr, GpuArray_context(&gaSrc), + GA_REDUCE_MINANDARGMIN, 1, 2, gaSrc.typecode, 0); + ck_assert_ptr_nonnull(gr); + ga_assert_ok(GpuReduction_call(gr, &gaMin, &gaArgmin, &gaSrc, 2, reduxList, 0)); + GpuReduction_free(gr); ga_assert_ok(GpuArray_read(pMin, sizeof(*pMin) *dims[1], &gaMin)); ga_assert_ok(GpuArray_read(pArgmin, sizeof(*pArgmin)*dims[1], &gaArgmin)); @@ -534,7 +559,7 @@ START_TEST(test_minandargmin_veryhighrank){ size_t prodDims = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7]; size_t rdxDims[4] = {1171,373,1,2}; size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3]; - const unsigned reduxList[] = {2,4,7,5}; + const int reduxList[] = {2,4,7,5}; float* pSrc = calloc(1, sizeof(*pSrc) * prodDims); float* pMin = calloc(1, sizeof(*pMin) * rdxProdDims); @@ -570,7 +595,12 @@ START_TEST(test_minandargmin_veryhighrank){ ga_assert_ok(GpuArray_memset(&gaMin, -1)); /* 0xFFFFFFFF is a qNaN. */ ga_assert_ok(GpuArray_memset(&gaArgmin, -1)); - ga_assert_ok(GpuArray_reduction(GA_REDUCE_MINANDARGMIN, &gaMin, &gaArgmin, &gaSrc, 4, reduxList)); + GpuReduction* gr; + GpuReduction_new(&gr, GpuArray_context(&gaSrc), + GA_REDUCE_MINANDARGMIN, 4, 4, gaSrc.typecode, 0); + ck_assert_ptr_nonnull(gr); + ga_assert_ok(GpuReduction_call(gr, &gaMin, &gaArgmin, &gaSrc, 4, reduxList, 0)); + GpuReduction_free(gr); ga_assert_ok(GpuArray_read(pMin, sizeof(*pMin) *rdxProdDims, &gaMin)); ga_assert_ok(GpuArray_read(pArgmin, sizeof(*pArgmin)*rdxProdDims, &gaArgmin)); @@ -633,7 +663,7 @@ START_TEST(test_minandargmin_alldimsreduced){ size_t i,j,k; size_t dims[3] = {32,50,79}; size_t prodDims = dims[0]*dims[1]*dims[2]; - const unsigned reduxList[] = {0,1,2}; + const int reduxList[] = {0,1,2}; float* pSrc = calloc(1, sizeof(*pSrc) * dims[0]*dims[1]*dims[2]); float* pMin = calloc(1, sizeof(*pMin) ); @@ -669,7 +699,12 @@ START_TEST(test_minandargmin_alldimsreduced){ ga_assert_ok(GpuArray_memset(&gaMin, -1)); /* 0xFFFFFFFF is a qNaN. */ ga_assert_ok(GpuArray_memset(&gaArgmin, -1)); - ga_assert_ok(GpuArray_reduction(GA_REDUCE_MINANDARGMIN, &gaMin, &gaArgmin, &gaSrc, 3, reduxList)); + GpuReduction* gr; + GpuReduction_new(&gr, GpuArray_context(&gaSrc), + GA_REDUCE_MINANDARGMIN, 0, 3, gaSrc.typecode, 0); + ck_assert_ptr_nonnull(gr); + ga_assert_ok(GpuReduction_call(gr, &gaMin, &gaArgmin, &gaSrc, 3, reduxList, 0)); + GpuReduction_free(gr); ga_assert_ok(GpuArray_read(pMin, sizeof(*pMin), &gaMin)); ga_assert_ok(GpuArray_read(pArgmin, sizeof(*pArgmin), &gaArgmin)); @@ -721,7 +756,7 @@ START_TEST(test_argmax_reduction){ size_t i,j,k; size_t dims[3] = {32,50,79}; size_t prodDims = dims[0]*dims[1]*dims[2]; - const unsigned reduxList[] = {0,2}; + const int reduxList[] = {0,2}; float* pSrc = calloc(1, sizeof(*pSrc) * dims[0]*dims[1]*dims[2]); float* pMax = calloc(1, sizeof(*pMax) * dims[1] ); @@ -754,7 +789,12 @@ START_TEST(test_argmax_reduction){ ga_assert_ok(GpuArray_write(&gaSrc, pSrc, sizeof(*pSrc)*prodDims)); ga_assert_ok(GpuArray_memset(&gaArgmax, -1)); - ga_assert_ok(GpuArray_reduction(GA_REDUCE_ARGMAX, NULL, &gaArgmax, &gaSrc, 2, reduxList)); + GpuReduction* gr; + GpuReduction_new(&gr, GpuArray_context(&gaSrc), + GA_REDUCE_ARGMAX, 1, 2, gaSrc.typecode, 0); + ck_assert_ptr_nonnull(gr); + ga_assert_ok(GpuReduction_call(gr, NULL, &gaArgmax, &gaSrc, 2, reduxList, 0)); + GpuReduction_free(gr); ga_assert_ok(GpuArray_read(pArgmax, sizeof(*pArgmax)*dims[1], &gaArgmax)); @@ -804,7 +844,7 @@ START_TEST(test_argmax_veryhighrank){ size_t prodDims = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7]; size_t rdxDims[4] = {1171,373,1,2}; size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3]; - const unsigned reduxList[] = {2,4,7,5}; + const int reduxList[] = {2,4,7,5}; float* pSrc = calloc(1, sizeof(*pSrc) * prodDims); float* pMax = calloc(1, sizeof(*pMax) * rdxProdDims); @@ -836,7 +876,12 @@ START_TEST(test_argmax_veryhighrank){ ga_assert_ok(GpuArray_write(&gaSrc, pSrc, sizeof(*pSrc)*prodDims)); ga_assert_ok(GpuArray_memset(&gaArgmax, -1)); - ga_assert_ok(GpuArray_reduction(GA_REDUCE_ARGMAX, NULL, &gaArgmax, &gaSrc, 4, reduxList)); + GpuReduction* gr; + GpuReduction_new(&gr, GpuArray_context(&gaSrc), + GA_REDUCE_ARGMAX, 4, 4, gaSrc.typecode, 0); + ck_assert_ptr_nonnull(gr); + ga_assert_ok(GpuReduction_call(gr, NULL, &gaArgmax, &gaSrc, 4, reduxList, 0)); + GpuReduction_free(gr); ga_assert_ok(GpuArray_read(pArgmax, sizeof(*pArgmax)*rdxProdDims, &gaArgmax)); @@ -896,7 +941,7 @@ START_TEST(test_argmax_alldimsreduced){ size_t i,j,k; size_t dims[3] = {32,50,79}; size_t prodDims = dims[0]*dims[1]*dims[2]; - const unsigned reduxList[] = {0,1,2}; + const int reduxList[] = {0,1,2}; float* pSrc = calloc(1, sizeof(*pSrc) * dims[0]*dims[1]*dims[2]); float* pMax = calloc(1, sizeof(*pMax) ); @@ -929,7 +974,12 @@ START_TEST(test_argmax_alldimsreduced){ ga_assert_ok(GpuArray_write(&gaSrc, pSrc, sizeof(*pSrc)*prodDims)); ga_assert_ok(GpuArray_memset(&gaArgmax, -1)); - ga_assert_ok(GpuArray_reduction(GA_REDUCE_ARGMAX, NULL, &gaArgmax, &gaSrc, 3, reduxList)); + GpuReduction* gr; + GpuReduction_new(&gr, GpuArray_context(&gaSrc), + GA_REDUCE_ARGMAX, 0, 3, gaSrc.typecode, 0); + ck_assert_ptr_nonnull(gr); + ga_assert_ok(GpuReduction_call(gr, NULL, &gaArgmax, &gaSrc, 3, reduxList, 0)); + GpuReduction_free(gr); ga_assert_ok(GpuArray_read(pArgmax, sizeof(*pArgmax), &gaArgmax)); @@ -978,7 +1028,7 @@ START_TEST(test_argmin_reduction){ size_t i,j,k; size_t dims[3] = {32,50,79}; size_t prodDims = dims[0]*dims[1]*dims[2]; - const unsigned reduxList[] = {0,2}; + const int reduxList[] = {0,2}; float* pSrc = calloc(1, sizeof(*pSrc) * dims[0]*dims[1]*dims[2]); float* pMin = calloc(1, sizeof(*pMin) * dims[1] ); @@ -1011,7 +1061,12 @@ START_TEST(test_argmin_reduction){ ga_assert_ok(GpuArray_write(&gaSrc, pSrc, sizeof(*pSrc)*prodDims)); ga_assert_ok(GpuArray_memset(&gaArgmin, -1)); - ga_assert_ok(GpuArray_reduction(GA_REDUCE_ARGMIN, NULL, &gaArgmin, &gaSrc, 2, reduxList)); + GpuReduction* gr; + GpuReduction_new(&gr, GpuArray_context(&gaSrc), + GA_REDUCE_ARGMIN, 1, 2, gaSrc.typecode, 0); + ck_assert_ptr_nonnull(gr); + ga_assert_ok(GpuReduction_call(gr, NULL, &gaArgmin, &gaSrc, 2, reduxList, 0)); + GpuReduction_free(gr); ga_assert_ok(GpuArray_read(pArgmin, sizeof(*pArgmin)*dims[1], &gaArgmin)); @@ -1061,7 +1116,7 @@ START_TEST(test_argmin_veryhighrank){ size_t prodDims = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7]; size_t rdxDims[4] = {1171,373,1,2}; size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3]; - const unsigned reduxList[] = {2,4,7,5}; + const int reduxList[] = {2,4,7,5}; float* pSrc = calloc(1, sizeof(*pSrc) * prodDims); float* pMin = calloc(1, sizeof(*pMin) * rdxProdDims); @@ -1093,7 +1148,12 @@ START_TEST(test_argmin_veryhighrank){ ga_assert_ok(GpuArray_write(&gaSrc, pSrc, sizeof(*pSrc)*prodDims)); ga_assert_ok(GpuArray_memset(&gaArgmin, -1)); - ga_assert_ok(GpuArray_reduction(GA_REDUCE_ARGMIN, NULL, &gaArgmin, &gaSrc, 4, reduxList)); + GpuReduction* gr; + GpuReduction_new(&gr, GpuArray_context(&gaSrc), + GA_REDUCE_ARGMIN, 4, 4, gaSrc.typecode, 0); + ck_assert_ptr_nonnull(gr); + ga_assert_ok(GpuReduction_call(gr, NULL, &gaArgmin, &gaSrc, 4, reduxList, 0)); + GpuReduction_free(gr); ga_assert_ok(GpuArray_read(pArgmin, sizeof(*pArgmin)*rdxProdDims, &gaArgmin)); @@ -1153,7 +1213,7 @@ START_TEST(test_argmin_alldimsreduced){ size_t i,j,k; size_t dims[3] = {32,50,79}; size_t prodDims = dims[0]*dims[1]*dims[2]; - const unsigned reduxList[] = {0,1,2}; + const int reduxList[] = {0,1,2}; float* pSrc = calloc(1, sizeof(*pSrc) * dims[0]*dims[1]*dims[2]); float* pMin = calloc(1, sizeof(*pMin) ); @@ -1186,7 +1246,12 @@ START_TEST(test_argmin_alldimsreduced){ ga_assert_ok(GpuArray_write(&gaSrc, pSrc, sizeof(*pSrc)*prodDims)); ga_assert_ok(GpuArray_memset(&gaArgmin, -1)); - ga_assert_ok(GpuArray_reduction(GA_REDUCE_ARGMIN, NULL, &gaArgmin, &gaSrc, 3, reduxList)); + GpuReduction* gr; + GpuReduction_new(&gr, GpuArray_context(&gaSrc), + GA_REDUCE_ARGMIN, 0, 3, gaSrc.typecode, 0); + ck_assert_ptr_nonnull(gr); + ga_assert_ok(GpuReduction_call(gr, NULL, &gaArgmin, &gaSrc, 3, reduxList, 0)); + GpuReduction_free(gr); ga_assert_ok(GpuArray_read(pArgmin, sizeof(*pArgmin), &gaArgmin)); @@ -1234,7 +1299,7 @@ START_TEST(test_max_reduction){ size_t i,j,k; size_t dims[3] = {32,50,79}; size_t prodDims = dims[0]*dims[1]*dims[2]; - const unsigned reduxList[] = {0,2}; + const int reduxList[] = {0,2}; float* pSrc = calloc(1, sizeof(*pSrc) * dims[0]*dims[1]*dims[2]); float* pMax = calloc(1, sizeof(*pMax) * dims[1] ); @@ -1265,7 +1330,12 @@ START_TEST(test_max_reduction){ ga_assert_ok(GpuArray_write(&gaSrc, pSrc, sizeof(*pSrc)*prodDims)); ga_assert_ok(GpuArray_memset(&gaMax, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_reduction(GA_REDUCE_MAX, &gaMax, NULL, &gaSrc, 2, reduxList)); + GpuReduction* gr; + GpuReduction_new(&gr, GpuArray_context(&gaSrc), + GA_REDUCE_MAX, 1, 2, gaSrc.typecode, 0); + ck_assert_ptr_nonnull(gr); + ga_assert_ok(GpuReduction_call(gr, &gaMax, NULL, &gaSrc, 2, reduxList, 0)); + GpuReduction_free(gr); ga_assert_ok(GpuArray_read(pMax, sizeof(*pMax) *dims[1], &gaMax)); @@ -1312,7 +1382,7 @@ START_TEST(test_max_veryhighrank){ size_t prodDims = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7]; size_t rdxDims[4] = {1171,373,1,2}; size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3]; - const unsigned reduxList[] = {2,4,7,5}; + const int reduxList[] = {2,4,7,5}; float* pSrc = calloc(1, sizeof(*pSrc) * prodDims); float* pMax = calloc(1, sizeof(*pMax) * rdxProdDims); @@ -1343,7 +1413,12 @@ START_TEST(test_max_veryhighrank){ ga_assert_ok(GpuArray_write(&gaSrc, pSrc, sizeof(*pSrc)*prodDims)); ga_assert_ok(GpuArray_memset(&gaMax, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_reduction(GA_REDUCE_MAX, &gaMax, NULL, &gaSrc, 4, reduxList)); + GpuReduction* gr; + GpuReduction_new(&gr, GpuArray_context(&gaSrc), + GA_REDUCE_MAX, 4, 4, gaSrc.typecode, 0); + ck_assert_ptr_nonnull(gr); + ga_assert_ok(GpuReduction_call(gr, &gaMax, NULL, &gaSrc, 4, reduxList, 0)); + GpuReduction_free(gr); ga_assert_ok(GpuArray_read(pMax, sizeof(*pMax) *rdxProdDims, &gaMax)); @@ -1400,7 +1475,7 @@ START_TEST(test_max_alldimsreduced){ size_t i,j,k; size_t dims[3] = {32,50,79}; size_t prodDims = dims[0]*dims[1]*dims[2]; - const unsigned reduxList[] = {0,1,2}; + const int reduxList[] = {0,1,2}; float* pSrc = calloc(1, sizeof(*pSrc) * dims[0]*dims[1]*dims[2]); float* pMax = calloc(1, sizeof(*pMax) ); @@ -1431,7 +1506,12 @@ START_TEST(test_max_alldimsreduced){ ga_assert_ok(GpuArray_write(&gaSrc, pSrc, sizeof(*pSrc)*prodDims)); ga_assert_ok(GpuArray_memset(&gaMax, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_reduction(GA_REDUCE_MAX, &gaMax, NULL, &gaSrc, 3, reduxList)); + GpuReduction* gr; + GpuReduction_new(&gr, GpuArray_context(&gaSrc), + GA_REDUCE_MAX, 0, 3, gaSrc.typecode, 0); + ck_assert_ptr_nonnull(gr); + ga_assert_ok(GpuReduction_call(gr, &gaMax, NULL, &gaSrc, 3, reduxList, 0)); + GpuReduction_free(gr); ga_assert_ok(GpuArray_read(pMax, sizeof(*pMax), &gaMax)); @@ -1476,7 +1556,7 @@ START_TEST(test_min_reduction){ size_t i,j,k; size_t dims[3] = {32,50,79}; size_t prodDims = dims[0]*dims[1]*dims[2]; - const unsigned reduxList[] = {0,2}; + const int reduxList[] = {0,2}; float* pSrc = calloc(1, sizeof(*pSrc) * dims[0]*dims[1]*dims[2]); float* pMin = calloc(1, sizeof(*pMin) * dims[1] ); @@ -1507,7 +1587,12 @@ START_TEST(test_min_reduction){ ga_assert_ok(GpuArray_write(&gaSrc, pSrc, sizeof(*pSrc)*prodDims)); ga_assert_ok(GpuArray_memset(&gaMin, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_reduction(GA_REDUCE_MIN, &gaMin, NULL, &gaSrc, 2, reduxList)); + GpuReduction* gr; + GpuReduction_new(&gr, GpuArray_context(&gaSrc), + GA_REDUCE_MIN, 1, 2, gaSrc.typecode, 0); + ck_assert_ptr_nonnull(gr); + ga_assert_ok(GpuReduction_call(gr, &gaMin, NULL, &gaSrc, 2, reduxList, 0)); + GpuReduction_free(gr); ga_assert_ok(GpuArray_read(pMin, sizeof(*pMin) *dims[1], &gaMin)); @@ -1554,7 +1639,7 @@ START_TEST(test_min_veryhighrank){ size_t prodDims = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7]; size_t rdxDims[4] = {1171,373,1,2}; size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3]; - const unsigned reduxList[] = {2,4,7,5}; + const int reduxList[] = {2,4,7,5}; float* pSrc = calloc(1, sizeof(*pSrc) * prodDims); float* pMin = calloc(1, sizeof(*pMin) * rdxProdDims); @@ -1585,7 +1670,12 @@ START_TEST(test_min_veryhighrank){ ga_assert_ok(GpuArray_write(&gaSrc, pSrc, sizeof(*pSrc)*prodDims)); ga_assert_ok(GpuArray_memset(&gaMin, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_reduction(GA_REDUCE_MIN, &gaMin, NULL, &gaSrc, 4, reduxList)); + GpuReduction* gr; + GpuReduction_new(&gr, GpuArray_context(&gaSrc), + GA_REDUCE_MIN, 4, 4, gaSrc.typecode, 0); + ck_assert_ptr_nonnull(gr); + ga_assert_ok(GpuReduction_call(gr, &gaMin, NULL, &gaSrc, 4, reduxList, 0)); + GpuReduction_free(gr); ga_assert_ok(GpuArray_read(pMin, sizeof(*pMin) *rdxProdDims, &gaMin)); @@ -1642,7 +1732,7 @@ START_TEST(test_min_alldimsreduced){ size_t i,j,k; size_t dims[3] = {32,50,79}; size_t prodDims = dims[0]*dims[1]*dims[2]; - const unsigned reduxList[] = {0,1,2}; + const int reduxList[] = {0,1,2}; float* pSrc = calloc(1, sizeof(*pSrc) * dims[0]*dims[1]*dims[2]); float* pMin = calloc(1, sizeof(*pMin) ); @@ -1673,7 +1763,12 @@ START_TEST(test_min_alldimsreduced){ ga_assert_ok(GpuArray_write(&gaSrc, pSrc, sizeof(*pSrc)*prodDims)); ga_assert_ok(GpuArray_memset(&gaMin, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_reduction(GA_REDUCE_MIN, &gaMin, NULL, &gaSrc, 3, reduxList)); + GpuReduction* gr; + GpuReduction_new(&gr, GpuArray_context(&gaSrc), + GA_REDUCE_MIN, 0, 3, gaSrc.typecode, 0); + ck_assert_ptr_nonnull(gr); + ga_assert_ok(GpuReduction_call(gr, &gaMin, NULL, &gaSrc, 3, reduxList, 0)); + GpuReduction_free(gr); ga_assert_ok(GpuArray_read(pMin, sizeof(*pMin), &gaMin)); @@ -1718,7 +1813,7 @@ START_TEST(test_sum_reduction){ size_t i,j,k; size_t dims[3] = {32,50,79}; size_t prodDims = dims[0]*dims[1]*dims[2]; - const unsigned reduxList[] = {0,2}; + const int reduxList[] = {0,2}; const float TOL = 1e-5; float* pS = calloc(1, sizeof(*pS) * dims[0]*dims[1]*dims[2]); @@ -1750,7 +1845,12 @@ START_TEST(test_sum_reduction){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_reduction(GA_REDUCE_SUM, &gaD, NULL, &gaS, 2, reduxList)); + GpuReduction* gr; + GpuReduction_new(&gr, GpuArray_context(&gaS), + GA_REDUCE_SUM, 1, 2, gaS.typecode, 0); + ck_assert_ptr_nonnull(gr); + ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 2, reduxList, 0)); + GpuReduction_free(gr); ga_assert_ok(GpuArray_read (pD, sizeof(*pD)*dims[1], &gaD)); @@ -1794,7 +1894,7 @@ START_TEST(test_sum_veryhighrank){ size_t prodDims = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7]; size_t rdxDims[4] = {1171,373,1,2}; size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3]; - const unsigned reduxList[] = {2,4,7,5}; + const int reduxList[] = {2,4,7,5}; const float TOL = 1e-5; float* pS = calloc(1, sizeof(*pS) * prodDims); @@ -1826,7 +1926,12 @@ START_TEST(test_sum_veryhighrank){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_reduction(GA_REDUCE_SUM, &gaD, NULL, &gaS, 4, reduxList)); + GpuReduction* gr; + GpuReduction_new(&gr, GpuArray_context(&gaS), + GA_REDUCE_SUM, 4, 4, gaS.typecode, 0); + ck_assert_ptr_nonnull(gr); + ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 4, reduxList, 0)); + GpuReduction_free(gr); ga_assert_ok(GpuArray_read (pD, sizeof(*pD)*rdxProdDims, &gaD)); @@ -1880,7 +1985,7 @@ START_TEST(test_sum_alldimsreduced){ size_t i,j,k; size_t dims[3] = {32,50,79}; size_t prodDims = dims[0]*dims[1]*dims[2]; - const unsigned reduxList[] = {0,1,2}; + const int reduxList[] = {0,1,2}; const float TOL = 1e-5; float* pS = calloc(1, sizeof(*pS) * dims[0]*dims[1]*dims[2]); @@ -1912,7 +2017,12 @@ START_TEST(test_sum_alldimsreduced){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_reduction(GA_REDUCE_SUM, &gaD, NULL, &gaS, 3, reduxList)); + GpuReduction* gr; + GpuReduction_new(&gr, GpuArray_context(&gaS), + GA_REDUCE_SUM, 0, 3, gaS.typecode, 0); + ck_assert_ptr_nonnull(gr); + ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 3, reduxList, 0)); + GpuReduction_free(gr); ga_assert_ok(GpuArray_read (pD, sizeof(*pD), &gaD)); @@ -1954,7 +2064,7 @@ START_TEST(test_prod_reduction){ size_t i,j,k; size_t dims[3] = {32,50,79}; size_t prodDims = dims[0]*dims[1]*dims[2]; - const unsigned reduxList[] = {0,2}; + const int reduxList[] = {0,2}; const float TOL = 1e-5; float* pS = calloc(1, sizeof(*pS) * dims[0]*dims[1]*dims[2]); @@ -1986,7 +2096,12 @@ START_TEST(test_prod_reduction){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_reduction(GA_REDUCE_PROD, &gaD, NULL, &gaS, 2, reduxList)); + GpuReduction* gr; + GpuReduction_new(&gr, GpuArray_context(&gaS), + GA_REDUCE_PROD, 1, 2, gaS.typecode, 0); + ck_assert_ptr_nonnull(gr); + ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 2, reduxList, 0)); + GpuReduction_free(gr); ga_assert_ok(GpuArray_read (pD, sizeof(*pD)*dims[1], &gaD)); @@ -2030,7 +2145,7 @@ START_TEST(test_prod_veryhighrank){ size_t prodDims = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7]; size_t rdxDims[4] = {1171,373,1,2}; size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3]; - const unsigned reduxList[] = {2,4,7,5}; + const int reduxList[] = {2,4,7,5}; const float TOL = 1e-5; float* pS = calloc(1, sizeof(*pS) * prodDims); @@ -2062,7 +2177,12 @@ START_TEST(test_prod_veryhighrank){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_reduction(GA_REDUCE_PROD, &gaD, NULL, &gaS, 4, reduxList)); + GpuReduction* gr; + GpuReduction_new(&gr, GpuArray_context(&gaS), + GA_REDUCE_PROD, 4, 4, gaS.typecode, 0); + ck_assert_ptr_nonnull(gr); + ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 4, reduxList, 0)); + GpuReduction_free(gr); ga_assert_ok(GpuArray_read (pD, sizeof(*pD)*rdxProdDims, &gaD)); @@ -2116,7 +2236,7 @@ START_TEST(test_prod_alldimsreduced){ size_t i,j,k; size_t dims[3] = {32,50,79}; size_t prodDims = dims[0]*dims[1]*dims[2]; - const unsigned reduxList[] = {0,1,2}; + const int reduxList[] = {0,1,2}; const float TOL = 1e-5; float* pS = calloc(1, sizeof(*pS) * dims[0]*dims[1]*dims[2]); @@ -2148,7 +2268,12 @@ START_TEST(test_prod_alldimsreduced){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_reduction(GA_REDUCE_PROD, &gaD, NULL, &gaS, 3, reduxList)); + GpuReduction* gr; + GpuReduction_new(&gr, GpuArray_context(&gaS), + GA_REDUCE_PROD, 0, 3, gaS.typecode, 0); + ck_assert_ptr_nonnull(gr); + ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 3, reduxList, 0)); + GpuReduction_free(gr); ga_assert_ok(GpuArray_read (pD, sizeof(*pD), &gaD)); @@ -2190,7 +2315,7 @@ START_TEST(test_prodnz_reduction){ size_t i,j,k; size_t dims[3] = {32,50,79}; size_t prodDims = dims[0]*dims[1]*dims[2]; - const unsigned reduxList[] = {0,2}; + const int reduxList[] = {0,2}; const float TOL = 1e-5; float* pS = calloc(1, sizeof(*pS) * dims[0]*dims[1]*dims[2]); @@ -2225,7 +2350,12 @@ START_TEST(test_prodnz_reduction){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_reduction(GA_REDUCE_PRODNZ, &gaD, NULL, &gaS, 2, reduxList)); + GpuReduction* gr; + GpuReduction_new(&gr, GpuArray_context(&gaS), + GA_REDUCE_PRODNZ, 1, 2, gaS.typecode, 0); + ck_assert_ptr_nonnull(gr); + ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 2, reduxList, 0)); + GpuReduction_free(gr); ga_assert_ok(GpuArray_read (pD, sizeof(*pD)*dims[1], &gaD)); @@ -2269,7 +2399,7 @@ START_TEST(test_prodnz_veryhighrank){ size_t prodDims = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7]; size_t rdxDims[4] = {1171,373,1,2}; size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3]; - const unsigned reduxList[] = {2,4,7,5}; + const int reduxList[] = {2,4,7,5}; const float TOL = 1e-5; float* pS = calloc(1, sizeof(*pS) * prodDims); @@ -2304,7 +2434,12 @@ START_TEST(test_prodnz_veryhighrank){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_reduction(GA_REDUCE_PRODNZ, &gaD, NULL, &gaS, 4, reduxList)); + GpuReduction* gr; + GpuReduction_new(&gr, GpuArray_context(&gaS), + GA_REDUCE_PRODNZ, 4, 4, gaS.typecode, 0); + ck_assert_ptr_nonnull(gr); + ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 4, reduxList, 0)); + GpuReduction_free(gr); ga_assert_ok(GpuArray_read (pD, sizeof(*pD)*rdxProdDims, &gaD)); @@ -2358,7 +2493,7 @@ START_TEST(test_prodnz_alldimsreduced){ size_t i,j,k; size_t dims[3] = {32,50,79}; size_t prodDims = dims[0]*dims[1]*dims[2]; - const unsigned reduxList[] = {0,1,2}; + const int reduxList[] = {0,1,2}; const float TOL = 1e-5; float* pS = calloc(1, sizeof(*pS) * dims[0]*dims[1]*dims[2]); @@ -2393,7 +2528,12 @@ START_TEST(test_prodnz_alldimsreduced){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_reduction(GA_REDUCE_PRODNZ, &gaD, NULL, &gaS, 3, reduxList)); + GpuReduction* gr; + GpuReduction_new(&gr, GpuArray_context(&gaS), + GA_REDUCE_PRODNZ, 0, 3, gaS.typecode, 0); + ck_assert_ptr_nonnull(gr); + ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 3, reduxList, 0)); + GpuReduction_free(gr); ga_assert_ok(GpuArray_read (pD, sizeof(*pD), &gaD)); @@ -2435,7 +2575,7 @@ START_TEST(test_and_reduction){ size_t i,j,k; size_t dims[3] = {32,50,79}; size_t prodDims = dims[0]*dims[1]*dims[2]; - const unsigned reduxList[] = {0,2}; + const int reduxList[] = {0,2}; uint32_t* pS = calloc(1, sizeof(*pS) * dims[0]*dims[1]*dims[2]); uint32_t* pD = calloc(1, sizeof(*pD) * dims[1] ); @@ -2475,7 +2615,12 @@ START_TEST(test_and_reduction){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); - ga_assert_ok(GpuArray_reduction(GA_REDUCE_AND, &gaD, NULL, &gaS, 2, reduxList)); + GpuReduction* gr; + GpuReduction_new(&gr, GpuArray_context(&gaS), + GA_REDUCE_AND, 1, 2, gaS.typecode, 0); + ck_assert_ptr_nonnull(gr); + ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 2, reduxList, 0)); + GpuReduction_free(gr); ga_assert_ok(GpuArray_read (pD, sizeof(*pD)*dims[1], &gaD)); @@ -2519,7 +2664,7 @@ START_TEST(test_and_veryhighrank){ size_t prodDims = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7]; size_t rdxDims[4] = {1171,373,1,2}; size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3]; - const unsigned reduxList[] = {2,4,7,5}; + const int reduxList[] = {2,4,7,5}; uint32_t* pS = calloc(1, sizeof(*pS) * prodDims); uint32_t* pD = calloc(1, sizeof(*pD) * rdxProdDims); @@ -2559,7 +2704,12 @@ START_TEST(test_and_veryhighrank){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_reduction(GA_REDUCE_AND, &gaD, NULL, &gaS, 4, reduxList)); + GpuReduction* gr; + GpuReduction_new(&gr, GpuArray_context(&gaS), + GA_REDUCE_AND, 4, 4, gaS.typecode, 0); + ck_assert_ptr_nonnull(gr); + ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 4, reduxList, 0)); + GpuReduction_free(gr); ga_assert_ok(GpuArray_read (pD, sizeof(*pD)*rdxProdDims, &gaD)); @@ -2613,7 +2763,7 @@ START_TEST(test_and_alldimsreduced){ size_t i,j,k; size_t dims[3] = {32,50,79}; size_t prodDims = dims[0]*dims[1]*dims[2]; - const unsigned reduxList[] = {0,1,2}; + const int reduxList[] = {0,1,2}; uint32_t* pS = calloc(1, sizeof(*pS) * dims[0]*dims[1]*dims[2]); uint32_t* pD = calloc(1, sizeof(*pD) ); @@ -2653,7 +2803,12 @@ START_TEST(test_and_alldimsreduced){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_reduction(GA_REDUCE_AND, &gaD, NULL, &gaS, 3, reduxList)); + GpuReduction* gr; + GpuReduction_new(&gr, GpuArray_context(&gaS), + GA_REDUCE_AND, 0, 3, gaS.typecode, 0); + ck_assert_ptr_nonnull(gr); + ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 3, reduxList, 0)); + GpuReduction_free(gr); ga_assert_ok(GpuArray_read (pD, sizeof(*pD), &gaD)); @@ -2695,7 +2850,7 @@ START_TEST(test_or_reduction){ size_t i,j,k; size_t dims[3] = {32,50,79}; size_t prodDims = dims[0]*dims[1]*dims[2]; - const unsigned reduxList[] = {0,2}; + const int reduxList[] = {0,2}; uint32_t* pS = calloc(1, sizeof(*pS) * dims[0]*dims[1]*dims[2]); uint32_t* pD = calloc(1, sizeof(*pD) * dims[1] ); @@ -2735,7 +2890,12 @@ START_TEST(test_or_reduction){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); - ga_assert_ok(GpuArray_reduction(GA_REDUCE_OR, &gaD, NULL, &gaS, 2, reduxList)); + GpuReduction* gr; + GpuReduction_new(&gr, GpuArray_context(&gaS), + GA_REDUCE_OR, 1, 2, gaS.typecode, 0); + ck_assert_ptr_nonnull(gr); + ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 2, reduxList, 0)); + GpuReduction_free(gr); ga_assert_ok(GpuArray_read (pD, sizeof(*pD)*dims[1], &gaD)); @@ -2779,7 +2939,7 @@ START_TEST(test_or_veryhighrank){ size_t prodDims = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7]; size_t rdxDims[4] = {1171,373,1,2}; size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3]; - const unsigned reduxList[] = {2,4,7,5}; + const int reduxList[] = {2,4,7,5}; uint32_t* pS = calloc(1, sizeof(*pS) * prodDims); uint32_t* pD = calloc(1, sizeof(*pD) * rdxProdDims); @@ -2819,7 +2979,12 @@ START_TEST(test_or_veryhighrank){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_reduction(GA_REDUCE_OR, &gaD, NULL, &gaS, 4, reduxList)); + GpuReduction* gr; + GpuReduction_new(&gr, GpuArray_context(&gaS), + GA_REDUCE_OR, 4, 4, gaS.typecode, 0); + ck_assert_ptr_nonnull(gr); + ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 4, reduxList, 0)); + GpuReduction_free(gr); ga_assert_ok(GpuArray_read (pD, sizeof(*pD)*rdxProdDims, &gaD)); @@ -2873,7 +3038,7 @@ START_TEST(test_or_alldimsreduced){ size_t i,j,k; size_t dims[3] = {32,50,79}; size_t prodDims = dims[0]*dims[1]*dims[2]; - const unsigned reduxList[] = {0,1,2}; + const int reduxList[] = {0,1,2}; uint32_t* pS = calloc(1, sizeof(*pS) * dims[0]*dims[1]*dims[2]); uint32_t* pD = calloc(1, sizeof(*pD) ); @@ -2913,7 +3078,12 @@ START_TEST(test_or_alldimsreduced){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_reduction(GA_REDUCE_OR, &gaD, NULL, &gaS, 3, reduxList)); + GpuReduction* gr; + GpuReduction_new(&gr, GpuArray_context(&gaS), + GA_REDUCE_OR, 0, 3, gaS.typecode, 0); + ck_assert_ptr_nonnull(gr); + ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 3, reduxList, 0)); + GpuReduction_free(gr); ga_assert_ok(GpuArray_read (pD, sizeof(*pD), &gaD)); @@ -2955,7 +3125,7 @@ START_TEST(test_xor_reduction){ size_t i,j,k; size_t dims[3] = {32,50,79}; size_t prodDims = dims[0]*dims[1]*dims[2]; - const unsigned reduxList[] = {0,2}; + const int reduxList[] = {0,2}; uint32_t* pS = calloc(1, sizeof(*pS) * dims[0]*dims[1]*dims[2]); uint32_t* pD = calloc(1, sizeof(*pD) * dims[1] ); @@ -2991,7 +3161,12 @@ START_TEST(test_xor_reduction){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); - ga_assert_ok(GpuArray_reduction(GA_REDUCE_XOR, &gaD, NULL, &gaS, 2, reduxList)); + GpuReduction* gr; + GpuReduction_new(&gr, GpuArray_context(&gaS), + GA_REDUCE_XOR, 1, 2, gaS.typecode, 0); + ck_assert_ptr_nonnull(gr); + ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 2, reduxList, 0)); + GpuReduction_free(gr); ga_assert_ok(GpuArray_read (pD, sizeof(*pD)*dims[1], &gaD)); @@ -3035,7 +3210,7 @@ START_TEST(test_xor_veryhighrank){ size_t prodDims = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7]; size_t rdxDims[4] = {1171,373,1,2}; size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3]; - const unsigned reduxList[] = {2,4,7,5}; + const int reduxList[] = {2,4,7,5}; uint32_t* pS = calloc(1, sizeof(*pS) * prodDims); uint32_t* pD = calloc(1, sizeof(*pD) * rdxProdDims); @@ -3071,7 +3246,12 @@ START_TEST(test_xor_veryhighrank){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_reduction(GA_REDUCE_XOR, &gaD, NULL, &gaS, 4, reduxList)); + GpuReduction* gr; + GpuReduction_new(&gr, GpuArray_context(&gaS), + GA_REDUCE_XOR, 4, 4, gaS.typecode, 0); + ck_assert_ptr_nonnull(gr); + ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 4, reduxList, 0)); + GpuReduction_free(gr); ga_assert_ok(GpuArray_read (pD, sizeof(*pD)*rdxProdDims, &gaD)); @@ -3125,7 +3305,7 @@ START_TEST(test_xor_alldimsreduced){ size_t i,j,k; size_t dims[3] = {32,50,79}; size_t prodDims = dims[0]*dims[1]*dims[2]; - const unsigned reduxList[] = {0,1,2}; + const int reduxList[] = {0,1,2}; uint32_t* pS = calloc(1, sizeof(*pS) * dims[0]*dims[1]*dims[2]); uint32_t* pD = calloc(1, sizeof(*pD) ); @@ -3161,7 +3341,12 @@ START_TEST(test_xor_alldimsreduced){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_reduction(GA_REDUCE_XOR, &gaD, NULL, &gaS, 3, reduxList)); + GpuReduction* gr; + GpuReduction_new(&gr, GpuArray_context(&gaS), + GA_REDUCE_XOR, 0, 3, gaS.typecode, 0); + ck_assert_ptr_nonnull(gr); + ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 3, reduxList, 0)); + GpuReduction_free(gr); ga_assert_ok(GpuArray_read (pD, sizeof(*pD), &gaD)); @@ -3203,7 +3388,7 @@ START_TEST(test_any_reduction){ size_t i,j,k; size_t dims[3] = {32,50,79}; size_t prodDims = dims[0]*dims[1]*dims[2]; - const unsigned reduxList[] = {0,2}; + const int reduxList[] = {0,2}; uint32_t* pS = calloc(1, sizeof(*pS) * dims[0]*dims[1]*dims[2]); uint32_t* pD = calloc(1, sizeof(*pD) * dims[1] ); @@ -3239,7 +3424,12 @@ START_TEST(test_any_reduction){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); - ga_assert_ok(GpuArray_reduction(GA_REDUCE_ANY, &gaD, NULL, &gaS, 2, reduxList)); + GpuReduction* gr; + GpuReduction_new(&gr, GpuArray_context(&gaS), + GA_REDUCE_ANY, 1, 2, gaS.typecode, 0); + ck_assert_ptr_nonnull(gr); + ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 2, reduxList, 0)); + GpuReduction_free(gr); ga_assert_ok(GpuArray_read (pD, sizeof(*pD)*dims[1], &gaD)); @@ -3283,7 +3473,7 @@ START_TEST(test_any_veryhighrank){ size_t prodDims = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7]; size_t rdxDims[4] = {1171,373,1,2}; size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3]; - const unsigned reduxList[] = {2,4,7,5}; + const int reduxList[] = {2,4,7,5}; uint32_t* pS = calloc(1, sizeof(*pS) * prodDims); uint32_t* pD = calloc(1, sizeof(*pD) * rdxProdDims); @@ -3319,7 +3509,12 @@ START_TEST(test_any_veryhighrank){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_reduction(GA_REDUCE_ANY, &gaD, NULL, &gaS, 4, reduxList)); + GpuReduction* gr; + GpuReduction_new(&gr, GpuArray_context(&gaS), + GA_REDUCE_ANY, 4, 4, gaS.typecode, 0); + ck_assert_ptr_nonnull(gr); + ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 4, reduxList, 0)); + GpuReduction_free(gr); ga_assert_ok(GpuArray_read (pD, sizeof(*pD)*rdxProdDims, &gaD)); @@ -3373,7 +3568,7 @@ START_TEST(test_any_alldimsreduced){ size_t i,j,k; size_t dims[3] = {32,50,79}; size_t prodDims = dims[0]*dims[1]*dims[2]; - const unsigned reduxList[] = {0,1,2}; + const int reduxList[] = {0,1,2}; uint32_t* pS = calloc(1, sizeof(*pS) * dims[0]*dims[1]*dims[2]); uint32_t* pD = calloc(1, sizeof(*pD) ); @@ -3409,7 +3604,12 @@ START_TEST(test_any_alldimsreduced){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_reduction(GA_REDUCE_ANY, &gaD, NULL, &gaS, 3, reduxList)); + GpuReduction* gr; + GpuReduction_new(&gr, GpuArray_context(&gaS), + GA_REDUCE_ANY, 0, 3, gaS.typecode, 0); + ck_assert_ptr_nonnull(gr); + ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 3, reduxList, 0)); + GpuReduction_free(gr); ga_assert_ok(GpuArray_read (pD, sizeof(*pD), &gaD)); @@ -3451,7 +3651,7 @@ START_TEST(test_all_reduction){ size_t i,j,k; size_t dims[3] = {32,50,79}; size_t prodDims = dims[0]*dims[1]*dims[2]; - const unsigned reduxList[] = {0,2}; + const int reduxList[] = {0,2}; uint32_t* pS = calloc(1, sizeof(*pS) * dims[0]*dims[1]*dims[2]); uint32_t* pD = calloc(1, sizeof(*pD) * dims[1] ); @@ -3487,7 +3687,12 @@ START_TEST(test_all_reduction){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); - ga_assert_ok(GpuArray_reduction(GA_REDUCE_ALL, &gaD, NULL, &gaS, 2, reduxList)); + GpuReduction* gr; + GpuReduction_new(&gr, GpuArray_context(&gaS), + GA_REDUCE_ALL, 1, 2, gaS.typecode, 0); + ck_assert_ptr_nonnull(gr); + ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 2, reduxList, 0)); + GpuReduction_free(gr); ga_assert_ok(GpuArray_read (pD, sizeof(*pD)*dims[1], &gaD)); @@ -3531,7 +3736,7 @@ START_TEST(test_all_veryhighrank){ size_t prodDims = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7]; size_t rdxDims[4] = {1171,373,1,2}; size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3]; - const unsigned reduxList[] = {2,4,7,5}; + const int reduxList[] = {2,4,7,5}; uint32_t* pS = calloc(1, sizeof(*pS) * prodDims); uint32_t* pD = calloc(1, sizeof(*pD) * rdxProdDims); @@ -3567,7 +3772,12 @@ START_TEST(test_all_veryhighrank){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_reduction(GA_REDUCE_ALL, &gaD, NULL, &gaS, 4, reduxList)); + GpuReduction* gr; + GpuReduction_new(&gr, GpuArray_context(&gaS), + GA_REDUCE_ALL, 4, 4, gaS.typecode, 0); + ck_assert_ptr_nonnull(gr); + ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 4, reduxList, 0)); + GpuReduction_free(gr); ga_assert_ok(GpuArray_read (pD, sizeof(*pD)*rdxProdDims, &gaD)); @@ -3621,7 +3831,7 @@ START_TEST(test_all_alldimsreduced){ size_t i,j,k; size_t dims[3] = {32,50,79}; size_t prodDims = dims[0]*dims[1]*dims[2]; - const unsigned reduxList[] = {0,1,2}; + const int reduxList[] = {0,1,2}; uint32_t* pS = calloc(1, sizeof(*pS) * dims[0]*dims[1]*dims[2]); uint32_t* pD = calloc(1, sizeof(*pD) ); @@ -3657,7 +3867,12 @@ START_TEST(test_all_alldimsreduced){ ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD, -1)); /* 0xFFFFFFFF is a qNaN. */ - ga_assert_ok(GpuArray_reduction(GA_REDUCE_ALL, &gaD, NULL, &gaS, 3, reduxList)); + GpuReduction* gr; + GpuReduction_new(&gr, GpuArray_context(&gaS), + GA_REDUCE_ALL, 0, 3, gaS.typecode, 0); + ck_assert_ptr_nonnull(gr); + ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 3, reduxList, 0)); + GpuReduction_free(gr); ga_assert_ok(GpuArray_read (pD, sizeof(*pD), &gaD)); @@ -3694,11 +3909,11 @@ Suite *get_suite(void) { TCase *tc = tcase_create("basic"); tcase_add_checked_fixture(tc, setup, teardown); tcase_set_timeout(tc, 120.0); - - tcase_add_test(tc, test_maxandargmax_reduction); - tcase_add_test(tc, test_maxandargmax_idxtranspose); + tcase_add_test(tc, test_maxandargmax_veryhighrank); tcase_add_test(tc, test_maxandargmax_alldimsreduced); + tcase_add_test(tc, test_maxandargmax_reduction); + tcase_add_test(tc, test_maxandargmax_idxtranspose); tcase_add_test(tc, test_minandargmin_reduction); tcase_add_test(tc, test_minandargmin_veryhighrank); From 8fc792bca3d08387f7deb8315814843156804db3 Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Mon, 3 Jul 2017 23:00:39 -0400 Subject: [PATCH 17/34] More fixes. 40% of tests still failing, and the code has a wierd smell to it that I really don't appreciate. --- src/gpuarray_reduction.c | 796 +++++++++++++++++++++++---------------- tests/check_reduction.c | 6 +- 2 files changed, 479 insertions(+), 323 deletions(-) diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c index 81376b93b8..f8cd15abfb 100644 --- a/src/gpuarray_reduction.c +++ b/src/gpuarray_reduction.c @@ -261,6 +261,15 @@ struct GpuReduction{ }; +/* Typedefs */ +typedef void (*GpuReductionIterFn)(GpuReduction* gr, + int typecode, + const char* typeName, + const char* baseName, + int num, + void* user); + + /* Static Function prototypes */ /* Utilities */ static int reduxGetSumInit (int typecode, const char** property); @@ -276,7 +285,7 @@ static int reduxSortPtrIBSrcRdSelect (const void* a, const void* b); static int reduxSortPtrByReduxNum (const void* a, const void* b); static int reduxSortPtrIBDstWrSelect (const void* a, const void* b); static int reduxSortPtrIBDstArgWrSelect (const void* a, const void* b); -static int reduxSortPtrFinalOrder (const void* a, const void* b); +static int reduxSortPtrInsertFinalOrder (const void* a, const void* b); /* Axis Description API */ static void axisInit (axis_desc* axis, @@ -290,6 +299,7 @@ static int axisGetReduxNum (const axis_desc* axis); static size_t axisGetLen (const axis_desc* axis); static size_t axisGetIntraLen (const axis_desc* axis); static size_t axisGetInterLen (const axis_desc* axis); +static size_t axisGetIntraInterLen (const axis_desc* axis); static ssize_t axisGetSrcStride (const axis_desc* axis); static size_t axisGetSrcAbsStride (const axis_desc* axis); static ssize_t axisGetDstStride (const axis_desc* axis); @@ -312,6 +322,9 @@ static int axisIsSplit (const axis_desc* axis); /* Generator Control Flow */ static int reduxGenInit (GpuReduction* gr); static int reduxGenInferProperties (GpuReduction* gr); +static void reduxGenIterArgs (GpuReduction* gr, + GpuReductionIterFn fn, + void* user); static int reduxGenSrc (GpuReduction* gr); static void reduxGenSrcAppend (GpuReduction* gr); static void reduxGenSrcAppendIncludes (GpuReduction* gr); @@ -350,6 +363,30 @@ static int reduxGenCleanupMsg (GpuReduction* gr, int r const char* fmt, ...); /* Generator Utilities */ +static void reduxGenCountArgs (GpuReduction* gr, + int typecode, + const char* typeName, + const char* baseName, + int num, + void* user); +static void reduxGenSaveArgTypecodes (GpuReduction* gr, + int typecode, + const char* typeName, + const char* baseName, + int num, + void* user); +static void reduxGenAppendArg (GpuReduction* gr, + int typecode, + const char* typeName, + const char* baseName, + int num, + void* user); +static void reduxInvMarshalArg (GpuReduction* gr, + int typecode, + const char* typeName, + const char* baseName, + int num, + void* user); static size_t reduxGenEstimateParallelism (const GpuReduction* gr); static int reduxGenRequiresDst (const GpuReduction* gr); static int reduxGenRequiresDstArg (const GpuReduction* gr); @@ -361,6 +398,9 @@ static size_t reduxGenGetMaxLocalSize (const GpuReduction* gr); static size_t reduxGenGetSHMEMSize (const GpuReduction* gr, size_t bs); static size_t reduxGenGetSHMEMDstOff (const GpuReduction* gr, size_t bs); static size_t reduxGenGetSHMEMDstArgOff (const GpuReduction* gr, size_t bs); +static size_t reduxGenGetWMEMSize (const GpuReduction* gr, size_t bs); +static size_t reduxGenGetWMEMDstOff (const GpuReduction* gr, size_t bs); +static size_t reduxGenGetWMEMDstArgOff (const GpuReduction* gr, size_t bs); /* Invoker Control Flow */ static int reduxInvInit (redux_ctx* ctx); @@ -922,23 +962,21 @@ static int reduxSortPtrIBDstArgWrSelect (const void* a, const void* b){ return 0; } -static int reduxSortPtrFinalOrder (const void* a, const void* b){ +static int reduxSortPtrInsertFinalOrder (const void* a, const void* b){ const axis_desc* xda = *(const axis_desc* const*)a; const axis_desc* xdb = *(const axis_desc* const*)b; - /* All intra axes go last. */ + + /* All intra axes go first. */ if (axisIsIntra(xda) && axisIsInter(xdb)){ - return +1; - }else if (axisIsInter(xda) && axisIsIntra(xdb)){ return -1; + }else if (axisIsInter(xda) && axisIsIntra(xdb)){ + return +1; } - if(axisIsIntra(xda)){ /** * Intra axes sort between themselves by descending intra axis number. - * The split axis is always intra, and since it has the highest intra - * axis number it will always sort first. */ if (axisGetIBNum(xda) < axisGetIBNum(xdb)){ @@ -949,18 +987,23 @@ static int reduxSortPtrFinalOrder (const void* a, const void* b){ return 0; }else{ - /* All free inter axes go first (i{0..3}) */ + /** + * Inter axes sort between themselves + * + * - Reduced axes first + * - Then by ascending source tensor stride + */ + if ( axisIsReduced(xda) && !axisIsReduced(xdb)){ - return +1; - }else if (!axisIsReduced(xda) && axisIsReduced(xdb)){ return -1; + }else if (!axisIsReduced(xda) && axisIsReduced(xdb)){ + return +1; } - /* Otherwise it's sort by descending source argument absolute stride. */ if (axisGetSrcAbsStride(xda) < axisGetSrcAbsStride(xdb)){ - return +1; - }else if (axisGetSrcAbsStride(xda) > axisGetSrcAbsStride(xdb)){ return -1; + }else if (axisGetSrcAbsStride(xda) > axisGetSrcAbsStride(xdb)){ + return +1; } } @@ -1040,6 +1083,9 @@ static size_t axisGetInterLen (const axis_desc* axis){ return axis->len; } } +static size_t axisGetIntraInterLen (const axis_desc* axis){ + return axisGetIntraLen(axis)*axisGetInterLen(axis); +} static ssize_t axisGetSrcStride (const axis_desc* axis){ return axisGetLen(axis) > 1 ? axis->srcStride : 0; } @@ -1275,6 +1321,7 @@ static int reduxGenInit (GpuReduction* gr){ gr->kArgTypeCodes = NULL; gr->kSourceCode = NULL; gr->kErrorString = NULL; + gr->kNumArgs = 0; return reduxGenInferProperties(gr); } @@ -1285,7 +1332,6 @@ static int reduxGenInit (GpuReduction* gr){ static int reduxGenInferProperties (GpuReduction* gr){ int i, ret; - int k; /** @@ -1412,6 +1458,7 @@ static int reduxGenInferProperties (GpuReduction* gr){ "Problem selecting types to be used in reduction!\n"); } + /* Compute floor(log2(gr->log2MaxL)). */ gr->log2MaxL = gr->maxLg-1; for(i=1;gr->log2MaxL & (gr->log2MaxL+1);i*=2){ @@ -1420,104 +1467,98 @@ static int reduxGenInferProperties (GpuReduction* gr){ for(i=0;gr->log2MaxL;i++){ gr->log2MaxL >>= 1; } - gr->log2MaxL = i; - - /* Compute number of kernel arguments. */ - gr->kNumArgs = 6 /* phase, U, V, B, D, H */ - + 2 /* splitFree, splitReduce */ - + gr->nds /* l{0..n} */ - + reduxGenRequiresDstArg(gr)*gr->ndr /* l{m..n}PDim */ - + 1 /* s */ - + 1 /* sOff */ - + gr->nds /* sJ{0..n} */ - + reduxGenRequiresDst (gr) /* d */ - + reduxGenRequiresDst (gr) /* dOff */ - + reduxGenRequiresDst (gr)*gr->ndd /* dJ{0..m} */ - + reduxGenRequiresDstArg(gr) /* a */ - + reduxGenRequiresDstArg(gr) /* aOff */ - + reduxGenRequiresDstArg(gr)*gr->ndd /* aJ{0..m} */ - + 1 /* w */ - + reduxGenKernelRequiresDst (gr)*2 /* wdOff, pdOff */ - + reduxGenKernelRequiresDstArg(gr)*2 /* waOff, paOff */ - + gr->log2MaxL /* bs{0..p} */ - + gr->log2MaxL /* bp{0..p} */ - + reduxGenRequiresDstArg(gr)*gr->log2MaxL /* bi{0..p} */ - + gr->log2MaxL /* bsOff{0..p} */ - + reduxGenRequiresDst (gr)*gr->log2MaxL /* bdOff{0..p} */ - + reduxGenRequiresDstArg(gr)*gr->log2MaxL;/* baOff{0..p} */ - - - /* Construct kernel argument typecode list */ + gr->log2MaxL = i?i:1; + + + /** + * Compute number of kernel arguments and construct kernel argument + * typecode list. + */ + + reduxGenIterArgs(gr, reduxGenCountArgs, 0); gr->kArgTypeCodes = calloc(gr->kNumArgs, sizeof(*gr->kArgTypeCodes)); if(!gr->kArgTypeCodes){ return reduxGenCleanupMsg(gr, GA_MEMORY_ERROR, "Failed to allocate memory for kernel arguments " "typecode list!\n"); } - i = 0; - gr->kArgTypeCodes[i++] = GA_INT; /* phase */ - gr->kArgTypeCodes[i++] = GA_SIZE; /* U */ - gr->kArgTypeCodes[i++] = GA_SIZE; /* V */ - gr->kArgTypeCodes[i++] = GA_SIZE; /* B */ - gr->kArgTypeCodes[i++] = GA_UINT; /* D */ - gr->kArgTypeCodes[i++] = GA_UINT; /* H */ - gr->kArgTypeCodes[i++] = GA_UINT; /* splitFree */ - gr->kArgTypeCodes[i++] = GA_UINT; /* splitReduce */ + reduxGenIterArgs(gr, reduxGenSaveArgTypecodes, &i); + + + /* Generate source code. */ + return reduxGenSrc(gr); +} + +/** + * Iterate over the arguments of the reduction operator. + */ + +static void reduxGenIterArgs (GpuReduction* gr, + GpuReductionIterFn fn, + void* user){ + int k; + + fn(gr, GA_INT, "int", "phase", 0, user); + fn(gr, GA_SIZE, "TX", "U", 0, user); + fn(gr, GA_SIZE, "TX", "V", 0, user); + fn(gr, GA_SIZE, "TX", "B", 0, user); + fn(gr, GA_UINT, "unsigned", "D", 0, user); + fn(gr, GA_UINT, "unsigned", "H", 0, user); + fn(gr, GA_UINT, "unsigned", "splitFree", 0, user); + fn(gr, GA_UINT, "unsigned", "splitReduce", 0, user); for(k=0;k < gr->nds;k++){ - gr->kArgTypeCodes[i++] = GA_SIZE; /* lN */ + fn(gr, GA_SIZE, "TX", "l%d", k, user); } - for(k=0;k < gr->ndr && reduxGenRequiresDstArg(gr);k++){ - gr->kArgTypeCodes[i++] = GA_SIZE; /* lNPDim */ + for(k=gr->ndd;k < gr->nds && reduxGenRequiresDstArg(gr);k++){ + fn(gr, GA_SIZE, "TX", "l%dPDim", k, user); } - gr->kArgTypeCodes[i++] = GA_BUFFER;/* s */ - gr->kArgTypeCodes[i++] = GA_SSIZE; /* sOff */ + fn(gr, GA_BUFFER, "const GLOBAL_MEM char*", "s", 0, user); + fn(gr, GA_SSIZE, "TX", "sOff", 0, user); for(k=0;k < gr->nds;k++){ - gr->kArgTypeCodes[i++] = GA_SSIZE; /* sJN */ + fn(gr, GA_SIZE, "TX", "sJ%d", k, user); } if(reduxGenRequiresDst (gr)){ - gr->kArgTypeCodes[i++] = GA_BUFFER;/* d */ - gr->kArgTypeCodes[i++] = GA_SSIZE; /* dOff */ + fn(gr, GA_BUFFER, "GLOBAL_MEM char*", "d", 0, user); + fn(gr, GA_SSIZE, "TX", "dOff", 0, user); for(k=0;k < gr->ndd;k++){ - gr->kArgTypeCodes[i++] = GA_SSIZE; /* dJN */ + fn(gr, GA_SIZE, "TX", "dJ%d", k, user); } } if(reduxGenRequiresDstArg(gr)){ - gr->kArgTypeCodes[i++] = GA_BUFFER;/* a */ - gr->kArgTypeCodes[i++] = GA_SSIZE; /* aOff */ + fn(gr, GA_BUFFER, "GLOBAL_MEM char*", "a", 0, user); + fn(gr, GA_SSIZE, "TX", "aOff", 0, user); for(k=0;k < gr->ndd;k++){ - gr->kArgTypeCodes[i++] = GA_SSIZE; /* aJN */ + fn(gr, GA_SIZE, "TX", "aJ%d", k, user); } } - gr->kArgTypeCodes[i++] = GA_BUFFER;/* w */ + fn(gr, GA_BUFFER, "GLOBAL_MEM char*", "w", 0, user); if(reduxGenKernelRequiresDst (gr)){ - gr->kArgTypeCodes[i++] = GA_SSIZE; /* wdOff */ - gr->kArgTypeCodes[i++] = GA_SSIZE; /* pdOff */ + fn(gr, GA_SSIZE, "TX", "wdOff", 0, user); + fn(gr, GA_SSIZE, "TX", "pdOff", 0, user); } if(reduxGenKernelRequiresDstArg(gr)){ - gr->kArgTypeCodes[i++] = GA_SSIZE; /* waOff */ - gr->kArgTypeCodes[i++] = GA_SSIZE; /* paOff */ + fn(gr, GA_SSIZE, "TX", "waOff", 0, user); + fn(gr, GA_SSIZE, "TX", "paOff", 0, user); } for(k=0;k < gr->log2MaxL;k++){ - gr->kArgTypeCodes[i++] = GA_UINT; /* ibsN */ + fn(gr, GA_UINT, "unsigned", "ibs%d", k, user); } for(k=0;k < gr->log2MaxL;k++){ - gr->kArgTypeCodes[i++] = GA_UINT; /* ibpN */ + fn(gr, GA_UINT, "unsigned", "ibp%d", k, user); } for(k=0;k < gr->log2MaxL && reduxGenRequiresDstArg(gr);k++){ - gr->kArgTypeCodes[i++] = GA_SIZE; /* iblNPDim */ + fn(gr, GA_SIZE, "TX", "ibl%dPDim", k, user); } for(k=0;k < gr->log2MaxL;k++){ - gr->kArgTypeCodes[i++] = GA_SSIZE; /* ibsOffN */ + fn(gr, GA_SSIZE, "TX", "ibsOff%d", k, user); } for(k=0;k < gr->log2MaxL && reduxGenRequiresDst (gr);k++){ - gr->kArgTypeCodes[i++] = GA_SSIZE; /* ibdOffN */ + fn(gr, GA_SSIZE, "TX", "ibdOff%d", k, user); } for(k=0;k < gr->log2MaxL && reduxGenRequiresDstArg(gr);k++){ - gr->kArgTypeCodes[i++] = GA_SSIZE; /* ibaOffN */ + fn(gr, GA_SSIZE, "TX", "ibaOff%d", k, user); } - - return reduxGenSrc(gr); } /** @@ -1769,70 +1810,10 @@ static void reduxGenSrcAppendReduxKernel (GpuReduction* gr){ srcbAppends (&gr->srcGen, "}\n"); } static void reduxGenSrcAppendPrototype (GpuReduction* gr){ - int i; + int i=0; - srcbAppends (&gr->srcGen, "KERNEL void redux(int phase,\n" - " TX U,\n" - " TX V,\n" - " TX B,\n" - " unsigned D,\n" - " unsigned H,\n" - " unsigned splitFree,\n" - " unsigned splitReduce,\n"); - srcbBeginList (&gr->srcGen, ",\n", "void"); - for(i=0;i<(int)(gr->ndd+gr->ndr);i++){ - srcbAppendElemf (&gr->srcGen, " TX l%d", i); - } - for(i=gr->ndd;i<(int)(gr->ndd+gr->ndr);i++){ - srcbAppendElemf (&gr->srcGen, " TX l%dPDim", i); - } - srcbAppendElemf (&gr->srcGen, " const GLOBAL_MEM char* s"); - srcbAppendElemf (&gr->srcGen, " TX sOff"); - for(i=0;i<(int)(gr->ndd+gr->ndr);i++){ - srcbAppendElemf (&gr->srcGen, " TX sJ%d", i); - } - if (reduxGenRequiresDst(gr)){ - srcbAppendElemf (&gr->srcGen, " GLOBAL_MEM char* d"); - srcbAppendElemf (&gr->srcGen, " TX dOff"); - for(i=0;i<(int)(gr->ndd);i++){ - srcbAppendElemf(&gr->srcGen, " TX dJ%d", i); - } - } - if (reduxGenRequiresDstArg(gr)){ - srcbAppendElemf (&gr->srcGen, " GLOBAL_MEM char* a"); - srcbAppendElemf (&gr->srcGen, " TX aOff"); - for(i=0;i<(int)(gr->ndd);i++){ - srcbAppendElemf(&gr->srcGen, " TX aJ%d", i); - } - } - srcbAppendElemf (&gr->srcGen, " GLOBAL_MEM char* w"); - if (reduxGenKernelRequiresDst(gr)){ - srcbAppendElemf (&gr->srcGen, " TX wdOff"); - srcbAppendElemf (&gr->srcGen, " TX pdOff"); - } - if (reduxGenKernelRequiresDstArg(gr)){ - srcbAppendElemf (&gr->srcGen, " TX waOff"); - srcbAppendElemf (&gr->srcGen, " TX paOff"); - } - for(i=0;i<(int)(gr->log2MaxL);i++){ - srcbAppendElemf (&gr->srcGen, " unsigned ibs%d", i); - } - for(i=0;i<(int)(gr->log2MaxL);i++){ - srcbAppendElemf (&gr->srcGen, " unsigned ibp%d", i); - } - for(i=0;i<(int)(gr->log2MaxL) && reduxGenRequiresDstArg(gr);i++){ - srcbAppendElemf (&gr->srcGen, " TX ibl%dPDim", i); - } - for(i=0;i<(int)(gr->log2MaxL);i++){ - srcbAppendElemf (&gr->srcGen, " TX ibsOff%d", i); - } - for(i=0;i<(int)(gr->log2MaxL) && reduxGenRequiresDst (gr);i++){ - srcbAppendElemf (&gr->srcGen, " TX ibdOff%d", i); - } - for(i=0;i<(int)(gr->log2MaxL) && reduxGenRequiresDstArg(gr);i++){ - srcbAppendElemf (&gr->srcGen, " TX ibaOff%d", i); - } - srcbEndList (&gr->srcGen); + srcbAppends (&gr->srcGen, "KERNEL void redux("); + reduxGenIterArgs(gr, reduxGenAppendArg, &i); srcbAppends (&gr->srcGen, ")"); } static void reduxGenSrcAppendBlockDecode (GpuReduction* gr){ @@ -2004,56 +1985,43 @@ static void reduxGenSrcAppendBlockDecode (GpuReduction* gr){ " * base pointers to their starting point.\n" " */\n" " \n" + " TX z, h, k;\n" " unsigned Dunit = D/splitFree;\n"); if(gr->ndd > 0){ srcbAppendf(&gr->srcGen, - " TX l%dMul = DIVIDECEIL(l%d, splitFree);\n", + " TX l%dDiv = DIVIDECEIL(l%d, splitFree);\n", gr->ndd-1, gr->ndd-1); } if(gr->ndr > 0){ srcbAppendf(&gr->srcGen, - " TX l%dMul = DIVIDECEIL(l%d, splitReduce);\n", + " TX l%dDiv = DIVIDECEIL(l%d, splitReduce);\n", gr->nds-1, gr->nds-1); } - srcbAppends(&gr->srcGen, " \n"); + srcbAppends(&gr->srcGen, + " \n" + " z = start;\n"); for(i=gr->nds-1;i>=0;i--){ - if(i == gr->nds-1){ + if(i == gr->nds-1 || i == gr->ndd-1){ srcbAppendf(&gr->srcGen, - " TX i%d = start %% l%dMul;\n", - i, i); - + " TX i%d = z %% l%dDiv;z /= l%dDiv;\n", + i, i, i); }else{ srcbAppendf(&gr->srcGen, - " TX i%d = i%d / l%d%s %% l%d%s;\n", - i, i+1, - i+1, - reduxGenAxisMaybeSplit(gr, i+1) ? "Mul" : "", - i, - reduxGenAxisMaybeSplit(gr, i) ? "Mul" : ""); + " TX i%d = z %% l%d; z /= l%d;\n", + i, i, i); } } srcbAppends(&gr->srcGen, " \n"); - if(gr->ndd > 0){ - srcbAppendf(&gr->srcGen, - " i%d *= splitFree;\n", - gr->ndd-1); - } - if(gr->ndr > 0){ - srcbAppendf(&gr->srcGen, - " i%d *= splitReduce;\n", - gr->nds-1); - } - srcbAppends(&gr->srcGen, " \n"); for(i=gr->nds-1;i>=0;i--){ if(i == gr->nds-1){ srcbAppendf(&gr->srcGen, - " TX sS%d = (sJ%d ) / splitReduce;\n", + " TX sS%d = sJ%d;\n", i, i); }else{ srcbAppendf(&gr->srcGen, - " TX sS%d = (sJ%d + (TX)l%d*sS%d)%s;\n", - i, i, i+1, i+1, - i == gr->ndd-1 ? " / splitFree" : ""); + " TX sS%d = sJ%d + l%d%s*sS%d;\n", + i, i, i+1, + reduxGenAxisMaybeSplit(gr, i+1) ? "Div" : " ", i+1); } } if (reduxGenRequiresDst(gr)){ @@ -2061,12 +2029,13 @@ static void reduxGenSrcAppendBlockDecode (GpuReduction* gr){ for(i=gr->ndd-1;i>=0;i--){ if(i == gr->ndd-1){ srcbAppendf(&gr->srcGen, - " TX dS%d = (dJ%d ) / splitFree;\n", + " TX dS%d = dJ%d;\n", i, i); }else{ srcbAppendf(&gr->srcGen, - " TX dS%d = (dJ%d + (TX)l%d*dS%d);\n", - i, i, i+1, i+1); + " TX dS%d = dJ%d + l%d%s*dS%d;\n", + i, i, i+1, + reduxGenAxisMaybeSplit(gr, i+1) ? "Div" : " ", i+1); } } } @@ -2075,12 +2044,13 @@ static void reduxGenSrcAppendBlockDecode (GpuReduction* gr){ for(i=gr->ndd-1;i>=0;i--){ if(i == gr->ndd-1){ srcbAppendf(&gr->srcGen, - " TX aS%d = (aJ%d ) / splitFree;\n", + " TX aS%d = aJ%d;\n", i, i); }else{ srcbAppendf(&gr->srcGen, - " TX aS%d = (aJ%d + (TX)l%d*aS%d);\n", - i, i, i+1, i+1); + " TX aS%d = aJ%d + l%d%s*aS%d;\n", + i, i, i+1, + reduxGenAxisMaybeSplit(gr, i+1) ? "Div" : " ", i+1); } } } @@ -2111,6 +2081,17 @@ static void reduxGenSrcAppendBlockDecode (GpuReduction* gr){ srcbAppends(&gr->srcGen, ";\n"); } srcbAppends(&gr->srcGen, " \n"); + if(gr->ndd > 0){ + srcbAppendf(&gr->srcGen, + " i%d *= splitFree;\n", + gr->ndd-1); + } + if(gr->ndr > 0){ + srcbAppendf(&gr->srcGen, + " i%d *= splitReduce;\n", + gr->nds-1); + } + srcbAppends(&gr->srcGen, " \n"); if(reduxGenKernelRequiresDst(gr)){ srcbAppends(&gr->srcGen, " TK* wd = (TK*)(w + wdOff);\n" @@ -2125,10 +2106,7 @@ static void reduxGenSrcAppendBlockDecode (GpuReduction* gr){ " TA* waR = &wa[GDIM_0*D];\n" " TA* pa = (TA*)(SHMEM + paOff);\n"); } - srcbAppends(&gr->srcGen, - " \n" - " TX h, k;\n" - " \n"); + srcbAppends(&gr->srcGen, " \n"); } static void reduxGenSrcAppendThreadDecode (GpuReduction* gr){ int i; @@ -2143,18 +2121,13 @@ static void reduxGenSrcAppendThreadDecode (GpuReduction* gr){ " * argument pointers, argument indices and permute targets.\n" " */\n" " \n" - " unsigned iSplit = LID_0/(LDIM_0/(splitFree*splitReduce));\n"); + " unsigned iSplit = LID_0/(LDIM_0/(splitFree*splitReduce));\n" + " z = LID_0;\n"); for(i=gr->log2MaxL-1;i>=0;i--){ - if(i == gr->log2MaxL-1){ - srcbAppendf(&gr->srcGen, - " int t%d = (unsigned)LID_0 %% ibs%d;\n", - i, i); - }else{ - srcbAppendf(&gr->srcGen, - " int t%d = (unsigned)t%d / ibs%d %% ibs%d;\n", - i, i+1, i+1, i); - } + srcbAppendf(&gr->srcGen, + " int t%d = z %% ibs%d;z /= ibs%d;\n", + i, i, i); } if(reduxGenRequiresDstArg(gr)){ srcbAppends(&gr->srcGen, " TX ti = "); @@ -2172,10 +2145,6 @@ static void reduxGenSrcAppendThreadDecode (GpuReduction* gr){ } srcbEndList(&gr->srcGen); srcbAppends(&gr->srcGen, ";\n"); - - - - srcbAppends(&gr->srcGen, " \n" " sOff += "); srcbBeginList(&gr->srcGen, " + ", "0"); @@ -2561,6 +2530,146 @@ static int reduxGenCleanupMsg (GpuReduction* gr, int ret, return reduxGenCleanup(gr, ret); } +/** + * Count # of arguments as determined by iterator. + */ + +static void reduxGenCountArgs (GpuReduction* gr, + int typecode, + const char* typeName, + const char* baseName, + int num, + void* user){ + (void)typecode; + (void)typeName; + (void)baseName; + (void)num; + (void)user; + + gr->kNumArgs++; +} + +/** + * Record the typecodes in the arguments typecode array. + */ + +static void reduxGenSaveArgTypecodes (GpuReduction* gr, + int typecode, + const char* typeName, + const char* baseName, + int num, + void* user){ + (void)typeName; + (void)baseName; + (void)num; + (void)user; + + gr->kArgTypeCodes[(*(int*)user)++] = typecode; +} + +/** + * Append an argument declaration to prototype. + */ + +static void reduxGenAppendArg (GpuReduction* gr, + int typecode, + const char* typeName, + const char* baseName, + int num, + void* user){ + (void)user; + (void)typecode; + + if((*(int*)user)++ > 0){ + srcbAppends(&gr->srcGen, ",\n "); + } + srcbAppendf(&gr->srcGen, "%-25s ", typeName); + srcbAppendf(&gr->srcGen, baseName, num); +} + +/** + * Marshall argument declaration during invocation. + */ + +static void reduxInvMarshalArg (GpuReduction* gr, + int typecode, + const char* typeName, + const char* baseName, + int k, + void* user){ + redux_ctx* ctx; + int* i; + + (void)typecode; + (void)typeName; + + ctx = (redux_ctx*)(((void**)user)[0]); + i = (int *)(((void**)user)[1]); + + if (strcmp(baseName, "phase") == 0){ + ctx->kArgs[(*i)++] = (void*)&ctx->phase; + }else if (strcmp(baseName, "U") == 0){ + ctx->kArgs[(*i)++] = (void*)&ctx->U; + }else if (strcmp(baseName, "V") == 0){ + ctx->kArgs[(*i)++] = (void*)&ctx->V; + }else if (strcmp(baseName, "B") == 0){ + ctx->kArgs[(*i)++] = (void*)&ctx->B; + }else if (strcmp(baseName, "D") == 0){ + ctx->kArgs[(*i)++] = (void*)&ctx->D; + }else if (strcmp(baseName, "H") == 0){ + ctx->kArgs[(*i)++] = (void*)&ctx->H; + }else if (strcmp(baseName, "splitFree") == 0){ + ctx->kArgs[(*i)++] = (void*)&ctx->splitFree; + }else if (strcmp(baseName, "splitReduce") == 0){ + ctx->kArgs[(*i)++] = (void*)&ctx->splitReduce; + }else if (strcmp(baseName, "l%d") == 0){ + ctx->kArgs[(*i)++] = (void*)&ctx->l[k]; + }else if (strcmp(baseName, "l%dPDim") == 0){ + ctx->kArgs[(*i)++] = (void*)&ctx->lPDim[k-gr->ndd]; + }else if (strcmp(baseName, "s") == 0){ + ctx->kArgs[(*i)++] = (void*) ctx->flatSrcData; + }else if (strcmp(baseName, "sOff") == 0){ + ctx->kArgs[(*i)++] = (void*)&ctx->flatSrcOffset; + }else if (strcmp(baseName, "sJ%d") == 0){ + ctx->kArgs[(*i)++] = (void*)&ctx->sJ[k]; + }else if (strcmp(baseName, "d") == 0){ + ctx->kArgs[(*i)++] = (void*) ctx->flatDstData; + }else if (strcmp(baseName, "dOff") == 0){ + ctx->kArgs[(*i)++] = (void*)&ctx->flatDstOffset; + }else if (strcmp(baseName, "dJ%d") == 0){ + ctx->kArgs[(*i)++] = (void*)&ctx->dJ[k]; + }else if (strcmp(baseName, "a") == 0){ + ctx->kArgs[(*i)++] = (void*) ctx->flatDstArgData; + }else if (strcmp(baseName, "aOff") == 0){ + ctx->kArgs[(*i)++] = (void*)&ctx->flatDstArgOffset; + }else if (strcmp(baseName, "aJ%d") == 0){ + ctx->kArgs[(*i)++] = (void*)&ctx->aJ[k]; + }else if (strcmp(baseName, "w") == 0){ + ctx->kArgs[(*i)++] = (void*) ctx->w; + }else if (strcmp(baseName, "wdOff") == 0){ + ctx->kArgs[(*i)++] = (void*)&ctx->wdOff; + }else if (strcmp(baseName, "pdOff") == 0){ + ctx->kArgs[(*i)++] = (void*)&ctx->pdOff; + }else if (strcmp(baseName, "waOff") == 0){ + ctx->kArgs[(*i)++] = (void*)&ctx->waOff; + }else if (strcmp(baseName, "paOff") == 0){ + ctx->kArgs[(*i)++] = (void*)&ctx->paOff; + }else if (strcmp(baseName, "ibs%d") == 0){ + ctx->kArgs[(*i)++] = (void*)&ctx->ibs[k]; + }else if (strcmp(baseName, "ibp%d") == 0){ + ctx->kArgs[(*i)++] = (void*)&ctx->ibp[k]; + }else if (strcmp(baseName, "ibl%dPDim") == 0){ + ctx->kArgs[(*i)++] = (void*)&ctx->iblPDim[k]; + }else if (strcmp(baseName, "ibsOff%d") == 0){ + ctx->kArgs[(*i)++] = (void*)&ctx->ibsOff[k]; + }else if (strcmp(baseName, "ibdOff%d") == 0){ + ctx->kArgs[(*i)++] = (void*)&ctx->ibdOff[k]; + }else if (strcmp(baseName, "ibaOff%d") == 0){ + ctx->kArgs[(*i)++] = (void*)&ctx->ibaOff[k]; + } +} + + /** * @brief Estimate the level of parallelism available in the GPU context of * this reduction operator. @@ -2695,7 +2804,7 @@ static size_t reduxGenGetMaxLocalSize (const GpuReduction* gr){ static size_t reduxGenGetSHMEMSize (const GpuReduction* gr, size_t bs){ const gpuarray_type* type; - size_t total = 0; + size_t total = 0, permuteSpace; if(reduxGenKernelRequiresDst(gr)){ type = gpuarray_get_type(gr->accTypeCode); @@ -2708,6 +2817,12 @@ static size_t reduxGenGetSHMEMSize (const GpuReduction* gr, size_t total += bs*type->size; } + /* Ensure space for pointer permute. */ + permuteSpace = gpuarray_get_type(gr->idxTypeCode)->size * bs; + if(total < permuteSpace){ + total = permuteSpace; + } + return total; } @@ -2740,6 +2855,47 @@ static size_t reduxGenGetSHMEMDstArgOff (const GpuReduction* gr, size_t } } +/** + * Get the amount of Workspace memory required. + * + * NOT necessarily the same as amount of SHMEM! The workspace is NOT used for + * intrablock offset permutes, for instance. + */ + +static size_t reduxGenGetWMEMSize (const GpuReduction* gr, size_t bs){ + const gpuarray_type* type; + size_t total = 0; + + if(reduxGenKernelRequiresDst(gr)){ + type = gpuarray_get_type(gr->accTypeCode); + total = DIVIDECEIL(total, type->align)*type->align; + total += bs*type->size; + } + if(reduxGenKernelRequiresDstArg(gr)){ + type = gpuarray_get_type(gr->idxTypeCode); + total = DIVIDECEIL(total, type->align)*type->align; + total += bs*type->size; + } + + return total; +} + +/** + * @brief Get the workspace memory byte offset for dst. + */ + +static size_t reduxGenGetWMEMDstOff (const GpuReduction* gr, size_t bs){ + return reduxGenGetSHMEMDstOff(gr, bs); +} + +/** + * @brief Get the workspace memory byte offset for dstArg. + */ + +static size_t reduxGenGetWMEMDstArgOff (const GpuReduction* gr, size_t bs){ + return reduxGenGetSHMEMDstArgOff(gr, bs); +} + /** * @brief Initialize the context. * @@ -3039,7 +3195,7 @@ static int reduxInvFlattenSource (redux_ctx* ctx){ static int reduxInvComputeKArgs (redux_ctx* ctx){ axis_desc* axis, *prevAxis; size_t target, aL, aLS; - int i, j; + int i, j, k, haveSplitFreeAxis, haveSplitReducedAxis; /** @@ -3086,9 +3242,6 @@ static int reduxInvComputeKArgs (redux_ctx* ctx){ for(i=0;igr->nds;i++){ ctx->l[i] = 1; } - for(i=0;igr->ndr;i++){ - ctx->lPDim[i] = 1; - } for(i=0;igr->log2MaxL;i++){ ctx->ibs[i] = 1; } @@ -3117,8 +3270,9 @@ static int reduxInvComputeKArgs (redux_ctx* ctx){ if(target/ctx->bs >= 2){ aLS = target/ctx->bs; ctx->bs *= aLS; - axisMarkIntraBlock(axis, i++, aLS); + axisMarkIntraBlock(axis, i, aLS); ctx->xdSplit = axis; + i++; } break; } @@ -3172,7 +3326,7 @@ static int reduxInvComputeKArgs (redux_ctx* ctx){ axisSetPDim(axis, 1); }else{ prevAxis = reduxInvGetSrcSortAxis(ctx, i-1); - axisSetPDim(prevAxis, axisGetPDim(axis)*axisGetLen(prevAxis)); + axisSetPDim(axis, axisGetPDim(prevAxis)*axisGetLen(prevAxis)); } } } @@ -3199,73 +3353,134 @@ static int reduxInvComputeKArgs (redux_ctx* ctx){ axisSetIBP(axis, 1); }else{ prevAxis = reduxInvGetSrcSortAxis(ctx, i-1); - axisSetIBP(axis, axisGetIBP(prevAxis)*axisGetLen(prevAxis)); + axisSetIBP(axis, axisGetIBP(prevAxis)*axisGetIntraLen(prevAxis)); } } } /** - * STEP 6. Place the axes in final loop order and perform final placement - * of: - * lN, lPDim, sJN, dJN, aJN, + * STEP 6. Place the intra axis arguments + * * ibs, ibp, iblPDim, ibsOff, ibdOff, ibaOff + * + * For this we need the axes in final order of insertion. */ reduxSortAxisPtrsBy(ctx->xdSrcPtrs, ctx->xdSrc, ctx->ndfs, - reduxSortPtrFinalOrder); - for(i=0,j=0;indfs;i++){ + reduxSortPtrInsertFinalOrder); + for(i=0;indib;i++){ + axis = reduxInvGetSrcSortAxis(ctx, i); + + ctx->ibs [i] = axisGetIntraLen (axis); + ctx->ibp [i] = axisGetIBP (axis); + ctx->iblPDim[i] = axisGetPDim (axis); + ctx->ibsOff [i] = axisGetSrcStride (axis); + ctx->ibdOff [i] = axisGetDstStride (axis); + ctx->ibaOff [i] = axisGetDstArgStride(axis); + } + + /** + * STEP 7. Place the inter axis arguments + * + * lN, lNPDim, sJN, dJN, aJN + * + * , where N in [0, ctx->gr->ndd) are free axes, + * N in [ctx->gr->ndd, ctx->gr->nds) are reduced axes, + * and ctx->xdSrcPtr[...] are sorted in the reverse of that order for + * insertion, and excludes any split axis. + * + * How precisely the insertion is done depends closely on whether there is + * a split axis and if so whether it is free or reduced. + * + * - If there is a split axis and it is free, then it should be inserted as + * the first free axis. Its jumps should be + * sJN = -sSM*intrainterLenM + sSN*splitFree + * dJN = -dSM*intrainterLenM + dSN*splitFree + * aJN = -aSM*intrainterLenM + aSN*splitFree + * - If there is a split axis and it is reduced, then it should be inserted + * as the first reduced axis. Its jump should be + * sJN = -sSM*intrainterLenM + sSN*splitReduced + * - If there is no split axis, proceed normally in filling the axes. + */ + + haveSplitFreeAxis = ctx->xdSplit && !axisIsReduced(ctx->xdSplit); + haveSplitReducedAxis = ctx->xdSplit && axisIsReduced(ctx->xdSplit); + + /* If we have a reduced split axis, insert it before any other reduced axis. */ + j = ctx->gr->nds-1; + k = ctx->gr->ndr-1; + if(haveSplitReducedAxis && k>=0){ + ctx->l [j] = axisGetLen (ctx->xdSplit); + ctx->lPDim [k] = axisGetPDim (ctx->xdSplit); + ctx->sJ [j] += (ssize_t)axisGetSrcStride (ctx->xdSplit)* + (ssize_t)axisGetIntraLen (ctx->xdSplit); + if(j>0){ + ctx->sJ [j-1] -= (ssize_t)axisGetSrcStride (ctx->xdSplit)* + (ssize_t)axisGetIntraInterLen(ctx->xdSplit); + } + j--; + k--; + } + + /* Insert rest of reduced axes. */ + for(;indfs && k>=0;i++,j--,k--){ axis = reduxInvGetSrcSortAxis(ctx, i); + if(!axisIsReduced(axis)){ + break; + } - if (axisIsSplit(axis) && !axisIsReduced(axis)){ - /* Split Free Axis? */ - ctx->ibs [ 0] = axisGetIntraLen(axis); - ctx->ibp [ 0] = axisGetIntraLen(axis); - ctx->iblPDim[ 0] = axisGetIntraLen(axis); - ctx->ibsOff [ 0] = axisGetSrcStride(axis); - ctx->ibdOff [ 0] = axisGetDstStride(axis); - ctx->ibaOff [ 0] = axisGetDstArgStride(axis); - - ctx->l [ctx->gr->ndd-1] = axisGetInterLen(axis); - ctx->lPDim [ctx->gr->ndd-1] = axisGetPDim (axis); - ctx->sJ [ctx->gr->ndd-1] = 0; - ctx->dJ [ctx->gr->ndd-1] = 0; - ctx->aJ [ctx->gr->ndd-1] = 0; - }else if (axisIsSplit(axis) && axisIsReduced(axis)){ - /* Split Reduced Axis? */ - ctx->ibs [ 0] = axisGetIntraLen(axis); - ctx->ibp [ 0] = axisGetIntraLen(axis); - ctx->iblPDim[ 0] = axisGetIntraLen(axis); - ctx->ibsOff [ 0] = axisGetSrcStride(axis); - ctx->ibdOff [ 0] = axisGetDstStride(axis); - ctx->ibaOff [ 0] = axisGetDstArgStride(axis); - - ctx->l [ctx->gr->nds-1] = axisGetInterLen(axis); - ctx->lPDim [ctx->gr->nds-1] = axisGetPDim (axis); - ctx->sJ [ctx->gr->nds-1] = 0; - ctx->dJ [ctx->gr->nds-1] = 0; - ctx->aJ [ctx->gr->nds-1] = 0; - }else if (axisIsInter(axis) && !axisIsReduced(axis)){ - /* Inter Free Axis? */ - ctx->l [ j] = axisGetInterLen(axis); - ctx->lPDim [ j] = axisGetPDim (axis); - ctx->sJ [ j] = 0; - ctx->dJ [ j] = 0; - ctx->aJ [ j] = 0; - }else if (axisIsInter(axis) && axisIsReduced(axis)){ - /* Inter Reduced Axis? */ - ctx->l [ j] = axisGetInterLen(axis); - ctx->lPDim [ j] = axisGetPDim (axis); - ctx->sJ [ j] = 0; - ctx->dJ [ j] = 0; - ctx->aJ [ j] = 0; - }else{ - /* Intra Axis? */ - ctx->ibs [ 0] = axisGetIntraLen(axis); - ctx->ibp [ 0] = axisGetIntraLen(axis); - ctx->iblPDim[ 0] = axisGetIntraLen(axis); - ctx->ibsOff [ 0] = axisGetSrcStride(axis); - ctx->ibdOff [ 0] = axisGetDstStride(axis); - ctx->ibaOff [ 0] = axisGetDstArgStride(axis); + ctx->l [j] = axisGetLen (axis); + ctx->lPDim [k] = axisGetPDim (axis); + ctx->sJ [j] += (ssize_t)axisGetSrcStride (axis)* + (ssize_t)axisGetIntraLen (axis); + if(j>0){ + ctx->sJ [j-1] -= (ssize_t)axisGetSrcStride (axis)* + (ssize_t)axisGetIntraInterLen(axis); + } + } + + /* If we have a free split axis, insert it before any other free axis. */ + k = ctx->gr->ndd-1; + if(haveSplitFreeAxis && k>=0){ + ctx->l [k] = axisGetLen (ctx->xdSplit); + ctx->sJ [k] += (ssize_t)axisGetSrcStride (ctx->xdSplit)* + (ssize_t)axisGetIntraLen (ctx->xdSplit); + ctx->dJ [k] += (ssize_t)axisGetDstStride (ctx->xdSplit)* + (ssize_t)axisGetIntraLen (ctx->xdSplit); + ctx->aJ [k] += (ssize_t)axisGetDstArgStride (ctx->xdSplit)* + (ssize_t)axisGetIntraLen (ctx->xdSplit); + if(k>0){ + ctx->sJ [k-1] -= (ssize_t)axisGetSrcStride (ctx->xdSplit)* + (ssize_t)axisGetIntraInterLen(ctx->xdSplit); + ctx->dJ [k-1] -= (ssize_t)axisGetDstStride (ctx->xdSplit)* + (ssize_t)axisGetIntraInterLen(ctx->xdSplit); + ctx->aJ [k-1] -= (ssize_t)axisGetDstArgStride (ctx->xdSplit)* + (ssize_t)axisGetIntraInterLen(ctx->xdSplit); + } + k--; + } + + /* Insert rest of free axes. */ + for(;indfs && k>=0;i++,k--){ + axis = reduxInvGetSrcSortAxis(ctx, i); + if(axisIsReduced(axis)){ + break; + } + + ctx->l [k] = axisGetLen (axis); + ctx->sJ [k] += (ssize_t)axisGetSrcStride (axis)* + (ssize_t)axisGetIntraLen (axis); + ctx->dJ [k] += (ssize_t)axisGetDstStride (axis)* + (ssize_t)axisGetIntraLen (axis); + ctx->aJ [k] += (ssize_t)axisGetDstArgStride (axis)* + (ssize_t)axisGetIntraLen (axis); + if(k>0){ + ctx->sJ [k-1] -= (ssize_t)axisGetSrcStride (axis)* + (ssize_t)axisGetIntraInterLen(axis); + ctx->dJ [k-1] -= (ssize_t)axisGetDstStride (axis)* + (ssize_t)axisGetIntraInterLen(axis); + ctx->aJ [k-1] -= (ssize_t)axisGetDstArgStride (axis)* + (ssize_t)axisGetIntraInterLen(axis); } } @@ -3421,9 +3636,9 @@ static int reduxInvSchedule (redux_ctx* ctx){ * Allocate required workspace. */ - ctx->wdOff = reduxGenGetSHMEMDstOff (ctx->gr, 2*ctx->gs*ctx->D); - ctx->waOff = reduxGenGetSHMEMDstArgOff(ctx->gr, 2*ctx->gs*ctx->D); - WSPACESIZE = reduxGenGetSHMEMSize (ctx->gr, 2*ctx->gs*ctx->D); + ctx->wdOff = reduxGenGetWMEMDstOff (ctx->gr, 2*ctx->gs*ctx->D); + ctx->waOff = reduxGenGetWMEMDstArgOff(ctx->gr, 2*ctx->gs*ctx->D); + WSPACESIZE = reduxGenGetWMEMSize (ctx->gr, 2*ctx->gs*ctx->D); ctx->w = gpudata_alloc(ctx->gr->gpuCtx, WSPACESIZE, 0, flags, 0); if(!ctx->w){ return reduxInvCleanupMsg(ctx, GA_MEMORY_ERROR, @@ -3439,73 +3654,14 @@ static int reduxInvSchedule (redux_ctx* ctx){ */ static int reduxInvoke (redux_ctx* ctx){ - int ret, i, k; + int ret, i=0; + void* ptrs[2] = {ctx, &i}; /** * Argument Marshalling. */ - i = 0; - ctx->kArgs[i++] = (void*)&ctx->phase; - ctx->kArgs[i++] = (void*)&ctx->U; - ctx->kArgs[i++] = (void*)&ctx->V; - ctx->kArgs[i++] = (void*)&ctx->B; - ctx->kArgs[i++] = (void*)&ctx->D; - ctx->kArgs[i++] = (void*)&ctx->H; - ctx->kArgs[i++] = (void*)&ctx->splitFree; - ctx->kArgs[i++] = (void*)&ctx->splitReduce; - for(k=0;k < ctx->gr->nds;k++){ - ctx->kArgs[i++] = (void*)&ctx->l[k]; - } - for(k=0;k < ctx->gr->ndr && reduxInvRequiresDstArg(ctx);k++){ - ctx->kArgs[i++] = (void*)&ctx->lPDim[k]; - } - ctx->kArgs[i++] = (void*) ctx->flatSrcData; - ctx->kArgs[i++] = (void*)&ctx->flatSrcOffset; - for(k=0;k < ctx->gr->nds;k++){ - ctx->kArgs[i++] = (void*)&ctx->sJ[k]; - } - if(reduxInvRequiresDst (ctx)){ - ctx->kArgs[i++] = (void*) ctx->flatDstData; - ctx->kArgs[i++] = (void*)&ctx->flatDstOffset; - for(k=0;k < ctx->gr->ndd;k++){ - ctx->kArgs[i++] = (void*)&ctx->dJ[k]; - } - } - if(reduxInvRequiresDstArg(ctx)){ - ctx->kArgs[i++] = (void*) ctx->flatDstArgData; - ctx->kArgs[i++] = (void*)&ctx->flatDstArgOffset; - for(k=0;k < ctx->gr->ndd;k++){ - ctx->kArgs[i++] = (void*)&ctx->aJ[k]; - } - } - ctx->kArgs[i++] = (void*) ctx->w; - if(reduxInvKernelRequiresDst (ctx)){ - ctx->kArgs[i++] = (void*)&ctx->wdOff; - ctx->kArgs[i++] = (void*)&ctx->pdOff; - } - if(reduxInvKernelRequiresDstArg(ctx)){ - ctx->kArgs[i++] = (void*)&ctx->waOff; - ctx->kArgs[i++] = (void*)&ctx->paOff; - } - for(k=0;k < ctx->gr->log2MaxL;k++){ - ctx->kArgs[i++] = (void*)&ctx->ibs[k]; - } - for(k=0;k < ctx->gr->log2MaxL;k++){ - ctx->kArgs[i++] = (void*)&ctx->ibp[k]; - } - for(k=0;k < ctx->gr->log2MaxL && reduxInvRequiresDstArg(ctx);k++){ - ctx->kArgs[i++] = (void*)&ctx->iblPDim[k]; - } - for(k=0;k < ctx->gr->log2MaxL;k++){ - ctx->kArgs[i++] = (void*)&ctx->ibsOff[k]; - } - for(k=0;k < ctx->gr->log2MaxL && reduxInvRequiresDst (ctx);k++){ - ctx->kArgs[i++] = (void*)&ctx->ibdOff[k]; - } - for(k=0;k < ctx->gr->log2MaxL && reduxInvRequiresDstArg(ctx);k++){ - ctx->kArgs[i++] = (void*)&ctx->ibaOff[k]; - } + reduxGenIterArgs(ctx->gr, reduxInvMarshalArg, ptrs); diff --git a/tests/check_reduction.c b/tests/check_reduction.c index 567f384aaf..a961bd8f40 100644 --- a/tests/check_reduction.c +++ b/tests/check_reduction.c @@ -3909,11 +3909,11 @@ Suite *get_suite(void) { TCase *tc = tcase_create("basic"); tcase_add_checked_fixture(tc, setup, teardown); tcase_set_timeout(tc, 120.0); - - tcase_add_test(tc, test_maxandargmax_veryhighrank); - tcase_add_test(tc, test_maxandargmax_alldimsreduced); + tcase_add_test(tc, test_maxandargmax_reduction); tcase_add_test(tc, test_maxandargmax_idxtranspose); + tcase_add_test(tc, test_maxandargmax_veryhighrank); + tcase_add_test(tc, test_maxandargmax_alldimsreduced); tcase_add_test(tc, test_minandargmin_reduction); tcase_add_test(tc, test_minandargmin_veryhighrank); From 8debf2d2c279f5552fd05a320032dbdfa6a6b12f Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Tue, 4 Jul 2017 01:05:45 -0400 Subject: [PATCH 18/34] Really dumb division bug fixed. All tests now pass except summation, which fails to meet tolerance. --- src/gpuarray_reduction.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c index f8cd15abfb..af8051cddb 100644 --- a/src/gpuarray_reduction.c +++ b/src/gpuarray_reduction.c @@ -2395,10 +2395,11 @@ static void reduxGenSrcAppendPhase1 (GpuReduction* gr){ " if(misalignL && doFinish && LID_0 < D){\n" " SETREDUXSTATE(accV, accI, wdL[(GID_0+0)*D+LID_0], waL[(GID_0+0)*D+LID_0]);\n" " \n" - " for(k=-1; /* Starting with the first block to our left... */\n" - " (start +0)/B == /* Is our write target the same as that of */\n" - " (start+k*V+V-1)/B; /* the target k blocks to our left? */\n" - " k--){ /* Try moving one more to the left. */\n" + " /* vvv-- NOTA BENE: The +B hack is REALLY NECESSARY, since C division is rounding to zero: (-1)/B == (B-1)/B for B>1. */\n" + " for(k=-1; /* Starting with the first block to our left... */\n" + " (start +B)/B == /* Is our write target the same as that of */\n" + " (start+k*V+V-1+B)/B; /* the target k blocks to our left? */\n" + " k--){ /* Try moving one more to the left. */\n" " REDUX(accV, accI, wdR[(GID_0+k)*D+LID_0], waR[(GID_0+k)*D+LID_0]);\n" " }\n" " \n"); From 5f4ec4ed0f66616cb2072edf915078411e9305f2 Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Tue, 4 Jul 2017 01:24:14 -0400 Subject: [PATCH 19/34] Fix summation tests: - Subtract 0.5 from random numbers, so they sum to 0 in expectation. - Increase tolerance from 1e-5 to 1e-4 just for summation. --- tests/check_reduction.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/check_reduction.c b/tests/check_reduction.c index a961bd8f40..b5e9ba604d 100644 --- a/tests/check_reduction.c +++ b/tests/check_reduction.c @@ -1814,7 +1814,7 @@ START_TEST(test_sum_reduction){ size_t dims[3] = {32,50,79}; size_t prodDims = dims[0]*dims[1]*dims[2]; const int reduxList[] = {0,2}; - const float TOL = 1e-5; + const float TOL = 1e-4; float* pS = calloc(1, sizeof(*pS) * dims[0]*dims[1]*dims[2]); float* pD = calloc(1, sizeof(*pD) * dims[1] ); @@ -1828,7 +1828,7 @@ START_TEST(test_sum_reduction){ */ for(i=0;i Date: Tue, 4 Jul 2017 04:26:10 -0400 Subject: [PATCH 20/34] Add huge sum-reduction and pepper kernel with `restrict` keyword, it doubles the speed. --- src/cluda_cuda.h | 1 + src/gpuarray_reduction.c | 47 +++++++++++-------------- tests/check_reduction.c | 74 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 95 insertions(+), 27 deletions(-) diff --git a/src/cluda_cuda.h b/src/cluda_cuda.h index ed20a8eb1c..e0ad08b90d 100644 --- a/src/cluda_cuda.h +++ b/src/cluda_cuda.h @@ -60,6 +60,7 @@ #define GA_DECL_SHARED_PARAM(type, name) #define GA_DECL_SHARED_BODY(type, name) extern __shared__ type name[]; #define GA_WARP_SIZE warpSize +#define restrict __restrict__ struct ga_half { ga_ushort data; diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c index af8051cddb..9b0d74c9dc 100644 --- a/src/gpuarray_reduction.c +++ b/src/gpuarray_reduction.c @@ -417,7 +417,6 @@ static int reduxInvCleanupMsg (redux_ctx* ctx, int r static size_t reduxInvEstimateParallelism (const redux_ctx* ctx); static int reduxInvRequiresDst (const redux_ctx* ctx); static int reduxInvRequiresDstArg (const redux_ctx* ctx); -static int reduxInvKernelRequiresDst (const redux_ctx* ctx); static unsigned reduxInvGetSplitFree (const redux_ctx* ctx); static unsigned reduxInvGetSplitReduce (const redux_ctx* ctx); static axis_desc* reduxInvGetSrcAxis (const redux_ctx* ctx, int i); @@ -1145,12 +1144,6 @@ static int reduxInvRequiresDst (const redux_ctx* ctx){ static int reduxInvRequiresDstArg (const redux_ctx* ctx){ return reduxGenRequiresDstArg(ctx->gr); } -static int reduxInvKernelRequiresDst (const redux_ctx* ctx){ - return reduxGenKernelRequiresDst(ctx->gr); -} -static int reduxInvKernelRequiresDstArg (const redux_ctx* ctx){ - return reduxGenKernelRequiresDstArg(ctx->gr); -} static unsigned reduxInvGetSplitFree (const redux_ctx* ctx){ if(ctx->xdSplit && !axisIsReduced(ctx->xdSplit)){ return axisGetIntraLen(ctx->xdSplit); @@ -1513,26 +1506,26 @@ static void reduxGenIterArgs (GpuReduction* gr, for(k=gr->ndd;k < gr->nds && reduxGenRequiresDstArg(gr);k++){ fn(gr, GA_SIZE, "TX", "l%dPDim", k, user); } - fn(gr, GA_BUFFER, "const GLOBAL_MEM char*", "s", 0, user); + fn(gr, GA_BUFFER, "const GLOBAL_MEM char* restrict", "s", 0, user); fn(gr, GA_SSIZE, "TX", "sOff", 0, user); for(k=0;k < gr->nds;k++){ fn(gr, GA_SIZE, "TX", "sJ%d", k, user); } if(reduxGenRequiresDst (gr)){ - fn(gr, GA_BUFFER, "GLOBAL_MEM char*", "d", 0, user); + fn(gr, GA_BUFFER, "GLOBAL_MEM char* restrict", "d", 0, user); fn(gr, GA_SSIZE, "TX", "dOff", 0, user); for(k=0;k < gr->ndd;k++){ fn(gr, GA_SIZE, "TX", "dJ%d", k, user); } } if(reduxGenRequiresDstArg(gr)){ - fn(gr, GA_BUFFER, "GLOBAL_MEM char*", "a", 0, user); + fn(gr, GA_BUFFER, "GLOBAL_MEM char* restrict", "a", 0, user); fn(gr, GA_SSIZE, "TX", "aOff", 0, user); for(k=0;k < gr->ndd;k++){ fn(gr, GA_SIZE, "TX", "aJ%d", k, user); } } - fn(gr, GA_BUFFER, "GLOBAL_MEM char*", "w", 0, user); + fn(gr, GA_BUFFER, "GLOBAL_MEM char* restrict", "w", 0, user); if(reduxGenKernelRequiresDst (gr)){ fn(gr, GA_SSIZE, "TX", "wdOff", 0, user); fn(gr, GA_SSIZE, "TX", "pdOff", 0, user); @@ -1633,9 +1626,9 @@ static void reduxGenSrcAppendMacroDefs (GpuReduction* gr){ */ if (gr->srcTypeCode == GA_HALF && gr->accTypeCode == GA_FLOAT){ - srcbAppends(&gr->srcGen, "#define LOADS(v, p) do{(v) = (TK)load_half((TS*)(p));}while(0)\n"); + srcbAppends(&gr->srcGen, "#define LOADS(v, p) do{(v) = (TK)load_half((const TS* restrict)(p));}while(0)\n"); }else{ - srcbAppends(&gr->srcGen, "#define LOADS(v, p) do{(v) = (TK)*(TS*)(p);}while(0)\n"); + srcbAppends(&gr->srcGen, "#define LOADS(v, p) do{(v) = (TK)*(const TS* restrict)(p);}while(0)\n"); } @@ -1746,9 +1739,9 @@ static void reduxGenSrcAppendMacroDefs (GpuReduction* gr){ if (reduxGenRequiresDst(gr)){ if (gr->dstTypeCode == GA_HALF && gr->accTypeCode == GA_FLOAT){ - srcbAppends(&gr->srcGen, "#define STORED(p, v) do{store_half((TD*)(p), (v));}while(0)\n"); + srcbAppends(&gr->srcGen, "#define STORED(p, v) do{store_half((TD* restrict)(p), (v));}while(0)\n"); }else{ - srcbAppends(&gr->srcGen, "#define STORED(p, v) do{*(TD*)(p) = (v);}while(0)\n"); + srcbAppends(&gr->srcGen, "#define STORED(p, v) do{*(TD* restrict)(p) = (v);}while(0)\n"); } }else{ srcbAppends(&gr->srcGen, "#define STORED(p, v) do{}while(0)\n"); @@ -1762,7 +1755,7 @@ static void reduxGenSrcAppendMacroDefs (GpuReduction* gr){ */ if (reduxGenRequiresDstArg(gr)){ - srcbAppends(&gr->srcGen, "#define STOREA(p, v) do{*(TA*)(p) = (v);}while(0)\n"); + srcbAppends(&gr->srcGen, "#define STOREA(p, v) do{*(TA* restrict)(p) = (v);}while(0)\n"); }else{ srcbAppends(&gr->srcGen, "#define STOREA(p, v) do{}while(0)\n"); } @@ -2094,17 +2087,17 @@ static void reduxGenSrcAppendBlockDecode (GpuReduction* gr){ srcbAppends(&gr->srcGen, " \n"); if(reduxGenKernelRequiresDst(gr)){ srcbAppends(&gr->srcGen, - " TK* wd = (TK*)(w + wdOff);\n" - " TK* wdL = &wd[0];\n" - " TK* wdR = &wd[GDIM_0*D];\n" - " TK* pd = (TK*)(SHMEM + pdOff);\n"); + " TK* restrict wd = (TK* restrict)(w + wdOff);\n" + " TK* restrict wdL = &wd[0];\n" + " TK* restrict wdR = &wd[GDIM_0*D];\n" + " TK* restrict pd = (TK* restrict)(SHMEM + pdOff);\n"); } if(reduxGenKernelRequiresDstArg(gr)){ srcbAppends(&gr->srcGen, - " TA* wa = (TA*)(w + waOff);\n" - " TA* waL = &wa[0];\n" - " TA* waR = &wa[GDIM_0*D];\n" - " TA* pa = (TA*)(SHMEM + paOff);\n"); + " TA* restrict wa = (TA* restrict)(w + waOff);\n" + " TA* restrict waL = &wa[0];\n" + " TA* restrict waR = &wa[GDIM_0*D];\n" + " TA* restrict pa = (TA* restrict)(SHMEM + paOff);\n"); } srcbAppends(&gr->srcGen, " \n"); } @@ -2182,12 +2175,12 @@ static void reduxGenSrcAppendThreadDecode (GpuReduction* gr){ " local_barrier();\n"); } srcbAppends(&gr->srcGen, " \n" - " const char* ts = s + sOff;\n"); + " const char* restrict ts = s + sOff;\n"); if(reduxGenRequiresDst(gr)){ - srcbAppends(&gr->srcGen, " char* td = d + dOff;\n"); + srcbAppends(&gr->srcGen, " char* restrict td = d + dOff;\n"); } if(reduxGenRequiresDstArg(gr)){ - srcbAppends(&gr->srcGen, " char* ta = a + aOff;\n"); + srcbAppends(&gr->srcGen, " char* restrict ta = a + aOff;\n"); } srcbAppends(&gr->srcGen, " \n" " \n"); diff --git a/tests/check_reduction.c b/tests/check_reduction.c index b5e9ba604d..12a99ded30 100644 --- a/tests/check_reduction.c +++ b/tests/check_reduction.c @@ -2054,6 +2054,79 @@ START_TEST(test_sum_alldimsreduced){ GpuArray_clear(&gaD); }END_TEST +START_TEST(test_sum_huge){ + pcgSeed(1); + + /** + * We test here a reduction of a huge 1D tensor on all dimensions. + */ + + size_t i; + size_t dims[1] = {100000000}; + size_t prodDims = dims[0]; + const int reduxList[] = {0}; + const float TOL = 1e-2; + + float* pS = calloc(1, sizeof(*pS) * dims[0]); + float* pD = calloc(1, sizeof(*pD)); + + ck_assert_ptr_ne(pS, NULL); + ck_assert_ptr_ne(pD, NULL); + + + /** + * Initialize source data. + */ + + for(i=0;i Date: Wed, 12 Jul 2017 12:50:32 -0400 Subject: [PATCH 21/34] Massive Refactor into effectively a lattice engine. --- src/gpuarray/reduction.h | 69 +- src/gpuarray_reduction.c | 4198 ++++++++++++++++++++------------------ tests/check_reduction.c | 128 +- 3 files changed, 2409 insertions(+), 1986 deletions(-) diff --git a/src/gpuarray/reduction.h b/src/gpuarray/reduction.h index c8508b841d..77043daa22 100644 --- a/src/gpuarray/reduction.h +++ b/src/gpuarray/reduction.h @@ -46,6 +46,8 @@ typedef enum _ga_reduce_op { GA_REDUCE_XOR, /* ^ */ GA_REDUCE_ALL, /* &&/all() */ GA_REDUCE_ANY, /* ||/any() */ + + GA_REDUCE_ENDSUPPORTED /* Must be last element in enum */ } ga_reduce_op; @@ -57,29 +59,31 @@ typedef enum _ga_reduce_op { * @param [out] gr The reduction operator. * @param [in] gpuCtx The GPU context. * @param [in] op The reduction operation to perform. - * @param [in] ndf The minimum number of destination dimensions to support. - * @param [in] ndr The minimum number of reduction dimensions to support. - * @param [in] srcTypeCode The data type of the source operand. + * @param [in] ndf The minimum number of free (destination) dimensions to support. + * @param [in] ndr The minimum number of reduction (source) dimensions to support. + * @param [in] s0TypeCode The data type of the source operand. * @param [in] flags Reduction operator creation flags. Currently must be * set to 0. * - * @return GA_NO_ERROR if the operator was created successfully, or a non-zero - * error code otherwise. + * @return GA_NO_ERROR if the operator was created successfully + * GA_INVALID_ERROR if grOut is NULL, or some other argument was invalid + * GA_NO_MEMORY if memory allocation failed anytime during creation + * or other non-zero error codes otherwise. */ -GPUARRAY_PUBLIC int GpuReduction_new (GpuReduction** grOut, - gpucontext* gpuCtx, - ga_reduce_op op, - unsigned ndf, - unsigned ndr, - int srcTypeCode, - int flags); +GPUARRAY_PUBLIC int GpuReduction_new (GpuReduction** grOut, + gpucontext* gpuCtx, + ga_reduce_op op, + unsigned ndf, + unsigned ndr, + int s0TypeCode, + int flags); /** * @brief Deallocate an operator allocated by GpuReduction_new(). */ -GPUARRAY_PUBLIC void GpuReduction_free (GpuReduction* gr); +GPUARRAY_PUBLIC void GpuReduction_free (GpuReduction* gr); /** * @brief Invoke an operator allocated by GpuReduction_new() on a source tensor. @@ -91,28 +95,27 @@ GPUARRAY_PUBLIC void GpuReduction_free (GpuReduction* gr); * destination. * * @param [in] gr The reduction operator. - * @param [out] dst The destination tensor. Has the same type as the source. - * @param [out] dstArg For argument of minima/maxima operations. Has type int64. - * @param [in] src The source tensor. + * @param [out] d0 The destination tensor. + * @param [out] d1 The second destination tensor, for argmin/argmax operations. + * @param [in] s0 The source tensor. * @param [in] reduxLen The number of axes reduced. Must be >= 1 and - * <= src->nd. + * <= s0->nd. * @param [in] reduxList A list of integers of length reduxLen, indicating * the axes to be reduced. The order of the axes - * matters for dstArg index calculations (GpuArray_argmin, - * GpuArray_argmax, GpuArray_minandargmin, - * GpuArray_maxandargmax). All entries in the list must be + * matters for dstArg index calculations (argmin, argmax, + * minandargmin, maxandargmax). All entries in the list must be * unique, >= 0 and < src->nd. * - * For example, if a 5D-tensor is max-reduced with an axis - * list of [3,4,1], then reduxLen shall be 3, and the + * For example, if a 5D-tensor is maxandargmax-reduced with an + * axis list of [3,4,1], then reduxLen shall be 3, and the * index calculation in every point shall take the form * - * dstArgmax[i0,i2] = i3 * src.shape[4] * src.shape[1] + - * i4 * src.shape[1] + - * i1 + * d1[i0,i2] = i3 * s0.shape[4] * s0.shape[1] + + * i4 * s0.shape[1] + + * i1 * * where (i3,i4,i1) are the coordinates of the maximum- - * valued element within subtensor [i0,:,i2,:,:] of src. + * valued element within subtensor [i0,:,i2,:,:] of s0. * @param [in] flags Reduction operator invocation flags. Currently must be * set to 0. * @@ -120,13 +123,13 @@ GPUARRAY_PUBLIC void GpuReduction_free (GpuReduction* gr); * error code otherwise. */ -GPUARRAY_PUBLIC int GpuReduction_call (GpuReduction* gr, - GpuArray* dst, - GpuArray* dstArg, - const GpuArray* src, - unsigned reduxLen, - const int* reduxList, - int flags); +GPUARRAY_PUBLIC int GpuReduction_call (const GpuReduction* gr, + GpuArray* d0, + GpuArray* d1, + const GpuArray* s0, + unsigned reduxLen, + const int* reduxList, + int flags); #ifdef __cplusplus diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c index 9b0d74c9dc..35518c0fbc 100644 --- a/src/gpuarray_reduction.c +++ b/src/gpuarray_reduction.c @@ -25,8 +25,28 @@ /* Defines */ #define DIVIDECEIL(a,b) (((a)+(b)-1)/(b)) -#define MAX_HW_DIMS 3 +/** + * Template Selector + * + * This is a bitfield interpreted as follows: + * + * 0b000x: Phase 1 processing (Phase 0) + * 0b00x0: Split axis is free (Reduced) + * 0bxx00: Huge axis is: + * 00: Nonexistent + * 01: Same as split axis + * 10: Same type (free/reduced) as split axis + * 11: Opposite type (free/reduced) to split axis + */ + +#define SELECTOR_PHASE1 0x01 +#define SELECTOR_SPLIT_FREE 0x02 +#define SELECTOR_HUGE_AXIS 0x0C +#define SELECTOR_HUGE_NONE 0x00 +#define SELECTOR_HUGE_IS_SPLIT 0x04 +#define SELECTOR_HUGE_SAME_TYPE 0x08 +#define SELECTOR_HUGE_OPPOSITE_TYPE 0x0C /* Datatypes */ @@ -38,60 +58,154 @@ struct axis_desc{ int reduxNum; int ibNum; - unsigned ibp; + unsigned perm; unsigned isReduced : 1; unsigned isIntra : 1; size_t len; size_t splitLen; - size_t pdim; - ssize_t srcStride; - ssize_t dstStride; - ssize_t dstArgStride; + ssize_t s0S; + ssize_t d0S; + ssize_t d1S; + size_t i0S; }; typedef struct axis_desc axis_desc; /** * Reduction Kernel Invoker. + */ + +struct redux_ctx{ + /* Function Arguments. */ + const GpuReduction* gr; + ga_reduce_op op; + GpuArray* d0; + GpuArray* d1; + const GpuArray* s0; + int reduxLen; + const int* reduxList; + int flags; + + /* General. */ + int nds0; /* # Source axes */ + int nds0r; /* # Reduced axes */ + int ndd0; /* # Destination axes */ + int ndfs0; /* # Flattened source axes */ + int ndfs0r; /* # Flattened source axes */ + int ndfd0; /* # Flattened source axes */ + int ndib; /* # Intra-block axes */ + int zeroAllAxes; /* # of zero-length axes in source tensor */ + int zeroRdxAxes; /* # of zero-length reduction axes in source tensor */ + size_t prodAllAxes; /* Product of length of all axes in source tensor */ + size_t prodRdxAxes; /* Product of length of all reduction axes in source tensor */ + size_t prodFreeAxes; /* Product of length of all free axes in source tensor */ + + /* Flattening */ + axis_desc* xdSrc; + axis_desc** xdSrcPtrs; + axis_desc* xdSplit; + + /* Invoker */ + uint32_t selector; + uint64_t U; + uint64_t V; + uint64_t B; + uint32_t D; + uint32_t Dunit; + uint32_t H; + + uint32_t LSlice; + uint64_t LPadded; + uint64_t* L, *Li; + gpudata* S0Data; + int64_t S0Off; + int64_t* S0J, *S0Si; + gpudata* D0Data; + int64_t D0Off; + int64_t* D0J, *D0Si; + gpudata* D1Data; + int64_t D1Off; + int64_t* D1J, *D1Si; + int64_t* I0J, *I0Si; + + gpudata* W; + int64_t W0Off; + ssize_t W1Off; + size_t shmemBytes; + ssize_t SHMEMK0Off; + ssize_t SHMEMK1Off; + + unsigned* perm; + + void** kArgs; + + /* Scheduler */ + size_t bs; + size_t gs; +}; +typedef struct redux_ctx redux_ctx; + + +/** + * Reduction Operator. * * INTRO * - * Generates the source code for a reduction kernel over arbitrarily-dimensioned, + * Generates the source code for a reduction kernel over arbitrarily-ranked, * -shaped and -typed tensors. * + * It is assumed that at most one axis will ever be of length > 2**31-1. The + * assumption is believed safe because no GPU or similar accelerator presently + * on Earth has the capacity to store or process 2**62-element tensors. + * * - * GOALS + * TYPE NAMES * - * The generator has the following goals: + * TS0: Type of s0 tensor + * TPS0: Promoted type of s0 tensor + * TD0: Type of d0 tensor + * TD1: Type of d1 tensor + * TS32: Type of 32-bit narrow, signed, 2's complement integer + * TU32: Type of 32-bit narrow, unsigned, 2's complement integer + * TS64: Type of 64-bit wide, signed, 2's complement integer + * TU64: Type of 64-bit wide, unsigned, 2's complement integer + * TK0: Type of reduction accumulator + * TK1: Type of flattened index * - * 1. Maximizing the use of coalesced memory loads within a warp. - * 2. Maximizing the # of useful threads within a warp. - * 3. Maximizing the number of warps within a block. - * 4. Ensuring there are no more than 5 blocks per multiprocessor. - * 5. Minimizing the workspace size (if it is required). + * But note however that: + * - TS0 is not necessarily the same as TPS0/TD0/TD1 + * - TD1 is not necessarily TS32/TU32/TS64/TU64/TK1 + * - TK1 is not necessarily TU64 + * - TK0 is not necessarily the same as TS0 or TPS0. Moreover, since it may + * be a "custom" type that exists only within the kernel, it might not + * necessarily have a gpuarray_type typecode associated with it. + * + * Example 1: TK0 might eventually become a double-TS0 struct for Kahan + * compensated summation. No typecode exists for a struct of two TS0 + * values. + * + * Example 2: If doing a Kahan summation of a GA_HALF array, the + * following might be the case: + * TS0 == GA_HALF + * TPS0 == GA_FLOAT + * TK0 == struct{GA_FLOAT,GA_FLOAT} * * * NOTES * - * Information elements required to perform reduction. + * Information elements required to generate source code: * - * 1. Ndim, shape and dtype of src tensor - * 2. Ndim, shape and dtype of dst/dstArg tensors + * 1. Maximum rank and dtype of s0 tensor + * 2. Maximum rank and dtype of d0/d1 tensors * 3. GPU context * 4. Number of processors * 5. Warp size * 6. Maximum size of block - * 7. Maximum size of block dimension X, Y, Z + * 7. Maximum size of block axis X * 8. Maximum size of grid - * 9. Maximum size of grid dimension X, Y, Z + * 9. Maximum size of grid axis X * 10. Dtype and initializer of accumulator - * 11. Sorted src axes for contiguous memory accesses - * 12. Ndim, shape and dtype of flattened src tensor - * 13. Number of stages (1 or 2) - * 14. Size of workspace tensor - * 15. Intrablock/split/free/reduced axes - * 16. Source code * - * Rationale for dependencies: + * Rationale for some dependencies: * * 1) Get the GPU context and its properties immediately, since an invalid * context is a likely error and we want to fail fast. @@ -99,105 +213,6 @@ typedef struct axis_desc axis_desc; * the context's properties have been retrieved since they provide * information about the device's natively-supported types and operations * (e.g. half-precision float) - */ - -struct redux_ctx{ - /* Function Arguments. */ - GpuReduction* gr; - ga_reduce_op op; - GpuArray* dst; - GpuArray* dstArg; - const GpuArray* src; - int reduxLen; - const int* reduxList; - int flags; - - /* General. */ - int nds; /* # Source dimensions */ - int ndr; /* # Reduced dimensions */ - int ndd; /* # Destination dimensions */ - int ndfs; /* # Flattened source dimensions */ - int ndfr; /* # Flattened source dimensions */ - int ndfd; /* # Flattened source dimensions */ - int ndib; /* # Intra-block dimensions */ - int zeroAllAxes; /* # of zero-length axes in source tensor */ - int zeroRdxAxes; /* # of zero-length reduction axes in source tensor */ - size_t prodAllAxes; /* Product of length of all axes in source tensor */ - size_t prodRdxAxes; /* Product of length of all reduction axes in source tensor */ - size_t prodFreeAxes; /* Product of length of all free axes in source tensor */ - - /* Flattening */ - axis_desc* xdSrc; - axis_desc** xdSrcPtrs; - axis_desc** xdTmpPtrs; - - /* Invoker */ - int phase; - size_t U; - size_t V; - size_t B; - unsigned D; - unsigned H; - unsigned splitReduce; - unsigned splitFree; - - axis_desc* xdSplit; - - size_t* l; - size_t* lPDim; - ssize_t* sJ; - ssize_t* dJ; - ssize_t* aJ; - - gpudata* flatSrcData; - ssize_t flatSrcOffset; - gpudata* flatDstData; - ssize_t flatDstOffset; - gpudata* flatDstArgData; - ssize_t flatDstArgOffset; - - gpudata* w; - size_t SHMEM; - ssize_t wdOff; - ssize_t pdOff; - ssize_t waOff; - ssize_t paOff; - - unsigned* ibs; - unsigned* ibp; - size_t* iblPDim; - ssize_t* ibsOff; - ssize_t* ibdOff; - ssize_t* ibaOff; - - void** kArgs; - - - /* Scheduler */ - size_t bs; - size_t gs; -}; -typedef struct redux_ctx redux_ctx; - - -/** - * Reduction Operator. - * - * INTRO - * - * Generates the source code for a reduction kernel over arbitrarily-dimensioned, - * -shaped and -typed tensors. - * - * - * GOALS - * - * The generator has the following goals: - * - * 1. Maximizing the use of coalesced memory loads within a warp. - * 2. Maximizing the # of useful threads within a warp. - * 3. Maximizing the number of warps within a block. - * 4. Ensuring there are no more than 5 blocks per multiprocessor. - * 5. Minimizing the workspace size (if it is required). * * * REFERENCES @@ -221,7 +236,7 @@ struct GpuReduction{ ga_reduce_op op; int ndd; int ndr; - int srcTypeCode; + int TS0tc; int flags; /* Misc */ @@ -230,21 +245,30 @@ struct GpuReduction{ /* Source code Generator. */ strb s; srcb srcGen; + char kName[256]; char* kSourceCode; size_t kSourceCodeLen; - int dstTypeCode; - int dstArgTypeCode; + int TPS0tc; + int TD0tc; + int TD1tc; + int TS32tc; + int TU32tc; + int TS64tc; + int TU64tc; + struct{ + size_t size; + size_t align; + char defn[256]; + char init[256]; + } TK0, TK1; int idxTypeCode; int accTypeCode; const char* srcTypeStr; const char* dstTypeStr; const char* dstArgTypeStr; const char* idxTypeStr; - const char* accTypeStr; - const char* initVal; /* Compile */ - int log2MaxL; int kNumArgs; int* kArgTypeCodes; char* kErrorString; @@ -258,224 +282,237 @@ struct GpuReduction{ size_t maxG0; size_t maxLM; size_t maxLK; + size_t maxBS; + int log2MaxBS; }; /* Typedefs */ -typedef void (*GpuReductionIterFn)(GpuReduction* gr, - int typecode, - const char* typeName, - const char* baseName, - int num, - void* user); +typedef void (*GpuReductionIterFn)(const GpuReduction* gr, + int typecode, + const char* typeName, + const char* baseName, + int num, + void* user); /* Static Function prototypes */ /* Utilities */ -static int reduxGetSumInit (int typecode, const char** property); -static int reduxGetProdInit (int typecode, const char** property); -static int reduxGetMinInit (int typecode, const char** property); -static int reduxGetMaxInit (int typecode, const char** property); -static int reduxGetAndInit (int typecode, const char** property); -static int reduxGetOrInit (int typecode, const char** property); -static int reduxIsSensitive (int typecode); -static int reduxSortFlatSensitive (const void* a, const void* b); -static int reduxSortFlatInsensitive (const void* a, const void* b); -static int reduxSortPtrIBSrcRdSelect (const void* a, const void* b); -static int reduxSortPtrByReduxNum (const void* a, const void* b); -static int reduxSortPtrIBDstWrSelect (const void* a, const void* b); -static int reduxSortPtrIBDstArgWrSelect (const void* a, const void* b); -static int reduxSortPtrInsertFinalOrder (const void* a, const void* b); +static int reduxGetSumInit (int typecode, const char** property); +static int reduxGetProdInit (int typecode, const char** property); +static int reduxGetMinInit (int typecode, const char** property); +static int reduxGetMaxInit (int typecode, const char** property); +static int reduxGetAndInit (int typecode, const char** property); +static int reduxGetOrInit (int typecode, const char** property); +static int reduxIsSensitive (int op); +static const char* reduxGetOpName (int op); +static int reduxIsFloatingPoint (int typecode); +static unsigned reduxCeilLog2 (uint64_t x); +static uint64_t reduxNextPow2 (uint64_t x); +static int reduxSortFlatSensitive (const void* a, const void* b); +static int reduxSortFlatInsensitive (const void* a, const void* b); +static int reduxSortPtrS0AbsStride (const void* a, const void* b); +static int reduxSortPtrByReduxNum (const void* a, const void* b); +static int reduxSortPtrD0WrSelect (const void* a, const void* b); +static int reduxSortPtrD1WrSelect (const void* a, const void* b); +static int reduxSortPtrInsertFinalOrder (const void* a, const void* b); /* Axis Description API */ -static void axisInit (axis_desc* axis, - ssize_t len, - ssize_t srcStride); -static void axisMarkReduced (axis_desc* axis, int reduxNum); -static void axisMarkIntraBlock (axis_desc* axis, - int ibNum, - size_t ibLen); -static int axisGetReduxNum (const axis_desc* axis); -static size_t axisGetLen (const axis_desc* axis); -static size_t axisGetIntraLen (const axis_desc* axis); -static size_t axisGetInterLen (const axis_desc* axis); -static size_t axisGetIntraInterLen (const axis_desc* axis); -static ssize_t axisGetSrcStride (const axis_desc* axis); -static size_t axisGetSrcAbsStride (const axis_desc* axis); -static ssize_t axisGetDstStride (const axis_desc* axis); -static size_t axisGetDstAbsStride (const axis_desc* axis); -static ssize_t axisGetDstArgStride (const axis_desc* axis); -static size_t axisGetDstArgAbsStride (const axis_desc* axis); -static unsigned axisGetIBP (const axis_desc* axis); -static int axisGetIBNum (const axis_desc* axis); -static void axisSetIBP (axis_desc* axis, - unsigned ibp); -static size_t axisGetPDim (const axis_desc* axis); -static void axisSetPDim (axis_desc* axis, - size_t pdim); -static int axisIsReduced (const axis_desc* axis); -static int axisIsIntra (const axis_desc* axis); -static int axisIsInter (const axis_desc* axis); -static int axisIsSplit (const axis_desc* axis); +static void axisInit (axis_desc* axis, + ssize_t len, + ssize_t s0S); +static void axisMarkReduced (axis_desc* axis, int reduxNum); +static void axisMarkIntraBlock (axis_desc* axis, + int ibNum, + size_t ibLen); +static int axisGetReduxNum (const axis_desc* axis); +static size_t axisGetLen (const axis_desc* axis); +static size_t axisGetIntraLen (const axis_desc* axis); +static size_t axisGetInterLen (const axis_desc* axis); +static size_t axisGetIntraInterLen (const axis_desc* axis); +static ssize_t axisGetS0Stride (const axis_desc* axis); +static size_t axisGetS0AbsStride (const axis_desc* axis); +static ssize_t axisGetD0Stride (const axis_desc* axis); +static size_t axisGetD0AbsStride (const axis_desc* axis); +static ssize_t axisGetD1Stride (const axis_desc* axis); +static size_t axisGetD1AbsStride (const axis_desc* axis); +static size_t axisGetI0Stride (const axis_desc* axis); +static void axisSetI0Stride (axis_desc* axis, + size_t pdim); +static unsigned axisGetPerm (const axis_desc* axis); +static int axisGetIBNum (const axis_desc* axis); +static void axisSetPerm (axis_desc* axis, + unsigned ibp); +static int axisIsReduced (const axis_desc* axis); +static int axisIsIntra (const axis_desc* axis); +static int axisIsInter (const axis_desc* axis); +static int axisIsSplit (const axis_desc* axis); /* Reduction Context API */ /* Generator Control Flow */ -static int reduxGenInit (GpuReduction* gr); -static int reduxGenInferProperties (GpuReduction* gr); -static void reduxGenIterArgs (GpuReduction* gr, - GpuReductionIterFn fn, - void* user); -static int reduxGenSrc (GpuReduction* gr); -static void reduxGenSrcAppend (GpuReduction* gr); -static void reduxGenSrcAppendIncludes (GpuReduction* gr); -static void reduxGenSrcAppendMacroDefs (GpuReduction* gr); -static void reduxGenSrcAppendTypedefs (GpuReduction* gr); -static void reduxGenSrcAppendReduxKernel (GpuReduction* gr); -static void reduxGenSrcAppendPrototype (GpuReduction* gr); -static void reduxGenSrcAppendBlockDecode (GpuReduction* gr); -static void reduxGenSrcAppendThreadDecode (GpuReduction* gr); -static void reduxGenSrcAppendPhase0 (GpuReduction* gr); -static void reduxGenSrcAppendLoops (GpuReduction* gr, - int freeMaybeSplit, - int reduceMaybeSplit); -static void reduxGenSrcAppendLoop (GpuReduction* gr, - int initial, - int freeMaybeSplit, - int reduceMaybeSplit); -static void reduxGenSrcAppendDecrement (GpuReduction* gr); -static void reduxGenSrcAppendVertical (GpuReduction* gr, - int freeMaybeSplit, - int reduceMaybeSplit); -static void reduxGenSrcAppendIncrement (GpuReduction* gr, - int axis, - int initial, - int freeMaybeSplit, - int reduceMaybeSplit); -static void reduxGenSrcAppendDstWrite (GpuReduction* gr, - int initial, - int freeMaybeSplit, - int reduceMaybeSplit); -static void reduxGenSrcAppendPhase1 (GpuReduction* gr); -static int reduxGenCompile (GpuReduction* gr); -static int reduxGenComputeLaunchBounds (GpuReduction* gr); -static int reduxGenCleanup (GpuReduction* gr, int ret); -static int reduxGenCleanupMsg (GpuReduction* gr, int ret, - const char* fmt, ...); +static int reduxGenInit (GpuReduction* gr); +static int reduxGenInferProperties (GpuReduction* gr); +static void reduxGenSetMaxBS (GpuReduction* gr); +static void reduxGenSetKTypes (GpuReduction* gr); +static void reduxGenIterArgs (const GpuReduction* gr, + GpuReductionIterFn fn, + void* user); +static int reduxGenSrc (GpuReduction* gr); +static void reduxGenSrcAppend (GpuReduction* gr); +static void reduxGenSrcAppendIncludes (GpuReduction* gr); +static void reduxGenSrcAppendMacroTypedefs (GpuReduction* gr); +static void reduxGenSrcAppendReduxKernel (GpuReduction* gr); +static void reduxGenSrcAppendPrototype (GpuReduction* gr); +static void reduxGenSrcAppendDecode (GpuReduction* gr); +static void reduxGenSrcAppendPhase0 (GpuReduction* gr, + uint32_t selector); +static void reduxGenSrcAppendLoop (GpuReduction* gr, + uint32_t selector, + int initial); +static void reduxGenSrcAppendVertical (GpuReduction* gr, + uint32_t selector); +static void reduxGenSrcAppendIncrement (GpuReduction* gr, + uint32_t selector, + int initial, + int axis); +static void reduxGenSrcAppendDstWrite (GpuReduction* gr, + uint32_t selector, + int initial); +static void reduxGenSrcAppendPhase1 (GpuReduction* gr); +static int reduxGenSrcAxisIsHuge (GpuReduction* gr, + uint32_t selector, + int axis); +static int reduxGenSrcAxisIsSplit (GpuReduction* gr, + uint32_t selector, + int axis); +static int reduxGenCompile (GpuReduction* gr); +static int reduxGenComputeLaunchBounds (GpuReduction* gr); +static int reduxGenCleanup (GpuReduction* gr, int ret); +static int reduxGenCleanupMsg (GpuReduction* gr, int ret, + const char* fmt, ...); /* Generator Utilities */ -static void reduxGenCountArgs (GpuReduction* gr, - int typecode, - const char* typeName, - const char* baseName, - int num, - void* user); -static void reduxGenSaveArgTypecodes (GpuReduction* gr, - int typecode, - const char* typeName, - const char* baseName, - int num, - void* user); -static void reduxGenAppendArg (GpuReduction* gr, - int typecode, - const char* typeName, - const char* baseName, - int num, - void* user); -static void reduxInvMarshalArg (GpuReduction* gr, - int typecode, - const char* typeName, - const char* baseName, - int num, - void* user); -static size_t reduxGenEstimateParallelism (const GpuReduction* gr); -static int reduxGenRequiresDst (const GpuReduction* gr); -static int reduxGenRequiresDstArg (const GpuReduction* gr); -static int reduxGenKernelRequiresDst (const GpuReduction* gr); -static int reduxGenKernelRequiresDstArg (const GpuReduction* gr); -static int reduxGenAxisMaybeSplit (const GpuReduction* gr, int axis); -static size_t reduxGenGetReduxStateSize (const GpuReduction* gr); -static size_t reduxGenGetMaxLocalSize (const GpuReduction* gr); -static size_t reduxGenGetSHMEMSize (const GpuReduction* gr, size_t bs); -static size_t reduxGenGetSHMEMDstOff (const GpuReduction* gr, size_t bs); -static size_t reduxGenGetSHMEMDstArgOff (const GpuReduction* gr, size_t bs); -static size_t reduxGenGetWMEMSize (const GpuReduction* gr, size_t bs); -static size_t reduxGenGetWMEMDstOff (const GpuReduction* gr, size_t bs); -static size_t reduxGenGetWMEMDstArgOff (const GpuReduction* gr, size_t bs); +static void reduxGenCountArgs (const GpuReduction* gr, + int typecode, + const char* typeName, + const char* baseName, + int num, + void* user); +static void reduxGenSaveArgTypecodes (const GpuReduction* gr, + int typecode, + const char* typeName, + const char* baseName, + int num, + void* user); +static void reduxGenAppendArg (const GpuReduction* gr, + int typecode, + const char* typeName, + const char* baseName, + int num, + void* user); +static void reduxInvMarshalArg (const GpuReduction* gr, + int typecode, + const char* typeName, + const char* baseName, + int num, + void* user); +static size_t reduxGenEstimateParallelism (const GpuReduction* gr); +static int reduxGenRequiresS0 (const GpuReduction* gr); +static int reduxGenRequiresD0 (const GpuReduction* gr); +static int reduxGenRequiresD1 (const GpuReduction* gr); +static int reduxGenKernelRequiresLatticeS0(const GpuReduction* gr); +static int reduxGenKernelRequiresLatticeD0(const GpuReduction* gr); +static int reduxGenKernelRequiresLatticeD1(const GpuReduction* gr); +static int reduxGenKernelRequiresLatticeI0(const GpuReduction* gr); +static int reduxGenKernelRequiresStateK0 (const GpuReduction* gr); +static int reduxGenKernelRequiresStateK1 (const GpuReduction* gr); +static int reduxGenKernelRequiresWspace (const GpuReduction* gr); +static size_t reduxGenGetK0Size (const GpuReduction* gr); +static size_t reduxGenGetK0Align (const GpuReduction* gr); +static size_t reduxGenGetK1Size (const GpuReduction* gr); +static size_t reduxGenGetK1Align (const GpuReduction* gr); +static size_t reduxGenGetReduxStateSize (const GpuReduction* gr); +static size_t reduxGenGetMaxLocalSize (const GpuReduction* gr); +static size_t reduxGenGetSHMEMSize (const GpuReduction* gr, size_t cells); +static size_t reduxGenGetSHMEMK0Off (const GpuReduction* gr, size_t cells); +static size_t reduxGenGetSHMEMK1Off (const GpuReduction* gr, size_t cells); +static size_t reduxGenGetWMEMSize (const GpuReduction* gr, size_t cells); +static size_t reduxGenGetWMEMK0Off (const GpuReduction* gr, size_t cells); +static size_t reduxGenGetWMEMK1Off (const GpuReduction* gr, size_t cells); /* Invoker Control Flow */ -static int reduxInvInit (redux_ctx* ctx); -static int reduxInvInferProperties (redux_ctx* ctx); -static int reduxInvFlattenSource (redux_ctx* ctx); -static int reduxInvComputeKArgs (redux_ctx* ctx); -static int reduxInvSchedule (redux_ctx* ctx); -static int reduxInvoke (redux_ctx* ctx); -static int reduxInvCleanup (redux_ctx* ctx, int ret); -static int reduxInvCleanupMsg (redux_ctx* ctx, int ret, - const char* fmt, ...); +static int reduxInvInit (redux_ctx* ctx); +static int reduxInvInferProperties (redux_ctx* ctx); +static int reduxInvFlattenSource (redux_ctx* ctx); +static int reduxInvComputeKernelArgs (redux_ctx* ctx); +static int reduxInvSchedule (redux_ctx* ctx); +static int reduxInvoke (redux_ctx* ctx); +static int reduxInvCleanup (redux_ctx* ctx, int ret); +static int reduxInvCleanupMsg (redux_ctx* ctx, int ret, + const char* fmt, ...); /* Invoker Utilities */ -static size_t reduxInvEstimateParallelism (const redux_ctx* ctx); -static int reduxInvRequiresDst (const redux_ctx* ctx); -static int reduxInvRequiresDstArg (const redux_ctx* ctx); -static unsigned reduxInvGetSplitFree (const redux_ctx* ctx); -static unsigned reduxInvGetSplitReduce (const redux_ctx* ctx); -static axis_desc* reduxInvGetSrcAxis (const redux_ctx* ctx, int i); -static axis_desc* reduxInvGetSrcSortAxis (const redux_ctx* ctx, int i); -static int reduxTryFlattenOut (const redux_ctx* ctx, - const axis_desc* out); -static int reduxTryFlattenInto (redux_ctx* ctx, - axis_desc* into, - const axis_desc* from); -static void reduxSortAxisPtrsBy (axis_desc** ptrs, - axis_desc* axes, - size_t numAxes, - int(*fn)(const void*, const void*)); +static size_t reduxInvEstimateParallelism (const redux_ctx* ctx); +static int reduxInvRequiresS0 (const redux_ctx* ctx); +static int reduxInvRequiresD0 (const redux_ctx* ctx); +static int reduxInvRequiresD1 (const redux_ctx* ctx); +static axis_desc* reduxInvGetSrcAxis (const redux_ctx* ctx, int i); +static axis_desc* reduxInvGetSrcSortAxis (const redux_ctx* ctx, int i); +static int reduxTryFlattenOut (const redux_ctx* ctx, + const axis_desc* axis); +static int reduxTryFlattenInto (redux_ctx* ctx, + axis_desc* into, + const axis_desc* from); +static void reduxSortAxisPtrsBy (axis_desc** ptrs, + axis_desc* axes, + size_t numAxes, + int(*fn)(const void*, const void*)); /* Function Implementations */ /* Extern Functions */ -GPUARRAY_PUBLIC int GpuReduction_new (GpuReduction** grOut, - gpucontext* gpuCtx, - ga_reduce_op op, - unsigned ndf, - unsigned ndr, - int srcTypeCode, - int flags){ - if(!grOut){ +GPUARRAY_PUBLIC int GpuReduction_new (GpuReduction** grOut, + gpucontext* gpuCtx, + ga_reduce_op op, + unsigned ndf, + unsigned ndr, + int s0TypeCode, + int flags){ + if (!grOut){ return GA_INVALID_ERROR; } *grOut = calloc(1, sizeof(**grOut)); - if(*grOut){ - (*grOut)->gpuCtx = gpuCtx; - (*grOut)->op = op; - (*grOut)->ndd = (int)ndf; - (*grOut)->ndr = (int)ndr; - (*grOut)->srcTypeCode = srcTypeCode; - (*grOut)->flags = flags; + if (*grOut){ + (*grOut)->gpuCtx = gpuCtx; + (*grOut)->op = op; + (*grOut)->ndd = (int)ndf; + (*grOut)->ndr = (int)ndr; + (*grOut)->TS0tc = s0TypeCode; + (*grOut)->flags = flags; return reduxGenInit(*grOut); }else{ return GA_MEMORY_ERROR; } } -GPUARRAY_PUBLIC void GpuReduction_free (GpuReduction* gr){ +GPUARRAY_PUBLIC void GpuReduction_free (GpuReduction* gr){ reduxGenCleanup(gr, !GA_NO_ERROR); } -GPUARRAY_PUBLIC int GpuReduction_call (GpuReduction* gr, - GpuArray* dst, - GpuArray* dstArg, - const GpuArray* src, - unsigned reduxLen, - const int* reduxList, - int flags){ +GPUARRAY_PUBLIC int GpuReduction_call (const GpuReduction* gr, + GpuArray* d0, + GpuArray* d1, + const GpuArray* s0, + unsigned reduxLen, + const int* reduxList, + int flags){ redux_ctx ctxSTACK, *ctx = &ctxSTACK; memset(ctx, 0, sizeof(*ctx)); ctx->gr = gr; - ctx->dst = dst; - ctx->dstArg = dstArg; - ctx->src = src; + ctx->d0 = d0; + ctx->d1 = d1; + ctx->s0 = s0; ctx->reduxLen = reduxLen; ctx->reduxList = reduxList; ctx->flags = flags; @@ -497,7 +534,7 @@ GPUARRAY_PUBLIC int GpuReduction_call (GpuReduction* gr, * @return Zero if successful; Non-zero if the datatype is not supported. */ -static int reduxGetSumInit (int typecode, const char** property){ +static int reduxGetSumInit (int typecode, const char** property){ if (typecode == GA_POINTER || typecode == GA_BUFFER){ return GA_UNSUPPORTED_ERROR; @@ -517,7 +554,7 @@ static int reduxGetSumInit (int typecode, const char** prop * @return Zero if successful; Non-zero if the datatype is not supported. */ -static int reduxGetProdInit (int typecode, const char** property){ +static int reduxGetProdInit (int typecode, const char** property){ if (typecode == GA_POINTER || typecode == GA_BUFFER){ return GA_UNSUPPORTED_ERROR; @@ -537,7 +574,7 @@ static int reduxGetProdInit (int typecode, const char** prop * @return Zero if successful; Non-zero if the datatype is not supported. */ -static int reduxGetMinInit (int typecode, const char** property){ +static int reduxGetMinInit (int typecode, const char** property){ switch (typecode){ case GA_BYTE2: case GA_BYTE3: @@ -627,7 +664,7 @@ static int reduxGetMinInit (int typecode, const char** prop * @return Zero if successful; Non-zero if the datatype is not supported. */ -static int reduxGetMaxInit (int typecode, const char** property){ +static int reduxGetMaxInit (int typecode, const char** property){ switch (typecode){ case GA_BOOL: *property = "1"; @@ -726,7 +763,7 @@ static int reduxGetMaxInit (int typecode, const char** prop * @return Zero if successful; Non-zero if the datatype is not supported. */ -static int reduxGetAndInit (int typecode, const char** property){ +static int reduxGetAndInit (int typecode, const char** property){ if (typecode == GA_POINTER || typecode == GA_BUFFER){ return GA_UNSUPPORTED_ERROR; @@ -746,7 +783,7 @@ static int reduxGetAndInit (int typecode, const char** prop * @return Zero if successful; Non-zero if the datatype is not supported. */ -static int reduxGetOrInit (int typecode, const char** property){ +static int reduxGetOrInit (int typecode, const char** property){ if (typecode == GA_POINTER || typecode == GA_BUFFER){ return GA_UNSUPPORTED_ERROR; @@ -756,7 +793,7 @@ static int reduxGetOrInit (int typecode, const char** prop } /** - * @brief Returns whether the reduction is sensitive. + * @brief Returns whether the reduction is "sensitive". * * A reduction is sensitive when its output satisfies at least one of the * following conditions: @@ -782,8 +819,8 @@ static int reduxGetOrInit (int typecode, const char** prop * . */ -static int reduxIsSensitive (int typecode){ - switch (typecode){ +static int reduxIsSensitive (int op){ + switch (op){ case GA_REDUCE_MINANDARGMIN: case GA_REDUCE_MAXANDARGMAX: case GA_REDUCE_ARGMIN: @@ -794,6 +831,95 @@ static int reduxIsSensitive (int typecode){ } } +/** + * Get a name for the op, usable within a C identifier. + */ + +static const char* reduxGetOpName (int op){ + switch (op){ + case GA_REDUCE_SUM: return "Sum"; + case GA_REDUCE_PROD: return "Prod"; + case GA_REDUCE_PRODNZ: return "ProdNonZero"; + case GA_REDUCE_MIN: return "Min"; + case GA_REDUCE_MAX: return "Max"; + case GA_REDUCE_ARGMIN: return "Argmin"; + case GA_REDUCE_ARGMAX: return "Argmax"; + case GA_REDUCE_MINANDARGMIN: return "MinAndArgmin"; + case GA_REDUCE_MAXANDARGMAX: return "MaxAndArgmax"; + case GA_REDUCE_AND: return "And"; + case GA_REDUCE_OR: return "Or"; + case GA_REDUCE_XOR: return "Xor"; + case GA_REDUCE_ALL: return "All"; + case GA_REDUCE_ANY: return "Any"; + default: return NULL; + } +} + +/** + * Whether or not the typecode is a floating-point type. + */ + +static int reduxIsFloatingPoint (int typecode){ + switch(typecode){ + case GA_HALF: + case GA_HALF2: + case GA_HALF4: + case GA_HALF8: + case GA_HALF16: + case GA_FLOAT: + case GA_FLOAT2: + case GA_FLOAT4: + case GA_FLOAT8: + case GA_FLOAT16: + case GA_DOUBLE: + case GA_DOUBLE2: + case GA_DOUBLE4: + case GA_DOUBLE8: + case GA_DOUBLE16: + case GA_QUAD: + case GA_CFLOAT: + case GA_CDOUBLE: + case GA_CQUAD: + return 1; + default: + return 0; + } +} + +/** + * Compute ceil(log2(x)). + */ + +static unsigned reduxCeilLog2 (uint64_t x){ + int i; + + if (x <= 1){ + return 1; + } + for (i=0,x--;x;i++,x>>=1){} + return i; +} + +/** + * Compute next power of 2. + * + * If x is a power of two already, return x. + */ + +static uint64_t reduxNextPow2 (uint64_t x){ + if (x & (x-1)){ + x |= x >> 1; + x |= x >> 2; + x |= x >> 4; + x |= x >> 8; + x |= x >> 16; + x |= x >> 32; + return x+1; + }else{ + return x; + } +} + /** * @brief Sort the axes into optimal order for flattening. * @@ -816,7 +942,7 @@ static int reduxIsSensitive (int typecode){ * 5. then by increasing source axis number. */ -static int reduxSortFlatInsensitive (const void* a, const void* b){ +static int reduxSortFlatInsensitive (const void* a, const void* b){ const axis_desc* xda = (const axis_desc*)a; const axis_desc* xdb = (const axis_desc*)b; @@ -826,15 +952,15 @@ static int reduxSortFlatInsensitive (const void* a, const void* b){ return -1; } - if (axisGetSrcAbsStride(xda) < axisGetSrcAbsStride(xdb)){ + if (axisGetS0AbsStride(xda) < axisGetS0AbsStride(xdb)){ return +1; - }else if (axisGetSrcAbsStride(xda) > axisGetSrcAbsStride(xdb)){ + }else if (axisGetS0AbsStride(xda) > axisGetS0AbsStride(xdb)){ return -1; } return 0; } -static int reduxSortFlatSensitive (const void* a, const void* b){ +static int reduxSortFlatSensitive (const void* a, const void* b){ const axis_desc* xda = (const axis_desc*)a; const axis_desc* xdb = (const axis_desc*)b; @@ -847,9 +973,9 @@ static int reduxSortFlatSensitive (const void* a, const void* b){ if (axisIsReduced(xda)){ return axisGetReduxNum(xda) axisGetSrcAbsStride(xdb)){ + }else if (axisGetS0AbsStride(xda) > axisGetS0AbsStride(xdb)){ return -1; } @@ -863,19 +989,19 @@ static int reduxSortFlatSensitive (const void* a, const void* b){ * This means ascending order of absolute stride. */ -static int reduxSortPtrIBSrcRdSelect (const void* a, const void* b){ +static int reduxSortPtrS0AbsStride (const void* a, const void* b){ const axis_desc* xda = *(const axis_desc* const*)a; const axis_desc* xdb = *(const axis_desc* const*)b; - if (axisGetSrcAbsStride(xda) < axisGetSrcAbsStride(xdb)){ + if (axisGetS0AbsStride(xda) < axisGetS0AbsStride(xdb)){ return -1; - }else if (axisGetSrcAbsStride(xda) > axisGetSrcAbsStride(xdb)){ + }else if (axisGetS0AbsStride(xda) > axisGetS0AbsStride(xdb)){ return +1; } return 0; } -static int reduxSortPtrByReduxNum (const void* a, const void* b){ +static int reduxSortPtrByReduxNum (const void* a, const void* b){ const axis_desc* xda = *(const axis_desc* const*)a; const axis_desc* xdb = *(const axis_desc* const*)b; @@ -893,7 +1019,7 @@ static int reduxSortPtrByReduxNum (const void* a, const void* b){ return 0; } -static int reduxSortPtrIBDstWrSelect (const void* a, const void* b){ +static int reduxSortPtrD0WrSelect (const void* a, const void* b){ const axis_desc* xda = *(const axis_desc* const*)a; const axis_desc* xdb = *(const axis_desc* const*)b; @@ -919,15 +1045,15 @@ static int reduxSortPtrIBDstWrSelect (const void* a, const void* b){ } /* Otherwise it's sort by destination absolute stride. */ - if (axisGetDstAbsStride(xda) < axisGetDstAbsStride(xdb)){ + if (axisGetD0AbsStride(xda) < axisGetD0AbsStride(xdb)){ return -1; - }else if (axisGetDstAbsStride(xda) > axisGetDstAbsStride(xdb)){ + }else if (axisGetD0AbsStride(xda) > axisGetD0AbsStride(xdb)){ return +1; } return 0; } -static int reduxSortPtrIBDstArgWrSelect (const void* a, const void* b){ +static int reduxSortPtrD1WrSelect (const void* a, const void* b){ const axis_desc* xda = *(const axis_desc* const*)a; const axis_desc* xdb = *(const axis_desc* const*)b; @@ -953,15 +1079,15 @@ static int reduxSortPtrIBDstArgWrSelect (const void* a, const void* b){ } /* Otherwise it's sort by destination argument absolute stride. */ - if (axisGetDstArgAbsStride(xda) < axisGetDstArgAbsStride(xdb)){ + if (axisGetD1AbsStride(xda) < axisGetD1AbsStride(xdb)){ return -1; - }else if (axisGetDstArgAbsStride(xda) > axisGetDstArgAbsStride(xdb)){ + }else if (axisGetD1AbsStride(xda) > axisGetD1AbsStride(xdb)){ return +1; } return 0; } -static int reduxSortPtrInsertFinalOrder (const void* a, const void* b){ +static int reduxSortPtrInsertFinalOrder (const void* a, const void* b){ const axis_desc* xda = *(const axis_desc* const*)a; const axis_desc* xdb = *(const axis_desc* const*)b; @@ -973,7 +1099,7 @@ static int reduxSortPtrInsertFinalOrder (const void* a, const void* b){ return +1; } - if(axisIsIntra(xda)){ + if (axisIsIntra(xda)){ /** * Intra axes sort between themselves by descending intra axis number. */ @@ -999,9 +1125,9 @@ static int reduxSortPtrInsertFinalOrder (const void* a, const void* b){ return +1; } - if (axisGetSrcAbsStride(xda) < axisGetSrcAbsStride(xdb)){ + if (axisGetS0AbsStride(xda) < axisGetS0AbsStride(xdb)){ return -1; - }else if (axisGetSrcAbsStride(xda) > axisGetSrcAbsStride(xdb)){ + }else if (axisGetS0AbsStride(xda) > axisGetS0AbsStride(xdb)){ return +1; } } @@ -1016,28 +1142,28 @@ static int reduxSortPtrInsertFinalOrder (const void* a, const void* b){ * @brief Initialize Axis Description. */ -static void axisInit (axis_desc* axis, - ssize_t len, - ssize_t srcStride){ +static void axisInit (axis_desc* axis, + ssize_t len, + ssize_t s0S){ memset(axis, 0, sizeof(*axis)); - axis->reduxNum = -1; - axis->ibNum = -1; - axis->ibp = 0; - axis->len = len; - axis->splitLen = 1; - axis->pdim = 0; + axis->reduxNum = -1; + axis->ibNum = -1; + axis->perm = 0; + axis->len = len; + axis->splitLen = 1; + axis->i0S = 0; - axis->srcStride = srcStride; - axis->dstStride = 0; - axis->dstArgStride = 0; + axis->s0S = s0S; + axis->d0S = 0; + axis->d1S = 0; } /** * @brief Mark axis as reduction axis, with position reduxNum in the axis list. */ -static void axisMarkReduced (axis_desc* axis, int reduxNum){ +static void axisMarkReduced (axis_desc* axis, int reduxNum){ axis->isReduced = 1; axis->reduxNum = reduxNum; } @@ -1046,9 +1172,9 @@ static void axisMarkReduced (axis_desc* axis, int r * @brief Mark axis as (split) intrablock axis. */ -static void axisMarkIntraBlock (axis_desc* axis, - int ibNum, - size_t ibLen){ +static void axisMarkIntraBlock (axis_desc* axis, + int ibNum, + size_t ibLen){ axis->isIntra = 1; axis->ibNum = ibNum; axis->splitLen = ibLen; @@ -1058,13 +1184,13 @@ static void axisMarkIntraBlock (axis_desc* axis, * @brief Get properties of an axis. */ -static int axisGetReduxNum (const axis_desc* axis){ +static int axisGetReduxNum (const axis_desc* axis){ return axis->reduxNum; } -static size_t axisGetLen (const axis_desc* axis){ +static size_t axisGetLen (const axis_desc* axis){ return axis->len; } -static size_t axisGetIntraLen (const axis_desc* axis){ +static size_t axisGetIntraLen (const axis_desc* axis){ if (axisIsSplit(axis)){ return axis->splitLen; }else if (axisIsIntra(axis)){ @@ -1073,7 +1199,7 @@ static size_t axisGetIntraLen (const axis_desc* axis){ return 1; } } -static size_t axisGetInterLen (const axis_desc* axis){ +static size_t axisGetInterLen (const axis_desc* axis){ if (axisIsSplit(axis)){ return DIVIDECEIL(axis->len, axis->splitLen); }else if (axisIsIntra(axis)){ @@ -1082,88 +1208,77 @@ static size_t axisGetInterLen (const axis_desc* axis){ return axis->len; } } -static size_t axisGetIntraInterLen (const axis_desc* axis){ +static size_t axisGetIntraInterLen (const axis_desc* axis){ return axisGetIntraLen(axis)*axisGetInterLen(axis); } -static ssize_t axisGetSrcStride (const axis_desc* axis){ - return axisGetLen(axis) > 1 ? axis->srcStride : 0; +static ssize_t axisGetS0Stride (const axis_desc* axis){ + return axisGetLen(axis) > 1 ? axis->s0S : 0; } -static size_t axisGetSrcAbsStride (const axis_desc* axis){ - return axisGetSrcStride(axis)<0 ? -(size_t)axisGetSrcStride(axis): - +(size_t)axisGetSrcStride(axis); +static size_t axisGetS0AbsStride (const axis_desc* axis){ + return axisGetS0Stride(axis)<0 ? -(size_t)axisGetS0Stride(axis): + +(size_t)axisGetS0Stride(axis); } -static ssize_t axisGetDstStride (const axis_desc* axis){ - return axisGetLen(axis) > 1 ? axis->dstStride : 0; +static ssize_t axisGetD0Stride (const axis_desc* axis){ + return axisGetLen(axis) > 1 ? axis->d0S : 0; } -static size_t axisGetDstAbsStride (const axis_desc* axis){ - return axisGetDstStride(axis)<0 ? -(size_t)axisGetDstStride(axis): - +(size_t)axisGetDstStride(axis); +static size_t axisGetD0AbsStride (const axis_desc* axis){ + return axisGetD0Stride(axis)<0 ? -(size_t)axisGetD0Stride(axis): + +(size_t)axisGetD0Stride(axis); } -static ssize_t axisGetDstArgStride (const axis_desc* axis){ - return axisGetLen(axis) > 1 ? axis->dstArgStride : 0; +static ssize_t axisGetD1Stride (const axis_desc* axis){ + return axisGetLen(axis) > 1 ? axis->d1S : 0; } -static size_t axisGetDstArgAbsStride (const axis_desc* axis){ - return axisGetDstArgStride(axis)<0 ? -(size_t)axisGetDstArgStride(axis): - +(size_t)axisGetDstArgStride(axis); +static size_t axisGetD1AbsStride (const axis_desc* axis){ + return axisGetD1Stride(axis)<0 ? -(size_t)axisGetD1Stride(axis): + +(size_t)axisGetD1Stride(axis); } -static unsigned axisGetIBP (const axis_desc* axis){ - return axis->ibp; +static size_t axisGetI0Stride (const axis_desc* axis){ + return axis->i0S; } -static int axisGetIBNum (const axis_desc* axis){ - return axis->ibNum; +static void axisSetI0Stride (axis_desc* axis, + size_t i0S){ + axis->i0S = i0S; } -static void axisSetIBP (axis_desc* axis, - unsigned ibp){ - axis->ibp = ibp; +static unsigned axisGetPerm (const axis_desc* axis){ + return axis->perm; } -static size_t axisGetPDim (const axis_desc* axis){ - return axis->pdim; +static int axisGetIBNum (const axis_desc* axis){ + return axis->ibNum; } -static void axisSetPDim (axis_desc* axis, - size_t pdim){ - axis->pdim = pdim; +static void axisSetPerm (axis_desc* axis, + unsigned perm){ + axis->perm = perm; } -static int axisIsReduced (const axis_desc* axis){ +static int axisIsReduced (const axis_desc* axis){ return axis->isReduced; } -static int axisIsIntra (const axis_desc* axis){ +static int axisIsIntra (const axis_desc* axis){ return axis->isIntra; } -static int axisIsInter (const axis_desc* axis){ +static int axisIsInter (const axis_desc* axis){ return !axisIsIntra(axis); } -static int axisIsSplit (const axis_desc* axis){ +static int axisIsSplit (const axis_desc* axis){ return axisIsIntra(axis) && axis->splitLen != axis->len; } -static size_t reduxInvEstimateParallelism (const redux_ctx* ctx){ +static size_t reduxInvEstimateParallelism (const redux_ctx* ctx){ return reduxGenEstimateParallelism(ctx->gr); } -static int reduxInvRequiresDst (const redux_ctx* ctx){ - return reduxGenRequiresDst(ctx->gr); +static int reduxInvRequiresS0 (const redux_ctx* ctx){ + return reduxGenRequiresS0(ctx->gr); } -static int reduxInvRequiresDstArg (const redux_ctx* ctx){ - return reduxGenRequiresDstArg(ctx->gr); +static int reduxInvRequiresD0 (const redux_ctx* ctx){ + return reduxGenRequiresD0(ctx->gr); } -static unsigned reduxInvGetSplitFree (const redux_ctx* ctx){ - if(ctx->xdSplit && !axisIsReduced(ctx->xdSplit)){ - return axisGetIntraLen(ctx->xdSplit); - }else{ - return 1; - } -} -static unsigned reduxInvGetSplitReduce (const redux_ctx* ctx){ - if(ctx->xdSplit && axisIsReduced(ctx->xdSplit)){ - return axisGetIntraLen(ctx->xdSplit); - }else{ - return 1; - } +static int reduxInvRequiresD1 (const redux_ctx* ctx){ + return reduxGenRequiresD1(ctx->gr); } /** * @brief Get description of source axis with given number. */ -static axis_desc* reduxInvGetSrcAxis (const redux_ctx* ctx, int i){ +static axis_desc* reduxInvGetSrcAxis (const redux_ctx* ctx, int i){ return &ctx->xdSrc[i]; } @@ -1171,7 +1286,7 @@ static axis_desc* reduxInvGetSrcAxis (const redux_ctx* ctx, int i){ * @brief Get description of source axis with given number in sort-order. */ -static axis_desc* reduxInvGetSrcSortAxis (const redux_ctx* ctx, int i){ +static axis_desc* reduxInvGetSrcSortAxis (const redux_ctx* ctx, int i){ return ctx->xdSrcPtrs[i]; } @@ -1187,10 +1302,10 @@ static axis_desc* reduxInvGetSrcSortAxis (const redux_ctx* ctx, int i){ * @return Non-zero if flattening attempt successful; Zero otherwise. */ -static int reduxTryFlattenOut (const redux_ctx* ctx, - const axis_desc* out){ - if ((axisGetLen (out) == 1 )|| - (axisIsReduced(out) && ctx->zeroRdxAxes > 0)){ +static int reduxTryFlattenOut (const redux_ctx* ctx, + const axis_desc* axis){ + if ((axisGetLen (axis) == 1 )|| + (axisIsReduced(axis) && ctx->zeroRdxAxes > 0)){ return 1; }else{ return 0; @@ -1218,66 +1333,66 @@ static int reduxTryFlattenOut (const redux_ctx* ctx, * @return Non-zero if flattening attempt successful; Zero otherwise. */ -static int reduxTryFlattenInto (redux_ctx* ctx, - axis_desc* into, - const axis_desc* from){ - int signSrc = 0, signDst = 0, signDstArg = 0, - reverseSrc = 0, reverseDst = 0, reverseDstArg = 0; +static int reduxTryFlattenInto (redux_ctx* ctx, + axis_desc* into, + const axis_desc* from){ + int signS0 = 0, signD0 = 0, signD1 = 0, + reverseS0 = 0, reverseD0 = 0, reverseD1 = 0; - if (axisIsReduced (into) != axisIsReduced (from) || - axisGetSrcAbsStride (into) != axisGetSrcAbsStride (from)*axisGetLen(from)){ + if (axisIsReduced (into) != axisIsReduced (from) || + axisGetS0AbsStride(into) != axisGetS0AbsStride(from)*axisGetLen(from)){ return 0; } - if (reduxInvRequiresDst (ctx) && - axisGetDstAbsStride (into) != axisGetDstAbsStride (from)*axisGetLen(from)){ + if (reduxInvRequiresD0(ctx) && + axisGetD0AbsStride(into) != axisGetD0AbsStride(from)*axisGetLen(from)){ return 0; } - if (reduxInvRequiresDstArg(ctx) && - axisGetDstArgAbsStride(into) != axisGetDstArgAbsStride(from)*axisGetLen(from)){ + if (reduxInvRequiresD1(ctx) && + axisGetD1AbsStride(into) != axisGetD1AbsStride(from)*axisGetLen(from)){ return 0; } - signSrc = (axisGetSrcStride (into)^axisGetSrcStride (from)) < 0; - signDst = (axisGetDstStride (into)^axisGetDstStride (from)) < 0; - signDstArg = (axisGetDstArgStride(into)^axisGetDstArgStride(from)) < 0; - reverseSrc = signSrc; - reverseDst = signDst && reduxInvRequiresDst (ctx); - reverseDstArg = signDstArg && reduxInvRequiresDstArg(ctx); + signS0 = (axisGetS0Stride(into)^axisGetS0Stride(from)) < 0; + signD0 = (axisGetD0Stride(into)^axisGetD0Stride(from)) < 0; + signD1 = (axisGetD1Stride(into)^axisGetD1Stride(from)) < 0; + reverseS0 = signS0; + reverseD0 = signD0 && reduxInvRequiresD0(ctx); + reverseD1 = signD1 && reduxInvRequiresD1(ctx); if (reduxIsSensitive(ctx->op)){ - if(reverseSrc || reverseDst || reverseDstArg){ + if (reverseS0 || reverseD0 || reverseD1){ return 0; } } - if (reduxInvRequiresDst (ctx) && - reduxInvRequiresDstArg(ctx) && - reverseDst != reverseDstArg){ + if (reduxInvRequiresD0(ctx) && + reduxInvRequiresD1(ctx) && + reverseD0 != reverseD1){ /* Either both, or neither, of dst and dstArg must require reversal. */ return 0; } - if (reverseSrc){ - ctx->flatSrcOffset += (ssize_t)(axisGetLen(from)-1)*axisGetSrcStride(from); - into->srcStride = -axisGetSrcStride (from); + if (reverseS0){ + ctx->S0Off += (ssize_t)(axisGetLen(from)-1)*axisGetS0Stride(from); + into->s0S = -axisGetS0Stride(from); }else{ - into->srcStride = axisGetSrcStride (from); + into->s0S = axisGetS0Stride(from); } - if (reverseDst){ - ctx->flatDstOffset += (ssize_t)(axisGetLen(from)-1)*axisGetDstStride(from); - into->dstStride = -axisGetDstStride (from); + if (reverseD0){ + ctx->D0Off += (ssize_t)(axisGetLen(from)-1)*axisGetD0Stride(from); + into->d0S = -axisGetD0Stride(from); }else{ - into->dstStride = axisGetDstStride (from); + into->d0S = axisGetD0Stride(from); } - if (reverseDstArg){ - ctx->flatDstArgOffset += (ssize_t)(axisGetLen(from)-1)*axisGetDstArgStride(from); - into->dstArgStride = -axisGetDstArgStride(from); + if (reverseD1){ + ctx->D1Off += (ssize_t)(axisGetLen(from)-1)*axisGetD1Stride(from); + into->d1S = -axisGetD1Stride(from); }else{ - into->dstArgStride = axisGetDstArgStride(from); + into->d1S = axisGetD1Stride(from); } into->len *= axisGetLen(from); @@ -1290,13 +1405,13 @@ static int reduxTryFlattenInto (redux_ctx* ctx, * not touching the axes themselves. */ -static void reduxSortAxisPtrsBy (axis_desc** ptrs, - axis_desc* axes, - size_t numAxes, - int(*fn)(const void*, const void*)){ +static void reduxSortAxisPtrsBy (axis_desc** ptrs, + axis_desc* axes, + size_t numAxes, + int(*fn)(const void*, const void*)){ size_t i; - for(i=0;ikArgTypeCodes = NULL; gr->kSourceCode = NULL; gr->kErrorString = NULL; @@ -1323,23 +1438,27 @@ static int reduxGenInit (GpuReduction* gr){ * @brief Begin inferring the properties of the reduction operator. */ -static int reduxGenInferProperties (GpuReduction* gr){ - int i, ret; +static int reduxGenInferProperties (GpuReduction* gr){ + int i; /** * Insane arguments? */ - if(gr->ndr <= 0){ + if (gr->op < 0 || gr->op >= GA_REDUCE_ENDSUPPORTED){ + return reduxGenCleanupMsg(gr, GA_INVALID_ERROR, + "Unknown reduction operation!\n"); + } + if (gr->ndr <= 0){ return reduxGenCleanupMsg(gr, GA_INVALID_ERROR, "No reduction axes!\n"); } - if(gr->ndd < 0){ + if (gr->ndd < 0){ return reduxGenCleanupMsg(gr, GA_INVALID_ERROR, - "Destination has less than 0 dimensions!\n"); + "Destination tensor has less than 0 rank!\n"); } - if(gr->flags != 0){ + if (gr->flags != 0){ return reduxGenCleanupMsg(gr, GA_INVALID_ERROR, "\"flags\" must be set to 0!\n"); } @@ -1377,180 +1496,307 @@ static int reduxGenInferProperties (GpuReduction* gr){ * Type management. * * - Deal with the various typecodes. - * - Determine initializer and error out if reduction unsupported on that - * datatype. */ - gr->dstTypeCode = gr->srcTypeCode; - gr->dstArgTypeCode = GA_SSIZE; - gr->idxTypeCode = GA_SSIZE; - switch (gr->srcTypeCode){ + gr->TD0tc = gr->TS0tc; + gr->TD1tc = GA_SSIZE; + gr->TS32tc = GA_INT; + gr->TU32tc = GA_UINT; + gr->TS64tc = GA_LONG; + gr->TU64tc = GA_ULONG; + switch(gr->op){ + case GA_REDUCE_AND: + case GA_REDUCE_OR: + case GA_REDUCE_XOR: + if (reduxIsFloatingPoint(gr->TS0tc)){ + return reduxGenCleanupMsg(gr, GA_INVALID_ERROR, + "Bitwise operations not applicable to floating-point datatypes!\n"); + } + break; + default: + break; + } + reduxGenSetKTypes(gr); + + + /** + * Compute number of kernel arguments and construct kernel argument + * typecode list. + */ + + reduxGenSetMaxBS(gr); + reduxGenIterArgs(gr, reduxGenCountArgs, &gr->kNumArgs); + gr->kArgTypeCodes = calloc(gr->kNumArgs, sizeof(*gr->kArgTypeCodes)); + if (!gr->kArgTypeCodes){ + return reduxGenCleanupMsg(gr, GA_MEMORY_ERROR, + "Failed to allocate memory for kernel arguments " + "typecode list!\n"); + } + i = 0; + reduxGenIterArgs(gr, reduxGenSaveArgTypecodes, &i); + + + /* Generate source code. */ + return reduxGenSrc(gr); +} + +/** + * Compute maximum block size we shall support in generated kernels. + */ + +static void reduxGenSetMaxBS (GpuReduction* gr){ + gr->maxBS = gr->maxLM/reduxGenGetReduxStateSize(gr); + gr->maxBS = gr->maxBS < gr->maxLg ? gr->maxBS : gr->maxLg; + gr->maxBS = gr->maxBS < gr->maxL0 ? gr->maxBS : gr->maxL0; + + /** + * In practice we want a moderate amount of blocks, not just one monolith + * that occupies a processor for its entire lifetime. E.g. An NVIDIA GPU + * supports 1024 threads / block, but we shall gun for less than that. + * + * Our heuristic shall be to divide by 4 the maximum number of threads per + * block, so that there's 4 times more blocks than normally there would be. + * This helps on many fronts: + * + * - A smaller "tail effect" when the last huge block must wait its turn + * and then delays the completion of the entire grid + * - The horizontal reductions take less time per block, and sometimes + * horizontal reduction time can dominate performance. + * - Less time taken for across-thread synchronization; And whenever a + * block's threads are stalled waiting for synchronization, another + * block's threads can fill in with their global memory requests. + */ + + if (gr->maxBS >= 16){ + gr->maxBS /= 4; + } + + /* Since ceil(log2(maxBS)) is also heavily used, compute it here */ + gr->log2MaxBS = reduxCeilLog2(gr->maxBS); +} + +/** + * Decide on the TK* accumulator types and initializers we will use. + * + * Currently, the only special thing we do is to promote the accumulator type + * to GA_FLOATx if the source type is GA_HALFx: + * + * TPS0 = promotion(TS0) + * + * Therefore, it is currently always the case that TK0 == TPS0. + * + * In the future this might become wierder when the accumulator is a Kahan + * summation, for instance, and then TK0 != promoted(TS0). + * + * If the user guaranteed to us that TK1 can be made narrower than 64-bit + * unsigned through, perhaps, a flag, this is also where we set it. + */ + +static void reduxGenSetKTypes (GpuReduction* gr){ + const gpuarray_type *TK0 = NULL, *TK1 = NULL, *TPS0 = NULL; + const char* TK0init = NULL; + + /** + * Handle TPS0 type promotion.... + */ + + switch (gr->TS0tc){ case GA_HALF: - gr->accTypeCode = GA_FLOAT; + TPS0 = gpuarray_get_type(GA_FLOAT); break; case GA_HALF2: - gr->accTypeCode = GA_FLOAT2; + TPS0 = gpuarray_get_type(GA_FLOAT2); break; case GA_HALF4: - gr->accTypeCode = GA_FLOAT4; + TPS0 = gpuarray_get_type(GA_FLOAT4); break; case GA_HALF8: - gr->accTypeCode = GA_FLOAT8; + TPS0 = gpuarray_get_type(GA_FLOAT8); break; case GA_HALF16: - gr->accTypeCode = GA_FLOAT16; + TPS0 = gpuarray_get_type(GA_FLOAT16); break; default: - gr->accTypeCode = gr->srcTypeCode; - } - gr->srcTypeStr = gpuarray_get_type(gr->srcTypeCode) ->cluda_name; - gr->dstTypeStr = gpuarray_get_type(gr->dstTypeCode) ->cluda_name; - gr->dstArgTypeStr = gpuarray_get_type(gr->dstArgTypeCode)->cluda_name; - gr->idxTypeStr = gpuarray_get_type(gr->idxTypeCode) ->cluda_name; - gr->accTypeStr = gpuarray_get_type(gr->accTypeCode) ->cluda_name; - if (!gr->srcTypeStr || - !gr->dstTypeStr || - !gr->dstArgTypeStr || - !gr->idxTypeStr || - !gr->accTypeStr ){ - return reduxGenCleanupMsg(gr, GA_INVALID_ERROR, - "Have typecode with no CLUDA name!\n"); + TPS0 = gpuarray_get_type(gr->TS0tc); } + gr->TPS0tc = TPS0->typecode; + + + /** + * Each operator may define and initialize TK0 and/or TK1 any way + * they want. + */ + switch (gr->op){ case GA_REDUCE_SUM: - ret = reduxGetSumInit (gr->accTypeCode, &gr->initVal); + TK0 = TPS0; + reduxGetSumInit (TK0->typecode, &TK0init); + gr->TK0.align = TK0->align; + gr->TK0.size = TK0->size; + sprintf(gr->TK0.defn, "%s", TK0->cluda_name); + sprintf(gr->TK0.init, "%s", TK0init); break; case GA_REDUCE_PRODNZ: case GA_REDUCE_PROD: - ret = reduxGetProdInit(gr->accTypeCode, &gr->initVal); + TK0 = TPS0; + reduxGetProdInit(TK0->typecode, &TK0init); + gr->TK0.align = TK0->align; + gr->TK0.size = TK0->size; + sprintf(gr->TK0.defn, "%s", TK0->cluda_name); + sprintf(gr->TK0.init, "%s", TK0init); break; case GA_REDUCE_MINANDARGMIN: case GA_REDUCE_ARGMIN: case GA_REDUCE_MIN: - ret = reduxGetMinInit (gr->accTypeCode, &gr->initVal); + TK0 = TPS0; + TK1 = gpuarray_get_type(GA_SIZE); + reduxGetMinInit (TK0->typecode, &TK0init); + gr->TK0.align = TK0->align; + gr->TK0.size = TK0->size; + sprintf(gr->TK0.defn, "%s", TK0->cluda_name); + sprintf(gr->TK0.init, "%s", TK0init); + gr->TK1.align = TK1->align; + gr->TK1.size = TK1->size; + sprintf(gr->TK1.defn, "%s", TK1->cluda_name); + sprintf(gr->TK1.init, "0"); break; case GA_REDUCE_MAXANDARGMAX: case GA_REDUCE_ARGMAX: case GA_REDUCE_MAX: - ret = reduxGetMaxInit (gr->accTypeCode, &gr->initVal); + TK0 = TPS0; + TK1 = gpuarray_get_type(GA_SIZE); + reduxGetMaxInit (TK0->typecode, &TK0init); + gr->TK0.align = TK0->align; + gr->TK0.size = TK0->size; + sprintf(gr->TK0.defn, "%s", TK0->cluda_name); + sprintf(gr->TK0.init, "%s", TK0init); + gr->TK1.align = TK1->align; + gr->TK1.size = TK1->size; + sprintf(gr->TK1.defn, "%s", TK1->cluda_name); + sprintf(gr->TK1.init, "0"); break; case GA_REDUCE_ALL: case GA_REDUCE_AND: - ret = reduxGetAndInit (gr->accTypeCode, &gr->initVal); + TK0 = TPS0; + reduxGetAndInit (TK0->typecode, &TK0init); + gr->TK0.align = TK0->align; + gr->TK0.size = TK0->size; + sprintf(gr->TK0.defn, "%s", TK0->cluda_name); + sprintf(gr->TK0.init, "%s", TK0init); break; case GA_REDUCE_ANY: case GA_REDUCE_XOR: case GA_REDUCE_OR: - ret = reduxGetOrInit (gr->accTypeCode, &gr->initVal); + TK0 = TPS0; + reduxGetOrInit (TK0->typecode, &TK0init); + gr->TK0.align = TK0->align; + gr->TK0.size = TK0->size; + sprintf(gr->TK0.defn, "%s", TK0->cluda_name); + sprintf(gr->TK0.init, "%s", TK0init); break; default: - ret = GA_UNSUPPORTED_ERROR; - } - if (ret != GA_NO_ERROR){ - return reduxGenCleanupMsg(gr, ret, - "Problem selecting types to be used in reduction!\n"); - } - - - /* Compute floor(log2(gr->log2MaxL)). */ - gr->log2MaxL = gr->maxLg-1; - for(i=1;gr->log2MaxL & (gr->log2MaxL+1);i*=2){ - gr->log2MaxL |= gr->log2MaxL>>i; - } - for(i=0;gr->log2MaxL;i++){ - gr->log2MaxL >>= 1; - } - gr->log2MaxL = i?i:1; - - - /** - * Compute number of kernel arguments and construct kernel argument - * typecode list. - */ - - reduxGenIterArgs(gr, reduxGenCountArgs, 0); - gr->kArgTypeCodes = calloc(gr->kNumArgs, sizeof(*gr->kArgTypeCodes)); - if(!gr->kArgTypeCodes){ - return reduxGenCleanupMsg(gr, GA_MEMORY_ERROR, - "Failed to allocate memory for kernel arguments " - "typecode list!\n"); + ;/* Unreachable */ } - i = 0; - reduxGenIterArgs(gr, reduxGenSaveArgTypecodes, &i); - - - /* Generate source code. */ - return reduxGenSrc(gr); } /** * Iterate over the arguments of the reduction operator. */ -static void reduxGenIterArgs (GpuReduction* gr, - GpuReductionIterFn fn, - void* user){ +static void reduxGenIterArgs (const GpuReduction* gr, + GpuReductionIterFn fn, + void* user){ int k; - fn(gr, GA_INT, "int", "phase", 0, user); - fn(gr, GA_SIZE, "TX", "U", 0, user); - fn(gr, GA_SIZE, "TX", "V", 0, user); - fn(gr, GA_SIZE, "TX", "B", 0, user); - fn(gr, GA_UINT, "unsigned", "D", 0, user); - fn(gr, GA_UINT, "unsigned", "H", 0, user); - fn(gr, GA_UINT, "unsigned", "splitFree", 0, user); - fn(gr, GA_UINT, "unsigned", "splitReduce", 0, user); - for(k=0;k < gr->nds;k++){ - fn(gr, GA_SIZE, "TX", "l%d", k, user); - } - for(k=gr->ndd;k < gr->nds && reduxGenRequiresDstArg(gr);k++){ - fn(gr, GA_SIZE, "TX", "l%dPDim", k, user); - } - fn(gr, GA_BUFFER, "const GLOBAL_MEM char* restrict", "s", 0, user); - fn(gr, GA_SSIZE, "TX", "sOff", 0, user); - for(k=0;k < gr->nds;k++){ - fn(gr, GA_SIZE, "TX", "sJ%d", k, user); - } - if(reduxGenRequiresDst (gr)){ - fn(gr, GA_BUFFER, "GLOBAL_MEM char* restrict", "d", 0, user); - fn(gr, GA_SSIZE, "TX", "dOff", 0, user); - for(k=0;k < gr->ndd;k++){ - fn(gr, GA_SIZE, "TX", "dJ%d", k, user); + /** + * Template selector + */ + + fn(gr, gr->TU32tc, "TU32", "selector", 0, user); + + /** + * "Universal" parameters describing the partitioning of the problem. + */ + + fn(gr, gr->TU64tc, "TU64", "U", 0, user); + fn(gr, gr->TU64tc, "TU64", "V", 0, user); + fn(gr, gr->TU64tc, "TU64", "B", 0, user); + fn(gr, gr->TU32tc, "TU32", "D", 0, user); + fn(gr, gr->TU32tc, "TU32", "Dunit", 0, user); + fn(gr, gr->TU32tc, "TU32", "H", 0, user); + + /* Global Lattice Coordinates */ + fn(gr, gr->TU32tc, "TU32", "LSlice", 0, user); + fn(gr, gr->TU32tc, "TU64", "LPadded", 0, user); + for (k=0;k < gr->nds;k++){ + fn(gr, gr->TU64tc, "TU64", "L%d", k, user); + } + for (k=0;k < gr->log2MaxBS;k++){ + fn(gr, gr->TU32tc, "TU32", "L%di", k, user); + } + + /* S0 Lattice */ + if (reduxGenKernelRequiresLatticeS0(gr)){ + fn(gr, GA_BUFFER, "const GLOBAL_MEM char* restrict", "S0", 0, user); + fn(gr, gr->TS64tc, "TS64", "S0Off", 0, user); + for (k=0;k < gr->nds;k++){ + fn(gr, gr->TS64tc, "TS64", "S0J%d", k, user); } - } - if(reduxGenRequiresDstArg(gr)){ - fn(gr, GA_BUFFER, "GLOBAL_MEM char* restrict", "a", 0, user); - fn(gr, GA_SSIZE, "TX", "aOff", 0, user); - for(k=0;k < gr->ndd;k++){ - fn(gr, GA_SIZE, "TX", "aJ%d", k, user); + for (k=0;k < gr->log2MaxBS;k++){ + fn(gr, gr->TS64tc, "TS64", "S0S%di", k, user); } } - fn(gr, GA_BUFFER, "GLOBAL_MEM char* restrict", "w", 0, user); - if(reduxGenKernelRequiresDst (gr)){ - fn(gr, GA_SSIZE, "TX", "wdOff", 0, user); - fn(gr, GA_SSIZE, "TX", "pdOff", 0, user); - } - if(reduxGenKernelRequiresDstArg(gr)){ - fn(gr, GA_SSIZE, "TX", "waOff", 0, user); - fn(gr, GA_SSIZE, "TX", "paOff", 0, user); - } - for(k=0;k < gr->log2MaxL;k++){ - fn(gr, GA_UINT, "unsigned", "ibs%d", k, user); - } - for(k=0;k < gr->log2MaxL;k++){ - fn(gr, GA_UINT, "unsigned", "ibp%d", k, user); + + /* d0 Lattice */ + if (reduxGenKernelRequiresLatticeD0(gr)){ + fn(gr, GA_BUFFER, "GLOBAL_MEM char* restrict", "D0", 0, user); + fn(gr, gr->TS64tc, "TS64", "D0Off", 0, user); + for (k=0;k < gr->ndd;k++){ + fn(gr, gr->TS64tc, "TS64", "D0J%d", k, user); + } + for (k=0;k < gr->log2MaxBS;k++){ + fn(gr, gr->TS64tc, "TS64", "D0S%di", k, user); + } } - for(k=0;k < gr->log2MaxL && reduxGenRequiresDstArg(gr);k++){ - fn(gr, GA_SIZE, "TX", "ibl%dPDim", k, user); + + /* D1 Lattice */ + if (reduxGenKernelRequiresLatticeD1(gr)){ + fn(gr, GA_BUFFER, "GLOBAL_MEM char* restrict", "D1", 0, user); + fn(gr, gr->TS64tc, "TS64", "D1Off", 0, user); + for (k=0;k < gr->ndd;k++){ + fn(gr, gr->TS64tc, "TS64", "D1J%d", k, user); + } + for (k=0;k < gr->log2MaxBS;k++){ + fn(gr, gr->TS64tc, "TS64", "D1S%di", k, user); + } } - for(k=0;k < gr->log2MaxL;k++){ - fn(gr, GA_SSIZE, "TX", "ibsOff%d", k, user); + + /* I0 Lattice */ + if (reduxGenKernelRequiresLatticeI0(gr)){ + for (k=0;k < gr->nds;k++){ + fn(gr, gr->TS64tc, "TS64", "I0J%d", k, user); + } + for (k=0;k < gr->log2MaxBS;k++){ + fn(gr, gr->TS64tc, "TS64", "I0S%di", k, user); + } } - for(k=0;k < gr->log2MaxL && reduxGenRequiresDst (gr);k++){ - fn(gr, GA_SSIZE, "TX", "ibdOff%d", k, user); + + /* Workspace */ + if (reduxGenKernelRequiresWspace(gr)){ + fn(gr, GA_BUFFER, "GLOBAL_MEM char* restrict", "W", 0, user); + if (reduxGenKernelRequiresStateK0(gr)){ + fn(gr, gr->TS64tc, "TS64", "W0Off", 0, user); + fn(gr, gr->TS64tc, "TS64", "SHMEMK0Off", 0, user); + } + if (reduxGenKernelRequiresStateK1(gr)){ + fn(gr, gr->TS64tc, "TS64", "W1Off", 0, user); + fn(gr, gr->TS64tc, "TS64", "SHMEMK1Off", 0, user); + } } - for(k=0;k < gr->log2MaxL && reduxGenRequiresDstArg(gr);k++){ - fn(gr, GA_SSIZE, "TX", "ibaOff%d", k, user); + + /* Intra-Block Permute Core */ + for (k=0;k < gr->log2MaxBS;k++){ + fn(gr, gr->TU32tc, "TU32", "perm%di", k, user); } } @@ -1561,6 +1807,9 @@ static void reduxGenIterArgs (GpuReduction* gr, */ static int reduxGenSrc (GpuReduction* gr){ + sprintf(gr->kName, "reduxKernel%s_f%d_r%d", + reduxGetOpName(gr->op), gr->ndd, gr->ndr); + reduxGenSrcAppend(gr); gr->kSourceCodeLen = gr->s.l; @@ -1580,10 +1829,9 @@ static int reduxGenSrc (GpuReduction* gr){ */ static void reduxGenSrcAppend (GpuReduction* gr){ - reduxGenSrcAppendIncludes (gr); - reduxGenSrcAppendMacroDefs (gr); - reduxGenSrcAppendTypedefs (gr); - reduxGenSrcAppendReduxKernel (gr); + reduxGenSrcAppendIncludes (gr); + reduxGenSrcAppendMacroTypedefs(gr); + reduxGenSrcAppendReduxKernel (gr); } static void reduxGenSrcAppendIncludes (GpuReduction* gr){ srcbAppends(&gr->srcGen, "/* Includes */\n"); @@ -1592,61 +1840,80 @@ static void reduxGenSrcAppendIncludes (GpuReduction* gr){ srcbAppends(&gr->srcGen, "\n"); srcbAppends(&gr->srcGen, "\n"); } -static void reduxGenSrcAppendMacroDefs (GpuReduction* gr){ - int i; +static void reduxGenSrcAppendMacroTypedefs(GpuReduction* gr){ + /** + * Typedefs of various types. + */ + + if (reduxGenRequiresS0(gr)){ + srcbAppendf(&gr->srcGen, "typedef %-20s TS0;\n", gpuarray_get_type(gr->TS0tc )->cluda_name); + srcbAppendf(&gr->srcGen, "typedef %-20s TPS0;\n", gpuarray_get_type(gr->TPS0tc)->cluda_name); + } + if (reduxGenRequiresD0(gr)){ + srcbAppendf(&gr->srcGen, "typedef %-20s TD0;\n", gpuarray_get_type(gr->TD0tc )->cluda_name); + } + if (reduxGenRequiresD1(gr)){ + srcbAppendf(&gr->srcGen, "typedef %-20s TD1;\n", gpuarray_get_type(gr->TD1tc )->cluda_name); + } + srcbAppendf(&gr->srcGen, "typedef %-20s TS32;\n", gpuarray_get_type(gr->TS32tc)->cluda_name); + srcbAppendf(&gr->srcGen, "typedef %-20s TU32;\n", gpuarray_get_type(gr->TU32tc)->cluda_name); + srcbAppendf(&gr->srcGen, "typedef %-20s TS64;\n", gpuarray_get_type(gr->TS64tc)->cluda_name); + srcbAppendf(&gr->srcGen, "typedef %-20s TU64;\n", gpuarray_get_type(gr->TU64tc)->cluda_name); + if (reduxGenKernelRequiresStateK0(gr)){ + srcbAppendf(&gr->srcGen, "typedef %-20s TK0;\n", gr->TK0.defn); + } + if (reduxGenKernelRequiresStateK1(gr)){ + srcbAppendf(&gr->srcGen, "typedef %-20s TK1;\n", gr->TK1.defn); + } + srcbAppendf(&gr->srcGen, "\n\n\n\n"); + /** * DECLREDUXSTATE, INITREDUXSTATE and SETREDUXSTATE macros. */ - if ( reduxGenKernelRequiresDst(gr) && reduxGenKernelRequiresDstArg(gr)){ + if ( reduxGenKernelRequiresStateK0(gr) && reduxGenKernelRequiresStateK1(gr)){ srcbAppendf(&gr->srcGen, - "#define DECLREDUXSTATE(V, I) TK V;TX I;\n" - "#define INITREDUXSTATE(V, I) do{(V) = %s;(I) = 0;}while(0)\n" + "#define DECLREDUXSTATE(V, I) TK0 V;TK1 I;\n" + "#define INITREDUXSTATE(V, I) do{(V) = (%s);(I) = (%s);}while(0)\n" "#define SETREDUXSTATE(V, I, v, i) do{(V) = (v);(I) = (i);}while(0)\n", - gr->initVal); - }else if ( reduxGenKernelRequiresDst(gr) && !reduxGenKernelRequiresDstArg(gr)){ + gr->TK0.init, gr->TK1.init); + }else if ( reduxGenKernelRequiresStateK0(gr) && !reduxGenKernelRequiresStateK1(gr)){ srcbAppendf(&gr->srcGen, - "#define DECLREDUXSTATE(V, I) TK V;\n" - "#define INITREDUXSTATE(V, I) do{(V) = %s;}while(0)\n" + "#define DECLREDUXSTATE(V, I) TK0 V;\n" + "#define INITREDUXSTATE(V, I) do{(V) = (%s);}while(0)\n" "#define SETREDUXSTATE(V, I, v, i) do{(V) = (v);}while(0)\n", - gr->initVal); - }else if (!reduxGenKernelRequiresDst(gr) && reduxGenKernelRequiresDstArg(gr)){ + gr->TK0.init); + }else if (!reduxGenKernelRequiresStateK0(gr) && reduxGenKernelRequiresStateK1(gr)){ srcbAppendf(&gr->srcGen, - "#define DECLREDUXSTATE(V, I) TX I;\n" - "#define INITREDUXSTATE(V, I) do{(I) = 0;}while(0)\n" - "#define SETREDUXSTATE(V, I, v, i) do{(I) = (i);}while(0)\n"); + "#define DECLREDUXSTATE(V, I) TK1 I;\n" + "#define INITREDUXSTATE(V, I) do{(I) = (%s);}while(0)\n" + "#define SETREDUXSTATE(V, I, v, i) do{(I) = (i);}while(0)\n", + gr->TK1.init); } /** - * LOADS(v, p) macro. + * LOADS0(v, p) macro. + * + * Loads a TK0-typed value v from a TS-typed source pointer p, promoting + * through type TPS0. * - * Loads a TK-typed value v from a TS-typed source pointer p. + * In some future, TK0 will not equal TPS0, and so a cast as done below will not + * necessarily be valid. Instead it may require an assignment to a struct member. */ - if (gr->srcTypeCode == GA_HALF && gr->accTypeCode == GA_FLOAT){ - srcbAppends(&gr->srcGen, "#define LOADS(v, p) do{(v) = (TK)load_half((const TS* restrict)(p));}while(0)\n"); + if (reduxGenKernelRequiresLatticeS0(gr)){ + if (gr->TS0tc == GA_HALF && gr->TPS0tc == GA_FLOAT){ + srcbAppends(&gr->srcGen, "#define LOADS0(v, p) do{(v) = (TK0)(TPS0)load_half((const TS0* restrict)(p));}while(0)\n"); + }else{ + srcbAppends(&gr->srcGen, "#define LOADS0(v, p) do{(v) = (TK0)(TPS0)*(const TS0* restrict)(p);}while(0)\n"); + } }else{ - srcbAppends(&gr->srcGen, "#define LOADS(v, p) do{(v) = (TK)*(const TS* restrict)(p);}while(0)\n"); + srcbAppends(&gr->srcGen, "#define LOADS0(p, v) do{}while(0)\n"); } - /** - * GETIDX macro. - * - * Expands to the current flattened index. - */ - - srcbAppends (&gr->srcGen, "#define GETIDX ("); - srcbBeginList (&gr->srcGen, " + ", "0"); - srcbAppendElemf(&gr->srcGen, "ti"); - for(i=gr->ndd;inds;i++){ - srcbAppendElemf(&gr->srcGen, "i%d*l%dPDim", i, i); - } - srcbEndList (&gr->srcGen); - srcbAppends (&gr->srcGen, ")\n"); - /** * REDUX macro. * @@ -1654,54 +1921,79 @@ static void reduxGenSrcAppendMacroDefs (GpuReduction* gr){ * flattened index i into reduction states V and I respectively. */ - srcbAppends(&gr->srcGen, "#define REDUX(V, I, v, i) do{ \\\n"); switch (gr->op){ case GA_REDUCE_SUM: - srcbAppendf(&gr->srcGen, " (V) += (v); \\\n"); + srcbAppendf(&gr->srcGen, "#define REDUX(V, I, v, i) do{ \\\n" + " (V) += (v); \\\n" + " }while(0)\n"); break; case GA_REDUCE_PROD: - srcbAppendf(&gr->srcGen, " (V) *= (v); \\\n"); + srcbAppendf(&gr->srcGen, "#define REDUX(V, I, v, i) do{ \\\n" + " (V) *= (v); \\\n" + " }while(0)\n"); break; case GA_REDUCE_PRODNZ: - srcbAppendf(&gr->srcGen, " (V) *= ((v) == 0 ? (%s) : (v)); \\\n", gr->initVal); + srcbAppendf(&gr->srcGen, "#define REDUX(V, I, v, i) do{ \\\n" + " if((v) != 0){(V) *= (v);} \\\n" + " }while(0)\n"); break; case GA_REDUCE_MIN: - srcbAppendf(&gr->srcGen, " (V) = min((V), (v)); \\\n"); + srcbAppendf(&gr->srcGen, "#define REDUX(V, I, v, i) do{ \\\n" + " (V) = min((V), (v)); \\\n" + " }while(0)\n"); break; case GA_REDUCE_MAX: - srcbAppendf(&gr->srcGen, " (V) = max((V), (v)); \\\n"); + srcbAppendf(&gr->srcGen, "#define REDUX(V, I, v, i) do{ \\\n" + " (V) = max((V), (v)); \\\n" + " }while(0)\n"); break; case GA_REDUCE_ARGMIN: case GA_REDUCE_MINANDARGMIN: - srcbAppendf(&gr->srcGen, " (V) = min((V), (v)); \\\n" + srcbAppendf(&gr->srcGen, "#define REDUX(V, I, v, i) do{ \\\n" + " (V) = min((V), (v)); \\\n" " if((V) == (v)){ \\\n" " (I) = (i); \\\n" - " } \\\n"); + " } \\\n" + " }while(0)\n"); break; case GA_REDUCE_ARGMAX: case GA_REDUCE_MAXANDARGMAX: - srcbAppendf(&gr->srcGen, " (V) = max((V), (v)); \\\n" + srcbAppendf(&gr->srcGen, "#define REDUX(V, I, v, i) do{ \\\n" + " (V) = max((V), (v)); \\\n" " if((V) == (v)){ \\\n" " (I) = (i); \\\n" - " } \\\n"); + " } \\\n" + " }while(0)\n"); break; case GA_REDUCE_AND: - srcbAppendf(&gr->srcGen, " (V) &= (v); \\\n"); + srcbAppendf(&gr->srcGen, "#define REDUX(V, I, v, i) do{ \\\n" + " (V) &= (v); \\\n" + " }while(0)\n"); break; case GA_REDUCE_OR: - srcbAppendf(&gr->srcGen, " (V) |= (v); \\\n"); + srcbAppendf(&gr->srcGen, "#define REDUX(V, I, v, i) do{ \\\n" + " (V) |= (v); \\\n" + " }while(0)\n"); break; case GA_REDUCE_XOR: - srcbAppendf(&gr->srcGen, " (V) ^= (v); \\\n"); + srcbAppendf(&gr->srcGen, "#define REDUX(V, I, v, i) do{ \\\n" + " (V) ^= (v); \\\n" + " }while(0)\n"); break; case GA_REDUCE_ALL: - srcbAppendf(&gr->srcGen, " (V) = (V) && (v); \\\n"); + srcbAppendf(&gr->srcGen, "#define REDUX(V, I, v, i) do{ \\\n" + " (V) = (V) && (v); \\\n" + " }while(0)\n"); break; case GA_REDUCE_ANY: - srcbAppendf(&gr->srcGen, " (V) = (V) || (v); \\\n"); + srcbAppendf(&gr->srcGen, "#define REDUX(V, I, v, i) do{ \\\n" + " (V) = (V) || (v); \\\n" + " }while(0)\n"); + break; + default: + /* Unreachable */ break; } - srcbAppends(&gr->srcGen, " }while(0)\n"); /** @@ -1709,55 +2001,112 @@ static void reduxGenSrcAppendMacroDefs (GpuReduction* gr){ * * Performs a horizontal reduction operation, first intra-block permuting * the data and its index and then reducing it till done. - */ - + * + * - If D==LDIM_0, then no horizontal (across-block) reductions are + * really needed. In this case, the permutation tp: + * - Is fully in-bounds (tp < LDIM_0 for all threads) + * - Exists firstly to make it easy to mask writes (hard). + * - Exists secondly to optimize memory write bandwidth (soft). + * and the value H should be equal to D and to LDIM_0 + * - If D= LDIM_0 for some threads) + * - Exists firstly to make it easy to mask writes (hard). + * - Exists secondly to enable a tree reduction (hard). + * - Exists thirdly to optimize memory write bandwidth (soft). + * and the value H must be a power of 2 and shall be set to nextPow2(bs). + * + * E.g. Suppose that a block configuration was D=999, H=1 (bs=999). A + * permutation we might want is + * [0,...,332,333,...,665,666,...,998] + * and we want H = 999. + * E.g. Suppose that a block configuration was D=257, H=3 (bs=771). A + * permutation we might want is + * [0,...,256,512,...,768,1024,...,1280] + * and we want H = 1024. + * E.g. Suppose that a block configuration was D=33, H=17 (bs=561). A + * permutation we might want is + * [0,...,32,64,...,96,128,...,160,...,960,...,992,1024,...,1056] + * and we want H = 1024. + * E.g. Suppose that a block configuration was D=16, H=16 (bs=256). A + * permutation we might want is + * [0,...255] + * and we want H = 256. + * + */ + srcbAppends(&gr->srcGen, - "#define HREDUX(pd, pa, tp, V, I) \\\n" - " do{ \\\n" - " /* Horizontal Reduction */ \\\n" - " SETREDUXSTATE(pd[tp], pa[tp], accV, accI); \\\n" - " local_barrier(); \\\n" - " \\\n" - " h = H; \\\n" - " while(h>1){ \\\n" - " if((h&1) && (LID_0 < D)){ \\\n" - " REDUX(pd[LID_0], pa[LID_0], pd[LID_0 + D*h-D], pa[LID_0 + D*h-D]); \\\n" - " } \\\n" - " h >>= 1; \\\n" - " if(LID_0 < D*h){ \\\n" - " REDUX(pd[LID_0], pa[LID_0], pd[LID_0 + D*h ], pa[LID_0 + D*h ]); \\\n" - " } \\\n" - " local_barrier(); \\\n" - " } \\\n" + "#define HREDUX(SHMEMK0, SHMEMK1, perm, k0, k1) \\\n" + " do{ \\\n" + " if(D < LDIM_0){ \\\n" + " /* SPECIAL FIRST REDUCTION: */ \\\n" + " h = H; \\\n" + " \\\n" + " /* LO Half */ \\\n" + " if(perm < h){ \\\n" + " SETREDUXSTATE(SHMEMK0[perm], \\\n" + " SHMEMK1[perm], \\\n" + " k0, \\\n" + " k1); \\\n" + " } \\\n" + " local_barrier(); \\\n" + " \\\n" + " /* HI Half */ \\\n" + " if(perm >= h){ \\\n" + " REDUX (SHMEMK0[perm-h], \\\n" + " SHMEMK1[perm-h], \\\n" + " k0, \\\n" + " k1); \\\n" + " } \\\n" + " local_barrier(); \\\n" + " \\\n" + " /* Follow-up reductions */ \\\n" + " while((h >>= 1) >= D){ \\\n" + " if(LID_0 < h){ \\\n" + " REDUX(SHMEMK0[LID_0], \\\n" + " SHMEMK1[LID_0], \\\n" + " SHMEMK0[LID_0+h], \\\n" + " SHMEMK1[LID_0+h]); \\\n" + " } \\\n" + " local_barrier(); \\\n" + " } \\\n" + " }else{ \\\n" + " /* All-permute */ \\\n" + " SETREDUXSTATE(SHMEMK0[perm], \\\n" + " SHMEMK1[perm], \\\n" + " k0, \\\n" + " k1); \\\n" + " local_barrier(); \\\n" + " } \\\n" " }while(0)\n"); /** - * STORED macro. + * STORED0 macro. * - * Stores a TK-typed value v into a TS-typed destination pointer p. + * Stores a TK0-typed value v into a TD0-typed destination pointer p. */ - if (reduxGenRequiresDst(gr)){ - if (gr->dstTypeCode == GA_HALF && gr->accTypeCode == GA_FLOAT){ - srcbAppends(&gr->srcGen, "#define STORED(p, v) do{store_half((TD* restrict)(p), (v));}while(0)\n"); + if (reduxGenKernelRequiresLatticeD0(gr)){ + if (gr->TD0tc == GA_HALF && gr->TPS0tc == GA_FLOAT){ + srcbAppends(&gr->srcGen, "#define STORED0(p, v) do{store_half((TD0* restrict)(p), (v));}while(0)\n"); }else{ - srcbAppends(&gr->srcGen, "#define STORED(p, v) do{*(TD* restrict)(p) = (v);}while(0)\n"); + srcbAppends(&gr->srcGen, "#define STORED0(p, v) do{*(TD0* restrict)(p) = (v);}while(0)\n"); } }else{ - srcbAppends(&gr->srcGen, "#define STORED(p, v) do{}while(0)\n"); + srcbAppends(&gr->srcGen, "#define STORED0(p, v) do{}while(0)\n"); } /** - * STOREA macro. + * STORED1 macro. * - * Stores a TX-typed value v into a TA-typed destination pointer p. + * Stores a TK1-typed value v into a TD1-typed destination pointer p. */ - if (reduxGenRequiresDstArg(gr)){ - srcbAppends(&gr->srcGen, "#define STOREA(p, v) do{*(TA* restrict)(p) = (v);}while(0)\n"); + if (reduxGenKernelRequiresLatticeD1(gr)){ + srcbAppends(&gr->srcGen, "#define STORED1(p, v) do{*(TD1* restrict)(p) = (v);}while(0)\n"); }else{ - srcbAppends(&gr->srcGen, "#define STOREA(p, v) do{}while(0)\n"); + srcbAppends(&gr->srcGen, "#define STORED1(p, v) do{}while(0)\n"); } @@ -1765,682 +2114,694 @@ static void reduxGenSrcAppendMacroDefs (GpuReduction* gr){ * DIVIDECEIL macro. */ - srcbAppends(&gr->srcGen, "#define DIVIDECEIL(a,b) (((a)+(b)-1)/(b))\n"); - - srcbAppends(&gr->srcGen, "\n\n\n\n"); -} -static void reduxGenSrcAppendTypedefs (GpuReduction* gr){ - srcbAppendf(&gr->srcGen, "typedef %-20s TS;\n", gr->srcTypeStr); - srcbAppendf(&gr->srcGen, "typedef %-20s TD;\n", gr->dstTypeStr); - srcbAppendf(&gr->srcGen, "typedef %-20s TA;\n", gr->dstArgTypeStr); - srcbAppendf(&gr->srcGen, "typedef %-20s TX;\n", gr->idxTypeStr); - srcbAppendf(&gr->srcGen, "typedef %-20s TK;\n", gr->accTypeStr); - srcbAppendf(&gr->srcGen, "\n\n\n\n"); + srcbAppends(&gr->srcGen, "#define DIVIDECEIL(a,b) (((a)+(b)-1)/(b))\n\n\n\n\n"); } static void reduxGenSrcAppendReduxKernel (GpuReduction* gr){ reduxGenSrcAppendPrototype (gr); srcbAppends (&gr->srcGen, "{\n"); - reduxGenSrcAppendBlockDecode (gr); - reduxGenSrcAppendThreadDecode(gr); - srcbAppends (&gr->srcGen, " /**\n" - " * PERFORM REDUCTION.\n" - " * \n" - " * We either perform Phase 0 or Phase 1 according to our argument.\n" - " * \n" - " * Phase 0 is the primary worker and, in special cases, is the only necessary phase.\n" - " * However, it may occasionally do only part of a reduction, in which case it leaves\n" - " * the partial reduction results in a workspace that is then read by Phase 1.\n" - " * \n" - " * Phase 1 is a fixup phase that collects any partial reduction results from Phase 0\n" - " * and completes the reduction before writing to the final destination.\n" - " */\n" - " \n" - " if(phase==0){\n"); - reduxGenSrcAppendPhase0 (gr); - srcbAppends (&gr->srcGen, " }else{\n"); + reduxGenSrcAppendDecode (gr); + + /** + * PERFORM REDUCTION. + * + * We either perform Phase 0 or Phase 1 according to the selector argument. + * + * Phase 0 is the primary worker and, in special cases, is the only + * necessary phase. However, it may occasionally do only part of a + * reduction, in which case it leaves the partial reduction results in a + * workspace that is then read by Phase 1. + * + * Phase 1 is a fixup phase that collects any partial reduction results + * from Phase 0 and completes the reduction before writing to the final + * destination. + * + * The template selector indicates one of several specialized versions of + * the kernel to be executed. It indicates phase, which is the split axis, + * and which axis if any is "huge". + */ + + srcbAppends (&gr->srcGen, " if(selector&1){\n"); reduxGenSrcAppendPhase1 (gr); + srcbAppends (&gr->srcGen, " }else if(selector == 0){\n"); + reduxGenSrcAppendPhase0 (gr, 0); + srcbAppends (&gr->srcGen, " }else if(selector == 2){\n"); + reduxGenSrcAppendPhase0 (gr, 2); + srcbAppends (&gr->srcGen, " }else if(selector == 4){\n"); + reduxGenSrcAppendPhase0 (gr, 4); + srcbAppends (&gr->srcGen, " }else if(selector == 6){\n"); + reduxGenSrcAppendPhase0 (gr, 6); + srcbAppends (&gr->srcGen, " }else if(selector == 8){\n"); + reduxGenSrcAppendPhase0 (gr, 8); + srcbAppends (&gr->srcGen, " }else if(selector == 10){\n"); + reduxGenSrcAppendPhase0 (gr, 10); + srcbAppends (&gr->srcGen, " }else if(selector == 12){\n"); + reduxGenSrcAppendPhase0 (gr, 12); + srcbAppends (&gr->srcGen, " }else if(selector == 14){\n"); + reduxGenSrcAppendPhase0 (gr, 14); srcbAppends (&gr->srcGen, " }\n"); srcbAppends (&gr->srcGen, "}\n"); } static void reduxGenSrcAppendPrototype (GpuReduction* gr){ int i=0; - - srcbAppends (&gr->srcGen, "KERNEL void redux("); + + srcbAppendf(&gr->srcGen, + "KERNEL void\n" + "#if defined(__CUDACC__)\n" + "__launch_bounds__(%d, 8)\n" + "#endif\n", + gr->maxBS); + srcbAppendf(&gr->srcGen, + "%s(\n ", + gr->kName); reduxGenIterArgs(gr, reduxGenAppendArg, &i); - srcbAppends (&gr->srcGen, ")"); + srcbAppends(&gr->srcGen, ")"); } -static void reduxGenSrcAppendBlockDecode (GpuReduction* gr){ +static void reduxGenSrcAppendDecode (GpuReduction* gr){ int i; - + srcbAppends(&gr->srcGen, " GA_DECL_SHARED_BODY(char, SHMEM)\n" - " DECLREDUXSTATE(accV, accI)\n" - " DECLREDUXSTATE(tmpV, tmpI)\n" - " INITREDUXSTATE(accV, accI);\n" - " \n" - " /**\n" - " * +-------------+-------------+------------+---------------------------------+\n" - " * | misalignL | misalignR | doFinish | DESCRIPTION |\n" - " * +-------------+-------------+------------+---------------------------------+\n" - " * | 0 | 0 | 0 | Impossible unless v == 0, |\n" - " * | | | | which is forbidden. |\n" - " * | | | | |\n" - " * | 0 | 0 | 1 | V % B == 0. Each block |\n" - " * | | | | handles integer number of |\n" - " * | | | | destination elements, no |\n" - " * | | | | partial results are required, |\n" - " * | | | | workspace is unused. |\n" - " * | | | | |\n" - " * | 0 | 1 | 0 | V < B. Block begins aligned |\n" - " * | | | | but ends misaligned, before |\n" - " * | | | | the end of its first element. |\n" - " * | | | | Partial result written to |\n" - " * | | | | right-half of array. |\n" - " * | | | | |\n" - " * | 0 | 1 | 1 | V > B, V % B != 0. Block |\n" - " * | | | | begins aligned but ends |\n" - " * | | | | misaligned, after the end of |\n" - " * | | | | its first element. |\n" - " * | | | | First 1 or more complete |\n" - " * | | | | elements written out directly |\n" - " * | | | | to destination. |\n" - " * | | | | Partial result of last element |\n" - " * | | | | written to right-half of array.|\n" - " * | | | | |\n" - " * | 1 | 0 | 0 | Impossible unless v == 0, |\n" - " * | | | | which is forbidden. |\n" - " * | | | | |\n" - " * | 1 | 0 | 1 | V % B != 0. Partial result of |\n" - " * | | | | first element written to left- |\n" - " * | | | | half of array. Zero or more |\n" - " * | | | | complete reductions performed |\n" - " * | | | | and written directly to |\n" - " * | | | | destination. Block ends |\n" - " * | | | | aligned. |\n" - " * | | | | |\n" - " * | 1 | 1 | 0 | V < B. Block begins misaligned |\n" - " * | | | | and ends misaligned, before |\n" - " * | | | | the end of its first element. |\n" - " * | | | | Partial result written to at |\n" - " * | | | | least right-half of array. |\n" - " * | | | | |\n" - " * | 1 | 1 | 1 | V % B != 0. Block begins |\n" - " * | | | | misaligned and ends misaligned,|\n" - " * | | | | after the end of its first |\n" - " * | | | | element. |\n" - " * | | | | Partial result of first element|\n" - " * | | | | written to left-half of array. |\n" - " * | | | | Partial result of last element |\n" - " * | | | | written to right-half of array.|\n" - " * | | | | 0 or more complete elements |\n" - " * | | | | written out directly to |\n" - " * | | | | destination. |\n" - " * +-------------+-------------+------------+---------------------------------+\n" - " * \n" - " * Possible configurations of blocks:\n" - " * If V % B == 0: 001\n" - " * If V < B: 010, 110, 111, 101\n" - " * If V > B: 011, 111, 101\n" - " * \n" - " * Possible configurations for collector blocks (responsible for gathering of\n" - " * results to the left):\n" - " * 101, 111 (misalignL && doFinish)\n" - " * \n" - " * Possible configurations for left-neighbours of collector blocks\n" - " * 110 (any number 0+), then exactly one of:\n" - " * 010, 011, 111\n" - " * \n" - " * Conclusion:\n" - " * - In Phase 0:\n" - " * - Always make a right-write if misalignR (010, 011, 110, 111).\n" - " * - Make a left -write at least if collector block (101, 111).\n" - " * - In Phase 1:\n" - " * - Exit if not collector block (101, 111)\n" - " * - If collector block,\n" - " * - Left -read from self\n" - " * - Right-read from all left-neighbours with same write-target.\n" - " * \n" - " * Code Structure perfectly satisfying conclusion:\n" - " * \n" - " * if(misalignL){\n" - " * while(v > 0){\n" - " * v--;\n" - " * REDUX();\n" - " * ReduxLoopIncs_CONTINUE;\n" - " * HREDUX();\n" - " * WSLeftWrite();\n" - " * REINIT();\n" - " * FreeLoopIncs_BREAK;\n" - " * BREAK;\n" - " * }\n" - " * }\n" - " * while(v > 0){\n" - " * v--;\n" - " * REDUX();\n" - " * ReduxLoopIncs_CONTINUE;\n" - " * HREDUX();\n" - " * DstWrite();\n" - " * REINIT();\n" - " * FreeLoopIncs_CONTINUE;\n" - " * BREAK;\n" - " * }\n" - " * if(misalignR){\n" - " * HREDUX();\n" - " * WSRightWrite();\n" - " * }\n" - " * \n" - " * Code Walkthrough:\n" - " * \n" - " * 000, 100: Impossible, can be ignored.\n" - " * 001: Only master loop entered, handles exact integer number of destinations.\n" - " * 010: Master loop entered but broken on vcount before HREDUX() reached.\n" - " * No reinit executed on breakout. HREDUX(), followed by WSRightWrite() of\n" - " * partial result.\n" - " * 011: Master loop entered for at least 1 full destination, then broken on\n" - " * vcount before HREDUX() reached. No reinit executed on breakout. HREDUX()\n" - " * followed by WSRightWrite() of partial result.\n" - " * 101: Left-misalign loop entered and completes a reduction. HREDUX()\n" - " * performed, WSLeftWrite() performed, reinitialization, bump of outer\n" - " * loop counters, then breakout. Master loop entered for 0 or more complete\n" - " * destination elements involving full writeouts to destination and reinit.\n" - " * Aligned on both misalignL and master loop breakouts. No entry into\n" - " * misalignR fixup.\n" - " * 110: Left-misalign loop entered, breaks on vcount before HREDUX(). No reinit\n" - " * executed on breakout. Master loop not entered. HREDUX(), followed by\n" - " * WSRightWrite() of partial result.\n" - " * 111: Left-misalign loop entered and completes a reduction. HREDUX() performed,\n" - " * WSLeftWrite() performed, reinit, bump of outer loop counters, breakout.\n" - " * Master loop entered for 0 or more complete destination elements\n" - " * involving full writeout to destination and reinit.\n" - " * Master loop broken on vcount before HREDUX(). misalignR fixup entered,\n" - " * HREDUX(), WSRightWrite().\n" - " */\n" + " DECLREDUXSTATE(tmpK0, I0)\n" + " DECLREDUXSTATE(K0, K1)\n" + " INITREDUXSTATE(K0, K1);\n" " \n" - " TX start = GID_0 * V;\n" - " if(start >= U){return;}\n" - " TX v = U-start < V ? U-start : V;\n" + " TU64 z, h, k;\n" " \n" - " int misalignL = (start+0)%B != 0;\n" - " int misalignR = (start+v)%B != 0;\n" - " int doFinish = (start+0)/B != (start+v)/B;\n" + /** + * +-------------+-------------+------------+---------------------------------+ + * | misalignL | misalignR | doFinish | DESCRIPTION | + * +-------------+-------------+------------+---------------------------------+ + * | 0 | 0 | 0 | Impossible unless v == 0, | + * | | | | which is forbidden. | + * | | | | | + * | 0 | 0 | 1 | V % B == 0. Each block | + * | | | | handles integer number of | + * | | | | destination elements, no | + * | | | | partial results are required, | + * | | | | workspace is unused. | + * | | | | | + * | 0 | 1 | 0 | V < B. Block begins aligned | + * | | | | but ends misaligned, before | + * | | | | the end of its first element. | + * | | | | Partial result written to | + * | | | | right-half of array. | + * | | | | | + * | 0 | 1 | 1 | V > B, V % B != 0. Block | + * | | | | begins aligned but ends | + * | | | | misaligned, after the end of | + * | | | | its first element. | + * | | | | First 1 or more complete | + * | | | | elements written out directly | + * | | | | to destination. | + * | | | | Partial result of last element | + * | | | | written to right-half of array.| + * | | | | | + * | 1 | 0 | 0 | Impossible unless v == 0, | + * | | | | which is forbidden. | + * | | | | | + * | 1 | 0 | 1 | V % B != 0. Partial result of | + * | | | | first element written to left- | + * | | | | half of array. Zero or more | + * | | | | complete reductions performed | + * | | | | and written directly to | + * | | | | destination. Block ends | + * | | | | aligned. | + * | | | | | + * | 1 | 1 | 0 | V < B. Block begins misaligned | + * | | | | and ends misaligned, before | + * | | | | the end of its first element. | + * | | | | Partial result written to at | + * | | | | least right-half of array. | + * | | | | | + * | 1 | 1 | 1 | V % B != 0. Block begins | + * | | | | misaligned and ends misaligned,| + * | | | | after the end of its first | + * | | | | element. | + * | | | | Partial result of first element| + * | | | | written to left-half of array. | + * | | | | Partial result of last element | + * | | | | written to right-half of array.| + * | | | | 0 or more complete elements | + * | | | | written out directly to | + * | | | | destination. | + * +-------------+-------------+------------+---------------------------------+ + * + * Possible configurations of blocks: + * If V % B == 0: 001 + * If V < B: 010, 110, 111, 101 + * If V > B: 011, 111, 101 + * + * Possible configurations for collector blocks (responsible for gathering of + * results to their right): + * 010, 011, 111 (misalignR && (!misalignL || doFinish)) + * + * Possible configurations for right-neighbours of collector blocks + * 110 (any number 0+), then exactly one of: + * 101, 111 + * + * Conclusion: + * - In Phase 0: + * - Always make a right-write if collector block (010, 011, 111). + * - Always make a left -write if misalignL (101, 110, 111). + * - In Phase 1: + * - Exit if not collector block (010, 011, 111) + * - If collector block, + * - Right-read from self + * - Left -read from all right-neighbours with same write-target. + * + * Code Structure perfectly satisfying conclusion: + * + * if(misalignR){ + * while(v > 0){ + * v--; + * REDUX(); + * ReduxLoopDecs_CONTINUE; + * HREDUX(); + * WSRightWrite(); + * REINIT(); + * FreeLoopDecs_BREAK; + * BREAK; + * } + * } + * while(v > 0){ + * v--; + * REDUX(); + * ReduxLoopDecs_CONTINUE; + * HREDUX(); + * DstWrite(); + * REINIT(); + * FreeLoopDecs_CONTINUE; + * BREAK; + * } + * if(misalignL){ + * HREDUX(); + * WSLeftWrite(); + * } + * + * Code Walkthrough: + * + * 000, 100: -- Impossible, can be ignored. + * 001: -- Only master loop entered, handles exact integer number of destinations. + * 010: -R Right-misalign loop entered, completes a reduction. HREDUX, partial + * result right-written to workspace, reinit, bump of free loop counters, + * break simultaneously on vcount and free loop breaks. + * Master loop not entered. Left-misalign fixup not entered. + * 011: -R Right-misalign loop entered, completes a reduction. HREDUX, partial + * result right-written to workspace, reinit, bump of free loop counters, + * break on free loop breaks. Master loop entered for 1+ complete + * destination elements written direct to destination. Break on vcount. + * Left-misalign fixup not entered. + * 101: L- Master loop entered for 0+ complete destination elements written + * directly to destination. Master loop broken on vcount. Left-misalign + * fixup entered, HREDUX, partial result left-written to workspace. + * 110: L- Right-misalign loop entered, broken on vcount before HREDUX. No + * reinit. Master loop not entered. Left-misalign fixup entered, HREDUX, + * partial result left-written to workspace. + * 111: LR Right-misalign loop entered and completes a reduction. HREDUX, partial + * result right-written to workspace, reinit, bump of free loop counters, + * breakout. Master loop entered for 0 or more complete destination + * elements written directly to destination. Master loop broken on vcount + * before HREDUX. Right-misalign fixup entered, HREDUX, partial result + * left-written to workspace. + */ " \n" - " /**\n" - " * Decode BLOCK start point.\n" - " * \n" - " * For the purpose of decoding the start point, the split axis's \"length\"\n" - " * is divided by either splitReduce or splitFree and rounded up. Therefore,\n" - " * for those axes the true computed initial starting point must be\n" - " * multiplied by either splitReduce or splitFree.\n" - " * \n" - " * Since we provide not strides but \"jumps\" to the kernel (to move as many\n" - " * things as possible into constant memory and out of the fast path), we\n" - " * must also convert jumps to strides in preparation for offsetting the\n" - " * base pointers to their starting point.\n" - " */\n" + " TU64 left = GID_0 * V;\n" + " if(left >= U){return;}\n" + " TU64 v = U-left < V ? U-left : V;\n" " \n" - " TX z, h, k;\n" - " unsigned Dunit = D/splitFree;\n"); - if(gr->ndd > 0){ - srcbAppendf(&gr->srcGen, - " TX l%dDiv = DIVIDECEIL(l%d, splitFree);\n", - gr->ndd-1, gr->ndd-1); - } - if(gr->ndr > 0){ - srcbAppendf(&gr->srcGen, - " TX l%dDiv = DIVIDECEIL(l%d, splitReduce);\n", - gr->nds-1, gr->nds-1); - } - srcbAppends(&gr->srcGen, + " TS32 misalignL = (left+0)%B != 0;\n" + " TS32 misalignR = (left+v)%B != 0;\n" + " TS32 doFinish = (left+0)/B != (left+v)/B;\n" + " TS32 collector = misalignR && (!misalignL || doFinish);\n" " \n" - " z = start;\n"); - for(i=gr->nds-1;i>=0;i--){ - if(i == gr->nds-1 || i == gr->ndd-1){ + " TU32 iSplit = LID_0/(LDIM_0/LSlice);\n" + " \n"); + /** + * Decode Intra-/Inter-Block start point. + * + * For the purpose of decoding the start point, the split axis's \"length\" + * is divided by either splitReduce or splitFree and rounded up. Therefore, + * for those axes the true computed initial starting point must be + * multiplied by either splitReduce or splitFree. + * + * Since we provide not strides but \"jumps\" to the kernel (to move as many + * things as possible into constant memory and out of the fast path), we + * must also convert jumps to strides in preparation for offsetting the + * base pointers to their starting point. + * + * This also involves computing the intra-block coordinate of a thread in a + * up-to-log2(MAX_BLOCK_THREADS)-rank coordinate system, then using + * those coordinates to compute intrablock S0/D0/D1/I0/permute targets. + */ + + for (i=gr->nds-1;i>=0;i--){ + if (i == gr->nds-1 && i == gr->ndd-1){ srcbAppendf(&gr->srcGen, - " TX i%d = z %% l%dDiv;z /= l%dDiv;\n", - i, i, i); - }else{ + " TU64 _L%d = DIVIDECEIL(L%d, LSlice);\n", i, i); + }else if (i == gr->nds-1){ srcbAppendf(&gr->srcGen, - " TX i%d = z %% l%d; z /= l%d;\n", - i, i, i); - } - } - srcbAppends(&gr->srcGen, " \n"); - for(i=gr->nds-1;i>=0;i--){ - if(i == gr->nds-1){ + " TU64 _L%d = DIVIDECEIL(L%d, (selector&2) ? 1 : LSlice);\n", i, i); + }else if (i == gr->ndd-1){ srcbAppendf(&gr->srcGen, - " TX sS%d = sJ%d;\n", - i, i); + " TU64 _L%d = DIVIDECEIL(L%d, (selector&2) ? LSlice : 1);\n", i, i); }else{ srcbAppendf(&gr->srcGen, - " TX sS%d = sJ%d + l%d%s*sS%d;\n", - i, i, i+1, - reduxGenAxisMaybeSplit(gr, i+1) ? "Div" : " ", i+1); + " TU64 _L%d = L%d;\n", i, i); } } - if (reduxGenRequiresDst(gr)){ + srcbAppends(&gr->srcGen, + " \n" + " z = left+v-1;\n"); + for (i=gr->nds-1;i>=0;i--){ + srcbAppendf(&gr->srcGen, + " TS64 _i%d = z %% _L%d; z /= _L%d;\n", i, i, i); + } + srcbAppends(&gr->srcGen, + " z = LID_0;\n"); + for (i=gr->log2MaxBS-1;i>=0;i--){ + srcbAppendf(&gr->srcGen, + " TS32 _i%di = z %% L%di; z /= L%di;\n", i, i, i); + } + + + /* Compute Intrablock Permute Core, since it will be used soon */ + srcbAppends(&gr->srcGen, " \n"); + srcbAppends(&gr->srcGen, " const TU32 perm = "); + srcbBeginList(&gr->srcGen, " + ", "0"); + for (i=0;ilog2MaxBS;i++){ + srcbAppendElemf(&gr->srcGen, "_i%di*perm%di", i, i); + } + srcbEndList(&gr->srcGen); + srcbAppends(&gr->srcGen, ";\n"); + + + /* S0 Lattice */ + if (reduxGenKernelRequiresLatticeS0(gr)){ srcbAppends(&gr->srcGen, " \n"); - for(i=gr->ndd-1;i>=0;i--){ - if(i == gr->ndd-1){ + for (i=gr->nds-1;i>=0;i--){ + if (i == gr->nds-1){ srcbAppendf(&gr->srcGen, - " TX dS%d = dJ%d;\n", - i, i); + " TS64 _S0S%d = S0J%d;\n", i, i); }else{ srcbAppendf(&gr->srcGen, - " TX dS%d = dJ%d + l%d%s*dS%d;\n", - i, i, i+1, - reduxGenAxisMaybeSplit(gr, i+1) ? "Div" : " ", i+1); + " TS64 _S0S%d = S0J%d + _L%d*_S0S%d;\n", i, i, i+1, i+1); } } + srcbAppends(&gr->srcGen, " S0Off += "); + srcbBeginList(&gr->srcGen, " + ", "0"); + for (i=0;inds;i++){ + srcbAppendElemf(&gr->srcGen, "_i%d*_S0S%d", i, i); + } + for (i=0;ilog2MaxBS;i++){ + srcbAppendElemf(&gr->srcGen, "_i%di*S0S%di", i, i); + } + srcbEndList(&gr->srcGen); + srcbAppends(&gr->srcGen, ";\n" + " S0 += S0Off;\n"); } - if (reduxGenRequiresDstArg(gr)){ + + + /* D0 Lattice */ + if (reduxGenKernelRequiresLatticeD0(gr)){ srcbAppends(&gr->srcGen, " \n"); - for(i=gr->ndd-1;i>=0;i--){ - if(i == gr->ndd-1){ + for (i=gr->ndd-1;i>=0;i--){ + if (i == gr->ndd-1){ srcbAppendf(&gr->srcGen, - " TX aS%d = aJ%d;\n", - i, i); + " TS64 _D0S%d = D0J%d;\n", i, i); }else{ srcbAppendf(&gr->srcGen, - " TX aS%d = aJ%d + l%d%s*aS%d;\n", - i, i, i+1, - reduxGenAxisMaybeSplit(gr, i+1) ? "Div" : " ", i+1); + " TS64 _D0S%d = D0J%d + _L%d*_D0S%d;\n", i, i, i+1, i+1); } } - } - srcbAppends(&gr->srcGen, " \n"); - srcbAppends(&gr->srcGen, " sOff += "); - srcbBeginList(&gr->srcGen, " + ", "0"); - for(i=0;inds;i++){ - srcbAppendElemf(&gr->srcGen, "(TX)i%d*sS%d", i, i); - } - srcbEndList(&gr->srcGen); - srcbAppends(&gr->srcGen, ";\n"); - if (reduxGenRequiresDst(gr)){ - srcbAppends(&gr->srcGen, " dOff += "); + srcbAppends(&gr->srcGen, " D0Off += "); srcbBeginList(&gr->srcGen, " + ", "0"); - for(i=0;indd;i++){ - srcbAppendElemf(&gr->srcGen, "(TX)i%d*dS%d", i, i); + for (i=0;indd;i++){ + srcbAppendElemf(&gr->srcGen, "_i%d*_D0S%d", i, i); } - srcbEndList(&gr->srcGen); - srcbAppends(&gr->srcGen, ";\n"); - } - if (reduxGenRequiresDstArg(gr)){ - srcbAppends(&gr->srcGen, " aOff += "); - srcbBeginList(&gr->srcGen, " + ", "0"); - for(i=0;indd;i++){ - srcbAppendElemf(&gr->srcGen, "(TX)i%d*aS%d", i, i); + for (i=0;ilog2MaxBS;i++){ + srcbAppendElemf(&gr->srcGen, "_i%di*D0S%di", i, i); } srcbEndList(&gr->srcGen); - srcbAppends(&gr->srcGen, ";\n"); - } - srcbAppends(&gr->srcGen, " \n"); - if(gr->ndd > 0){ - srcbAppendf(&gr->srcGen, - " i%d *= splitFree;\n", - gr->ndd-1); - } - if(gr->ndr > 0){ - srcbAppendf(&gr->srcGen, - " i%d *= splitReduce;\n", - gr->nds-1); - } - srcbAppends(&gr->srcGen, " \n"); - if(reduxGenKernelRequiresDst(gr)){ - srcbAppends(&gr->srcGen, - " TK* restrict wd = (TK* restrict)(w + wdOff);\n" - " TK* restrict wdL = &wd[0];\n" - " TK* restrict wdR = &wd[GDIM_0*D];\n" - " TK* restrict pd = (TK* restrict)(SHMEM + pdOff);\n"); - } - if(reduxGenKernelRequiresDstArg(gr)){ - srcbAppends(&gr->srcGen, - " TA* restrict wa = (TA* restrict)(w + waOff);\n" - " TA* restrict waL = &wa[0];\n" - " TA* restrict waR = &wa[GDIM_0*D];\n" - " TA* restrict pa = (TA* restrict)(SHMEM + paOff);\n"); + srcbAppends(&gr->srcGen, ";\n" + " if(perm < D){\n" + " ((TS64*)SHMEM)[perm] = D0Off;\n" + " }\n" + " local_barrier();\n" + " if(LID_0 < D){\n" + " D0Off = ((TS64*)SHMEM)[LID_0];\n" + " }\n" + " local_barrier();\n" + " D0 += D0Off;\n"); } - srcbAppends(&gr->srcGen, " \n"); -} -static void reduxGenSrcAppendThreadDecode (GpuReduction* gr){ - int i; - srcbAppends(&gr->srcGen, - " /**\n" - " * Decode THREAD start point.\n" - " * \n" - " * This involves computing the intra-block coordinate of a thread in a\n" - " * up-to-log2(MAX_BLOCK_THREADS)-dimensional coordinate system, then using\n" - " * those coordinates to compute private source/destination/destination\n" - " * argument pointers, argument indices and permute targets.\n" - " */\n" - " \n" - " unsigned iSplit = LID_0/(LDIM_0/(splitFree*splitReduce));\n" - " z = LID_0;\n"); - - for(i=gr->log2MaxL-1;i>=0;i--){ - srcbAppendf(&gr->srcGen, - " int t%d = z %% ibs%d;z /= ibs%d;\n", - i, i, i); - } - if(reduxGenRequiresDstArg(gr)){ - srcbAppends(&gr->srcGen, " TX ti = "); - srcbBeginList(&gr->srcGen, " + ", "0"); - for(i=0;ilog2MaxL;i++){ - srcbAppendElemf(&gr->srcGen, "t%d*ibl%dPDim", i, i); + + /* D1 Lattice */ + if (reduxGenKernelRequiresLatticeD1(gr)){ + srcbAppends(&gr->srcGen, " \n"); + for (i=gr->ndd-1;i>=0;i--){ + if (i == gr->ndd-1){ + srcbAppendf(&gr->srcGen, + " TS64 _D1S%d = D1J%d;\n", i, i); + }else{ + srcbAppendf(&gr->srcGen, + " TS64 _D1S%d = D1J%d + _L%d*_D1S%d;\n", i, i, i+1, i+1); + } } - srcbEndList(&gr->srcGen); - srcbAppends(&gr->srcGen, ";\n"); - } - srcbAppends(&gr->srcGen, " unsigned tp = "); - srcbBeginList(&gr->srcGen, " + ", "0"); - for(i=0;ilog2MaxL;i++){ - srcbAppendElemf(&gr->srcGen, "t%d* ibp%d", i, i); - } - srcbEndList(&gr->srcGen); - srcbAppends(&gr->srcGen, ";\n"); - srcbAppends(&gr->srcGen, " \n" - " sOff += "); - srcbBeginList(&gr->srcGen, " + ", "0"); - for(i=0;ilog2MaxL;i++){ - srcbAppendElemf(&gr->srcGen, "t%d*ibsOff%d ", i, i); - } - srcbEndList(&gr->srcGen); - srcbAppends(&gr->srcGen, ";\n"); - if(reduxGenRequiresDst(gr)){ - srcbAppends(&gr->srcGen, " \n" - " dOff += "); + srcbAppends(&gr->srcGen, " D1Off += "); srcbBeginList(&gr->srcGen, " + ", "0"); - for(i=0;ilog2MaxL;i++){ - srcbAppendElemf(&gr->srcGen, "t%d*ibdOff%d ", i, i); + for (i=0;indd;i++){ + srcbAppendElemf(&gr->srcGen, "_i%d*_D1S%d", i, i); + } + for (i=0;ilog2MaxBS;i++){ + srcbAppendElemf(&gr->srcGen, "_i%di*D1S%di", i, i); } srcbEndList(&gr->srcGen); - srcbAppends(&gr->srcGen, ";\n"); - srcbAppends(&gr->srcGen, " ((TX*)SHMEM)[tp] = dOff;\n" + srcbAppends(&gr->srcGen, ";\n" + " if(perm < D){\n" + " ((TS64*)SHMEM)[perm] = D1Off;\n" + " }\n" + " local_barrier();\n" + " if(LID_0 < D){\n" + " D1Off = ((TS64*)SHMEM)[LID_0];\n" + " }\n" " local_barrier();\n" - " dOff = ((TX*)SHMEM)[LID_0];\n" - " local_barrier();\n"); + " D1 += D1Off;\n"); } - if(reduxGenRequiresDstArg(gr)){ - srcbAppends(&gr->srcGen, " \n" - " aOff += "); + + + /* I0 Lattice */ + if (reduxGenKernelRequiresLatticeI0(gr)){ + srcbAppends(&gr->srcGen, " \n"); + for (i=gr->nds-1;i>=0;i--){ + if (i == gr->nds-1){ + srcbAppendf(&gr->srcGen, + " TS64 _I0S%d = I0J%d;\n", i, i); + }else{ + srcbAppendf(&gr->srcGen, + " TS64 _I0S%d = I0J%d + _L%d*_I0S%d;\n", i, i, i+1, i+1); + } + } + srcbAppends(&gr->srcGen, " I0 = "); srcbBeginList(&gr->srcGen, " + ", "0"); - for(i=0;ilog2MaxL;i++){ - srcbAppendElemf(&gr->srcGen, "t%d*ibaOff%d ", i, i); + for (i=0;inds;i++){ + srcbAppendElemf(&gr->srcGen, "_i%d*_I0S%d", i, i); + } + for (i=0;ilog2MaxBS;i++){ + srcbAppendElemf(&gr->srcGen, "_i%di*I0S%di", i, i); } srcbEndList(&gr->srcGen); srcbAppends(&gr->srcGen, ";\n"); - srcbAppends(&gr->srcGen, " ((TX*)SHMEM)[tp] = aOff;\n" - " local_barrier();\n" - " aOff = ((TX*)SHMEM)[LID_0];\n" - " local_barrier();\n"); } - srcbAppends(&gr->srcGen, " \n" - " const char* restrict ts = s + sOff;\n"); - if(reduxGenRequiresDst(gr)){ - srcbAppends(&gr->srcGen, " char* restrict td = d + dOff;\n"); + + + /* Workspace */ + if (reduxGenKernelRequiresWspace(gr)){ + srcbAppends(&gr->srcGen, " \n"); + if (reduxGenKernelRequiresStateK0(gr)){ + srcbAppends(&gr->srcGen, + " TK0* restrict const W0 = (TK0*)(W + W0Off);\n" + " TK0* restrict const W0L = &W0[0];\n" + " TK0* restrict const W0R = &W0[GDIM_0*D];\n" + " TK0* restrict const SHMEMK0 = (TK0*)(SHMEM + SHMEMK0Off);\n"); + } + if (reduxGenKernelRequiresStateK1(gr)){ + srcbAppends(&gr->srcGen, + " TK1* restrict const W1 = (TK1*)(W + W1Off);\n" + " TK1* restrict const W1L = &W1[0];\n" + " TK1* restrict const W1R = &W1[GDIM_0*D];\n" + " TK1* restrict const SHMEMK1 = (TK1*)(SHMEM + SHMEMK1Off);\n"); + } + } + + + /* Fixup the division we did to one of the dimensions. */ + srcbAppendf(&gr->srcGen, " \n"); + if (gr->nds>0){ + srcbAppendf(&gr->srcGen, + " _i%d *= (selector&2) ? 1 : LSlice;\n", gr->nds-1); } - if(reduxGenRequiresDstArg(gr)){ - srcbAppends(&gr->srcGen, " char* restrict ta = a + aOff;\n"); + if (gr->ndd>0){ + srcbAppendf(&gr->srcGen, + " _i%d *= (selector&2) ? LSlice : 1;\n", gr->ndd-1); } - srcbAppends(&gr->srcGen, " \n" - " \n"); -} -static void reduxGenSrcAppendPhase0 (GpuReduction* gr){ + + + /* Add a couple newlines before next section */ srcbAppends(&gr->srcGen, - " /* PHASE 0 */\n" - " \n" - " /* Loop Cores. */\n"); - if (gr->ndd == 0){ - /** - * Special case: If ndd == 0, we know this is an all-reduce or nearly, so - * we know that the only split axis, if any, is going to be a reduction axis. - * Therefore, splitFree will always be 1, and we only need to generate one - * set of loops. - */ - - reduxGenSrcAppendLoops(gr, 0, 1); - }else{ - srcbAppends(&gr->srcGen, " if(splitReduce == 1){\n" - " /* Free axis possibly split. */\n"); - reduxGenSrcAppendLoops(gr, 1, 0); - srcbAppends(&gr->srcGen, " }else{\n" - " /* Reduce axis possibly split. */\n"); - reduxGenSrcAppendLoops(gr, 0, 1); - srcbAppends(&gr->srcGen, " }\n"); - } -} -static void reduxGenSrcAppendLoops (GpuReduction* gr, - int freeMaybeSplit, - int reduceMaybeSplit){ - srcbAppends(&gr->srcGen, " if(misalignL){\n"); - reduxGenSrcAppendLoop(gr, 1, freeMaybeSplit, reduceMaybeSplit); + " \n" + " \n"); +} +static void reduxGenSrcAppendPhase0 (GpuReduction* gr, + uint32_t selector){ + int i; + const char* type; + + /** + * Convert index types depending on the template selected by the selector. + * + * If misaligned on the right, write partial reduction to right-half. + * If misaligned on the left, write partial reduction to left-half. + * + * The Phase 1 collector blocks will take care of reading the partial + * reduction results and combining them. + */ + + srcbAppends(&gr->srcGen, " "); + for (i=0;inds;i++){ + type = reduxGenSrcAxisIsHuge(gr, selector, i) ? "TU64" : "TU32"; + srcbAppendf(&gr->srcGen, "%s i%d = _i%d;", type, i, i); + } + srcbAppends(&gr->srcGen, "\n" + " \n" + " if(misalignR){\n"); + reduxGenSrcAppendLoop(gr, selector, 1); srcbAppends(&gr->srcGen, " }\n"); - reduxGenSrcAppendLoop(gr, 0, freeMaybeSplit, reduceMaybeSplit); - srcbAppends(&gr->srcGen, - " \n" - " /**\n" - " * Are we misaligned on the right? If so, we have a partial reduction\n" - " * to save.\n" - " */\n" - " \n" - " if(misalignR){\n" - " HREDUX(pd, pa, tp, accV, accI);\n" - " \n" - " /* Right-write partial reduction to workspace. */\n" - " if(LID_0 < D){\n" - " SETREDUXSTATE(wdR[GID_0*D+LID_0], waR[GID_0*D+LID_0], pd[LID_0], pa[LID_0]);\n" - " }\n" - " }\n"); -} -static void reduxGenSrcAppendLoop (GpuReduction* gr, - int initial, - int freeMaybeSplit, - int reduceMaybeSplit){ - int i; - - srcbAppends(&gr->srcGen, " while(v > 0){\n"); - reduxGenSrcAppendDecrement(gr); - reduxGenSrcAppendVertical (gr, freeMaybeSplit, reduceMaybeSplit); - srcbAppends(&gr->srcGen, " /* Reduction Increments */\n"); - for(i=gr->nds-1;i >= gr->ndd;i--){ - reduxGenSrcAppendIncrement(gr, i, initial, freeMaybeSplit, reduceMaybeSplit); - } - srcbAppends(&gr->srcGen, " /* Horizontal Reduction */\n" - " HREDUX(pd, pa, tp, accV, accI);\n" - " \n"); - reduxGenSrcAppendDstWrite(gr, initial, freeMaybeSplit, reduceMaybeSplit); - srcbAppends(&gr->srcGen, " /* Reinitialize accumulators */\n" - " INITREDUXSTATE(accV, accI);\n" - " \n"); - srcbAppends(&gr->srcGen, " /* Free Increments */\n"); - for(i=gr->ndd-1;i >= 0;i--){ - reduxGenSrcAppendIncrement(gr, i, initial, freeMaybeSplit, reduceMaybeSplit); - } - srcbAppends(&gr->srcGen, " /* Exit loop */\n" - " break;\n" + reduxGenSrcAppendLoop(gr, selector, 0); + srcbAppends(&gr->srcGen, " if(misalignL){\n" + " HREDUX(SHMEMK0, SHMEMK1, perm, K0, K1);\n" + " if(LID_0 < D){\n" + " SETREDUXSTATE(W0L[GID_0*D+LID_0],\n" + " W1L[GID_0*D+LID_0],\n" + " SHMEMK0[LID_0],\n" + " SHMEMK1[LID_0]);\n" + " }\n" " }\n"); } -static void reduxGenSrcAppendDecrement (GpuReduction* gr){ - srcbAppends(&gr->srcGen, " /* Decrement. */\n" - " v--;\n" - " \n"); -} -static void reduxGenSrcAppendVertical (GpuReduction* gr, - int freeMaybeSplit, - int reduceMaybeSplit){ +static void reduxGenSrcAppendLoop (GpuReduction* gr, + uint32_t selector, + int initial){ int i; - - if(!freeMaybeSplit && !reduceMaybeSplit){ - srcbAppends(&gr->srcGen, " /* Vertical Reductions */\n" - " LOADS(tmpV, ts);\n" - " REDUX(accV, accI, tmpV, GETIDX);\n" - " \n"); + + srcbAppends(&gr->srcGen, " while(v > 0){v--;\n"); + reduxGenSrcAppendVertical (gr, selector); + for (i=gr->nds-1;i >= gr->ndd;i--){ + reduxGenSrcAppendIncrement(gr, selector, initial, i); + } + srcbAppends(&gr->srcGen, " HREDUX(SHMEMK0, SHMEMK1, perm, K0, K1);\n"); + reduxGenSrcAppendDstWrite(gr, selector, initial); + srcbAppends(&gr->srcGen, " INITREDUXSTATE(K0, K1);\n"); + for (i=gr->ndd-1;i >= 0;i--){ + reduxGenSrcAppendIncrement(gr, selector, initial, i); + } + srcbAppends(&gr->srcGen, " break;\n" + " }\n"); +} +static void reduxGenSrcAppendVertical (GpuReduction* gr, + uint32_t selector){ + int i = (selector&SELECTOR_SPLIT_FREE) ? gr->ndd-1 : gr->nds-1; + + if (i >= 0){ + srcbAppendf(&gr->srcGen, " if(i%d+iSplit < L%d){\n" + " LOADS0(tmpK0, S0);\n" + " REDUX(K0, K1, tmpK0, I0);\n" + " }\n", i, i); }else{ - i = freeMaybeSplit ? gr->ndd-1 : gr->nds-1; - srcbAppendf(&gr->srcGen, " /* Vertical Reductions */\n" - " if(i%d+iSplit < l%d){\n" - " LOADS(tmpV, ts);\n" - " REDUX(accV, accI, tmpV, GETIDX);\n" - " }\n" - " \n", i, i); - } -} -static void reduxGenSrcAppendIncrement (GpuReduction* gr, - int axis, - int initial, - int freeMaybeSplit, - int reduceMaybeSplit){ + srcbAppends(&gr->srcGen, " LOADS0(tmpK0, S0);\n" + " REDUX(K0, K1, tmpK0, I0);\n"); + } +} +static void reduxGenSrcAppendIncrement (GpuReduction* gr, + uint32_t selector, + int initial, + int axis){ + const char* cast = reduxGenSrcAxisIsHuge(gr, selector, axis) ? "TS64" : "TS32"; const char* breakOrCont = (initial) && (axis < gr->ndd) ? "break" : "continue"; - - if (freeMaybeSplit && axis == gr->ndd-1){ - srcbAppendf(&gr->srcGen, - " i%d += splitFree;\n" - " ts += sJ%d;", - axis, axis); - if(reduxGenRequiresDst(gr)){ - srcbAppendf(&gr->srcGen, "td += dJ%d;", axis); - } - if(reduxGenRequiresDstArg(gr)){ - srcbAppendf(&gr->srcGen, "ta += aJ%d;", axis); - } - srcbAppends(&gr->srcGen, "\n"); - srcbAppendf(&gr->srcGen, - " if (i%d < l%d){%s;}\n" - " else {i%d = 0;}\n" - " \n", - axis, axis, breakOrCont, axis); - }else if (reduceMaybeSplit && axis == gr->nds-1){ - srcbAppendf(&gr->srcGen, - " i%d += splitReduce;\n" - " ts += sJ%d;\n" - " if (i%d < l%d){%s;}\n" - " else {i%d = 0;}\n" - " \n", - axis, axis, axis, axis, breakOrCont, axis); + + /* Pointer bumps */ + srcbAppends(&gr->srcGen, " "); + if (reduxGenKernelRequiresLatticeS0(gr)){ + srcbAppendf(&gr->srcGen, "S0 -= S0J%d;", axis); }else{ - srcbAppendf(&gr->srcGen, - " i%d++;\n" - " ts += sJ%d;", - axis, axis); - if(axis < gr->ndd){ - if(reduxGenRequiresDst(gr)){ - srcbAppendf(&gr->srcGen, "td += dJ%d;", axis); - } - if(reduxGenRequiresDstArg(gr)){ - srcbAppendf(&gr->srcGen, "ta += aJ%d;", axis); - } - } - srcbAppends(&gr->srcGen, "\n"); - srcbAppendf(&gr->srcGen, - " if (i%d < l%d){%s;}\n" - " else {i%d = 0;}\n" - " \n", - axis, axis, breakOrCont, axis); - } -} -static void reduxGenSrcAppendDstWrite (GpuReduction* gr, - int initial, - int freeMaybeSplit, - int reduceMaybeSplit){ - if(initial){ - srcbAppends(&gr->srcGen, " /* Workspace Left-Write */\n" - " if(LID_0 < D){\n" - " SETREDUXSTATE(wdL[GID_0*D + LID_0], waL[GID_0*D + LID_0], pd[LID_0], pa[LID_0]);\n" - " }\n" - " \n"); + srcbAppends(&gr->srcGen, " "); + } + if (reduxGenKernelRequiresLatticeD0(gr) && axis < gr->ndd){ + srcbAppendf(&gr->srcGen, "D0 -= D0J%d;", axis); }else{ - if(!freeMaybeSplit){ - srcbAppends(&gr->srcGen, " /* Destination Write */\n" - " if(LID_0 < D){\n" - " STORED(td, pd[LID_0]);\n" - " STOREA(ta, pa[LID_0]);\n" - " }\n" - " \n"); - }else{ - if(gr->ndd > 0){ - srcbAppendf(&gr->srcGen, " /* Destination Write */\n" - " if(LID_0 < (l%d-i%dsrcGen, " "); + } + if (reduxGenKernelRequiresLatticeD1(gr) && axis < gr->ndd){ + srcbAppendf(&gr->srcGen, "D1 -= D1J%d;", axis); + }else{ + srcbAppends(&gr->srcGen, " "); + } + if (reduxGenKernelRequiresLatticeI0(gr)){ + srcbAppendf(&gr->srcGen, "I0 -= I0J%d;", axis); + }else{ + srcbAppends(&gr->srcGen, " "); + } + + /* Index Check */ + if (reduxGenSrcAxisIsSplit(gr, selector, axis)){ + srcbAppendf(&gr->srcGen, "i%d-=LSlice;if((%s)i%d >= 0){%s;}else{i%d+=LPadded;}\n", + axis, cast, axis, breakOrCont, axis); + }else{ + srcbAppendf(&gr->srcGen, "i%d--; if((%s)i%d >= 0){%s;}else{i%d+=L%d;}\n", + axis, cast, axis, breakOrCont, axis, axis); + } +} +static void reduxGenSrcAppendDstWrite (GpuReduction* gr, + uint32_t selector, + int initial){ + if (initial){ + srcbAppends(&gr->srcGen, " if(LID_0 < D){\n" + " SETREDUXSTATE(W0R[GID_0*D + LID_0],\n" + " W1R[GID_0*D + LID_0],\n" + " SHMEMK0[LID_0],\n" + " SHMEMK1[LID_0]);\n" + " }\n"); + }else{ + if (selector & SELECTOR_SPLIT_FREE){ + if (gr->ndd > 0){ + srcbAppendf(&gr->srcGen, " if(LID_0 < ((L%d-i%d)ndd-1, gr->ndd-1, gr->ndd-1, gr->ndd-1); }else{ - srcbAppendf(&gr->srcGen, " STORED(td, pd[LID_0]);\n" - " STOREA(ta, pa[LID_0]);\n"); + srcbAppendf(&gr->srcGen, " STORED0(D0, SHMEMK0[LID_0]);\n" + " STORED1(D1, SHMEMK1[LID_0]);\n"); } + }else{ + srcbAppends(&gr->srcGen, " if(LID_0 < D){\n" + " STORED0(D0, SHMEMK0[LID_0]);\n" + " STORED1(D1, SHMEMK1[LID_0]);\n" + " }\n"); } } } -static void reduxGenSrcAppendPhase1 (GpuReduction* gr){ +static void reduxGenSrcAppendPhase1 (GpuReduction* gr){ + /** + * PHASE 1 + * + * If we are a collector block, gather all partial results for the + * same points to the right of the current position in our workspace + * and accumulate them into our partial result, then write out to + * destination/destination argument. + * + * We perform a right-read of our workspace and a left-read of the + * other blocks' workspace. + */ + srcbAppends(&gr->srcGen, - " /* PHASE 1 */\n" - " \n" - " /**\n" - " * If we are a collector block, gather all partial results for the\n" - " * same point to the left of the current position in our workspace\n" - " * and accumulate them into our partial result, then write out to\n" - " * destination/destination argument.\n" - " * We perform a left-read of our workspace and a right-read of the\n" - " * other blocks' workspace.\n" - " */\n" - " \n" - " if(misalignL && doFinish && LID_0 < D){\n" - " SETREDUXSTATE(accV, accI, wdL[(GID_0+0)*D+LID_0], waL[(GID_0+0)*D+LID_0]);\n" + " if(collector && LID_0 < D){\n" + " SETREDUXSTATE(K0, K1, W0R[(GID_0+0)*D+LID_0], W1R[(GID_0+0)*D+LID_0]);\n" " \n" - " /* vvv-- NOTA BENE: The +B hack is REALLY NECESSARY, since C division is rounding to zero: (-1)/B == (B-1)/B for B>1. */\n" - " for(k=-1; /* Starting with the first block to our left... */\n" - " (start +B)/B == /* Is our write target the same as that of */\n" - " (start+k*V+V-1+B)/B; /* the target k blocks to our left? */\n" - " k--){ /* Try moving one more to the left. */\n" - " REDUX(accV, accI, wdR[(GID_0+k)*D+LID_0], waR[(GID_0+k)*D+LID_0]);\n" + " for(k=1,v=left+v-1,z=v+1; /* Starting with the first block to our right... */\n" + " v/B == z/B; /* Is our write target the same as that of */\n" + " /* the target k blocks to our right? */\n" + " k++,z+=V){ /* Try moving one more to the right. */\n" + " REDUX(K0, K1, W0L[(GID_0+k)*D+LID_0], W1L[(GID_0+k)*D+LID_0]);\n" " }\n" " \n"); - if(gr->ndd > 0){ + if (gr->ndd > 0){ srcbAppendf(&gr->srcGen, - " if(LID_0 < (l%d-i%dndd-1, gr->ndd-1, gr->ndd-1, gr->ndd-1); }else{ srcbAppends(&gr->srcGen, - " STORED(td, accV);\n" - " STOREA(ta, accI);\n"); + " STORED0(D0, K0);\n" + " STORED1(D1, K1);\n" + " }\n"); + } +} +static int reduxGenSrcAxisIsHuge (GpuReduction* gr, + uint32_t selector, + int axis){ + int hugeType = selector & SELECTOR_HUGE_AXIS; + int isSplitFree = !!(selector & SELECTOR_SPLIT_FREE); + int isAxisFree = axis < gr->ndd; + + if (hugeType == SELECTOR_HUGE_IS_SPLIT){ + return reduxGenSrcAxisIsSplit(gr, selector, axis); + }else if (hugeType == SELECTOR_HUGE_SAME_TYPE){ + if (isSplitFree == isAxisFree){ + if (isAxisFree){ + return axis == gr->ndd-2; + }else{ + return axis == gr->nds-2; + } + }else{ + return 0; + } + }else if (hugeType == SELECTOR_HUGE_OPPOSITE_TYPE){ + if (isSplitFree != isAxisFree){ + if (isAxisFree){ + return axis == gr->ndd-1; + }else{ + return axis == gr->nds-1; + } + }else{ + return 0; + } + }else{ + return 0; } - srcbAppends(&gr->srcGen, - " }\n"); +} +static int reduxGenSrcAxisIsSplit (GpuReduction* gr, + uint32_t selector, + int axis){ + return ( (selector & SELECTOR_SPLIT_FREE) && axis == gr->ndd-1) || + (!(selector & SELECTOR_SPLIT_FREE) && axis == gr->nds-1); } /** * @brief Compile the generated kernel. */ -static int reduxGenCompile (GpuReduction* gr){ - int ret; - +static int reduxGenCompile (GpuReduction* gr){ + int ret, flags = 0; + + flags |= GA_USE_CLUDA; + if (gr->TS0tc == GA_HALF || gr->TD0tc == GA_HALF){ + flags |= GA_USE_HALF|GA_USE_SMALL; + } + ret = GpuKernel_init(&gr->k, gr->gpuCtx, 1, (const char**)&gr->kSourceCode, &gr->kSourceCodeLen, - "redux", + gr->kName, gr->kNumArgs, gr->kArgTypeCodes, - GA_USE_CLUDA, + flags, &gr->kErrorString); if (ret != GA_NO_ERROR){ return reduxGenCleanupMsg(gr, ret, - "Failed to compile reduction kernel!\n" + "Failed to compile reduction kernel \"%s\"!\n" "Error code is: %d\n" "Error string is:\n" "%s\n" "Source code is:\n" "%s\n", - ret, gr->kErrorString, gr->kSourceCode); + gr->kName, ret, gr->kErrorString, gr->kSourceCode); } - + return reduxGenComputeLaunchBounds(gr); } @@ -2451,43 +2812,20 @@ static int reduxGenCompile (GpuReduction* gr){ static int reduxGenComputeLaunchBounds (GpuReduction* gr){ int ret; - size_t a,b,c; - + /** * Compute the maximum number of threads this kernel will support, * since this is critical to the scheduling and will not change now * that the kernel is compiled. - * - * This depends on several exhaustible resources and isn't necessarily - * trivial to compute due to the complicated rules we must follow to - * align shared memory, possibly slightly increasing consumption. */ - - ret = gpukernel_property(gr->k.k, GA_KERNEL_PROP_MAXLSIZE, &gr->maxLK); - if(ret != GA_NO_ERROR){ + + ret = gpukernel_property(gr->k.k, GA_KERNEL_PROP_MAXLSIZE, &gr->maxLK); + if (ret != GA_NO_ERROR){ return reduxGenCleanupMsg(gr, ret, "Failed to read max local size for compiled kernel!\n"); } - a = gr->maxL0; - b = gr->maxLg; - c = gr->maxLM/reduxGenGetReduxStateSize(gr); - /* Kernel register use */ - gr->maxLK = gr->maxLKmaxLK: a;/* Maximum block size on axis 0 */ - gr->maxLK = gr->maxLKmaxLK: b;/* Maximum total block size */ - gr->maxLK = gr->maxLKmaxLK: c;/* Shared memory per thread. */ - - /** - * We now have a tight bound on the maximum block size, but due to memory - * alignment rules the memory consumption may be slightly higher than we - * initially computed, and thus the shared memory use can still be - * excessive. The following loop will almost certainly decrement at most - * once, unless type alignments are very wierd. - */ - - while(reduxGenGetSHMEMSize(gr, gr->maxLK) > gr->maxLM){ - gr->maxLK--; - } - + gr->maxLK = gr->maxLKmaxBS ? gr->maxLK : gr->maxBS; + return reduxGenCleanup(gr, GA_NO_ERROR); } @@ -2496,11 +2834,11 @@ static int reduxGenComputeLaunchBounds (GpuReduction* gr){ */ static int reduxGenCleanup (GpuReduction* gr, int ret){ - if(ret != GA_NO_ERROR){ + if (ret != GA_NO_ERROR){ free(gr->kArgTypeCodes); free(gr->kSourceCode); free(gr->kErrorString); - + memset(gr, 0, sizeof(*gr)); free(gr); } @@ -2511,7 +2849,7 @@ static int reduxGenCleanupMsg (GpuReduction* gr, int ret, const char* fmt, ...){ #if DEBUG FILE* fp = stderr; - + va_list ap; va_start(ap, fmt); vfprintf(fp, fmt, ap); @@ -2520,7 +2858,7 @@ static int reduxGenCleanupMsg (GpuReduction* gr, int ret, #else (void)fmt; #endif - + return reduxGenCleanup(gr, ret); } @@ -2528,26 +2866,26 @@ static int reduxGenCleanupMsg (GpuReduction* gr, int ret, * Count # of arguments as determined by iterator. */ -static void reduxGenCountArgs (GpuReduction* gr, +static void reduxGenCountArgs (const GpuReduction* gr, int typecode, const char* typeName, const char* baseName, int num, void* user){ + (void)gr; (void)typecode; (void)typeName; (void)baseName; (void)num; - (void)user; - - gr->kNumArgs++; + + (*(int*)user)++; } /** * Record the typecodes in the arguments typecode array. */ -static void reduxGenSaveArgTypecodes (GpuReduction* gr, +static void reduxGenSaveArgTypecodes (const GpuReduction* gr, int typecode, const char* typeName, const char* baseName, @@ -2557,7 +2895,7 @@ static void reduxGenSaveArgTypecodes (GpuReduction* gr, (void)baseName; (void)num; (void)user; - + gr->kArgTypeCodes[(*(int*)user)++] = typecode; } @@ -2565,7 +2903,7 @@ static void reduxGenSaveArgTypecodes (GpuReduction* gr, * Append an argument declaration to prototype. */ -static void reduxGenAppendArg (GpuReduction* gr, +static void reduxGenAppendArg (const GpuReduction* gr, int typecode, const char* typeName, const char* baseName, @@ -2573,35 +2911,35 @@ static void reduxGenAppendArg (GpuReduction* gr, void* user){ (void)user; (void)typecode; - - if((*(int*)user)++ > 0){ - srcbAppends(&gr->srcGen, ",\n "); + + if ((*(int*)user)++ > 0){ + srcbAppends(&((GpuReduction*)gr)->srcGen, ",\n "); } - srcbAppendf(&gr->srcGen, "%-25s ", typeName); - srcbAppendf(&gr->srcGen, baseName, num); + srcbAppendf(&((GpuReduction*)gr)->srcGen, "%-35s ", typeName); + srcbAppendf(&((GpuReduction*)gr)->srcGen, baseName, num); } /** * Marshall argument declaration during invocation. */ -static void reduxInvMarshalArg (GpuReduction* gr, +static void reduxInvMarshalArg (const GpuReduction* gr, int typecode, const char* typeName, const char* baseName, - int k, + int num, void* user){ redux_ctx* ctx; - int* i; - + int* i, k = num; + (void)typecode; (void)typeName; - + ctx = (redux_ctx*)(((void**)user)[0]); i = (int *)(((void**)user)[1]); - - if (strcmp(baseName, "phase") == 0){ - ctx->kArgs[(*i)++] = (void*)&ctx->phase; + + if (strcmp(baseName, "selector") == 0){ + ctx->kArgs[(*i)++] = (void*)&ctx->selector; }else if (strcmp(baseName, "U") == 0){ ctx->kArgs[(*i)++] = (void*)&ctx->U; }else if (strcmp(baseName, "V") == 0){ @@ -2610,56 +2948,58 @@ static void reduxInvMarshalArg (GpuReduction* gr, ctx->kArgs[(*i)++] = (void*)&ctx->B; }else if (strcmp(baseName, "D") == 0){ ctx->kArgs[(*i)++] = (void*)&ctx->D; + }else if (strcmp(baseName, "Dunit") == 0){ + ctx->kArgs[(*i)++] = (void*)&ctx->Dunit; }else if (strcmp(baseName, "H") == 0){ ctx->kArgs[(*i)++] = (void*)&ctx->H; - }else if (strcmp(baseName, "splitFree") == 0){ - ctx->kArgs[(*i)++] = (void*)&ctx->splitFree; - }else if (strcmp(baseName, "splitReduce") == 0){ - ctx->kArgs[(*i)++] = (void*)&ctx->splitReduce; - }else if (strcmp(baseName, "l%d") == 0){ - ctx->kArgs[(*i)++] = (void*)&ctx->l[k]; - }else if (strcmp(baseName, "l%dPDim") == 0){ - ctx->kArgs[(*i)++] = (void*)&ctx->lPDim[k-gr->ndd]; - }else if (strcmp(baseName, "s") == 0){ - ctx->kArgs[(*i)++] = (void*) ctx->flatSrcData; - }else if (strcmp(baseName, "sOff") == 0){ - ctx->kArgs[(*i)++] = (void*)&ctx->flatSrcOffset; - }else if (strcmp(baseName, "sJ%d") == 0){ - ctx->kArgs[(*i)++] = (void*)&ctx->sJ[k]; - }else if (strcmp(baseName, "d") == 0){ - ctx->kArgs[(*i)++] = (void*) ctx->flatDstData; - }else if (strcmp(baseName, "dOff") == 0){ - ctx->kArgs[(*i)++] = (void*)&ctx->flatDstOffset; - }else if (strcmp(baseName, "dJ%d") == 0){ - ctx->kArgs[(*i)++] = (void*)&ctx->dJ[k]; - }else if (strcmp(baseName, "a") == 0){ - ctx->kArgs[(*i)++] = (void*) ctx->flatDstArgData; - }else if (strcmp(baseName, "aOff") == 0){ - ctx->kArgs[(*i)++] = (void*)&ctx->flatDstArgOffset; - }else if (strcmp(baseName, "aJ%d") == 0){ - ctx->kArgs[(*i)++] = (void*)&ctx->aJ[k]; - }else if (strcmp(baseName, "w") == 0){ - ctx->kArgs[(*i)++] = (void*) ctx->w; - }else if (strcmp(baseName, "wdOff") == 0){ - ctx->kArgs[(*i)++] = (void*)&ctx->wdOff; - }else if (strcmp(baseName, "pdOff") == 0){ - ctx->kArgs[(*i)++] = (void*)&ctx->pdOff; - }else if (strcmp(baseName, "waOff") == 0){ - ctx->kArgs[(*i)++] = (void*)&ctx->waOff; - }else if (strcmp(baseName, "paOff") == 0){ - ctx->kArgs[(*i)++] = (void*)&ctx->paOff; - }else if (strcmp(baseName, "ibs%d") == 0){ - ctx->kArgs[(*i)++] = (void*)&ctx->ibs[k]; - }else if (strcmp(baseName, "ibp%d") == 0){ - ctx->kArgs[(*i)++] = (void*)&ctx->ibp[k]; - }else if (strcmp(baseName, "ibl%dPDim") == 0){ - ctx->kArgs[(*i)++] = (void*)&ctx->iblPDim[k]; - }else if (strcmp(baseName, "ibsOff%d") == 0){ - ctx->kArgs[(*i)++] = (void*)&ctx->ibsOff[k]; - }else if (strcmp(baseName, "ibdOff%d") == 0){ - ctx->kArgs[(*i)++] = (void*)&ctx->ibdOff[k]; - }else if (strcmp(baseName, "ibaOff%d") == 0){ - ctx->kArgs[(*i)++] = (void*)&ctx->ibaOff[k]; + }else if (strcmp(baseName, "LSlice") == 0){ + ctx->kArgs[(*i)++] = (void*)&ctx->LSlice; + }else if (strcmp(baseName, "LPadded") == 0){ + ctx->kArgs[(*i)++] = (void*)&ctx->LPadded; + }else if (strcmp(baseName, "L%d") == 0){ + ctx->kArgs[(*i)++] = (void*)&ctx->L[k]; + }else if (strcmp(baseName, "L%di") == 0){ + ctx->kArgs[(*i)++] = (void*)&ctx->Li[k]; + }else if (strcmp(baseName, "S0") == 0){ + ctx->kArgs[(*i)++] = (void*) ctx->S0Data; + }else if (strcmp(baseName, "S0Off") == 0){ + ctx->kArgs[(*i)++] = (void*)&ctx->S0Off; + }else if (strcmp(baseName, "S0J%d") == 0){ + ctx->kArgs[(*i)++] = (void*)&ctx->S0J[k]; + }else if (strcmp(baseName, "S0S%di") == 0){ + ctx->kArgs[(*i)++] = (void*)&ctx->S0Si[k]; + }else if (strcmp(baseName, "D0") == 0){ + ctx->kArgs[(*i)++] = (void*) ctx->D0Data; + }else if (strcmp(baseName, "D0Off") == 0){ + ctx->kArgs[(*i)++] = (void*)&ctx->D0Off; + }else if (strcmp(baseName, "D0J%d") == 0){ + ctx->kArgs[(*i)++] = (void*)&ctx->D0J[k]; + }else if (strcmp(baseName, "D0S%di") == 0){ + ctx->kArgs[(*i)++] = (void*)&ctx->D0Si[k]; + }else if (strcmp(baseName, "D1") == 0){ + ctx->kArgs[(*i)++] = (void*) ctx->D1Data; + }else if (strcmp(baseName, "D1Off") == 0){ + ctx->kArgs[(*i)++] = (void*)&ctx->D1Off; + }else if (strcmp(baseName, "D1J%d") == 0){ + ctx->kArgs[(*i)++] = (void*)&ctx->D1J[k]; + }else if (strcmp(baseName, "D1S%di") == 0){ + ctx->kArgs[(*i)++] = (void*)&ctx->D1Si[k]; + }else if (strcmp(baseName, "I0J%d") == 0){ + ctx->kArgs[(*i)++] = (void*)&ctx->I0J[k]; + }else if (strcmp(baseName, "I0S%di") == 0){ + ctx->kArgs[(*i)++] = (void*)&ctx->I0Si[k]; + }else if (strcmp(baseName, "W") == 0){ + ctx->kArgs[(*i)++] = (void*) ctx->W; + }else if (strcmp(baseName, "W0Off") == 0){ + ctx->kArgs[(*i)++] = (void*)&ctx->W0Off; + }else if (strcmp(baseName, "SHMEMK0Off") == 0){ + ctx->kArgs[(*i)++] = (void*)&ctx->SHMEMK0Off; + }else if (strcmp(baseName, "W1Off") == 0){ + ctx->kArgs[(*i)++] = (void*)&ctx->W1Off; + }else if (strcmp(baseName, "SHMEMK1Off") == 0){ + ctx->kArgs[(*i)++] = (void*)&ctx->SHMEMK1Off; + }else if (strcmp(baseName, "perm%di") == 0){ + ctx->kArgs[(*i)++] = (void*)&ctx->perm[k]; } } @@ -2667,7 +3007,7 @@ static void reduxInvMarshalArg (GpuReduction* gr, /** * @brief Estimate the level of parallelism available in the GPU context of * this reduction operator. - * + * * This is a rough target number of threads. It would definitely fill the * device, plus some substantial margin. */ @@ -2676,21 +3016,75 @@ static size_t reduxGenEstimateParallelism (const GpuReduction* gr){ /** * An arbitrary margin factor ensuring there will be a few thread blocks * per SMX. - * + * * E.g. on Kepler, each SMX can handle up to two 1024-thread blocks - * simultaneously, so a margin of 6/SMX should ensure with very high + * simultaneously, so a margin of 16/SMX should ensure with very high * likelyhood that all SMXes will be fed and kept busy. */ - - size_t marginFactor = 6; + + size_t marginFactor = 16; return marginFactor * gr->numProcs * gr->maxLg; } /** - * @brief Returns whether the reduction interface requires a dst argument. + * @brief Return whether or not the reduction operator's interface or kernel + * require a specific argument, lattice or storage. + * + * Specifically, check if the reductions operator's: + * - Interface (reduxGenRequires*()) the passing of an s0/d0/d1 argument + * - Kernel (reduxGenKernelRequiresLattice*()) the walking of an s0/d0/d1/i0 lattice + * - Kernel (reduxGenKernelRequiresState*()) contains a k0/k1 state + * - Kernel (reduxGenKernelRequiresWspace()) workspaces named w* for states k*. + * + * The reduction operator's interface, kernel and state are semantically + * subtly different. The interface asks whether the GpuReduction_call(), and + * therefore the generated kernel, must receive a specific argument: + * + * - Argument s0 (Typically the source tensor) + * - Argument d0 (Typically the destination tensor) + * - Argument d1 (Typically the destination argument tensor) + * + * The kernel asks whether it must internally walk over a specific lattice, where: + * + * - Lattice s0 is the lattice of pointers into the s0 tensor. + * - Lattice d0 is the lattice of pointers into the d0 tensor. + * - Lattice d1 is the lattice of pointers into the d1 tensor. + * - Lattice i0 is the lattice of flattened indices into the s0 tensor. + * + * The state asks whether it should contain: + * + * - State k0 (Typically for accumulator states typed `TK` over the s0 lattice + * and written to the d0 lattice) + * - State k1 (Typically for indexes typed `TI` from the i0 lattice and written + * to the d1 lattice) + * + * The workspace asks whether it is required in order to save partial reduction + * states k* computed during Phase 0. + * + * + * + * Currently: + * + * - All GpuReductions require an s0 argument. + * - All GpuReductions except argmin/argmax require a d0 argument. + * - Only the argmin/argmax/minandargmin/maxandargmax GpuReductions require a d1 argument. + * - All and only the GpuReductions requiring a s0 argument require walking over the s0 lattice. + * - All and only the GpuReductions requiring a d0 argument require walking over the d0 lattice. + * - All and only the GpuReductions requiring a d1 argument require walking over the d1 lattice. + * - All and only the GpuReductions requiring a d1 argument require walking over the i0 lattice. + * - All and only the GpuReductions requiring a s0 lattice walk require a k0 state. + * - All and only the GpuReductions requiring a i0 lattice walk require a k1 state. + * - All GpuReductions potentially require a workspace for their states. + * + * However, if this reduction engine were generalized to multi-reduction, elemwise or + * initialization operations, the above might not necessarily hold anymore. */ -static int reduxGenRequiresDst (const GpuReduction* gr){ +static int reduxGenRequiresS0 (const GpuReduction* gr){ + (void)gr; + return 1; +} +static int reduxGenRequiresD0 (const GpuReduction* gr){ switch (gr->op){ case GA_REDUCE_ARGMIN: case GA_REDUCE_ARGMAX: @@ -2699,12 +3093,7 @@ static int reduxGenRequiresDst (const GpuReduction* gr){ return 1; } } - -/** - * @brief Returns whether the reduction interface requires a dstArg argument. - */ - -static int reduxGenRequiresDstArg (const GpuReduction* gr){ +static int reduxGenRequiresD1 (const GpuReduction* gr){ switch (gr->op){ case GA_REDUCE_MINANDARGMIN: case GA_REDUCE_MAXANDARGMAX: @@ -2715,48 +3104,45 @@ static int reduxGenRequiresDstArg (const GpuReduction* gr){ return 0; } } - -/** - * @brief Returns whether the generated kernel internally requires a dst - * workspace. - * - * This is semantically subtly different from reduxGenRequiresDst(). The main - * difference is in the implementation of the GA_REDUCE_ARGMIN/ARGMAX - * reductions; both require a dst workspace buffer for the min/max values - * associated with the indices that they return, even though they will be - * discarded. - * - * As of now, all reductions use a dst workspace internally. - */ - -static int reduxGenKernelRequiresDst (const GpuReduction* gr){ +static int reduxGenKernelRequiresLatticeS0(const GpuReduction* gr){ + return reduxGenRequiresS0(gr); +} +static int reduxGenKernelRequiresLatticeD0(const GpuReduction* gr){ + return reduxGenRequiresD0(gr); +} +static int reduxGenKernelRequiresLatticeD1(const GpuReduction* gr){ + return reduxGenRequiresD1(gr); +} +static int reduxGenKernelRequiresLatticeI0(const GpuReduction* gr){ + return reduxGenRequiresD1(gr); +} +static int reduxGenKernelRequiresStateK0 (const GpuReduction* gr){ + return reduxGenKernelRequiresLatticeS0(gr); +} +static int reduxGenKernelRequiresStateK1 (const GpuReduction* gr){ + return reduxGenKernelRequiresLatticeI0(gr); +} +static int reduxGenKernelRequiresWspace (const GpuReduction* gr){ + (void)gr; return 1; } -/** - * @brief Returns whether the generated kernel internally requires a dstArg - * workspace. - * - * This is semantically subtly different from reduxHasDstArg(), since it asks - * whether the reduction, even though it might not accept a dstArg argument, - * still requires a dstArg workspace internally. - * - * Currently, there exist no operations that require a dstArg workspace - * internally but which is not also part of the external interface. - */ - -static int reduxGenKernelRequiresDstArg (const GpuReduction* gr){ - return reduxGenRequiresDstArg(gr); -} /** - * @brief Whether or not an axis is maybe split. - * - * An axis is possibly split if it is the last free or last reduction axis. + * Get size and alignment requirements of K0 and K1 states. */ -static int reduxGenAxisMaybeSplit (const GpuReduction* gr, int axis){ - return axis == gr->ndd-1 || axis == gr->nds-1; +static size_t reduxGenGetK0Size (const GpuReduction* gr){ + return gr->TK0.size; +} +static size_t reduxGenGetK0Align (const GpuReduction* gr){ + return gr->TK0.align; +} +static size_t reduxGenGetK1Size (const GpuReduction* gr){ + return gr->TK1.size; +} +static size_t reduxGenGetK1Align (const GpuReduction* gr){ + return gr->TK1.align; } /** @@ -2764,20 +3150,15 @@ static int reduxGenAxisMaybeSplit (const GpuReduction* gr, int ax */ static size_t reduxGenGetReduxStateSize (const GpuReduction* gr){ - size_t total = 0, idxSize = gpuarray_get_elsize(gr->idxTypeCode); - + size_t total = 0, idxSize = gpuarray_get_elsize(gr->TS64tc); + /* The accumulator and index types can be wider than dst/dstArg's types. */ - total += reduxGenKernelRequiresDst(gr) ? - gpuarray_get_elsize(gr->accTypeCode) : - 0; - total += reduxGenKernelRequiresDstArg(gr) ? - gpuarray_get_elsize(gr->idxTypeCode) : - 0; - + total += reduxGenKernelRequiresStateK0(gr) ? reduxGenGetK0Size(gr) : 0; + total += reduxGenKernelRequiresStateK1(gr) ? reduxGenGetK1Size(gr) : 0; + /* At minimum, there must be space for the offset permute. */ total = total < idxSize ? idxSize : total; - - + /* Return the calculated amount of space. */ return total; } @@ -2785,143 +3166,113 @@ static size_t reduxGenGetReduxStateSize (const GpuReduction* gr){ /** * @brief Get the maximum number of threads this operator's kernel can handle. */ - -static size_t reduxGenGetMaxLocalSize (const GpuReduction* gr){ - return gr->maxLK; -} - -/** - * @brief Get the shared memory consumption for a given block size. - * - * This is non-trivial since it requires ensuring alignment of datatypes. - */ - -static size_t reduxGenGetSHMEMSize (const GpuReduction* gr, size_t bs){ - const gpuarray_type* type; - size_t total = 0, permuteSpace; - - if(reduxGenKernelRequiresDst(gr)){ - type = gpuarray_get_type(gr->accTypeCode); - total = DIVIDECEIL(total, type->align)*type->align; - total += bs*type->size; - } - if(reduxGenKernelRequiresDstArg(gr)){ - type = gpuarray_get_type(gr->idxTypeCode); - total = DIVIDECEIL(total, type->align)*type->align; - total += bs*type->size; - } - - /* Ensure space for pointer permute. */ - permuteSpace = gpuarray_get_type(gr->idxTypeCode)->size * bs; - if(total < permuteSpace){ - total = permuteSpace; - } - - return total; + +static size_t reduxGenGetMaxLocalSize (const GpuReduction* gr){ + return gr->maxLK; } /** - * @brief Get the shared memory byte offset for dst. + * @brief Get the shared memory consumption for a given block size. */ -static size_t reduxGenGetSHMEMDstOff (const GpuReduction* gr, size_t bs){ - return 0; +static size_t reduxGenGetSHMEMSize (const GpuReduction* gr, size_t cells){ + size_t total = 0, totalPermute; + + /* Compute size of SHMEM working space */ + total += reduxGenKernelRequiresStateK0(gr) ? cells*reduxGenGetK0Size(gr) : 0; + total += reduxGenKernelRequiresStateK1(gr) ? cells*reduxGenGetK1Size(gr) : 0; + + /* But ensure space for pointer offset permute at beginning of kernel. */ + totalPermute = cells*gpuarray_get_type(gr->TS64tc)->size; + total = total < totalPermute ? totalPermute : total; + + return total; } /** - * @brief Get the shared memory byte offset for dstArg. + * @brief Get the shared memory byte offset for the k0 and k1 states. */ -static size_t reduxGenGetSHMEMDstArgOff (const GpuReduction* gr, size_t bs){ - const gpuarray_type* type; - size_t total = 0; - - if(reduxGenKernelRequiresDst(gr) && reduxGenKernelRequiresDstArg(gr)){ - type = gpuarray_get_type(gr->accTypeCode); - total = DIVIDECEIL(total, type->align)*type->align; - total += bs*type->size; - type = gpuarray_get_type(gr->idxTypeCode); - total = DIVIDECEIL(total, type->align)*type->align; - - return total; +static size_t reduxGenGetSHMEMK0Off (const GpuReduction* gr, size_t cells){ + if (!reduxGenKernelRequiresWspace (gr)|| + !reduxGenKernelRequiresStateK0(gr)|| + !reduxGenKernelRequiresStateK1(gr)){ + return 0; + } + + if (reduxGenGetK0Align(gr) > reduxGenGetK1Align(gr)){ + return 0; + }else{ + return cells*reduxGenGetK1Size(gr); + } +} +static size_t reduxGenGetSHMEMK1Off (const GpuReduction* gr, size_t cells){ + if (!reduxGenKernelRequiresWspace (gr)|| + !reduxGenKernelRequiresStateK0(gr)|| + !reduxGenKernelRequiresStateK1(gr)){ + return 0; + } + + if (reduxGenGetK0Align(gr) > reduxGenGetK1Align(gr)){ + return cells*reduxGenGetK0Size(gr); }else{ return 0; } } /** - * Get the amount of Workspace memory required. - * + * Get the amount of workspace memory required. + * * NOT necessarily the same as amount of SHMEM! The workspace is NOT used for * intrablock offset permutes, for instance. */ -static size_t reduxGenGetWMEMSize (const GpuReduction* gr, size_t bs){ - const gpuarray_type* type; +static size_t reduxGenGetWMEMSize (const GpuReduction* gr, size_t cells){ size_t total = 0; - - if(reduxGenKernelRequiresDst(gr)){ - type = gpuarray_get_type(gr->accTypeCode); - total = DIVIDECEIL(total, type->align)*type->align; - total += bs*type->size; - } - if(reduxGenKernelRequiresDstArg(gr)){ - type = gpuarray_get_type(gr->idxTypeCode); - total = DIVIDECEIL(total, type->align)*type->align; - total += bs*type->size; - } - - return total; -} -/** - * @brief Get the workspace memory byte offset for dst. - */ + total += reduxGenKernelRequiresStateK0(gr) ? cells*reduxGenGetK0Size(gr) : 0; + total += reduxGenKernelRequiresStateK1(gr) ? cells*reduxGenGetK1Size(gr) : 0; -static size_t reduxGenGetWMEMDstOff (const GpuReduction* gr, size_t bs){ - return reduxGenGetSHMEMDstOff(gr, bs); + return total; } /** - * @brief Get the workspace memory byte offset for dstArg. + * @brief Get the workspace memory byte offset for the k0 and k1 states. */ -static size_t reduxGenGetWMEMDstArgOff (const GpuReduction* gr, size_t bs){ - return reduxGenGetSHMEMDstArgOff(gr, bs); +static size_t reduxGenGetWMEMK0Off (const GpuReduction* gr, size_t cells){ + return reduxGenGetSHMEMK0Off(gr, cells); +} +static size_t reduxGenGetWMEMK1Off (const GpuReduction* gr, size_t cells){ + return reduxGenGetSHMEMK1Off(gr, cells); } /** * @brief Initialize the context. - * + * * After this function, calling reduxInvCleanup*() becomes safe. */ -static int reduxInvInit (redux_ctx* ctx){ +static int reduxInvInit (redux_ctx* ctx){ /** * We initialize certain parts of the context. */ - - ctx->l = NULL; - ctx->lPDim = NULL; - ctx->sJ = NULL; - ctx->dJ = NULL; - ctx->aJ = NULL; - ctx->ibs = NULL; - ctx->ibp = NULL; - ctx->iblPDim = NULL; - ctx->ibsOff = NULL; - ctx->ibdOff = NULL; - ctx->ibaOff = NULL; - ctx->kArgs = NULL; - ctx->xdSrc = NULL; - ctx->xdSrcPtrs = NULL; - ctx->xdTmpPtrs = NULL; - ctx->xdSplit = NULL; - - ctx->w = NULL; - - ctx->prodAllAxes = ctx->prodRdxAxes = ctx->prodFreeAxes = 1; - ctx->bs = ctx->gs = 1; + + ctx->L = ctx->Li = NULL; + ctx->S0J = ctx->S0Si = NULL; + ctx->D0J = ctx->D0Si = NULL; + ctx->D1J = ctx->D1Si = NULL; + ctx->I0J = ctx->I0Si = NULL; + ctx->perm = NULL; + ctx->kArgs = NULL; + ctx->xdSrc = NULL; + ctx->xdSrcPtrs = NULL; + ctx->xdSplit = NULL; + + ctx->W = NULL; + + ctx->prodAllAxes = ctx->prodRdxAxes = ctx->prodFreeAxes = 1; + ctx->bs = ctx->gs = 1; return reduxInvInferProperties(ctx); } @@ -2936,50 +3287,51 @@ static int reduxInvInferProperties (redux_ctx* ctx){ size_t d; - /* Insane src, reduxLen, dst or dstArg? */ - if(!ctx->reduxList){ - ctx->reduxLen = ctx->src->nd; - } - if (!ctx->src){ + /* Insane s0, reduxLen, d0 or d1? */ + if (reduxInvRequiresS0(ctx) && !ctx->s0){ return reduxInvCleanupMsg(ctx, GA_INVALID_ERROR, - "src is NULL!\n"); - }else if (ctx->src->nd <= 0){ + "s0 is NULL, but reduction requires it!\n"); + } + if (!ctx->reduxList){ + ctx->reduxLen = reduxInvRequiresS0(ctx) ? ctx->s0->nd : 0; + } + if (reduxInvRequiresS0(ctx) && ctx->s0->nd <= 0){ return reduxInvCleanupMsg(ctx, GA_INVALID_ERROR, - "src is a scalar, cannot reduce it!\n"); - }else if (ctx->reduxLen < 0){ + "s0 is a scalar, cannot reduce it further!\n"); + }else if (reduxInvRequiresS0(ctx) && ctx->reduxLen < 0){ return reduxInvCleanupMsg(ctx, GA_INVALID_ERROR, - "Length of list of dimensions to be reduced is less than 0!\n"); - }else if (ctx->src->nd < (unsigned)ctx->reduxLen){ + "Length of list of axes to be reduced is less than 0!\n"); + }else if (reduxInvRequiresS0(ctx) && ctx->s0->nd < (unsigned)ctx->reduxLen){ return reduxInvCleanupMsg(ctx, GA_INVALID_ERROR, - "src has fewer dimensions than there are dimensions to reduce!\n"); - }else if (reduxInvRequiresDst (ctx) && !ctx->dst){ + "s0 has fewer axes than there are axes to reduce!\n"); + }else if (reduxInvRequiresD0(ctx) && !ctx->d0){ return reduxInvCleanupMsg(ctx, GA_INVALID_ERROR, - "dst is NULL, but reduction requires it!\n"); - }else if (reduxInvRequiresDstArg(ctx) && !ctx->dstArg){ + "d0 is NULL, but reduction requires it!\n"); + }else if (reduxInvRequiresD1(ctx) && !ctx->d1){ return reduxInvCleanupMsg(ctx, GA_INVALID_ERROR, - "dstArg is NULL, but reduction requires it!\n"); - }else if (ctx->dst && ctx->dst->nd +ctx->reduxLen != ctx->src->nd){ + "d1 is NULL, but reduction requires it!\n"); + }else if (reduxInvRequiresD0(ctx) && reduxInvRequiresS0(ctx) && ctx->d0->nd+ctx->reduxLen != ctx->s0->nd){ return reduxInvCleanupMsg(ctx, GA_INVALID_ERROR, - "dst is of incorrect dimensionality for this reduction!\n"); - }else if (ctx->dstArg && ctx->dstArg->nd+ctx->reduxLen != ctx->src->nd){ + "d0 is of incorrect rank for this reduction!\n"); + }else if (reduxInvRequiresD1(ctx) && reduxInvRequiresS0(ctx) && ctx->d1->nd+ctx->reduxLen != ctx->s0->nd){ return reduxInvCleanupMsg(ctx, GA_INVALID_ERROR, - "dstArg is of incorrect dimensionality for this reduction!\n"); + "d1 is of incorrect rank for this reduction!\n"); } - ctx->nds = ctx->src->nd; - ctx->ndr = ctx->reduxLen; - ctx->ndd = ctx->nds - ctx->ndr; - ctx->ndfs = ctx->ndfr = ctx->ndfd = 0; - + ctx->nds0 = reduxInvRequiresS0(ctx) ? ctx->s0->nd : 0; + ctx->nds0r = ctx->reduxLen; + ctx->ndd0 = ctx->nds0 - ctx->nds0r; + ctx->ndfs0 = ctx->ndfs0r = ctx->ndfd0 = 0; + /* Insane reduxList? */ - for (i=0;indr;i++){ + for (i=0;inds0r;i++){ j = ctx->reduxList ? ctx->reduxList[i] : i; - if (j < -ctx->nds || j >= ctx->nds){ + if (j < -ctx->nds0 || j >= ctx->nds0){ return reduxInvCleanupMsg(ctx, GA_INVALID_ERROR, "Insane axis number %d! Should be [%d, %d)!\n", - j, -ctx->nds, ctx->nds); + j, -ctx->nds0, ctx->nds0); } - j = j<0 ? ctx->nds+j : j; - d = ctx->src->dimensions[j]; + j = j<0 ? ctx->nds0+j : j; + d = ctx->s0->dimensions[j]; ctx->zeroRdxAxes += !d; ctx->prodRdxAxes *= d?d:1; } @@ -2987,55 +3339,55 @@ static int reduxInvInferProperties (redux_ctx* ctx){ /** * Insane shape? - * + * * The source tensor is allowed to be empty (its shape may contain 0s). * However, all axes that are of length 0 must be reduction axes. - * + * * The reason for this is that a reduction cannot store any output into an - * empty destination tensor (whose dimensions are the free axes), because + * empty destination tensor (whose axes are the free axes), because * it has 0 space. The operation cannot then fulfill its contract. - * + * * On the other hand, when some or all reduction axes of a tensor are of * length 0, the reduction can be interpreted as initializing the * destination tensor to the identity value of the operation. For lack of a * better idea, the destination argument tensor can then be zeroed. */ - for (i=0;inds;i++){ - d = ctx->src->dimensions[i]; + for (i=0;inds0;i++){ + d = ctx->s0->dimensions[i]; ctx->zeroAllAxes += !d; ctx->prodAllAxes *= d?d:1; } if (ctx->zeroAllAxes != ctx->zeroRdxAxes){ return reduxInvCleanupMsg(ctx, GA_INVALID_ERROR, - "Source tensor has length-0 dimensions that are not reduced!\n"); + "Source tensor has length-0 axes that are not reduced!\n"); } ctx->prodFreeAxes = ctx->prodAllAxes/ctx->prodRdxAxes; /** * Allocate and construct source-tensor axis-description lists. - * + * * While constructing the descriptions of each axis, verify that: - * + * * 1. reduxLen has no duplicates. - * 2. dst and/or dstArg's dimensions match src's dimensions, stripped of + * 2. d0 and/or d1's axes match s0's axes when stripped of * the reduction axes. */ - ctx->xdSrc = calloc(ctx->nds, sizeof(*ctx->xdSrc)); - ctx->xdSrcPtrs = calloc(ctx->nds+1, sizeof(*ctx->xdSrcPtrs)); + ctx->xdSrc = calloc(ctx->nds0, sizeof(*ctx->xdSrc)); + ctx->xdSrcPtrs = calloc(ctx->nds0+1, sizeof(*ctx->xdSrcPtrs)); if (!ctx->xdSrc || !ctx->xdSrcPtrs){ return reduxInvCleanup(ctx, GA_MEMORY_ERROR); } - for (i=0;inds;i++){ + for (i=0;inds0;i++){ axisInit(&ctx->xdSrc[i], - ctx->src->dimensions[i], - ctx->src->strides[i]); + ctx->s0->dimensions[i], + ctx->s0->strides[i]); } - for (i=0;indr;i++){ + for (i=0;inds0r;i++){ j = ctx->reduxList ? ctx->reduxList[i] : i; - j = j<0 ? ctx->nds+j : j; + j = j<0 ? ctx->nds0+j : j; a = reduxInvGetSrcAxis(ctx, j); if (axisIsReduced(a)){ return reduxInvCleanupMsg(ctx, GA_INVALID_ERROR, @@ -3045,55 +3397,57 @@ static int reduxInvInferProperties (redux_ctx* ctx){ } axisMarkReduced(a, i); } - for (i=j=0;inds;i++){ - axis_desc* a = reduxInvGetSrcAxis(ctx, i); - size_t srcLen = axisGetLen(a), dstLen, dstArgLen; - + for (i=j=0;inds0;i++){ + axis_desc* a = reduxInvGetSrcAxis(ctx, i); + size_t s0Len = axisGetLen(a), d0Len, d1Len; + if (axisIsReduced(a)){continue;} - if (reduxInvRequiresDst(ctx)){ - dstLen = ctx->dst->dimensions[j]; - - if(srcLen != dstLen){ + if (reduxInvRequiresD0(ctx)){ + d0Len = ctx->d0->dimensions[j]; + + if (s0Len != d0Len){ return reduxInvCleanupMsg(ctx, GA_INVALID_ERROR, - "Source axis %d has length %zu, but " - "corresponding destination axis %d has length %zu!\n", - i, srcLen, j, dstLen); + "s0 axis %d has length %zu, but " + "corresponding d0 axis %d has length %zu!\n", + i, s0Len, j, d0Len); } - - a->dstStride = ctx->dst->strides[j]; + + a->d0S = ctx->d0->strides[j]; } - if (reduxInvRequiresDstArg(ctx)){ - dstArgLen = ctx->dstArg->dimensions[j]; - - if(srcLen != dstArgLen){ + if (reduxInvRequiresD1(ctx)){ + d1Len = ctx->d1->dimensions[j]; + + if (s0Len != d1Len){ return reduxInvCleanupMsg(ctx, GA_INVALID_ERROR, - "Source axis %d has length %zu, but " - "corresponding destination-argument axis %d has length %zu!\n", - i, srcLen, j, dstArgLen); + "s0 axis %d has length %zu, but " + "corresponding d1 axis %d has length %zu!\n", + i, s0Len, j, d1Len); } - - a->dstArgStride = ctx->dstArg->strides[j]; + + a->d1S = ctx->d1->strides[j]; } - + j++; } - - + + /** * Grab gpudata buffers and byte offsets before we begin flattening the * tensors. As we flatten the tensor, we may reverse some axes, leading to * a bump of the byte offset. */ - - ctx->flatSrcData = ctx->src->data; - ctx->flatSrcOffset = ctx->src->offset; - if(reduxInvRequiresDst(ctx)){ - ctx->flatDstData = ctx->dst->data; - ctx->flatDstOffset = ctx->dst->offset; + + if (reduxInvRequiresS0(ctx)){ + ctx->S0Data = ctx->s0->data; + ctx->S0Off = ctx->s0->offset; + } + if (reduxInvRequiresD0(ctx)){ + ctx->D0Data = ctx->d0->data; + ctx->D0Off = ctx->d0->offset; } - if(reduxInvRequiresDstArg(ctx)){ - ctx->flatDstArgData = ctx->dstArg->data; - ctx->flatDstArgOffset = ctx->dstArg->offset; + if (reduxInvRequiresD1(ctx)){ + ctx->D1Data = ctx->d1->data; + ctx->D1Off = ctx->d1->offset; } return reduxInvFlattenSource(ctx); @@ -3101,7 +3455,7 @@ static int reduxInvInferProperties (redux_ctx* ctx){ /** * @brief Flatten the source tensor as much as is practical. - * + * * This makes the axis lengths as long as possible and the tensor itself as * contiguous as possible. */ @@ -3110,158 +3464,158 @@ static int reduxInvFlattenSource (redux_ctx* ctx){ axis_desc* axis, *flatAxis, *sortAxis; int i, j, k, isSensitive; - ctx->ndfs = ctx->nds; + ctx->ndfs0 = ctx->nds0; /** - * Pass 1: Flatten out 0- and 1-length dimensions. We already know that - * - * a) There are no 0-length free dimensions, because that + * Pass 1: Flatten out 0- and 1-length axes. We already know that + * + * a) There are no 0-length free axes, because that * constitutes an invalid input, and - * b) How many 0-length reduction dimensions there are, because + * b) How many 0-length reduction axes there are, because * we counted them in the error-checking code. - * + * * So if there are any 0-length axes, we can delete all reduction axes and * replace them with a single one. - * + * * We can also delete 1-length axes outright, since they can always be * ignored; They are always indexed at [0]. */ - for (i=j=0;indfs;i++){ + for (i=j=0;indfs0;i++){ axis = reduxInvGetSrcAxis(ctx, i); if (!reduxTryFlattenOut(ctx, axis)){ *reduxInvGetSrcAxis(ctx, j++) = *axis; } } - if(ctx->zeroRdxAxes > 0){ + if (ctx->zeroRdxAxes > 0){ /* New reduction axis of 0 length. */ axisInit (reduxInvGetSrcAxis(ctx, j), 0, 0); axisMarkReduced(reduxInvGetSrcAxis(ctx, j), 0); j++; } - ctx->ndfs = j; + ctx->ndfs0 = j; /** - * Pass 2: Flatten out continuous dimensions, where strides and sensitivity + * Pass 2: Flatten out continuous axes, where strides and sensitivity * allows it. */ - - k = ctx->ndfs; + + k = ctx->ndfs0; isSensitive = reduxIsSensitive(ctx->op); - qsort(ctx->xdSrc, ctx->ndfs, sizeof(*ctx->xdSrc), + qsort(ctx->xdSrc, ctx->ndfs0, sizeof(*ctx->xdSrc), isSensitive ? reduxSortFlatSensitive : reduxSortFlatInsensitive); - for (i=j=1;indfs;i++){ + for (i=j=1;indfs0;i++){ flatAxis = reduxInvGetSrcAxis(ctx, j-1); sortAxis = reduxInvGetSrcAxis(ctx, i); - + if (reduxTryFlattenInto(ctx, flatAxis, sortAxis)){ k--; }else{ *reduxInvGetSrcAxis(ctx, j++) = *sortAxis; } } - ctx->ndfs = k; + ctx->ndfs0 = k; /** - * Compute number of free and reduced dimensions. + * Compute number of flattened free and reduced axes. */ - for(ctx->ndfr=ctx->ndfd=i=0;indfs;i++){ - if(axisIsReduced(reduxInvGetSrcAxis(ctx, i))){ - ctx->ndfr++; + for (ctx->ndfs0r=ctx->ndfd0=i=0;indfs0;i++){ + if (axisIsReduced(reduxInvGetSrcAxis(ctx, i))){ + ctx->ndfs0r++; }else{ - ctx->ndfd++; + ctx->ndfd0++; } } - return reduxInvComputeKArgs(ctx); + return reduxInvComputeKernelArgs(ctx); } /** * @brief Compute the arguments to the kernel. - * + * * This is a multistep process and involves a lot of axis sorting on various * criteria. */ -static int reduxInvComputeKArgs (redux_ctx* ctx){ +static int reduxInvComputeKernelArgs (redux_ctx* ctx){ axis_desc* axis, *prevAxis; - size_t target, aL, aLS; - int i, j, k, haveSplitFreeAxis, haveSplitReducedAxis; + size_t target, aL, aLS, perm, i0S; + int i, j, haveSplitFreeAxis, haveSplitReducedAxis; /** * STEP 0: Default Kernel Argument Values. - * + * * They should be valid for a "scalar" job. In particular, for any * non-existent axis, assume length 1. */ - - ctx->phase = 0; + + ctx->selector = 0; ctx->U = 1; ctx->V = 1; ctx->B = 1; ctx->D = 1; ctx->H = 1; - ctx->splitFree = 1; - ctx->splitReduce = 1; - ctx->xdSplit = NULL; - ctx->l = calloc(ctx->gr->nds, sizeof(*ctx->l)); - ctx->lPDim = calloc(ctx->gr->ndr, sizeof(*ctx->lPDim)); - ctx->sJ = calloc(ctx->gr->nds, sizeof(*ctx->sJ)); - ctx->dJ = calloc(ctx->gr->ndd, sizeof(*ctx->dJ)); - ctx->aJ = calloc(ctx->gr->ndd, sizeof(*ctx->aJ)); - ctx->wdOff = 0; - ctx->pdOff = 0; - ctx->waOff = 0; - ctx->paOff = 0; - ctx->ibs = calloc(ctx->gr->log2MaxL, sizeof(*ctx->ibs)); - ctx->ibp = calloc(ctx->gr->log2MaxL, sizeof(*ctx->ibp)); - ctx->iblPDim = calloc(ctx->gr->log2MaxL, sizeof(*ctx->iblPDim)); - ctx->ibsOff = calloc(ctx->gr->log2MaxL, sizeof(*ctx->ibsOff)); - ctx->ibdOff = calloc(ctx->gr->log2MaxL, sizeof(*ctx->ibdOff)); - ctx->ibaOff = calloc(ctx->gr->log2MaxL, sizeof(*ctx->ibaOff)); + ctx->LSlice = 1; + ctx->LPadded = 1; + ctx->L = calloc(ctx->gr->nds, sizeof(*ctx->L)); + ctx->Li = calloc(ctx->gr->log2MaxBS, sizeof(*ctx->Li)); + ctx->S0J = calloc(ctx->gr->nds, sizeof(*ctx->S0J)); + ctx->S0Si = calloc(ctx->gr->log2MaxBS, sizeof(*ctx->S0Si)); + ctx->D0J = calloc(ctx->gr->ndd, sizeof(*ctx->D0J)); + ctx->D0Si = calloc(ctx->gr->log2MaxBS, sizeof(*ctx->D0Si)); + ctx->D1J = calloc(ctx->gr->ndd, sizeof(*ctx->D1J)); + ctx->D1Si = calloc(ctx->gr->log2MaxBS, sizeof(*ctx->D1Si)); + ctx->I0J = calloc(ctx->gr->nds, sizeof(*ctx->I0J)); + ctx->I0Si = calloc(ctx->gr->log2MaxBS, sizeof(*ctx->I0Si)); + ctx->W0Off = 0; + ctx->SHMEMK0Off = 0; + ctx->W1Off = 0; + ctx->SHMEMK1Off = 0; + ctx->perm = calloc(ctx->gr->log2MaxBS, sizeof(*ctx->perm)); ctx->bs = 1; ctx->gs = 1; ctx->kArgs = calloc(ctx->gr->kNumArgs, sizeof(*ctx->kArgs)); - - if(!ctx->l || !ctx->lPDim || !ctx->sJ || !ctx->dJ || - !ctx->aJ || !ctx->ibs || !ctx->ibp || !ctx->iblPDim || - !ctx->ibsOff || !ctx->ibdOff || !ctx->ibaOff || !ctx->kArgs){ + + if (!ctx->L || !ctx->Li || !ctx->S0J || !ctx->S0Si || + !ctx->D0J || !ctx->D0Si || !ctx->D1J || !ctx->D1Si || + !ctx->I0J || !ctx->I0Si || !ctx->perm || !ctx->kArgs){ return reduxInvCleanupMsg(ctx, GA_MEMORY_ERROR, "Failed to allocate memory for kernel invocation arguments!\n"); } - for(i=0;igr->nds;i++){ - ctx->l[i] = 1; + + for (i=0;igr->nds;i++){ + ctx->L[i] = 1; } - for(i=0;igr->log2MaxL;i++){ - ctx->ibs[i] = 1; + for (i=0;igr->log2MaxBS;i++){ + ctx->Li[i] = 1; } /** * STEP 1: Select Intra-Block Axes. - * + * * Sort the axes in the order likely to maximize contiguity of source * memory accesses, then tag them to the kernel block size limit, possibly * splitting an axis in the process. */ - - reduxSortAxisPtrsBy(ctx->xdSrcPtrs, ctx->xdSrc, ctx->ndfs, - reduxSortPtrIBSrcRdSelect); + + reduxSortAxisPtrsBy(ctx->xdSrcPtrs, ctx->xdSrc, ctx->ndfs0, + reduxSortPtrS0AbsStride); target = reduxGenGetMaxLocalSize(ctx->gr); - - for(i=0;indfs && igr->log2MaxL;i++){ + + for (i=0;indfs0 && igr->log2MaxBS;i++){ axis = reduxInvGetSrcSortAxis(ctx, i); aL = axisGetLen(axis); - - if(ctx->bs*aL <= target){ + + if (ctx->bs*aL <= target){ ctx->bs *= aL; axisMarkIntraBlock(axis, i, aL); }else{ - if(target/ctx->bs >= 2){ + if (target/ctx->bs >= 2){ aLS = target/ctx->bs; ctx->bs *= aLS; axisMarkIntraBlock(axis, i, aLS); @@ -3271,344 +3625,292 @@ static int reduxInvComputeKArgs (redux_ctx* ctx){ break; } } - ctx->ndib = i; + ctx->ndib = i; + ctx->LSlice = ctx->xdSplit ? axisGetIntraLen(ctx->xdSplit) : 1; /** - * STEP 2: Compute values dependent only on the intrablock axis selection. - * - * For instance, the splitFree/splitReduce factors depend only on the split - * axis, if any. - * - * The shared memory consumption and shared memory offsets depend only - * on block size. + * STEP 2: Compute U, B, D, Dunit, H */ - ctx->splitFree = reduxInvGetSplitFree (ctx); - ctx->splitReduce = reduxInvGetSplitReduce (ctx); - ctx->SHMEM = reduxGenGetSHMEMSize (ctx->gr, ctx->bs); - ctx->pdOff = reduxGenGetSHMEMDstOff (ctx->gr, ctx->bs); - ctx->paOff = reduxGenGetSHMEMDstArgOff(ctx->gr, ctx->bs); - - - /** - * STEP 3: Compute U, B, D, H - */ - - for (i=0;indfs;i++){ + for (i=0;indfs0;i++){ axis = reduxInvGetSrcAxis(ctx, i); ctx->U *= axisGetInterLen(axis); ctx->B *= axisIsReduced(axis) ? axisGetInterLen(axis) : 1; - ctx->H *= axisIsReduced(axis) ? axisGetIntraLen(axis) : 1; + ctx->D *=!axisIsReduced(axis) ? axisGetIntraLen(axis) : 1; } - ctx->D = ctx->bs/ctx->H; - - + ctx->H = ctx->Dbs ? reduxNextPow2(ctx->bs) : ctx->bs; + ctx->Dunit = ctx->D/ctx->LSlice; + + /** - * STEP 4: Compute PDim values. - * + * STEP 3: Compute shared memory parameters. + */ + + ctx->shmemBytes = reduxGenGetSHMEMSize (ctx->gr, ctx->H); + ctx->SHMEMK0Off = reduxGenGetSHMEMK0Off(ctx->gr, ctx->H); + ctx->SHMEMK1Off = reduxGenGetSHMEMK1Off(ctx->gr, ctx->H); + + + /** + * STEP 4: Compute I0 stride values. + * * This will be used for index calculation. */ - - reduxSortAxisPtrsBy(ctx->xdSrcPtrs, ctx->xdSrc, ctx->ndfs, + + reduxSortAxisPtrsBy(ctx->xdSrcPtrs, ctx->xdSrc, ctx->ndfs0, reduxSortPtrByReduxNum); - for (i=0;indfs;i++){ + for (i=0,i0S=1;indfs0;i++){ axis = reduxInvGetSrcSortAxis(ctx, i); - - if(axisIsReduced(axis)){ - if(i==0){ - axisSetPDim(axis, 1); - }else{ - prevAxis = reduxInvGetSrcSortAxis(ctx, i-1); - axisSetPDim(axis, axisGetPDim(prevAxis)*axisGetLen(prevAxis)); - } + + if (axisIsReduced(axis)){ + axisSetI0Stride(axis, i0S); + i0S *= axisGetLen(axis); } } - - + + /** * STEP 5: Compute Intra-Block Permute Core. - * + * * Sort the axes in the order most likely to maximize contiguity of * destination/destination argument memory accesses, then compute the * permutation that achieves the highest-bandwidth, * post-horizontal-reduction destination writes. */ - - reduxSortAxisPtrsBy(ctx->xdSrcPtrs, ctx->xdSrc, ctx->ndfs, - reduxInvRequiresDst(ctx) ? - reduxSortPtrIBDstWrSelect : - reduxSortPtrIBDstArgWrSelect); - for(i=0;indfs;i++){ + + reduxSortAxisPtrsBy(ctx->xdSrcPtrs, ctx->xdSrc, ctx->ndfs0, + reduxInvRequiresD0(ctx)? + reduxSortPtrD0WrSelect : + reduxSortPtrD1WrSelect); + for (i=0,perm=1;indfs0;i++){ axis = reduxInvGetSrcSortAxis(ctx, i); - - if(axisIsIntra(axis)){ - if(i==0){ - axisSetIBP(axis, 1); - }else{ + + if (axisIsIntra(axis)){ + if (i>0 && axisIsReduced(axis)){ prevAxis = reduxInvGetSrcSortAxis(ctx, i-1); - axisSetIBP(axis, axisGetIBP(prevAxis)*axisGetIntraLen(prevAxis)); + if (!axisIsReduced(prevAxis)){ + /** + * The permute stride of the lowest-absolute-stride + * reduced axis must be a power of two to make horizontal + * reduction easier. + */ + + perm = reduxNextPow2(perm); + } } + axisSetPerm(axis, perm); + perm *= axisGetIntraLen(axis); } } - + + /** * STEP 6. Place the intra axis arguments - * - * ibs, ibp, iblPDim, ibsOff, ibdOff, ibaOff - * + * + * LN, perm, S0SNi, D0SNi, D1SNi, I0SNi + * * For this we need the axes in final order of insertion. */ - - reduxSortAxisPtrsBy(ctx->xdSrcPtrs, ctx->xdSrc, ctx->ndfs, + + reduxSortAxisPtrsBy(ctx->xdSrcPtrs, ctx->xdSrc, ctx->ndfs0, reduxSortPtrInsertFinalOrder); - for(i=0;indib;i++){ + for (i=0;indib;i++){ axis = reduxInvGetSrcSortAxis(ctx, i); - - ctx->ibs [i] = axisGetIntraLen (axis); - ctx->ibp [i] = axisGetIBP (axis); - ctx->iblPDim[i] = axisGetPDim (axis); - ctx->ibsOff [i] = axisGetSrcStride (axis); - ctx->ibdOff [i] = axisGetDstStride (axis); - ctx->ibaOff [i] = axisGetDstArgStride(axis); + + ctx->Li [i] = axisGetIntraLen(axis); + ctx->perm[i] = axisGetPerm (axis); + ctx->S0Si[i] = axisGetS0Stride(axis); + ctx->D0Si[i] = axisGetD0Stride(axis); + ctx->D1Si[i] = axisGetD1Stride(axis); + ctx->I0Si[i] = axisGetI0Stride(axis); } - + + /** * STEP 7. Place the inter axis arguments - * - * lN, lNPDim, sJN, dJN, aJN - * + * + * LN, S0JN, D0JN, D1JN, I0JN + * * , where N in [0, ctx->gr->ndd) are free axes, * N in [ctx->gr->ndd, ctx->gr->nds) are reduced axes, * and ctx->xdSrcPtr[...] are sorted in the reverse of that order for - * insertion, and excludes any split axis. - * + * insertion, and excludes any intra axis (including the split one). + * * How precisely the insertion is done depends closely on whether there is * a split axis and if so whether it is free or reduced. - * + * * - If there is a split axis and it is free, then it should be inserted as * the first free axis. Its jumps should be - * sJN = -sSM*intrainterLenM + sSN*splitFree - * dJN = -dSM*intrainterLenM + dSN*splitFree - * aJN = -aSM*intrainterLenM + aSN*splitFree + * S0JN = -S0SM*intrainterLenM + S0SN*splitFree + * D0JN = -D0SM*intrainterLenM + D0SN*splitFree + * D1JN = -D1SM*intrainterLenM + D1SN*splitFree + * I0JN = -I0SM*intrainterLenM + I0SN*splitFree * - If there is a split axis and it is reduced, then it should be inserted * as the first reduced axis. Its jump should be - * sJN = -sSM*intrainterLenM + sSN*splitReduced + * S0JN = -S0SM*intrainterLenM + S0SN*splitReduced + * I0JN = -I0SM*intrainterLenM + I0SN*splitReduced * - If there is no split axis, proceed normally in filling the axes. */ - + haveSplitFreeAxis = ctx->xdSplit && !axisIsReduced(ctx->xdSplit); haveSplitReducedAxis = ctx->xdSplit && axisIsReduced(ctx->xdSplit); - + j = ctx->gr->nds-1; + /* If we have a reduced split axis, insert it before any other reduced axis. */ - j = ctx->gr->nds-1; - k = ctx->gr->ndr-1; - if(haveSplitReducedAxis && k>=0){ - ctx->l [j] = axisGetLen (ctx->xdSplit); - ctx->lPDim [k] = axisGetPDim (ctx->xdSplit); - ctx->sJ [j] += (ssize_t)axisGetSrcStride (ctx->xdSplit)* - (ssize_t)axisGetIntraLen (ctx->xdSplit); - if(j>0){ - ctx->sJ [j-1] -= (ssize_t)axisGetSrcStride (ctx->xdSplit)* - (ssize_t)axisGetIntraInterLen(ctx->xdSplit); + if (haveSplitReducedAxis && j>=ctx->gr->ndd){ + ctx->L [j] = axisGetLen (ctx->xdSplit); + ctx->S0J[j] += (ssize_t)axisGetS0Stride(ctx->xdSplit)* + (ssize_t)axisGetIntraLen(ctx->xdSplit); + ctx->I0J[j] += (ssize_t)axisGetI0Stride(ctx->xdSplit)* + (ssize_t)axisGetIntraLen(ctx->xdSplit); + if (j>0){ + ctx->S0J[j-1] -= (ssize_t)axisGetS0Stride (ctx->xdSplit)* + (ssize_t)axisGetIntraInterLen(ctx->xdSplit); + ctx->I0J[j-1] -= (ssize_t)axisGetI0Stride (ctx->xdSplit)* + (ssize_t)axisGetIntraInterLen(ctx->xdSplit); } j--; - k--; } - + /* Insert rest of reduced axes. */ - for(;indfs && k>=0;i++,j--,k--){ + for (;indfs0 && j>=ctx->gr->ndd;i++,j--){ axis = reduxInvGetSrcSortAxis(ctx, i); - if(!axisIsReduced(axis)){ + if (!axisIsReduced(axis)){ break; } - - ctx->l [j] = axisGetLen (axis); - ctx->lPDim [k] = axisGetPDim (axis); - ctx->sJ [j] += (ssize_t)axisGetSrcStride (axis)* - (ssize_t)axisGetIntraLen (axis); - if(j>0){ - ctx->sJ [j-1] -= (ssize_t)axisGetSrcStride (axis)* - (ssize_t)axisGetIntraInterLen(axis); + + ctx->L [j] = axisGetLen (axis); + ctx->S0J[j] += (ssize_t)axisGetS0Stride(axis)* + (ssize_t)axisGetIntraLen(axis); + ctx->I0J[j] += (ssize_t)axisGetI0Stride(axis)* + (ssize_t)axisGetIntraLen(axis); + if (j>0){ + ctx->S0J[j-1] -= (ssize_t)axisGetS0Stride (axis)* + (ssize_t)axisGetIntraInterLen(axis); + ctx->I0J[j-1] -= (ssize_t)axisGetI0Stride (axis)* + (ssize_t)axisGetIntraInterLen(axis); } } - + /* If we have a free split axis, insert it before any other free axis. */ - k = ctx->gr->ndd-1; - if(haveSplitFreeAxis && k>=0){ - ctx->l [k] = axisGetLen (ctx->xdSplit); - ctx->sJ [k] += (ssize_t)axisGetSrcStride (ctx->xdSplit)* - (ssize_t)axisGetIntraLen (ctx->xdSplit); - ctx->dJ [k] += (ssize_t)axisGetDstStride (ctx->xdSplit)* - (ssize_t)axisGetIntraLen (ctx->xdSplit); - ctx->aJ [k] += (ssize_t)axisGetDstArgStride (ctx->xdSplit)* - (ssize_t)axisGetIntraLen (ctx->xdSplit); - if(k>0){ - ctx->sJ [k-1] -= (ssize_t)axisGetSrcStride (ctx->xdSplit)* - (ssize_t)axisGetIntraInterLen(ctx->xdSplit); - ctx->dJ [k-1] -= (ssize_t)axisGetDstStride (ctx->xdSplit)* - (ssize_t)axisGetIntraInterLen(ctx->xdSplit); - ctx->aJ [k-1] -= (ssize_t)axisGetDstArgStride (ctx->xdSplit)* - (ssize_t)axisGetIntraInterLen(ctx->xdSplit); + j = ctx->gr->ndd-1; + if (haveSplitFreeAxis && j>=0){ + ctx->L [j] = axisGetLen (ctx->xdSplit); + ctx->S0J[j] += (ssize_t)axisGetS0Stride(ctx->xdSplit)* + (ssize_t)axisGetIntraLen(ctx->xdSplit); + ctx->D0J[j] += (ssize_t)axisGetD0Stride(ctx->xdSplit)* + (ssize_t)axisGetIntraLen(ctx->xdSplit); + ctx->D1J[j] += (ssize_t)axisGetD1Stride(ctx->xdSplit)* + (ssize_t)axisGetIntraLen(ctx->xdSplit); + ctx->I0J[j] += (ssize_t)axisGetI0Stride(ctx->xdSplit)* + (ssize_t)axisGetIntraLen(ctx->xdSplit); + if (j>0){ + ctx->S0J[j-1] -= (ssize_t)axisGetS0Stride (ctx->xdSplit)* + (ssize_t)axisGetIntraInterLen(ctx->xdSplit); + ctx->D0J[j-1] -= (ssize_t)axisGetD0Stride (ctx->xdSplit)* + (ssize_t)axisGetIntraInterLen(ctx->xdSplit); + ctx->D1J[j-1] -= (ssize_t)axisGetD1Stride (ctx->xdSplit)* + (ssize_t)axisGetIntraInterLen(ctx->xdSplit); + ctx->I0J[j-1] -= (ssize_t)axisGetI0Stride (ctx->xdSplit)* + (ssize_t)axisGetIntraInterLen(ctx->xdSplit); } - k--; + j--; } - + /* Insert rest of free axes. */ - for(;indfs && k>=0;i++,k--){ + for (;indfs0 && j>=0;i++,j--){ axis = reduxInvGetSrcSortAxis(ctx, i); - if(axisIsReduced(axis)){ + if (axisIsReduced(axis)){ break; } - - ctx->l [k] = axisGetLen (axis); - ctx->sJ [k] += (ssize_t)axisGetSrcStride (axis)* - (ssize_t)axisGetIntraLen (axis); - ctx->dJ [k] += (ssize_t)axisGetDstStride (axis)* - (ssize_t)axisGetIntraLen (axis); - ctx->aJ [k] += (ssize_t)axisGetDstArgStride (axis)* - (ssize_t)axisGetIntraLen (axis); - if(k>0){ - ctx->sJ [k-1] -= (ssize_t)axisGetSrcStride (axis)* - (ssize_t)axisGetIntraInterLen(axis); - ctx->dJ [k-1] -= (ssize_t)axisGetDstStride (axis)* - (ssize_t)axisGetIntraInterLen(axis); - ctx->aJ [k-1] -= (ssize_t)axisGetDstArgStride (axis)* - (ssize_t)axisGetIntraInterLen(axis); + + ctx->L [j] = axisGetLen (axis); + ctx->S0J[j] += (ssize_t)axisGetS0Stride(axis)* + (ssize_t)axisGetIntraLen(axis); + ctx->D0J[j] += (ssize_t)axisGetD0Stride(axis)* + (ssize_t)axisGetIntraLen(axis); + ctx->D1J[j] += (ssize_t)axisGetD1Stride(axis)* + (ssize_t)axisGetIntraLen(axis); + ctx->I0J[j] += (ssize_t)axisGetI0Stride(axis)* + (ssize_t)axisGetIntraLen(axis); + if (j>0){ + ctx->S0J[j-1] -= (ssize_t)axisGetS0Stride (axis)* + (ssize_t)axisGetIntraInterLen(axis); + ctx->D0J[j-1] -= (ssize_t)axisGetD0Stride (axis)* + (ssize_t)axisGetIntraInterLen(axis); + ctx->D1J[j-1] -= (ssize_t)axisGetD1Stride (axis)* + (ssize_t)axisGetIntraInterLen(axis); + ctx->I0J[j-1] -= (ssize_t)axisGetI0Stride (axis)* + (ssize_t)axisGetIntraInterLen(axis); } } - return reduxInvSchedule(ctx); -} - -#if 0 -static void reduxScheduleKernel (int ndims, - uint64_t* dims, - uint64_t warpSize, - uint64_t maxLg, - uint64_t* maxLs, - uint64_t maxGg, - uint64_t* maxGs, - uint64_t* bs, - uint64_t* gs, - uint64_t* cs); - -/** - * @brief Given the parameters of a kernel scheduling problem, solve it as - * optimally as possible. - * - * NB: This is the only function in this entire file that should have - * anything to do with the integer factorization APIs. - */ - -static void reduxScheduleKernel (int ndims, - uint64_t* dims, - uint64_t warpSize, - uint64_t maxLg, - uint64_t* maxLs, - uint64_t maxGg, - uint64_t* maxGs, - uint64_t* bs, - uint64_t* gs, - uint64_t* cs){ - uint64_t warpMod, bestWarpMod = 1; - int i, bestWarpAxis = 0; - uint64_t roundedDims[MAX_HW_DIMS]; - double slack [MAX_HW_DIMS]; - ga_factor_list factBS [MAX_HW_DIMS]; - ga_factor_list factGS [MAX_HW_DIMS]; - ga_factor_list factCS [MAX_HW_DIMS]; - /** - * Quick check for scalar case. + * STEP 8. Compute the template selector. Requires finding the huge axis, + * if any. Then, compute LPadded, which depends on the selector + * value we choose. */ - if (ndims <= 0){ - return; + if (ctx->xdSplit && !axisIsReduced(ctx->xdSplit)){ + ctx->selector |= SELECTOR_SPLIT_FREE; } + for (i=0;indfs0;i++){ + axis = reduxInvGetSrcAxis(ctx, i); - - /** - * Identify the dimension to which the warp factor will be given. - * - * The current heuristic is to find the dimension that is either - * 1) Evenly divided by the warp size, or - * 2) As close to filling the last warp as possible. - */ - - for (i=0;i0 && (warpMod==0 || warpMod>=bestWarpMod)){ - bestWarpAxis = i; - bestWarpMod = warpMod; + if (axisGetLen(axis) >= ((uint64_t)1<<31)){ + if (axis == ctx->xdSplit){ + ctx->selector |= SELECTOR_HUGE_IS_SPLIT; + }else if (axisIsReduced(axis) == axisIsReduced(ctx->xdSplit)){ + ctx->selector |= SELECTOR_HUGE_SAME_TYPE; + }else{ + ctx->selector |= SELECTOR_HUGE_OPPOSITE_TYPE; + } } } - - if (ndims > 0){ - roundedDims[bestWarpAxis] = (roundedDims[bestWarpAxis] + warpSize - 1)/warpSize; - gaIFactorize(warpSize, 0, 0, &factBS[bestWarpAxis]); - } - - /** - * Factorization job. We'll steadily increase the slack in case of failure - * in order to ensure we do get a factorization, which we place into - * chunkSize. - */ - - for (i=0;iselector & SELECTOR_SPLIT_FREE){ + if (ctx->gr->ndd>0){ + ctx->LPadded = ctx->L[ctx->gr->ndd-1]; + } + }else{ + if (ctx->gr->nds>0){ + ctx->LPadded = ctx->L[ctx->gr->nds-1]; } } + ctx->LPadded = DIVIDECEIL(ctx->LPadded, ctx->LSlice)*ctx->LSlice; - /** - * Invoke the scheduler. - * - * The scheduler will move some factors from chunkSize into blockSize and - * gridSize, improving performance. - */ - gaIFLSchedule(ndims, maxLg, maxLs, maxGg, maxGs, factBS, factGS, factCS); - for (i=0;igs: The grid size, which is the number of thread blocks. * 2. ctx->V: The number of vertical reductions per thread block. - * + * * Two factors drive the scheduling: - * + * * 1. We want to keep all multiprocessors of the device busy; For this we use * an estimate of the level of parallelism of the device. * 2. If V can be chosen such that V % B == 0, then only a single kernel * phase is necessary. - * + * + * To do this, we first choose gs to be the number of blocks that roughly fills + * the available parallelism given the block size, but reduce it to at most U + * (The universal amount of vertical reductions to be done). + * + * We then select V as the minimum number of vertical reductions per block + * that will cover the universe U. + * + * Lastly, iff there exists a value V <= V' <= 2*V such that V' % B == 0, then + * increase V to the smallest such V' and recompute ctx->gs. + * * Once the scheduling is performed, the workspace can be allocated and * workspace offsets can be computed. */ @@ -3616,30 +3918,34 @@ static void reduxScheduleKernel (int ndims, static int reduxInvSchedule (redux_ctx* ctx){ const int flags = GA_BUFFER_READ_WRITE; size_t WSPACESIZE; - + /** - * Get enough blocks to fill available device parallelism to capacity. - * Then, compute corresponding V. + * Scheduling */ - - ctx->gs = DIVIDECEIL(reduxInvEstimateParallelism(ctx), - reduxGenGetMaxLocalSize(ctx->gr)); - ctx->V = DIVIDECEIL(ctx->U, ctx->gs); - + + ctx->gs = DIVIDECEIL(reduxInvEstimateParallelism(ctx), + reduxGenGetMaxLocalSize(ctx->gr)); + ctx->gs = ctx->gs > ctx->U ? ctx->U : ctx->gs; + ctx->V = DIVIDECEIL(ctx->U, ctx->gs); + if (ctx->V%ctx->B != 0 && ctx->V*2 >= ctx->B){ + ctx->V = DIVIDECEIL(ctx->V, ctx->B)*ctx->B; + } + ctx->gs = DIVIDECEIL(ctx->U, ctx->V); + /** * Allocate required workspace. */ - - ctx->wdOff = reduxGenGetWMEMDstOff (ctx->gr, 2*ctx->gs*ctx->D); - ctx->waOff = reduxGenGetWMEMDstArgOff(ctx->gr, 2*ctx->gs*ctx->D); - WSPACESIZE = reduxGenGetWMEMSize (ctx->gr, 2*ctx->gs*ctx->D); - ctx->w = gpudata_alloc(ctx->gr->gpuCtx, WSPACESIZE, 0, flags, 0); - if(!ctx->w){ + + ctx->W0Off = reduxGenGetWMEMK0Off(ctx->gr, 2*ctx->gs*ctx->D); + ctx->W1Off = reduxGenGetWMEMK1Off(ctx->gr, 2*ctx->gs*ctx->D); + WSPACESIZE = reduxGenGetWMEMSize (ctx->gr, 2*ctx->gs*ctx->D); + ctx->W = gpudata_alloc(ctx->gr->gpuCtx, WSPACESIZE, 0, flags, 0); + if (!ctx->W){ return reduxInvCleanupMsg(ctx, GA_MEMORY_ERROR, "Could not allocate %zu-byte workspace for reduction!\n", WSPACESIZE); } - + return reduxInvoke(ctx); } @@ -3650,37 +3956,36 @@ static int reduxInvSchedule (redux_ctx* ctx){ static int reduxInvoke (redux_ctx* ctx){ int ret, i=0; void* ptrs[2] = {ctx, &i}; - + /** * Argument Marshalling. */ - + reduxGenIterArgs(ctx->gr, reduxInvMarshalArg, ptrs); /** * The kernel is now invoked once or twice, for phase 0 or 1. - * - * Phase 1 is sometimes optional. + * + * Phase 1 is optional iff V%B == 0. */ - ctx->phase = 0; - ret = GpuKernel_call(&ctx->gr->k, 1, &ctx->gs, &ctx->bs, ctx->SHMEM, ctx->kArgs); + ret = GpuKernel_call((GpuKernel*)&ctx->gr->k, 1, &ctx->gs, &ctx->bs, ctx->shmemBytes, ctx->kArgs); if (ret != GA_NO_ERROR){ return reduxInvCleanupMsg(ctx, ret, "Failure in kernel call, Phase 0!\n"); } - - if(ctx->V%ctx->B != 0){ - ctx->phase = 1; - ret = GpuKernel_call(&ctx->gr->k, 1, &ctx->gs, &ctx->bs, ctx->SHMEM, ctx->kArgs); + + if (ctx->V % ctx->B != 0){ + ctx->selector |= SELECTOR_PHASE1; + ret = GpuKernel_call((GpuKernel*)&ctx->gr->k, 1, &ctx->gs, &ctx->bs, ctx->shmemBytes, ctx->kArgs); if (ret != GA_NO_ERROR){ return reduxInvCleanupMsg(ctx, ret, "Failure in kernel call, Phase 1!\n"); } } - + /* Success! */ return reduxInvCleanup(ctx, GA_NO_ERROR); } @@ -3690,41 +3995,43 @@ static int reduxInvoke (redux_ctx* ctx){ */ static int reduxInvCleanup (redux_ctx* ctx, int ret){ - free(ctx->l); - free(ctx->lPDim); - free(ctx->sJ); - free(ctx->dJ); - free(ctx->aJ); - free(ctx->ibs); - free(ctx->ibp); - free(ctx->iblPDim); - free(ctx->ibsOff); - free(ctx->ibdOff); - free(ctx->ibaOff); - free(ctx->kArgs); + ctx->gr = NULL; + ctx->s0 = NULL; + ctx->d0 = NULL; + ctx->d1 = NULL; + ctx->reduxList = NULL; + free(ctx->xdSrc); free(ctx->xdSrcPtrs); - free(ctx->xdTmpPtrs); - - gpudata_release(ctx->w); - - ctx->l = NULL; - ctx->lPDim = NULL; - ctx->sJ = NULL; - ctx->dJ = NULL; - ctx->aJ = NULL; - ctx->ibs = NULL; - ctx->ibp = NULL; - ctx->iblPDim = NULL; - ctx->ibsOff = NULL; - ctx->ibdOff = NULL; - ctx->ibaOff = NULL; - ctx->kArgs = NULL; + free(ctx->L); + free(ctx->Li); + free(ctx->S0J); + free(ctx->S0Si); + free(ctx->D0J); + free(ctx->D0Si); + free(ctx->D1J); + free(ctx->D1Si); + free(ctx->I0J); + free(ctx->I0Si); + free(ctx->perm); + free(ctx->kArgs); + gpudata_release(ctx->W); + ctx->xdSrc = NULL; ctx->xdSrcPtrs = NULL; - ctx->xdTmpPtrs = NULL; - - ctx->w = NULL; + ctx->L = NULL; + ctx->Li = NULL; + ctx->S0J = NULL; + ctx->S0Si = NULL; + ctx->D0J = NULL; + ctx->D0Si = NULL; + ctx->D1J = NULL; + ctx->D1Si = NULL; + ctx->I0J = NULL; + ctx->I0Si = NULL; + ctx->perm = NULL; + ctx->kArgs = NULL; + ctx->W = NULL; return ret; } @@ -3732,7 +4039,7 @@ static int reduxInvCleanupMsg (redux_ctx* ctx, int ret, const char* fmt, ...){ #if DEBUG FILE* fp = stderr; - + va_list ap; va_start(ap, fmt); vfprintf(fp, fmt, ap); @@ -3741,6 +4048,7 @@ static int reduxInvCleanupMsg (redux_ctx* ctx, int ret, #else (void)fmt; #endif - + return reduxInvCleanup(ctx, ret); } + diff --git a/tests/check_reduction.c b/tests/check_reduction.c index 12a99ded30..7a2141cfae 100644 --- a/tests/check_reduction.c +++ b/tests/check_reduction.c @@ -142,7 +142,17 @@ START_TEST(test_maxandargmax_reduction){ } } } - + + if(gtMax != pMax[j]){ + fprintf(stderr, "Mismatch GT %f != %f UUT @ %zu!\n", + gtMax, pMax[j], j); + fflush(stderr); + } + if(gtArgmax != pArgmax[j]){ + fprintf(stderr, "Mismatch GT %zu != %zu UUT @ %zu!\n", + gtArgmax, pArgmax[j], j); + fflush(stderr); + } ck_assert_msg(gtMax == pMax[j], "Max value mismatch!"); ck_assert_msg(gtArgmax == pArgmax[j], "Argmax value mismatch!"); } @@ -256,6 +266,107 @@ START_TEST(test_maxandargmax_idxtranspose){ GpuArray_clear(&gaArgmax); }END_TEST +START_TEST(test_maxandargmax_bigdestination){ + pcgSeed(1); + + /** + * We test here a reduction of some random 3D tensor on the first and + * third dimensions. + */ + + size_t i,j; + size_t dims[2] = {2,131072}; + size_t prodDims = dims[0]*dims[1]; + const int reduxList[] = {0}; + + float* pSrc = calloc(1, sizeof(*pSrc) * dims[0]*dims[1]); + float* pMax = calloc(1, sizeof(*pMax) * dims[1]); + size_t* pArgmax = calloc(1, sizeof(*pArgmax) * dims[1]); + + ck_assert_ptr_ne(pSrc, NULL); + ck_assert_ptr_ne(pMax, NULL); + ck_assert_ptr_ne(pArgmax, NULL); + + + /** + * Initialize source data. + */ + + for(i=0;i gtMax){ + gtMax = v; + gtArgmax = i; + } + } + + if(gtMax != pMax[j]){ + fprintf(stderr, "Mismatch GT %f != %f UUT @ %zu!\n", + gtMax, pMax[j], j); + fflush(stderr); + } + if(gtArgmax != pArgmax[j]){ + fprintf(stderr, "Mismatch GT %zu != %zu UUT @ %zu!\n", + gtArgmax, pArgmax[j], j); + fflush(stderr); + } + ck_assert_msg(gtMax == pMax[j], "Max value mismatch!"); + ck_assert_msg(gtArgmax == pArgmax[j], "Argmax value mismatch!"); + } + + /** + * Deallocate. + */ + + free(pSrc); + free(pMax); + free(pArgmax); + GpuArray_clear(&gaSrc); + GpuArray_clear(&gaMax); + GpuArray_clear(&gaArgmax); +}END_TEST + START_TEST(test_maxandargmax_veryhighrank){ pcgSeed(1); @@ -2138,7 +2249,7 @@ START_TEST(test_prod_reduction){ size_t dims[3] = {32,50,79}; size_t prodDims = dims[0]*dims[1]*dims[2]; const int reduxList[] = {0,2}; - const float TOL = 1e-5; + const float TOL = 1e-4; float* pS = calloc(1, sizeof(*pS) * dims[0]*dims[1]*dims[2]); float* pD = calloc(1, sizeof(*pD) * dims[1] ); @@ -2219,7 +2330,7 @@ START_TEST(test_prod_veryhighrank){ size_t rdxDims[4] = {1171,373,1,2}; size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3]; const int reduxList[] = {2,4,7,5}; - const float TOL = 1e-5; + const float TOL = 1e-4; float* pS = calloc(1, sizeof(*pS) * prodDims); float* pD = calloc(1, sizeof(*pD) * rdxProdDims); @@ -2310,7 +2421,7 @@ START_TEST(test_prod_alldimsreduced){ size_t dims[3] = {32,50,79}; size_t prodDims = dims[0]*dims[1]*dims[2]; const int reduxList[] = {0,1,2}; - const float TOL = 1e-5; + const float TOL = 1e-4; float* pS = calloc(1, sizeof(*pS) * dims[0]*dims[1]*dims[2]); float* pD = calloc(1, sizeof(*pD) ); @@ -2389,7 +2500,7 @@ START_TEST(test_prodnz_reduction){ size_t dims[3] = {32,50,79}; size_t prodDims = dims[0]*dims[1]*dims[2]; const int reduxList[] = {0,2}; - const float TOL = 1e-5; + const float TOL = 1e-4; float* pS = calloc(1, sizeof(*pS) * dims[0]*dims[1]*dims[2]); float* pD = calloc(1, sizeof(*pD) * dims[1] ); @@ -2473,7 +2584,7 @@ START_TEST(test_prodnz_veryhighrank){ size_t rdxDims[4] = {1171,373,1,2}; size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3]; const int reduxList[] = {2,4,7,5}; - const float TOL = 1e-5; + const float TOL = 1e-4; float* pS = calloc(1, sizeof(*pS) * prodDims); float* pD = calloc(1, sizeof(*pD) * rdxProdDims); @@ -2567,7 +2678,7 @@ START_TEST(test_prodnz_alldimsreduced){ size_t dims[3] = {32,50,79}; size_t prodDims = dims[0]*dims[1]*dims[2]; const int reduxList[] = {0,1,2}; - const float TOL = 1e-5; + const float TOL = 1e-4; float* pS = calloc(1, sizeof(*pS) * dims[0]*dims[1]*dims[2]); float* pD = calloc(1, sizeof(*pD) ); @@ -3982,9 +4093,10 @@ Suite *get_suite(void) { TCase *tc = tcase_create("basic"); tcase_add_checked_fixture(tc, setup, teardown); tcase_set_timeout(tc, 120.0); - + tcase_add_test(tc, test_maxandargmax_reduction); tcase_add_test(tc, test_maxandargmax_idxtranspose); + tcase_add_test(tc, test_maxandargmax_bigdestination); tcase_add_test(tc, test_maxandargmax_veryhighrank); tcase_add_test(tc, test_maxandargmax_alldimsreduced); From c9a0389495b3eecb638792bb560102413dea1569 Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Thu, 13 Jul 2017 23:18:21 -0400 Subject: [PATCH 22/34] More refactoring. Now, all the veryhighrank tests pass and the others fail for an unknown reason. --- src/cluda_cuda.h.c | 551 +++---- src/gpuarray/reduction.h | 112 +- src/gpuarray_reduction.c | 507 +++--- tests/check_reduction.c | 3369 +++++++++++++++++++++++--------------- 4 files changed, 2656 insertions(+), 1883 deletions(-) diff --git a/src/cluda_cuda.h.c b/src/cluda_cuda.h.c index ba3f88cadc..319074542e 100644 --- a/src/cluda_cuda.h.c +++ b/src/cluda_cuda.h.c @@ -174,77 +174,171 @@ static const char cluda_cuda_h[] = { 0x70, 0x65, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x5b, 0x5d, 0x3b, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x47, 0x41, 0x5f, 0x57, 0x41, 0x52, 0x50, 0x5f, 0x53, 0x49, 0x5a, 0x45, 0x20, 0x77, -0x61, 0x72, 0x70, 0x53, 0x69, 0x7a, 0x65, 0x0a, 0x0a, 0x73, 0x74, -0x72, 0x75, 0x63, 0x74, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, -0x66, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x73, -0x68, 0x6f, 0x72, 0x74, 0x20, 0x64, 0x61, 0x74, 0x61, 0x3b, 0x0a, -0x7d, 0x3b, 0x0a, 0x0a, 0x73, 0x74, 0x61, 0x74, 0x69, 0x63, 0x20, -0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, -0x69, 0x6e, 0x6c, 0x69, 0x6e, 0x65, 0x20, 0x66, 0x6c, 0x6f, 0x61, -0x74, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x32, 0x66, -0x6c, 0x6f, 0x61, 0x74, 0x28, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, -0x66, 0x20, 0x68, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x66, 0x6c, -0x6f, 0x61, 0x74, 0x20, 0x72, 0x3b, 0x0a, 0x20, 0x20, 0x61, 0x73, -0x6d, 0x28, 0x22, 0x7b, 0x20, 0x63, 0x76, 0x74, 0x2e, 0x66, 0x33, -0x32, 0x2e, 0x66, 0x31, 0x36, 0x20, 0x25, 0x30, 0x2c, 0x20, 0x25, -0x31, 0x3b, 0x20, 0x7d, 0x5c, 0x6e, 0x22, 0x20, 0x3a, 0x20, 0x22, -0x3d, 0x66, 0x22, 0x28, 0x72, 0x29, 0x20, 0x3a, 0x20, 0x22, 0x68, -0x22, 0x28, 0x68, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x29, 0x3b, -0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x72, -0x3b, 0x0a, 0x7d, 0x0a, 0x73, 0x74, 0x61, 0x74, 0x69, 0x63, 0x20, -0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, -0x69, 0x6e, 0x6c, 0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x68, -0x61, 0x6c, 0x66, 0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, -0x74, 0x32, 0x68, 0x61, 0x6c, 0x66, 0x28, 0x66, 0x6c, 0x6f, 0x61, -0x74, 0x20, 0x66, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x67, 0x61, -0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x72, 0x3b, 0x0a, 0x20, 0x20, -0x61, 0x73, 0x6d, 0x28, 0x22, 0x7b, 0x20, 0x63, 0x76, 0x74, 0x2e, -0x72, 0x6e, 0x2e, 0x66, 0x31, 0x36, 0x2e, 0x66, 0x33, 0x32, 0x20, -0x25, 0x30, 0x2c, 0x20, 0x25, 0x31, 0x3b, 0x20, 0x7d, 0x5c, 0x6e, -0x22, 0x20, 0x3a, 0x20, 0x22, 0x3d, 0x68, 0x22, 0x28, 0x72, 0x2e, -0x64, 0x61, 0x74, 0x61, 0x29, 0x20, 0x3a, 0x20, 0x22, 0x66, 0x22, -0x28, 0x66, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, -0x75, 0x72, 0x6e, 0x20, 0x72, 0x3b, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, -0x2a, 0x20, 0x67, 0x61, 0x5f, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x2f, +0x61, 0x72, 0x70, 0x53, 0x69, 0x7a, 0x65, 0x0a, 0x23, 0x64, 0x65, +0x66, 0x69, 0x6e, 0x65, 0x20, 0x72, 0x65, 0x73, 0x74, 0x72, 0x69, +0x63, 0x74, 0x20, 0x5f, 0x5f, 0x72, 0x65, 0x73, 0x74, 0x72, 0x69, +0x63, 0x74, 0x5f, 0x5f, 0x0a, 0x0a, 0x73, 0x74, 0x72, 0x75, 0x63, +0x74, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x7b, +0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x73, 0x68, 0x6f, 0x72, +0x74, 0x20, 0x64, 0x61, 0x74, 0x61, 0x3b, 0x0a, 0x7d, 0x3b, 0x0a, +0x0a, 0x73, 0x74, 0x61, 0x74, 0x69, 0x63, 0x20, 0x5f, 0x5f, 0x64, +0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x69, 0x6e, 0x6c, +0x69, 0x6e, 0x65, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x67, +0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x32, 0x66, 0x6c, 0x6f, 0x61, +0x74, 0x28, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x68, +0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, +0x20, 0x72, 0x3b, 0x0a, 0x20, 0x20, 0x61, 0x73, 0x6d, 0x28, 0x22, +0x7b, 0x20, 0x63, 0x76, 0x74, 0x2e, 0x66, 0x33, 0x32, 0x2e, 0x66, +0x31, 0x36, 0x20, 0x25, 0x30, 0x2c, 0x20, 0x25, 0x31, 0x3b, 0x20, +0x7d, 0x5c, 0x6e, 0x22, 0x20, 0x3a, 0x20, 0x22, 0x3d, 0x66, 0x22, +0x28, 0x72, 0x29, 0x20, 0x3a, 0x20, 0x22, 0x68, 0x22, 0x28, 0x68, +0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, +0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x72, 0x3b, 0x0a, 0x7d, +0x0a, 0x73, 0x74, 0x61, 0x74, 0x69, 0x63, 0x20, 0x5f, 0x5f, 0x64, +0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x69, 0x6e, 0x6c, +0x69, 0x6e, 0x65, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, +0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x32, 0x68, +0x61, 0x6c, 0x66, 0x28, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x66, +0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, +0x6c, 0x66, 0x20, 0x72, 0x3b, 0x0a, 0x20, 0x20, 0x61, 0x73, 0x6d, +0x28, 0x22, 0x7b, 0x20, 0x63, 0x76, 0x74, 0x2e, 0x72, 0x6e, 0x2e, +0x66, 0x31, 0x36, 0x2e, 0x66, 0x33, 0x32, 0x20, 0x25, 0x30, 0x2c, +0x20, 0x25, 0x31, 0x3b, 0x20, 0x7d, 0x5c, 0x6e, 0x22, 0x20, 0x3a, +0x20, 0x22, 0x3d, 0x68, 0x22, 0x28, 0x72, 0x2e, 0x64, 0x61, 0x74, +0x61, 0x29, 0x20, 0x3a, 0x20, 0x22, 0x66, 0x22, 0x28, 0x66, 0x29, +0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, +0x20, 0x72, 0x3b, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2a, 0x20, 0x67, +0x61, 0x5f, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, +0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, +0x61, 0x64, 0x64, 0x5f, 0x69, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, +0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, +0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, +0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, +0x64, 0x5f, 0x69, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, +0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, +0x5f, 0x69, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, +0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, +0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, +0x5f, 0x69, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, +0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, +0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, +0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, +0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, +0x64, 0x64, 0x5f, 0x49, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, +0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, +0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, +0x5f, 0x49, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, +0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, +0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, +0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, +0x49, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, +0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, +0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, +0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, +0x49, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, +0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, +0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x6c, +0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x2f, 0x0a, 0x5f, 0x5f, 0x64, 0x65, +0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x67, 0x61, 0x5f, 0x6c, +0x6f, 0x6e, 0x67, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, +0x64, 0x5f, 0x6c, 0x67, 0x28, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, +0x67, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, +0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, +0x7b, 0x0a, 0x20, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, +0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, +0x20, 0x2a, 0x77, 0x61, 0x64, 0x64, 0x72, 0x20, 0x3d, 0x20, 0x28, +0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, +0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x29, 0x61, +0x64, 0x64, 0x72, 0x3b, 0x0a, 0x20, 0x20, 0x75, 0x6e, 0x73, 0x69, +0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, +0x6f, 0x6e, 0x67, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x2a, +0x77, 0x61, 0x64, 0x64, 0x72, 0x3b, 0x0a, 0x20, 0x20, 0x75, 0x6e, +0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, +0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, +0x65, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, 0x0a, +0x20, 0x20, 0x20, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, +0x20, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x20, +0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, +0x69, 0x63, 0x43, 0x41, 0x53, 0x28, 0x77, 0x61, 0x64, 0x64, 0x72, +0x2c, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x2c, 0x20, +0x28, 0x76, 0x61, 0x6c, 0x20, 0x2b, 0x20, 0x28, 0x67, 0x61, 0x5f, +0x6c, 0x6f, 0x6e, 0x67, 0x29, 0x28, 0x61, 0x73, 0x73, 0x75, 0x6d, +0x65, 0x64, 0x29, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x20, +0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x61, 0x73, 0x73, 0x75, +0x6d, 0x65, 0x64, 0x20, 0x21, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x29, +0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, +0x28, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x29, 0x6f, 0x6c, +0x64, 0x3b, 0x0a, 0x7d, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, +0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, +0x6c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, +0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x6c, 0x67, 0x28, 0x61, +0x2c, 0x20, 0x62, 0x29, 0x0a, 0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, +0x63, 0x65, 0x5f, 0x5f, 0x20, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, +0x67, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, +0x5f, 0x6c, 0x67, 0x28, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, +0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, +0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, +0x0a, 0x20, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, +0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, +0x72, 0x65, 0x73, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x73, 0x20, +0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, +0x68, 0x28, 0x28, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, +0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, +0x2a, 0x29, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x76, 0x61, 0x6c, +0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, +0x20, 0x28, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x29, 0x72, +0x65, 0x73, 0x3b, 0x0a, 0x7d, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, +0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, +0x67, 0x5f, 0x6c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x6c, +0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, +0x67, 0x61, 0x5f, 0x75, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, -0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x69, 0x67, 0x28, 0x61, +0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x4c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, -0x5f, 0x61, 0x64, 0x64, 0x5f, 0x69, 0x6c, 0x28, 0x61, 0x2c, 0x20, +0x5f, 0x61, 0x64, 0x64, 0x5f, 0x4c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, -0x63, 0x68, 0x67, 0x5f, 0x69, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, +0x63, 0x68, 0x67, 0x5f, 0x4c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, -0x63, 0x68, 0x67, 0x5f, 0x69, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, +0x63, 0x68, 0x67, 0x5f, 0x4c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, -0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x2f, 0x0a, -0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x49, 0x67, 0x28, 0x61, 0x2c, -0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, -0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, -0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, -0x61, 0x64, 0x64, 0x5f, 0x49, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, -0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, -0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, -0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, -0x68, 0x67, 0x5f, 0x49, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, -0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, -0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, -0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, -0x68, 0x67, 0x5f, 0x49, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, -0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, -0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, -0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x2f, 0x0a, 0x5f, -0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x67, -0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x61, 0x74, 0x6f, 0x6d, -0x5f, 0x61, 0x64, 0x64, 0x5f, 0x6c, 0x67, 0x28, 0x67, 0x61, 0x5f, -0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, -0x20, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x76, 0x61, +0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x2a, 0x2f, +0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, +0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x66, 0x67, 0x28, 0x61, +0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, +0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, +0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, +0x5f, 0x61, 0x64, 0x64, 0x5f, 0x66, 0x6c, 0x28, 0x61, 0x2c, 0x20, +0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, +0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, +0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, +0x63, 0x68, 0x67, 0x5f, 0x66, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, +0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, +0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, +0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, +0x63, 0x68, 0x67, 0x5f, 0x66, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, +0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, +0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, +0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x2a, +0x2f, 0x0a, 0x23, 0x69, 0x66, 0x20, 0x5f, 0x5f, 0x43, 0x55, 0x44, +0x41, 0x5f, 0x41, 0x52, 0x43, 0x48, 0x5f, 0x5f, 0x20, 0x3c, 0x20, +0x36, 0x30, 0x30, 0x0a, 0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, +0x65, 0x5f, 0x5f, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, +0x6c, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, +0x5f, 0x64, 0x67, 0x28, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, +0x6c, 0x65, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, +0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x77, 0x61, 0x64, 0x64, 0x72, 0x20, @@ -262,225 +356,90 @@ static const char cluda_cuda_h[] = { 0x20, 0x20, 0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x43, 0x41, 0x53, 0x28, 0x77, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, -0x64, 0x2c, 0x20, 0x28, 0x76, 0x61, 0x6c, 0x20, 0x2b, 0x20, 0x28, -0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x29, 0x28, 0x61, 0x73, -0x73, 0x75, 0x6d, 0x65, 0x64, 0x29, 0x29, 0x29, 0x3b, 0x0a, 0x20, -0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x61, -0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, 0x21, 0x3d, 0x20, 0x6f, -0x6c, 0x64, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, -0x72, 0x6e, 0x20, 0x28, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, -0x29, 0x6f, 0x6c, 0x64, 0x3b, 0x0a, 0x7d, 0x0a, 0x23, 0x64, 0x65, +0x64, 0x2c, 0x20, 0x5f, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, +0x5f, 0x61, 0x73, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x6c, 0x6f, 0x6e, +0x67, 0x28, 0x76, 0x61, 0x6c, 0x20, 0x2b, 0x20, 0x5f, 0x5f, 0x6c, +0x6f, 0x6e, 0x67, 0x6c, 0x6f, 0x6e, 0x67, 0x5f, 0x61, 0x73, 0x5f, +0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x28, 0x61, 0x73, 0x73, 0x75, +0x6d, 0x65, 0x64, 0x29, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, +0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x61, 0x73, 0x73, +0x75, 0x6d, 0x65, 0x64, 0x20, 0x21, 0x3d, 0x20, 0x6f, 0x6c, 0x64, +0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, +0x20, 0x5f, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x6c, 0x6f, 0x6e, 0x67, +0x5f, 0x61, 0x73, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x28, +0x6f, 0x6c, 0x64, 0x29, 0x3b, 0x0a, 0x7d, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, -0x64, 0x64, 0x5f, 0x6c, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, -0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x6c, -0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x5f, 0x5f, 0x64, -0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x67, 0x61, 0x5f, -0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, -0x63, 0x68, 0x67, 0x5f, 0x6c, 0x67, 0x28, 0x67, 0x61, 0x5f, 0x6c, -0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, -0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x76, 0x61, 0x6c, -0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, -0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, -0x6e, 0x67, 0x20, 0x72, 0x65, 0x73, 0x3b, 0x0a, 0x20, 0x20, 0x72, -0x65, 0x73, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, -0x45, 0x78, 0x63, 0x68, 0x28, 0x28, 0x75, 0x6e, 0x73, 0x69, 0x67, -0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, -0x6e, 0x67, 0x20, 0x2a, 0x29, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, -0x76, 0x61, 0x6c, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, -0x75, 0x72, 0x6e, 0x20, 0x28, 0x67, 0x61, 0x5f, 0x6c, 0x6f, 0x6e, -0x67, 0x29, 0x72, 0x65, 0x73, 0x3b, 0x0a, 0x7d, 0x0a, 0x23, 0x64, -0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, -0x78, 0x63, 0x68, 0x67, 0x5f, 0x6c, 0x6c, 0x28, 0x61, 0x2c, 0x20, -0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, -0x67, 0x5f, 0x6c, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, -0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x6c, 0x6f, 0x6e, 0x67, -0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, -0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x4c, -0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, -0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, -0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x4c, 0x6c, 0x28, -0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, -0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, -0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x4c, 0x67, 0x28, 0x61, -0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, -0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, -0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x4c, 0x6c, 0x28, 0x61, -0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, -0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, -0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, -0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, -0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x66, -0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, -0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, -0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x66, 0x6c, 0x28, -0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, -0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, -0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x66, 0x67, 0x28, 0x61, -0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, -0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, -0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x66, 0x6c, 0x28, 0x61, +0x64, 0x64, 0x5f, 0x64, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, +0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, +0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x65, 0x6c, +0x73, 0x65, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x67, +0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, +0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, +0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, +0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, -0x45, 0x78, 0x63, 0x68, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, -0x2f, 0x2a, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, -0x65, 0x20, 0x2a, 0x2f, 0x0a, 0x23, 0x69, 0x66, 0x20, 0x5f, 0x5f, -0x43, 0x55, 0x44, 0x41, 0x5f, 0x41, 0x52, 0x43, 0x48, 0x5f, 0x5f, -0x20, 0x3c, 0x20, 0x36, 0x30, 0x30, 0x0a, 0x5f, 0x5f, 0x64, 0x65, -0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x67, 0x61, 0x5f, 0x64, -0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, -0x61, 0x64, 0x64, 0x5f, 0x64, 0x67, 0x28, 0x67, 0x61, 0x5f, 0x64, +0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, +0x65, 0x6e, 0x64, 0x69, 0x66, 0x0a, 0x5f, 0x5f, 0x64, 0x65, 0x76, +0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, +0x75, 0x62, 0x6c, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, +0x63, 0x68, 0x67, 0x5f, 0x64, 0x67, 0x28, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, -0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x77, 0x61, 0x64, -0x64, 0x72, 0x20, 0x3d, 0x20, 0x28, 0x75, 0x6e, 0x73, 0x69, 0x67, -0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, -0x6e, 0x67, 0x20, 0x2a, 0x29, 0x61, 0x64, 0x64, 0x72, 0x3b, 0x0a, -0x20, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, -0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6f, -0x6c, 0x64, 0x20, 0x3d, 0x20, 0x2a, 0x77, 0x61, 0x64, 0x64, 0x72, -0x3b, 0x0a, 0x20, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, -0x64, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, -0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x3b, 0x0a, 0x20, +0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x72, 0x65, 0x73, 0x3b, +0x0a, 0x20, 0x20, 0x72, 0x65, 0x73, 0x20, 0x3d, 0x20, 0x61, 0x74, +0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, 0x28, 0x28, 0x75, +0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, 0x6c, 0x6f, 0x6e, +0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, 0x29, 0x61, 0x64, +0x64, 0x72, 0x2c, 0x20, 0x5f, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, +0x65, 0x5f, 0x61, 0x73, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x6c, 0x6f, +0x6e, 0x67, 0x28, 0x76, 0x61, 0x6c, 0x29, 0x29, 0x3b, 0x0a, 0x20, +0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x5f, 0x5f, 0x6c, +0x6f, 0x6e, 0x67, 0x6c, 0x6f, 0x6e, 0x67, 0x5f, 0x61, 0x73, 0x5f, +0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x28, 0x72, 0x65, 0x73, 0x29, +0x3b, 0x0a, 0x7d, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, +0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, +0x64, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, +0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x64, 0x67, 0x28, +0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, 0x20, 0x67, 0x61, +0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x2f, 0x0a, 0x5f, 0x5f, +0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x67, 0x61, +0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, +0x61, 0x64, 0x64, 0x5f, 0x65, 0x67, 0x28, 0x67, 0x61, 0x5f, 0x68, +0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, +0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x76, 0x61, 0x6c, +0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, +0x6e, 0x74, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x20, 0x3d, 0x20, +0x28, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x29, +0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, +0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x7e, 0x32, 0x29, 0x3b, 0x0a, +0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x6f, +0x6c, 0x64, 0x2c, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, +0x2c, 0x20, 0x73, 0x75, 0x6d, 0x2c, 0x20, 0x6e, 0x65, 0x77, 0x5f, +0x3b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, +0x20, 0x74, 0x6d, 0x70, 0x3b, 0x0a, 0x20, 0x20, 0x6f, 0x6c, 0x64, +0x20, 0x3d, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, 0x3d, 0x20, 0x6f, 0x6c, -0x64, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20, -0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x43, 0x41, 0x53, -0x28, 0x77, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x61, 0x73, 0x73, -0x75, 0x6d, 0x65, 0x64, 0x2c, 0x20, 0x5f, 0x5f, 0x64, 0x6f, 0x75, -0x62, 0x6c, 0x65, 0x5f, 0x61, 0x73, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, -0x6c, 0x6f, 0x6e, 0x67, 0x28, 0x76, 0x61, 0x6c, 0x20, 0x2b, 0x20, -0x5f, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x6c, 0x6f, 0x6e, 0x67, 0x5f, -0x61, 0x73, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x28, 0x61, -0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x29, 0x29, 0x29, 0x3b, 0x0a, -0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, -0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, 0x21, 0x3d, 0x20, -0x6f, 0x6c, 0x64, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, -0x75, 0x72, 0x6e, 0x20, 0x5f, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x6c, -0x6f, 0x6e, 0x67, 0x5f, 0x61, 0x73, 0x5f, 0x64, 0x6f, 0x75, 0x62, -0x6c, 0x65, 0x28, 0x6f, 0x6c, 0x64, 0x29, 0x3b, 0x0a, 0x7d, 0x0a, -0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x6c, 0x28, 0x61, 0x2c, -0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, -0x64, 0x5f, 0x64, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, -0x23, 0x65, 0x6c, 0x73, 0x65, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, -0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, -0x5f, 0x64, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, -0x74, 0x6f, 0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, -0x20, 0x62, 0x29, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, -0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x64, -0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x69, 0x63, 0x41, 0x64, 0x64, 0x28, 0x61, 0x2c, 0x20, 0x62, -0x29, 0x0a, 0x23, 0x65, 0x6e, 0x64, 0x69, 0x66, 0x0a, 0x5f, 0x5f, -0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x67, 0x61, -0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x64, 0x67, 0x28, 0x67, -0x61, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x20, 0x2a, 0x61, -0x64, 0x64, 0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x64, 0x6f, 0x75, -0x62, 0x6c, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x0a, -0x20, 0x20, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, -0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x72, -0x65, 0x73, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x73, 0x20, 0x3d, -0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x45, 0x78, 0x63, 0x68, -0x28, 0x28, 0x75, 0x6e, 0x73, 0x69, 0x67, 0x6e, 0x65, 0x64, 0x20, -0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x2a, -0x29, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, 0x5f, 0x5f, 0x64, 0x6f, -0x75, 0x62, 0x6c, 0x65, 0x5f, 0x61, 0x73, 0x5f, 0x6c, 0x6f, 0x6e, -0x67, 0x6c, 0x6f, 0x6e, 0x67, 0x28, 0x76, 0x61, 0x6c, 0x29, 0x29, -0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, -0x5f, 0x5f, 0x6c, 0x6f, 0x6e, 0x67, 0x6c, 0x6f, 0x6e, 0x67, 0x5f, -0x61, 0x73, 0x5f, 0x64, 0x6f, 0x75, 0x62, 0x6c, 0x65, 0x28, 0x72, -0x65, 0x73, 0x29, 0x3b, 0x0a, 0x7d, 0x0a, 0x23, 0x64, 0x65, 0x66, -0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, -0x68, 0x67, 0x5f, 0x64, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, -0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, -0x64, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x2f, 0x2a, -0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x2f, -0x0a, 0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, -0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x61, 0x74, -0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x65, 0x67, 0x28, 0x67, -0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, 0x64, 0x64, -0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, -0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x67, 0x61, -0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, -0x20, 0x3d, 0x20, 0x28, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, -0x20, 0x2a, 0x29, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, -0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x7e, 0x32, -0x29, 0x3b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, -0x74, 0x20, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x61, 0x73, 0x73, 0x75, -0x6d, 0x65, 0x64, 0x2c, 0x20, 0x73, 0x75, 0x6d, 0x2c, 0x20, 0x6e, -0x65, 0x77, 0x5f, 0x3b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, -0x61, 0x6c, 0x66, 0x20, 0x74, 0x6d, 0x70, 0x3b, 0x0a, 0x20, 0x20, -0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, -0x3b, 0x0a, 0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, 0x0a, 0x20, 0x20, -0x20, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, 0x3d, -0x20, 0x6f, 0x6c, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, -0x6d, 0x70, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x5f, -0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, 0x28, -0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x30, 0x2c, 0x20, 0x28, 0x28, 0x67, -0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, -0x20, 0x26, 0x20, 0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, 0x78, 0x34, -0x34, 0x33, 0x32, 0x20, 0x3a, 0x20, 0x30, 0x78, 0x34, 0x34, 0x31, -0x30, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, 0x75, 0x6d, -0x20, 0x3d, 0x20, 0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, -0x32, 0x68, 0x61, 0x6c, 0x66, 0x28, 0x67, 0x61, 0x5f, 0x68, 0x61, -0x6c, 0x66, 0x32, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x28, 0x76, 0x61, -0x6c, 0x29, 0x20, 0x2b, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, -0x66, 0x32, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x28, 0x74, 0x6d, 0x70, -0x29, 0x29, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x3b, 0x0a, 0x20, 0x20, -0x20, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x20, 0x3d, 0x20, 0x5f, 0x5f, -0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f, -0x6c, 0x64, 0x2c, 0x20, 0x73, 0x75, 0x6d, 0x2c, 0x20, 0x28, 0x28, -0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, -0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, 0x78, -0x35, 0x34, 0x31, 0x30, 0x20, 0x3a, 0x20, 0x30, 0x78, 0x33, 0x32, -0x35, 0x34, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x6c, -0x64, 0x20, 0x3d, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x69, 0x63, 0x43, -0x41, 0x53, 0x28, 0x62, 0x61, 0x73, 0x65, 0x2c, 0x20, 0x61, 0x73, -0x73, 0x75, 0x6d, 0x65, 0x64, 0x2c, 0x20, 0x6e, 0x65, 0x77, 0x5f, -0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x20, 0x77, 0x68, 0x69, 0x6c, -0x65, 0x20, 0x28, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x20, -0x21, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x29, 0x3b, 0x0a, 0x20, 0x20, -0x74, 0x6d, 0x70, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, -0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, -0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x30, 0x2c, 0x20, 0x28, 0x28, -0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, -0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, 0x78, -0x34, 0x34, 0x33, 0x32, 0x20, 0x3a, 0x20, 0x30, 0x78, 0x34, 0x34, -0x31, 0x30, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, -0x72, 0x6e, 0x20, 0x74, 0x6d, 0x70, 0x3b, 0x0a, 0x7d, 0x0a, 0x23, -0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, -0x5f, 0x61, 0x64, 0x64, 0x5f, 0x65, 0x6c, 0x28, 0x61, 0x2c, 0x20, -0x62, 0x29, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, -0x5f, 0x65, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x0a, -0x5f, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, -0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x61, 0x74, 0x6f, -0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x65, 0x67, 0x28, 0x67, -0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, 0x64, 0x64, -0x72, 0x2c, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, -0x76, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x67, 0x61, -0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, -0x20, 0x3d, 0x20, 0x28, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, -0x20, 0x2a, 0x29, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, -0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x7e, 0x32, -0x29, 0x3b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, -0x74, 0x20, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x61, 0x73, 0x73, 0x75, -0x6d, 0x65, 0x64, 0x2c, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x3b, 0x0a, -0x20, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x74, -0x6d, 0x70, 0x3b, 0x0a, 0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, -0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x64, -0x6f, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x61, 0x73, 0x73, -0x75, 0x6d, 0x65, 0x64, 0x20, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x3b, -0x0a, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x20, 0x3d, -0x20, 0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, -0x6d, 0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x2e, -0x64, 0x61, 0x74, 0x61, 0x2c, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, +0x64, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x6d, 0x70, 0x2e, +0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x62, 0x79, +0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f, 0x6c, 0x64, +0x2c, 0x20, 0x30, 0x2c, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, +0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, +0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, 0x78, 0x34, 0x34, 0x33, 0x32, +0x20, 0x3a, 0x20, 0x30, 0x78, 0x34, 0x34, 0x31, 0x30, 0x29, 0x3b, +0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, 0x75, 0x6d, 0x20, 0x3d, 0x20, +0x67, 0x61, 0x5f, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x32, 0x68, 0x61, +0x6c, 0x66, 0x28, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x32, +0x66, 0x6c, 0x6f, 0x61, 0x74, 0x28, 0x76, 0x61, 0x6c, 0x29, 0x20, +0x2b, 0x20, 0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x32, 0x66, +0x6c, 0x6f, 0x61, 0x74, 0x28, 0x74, 0x6d, 0x70, 0x29, 0x29, 0x2e, +0x64, 0x61, 0x74, 0x61, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6e, +0x65, 0x77, 0x5f, 0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x62, 0x79, 0x74, +0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f, 0x6c, 0x64, 0x2c, +0x20, 0x73, 0x75, 0x6d, 0x2c, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x32, 0x29, 0x20, 0x3f, 0x20, 0x30, 0x78, 0x35, 0x34, 0x31, 0x30, 0x20, 0x3a, 0x20, 0x30, 0x78, 0x33, 0x32, 0x35, 0x34, 0x29, @@ -499,8 +458,52 @@ static const char cluda_cuda_h[] = { 0x32, 0x20, 0x3a, 0x20, 0x30, 0x78, 0x34, 0x34, 0x31, 0x30, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x6d, 0x70, 0x3b, 0x0a, 0x7d, 0x0a, 0x23, 0x64, 0x65, 0x66, -0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, -0x68, 0x67, 0x5f, 0x65, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, +0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, +0x64, 0x5f, 0x65, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, +0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x61, 0x64, 0x64, 0x5f, 0x65, 0x67, +0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x0a, 0x5f, 0x5f, 0x64, +0x65, 0x76, 0x69, 0x63, 0x65, 0x5f, 0x5f, 0x20, 0x67, 0x61, 0x5f, +0x68, 0x61, 0x6c, 0x66, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, +0x63, 0x68, 0x67, 0x5f, 0x65, 0x67, 0x28, 0x67, 0x61, 0x5f, 0x68, +0x61, 0x6c, 0x66, 0x20, 0x2a, 0x61, 0x64, 0x64, 0x72, 0x2c, 0x20, +0x67, 0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x76, 0x61, 0x6c, +0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, +0x6e, 0x74, 0x20, 0x2a, 0x62, 0x61, 0x73, 0x65, 0x20, 0x3d, 0x20, +0x28, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x2a, 0x29, +0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x29, 0x61, +0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x7e, 0x32, 0x29, 0x3b, 0x0a, +0x20, 0x20, 0x67, 0x61, 0x5f, 0x75, 0x69, 0x6e, 0x74, 0x20, 0x6f, +0x6c, 0x64, 0x2c, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, +0x2c, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x3b, 0x0a, 0x20, 0x20, 0x67, +0x61, 0x5f, 0x68, 0x61, 0x6c, 0x66, 0x20, 0x74, 0x6d, 0x70, 0x3b, +0x0a, 0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x2a, 0x62, +0x61, 0x73, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x64, 0x6f, 0x20, 0x7b, +0x0a, 0x20, 0x20, 0x20, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, +0x64, 0x20, 0x3d, 0x20, 0x6f, 0x6c, 0x64, 0x3b, 0x0a, 0x20, 0x20, +0x20, 0x20, 0x6e, 0x65, 0x77, 0x5f, 0x20, 0x3d, 0x20, 0x5f, 0x5f, +0x62, 0x79, 0x74, 0x65, 0x5f, 0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f, +0x6c, 0x64, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x2e, 0x64, 0x61, 0x74, +0x61, 0x2c, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, +0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x32, 0x29, +0x20, 0x3f, 0x20, 0x30, 0x78, 0x35, 0x34, 0x31, 0x30, 0x20, 0x3a, +0x20, 0x30, 0x78, 0x33, 0x32, 0x35, 0x34, 0x29, 0x3b, 0x0a, 0x20, +0x20, 0x20, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x61, 0x74, +0x6f, 0x6d, 0x69, 0x63, 0x43, 0x41, 0x53, 0x28, 0x62, 0x61, 0x73, +0x65, 0x2c, 0x20, 0x61, 0x73, 0x73, 0x75, 0x6d, 0x65, 0x64, 0x2c, +0x20, 0x6e, 0x65, 0x77, 0x5f, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, +0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x61, 0x73, 0x73, +0x75, 0x6d, 0x65, 0x64, 0x20, 0x21, 0x3d, 0x20, 0x6f, 0x6c, 0x64, +0x29, 0x3b, 0x0a, 0x20, 0x20, 0x74, 0x6d, 0x70, 0x2e, 0x64, 0x61, +0x74, 0x61, 0x20, 0x3d, 0x20, 0x5f, 0x5f, 0x62, 0x79, 0x74, 0x65, +0x5f, 0x70, 0x65, 0x72, 0x6d, 0x28, 0x6f, 0x6c, 0x64, 0x2c, 0x20, +0x30, 0x2c, 0x20, 0x28, 0x28, 0x67, 0x61, 0x5f, 0x73, 0x69, 0x7a, +0x65, 0x29, 0x61, 0x64, 0x64, 0x72, 0x20, 0x26, 0x20, 0x32, 0x29, +0x20, 0x3f, 0x20, 0x30, 0x78, 0x34, 0x34, 0x33, 0x32, 0x20, 0x3a, +0x20, 0x30, 0x78, 0x34, 0x34, 0x31, 0x30, 0x29, 0x3b, 0x0a, 0x20, +0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x6d, 0x70, +0x3b, 0x0a, 0x7d, 0x0a, 0x23, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x20, 0x61, 0x74, 0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, -0x65, 0x67, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x65, -0x6e, 0x64, 0x69, 0x66, 0x0a, 0x00}; +0x65, 0x6c, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x61, 0x74, +0x6f, 0x6d, 0x5f, 0x78, 0x63, 0x68, 0x67, 0x5f, 0x65, 0x67, 0x28, +0x61, 0x2c, 0x20, 0x62, 0x29, 0x0a, 0x23, 0x65, 0x6e, 0x64, 0x69, +0x66, 0x0a, 0x00}; diff --git a/src/gpuarray/reduction.h b/src/gpuarray/reduction.h index 77043daa22..a536c26371 100644 --- a/src/gpuarray/reduction.h +++ b/src/gpuarray/reduction.h @@ -22,8 +22,10 @@ extern "C" { /* Data Structures */ +struct GpuReductionAttr; struct GpuReduction; -typedef struct GpuReduction GpuReduction; +typedef struct GpuReductionAttr GpuReductionAttr; +typedef struct GpuReduction GpuReduction; /** @@ -31,59 +33,87 @@ typedef struct GpuReduction GpuReduction; */ typedef enum _ga_reduce_op { - /* dst , dstArg */ - GA_REDUCE_SUM, /* + */ - GA_REDUCE_PROD, /* * */ - GA_REDUCE_PRODNZ, /* * (!=0) */ - GA_REDUCE_MIN, /* min() */ - GA_REDUCE_MAX, /* max() */ - GA_REDUCE_ARGMIN, /* argmin() */ - GA_REDUCE_ARGMAX, /* argmax() */ - GA_REDUCE_MINANDARGMIN, /* min() , argmin() */ - GA_REDUCE_MAXANDARGMAX, /* max() , argmax() */ - GA_REDUCE_AND, /* & */ - GA_REDUCE_OR, /* | */ - GA_REDUCE_XOR, /* ^ */ - GA_REDUCE_ALL, /* &&/all() */ - GA_REDUCE_ANY, /* ||/any() */ + /* d0 , d1 */ + GA_ELEMWISE, + GA_REDUCE_COPY=GA_ELEMWISE, /* (copy) */ + GA_REDUCE_SUM, /* + */ + GA_REDUCE_PROD, /* * */ + GA_REDUCE_PRODNZ, /* * (!=0) */ + GA_REDUCE_MIN, /* min() */ + GA_REDUCE_MAX, /* max() */ + GA_REDUCE_ARGMIN, /* argmin() */ + GA_REDUCE_ARGMAX, /* argmax() */ + GA_REDUCE_MINANDARGMIN, /* min() , argmin() */ + GA_REDUCE_MAXANDARGMAX, /* max() , argmax() */ + GA_REDUCE_AND, /* & */ + GA_REDUCE_OR, /* | */ + GA_REDUCE_XOR, /* ^ */ + GA_REDUCE_ALL, /* &&/all() */ + GA_REDUCE_ANY, /* ||/any() */ - GA_REDUCE_ENDSUPPORTED /* Must be last element in enum */ + GA_REDUCE_ENDSUPPORTED /* Must be last element in enum */ } ga_reduce_op; /* External Functions */ /** - * @brief Create a new GPU reduction operator over a list of axes to reduce. + * @brief Create, modify and free the attributes of a reduction operator. + * + * @param [out] grAttr The reduction operator attributes object. + * @param [in] op The reduction operation. + * @param [in] maxSrcDims The maximum number of supported source dimensions. + * @param [in] maxDstDims The maximum number of supported destination dimensions. + * @param [in] s0Typecode The typecode of the source tensor. + * @param [in] d0Typecode The typecode of the first destination tensor. + * @param [in] d1Typecode The typecode of the second destination tensor. + * @param [in] i0Typecode The typecode of the indices. + */ + +GPUARRAY_PUBLIC int GpuReductionAttr_new (GpuReductionAttr** grAttr, + gpucontext* gpuCtx); +GPUARRAY_PUBLIC int GpuReductionAttr_setop (GpuReductionAttr* grAttr, + ga_reduce_op op); +GPUARRAY_PUBLIC int GpuReductionAttr_setdims (GpuReductionAttr* grAttr, + unsigned maxSrcDims, + unsigned maxDstDims); +GPUARRAY_PUBLIC int GpuReductionAttr_sets0type (GpuReductionAttr* grAttr, + int s0Typecode); +GPUARRAY_PUBLIC int GpuReductionAttr_setd0type (GpuReductionAttr* grAttr, + int d0Typecode); +GPUARRAY_PUBLIC int GpuReductionAttr_setd1type (GpuReductionAttr* grAttr, + int d1Typecode); +GPUARRAY_PUBLIC int GpuReductionAttr_seti0type (GpuReductionAttr* grAttr, + int i0Typecode); +GPUARRAY_PUBLIC int GpuReductionAttr_appendopname (GpuReductionAttr* grAttr, + size_t n, + char* name); +GPUARRAY_PUBLIC int GpuReductionAttr_issensitive (const GpuReductionAttr* grAttr); +GPUARRAY_PUBLIC int GpuReductionAttr_requiresS0 (const GpuReductionAttr* grAttr); +GPUARRAY_PUBLIC int GpuReductionAttr_requiresD0 (const GpuReductionAttr* grAttr); +GPUARRAY_PUBLIC int GpuReductionAttr_requiresD1 (const GpuReductionAttr* grAttr); +GPUARRAY_PUBLIC void GpuReductionAttr_free (GpuReductionAttr* grAttr); + +/** + * @brief Create a new GPU reduction operator with the given attributes. * * @param [out] gr The reduction operator. - * @param [in] gpuCtx The GPU context. - * @param [in] op The reduction operation to perform. - * @param [in] ndf The minimum number of free (destination) dimensions to support. - * @param [in] ndr The minimum number of reduction (source) dimensions to support. - * @param [in] s0TypeCode The data type of the source operand. - * @param [in] flags Reduction operator creation flags. Currently must be - * set to 0. + * @param [in] grAttr The GPU context. * * @return GA_NO_ERROR if the operator was created successfully - * GA_INVALID_ERROR if grOut is NULL, or some other argument was invalid + * GA_INVALID_ERROR if some argument was invalid * GA_NO_MEMORY if memory allocation failed anytime during creation * or other non-zero error codes otherwise. */ -GPUARRAY_PUBLIC int GpuReduction_new (GpuReduction** grOut, - gpucontext* gpuCtx, - ga_reduce_op op, - unsigned ndf, - unsigned ndr, - int s0TypeCode, - int flags); +GPUARRAY_PUBLIC int GpuReduction_new (GpuReduction** gr, + const GpuReductionAttr* grAttr); /** * @brief Deallocate an operator allocated by GpuReduction_new(). */ -GPUARRAY_PUBLIC void GpuReduction_free (GpuReduction* gr); +GPUARRAY_PUBLIC void GpuReduction_free (GpuReduction* gr); /** * @brief Invoke an operator allocated by GpuReduction_new() on a source tensor. @@ -123,13 +153,13 @@ GPUARRAY_PUBLIC void GpuReduction_free (GpuReduction* gr); * error code otherwise. */ -GPUARRAY_PUBLIC int GpuReduction_call (const GpuReduction* gr, - GpuArray* d0, - GpuArray* d1, - const GpuArray* s0, - unsigned reduxLen, - const int* reduxList, - int flags); +GPUARRAY_PUBLIC int GpuReduction_call (const GpuReduction* gr, + GpuArray* d0, + GpuArray* d1, + const GpuArray* s0, + unsigned reduxLen, + const int* reduxList, + int flags); #ifdef __cplusplus diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c index 35518c0fbc..baead32518 100644 --- a/src/gpuarray_reduction.c +++ b/src/gpuarray_reduction.c @@ -77,7 +77,6 @@ typedef struct axis_desc axis_desc; struct redux_ctx{ /* Function Arguments. */ const GpuReduction* gr; - ga_reduce_op op; GpuArray* d0; GpuArray* d1; const GpuArray* s0; @@ -90,8 +89,6 @@ struct redux_ctx{ int nds0r; /* # Reduced axes */ int ndd0; /* # Destination axes */ int ndfs0; /* # Flattened source axes */ - int ndfs0r; /* # Flattened source axes */ - int ndfd0; /* # Flattened source axes */ int ndib; /* # Intra-block axes */ int zeroAllAxes; /* # of zero-length axes in source tensor */ int zeroRdxAxes; /* # of zero-length reduction axes in source tensor */ @@ -145,6 +142,23 @@ struct redux_ctx{ typedef struct redux_ctx redux_ctx; + +/** + * Reduction Operator Attributes. + */ + +struct GpuReductionAttr{ + gpucontext* gpuCtx; + unsigned numProcs; + size_t maxLg, maxL0, maxGg, maxG0, maxLM; + + ga_reduce_op op; + int maxSrcDims; + int maxDstDims; + int s0Typecode, d0Typecode, d1Typecode, i0Typecode; +}; + + /** * Reduction Operator. * @@ -232,15 +246,12 @@ typedef struct redux_ctx redux_ctx; struct GpuReduction{ /* Function Arguments. */ + GpuReductionAttr grAttr; gpucontext* gpuCtx; ga_reduce_op op; + int nds; int ndd; int ndr; - int TS0tc; - int flags; - - /* Misc */ - int nds; /* Source code Generator. */ strb s; @@ -248,9 +259,11 @@ struct GpuReduction{ char kName[256]; char* kSourceCode; size_t kSourceCodeLen; + int TS0tc; int TPS0tc; int TD0tc; int TD1tc; + int TI0tc; int TS32tc; int TU32tc; int TS64tc; @@ -275,12 +288,6 @@ struct GpuReduction{ GpuKernel k; /* Scheduling */ - unsigned numProcs; - size_t maxLg; - size_t maxL0; - size_t maxGg; - size_t maxG0; - size_t maxLM; size_t maxLK; size_t maxBS; int log2MaxBS; @@ -304,8 +311,6 @@ static int reduxGetMinInit (int typecode, const char** pr static int reduxGetMaxInit (int typecode, const char** property); static int reduxGetAndInit (int typecode, const char** property); static int reduxGetOrInit (int typecode, const char** property); -static int reduxIsSensitive (int op); -static const char* reduxGetOpName (int op); static int reduxIsFloatingPoint (int typecode); static unsigned reduxCeilLog2 (uint64_t x); static uint64_t reduxNextPow2 (uint64_t x); @@ -471,41 +476,213 @@ static void reduxSortAxisPtrsBy (axis_desc** ptrs, /* Function Implementations */ /* Extern Functions */ -GPUARRAY_PUBLIC int GpuReduction_new (GpuReduction** grOut, - gpucontext* gpuCtx, - ga_reduce_op op, - unsigned ndf, - unsigned ndr, - int s0TypeCode, - int flags){ - if (!grOut){ +GPUARRAY_PUBLIC int GpuReductionAttr_new (GpuReductionAttr** grAttr, + gpucontext* gpuCtx){ + if(!grAttr){ + return GA_INVALID_ERROR; + } + if(!gpuCtx){ + *grAttr = NULL; + return GA_INVALID_ERROR; + } + *grAttr = calloc(1, sizeof(**grAttr)); + if(!*grAttr){ + return GA_MEMORY_ERROR; + } + + (*grAttr)->gpuCtx = gpuCtx; + if (gpucontext_property(gpuCtx, GA_CTX_PROP_NUMPROCS, &(*grAttr)->numProcs) != GA_NO_ERROR || + gpucontext_property(gpuCtx, GA_CTX_PROP_MAXLSIZE, &(*grAttr)->maxLg) != GA_NO_ERROR || + gpucontext_property(gpuCtx, GA_CTX_PROP_MAXLSIZE0, &(*grAttr)->maxL0) != GA_NO_ERROR || + gpucontext_property(gpuCtx, GA_CTX_PROP_MAXGSIZE, &(*grAttr)->maxGg) != GA_NO_ERROR || + gpucontext_property(gpuCtx, GA_CTX_PROP_MAXGSIZE0, &(*grAttr)->maxG0) != GA_NO_ERROR || + gpucontext_property(gpuCtx, GA_CTX_PROP_LMEMSIZE, &(*grAttr)->maxLM) != GA_NO_ERROR ){ + free(*grAttr); + return GA_INVALID_ERROR; + } + (*grAttr)->op = GA_REDUCE_SUM; + (*grAttr)->maxSrcDims = 1; + (*grAttr)->maxDstDims = 1; + (*grAttr)->s0Typecode = GA_FLOAT; + (*grAttr)->d0Typecode = GA_FLOAT; + (*grAttr)->d1Typecode = GA_ULONG; + (*grAttr)->i0Typecode = GA_ULONG; + + return GA_NO_ERROR; +} +GPUARRAY_PUBLIC int GpuReductionAttr_setop (GpuReductionAttr* grAttr, + ga_reduce_op op){ + grAttr->op = op; + + return GA_NO_ERROR; +} +GPUARRAY_PUBLIC int GpuReductionAttr_setdims (GpuReductionAttr* grAttr, + unsigned maxSrcDims, + unsigned maxDstDims){ + grAttr->maxSrcDims = maxSrcDims; + grAttr->maxDstDims = maxDstDims; + + return GA_NO_ERROR; +} +GPUARRAY_PUBLIC int GpuReductionAttr_sets0type (GpuReductionAttr* grAttr, + int s0Typecode){ + switch(grAttr->op){ + case GA_REDUCE_AND: + case GA_REDUCE_OR: + case GA_REDUCE_XOR: + if (reduxIsFloatingPoint(s0Typecode)){ + /* Bitwise operations not applicable to floating-point datatypes! */ + return GA_INVALID_ERROR; + } + break; + default: + break; + } + + grAttr->s0Typecode = s0Typecode; + + return GA_NO_ERROR; +} +GPUARRAY_PUBLIC int GpuReductionAttr_setd0type (GpuReductionAttr* grAttr, + int d0Typecode){ + grAttr->d0Typecode = d0Typecode; + + return GA_NO_ERROR; +} +GPUARRAY_PUBLIC int GpuReductionAttr_setd1type (GpuReductionAttr* grAttr, + int d1Typecode){ + grAttr->d1Typecode = d1Typecode; + + return GA_NO_ERROR; +} +GPUARRAY_PUBLIC int GpuReductionAttr_seti0type (GpuReductionAttr* grAttr, + int i0Typecode){ + grAttr->i0Typecode = i0Typecode; + + return GA_NO_ERROR; +} +GPUARRAY_PUBLIC int GpuReductionAttr_appendopname (GpuReductionAttr* grAttr, + size_t n, + char* name){ + switch(grAttr->op){ + case GA_REDUCE_COPY: return snprintf(name, n, "Copy_%d", grAttr->maxSrcDims); + case GA_REDUCE_SUM: return snprintf(name, n, "Sum_%d_%d", grAttr->maxSrcDims, grAttr->maxDstDims); + case GA_REDUCE_PROD: return snprintf(name, n, "Prod_%d_%d", grAttr->maxSrcDims, grAttr->maxDstDims); + case GA_REDUCE_PRODNZ: return snprintf(name, n, "ProdNonZero_%d_%d", grAttr->maxSrcDims, grAttr->maxDstDims); + case GA_REDUCE_MIN: return snprintf(name, n, "Min_%d_%d", grAttr->maxSrcDims, grAttr->maxDstDims); + case GA_REDUCE_MAX: return snprintf(name, n, "Max_%d_%d", grAttr->maxSrcDims, grAttr->maxDstDims); + case GA_REDUCE_ARGMIN: return snprintf(name, n, "Argmin_%d_%d", grAttr->maxSrcDims, grAttr->maxDstDims); + case GA_REDUCE_ARGMAX: return snprintf(name, n, "Argmax_%d_%d", grAttr->maxSrcDims, grAttr->maxDstDims); + case GA_REDUCE_MINANDARGMIN: return snprintf(name, n, "MinAndArgmin_%d_%d", grAttr->maxSrcDims, grAttr->maxDstDims); + case GA_REDUCE_MAXANDARGMAX: return snprintf(name, n, "MaxAndArgmax_%d_%d", grAttr->maxSrcDims, grAttr->maxDstDims); + case GA_REDUCE_AND: return snprintf(name, n, "And_%d_%d", grAttr->maxSrcDims, grAttr->maxDstDims); + case GA_REDUCE_OR: return snprintf(name, n, "Or_%d_%d", grAttr->maxSrcDims, grAttr->maxDstDims); + case GA_REDUCE_XOR: return snprintf(name, n, "Xor_%d_%d", grAttr->maxSrcDims, grAttr->maxDstDims); + case GA_REDUCE_ALL: return snprintf(name, n, "All_%d_%d", grAttr->maxSrcDims, grAttr->maxDstDims); + case GA_REDUCE_ANY: return snprintf(name, n, "Any_%d_%d", grAttr->maxSrcDims, grAttr->maxDstDims); + default: if(name && n>0){*name = '\0';} return GA_INVALID_ERROR; + } +} +GPUARRAY_PUBLIC int GpuReductionAttr_issensitive (const GpuReductionAttr* grAttr){ + /** + * @brief Returns whether the reduction is "sensitive". + * + * A reduction is sensitive when its output satisfies at least one of the + * following conditions: + * + * - It depends on the exact order of axes in the reduxList + * - It depends on exact signs of the strides of axes in the reduxList + * + * Such sensitivity may prevent a flattening of contiguous axes even when it + * would have been otherwise permitted. + * + * For instance, ARGMIN/ARGMAX have this sensitivity, because the dstArg + * tensor's contents are flattened coordinates into the source tensor, and + * the flattening order is precisely reduxList. Permuting it would thus produce + * incorrect output. Moreover, if the strides of a reduction axis were to be + * reversed for the purpose of flattening the axis into another, the computed + * coordinate would again be incorrect. + * + * + * TL;DR: Reduction is sensitive if + * reduce(x, axis=axisList) != reduce(x, axis=axisList[::-1]) + * or + * reduce(x) != reduce(x[::-1]) + * . + */ + + switch (grAttr->op){ + case GA_REDUCE_MINANDARGMIN: + case GA_REDUCE_MAXANDARGMAX: + case GA_REDUCE_ARGMIN: + case GA_REDUCE_ARGMAX: + return 1; + default: + return 0; + } +} +GPUARRAY_PUBLIC int GpuReductionAttr_requiresS0 (const GpuReductionAttr* grAttr){ + switch(grAttr->op){ + default: return 1; + } +} +GPUARRAY_PUBLIC int GpuReductionAttr_requiresD0 (const GpuReductionAttr* grAttr){ + switch (grAttr->op){ + case GA_REDUCE_ARGMIN: + case GA_REDUCE_ARGMAX: + return 0; + default: + return 1; + } +} +GPUARRAY_PUBLIC int GpuReductionAttr_requiresD1 (const GpuReductionAttr* grAttr){ + switch (grAttr->op){ + case GA_REDUCE_MINANDARGMIN: + case GA_REDUCE_MAXANDARGMAX: + case GA_REDUCE_ARGMIN: + case GA_REDUCE_ARGMAX: + return 1; + default: + return 0; + } +} +GPUARRAY_PUBLIC void GpuReductionAttr_free (GpuReductionAttr* grAttr){ + free(grAttr); +} +GPUARRAY_PUBLIC int GpuReduction_new (GpuReduction** gr, + const GpuReductionAttr* grAttr){ + if (!gr){ + return GA_INVALID_ERROR; + } + if (!grAttr){ + *gr = NULL; return GA_INVALID_ERROR; } - *grOut = calloc(1, sizeof(**grOut)); - if (*grOut){ - (*grOut)->gpuCtx = gpuCtx; - (*grOut)->op = op; - (*grOut)->ndd = (int)ndf; - (*grOut)->ndr = (int)ndr; - (*grOut)->TS0tc = s0TypeCode; - (*grOut)->flags = flags; + *gr = calloc(1, sizeof(**gr)); + if (*gr){ + (*gr)->grAttr = *grAttr; + (*gr)->gpuCtx = grAttr->gpuCtx; + (*gr)->op = grAttr->op; + (*gr)->nds = (int)grAttr->maxSrcDims; + (*gr)->ndd = (int)grAttr->maxDstDims; + (*gr)->ndr = (int)(grAttr->maxSrcDims-grAttr->maxDstDims); - return reduxGenInit(*grOut); + return reduxGenInit(*gr); }else{ return GA_MEMORY_ERROR; } } -GPUARRAY_PUBLIC void GpuReduction_free (GpuReduction* gr){ +GPUARRAY_PUBLIC void GpuReduction_free (GpuReduction* gr){ reduxGenCleanup(gr, !GA_NO_ERROR); } -GPUARRAY_PUBLIC int GpuReduction_call (const GpuReduction* gr, - GpuArray* d0, - GpuArray* d1, - const GpuArray* s0, - unsigned reduxLen, - const int* reduxList, - int flags){ +GPUARRAY_PUBLIC int GpuReduction_call (const GpuReduction* gr, + GpuArray* d0, + GpuArray* d1, + const GpuArray* s0, + unsigned reduxLen, + const int* reduxList, + int flags){ redux_ctx ctxSTACK, *ctx = &ctxSTACK; memset(ctx, 0, sizeof(*ctx)); @@ -792,69 +969,6 @@ static int reduxGetOrInit (int typecode, const char** pro return GA_NO_ERROR; } -/** - * @brief Returns whether the reduction is "sensitive". - * - * A reduction is sensitive when its output satisfies at least one of the - * following conditions: - * - * - It depends on the exact order of axes in the reduxList - * - It depends on exact signs of the strides of axes in the reduxList - * - * Such sensitivity may prevent a flattening of contiguous axes even when it - * would have been otherwise permitted. - * - * For instance, ARGMIN/ARGMAX have this sensitivity, because the dstArg - * tensor's contents are flattened coordinates into the source tensor, and - * the flattening order is precisely reduxList. Permuting it would thus produce - * incorrect output. Moreover, if the strides of a reduction axis were to be - * reversed for the purpose of flattening the axis into another, the computed - * coordinate would again be incorrect. - * - * - * TL;DR: Reduction is sensitive if - * reduce(x, axis=axisList) != reduce(x, axis=axisList[::-1]) - * or - * reduce(x) != reduce(x[::-1]) - * . - */ - -static int reduxIsSensitive (int op){ - switch (op){ - case GA_REDUCE_MINANDARGMIN: - case GA_REDUCE_MAXANDARGMAX: - case GA_REDUCE_ARGMIN: - case GA_REDUCE_ARGMAX: - return 1; - default: - return 0; - } -} - -/** - * Get a name for the op, usable within a C identifier. - */ - -static const char* reduxGetOpName (int op){ - switch (op){ - case GA_REDUCE_SUM: return "Sum"; - case GA_REDUCE_PROD: return "Prod"; - case GA_REDUCE_PRODNZ: return "ProdNonZero"; - case GA_REDUCE_MIN: return "Min"; - case GA_REDUCE_MAX: return "Max"; - case GA_REDUCE_ARGMIN: return "Argmin"; - case GA_REDUCE_ARGMAX: return "Argmax"; - case GA_REDUCE_MINANDARGMIN: return "MinAndArgmin"; - case GA_REDUCE_MAXANDARGMAX: return "MaxAndArgmax"; - case GA_REDUCE_AND: return "And"; - case GA_REDUCE_OR: return "Or"; - case GA_REDUCE_XOR: return "Xor"; - case GA_REDUCE_ALL: return "All"; - case GA_REDUCE_ANY: return "Any"; - default: return NULL; - } -} - /** * Whether or not the typecode is a floating-point type. */ @@ -1361,7 +1475,7 @@ static int reduxTryFlattenInto (redux_ctx* ctx, reverseD0 = signD0 && reduxInvRequiresD0(ctx); reverseD1 = signD1 && reduxInvRequiresD1(ctx); - if (reduxIsSensitive(ctx->op)){ + if (GpuReductionAttr_issensitive(&ctx->gr->grAttr)){ if (reverseS0 || reverseD0 || reverseD1){ return 0; } @@ -1441,30 +1555,6 @@ static int reduxGenInit (GpuReduction* gr){ static int reduxGenInferProperties (GpuReduction* gr){ int i; - - /** - * Insane arguments? - */ - - if (gr->op < 0 || gr->op >= GA_REDUCE_ENDSUPPORTED){ - return reduxGenCleanupMsg(gr, GA_INVALID_ERROR, - "Unknown reduction operation!\n"); - } - if (gr->ndr <= 0){ - return reduxGenCleanupMsg(gr, GA_INVALID_ERROR, - "No reduction axes!\n"); - } - if (gr->ndd < 0){ - return reduxGenCleanupMsg(gr, GA_INVALID_ERROR, - "Destination tensor has less than 0 rank!\n"); - } - if (gr->flags != 0){ - return reduxGenCleanupMsg(gr, GA_INVALID_ERROR, - "\"flags\" must be set to 0!\n"); - } - gr->nds = gr->ndr+gr->ndd; - - /** * Source code buffer preallocation failed? */ @@ -1476,46 +1566,20 @@ static int reduxGenInferProperties (GpuReduction* gr){ srcbInit(&gr->srcGen, &gr->s); - /** - * GPU context non-existent, or cannot read its properties? - */ - - if (!gr->gpuCtx || - gpucontext_property(gr->gpuCtx, GA_CTX_PROP_NUMPROCS, &gr->numProcs) != GA_NO_ERROR || - gpucontext_property(gr->gpuCtx, GA_CTX_PROP_MAXLSIZE, &gr->maxLg) != GA_NO_ERROR || - gpucontext_property(gr->gpuCtx, GA_CTX_PROP_MAXLSIZE0, &gr->maxL0) != GA_NO_ERROR || - gpucontext_property(gr->gpuCtx, GA_CTX_PROP_MAXGSIZE, &gr->maxGg) != GA_NO_ERROR || - gpucontext_property(gr->gpuCtx, GA_CTX_PROP_MAXGSIZE0, &gr->maxG0) != GA_NO_ERROR || - gpucontext_property(gr->gpuCtx, GA_CTX_PROP_LMEMSIZE, &gr->maxLM) != GA_NO_ERROR ){ - return reduxGenCleanupMsg(gr, GA_INVALID_ERROR, - "Error obtaining one or more properties from GPU context!\n"); - } - - /** * Type management. * - * - Deal with the various typecodes. + * Read out the various typecodes from the attributes. */ - - gr->TD0tc = gr->TS0tc; - gr->TD1tc = GA_SSIZE; + + gr->TS0tc = gr->grAttr.s0Typecode; + gr->TD0tc = gr->grAttr.d0Typecode; + gr->TD1tc = gr->grAttr.d1Typecode; + gr->TI0tc = gr->grAttr.i0Typecode; gr->TS32tc = GA_INT; gr->TU32tc = GA_UINT; gr->TS64tc = GA_LONG; gr->TU64tc = GA_ULONG; - switch(gr->op){ - case GA_REDUCE_AND: - case GA_REDUCE_OR: - case GA_REDUCE_XOR: - if (reduxIsFloatingPoint(gr->TS0tc)){ - return reduxGenCleanupMsg(gr, GA_INVALID_ERROR, - "Bitwise operations not applicable to floating-point datatypes!\n"); - } - break; - default: - break; - } reduxGenSetKTypes(gr); @@ -1545,9 +1609,9 @@ static int reduxGenInferProperties (GpuReduction* gr){ */ static void reduxGenSetMaxBS (GpuReduction* gr){ - gr->maxBS = gr->maxLM/reduxGenGetReduxStateSize(gr); - gr->maxBS = gr->maxBS < gr->maxLg ? gr->maxBS : gr->maxLg; - gr->maxBS = gr->maxBS < gr->maxL0 ? gr->maxBS : gr->maxL0; + gr->maxBS = gr->grAttr.maxLM/reduxGenGetReduxStateSize(gr); + gr->maxBS = gr->maxBS < gr->grAttr.maxLg ? gr->maxBS : gr->grAttr.maxLg; + gr->maxBS = gr->maxBS < gr->grAttr.maxL0 ? gr->maxBS : gr->grAttr.maxL0; /** * In practice we want a moderate amount of blocks, not just one monolith @@ -1588,8 +1652,9 @@ static void reduxGenSetMaxBS (GpuReduction* gr){ * In the future this might become wierder when the accumulator is a Kahan * summation, for instance, and then TK0 != promoted(TS0). * - * If the user guaranteed to us that TK1 can be made narrower than 64-bit - * unsigned through, perhaps, a flag, this is also where we set it. + * If the user guaranteed to us through gr->grAttr that TK1 can be made + * narrower than 64-bit, this is also where we'd take this into account. + * For now we default TK1 to exactly TI0. */ static void reduxGenSetKTypes (GpuReduction* gr){ @@ -1627,7 +1692,7 @@ static void reduxGenSetKTypes (GpuReduction* gr){ * they want. */ - switch (gr->op){ + switch (gr->grAttr.op){ case GA_REDUCE_SUM: TK0 = TPS0; reduxGetSumInit (TK0->typecode, &TK0init); @@ -1649,7 +1714,7 @@ static void reduxGenSetKTypes (GpuReduction* gr){ case GA_REDUCE_ARGMIN: case GA_REDUCE_MIN: TK0 = TPS0; - TK1 = gpuarray_get_type(GA_SIZE); + TK1 = gpuarray_get_type(gr->TI0tc); reduxGetMinInit (TK0->typecode, &TK0init); gr->TK0.align = TK0->align; gr->TK0.size = TK0->size; @@ -1664,7 +1729,7 @@ static void reduxGenSetKTypes (GpuReduction* gr){ case GA_REDUCE_ARGMAX: case GA_REDUCE_MAX: TK0 = TPS0; - TK1 = gpuarray_get_type(GA_SIZE); + TK1 = gpuarray_get_type(gr->TI0tc); reduxGetMaxInit (TK0->typecode, &TK0init); gr->TK0.align = TK0->align; gr->TK0.size = TK0->size; @@ -1807,8 +1872,7 @@ static void reduxGenIterArgs (const GpuReduction* gr, */ static int reduxGenSrc (GpuReduction* gr){ - sprintf(gr->kName, "reduxKernel%s_f%d_r%d", - reduxGetOpName(gr->op), gr->ndd, gr->ndr); + GpuReductionAttr_appendopname(&gr->grAttr, sizeof(gr->kName), gr->kName); reduxGenSrcAppend(gr); @@ -1855,6 +1919,9 @@ static void reduxGenSrcAppendMacroTypedefs(GpuReduction* gr){ if (reduxGenRequiresD1(gr)){ srcbAppendf(&gr->srcGen, "typedef %-20s TD1;\n", gpuarray_get_type(gr->TD1tc )->cluda_name); } + if (reduxGenKernelRequiresLatticeI0(gr)){ + srcbAppendf(&gr->srcGen, "typedef %-20s TI0;\n", gpuarray_get_type(gr->TI0tc )->cluda_name); + } srcbAppendf(&gr->srcGen, "typedef %-20s TS32;\n", gpuarray_get_type(gr->TS32tc)->cluda_name); srcbAppendf(&gr->srcGen, "typedef %-20s TU32;\n", gpuarray_get_type(gr->TU32tc)->cluda_name); srcbAppendf(&gr->srcGen, "typedef %-20s TS64;\n", gpuarray_get_type(gr->TS64tc)->cluda_name); @@ -1921,7 +1988,7 @@ static void reduxGenSrcAppendMacroTypedefs(GpuReduction* gr){ * flattened index i into reduction states V and I respectively. */ - switch (gr->op){ + switch (gr->grAttr.op){ case GA_REDUCE_SUM: srcbAppendf(&gr->srcGen, "#define REDUX(V, I, v, i) do{ \\\n" " (V) += (v); \\\n" @@ -2180,8 +2247,13 @@ static void reduxGenSrcAppendDecode (GpuReduction* gr){ int i; srcbAppends(&gr->srcGen, - " GA_DECL_SHARED_BODY(char, SHMEM)\n" - " DECLREDUXSTATE(tmpK0, I0)\n" + " GA_DECL_SHARED_BODY(char, SHMEM)\n"); + if (reduxGenKernelRequiresLatticeI0(gr)){ + srcbAppends(&gr->srcGen, + " TI0 I0;\n"); + } + srcbAppends(&gr->srcGen, + " TK0 tmpK0;\n" " DECLREDUXSTATE(K0, K1)\n" " INITREDUXSTATE(K0, K1);\n" " \n" @@ -2443,15 +2515,17 @@ static void reduxGenSrcAppendDecode (GpuReduction* gr){ } srcbEndList(&gr->srcGen); srcbAppends(&gr->srcGen, ";\n" + " local_barrier();\n" " if(perm < D){\n" - " ((TS64*)SHMEM)[perm] = D0Off;\n" + " ((TS64*)SHMEM)[perm] = D0Off;\n" " }\n" - " local_barrier();\n" - " if(LID_0 < D){\n" - " D0Off = ((TS64*)SHMEM)[LID_0];\n" + " if(LID_0 >= D){\n" + " ((TS64*)SHMEM)[LID_0] = 0;\n" " }\n" " local_barrier();\n" - " D0 += D0Off;\n"); + " D0Off = ((TS64*)SHMEM)[LID_0];\n" + " D0 += D0Off;\n" + " local_barrier();\n"); } @@ -2477,15 +2551,17 @@ static void reduxGenSrcAppendDecode (GpuReduction* gr){ } srcbEndList(&gr->srcGen); srcbAppends(&gr->srcGen, ";\n" + " local_barrier();\n" " if(perm < D){\n" - " ((TS64*)SHMEM)[perm] = D1Off;\n" + " ((TS64*)SHMEM)[perm] = D1Off;\n" " }\n" - " local_barrier();\n" - " if(LID_0 < D){\n" - " D1Off = ((TS64*)SHMEM)[LID_0];\n" + " if(LID_0 >= D){\n" + " ((TS64*)SHMEM)[LID_0] = 0;\n" " }\n" " local_barrier();\n" - " D1 += D1Off;\n"); + " D1Off = ((TS64*)SHMEM)[LID_0];\n" + " D1 += D1Off;\n" + " local_barrier();\n"); } @@ -2531,6 +2607,13 @@ static void reduxGenSrcAppendDecode (GpuReduction* gr){ " TK1* restrict const W1R = &W1[GDIM_0*D];\n" " TK1* restrict const SHMEMK1 = (TK1*)(SHMEM + SHMEMK1Off);\n"); } + srcbAppends(&gr->srcGen, + " INITREDUXSTATE(W0L[LID_0], W1L[LID_0]);\n" + " INITREDUXSTATE(W0R[LID_0], W1R[LID_0]);\n" + " if(D < LDIM_0 && LID_0+Dndd) ? "break" : "continue"; + const char* breakOrCont = (initial) && (axis < gr->ndd) ? "break " : "continue"; /* Pointer bumps */ srcbAppends(&gr->srcGen, " "); @@ -3023,7 +3106,7 @@ static size_t reduxGenEstimateParallelism (const GpuReduction* gr){ */ size_t marginFactor = 16; - return marginFactor * gr->numProcs * gr->maxLg; + return marginFactor * gr->grAttr.numProcs * gr->grAttr.maxLg; } /** @@ -3081,28 +3164,13 @@ static size_t reduxGenEstimateParallelism (const GpuReduction* gr){ */ static int reduxGenRequiresS0 (const GpuReduction* gr){ - (void)gr; - return 1; + return GpuReductionAttr_requiresS0(&gr->grAttr); } static int reduxGenRequiresD0 (const GpuReduction* gr){ - switch (gr->op){ - case GA_REDUCE_ARGMIN: - case GA_REDUCE_ARGMAX: - return 0; - default: - return 1; - } + return GpuReductionAttr_requiresD0(&gr->grAttr); } static int reduxGenRequiresD1 (const GpuReduction* gr){ - switch (gr->op){ - case GA_REDUCE_MINANDARGMIN: - case GA_REDUCE_MAXANDARGMAX: - case GA_REDUCE_ARGMIN: - case GA_REDUCE_ARGMAX: - return 1; - default: - return 0; - } + return GpuReductionAttr_requiresD1(&gr->grAttr); } static int reduxGenKernelRequiresLatticeS0(const GpuReduction* gr){ return reduxGenRequiresS0(gr); @@ -3254,10 +3322,6 @@ static size_t reduxGenGetWMEMK1Off (const GpuReduction* gr, size */ static int reduxInvInit (redux_ctx* ctx){ - /** - * We initialize certain parts of the context. - */ - ctx->L = ctx->Li = NULL; ctx->S0J = ctx->S0Si = NULL; ctx->D0J = ctx->D0Si = NULL; @@ -3281,7 +3345,7 @@ static int reduxInvInit (redux_ctx* ctx){ * @brief Begin inferring the properties of the reduction invocation. */ -static int reduxInvInferProperties (redux_ctx* ctx){ +static int reduxInvInferProperties (redux_ctx* ctx){ axis_desc* a; int i, j; size_t d; @@ -3320,7 +3384,8 @@ static int reduxInvInferProperties (redux_ctx* ctx){ ctx->nds0 = reduxInvRequiresS0(ctx) ? ctx->s0->nd : 0; ctx->nds0r = ctx->reduxLen; ctx->ndd0 = ctx->nds0 - ctx->nds0r; - ctx->ndfs0 = ctx->ndfs0r = ctx->ndfd0 = 0; + ctx->ndfs0 = ctx->nds0; + /* Insane reduxList? */ for (i=0;inds0r;i++){ @@ -3450,7 +3515,10 @@ static int reduxInvInferProperties (redux_ctx* ctx){ ctx->D1Off = ctx->d1->offset; } - return reduxInvFlattenSource(ctx); + + return ctx->flags & 0 ? //FIXME: Delete this hack after debugging. + reduxInvFlattenSource (ctx): + reduxInvComputeKernelArgs(ctx); } /** @@ -3464,8 +3532,6 @@ static int reduxInvFlattenSource (redux_ctx* ctx){ axis_desc* axis, *flatAxis, *sortAxis; int i, j, k, isSensitive; - ctx->ndfs0 = ctx->nds0; - /** * Pass 1: Flatten out 0- and 1-length axes. We already know that * @@ -3502,7 +3568,7 @@ static int reduxInvFlattenSource (redux_ctx* ctx){ */ k = ctx->ndfs0; - isSensitive = reduxIsSensitive(ctx->op); + isSensitive = GpuReductionAttr_issensitive(&ctx->gr->grAttr); qsort(ctx->xdSrc, ctx->ndfs0, sizeof(*ctx->xdSrc), isSensitive ? reduxSortFlatSensitive : reduxSortFlatInsensitive); for (i=j=1;indfs0;i++){ @@ -3517,19 +3583,6 @@ static int reduxInvFlattenSource (redux_ctx* ctx){ } ctx->ndfs0 = k; - - /** - * Compute number of flattened free and reduced axes. - */ - - for (ctx->ndfs0r=ctx->ndfd0=i=0;indfs0;i++){ - if (axisIsReduced(reduxInvGetSrcAxis(ctx, i))){ - ctx->ndfs0r++; - }else{ - ctx->ndfd0++; - } - } - return reduxInvComputeKernelArgs(ctx); } diff --git a/tests/check_reduction.c b/tests/check_reduction.c index 7a2141cfae..8e5eef93e4 100644 --- a/tests/check_reduction.c +++ b/tests/check_reduction.c @@ -16,6 +16,7 @@ void teardown(void); /* Defines */ +#define MAXERRPRINT 2 #define ga_assert_ok(e) ck_assert_int_eq(e, GA_NO_ERROR) @@ -74,18 +75,19 @@ START_TEST(test_maxandargmax_reduction){ * third dimensions. */ + size_t errCnt = 0; size_t i,j,k; size_t dims[3] = {32,50,79}; size_t prodDims = dims[0]*dims[1]*dims[2]; const int reduxList[] = {0,2}; - float *pSrc = calloc(sizeof(*pSrc), prodDims); - float *pMax = calloc(sizeof(*pMax), dims[1]); - unsigned long *pArgmax = calloc(sizeof(*pArgmax), dims[1]); + float* pS0 = calloc(1, sizeof(*pS0) * dims[0]*dims[1]*dims[2]); + float* pD0 = calloc(1, sizeof(*pD0) * dims[1] ); + size_t* pD1 = calloc(1, sizeof(*pD1) * dims[1] ); - ck_assert_ptr_ne(pSrc, NULL); - ck_assert_ptr_ne(pMax, NULL); - ck_assert_ptr_ne(pArgmax, NULL); + ck_assert_ptr_nonnull(pS0); + ck_assert_ptr_nonnull(pD0); + ck_assert_ptr_nonnull(pD1); /** @@ -93,7 +95,7 @@ START_TEST(test_maxandargmax_reduction){ */ for(i=0;i gtMax){ - gtMax = v; - gtArgmax = i*dims[2] + k; + if(v > gtD0){ + gtD0 = v; + gtD1 = i*dims[2] + k; } } } - if(gtMax != pMax[j]){ - fprintf(stderr, "Mismatch GT %f != %f UUT @ %zu!\n", - gtMax, pMax[j], j); - fflush(stderr); - } - if(gtArgmax != pArgmax[j]){ - fprintf(stderr, "Mismatch GT %zu != %zu UUT @ %zu!\n", - gtArgmax, pArgmax[j], j); - fflush(stderr); + if(gtD0 != pD0[j] || gtD1 != pD1[j]){ + errCnt++; + if(errCnt < MAXERRPRINT){ + fprintf(stderr, "%s:%d: Mismatch GT %f[%zu] != %f[%zu] UUT @ %zu!\n", + __func__, __LINE__, gtD0, gtD1, pD0[j], pD1[j], j); + fflush (stderr); + } } - ck_assert_msg(gtMax == pMax[j], "Max value mismatch!"); - ck_assert_msg(gtArgmax == pArgmax[j], "Argmax value mismatch!"); } + ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt); /** * Deallocate. */ - free(pSrc); - free(pMax); - free(pArgmax); - GpuArray_clear(&gaSrc); - GpuArray_clear(&gaMax); - GpuArray_clear(&gaArgmax); + free(pS0); + free(pD0); + free(pD1); + GpuArray_clear(&gaS0); + GpuArray_clear(&gaD0); + GpuArray_clear(&gaD1); }END_TEST START_TEST(test_maxandargmax_idxtranspose){ @@ -178,7 +185,8 @@ START_TEST(test_maxandargmax_idxtranspose){ * transposition of the argmax "coordinates" and thus a change in its * "flattened" output version. */ - + + size_t errCnt = 0; size_t i,j,k; size_t dims[3] = {32,50,79}; size_t prodDims = dims[0]*dims[1]*dims[2]; @@ -186,13 +194,13 @@ START_TEST(test_maxandargmax_idxtranspose){ size_t rdxProdDims = rdxDims[0]; const int reduxList[] = {2,0}; - float *pSrc = calloc(sizeof(*pSrc), prodDims); - float *pMax = calloc(sizeof(*pMax), rdxProdDims); - unsigned long *pArgmax = calloc(sizeof(*pArgmax), rdxProdDims); + float* pS0 = calloc(1, sizeof(*pS0) * prodDims); + float* pD0 = calloc(1, sizeof(*pD0) * rdxProdDims); + size_t* pD1 = calloc(1, sizeof(*pD1) * rdxProdDims); - ck_assert_ptr_ne(pSrc, NULL); - ck_assert_ptr_ne(pMax, NULL); - ck_assert_ptr_ne(pArgmax, NULL); + ck_assert_ptr_nonnull(pS0); + ck_assert_ptr_nonnull(pD0); + ck_assert_ptr_nonnull(pD1); /** @@ -200,7 +208,7 @@ START_TEST(test_maxandargmax_idxtranspose){ */ for(i=0;i gtMax){ - gtMax = v; - gtArgmax = k*dims[0] + i; + if(v > gtD0){ + gtD0 = v; + gtD1 = k*dims[0] + i; } } } - ck_assert_msg(gtMax == pMax[j], "Max value mismatch!"); - ck_assert_msg(gtArgmax == pArgmax[j], "Argmax value mismatch!"); + if(gtD0 != pD0[j] || gtD1 != pD1[j]){ + errCnt++; + if(errCnt < MAXERRPRINT){ + fprintf(stderr, "%s:%d: Mismatch GT %f[%zu] != %f[%zu] UUT @ %zu!\n", + __func__, __LINE__, gtD0, gtD1, pD0[j], pD1[j], j); + fflush (stderr); + } + } } + ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt); /** * Deallocate. */ - free(pSrc); - free(pMax); - free(pArgmax); - GpuArray_clear(&gaSrc); - GpuArray_clear(&gaMax); - GpuArray_clear(&gaArgmax); + free(pS0); + free(pD0); + free(pD1); + GpuArray_clear(&gaS0); + GpuArray_clear(&gaD0); + GpuArray_clear(&gaD1); }END_TEST START_TEST(test_maxandargmax_bigdestination){ @@ -273,19 +296,20 @@ START_TEST(test_maxandargmax_bigdestination){ * We test here a reduction of some random 3D tensor on the first and * third dimensions. */ - + + size_t errCnt = 0; size_t i,j; size_t dims[2] = {2,131072}; size_t prodDims = dims[0]*dims[1]; const int reduxList[] = {0}; - float* pSrc = calloc(1, sizeof(*pSrc) * dims[0]*dims[1]); - float* pMax = calloc(1, sizeof(*pMax) * dims[1]); - size_t* pArgmax = calloc(1, sizeof(*pArgmax) * dims[1]); + float* pS0 = calloc(1, sizeof(*pS0) * dims[0]*dims[1]); + float* pD0 = calloc(1, sizeof(*pD0) * dims[1]); + size_t* pD1 = calloc(1, sizeof(*pD1) * dims[1]); - ck_assert_ptr_ne(pSrc, NULL); - ck_assert_ptr_ne(pMax, NULL); - ck_assert_ptr_ne(pArgmax, NULL); + ck_assert_ptr_nonnull(pS0); + ck_assert_ptr_nonnull(pD0); + ck_assert_ptr_nonnull(pD1); /** @@ -293,7 +317,7 @@ START_TEST(test_maxandargmax_bigdestination){ */ for(i=0;i gtMax){ - gtMax = v; - gtArgmax = i; + if(v > gtD0){ + gtD0 = v; + gtD1 = i; } } - if(gtMax != pMax[j]){ - fprintf(stderr, "Mismatch GT %f != %f UUT @ %zu!\n", - gtMax, pMax[j], j); - fflush(stderr); - } - if(gtArgmax != pArgmax[j]){ - fprintf(stderr, "Mismatch GT %zu != %zu UUT @ %zu!\n", - gtArgmax, pArgmax[j], j); - fflush(stderr); + if(gtD0 != pD0[j] || gtD1 != pD1[j]){ + errCnt++; + if(errCnt < MAXERRPRINT){ + fprintf(stderr, "%s:%d: Mismatch GT %f[%zu] != %f[%zu] UUT @ %zu!\n", + __func__, __LINE__, gtD0, gtD1, pD0[j], pD1[j], j); + fflush (stderr); + } } - ck_assert_msg(gtMax == pMax[j], "Max value mismatch!"); - ck_assert_msg(gtArgmax == pArgmax[j], "Argmax value mismatch!"); } + ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt); /** * Deallocate. */ - free(pSrc); - free(pMax); - free(pArgmax); - GpuArray_clear(&gaSrc); - GpuArray_clear(&gaMax); - GpuArray_clear(&gaArgmax); + free(pS0); + free(pD0); + free(pD1); + GpuArray_clear(&gaS0); + GpuArray_clear(&gaD0); + GpuArray_clear(&gaD1); }END_TEST START_TEST(test_maxandargmax_veryhighrank){ @@ -374,6 +403,7 @@ START_TEST(test_maxandargmax_veryhighrank){ * Here we test a reduction of a random 8D tensor on four dimensions. */ + size_t errCnt = 0; size_t i,j,k,l,m,n,o,p; size_t dims [8] = {1171,373,2,1,2,1,2,1}; size_t prodDims = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7]; @@ -381,13 +411,13 @@ START_TEST(test_maxandargmax_veryhighrank){ size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3]; const int reduxList[] = {2,4,7,5}; - float *pSrc = calloc(sizeof(*pSrc), prodDims); - float *pMax = calloc(sizeof(*pMax), rdxProdDims); - unsigned long *pArgmax = calloc(sizeof(*pArgmax), rdxProdDims); + float* pS0 = calloc(1, sizeof(*pS0) * prodDims); + float* pD0 = calloc(1, sizeof(*pD0) * rdxProdDims); + size_t* pD1 = calloc(1, sizeof(*pD1) * rdxProdDims); - ck_assert_ptr_ne(pSrc, NULL); - ck_assert_ptr_ne(pMax, NULL); - ck_assert_ptr_ne(pArgmax, NULL); + ck_assert_ptr_nonnull(pS0); + ck_assert_ptr_nonnull(pD0); + ck_assert_ptr_nonnull(pD1); /** @@ -395,7 +425,7 @@ START_TEST(test_maxandargmax_veryhighrank){ */ for(i=0;i gtMax){ - gtMax = v; - gtArgmax = (((k)*dims[4] + m)*dims[7] + p)*dims[5] + n; + if(v > gtD0){ + gtD0 = v; + gtD1 = (((k)*dims[4] + m)*dims[7] + p)*dims[5] + n; } } } @@ -453,24 +491,31 @@ START_TEST(test_maxandargmax_veryhighrank){ } size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o; - ck_assert_msg(gtMax == pMax[dstIdx], "Max value mismatch!"); - ck_assert_msg(gtArgmax == pArgmax[dstIdx], "Argmax value mismatch!"); + if(gtD0 != pD0[dstIdx] || gtD1 != pD1[dstIdx]){ + errCnt++; + if(errCnt < MAXERRPRINT){ + fprintf(stderr, "%s:%d: Mismatch GT %f[%zu] != %f[%zu] UUT @ %zu!\n", + __func__, __LINE__, gtD0, gtD1, pD0[dstIdx], pD1[dstIdx], dstIdx); + fflush (stderr); + } + } } } } } + ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt); /** * Deallocate. */ - free(pSrc); - free(pMax); - free(pArgmax); - GpuArray_clear(&gaSrc); - GpuArray_clear(&gaMax); - GpuArray_clear(&gaArgmax); + free(pS0); + free(pD0); + free(pD1); + GpuArray_clear(&gaS0); + GpuArray_clear(&gaD0); + GpuArray_clear(&gaD1); }END_TEST START_TEST(test_maxandargmax_alldimsreduced){ @@ -480,18 +525,19 @@ START_TEST(test_maxandargmax_alldimsreduced){ * We test here a reduction of some random 3D tensor on all dimensions. */ + size_t errCnt = 0; size_t i,j,k; size_t dims[3] = {32,50,79}; size_t prodDims = dims[0]*dims[1]*dims[2]; const int reduxList[] = {0,1,2}; - float *pSrc = calloc(sizeof(*pSrc), prodDims); - float *pMax = calloc(1, sizeof(*pMax)); - unsigned long *pArgmax = calloc(1, sizeof(*pArgmax)); + float* pS0 = calloc(1, sizeof(*pS0) * dims[0]*dims[1]*dims[2]); + float* pD0 = calloc(1, sizeof(*pD0) ); + size_t* pD1 = calloc(1, sizeof(*pD1) ); - ck_assert_ptr_ne(pSrc, NULL); - ck_assert_ptr_ne(pMax, NULL); - ck_assert_ptr_ne(pArgmax, NULL); + ck_assert_ptr_nonnull(pS0); + ck_assert_ptr_nonnull(pD0); + ck_assert_ptr_nonnull(pD1); /** @@ -499,7 +545,7 @@ START_TEST(test_maxandargmax_alldimsreduced){ */ for(i=0;i gtMax){ - gtMax = v; - gtArgmax = (i*dims[1] + j)*dims[2] + k; + if(v > gtD0){ + gtD0 = v; + gtD1 = (i*dims[1] + j)*dims[2] + k; } } } } - - ck_assert_msg(gtMax == pMax[0], "Max value mismatch!"); - ck_assert_msg(gtArgmax == pArgmax[0], "Argmax value mismatch!"); + if(gtD0 != pD0[0] || gtD1 != pD1[0]){ + errCnt++; + if(errCnt < MAXERRPRINT){ + fprintf(stderr, "%s:%d: Mismatch GT %f[%zu] != %f[%zu] UUT @ %zu!\n", + __func__, __LINE__, gtD0, gtD1, pD0[0], pD1[0], (size_t)0); + fflush (stderr); + } + } + ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt); /** * Deallocate. */ - free(pSrc); - free(pMax); - free(pArgmax); - GpuArray_clear(&gaSrc); - GpuArray_clear(&gaMax); - GpuArray_clear(&gaArgmax); + free(pS0); + free(pD0); + free(pD1); + GpuArray_clear(&gaS0); + GpuArray_clear(&gaD0); + GpuArray_clear(&gaD1); }END_TEST START_TEST(test_minandargmin_reduction){ @@ -573,18 +633,19 @@ START_TEST(test_minandargmin_reduction){ * third dimensions. */ + size_t errCnt = 0; size_t i,j,k; size_t dims[3] = {32,50,79}; size_t prodDims = dims[0]*dims[1]*dims[2]; const int reduxList[] = {0,2}; - float* pSrc = calloc(1, sizeof(*pSrc) * dims[0]*dims[1]*dims[2]); - float* pMin = calloc(1, sizeof(*pMin) * dims[1] ); - size_t* pArgmin = calloc(1, sizeof(*pArgmin) * dims[1] ); + float* pS0 = calloc(1, sizeof(*pS0) * dims[0]*dims[1]*dims[2]); + float* pD0 = calloc(1, sizeof(*pD0) * dims[1] ); + size_t* pD1 = calloc(1, sizeof(*pD1) * dims[1] ); - ck_assert_ptr_ne(pSrc, NULL); - ck_assert_ptr_ne(pMin, NULL); - ck_assert_ptr_ne(pArgmin, NULL); + ck_assert_ptr_nonnull(pS0); + ck_assert_ptr_nonnull(pD0); + ck_assert_ptr_nonnull(pD1); /** @@ -592,7 +653,7 @@ START_TEST(test_minandargmin_reduction){ */ for(i=0;i gtMax){ - gtMax = v; - gtArgmax = i*dims[2] + k; + if(v > gtD0){ + gtD0 = v; + gtD1 = i*dims[2] + k; } } } - - ck_assert_msg(gtArgmax == pArgmax[j], "Argmax value mismatch!"); + + if(gtD1 != pD1[j]){ + errCnt++; + if(errCnt < MAXERRPRINT){ + fprintf(stderr, "%s:%d: Mismatch GT [%zu] != [%zu] UUT @ %zu!\n", + __func__, __LINE__, gtD1, pD1[j], j); + fflush (stderr); + } + } } + ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt); /** * Deallocate. */ - free(pSrc); - free(pMax); - free(pArgmax); - GpuArray_clear(&gaSrc); - GpuArray_clear(&gaArgmax); + free(pS0); + free(pD0); + free(pD1); + GpuArray_clear(&gaS0); + GpuArray_clear(&gaD1); }END_TEST START_TEST(test_argmax_veryhighrank){ @@ -950,6 +1073,7 @@ START_TEST(test_argmax_veryhighrank){ * Here we test a reduction of a random 8D tensor on four dimensions. */ + size_t errCnt = 0; size_t i,j,k,l,m,n,o,p; size_t dims [8] = {1171,373,2,1,2,1,2,1}; size_t prodDims = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7]; @@ -957,12 +1081,12 @@ START_TEST(test_argmax_veryhighrank){ size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3]; const int reduxList[] = {2,4,7,5}; - float* pSrc = calloc(1, sizeof(*pSrc) * prodDims); - float* pMax = calloc(1, sizeof(*pMax) * rdxProdDims); - size_t* pArgmax = calloc(1, sizeof(*pArgmax) * rdxProdDims); + float* pS0 = calloc(1, sizeof(*pS0) * prodDims); + float* pD0 = calloc(1, sizeof(*pD0) * rdxProdDims); + size_t* pD1 = calloc(1, sizeof(*pD1) * rdxProdDims); - ck_assert_ptr_ne(pSrc, NULL); - ck_assert_ptr_ne(pArgmax, NULL); + ck_assert_ptr_nonnull(pS0); + ck_assert_ptr_nonnull(pD1); /** @@ -970,7 +1094,7 @@ START_TEST(test_argmax_veryhighrank){ */ for(i=0;i gtMax){ - gtMax = v; - gtArgmax = (((k)*dims[4] + m)*dims[7] + p)*dims[5] + n; + if(v > gtD0){ + gtD0 = v; + gtD1 = (((k)*dims[4] + m)*dims[7] + p)*dims[5] + n; } } } @@ -1024,22 +1155,30 @@ START_TEST(test_argmax_veryhighrank){ } size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o; - ck_assert_msg(gtArgmax == pArgmax[dstIdx], "Argmax value mismatch!"); + if(gtD1 != pD1[dstIdx]){ + errCnt++; + if(errCnt < MAXERRPRINT){ + fprintf(stderr, "%s:%d: Mismatch GT [%zu] != [%zu] UUT @ %zu!\n", + __func__, __LINE__, gtD1, pD1[dstIdx], dstIdx); + fflush (stderr); + } + } } } } } + ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt); /** * Deallocate. */ - free(pSrc); - free(pMax); - free(pArgmax); - GpuArray_clear(&gaSrc); - GpuArray_clear(&gaArgmax); + free(pS0); + free(pD0); + free(pD1); + GpuArray_clear(&gaS0); + GpuArray_clear(&gaD1); }END_TEST START_TEST(test_argmax_alldimsreduced){ @@ -1049,18 +1188,19 @@ START_TEST(test_argmax_alldimsreduced){ * We test here a reduction of some random 3D tensor on all dimensions. */ + size_t errCnt = 0; size_t i,j,k; size_t dims[3] = {32,50,79}; size_t prodDims = dims[0]*dims[1]*dims[2]; const int reduxList[] = {0,1,2}; - float* pSrc = calloc(1, sizeof(*pSrc) * dims[0]*dims[1]*dims[2]); - float* pMax = calloc(1, sizeof(*pMax) ); - size_t* pArgmax = calloc(1, sizeof(*pArgmax) ); + float* pS0 = calloc(1, sizeof(*pS0) * dims[0]*dims[1]*dims[2]); + float* pD0 = calloc(1, sizeof(*pD0) ); + size_t* pD1 = calloc(1, sizeof(*pD1) ); - ck_assert_ptr_ne(pSrc, NULL); - ck_assert_ptr_ne(pMax, NULL); - ck_assert_ptr_ne(pArgmax, NULL); + ck_assert_ptr_nonnull(pS0); + ck_assert_ptr_nonnull(pD0); + ck_assert_ptr_nonnull(pD1); /** @@ -1068,7 +1208,7 @@ START_TEST(test_argmax_alldimsreduced){ */ for(i=0;i gtMax){ - gtMax = v; - gtArgmax = (i*dims[1] + j)*dims[2] + k; + if(v > gtD0){ + gtD0 = v; + gtD1 = (i*dims[1] + j)*dims[2] + k; } } } } - - ck_assert_msg(gtArgmax == pArgmax[0], "Argmax value mismatch!"); + if(gtD1 != pD1[0]){ + errCnt++; + if(errCnt < MAXERRPRINT){ + fprintf(stderr, "%s:%d: Mismatch GT [%zu] != [%zu] UUT @ %zu!\n", + __func__, __LINE__, gtD1, pD1[0], (size_t)0); + fflush (stderr); + } + } + ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt); /** * Deallocate. */ - free(pSrc); - free(pMax); - free(pArgmax); - GpuArray_clear(&gaSrc); - GpuArray_clear(&gaArgmax); + free(pS0); + free(pD0); + free(pD1); + GpuArray_clear(&gaS0); + GpuArray_clear(&gaD1); }END_TEST START_TEST(test_argmin_reduction){ @@ -1136,18 +1290,19 @@ START_TEST(test_argmin_reduction){ * third dimensions. */ + size_t errCnt = 0; size_t i,j,k; size_t dims[3] = {32,50,79}; size_t prodDims = dims[0]*dims[1]*dims[2]; const int reduxList[] = {0,2}; - float* pSrc = calloc(1, sizeof(*pSrc) * dims[0]*dims[1]*dims[2]); - float* pMin = calloc(1, sizeof(*pMin) * dims[1] ); - size_t* pArgmin = calloc(1, sizeof(*pArgmin) * dims[1] ); + float* pS0 = calloc(1, sizeof(*pS0) * dims[0]*dims[1]*dims[2]); + float* pD0 = calloc(1, sizeof(*pD0) * dims[1] ); + size_t* pD1 = calloc(1, sizeof(*pD1) * dims[1] ); - ck_assert_ptr_ne(pSrc, NULL); - ck_assert_ptr_ne(pMin, NULL); - ck_assert_ptr_ne(pArgmin, NULL); + ck_assert_ptr_nonnull(pS0); + ck_assert_ptr_nonnull(pD0); + ck_assert_ptr_nonnull(pD1); /** @@ -1155,7 +1310,7 @@ START_TEST(test_argmin_reduction){ */ for(i=0;i gtMax){ - gtMax = v; + if(v > gtD0){ + gtD0 = v; } } } - - ck_assert_msg(gtMax == pMax[j], "Max value mismatch!"); + + if(gtD0 != pD0[j]){ + errCnt++; + if(errCnt < MAXERRPRINT){ + fprintf(stderr, "%s:%d: Mismatch GT %f != %f UUT @ %zu!\n", + __func__, __LINE__, gtD0, pD0[j], j); + fflush (stderr); + } + } } + ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt); /** * Deallocate. */ - free(pSrc); - free(pMax); - GpuArray_clear(&gaSrc); - GpuArray_clear(&gaMax); + free(pS0); + free(pD0); + GpuArray_clear(&gaS0); + GpuArray_clear(&gaD0); }END_TEST START_TEST(test_max_veryhighrank){ @@ -1488,6 +1705,7 @@ START_TEST(test_max_veryhighrank){ * Here we test a reduction of a random 8D tensor on four dimensions. */ + size_t errCnt = 0; size_t i,j,k,l,m,n,o,p; size_t dims [8] = {1171,373,2,1,2,1,2,1}; size_t prodDims = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7]; @@ -1495,11 +1713,11 @@ START_TEST(test_max_veryhighrank){ size_t rdxProdDims = rdxDims[0]*rdxDims[1]*rdxDims[2]*rdxDims[3]; const int reduxList[] = {2,4,7,5}; - float* pSrc = calloc(1, sizeof(*pSrc) * prodDims); - float* pMax = calloc(1, sizeof(*pMax) * rdxProdDims); + float* pS0 = calloc(1, sizeof(*pS0) * prodDims); + float* pD0 = calloc(1, sizeof(*pD0) * rdxProdDims); - ck_assert_ptr_ne(pSrc, NULL); - ck_assert_ptr_ne(pMax, NULL); + ck_assert_ptr_nonnull(pS0); + ck_assert_ptr_nonnull(pD0); /** @@ -1507,7 +1725,7 @@ START_TEST(test_max_veryhighrank){ */ for(i=0;i gtMax){ - gtMax = v; + if(v > gtD0){ + gtD0 = v; } } } @@ -1559,21 +1784,29 @@ START_TEST(test_max_veryhighrank){ } size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o; - ck_assert_msg(gtMax == pMax[dstIdx], "Max value mismatch!"); + if(gtD0 != pD0[dstIdx]){ + errCnt++; + if(errCnt < MAXERRPRINT){ + fprintf(stderr, "%s:%d: Mismatch GT %f != %f UUT @ %zu!\n", + __func__, __LINE__, gtD0, pD0[dstIdx], dstIdx); + fflush (stderr); + } + } } } } } + ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt); /** * Deallocate. */ - free(pSrc); - free(pMax); - GpuArray_clear(&gaSrc); - GpuArray_clear(&gaMax); + free(pS0); + free(pD0); + GpuArray_clear(&gaS0); + GpuArray_clear(&gaD0); }END_TEST START_TEST(test_max_alldimsreduced){ @@ -1583,16 +1816,17 @@ START_TEST(test_max_alldimsreduced){ * We test here a reduction of some random 3D tensor on all dimensions. */ + size_t errCnt = 0; size_t i,j,k; size_t dims[3] = {32,50,79}; size_t prodDims = dims[0]*dims[1]*dims[2]; const int reduxList[] = {0,1,2}; - float* pSrc = calloc(1, sizeof(*pSrc) * dims[0]*dims[1]*dims[2]); - float* pMax = calloc(1, sizeof(*pMax) ); + float* pS0 = calloc(1, sizeof(*pS0) * dims[0]*dims[1]*dims[2]); + float* pD0 = calloc(1, sizeof(*pD0) ); - ck_assert_ptr_ne(pSrc, NULL); - ck_assert_ptr_ne(pMax, NULL); + ck_assert_ptr_nonnull(pS0); + ck_assert_ptr_nonnull(pD0); /** @@ -1600,7 +1834,7 @@ START_TEST(test_max_alldimsreduced){ */ for(i=0;i gtMax){ - gtMax = v; + if(v > gtD0){ + gtD0 = v; } } } } - - ck_assert_msg(gtMax == pMax[0], "Max value mismatch!"); + if(gtD0 != pD0[0]){ + errCnt++; + if(errCnt < MAXERRPRINT){ + fprintf(stderr, "%s:%d: Mismatch GT %f != %f UUT @ %zu!\n", + __func__, __LINE__, gtD0, pD0[0], (size_t)0); + fflush (stderr); + } + } + ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt); /** * Deallocate. */ - free(pSrc); - free(pMax); - GpuArray_clear(&gaSrc); - GpuArray_clear(&gaMax); + free(pS0); + free(pD0); + GpuArray_clear(&gaS0); + GpuArray_clear(&gaD0); }END_TEST START_TEST(test_min_reduction){ @@ -1664,16 +1912,17 @@ START_TEST(test_min_reduction){ * We test here a reduction of some random 3D tensor on all dimensions. */ + size_t errCnt = 0; size_t i,j,k; size_t dims[3] = {32,50,79}; size_t prodDims = dims[0]*dims[1]*dims[2]; const int reduxList[] = {0,2}; - float* pSrc = calloc(1, sizeof(*pSrc) * dims[0]*dims[1]*dims[2]); - float* pMin = calloc(1, sizeof(*pMin) * dims[1] ); + float* pS0 = calloc(1, sizeof(*pS0) * dims[0]*dims[1]*dims[2]); + float* pD0 = calloc(1, sizeof(*pD0) * dims[1] ); - ck_assert_ptr_ne(pSrc, NULL); - ck_assert_ptr_ne(pMin, NULL); + ck_assert_ptr_nonnull(pS0); + ck_assert_ptr_nonnull(pD0); /** @@ -1681,7 +1930,7 @@ START_TEST(test_min_reduction){ */ for(i=0;i= TOL){ + errCnt++; + if(errCnt < MAXERRPRINT){ + fprintf(stderr, "%s:%d: Mismatch GT %f != %f UUT @ %zu (TOL=%f)!\n", + __func__, __LINE__, gtD0, pD0[j], j, TOL); + fflush (stderr); } } - - ck_assert_double_eq_tol(gtD, pD[j], TOL); } + ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt); /** * Deallocate. */ - free(pS); - free(pD); - GpuArray_clear(&gaS); - GpuArray_clear(&gaD); + free(pS0); + free(pD0); + GpuArray_clear(&gaS0); + GpuArray_clear(&gaD0); }END_TEST START_TEST(test_sum_veryhighrank){ @@ -2000,6 +2311,7 @@ START_TEST(test_sum_veryhighrank){ * Here we test a reduction of a random 8D tensor on four dimensions. */ + size_t errCnt = 0; size_t i,j,k,l,m,n,o,p; size_t dims [8] = {1171,373,2,1,2,1,2,1}; size_t prodDims = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7]; @@ -2008,11 +2320,11 @@ START_TEST(test_sum_veryhighrank){ const int reduxList[] = {2,4,7,5}; const float TOL = 1e-4; - float* pS = calloc(1, sizeof(*pS) * prodDims); - float* pD = calloc(1, sizeof(*pD) * rdxProdDims); + float* pS0 = calloc(1, sizeof(*pS0) * prodDims); + float* pD0 = calloc(1, sizeof(*pD0) * rdxProdDims); - ck_assert_ptr_ne(pS, NULL); - ck_assert_ptr_ne(pD, NULL); + ck_assert_ptr_nonnull(pS0); + ck_assert_ptr_nonnull(pD0); /** @@ -2020,7 +2332,7 @@ START_TEST(test_sum_veryhighrank){ */ for(i=0;i= TOL){ + errCnt++; + if(errCnt < MAXERRPRINT){ + fprintf(stderr, "%s:%d: Mismatch GT %f != %f UUT @ %zu (TOL=%f)!\n", + __func__, __LINE__, gtD0, pD0[dstIdx], dstIdx, TOL); + fflush (stderr); + } + } } } } } + ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt); /** * Deallocate. */ - free(pS); - free(pD); - GpuArray_clear(&gaS); - GpuArray_clear(&gaD); + free(pS0); + free(pD0); + GpuArray_clear(&gaS0); + GpuArray_clear(&gaD0); }END_TEST START_TEST(test_sum_alldimsreduced){ @@ -2093,17 +2420,18 @@ START_TEST(test_sum_alldimsreduced){ * We test here a reduction of some random 3D tensor on all dimensions. */ + size_t errCnt = 0; size_t i,j,k; size_t dims[3] = {32,50,79}; size_t prodDims = dims[0]*dims[1]*dims[2]; const int reduxList[] = {0,1,2}; const float TOL = 1e-4; - float* pS = calloc(1, sizeof(*pS) * dims[0]*dims[1]*dims[2]); - float* pD = calloc(1, sizeof(*pD) ); + float* pS0 = calloc(1, sizeof(*pS0) * dims[0]*dims[1]*dims[2]); + float* pD0 = calloc(1, sizeof(*pD0) ); - ck_assert_ptr_ne(pS, NULL); - ck_assert_ptr_ne(pD, NULL); + ck_assert_ptr_nonnull(pS0); + ck_assert_ptr_nonnull(pD0); /** @@ -2111,7 +2439,7 @@ START_TEST(test_sum_alldimsreduced){ */ for(i=0;i= TOL){ + errCnt++; + if(errCnt < MAXERRPRINT){ + fprintf(stderr, "%s:%d: Mismatch GT %f != %f UUT @ %zu (TOL=%f)!\n", + __func__, __LINE__, gtD0, pD0[0], (size_t)0, TOL); + fflush (stderr); + } + } + ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt); /** * Deallocate. */ - free(pS); - free(pD); - GpuArray_clear(&gaS); - GpuArray_clear(&gaD); + free(pS0); + free(pD0); + GpuArray_clear(&gaS0); + GpuArray_clear(&gaD0); }END_TEST START_TEST(test_sum_huge){ @@ -2172,17 +2514,18 @@ START_TEST(test_sum_huge){ * We test here a reduction of a huge 1D tensor on all dimensions. */ + size_t errCnt = 0; size_t i; size_t dims[1] = {100000000}; size_t prodDims = dims[0]; const int reduxList[] = {0}; const float TOL = 1e-2; - float* pS = calloc(1, sizeof(*pS) * dims[0]); - float* pD = calloc(1, sizeof(*pD)); + float* pS0 = calloc(1, sizeof(*pS0) * dims[0]); + float* pD0 = calloc(1, sizeof(*pD0)); - ck_assert_ptr_ne(pS, NULL); - ck_assert_ptr_ne(pD, NULL); + ck_assert_ptr_nonnull(pS0); + ck_assert_ptr_nonnull(pD0); /** @@ -2190,7 +2533,7 @@ START_TEST(test_sum_huge){ */ for(i=0;i= TOL){ + errCnt++; + if(errCnt < MAXERRPRINT){ + fprintf(stderr, "%s:%d: Mismatch GT %f != %f UUT @ %zu (TOL=%f)!\n", + __func__, __LINE__, gtD0, pD0[0], (size_t)0, TOL); + fflush (stderr); + } } - ck_assert_double_eq_tol(gtD, pD[0], TOL); + ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt); /** * Deallocate. */ - free(pS); - free(pD); - GpuArray_clear(&gaS); - GpuArray_clear(&gaD); + free(pS0); + free(pD0); + GpuArray_clear(&gaS0); + GpuArray_clear(&gaD0); }END_TEST START_TEST(test_prod_reduction){ @@ -2245,17 +2603,18 @@ START_TEST(test_prod_reduction){ * We test here a reduction of some random 3D tensor on all dimensions. */ + size_t errCnt = 0; size_t i,j,k; size_t dims[3] = {32,50,79}; size_t prodDims = dims[0]*dims[1]*dims[2]; const int reduxList[] = {0,2}; const float TOL = 1e-4; - float* pS = calloc(1, sizeof(*pS) * dims[0]*dims[1]*dims[2]); - float* pD = calloc(1, sizeof(*pD) * dims[1] ); + float* pS0 = calloc(1, sizeof(*pS0) * dims[0]*dims[1]*dims[2]); + float* pD0 = calloc(1, sizeof(*pD0) * dims[1] ); - ck_assert_ptr_ne(pS, NULL); - ck_assert_ptr_ne(pD, NULL); + ck_assert_ptr_nonnull(pS0); + ck_assert_ptr_nonnull(pD0); /** @@ -2263,7 +2622,7 @@ START_TEST(test_prod_reduction){ */ for(i=0;i= TOL){ + errCnt++; + if(errCnt < MAXERRPRINT){ + fprintf(stderr, "%s:%d: Mismatch GT %f != %f UUT @ %zu (TOL=%f)!\n", + __func__, __LINE__, gtD0, pD0[j], j, TOL); + fflush (stderr); } } - - ck_assert_double_eq_tol(gtD, pD[j], TOL); } + ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt); /** * Deallocate. */ - free(pS); - free(pD); - GpuArray_clear(&gaS); - GpuArray_clear(&gaD); + free(pS0); + free(pD0); + GpuArray_clear(&gaS0); + GpuArray_clear(&gaD0); }END_TEST START_TEST(test_prod_veryhighrank){ @@ -2324,6 +2698,7 @@ START_TEST(test_prod_veryhighrank){ * Here we test a reduction of a random 8D tensor on four dimensions. */ + size_t errCnt = 0; size_t i,j,k,l,m,n,o,p; size_t dims [8] = {1171,373,2,1,2,1,2,1}; size_t prodDims = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7]; @@ -2332,11 +2707,11 @@ START_TEST(test_prod_veryhighrank){ const int reduxList[] = {2,4,7,5}; const float TOL = 1e-4; - float* pS = calloc(1, sizeof(*pS) * prodDims); - float* pD = calloc(1, sizeof(*pD) * rdxProdDims); + float* pS0 = calloc(1, sizeof(*pS0) * prodDims); + float* pD0 = calloc(1, sizeof(*pD0) * rdxProdDims); - ck_assert_ptr_ne(pS, NULL); - ck_assert_ptr_ne(pD, NULL); + ck_assert_ptr_nonnull(pS0); + ck_assert_ptr_nonnull(pD0); /** @@ -2344,7 +2719,7 @@ START_TEST(test_prod_veryhighrank){ */ for(i=0;i= TOL){ + errCnt++; + if(errCnt < MAXERRPRINT){ + fprintf(stderr, "%s:%d: Mismatch GT %f != %f UUT @ %zu (TOL=%f)!\n", + __func__, __LINE__, gtD0, pD0[dstIdx], dstIdx, TOL); + fflush (stderr); + } + } } } } } + ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt); /** * Deallocate. */ - free(pS); - free(pD); - GpuArray_clear(&gaS); - GpuArray_clear(&gaD); + free(pS0); + free(pD0); + GpuArray_clear(&gaS0); + GpuArray_clear(&gaD0); }END_TEST START_TEST(test_prod_alldimsreduced){ @@ -2417,17 +2807,18 @@ START_TEST(test_prod_alldimsreduced){ * We test here a reduction of some random 3D tensor on all dimensions. */ + size_t errCnt = 0; size_t i,j,k; size_t dims[3] = {32,50,79}; size_t prodDims = dims[0]*dims[1]*dims[2]; const int reduxList[] = {0,1,2}; const float TOL = 1e-4; - float* pS = calloc(1, sizeof(*pS) * dims[0]*dims[1]*dims[2]); - float* pD = calloc(1, sizeof(*pD) ); + float* pS0 = calloc(1, sizeof(*pS0) * dims[0]*dims[1]*dims[2]); + float* pD0 = calloc(1, sizeof(*pD0) ); - ck_assert_ptr_ne(pS, NULL); - ck_assert_ptr_ne(pD, NULL); + ck_assert_ptr_nonnull(pS0); + ck_assert_ptr_nonnull(pD0); /** @@ -2435,7 +2826,7 @@ START_TEST(test_prod_alldimsreduced){ */ for(i=0;i= TOL){ + errCnt++; + if(errCnt < MAXERRPRINT){ + fprintf(stderr, "%s:%d: Mismatch GT %f != %f UUT @ %zu (TOL=%f)!\n", + __func__, __LINE__, gtD0, pD0[0], (size_t)0, TOL); + fflush (stderr); + } + } + ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt); /** * Deallocate. */ - free(pS); - free(pD); - GpuArray_clear(&gaS); - GpuArray_clear(&gaD); + free(pS0); + free(pD0); + GpuArray_clear(&gaS0); + GpuArray_clear(&gaD0); }END_TEST START_TEST(test_prodnz_reduction){ @@ -2496,17 +2901,18 @@ START_TEST(test_prodnz_reduction){ * We test here a reduction of some random 3D tensor on all dimensions. */ + size_t errCnt = 0; size_t i,j,k; size_t dims[3] = {32,50,79}; size_t prodDims = dims[0]*dims[1]*dims[2]; const int reduxList[] = {0,2}; const float TOL = 1e-4; - float* pS = calloc(1, sizeof(*pS) * dims[0]*dims[1]*dims[2]); - float* pD = calloc(1, sizeof(*pD) * dims[1] ); + float* pS0 = calloc(1, sizeof(*pS0) * dims[0]*dims[1]*dims[2]); + float* pD0 = calloc(1, sizeof(*pD0) * dims[1] ); - ck_assert_ptr_ne(pS, NULL); - ck_assert_ptr_ne(pD, NULL); + ck_assert_ptr_nonnull(pS0); + ck_assert_ptr_nonnull(pD0); /** @@ -2514,9 +2920,9 @@ START_TEST(test_prodnz_reduction){ */ for(i=0;i= TOL){ + errCnt++; + if(errCnt < MAXERRPRINT){ + fprintf(stderr, "%s:%d: Mismatch GT %f != %f UUT @ %zu (TOL=%f)!\n", + __func__, __LINE__, gtD0, pD0[j], j, TOL); + fflush (stderr); } } - - ck_assert_double_eq_tol(gtD, pD[j], TOL); } + ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt); /** * Deallocate. */ - free(pS); - free(pD); - GpuArray_clear(&gaS); - GpuArray_clear(&gaD); + free(pS0); + free(pD0); + GpuArray_clear(&gaS0); + GpuArray_clear(&gaD0); }END_TEST START_TEST(test_prodnz_veryhighrank){ @@ -2578,6 +2999,7 @@ START_TEST(test_prodnz_veryhighrank){ * Here we test a reduction of a random 8D tensor on four dimensions. */ + size_t errCnt = 0; size_t i,j,k,l,m,n,o,p; size_t dims [8] = {1171,373,2,1,2,1,2,1}; size_t prodDims = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7]; @@ -2586,11 +3008,11 @@ START_TEST(test_prodnz_veryhighrank){ const int reduxList[] = {2,4,7,5}; const float TOL = 1e-4; - float* pS = calloc(1, sizeof(*pS) * prodDims); - float* pD = calloc(1, sizeof(*pD) * rdxProdDims); + float* pS0 = calloc(1, sizeof(*pS0) * prodDims); + float* pD0 = calloc(1, sizeof(*pD0) * rdxProdDims); - ck_assert_ptr_ne(pS, NULL); - ck_assert_ptr_ne(pD, NULL); + ck_assert_ptr_nonnull(pS0); + ck_assert_ptr_nonnull(pD0); /** @@ -2598,9 +3020,9 @@ START_TEST(test_prodnz_veryhighrank){ */ for(i=0;i= TOL){ + errCnt++; + if(errCnt < MAXERRPRINT){ + fprintf(stderr, "%s:%d: Mismatch GT %f != %f UUT @ %zu (TOL=%f)!\n", + __func__, __LINE__, gtD0, pD0[dstIdx], dstIdx, TOL); + fflush (stderr); + } + } } } } } + ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt); /** * Deallocate. */ - free(pS); - free(pD); - GpuArray_clear(&gaS); - GpuArray_clear(&gaD); + free(pS0); + free(pD0); + GpuArray_clear(&gaS0); + GpuArray_clear(&gaD0); }END_TEST START_TEST(test_prodnz_alldimsreduced){ @@ -2674,17 +3111,18 @@ START_TEST(test_prodnz_alldimsreduced){ * We test here a reduction of some random 3D tensor on all dimensions. */ + size_t errCnt = 0; size_t i,j,k; size_t dims[3] = {32,50,79}; size_t prodDims = dims[0]*dims[1]*dims[2]; const int reduxList[] = {0,1,2}; const float TOL = 1e-4; - float* pS = calloc(1, sizeof(*pS) * dims[0]*dims[1]*dims[2]); - float* pD = calloc(1, sizeof(*pD) ); + float* pS0 = calloc(1, sizeof(*pS0) * dims[0]*dims[1]*dims[2]); + float* pD0 = calloc(1, sizeof(*pD0) ); - ck_assert_ptr_ne(pS, NULL); - ck_assert_ptr_ne(pD, NULL); + ck_assert_ptr_nonnull(pS0); + ck_assert_ptr_nonnull(pD0); /** @@ -2692,9 +3130,9 @@ START_TEST(test_prodnz_alldimsreduced){ */ for(i=0;i= TOL){ + errCnt++; + if(errCnt < MAXERRPRINT){ + fprintf(stderr, "%s:%d: Mismatch GT %f != %f UUT @ %zu (TOL=%f)!\n", + __func__, __LINE__, gtD0, pD0[0], (size_t)0, TOL); + fflush (stderr); + } + } + ck_assert_msg(errCnt == 0, "%zu mismatches!", errCnt); /** * Deallocate. */ - free(pS); - free(pD); - GpuArray_clear(&gaS); - GpuArray_clear(&gaD); + free(pS0); + free(pD0); + GpuArray_clear(&gaS0); + GpuArray_clear(&gaD0); }END_TEST START_TEST(test_and_reduction){ @@ -2756,16 +3208,17 @@ START_TEST(test_and_reduction){ * We test here a reduction of some random 3D tensor on all dimensions. */ + size_t errCnt = 0; size_t i,j,k; size_t dims[3] = {32,50,79}; size_t prodDims = dims[0]*dims[1]*dims[2]; const int reduxList[] = {0,2}; - uint32_t* pS = calloc(1, sizeof(*pS) * dims[0]*dims[1]*dims[2]); - uint32_t* pD = calloc(1, sizeof(*pD) * dims[1] ); + uint32_t* pS0 = calloc(1, sizeof(*pS0) * dims[0]*dims[1]*dims[2]); + uint32_t* pD0 = calloc(1, sizeof(*pD0) * dims[1] ); - ck_assert_ptr_ne(pS, NULL); - ck_assert_ptr_ne(pD, NULL); + ck_assert_ptr_nonnull(pS0); + ck_assert_ptr_nonnull(pD0); /** @@ -2778,11 +3231,11 @@ START_TEST(test_and_reduction){ * probability. */ - pS[i] = (uint32_t)(pcgRand01() * (uint32_t)-1); - pS[i] |= (uint32_t)(pcgRand01() * (uint32_t)-1); - pS[i] |= (uint32_t)(pcgRand01() * (uint32_t)-1); - pS[i] |= (uint32_t)(pcgRand01() * (uint32_t)-1); - pS[i] |= (uint32_t)(pcgRand01() * (uint32_t)-1); + pS0[i] = (uint32_t)(pcgRand01() * (uint32_t)-1); + pS0[i] |= (uint32_t)(pcgRand01() * (uint32_t)-1); + pS0[i] |= (uint32_t)(pcgRand01() * (uint32_t)-1); + pS0[i] |= (uint32_t)(pcgRand01() * (uint32_t)-1); + pS0[i] |= (uint32_t)(pcgRand01() * (uint32_t)-1); } @@ -2790,23 +3243,30 @@ START_TEST(test_and_reduction){ * Run the kernel. */ - GpuArray gaS; - GpuArray gaD; + GpuArray gaS0; + GpuArray gaD0; + GpuReductionAttr* grAttr; + GpuReduction* gr; - ga_assert_ok(GpuArray_empty (&gaS, ctx, GA_UINT, 3, &dims[0], GA_C_ORDER)); - ga_assert_ok(GpuArray_empty (&gaD, ctx, GA_UINT, 1, &dims[1], GA_C_ORDER)); + ga_assert_ok(GpuArray_empty (&gaS0, ctx, GA_UINT, 3, &dims[0], GA_C_ORDER)); + ga_assert_ok(GpuArray_empty (&gaD0, ctx, GA_UINT, 1, &dims[1], GA_C_ORDER)); - ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); - ga_assert_ok(GpuArray_memset(&gaD, -1)); - - GpuReduction* gr; - GpuReduction_new(&gr, GpuArray_context(&gaS), - GA_REDUCE_AND, 1, 2, gaS.typecode, 0); + ga_assert_ok(GpuArray_write (&gaS0, pS0, sizeof(*pS0)*prodDims)); + ga_assert_ok(GpuArray_memset(&gaD0, -1)); + + GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0)); + ck_assert_ptr_nonnull(grAttr); + GpuReductionAttr_setop (grAttr, GA_REDUCE_AND); + GpuReductionAttr_setdims (grAttr, gaS0.nd, gaD0.nd); + GpuReductionAttr_sets0type(grAttr, gaS0.typecode); + GpuReductionAttr_setd0type(grAttr, gaD0.typecode); + GpuReduction_new(&gr, grAttr); ck_assert_ptr_nonnull(gr); - ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 2, reduxList, 0)); + ga_assert_ok(GpuReduction_call(gr, &gaD0, NULL, &gaS0, gaS0.nd-gaD0.nd, reduxList, 0)); GpuReduction_free(gr); + GpuReductionAttr_free(grAttr); - ga_assert_ok(GpuArray_read (pD, sizeof(*pD)*dims[1], &gaD)); + ga_assert_ok(GpuArray_read (pD0, sizeof(*pD0)*dims[1], &gaD0)); /** @@ -2814,26 +3274,34 @@ START_TEST(test_and_reduction){ */ for(j=0;j 0.05; + pS0[i] = pcgRand01() > 0.05; } @@ -3862,23 +4503,30 @@ START_TEST(test_all_reduction){ * Run the kernel. */ - GpuArray gaS; - GpuArray gaD; + GpuArray gaS0; + GpuArray gaD0; + GpuReductionAttr* grAttr; + GpuReduction* gr; - ga_assert_ok(GpuArray_empty (&gaS, ctx, GA_UINT, 3, &dims[0], GA_C_ORDER)); - ga_assert_ok(GpuArray_empty (&gaD, ctx, GA_UINT, 1, &dims[1], GA_C_ORDER)); + ga_assert_ok(GpuArray_empty (&gaS0, ctx, GA_UINT, 3, &dims[0], GA_C_ORDER)); + ga_assert_ok(GpuArray_empty (&gaD0, ctx, GA_UINT, 1, &dims[1], GA_C_ORDER)); - ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); - ga_assert_ok(GpuArray_memset(&gaD, -1)); - - GpuReduction* gr; - GpuReduction_new(&gr, GpuArray_context(&gaS), - GA_REDUCE_ALL, 1, 2, gaS.typecode, 0); + ga_assert_ok(GpuArray_write (&gaS0, pS0, sizeof(*pS0)*prodDims)); + ga_assert_ok(GpuArray_memset(&gaD0, -1)); + + GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0)); + ck_assert_ptr_nonnull(grAttr); + GpuReductionAttr_setop (grAttr, GA_REDUCE_ALL); + GpuReductionAttr_setdims (grAttr, gaS0.nd, gaD0.nd); + GpuReductionAttr_sets0type(grAttr, gaS0.typecode); + GpuReductionAttr_setd0type(grAttr, gaD0.typecode); + GpuReduction_new(&gr, grAttr); ck_assert_ptr_nonnull(gr); - ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 2, reduxList, 0)); + ga_assert_ok(GpuReduction_call(gr, &gaD0, NULL, &gaS0, gaS0.nd-gaD0.nd, reduxList, 0)); GpuReduction_free(gr); + GpuReductionAttr_free(grAttr); - ga_assert_ok(GpuArray_read (pD, sizeof(*pD)*dims[1], &gaD)); + ga_assert_ok(GpuArray_read (pD0, sizeof(*pD0)*dims[1], &gaD0)); /** @@ -3886,26 +4534,34 @@ START_TEST(test_all_reduction){ */ for(j=0;j 0.05; + pS0[i] = pcgRand01() > 0.05; } @@ -3947,23 +4604,30 @@ START_TEST(test_all_veryhighrank){ * Run the kernel. */ - GpuArray gaS; - GpuArray gaD; + GpuArray gaS0; + GpuArray gaD0; + GpuReductionAttr* grAttr; + GpuReduction* gr; - ga_assert_ok(GpuArray_empty (&gaS, ctx, GA_UINT, 8, dims, GA_C_ORDER)); - ga_assert_ok(GpuArray_empty (&gaD, ctx, GA_UINT, 4, rdxDims, GA_C_ORDER)); + ga_assert_ok(GpuArray_empty (&gaS0, ctx, GA_UINT, 8, dims, GA_C_ORDER)); + ga_assert_ok(GpuArray_empty (&gaD0, ctx, GA_UINT, 4, rdxDims, GA_C_ORDER)); - ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); - ga_assert_ok(GpuArray_memset(&gaD, -1)); /* 0xFFFFFFFF is a qNaN. */ - - GpuReduction* gr; - GpuReduction_new(&gr, GpuArray_context(&gaS), - GA_REDUCE_ALL, 4, 4, gaS.typecode, 0); + ga_assert_ok(GpuArray_write (&gaS0, pS0, sizeof(*pS0)*prodDims)); + ga_assert_ok(GpuArray_memset(&gaD0, -1)); /* 0xFFFFFFFF is a qNaN. */ + + GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0)); + ck_assert_ptr_nonnull(grAttr); + GpuReductionAttr_setop (grAttr, GA_REDUCE_ALL); + GpuReductionAttr_setdims (grAttr, gaS0.nd, gaD0.nd); + GpuReductionAttr_sets0type(grAttr, gaS0.typecode); + GpuReductionAttr_setd0type(grAttr, gaD0.typecode); + GpuReduction_new(&gr, grAttr); ck_assert_ptr_nonnull(gr); - ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 4, reduxList, 0)); + ga_assert_ok(GpuReduction_call(gr, &gaD0, NULL, &gaS0, gaS0.nd-gaD0.nd, reduxList, 0)); GpuReduction_free(gr); + GpuReductionAttr_free(grAttr); - ga_assert_ok(GpuArray_read (pD, sizeof(*pD)*rdxProdDims, &gaD)); + ga_assert_ok(GpuArray_read (pD0, sizeof(*pD0)*rdxProdDims, &gaD0)); /** @@ -3974,35 +4638,43 @@ START_TEST(test_all_veryhighrank){ for(j=0;j 0.05; + pS0[i] = pcgRand01() > 0.05; } @@ -4042,50 +4715,64 @@ START_TEST(test_all_alldimsreduced){ * Run the kernel. */ - GpuArray gaS; - GpuArray gaD; - - ga_assert_ok(GpuArray_empty (&gaS, ctx, GA_UINT, 3, &dims[0], GA_C_ORDER)); - ga_assert_ok(GpuArray_empty (&gaD, ctx, GA_UINT, 0, NULL, GA_C_ORDER)); + GpuArray gaS0; + GpuArray gaD0; + GpuReductionAttr* grAttr; + GpuReduction* gr; - ga_assert_ok(GpuArray_write (&gaS, pS, sizeof(*pS)*prodDims)); - ga_assert_ok(GpuArray_memset(&gaD, -1)); /* 0xFFFFFFFF is a qNaN. */ + ga_assert_ok(GpuArray_empty (&gaS0, ctx, GA_UINT, 3, &dims[0], GA_C_ORDER)); + ga_assert_ok(GpuArray_empty (&gaD0, ctx, GA_UINT, 0, NULL, GA_C_ORDER)); - GpuReduction* gr; - GpuReduction_new(&gr, GpuArray_context(&gaS), - GA_REDUCE_ALL, 0, 3, gaS.typecode, 0); + ga_assert_ok(GpuArray_write (&gaS0, pS0, sizeof(*pS0)*prodDims)); + ga_assert_ok(GpuArray_memset(&gaD0, -1)); /* 0xFFFFFFFF is a qNaN. */ + + GpuReductionAttr_new(&grAttr, GpuArray_context(&gaS0)); + ck_assert_ptr_nonnull(grAttr); + GpuReductionAttr_setop (grAttr, GA_REDUCE_ALL); + GpuReductionAttr_setdims (grAttr, gaS0.nd, gaD0.nd); + GpuReductionAttr_sets0type(grAttr, gaS0.typecode); + GpuReductionAttr_setd0type(grAttr, gaD0.typecode); + GpuReduction_new(&gr, grAttr); ck_assert_ptr_nonnull(gr); - ga_assert_ok(GpuReduction_call(gr, &gaD, NULL, &gaS, 3, reduxList, 0)); + ga_assert_ok(GpuReduction_call(gr, &gaD0, NULL, &gaS0, gaS0.nd-gaD0.nd, reduxList, 0)); GpuReduction_free(gr); + GpuReductionAttr_free(grAttr); - ga_assert_ok(GpuArray_read (pD, sizeof(*pD), &gaD)); + ga_assert_ok(GpuArray_read (pD0, sizeof(*pD0), &gaD0)); /** * Check that the destination tensors are correct. */ - uint32_t gtD = 1; + uint32_t gtD0 = 1; for(i=0;i Date: Fri, 14 Jul 2017 12:03:26 -0400 Subject: [PATCH 23/34] Delete an "initialization" that should not be there. --- src/gpuarray_reduction.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c index baead32518..8c3c665252 100644 --- a/src/gpuarray_reduction.c +++ b/src/gpuarray_reduction.c @@ -112,7 +112,8 @@ struct redux_ctx{ uint32_t LSlice; uint64_t LPadded; - uint64_t* L, *Li; + uint64_t* L; + uint32_t* Li; gpudata* S0Data; int64_t S0Off; int64_t* S0J, *S0Si; @@ -2607,13 +2608,6 @@ static void reduxGenSrcAppendDecode (GpuReduction* gr){ " TK1* restrict const W1R = &W1[GDIM_0*D];\n" " TK1* restrict const SHMEMK1 = (TK1*)(SHMEM + SHMEMK1Off);\n"); } - srcbAppends(&gr->srcGen, - " INITREDUXSTATE(W0L[LID_0], W1L[LID_0]);\n" - " INITREDUXSTATE(W0R[LID_0], W1R[LID_0]);\n" - " if(D < LDIM_0 && LID_0+DL = ctx->Li = NULL; + ctx->L = NULL; + ctx->Li = NULL; ctx->S0J = ctx->S0Si = NULL; ctx->D0J = ctx->D0Si = NULL; ctx->D1J = ctx->D1Si = NULL; From 4a17f4835291062b1fd7fc2e2bd2fdf26a79a575 Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Fri, 14 Jul 2017 12:46:32 -0400 Subject: [PATCH 24/34] Added an initialization that WAS needed. --- src/gpuarray_reduction.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c index 8c3c665252..a4f19359cb 100644 --- a/src/gpuarray_reduction.c +++ b/src/gpuarray_reduction.c @@ -2608,6 +2608,12 @@ static void reduxGenSrcAppendDecode (GpuReduction* gr){ " TK1* restrict const W1R = &W1[GDIM_0*D];\n" " TK1* restrict const SHMEMK1 = (TK1*)(SHMEM + SHMEMK1Off);\n"); } + srcbAppends(&gr->srcGen, + " INITREDUXSTATE(SHMEMK0[LID_0], SHMEMK1[LID_0]);\n" + " if(Dflags & 0 ? //FIXME: Delete this hack after debugging. - reduxInvFlattenSource (ctx): - reduxInvComputeKernelArgs(ctx); + return reduxInvFlattenSource(ctx); } /** From 328c957210224134dae9f14f7ad35cdba8a9c10b Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Fri, 14 Jul 2017 13:26:31 -0400 Subject: [PATCH 25/34] Add a bunch of local_barrier()'s. They are overkill but seem to fix the problems with the testcases, at least so far. --- src/gpuarray_reduction.c | 3 ++ tests/check_reduction.c | 92 ++++++++++++++++++++-------------------- 2 files changed, 49 insertions(+), 46 deletions(-) diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c index a4f19359cb..c8e64bd3a0 100644 --- a/src/gpuarray_reduction.c +++ b/src/gpuarray_reduction.c @@ -2609,6 +2609,7 @@ static void reduxGenSrcAppendDecode (GpuReduction* gr){ " TK1* restrict const SHMEMK1 = (TK1*)(SHMEM + SHMEMK1Off);\n"); } srcbAppends(&gr->srcGen, + " local_barrier();\n" " INITREDUXSTATE(SHMEMK0[LID_0], SHMEMK1[LID_0]);\n" " if(DsrcGen, " local_barrier();\n"); if (initial){ srcbAppends(&gr->srcGen, " if(LID_0 < D){\n" " SETREDUXSTATE(W0R[GID_0*D + LID_0],\n" @@ -2771,6 +2773,7 @@ static void reduxGenSrcAppendDstWrite (GpuReduction* gr, " }\n"); } } + srcbAppends(&gr->srcGen, " local_barrier();\n"); } static void reduxGenSrcAppendPhase1 (GpuReduction* gr){ /** diff --git a/tests/check_reduction.c b/tests/check_reduction.c index 8e5eef93e4..6a1e8c6a97 100644 --- a/tests/check_reduction.c +++ b/tests/check_reduction.c @@ -16,7 +16,7 @@ void teardown(void); /* Defines */ -#define MAXERRPRINT 2 +#define MAXERRPRINT 16 #define ga_assert_ok(e) ck_assert_int_eq(e, GA_NO_ERROR) @@ -155,7 +155,7 @@ START_TEST(test_maxandargmax_reduction){ if(gtD0 != pD0[j] || gtD1 != pD1[j]){ errCnt++; - if(errCnt < MAXERRPRINT){ + if(errCnt <= MAXERRPRINT){ fprintf(stderr, "%s:%d: Mismatch GT %f[%zu] != %f[%zu] UUT @ %zu!\n", __func__, __LINE__, gtD0, gtD1, pD0[j], pD1[j], j); fflush (stderr); @@ -268,7 +268,7 @@ START_TEST(test_maxandargmax_idxtranspose){ if(gtD0 != pD0[j] || gtD1 != pD1[j]){ errCnt++; - if(errCnt < MAXERRPRINT){ + if(errCnt <= MAXERRPRINT){ fprintf(stderr, "%s:%d: Mismatch GT %f[%zu] != %f[%zu] UUT @ %zu!\n", __func__, __LINE__, gtD0, gtD1, pD0[j], pD1[j], j); fflush (stderr); @@ -375,7 +375,7 @@ START_TEST(test_maxandargmax_bigdestination){ if(gtD0 != pD0[j] || gtD1 != pD1[j]){ errCnt++; - if(errCnt < MAXERRPRINT){ + if(errCnt <= MAXERRPRINT){ fprintf(stderr, "%s:%d: Mismatch GT %f[%zu] != %f[%zu] UUT @ %zu!\n", __func__, __LINE__, gtD0, gtD1, pD0[j], pD1[j], j); fflush (stderr); @@ -493,7 +493,7 @@ START_TEST(test_maxandargmax_veryhighrank){ size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o; if(gtD0 != pD0[dstIdx] || gtD1 != pD1[dstIdx]){ errCnt++; - if(errCnt < MAXERRPRINT){ + if(errCnt <= MAXERRPRINT){ fprintf(stderr, "%s:%d: Mismatch GT %f[%zu] != %f[%zu] UUT @ %zu!\n", __func__, __LINE__, gtD0, gtD1, pD0[dstIdx], pD1[dstIdx], dstIdx); fflush (stderr); @@ -605,7 +605,7 @@ START_TEST(test_maxandargmax_alldimsreduced){ } if(gtD0 != pD0[0] || gtD1 != pD1[0]){ errCnt++; - if(errCnt < MAXERRPRINT){ + if(errCnt <= MAXERRPRINT){ fprintf(stderr, "%s:%d: Mismatch GT %f[%zu] != %f[%zu] UUT @ %zu!\n", __func__, __LINE__, gtD0, gtD1, pD0[0], pD1[0], (size_t)0); fflush (stderr); @@ -713,7 +713,7 @@ START_TEST(test_minandargmin_reduction){ if(gtD0 != pD0[j] || gtD1 != pD1[j]){ errCnt++; - if(errCnt < MAXERRPRINT){ + if(errCnt <= MAXERRPRINT){ fprintf(stderr, "%s:%d: Mismatch GT %f[%zu] != %f[%zu] UUT @ %zu!\n", __func__, __LINE__, gtD0, gtD1, pD0[j], pD1[j], j); fflush (stderr); @@ -831,7 +831,7 @@ START_TEST(test_minandargmin_veryhighrank){ size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o; if(gtD0 != pD0[dstIdx] || gtD1 != pD1[dstIdx]){ errCnt++; - if(errCnt < MAXERRPRINT){ + if(errCnt <= MAXERRPRINT){ fprintf(stderr, "%s:%d: Mismatch GT %f[%zu] != %f[%zu] UUT @ %zu!\n", __func__, __LINE__, gtD0, gtD1, pD0[dstIdx], pD1[dstIdx], dstIdx); fflush (stderr); @@ -943,7 +943,7 @@ START_TEST(test_minandargmin_alldimsreduced){ } if(gtD0 != pD0[0] || gtD1 != pD1[0]){ errCnt++; - if(errCnt < MAXERRPRINT){ + if(errCnt <= MAXERRPRINT){ fprintf(stderr, "%s:%d: Mismatch GT %f[%zu] != %f[%zu] UUT @ %zu!\n", __func__, __LINE__, gtD0, gtD1, pD0[0], pD1[0], (size_t)0); fflush (stderr); @@ -1046,7 +1046,7 @@ START_TEST(test_argmax_reduction){ if(gtD1 != pD1[j]){ errCnt++; - if(errCnt < MAXERRPRINT){ + if(errCnt <= MAXERRPRINT){ fprintf(stderr, "%s:%d: Mismatch GT [%zu] != [%zu] UUT @ %zu!\n", __func__, __LINE__, gtD1, pD1[j], j); fflush (stderr); @@ -1157,7 +1157,7 @@ START_TEST(test_argmax_veryhighrank){ size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o; if(gtD1 != pD1[dstIdx]){ errCnt++; - if(errCnt < MAXERRPRINT){ + if(errCnt <= MAXERRPRINT){ fprintf(stderr, "%s:%d: Mismatch GT [%zu] != [%zu] UUT @ %zu!\n", __func__, __LINE__, gtD1, pD1[dstIdx], dstIdx); fflush (stderr); @@ -1263,7 +1263,7 @@ START_TEST(test_argmax_alldimsreduced){ } if(gtD1 != pD1[0]){ errCnt++; - if(errCnt < MAXERRPRINT){ + if(errCnt <= MAXERRPRINT){ fprintf(stderr, "%s:%d: Mismatch GT [%zu] != [%zu] UUT @ %zu!\n", __func__, __LINE__, gtD1, pD1[0], (size_t)0); fflush (stderr); @@ -1365,7 +1365,7 @@ START_TEST(test_argmin_reduction){ if(gtD1 != pD1[j]){ errCnt++; - if(errCnt < MAXERRPRINT){ + if(errCnt <= MAXERRPRINT){ fprintf(stderr, "%s:%d: Mismatch GT [%zu] != [%zu] UUT @ %zu!\n", __func__, __LINE__, gtD1, pD1[j], j); fflush (stderr); @@ -1476,7 +1476,7 @@ START_TEST(test_argmin_veryhighrank){ size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o; if(gtD1 != pD1[dstIdx]){ errCnt++; - if(errCnt < MAXERRPRINT){ + if(errCnt <= MAXERRPRINT){ fprintf(stderr, "%s:%d: Mismatch GT [%zu] != [%zu] UUT @ %zu!\n", __func__, __LINE__, gtD1, pD1[dstIdx], dstIdx); fflush (stderr); @@ -1582,7 +1582,7 @@ START_TEST(test_argmin_alldimsreduced){ } if(gtD1 != pD1[0]){ errCnt++; - if(errCnt < MAXERRPRINT){ + if(errCnt <= MAXERRPRINT){ fprintf(stderr, "%s:%d: Mismatch GT [%zu] != [%zu] UUT @ %zu!\n", __func__, __LINE__, gtD1, pD1[0], (size_t)0); fflush (stderr); @@ -1679,7 +1679,7 @@ START_TEST(test_max_reduction){ if(gtD0 != pD0[j]){ errCnt++; - if(errCnt < MAXERRPRINT){ + if(errCnt <= MAXERRPRINT){ fprintf(stderr, "%s:%d: Mismatch GT %f != %f UUT @ %zu!\n", __func__, __LINE__, gtD0, pD0[j], j); fflush (stderr); @@ -1786,7 +1786,7 @@ START_TEST(test_max_veryhighrank){ size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o; if(gtD0 != pD0[dstIdx]){ errCnt++; - if(errCnt < MAXERRPRINT){ + if(errCnt <= MAXERRPRINT){ fprintf(stderr, "%s:%d: Mismatch GT %f != %f UUT @ %zu!\n", __func__, __LINE__, gtD0, pD0[dstIdx], dstIdx); fflush (stderr); @@ -1887,7 +1887,7 @@ START_TEST(test_max_alldimsreduced){ } if(gtD0 != pD0[0]){ errCnt++; - if(errCnt < MAXERRPRINT){ + if(errCnt <= MAXERRPRINT){ fprintf(stderr, "%s:%d: Mismatch GT %f != %f UUT @ %zu!\n", __func__, __LINE__, gtD0, pD0[0], (size_t)0); fflush (stderr); @@ -1983,7 +1983,7 @@ START_TEST(test_min_reduction){ if(gtD0 != pD0[j]){ errCnt++; - if(errCnt < MAXERRPRINT){ + if(errCnt <= MAXERRPRINT){ fprintf(stderr, "%s:%d: Mismatch GT %f != %f UUT @ %zu!\n", __func__, __LINE__, gtD0, pD0[j], j); fflush (stderr); @@ -2090,7 +2090,7 @@ START_TEST(test_min_veryhighrank){ size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o; if(gtD0 != pD0[dstIdx]){ errCnt++; - if(errCnt < MAXERRPRINT){ + if(errCnt <= MAXERRPRINT){ fprintf(stderr, "%s:%d: Mismatch GT %f != %f UUT @ %zu!\n", __func__, __LINE__, gtD0, pD0[dstIdx], dstIdx); fflush (stderr); @@ -2191,7 +2191,7 @@ START_TEST(test_min_alldimsreduced){ } if(gtD0 != pD0[0]){ errCnt++; - if(errCnt < MAXERRPRINT){ + if(errCnt <= MAXERRPRINT){ fprintf(stderr, "%s:%d: Mismatch GT %f != %f UUT @ %zu!\n", __func__, __LINE__, gtD0, pD0[0], (size_t)0); fflush (stderr); @@ -2285,7 +2285,7 @@ START_TEST(test_sum_reduction){ if(fabs(gtD0-pD0[j]) >= TOL){ errCnt++; - if(errCnt < MAXERRPRINT){ + if(errCnt <= MAXERRPRINT){ fprintf(stderr, "%s:%d: Mismatch GT %f != %f UUT @ %zu (TOL=%f)!\n", __func__, __LINE__, gtD0, pD0[j], j, TOL); fflush (stderr); @@ -2390,7 +2390,7 @@ START_TEST(test_sum_veryhighrank){ size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o; if(fabs(gtD0-pD0[dstIdx]) >= TOL){ errCnt++; - if(errCnt < MAXERRPRINT){ + if(errCnt <= MAXERRPRINT){ fprintf(stderr, "%s:%d: Mismatch GT %f != %f UUT @ %zu (TOL=%f)!\n", __func__, __LINE__, gtD0, pD0[dstIdx], dstIdx, TOL); fflush (stderr); @@ -2489,7 +2489,7 @@ START_TEST(test_sum_alldimsreduced){ } if(fabs(gtD0-pD0[0]) >= TOL){ errCnt++; - if(errCnt < MAXERRPRINT){ + if(errCnt <= MAXERRPRINT){ fprintf(stderr, "%s:%d: Mismatch GT %f != %f UUT @ %zu (TOL=%f)!\n", __func__, __LINE__, gtD0, pD0[0], (size_t)0, TOL); fflush (stderr); @@ -2578,7 +2578,7 @@ START_TEST(test_sum_huge){ } if(fabs(gtD0-pD0[0]) >= TOL){ errCnt++; - if(errCnt < MAXERRPRINT){ + if(errCnt <= MAXERRPRINT){ fprintf(stderr, "%s:%d: Mismatch GT %f != %f UUT @ %zu (TOL=%f)!\n", __func__, __LINE__, gtD0, pD0[0], (size_t)0, TOL); fflush (stderr); @@ -2672,7 +2672,7 @@ START_TEST(test_prod_reduction){ if(fabs(gtD0-pD0[j]) >= TOL){ errCnt++; - if(errCnt < MAXERRPRINT){ + if(errCnt <= MAXERRPRINT){ fprintf(stderr, "%s:%d: Mismatch GT %f != %f UUT @ %zu (TOL=%f)!\n", __func__, __LINE__, gtD0, pD0[j], j, TOL); fflush (stderr); @@ -2777,7 +2777,7 @@ START_TEST(test_prod_veryhighrank){ size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o; if(fabs(gtD0-pD0[dstIdx]) >= TOL){ errCnt++; - if(errCnt < MAXERRPRINT){ + if(errCnt <= MAXERRPRINT){ fprintf(stderr, "%s:%d: Mismatch GT %f != %f UUT @ %zu (TOL=%f)!\n", __func__, __LINE__, gtD0, pD0[dstIdx], dstIdx, TOL); fflush (stderr); @@ -2876,7 +2876,7 @@ START_TEST(test_prod_alldimsreduced){ } if(fabs(gtD0-pD0[0]) >= TOL){ errCnt++; - if(errCnt < MAXERRPRINT){ + if(errCnt <= MAXERRPRINT){ fprintf(stderr, "%s:%d: Mismatch GT %f != %f UUT @ %zu (TOL=%f)!\n", __func__, __LINE__, gtD0, pD0[0], (size_t)0, TOL); fflush (stderr); @@ -2973,7 +2973,7 @@ START_TEST(test_prodnz_reduction){ if(fabs(gtD0-pD0[j]) >= TOL){ errCnt++; - if(errCnt < MAXERRPRINT){ + if(errCnt <= MAXERRPRINT){ fprintf(stderr, "%s:%d: Mismatch GT %f != %f UUT @ %zu (TOL=%f)!\n", __func__, __LINE__, gtD0, pD0[j], j, TOL); fflush (stderr); @@ -3081,7 +3081,7 @@ START_TEST(test_prodnz_veryhighrank){ size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o; if(fabs(gtD0-pD0[dstIdx]) >= TOL){ errCnt++; - if(errCnt < MAXERRPRINT){ + if(errCnt <= MAXERRPRINT){ fprintf(stderr, "%s:%d: Mismatch GT %f != %f UUT @ %zu (TOL=%f)!\n", __func__, __LINE__, gtD0, pD0[dstIdx], dstIdx, TOL); fflush (stderr); @@ -3183,7 +3183,7 @@ START_TEST(test_prodnz_alldimsreduced){ } if(fabs(gtD0-pD0[0]) >= TOL){ errCnt++; - if(errCnt < MAXERRPRINT){ + if(errCnt <= MAXERRPRINT){ fprintf(stderr, "%s:%d: Mismatch GT %f != %f UUT @ %zu (TOL=%f)!\n", __func__, __LINE__, gtD0, pD0[0], (size_t)0, TOL); fflush (stderr); @@ -3285,7 +3285,7 @@ START_TEST(test_and_reduction){ if(gtD0 != pD0[j]){ errCnt++; - if(errCnt < MAXERRPRINT){ + if(errCnt <= MAXERRPRINT){ fprintf(stderr, "%s:%d: Mismatch GT %u != %u UUT @ %zu!\n", __func__, __LINE__, gtD0, pD0[j], j); fflush (stderr); @@ -3398,7 +3398,7 @@ START_TEST(test_and_veryhighrank){ size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o; if(gtD0 != pD0[dstIdx]){ errCnt++; - if(errCnt < MAXERRPRINT){ + if(errCnt <= MAXERRPRINT){ fprintf(stderr, "%s:%d: Mismatch GT %u != %u UUT @ %zu!\n", __func__, __LINE__, gtD0, pD0[dstIdx], dstIdx); fflush (stderr); @@ -3505,7 +3505,7 @@ START_TEST(test_and_alldimsreduced){ } if(gtD0 != pD0[0]){ errCnt++; - if(errCnt < MAXERRPRINT){ + if(errCnt <= MAXERRPRINT){ fprintf(stderr, "%s:%d: Mismatch GT %u != %u UUT @ %zu!\n", __func__, __LINE__, gtD0, pD0[0], (size_t)0); fflush (stderr); @@ -3607,7 +3607,7 @@ START_TEST(test_or_reduction){ if(gtD0 != pD0[j]){ errCnt++; - if(errCnt < MAXERRPRINT){ + if(errCnt <= MAXERRPRINT){ fprintf(stderr, "%s:%d: Mismatch GT %u != %u UUT @ %zu!\n", __func__, __LINE__, gtD0, pD0[j], (size_t)j); fflush (stderr); @@ -3720,7 +3720,7 @@ START_TEST(test_or_veryhighrank){ size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o; if(gtD0 != pD0[dstIdx]){ errCnt++; - if(errCnt < MAXERRPRINT){ + if(errCnt <= MAXERRPRINT){ fprintf(stderr, "%s:%d: Mismatch GT %u != %u UUT @ %zu!\n", __func__, __LINE__, gtD0, pD0[dstIdx], dstIdx); fflush (stderr); @@ -3827,7 +3827,7 @@ START_TEST(test_or_alldimsreduced){ } if(gtD0 != pD0[0]){ errCnt++; - if(errCnt < MAXERRPRINT){ + if(errCnt <= MAXERRPRINT){ fprintf(stderr, "%s:%d: Mismatch GT %u != %u UUT @ %zu!\n", __func__, __LINE__, gtD0, pD0[0], (size_t)0); fflush (stderr); @@ -3925,7 +3925,7 @@ START_TEST(test_xor_reduction){ if(gtD0 != pD0[j]){ errCnt++; - if(errCnt < MAXERRPRINT){ + if(errCnt <= MAXERRPRINT){ fprintf(stderr, "%s:%d: Mismatch GT %u != %u UUT @ %zu!\n", __func__, __LINE__, gtD0, pD0[j], (size_t)j); fflush (stderr); @@ -4034,7 +4034,7 @@ START_TEST(test_xor_veryhighrank){ size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o; if(gtD0 != pD0[dstIdx]){ errCnt++; - if(errCnt < MAXERRPRINT){ + if(errCnt <= MAXERRPRINT){ fprintf(stderr, "%s:%d: Mismatch GT %u != %u UUT @ %zu!\n", __func__, __LINE__, gtD0, pD0[dstIdx], dstIdx); fflush (stderr); @@ -4137,7 +4137,7 @@ START_TEST(test_xor_alldimsreduced){ } if(gtD0 != pD0[0]){ errCnt++; - if(errCnt < MAXERRPRINT){ + if(errCnt <= MAXERRPRINT){ fprintf(stderr, "%s:%d: Mismatch GT %u != %u UUT @ %zu!\n", __func__, __LINE__, gtD0, pD0[0], (size_t)0); fflush (stderr); @@ -4235,7 +4235,7 @@ START_TEST(test_any_reduction){ if(gtD0 != pD0[j]){ errCnt++; - if(errCnt < MAXERRPRINT){ + if(errCnt <= MAXERRPRINT){ fprintf(stderr, "%s:%d: Mismatch GT %u != %u UUT @ %zu!\n", __func__, __LINE__, gtD0, pD0[j], (size_t)j); fflush (stderr); @@ -4344,7 +4344,7 @@ START_TEST(test_any_veryhighrank){ size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o; if(gtD0 != pD0[dstIdx]){ errCnt++; - if(errCnt < MAXERRPRINT){ + if(errCnt <= MAXERRPRINT){ fprintf(stderr, "%s:%d: Mismatch GT %u != %u UUT @ %zu!\n", __func__, __LINE__, gtD0, pD0[dstIdx], dstIdx); fflush (stderr); @@ -4447,7 +4447,7 @@ START_TEST(test_any_alldimsreduced){ } if(gtD0 != pD0[0]){ errCnt++; - if(errCnt < MAXERRPRINT){ + if(errCnt <= MAXERRPRINT){ fprintf(stderr, "%s:%d: Mismatch GT %u != %u UUT @ %zu!\n", __func__, __LINE__, gtD0, pD0[0], (size_t)0); fflush (stderr); @@ -4545,7 +4545,7 @@ START_TEST(test_all_reduction){ if(gtD0 != pD0[j]){ errCnt++; - if(errCnt < MAXERRPRINT){ + if(errCnt <= MAXERRPRINT){ fprintf(stderr, "%s:%d: Mismatch GT %u != %u UUT @ %zu!\n", __func__, __LINE__, gtD0, pD0[j], (size_t)j); fflush (stderr); @@ -4654,7 +4654,7 @@ START_TEST(test_all_veryhighrank){ size_t dstIdx = (((i)*dims[1] + j)*dims[3] + l)*dims[6] + o; if(gtD0 != pD0[dstIdx]){ errCnt++; - if(errCnt < MAXERRPRINT){ + if(errCnt <= MAXERRPRINT){ fprintf(stderr, "%s:%d: Mismatch GT %u != %u UUT @ %zu!\n", __func__, __LINE__, gtD0, pD0[dstIdx], dstIdx); fflush (stderr); @@ -4757,7 +4757,7 @@ START_TEST(test_all_alldimsreduced){ } if(gtD0 != pD0[0]){ errCnt++; - if(errCnt < MAXERRPRINT){ + if(errCnt <= MAXERRPRINT){ fprintf(stderr, "%s:%d: Mismatch GT %u != %u UUT @ %zu!\n", __func__, __LINE__, gtD0, pD0[0], (size_t)0); fflush (stderr); From 925688c4344c0e379bd1e8dd243ddb8fe3e3a841 Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Fri, 14 Jul 2017 14:03:09 -0400 Subject: [PATCH 26/34] Style fixes. --- src/gpuarray_reduction.c | 638 +++++++++++++++++++-------------------- 1 file changed, 319 insertions(+), 319 deletions(-) diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c index c8e64bd3a0..6d2e6fc17f 100644 --- a/src/gpuarray_reduction.c +++ b/src/gpuarray_reduction.c @@ -306,188 +306,188 @@ typedef void (*GpuReductionIterFn)(const GpuReduction* gr, /* Static Function prototypes */ /* Utilities */ -static int reduxGetSumInit (int typecode, const char** property); -static int reduxGetProdInit (int typecode, const char** property); -static int reduxGetMinInit (int typecode, const char** property); -static int reduxGetMaxInit (int typecode, const char** property); -static int reduxGetAndInit (int typecode, const char** property); -static int reduxGetOrInit (int typecode, const char** property); -static int reduxIsFloatingPoint (int typecode); -static unsigned reduxCeilLog2 (uint64_t x); -static uint64_t reduxNextPow2 (uint64_t x); -static int reduxSortFlatSensitive (const void* a, const void* b); -static int reduxSortFlatInsensitive (const void* a, const void* b); -static int reduxSortPtrS0AbsStride (const void* a, const void* b); -static int reduxSortPtrByReduxNum (const void* a, const void* b); -static int reduxSortPtrD0WrSelect (const void* a, const void* b); -static int reduxSortPtrD1WrSelect (const void* a, const void* b); -static int reduxSortPtrInsertFinalOrder (const void* a, const void* b); +static int reduxGetSumInit (int typecode, const char** property); +static int reduxGetProdInit (int typecode, const char** property); +static int reduxGetMinInit (int typecode, const char** property); +static int reduxGetMaxInit (int typecode, const char** property); +static int reduxGetAndInit (int typecode, const char** property); +static int reduxGetOrInit (int typecode, const char** property); +static int reduxIsFloatingPoint (int typecode); +static unsigned reduxCeilLog2 (uint64_t x); +static uint64_t reduxNextPow2 (uint64_t x); +static int reduxSortFlatInsensitive (const void* a, const void* b); +static int reduxSortFlatSensitive (const void* a, const void* b); +static int reduxSortPtrS0AbsStride (const void* a, const void* b); +static int reduxSortPtrByReduxNum (const void* a, const void* b); +static int reduxSortPtrD0WrSelect (const void* a, const void* b); +static int reduxSortPtrD1WrSelect (const void* a, const void* b); +static int reduxSortPtrInsertFinalOrder (const void* a, const void* b); /* Axis Description API */ -static void axisInit (axis_desc* axis, - ssize_t len, - ssize_t s0S); -static void axisMarkReduced (axis_desc* axis, int reduxNum); -static void axisMarkIntraBlock (axis_desc* axis, - int ibNum, - size_t ibLen); -static int axisGetReduxNum (const axis_desc* axis); -static size_t axisGetLen (const axis_desc* axis); -static size_t axisGetIntraLen (const axis_desc* axis); -static size_t axisGetInterLen (const axis_desc* axis); -static size_t axisGetIntraInterLen (const axis_desc* axis); -static ssize_t axisGetS0Stride (const axis_desc* axis); -static size_t axisGetS0AbsStride (const axis_desc* axis); -static ssize_t axisGetD0Stride (const axis_desc* axis); -static size_t axisGetD0AbsStride (const axis_desc* axis); -static ssize_t axisGetD1Stride (const axis_desc* axis); -static size_t axisGetD1AbsStride (const axis_desc* axis); -static size_t axisGetI0Stride (const axis_desc* axis); -static void axisSetI0Stride (axis_desc* axis, - size_t pdim); -static unsigned axisGetPerm (const axis_desc* axis); -static int axisGetIBNum (const axis_desc* axis); -static void axisSetPerm (axis_desc* axis, - unsigned ibp); -static int axisIsReduced (const axis_desc* axis); -static int axisIsIntra (const axis_desc* axis); -static int axisIsInter (const axis_desc* axis); -static int axisIsSplit (const axis_desc* axis); +static void axisInit (axis_desc* axis, + ssize_t len, + ssize_t s0S); +static void axisMarkReduced (axis_desc* axis, int reduxNum); +static void axisMarkIntraBlock (axis_desc* axis, + int ibNum, + size_t ibLen); +static int axisGetReduxNum (const axis_desc* axis); +static size_t axisGetLen (const axis_desc* axis); +static size_t axisGetIntraLen (const axis_desc* axis); +static size_t axisGetInterLen (const axis_desc* axis); +static size_t axisGetIntraInterLen (const axis_desc* axis); +static ssize_t axisGetS0Stride (const axis_desc* axis); +static size_t axisGetS0AbsStride (const axis_desc* axis); +static ssize_t axisGetD0Stride (const axis_desc* axis); +static size_t axisGetD0AbsStride (const axis_desc* axis); +static ssize_t axisGetD1Stride (const axis_desc* axis); +static size_t axisGetD1AbsStride (const axis_desc* axis); +static size_t axisGetI0Stride (const axis_desc* axis); +static void axisSetI0Stride (axis_desc* axis, + size_t pdim); +static unsigned axisGetPerm (const axis_desc* axis); +static int axisGetIBNum (const axis_desc* axis); +static void axisSetPerm (axis_desc* axis, + unsigned ibp); +static int axisIsReduced (const axis_desc* axis); +static int axisIsIntra (const axis_desc* axis); +static int axisIsInter (const axis_desc* axis); +static int axisIsSplit (const axis_desc* axis); /* Reduction Context API */ /* Generator Control Flow */ -static int reduxGenInit (GpuReduction* gr); -static int reduxGenInferProperties (GpuReduction* gr); -static void reduxGenSetMaxBS (GpuReduction* gr); -static void reduxGenSetKTypes (GpuReduction* gr); -static void reduxGenIterArgs (const GpuReduction* gr, - GpuReductionIterFn fn, - void* user); -static int reduxGenSrc (GpuReduction* gr); -static void reduxGenSrcAppend (GpuReduction* gr); -static void reduxGenSrcAppendIncludes (GpuReduction* gr); -static void reduxGenSrcAppendMacroTypedefs (GpuReduction* gr); -static void reduxGenSrcAppendReduxKernel (GpuReduction* gr); -static void reduxGenSrcAppendPrototype (GpuReduction* gr); -static void reduxGenSrcAppendDecode (GpuReduction* gr); -static void reduxGenSrcAppendPhase0 (GpuReduction* gr, - uint32_t selector); -static void reduxGenSrcAppendLoop (GpuReduction* gr, - uint32_t selector, - int initial); -static void reduxGenSrcAppendVertical (GpuReduction* gr, - uint32_t selector); -static void reduxGenSrcAppendIncrement (GpuReduction* gr, - uint32_t selector, - int initial, - int axis); -static void reduxGenSrcAppendDstWrite (GpuReduction* gr, - uint32_t selector, - int initial); -static void reduxGenSrcAppendPhase1 (GpuReduction* gr); -static int reduxGenSrcAxisIsHuge (GpuReduction* gr, - uint32_t selector, - int axis); -static int reduxGenSrcAxisIsSplit (GpuReduction* gr, - uint32_t selector, - int axis); -static int reduxGenCompile (GpuReduction* gr); -static int reduxGenComputeLaunchBounds (GpuReduction* gr); -static int reduxGenCleanup (GpuReduction* gr, int ret); -static int reduxGenCleanupMsg (GpuReduction* gr, int ret, - const char* fmt, ...); +static int reduxGenInit (GpuReduction* gr); +static int reduxGenInferProperties (GpuReduction* gr); +static void reduxGenSetMaxBS (GpuReduction* gr); +static void reduxGenSetKTypes (GpuReduction* gr); +static void reduxGenIterArgs (const GpuReduction* gr, + GpuReductionIterFn fn, + void* user); +static int reduxGenSrc (GpuReduction* gr); +static void reduxGenSrcAppend (GpuReduction* gr); +static void reduxGenSrcAppendIncludes (GpuReduction* gr); +static void reduxGenSrcAppendMacroTypedefs (GpuReduction* gr); +static void reduxGenSrcAppendReduxKernel (GpuReduction* gr); +static void reduxGenSrcAppendPrototype (GpuReduction* gr); +static void reduxGenSrcAppendDecode (GpuReduction* gr); +static void reduxGenSrcAppendPhase0 (GpuReduction* gr, + uint32_t selector); +static void reduxGenSrcAppendLoop (GpuReduction* gr, + uint32_t selector, + int initial); +static void reduxGenSrcAppendVertical (GpuReduction* gr, + uint32_t selector); +static void reduxGenSrcAppendIncrement (GpuReduction* gr, + uint32_t selector, + int initial, + int axis); +static void reduxGenSrcAppendDstWrite (GpuReduction* gr, + uint32_t selector, + int initial); +static void reduxGenSrcAppendPhase1 (GpuReduction* gr); +static int reduxGenSrcAxisIsHuge (GpuReduction* gr, + uint32_t selector, + int axis); +static int reduxGenSrcAxisIsSplit (GpuReduction* gr, + uint32_t selector, + int axis); +static int reduxGenCompile (GpuReduction* gr); +static int reduxGenComputeLaunchBounds (GpuReduction* gr); +static int reduxGenCleanup (GpuReduction* gr, int ret); +static int reduxGenCleanupMsg (GpuReduction* gr, int ret, + const char* fmt, ...); /* Generator Utilities */ -static void reduxGenCountArgs (const GpuReduction* gr, - int typecode, - const char* typeName, - const char* baseName, - int num, - void* user); -static void reduxGenSaveArgTypecodes (const GpuReduction* gr, - int typecode, - const char* typeName, - const char* baseName, - int num, - void* user); -static void reduxGenAppendArg (const GpuReduction* gr, - int typecode, - const char* typeName, - const char* baseName, - int num, - void* user); -static void reduxInvMarshalArg (const GpuReduction* gr, - int typecode, - const char* typeName, - const char* baseName, - int num, - void* user); -static size_t reduxGenEstimateParallelism (const GpuReduction* gr); -static int reduxGenRequiresS0 (const GpuReduction* gr); -static int reduxGenRequiresD0 (const GpuReduction* gr); -static int reduxGenRequiresD1 (const GpuReduction* gr); -static int reduxGenKernelRequiresLatticeS0(const GpuReduction* gr); -static int reduxGenKernelRequiresLatticeD0(const GpuReduction* gr); -static int reduxGenKernelRequiresLatticeD1(const GpuReduction* gr); -static int reduxGenKernelRequiresLatticeI0(const GpuReduction* gr); -static int reduxGenKernelRequiresStateK0 (const GpuReduction* gr); -static int reduxGenKernelRequiresStateK1 (const GpuReduction* gr); -static int reduxGenKernelRequiresWspace (const GpuReduction* gr); -static size_t reduxGenGetK0Size (const GpuReduction* gr); -static size_t reduxGenGetK0Align (const GpuReduction* gr); -static size_t reduxGenGetK1Size (const GpuReduction* gr); -static size_t reduxGenGetK1Align (const GpuReduction* gr); -static size_t reduxGenGetReduxStateSize (const GpuReduction* gr); -static size_t reduxGenGetMaxLocalSize (const GpuReduction* gr); -static size_t reduxGenGetSHMEMSize (const GpuReduction* gr, size_t cells); -static size_t reduxGenGetSHMEMK0Off (const GpuReduction* gr, size_t cells); -static size_t reduxGenGetSHMEMK1Off (const GpuReduction* gr, size_t cells); -static size_t reduxGenGetWMEMSize (const GpuReduction* gr, size_t cells); -static size_t reduxGenGetWMEMK0Off (const GpuReduction* gr, size_t cells); -static size_t reduxGenGetWMEMK1Off (const GpuReduction* gr, size_t cells); +static void reduxGenCountArgs (const GpuReduction* gr, + int typecode, + const char* typeName, + const char* baseName, + int num, + void* user); +static void reduxGenSaveArgTypecodes (const GpuReduction* gr, + int typecode, + const char* typeName, + const char* baseName, + int num, + void* user); +static void reduxGenAppendArg (const GpuReduction* gr, + int typecode, + const char* typeName, + const char* baseName, + int num, + void* user); +static void reduxInvMarshalArg (const GpuReduction* gr, + int typecode, + const char* typeName, + const char* baseName, + int num, + void* user); +static size_t reduxGenEstimateParallelism (const GpuReduction* gr); +static int reduxGenRequiresS0 (const GpuReduction* gr); +static int reduxGenRequiresD0 (const GpuReduction* gr); +static int reduxGenRequiresD1 (const GpuReduction* gr); +static int reduxGenKernelRequiresLatticeS0 (const GpuReduction* gr); +static int reduxGenKernelRequiresLatticeD0 (const GpuReduction* gr); +static int reduxGenKernelRequiresLatticeD1 (const GpuReduction* gr); +static int reduxGenKernelRequiresLatticeI0 (const GpuReduction* gr); +static int reduxGenKernelRequiresStateK0 (const GpuReduction* gr); +static int reduxGenKernelRequiresStateK1 (const GpuReduction* gr); +static int reduxGenKernelRequiresWspace (const GpuReduction* gr); +static size_t reduxGenGetK0Size (const GpuReduction* gr); +static size_t reduxGenGetK0Align (const GpuReduction* gr); +static size_t reduxGenGetK1Size (const GpuReduction* gr); +static size_t reduxGenGetK1Align (const GpuReduction* gr); +static size_t reduxGenGetReduxStateSize (const GpuReduction* gr); +static size_t reduxGenGetMaxLocalSize (const GpuReduction* gr); +static size_t reduxGenGetSHMEMSize (const GpuReduction* gr, size_t cells); +static size_t reduxGenGetSHMEMK0Off (const GpuReduction* gr, size_t cells); +static size_t reduxGenGetSHMEMK1Off (const GpuReduction* gr, size_t cells); +static size_t reduxGenGetWMEMSize (const GpuReduction* gr, size_t cells); +static size_t reduxGenGetWMEMK0Off (const GpuReduction* gr, size_t cells); +static size_t reduxGenGetWMEMK1Off (const GpuReduction* gr, size_t cells); /* Invoker Control Flow */ -static int reduxInvInit (redux_ctx* ctx); -static int reduxInvInferProperties (redux_ctx* ctx); -static int reduxInvFlattenSource (redux_ctx* ctx); -static int reduxInvComputeKernelArgs (redux_ctx* ctx); -static int reduxInvSchedule (redux_ctx* ctx); -static int reduxInvoke (redux_ctx* ctx); -static int reduxInvCleanup (redux_ctx* ctx, int ret); -static int reduxInvCleanupMsg (redux_ctx* ctx, int ret, - const char* fmt, ...); +static int reduxInvInit (redux_ctx* ctx); +static int reduxInvInferProperties (redux_ctx* ctx); +static int reduxInvFlattenSource (redux_ctx* ctx); +static int reduxInvComputeKernelArgs (redux_ctx* ctx); +static int reduxInvSchedule (redux_ctx* ctx); +static int reduxInvoke (redux_ctx* ctx); +static int reduxInvCleanup (redux_ctx* ctx, int ret); +static int reduxInvCleanupMsg (redux_ctx* ctx, int ret, + const char* fmt, ...); /* Invoker Utilities */ -static size_t reduxInvEstimateParallelism (const redux_ctx* ctx); -static int reduxInvRequiresS0 (const redux_ctx* ctx); -static int reduxInvRequiresD0 (const redux_ctx* ctx); -static int reduxInvRequiresD1 (const redux_ctx* ctx); -static axis_desc* reduxInvGetSrcAxis (const redux_ctx* ctx, int i); -static axis_desc* reduxInvGetSrcSortAxis (const redux_ctx* ctx, int i); -static int reduxTryFlattenOut (const redux_ctx* ctx, - const axis_desc* axis); -static int reduxTryFlattenInto (redux_ctx* ctx, - axis_desc* into, - const axis_desc* from); -static void reduxSortAxisPtrsBy (axis_desc** ptrs, - axis_desc* axes, - size_t numAxes, - int(*fn)(const void*, const void*)); +static size_t reduxInvEstimateParallelism (const redux_ctx* ctx); +static int reduxInvRequiresS0 (const redux_ctx* ctx); +static int reduxInvRequiresD0 (const redux_ctx* ctx); +static int reduxInvRequiresD1 (const redux_ctx* ctx); +static axis_desc* reduxInvGetSrcAxis (const redux_ctx* ctx, int i); +static axis_desc* reduxInvGetSrcSortAxis (const redux_ctx* ctx, int i); +static int reduxTryFlattenOut (const redux_ctx* ctx, + const axis_desc* axis); +static int reduxTryFlattenInto (redux_ctx* ctx, + axis_desc* into, + const axis_desc* from); +static void reduxSortAxisPtrsBy (axis_desc** ptrs, + axis_desc* axes, + size_t numAxes, + int(*fn)(const void*, const void*)); /* Function Implementations */ /* Extern Functions */ GPUARRAY_PUBLIC int GpuReductionAttr_new (GpuReductionAttr** grAttr, gpucontext* gpuCtx){ - if(!grAttr){ + if (!grAttr){ return GA_INVALID_ERROR; } - if(!gpuCtx){ + if (!gpuCtx){ *grAttr = NULL; return GA_INVALID_ERROR; } *grAttr = calloc(1, sizeof(**grAttr)); - if(!*grAttr){ + if (!*grAttr){ return GA_MEMORY_ERROR; } @@ -527,7 +527,7 @@ GPUARRAY_PUBLIC int GpuReductionAttr_setdims (GpuReductionAttr* } GPUARRAY_PUBLIC int GpuReductionAttr_sets0type (GpuReductionAttr* grAttr, int s0Typecode){ - switch(grAttr->op){ + switch (grAttr->op){ case GA_REDUCE_AND: case GA_REDUCE_OR: case GA_REDUCE_XOR: @@ -565,7 +565,7 @@ GPUARRAY_PUBLIC int GpuReductionAttr_seti0type (GpuReductionAttr* GPUARRAY_PUBLIC int GpuReductionAttr_appendopname (GpuReductionAttr* grAttr, size_t n, char* name){ - switch(grAttr->op){ + switch (grAttr->op){ case GA_REDUCE_COPY: return snprintf(name, n, "Copy_%d", grAttr->maxSrcDims); case GA_REDUCE_SUM: return snprintf(name, n, "Sum_%d_%d", grAttr->maxSrcDims, grAttr->maxDstDims); case GA_REDUCE_PROD: return snprintf(name, n, "Prod_%d_%d", grAttr->maxSrcDims, grAttr->maxDstDims); @@ -581,7 +581,7 @@ GPUARRAY_PUBLIC int GpuReductionAttr_appendopname (GpuReductionAttr* case GA_REDUCE_XOR: return snprintf(name, n, "Xor_%d_%d", grAttr->maxSrcDims, grAttr->maxDstDims); case GA_REDUCE_ALL: return snprintf(name, n, "All_%d_%d", grAttr->maxSrcDims, grAttr->maxDstDims); case GA_REDUCE_ANY: return snprintf(name, n, "Any_%d_%d", grAttr->maxSrcDims, grAttr->maxDstDims); - default: if(name && n>0){*name = '\0';} return GA_INVALID_ERROR; + default: if (name && n>0){*name = '\0';} return GA_INVALID_ERROR; } } GPUARRAY_PUBLIC int GpuReductionAttr_issensitive (const GpuReductionAttr* grAttr){ @@ -623,7 +623,7 @@ GPUARRAY_PUBLIC int GpuReductionAttr_issensitive (const GpuReductionAttr* } } GPUARRAY_PUBLIC int GpuReductionAttr_requiresS0 (const GpuReductionAttr* grAttr){ - switch(grAttr->op){ + switch (grAttr->op){ default: return 1; } } @@ -712,7 +712,7 @@ GPUARRAY_PUBLIC int GpuReduction_call (const GpuReduction* * @return Zero if successful; Non-zero if the datatype is not supported. */ -static int reduxGetSumInit (int typecode, const char** property){ +static int reduxGetSumInit (int typecode, const char** property){ if (typecode == GA_POINTER || typecode == GA_BUFFER){ return GA_UNSUPPORTED_ERROR; @@ -732,7 +732,7 @@ static int reduxGetSumInit (int typecode, const char** pro * @return Zero if successful; Non-zero if the datatype is not supported. */ -static int reduxGetProdInit (int typecode, const char** property){ +static int reduxGetProdInit (int typecode, const char** property){ if (typecode == GA_POINTER || typecode == GA_BUFFER){ return GA_UNSUPPORTED_ERROR; @@ -752,7 +752,7 @@ static int reduxGetProdInit (int typecode, const char** pro * @return Zero if successful; Non-zero if the datatype is not supported. */ -static int reduxGetMinInit (int typecode, const char** property){ +static int reduxGetMinInit (int typecode, const char** property){ switch (typecode){ case GA_BYTE2: case GA_BYTE3: @@ -842,7 +842,7 @@ static int reduxGetMinInit (int typecode, const char** pro * @return Zero if successful; Non-zero if the datatype is not supported. */ -static int reduxGetMaxInit (int typecode, const char** property){ +static int reduxGetMaxInit (int typecode, const char** property){ switch (typecode){ case GA_BOOL: *property = "1"; @@ -941,7 +941,7 @@ static int reduxGetMaxInit (int typecode, const char** pro * @return Zero if successful; Non-zero if the datatype is not supported. */ -static int reduxGetAndInit (int typecode, const char** property){ +static int reduxGetAndInit (int typecode, const char** property){ if (typecode == GA_POINTER || typecode == GA_BUFFER){ return GA_UNSUPPORTED_ERROR; @@ -961,7 +961,7 @@ static int reduxGetAndInit (int typecode, const char** pro * @return Zero if successful; Non-zero if the datatype is not supported. */ -static int reduxGetOrInit (int typecode, const char** property){ +static int reduxGetOrInit (int typecode, const char** property){ if (typecode == GA_POINTER || typecode == GA_BUFFER){ return GA_UNSUPPORTED_ERROR; @@ -974,8 +974,8 @@ static int reduxGetOrInit (int typecode, const char** pro * Whether or not the typecode is a floating-point type. */ -static int reduxIsFloatingPoint (int typecode){ - switch(typecode){ +static int reduxIsFloatingPoint (int typecode){ + switch (typecode){ case GA_HALF: case GA_HALF2: case GA_HALF4: @@ -1005,7 +1005,7 @@ static int reduxIsFloatingPoint (int typecode){ * Compute ceil(log2(x)). */ -static unsigned reduxCeilLog2 (uint64_t x){ +static unsigned reduxCeilLog2 (uint64_t x){ int i; if (x <= 1){ @@ -1021,7 +1021,7 @@ static unsigned reduxCeilLog2 (uint64_t x){ * If x is a power of two already, return x. */ -static uint64_t reduxNextPow2 (uint64_t x){ +static uint64_t reduxNextPow2 (uint64_t x){ if (x & (x-1)){ x |= x >> 1; x |= x >> 2; @@ -1057,7 +1057,7 @@ static uint64_t reduxNextPow2 (uint64_t x){ * 5. then by increasing source axis number. */ -static int reduxSortFlatInsensitive (const void* a, const void* b){ +static int reduxSortFlatInsensitive (const void* a, const void* b){ const axis_desc* xda = (const axis_desc*)a; const axis_desc* xdb = (const axis_desc*)b; @@ -1075,7 +1075,7 @@ static int reduxSortFlatInsensitive (const void* a, const void* b){ return 0; } -static int reduxSortFlatSensitive (const void* a, const void* b){ +static int reduxSortFlatSensitive (const void* a, const void* b){ const axis_desc* xda = (const axis_desc*)a; const axis_desc* xdb = (const axis_desc*)b; @@ -1104,7 +1104,7 @@ static int reduxSortFlatSensitive (const void* a, const void* b){ * This means ascending order of absolute stride. */ -static int reduxSortPtrS0AbsStride (const void* a, const void* b){ +static int reduxSortPtrS0AbsStride (const void* a, const void* b){ const axis_desc* xda = *(const axis_desc* const*)a; const axis_desc* xdb = *(const axis_desc* const*)b; @@ -1116,7 +1116,7 @@ static int reduxSortPtrS0AbsStride (const void* a, const void* b){ return 0; } -static int reduxSortPtrByReduxNum (const void* a, const void* b){ +static int reduxSortPtrByReduxNum (const void* a, const void* b){ const axis_desc* xda = *(const axis_desc* const*)a; const axis_desc* xdb = *(const axis_desc* const*)b; @@ -1134,7 +1134,7 @@ static int reduxSortPtrByReduxNum (const void* a, const void* b){ return 0; } -static int reduxSortPtrD0WrSelect (const void* a, const void* b){ +static int reduxSortPtrD0WrSelect (const void* a, const void* b){ const axis_desc* xda = *(const axis_desc* const*)a; const axis_desc* xdb = *(const axis_desc* const*)b; @@ -1168,7 +1168,7 @@ static int reduxSortPtrD0WrSelect (const void* a, const void* b){ return 0; } -static int reduxSortPtrD1WrSelect (const void* a, const void* b){ +static int reduxSortPtrD1WrSelect (const void* a, const void* b){ const axis_desc* xda = *(const axis_desc* const*)a; const axis_desc* xdb = *(const axis_desc* const*)b; @@ -1202,7 +1202,7 @@ static int reduxSortPtrD1WrSelect (const void* a, const void* b){ return 0; } -static int reduxSortPtrInsertFinalOrder (const void* a, const void* b){ +static int reduxSortPtrInsertFinalOrder (const void* a, const void* b){ const axis_desc* xda = *(const axis_desc* const*)a; const axis_desc* xdb = *(const axis_desc* const*)b; @@ -1257,9 +1257,9 @@ static int reduxSortPtrInsertFinalOrder (const void* a, const void* b){ * @brief Initialize Axis Description. */ -static void axisInit (axis_desc* axis, - ssize_t len, - ssize_t s0S){ +static void axisInit (axis_desc* axis, + ssize_t len, + ssize_t s0S){ memset(axis, 0, sizeof(*axis)); axis->reduxNum = -1; @@ -1278,7 +1278,7 @@ static void axisInit (axis_desc* axis, * @brief Mark axis as reduction axis, with position reduxNum in the axis list. */ -static void axisMarkReduced (axis_desc* axis, int reduxNum){ +static void axisMarkReduced (axis_desc* axis, int reduxNum){ axis->isReduced = 1; axis->reduxNum = reduxNum; } @@ -1287,9 +1287,9 @@ static void axisMarkReduced (axis_desc* axis, int * @brief Mark axis as (split) intrablock axis. */ -static void axisMarkIntraBlock (axis_desc* axis, - int ibNum, - size_t ibLen){ +static void axisMarkIntraBlock (axis_desc* axis, + int ibNum, + size_t ibLen){ axis->isIntra = 1; axis->ibNum = ibNum; axis->splitLen = ibLen; @@ -1299,13 +1299,13 @@ static void axisMarkIntraBlock (axis_desc* axis, * @brief Get properties of an axis. */ -static int axisGetReduxNum (const axis_desc* axis){ +static int axisGetReduxNum (const axis_desc* axis){ return axis->reduxNum; } -static size_t axisGetLen (const axis_desc* axis){ +static size_t axisGetLen (const axis_desc* axis){ return axis->len; } -static size_t axisGetIntraLen (const axis_desc* axis){ +static size_t axisGetIntraLen (const axis_desc* axis){ if (axisIsSplit(axis)){ return axis->splitLen; }else if (axisIsIntra(axis)){ @@ -1314,7 +1314,7 @@ static size_t axisGetIntraLen (const axis_desc* axis){ return 1; } } -static size_t axisGetInterLen (const axis_desc* axis){ +static size_t axisGetInterLen (const axis_desc* axis){ if (axisIsSplit(axis)){ return DIVIDECEIL(axis->len, axis->splitLen); }else if (axisIsIntra(axis)){ @@ -1323,69 +1323,69 @@ static size_t axisGetInterLen (const axis_desc* axis){ return axis->len; } } -static size_t axisGetIntraInterLen (const axis_desc* axis){ +static size_t axisGetIntraInterLen (const axis_desc* axis){ return axisGetIntraLen(axis)*axisGetInterLen(axis); } -static ssize_t axisGetS0Stride (const axis_desc* axis){ +static ssize_t axisGetS0Stride (const axis_desc* axis){ return axisGetLen(axis) > 1 ? axis->s0S : 0; } -static size_t axisGetS0AbsStride (const axis_desc* axis){ +static size_t axisGetS0AbsStride (const axis_desc* axis){ return axisGetS0Stride(axis)<0 ? -(size_t)axisGetS0Stride(axis): +(size_t)axisGetS0Stride(axis); } -static ssize_t axisGetD0Stride (const axis_desc* axis){ +static ssize_t axisGetD0Stride (const axis_desc* axis){ return axisGetLen(axis) > 1 ? axis->d0S : 0; } -static size_t axisGetD0AbsStride (const axis_desc* axis){ +static size_t axisGetD0AbsStride (const axis_desc* axis){ return axisGetD0Stride(axis)<0 ? -(size_t)axisGetD0Stride(axis): +(size_t)axisGetD0Stride(axis); } -static ssize_t axisGetD1Stride (const axis_desc* axis){ +static ssize_t axisGetD1Stride (const axis_desc* axis){ return axisGetLen(axis) > 1 ? axis->d1S : 0; } -static size_t axisGetD1AbsStride (const axis_desc* axis){ +static size_t axisGetD1AbsStride (const axis_desc* axis){ return axisGetD1Stride(axis)<0 ? -(size_t)axisGetD1Stride(axis): +(size_t)axisGetD1Stride(axis); } -static size_t axisGetI0Stride (const axis_desc* axis){ +static size_t axisGetI0Stride (const axis_desc* axis){ return axis->i0S; } -static void axisSetI0Stride (axis_desc* axis, - size_t i0S){ +static void axisSetI0Stride (axis_desc* axis, + size_t i0S){ axis->i0S = i0S; } -static unsigned axisGetPerm (const axis_desc* axis){ +static unsigned axisGetPerm (const axis_desc* axis){ return axis->perm; } -static int axisGetIBNum (const axis_desc* axis){ +static int axisGetIBNum (const axis_desc* axis){ return axis->ibNum; } -static void axisSetPerm (axis_desc* axis, - unsigned perm){ +static void axisSetPerm (axis_desc* axis, + unsigned perm){ axis->perm = perm; } -static int axisIsReduced (const axis_desc* axis){ +static int axisIsReduced (const axis_desc* axis){ return axis->isReduced; } -static int axisIsIntra (const axis_desc* axis){ +static int axisIsIntra (const axis_desc* axis){ return axis->isIntra; } -static int axisIsInter (const axis_desc* axis){ +static int axisIsInter (const axis_desc* axis){ return !axisIsIntra(axis); } -static int axisIsSplit (const axis_desc* axis){ +static int axisIsSplit (const axis_desc* axis){ return axisIsIntra(axis) && axis->splitLen != axis->len; } -static size_t reduxInvEstimateParallelism (const redux_ctx* ctx){ +static size_t reduxInvEstimateParallelism (const redux_ctx* ctx){ return reduxGenEstimateParallelism(ctx->gr); } -static int reduxInvRequiresS0 (const redux_ctx* ctx){ +static int reduxInvRequiresS0 (const redux_ctx* ctx){ return reduxGenRequiresS0(ctx->gr); } -static int reduxInvRequiresD0 (const redux_ctx* ctx){ +static int reduxInvRequiresD0 (const redux_ctx* ctx){ return reduxGenRequiresD0(ctx->gr); } -static int reduxInvRequiresD1 (const redux_ctx* ctx){ +static int reduxInvRequiresD1 (const redux_ctx* ctx){ return reduxGenRequiresD1(ctx->gr); } @@ -1393,7 +1393,7 @@ static int reduxInvRequiresD1 (const redux_ctx* ctx){ * @brief Get description of source axis with given number. */ -static axis_desc* reduxInvGetSrcAxis (const redux_ctx* ctx, int i){ +static axis_desc* reduxInvGetSrcAxis (const redux_ctx* ctx, int i){ return &ctx->xdSrc[i]; } @@ -1401,7 +1401,7 @@ static axis_desc* reduxInvGetSrcAxis (const redux_ctx* ctx, int i){ * @brief Get description of source axis with given number in sort-order. */ -static axis_desc* reduxInvGetSrcSortAxis (const redux_ctx* ctx, int i){ +static axis_desc* reduxInvGetSrcSortAxis (const redux_ctx* ctx, int i){ return ctx->xdSrcPtrs[i]; } @@ -1417,8 +1417,8 @@ static axis_desc* reduxInvGetSrcSortAxis (const redux_ctx* ctx, int i){ * @return Non-zero if flattening attempt successful; Zero otherwise. */ -static int reduxTryFlattenOut (const redux_ctx* ctx, - const axis_desc* axis){ +static int reduxTryFlattenOut (const redux_ctx* ctx, + const axis_desc* axis){ if ((axisGetLen (axis) == 1 )|| (axisIsReduced(axis) && ctx->zeroRdxAxes > 0)){ return 1; @@ -1448,9 +1448,9 @@ static int reduxTryFlattenOut (const redux_ctx* ctx, * @return Non-zero if flattening attempt successful; Zero otherwise. */ -static int reduxTryFlattenInto (redux_ctx* ctx, - axis_desc* into, - const axis_desc* from){ +static int reduxTryFlattenInto (redux_ctx* ctx, + axis_desc* into, + const axis_desc* from){ int signS0 = 0, signD0 = 0, signD1 = 0, reverseS0 = 0, reverseD0 = 0, reverseD1 = 0; @@ -1520,10 +1520,10 @@ static int reduxTryFlattenInto (redux_ctx* ctx, * not touching the axes themselves. */ -static void reduxSortAxisPtrsBy (axis_desc** ptrs, - axis_desc* axes, - size_t numAxes, - int(*fn)(const void*, const void*)){ +static void reduxSortAxisPtrsBy (axis_desc** ptrs, + axis_desc* axes, + size_t numAxes, + int(*fn)(const void*, const void*)){ size_t i; for (i=0;ikArgTypeCodes = NULL; gr->kSourceCode = NULL; gr->kErrorString = NULL; @@ -1553,7 +1553,7 @@ static int reduxGenInit (GpuReduction* gr){ * @brief Begin inferring the properties of the reduction operator. */ -static int reduxGenInferProperties (GpuReduction* gr){ +static int reduxGenInferProperties (GpuReduction* gr){ int i; /** @@ -1609,7 +1609,7 @@ static int reduxGenInferProperties (GpuReduction* gr){ * Compute maximum block size we shall support in generated kernels. */ -static void reduxGenSetMaxBS (GpuReduction* gr){ +static void reduxGenSetMaxBS (GpuReduction* gr){ gr->maxBS = gr->grAttr.maxLM/reduxGenGetReduxStateSize(gr); gr->maxBS = gr->maxBS < gr->grAttr.maxLg ? gr->maxBS : gr->grAttr.maxLg; gr->maxBS = gr->maxBS < gr->grAttr.maxL0 ? gr->maxBS : gr->grAttr.maxL0; @@ -1658,7 +1658,7 @@ static void reduxGenSetMaxBS (GpuReduction* gr){ * For now we default TK1 to exactly TI0. */ -static void reduxGenSetKTypes (GpuReduction* gr){ +static void reduxGenSetKTypes (GpuReduction* gr){ const gpuarray_type *TK0 = NULL, *TK1 = NULL, *TPS0 = NULL; const char* TK0init = NULL; @@ -1769,9 +1769,9 @@ static void reduxGenSetKTypes (GpuReduction* gr){ * Iterate over the arguments of the reduction operator. */ -static void reduxGenIterArgs (const GpuReduction* gr, - GpuReductionIterFn fn, - void* user){ +static void reduxGenIterArgs (const GpuReduction* gr, + GpuReductionIterFn fn, + void* user){ int k; /** @@ -1872,7 +1872,7 @@ static void reduxGenIterArgs (const GpuReduction* gr, * @return GA_MEMORY_ERROR if not enough memory left; GA_NO_ERROR otherwise. */ -static int reduxGenSrc (GpuReduction* gr){ +static int reduxGenSrc (GpuReduction* gr){ GpuReductionAttr_appendopname(&gr->grAttr, sizeof(gr->kName), gr->kName); reduxGenSrcAppend(gr); @@ -1893,19 +1893,19 @@ static int reduxGenSrc (GpuReduction* gr){ * @brief Append source code to the string buffer. */ -static void reduxGenSrcAppend (GpuReduction* gr){ +static void reduxGenSrcAppend (GpuReduction* gr){ reduxGenSrcAppendIncludes (gr); reduxGenSrcAppendMacroTypedefs(gr); reduxGenSrcAppendReduxKernel (gr); } -static void reduxGenSrcAppendIncludes (GpuReduction* gr){ +static void reduxGenSrcAppendIncludes (GpuReduction* gr){ srcbAppends(&gr->srcGen, "/* Includes */\n"); srcbAppends(&gr->srcGen, "#include \"cluda.h\"\n"); srcbAppends(&gr->srcGen, "\n"); srcbAppends(&gr->srcGen, "\n"); srcbAppends(&gr->srcGen, "\n"); } -static void reduxGenSrcAppendMacroTypedefs(GpuReduction* gr){ +static void reduxGenSrcAppendMacroTypedefs (GpuReduction* gr){ /** * Typedefs of various types. */ @@ -2184,7 +2184,7 @@ static void reduxGenSrcAppendMacroTypedefs(GpuReduction* gr){ srcbAppends(&gr->srcGen, "#define DIVIDECEIL(a,b) (((a)+(b)-1)/(b))\n\n\n\n\n"); } -static void reduxGenSrcAppendReduxKernel (GpuReduction* gr){ +static void reduxGenSrcAppendReduxKernel (GpuReduction* gr){ reduxGenSrcAppendPrototype (gr); srcbAppends (&gr->srcGen, "{\n"); reduxGenSrcAppendDecode (gr); @@ -2229,7 +2229,7 @@ static void reduxGenSrcAppendReduxKernel (GpuReduction* gr){ srcbAppends (&gr->srcGen, " }\n"); srcbAppends (&gr->srcGen, "}\n"); } -static void reduxGenSrcAppendPrototype (GpuReduction* gr){ +static void reduxGenSrcAppendPrototype (GpuReduction* gr){ int i=0; srcbAppendf(&gr->srcGen, @@ -2244,7 +2244,7 @@ static void reduxGenSrcAppendPrototype (GpuReduction* gr){ reduxGenIterArgs(gr, reduxGenAppendArg, &i); srcbAppends(&gr->srcGen, ")"); } -static void reduxGenSrcAppendDecode (GpuReduction* gr){ +static void reduxGenSrcAppendDecode (GpuReduction* gr){ int i; srcbAppends(&gr->srcGen, @@ -2635,8 +2635,8 @@ static void reduxGenSrcAppendDecode (GpuReduction* gr){ " \n" " \n"); } -static void reduxGenSrcAppendPhase0 (GpuReduction* gr, - uint32_t selector){ +static void reduxGenSrcAppendPhase0 (GpuReduction* gr, + uint32_t selector){ int i; const char* type; @@ -2671,9 +2671,9 @@ static void reduxGenSrcAppendPhase0 (GpuReduction* gr, " }\n" " }\n"); } -static void reduxGenSrcAppendLoop (GpuReduction* gr, - uint32_t selector, - int initial){ +static void reduxGenSrcAppendLoop (GpuReduction* gr, + uint32_t selector, + int initial){ int i; srcbAppends(&gr->srcGen, " while(v > 0){v--;\n"); @@ -2690,8 +2690,8 @@ static void reduxGenSrcAppendLoop (GpuReduction* gr, srcbAppends(&gr->srcGen, " break;\n" " }\n"); } -static void reduxGenSrcAppendVertical (GpuReduction* gr, - uint32_t selector){ +static void reduxGenSrcAppendVertical (GpuReduction* gr, + uint32_t selector){ int i = (selector&SELECTOR_SPLIT_FREE) ? gr->ndd-1 : gr->nds-1; if (i >= 0){ @@ -2704,10 +2704,10 @@ static void reduxGenSrcAppendVertical (GpuReduction* gr, " REDUX(K0, K1, tmpK0, I0);\n"); } } -static void reduxGenSrcAppendIncrement (GpuReduction* gr, - uint32_t selector, - int initial, - int axis){ +static void reduxGenSrcAppendIncrement (GpuReduction* gr, + uint32_t selector, + int initial, + int axis){ const char* cast = reduxGenSrcAxisIsHuge(gr, selector, axis) ? "TS64" : "TS32"; const char* breakOrCont = (initial) && (axis < gr->ndd) ? "break " : "continue"; @@ -2743,9 +2743,9 @@ static void reduxGenSrcAppendIncrement (GpuReduction* gr, axis, cast, axis, breakOrCont, axis, axis); } } -static void reduxGenSrcAppendDstWrite (GpuReduction* gr, - uint32_t selector, - int initial){ +static void reduxGenSrcAppendDstWrite (GpuReduction* gr, + uint32_t selector, + int initial){ srcbAppends(&gr->srcGen, " local_barrier();\n"); if (initial){ srcbAppends(&gr->srcGen, " if(LID_0 < D){\n" @@ -2775,7 +2775,7 @@ static void reduxGenSrcAppendDstWrite (GpuReduction* gr, } srcbAppends(&gr->srcGen, " local_barrier();\n"); } -static void reduxGenSrcAppendPhase1 (GpuReduction* gr){ +static void reduxGenSrcAppendPhase1 (GpuReduction* gr){ /** * PHASE 1 * @@ -2814,9 +2814,9 @@ static void reduxGenSrcAppendPhase1 (GpuReduction* gr){ " }\n"); } } -static int reduxGenSrcAxisIsHuge (GpuReduction* gr, - uint32_t selector, - int axis){ +static int reduxGenSrcAxisIsHuge (GpuReduction* gr, + uint32_t selector, + int axis){ int hugeType = selector & SELECTOR_HUGE_AXIS; int isSplitFree = !!(selector & SELECTOR_SPLIT_FREE); int isAxisFree = axis < gr->ndd; @@ -2847,9 +2847,9 @@ static int reduxGenSrcAxisIsHuge (GpuReduction* gr, return 0; } } -static int reduxGenSrcAxisIsSplit (GpuReduction* gr, - uint32_t selector, - int axis){ +static int reduxGenSrcAxisIsSplit (GpuReduction* gr, + uint32_t selector, + int axis){ return ( (selector & SELECTOR_SPLIT_FREE) && axis == gr->ndd-1) || (!(selector & SELECTOR_SPLIT_FREE) && axis == gr->nds-1); } @@ -2858,7 +2858,7 @@ static int reduxGenSrcAxisIsSplit (GpuReduction* gr, * @brief Compile the generated kernel. */ -static int reduxGenCompile (GpuReduction* gr){ +static int reduxGenCompile (GpuReduction* gr){ int ret, flags = 0; flags |= GA_USE_CLUDA; @@ -2896,7 +2896,7 @@ static int reduxGenCompile (GpuReduction* gr){ * support launching. */ -static int reduxGenComputeLaunchBounds (GpuReduction* gr){ +static int reduxGenComputeLaunchBounds (GpuReduction* gr){ int ret; /** @@ -2919,7 +2919,7 @@ static int reduxGenComputeLaunchBounds (GpuReduction* gr){ * @brief Cleanup generator context. */ -static int reduxGenCleanup (GpuReduction* gr, int ret){ +static int reduxGenCleanup (GpuReduction* gr, int ret){ if (ret != GA_NO_ERROR){ free(gr->kArgTypeCodes); free(gr->kSourceCode); @@ -2931,8 +2931,8 @@ static int reduxGenCleanup (GpuReduction* gr, int ret) return ret; } -static int reduxGenCleanupMsg (GpuReduction* gr, int ret, - const char* fmt, ...){ +static int reduxGenCleanupMsg (GpuReduction* gr, int ret, + const char* fmt, ...){ #if DEBUG FILE* fp = stderr; @@ -2952,12 +2952,12 @@ static int reduxGenCleanupMsg (GpuReduction* gr, int ret, * Count # of arguments as determined by iterator. */ -static void reduxGenCountArgs (const GpuReduction* gr, - int typecode, - const char* typeName, - const char* baseName, - int num, - void* user){ +static void reduxGenCountArgs (const GpuReduction* gr, + int typecode, + const char* typeName, + const char* baseName, + int num, + void* user){ (void)gr; (void)typecode; (void)typeName; @@ -2971,12 +2971,12 @@ static void reduxGenCountArgs (const GpuReduction* gr, * Record the typecodes in the arguments typecode array. */ -static void reduxGenSaveArgTypecodes (const GpuReduction* gr, - int typecode, - const char* typeName, - const char* baseName, - int num, - void* user){ +static void reduxGenSaveArgTypecodes (const GpuReduction* gr, + int typecode, + const char* typeName, + const char* baseName, + int num, + void* user){ (void)typeName; (void)baseName; (void)num; @@ -2989,12 +2989,12 @@ static void reduxGenSaveArgTypecodes (const GpuReduction* gr, * Append an argument declaration to prototype. */ -static void reduxGenAppendArg (const GpuReduction* gr, - int typecode, - const char* typeName, - const char* baseName, - int num, - void* user){ +static void reduxGenAppendArg (const GpuReduction* gr, + int typecode, + const char* typeName, + const char* baseName, + int num, + void* user){ (void)user; (void)typecode; @@ -3009,12 +3009,12 @@ static void reduxGenAppendArg (const GpuReduction* gr, * Marshall argument declaration during invocation. */ -static void reduxInvMarshalArg (const GpuReduction* gr, - int typecode, - const char* typeName, - const char* baseName, - int num, - void* user){ +static void reduxInvMarshalArg (const GpuReduction* gr, + int typecode, + const char* typeName, + const char* baseName, + int num, + void* user){ redux_ctx* ctx; int* i, k = num; @@ -3098,7 +3098,7 @@ static void reduxInvMarshalArg (const GpuReduction* gr, * device, plus some substantial margin. */ -static size_t reduxGenEstimateParallelism (const GpuReduction* gr){ +static size_t reduxGenEstimateParallelism (const GpuReduction* gr){ /** * An arbitrary margin factor ensuring there will be a few thread blocks * per SMX. @@ -3166,34 +3166,34 @@ static size_t reduxGenEstimateParallelism (const GpuReduction* gr){ * initialization operations, the above might not necessarily hold anymore. */ -static int reduxGenRequiresS0 (const GpuReduction* gr){ +static int reduxGenRequiresS0 (const GpuReduction* gr){ return GpuReductionAttr_requiresS0(&gr->grAttr); } -static int reduxGenRequiresD0 (const GpuReduction* gr){ +static int reduxGenRequiresD0 (const GpuReduction* gr){ return GpuReductionAttr_requiresD0(&gr->grAttr); } -static int reduxGenRequiresD1 (const GpuReduction* gr){ +static int reduxGenRequiresD1 (const GpuReduction* gr){ return GpuReductionAttr_requiresD1(&gr->grAttr); } -static int reduxGenKernelRequiresLatticeS0(const GpuReduction* gr){ +static int reduxGenKernelRequiresLatticeS0 (const GpuReduction* gr){ return reduxGenRequiresS0(gr); } -static int reduxGenKernelRequiresLatticeD0(const GpuReduction* gr){ +static int reduxGenKernelRequiresLatticeD0 (const GpuReduction* gr){ return reduxGenRequiresD0(gr); } -static int reduxGenKernelRequiresLatticeD1(const GpuReduction* gr){ +static int reduxGenKernelRequiresLatticeD1 (const GpuReduction* gr){ return reduxGenRequiresD1(gr); } -static int reduxGenKernelRequiresLatticeI0(const GpuReduction* gr){ +static int reduxGenKernelRequiresLatticeI0 (const GpuReduction* gr){ return reduxGenRequiresD1(gr); } -static int reduxGenKernelRequiresStateK0 (const GpuReduction* gr){ +static int reduxGenKernelRequiresStateK0 (const GpuReduction* gr){ return reduxGenKernelRequiresLatticeS0(gr); } -static int reduxGenKernelRequiresStateK1 (const GpuReduction* gr){ +static int reduxGenKernelRequiresStateK1 (const GpuReduction* gr){ return reduxGenKernelRequiresLatticeI0(gr); } -static int reduxGenKernelRequiresWspace (const GpuReduction* gr){ +static int reduxGenKernelRequiresWspace (const GpuReduction* gr){ (void)gr; return 1; } @@ -3203,16 +3203,16 @@ static int reduxGenKernelRequiresWspace (const GpuReduction* gr){ * Get size and alignment requirements of K0 and K1 states. */ -static size_t reduxGenGetK0Size (const GpuReduction* gr){ +static size_t reduxGenGetK0Size (const GpuReduction* gr){ return gr->TK0.size; } -static size_t reduxGenGetK0Align (const GpuReduction* gr){ +static size_t reduxGenGetK0Align (const GpuReduction* gr){ return gr->TK0.align; } -static size_t reduxGenGetK1Size (const GpuReduction* gr){ +static size_t reduxGenGetK1Size (const GpuReduction* gr){ return gr->TK1.size; } -static size_t reduxGenGetK1Align (const GpuReduction* gr){ +static size_t reduxGenGetK1Align (const GpuReduction* gr){ return gr->TK1.align; } @@ -3220,7 +3220,7 @@ static size_t reduxGenGetK1Align (const GpuReduction* gr){ * @brief Get the number of bytes of workspace per (partial) reduction per thread. */ -static size_t reduxGenGetReduxStateSize (const GpuReduction* gr){ +static size_t reduxGenGetReduxStateSize (const GpuReduction* gr){ size_t total = 0, idxSize = gpuarray_get_elsize(gr->TS64tc); /* The accumulator and index types can be wider than dst/dstArg's types. */ @@ -3238,7 +3238,7 @@ static size_t reduxGenGetReduxStateSize (const GpuReduction* gr){ * @brief Get the maximum number of threads this operator's kernel can handle. */ -static size_t reduxGenGetMaxLocalSize (const GpuReduction* gr){ +static size_t reduxGenGetMaxLocalSize (const GpuReduction* gr){ return gr->maxLK; } @@ -3246,7 +3246,7 @@ static size_t reduxGenGetMaxLocalSize (const GpuReduction* gr){ * @brief Get the shared memory consumption for a given block size. */ -static size_t reduxGenGetSHMEMSize (const GpuReduction* gr, size_t cells){ +static size_t reduxGenGetSHMEMSize (const GpuReduction* gr, size_t cells){ size_t total = 0, totalPermute; /* Compute size of SHMEM working space */ @@ -3264,7 +3264,7 @@ static size_t reduxGenGetSHMEMSize (const GpuReduction* gr, size * @brief Get the shared memory byte offset for the k0 and k1 states. */ -static size_t reduxGenGetSHMEMK0Off (const GpuReduction* gr, size_t cells){ +static size_t reduxGenGetSHMEMK0Off (const GpuReduction* gr, size_t cells){ if (!reduxGenKernelRequiresWspace (gr)|| !reduxGenKernelRequiresStateK0(gr)|| !reduxGenKernelRequiresStateK1(gr)){ @@ -3277,7 +3277,7 @@ static size_t reduxGenGetSHMEMK0Off (const GpuReduction* gr, size return cells*reduxGenGetK1Size(gr); } } -static size_t reduxGenGetSHMEMK1Off (const GpuReduction* gr, size_t cells){ +static size_t reduxGenGetSHMEMK1Off (const GpuReduction* gr, size_t cells){ if (!reduxGenKernelRequiresWspace (gr)|| !reduxGenKernelRequiresStateK0(gr)|| !reduxGenKernelRequiresStateK1(gr)){ @@ -3298,7 +3298,7 @@ static size_t reduxGenGetSHMEMK1Off (const GpuReduction* gr, size * intrablock offset permutes, for instance. */ -static size_t reduxGenGetWMEMSize (const GpuReduction* gr, size_t cells){ +static size_t reduxGenGetWMEMSize (const GpuReduction* gr, size_t cells){ size_t total = 0; total += reduxGenKernelRequiresStateK0(gr) ? cells*reduxGenGetK0Size(gr) : 0; @@ -3311,10 +3311,10 @@ static size_t reduxGenGetWMEMSize (const GpuReduction* gr, size * @brief Get the workspace memory byte offset for the k0 and k1 states. */ -static size_t reduxGenGetWMEMK0Off (const GpuReduction* gr, size_t cells){ +static size_t reduxGenGetWMEMK0Off (const GpuReduction* gr, size_t cells){ return reduxGenGetSHMEMK0Off(gr, cells); } -static size_t reduxGenGetWMEMK1Off (const GpuReduction* gr, size_t cells){ +static size_t reduxGenGetWMEMK1Off (const GpuReduction* gr, size_t cells){ return reduxGenGetSHMEMK1Off(gr, cells); } @@ -3324,7 +3324,7 @@ static size_t reduxGenGetWMEMK1Off (const GpuReduction* gr, size * After this function, calling reduxInvCleanup*() becomes safe. */ -static int reduxInvInit (redux_ctx* ctx){ +static int reduxInvInit (redux_ctx* ctx){ ctx->L = NULL; ctx->Li = NULL; ctx->S0J = ctx->S0Si = NULL; @@ -3349,7 +3349,7 @@ static int reduxInvInit (redux_ctx* ctx){ * @brief Begin inferring the properties of the reduction invocation. */ -static int reduxInvInferProperties (redux_ctx* ctx){ +static int reduxInvInferProperties (redux_ctx* ctx){ axis_desc* a; int i, j; size_t d; @@ -3530,7 +3530,7 @@ static int reduxInvInferProperties (redux_ctx* ctx){ * contiguous as possible. */ -static int reduxInvFlattenSource (redux_ctx* ctx){ +static int reduxInvFlattenSource (redux_ctx* ctx){ axis_desc* axis, *flatAxis, *sortAxis; int i, j, k, isSensitive; @@ -3595,7 +3595,7 @@ static int reduxInvFlattenSource (redux_ctx* ctx){ * criteria. */ -static int reduxInvComputeKernelArgs (redux_ctx* ctx){ +static int reduxInvComputeKernelArgs (redux_ctx* ctx){ axis_desc* axis, *prevAxis; size_t target, aL, aLS, perm, i0S; int i, j, haveSplitFreeAxis, haveSplitReducedAxis; @@ -4008,7 +4008,7 @@ static int reduxInvSchedule (redux_ctx* ctx){ * @brief Invoke the kernel. */ -static int reduxInvoke (redux_ctx* ctx){ +static int reduxInvoke (redux_ctx* ctx){ int ret, i=0; void* ptrs[2] = {ctx, &i}; @@ -4049,7 +4049,7 @@ static int reduxInvoke (redux_ctx* ctx){ * Cleanup */ -static int reduxInvCleanup (redux_ctx* ctx, int ret){ +static int reduxInvCleanup (redux_ctx* ctx, int ret){ ctx->gr = NULL; ctx->s0 = NULL; ctx->d0 = NULL; @@ -4090,8 +4090,8 @@ static int reduxInvCleanup (redux_ctx* ctx, int ret) return ret; } -static int reduxInvCleanupMsg (redux_ctx* ctx, int ret, - const char* fmt, ...){ +static int reduxInvCleanupMsg (redux_ctx* ctx, int ret, + const char* fmt, ...){ #if DEBUG FILE* fp = stderr; From 8f5250e732f0e0054c323dd37a8f5d9e0c8c2c40 Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Sun, 23 Jul 2017 00:54:10 -0400 Subject: [PATCH 27/34] Muzzle -Wdeclaration-after-statement in check_reduction.c. There is now not a single -Wdeclaration-after-statement warning origination in that file. --- tests/check_reduction.c | 791 ++++++++++++++++++---------------------- 1 file changed, 360 insertions(+), 431 deletions(-) diff --git a/tests/check_reduction.c b/tests/check_reduction.c index 6a1e8c6a97..60411ead57 100644 --- a/tests/check_reduction.c +++ b/tests/check_reduction.c @@ -68,13 +68,16 @@ static double pcgRand01(void){ */ START_TEST(test_maxandargmax_reduction){ - pcgSeed(1); - /** * We test here a reduction of some random 3D tensor on the first and * third dimensions. */ - + + GpuArray gaS0; + GpuArray gaD0; + GpuArray gaD1; + GpuReductionAttr* grAttr; + GpuReduction* gr; size_t errCnt = 0; size_t i,j,k; size_t dims[3] = {32,50,79}; @@ -93,7 +96,8 @@ START_TEST(test_maxandargmax_reduction){ /** * Initialize source data. */ - + + pcgSeed(1); for(i=0;i= TOL){ errCnt++; if(errCnt <= MAXERRPRINT){ @@ -2305,13 +2273,15 @@ START_TEST(test_sum_reduction){ }END_TEST START_TEST(test_sum_veryhighrank){ - pcgSeed(1); - /** * Here we test a reduction of a random 8D tensor on four dimensions. */ - size_t errCnt = 0; + GpuArray gaS0; + GpuArray gaD0; + GpuReductionAttr* grAttr; + GpuReduction* gr; + size_t errCnt = 0, dstIdx; size_t i,j,k,l,m,n,o,p; size_t dims [8] = {1171,373,2,1,2,1,2,1}; size_t prodDims = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7]; @@ -2331,6 +2301,7 @@ START_TEST(test_sum_veryhighrank){ * Initialize source data. */ + pcgSeed(1); for(i=0;i= TOL){ errCnt++; if(errCnt <= MAXERRPRINT){ @@ -2414,18 +2380,21 @@ START_TEST(test_sum_veryhighrank){ }END_TEST START_TEST(test_sum_alldimsreduced){ - pcgSeed(1); - /** * We test here a reduction of some random 3D tensor on all dimensions. */ + GpuArray gaS0; + GpuArray gaD0; + GpuReductionAttr* grAttr; + GpuReduction* gr; size_t errCnt = 0; size_t i,j,k; size_t dims[3] = {32,50,79}; size_t prodDims = dims[0]*dims[1]*dims[2]; const int reduxList[] = {0,1,2}; const float TOL = 1e-4; + float gtD0; float* pS0 = calloc(1, sizeof(*pS0) * dims[0]*dims[1]*dims[2]); float* pD0 = calloc(1, sizeof(*pD0) ); @@ -2438,6 +2407,7 @@ START_TEST(test_sum_alldimsreduced){ * Initialize source data. */ + pcgSeed(1); for(i=0;i= TOL){ errCnt++; if(errCnt <= MAXERRPRINT){ @@ -2692,13 +2654,15 @@ START_TEST(test_prod_reduction){ }END_TEST START_TEST(test_prod_veryhighrank){ - pcgSeed(1); - /** * Here we test a reduction of a random 8D tensor on four dimensions. */ - size_t errCnt = 0; + GpuArray gaS0; + GpuArray gaD0; + GpuReductionAttr* grAttr; + GpuReduction* gr; + size_t errCnt = 0, dstIdx; size_t i,j,k,l,m,n,o,p; size_t dims [8] = {1171,373,2,1,2,1,2,1}; size_t prodDims = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7]; @@ -2718,6 +2682,7 @@ START_TEST(test_prod_veryhighrank){ * Initialize source data. */ + pcgSeed(1); for(i=0;i= TOL){ errCnt++; if(errCnt <= MAXERRPRINT){ @@ -2801,18 +2761,21 @@ START_TEST(test_prod_veryhighrank){ }END_TEST START_TEST(test_prod_alldimsreduced){ - pcgSeed(1); - /** * We test here a reduction of some random 3D tensor on all dimensions. */ + GpuArray gaS0; + GpuArray gaD0; + GpuReductionAttr* grAttr; + GpuReduction* gr; size_t errCnt = 0; size_t i,j,k; size_t dims[3] = {32,50,79}; size_t prodDims = dims[0]*dims[1]*dims[2]; const int reduxList[] = {0,1,2}; const float TOL = 1e-4; + float gtD0; float* pS0 = calloc(1, sizeof(*pS0) * dims[0]*dims[1]*dims[2]); float* pD0 = calloc(1, sizeof(*pD0) ); @@ -2825,6 +2788,7 @@ START_TEST(test_prod_alldimsreduced){ * Initialize source data. */ + pcgSeed(1); for(i=0;i= TOL){ errCnt++; if(errCnt <= MAXERRPRINT){ @@ -2993,13 +2950,15 @@ START_TEST(test_prodnz_reduction){ }END_TEST START_TEST(test_prodnz_veryhighrank){ - pcgSeed(1); - /** * Here we test a reduction of a random 8D tensor on four dimensions. */ - size_t errCnt = 0; + GpuArray gaS0; + GpuArray gaD0; + GpuReductionAttr* grAttr; + GpuReduction* gr; + size_t errCnt = 0, dstIdx; size_t i,j,k,l,m,n,o,p; size_t dims [8] = {1171,373,2,1,2,1,2,1}; size_t prodDims = dims[0]*dims[1]*dims[2]*dims[3]*dims[4]*dims[5]*dims[6]*dims[7]; @@ -3019,6 +2978,7 @@ START_TEST(test_prodnz_veryhighrank){ * Initialize source data. */ + pcgSeed(1); for(i=0;i= TOL){ errCnt++; if(errCnt <= MAXERRPRINT){ @@ -3105,18 +3060,21 @@ START_TEST(test_prodnz_veryhighrank){ }END_TEST START_TEST(test_prodnz_alldimsreduced){ - pcgSeed(1); - /** * We test here a reduction of some random 3D tensor on all dimensions. */ + GpuArray gaS0; + GpuArray gaD0; + GpuReductionAttr* grAttr; + GpuReduction* gr; size_t errCnt = 0; size_t i,j,k; size_t dims[3] = {32,50,79}; size_t prodDims = dims[0]*dims[1]*dims[2]; const int reduxList[] = {0,1,2}; const float TOL = 1e-4; + float gtD0; float* pS0 = calloc(1, sizeof(*pS0) * dims[0]*dims[1]*dims[2]); float* pD0 = calloc(1, sizeof(*pD0) ); @@ -3129,6 +3087,7 @@ START_TEST(test_prodnz_alldimsreduced){ * Initialize source data. */ + pcgSeed(1); for(i=0;i Date: Tue, 25 Jul 2017 15:42:36 -0400 Subject: [PATCH 28/34] Easy feedback fixes applied. --- src/gpuarray_reduction.c | 39 +++++++++++++++++---------------------- 1 file changed, 17 insertions(+), 22 deletions(-) diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c index 6d2e6fc17f..092b8f8509 100644 --- a/src/gpuarray_reduction.c +++ b/src/gpuarray_reduction.c @@ -248,8 +248,6 @@ struct GpuReductionAttr{ struct GpuReduction{ /* Function Arguments. */ GpuReductionAttr grAttr; - gpucontext* gpuCtx; - ga_reduce_op op; int nds; int ndd; int ndr; @@ -652,6 +650,8 @@ GPUARRAY_PUBLIC void GpuReductionAttr_free (GpuReductionAttr* } GPUARRAY_PUBLIC int GpuReduction_new (GpuReduction** gr, const GpuReductionAttr* grAttr){ + GpuReduction* grOut = NULL; + if (!gr){ return GA_INVALID_ERROR; } @@ -660,16 +660,14 @@ GPUARRAY_PUBLIC int GpuReduction_new (GpuReduction** return GA_INVALID_ERROR; } - *gr = calloc(1, sizeof(**gr)); - if (*gr){ - (*gr)->grAttr = *grAttr; - (*gr)->gpuCtx = grAttr->gpuCtx; - (*gr)->op = grAttr->op; - (*gr)->nds = (int)grAttr->maxSrcDims; - (*gr)->ndd = (int)grAttr->maxDstDims; - (*gr)->ndr = (int)(grAttr->maxSrcDims-grAttr->maxDstDims); + grOut = calloc(1, sizeof(*grOut)); + if (grOut){ + grOut->grAttr = *grAttr; + grOut->nds = (int)grAttr->maxSrcDims; + grOut->ndd = (int)grAttr->maxDstDims; + grOut->ndr = (int)(grAttr->maxSrcDims - grAttr->maxDstDims); - return reduxGenInit(*gr); + return reduxGenInit(grOut); }else{ return GA_MEMORY_ERROR; } @@ -684,7 +682,8 @@ GPUARRAY_PUBLIC int GpuReduction_call (const GpuReduction* unsigned reduxLen, const int* reduxList, int flags){ - redux_ctx ctxSTACK, *ctx = &ctxSTACK; + redux_ctx ctxSTACK; + redux_ctx *ctx = &ctxSTACK; memset(ctx, 0, sizeof(*ctx)); ctx->gr = gr; @@ -713,8 +712,7 @@ GPUARRAY_PUBLIC int GpuReduction_call (const GpuReduction* */ static int reduxGetSumInit (int typecode, const char** property){ - if (typecode == GA_POINTER || - typecode == GA_BUFFER){ + if (typecode < 0){ return GA_UNSUPPORTED_ERROR; } *property = "0"; @@ -733,8 +731,7 @@ static int reduxGetSumInit (int typecode, const char** */ static int reduxGetProdInit (int typecode, const char** property){ - if (typecode == GA_POINTER || - typecode == GA_BUFFER){ + if (typecode < 0){ return GA_UNSUPPORTED_ERROR; } *property = "1"; @@ -942,8 +939,7 @@ static int reduxGetMaxInit (int typecode, const char** */ static int reduxGetAndInit (int typecode, const char** property){ - if (typecode == GA_POINTER || - typecode == GA_BUFFER){ + if (typecode < 0){ return GA_UNSUPPORTED_ERROR; } *property = "~0"; @@ -962,8 +958,7 @@ static int reduxGetAndInit (int typecode, const char** */ static int reduxGetOrInit (int typecode, const char** property){ - if (typecode == GA_POINTER || - typecode == GA_BUFFER){ + if (typecode < 0){ return GA_UNSUPPORTED_ERROR; } *property = "0"; @@ -2867,7 +2862,7 @@ static int reduxGenCompile (GpuReduction* gr){ } ret = GpuKernel_init(&gr->k, - gr->gpuCtx, + gr->grAttr.gpuCtx, 1, (const char**)&gr->kSourceCode, &gr->kSourceCodeLen, @@ -3994,7 +3989,7 @@ static int reduxInvSchedule (redux_ctx* ctx){ ctx->W0Off = reduxGenGetWMEMK0Off(ctx->gr, 2*ctx->gs*ctx->D); ctx->W1Off = reduxGenGetWMEMK1Off(ctx->gr, 2*ctx->gs*ctx->D); WSPACESIZE = reduxGenGetWMEMSize (ctx->gr, 2*ctx->gs*ctx->D); - ctx->W = gpudata_alloc(ctx->gr->gpuCtx, WSPACESIZE, 0, flags, 0); + ctx->W = gpudata_alloc(ctx->gr->grAttr.gpuCtx, WSPACESIZE, 0, flags, 0); if (!ctx->W){ return reduxInvCleanupMsg(ctx, GA_MEMORY_ERROR, "Could not allocate %zu-byte workspace for reduction!\n", From f129c69073f0e1f3f0f8c6559a7aa68271323337 Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Fri, 4 Aug 2017 14:40:57 -0400 Subject: [PATCH 29/34] Add stdargs support to the error API. --- src/util/error.c | 15 ++++++++++----- src/util/error.h | 1 + 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/src/util/error.c b/src/util/error.c index 19ce184363..2f570144e8 100644 --- a/src/util/error.c +++ b/src/util/error.c @@ -29,15 +29,20 @@ int error_set(error *e, int code, const char *msg) { return code; } -int error_fmt(error *e, int code, const char *fmt, ...) { - va_list ap; - +int error_vfmt(error *e, int code, const char *fmt, va_list ap) { e->code = code; - va_start(ap, fmt); vsnprintf(e->msg, ERROR_MSGBUF_LEN, fmt, ap); - va_end(ap); #ifdef DEBUG fprintf(stderr, "ERROR %d: %s\n", e->code, e->msg); #endif return code; } + +int error_fmt(error *e, int code, const char *fmt, ...) { + int ret; + va_list ap; + va_start(ap, fmt); + ret = error_vfmt(e, code, fmt, ap); + va_end(ap); + return ret; +} diff --git a/src/util/error.h b/src/util/error.h index fc1ecb1663..0f7651fec0 100644 --- a/src/util/error.h +++ b/src/util/error.h @@ -18,6 +18,7 @@ int error_alloc(error **e); void error_free(error *e); int error_set(error *e, int code, const char *msg); int error_fmt(error *e, int code, const char *fmt, ...); +int error_vfmt(error *e, int code, const char *fmt, va_list ap); extern error *global_err; From 76fd38ca3b94299090ca335b31dd5d745aa28484 Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Sat, 26 Aug 2017 19:57:52 -0400 Subject: [PATCH 30/34] Deleted recently-removed properties. --- src/gpuarray_reduction.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c index 092b8f8509..88d5d9e5cf 100644 --- a/src/gpuarray_reduction.c +++ b/src/gpuarray_reduction.c @@ -151,7 +151,7 @@ typedef struct redux_ctx redux_ctx; struct GpuReductionAttr{ gpucontext* gpuCtx; unsigned numProcs; - size_t maxLg, maxL0, maxGg, maxG0, maxLM; + size_t maxL0, maxG0, maxLM; ga_reduce_op op; int maxSrcDims; @@ -491,9 +491,7 @@ GPUARRAY_PUBLIC int GpuReductionAttr_new (GpuReductionAttr** (*grAttr)->gpuCtx = gpuCtx; if (gpucontext_property(gpuCtx, GA_CTX_PROP_NUMPROCS, &(*grAttr)->numProcs) != GA_NO_ERROR || - gpucontext_property(gpuCtx, GA_CTX_PROP_MAXLSIZE, &(*grAttr)->maxLg) != GA_NO_ERROR || gpucontext_property(gpuCtx, GA_CTX_PROP_MAXLSIZE0, &(*grAttr)->maxL0) != GA_NO_ERROR || - gpucontext_property(gpuCtx, GA_CTX_PROP_MAXGSIZE, &(*grAttr)->maxGg) != GA_NO_ERROR || gpucontext_property(gpuCtx, GA_CTX_PROP_MAXGSIZE0, &(*grAttr)->maxG0) != GA_NO_ERROR || gpucontext_property(gpuCtx, GA_CTX_PROP_LMEMSIZE, &(*grAttr)->maxLM) != GA_NO_ERROR ){ free(*grAttr); @@ -1606,7 +1604,6 @@ static int reduxGenInferProperties (GpuReduction* gr){ static void reduxGenSetMaxBS (GpuReduction* gr){ gr->maxBS = gr->grAttr.maxLM/reduxGenGetReduxStateSize(gr); - gr->maxBS = gr->maxBS < gr->grAttr.maxLg ? gr->maxBS : gr->grAttr.maxLg; gr->maxBS = gr->maxBS < gr->grAttr.maxL0 ? gr->maxBS : gr->grAttr.maxL0; /** @@ -2856,7 +2853,6 @@ static int reduxGenSrcAxisIsSplit (GpuReduction* gr, static int reduxGenCompile (GpuReduction* gr){ int ret, flags = 0; - flags |= GA_USE_CLUDA; if (gr->TS0tc == GA_HALF || gr->TD0tc == GA_HALF){ flags |= GA_USE_HALF|GA_USE_SMALL; } @@ -3104,7 +3100,7 @@ static size_t reduxGenEstimateParallelism (const GpuReduction* gr){ */ size_t marginFactor = 16; - return marginFactor * gr->grAttr.numProcs * gr->grAttr.maxLg; + return marginFactor * gr->grAttr.numProcs * gr->grAttr.maxL0; } /** From 0832fa1bfb954dc23b607bdc2d5303bd417c8109 Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Sat, 26 Aug 2017 19:58:11 -0400 Subject: [PATCH 31/34] Added missing header --- src/util/error.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/util/error.h b/src/util/error.h index 0f7651fec0..7577b4cee9 100644 --- a/src/util/error.h +++ b/src/util/error.h @@ -3,6 +3,7 @@ #include #include +#include #include From c679474767aa71b87e86e6f040af4230c6ff91ab Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Sat, 26 Aug 2017 20:23:33 -0400 Subject: [PATCH 32/34] For test purposes, create buffer of ULONG rather than unsupported SIZE. --- tests/check_reduction.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/tests/check_reduction.c b/tests/check_reduction.c index 60411ead57..ca52946fa5 100644 --- a/tests/check_reduction.c +++ b/tests/check_reduction.c @@ -109,7 +109,7 @@ START_TEST(test_maxandargmax_reduction){ ga_assert_ok(GpuArray_empty(&gaS0, ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER)); ga_assert_ok(GpuArray_empty(&gaD0, ctx, GA_FLOAT, 1, &dims[1], GA_C_ORDER)); - ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_SIZE, 1, &dims[1], GA_C_ORDER)); + ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_ULONG, 1, &dims[1], GA_C_ORDER)); ga_assert_ok(GpuArray_write(&gaS0, pS0, sizeof(*pS0)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD0, -1)); /* 0xFFFFFFFF is a qNaN. */ @@ -220,7 +220,7 @@ START_TEST(test_maxandargmax_idxtranspose){ ga_assert_ok(GpuArray_empty(&gaS0, ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER)); ga_assert_ok(GpuArray_empty(&gaD0, ctx, GA_FLOAT, 1, &dims[1], GA_C_ORDER)); - ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_SIZE, 1, &dims[1], GA_C_ORDER)); + ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_ULONG, 1, &dims[1], GA_C_ORDER)); ga_assert_ok(GpuArray_write(&gaS0, pS0, sizeof(*pS0)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD0, -1)); /* 0xFFFFFFFF is a qNaN. */ @@ -327,7 +327,7 @@ START_TEST(test_maxandargmax_bigdestination){ ga_assert_ok(GpuArray_empty(&gaS0, ctx, GA_FLOAT, 2, &dims[0], GA_C_ORDER)); ga_assert_ok(GpuArray_empty(&gaD0, ctx, GA_FLOAT, 1, &dims[1], GA_C_ORDER)); - ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_SIZE, 1, &dims[1], GA_C_ORDER)); + ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_ULONG, 1, &dims[1], GA_C_ORDER)); ga_assert_ok(GpuArray_write(&gaS0, pS0, sizeof(*pS0)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD0, -1)); /* 0xFFFFFFFF is a qNaN. */ @@ -433,7 +433,7 @@ START_TEST(test_maxandargmax_veryhighrank){ ga_assert_ok(GpuArray_empty(&gaS0, ctx, GA_FLOAT, 8, dims, GA_C_ORDER)); ga_assert_ok(GpuArray_empty(&gaD0, ctx, GA_FLOAT, 4, rdxDims, GA_C_ORDER)); - ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_SIZE, 4, rdxDims, GA_C_ORDER)); + ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_ULONG, 4, rdxDims, GA_C_ORDER)); ga_assert_ok(GpuArray_write(&gaS0, pS0, sizeof(*pS0)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD0, -1)); /* 0xFFFFFFFF is a qNaN. */ @@ -553,7 +553,7 @@ START_TEST(test_maxandargmax_alldimsreduced){ ga_assert_ok(GpuArray_empty(&gaS0, ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER)); ga_assert_ok(GpuArray_empty(&gaD0, ctx, GA_FLOAT, 0, NULL, GA_C_ORDER)); - ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_SIZE, 0, NULL, GA_C_ORDER)); + ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_ULONG, 0, NULL, GA_C_ORDER)); ga_assert_ok(GpuArray_write(&gaS0, pS0, sizeof(*pS0)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD0, -1)); /* 0xFFFFFFFF is a qNaN. */ @@ -659,7 +659,7 @@ START_TEST(test_minandargmin_reduction){ ga_assert_ok(GpuArray_empty(&gaS0, ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER)); ga_assert_ok(GpuArray_empty(&gaD0, ctx, GA_FLOAT, 1, &dims[1], GA_C_ORDER)); - ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_SIZE, 1, &dims[1], GA_C_ORDER)); + ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_ULONG, 1, &dims[1], GA_C_ORDER)); ga_assert_ok(GpuArray_write(&gaS0, pS0, sizeof(*pS0)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD0, -1)); /* 0xFFFFFFFF is a qNaN. */ @@ -767,7 +767,7 @@ START_TEST(test_minandargmin_veryhighrank){ ga_assert_ok(GpuArray_empty(&gaS0, ctx, GA_FLOAT, 8, dims, GA_C_ORDER)); ga_assert_ok(GpuArray_empty(&gaD0, ctx, GA_FLOAT, 4, rdxDims, GA_C_ORDER)); - ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_SIZE, 4, rdxDims, GA_C_ORDER)); + ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_ULONG, 4, rdxDims, GA_C_ORDER)); ga_assert_ok(GpuArray_write(&gaS0, pS0, sizeof(*pS0)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD0, -1)); /* 0xFFFFFFFF is a qNaN. */ @@ -887,7 +887,7 @@ START_TEST(test_minandargmin_alldimsreduced){ ga_assert_ok(GpuArray_empty(&gaS0, ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER)); ga_assert_ok(GpuArray_empty(&gaD0, ctx, GA_FLOAT, 0, NULL, GA_C_ORDER)); - ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_SIZE, 0, NULL, GA_C_ORDER)); + ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_ULONG, 0, NULL, GA_C_ORDER)); ga_assert_ok(GpuArray_write(&gaS0, pS0, sizeof(*pS0)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD0, -1)); /* 0xFFFFFFFF is a qNaN. */ @@ -991,7 +991,7 @@ START_TEST(test_argmax_reduction){ */ ga_assert_ok(GpuArray_empty(&gaS0, ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER)); - ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_SIZE, 1, &dims[1], GA_C_ORDER)); + ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_ULONG, 1, &dims[1], GA_C_ORDER)); ga_assert_ok(GpuArray_write(&gaS0, pS0, sizeof(*pS0)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD1, -1)); @@ -1092,7 +1092,7 @@ START_TEST(test_argmax_veryhighrank){ */ ga_assert_ok(GpuArray_empty(&gaS0, ctx, GA_FLOAT, 8, dims, GA_C_ORDER)); - ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_SIZE, 4, rdxDims, GA_C_ORDER)); + ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_ULONG, 4, rdxDims, GA_C_ORDER)); ga_assert_ok(GpuArray_write(&gaS0, pS0, sizeof(*pS0)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD1, -1)); @@ -1206,7 +1206,7 @@ START_TEST(test_argmax_alldimsreduced){ */ ga_assert_ok(GpuArray_empty(&gaS0, ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER)); - ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_SIZE, 0, NULL, GA_C_ORDER)); + ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_ULONG, 0, NULL, GA_C_ORDER)); ga_assert_ok(GpuArray_write(&gaS0, pS0, sizeof(*pS0)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD1, -1)); @@ -1306,7 +1306,7 @@ START_TEST(test_argmin_reduction){ */ ga_assert_ok(GpuArray_empty(&gaS0, ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER)); - ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_SIZE, 1, &dims[1], GA_C_ORDER)); + ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_ULONG, 1, &dims[1], GA_C_ORDER)); ga_assert_ok(GpuArray_write(&gaS0, pS0, sizeof(*pS0)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD1, -1)); @@ -1407,7 +1407,7 @@ START_TEST(test_argmin_veryhighrank){ */ ga_assert_ok(GpuArray_empty(&gaS0, ctx, GA_FLOAT, 8, dims, GA_C_ORDER)); - ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_SIZE, 4, rdxDims, GA_C_ORDER)); + ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_ULONG, 4, rdxDims, GA_C_ORDER)); ga_assert_ok(GpuArray_write(&gaS0, pS0, sizeof(*pS0)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD1, -1)); @@ -1521,7 +1521,7 @@ START_TEST(test_argmin_alldimsreduced){ */ ga_assert_ok(GpuArray_empty(&gaS0, ctx, GA_FLOAT, 3, &dims[0], GA_C_ORDER)); - ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_SIZE, 0, NULL, GA_C_ORDER)); + ga_assert_ok(GpuArray_empty(&gaD1, ctx, GA_ULONG, 0, NULL, GA_C_ORDER)); ga_assert_ok(GpuArray_write(&gaS0, pS0, sizeof(*pS0)*prodDims)); ga_assert_ok(GpuArray_memset(&gaD1, -1)); From ecde75cbec916b493a9530a65696806ebcdb863e Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Sat, 26 Aug 2017 20:27:54 -0400 Subject: [PATCH 33/34] Bugfix in GpuReduction_new(). --- src/gpuarray_reduction.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c index 88d5d9e5cf..9763d18b40 100644 --- a/src/gpuarray_reduction.c +++ b/src/gpuarray_reduction.c @@ -648,6 +648,7 @@ GPUARRAY_PUBLIC void GpuReductionAttr_free (GpuReductionAttr* } GPUARRAY_PUBLIC int GpuReduction_new (GpuReduction** gr, const GpuReductionAttr* grAttr){ + int ret; GpuReduction* grOut = NULL; if (!gr){ @@ -665,8 +666,16 @@ GPUARRAY_PUBLIC int GpuReduction_new (GpuReduction** grOut->ndd = (int)grAttr->maxDstDims; grOut->ndr = (int)(grAttr->maxSrcDims - grAttr->maxDstDims); - return reduxGenInit(grOut); + ret = reduxGenInit(grOut); + if(ret == GA_NO_ERROR){ + *gr = grOut; + }else{ + GpuReduction_free(grOut); + *gr = NULL; + } + return ret; }else{ + *gr = NULL; return GA_MEMORY_ERROR; } } From 79d3649f53f352e363c67085686db8fe98052d08 Mon Sep 17 00:00:00 2001 From: Olexa Bilaniuk Date: Sat, 26 Aug 2017 20:40:58 -0400 Subject: [PATCH 34/34] Bugfixes in check_reduction.c --- tests/check_reduction.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/check_reduction.c b/tests/check_reduction.c index ca52946fa5..973a348299 100644 --- a/tests/check_reduction.c +++ b/tests/check_reduction.c @@ -7,6 +7,7 @@ #include #include #include +#include extern void *ctx; @@ -18,7 +19,9 @@ void teardown(void); /* Defines */ #define MAXERRPRINT 16 #define ga_assert_ok(e) ck_assert_int_eq(e, GA_NO_ERROR) - +#ifndef ck_assert_ptr_nonnull +#define ck_assert_ptr_nonnull(p) ck_assert_msg((p), "Null Pointer!") +#endif