Skip to content

Commit

Permalink
Merge pull request #21 from galapaegos/master
Browse files Browse the repository at this point in the history
Performance improvements for dalitz, pipipi, zachFit
  • Loading branch information
liang-sun authored Dec 17, 2016
2 parents 6e9f07b + c91b6f7 commit 062199f
Show file tree
Hide file tree
Showing 24 changed files with 348 additions and 232 deletions.
10 changes: 10 additions & 0 deletions GlobalCudaDefines.hh
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,11 @@ enum gooError {gooSuccess = 0, gooErrorMemoryAllocation};
#define DEVICE_VECTOR thrust::device_vector
#define MEMCPY(target, source, count, direction) cudaMemcpy(target, source, count, direction)
#define MEMCPY_TO_SYMBOL(target, source, count, offset, direction) cudaMemcpyToSymbol(target, source, count, offset, direction)
#ifdef TARGET_SM35
#define RO_CACHE(x) __ldg(&x)
#else
#define RO_CACHE(x)
#endif
#define GET_FUNCTION_ADDR(fname) cudaMemcpyFromSymbol((void**) &host_fcn_ptr, fname, sizeof(void*))
#define MEMCPY_FROM_SYMBOL(target, source, count, offset, direction) cudaMemcpyFromSymbol(target, source, count, offset, direction)
// For CUDA case, just use existing errors, renamed
Expand Down Expand Up @@ -84,6 +89,11 @@ typedef double fptype;
#define MODF modf
#define SIN sin
#define SQRT sqrt
#ifdef TARGET_SM35
#define RSQRT rsqrt
#else
#define RSQRT 1.0/SQRT
#endif
#define FLOOR floor
#define POW pow
#else
Expand Down
3 changes: 2 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ endif

ifeq ($(TARGET_OMP),)
# nvcc (CUDA)
CXXFLAGS += -arch=sm_20
CXXFLAGS += -DTARGET_SM35 -arch=sm_35
#CXXFLAGS += -arch=sm_20
else
# OpenMP common flags
DEFINEFLAGS += -fno-inline -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_BACKEND_OMP
Expand Down
22 changes: 11 additions & 11 deletions PDFs/AddPdf.cu
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
#include "AddPdf.hh"

EXEC_TARGET fptype device_AddPdfs (fptype* evt, fptype* p, unsigned int* indices) {
int numParameters = indices[0];
int numParameters = RO_CACHE(indices[0]);
fptype ret = 0;
fptype totalWeight = 0;
for (int i = 1; i < numParameters-3; i += 3) {
totalWeight += p[indices[i+2]];
fptype curr = callFunction(evt, indices[i], indices[i+1]);
fptype weight = p[indices[i+2]];
ret += weight * curr * normalisationFactors[indices[i+1]];
totalWeight += RO_CACHE(p[RO_CACHE(indices[i+2])]);
fptype curr = callFunction(evt, RO_CACHE(indices[i]), RO_CACHE(indices[i+1]));
fptype weight = RO_CACHE(p[RO_CACHE(indices[i+2])]);
ret += weight * curr * normalisationFactors[RO_CACHE(indices[i+1])];

//if ((gpuDebug & 1) && (0 == THREADIDX) && (0 == BLOCKIDX))
//if ((1 > (int) floor(0.5 + evt[8])) && (gpuDebug & 1) && (paramIndices + debugParamIndex == indices))
Expand All @@ -19,8 +19,8 @@ EXEC_TARGET fptype device_AddPdfs (fptype* evt, fptype* p, unsigned int* indices
// nP | F P w | F P
// in which nP = 5. Therefore the parameter index for the last function pointer is nP, and the function index is nP-1.
//fptype last = (*(reinterpret_cast<device_function_ptr>(device_function_table[indices[numParameters-1]])))(evt, p, paramIndices + indices[numParameters]);
fptype last = callFunction(evt, indices[numParameters - 1], indices[numParameters]);
ret += (1 - totalWeight) * last * normalisationFactors[indices[numParameters]];
fptype last = callFunction(evt, RO_CACHE(indices[numParameters - 1]), RO_CACHE(indices[numParameters]));
ret += (1 - totalWeight) * last * normalisationFactors[RO_CACHE(indices[numParameters])];

//if ((THREADIDX < 50) && (isnan(ret))) printf("NaN final component %f %f\n", last, totalWeight);

Expand All @@ -36,14 +36,14 @@ EXEC_TARGET fptype device_AddPdfsExt (fptype* evt, fptype* p, unsigned int* indi
// nP | F P w | F P w
// in which nP = 6.

int numParameters = indices[0];
int numParameters = RO_CACHE(indices[0]);
fptype ret = 0;
fptype totalWeight = 0;
for (int i = 1; i < numParameters; i += 3) {
//fptype curr = (*(reinterpret_cast<device_function_ptr>(device_function_table[indices[i]])))(evt, p, paramIndices + indices[i+1]);
fptype curr = callFunction(evt, indices[i], indices[i+1]);
fptype weight = p[indices[i+2]];
ret += weight * curr * normalisationFactors[indices[i+1]];
fptype curr = callFunction(evt, RO_CACHE(indices[i]), RO_CACHE(indices[i+1]));
fptype weight = RO_CACHE(p[RO_CACHE(indices[i+2])]);
ret += weight * curr * normalisationFactors[RO_CACHE(indices[i+1])];

totalWeight += weight;
//if ((gpuDebug & 1) && (THREADIDX == 0) && (0 == BLOCKIDX))
Expand Down
8 changes: 4 additions & 4 deletions PDFs/CompositePdf.cu
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
#include "CompositePdf.hh"

EXEC_TARGET fptype device_Composite (fptype* evt, fptype* p, unsigned int* indices) {
unsigned int coreFcnIndex = indices[1];
unsigned int coreParIndex = indices[2];
unsigned int shellFcnIndex = indices[3];
unsigned int shellParIndex = indices[4];
unsigned int coreFcnIndex = RO_CACHE(indices[1]);
unsigned int coreParIndex = RO_CACHE(indices[2]);
unsigned int shellFcnIndex = RO_CACHE(indices[3]);
unsigned int shellParIndex = RO_CACHE(indices[4]);

// NB, not normalising core function, it is not being used as a PDF.
//fptype coreValue = (*(reinterpret_cast<device_function_ptr>(device_function_table[coreFcnIndex])))(evt, cudaArray, paramIndices+coreParIndex);
Expand Down
16 changes: 9 additions & 7 deletions PDFs/ConvolutionPdf.cu
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@ MEM_CONSTANT int modelOffset[100];

EXEC_TARGET fptype device_ConvolvePdfs (fptype* evt, fptype* p, unsigned int* indices) {
fptype ret = 0;
fptype loBound = functorConstants[indices[5]+0];
fptype hiBound = functorConstants[indices[5]+1];
fptype step = functorConstants[indices[5]+2];
fptype loBound = RO_CACHE(functorConstants[RO_CACHE(indices[5])+0]);
fptype hiBound = RO_CACHE(functorConstants[RO_CACHE(indices[5])+1]);
fptype step = RO_CACHE(functorConstants[RO_CACHE(indices[5])+2]);
fptype x0 = evt[indices[2 + indices[0]]];
int workSpaceIndex = indices[6];

Expand All @@ -30,14 +30,16 @@ EXEC_TARGET fptype device_ConvolvePdfs (fptype* evt, fptype* p, unsigned int* in
int offsetInBins = (int) FLOOR(x0 / step - lowerBoundOffset);

// Brute-force calculate integral M(x) * R(x - x0) dx
int offset = RO_CACHE(modelOffset[workSpaceIndex]);
for (int i = 0; i < numbins; ++i) {
fptype model = dev_modWorkSpace[workSpaceIndex][i];
fptype resol = dev_resWorkSpace[workSpaceIndex][i + modelOffset[workSpaceIndex] - offsetInBins];
fptype model = RO_CACHE(dev_modWorkSpace[workSpaceIndex][i]);
fptype resol = RO_CACHE(dev_resWorkSpace[workSpaceIndex][i + offset - offsetInBins]);

ret += model*resol;
}

ret *= normalisationFactors[indices[2]];
ret *= normalisationFactors[indices[4]];
ret *= normalisationFactors[RO_CACHE(indices[2])];
ret *= normalisationFactors[RO_CACHE(indices[4])];

return ret;
}
Expand Down
2 changes: 1 addition & 1 deletion PDFs/DalitzPlotHelpers.hh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

#include "ResonancePdf.hh"

EXEC_TARGET bool inDalitz (fptype m12, fptype m13, fptype bigM, fptype dm1, fptype dm2, fptype dm3);
EXEC_TARGET bool inDalitz (const fptype &m12, const fptype &m13, const fptype &bigM, const fptype &dm1, const fptype &dm2, const fptype &dm3);
EXEC_TARGET devcomplex<fptype> getResonanceAmplitude (fptype m12, fptype m13, fptype m23,
unsigned int functionIdx, unsigned int pIndex);

Expand Down
88 changes: 56 additions & 32 deletions PDFs/DalitzPlotPdf.cu
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@ const int resonanceOffset_DP = 4; // Offset of the first resonance into the para
// waves are recalculated when the corresponding resonance mass or width
// changes. Note that in a multithread environment each thread needs its
// own cache, hence the '10'. Ten threads should be enough for anyone!
MEM_DEVICE devcomplex<fptype>* cResonances[10];

//NOTE: This is does not support ten instances (ten threads) of resoncances now, only one set of resonances.
MEM_DEVICE devcomplex<fptype>* cResonances[16];

EXEC_TARGET inline int parIndexFromResIndex_DP (int resIndex) {
return resonanceOffset_DP + resIndex*resonanceSize;
Expand All @@ -25,58 +27,68 @@ EXEC_TARGET devcomplex<fptype> device_DalitzPlot_calcIntegrals (fptype m12, fpty
// observed points, that's why it doesn't use
// cResonances. No need to cache the values at individual
// grid points - we only care about totals.
fptype motherMass = functorConstants[indices[1] + 0];
fptype daug1Mass = functorConstants[indices[1] + 1];
fptype daug2Mass = functorConstants[indices[1] + 2];
fptype daug3Mass = functorConstants[indices[1] + 3];
fptype motherMass = RO_CACHE(functorConstants[RO_CACHE(indices[1]) + 0]);
fptype daug1Mass = RO_CACHE(functorConstants[RO_CACHE(indices[1]) + 1]);
fptype daug2Mass = RO_CACHE(functorConstants[RO_CACHE(indices[1]) + 2]);
fptype daug3Mass = RO_CACHE(functorConstants[RO_CACHE(indices[1]) + 3]);

devcomplex<fptype> ret;
if (!inDalitz(m12, m13, motherMass, daug1Mass, daug2Mass, daug3Mass)) return ret;
fptype m23 = motherMass*motherMass + daug1Mass*daug1Mass + daug2Mass*daug2Mass + daug3Mass*daug3Mass - m12 - m13;

int parameter_i = parIndexFromResIndex_DP(res_i);
unsigned int functn_i = indices[parameter_i+2];
unsigned int params_i = indices[parameter_i+3];
unsigned int functn_i = RO_CACHE(indices[parameter_i+2]);
unsigned int params_i = RO_CACHE(indices[parameter_i+3]);
ret = getResonanceAmplitude(m12, m13, m23, functn_i, params_i);

int parameter_j = parIndexFromResIndex_DP(res_j);
unsigned int functn_j = indices[parameter_j+2];
unsigned int params_j = indices[parameter_j+3];
unsigned int functn_j = RO_CACHE(indices[parameter_j+2]);
unsigned int params_j = RO_CACHE(indices[parameter_j+3]);
ret *= conj(getResonanceAmplitude(m12, m13, m23, functn_j, params_j));

return ret;
}

EXEC_TARGET fptype device_DalitzPlot (fptype* evt, fptype* p, unsigned int* indices) {
fptype motherMass = functorConstants[indices[1] + 0];
fptype daug1Mass = functorConstants[indices[1] + 1];
fptype daug2Mass = functorConstants[indices[1] + 2];
fptype daug3Mass = functorConstants[indices[1] + 3];
fptype motherMass = RO_CACHE(functorConstants[RO_CACHE(indices[1]) + 0]);
fptype daug1Mass = RO_CACHE(functorConstants[RO_CACHE(indices[1]) + 1]);
fptype daug2Mass = RO_CACHE(functorConstants[RO_CACHE(indices[1]) + 2]);
fptype daug3Mass = RO_CACHE(functorConstants[RO_CACHE(indices[1]) + 3]);

fptype m12 = evt[indices[2 + indices[0]]];
fptype m13 = evt[indices[3 + indices[0]]];
fptype m12 = RO_CACHE(evt[RO_CACHE(indices[2 + RO_CACHE(indices[0])])]);
fptype m13 = RO_CACHE(evt[RO_CACHE(indices[3 + RO_CACHE(indices[0])])]);

if (!inDalitz(m12, m13, motherMass, daug1Mass, daug2Mass, daug3Mass)) return 0;
int evtNum = (int) FLOOR(0.5 + evt[indices[4 + indices[0]]]);

fptype evtIndex = RO_CACHE(evt[RO_CACHE(indices[4 + RO_CACHE(indices[0])])]);

int evtNum = (int) FLOOR(0.5 + evtIndex);

devcomplex<fptype> totalAmp(0, 0);
unsigned int numResonances = indices[2];
unsigned int cacheToUse = indices[3];
unsigned int numResonances = RO_CACHE(indices[2]);
unsigned int cacheToUse = RO_CACHE(indices[3]);

for (int i = 0; i < numResonances; ++i) {
int paramIndex = parIndexFromResIndex_DP(i);
fptype amp_real = p[indices[paramIndex+0]];
fptype amp_imag = p[indices[paramIndex+1]];

devcomplex<fptype> matrixelement((cResonances[cacheToUse][evtNum*numResonances + i]).real,
(cResonances[cacheToUse][evtNum*numResonances + i]).imag);
fptype amp_real = RO_CACHE(p[RO_CACHE(indices[paramIndex+0])]);
fptype amp_imag = RO_CACHE(p[RO_CACHE(indices[paramIndex+1])]);

//potential performance improvement by
//double2 me = RO_CACHE(reinterpret_cast<double2*> (cResonances[i][evtNum]));
fptype me_real = RO_CACHE(cResonances[i][evtNum].real);
fptype me_imag = RO_CACHE(cResonances[i][evtNum].imag);

//devcomplex<fptype> matrixelement((cResonances[cacheToUse][evtNum*numResonances + i]).real,
// (cResonances[cacheToUse][evtNum*numResonances + i]).imag);
//devcomplex<fptype> matrixelement (me[0], me[1]);
devcomplex<fptype> matrixelement (me_real, me_imag);
matrixelement.multiply(amp_real, amp_imag);
totalAmp += matrixelement;
}

fptype ret = norm2(totalAmp);
int effFunctionIdx = parIndexFromResIndex_DP(numResonances);
fptype eff = callFunction(evt, indices[effFunctionIdx], indices[effFunctionIdx + 1]);
fptype eff = callFunction(evt, RO_CACHE(indices[effFunctionIdx]), RO_CACHE(indices[effFunctionIdx + 1]));
ret *= eff;

//printf("DalitzPlot evt %i zero: %i %i %f (%f, %f).\n", evtNum, numResonances, effFunctionIdx, eff, totalAmp.real, totalAmp.imag);
Expand All @@ -97,7 +109,7 @@ __host__ DalitzPlotPdf::DalitzPlotPdf (std::string n,
, _m12(m12)
, _m13(m13)
, dalitzNormRange(0)
, cachedWaves(0)
//, cachedWaves(0)
, integrals(0)
, forceRedoIntegrals(true)
, totalEventSize(3) // Default 3 = m12, m13, evtNum
Expand All @@ -110,6 +122,9 @@ __host__ DalitzPlotPdf::DalitzPlotPdf (std::string n,
registerObservable(eventNumber);

fptype decayConstants[5];

for (int i = 0; i < 16; i++)
cachedWaves[i] = 0;

std::vector<unsigned int> pindices;
pindices.push_back(registerConstants(5));
Expand Down Expand Up @@ -170,12 +185,21 @@ __host__ void DalitzPlotPdf::setDataSize (unsigned int dataSize, unsigned int ev
totalEventSize = evtSize;
assert(totalEventSize >= 3);

if (cachedWaves) delete cachedWaves;
//if (cachedWaves) delete cachedWaves;
if (cachedWaves[0])
{
for (int i = 0; i < 16; i++)
delete cachedWaves[i];
}

numEntries = dataSize;
cachedWaves = new DEVICE_VECTOR<devcomplex<fptype> >(dataSize*decayInfo->resonances.size());
void* dummy = thrust::raw_pointer_cast(cachedWaves->data());
MEMCPY_TO_SYMBOL(cResonances, &dummy, sizeof(devcomplex<fptype>*), cacheToUse*sizeof(devcomplex<fptype>*), cudaMemcpyHostToDevice);

for (int i = 0; i < 16; i++)
{
cachedWaves[i] = new DEVICE_VECTOR<devcomplex<fptype> >(dataSize);
void* dummy = thrust::raw_pointer_cast(cachedWaves[i]->data());
MEMCPY_TO_SYMBOL(cResonances, &dummy, sizeof(devcomplex<fptype>*), i*sizeof(devcomplex<fptype>*), cudaMemcpyHostToDevice);
}
setForceIntegrals();
}

Expand Down Expand Up @@ -224,9 +248,9 @@ __host__ fptype DalitzPlotPdf::normalise () const {
if (redoIntegral[i]) {
thrust::transform(thrust::make_zip_iterator(thrust::make_tuple(eventIndex, dataArray, eventSize)),
thrust::make_zip_iterator(thrust::make_tuple(eventIndex + numEntries, arrayAddress, eventSize)),
strided_range<DEVICE_VECTOR<devcomplex<fptype> >::iterator>(cachedWaves->begin() + i,
cachedWaves->end(),
decayInfo->resonances.size()).begin(),
strided_range<DEVICE_VECTOR<devcomplex<fptype> >::iterator>(cachedWaves[i]->begin(),
cachedWaves[i]->end(),
1).begin(),
*(calculators[i]));
}

Expand Down
2 changes: 1 addition & 1 deletion PDFs/DalitzPlotPdf.hh
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ private:

// Following variables are useful if masses and widths, involved in difficult BW calculation,
// change infrequently while amplitudes, only used in adding BW results together, change rapidly.
DEVICE_VECTOR<devcomplex<fptype> >* cachedWaves; // Caches the BW values for each event.
DEVICE_VECTOR<devcomplex<fptype> >* cachedWaves[16]; // Caches the BW values for each event.
devcomplex<fptype>*** integrals; // Caches the integrals of the BW waves for each combination of resonances.

bool* redoIntegral;
Expand Down
18 changes: 9 additions & 9 deletions PDFs/DalitzVetoPdf.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,23 @@
#include "DalitzPlotHelpers.hh"

EXEC_TARGET fptype device_DalitzVeto (fptype* evt, fptype* p, unsigned int* indices) {
fptype x = evt[indices[2 + indices[0] + 0]];
fptype y = evt[indices[2 + indices[0] + 1]];
fptype x = evt[RO_CACHE(indices[2 + RO_CACHE(indices[0]) + 0])];
fptype y = evt[RO_CACHE(indices[2 + RO_CACHE(indices[0]) + 1])];

fptype motherM = p[indices[1]];
fptype d1m = p[indices[2]];
fptype d2m = p[indices[3]];
fptype d3m = p[indices[4]];
fptype motherM = RO_CACHE(p[RO_CACHE(indices[1])]);
fptype d1m = RO_CACHE(p[RO_CACHE(indices[2])]);
fptype d2m = RO_CACHE(p[RO_CACHE(indices[3])]);
fptype d3m = RO_CACHE(p[RO_CACHE(indices[4])]);

fptype massSum = motherM*motherM + d1m*d1m + d2m*d2m + d3m*d3m;
fptype z = massSum - x - y;

fptype ret = inDalitz(x, y, motherM, d1m, d2m, d3m) ? 1.0 : 0.0;
unsigned int numVetos = indices[5];
unsigned int numVetos = RO_CACHE(indices[5]);
for (int i = 0; i < numVetos; ++i) {
unsigned int varIndex = indices[6 + i*3 + 0];
fptype minimum = p[indices[6 + i*3 + 1]];
fptype maximum = p[indices[6 + i*3 + 2]];
fptype minimum = RO_CACHE(p[RO_CACHE(indices[6 + i*3 + 1])]);
fptype maximum = RO_CACHE(p[RO_CACHE(indices[6 + i*3 + 2])]);
fptype currDalitzVar = (PAIR_12 == varIndex ? x : PAIR_13 == varIndex ? y : z);

ret *= ((currDalitzVar < maximum) && (currDalitzVar > minimum)) ? 0.0 : 1.0;
Expand Down
20 changes: 10 additions & 10 deletions PDFs/EventWeightedAddPdf.cu
Original file line number Diff line number Diff line change
@@ -1,22 +1,22 @@
#include "EventWeightedAddPdf.hh"

EXEC_TARGET fptype device_EventWeightedAddPdfs (fptype* evt, fptype* p, unsigned int* indices) {
int numParameters = indices[0];
int numParameters = RO_CACHE(indices[0]);
fptype ret = 0;
fptype totalWeight = 0;

for (int i = 0; i < numParameters/2 - 1; ++i) {
fptype weight = evt[indices[2 + numParameters + i]];
fptype weight = RO_CACHE(evt[RO_CACHE(indices[2 + numParameters + i])]);
totalWeight += weight;
fptype curr = callFunction(evt, indices[2*i + 1], indices[2*(i+1)]);
ret += weight * curr * normalisationFactors[indices[2*(i+1)]];
fptype curr = callFunction(evt, RO_CACHE(indices[2*i + 1]), RO_CACHE(indices[2*(i+1)]));
ret += weight * curr * normalisationFactors[RO_CACHE(indices[2*(i+1)])];
}
// numParameters does not count itself. So the array structure for two functions is
// nP | F P | F P | nO | o1
// in which nP = 4. and nO = 1. Therefore the parameter index for the last function pointer is nP, and the function index is nP-1.
//fptype last = (*(reinterpret_cast<device_function_ptr>(device_function_table[indices[numParameters-1]])))(evt, p, paramIndices + indices[numParameters]);
fptype last = callFunction(evt, indices[numParameters-1], indices[numParameters]);
ret += (1 - totalWeight) * last * normalisationFactors[indices[numParameters]];
fptype last = callFunction(evt, RO_CACHE(indices[numParameters-1]), RO_CACHE(indices[numParameters]));
ret += (1 - totalWeight) * last * normalisationFactors[RO_CACHE(indices[numParameters])];

return ret;
}
Expand All @@ -26,14 +26,14 @@ EXEC_TARGET fptype device_EventWeightedAddPdfsExt (fptype* evt, fptype* p, unsig
// nP | F P | F P | nO | o1 o2
// in which nP = 4, nO = 2.

int numParameters = indices[0];
int numParameters = RO_CACHE(indices[0]);
fptype ret = 0;
fptype totalWeight = 0;
for (int i = 0; i < numParameters/2; ++i) {
fptype curr = callFunction(evt, indices[2*i + 1], indices[2*(i+1)]);
fptype curr = callFunction(evt, RO_CACHE(indices[2*i + 1]), RO_CACHE(indices[2*(i+1)]));
//if ((0 == BLOCKIDX) && (THREADIDX < 5) && (isnan(curr))) printf("NaN component %i %i\n", i, THREADIDX);
fptype weight = evt[indices[2 + numParameters + i]];
ret += weight * curr * normalisationFactors[indices[2*(i+1)]];
fptype weight = RO_CACHE(evt[RO_CACHE(indices[2 + numParameters + i])]);
ret += weight * curr * normalisationFactors[RO_CACHE(indices[2*(i+1)])];
totalWeight += weight;

//if ((gpuDebug & 1) && (0 == THREADIDX))
Expand Down
Loading

0 comments on commit 062199f

Please sign in to comment.