Merge pull request #21 from galapaegos/master

Performance improvements for dalitz, pipipi, zachFit
GooFit · Dec 17, 2016 · 062199f · 062199f
2 parents 6e9f07b + c91b6f7
commit 062199f
Show file tree

Hide file tree

Showing 24 changed files with 348 additions and 232 deletions.
diff --git a/GlobalCudaDefines.hh b/GlobalCudaDefines.hh
@@ -47,6 +47,11 @@ enum gooError {gooSuccess = 0, gooErrorMemoryAllocation};
 #define DEVICE_VECTOR thrust::device_vector
 #define MEMCPY(target, source, count, direction) cudaMemcpy(target, source, count, direction) 
 #define MEMCPY_TO_SYMBOL(target, source, count, offset, direction) cudaMemcpyToSymbol(target, source, count, offset, direction)
+#ifdef TARGET_SM35
+#define RO_CACHE(x) __ldg(&x)
+#else
+#define RO_CACHE(x)
+#endif
 #define GET_FUNCTION_ADDR(fname) cudaMemcpyFromSymbol((void**) &host_fcn_ptr, fname, sizeof(void*))
 #define MEMCPY_FROM_SYMBOL(target, source, count, offset, direction) cudaMemcpyFromSymbol(target, source, count, offset, direction)
 // For CUDA case, just use existing errors, renamed
@@ -84,6 +89,11 @@ typedef double fptype;
 #define MODF modf
 #define SIN sin
 #define SQRT sqrt
+#ifdef TARGET_SM35
+#define RSQRT rsqrt
+#else
+#define RSQRT 1.0/SQRT
+#endif
 #define FLOOR floor
 #define POW pow
 #else 

diff --git a/Makefile b/Makefile
@@ -31,7 +31,8 @@ endif
 
 ifeq ($(TARGET_OMP),)
 # nvcc (CUDA)
-CXXFLAGS += -arch=sm_20
+CXXFLAGS += -DTARGET_SM35 -arch=sm_35
+#CXXFLAGS += -arch=sm_20
 else
 # OpenMP common flags
 DEFINEFLAGS += -fno-inline -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_BACKEND_OMP

diff --git a/PDFs/AddPdf.cu b/PDFs/AddPdf.cu
@@ -1,14 +1,14 @@
 #include "AddPdf.hh"
 
 EXEC_TARGET fptype device_AddPdfs (fptype* evt, fptype* p, unsigned int* indices) { 
-  int numParameters = indices[0]; 
+  int numParameters = RO_CACHE(indices[0]); 
   fptype ret = 0;
   fptype totalWeight = 0; 
   for (int i = 1; i < numParameters-3; i += 3) {
-    totalWeight += p[indices[i+2]];
-    fptype curr = callFunction(evt, indices[i], indices[i+1]); 
-    fptype weight = p[indices[i+2]];
-    ret += weight * curr * normalisationFactors[indices[i+1]]; 
+    totalWeight += RO_CACHE(p[RO_CACHE(indices[i+2])]);
+    fptype curr = callFunction(evt, RO_CACHE(indices[i]), RO_CACHE(indices[i+1])); 
+    fptype weight = RO_CACHE(p[RO_CACHE(indices[i+2])]);
+    ret += weight * curr * normalisationFactors[RO_CACHE(indices[i+1])]; 
 
     //if ((gpuDebug & 1) && (0 == THREADIDX) && (0 == BLOCKIDX)) 
     //if ((1 > (int) floor(0.5 + evt[8])) && (gpuDebug & 1) && (paramIndices + debugParamIndex == indices))
@@ -19,8 +19,8 @@ EXEC_TARGET fptype device_AddPdfs (fptype* evt, fptype* p, unsigned int* indices
   // nP | F P w | F P
   // in which nP = 5. Therefore the parameter index for the last function pointer is nP, and the function index is nP-1. 
   //fptype last = (*(reinterpret_cast<device_function_ptr>(device_function_table[indices[numParameters-1]])))(evt, p, paramIndices + indices[numParameters]);
-  fptype last = callFunction(evt, indices[numParameters - 1], indices[numParameters]);
-  ret += (1 - totalWeight) * last * normalisationFactors[indices[numParameters]]; 
+  fptype last = callFunction(evt, RO_CACHE(indices[numParameters - 1]), RO_CACHE(indices[numParameters]));
+  ret += (1 - totalWeight) * last * normalisationFactors[RO_CACHE(indices[numParameters])]; 
 
   //if ((THREADIDX < 50) && (isnan(ret))) printf("NaN final component %f %f\n", last, totalWeight); 
 
@@ -36,14 +36,14 @@ EXEC_TARGET fptype device_AddPdfsExt (fptype* evt, fptype* p, unsigned int* indi
   // nP | F P w | F P w
   // in which nP = 6. 
 
-  int numParameters = indices[0]; 
+  int numParameters = RO_CACHE(indices[0]); 
   fptype ret = 0;
   fptype totalWeight = 0; 
   for (int i = 1; i < numParameters; i += 3) {    
     //fptype curr = (*(reinterpret_cast<device_function_ptr>(device_function_table[indices[i]])))(evt, p, paramIndices + indices[i+1]);
-    fptype curr = callFunction(evt, indices[i], indices[i+1]); 
-    fptype weight = p[indices[i+2]];
-    ret += weight * curr * normalisationFactors[indices[i+1]]; 
+    fptype curr = callFunction(evt, RO_CACHE(indices[i]), RO_CACHE(indices[i+1])); 
+    fptype weight = RO_CACHE(p[RO_CACHE(indices[i+2])]);
+    ret += weight * curr * normalisationFactors[RO_CACHE(indices[i+1])]; 
 
     totalWeight += weight; 
     //if ((gpuDebug & 1) && (THREADIDX == 0) && (0 == BLOCKIDX)) 

diff --git a/PDFs/CompositePdf.cu b/PDFs/CompositePdf.cu
@@ -1,10 +1,10 @@
 #include "CompositePdf.hh"
 
 EXEC_TARGET fptype device_Composite (fptype* evt, fptype* p, unsigned int* indices) {
-  unsigned int coreFcnIndex  = indices[1];
-  unsigned int coreParIndex  = indices[2];
-  unsigned int shellFcnIndex = indices[3];
-  unsigned int shellParIndex = indices[4];
+  unsigned int coreFcnIndex  = RO_CACHE(indices[1]);
+  unsigned int coreParIndex  = RO_CACHE(indices[2]);
+  unsigned int shellFcnIndex = RO_CACHE(indices[3]);
+  unsigned int shellParIndex = RO_CACHE(indices[4]);
 
   // NB, not normalising core function, it is not being used as a PDF. 
   //fptype coreValue = (*(reinterpret_cast<device_function_ptr>(device_function_table[coreFcnIndex])))(evt, cudaArray, paramIndices+coreParIndex);

diff --git a/PDFs/ConvolutionPdf.cu b/PDFs/ConvolutionPdf.cu
@@ -17,9 +17,9 @@ MEM_CONSTANT int modelOffset[100];
 
 EXEC_TARGET fptype device_ConvolvePdfs (fptype* evt, fptype* p, unsigned int* indices) { 
   fptype ret     = 0; 
-  fptype loBound = functorConstants[indices[5]+0];
-  fptype hiBound = functorConstants[indices[5]+1];
-  fptype step    = functorConstants[indices[5]+2];
+  fptype loBound = RO_CACHE(functorConstants[RO_CACHE(indices[5])+0]);
+  fptype hiBound = RO_CACHE(functorConstants[RO_CACHE(indices[5])+1]);
+  fptype step    = RO_CACHE(functorConstants[RO_CACHE(indices[5])+2]);
   fptype x0      = evt[indices[2 + indices[0]]]; 
   int workSpaceIndex = indices[6]; 
 
@@ -30,14 +30,16 @@ EXEC_TARGET fptype device_ConvolvePdfs (fptype* evt, fptype* p, unsigned int* in
   int offsetInBins = (int) FLOOR(x0 / step - lowerBoundOffset); 
 
   // Brute-force calculate integral M(x) * R(x - x0) dx
+  int offset = RO_CACHE(modelOffset[workSpaceIndex]);
   for (int i = 0; i < numbins; ++i) {
-    fptype model = dev_modWorkSpace[workSpaceIndex][i]; 
-    fptype resol = dev_resWorkSpace[workSpaceIndex][i + modelOffset[workSpaceIndex] - offsetInBins]; 
+    fptype model = RO_CACHE(dev_modWorkSpace[workSpaceIndex][i]); 
+    fptype resol = RO_CACHE(dev_resWorkSpace[workSpaceIndex][i + offset - offsetInBins]); 
+
     ret += model*resol;
   }
 
-  ret *= normalisationFactors[indices[2]]; 
-  ret *= normalisationFactors[indices[4]]; 
+  ret *= normalisationFactors[RO_CACHE(indices[2])]; 
+  ret *= normalisationFactors[RO_CACHE(indices[4])]; 
 
   return ret; 
 }

diff --git a/PDFs/DalitzPlotHelpers.hh b/PDFs/DalitzPlotHelpers.hh
@@ -3,7 +3,7 @@
 
 #include "ResonancePdf.hh"
 
-EXEC_TARGET bool inDalitz (fptype m12, fptype m13, fptype bigM, fptype dm1, fptype dm2, fptype dm3); 
+EXEC_TARGET bool inDalitz (const fptype &m12, const fptype &m13, const fptype &bigM, const fptype &dm1, const fptype &dm2, const fptype &dm3); 
 EXEC_TARGET devcomplex<fptype> getResonanceAmplitude (fptype m12, fptype m13, fptype m23, 
 						      unsigned int functionIdx, unsigned int pIndex); 
 

diff --git a/PDFs/DalitzPlotPdf.cu b/PDFs/DalitzPlotPdf.cu
@@ -11,7 +11,9 @@ const int resonanceOffset_DP = 4; // Offset of the first resonance into the para
 // waves are recalculated when the corresponding resonance mass or width 
 // changes. Note that in a multithread environment each thread needs its
 // own cache, hence the '10'. Ten threads should be enough for anyone! 
-MEM_DEVICE devcomplex<fptype>* cResonances[10]; 
+
+//NOTE: This is does not support ten instances (ten threads) of resoncances now, only one set of resonances.
+MEM_DEVICE devcomplex<fptype>* cResonances[16]; 
 
 EXEC_TARGET inline int parIndexFromResIndex_DP (int resIndex) {
   return resonanceOffset_DP + resIndex*resonanceSize; 
@@ -25,58 +27,68 @@ EXEC_TARGET devcomplex<fptype> device_DalitzPlot_calcIntegrals (fptype m12, fpty
   // observed points, that's why it doesn't use 
   // cResonances. No need to cache the values at individual
   // grid points - we only care about totals. 
-  fptype motherMass = functorConstants[indices[1] + 0]; 
-  fptype daug1Mass  = functorConstants[indices[1] + 1]; 
-  fptype daug2Mass  = functorConstants[indices[1] + 2]; 
-  fptype daug3Mass  = functorConstants[indices[1] + 3];  
+  fptype motherMass = RO_CACHE(functorConstants[RO_CACHE(indices[1]) + 0]); 
+  fptype daug1Mass  = RO_CACHE(functorConstants[RO_CACHE(indices[1]) + 1]); 
+  fptype daug2Mass  = RO_CACHE(functorConstants[RO_CACHE(indices[1]) + 2]); 
+  fptype daug3Mass  = RO_CACHE(functorConstants[RO_CACHE(indices[1]) + 3]);  
 
   devcomplex<fptype> ret; 
   if (!inDalitz(m12, m13, motherMass, daug1Mass, daug2Mass, daug3Mass)) return ret;
   fptype m23 = motherMass*motherMass + daug1Mass*daug1Mass + daug2Mass*daug2Mass + daug3Mass*daug3Mass - m12 - m13; 
 
   int parameter_i = parIndexFromResIndex_DP(res_i);
-  unsigned int functn_i = indices[parameter_i+2];
-  unsigned int params_i = indices[parameter_i+3];
+  unsigned int functn_i = RO_CACHE(indices[parameter_i+2]);
+  unsigned int params_i = RO_CACHE(indices[parameter_i+3]);
   ret = getResonanceAmplitude(m12, m13, m23, functn_i, params_i);
 
   int parameter_j = parIndexFromResIndex_DP(res_j);
-  unsigned int functn_j = indices[parameter_j+2];
-  unsigned int params_j = indices[parameter_j+3];
+  unsigned int functn_j = RO_CACHE(indices[parameter_j+2]);
+  unsigned int params_j = RO_CACHE(indices[parameter_j+3]);
   ret *= conj(getResonanceAmplitude(m12, m13, m23, functn_j, params_j));
 
   return ret; 
 }
 
 EXEC_TARGET fptype device_DalitzPlot (fptype* evt, fptype* p, unsigned int* indices) {
-  fptype motherMass = functorConstants[indices[1] + 0]; 
-  fptype daug1Mass  = functorConstants[indices[1] + 1]; 
-  fptype daug2Mass  = functorConstants[indices[1] + 2]; 
-  fptype daug3Mass  = functorConstants[indices[1] + 3]; 
+  fptype motherMass = RO_CACHE(functorConstants[RO_CACHE(indices[1]) + 0]); 
+  fptype daug1Mass  = RO_CACHE(functorConstants[RO_CACHE(indices[1]) + 1]); 
+  fptype daug2Mass  = RO_CACHE(functorConstants[RO_CACHE(indices[1]) + 2]); 
+  fptype daug3Mass  = RO_CACHE(functorConstants[RO_CACHE(indices[1]) + 3]); 
 
-  fptype m12 = evt[indices[2 + indices[0]]]; 
-  fptype m13 = evt[indices[3 + indices[0]]];
+  fptype m12 = RO_CACHE(evt[RO_CACHE(indices[2 + RO_CACHE(indices[0])])]);
+  fptype m13 = RO_CACHE(evt[RO_CACHE(indices[3 + RO_CACHE(indices[0])])]);
 
   if (!inDalitz(m12, m13, motherMass, daug1Mass, daug2Mass, daug3Mass)) return 0; 
-  int evtNum = (int) FLOOR(0.5 + evt[indices[4 + indices[0]]]); 
+
+  fptype evtIndex = RO_CACHE(evt[RO_CACHE(indices[4 + RO_CACHE(indices[0])])]);
+
+  int evtNum = (int) FLOOR(0.5 + evtIndex); 
 
   devcomplex<fptype> totalAmp(0, 0);
-  unsigned int numResonances = indices[2]; 
-  unsigned int cacheToUse    = indices[3]; 
+  unsigned int numResonances = RO_CACHE(indices[2]); 
+  unsigned int cacheToUse    = RO_CACHE(indices[3]); 
 
   for (int i = 0; i < numResonances; ++i) {
     int paramIndex  = parIndexFromResIndex_DP(i);
-    fptype amp_real = p[indices[paramIndex+0]];
-    fptype amp_imag = p[indices[paramIndex+1]];
-
-    devcomplex<fptype> matrixelement((cResonances[cacheToUse][evtNum*numResonances + i]).real,
-				     (cResonances[cacheToUse][evtNum*numResonances + i]).imag); 
+    fptype amp_real = RO_CACHE(p[RO_CACHE(indices[paramIndex+0])]);
+    fptype amp_imag = RO_CACHE(p[RO_CACHE(indices[paramIndex+1])]);
+
+    //potential performance improvement by 
+    //double2 me = RO_CACHE(reinterpret_cast<double2*> (cResonances[i][evtNum]));
+    fptype me_real = RO_CACHE(cResonances[i][evtNum].real);
+    fptype me_imag = RO_CACHE(cResonances[i][evtNum].imag);
+
+    //devcomplex<fptype> matrixelement((cResonances[cacheToUse][evtNum*numResonances + i]).real,
+    //				     (cResonances[cacheToUse][evtNum*numResonances + i]).imag); 
+    //devcomplex<fptype> matrixelement (me[0], me[1]);
+    devcomplex<fptype> matrixelement (me_real, me_imag);
     matrixelement.multiply(amp_real, amp_imag); 
     totalAmp += matrixelement; 
   } 
 
   fptype ret = norm2(totalAmp); 
   int effFunctionIdx = parIndexFromResIndex_DP(numResonances); 
-  fptype eff = callFunction(evt, indices[effFunctionIdx], indices[effFunctionIdx + 1]); 
+  fptype eff = callFunction(evt, RO_CACHE(indices[effFunctionIdx]), RO_CACHE(indices[effFunctionIdx + 1])); 
   ret *= eff;
 
   //printf("DalitzPlot evt %i zero: %i %i %f (%f, %f).\n", evtNum, numResonances, effFunctionIdx, eff, totalAmp.real, totalAmp.imag); 
@@ -97,7 +109,7 @@ __host__ DalitzPlotPdf::DalitzPlotPdf (std::string n,
   , _m12(m12)
   , _m13(m13)
   , dalitzNormRange(0)
-  , cachedWaves(0) 
+  //, cachedWaves(0) 
   , integrals(0)
   , forceRedoIntegrals(true)
   , totalEventSize(3) // Default 3 = m12, m13, evtNum 
@@ -110,6 +122,9 @@ __host__ DalitzPlotPdf::DalitzPlotPdf (std::string n,
   registerObservable(eventNumber); 
 
   fptype decayConstants[5];
+
+  for (int i = 0; i < 16; i++)
+    cachedWaves[i] = 0;
 
   std::vector<unsigned int> pindices;
   pindices.push_back(registerConstants(5)); 
@@ -170,12 +185,21 @@ __host__ void DalitzPlotPdf::setDataSize (unsigned int dataSize, unsigned int ev
   totalEventSize = evtSize;
   assert(totalEventSize >= 3); 
 
-  if (cachedWaves) delete cachedWaves;
+  //if (cachedWaves) delete cachedWaves;
+  if (cachedWaves[0])
+  {
+    for (int i = 0; i < 16; i++)
+	  delete cachedWaves[i];
+  }
 
   numEntries = dataSize; 
-  cachedWaves = new DEVICE_VECTOR<devcomplex<fptype> >(dataSize*decayInfo->resonances.size());
-  void* dummy = thrust::raw_pointer_cast(cachedWaves->data()); 
-  MEMCPY_TO_SYMBOL(cResonances, &dummy, sizeof(devcomplex<fptype>*), cacheToUse*sizeof(devcomplex<fptype>*), cudaMemcpyHostToDevice); 
+
+  for (int i = 0; i < 16; i++)
+  {
+    cachedWaves[i] = new DEVICE_VECTOR<devcomplex<fptype> >(dataSize);
+    void* dummy = thrust::raw_pointer_cast(cachedWaves[i]->data()); 
+    MEMCPY_TO_SYMBOL(cResonances, &dummy, sizeof(devcomplex<fptype>*), i*sizeof(devcomplex<fptype>*), cudaMemcpyHostToDevice); 
+  }
   setForceIntegrals(); 
 }
 
@@ -224,9 +248,9 @@ __host__ fptype DalitzPlotPdf::normalise () const {
     if (redoIntegral[i]) {
       thrust::transform(thrust::make_zip_iterator(thrust::make_tuple(eventIndex, dataArray, eventSize)),
 			thrust::make_zip_iterator(thrust::make_tuple(eventIndex + numEntries, arrayAddress, eventSize)),
-			strided_range<DEVICE_VECTOR<devcomplex<fptype> >::iterator>(cachedWaves->begin() + i, 
-										    cachedWaves->end(), 
-										    decayInfo->resonances.size()).begin(), 
+			strided_range<DEVICE_VECTOR<devcomplex<fptype> >::iterator>(cachedWaves[i]->begin(), 
+										    cachedWaves[i]->end(), 
+										    1).begin(), 
 			*(calculators[i]));
     }
 

diff --git a/PDFs/DalitzPlotPdf.hh b/PDFs/DalitzPlotPdf.hh
@@ -29,7 +29,7 @@ private:
 
   // Following variables are useful if masses and widths, involved in difficult BW calculation, 
   // change infrequently while amplitudes, only used in adding BW results together, change rapidly.
-  DEVICE_VECTOR<devcomplex<fptype> >* cachedWaves; // Caches the BW values for each event.
+  DEVICE_VECTOR<devcomplex<fptype> >* cachedWaves[16]; // Caches the BW values for each event.
   devcomplex<fptype>*** integrals; // Caches the integrals of the BW waves for each combination of resonances. 
 
   bool* redoIntegral;

diff --git a/PDFs/DalitzVetoPdf.cu b/PDFs/DalitzVetoPdf.cu
@@ -2,23 +2,23 @@
 #include "DalitzPlotHelpers.hh" 
 
 EXEC_TARGET fptype device_DalitzVeto (fptype* evt, fptype* p, unsigned int* indices) {
-  fptype x         = evt[indices[2 + indices[0] + 0]]; 
-  fptype y         = evt[indices[2 + indices[0] + 1]]; 
+  fptype x         = evt[RO_CACHE(indices[2 + RO_CACHE(indices[0]) + 0])]; 
+  fptype y         = evt[RO_CACHE(indices[2 + RO_CACHE(indices[0]) + 1])]; 
 
-  fptype motherM   = p[indices[1]];
-  fptype d1m       = p[indices[2]];
-  fptype d2m       = p[indices[3]];
-  fptype d3m       = p[indices[4]];
+  fptype motherM   = RO_CACHE(p[RO_CACHE(indices[1])]);
+  fptype d1m       = RO_CACHE(p[RO_CACHE(indices[2])]);
+  fptype d2m       = RO_CACHE(p[RO_CACHE(indices[3])]);
+  fptype d3m       = RO_CACHE(p[RO_CACHE(indices[4])]);
 
   fptype massSum   = motherM*motherM + d1m*d1m + d2m*d2m + d3m*d3m;
   fptype z         = massSum - x - y;
 
   fptype ret = inDalitz(x, y, motherM, d1m, d2m, d3m) ? 1.0 : 0.0; 
-  unsigned int numVetos = indices[5];
+  unsigned int numVetos = RO_CACHE(indices[5]);
   for (int i = 0; i < numVetos; ++i) {
     unsigned int varIndex =   indices[6 + i*3 + 0];
-    fptype minimum        = p[indices[6 + i*3 + 1]];
-    fptype maximum        = p[indices[6 + i*3 + 2]];
+    fptype minimum        = RO_CACHE(p[RO_CACHE(indices[6 + i*3 + 1])]);
+    fptype maximum        = RO_CACHE(p[RO_CACHE(indices[6 + i*3 + 2])]);
     fptype currDalitzVar = (PAIR_12 == varIndex ? x : PAIR_13 == varIndex ? y : z);
 
     ret *= ((currDalitzVar < maximum) && (currDalitzVar > minimum)) ? 0.0 : 1.0;

diff --git a/PDFs/EventWeightedAddPdf.cu b/PDFs/EventWeightedAddPdf.cu
@@ -1,22 +1,22 @@
 #include "EventWeightedAddPdf.hh"
 
 EXEC_TARGET fptype device_EventWeightedAddPdfs (fptype* evt, fptype* p, unsigned int* indices) { 
-  int numParameters = indices[0]; 
+  int numParameters = RO_CACHE(indices[0]); 
   fptype ret = 0;
   fptype totalWeight = 0; 
 
   for (int i = 0; i < numParameters/2 - 1; ++i) {
-    fptype weight = evt[indices[2 + numParameters + i]];
+    fptype weight = RO_CACHE(evt[RO_CACHE(indices[2 + numParameters + i])]);
     totalWeight += weight;
-    fptype curr = callFunction(evt, indices[2*i + 1], indices[2*(i+1)]); 
-    ret += weight * curr * normalisationFactors[indices[2*(i+1)]]; 
+    fptype curr = callFunction(evt, RO_CACHE(indices[2*i + 1]), RO_CACHE(indices[2*(i+1)])); 
+    ret += weight * curr * normalisationFactors[RO_CACHE(indices[2*(i+1)])]; 
   }
   // numParameters does not count itself. So the array structure for two functions is
   // nP | F P | F P | nO | o1 
   // in which nP = 4. and nO = 1. Therefore the parameter index for the last function pointer is nP, and the function index is nP-1. 
   //fptype last = (*(reinterpret_cast<device_function_ptr>(device_function_table[indices[numParameters-1]])))(evt, p, paramIndices + indices[numParameters]);
-  fptype last = callFunction(evt, indices[numParameters-1], indices[numParameters]); 
-  ret += (1 - totalWeight) * last * normalisationFactors[indices[numParameters]]; 
+  fptype last = callFunction(evt, RO_CACHE(indices[numParameters-1]), RO_CACHE(indices[numParameters])); 
+  ret += (1 - totalWeight) * last * normalisationFactors[RO_CACHE(indices[numParameters])]; 
 
   return ret; 
 }
@@ -26,14 +26,14 @@ EXEC_TARGET fptype device_EventWeightedAddPdfsExt (fptype* evt, fptype* p, unsig
   // nP | F P | F P | nO | o1 o2
   // in which nP = 4, nO = 2. 
 
-  int numParameters = indices[0]; 
+  int numParameters = RO_CACHE(indices[0]); 
   fptype ret = 0;
   fptype totalWeight = 0; 
   for (int i = 0; i < numParameters/2; ++i) {
-    fptype curr = callFunction(evt, indices[2*i + 1], indices[2*(i+1)]); 
+    fptype curr = callFunction(evt, RO_CACHE(indices[2*i + 1]), RO_CACHE(indices[2*(i+1)])); 
     //if ((0 == BLOCKIDX) && (THREADIDX < 5) && (isnan(curr))) printf("NaN component %i %i\n", i, THREADIDX); 
-    fptype weight = evt[indices[2 + numParameters + i]];
-    ret += weight * curr * normalisationFactors[indices[2*(i+1)]]; 
+    fptype weight = RO_CACHE(evt[RO_CACHE(indices[2 + numParameters + i])]);
+    ret += weight * curr * normalisationFactors[RO_CACHE(indices[2*(i+1)])]; 
     totalWeight += weight; 
 
     //if ((gpuDebug & 1) && (0 == THREADIDX))