diff --git a/src/Ewald.cpp b/src/Ewald.cpp index da70d8832..6b15f0928 100644 --- a/src/Ewald.cpp +++ b/src/Ewald.cpp @@ -745,88 +745,140 @@ double Ewald::MolExchangeReciprocal(const std::vector &newMol, const std::vector &molIndexOld, bool first_call) { double energyRecipNew = 0.0; - double energyRecipOld = 0.0; - // Change in reciprocal happens in the same box. + //Change in reciprocal happens in the same box. uint box = newMol[0].GetBox(); if (box < BOXES_WITH_U_NB) { GOMC_EVENT_START(1, GomcProfileEvent::RECIP_MEMC_ENERGY); -// Because MolExchangeReciprocal does not have a matching GPU function, this is -// a stub function to copy the GPU sumRref and sumIref vectors to the CPU in -// order to calcuate the new sums. If this function is ported to the GPU, this -// call should be removed. + #ifdef GOMC_CUDA - CallMolExchangeReciprocalStartGPU(ff.particles->getCUDAVars(), imageSizeRef[box], - sumRref[box], sumIref[box], box); -#endif + //Build a vector of only the charged particles in the new and old molecules + std::vector chargeBoxNew; + MoleculeKind const& thisKindNew = newMol[0].GetKind(); + uint lengthNew = thisKindNew.NumAtoms() * newMol.size(); + XYZArray newMolCoords = XYZArray(lengthNew); + + int newChargedParticles = 0; + for (uint m = 0; m < newMol.size(); ++m) { + uint newMoleculeIndex = molIndexNew[m]; + double lambdaCoef = GetLambdaCoef(newMoleculeIndex, box); + + XYZArray currNewMolCoords = newMol[m].GetCoords(); + for (uint p = 0; p < lengthNew; ++p) { + unsigned long currentAtom = mols.MolStart(newMoleculeIndex) + p; + if(!particleHasNoCharge[currentAtom]) { + newMolCoords.Set(newChargedParticles, + currNewMolCoords.x[p], + currNewMolCoords.y[p], + currNewMolCoords.z[p]); + newChargedParticles++; + chargeBoxNew.push_back(thisKindNew.AtomCharge(p) * lambdaCoef); + } + } + } + lengthNew = newChargedParticles; + + std::vector chargeBoxOld; + MoleculeKind const& thisKindOld = oldMol[0].GetKind(); + uint lengthOld = thisKindOld.NumAtoms() * oldMol.size(); + XYZArray oldMolCoords = XYZArray(lengthOld); + + int oldChargedParticles = 0; + for (uint m = 0; m < oldMol.size(); ++m) { + uint oldMoleculeIndex = molIndexOld[m]; + double lambdaCoef = GetLambdaCoef(oldMoleculeIndex, box); + XYZArray currOldMolCoords = oldMol[m].GetCoords(); + for (uint p = 0; p < lengthOld; ++p) { + unsigned long currentAtom = mols.MolStart(oldMoleculeIndex) + p; + if(!particleHasNoCharge[currentAtom]) { + oldMolCoords.Set(oldChargedParticles, + currOldMolCoords.x[p], + currOldMolCoords.y[p], + currOldMolCoords.z[p]); + oldChargedParticles++; + chargeBoxOld.push_back(thisKindOld.AtomCharge(p) * lambdaCoef); + } + } + } + lengthOld = oldChargedParticles; - uint lengthNew, lengthOld; - MoleculeKind const &thisKindNew = newMol[0].GetKind(); - MoleculeKind const &thisKindOld = oldMol[0].GetKind(); - lengthNew = thisKindNew.NumAtoms(); - lengthOld = thisKindOld.NumAtoms(); + // Depending on the move, we could call this function twice. If so, we don't want to + // double count the existing (reference) sums, so we copy them only for the first call + // and then add to them inside the function based on the delta values for the move. + if(first_call) { + CopyRefToNewCUDA(ff.particles->getCUDAVars(), box, imageSizeRef[box]); + } + CallMolExchangeReciprocalGPU(ff.particles->getCUDAVars(), + imageSizeRef[box], + sumRnew[box], sumInew[box], box, + chargeBoxNew, chargeBoxOld, + lengthNew, lengthOld, + energyRecipNew, + newMolCoords, + oldMolCoords); +#else + MoleculeKind const& thisKindNew = newMol[0].GetKind(); + MoleculeKind const& thisKindOld = oldMol[0].GetKind(); + uint lengthNew = thisKindNew.NumAtoms(); + uint lengthOld = thisKindOld.NumAtoms(); #ifdef _OPENMP -#pragma omp parallel for default(none) shared(box, first_call, lengthNew, lengthOld, \ + #pragma omp parallel for default(none) shared(box, first_call, lengthNew, lengthOld, \ newMol, oldMol, thisKindNew, thisKindOld, molIndexNew, molIndexOld) \ -reduction(+:energyRecipNew) + reduction(+:energyRecipNew) #endif - for (int i = 0; i < (int)imageSizeRef[box]; i++) { + for (int i = 0; i < (int) imageSizeRef[box]; i++) { double sumRealNew = 0.0; double sumImaginaryNew = 0.0; - // Add dot sum of the new molecule for (uint m = 0; m < newMol.size(); m++) { uint newMoleculeIndex = molIndexNew[m]; double lambdaCoef = GetLambdaCoef(newMoleculeIndex, box); for (uint p = 0; p < lengthNew; ++p) { unsigned long currentAtom = mols.MolStart(newMoleculeIndex) + p; - if (particleHasNoCharge[currentAtom]) { + if(particleHasNoCharge[currentAtom]) { continue; } double dotProductNew = Dot(p, kxRef[box][i], kyRef[box][i], kzRef[box][i], newMol[m].GetCoords()); - // TODO: Using GNU extension we could improve this part of the code // by using sincos() function and merge sin() and cos() calculation // However, this will not work with Visual studio // Intel should automatically optimize this section by using // internal functions like __svml_sincosf8..() - sumRealNew += - (thisKindNew.AtomCharge(p) * lambdaCoef * cos(dotProductNew)); - sumImaginaryNew += - (thisKindNew.AtomCharge(p) * lambdaCoef * sin(dotProductNew)); + sumRealNew += (thisKindNew.AtomCharge(p) * lambdaCoef * + cos(dotProductNew)); + sumImaginaryNew += (thisKindNew.AtomCharge(p) * lambdaCoef * + sin(dotProductNew)); } } - // Subtract the sum of the old molecule for (uint m = 0; m < oldMol.size(); m++) { uint oldMoleculeIndex = molIndexOld[m]; double lambdaCoef = GetLambdaCoef(oldMoleculeIndex, box); for (uint p = 0; p < lengthOld; ++p) { unsigned long currentAtom = mols.MolStart(oldMoleculeIndex) + p; - if (particleHasNoCharge[currentAtom]) { + if(particleHasNoCharge[currentAtom]) { continue; } double dotProductOld = Dot(p, kxRef[box][i], kyRef[box][i], kzRef[box][i], oldMol[m].GetCoords()); - // TODO: Using GNU extension we could improve this part of the code // by using sincos() function and merge sin() and cos() calculation // However, this will not work with Visual studio // Intel should automatically optimize this section by using // internal functions like __svml_sincosf8..() - sumRealNew -= - thisKindOld.AtomCharge(p) * lambdaCoef * cos(dotProductOld); - sumImaginaryNew -= - thisKindOld.AtomCharge(p) * lambdaCoef * sin(dotProductOld); + sumRealNew -= thisKindOld.AtomCharge(p) * lambdaCoef * + cos(dotProductOld); + sumImaginaryNew -= thisKindOld.AtomCharge(p) * lambdaCoef * + sin(dotProductOld); } } // Update the new sum value based on the difference and previous sum - // If this is the first call to this function within the same pair of - // molecules, then use the ref variable to update new However, if this is - // the second time calling it then use the previous result as reference + // If this is the first call to this function within the same pair of molecules, + // then use the ref variable to update new + // However, if this is the second time calling it then use the previous result as reference if (first_call) { sumRnew[box][i] = sumRref[box][i] + sumRealNew; sumInew[box][i] = sumIref[box][i] + sumImaginaryNew; @@ -835,30 +887,15 @@ reduction(+:energyRecipNew) sumInew[box][i] += sumImaginaryNew; } - // Calculate new energy recip based on the new sum real and imaginary - // values - energyRecipNew += (sumRnew[box][i] * sumRnew[box][i] + - sumInew[box][i] * sumInew[box][i]) * - prefactRef[box][i]; + // Calculate new energy recip based on the new sum real and imaginary values + energyRecipNew += (sumRnew[box][i] * sumRnew[box][i] + sumInew[box][i] + * sumInew[box][i]) * prefactRef[box][i]; } - - // Keep hold of the old recip value - energyRecipOld = sysPotRef.boxEnergy[box].recip; +#endif GOMC_EVENT_STOP(1, GomcProfileEvent::RECIP_MEMC_ENERGY); } - -// Because MolExchangeReciprocal does not have a matching GPU function, this is -// a stub function to copy the CPU sumRnew and sumInew vectors to the GPU in -// case the move is accepted. If this function is ported to the GPU, this call -// should be moved to the beginning of the MolExchangeReciprocal function and -// called instead of running the CPU code. -#ifdef GOMC_CUDA - CallMolExchangeReciprocalGPU(ff.particles->getCUDAVars(), imageSizeRef[box], - sumRnew[box], sumInew[box], box); -#endif - - // Return the difference between old and new reciprocal energies - return energyRecipNew - energyRecipOld; + // Return the change in reciprocal energy + return energyRecipNew - sysPotRef.boxEnergy[box].recip; } // restore cosMol and sinMol diff --git a/src/GPU/CalculateEwaldCUDAKernel.cu b/src/GPU/CalculateEwaldCUDAKernel.cu index a68be9ce9..609a82149 100644 --- a/src/GPU/CalculateEwaldCUDAKernel.cu +++ b/src/GPU/CalculateEwaldCUDAKernel.cu @@ -373,10 +373,6 @@ void CallSwapReciprocalGPU(VariablesCUDA *vars, XYZArray const &coords, cudaDeviceSynchronize(); checkLastErrorCUDA(__FILE__, __LINE__); #endif - // cudaMemcpy(sumRnew, vars->gpu_sumRnew[box], imageSize * sizeof(double), - // cudaMemcpyDeviceToHost); - // cudaMemcpy(sumInew, vars->gpu_sumInew[box], imageSize * sizeof(double), - // cudaMemcpyDeviceToHost); // ReduceSum DeviceReduce::Sum(vars->cub_reduce_storage, vars->cub_reduce_storage_size, vars->gpu_recipEnergies, @@ -387,22 +383,131 @@ void CallSwapReciprocalGPU(VariablesCUDA *vars, XYZArray const &coords, CUFREE(gpu_particleCharge); } -void CallMolExchangeReciprocalGPU(VariablesCUDA *vars, uint imageSize, - double *sumRnew, double *sumInew, uint box) { - cudaMemcpy(vars->gpu_sumRnew[box], sumRnew, imageSize * sizeof(double), - cudaMemcpyHostToDevice); - cudaMemcpy(vars->gpu_sumInew[box], sumInew, imageSize * sizeof(double), - cudaMemcpyHostToDevice); -} +void CallMolExchangeReciprocalGPU(VariablesCUDA *vars, + uint imageSize, + double *sumRnew, + double *sumInew, + uint box, + std::vector chargeBoxNew, + std::vector chargeBoxOld, + uint lengthNew, uint lengthOld, + double &energyRecipNew, + XYZArray newMolCoords, + XYZArray oldMolCoords) +{ + uint atomsPerImage = lengthNew +lengthOld; + uint totalAtoms = atomsPerImage * imageSize; + + double *gpu_chargeBoxNew; + double *gpu_chargeBoxOld; + double *gpu_newMolX; + double *gpu_newMolY; + double *gpu_newMolZ; + double *gpu_oldMolX; + double *gpu_oldMolY; + double *gpu_oldMolZ; + + cudaMemset(vars->gpu_recipEnergies, 0, imageSize * sizeof(double)); + + if(lengthNew > 0){ + CUMALLOC((void**) &gpu_chargeBoxNew, + chargeBoxNew.size() * sizeof(double)); + cudaMemcpy(gpu_chargeBoxNew, &chargeBoxNew[0], + chargeBoxNew.size() * sizeof(double), + cudaMemcpyHostToDevice); + + } + if(lengthOld > 0){ + CUMALLOC((void**) &gpu_chargeBoxOld, + chargeBoxOld.size() * sizeof(double)); + cudaMemcpy(gpu_chargeBoxOld, &chargeBoxOld[0], + chargeBoxOld.size() * sizeof(double), + cudaMemcpyHostToDevice); + } + CUMALLOC((void**) &gpu_newMolX, newMolCoords.Count() * sizeof(double)); + CUMALLOC((void**) &gpu_newMolY, newMolCoords.Count() * sizeof(double)); + CUMALLOC((void**) &gpu_newMolZ, newMolCoords.Count() * sizeof(double)); + CUMALLOC((void**) &gpu_oldMolX, oldMolCoords.Count() * sizeof(double)); + CUMALLOC((void**) &gpu_oldMolY, oldMolCoords.Count() * sizeof(double)); + CUMALLOC((void**) &gpu_oldMolZ, oldMolCoords.Count() * sizeof(double)); + cudaMemcpy(gpu_newMolX, &newMolCoords.x[0], + newMolCoords.Count() * sizeof(double), + cudaMemcpyHostToDevice); + cudaMemcpy(gpu_newMolY, &newMolCoords.y[0], + newMolCoords.Count() * sizeof(double), + cudaMemcpyHostToDevice); + cudaMemcpy(gpu_newMolZ, &newMolCoords.z[0], + newMolCoords.Count() * sizeof(double), + cudaMemcpyHostToDevice); + cudaMemcpy(gpu_oldMolX, &oldMolCoords.x[0], + oldMolCoords.Count() * sizeof(double), + cudaMemcpyHostToDevice); + cudaMemcpy(gpu_oldMolY, &oldMolCoords.y[0], + oldMolCoords.Count() * sizeof(double), + cudaMemcpyHostToDevice); + cudaMemcpy(gpu_oldMolZ, &oldMolCoords.z[0], + oldMolCoords.Count() * sizeof(double), + cudaMemcpyHostToDevice); + + int threadsPerBlock = 256; + int blocksPerGrid = (int)(totalAtoms / threadsPerBlock) + 1; + int dynamicSharedMemorySize = 4 * sizeof(double) * (lengthNew + lengthOld); + MolExchangeReciprocalGPU<<< blocksPerGrid, threadsPerBlock, dynamicSharedMemorySize>>>( + imageSize, + vars->gpu_kxRef[box], + vars->gpu_kyRef[box], + vars->gpu_kzRef[box], + vars->gpu_sumRnew[box], + vars->gpu_sumInew[box], + gpu_chargeBoxNew, + gpu_chargeBoxOld, + lengthNew, + lengthOld, + gpu_newMolX, + gpu_newMolY, + gpu_newMolZ, + gpu_oldMolX, + gpu_oldMolY, + gpu_oldMolZ); +#ifndef NDEBUG + cudaDeviceSynchronize(); + checkLastErrorCUDA(__FILE__, __LINE__); +#endif -void CallMolExchangeReciprocalStartGPU(VariablesCUDA *vars, uint imageSize, - double *sumRref, double *sumIref, uint box) { - cudaMemcpy(sumRref, vars->gpu_sumRref[box], imageSize * sizeof(double), - cudaMemcpyDeviceToHost); - cudaMemcpy(sumIref, vars->gpu_sumIref[box], imageSize * sizeof(double), - cudaMemcpyDeviceToHost); + blocksPerGrid = (int)(imageSize / threadsPerBlock) + 1; + BoxReciprocalGPU <<< blocksPerGrid, threadsPerBlock>>>( + vars->gpu_prefactRef[box], + vars->gpu_sumRnew[box], + vars->gpu_sumInew[box], + vars->gpu_recipEnergies, + imageSize); + #ifndef NDEBUG + cudaDeviceSynchronize(); + checkLastErrorCUDA(__FILE__, __LINE__); +#endif + + // ReduceSum + + DeviceReduce::Sum(vars->cub_reduce_storage, vars->cub_reduce_storage_size, vars->gpu_recipEnergies, + vars->gpu_finalVal, imageSize); + cudaMemcpy(&energyRecipNew, vars->gpu_finalVal, + sizeof(double), cudaMemcpyDeviceToHost); + + if(lengthNew > 0){ + CUFREE(gpu_chargeBoxNew); + } + if(lengthOld > 0){ + CUFREE(gpu_chargeBoxOld); + } + CUFREE(gpu_newMolX); + CUFREE(gpu_newMolY); + CUFREE(gpu_newMolZ); + CUFREE(gpu_oldMolX); + CUFREE(gpu_oldMolY); + CUFREE(gpu_oldMolZ); } + void CallBoxForceReciprocalGPU( VariablesCUDA *vars, XYZArray &atomForceRec, XYZArray &molForceRec, const std::vector &particleCharge, @@ -689,6 +794,73 @@ __global__ void SwapReciprocalGPU(double *gpu_x, double *gpu_y, double *gpu_z, gpu_prefactRef[threadID]); } + +__global__ void MolExchangeReciprocalGPU( + int imageSize, + double *gpu_kx, + double *gpu_ky, + double *gpu_kz, + double *gpu_sumRnew, + double *gpu_sumInew, + double *gpu_chargeBoxNew, + double *gpu_chargeBoxOld, + uint lengthNew, + uint lengthOld, + double *gpu_newMolX, + double *gpu_newMolY, + double *gpu_newMolZ, + double *gpu_oldMolX, + double *gpu_oldMolY, + double *gpu_oldMolZ) +{ + int threadID = blockIdx.x * blockDim.x + threadIdx.x; + int chargeBoxLength = lengthNew + lengthOld; + + extern __shared__ double shared_arr[]; + double* shared_chargeBox = (double *) shared_arr; + double* shared_Mol = (double *) &shared_chargeBox[chargeBoxLength]; + + if(threadIdx.x < lengthNew) { + shared_Mol[threadIdx.x * 3] = gpu_newMolX[threadIdx.x]; + shared_Mol[threadIdx.x * 3 + 1] = gpu_newMolY[threadIdx.x]; + shared_Mol[threadIdx.x * 3 + 2] = gpu_newMolZ[threadIdx.x]; + shared_chargeBox[threadIdx.x] = gpu_chargeBoxNew[threadIdx.x]; + } + else if (threadIdx.x < chargeBoxLength) { + int gpu_oldMolIndex = threadIdx.x - lengthNew; + shared_Mol[threadIdx.x * 3] = gpu_oldMolX[gpu_oldMolIndex]; + shared_Mol[threadIdx.x * 3 + 1] = gpu_oldMolY[gpu_oldMolIndex]; + shared_Mol[threadIdx.x * 3 + 2] = gpu_oldMolZ[gpu_oldMolIndex]; + shared_chargeBox[threadIdx.x] = -gpu_chargeBoxOld[gpu_oldMolIndex]; + } + __syncthreads(); + + //wait until the shared array is loaded before deciding that we don't need these threads + if (threadID >= imageSize * chargeBoxLength) + return; + + // for each new & old atom index, loop thru each image + int p = threadID / imageSize; + int imageID = threadID % imageSize; + + double dotProduct = DotProductGPU(gpu_kx[imageID], + gpu_ky[imageID], + gpu_kz[imageID], + shared_Mol[p * 3], + shared_Mol[p * 3 + 1], + shared_Mol[p * 3 + 2]); + + double dotsin, dotcos; + sincos(dotProduct, &dotsin, &dotcos); + + double sumRealNew = shared_chargeBox[p] * dotcos; + double sumImaginaryNew = shared_chargeBox[p] * dotsin; + + atomicAdd(&gpu_sumRnew[imageID], sumRealNew); + atomicAdd(&gpu_sumInew[imageID], sumImaginaryNew); +} + + __global__ void MolReciprocalGPU(double *gpu_cx, double *gpu_cy, double *gpu_cz, double *gpu_nx, double *gpu_ny, double *gpu_nz, double *gpu_kx, double *gpu_ky, double *gpu_kz, diff --git a/src/GPU/CalculateEwaldCUDAKernel.cuh b/src/GPU/CalculateEwaldCUDAKernel.cuh index a4fb130f2..a6b03ea1a 100644 --- a/src/GPU/CalculateEwaldCUDAKernel.cuh +++ b/src/GPU/CalculateEwaldCUDAKernel.cuh @@ -91,13 +91,14 @@ void CallMolExchangeReciprocalGPU(VariablesCUDA *vars, uint imageSize, double *sumRnew, double *sumInew, - uint box); + uint box, + std::vector chargeBoxNew, + std::vector chargeBoxOld, + uint lengthNew, uint lengthOld, + double &energyRecipNew, + XYZArray newMolCoords, + XYZArray oldMolCoords); -void CallMolExchangeReciprocalStartGPU(VariablesCUDA *vars, - uint imageSize, - double *sumRref, - double *sumIref, - uint box); __global__ void BoxForceReciprocalGPU(double *gpu_aForceRecx, double *gpu_aForceRecy, @@ -191,6 +192,24 @@ __global__ void SwapReciprocalGPU(double *gpu_x, double *gpu_y, double *gpu_z, double *gpu_RecipEnergies, int imageSize); +__global__ void MolExchangeReciprocalGPU( + int imageSize, + double *gpu_kx, + double *gpu_ky, + double *gpu_kz, + double *gpu_sumRnew, + double *gpu_sumInew, + double *gpu_chargeBoxNew, + double *gpu_chargeBoxOld, + uint lengthNew, + uint lengthOld, + double *gpu_newMolX, + double *gpu_newMolY, + double *gpu_newMolZ, + double *gpu_oldMolX, + double *gpu_oldMolY, + double *gpu_oldMolZ); + __global__ void NewSwapReciprocalGPU(VariablesCUDA *vars, int atomNumber, uint box,