diff --git a/src/Ewald.cpp b/src/Ewald.cpp
index da70d8832..6b15f0928 100644
--- a/src/Ewald.cpp
+++ b/src/Ewald.cpp
@@ -745,88 +745,140 @@ double Ewald::MolExchangeReciprocal(const std::vector<cbmc::TrialMol> &newMol,
                                     const std::vector<uint> &molIndexOld,
                                     bool first_call) {
   double energyRecipNew = 0.0;
-  double energyRecipOld = 0.0;
-  // Change in reciprocal happens in the same box.
+  //Change in reciprocal happens in the same box.
   uint box = newMol[0].GetBox();
 
   if (box < BOXES_WITH_U_NB) {
     GOMC_EVENT_START(1, GomcProfileEvent::RECIP_MEMC_ENERGY);
-// Because MolExchangeReciprocal does not have a matching GPU function, this is
-// a stub function to copy the GPU sumRref and sumIref vectors to the CPU in
-// order to calcuate the new sums. If this function is ported to the GPU, this
-// call should be removed.
+    
 #ifdef GOMC_CUDA
-  CallMolExchangeReciprocalStartGPU(ff.particles->getCUDAVars(), imageSizeRef[box],
-                               sumRref[box], sumIref[box], box);
-#endif
+    //Build a vector of only the charged particles in the new and old molecules
+    std::vector<double> chargeBoxNew;
+    MoleculeKind const& thisKindNew = newMol[0].GetKind();
+    uint lengthNew = thisKindNew.NumAtoms() * newMol.size();
+    XYZArray newMolCoords = XYZArray(lengthNew); 
+
+    int newChargedParticles = 0; 
+    for (uint m = 0; m < newMol.size(); ++m) {
+      uint newMoleculeIndex = molIndexNew[m];
+      double lambdaCoef = GetLambdaCoef(newMoleculeIndex, box);
+
+      XYZArray currNewMolCoords = newMol[m].GetCoords(); 
+      for (uint p = 0; p < lengthNew; ++p) {
+        unsigned long currentAtom = mols.MolStart(newMoleculeIndex) + p;
+        if(!particleHasNoCharge[currentAtom]) {
+          newMolCoords.Set(newChargedParticles, 
+                           currNewMolCoords.x[p],
+                           currNewMolCoords.y[p],
+                           currNewMolCoords.z[p]); 
+          newChargedParticles++;
+          chargeBoxNew.push_back(thisKindNew.AtomCharge(p) * lambdaCoef);
+        }
+      }      
+    }
+    lengthNew = newChargedParticles;
+
+    std::vector<double> chargeBoxOld;
+    MoleculeKind const& thisKindOld = oldMol[0].GetKind();
+    uint lengthOld = thisKindOld.NumAtoms() * oldMol.size();
+    XYZArray oldMolCoords = XYZArray(lengthOld); 
+
+    int oldChargedParticles = 0; 
+    for (uint m = 0; m < oldMol.size(); ++m) {
+      uint oldMoleculeIndex = molIndexOld[m];
+      double lambdaCoef = GetLambdaCoef(oldMoleculeIndex, box);
+      XYZArray currOldMolCoords = oldMol[m].GetCoords(); 
+      for (uint p = 0; p < lengthOld; ++p) {
+        unsigned long currentAtom = mols.MolStart(oldMoleculeIndex) + p;
+        if(!particleHasNoCharge[currentAtom]) {
+          oldMolCoords.Set(oldChargedParticles, 
+                           currOldMolCoords.x[p],
+                           currOldMolCoords.y[p],
+                           currOldMolCoords.z[p]); 
+          oldChargedParticles++;
+          chargeBoxOld.push_back(thisKindOld.AtomCharge(p) * lambdaCoef);
+        }
+      }
+    }
+    lengthOld = oldChargedParticles;
 
-    uint lengthNew, lengthOld;
-    MoleculeKind const &thisKindNew = newMol[0].GetKind();
-    MoleculeKind const &thisKindOld = oldMol[0].GetKind();
-    lengthNew = thisKindNew.NumAtoms();
-    lengthOld = thisKindOld.NumAtoms();
+    // Depending on the move, we could call this function twice. If so, we don't want to
+    // double count the existing (reference) sums, so we copy them only for the first call
+    // and then add to them inside the function based on the delta values for the move.
+    if(first_call) {
+      CopyRefToNewCUDA(ff.particles->getCUDAVars(), box, imageSizeRef[box]);
+    }
 
+    CallMolExchangeReciprocalGPU(ff.particles->getCUDAVars(), 
+                                 imageSizeRef[box],
+                                 sumRnew[box], sumInew[box], box, 
+                                 chargeBoxNew, chargeBoxOld,
+                                 lengthNew, lengthOld,
+                                 energyRecipNew,
+                                 newMolCoords,
+                                 oldMolCoords);
+#else 
+    MoleculeKind const& thisKindNew = newMol[0].GetKind();
+    MoleculeKind const& thisKindOld = oldMol[0].GetKind();
+    uint lengthNew = thisKindNew.NumAtoms();
+    uint lengthOld = thisKindOld.NumAtoms();
 #ifdef _OPENMP
-#pragma omp parallel for default(none) shared(box, first_call, lengthNew, lengthOld, \
+    #pragma omp parallel for default(none) shared(box, first_call, lengthNew, lengthOld, \
     newMol, oldMol, thisKindNew, thisKindOld, molIndexNew, molIndexOld) \
-reduction(+:energyRecipNew)
+    reduction(+:energyRecipNew)
 #endif
-    for (int i = 0; i < (int)imageSizeRef[box]; i++) {
+    for (int i = 0; i < (int) imageSizeRef[box]; i++) {
       double sumRealNew = 0.0;
       double sumImaginaryNew = 0.0;
 
-      // Add dot sum of the new molecule
       for (uint m = 0; m < newMol.size(); m++) {
         uint newMoleculeIndex = molIndexNew[m];
         double lambdaCoef = GetLambdaCoef(newMoleculeIndex, box);
         for (uint p = 0; p < lengthNew; ++p) {
           unsigned long currentAtom = mols.MolStart(newMoleculeIndex) + p;
-          if (particleHasNoCharge[currentAtom]) {
+          if(particleHasNoCharge[currentAtom]) {
             continue;
           }
           double dotProductNew = Dot(p, kxRef[box][i], kyRef[box][i],
                                      kzRef[box][i], newMol[m].GetCoords());
-
           // TODO: Using GNU extension we could improve this part of the code
           // by using sincos() function and merge sin() and cos() calculation
           // However, this will not work with Visual studio
           // Intel should automatically optimize this section by using
           // internal functions like __svml_sincosf8..()
-          sumRealNew +=
-              (thisKindNew.AtomCharge(p) * lambdaCoef * cos(dotProductNew));
-          sumImaginaryNew +=
-              (thisKindNew.AtomCharge(p) * lambdaCoef * sin(dotProductNew));
+          sumRealNew += (thisKindNew.AtomCharge(p) * lambdaCoef *
+                         cos(dotProductNew));
+          sumImaginaryNew += (thisKindNew.AtomCharge(p) * lambdaCoef *
+                              sin(dotProductNew));
         }
       }
 
-      // Subtract the sum of the old molecule
       for (uint m = 0; m < oldMol.size(); m++) {
         uint oldMoleculeIndex = molIndexOld[m];
         double lambdaCoef = GetLambdaCoef(oldMoleculeIndex, box);
         for (uint p = 0; p < lengthOld; ++p) {
           unsigned long currentAtom = mols.MolStart(oldMoleculeIndex) + p;
-          if (particleHasNoCharge[currentAtom]) {
+          if(particleHasNoCharge[currentAtom]) {
             continue;
           }
           double dotProductOld = Dot(p, kxRef[box][i], kyRef[box][i],
                                      kzRef[box][i], oldMol[m].GetCoords());
-
           // TODO: Using GNU extension we could improve this part of the code
           // by using sincos() function and merge sin() and cos() calculation
           // However, this will not work with Visual studio
           // Intel should automatically optimize this section by using
           // internal functions like __svml_sincosf8..()
-          sumRealNew -=
-              thisKindOld.AtomCharge(p) * lambdaCoef * cos(dotProductOld);
-          sumImaginaryNew -=
-              thisKindOld.AtomCharge(p) * lambdaCoef * sin(dotProductOld);
+          sumRealNew -= thisKindOld.AtomCharge(p) * lambdaCoef *
+                        cos(dotProductOld);
+          sumImaginaryNew -= thisKindOld.AtomCharge(p) * lambdaCoef *
+                             sin(dotProductOld);
         }
       }
 
       // Update the new sum value based on the difference and previous sum
-      // If this is the first call to this function within the same pair of
-      // molecules, then use the ref variable to update new However, if this is
-      // the second time calling it then use the previous result as reference
+      // If this is the first call to this function within the same pair of molecules,
+      // then use the ref variable to update new
+      // However, if this is the second time calling it then use the previous result as reference
       if (first_call) {
         sumRnew[box][i] = sumRref[box][i] + sumRealNew;
         sumInew[box][i] = sumIref[box][i] + sumImaginaryNew;
@@ -835,30 +887,15 @@ reduction(+:energyRecipNew)
         sumInew[box][i] += sumImaginaryNew;
       }
 
-      // Calculate new energy recip based on the new sum real and imaginary
-      // values
-      energyRecipNew += (sumRnew[box][i] * sumRnew[box][i] +
-                         sumInew[box][i] * sumInew[box][i]) *
-                        prefactRef[box][i];
+      // Calculate new energy recip based on the new sum real and imaginary values
+      energyRecipNew += (sumRnew[box][i] * sumRnew[box][i] + sumInew[box][i]
+                         * sumInew[box][i]) * prefactRef[box][i];
     }
-
-    // Keep hold of the old recip value
-    energyRecipOld = sysPotRef.boxEnergy[box].recip;
+#endif 
     GOMC_EVENT_STOP(1, GomcProfileEvent::RECIP_MEMC_ENERGY);
   }
-
-// Because MolExchangeReciprocal does not have a matching GPU function, this is
-// a stub function to copy the CPU sumRnew and sumInew vectors to the GPU in
-// case the move is accepted. If this function is ported to the GPU, this call
-// should be moved to the beginning of the MolExchangeReciprocal function and
-// called instead of running the CPU code.
-#ifdef GOMC_CUDA
-  CallMolExchangeReciprocalGPU(ff.particles->getCUDAVars(), imageSizeRef[box],
-                               sumRnew[box], sumInew[box], box);
-#endif
-
-  // Return the difference between old and new reciprocal energies
-  return energyRecipNew - energyRecipOld;
+  // Return the change in reciprocal energy
+  return energyRecipNew - sysPotRef.boxEnergy[box].recip;
 }
 
 // restore cosMol and sinMol
diff --git a/src/GPU/CalculateEwaldCUDAKernel.cu b/src/GPU/CalculateEwaldCUDAKernel.cu
index a68be9ce9..609a82149 100644
--- a/src/GPU/CalculateEwaldCUDAKernel.cu
+++ b/src/GPU/CalculateEwaldCUDAKernel.cu
@@ -373,10 +373,6 @@ void CallSwapReciprocalGPU(VariablesCUDA *vars, XYZArray const &coords,
   cudaDeviceSynchronize();
   checkLastErrorCUDA(__FILE__, __LINE__);
 #endif
-  // cudaMemcpy(sumRnew, vars->gpu_sumRnew[box], imageSize * sizeof(double),
-             // cudaMemcpyDeviceToHost);
-  // cudaMemcpy(sumInew, vars->gpu_sumInew[box], imageSize * sizeof(double),
-             // cudaMemcpyDeviceToHost);
 
   // ReduceSum
   DeviceReduce::Sum(vars->cub_reduce_storage, vars->cub_reduce_storage_size, vars->gpu_recipEnergies,
@@ -387,22 +383,131 @@ void CallSwapReciprocalGPU(VariablesCUDA *vars, XYZArray const &coords,
   CUFREE(gpu_particleCharge);
 }
 
-void CallMolExchangeReciprocalGPU(VariablesCUDA *vars, uint imageSize,
-                                  double *sumRnew, double *sumInew, uint box) {
-  cudaMemcpy(vars->gpu_sumRnew[box], sumRnew, imageSize * sizeof(double),
-             cudaMemcpyHostToDevice);
-  cudaMemcpy(vars->gpu_sumInew[box], sumInew, imageSize * sizeof(double),
-             cudaMemcpyHostToDevice);
-}
+void CallMolExchangeReciprocalGPU(VariablesCUDA *vars,
+                                  uint imageSize,
+                                  double *sumRnew,
+                                  double *sumInew,
+                                  uint box, 
+                                  std::vector<double> chargeBoxNew,
+                                  std::vector<double> chargeBoxOld,
+                                  uint lengthNew, uint lengthOld,
+                                  double &energyRecipNew,
+                                  XYZArray newMolCoords,
+                                  XYZArray oldMolCoords)
+{
+  uint atomsPerImage = lengthNew +lengthOld;
+  uint totalAtoms = atomsPerImage * imageSize;
+  
+  double *gpu_chargeBoxNew;
+  double *gpu_chargeBoxOld;
+  double *gpu_newMolX;
+  double *gpu_newMolY;
+  double *gpu_newMolZ;
+  double *gpu_oldMolX;
+  double *gpu_oldMolY;
+  double *gpu_oldMolZ;
+
+  cudaMemset(vars->gpu_recipEnergies, 0, imageSize * sizeof(double));
+
+  if(lengthNew > 0){
+    CUMALLOC((void**) &gpu_chargeBoxNew,
+              chargeBoxNew.size() * sizeof(double));
+    cudaMemcpy(gpu_chargeBoxNew, &chargeBoxNew[0], 
+               chargeBoxNew.size() * sizeof(double), 
+               cudaMemcpyHostToDevice);
+        
+  }
+  if(lengthOld > 0){
+    CUMALLOC((void**) &gpu_chargeBoxOld,
+              chargeBoxOld.size() * sizeof(double));
+    cudaMemcpy(gpu_chargeBoxOld, &chargeBoxOld[0], 
+               chargeBoxOld.size() * sizeof(double), 
+               cudaMemcpyHostToDevice);
+  }
+  CUMALLOC((void**) &gpu_newMolX, newMolCoords.Count() * sizeof(double));
+  CUMALLOC((void**) &gpu_newMolY, newMolCoords.Count() * sizeof(double));
+  CUMALLOC((void**) &gpu_newMolZ, newMolCoords.Count() * sizeof(double));
+  CUMALLOC((void**) &gpu_oldMolX, oldMolCoords.Count() * sizeof(double));
+  CUMALLOC((void**) &gpu_oldMolY, oldMolCoords.Count() * sizeof(double)); 
+  CUMALLOC((void**) &gpu_oldMolZ, oldMolCoords.Count() * sizeof(double));          
+  cudaMemcpy(gpu_newMolX, &newMolCoords.x[0], 
+              newMolCoords.Count() * sizeof(double), 
+              cudaMemcpyHostToDevice);
+  cudaMemcpy(gpu_newMolY, &newMolCoords.y[0], 
+              newMolCoords.Count() * sizeof(double), 
+              cudaMemcpyHostToDevice);
+  cudaMemcpy(gpu_newMolZ, &newMolCoords.z[0], 
+              newMolCoords.Count() * sizeof(double), 
+              cudaMemcpyHostToDevice);
+  cudaMemcpy(gpu_oldMolX, &oldMolCoords.x[0], 
+              oldMolCoords.Count() * sizeof(double), 
+              cudaMemcpyHostToDevice);
+  cudaMemcpy(gpu_oldMolY, &oldMolCoords.y[0], 
+              oldMolCoords.Count() * sizeof(double), 
+              cudaMemcpyHostToDevice);
+  cudaMemcpy(gpu_oldMolZ, &oldMolCoords.z[0], 
+              oldMolCoords.Count() * sizeof(double), 
+              cudaMemcpyHostToDevice);
+
+  int threadsPerBlock = 256;
+  int blocksPerGrid = (int)(totalAtoms / threadsPerBlock) + 1;
+  int dynamicSharedMemorySize = 4 * sizeof(double) * (lengthNew + lengthOld);
+  MolExchangeReciprocalGPU<<< blocksPerGrid, threadsPerBlock, dynamicSharedMemorySize>>>(
+        imageSize,
+        vars->gpu_kxRef[box],
+        vars->gpu_kyRef[box],
+        vars->gpu_kzRef[box],
+        vars->gpu_sumRnew[box],
+        vars->gpu_sumInew[box],
+        gpu_chargeBoxNew,
+        gpu_chargeBoxOld,
+        lengthNew,
+        lengthOld,
+        gpu_newMolX,
+        gpu_newMolY,
+        gpu_newMolZ,
+        gpu_oldMolX,
+        gpu_oldMolY,
+        gpu_oldMolZ);
+#ifndef NDEBUG
+  cudaDeviceSynchronize();
+  checkLastErrorCUDA(__FILE__, __LINE__);
+#endif
 
-void CallMolExchangeReciprocalStartGPU(VariablesCUDA *vars, uint imageSize,
-                                  double *sumRref, double *sumIref, uint box) {
-  cudaMemcpy(sumRref, vars->gpu_sumRref[box], imageSize * sizeof(double),
-             cudaMemcpyDeviceToHost);
-  cudaMemcpy(sumIref, vars->gpu_sumIref[box], imageSize * sizeof(double),
-             cudaMemcpyDeviceToHost);
+  blocksPerGrid = (int)(imageSize / threadsPerBlock) + 1;
+  BoxReciprocalGPU <<< blocksPerGrid, threadsPerBlock>>>(
+        vars->gpu_prefactRef[box],
+        vars->gpu_sumRnew[box],
+        vars->gpu_sumInew[box],
+        vars->gpu_recipEnergies,
+		imageSize);
+ #ifndef NDEBUG
+  cudaDeviceSynchronize();
+  checkLastErrorCUDA(__FILE__, __LINE__);
+#endif
+
+  // ReduceSum
+
+   DeviceReduce::Sum(vars->cub_reduce_storage, vars->cub_reduce_storage_size, vars->gpu_recipEnergies,
+                    vars->gpu_finalVal, imageSize);
+  cudaMemcpy(&energyRecipNew, vars->gpu_finalVal,
+             sizeof(double), cudaMemcpyDeviceToHost);
+
+  if(lengthNew > 0){
+    CUFREE(gpu_chargeBoxNew);
+  }
+  if(lengthOld > 0){
+    CUFREE(gpu_chargeBoxOld);
+  }
+  CUFREE(gpu_newMolX); 
+  CUFREE(gpu_newMolY);
+  CUFREE(gpu_newMolZ);
+  CUFREE(gpu_oldMolX);
+  CUFREE(gpu_oldMolY);
+  CUFREE(gpu_oldMolZ);
 }
 
+
 void CallBoxForceReciprocalGPU(
     VariablesCUDA *vars, XYZArray &atomForceRec, XYZArray &molForceRec,
     const std::vector<double> &particleCharge,
@@ -689,6 +794,73 @@ __global__ void SwapReciprocalGPU(double *gpu_x, double *gpu_y, double *gpu_z,
        gpu_prefactRef[threadID]);
 }
 
+
+__global__ void MolExchangeReciprocalGPU(
+                                  int imageSize, 
+                                  double *gpu_kx, 
+                                  double *gpu_ky, 
+                                  double *gpu_kz,
+                                  double *gpu_sumRnew,
+                                  double *gpu_sumInew,
+                                  double *gpu_chargeBoxNew,
+                                  double *gpu_chargeBoxOld, 
+                                  uint lengthNew, 
+                                  uint lengthOld,
+                                  double *gpu_newMolX,
+                                  double *gpu_newMolY,
+                                  double *gpu_newMolZ,
+                                  double *gpu_oldMolX,
+                                  double *gpu_oldMolY,
+                                  double *gpu_oldMolZ)
+{
+  int threadID = blockIdx.x * blockDim.x + threadIdx.x;
+  int chargeBoxLength = lengthNew + lengthOld;
+
+  extern __shared__ double shared_arr[]; 
+  double* shared_chargeBox = (double *) shared_arr; 
+  double* shared_Mol = (double *) &shared_chargeBox[chargeBoxLength]; 
+
+  if(threadIdx.x < lengthNew) { 
+    shared_Mol[threadIdx.x * 3] = gpu_newMolX[threadIdx.x];
+    shared_Mol[threadIdx.x * 3 + 1] = gpu_newMolY[threadIdx.x];
+    shared_Mol[threadIdx.x * 3 + 2] = gpu_newMolZ[threadIdx.x];
+    shared_chargeBox[threadIdx.x] = gpu_chargeBoxNew[threadIdx.x];
+  }
+  else if (threadIdx.x < chargeBoxLength) {
+    int gpu_oldMolIndex = threadIdx.x - lengthNew;
+    shared_Mol[threadIdx.x * 3] = gpu_oldMolX[gpu_oldMolIndex];
+    shared_Mol[threadIdx.x * 3 + 1] = gpu_oldMolY[gpu_oldMolIndex];
+    shared_Mol[threadIdx.x * 3 + 2] = gpu_oldMolZ[gpu_oldMolIndex];
+    shared_chargeBox[threadIdx.x] = -gpu_chargeBoxOld[gpu_oldMolIndex];
+  }
+  __syncthreads();
+
+  //wait until the shared array is loaded before deciding that we don't need these threads
+  if (threadID >= imageSize * chargeBoxLength)
+    return;
+
+  // for each new & old atom index, loop thru each image
+  int p = threadID / imageSize;
+  int imageID = threadID % imageSize;
+
+  double dotProduct = DotProductGPU(gpu_kx[imageID],
+                                    gpu_ky[imageID],
+                                    gpu_kz[imageID],
+                                    shared_Mol[p * 3],
+                                    shared_Mol[p * 3 + 1],
+                                    shared_Mol[p * 3 + 2]);
+									
+  double dotsin, dotcos;
+  sincos(dotProduct, &dotsin, &dotcos);
+
+  double sumRealNew = shared_chargeBox[p] * dotcos;
+  double sumImaginaryNew = shared_chargeBox[p] * dotsin;
+
+  atomicAdd(&gpu_sumRnew[imageID], sumRealNew);
+  atomicAdd(&gpu_sumInew[imageID], sumImaginaryNew);
+}
+
+
 __global__ void MolReciprocalGPU(double *gpu_cx, double *gpu_cy, double *gpu_cz,
                                  double *gpu_nx, double *gpu_ny, double *gpu_nz,
                                  double *gpu_kx, double *gpu_ky, double *gpu_kz,
diff --git a/src/GPU/CalculateEwaldCUDAKernel.cuh b/src/GPU/CalculateEwaldCUDAKernel.cuh
index a4fb130f2..a6b03ea1a 100644
--- a/src/GPU/CalculateEwaldCUDAKernel.cuh
+++ b/src/GPU/CalculateEwaldCUDAKernel.cuh
@@ -91,13 +91,14 @@ void CallMolExchangeReciprocalGPU(VariablesCUDA *vars,
                                   uint imageSize,
                                   double *sumRnew,
                                   double *sumInew,
-                                  uint box);
+                                  uint box, 
+                                  std::vector<double> chargeBoxNew,
+                                  std::vector<double> chargeBoxOld,
+                                  uint lengthNew, uint lengthOld,
+                                  double &energyRecipNew,
+                                  XYZArray newMolCoords,
+                                  XYZArray oldMolCoords);
 
-void CallMolExchangeReciprocalStartGPU(VariablesCUDA *vars,
-                                  uint imageSize,
-                                  double *sumRref,
-                                  double *sumIref,
-                                  uint box);
 
 __global__ void BoxForceReciprocalGPU(double *gpu_aForceRecx,
                                       double *gpu_aForceRecy,
@@ -191,6 +192,24 @@ __global__ void SwapReciprocalGPU(double *gpu_x, double *gpu_y, double *gpu_z,
                                   double *gpu_RecipEnergies,
                                   int imageSize);
 
+__global__ void MolExchangeReciprocalGPU(
+                                  int imageSize,
+                                  double *gpu_kx,
+                                  double *gpu_ky,
+                                  double *gpu_kz,
+                                  double *gpu_sumRnew,
+                                  double *gpu_sumInew,
+                                  double *gpu_chargeBoxNew,
+                                  double *gpu_chargeBoxOld,
+                                  uint lengthNew,
+                                  uint lengthOld,
+                                  double *gpu_newMolX,
+                                  double *gpu_newMolY,
+                                  double *gpu_newMolZ,
+                                  double *gpu_oldMolX,
+                                  double *gpu_oldMolY,
+                                  double *gpu_oldMolZ);
+
 __global__ void NewSwapReciprocalGPU(VariablesCUDA *vars,
                                   int atomNumber,
                                   uint box,