diff --git a/source/lib/include/prod_force.h b/source/lib/include/prod_force.h index 3c9bd89549..661e998d12 100644 --- a/source/lib/include/prod_force.h +++ b/source/lib/include/prod_force.h @@ -2,6 +2,19 @@ namespace deepmd { +/** + * @brief Produce force from net_deriv and in_deriv. + * + * @tparam FPTYPE float or double + * @param[out] force Atomic forces. + * @param[in] net_deriv Net derivative. + * @param[in] in_deriv Environmental derivative. + * @param[in] nlist Neighbor list. + * @param[in] nloc The number of local atoms. + * @param[in] nall The number of all atoms, including ghost atoms. + * @param[in] nnei The number of neighbors. + * @param[in] nframes The number of frames. + */ template void prod_force_a_cpu(FPTYPE* force, const FPTYPE* net_deriv, @@ -10,7 +23,38 @@ void prod_force_a_cpu(FPTYPE* force, const int nloc, const int nall, const int nnei, - const int start_index = 0); + const int nframes); + +/** + * @brief Produce force from net_deriv and in_deriv. + * @details This function is used for multi-threading. Only part of atoms + * are computed in this thread. They will be comptued in parallel. + * + * @tparam FPTYPE float or double + * @param[out] force Atomic forces. + * @param[in] net_deriv Net derivative. + * @param[in] in_deriv Environmental derivative. + * @param[in] nlist Neighbor list. + * @param[in] nloc The number of local atoms. + * @param[in] nall The number of all atoms, including ghost atoms. + * @param[in] nnei The number of neighbors. + * @param[in] nframes The number of frames. + * @param[in] thread_nloc The number of local atoms to be computed in this + * thread. + * @param[in] thread_start_index The start index of local atoms to be computed + * in this thread. The index should be in [0, nloc). + */ +template +void prod_force_a_cpu(FPTYPE* force, + const FPTYPE* net_deriv, + const FPTYPE* in_deriv, + const int* nlist, + const int nloc, + const int nall, + const int nnei, + const int nframes, + const int thread_nloc, + const int thread_start_index); template void prod_force_r_cpu(FPTYPE* force, @@ -19,7 +63,8 @@ void prod_force_r_cpu(FPTYPE* force, const int* nlist, const int nloc, const int nall, - const int nnei); + const int nnei, + const int nframes); #if GOOGLE_CUDA template @@ -29,7 +74,8 @@ void prod_force_a_gpu_cuda(FPTYPE* force, const int* nlist, const int nloc, const int nall, - const int nnei); + const int nnei, + const int nframes); template void prod_force_r_gpu_cuda(FPTYPE* force, @@ -38,7 +84,8 @@ void prod_force_r_gpu_cuda(FPTYPE* force, const int* nlist, const int nloc, const int nall, - const int nnei); + const int nnei, + const int nframes); #endif // GOOGLE_CUDA #if TENSORFLOW_USE_ROCM @@ -49,7 +96,8 @@ void prod_force_a_gpu_rocm(FPTYPE* force, const int* nlist, const int nloc, const int nall, - const int nnei); + const int nnei, + const int nframes); template void prod_force_r_gpu_rocm(FPTYPE* force, @@ -58,7 +106,8 @@ void prod_force_r_gpu_rocm(FPTYPE* force, const int* nlist, const int nloc, const int nall, - const int nnei); + const int nnei, + const int nframes); #endif // TENSORFLOW_USE_ROCM } // namespace deepmd diff --git a/source/lib/src/cuda/prod_force.cu b/source/lib/src/cuda/prod_force.cu index db3683eab7..80f4eaed50 100644 --- a/source/lib/src/cuda/prod_force.cu +++ b/source/lib/src/cuda/prod_force.cu @@ -5,7 +5,9 @@ template __global__ void force_deriv_wrt_center_atom(FPTYPE* force, const FPTYPE* net_deriv, const FPTYPE* in_deriv, - const int ndescrpt) { + const int ndescrpt, + const int nloc, + const int nall) { __shared__ FPTYPE data[THREADS_PER_BLOCK * 3]; int_64 bid = blockIdx.x; unsigned int tid = threadIdx.x; @@ -31,10 +33,13 @@ __global__ void force_deriv_wrt_center_atom(FPTYPE* force, __syncthreads(); } // write result for this block to global memory + const int_64 kk = bid / nloc; // frame index + const int_64 ll = bid % nloc; // atom index + const int_64 i_idx_nall = kk * nall + ll; if (tid == 0) { - force[bid * 3 + 0] -= data[THREADS_PER_BLOCK * 0]; - force[bid * 3 + 1] -= data[THREADS_PER_BLOCK * 1]; - force[bid * 3 + 2] -= data[THREADS_PER_BLOCK * 2]; + force[i_idx_nall * 3 + 0] -= data[THREADS_PER_BLOCK * 0]; + force[i_idx_nall * 3 + 1] -= data[THREADS_PER_BLOCK * 1]; + force[i_idx_nall * 3 + 2] -= data[THREADS_PER_BLOCK * 2]; } } @@ -44,6 +49,7 @@ __global__ void force_deriv_wrt_neighbors_a(FPTYPE* force, const FPTYPE* in_deriv, const int* nlist, const int nloc, + const int nall, const int nnei) { // idy -> nnei const int_64 idx = blockIdx.x; @@ -63,7 +69,8 @@ __global__ void force_deriv_wrt_neighbors_a(FPTYPE* force, force_tmp += net_deriv[idx * ndescrpt + idy * 4 + idw] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz]; } - atomicAdd(force + j_idx * 3 + idz, force_tmp); + const int_64 kk = idx / nloc; // frame index + atomicAdd(force + kk * nall * 3 + j_idx * 3 + idz, force_tmp); } template @@ -72,6 +79,7 @@ __global__ void force_deriv_wrt_neighbors_r(FPTYPE* force, const FPTYPE* in_deriv, const int* nlist, const int nloc, + const int nall, const int nnei) { // idy -> nnei const int_64 idx = blockIdx.x; @@ -86,7 +94,8 @@ __global__ void force_deriv_wrt_neighbors_r(FPTYPE* force, if (j_idx < 0) { return; } - atomicAdd(force + j_idx * 3 + idz, + const int_64 kk = idx / nloc; // frame index + atomicAdd(force + kk * nall * 3 + j_idx * 3 + idz, net_deriv[idx * ndescrpt + idy] * in_deriv[idx * ndescrpt * 3 + idy * 3 + idz]); } @@ -99,21 +108,22 @@ void prod_force_a_gpu_cuda(FPTYPE* force, const int* nlist, const int nloc, const int nall, - const int nnei) { + const int nnei, + const int nframes) { const int ndescrpt = nnei * 4; - DPErrcheck(cudaMemset(force, 0, sizeof(FPTYPE) * nall * 3)); + DPErrcheck(cudaMemset(force, 0, sizeof(FPTYPE) * nframes * nall * 3)); - force_deriv_wrt_center_atom - <<>>(force, net_deriv, in_deriv, ndescrpt); + force_deriv_wrt_center_atom<<>>( + force, net_deriv, in_deriv, ndescrpt, nloc, nall); DPErrcheck(cudaGetLastError()); DPErrcheck(cudaDeviceSynchronize()); const int LEN = 64; const int nblock = (nnei + LEN - 1) / LEN; - dim3 block_grid(nloc, nblock); + dim3 block_grid(nframes * nloc, nblock); dim3 thread_grid(LEN, 3); force_deriv_wrt_neighbors_a<<>>( - force, net_deriv, in_deriv, nlist, nloc, nnei); + force, net_deriv, in_deriv, nlist, nloc, nall, nnei); DPErrcheck(cudaGetLastError()); DPErrcheck(cudaDeviceSynchronize()); } @@ -125,21 +135,22 @@ void prod_force_r_gpu_cuda(FPTYPE* force, const int* nlist, const int nloc, const int nall, - const int nnei) { + const int nnei, + const int nframes) { const int ndescrpt = nnei * 1; - DPErrcheck(cudaMemset(force, 0, sizeof(FPTYPE) * nall * 3)); + DPErrcheck(cudaMemset(force, 0, sizeof(FPTYPE) * nframes * nall * 3)); - force_deriv_wrt_center_atom - <<>>(force, net_deriv, in_deriv, ndescrpt); + force_deriv_wrt_center_atom<<>>( + force, net_deriv, in_deriv, ndescrpt, nloc, nall); DPErrcheck(cudaGetLastError()); DPErrcheck(cudaDeviceSynchronize()); const int LEN = 64; const int nblock = (nnei + LEN - 1) / LEN; - dim3 block_grid(nloc, nblock); + dim3 block_grid(nframes * nloc, nblock); dim3 thread_grid(LEN, 3); force_deriv_wrt_neighbors_r<<>>( - force, net_deriv, in_deriv, nlist, nloc, nnei); + force, net_deriv, in_deriv, nlist, nloc, nall, nnei); DPErrcheck(cudaGetLastError()); DPErrcheck(cudaDeviceSynchronize()); } @@ -150,26 +161,30 @@ template void prod_force_a_gpu_cuda(float* force, const int* nlist, const int nloc, const int nall, - const int nnei); + const int nnei, + const int nframes); template void prod_force_a_gpu_cuda(double* force, const double* net_deriv, const double* in_deriv, const int* nlist, const int nloc, const int nall, - const int nnei); + const int nnei, + const int nframes); template void prod_force_r_gpu_cuda(float* force, const float* net_deriv, const float* in_deriv, const int* nlist, const int nloc, const int nall, - const int nnei); + const int nnei, + const int nframes); template void prod_force_r_gpu_cuda(double* force, const double* net_deriv, const double* in_deriv, const int* nlist, const int nloc, const int nall, - const int nnei); + const int nnei, + const int nframes); } // namespace deepmd diff --git a/source/lib/src/prod_force.cc b/source/lib/src/prod_force.cc index b258ff1d3f..8cd67d8c7a 100644 --- a/source/lib/src/prod_force.cc +++ b/source/lib/src/prod_force.cc @@ -27,20 +27,26 @@ void deepmd::prod_force_a_cpu(FPTYPE* force, const int nloc, const int nall, const int nnei, - const int start_index) { + const int nframes, + const int thread_nloc, + const int thread_start_index) { const int ndescrpt = 4 * nnei; - memset(force, 0, sizeof(FPTYPE) * nall * 3); + memset(force, 0, sizeof(FPTYPE) * nframes * nall * 3); // compute force of a frame - for (int i_idx = start_index; i_idx < start_index + nloc; ++i_idx) { + for (int i_idx = nframes * thread_start_index; + i_idx < nframes * (thread_start_index + thread_nloc); ++i_idx) { + int kk = i_idx / nloc; // frame index + int ll = i_idx % nloc; // atom index + int i_idx_nall = kk * nall + ll; // deriv wrt center atom for (int aa = 0; aa < ndescrpt; ++aa) { - force[i_idx * 3 + 0] -= net_deriv[i_idx * ndescrpt + aa] * - env_deriv[i_idx * ndescrpt * 3 + aa * 3 + 0]; - force[i_idx * 3 + 1] -= net_deriv[i_idx * ndescrpt + aa] * - env_deriv[i_idx * ndescrpt * 3 + aa * 3 + 1]; - force[i_idx * 3 + 2] -= net_deriv[i_idx * ndescrpt + aa] * - env_deriv[i_idx * ndescrpt * 3 + aa * 3 + 2]; + force[i_idx_nall * 3 + 0] -= net_deriv[i_idx * ndescrpt + aa] * + env_deriv[i_idx * ndescrpt * 3 + aa * 3 + 0]; + force[i_idx_nall * 3 + 1] -= net_deriv[i_idx * ndescrpt + aa] * + env_deriv[i_idx * ndescrpt * 3 + aa * 3 + 1]; + force[i_idx_nall * 3 + 2] -= net_deriv[i_idx * ndescrpt + aa] * + env_deriv[i_idx * ndescrpt * 3 + aa * 3 + 2]; } // deriv wrt neighbors for (int jj = 0; jj < nnei; ++jj) { @@ -49,17 +55,56 @@ void deepmd::prod_force_a_cpu(FPTYPE* force, int aa_start, aa_end; make_index_range(aa_start, aa_end, jj, nnei); for (int aa = aa_start; aa < aa_end; ++aa) { - force[j_idx * 3 + 0] += net_deriv[i_idx * ndescrpt + aa] * - env_deriv[i_idx * ndescrpt * 3 + aa * 3 + 0]; - force[j_idx * 3 + 1] += net_deriv[i_idx * ndescrpt + aa] * - env_deriv[i_idx * ndescrpt * 3 + aa * 3 + 1]; - force[j_idx * 3 + 2] += net_deriv[i_idx * ndescrpt + aa] * - env_deriv[i_idx * ndescrpt * 3 + aa * 3 + 2]; + force[kk * nall * 3 + j_idx * 3 + 0] += + net_deriv[i_idx * ndescrpt + aa] * + env_deriv[i_idx * ndescrpt * 3 + aa * 3 + 0]; + force[kk * nall * 3 + j_idx * 3 + 1] += + net_deriv[i_idx * ndescrpt + aa] * + env_deriv[i_idx * ndescrpt * 3 + aa * 3 + 1]; + force[kk * nall * 3 + j_idx * 3 + 2] += + net_deriv[i_idx * ndescrpt + aa] * + env_deriv[i_idx * ndescrpt * 3 + aa * 3 + 2]; } } } } +// overload to provide default values +template +void deepmd::prod_force_a_cpu(FPTYPE* force, + const FPTYPE* net_deriv, + const FPTYPE* env_deriv, + const int* nlist, + const int nloc, + const int nall, + const int nnei, + const int nframes) { + deepmd::prod_force_a_cpu(force, net_deriv, env_deriv, nlist, nloc, nall, nnei, + nframes, nloc, 0); +}; + +template void deepmd::prod_force_a_cpu(double* force, + const double* net_deriv, + const double* env_deriv, + const int* nlist, + const int nloc, + const int nall, + const int nnei, + const int nframes, + const int thread_nloc, + const int thread_start_index); + +template void deepmd::prod_force_a_cpu(float* force, + const float* net_deriv, + const float* env_deriv, + const int* nlist, + const int nloc, + const int nall, + const int nnei, + const int nframes, + const int thread_nloc, + const int thread_start_index); + template void deepmd::prod_force_a_cpu(double* force, const double* net_deriv, const double* env_deriv, @@ -67,7 +112,7 @@ template void deepmd::prod_force_a_cpu(double* force, const int nloc, const int nall, const int nnei, - const int start_index); + const int nframes); template void deepmd::prod_force_a_cpu(float* force, const float* net_deriv, @@ -76,7 +121,7 @@ template void deepmd::prod_force_a_cpu(float* force, const int nloc, const int nall, const int nnei, - const int start_index); + const int nframes); template void deepmd::prod_force_r_cpu(FPTYPE* force, @@ -85,10 +130,11 @@ void deepmd::prod_force_r_cpu(FPTYPE* force, const int* nlist, const int nloc, const int nall, - const int nnei) { + const int nnei, + const int nframes) { const int ndescrpt = 1 * nnei; - for (int ii = 0; ii < nall; ++ii) { + for (int ii = 0; ii < nframes * nall; ++ii) { int i_idx = ii; force[i_idx * 3 + 0] = (FPTYPE)0.; force[i_idx * 3 + 1] = (FPTYPE)0.; @@ -96,28 +142,34 @@ void deepmd::prod_force_r_cpu(FPTYPE* force, } // compute force of a frame - for (int ii = 0; ii < nloc; ++ii) { + for (int ii = 0; ii < nframes * nloc; ++ii) { + int kk = ii / nloc; // frame index + int ll = ii % nloc; // atom index + int i_idx_nall = kk * nall + ll; int i_idx = ii; // deriv wrt center atom for (int aa = 0; aa < ndescrpt; ++aa) { - force[i_idx * 3 + 0] -= net_deriv[i_idx * ndescrpt + aa] * - env_deriv[i_idx * ndescrpt * 3 + aa * 3 + 0]; - force[i_idx * 3 + 1] -= net_deriv[i_idx * ndescrpt + aa] * - env_deriv[i_idx * ndescrpt * 3 + aa * 3 + 1]; - force[i_idx * 3 + 2] -= net_deriv[i_idx * ndescrpt + aa] * - env_deriv[i_idx * ndescrpt * 3 + aa * 3 + 2]; + force[i_idx_nall * 3 + 0] -= net_deriv[i_idx * ndescrpt + aa] * + env_deriv[i_idx * ndescrpt * 3 + aa * 3 + 0]; + force[i_idx_nall * 3 + 1] -= net_deriv[i_idx * ndescrpt + aa] * + env_deriv[i_idx * ndescrpt * 3 + aa * 3 + 1]; + force[i_idx_nall * 3 + 2] -= net_deriv[i_idx * ndescrpt + aa] * + env_deriv[i_idx * ndescrpt * 3 + aa * 3 + 2]; } // deriv wrt neighbors for (int jj = 0; jj < nnei; ++jj) { int j_idx = nlist[i_idx * nnei + jj]; // if (j_idx > nloc) j_idx = j_idx % nloc; if (j_idx < 0) continue; - force[j_idx * 3 + 0] += net_deriv[i_idx * ndescrpt + jj] * - env_deriv[i_idx * ndescrpt * 3 + jj * 3 + 0]; - force[j_idx * 3 + 1] += net_deriv[i_idx * ndescrpt + jj] * - env_deriv[i_idx * ndescrpt * 3 + jj * 3 + 1]; - force[j_idx * 3 + 2] += net_deriv[i_idx * ndescrpt + jj] * - env_deriv[i_idx * ndescrpt * 3 + jj * 3 + 2]; + force[kk * nall * 3 + j_idx * 3 + 0] += + net_deriv[i_idx * ndescrpt + jj] * + env_deriv[i_idx * ndescrpt * 3 + jj * 3 + 0]; + force[kk * nall * 3 + j_idx * 3 + 1] += + net_deriv[i_idx * ndescrpt + jj] * + env_deriv[i_idx * ndescrpt * 3 + jj * 3 + 1]; + force[kk * nall * 3 + j_idx * 3 + 2] += + net_deriv[i_idx * ndescrpt + jj] * + env_deriv[i_idx * ndescrpt * 3 + jj * 3 + 2]; } } } @@ -128,7 +180,8 @@ template void deepmd::prod_force_r_cpu(double* force, const int* nlist, const int nloc, const int nall, - const int nnei); + const int nnei, + const int nframes); template void deepmd::prod_force_r_cpu(float* force, const float* net_deriv, @@ -136,4 +189,5 @@ template void deepmd::prod_force_r_cpu(float* force, const int* nlist, const int nloc, const int nall, - const int nnei); + const int nnei, + const int nframes); diff --git a/source/lib/src/rocm/prod_force.hip.cu b/source/lib/src/rocm/prod_force.hip.cu index cf2df5d4a8..bc4fa15078 100644 --- a/source/lib/src/rocm/prod_force.hip.cu +++ b/source/lib/src/rocm/prod_force.hip.cu @@ -5,7 +5,9 @@ template __global__ void force_deriv_wrt_center_atom(FPTYPE* force, const FPTYPE* net_deriv, const FPTYPE* in_deriv, - const int ndescrpt) { + const int ndescrpt, + const int nloc, + const int nall) { __shared__ FPTYPE data[THREADS_PER_BLOCK * 3]; int_64 bid = blockIdx.x; unsigned int tid = threadIdx.x; @@ -31,10 +33,13 @@ __global__ void force_deriv_wrt_center_atom(FPTYPE* force, __syncthreads(); } // write result for this block to global memory + const int_64 kk = bid / nloc; // frame index + const int_64 ll = bid % nloc; // atom index + const int_64 i_idx_nall = kk * nall + ll; if (tid == 0) { - force[bid * 3 + 0] -= data[THREADS_PER_BLOCK * 0]; - force[bid * 3 + 1] -= data[THREADS_PER_BLOCK * 1]; - force[bid * 3 + 2] -= data[THREADS_PER_BLOCK * 2]; + force[i_idx_nall * 3 + 0] -= data[THREADS_PER_BLOCK * 0]; + force[i_idx_nall * 3 + 1] -= data[THREADS_PER_BLOCK * 1]; + force[i_idx_nall * 3 + 2] -= data[THREADS_PER_BLOCK * 2]; } } @@ -44,6 +49,7 @@ __global__ void force_deriv_wrt_neighbors_a(FPTYPE* force, const FPTYPE* in_deriv, const int* nlist, const int nloc, + const int nall, const int nnei) { // idy -> nnei const int_64 idx = blockIdx.x; @@ -63,7 +69,8 @@ __global__ void force_deriv_wrt_neighbors_a(FPTYPE* force, force_tmp += net_deriv[idx * ndescrpt + idy * 4 + idw] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz]; } - atomicAdd(force + j_idx * 3 + idz, force_tmp); + const int_64 kk = idx / nloc; // frame index + atomicAdd(force + kk * nall * 3 + j_idx * 3 + idz, force_tmp); } template @@ -72,6 +79,7 @@ __global__ void force_deriv_wrt_neighbors_r(FPTYPE* force, const FPTYPE* in_deriv, const int* nlist, const int nloc, + const int nall, const int nnei) { // idy -> nnei const int_64 idx = blockIdx.x; @@ -86,7 +94,8 @@ __global__ void force_deriv_wrt_neighbors_r(FPTYPE* force, if (j_idx < 0) { return; } - atomicAdd(force + j_idx * 3 + idz, + const int_64 kk = idx / nloc; // frame index + atomicAdd(force + kk * nall * 3 + j_idx * 3 + idz, net_deriv[idx * ndescrpt + idy] * in_deriv[idx * ndescrpt * 3 + idy * 3 + idz]); } @@ -99,21 +108,23 @@ void prod_force_a_gpu_rocm(FPTYPE* force, const int* nlist, const int nloc, const int nall, - const int nnei) { + const int nnei, + const int nframes) { const int ndescrpt = nnei * 4; - DPErrcheck(hipMemset(force, 0, sizeof(FPTYPE) * nall * 3)); + DPErrcheck(hipMemset(force, 0, sizeof(FPTYPE) * nframes * nall * 3)); hipLaunchKernelGGL(HIP_KERNEL_NAME(force_deriv_wrt_center_atom), - nloc, TPB, 0, 0, force, net_deriv, in_deriv, ndescrpt); + nframes * nloc, TPB, 0, 0, force, net_deriv, in_deriv, + ndescrpt, nloc, nall); DPErrcheck(hipGetLastError()); DPErrcheck(hipDeviceSynchronize()); const int LEN = 64; const int nblock = (nnei + LEN - 1) / LEN; - dim3 block_grid(nloc, nblock); + dim3 block_grid(nframes * nloc, nblock); dim3 thread_grid(LEN, 3); hipLaunchKernelGGL(force_deriv_wrt_neighbors_a, block_grid, thread_grid, 0, 0, - force, net_deriv, in_deriv, nlist, nloc, nnei); + force, net_deriv, in_deriv, nlist, nloc, nall, nnei); DPErrcheck(hipGetLastError()); DPErrcheck(hipDeviceSynchronize()); } @@ -125,21 +136,23 @@ void prod_force_r_gpu_rocm(FPTYPE* force, const int* nlist, const int nloc, const int nall, - const int nnei) { + const int nnei, + const int nframes) { const int ndescrpt = nnei * 1; - DPErrcheck(hipMemset(force, 0, sizeof(FPTYPE) * nall * 3)); + DPErrcheck(hipMemset(force, 0, sizeof(FPTYPE) * nframes * nall * 3)); hipLaunchKernelGGL(HIP_KERNEL_NAME(force_deriv_wrt_center_atom), - nloc, TPB, 0, 0, force, net_deriv, in_deriv, ndescrpt); + nframes * nloc, TPB, 0, 0, force, net_deriv, in_deriv, + ndescrpt, nloc, nall); DPErrcheck(hipGetLastError()); DPErrcheck(hipDeviceSynchronize()); const int LEN = 64; const int nblock = (nnei + LEN - 1) / LEN; - dim3 block_grid(nloc, nblock); + dim3 block_grid(nframes * nloc, nblock); dim3 thread_grid(LEN, 3); hipLaunchKernelGGL(force_deriv_wrt_neighbors_r, block_grid, thread_grid, 0, 0, - force, net_deriv, in_deriv, nlist, nloc, nnei); + force, net_deriv, in_deriv, nlist, nloc, nall, nnei); DPErrcheck(hipGetLastError()); DPErrcheck(hipDeviceSynchronize()); } @@ -150,27 +163,31 @@ template void prod_force_a_gpu_rocm(float* force, const int* nlist, const int nloc, const int nall, - const int nnei); + const int nnei, + const int nframes); template void prod_force_a_gpu_rocm(double* force, const double* net_deriv, const double* in_deriv, const int* nlist, const int nloc, const int nall, - const int nnei); + const int nnei, + const int nframes); template void prod_force_r_gpu_rocm(float* force, const float* net_deriv, const float* in_deriv, const int* nlist, const int nloc, const int nall, - const int nnei); + const int nnei, + const int nframes); template void prod_force_r_gpu_rocm(double* force, const double* net_deriv, const double* in_deriv, const int* nlist, const int nloc, const int nall, - const int nnei); + const int nnei, + const int nframes); } // namespace deepmd diff --git a/source/lib/tests/test_prod_force_a.cc b/source/lib/tests/test_prod_force_a.cc index 03ef65fa3f..7fbebf1b57 100644 --- a/source/lib/tests/test_prod_force_a.cc +++ b/source/lib/tests/test_prod_force_a.cc @@ -8,6 +8,11 @@ #include "neighbor_list.h" #include "prod_force.h" +template +inline void double_vec(std::vector& v) { + v.insert(std::end(v), std::begin(v), std::end(v)); +} + class TestProdForceA : public ::testing::Test { protected: std::vector posi = {12.83, 2.56, 2.18, 12.09, 2.87, 2.74, @@ -16,6 +21,7 @@ class TestProdForceA : public ::testing::Test { std::vector atype = {0, 1, 1, 0, 1, 1}; std::vector posi_cpy; std::vector atype_cpy; + int nframes = 2; int ntypes = 2; int nloc, nall, nnei, ndescrpt; double rc = 6; @@ -102,16 +108,20 @@ class TestProdForceA : public ::testing::Test { for (int ii = 0; ii < nloc * ndescrpt; ++ii) { net_deriv[ii] = 10 - ii * 0.01; } + double_vec(nlist); + double_vec(net_deriv); + double_vec(env_deriv); + double_vec(expected_force); } void TearDown() override {} }; TEST_F(TestProdForceA, cpu) { - std::vector force(nall * 3); + std::vector force(nframes * nall * 3); int n_a_sel = nnei; deepmd::prod_force_a_cpu(&force[0], &net_deriv[0], &env_deriv[0], - &nlist[0], nloc, nall, nnei); - EXPECT_EQ(force.size(), nall * 3); + &nlist[0], nloc, nall, nnei, nframes); + EXPECT_EQ(force.size(), nframes * nall * 3); EXPECT_EQ(force.size(), expected_force.size()); for (int jj = 0; jj < force.size(); ++jj) { EXPECT_LT(fabs(force[jj] - expected_force[jj]), 1e-5); @@ -124,7 +134,7 @@ TEST_F(TestProdForceA, cpu) { #if GOOGLE_CUDA TEST_F(TestProdForceA, gpu_cuda) { - std::vector force(nall * 3, 0.0); + std::vector force(nframes * nall * 3, 0.0); int n_a_sel = nnei; int* nlist_dev = NULL; @@ -136,7 +146,7 @@ TEST_F(TestProdForceA, gpu_cuda) { deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv); deepmd::prod_force_a_gpu_cuda(force_dev, net_deriv_dev, env_deriv_dev, - nlist_dev, nloc, nall, nnei); + nlist_dev, nloc, nall, nnei, nframes); deepmd::memcpy_device_to_host(force_dev, force); deepmd::delete_device_memory(nlist_dev); @@ -144,7 +154,7 @@ TEST_F(TestProdForceA, gpu_cuda) { deepmd::delete_device_memory(net_deriv_dev); deepmd::delete_device_memory(env_deriv_dev); - EXPECT_EQ(force.size(), nall * 3); + EXPECT_EQ(force.size(), nframes * nall * 3); EXPECT_EQ(force.size(), expected_force.size()); for (int jj = 0; jj < force.size(); ++jj) { EXPECT_LT(fabs(force[jj] - expected_force[jj]), 1e-5); @@ -154,7 +164,7 @@ TEST_F(TestProdForceA, gpu_cuda) { #if TENSORFLOW_USE_ROCM TEST_F(TestProdForceA, gpu_rocm) { - std::vector force(nall * 3, 0.0); + std::vector force(nframes * nall * 3, 0.0); int n_a_sel = nnei; int* nlist_dev = NULL; @@ -166,7 +176,7 @@ TEST_F(TestProdForceA, gpu_rocm) { deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv); deepmd::prod_force_a_gpu_rocm(force_dev, net_deriv_dev, env_deriv_dev, - nlist_dev, nloc, nall, nnei); + nlist_dev, nloc, nall, nnei, nframes); deepmd::memcpy_device_to_host(force_dev, force); deepmd::delete_device_memory(nlist_dev); @@ -174,7 +184,7 @@ TEST_F(TestProdForceA, gpu_rocm) { deepmd::delete_device_memory(net_deriv_dev); deepmd::delete_device_memory(env_deriv_dev); - EXPECT_EQ(force.size(), nall * 3); + EXPECT_EQ(force.size(), nframes * nall * 3); EXPECT_EQ(force.size(), expected_force.size()); for (int jj = 0; jj < force.size(); ++jj) { EXPECT_LT(fabs(force[jj] - expected_force[jj]), 1e-5); diff --git a/source/lib/tests/test_prod_force_r.cc b/source/lib/tests/test_prod_force_r.cc index a1b5392355..c2a2359e0b 100644 --- a/source/lib/tests/test_prod_force_r.cc +++ b/source/lib/tests/test_prod_force_r.cc @@ -8,6 +8,11 @@ #include "neighbor_list.h" #include "prod_force.h" +template +inline void double_vec(std::vector& v) { + v.insert(std::end(v), std::begin(v), std::end(v)); +} + class TestProdForceR : public ::testing::Test { protected: std::vector posi = {12.83, 2.56, 2.18, 12.09, 2.87, 2.74, @@ -16,6 +21,7 @@ class TestProdForceR : public ::testing::Test { std::vector atype = {0, 1, 1, 0, 1, 1}; std::vector posi_cpy; std::vector atype_cpy; + int nframes = 2; int ntypes = 2; int nloc, nall, nnei, ndescrpt; double rc = 6; @@ -99,16 +105,20 @@ class TestProdForceR : public ::testing::Test { for (int ii = 0; ii < nloc * ndescrpt; ++ii) { net_deriv[ii] = 10 - ii * 0.01; } + double_vec(nlist); + double_vec(net_deriv); + double_vec(env_deriv); + double_vec(expected_force); } void TearDown() override {} }; TEST_F(TestProdForceR, cpu) { - std::vector force(nall * 3); + std::vector force(nframes * nall * 3); int n_a_sel = nnei; deepmd::prod_force_r_cpu(&force[0], &net_deriv[0], &env_deriv[0], - &nlist[0], nloc, nall, nnei); - EXPECT_EQ(force.size(), nall * 3); + &nlist[0], nloc, nall, nnei, nframes); + EXPECT_EQ(force.size(), nframes * nall * 3); EXPECT_EQ(force.size(), expected_force.size()); for (int jj = 0; jj < force.size(); ++jj) { EXPECT_LT(fabs(force[jj] - expected_force[jj]), 1e-5); @@ -121,7 +131,7 @@ TEST_F(TestProdForceR, cpu) { #if GOOGLE_CUDA TEST_F(TestProdForceR, gpu_cuda) { - std::vector force(nall * 3, 0.0); + std::vector force(nframes * nall * 3, 0.0); int n_a_sel = nnei; int* nlist_dev = NULL; @@ -133,7 +143,7 @@ TEST_F(TestProdForceR, gpu_cuda) { deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv); deepmd::prod_force_r_gpu_cuda(force_dev, net_deriv_dev, env_deriv_dev, - nlist_dev, nloc, nall, nnei); + nlist_dev, nloc, nall, nnei, nframes); deepmd::memcpy_device_to_host(force_dev, force); deepmd::delete_device_memory(nlist_dev); @@ -141,7 +151,7 @@ TEST_F(TestProdForceR, gpu_cuda) { deepmd::delete_device_memory(net_deriv_dev); deepmd::delete_device_memory(env_deriv_dev); - EXPECT_EQ(force.size(), nall * 3); + EXPECT_EQ(force.size(), nframes * nall * 3); EXPECT_EQ(force.size(), expected_force.size()); for (int jj = 0; jj < force.size(); ++jj) { EXPECT_LT(fabs(force[jj] - expected_force[jj]), 1e-5); @@ -151,7 +161,7 @@ TEST_F(TestProdForceR, gpu_cuda) { #if TENSORFLOW_USE_ROCM TEST_F(TestProdForceR, gpu_rocm) { - std::vector force(nall * 3, 0.0); + std::vector force(nframes * nall * 3, 0.0); int n_a_sel = nnei; int* nlist_dev = NULL; @@ -163,7 +173,7 @@ TEST_F(TestProdForceR, gpu_rocm) { deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv); deepmd::prod_force_r_gpu_rocm(force_dev, net_deriv_dev, env_deriv_dev, - nlist_dev, nloc, nall, nnei); + nlist_dev, nloc, nall, nnei, nframes); deepmd::memcpy_device_to_host(force_dev, force); deepmd::delete_device_memory(nlist_dev); @@ -171,7 +181,7 @@ TEST_F(TestProdForceR, gpu_rocm) { deepmd::delete_device_memory(net_deriv_dev); deepmd::delete_device_memory(env_deriv_dev); - EXPECT_EQ(force.size(), nall * 3); + EXPECT_EQ(force.size(), nframes * nall * 3); EXPECT_EQ(force.size(), expected_force.size()); for (int jj = 0; jj < force.size(); ++jj) { EXPECT_LT(fabs(force[jj] - expected_force[jj]), 1e-5); diff --git a/source/op/prod_force_multi_device.cc b/source/op/prod_force_multi_device.cc index 73914292db..8ba62882cf 100644 --- a/source/op/prod_force_multi_device.cc +++ b/source/op/prod_force_multi_device.cc @@ -136,25 +136,20 @@ class ProdForceSeAOp : public OpKernel { nloc_loc = end_index - start_index; } - for (int_64 kk = 0; kk < nframes; ++kk) { - FPTYPE* force = p_force + kk * nall * 3; - const FPTYPE* net_deriv = p_net_deriv + kk * nloc * ndescrpt; - const FPTYPE* in_deriv = p_in_deriv + kk * nloc * ndescrpt * 3; - const int* nlist = p_nlist + kk * nloc * nnei; - if (device == "GPU") { + if (device == "GPU") { #if GOOGLE_CUDA - deepmd::prod_force_a_gpu_cuda(force, net_deriv, in_deriv, nlist, nloc, - nall, nnei); + deepmd::prod_force_a_gpu_cuda(p_force, p_net_deriv, p_in_deriv, p_nlist, + nloc, nall, nnei, nframes); #endif // GOOGLE_CUDA #if TENSORFLOW_USE_ROCM - deepmd::prod_force_a_gpu_rocm(force, net_deriv, in_deriv, nlist, nloc, - nall, nnei); + deepmd::prod_force_a_gpu_rocm(p_force, p_net_deriv, p_in_deriv, p_nlist, + nloc, nall, nnei, nframes); #endif // TENSORFLOW_USE_ROCM - } else if (device == "CPU") { - deepmd::prod_force_a_cpu(force, net_deriv, in_deriv, nlist, nloc_loc, - nall, nnei, start_index = start_index); - } + } else if (device == "CPU") { + deepmd::prod_force_a_cpu(p_force, p_net_deriv, p_in_deriv, p_nlist, nloc, + nall, nnei, nframes, nloc_loc, + start_index = start_index); } } @@ -227,25 +222,19 @@ class ProdForceSeROp : public OpKernel { const FPTYPE* p_in_deriv = in_deriv_tensor.flat().data(); const int* p_nlist = nlist_tensor.flat().data(); - for (int_64 kk = 0; kk < nframes; ++kk) { - FPTYPE* force = p_force + kk * nall * 3; - const FPTYPE* net_deriv = p_net_deriv + kk * nloc * ndescrpt; - const FPTYPE* in_deriv = p_in_deriv + kk * nloc * ndescrpt * 3; - const int* nlist = p_nlist + kk * nloc * nnei; - if (device == "GPU") { + if (device == "GPU") { #if GOOGLE_CUDA - deepmd::prod_force_r_gpu_cuda(force, net_deriv, in_deriv, nlist, nloc, - nall, nnei); + deepmd::prod_force_r_gpu_cuda(p_force, p_net_deriv, p_in_deriv, p_nlist, + nloc, nall, nnei, nframes); #endif // GOOGLE_CUDA #if TENSORFLOW_USE_ROCM - deepmd::prod_force_r_gpu_rocm(force, net_deriv, in_deriv, nlist, nloc, - nall, nnei); + deepmd::prod_force_r_gpu_rocm(p_force, p_net_deriv, p_in_deriv, p_nlist, + nloc, nall, nnei, nframes); #endif // TENSORFLOW_USE_ROCM - } else if (device == "CPU") { - deepmd::prod_force_r_cpu(force, net_deriv, in_deriv, nlist, nloc, nall, - nnei); - } + } else if (device == "CPU") { + deepmd::prod_force_r_cpu(p_force, p_net_deriv, p_in_deriv, p_nlist, nloc, + nall, nnei, nframes); } }