Skip to content

Commit

Permalink
prod_force_grad: support multiple frames in parallel (#2601)
Browse files Browse the repository at this point in the history
Similiar to #2600.
  • Loading branch information
njzjz authored Jun 12, 2023
1 parent 4b822b8 commit 046a5a4
Show file tree
Hide file tree
Showing 7 changed files with 160 additions and 113 deletions.
18 changes: 12 additions & 6 deletions source/lib/include/prod_force_grad.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,17 @@ void prod_force_grad_a_cpu(FPTYPE* grad_net,
const FPTYPE* env_deriv,
const int* nlist,
const int nloc,
const int nnei);
const int nnei,
const int nframes);

template <typename FPTYPE>
void prod_force_grad_r_cpu(FPTYPE* grad_net,
const FPTYPE* grad,
const FPTYPE* env_deriv,
const int* nlist,
const int nloc,
const int nnei);
const int nnei,
const int nframes);

#if GOOGLE_CUDA
template <typename FPTYPE>
Expand All @@ -25,15 +27,17 @@ void prod_force_grad_a_gpu_cuda(FPTYPE* grad_net,
const FPTYPE* env_deriv,
const int* nlist,
const int nloc,
const int nnei);
const int nnei,
const int nframes);

template <typename FPTYPE>
void prod_force_grad_r_gpu_cuda(FPTYPE* grad_net,
const FPTYPE* grad,
const FPTYPE* env_deriv,
const int* nlist,
const int nloc,
const int nnei);
const int nnei,
const int nframes);
#endif // GOOGLE_CUDA

#if TENSORFLOW_USE_ROCM
Expand All @@ -43,14 +47,16 @@ void prod_force_grad_a_gpu_rocm(FPTYPE* grad_net,
const FPTYPE* env_deriv,
const int* nlist,
const int nloc,
const int nnei);
const int nnei,
const int nframes);

template <typename FPTYPE>
void prod_force_grad_r_gpu_rocm(FPTYPE* grad_net,
const FPTYPE* grad,
const FPTYPE* env_deriv,
const int* nlist,
const int nloc,
const int nnei);
const int nnei,
const int nframes);
#endif // TENSORFLOW_USE_ROCM
} // namespace deepmd
57 changes: 35 additions & 22 deletions source/lib/src/cuda/prod_force_grad.cu
Original file line number Diff line number Diff line change
Expand Up @@ -31,21 +31,24 @@ __global__ void force_grad_wrt_neighbors_a(FPTYPE* grad_net,
const FPTYPE* env_deriv,
const int* nlist,
const int nloc,
const int nnei) {
const int nnei,
const int nframes) {
// idy -> nnei
const int_64 idx = blockIdx.x * blockDim.x + threadIdx.x;
const unsigned int idy = blockIdx.y;
const unsigned int idw = threadIdx.y;
if (idx >= nloc) {
if (idx >= nframes * nloc) {
return;
}
int j_idx = nlist[idx * nnei + idy];
if (j_idx < 0) {
return;
}
if (j_idx >= nloc) j_idx = j_idx % nloc;
grad_net[idx * nnei * 4 + idy * 4 + idw] += dev_dot(
grad + j_idx * 3, env_deriv + idx * nnei * 4 * 3 + idy * 4 * 3 + idw * 3);
const int kk = idx / nloc; // frame index
grad_net[idx * nnei * 4 + idy * 4 + idw] +=
dev_dot(grad + kk * nloc * 3 + j_idx * 3,
env_deriv + idx * nnei * 4 * 3 + idy * 4 * 3 + idw * 3);
}

template <typename FPTYPE>
Expand All @@ -54,20 +57,22 @@ __global__ void force_grad_wrt_neighbors_r(FPTYPE* grad_net,
const FPTYPE* env_deriv,
const int* nlist,
const int nloc,
const int nnei) {
const int nnei,
const int nframes) {
// idy -> nnei
const int_64 idx = blockIdx.x * blockDim.x + threadIdx.x;
const unsigned int idy = blockIdx.y;
if (idx >= nloc) {
if (idx >= nframes * nloc) {
return;
}
int j_idx = nlist[idx * nnei + idy];
if (j_idx < 0) {
return;
}
if (j_idx >= nloc) j_idx = j_idx % nloc;
grad_net[idx * nnei + idy] +=
dev_dot(grad + j_idx * 3, env_deriv + idx * nnei * 3 + idy * 3);
const int kk = idx / nloc; // frame index
grad_net[idx * nnei + idy] += dev_dot(grad + kk * nloc * 3 + j_idx * 3,
env_deriv + idx * nnei * 3 + idy * 3);
}

namespace deepmd {
Expand All @@ -77,23 +82,25 @@ void prod_force_grad_a_gpu_cuda(FPTYPE* grad_net,
const FPTYPE* env_deriv,
const int* nlist,
const int nloc,
const int nnei) {
const int nnei,
const int nframes) {
const int ndescrpt = nnei * 4;
DPErrcheck(cudaMemset(grad_net, 0, sizeof(FPTYPE) * nloc * ndescrpt));
DPErrcheck(
cudaMemset(grad_net, 0, sizeof(FPTYPE) * nframes * nloc * ndescrpt));
const int nblock = (ndescrpt + TPB - 1) / TPB;
dim3 block_grid(nloc, nblock);
dim3 block_grid(nframes * nloc, nblock);
dim3 thread_grid(TPB, 1);
force_grad_wrt_center_atom<<<block_grid, thread_grid>>>(grad_net, grad,
env_deriv, ndescrpt);
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());

const int LEN = 128;
const int nblock_ = (nloc + LEN - 1) / LEN;
const int nblock_ = (nframes * nloc + LEN - 1) / LEN;
dim3 block_grid_(nblock_, nnei);
dim3 thread_grid_(LEN, 4);
force_grad_wrt_neighbors_a<<<block_grid_, thread_grid_>>>(
grad_net, grad, env_deriv, nlist, nloc, nnei);
grad_net, grad, env_deriv, nlist, nloc, nnei, nframes);
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
}
Expand All @@ -104,23 +111,25 @@ void prod_force_grad_r_gpu_cuda(FPTYPE* grad_net,
const FPTYPE* env_deriv,
const int* nlist,
const int nloc,
const int nnei) {
const int nnei,
const int nframes) {
const int ndescrpt = nnei * 1;
DPErrcheck(cudaMemset(grad_net, 0, sizeof(FPTYPE) * nloc * ndescrpt));
DPErrcheck(
cudaMemset(grad_net, 0, sizeof(FPTYPE) * nframes * nloc * ndescrpt));
const int nblock = (ndescrpt + TPB - 1) / TPB;
dim3 block_grid(nloc, nblock);
dim3 block_grid(nframes * nloc, nblock);
dim3 thread_grid(TPB, 1);
force_grad_wrt_center_atom<<<block_grid, thread_grid>>>(grad_net, grad,
env_deriv, ndescrpt);
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());

const int LEN = 128;
const int nblock_ = (nloc + LEN - 1) / LEN;
const int nblock_ = (nframes * nloc + LEN - 1) / LEN;
dim3 block_grid_(nblock_, nnei);
dim3 thread_grid_(LEN, 1);
force_grad_wrt_neighbors_r<<<block_grid_, thread_grid_>>>(
grad_net, grad, env_deriv, nlist, nloc, nnei);
grad_net, grad, env_deriv, nlist, nloc, nnei, nframes);
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
}
Expand All @@ -130,23 +139,27 @@ template void prod_force_grad_a_gpu_cuda<float>(float* grad_net,
const float* env_deriv,
const int* nlist,
const int nloc,
const int nnei);
const int nnei,
const int nframes);
template void prod_force_grad_a_gpu_cuda<double>(double* grad_net,
const double* grad,
const double* env_deriv,
const int* nlist,
const int nloc,
const int nnei);
const int nnei,
const int nframes);
template void prod_force_grad_r_gpu_cuda<float>(float* grad_net,
const float* grad,
const float* env_deriv,
const int* nlist,
const int nloc,
const int nnei);
const int nnei,
const int nframes);
template void prod_force_grad_r_gpu_cuda<double>(double* grad_net,
const double* grad,
const double* env_deriv,
const int* nlist,
const int nloc,
const int nnei);
const int nnei,
const int nframes);
} // namespace deepmd
32 changes: 20 additions & 12 deletions source/lib/src/prod_force_grad.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,19 +24,20 @@ void deepmd::prod_force_grad_a_cpu(FPTYPE* grad_net,
const FPTYPE* env_deriv,
const int* nlist,
const int nloc,
const int nnei) {
const int nnei,
const int nframes) {
const int ndescrpt = nnei * 4;

// reset the frame to 0
for (int ii = 0; ii < nloc; ++ii) {
for (int ii = 0; ii < nframes * nloc; ++ii) {
for (int aa = 0; aa < ndescrpt; ++aa) {
grad_net[ii * ndescrpt + aa] = (FPTYPE)0.;
}
}

// compute grad of one frame
#pragma omp parallel for
for (int ii = 0; ii < nloc; ++ii) {
for (int ii = 0; ii < nframes * nloc; ++ii) {
int i_idx = ii;

// deriv wrt center atom
Expand All @@ -55,10 +56,11 @@ void deepmd::prod_force_grad_a_cpu(FPTYPE* grad_net,
if (j_idx < 0) continue;
int aa_start, aa_end;
make_index_range(aa_start, aa_end, jj, nnei);
const int kk = i_idx / nloc; // frame index
for (int aa = aa_start; aa < aa_end; ++aa) {
for (int dd = 0; dd < 3; ++dd) {
grad_net[i_idx * ndescrpt + aa] +=
grad[j_idx * 3 + dd] *
grad[kk * nloc * 3 + j_idx * 3 + dd] *
env_deriv[i_idx * ndescrpt * 3 + aa * 3 + dd];
}
}
Expand All @@ -71,22 +73,25 @@ template void deepmd::prod_force_grad_a_cpu<double>(double* grad_net,
const double* env_deriv,
const int* nlist,
const int nloc,
const int nnei);
const int nnei,
const int nframes);

template void deepmd::prod_force_grad_a_cpu<float>(float* grad_net,
const float* grad,
const float* env_deriv,
const int* nlist,
const int nloc,
const int nnei);
const int nnei,
const int nframes);

template <typename FPTYPE>
void deepmd::prod_force_grad_r_cpu(FPTYPE* grad_net,
const FPTYPE* grad,
const FPTYPE* env_deriv,
const int* nlist,
const int nloc,
const int nnei)
const int nnei,
const int nframes)
//
// grad_net: nloc x ndescrpt
// grad: nloc x 3
Expand All @@ -97,15 +102,15 @@ void deepmd::prod_force_grad_r_cpu(FPTYPE* grad_net,
const int ndescrpt = nnei * 1;

// reset the frame to 0
for (int ii = 0; ii < nloc; ++ii) {
for (int ii = 0; ii < nframes * nloc; ++ii) {
for (int aa = 0; aa < ndescrpt; ++aa) {
grad_net[ii * ndescrpt + aa] = (FPTYPE)0.;
}
}

// compute grad of one frame
#pragma omp parallel for
for (int ii = 0; ii < nloc; ++ii) {
for (int ii = 0; ii < nframes * nloc; ++ii) {
int i_idx = ii;

// deriv wrt center atom
Expand All @@ -122,9 +127,10 @@ void deepmd::prod_force_grad_r_cpu(FPTYPE* grad_net,
int j_idx = nlist[i_idx * nnei + jj];
if (j_idx >= nloc) j_idx = j_idx % nloc;
if (j_idx < 0) continue;
int kk = i_idx / nloc; // frame index
for (int dd = 0; dd < 3; ++dd) {
grad_net[i_idx * ndescrpt + jj] +=
grad[j_idx * 3 + dd] *
grad[kk * nloc * 3 + j_idx * 3 + dd] *
env_deriv[i_idx * ndescrpt * 3 + jj * 3 + dd];
}
}
Expand All @@ -136,11 +142,13 @@ template void deepmd::prod_force_grad_r_cpu<double>(double* grad_net,
const double* env_deriv,
const int* nlist,
const int nloc,
const int nnei);
const int nnei,
const int nframes);

template void deepmd::prod_force_grad_r_cpu<float>(float* grad_net,
const float* grad,
const float* env_deriv,
const int* nlist,
const int nloc,
const int nnei);
const int nnei,
const int nframes);
Loading

0 comments on commit 046a5a4

Please sign in to comment.