diff --git a/source/lib/include/prod_force.h b/source/lib/include/prod_force.h
index 3c9bd89549..661e998d12 100644
--- a/source/lib/include/prod_force.h
+++ b/source/lib/include/prod_force.h
@@ -2,6 +2,19 @@
 
 namespace deepmd {
 
+/**
+ * @brief Produce force from net_deriv and in_deriv.
+ *
+ * @tparam FPTYPE float or double
+ * @param[out] force Atomic forces.
+ * @param[in] net_deriv Net derivative.
+ * @param[in] in_deriv Environmental derivative.
+ * @param[in] nlist Neighbor list.
+ * @param[in] nloc The number of local atoms.
+ * @param[in] nall The number of all atoms, including ghost atoms.
+ * @param[in] nnei The number of neighbors.
+ * @param[in] nframes The number of frames.
+ */
 template <typename FPTYPE>
 void prod_force_a_cpu(FPTYPE* force,
                       const FPTYPE* net_deriv,
@@ -10,7 +23,38 @@ void prod_force_a_cpu(FPTYPE* force,
                       const int nloc,
                       const int nall,
                       const int nnei,
-                      const int start_index = 0);
+                      const int nframes);
+
+/**
+ * @brief Produce force from net_deriv and in_deriv.
+ * @details This function is used for multi-threading. Only part of atoms
+ *         are computed in this thread. They will be comptued in parallel.
+ *
+ * @tparam FPTYPE float or double
+ * @param[out] force Atomic forces.
+ * @param[in] net_deriv Net derivative.
+ * @param[in] in_deriv Environmental derivative.
+ * @param[in] nlist Neighbor list.
+ * @param[in] nloc The number of local atoms.
+ * @param[in] nall The number of all atoms, including ghost atoms.
+ * @param[in] nnei The number of neighbors.
+ * @param[in] nframes The number of frames.
+ * @param[in] thread_nloc The number of local atoms to be computed in this
+ * thread.
+ * @param[in] thread_start_index The start index of local atoms to be computed
+ * in this thread. The index should be in [0, nloc).
+ */
+template <typename FPTYPE>
+void prod_force_a_cpu(FPTYPE* force,
+                      const FPTYPE* net_deriv,
+                      const FPTYPE* in_deriv,
+                      const int* nlist,
+                      const int nloc,
+                      const int nall,
+                      const int nnei,
+                      const int nframes,
+                      const int thread_nloc,
+                      const int thread_start_index);
 
 template <typename FPTYPE>
 void prod_force_r_cpu(FPTYPE* force,
@@ -19,7 +63,8 @@ void prod_force_r_cpu(FPTYPE* force,
                       const int* nlist,
                       const int nloc,
                       const int nall,
-                      const int nnei);
+                      const int nnei,
+                      const int nframes);
 
 #if GOOGLE_CUDA
 template <typename FPTYPE>
@@ -29,7 +74,8 @@ void prod_force_a_gpu_cuda(FPTYPE* force,
                            const int* nlist,
                            const int nloc,
                            const int nall,
-                           const int nnei);
+                           const int nnei,
+                           const int nframes);
 
 template <typename FPTYPE>
 void prod_force_r_gpu_cuda(FPTYPE* force,
@@ -38,7 +84,8 @@ void prod_force_r_gpu_cuda(FPTYPE* force,
                            const int* nlist,
                            const int nloc,
                            const int nall,
-                           const int nnei);
+                           const int nnei,
+                           const int nframes);
 #endif  // GOOGLE_CUDA
 
 #if TENSORFLOW_USE_ROCM
@@ -49,7 +96,8 @@ void prod_force_a_gpu_rocm(FPTYPE* force,
                            const int* nlist,
                            const int nloc,
                            const int nall,
-                           const int nnei);
+                           const int nnei,
+                           const int nframes);
 
 template <typename FPTYPE>
 void prod_force_r_gpu_rocm(FPTYPE* force,
@@ -58,7 +106,8 @@ void prod_force_r_gpu_rocm(FPTYPE* force,
                            const int* nlist,
                            const int nloc,
                            const int nall,
-                           const int nnei);
+                           const int nnei,
+                           const int nframes);
 #endif  // TENSORFLOW_USE_ROCM
 
 }  // namespace deepmd
diff --git a/source/lib/src/cuda/prod_force.cu b/source/lib/src/cuda/prod_force.cu
index db3683eab7..80f4eaed50 100644
--- a/source/lib/src/cuda/prod_force.cu
+++ b/source/lib/src/cuda/prod_force.cu
@@ -5,7 +5,9 @@ template <typename FPTYPE, int THREADS_PER_BLOCK>
 __global__ void force_deriv_wrt_center_atom(FPTYPE* force,
                                             const FPTYPE* net_deriv,
                                             const FPTYPE* in_deriv,
-                                            const int ndescrpt) {
+                                            const int ndescrpt,
+                                            const int nloc,
+                                            const int nall) {
   __shared__ FPTYPE data[THREADS_PER_BLOCK * 3];
   int_64 bid = blockIdx.x;
   unsigned int tid = threadIdx.x;
@@ -31,10 +33,13 @@ __global__ void force_deriv_wrt_center_atom(FPTYPE* force,
     __syncthreads();
   }
   // write result for this block to global memory
+  const int_64 kk = bid / nloc;  // frame index
+  const int_64 ll = bid % nloc;  // atom index
+  const int_64 i_idx_nall = kk * nall + ll;
   if (tid == 0) {
-    force[bid * 3 + 0] -= data[THREADS_PER_BLOCK * 0];
-    force[bid * 3 + 1] -= data[THREADS_PER_BLOCK * 1];
-    force[bid * 3 + 2] -= data[THREADS_PER_BLOCK * 2];
+    force[i_idx_nall * 3 + 0] -= data[THREADS_PER_BLOCK * 0];
+    force[i_idx_nall * 3 + 1] -= data[THREADS_PER_BLOCK * 1];
+    force[i_idx_nall * 3 + 2] -= data[THREADS_PER_BLOCK * 2];
   }
 }
 
@@ -44,6 +49,7 @@ __global__ void force_deriv_wrt_neighbors_a(FPTYPE* force,
                                             const FPTYPE* in_deriv,
                                             const int* nlist,
                                             const int nloc,
+                                            const int nall,
                                             const int nnei) {
   // idy -> nnei
   const int_64 idx = blockIdx.x;
@@ -63,7 +69,8 @@ __global__ void force_deriv_wrt_neighbors_a(FPTYPE* force,
     force_tmp += net_deriv[idx * ndescrpt + idy * 4 + idw] *
                  in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz];
   }
-  atomicAdd(force + j_idx * 3 + idz, force_tmp);
+  const int_64 kk = idx / nloc;  // frame index
+  atomicAdd(force + kk * nall * 3 + j_idx * 3 + idz, force_tmp);
 }
 
 template <typename FPTYPE>
@@ -72,6 +79,7 @@ __global__ void force_deriv_wrt_neighbors_r(FPTYPE* force,
                                             const FPTYPE* in_deriv,
                                             const int* nlist,
                                             const int nloc,
+                                            const int nall,
                                             const int nnei) {
   // idy -> nnei
   const int_64 idx = blockIdx.x;
@@ -86,7 +94,8 @@ __global__ void force_deriv_wrt_neighbors_r(FPTYPE* force,
   if (j_idx < 0) {
     return;
   }
-  atomicAdd(force + j_idx * 3 + idz,
+  const int_64 kk = idx / nloc;  // frame index
+  atomicAdd(force + kk * nall * 3 + j_idx * 3 + idz,
             net_deriv[idx * ndescrpt + idy] *
                 in_deriv[idx * ndescrpt * 3 + idy * 3 + idz]);
 }
@@ -99,21 +108,22 @@ void prod_force_a_gpu_cuda(FPTYPE* force,
                            const int* nlist,
                            const int nloc,
                            const int nall,
-                           const int nnei) {
+                           const int nnei,
+                           const int nframes) {
   const int ndescrpt = nnei * 4;
-  DPErrcheck(cudaMemset(force, 0, sizeof(FPTYPE) * nall * 3));
+  DPErrcheck(cudaMemset(force, 0, sizeof(FPTYPE) * nframes * nall * 3));
 
-  force_deriv_wrt_center_atom<FPTYPE, TPB>
-      <<<nloc, TPB>>>(force, net_deriv, in_deriv, ndescrpt);
+  force_deriv_wrt_center_atom<FPTYPE, TPB><<<nframes * nloc, TPB>>>(
+      force, net_deriv, in_deriv, ndescrpt, nloc, nall);
   DPErrcheck(cudaGetLastError());
   DPErrcheck(cudaDeviceSynchronize());
 
   const int LEN = 64;
   const int nblock = (nnei + LEN - 1) / LEN;
-  dim3 block_grid(nloc, nblock);
+  dim3 block_grid(nframes * nloc, nblock);
   dim3 thread_grid(LEN, 3);
   force_deriv_wrt_neighbors_a<<<block_grid, thread_grid>>>(
-      force, net_deriv, in_deriv, nlist, nloc, nnei);
+      force, net_deriv, in_deriv, nlist, nloc, nall, nnei);
   DPErrcheck(cudaGetLastError());
   DPErrcheck(cudaDeviceSynchronize());
 }
@@ -125,21 +135,22 @@ void prod_force_r_gpu_cuda(FPTYPE* force,
                            const int* nlist,
                            const int nloc,
                            const int nall,
-                           const int nnei) {
+                           const int nnei,
+                           const int nframes) {
   const int ndescrpt = nnei * 1;
-  DPErrcheck(cudaMemset(force, 0, sizeof(FPTYPE) * nall * 3));
+  DPErrcheck(cudaMemset(force, 0, sizeof(FPTYPE) * nframes * nall * 3));
 
-  force_deriv_wrt_center_atom<FPTYPE, TPB>
-      <<<nloc, TPB>>>(force, net_deriv, in_deriv, ndescrpt);
+  force_deriv_wrt_center_atom<FPTYPE, TPB><<<nframes * nloc, TPB>>>(
+      force, net_deriv, in_deriv, ndescrpt, nloc, nall);
   DPErrcheck(cudaGetLastError());
   DPErrcheck(cudaDeviceSynchronize());
 
   const int LEN = 64;
   const int nblock = (nnei + LEN - 1) / LEN;
-  dim3 block_grid(nloc, nblock);
+  dim3 block_grid(nframes * nloc, nblock);
   dim3 thread_grid(LEN, 3);
   force_deriv_wrt_neighbors_r<<<block_grid, thread_grid>>>(
-      force, net_deriv, in_deriv, nlist, nloc, nnei);
+      force, net_deriv, in_deriv, nlist, nloc, nall, nnei);
   DPErrcheck(cudaGetLastError());
   DPErrcheck(cudaDeviceSynchronize());
 }
@@ -150,26 +161,30 @@ template void prod_force_a_gpu_cuda<float>(float* force,
                                            const int* nlist,
                                            const int nloc,
                                            const int nall,
-                                           const int nnei);
+                                           const int nnei,
+                                           const int nframes);
 template void prod_force_a_gpu_cuda<double>(double* force,
                                             const double* net_deriv,
                                             const double* in_deriv,
                                             const int* nlist,
                                             const int nloc,
                                             const int nall,
-                                            const int nnei);
+                                            const int nnei,
+                                            const int nframes);
 template void prod_force_r_gpu_cuda<float>(float* force,
                                            const float* net_deriv,
                                            const float* in_deriv,
                                            const int* nlist,
                                            const int nloc,
                                            const int nall,
-                                           const int nnei);
+                                           const int nnei,
+                                           const int nframes);
 template void prod_force_r_gpu_cuda<double>(double* force,
                                             const double* net_deriv,
                                             const double* in_deriv,
                                             const int* nlist,
                                             const int nloc,
                                             const int nall,
-                                            const int nnei);
+                                            const int nnei,
+                                            const int nframes);
 }  // namespace deepmd
diff --git a/source/lib/src/prod_force.cc b/source/lib/src/prod_force.cc
index b258ff1d3f..8cd67d8c7a 100644
--- a/source/lib/src/prod_force.cc
+++ b/source/lib/src/prod_force.cc
@@ -27,20 +27,26 @@ void deepmd::prod_force_a_cpu(FPTYPE* force,
                               const int nloc,
                               const int nall,
                               const int nnei,
-                              const int start_index) {
+                              const int nframes,
+                              const int thread_nloc,
+                              const int thread_start_index) {
   const int ndescrpt = 4 * nnei;
 
-  memset(force, 0, sizeof(FPTYPE) * nall * 3);
+  memset(force, 0, sizeof(FPTYPE) * nframes * nall * 3);
   // compute force of a frame
-  for (int i_idx = start_index; i_idx < start_index + nloc; ++i_idx) {
+  for (int i_idx = nframes * thread_start_index;
+       i_idx < nframes * (thread_start_index + thread_nloc); ++i_idx) {
+    int kk = i_idx / nloc;  // frame index
+    int ll = i_idx % nloc;  // atom index
+    int i_idx_nall = kk * nall + ll;
     // deriv wrt center atom
     for (int aa = 0; aa < ndescrpt; ++aa) {
-      force[i_idx * 3 + 0] -= net_deriv[i_idx * ndescrpt + aa] *
-                              env_deriv[i_idx * ndescrpt * 3 + aa * 3 + 0];
-      force[i_idx * 3 + 1] -= net_deriv[i_idx * ndescrpt + aa] *
-                              env_deriv[i_idx * ndescrpt * 3 + aa * 3 + 1];
-      force[i_idx * 3 + 2] -= net_deriv[i_idx * ndescrpt + aa] *
-                              env_deriv[i_idx * ndescrpt * 3 + aa * 3 + 2];
+      force[i_idx_nall * 3 + 0] -= net_deriv[i_idx * ndescrpt + aa] *
+                                   env_deriv[i_idx * ndescrpt * 3 + aa * 3 + 0];
+      force[i_idx_nall * 3 + 1] -= net_deriv[i_idx * ndescrpt + aa] *
+                                   env_deriv[i_idx * ndescrpt * 3 + aa * 3 + 1];
+      force[i_idx_nall * 3 + 2] -= net_deriv[i_idx * ndescrpt + aa] *
+                                   env_deriv[i_idx * ndescrpt * 3 + aa * 3 + 2];
     }
     // deriv wrt neighbors
     for (int jj = 0; jj < nnei; ++jj) {
@@ -49,17 +55,56 @@ void deepmd::prod_force_a_cpu(FPTYPE* force,
       int aa_start, aa_end;
       make_index_range(aa_start, aa_end, jj, nnei);
       for (int aa = aa_start; aa < aa_end; ++aa) {
-        force[j_idx * 3 + 0] += net_deriv[i_idx * ndescrpt + aa] *
-                                env_deriv[i_idx * ndescrpt * 3 + aa * 3 + 0];
-        force[j_idx * 3 + 1] += net_deriv[i_idx * ndescrpt + aa] *
-                                env_deriv[i_idx * ndescrpt * 3 + aa * 3 + 1];
-        force[j_idx * 3 + 2] += net_deriv[i_idx * ndescrpt + aa] *
-                                env_deriv[i_idx * ndescrpt * 3 + aa * 3 + 2];
+        force[kk * nall * 3 + j_idx * 3 + 0] +=
+            net_deriv[i_idx * ndescrpt + aa] *
+            env_deriv[i_idx * ndescrpt * 3 + aa * 3 + 0];
+        force[kk * nall * 3 + j_idx * 3 + 1] +=
+            net_deriv[i_idx * ndescrpt + aa] *
+            env_deriv[i_idx * ndescrpt * 3 + aa * 3 + 1];
+        force[kk * nall * 3 + j_idx * 3 + 2] +=
+            net_deriv[i_idx * ndescrpt + aa] *
+            env_deriv[i_idx * ndescrpt * 3 + aa * 3 + 2];
       }
     }
   }
 }
 
+// overload to provide default values
+template <typename FPTYPE>
+void deepmd::prod_force_a_cpu(FPTYPE* force,
+                              const FPTYPE* net_deriv,
+                              const FPTYPE* env_deriv,
+                              const int* nlist,
+                              const int nloc,
+                              const int nall,
+                              const int nnei,
+                              const int nframes) {
+  deepmd::prod_force_a_cpu(force, net_deriv, env_deriv, nlist, nloc, nall, nnei,
+                           nframes, nloc, 0);
+};
+
+template void deepmd::prod_force_a_cpu<double>(double* force,
+                                               const double* net_deriv,
+                                               const double* env_deriv,
+                                               const int* nlist,
+                                               const int nloc,
+                                               const int nall,
+                                               const int nnei,
+                                               const int nframes,
+                                               const int thread_nloc,
+                                               const int thread_start_index);
+
+template void deepmd::prod_force_a_cpu<float>(float* force,
+                                              const float* net_deriv,
+                                              const float* env_deriv,
+                                              const int* nlist,
+                                              const int nloc,
+                                              const int nall,
+                                              const int nnei,
+                                              const int nframes,
+                                              const int thread_nloc,
+                                              const int thread_start_index);
+
 template void deepmd::prod_force_a_cpu<double>(double* force,
                                                const double* net_deriv,
                                                const double* env_deriv,
@@ -67,7 +112,7 @@ template void deepmd::prod_force_a_cpu<double>(double* force,
                                                const int nloc,
                                                const int nall,
                                                const int nnei,
-                                               const int start_index);
+                                               const int nframes);
 
 template void deepmd::prod_force_a_cpu<float>(float* force,
                                               const float* net_deriv,
@@ -76,7 +121,7 @@ template void deepmd::prod_force_a_cpu<float>(float* force,
                                               const int nloc,
                                               const int nall,
                                               const int nnei,
-                                              const int start_index);
+                                              const int nframes);
 
 template <typename FPTYPE>
 void deepmd::prod_force_r_cpu(FPTYPE* force,
@@ -85,10 +130,11 @@ void deepmd::prod_force_r_cpu(FPTYPE* force,
                               const int* nlist,
                               const int nloc,
                               const int nall,
-                              const int nnei) {
+                              const int nnei,
+                              const int nframes) {
   const int ndescrpt = 1 * nnei;
 
-  for (int ii = 0; ii < nall; ++ii) {
+  for (int ii = 0; ii < nframes * nall; ++ii) {
     int i_idx = ii;
     force[i_idx * 3 + 0] = (FPTYPE)0.;
     force[i_idx * 3 + 1] = (FPTYPE)0.;
@@ -96,28 +142,34 @@ void deepmd::prod_force_r_cpu(FPTYPE* force,
   }
 
   // compute force of a frame
-  for (int ii = 0; ii < nloc; ++ii) {
+  for (int ii = 0; ii < nframes * nloc; ++ii) {
+    int kk = ii / nloc;  // frame index
+    int ll = ii % nloc;  // atom index
+    int i_idx_nall = kk * nall + ll;
     int i_idx = ii;
     // deriv wrt center atom
     for (int aa = 0; aa < ndescrpt; ++aa) {
-      force[i_idx * 3 + 0] -= net_deriv[i_idx * ndescrpt + aa] *
-                              env_deriv[i_idx * ndescrpt * 3 + aa * 3 + 0];
-      force[i_idx * 3 + 1] -= net_deriv[i_idx * ndescrpt + aa] *
-                              env_deriv[i_idx * ndescrpt * 3 + aa * 3 + 1];
-      force[i_idx * 3 + 2] -= net_deriv[i_idx * ndescrpt + aa] *
-                              env_deriv[i_idx * ndescrpt * 3 + aa * 3 + 2];
+      force[i_idx_nall * 3 + 0] -= net_deriv[i_idx * ndescrpt + aa] *
+                                   env_deriv[i_idx * ndescrpt * 3 + aa * 3 + 0];
+      force[i_idx_nall * 3 + 1] -= net_deriv[i_idx * ndescrpt + aa] *
+                                   env_deriv[i_idx * ndescrpt * 3 + aa * 3 + 1];
+      force[i_idx_nall * 3 + 2] -= net_deriv[i_idx * ndescrpt + aa] *
+                                   env_deriv[i_idx * ndescrpt * 3 + aa * 3 + 2];
     }
     // deriv wrt neighbors
     for (int jj = 0; jj < nnei; ++jj) {
       int j_idx = nlist[i_idx * nnei + jj];
       // if (j_idx > nloc) j_idx = j_idx % nloc;
       if (j_idx < 0) continue;
-      force[j_idx * 3 + 0] += net_deriv[i_idx * ndescrpt + jj] *
-                              env_deriv[i_idx * ndescrpt * 3 + jj * 3 + 0];
-      force[j_idx * 3 + 1] += net_deriv[i_idx * ndescrpt + jj] *
-                              env_deriv[i_idx * ndescrpt * 3 + jj * 3 + 1];
-      force[j_idx * 3 + 2] += net_deriv[i_idx * ndescrpt + jj] *
-                              env_deriv[i_idx * ndescrpt * 3 + jj * 3 + 2];
+      force[kk * nall * 3 + j_idx * 3 + 0] +=
+          net_deriv[i_idx * ndescrpt + jj] *
+          env_deriv[i_idx * ndescrpt * 3 + jj * 3 + 0];
+      force[kk * nall * 3 + j_idx * 3 + 1] +=
+          net_deriv[i_idx * ndescrpt + jj] *
+          env_deriv[i_idx * ndescrpt * 3 + jj * 3 + 1];
+      force[kk * nall * 3 + j_idx * 3 + 2] +=
+          net_deriv[i_idx * ndescrpt + jj] *
+          env_deriv[i_idx * ndescrpt * 3 + jj * 3 + 2];
     }
   }
 }
@@ -128,7 +180,8 @@ template void deepmd::prod_force_r_cpu<double>(double* force,
                                                const int* nlist,
                                                const int nloc,
                                                const int nall,
-                                               const int nnei);
+                                               const int nnei,
+                                               const int nframes);
 
 template void deepmd::prod_force_r_cpu<float>(float* force,
                                               const float* net_deriv,
@@ -136,4 +189,5 @@ template void deepmd::prod_force_r_cpu<float>(float* force,
                                               const int* nlist,
                                               const int nloc,
                                               const int nall,
-                                              const int nnei);
+                                              const int nnei,
+                                              const int nframes);
diff --git a/source/lib/src/rocm/prod_force.hip.cu b/source/lib/src/rocm/prod_force.hip.cu
index cf2df5d4a8..bc4fa15078 100644
--- a/source/lib/src/rocm/prod_force.hip.cu
+++ b/source/lib/src/rocm/prod_force.hip.cu
@@ -5,7 +5,9 @@ template <typename FPTYPE, int THREADS_PER_BLOCK>
 __global__ void force_deriv_wrt_center_atom(FPTYPE* force,
                                             const FPTYPE* net_deriv,
                                             const FPTYPE* in_deriv,
-                                            const int ndescrpt) {
+                                            const int ndescrpt,
+                                            const int nloc,
+                                            const int nall) {
   __shared__ FPTYPE data[THREADS_PER_BLOCK * 3];
   int_64 bid = blockIdx.x;
   unsigned int tid = threadIdx.x;
@@ -31,10 +33,13 @@ __global__ void force_deriv_wrt_center_atom(FPTYPE* force,
     __syncthreads();
   }
   // write result for this block to global memory
+  const int_64 kk = bid / nloc;  // frame index
+  const int_64 ll = bid % nloc;  // atom index
+  const int_64 i_idx_nall = kk * nall + ll;
   if (tid == 0) {
-    force[bid * 3 + 0] -= data[THREADS_PER_BLOCK * 0];
-    force[bid * 3 + 1] -= data[THREADS_PER_BLOCK * 1];
-    force[bid * 3 + 2] -= data[THREADS_PER_BLOCK * 2];
+    force[i_idx_nall * 3 + 0] -= data[THREADS_PER_BLOCK * 0];
+    force[i_idx_nall * 3 + 1] -= data[THREADS_PER_BLOCK * 1];
+    force[i_idx_nall * 3 + 2] -= data[THREADS_PER_BLOCK * 2];
   }
 }
 
@@ -44,6 +49,7 @@ __global__ void force_deriv_wrt_neighbors_a(FPTYPE* force,
                                             const FPTYPE* in_deriv,
                                             const int* nlist,
                                             const int nloc,
+                                            const int nall,
                                             const int nnei) {
   // idy -> nnei
   const int_64 idx = blockIdx.x;
@@ -63,7 +69,8 @@ __global__ void force_deriv_wrt_neighbors_a(FPTYPE* force,
     force_tmp += net_deriv[idx * ndescrpt + idy * 4 + idw] *
                  in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz];
   }
-  atomicAdd(force + j_idx * 3 + idz, force_tmp);
+  const int_64 kk = idx / nloc;  // frame index
+  atomicAdd(force + kk * nall * 3 + j_idx * 3 + idz, force_tmp);
 }
 
 template <typename FPTYPE>
@@ -72,6 +79,7 @@ __global__ void force_deriv_wrt_neighbors_r(FPTYPE* force,
                                             const FPTYPE* in_deriv,
                                             const int* nlist,
                                             const int nloc,
+                                            const int nall,
                                             const int nnei) {
   // idy -> nnei
   const int_64 idx = blockIdx.x;
@@ -86,7 +94,8 @@ __global__ void force_deriv_wrt_neighbors_r(FPTYPE* force,
   if (j_idx < 0) {
     return;
   }
-  atomicAdd(force + j_idx * 3 + idz,
+  const int_64 kk = idx / nloc;  // frame index
+  atomicAdd(force + kk * nall * 3 + j_idx * 3 + idz,
             net_deriv[idx * ndescrpt + idy] *
                 in_deriv[idx * ndescrpt * 3 + idy * 3 + idz]);
 }
@@ -99,21 +108,23 @@ void prod_force_a_gpu_rocm(FPTYPE* force,
                            const int* nlist,
                            const int nloc,
                            const int nall,
-                           const int nnei) {
+                           const int nnei,
+                           const int nframes) {
   const int ndescrpt = nnei * 4;
-  DPErrcheck(hipMemset(force, 0, sizeof(FPTYPE) * nall * 3));
+  DPErrcheck(hipMemset(force, 0, sizeof(FPTYPE) * nframes * nall * 3));
 
   hipLaunchKernelGGL(HIP_KERNEL_NAME(force_deriv_wrt_center_atom<FPTYPE, TPB>),
-                     nloc, TPB, 0, 0, force, net_deriv, in_deriv, ndescrpt);
+                     nframes * nloc, TPB, 0, 0, force, net_deriv, in_deriv,
+                     ndescrpt, nloc, nall);
   DPErrcheck(hipGetLastError());
   DPErrcheck(hipDeviceSynchronize());
 
   const int LEN = 64;
   const int nblock = (nnei + LEN - 1) / LEN;
-  dim3 block_grid(nloc, nblock);
+  dim3 block_grid(nframes * nloc, nblock);
   dim3 thread_grid(LEN, 3);
   hipLaunchKernelGGL(force_deriv_wrt_neighbors_a, block_grid, thread_grid, 0, 0,
-                     force, net_deriv, in_deriv, nlist, nloc, nnei);
+                     force, net_deriv, in_deriv, nlist, nloc, nall, nnei);
   DPErrcheck(hipGetLastError());
   DPErrcheck(hipDeviceSynchronize());
 }
@@ -125,21 +136,23 @@ void prod_force_r_gpu_rocm(FPTYPE* force,
                            const int* nlist,
                            const int nloc,
                            const int nall,
-                           const int nnei) {
+                           const int nnei,
+                           const int nframes) {
   const int ndescrpt = nnei * 1;
-  DPErrcheck(hipMemset(force, 0, sizeof(FPTYPE) * nall * 3));
+  DPErrcheck(hipMemset(force, 0, sizeof(FPTYPE) * nframes * nall * 3));
 
   hipLaunchKernelGGL(HIP_KERNEL_NAME(force_deriv_wrt_center_atom<FPTYPE, TPB>),
-                     nloc, TPB, 0, 0, force, net_deriv, in_deriv, ndescrpt);
+                     nframes * nloc, TPB, 0, 0, force, net_deriv, in_deriv,
+                     ndescrpt, nloc, nall);
   DPErrcheck(hipGetLastError());
   DPErrcheck(hipDeviceSynchronize());
 
   const int LEN = 64;
   const int nblock = (nnei + LEN - 1) / LEN;
-  dim3 block_grid(nloc, nblock);
+  dim3 block_grid(nframes * nloc, nblock);
   dim3 thread_grid(LEN, 3);
   hipLaunchKernelGGL(force_deriv_wrt_neighbors_r, block_grid, thread_grid, 0, 0,
-                     force, net_deriv, in_deriv, nlist, nloc, nnei);
+                     force, net_deriv, in_deriv, nlist, nloc, nall, nnei);
   DPErrcheck(hipGetLastError());
   DPErrcheck(hipDeviceSynchronize());
 }
@@ -150,27 +163,31 @@ template void prod_force_a_gpu_rocm<float>(float* force,
                                            const int* nlist,
                                            const int nloc,
                                            const int nall,
-                                           const int nnei);
+                                           const int nnei,
+                                           const int nframes);
 template void prod_force_a_gpu_rocm<double>(double* force,
                                             const double* net_deriv,
                                             const double* in_deriv,
                                             const int* nlist,
                                             const int nloc,
                                             const int nall,
-                                            const int nnei);
+                                            const int nnei,
+                                            const int nframes);
 template void prod_force_r_gpu_rocm<float>(float* force,
                                            const float* net_deriv,
                                            const float* in_deriv,
                                            const int* nlist,
                                            const int nloc,
                                            const int nall,
-                                           const int nnei);
+                                           const int nnei,
+                                           const int nframes);
 template void prod_force_r_gpu_rocm<double>(double* force,
                                             const double* net_deriv,
                                             const double* in_deriv,
                                             const int* nlist,
                                             const int nloc,
                                             const int nall,
-                                            const int nnei);
+                                            const int nnei,
+                                            const int nframes);
 
 }  // namespace deepmd
diff --git a/source/lib/tests/test_prod_force_a.cc b/source/lib/tests/test_prod_force_a.cc
index 03ef65fa3f..7fbebf1b57 100644
--- a/source/lib/tests/test_prod_force_a.cc
+++ b/source/lib/tests/test_prod_force_a.cc
@@ -8,6 +8,11 @@
 #include "neighbor_list.h"
 #include "prod_force.h"
 
+template <typename T>
+inline void double_vec(std::vector<T>& v) {
+  v.insert(std::end(v), std::begin(v), std::end(v));
+}
+
 class TestProdForceA : public ::testing::Test {
  protected:
   std::vector<double> posi = {12.83, 2.56, 2.18, 12.09, 2.87, 2.74,
@@ -16,6 +21,7 @@ class TestProdForceA : public ::testing::Test {
   std::vector<int> atype = {0, 1, 1, 0, 1, 1};
   std::vector<double> posi_cpy;
   std::vector<int> atype_cpy;
+  int nframes = 2;
   int ntypes = 2;
   int nloc, nall, nnei, ndescrpt;
   double rc = 6;
@@ -102,16 +108,20 @@ class TestProdForceA : public ::testing::Test {
     for (int ii = 0; ii < nloc * ndescrpt; ++ii) {
       net_deriv[ii] = 10 - ii * 0.01;
     }
+    double_vec(nlist);
+    double_vec(net_deriv);
+    double_vec(env_deriv);
+    double_vec(expected_force);
   }
   void TearDown() override {}
 };
 
 TEST_F(TestProdForceA, cpu) {
-  std::vector<double> force(nall * 3);
+  std::vector<double> force(nframes * nall * 3);
   int n_a_sel = nnei;
   deepmd::prod_force_a_cpu<double>(&force[0], &net_deriv[0], &env_deriv[0],
-                                   &nlist[0], nloc, nall, nnei);
-  EXPECT_EQ(force.size(), nall * 3);
+                                   &nlist[0], nloc, nall, nnei, nframes);
+  EXPECT_EQ(force.size(), nframes * nall * 3);
   EXPECT_EQ(force.size(), expected_force.size());
   for (int jj = 0; jj < force.size(); ++jj) {
     EXPECT_LT(fabs(force[jj] - expected_force[jj]), 1e-5);
@@ -124,7 +134,7 @@ TEST_F(TestProdForceA, cpu) {
 
 #if GOOGLE_CUDA
 TEST_F(TestProdForceA, gpu_cuda) {
-  std::vector<double> force(nall * 3, 0.0);
+  std::vector<double> force(nframes * nall * 3, 0.0);
   int n_a_sel = nnei;
 
   int* nlist_dev = NULL;
@@ -136,7 +146,7 @@ TEST_F(TestProdForceA, gpu_cuda) {
   deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv);
 
   deepmd::prod_force_a_gpu_cuda<double>(force_dev, net_deriv_dev, env_deriv_dev,
-                                        nlist_dev, nloc, nall, nnei);
+                                        nlist_dev, nloc, nall, nnei, nframes);
 
   deepmd::memcpy_device_to_host(force_dev, force);
   deepmd::delete_device_memory(nlist_dev);
@@ -144,7 +154,7 @@ TEST_F(TestProdForceA, gpu_cuda) {
   deepmd::delete_device_memory(net_deriv_dev);
   deepmd::delete_device_memory(env_deriv_dev);
 
-  EXPECT_EQ(force.size(), nall * 3);
+  EXPECT_EQ(force.size(), nframes * nall * 3);
   EXPECT_EQ(force.size(), expected_force.size());
   for (int jj = 0; jj < force.size(); ++jj) {
     EXPECT_LT(fabs(force[jj] - expected_force[jj]), 1e-5);
@@ -154,7 +164,7 @@ TEST_F(TestProdForceA, gpu_cuda) {
 
 #if TENSORFLOW_USE_ROCM
 TEST_F(TestProdForceA, gpu_rocm) {
-  std::vector<double> force(nall * 3, 0.0);
+  std::vector<double> force(nframes * nall * 3, 0.0);
   int n_a_sel = nnei;
 
   int* nlist_dev = NULL;
@@ -166,7 +176,7 @@ TEST_F(TestProdForceA, gpu_rocm) {
   deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv);
 
   deepmd::prod_force_a_gpu_rocm<double>(force_dev, net_deriv_dev, env_deriv_dev,
-                                        nlist_dev, nloc, nall, nnei);
+                                        nlist_dev, nloc, nall, nnei, nframes);
 
   deepmd::memcpy_device_to_host(force_dev, force);
   deepmd::delete_device_memory(nlist_dev);
@@ -174,7 +184,7 @@ TEST_F(TestProdForceA, gpu_rocm) {
   deepmd::delete_device_memory(net_deriv_dev);
   deepmd::delete_device_memory(env_deriv_dev);
 
-  EXPECT_EQ(force.size(), nall * 3);
+  EXPECT_EQ(force.size(), nframes * nall * 3);
   EXPECT_EQ(force.size(), expected_force.size());
   for (int jj = 0; jj < force.size(); ++jj) {
     EXPECT_LT(fabs(force[jj] - expected_force[jj]), 1e-5);
diff --git a/source/lib/tests/test_prod_force_r.cc b/source/lib/tests/test_prod_force_r.cc
index a1b5392355..c2a2359e0b 100644
--- a/source/lib/tests/test_prod_force_r.cc
+++ b/source/lib/tests/test_prod_force_r.cc
@@ -8,6 +8,11 @@
 #include "neighbor_list.h"
 #include "prod_force.h"
 
+template <typename T>
+inline void double_vec(std::vector<T>& v) {
+  v.insert(std::end(v), std::begin(v), std::end(v));
+}
+
 class TestProdForceR : public ::testing::Test {
  protected:
   std::vector<double> posi = {12.83, 2.56, 2.18, 12.09, 2.87, 2.74,
@@ -16,6 +21,7 @@ class TestProdForceR : public ::testing::Test {
   std::vector<int> atype = {0, 1, 1, 0, 1, 1};
   std::vector<double> posi_cpy;
   std::vector<int> atype_cpy;
+  int nframes = 2;
   int ntypes = 2;
   int nloc, nall, nnei, ndescrpt;
   double rc = 6;
@@ -99,16 +105,20 @@ class TestProdForceR : public ::testing::Test {
     for (int ii = 0; ii < nloc * ndescrpt; ++ii) {
       net_deriv[ii] = 10 - ii * 0.01;
     }
+    double_vec(nlist);
+    double_vec(net_deriv);
+    double_vec(env_deriv);
+    double_vec(expected_force);
   }
   void TearDown() override {}
 };
 
 TEST_F(TestProdForceR, cpu) {
-  std::vector<double> force(nall * 3);
+  std::vector<double> force(nframes * nall * 3);
   int n_a_sel = nnei;
   deepmd::prod_force_r_cpu<double>(&force[0], &net_deriv[0], &env_deriv[0],
-                                   &nlist[0], nloc, nall, nnei);
-  EXPECT_EQ(force.size(), nall * 3);
+                                   &nlist[0], nloc, nall, nnei, nframes);
+  EXPECT_EQ(force.size(), nframes * nall * 3);
   EXPECT_EQ(force.size(), expected_force.size());
   for (int jj = 0; jj < force.size(); ++jj) {
     EXPECT_LT(fabs(force[jj] - expected_force[jj]), 1e-5);
@@ -121,7 +131,7 @@ TEST_F(TestProdForceR, cpu) {
 
 #if GOOGLE_CUDA
 TEST_F(TestProdForceR, gpu_cuda) {
-  std::vector<double> force(nall * 3, 0.0);
+  std::vector<double> force(nframes * nall * 3, 0.0);
   int n_a_sel = nnei;
 
   int* nlist_dev = NULL;
@@ -133,7 +143,7 @@ TEST_F(TestProdForceR, gpu_cuda) {
   deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv);
 
   deepmd::prod_force_r_gpu_cuda<double>(force_dev, net_deriv_dev, env_deriv_dev,
-                                        nlist_dev, nloc, nall, nnei);
+                                        nlist_dev, nloc, nall, nnei, nframes);
 
   deepmd::memcpy_device_to_host(force_dev, force);
   deepmd::delete_device_memory(nlist_dev);
@@ -141,7 +151,7 @@ TEST_F(TestProdForceR, gpu_cuda) {
   deepmd::delete_device_memory(net_deriv_dev);
   deepmd::delete_device_memory(env_deriv_dev);
 
-  EXPECT_EQ(force.size(), nall * 3);
+  EXPECT_EQ(force.size(), nframes * nall * 3);
   EXPECT_EQ(force.size(), expected_force.size());
   for (int jj = 0; jj < force.size(); ++jj) {
     EXPECT_LT(fabs(force[jj] - expected_force[jj]), 1e-5);
@@ -151,7 +161,7 @@ TEST_F(TestProdForceR, gpu_cuda) {
 
 #if TENSORFLOW_USE_ROCM
 TEST_F(TestProdForceR, gpu_rocm) {
-  std::vector<double> force(nall * 3, 0.0);
+  std::vector<double> force(nframes * nall * 3, 0.0);
   int n_a_sel = nnei;
 
   int* nlist_dev = NULL;
@@ -163,7 +173,7 @@ TEST_F(TestProdForceR, gpu_rocm) {
   deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv);
 
   deepmd::prod_force_r_gpu_rocm<double>(force_dev, net_deriv_dev, env_deriv_dev,
-                                        nlist_dev, nloc, nall, nnei);
+                                        nlist_dev, nloc, nall, nnei, nframes);
 
   deepmd::memcpy_device_to_host(force_dev, force);
   deepmd::delete_device_memory(nlist_dev);
@@ -171,7 +181,7 @@ TEST_F(TestProdForceR, gpu_rocm) {
   deepmd::delete_device_memory(net_deriv_dev);
   deepmd::delete_device_memory(env_deriv_dev);
 
-  EXPECT_EQ(force.size(), nall * 3);
+  EXPECT_EQ(force.size(), nframes * nall * 3);
   EXPECT_EQ(force.size(), expected_force.size());
   for (int jj = 0; jj < force.size(); ++jj) {
     EXPECT_LT(fabs(force[jj] - expected_force[jj]), 1e-5);
diff --git a/source/op/prod_force_multi_device.cc b/source/op/prod_force_multi_device.cc
index 73914292db..8ba62882cf 100644
--- a/source/op/prod_force_multi_device.cc
+++ b/source/op/prod_force_multi_device.cc
@@ -136,25 +136,20 @@ class ProdForceSeAOp : public OpKernel {
       nloc_loc = end_index - start_index;
     }
 
-    for (int_64 kk = 0; kk < nframes; ++kk) {
-      FPTYPE* force = p_force + kk * nall * 3;
-      const FPTYPE* net_deriv = p_net_deriv + kk * nloc * ndescrpt;
-      const FPTYPE* in_deriv = p_in_deriv + kk * nloc * ndescrpt * 3;
-      const int* nlist = p_nlist + kk * nloc * nnei;
-      if (device == "GPU") {
+    if (device == "GPU") {
 #if GOOGLE_CUDA
-        deepmd::prod_force_a_gpu_cuda(force, net_deriv, in_deriv, nlist, nloc,
-                                      nall, nnei);
+      deepmd::prod_force_a_gpu_cuda(p_force, p_net_deriv, p_in_deriv, p_nlist,
+                                    nloc, nall, nnei, nframes);
 #endif  // GOOGLE_CUDA
 
 #if TENSORFLOW_USE_ROCM
-        deepmd::prod_force_a_gpu_rocm(force, net_deriv, in_deriv, nlist, nloc,
-                                      nall, nnei);
+      deepmd::prod_force_a_gpu_rocm(p_force, p_net_deriv, p_in_deriv, p_nlist,
+                                    nloc, nall, nnei, nframes);
 #endif  // TENSORFLOW_USE_ROCM
-      } else if (device == "CPU") {
-        deepmd::prod_force_a_cpu(force, net_deriv, in_deriv, nlist, nloc_loc,
-                                 nall, nnei, start_index = start_index);
-      }
+    } else if (device == "CPU") {
+      deepmd::prod_force_a_cpu(p_force, p_net_deriv, p_in_deriv, p_nlist, nloc,
+                               nall, nnei, nframes, nloc_loc,
+                               start_index = start_index);
     }
   }
 
@@ -227,25 +222,19 @@ class ProdForceSeROp : public OpKernel {
     const FPTYPE* p_in_deriv = in_deriv_tensor.flat<FPTYPE>().data();
     const int* p_nlist = nlist_tensor.flat<int>().data();
 
-    for (int_64 kk = 0; kk < nframes; ++kk) {
-      FPTYPE* force = p_force + kk * nall * 3;
-      const FPTYPE* net_deriv = p_net_deriv + kk * nloc * ndescrpt;
-      const FPTYPE* in_deriv = p_in_deriv + kk * nloc * ndescrpt * 3;
-      const int* nlist = p_nlist + kk * nloc * nnei;
-      if (device == "GPU") {
+    if (device == "GPU") {
 #if GOOGLE_CUDA
-        deepmd::prod_force_r_gpu_cuda(force, net_deriv, in_deriv, nlist, nloc,
-                                      nall, nnei);
+      deepmd::prod_force_r_gpu_cuda(p_force, p_net_deriv, p_in_deriv, p_nlist,
+                                    nloc, nall, nnei, nframes);
 #endif  // GOOGLE_CUDA
 
 #if TENSORFLOW_USE_ROCM
-        deepmd::prod_force_r_gpu_rocm(force, net_deriv, in_deriv, nlist, nloc,
-                                      nall, nnei);
+      deepmd::prod_force_r_gpu_rocm(p_force, p_net_deriv, p_in_deriv, p_nlist,
+                                    nloc, nall, nnei, nframes);
 #endif  // TENSORFLOW_USE_ROCM
-      } else if (device == "CPU") {
-        deepmd::prod_force_r_cpu(force, net_deriv, in_deriv, nlist, nloc, nall,
-                                 nnei);
-      }
+    } else if (device == "CPU") {
+      deepmd::prod_force_r_cpu(p_force, p_net_deriv, p_in_deriv, p_nlist, nloc,
+                               nall, nnei, nframes);
     }
   }