diff --git a/.github/workflows/test_cuda.yml b/.github/workflows/test_cuda.yml
index db6b6bbbaa..3cb75ecf1a 100644
--- a/.github/workflows/test_cuda.yml
+++ b/.github/workflows/test_cuda.yml
@@ -76,7 +76,7 @@ jobs:
     - run: |
         export LD_LIBRARY_PATH=$GITHUB_WORKSPACE/dp_test/lib:$GITHUB_WORKSPACE/libtorch/lib:$CUDA_PATH/lib64:$LD_LIBRARY_PATH
         export PATH=$GITHUB_WORKSPACE/dp_test/bin:$PATH
-        python -m pytest source/lmp/tests
+        python -m pytest -s source/lmp/tests || (cat log.lammps && exit 1)
         python -m pytest source/ipi/tests
       env:
         OMP_NUM_THREADS: 1
diff --git a/deepmd/pt/entrypoints/main.py b/deepmd/pt/entrypoints/main.py
index 63463ceeef..eafce67e84 100644
--- a/deepmd/pt/entrypoints/main.py
+++ b/deepmd/pt/entrypoints/main.py
@@ -286,10 +286,14 @@ def train(FLAGS):
 
 def freeze(FLAGS):
     model = torch.jit.script(inference.Tester(FLAGS.model, head=FLAGS.head).model)
+    if '"type": "dpa2"' in model.model_def_script:
+        extra_files = {"type": "dpa2"}
+    else:
+        extra_files = {"type": "else"}
     torch.jit.save(
         model,
         FLAGS.output,
-        {},
+        extra_files,
     )
 
 
diff --git a/deepmd/pt/model/atomic_model/base_atomic_model.py b/deepmd/pt/model/atomic_model/base_atomic_model.py
index 10fa2a7bd9..3be052919d 100644
--- a/deepmd/pt/model/atomic_model/base_atomic_model.py
+++ b/deepmd/pt/model/atomic_model/base_atomic_model.py
@@ -186,6 +186,7 @@ def forward_common_atomic(
         mapping: Optional[torch.Tensor] = None,
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
+        comm_dict: Optional[Dict[str, torch.Tensor]] = None,
     ) -> Dict[str, torch.Tensor]:
         """Common interface for atomic inference.
 
@@ -207,6 +208,8 @@ def forward_common_atomic(
             frame parameters, shape: nf x dim_fparam
         aparam
             atomic parameter, shape: nf x nloc x dim_aparam
+        comm_dict
+            The data needed for communication for parallel inference.
 
         Returns
         -------
@@ -234,6 +237,7 @@ def forward_common_atomic(
             mapping=mapping,
             fparam=fparam,
             aparam=aparam,
+            comm_dict=comm_dict,
         )
         ret_dict = self.apply_out_stat(ret_dict, atype)
 
diff --git a/deepmd/pt/model/atomic_model/dp_atomic_model.py b/deepmd/pt/model/atomic_model/dp_atomic_model.py
index 182196aca5..3d9a57bf70 100644
--- a/deepmd/pt/model/atomic_model/dp_atomic_model.py
+++ b/deepmd/pt/model/atomic_model/dp_atomic_model.py
@@ -130,6 +130,7 @@ def forward_atomic(
         mapping: Optional[torch.Tensor] = None,
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
+        comm_dict: Optional[Dict[str, torch.Tensor]] = None,
     ) -> Dict[str, torch.Tensor]:
         """Return atomic prediction.
 
@@ -163,6 +164,7 @@ def forward_atomic(
             extended_atype,
             nlist,
             mapping=mapping,
+            comm_dict=comm_dict,
         )
         assert descriptor is not None
         # energy, force
diff --git a/deepmd/pt/model/atomic_model/linear_atomic_model.py b/deepmd/pt/model/atomic_model/linear_atomic_model.py
index b58594d3ce..bf03a68f31 100644
--- a/deepmd/pt/model/atomic_model/linear_atomic_model.py
+++ b/deepmd/pt/model/atomic_model/linear_atomic_model.py
@@ -144,6 +144,7 @@ def forward_atomic(
         mapping: Optional[torch.Tensor] = None,
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
+        comm_dict: Optional[Dict[str, torch.Tensor]] = None,
     ) -> Dict[str, torch.Tensor]:
         """Return atomic prediction.
 
diff --git a/deepmd/pt/model/atomic_model/pairtab_atomic_model.py b/deepmd/pt/model/atomic_model/pairtab_atomic_model.py
index 4f8bce78e1..3a0700be4f 100644
--- a/deepmd/pt/model/atomic_model/pairtab_atomic_model.py
+++ b/deepmd/pt/model/atomic_model/pairtab_atomic_model.py
@@ -228,6 +228,7 @@ def forward_atomic(
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
         do_atomic_virial: bool = False,
+        comm_dict: Optional[Dict[str, torch.Tensor]] = None,
     ) -> Dict[str, torch.Tensor]:
         nframes, nloc, nnei = nlist.shape
         extended_coord = extended_coord.view(nframes, -1, 3)
diff --git a/deepmd/pt/model/descriptor/dpa1.py b/deepmd/pt/model/descriptor/dpa1.py
index 852e08403c..71c8a13f46 100644
--- a/deepmd/pt/model/descriptor/dpa1.py
+++ b/deepmd/pt/model/descriptor/dpa1.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
     Callable,
+    Dict,
     List,
     Optional,
     Tuple,
@@ -453,6 +454,7 @@ def forward(
         extended_atype: torch.Tensor,
         nlist: torch.Tensor,
         mapping: Optional[torch.Tensor] = None,
+        comm_dict: Optional[Dict[str, torch.Tensor]] = None,
     ):
         """Compute the descriptor.
 
@@ -466,6 +468,8 @@ def forward(
             The neighbor list. shape: nf x nloc x nnei
         mapping
             The index mapping, not required by this descriptor.
+        comm_dict
+            The data needed for communication for parallel inference.
 
         Returns
         -------
diff --git a/deepmd/pt/model/descriptor/dpa2.py b/deepmd/pt/model/descriptor/dpa2.py
index fb792a51e2..2bf4d193f3 100644
--- a/deepmd/pt/model/descriptor/dpa2.py
+++ b/deepmd/pt/model/descriptor/dpa2.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
     Callable,
+    Dict,
     List,
     Optional,
     Tuple,
@@ -395,6 +396,7 @@ def forward(
         extended_atype: torch.Tensor,
         nlist: torch.Tensor,
         mapping: Optional[torch.Tensor] = None,
+        comm_dict: Optional[Dict[str, torch.Tensor]] = None,
     ):
         """Compute the descriptor.
 
@@ -408,6 +410,8 @@ def forward(
             The neighbor list. shape: nf x nloc x nnei
         mapping
             The index mapping, mapps extended region index to local region.
+        comm_dict
+            The data needed for communication for parallel inference.
 
         Returns
         -------
@@ -450,11 +454,13 @@ def forward(
         # linear to change shape
         g1 = self.g1_shape_tranform(g1)
         # mapping g1
-        assert mapping is not None
-        mapping_ext = (
-            mapping.view(nframes, nall).unsqueeze(-1).expand(-1, -1, g1.shape[-1])
-        )
-        g1_ext = torch.gather(g1, 1, mapping_ext)
+        if comm_dict is None:
+            assert mapping is not None
+            mapping_ext = (
+                mapping.view(nframes, nall).unsqueeze(-1).expand(-1, -1, g1.shape[-1])
+            )
+            g1_ext = torch.gather(g1, 1, mapping_ext)
+            g1 = g1_ext
         # repformer
         g1, g2, h2, rot_mat, sw = self.repformers(
             nlist_dict[
@@ -464,8 +470,9 @@ def forward(
             ],
             extended_coord,
             extended_atype,
-            g1_ext,
+            g1,
             mapping,
+            comm_dict,
         )
         if self.concat_output_tebd:
             g1 = torch.cat([g1, g1_inp], dim=-1)
diff --git a/deepmd/pt/model/descriptor/hybrid.py b/deepmd/pt/model/descriptor/hybrid.py
index 204ca7589d..731971f056 100644
--- a/deepmd/pt/model/descriptor/hybrid.py
+++ b/deepmd/pt/model/descriptor/hybrid.py
@@ -168,6 +168,7 @@ def forward(
         atype_ext: torch.Tensor,
         nlist: torch.Tensor,
         mapping: Optional[torch.Tensor] = None,
+        comm_dict: Optional[Dict[str, torch.Tensor]] = None,
     ):
         """Compute the descriptor.
 
@@ -181,6 +182,8 @@ def forward(
             The neighbor list. shape: nf x nloc x nnei
         mapping
             The index mapping, not required by this descriptor.
+        comm_dict
+            The data needed for communication for parallel inference.
 
         Returns
         -------
@@ -443,6 +446,7 @@ def forward(
         extended_atype: torch.Tensor,
         extended_atype_embd: Optional[torch.Tensor] = None,
         mapping: Optional[torch.Tensor] = None,
+        comm_dict: Optional[Dict[str, torch.Tensor]] = None,
     ):
         """Calculate decoded embedding for each atom.
 
diff --git a/deepmd/pt/model/descriptor/repformers.py b/deepmd/pt/model/descriptor/repformers.py
index 16a38052b1..c91ca8056b 100644
--- a/deepmd/pt/model/descriptor/repformers.py
+++ b/deepmd/pt/model/descriptor/repformers.py
@@ -54,6 +54,27 @@ def torch_linear(*args, **kwargs):
 mylinear = simple_linear
 
 
+if not hasattr(torch.ops.deepmd, "border_op"):
+
+    def border_op(
+        argument0,
+        argument1,
+        argument2,
+        argument3,
+        argument4,
+        argument5,
+        argument6,
+        argument7,
+        argument8,
+    ) -> torch.Tensor:
+        raise NotImplementedError(
+            "border_op is not available since customized PyTorch OP library is not built when freezing the model."
+        )
+
+    # Note: this hack cannot actually save a model that can be runned using LAMMPS.
+    torch.ops.deepmd.border_op = border_op
+
+
 @DescriptorBlock.register("se_repformer")
 @DescriptorBlock.register("se_uni")
 class DescrptBlockRepformers(DescriptorBlock):
@@ -234,9 +255,11 @@ def forward(
         extended_atype: torch.Tensor,
         extended_atype_embd: Optional[torch.Tensor] = None,
         mapping: Optional[torch.Tensor] = None,
+        comm_dict: Optional[Dict[str, torch.Tensor]] = None,
     ):
-        assert mapping is not None
-        assert extended_atype_embd is not None
+        if comm_dict is None:
+            assert mapping is not None
+            assert extended_atype_embd is not None
         nframes, nloc, nnei = nlist.shape
         nall = extended_coord.view(nframes, -1).shape[1] // 3
         atype = extended_atype[:, :nloc]
@@ -257,9 +280,13 @@ def forward(
         sw = sw.masked_fill(~nlist_mask, 0.0)
 
         # [nframes, nloc, tebd_dim]
-        atype_embd = extended_atype_embd[:, :nloc, :]
-        assert list(atype_embd.shape) == [nframes, nloc, self.g1_dim]
-
+        if comm_dict is None:
+            assert isinstance(extended_atype_embd, torch.Tensor)  # for jit
+            atype_embd = extended_atype_embd[:, :nloc, :]
+            assert list(atype_embd.shape) == [nframes, nloc, self.g1_dim]
+        else:
+            atype_embd = extended_atype_embd
+        assert isinstance(atype_embd, torch.Tensor)  # for jit
         g1 = self.act(atype_embd)
         # nb x nloc x nnei x 1,  nb x nloc x nnei x 3
         if not self.direct_dist:
@@ -275,11 +302,40 @@ def forward(
         # if the a neighbor is real or not is indicated by nlist_mask
         nlist[nlist == -1] = 0
         # nb x nall x ng1
-        mapping = mapping.view(nframes, nall).unsqueeze(-1).expand(-1, -1, self.g1_dim)
+        if comm_dict is None:
+            assert mapping is not None
+            mapping = (
+                mapping.view(nframes, nall).unsqueeze(-1).expand(-1, -1, self.g1_dim)
+            )
         for idx, ll in enumerate(self.layers):
             # g1:     nb x nloc x ng1
             # g1_ext: nb x nall x ng1
-            g1_ext = torch.gather(g1, 1, mapping)
+            if comm_dict is None:
+                assert mapping is not None
+                g1_ext = torch.gather(g1, 1, mapping)
+            else:
+                n_padding = nall - nloc
+                g1 = torch.nn.functional.pad(
+                    g1.squeeze(0), (0, 0, 0, n_padding), value=0.0
+                )
+                assert "send_list" in comm_dict
+                assert "send_proc" in comm_dict
+                assert "recv_proc" in comm_dict
+                assert "send_num" in comm_dict
+                assert "recv_num" in comm_dict
+                assert "communicator" in comm_dict
+                ret = torch.ops.deepmd.border_op(
+                    comm_dict["send_list"],
+                    comm_dict["send_proc"],
+                    comm_dict["recv_proc"],
+                    comm_dict["send_num"],
+                    comm_dict["recv_num"],
+                    g1,
+                    comm_dict["communicator"],
+                    torch.tensor(nloc),
+                    torch.tensor(nall - nloc),
+                )
+                g1_ext = ret[0].unsqueeze(0)
             g1, g2, h2 = ll.forward(
                 g1_ext,
                 g2,
diff --git a/deepmd/pt/model/descriptor/se_a.py b/deepmd/pt/model/descriptor/se_a.py
index 8b83f0d27b..3316ed5de7 100644
--- a/deepmd/pt/model/descriptor/se_a.py
+++ b/deepmd/pt/model/descriptor/se_a.py
@@ -191,6 +191,7 @@ def forward(
         atype_ext: torch.Tensor,
         nlist: torch.Tensor,
         mapping: Optional[torch.Tensor] = None,
+        comm_dict: Optional[Dict[str, torch.Tensor]] = None,
     ):
         """Compute the descriptor.
 
@@ -204,6 +205,8 @@ def forward(
             The neighbor list. shape: nf x nloc x nnei
         mapping
             The index mapping, not required by this descriptor.
+        comm_dict
+            The data needed for communication for parallel inference.
 
         Returns
         -------
diff --git a/deepmd/pt/model/model/ener_model.py b/deepmd/pt/model/model/ener_model.py
index 4a0eb49945..5256e42bb3 100644
--- a/deepmd/pt/model/model/ener_model.py
+++ b/deepmd/pt/model/model/ener_model.py
@@ -83,6 +83,7 @@ def forward_lower(
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
         do_atomic_virial: bool = False,
+        comm_dict: Optional[Dict[str, torch.Tensor]] = None,
     ):
         model_ret = self.forward_common_lower(
             extended_coord,
@@ -92,6 +93,7 @@ def forward_lower(
             fparam=fparam,
             aparam=aparam,
             do_atomic_virial=do_atomic_virial,
+            comm_dict=comm_dict,
         )
         if self.get_fitting_net() is not None:
             model_predict = {}
diff --git a/deepmd/pt/model/model/make_model.py b/deepmd/pt/model/model/make_model.py
index 386b5e14f9..989789c201 100644
--- a/deepmd/pt/model/model/make_model.py
+++ b/deepmd/pt/model/model/make_model.py
@@ -211,6 +211,7 @@ def forward_common_lower(
             fparam: Optional[torch.Tensor] = None,
             aparam: Optional[torch.Tensor] = None,
             do_atomic_virial: bool = False,
+            comm_dict: Optional[Dict[str, torch.Tensor]] = None,
         ):
             """Return model prediction. Lower interface that takes
             extended atomic coordinates and types, nlist, and mapping
@@ -233,6 +234,8 @@ def forward_common_lower(
                 atomic parameter. nf x nloc x nda
             do_atomic_virial
                 whether calculate atomic virial.
+            comm_dict
+                The data needed for communication for parallel inference.
 
             Returns
             -------
@@ -254,6 +257,7 @@ def forward_common_lower(
                 mapping=mapping,
                 fparam=fp,
                 aparam=ap,
+                comm_dict=comm_dict,
             )
             model_predict = fit_output_to_model_output(
                 atomic_ret,
diff --git a/source/api_c/include/c_api.h b/source/api_c/include/c_api.h
index cac6de377a..3ba5b5e107 100644
--- a/source/api_c/include/c_api.h
+++ b/source/api_c/include/c_api.h
@@ -24,12 +24,49 @@ extern DP_Nlist* DP_NewNlist(int inum_,
                              int* ilist_,
                              int* numneigh_,
                              int** firstneigh_);
+/*
+ * @brief Create a new neighbor list with communication capabilities.
+ * @details This function extends DP_NewNlist by adding support for parallel
+ * communication, allowing the neighbor list to be used in distributed
+ * environments.
+ * @param[in] inum_ Number of core region atoms.
+ * @param[in] ilist_ Array storing the core region atom's index.
+ * @param[in] numneigh_ Array storing the core region atom's neighbor atom
+ * number.
+ * @param[in] firstneigh_ Array storing the core region atom's neighbor index.
+ * @param[in] nswap Number of swaps to be performed in communication.
+ * @param[in] sendnum Array storing the number of atoms to send for each swap.
+ * @param[in] recvnum Array storing the number of atoms to receive for each
+ * swap.
+ * @param[in] firstrecv Index of the first receive operation for each swap.
+ * @param[in] sendlist List of atoms to be sent for each swap.
+ * @param[in] sendproc Array of processor IDs to send atoms to for each swap.
+ * @param[in] recvproc Array of processor IDs from which atoms are received for
+ * each swap.
+ * @param[in] world Pointer to the MPI communicator or similar communication
+ * world used for the operation.
+ * @returns A pointer to the initialized neighbor list with communication
+ * capabilities.
+ */
+extern DP_Nlist* DP_NewNlist_comm(int inum_,
+                                  int* ilist_,
+                                  int* numneigh_,
+                                  int** firstneigh_,
+                                  int nswap,
+                                  int* sendnum,
+                                  int* recvnum,
+                                  int* firstrecv,
+                                  int** sendlist,
+                                  int* sendproc,
+                                  int* recvproc,
+                                  void* world);
 
 /**
  * @brief Delete a neighbor list.
  *
  * @param nl Neighbor list to delete.
- */
+ *
+ **/
 extern void DP_DeleteNlist(DP_Nlist* nl);
 
 /**
diff --git a/source/api_c/include/deepmd.hpp b/source/api_c/include/deepmd.hpp
index 059612f7af..9d0310d99a 100644
--- a/source/api_c/include/deepmd.hpp
+++ b/source/api_c/include/deepmd.hpp
@@ -572,6 +572,34 @@ struct InputNlist {
         nl(DP_NewNlist(inum_, ilist_, numneigh_, firstneigh_)) {
     DP_CHECK_OK(DP_NlistCheckOK, nl);
   };
+  InputNlist(int inum_,
+             int *ilist_,
+             int *numneigh_,
+             int **firstneigh_,
+             int nswap,
+             int *sendnum,
+             int *recvnum,
+             int *firstrecv,
+             int **sendlist,
+             int *sendproc,
+             int *recvproc,
+             void *world)
+      : inum(inum_),
+        ilist(ilist_),
+        numneigh(numneigh_),
+        firstneigh(firstneigh_),
+        nl(DP_NewNlist_comm(inum_,
+                            ilist_,
+                            numneigh_,
+                            firstneigh_,
+                            nswap,
+                            sendnum,
+                            recvnum,
+                            firstrecv,
+                            sendlist,
+                            sendproc,
+                            recvproc,
+                            world)) {};
   ~InputNlist() { DP_DeleteNlist(nl); };
   /// @brief C API neighbor list.
   DP_Nlist *nl;
diff --git a/source/api_c/src/c_api.cc b/source/api_c/src/c_api.cc
index e21cd48ffa..77b74a58d1 100644
--- a/source/api_c/src/c_api.cc
+++ b/source/api_c/src/c_api.cc
@@ -24,7 +24,24 @@ DP_Nlist* DP_NewNlist(int inum_,
             deepmd::InputNlist nl(inum_, ilist_, numneigh_, firstneigh_);
             DP_Nlist* new_nl = new DP_Nlist(nl); return new_nl;)
 }
-
+DP_Nlist* DP_NewNlist_comm(int inum_,
+                           int* ilist_,
+                           int* numneigh_,
+                           int** firstneigh_,
+                           int nswap,
+                           int* sendnum,
+                           int* recvnum,
+                           int* firstrecv,
+                           int** sendlist,
+                           int* sendproc,
+                           int* recvproc,
+                           void* world) {
+  deepmd::InputNlist nl(inum_, ilist_, numneigh_, firstneigh_, nswap, sendnum,
+                        recvnum, firstrecv, sendlist, sendproc, recvproc,
+                        world);
+  DP_Nlist* new_nl = new DP_Nlist(nl);
+  return new_nl;
+}
 void DP_DeleteNlist(DP_Nlist* nl) { delete nl; }
 
 DP_DeepPot::DP_DeepPot() {}
diff --git a/source/api_cc/include/DeepPotPT.h b/source/api_cc/include/DeepPotPT.h
index a7fc910b46..dade7129e1 100644
--- a/source/api_cc/include/DeepPotPT.h
+++ b/source/api_cc/include/DeepPotPT.h
@@ -325,8 +325,10 @@ class DeepPotPT : public DeepPotBase {
   NeighborListData nlist_data;
   int max_num_neighbors;
   int gpu_id;
+  int do_message_passing;  // 1:dpa2 model 0:others
   bool gpu_enabled;
   at::Tensor firstneigh_tensor;
+  torch::Dict<std::string, torch::Tensor> comm_dict;
 };
 
 }  // namespace deepmd
diff --git a/source/api_cc/src/DeepPotPT.cc b/source/api_cc/src/DeepPotPT.cc
index b4631b5e46..af8a0c0252 100644
--- a/source/api_cc/src/DeepPotPT.cc
+++ b/source/api_cc/src/DeepPotPT.cc
@@ -53,8 +53,15 @@ void DeepPotPT::init(const std::string& model,
     std::cout << "load model from: " << model << " to gpu " << gpu_id
               << std::endl;
   }
-  module = torch::jit::load(model, device);
-
+  std::unordered_map<std::string, std::string> metadata = {{"type", ""}};
+  module = torch::jit::load(model, device, metadata);
+  // TODO: This should be fixed after implement api to decide whether need to
+  // message passing and rename this metadata
+  if (metadata["type"] == "dpa2") {
+    do_message_passing = 1;
+  } else {
+    do_message_passing = 0;
+  }
   torch::jit::FusionStrategy strategy;
   strategy = {{torch::jit::FusionBehavior::DYNAMIC, 10}};
   torch::jit::setFusionStrategy(strategy);
@@ -111,8 +118,10 @@ void DeepPotPT::compute(ENERGYVTYPE& ener,
     options = torch::TensorOptions().dtype(torch::kFloat32);
     floatType = torch::kFloat32;
   }
-  auto int_options = torch::TensorOptions().dtype(torch::kInt64);
-  auto int32_options = torch::TensorOptions().dtype(torch::kInt32);
+  auto int32_option =
+      torch::TensorOptions().device(torch::kCPU).dtype(torch::kInt32);
+  auto int_option =
+      torch::TensorOptions().device(torch::kCPU).dtype(torch::kInt64);
   // select real atoms
   std::vector<VALUETYPE> dcoord, dforce, aparam_, datom_energy, datom_virial;
   std::vector<int> datype, fwd_map, bkw_map;
@@ -123,6 +132,8 @@ void DeepPotPT::compute(ENERGYVTYPE& ener,
                           nghost, ntypes, 1, daparam, nall, aparam_nall);
   int nloc = nall_real - nghost_real;
   int nframes = 1;
+  // TODO: dpa2 model may need a fake communication op to deal with nloc == 0.
+  // this should be fixed after wrapping comm op as a pure c++ implementation.
   if (nloc == 0) {
     // no backward map needed
     ener.resize(nframes);
@@ -146,11 +157,39 @@ void DeepPotPT::compute(ENERGYVTYPE& ener,
           .to(device);
   std::vector<int64_t> atype_64(datype.begin(), datype.end());
   at::Tensor atype_Tensor =
-      torch::from_blob(atype_64.data(), {1, nall_real}, int_options).to(device);
+      torch::from_blob(atype_64.data(), {1, nall_real}, int_option).to(device);
   if (ago == 0) {
     nlist_data.copy_from_nlist(lmp_list);
     nlist_data.shuffle_exclude_empty(fwd_map);
     nlist_data.padding();
+    if (do_message_passing == 1) {
+      int nswap = lmp_list.nswap;
+      torch::Tensor sendproc_tensor =
+          torch::from_blob(lmp_list.sendproc, {nswap}, int32_option);
+      torch::Tensor recvproc_tensor =
+          torch::from_blob(lmp_list.recvproc, {nswap}, int32_option);
+      torch::Tensor firstrecv_tensor =
+          torch::from_blob(lmp_list.firstrecv, {nswap}, int32_option);
+      torch::Tensor recvnum_tensor =
+          torch::from_blob(lmp_list.recvnum, {nswap}, int32_option);
+      torch::Tensor sendnum_tensor =
+          torch::from_blob(lmp_list.sendnum, {nswap}, int32_option);
+      torch::Tensor communicator_tensor = torch::from_blob(
+          const_cast<void*>(lmp_list.world), {1}, torch::kInt64);
+      // torch::Tensor communicator_tensor =
+      //     torch::tensor(lmp_list.world, int32_option);
+      torch::Tensor nswap_tensor = torch::tensor(nswap, int32_option);
+      int total_send =
+          std::accumulate(lmp_list.sendnum, lmp_list.sendnum + nswap, 0);
+      torch::Tensor sendlist_tensor =
+          torch::from_blob(lmp_list.sendlist, {total_send}, int32_option);
+      comm_dict.insert("send_list", sendlist_tensor);
+      comm_dict.insert("send_proc", sendproc_tensor);
+      comm_dict.insert("recv_proc", recvproc_tensor);
+      comm_dict.insert("send_num", sendnum_tensor);
+      comm_dict.insert("recv_num", recvnum_tensor);
+      comm_dict.insert("communicator", communicator_tensor);
+    }
   }
   at::Tensor firstneigh = createNlistTensor(nlist_data.jlist);
   firstneigh_tensor = firstneigh.to(torch::kInt64).to(device);
@@ -173,11 +212,17 @@ void DeepPotPT::compute(ENERGYVTYPE& ener,
                         .to(device);
   }
   c10::Dict<c10::IValue, c10::IValue> outputs =
-      module
-          .run_method("forward_lower", coord_wrapped_Tensor, atype_Tensor,
-                      firstneigh_tensor, optional_tensor, fparam_tensor,
-                      aparam_tensor, do_atom_virial_tensor)
-          .toGenericDict();
+      (do_message_passing == 1)
+          ? module
+                .run_method("forward_lower", coord_wrapped_Tensor, atype_Tensor,
+                            firstneigh_tensor, optional_tensor, fparam_tensor,
+                            aparam_tensor, do_atom_virial_tensor, comm_dict)
+                .toGenericDict()
+          : module
+                .run_method("forward_lower", coord_wrapped_Tensor, atype_Tensor,
+                            firstneigh_tensor, optional_tensor, fparam_tensor,
+                            aparam_tensor, do_atom_virial_tensor)
+                .toGenericDict();
   c10::IValue energy_ = outputs.at("energy");
   c10::IValue force_ = outputs.at("extended_force");
   c10::IValue virial_ = outputs.at("virial");
diff --git a/source/api_cc/tests/test_deeppot_dpa_pt.cc b/source/api_cc/tests/test_deeppot_dpa_pt.cc
new file mode 100644
index 0000000000..416802cd20
--- /dev/null
+++ b/source/api_cc/tests/test_deeppot_dpa_pt.cc
@@ -0,0 +1,144 @@
+// SPDX-License-Identifier: LGPL-3.0-or-later
+#include <fcntl.h>
+#include <gtest/gtest.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include <algorithm>
+#include <cmath>
+#include <fstream>
+#include <vector>
+
+#include "DeepPot.h"
+#include "neighbor_list.h"
+#include "test_utils.h"
+
+// 1e-10 cannot pass; unclear bug or not
+#undef EPSILON
+#define EPSILON (std::is_same<VALUETYPE, double>::value ? 1e-7 : 1e-1)
+
+template <class VALUETYPE>
+class TestInferDeepPotDpaPt : public ::testing::Test {
+ protected:
+  std::vector<VALUETYPE> coord = {12.83, 2.56, 2.18, 12.09, 2.87, 2.74,
+                                  00.25, 3.32, 1.68, 3.36,  3.00, 1.81,
+                                  3.51,  2.51, 2.60, 4.27,  3.22, 1.56};
+  std::vector<int> atype = {0, 1, 1, 0, 1, 1};
+  std::vector<VALUETYPE> box = {13., 0., 0., 0., 13., 0., 0., 0., 13.};
+  std::vector<VALUETYPE> expected_e = {-93.295296030283,  -186.548183879333,
+                                       -186.988827037855, -93.295307298571,
+                                       -186.799369383945, -186.507754447584};
+  std::vector<VALUETYPE> expected_f = {
+      4.964133039248,  -0.542378158452, -0.381267990914, -0.563388054735,
+      0.340320322541,  0.473406268590,  0.159774831398,  0.684651816874,
+      -0.377008867620, -4.718603033927, -0.012604322920, -0.425121993870,
+      -0.500302936762, -0.637586419292, 0.930351899011,  0.658386154778,
+      0.167596761250,  -0.220359315197};
+  std::vector<VALUETYPE> expected_v = {
+      -5.055176133632, -0.743392222876, 0.330846378467,  -0.031111229868,
+      0.018004461517,  0.170047655301,  -0.063087726831, -0.004361215202,
+      -0.042920299661, 3.624188578021,  -0.252818122305, -0.026516806138,
+      -0.014510755893, 0.103726553937,  0.181001311123,  -0.508673535094,
+      0.142101134395,  0.135339636607,  -0.460067993361, 0.120541583338,
+      -0.206396390140, -0.630991740522, 0.397670086144,  -0.427022150075,
+      0.656463775044,  -0.209989614377, 0.288974239790,  -7.603428707029,
+      -0.912313971544, 0.882084544041,  -0.807760666057, -0.070519570327,
+      0.022164414763,  0.569448616709,  0.028522950109,  0.051641619288,
+      -1.452133900157, 0.037653156584,  -0.144421326931, -0.308825789350,
+      0.302020522568,  -0.446073217801, 0.313539058423,  -0.461052923736,
+      0.678235442273,  1.429780276456,  0.080472825760,  -0.103424652500,
+      0.123343430648,  0.011879908277,  -0.018897229721, -0.235518441452,
+      -0.013999547600, 0.027007016662};
+  int natoms;
+  double expected_tot_e;
+  std::vector<VALUETYPE> expected_tot_v;
+
+  deepmd::DeepPot dp;
+
+  void SetUp() override {
+    dp.init("../../tests/infer/deeppot_dpa.pth");
+
+    natoms = expected_e.size();
+    EXPECT_EQ(natoms * 3, expected_f.size());
+    EXPECT_EQ(natoms * 9, expected_v.size());
+    expected_tot_e = 0.;
+    expected_tot_v.resize(9);
+    std::fill(expected_tot_v.begin(), expected_tot_v.end(), 0.);
+    for (int ii = 0; ii < natoms; ++ii) {
+      expected_tot_e += expected_e[ii];
+    }
+    for (int ii = 0; ii < natoms; ++ii) {
+      for (int dd = 0; dd < 9; ++dd) {
+        expected_tot_v[dd] += expected_v[ii * 9 + dd];
+      }
+    }
+  };
+
+  void TearDown() override { remove("deeppot.pb"); };
+};
+
+TYPED_TEST_SUITE(TestInferDeepPotDpaPt, ValueTypes);
+
+TYPED_TEST(TestInferDeepPotDpaPt, cpu_build_nlist) {
+  using VALUETYPE = TypeParam;
+  std::vector<VALUETYPE>& coord = this->coord;
+  std::vector<int>& atype = this->atype;
+  std::vector<VALUETYPE>& box = this->box;
+  std::vector<VALUETYPE>& expected_e = this->expected_e;
+  std::vector<VALUETYPE>& expected_f = this->expected_f;
+  std::vector<VALUETYPE>& expected_v = this->expected_v;
+  int& natoms = this->natoms;
+  double& expected_tot_e = this->expected_tot_e;
+  std::vector<VALUETYPE>& expected_tot_v = this->expected_tot_v;
+  deepmd::DeepPot& dp = this->dp;
+  double ener;
+  std::vector<VALUETYPE> force, virial;
+  dp.compute(ener, force, virial, coord, atype, box);
+
+  EXPECT_EQ(force.size(), natoms * 3);
+  EXPECT_EQ(virial.size(), 9);
+
+  EXPECT_LT(fabs(ener - expected_tot_e), EPSILON);
+  for (int ii = 0; ii < natoms * 3; ++ii) {
+    EXPECT_LT(fabs(force[ii] - expected_f[ii]), EPSILON);
+  }
+  for (int ii = 0; ii < 3 * 3; ++ii) {
+    EXPECT_LT(fabs(virial[ii] - expected_tot_v[ii]), EPSILON);
+  }
+}
+
+TYPED_TEST(TestInferDeepPotDpaPt, cpu_build_nlist_atomic) {
+  using VALUETYPE = TypeParam;
+  std::vector<VALUETYPE>& coord = this->coord;
+  std::vector<int>& atype = this->atype;
+  std::vector<VALUETYPE>& box = this->box;
+  std::vector<VALUETYPE>& expected_e = this->expected_e;
+  std::vector<VALUETYPE>& expected_f = this->expected_f;
+  std::vector<VALUETYPE>& expected_v = this->expected_v;
+  int& natoms = this->natoms;
+  double& expected_tot_e = this->expected_tot_e;
+  std::vector<VALUETYPE>& expected_tot_v = this->expected_tot_v;
+  deepmd::DeepPot& dp = this->dp;
+  double ener;
+  std::vector<VALUETYPE> force, virial, atom_ener, atom_vir;
+  dp.compute(ener, force, virial, atom_ener, atom_vir, coord, atype, box);
+
+  EXPECT_EQ(force.size(), natoms * 3);
+  EXPECT_EQ(virial.size(), 9);
+  EXPECT_EQ(atom_ener.size(), natoms);
+  EXPECT_EQ(atom_vir.size(), natoms * 9);
+
+  EXPECT_LT(fabs(ener - expected_tot_e), EPSILON);
+  for (int ii = 0; ii < natoms * 3; ++ii) {
+    EXPECT_LT(fabs(force[ii] - expected_f[ii]), EPSILON);
+  }
+  for (int ii = 0; ii < 3 * 3; ++ii) {
+    EXPECT_LT(fabs(virial[ii] - expected_tot_v[ii]), EPSILON);
+  }
+  for (int ii = 0; ii < natoms; ++ii) {
+    EXPECT_LT(fabs(atom_ener[ii] - expected_e[ii]), EPSILON);
+  }
+  for (int ii = 0; ii < natoms * 9; ++ii) {
+    EXPECT_LT(fabs(atom_vir[ii] - expected_v[ii]), EPSILON);
+  }
+}
diff --git a/source/lib/include/neighbor_list.h b/source/lib/include/neighbor_list.h
index 39c2064b56..b99827b552 100644
--- a/source/lib/include/neighbor_list.h
+++ b/source/lib/include/neighbor_list.h
@@ -26,12 +26,72 @@ struct InputNlist {
   int* numneigh;
   /// Array stores the core region atom's neighbor index
   int** firstneigh;
-  InputNlist() : inum(0), ilist(NULL), numneigh(NULL), firstneigh(NULL) {};
+  ///  # of swaps to perform = sum of maxneed
+  int nswap;
+  /// # of atoms to send in each swap
+  int* sendnum;
+  /// # of atoms to recv in each swap
+  int* recvnum;
+  /// where to put 1st recv atom in each swap
+  int* firstrecv;
+  /// list of atoms to send in each swap
+  int** sendlist;
+  /// proc to send to at each swap
+  int* sendproc;
+  /// proc to recv from at each swap
+  int* recvproc;
+  /// MPI_comm data in lmp
+  void* world;
+  InputNlist()
+      : inum(0),
+        ilist(NULL),
+        numneigh(NULL),
+        firstneigh(NULL),
+        nswap(0),
+        sendnum(nullptr),
+        recvnum(nullptr),
+        firstrecv(nullptr),
+        sendlist(nullptr),
+        sendproc(nullptr),
+        recvproc(nullptr),
+        world(0) {};
   InputNlist(int inum_, int* ilist_, int* numneigh_, int** firstneigh_)
       : inum(inum_),
         ilist(ilist_),
         numneigh(numneigh_),
-        firstneigh(firstneigh_) {};
+        firstneigh(firstneigh_),
+        nswap(0),
+        sendnum(nullptr),
+        recvnum(nullptr),
+        firstrecv(nullptr),
+        sendlist(nullptr),
+        sendproc(nullptr),
+        recvproc(nullptr),
+        world(0) {};
+  InputNlist(int inum_,
+             int* ilist_,
+             int* numneigh_,
+             int** firstneigh_,
+             int nswap,
+             int* sendnum,
+             int* recvnum,
+             int* firstrecv,
+             int** sendlist,
+             int* sendproc,
+             int* recvproc,
+             void* world)
+      : inum(inum_),
+        ilist(ilist_),
+        numneigh(numneigh_),
+        firstneigh(firstneigh_),
+        nswap(nswap),
+        sendnum(sendnum),
+        recvnum(recvnum),
+        firstrecv(firstrecv),
+        sendlist(sendlist),
+        sendproc(sendproc),
+        recvproc(recvproc),
+        world(world) {};
   ~InputNlist() {};
 };
 
diff --git a/source/lmp/pair_deepmd.cpp b/source/lmp/pair_deepmd.cpp
index 90aa453143..c5dc8ecb48 100644
--- a/source/lmp/pair_deepmd.cpp
+++ b/source/lmp/pair_deepmd.cpp
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: LGPL-3.0-or-later
 #include <string.h>
 
+#include <cassert>
 #include <iomanip>
 #include <iostream>
 #include <limits>
@@ -459,7 +460,9 @@ void PairDeepMD::compute(int eflag, int vflag) {
                "centroid/stress/atom command for 9-element atomic virial.");
   }
   bool do_ghost = true;
-
+  assert(sizeof(MPI_Comm) == sizeof(int));
+  //  dpa2 communication
+  commdata_ = (CommBrickDeepMD *)comm;
   double **x = atom->x;
   double **f = atom->f;
   int *type = atom->type;
@@ -550,8 +553,11 @@ void PairDeepMD::compute(int eflag, int vflag) {
   multi_models_mod_devi =
       (numb_models > 1 && (out_freq > 0 && update->ntimestep % out_freq == 0));
   if (do_ghost) {
-    deepmd_compat::InputNlist lmp_list(list->inum, list->ilist, list->numneigh,
-                                       list->firstneigh);
+    deepmd_compat::InputNlist lmp_list(
+        list->inum, list->ilist, list->numneigh, list->firstneigh,
+        commdata_->nswap, commdata_->sendnum, commdata_->recvnum,
+        commdata_->firstrecv, commdata_->sendlist, commdata_->sendproc,
+        commdata_->recvproc, &world);
     deepmd_compat::InputNlist extend_lmp_list;
     if (atom->sp_flag) {
       extend(extend_inum, extend_ilist, extend_numneigh, extend_neigh,
diff --git a/source/lmp/pair_deepmd.h b/source/lmp/pair_deepmd.h
index cd72dc7b2a..a3f6717a3b 100644
--- a/source/lmp/pair_deepmd.h
+++ b/source/lmp/pair_deepmd.h
@@ -32,10 +32,13 @@ namespace deepmd_compat = deepmd::hpp;
 #include <iostream>
 #include <map>
 
+#include "comm_brick.h"
 #define FLOAT_PREC double
 
 namespace LAMMPS_NS {
-
+class CommBrickDeepMD : public CommBrick {
+  friend class PairDeepMD;
+};
 class PairDeepMD : public Pair {
  public:
   PairDeepMD(class LAMMPS *);
@@ -137,6 +140,8 @@ class PairDeepMD : public Pair {
   tagint *tagsend, *tagrecv;
   double *stdfsend, *stdfrecv;
   std::vector<int> type_idx_map;
+
+  CommBrickDeepMD *commdata_;
 };
 
 }  // namespace LAMMPS_NS
diff --git a/source/lmp/tests/test_lammps_dpa_pt.py b/source/lmp/tests/test_lammps_dpa_pt.py
new file mode 100644
index 0000000000..a4e2f93014
--- /dev/null
+++ b/source/lmp/tests/test_lammps_dpa_pt.py
@@ -0,0 +1,721 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import importlib
+import os
+import shutil
+import subprocess as sp
+import sys
+import tempfile
+from pathlib import (
+    Path,
+)
+
+import constants
+import numpy as np
+import pytest
+from lammps import (
+    PyLammps,
+)
+from write_lmp_data import (
+    write_lmp_data,
+)
+
+pbtxt_file2 = (
+    Path(__file__).parent.parent.parent / "tests" / "infer" / "deeppot-1.pbtxt"
+)
+pb_file = Path(__file__).parent.parent.parent / "tests" / "infer" / "deeppot_dpa.pth"
+pb_file2 = Path(__file__).parent / "graph2.pb"
+system_file = Path(__file__).parent.parent.parent / "tests"
+data_file = Path(__file__).parent / "data.lmp"
+data_file_si = Path(__file__).parent / "data.si"
+data_type_map_file = Path(__file__).parent / "data_type_map.lmp"
+md_file = Path(__file__).parent / "md.out"
+
+# this is as the same as python and c++ tests, test_deeppot_a.py
+expected_ae = np.array(
+    [
+        -93.295296030283,
+        -186.548183879333,
+        -186.988827037855,
+        -93.295307298571,
+        -186.799369383945,
+        -186.507754447584,
+    ]
+)
+expected_e = np.sum(expected_ae)
+expected_f = np.array(
+    [
+        4.964133039248,
+        -0.542378158452,
+        -0.381267990914,
+        -0.563388054735,
+        0.340320322541,
+        0.473406268590,
+        0.159774831398,
+        0.684651816874,
+        -0.377008867620,
+        -4.718603033927,
+        -0.012604322920,
+        -0.425121993870,
+        -0.500302936762,
+        -0.637586419292,
+        0.930351899011,
+        0.658386154778,
+        0.167596761250,
+        -0.220359315197,
+    ]
+).reshape(6, 3)
+
+expected_f2 = np.array(
+    [
+        [-0.6454949, 1.72457783, 0.18897958],
+        [1.68936514, -0.36995299, -1.36044464],
+        [-1.09902692, -1.35487928, 1.17416702],
+        [1.68426111, -0.50835585, 0.98340415],
+        [0.05771758, 1.12515818, -1.77561531],
+        [-1.686822, -0.61654789, 0.78950921],
+    ]
+)
+
+expected_v = -np.array(
+    [
+        -5.055176133632,
+        -0.743392222876,
+        0.330846378467,
+        -0.031111229868,
+        0.018004461517,
+        0.170047655301,
+        -0.063087726831,
+        -0.004361215202,
+        -0.042920299661,
+        3.624188578021,
+        -0.252818122305,
+        -0.026516806138,
+        -0.014510755893,
+        0.103726553937,
+        0.181001311123,
+        -0.508673535094,
+        0.142101134395,
+        0.135339636607,
+        -0.460067993361,
+        0.120541583338,
+        -0.206396390140,
+        -0.630991740522,
+        0.397670086144,
+        -0.427022150075,
+        0.656463775044,
+        -0.209989614377,
+        0.288974239790,
+        -7.603428707029,
+        -0.912313971544,
+        0.882084544041,
+        -0.807760666057,
+        -0.070519570327,
+        0.022164414763,
+        0.569448616709,
+        0.028522950109,
+        0.051641619288,
+        -1.452133900157,
+        0.037653156584,
+        -0.144421326931,
+        -0.308825789350,
+        0.302020522568,
+        -0.446073217801,
+        0.313539058423,
+        -0.461052923736,
+        0.678235442273,
+        1.429780276456,
+        0.080472825760,
+        -0.103424652500,
+        0.123343430648,
+        0.011879908277,
+        -0.018897229721,
+        -0.235518441452,
+        -0.013999547600,
+        0.027007016662,
+    ]
+).reshape(6, 9)
+expected_v2 = -np.array(
+    [
+        [
+            -0.70008436,
+            -0.06399891,
+            0.63678391,
+            -0.07642171,
+            -0.70580035,
+            0.20506145,
+            0.64098364,
+            0.20305781,
+            -0.57906794,
+        ],
+        [
+            -0.6372635,
+            0.14315552,
+            0.51952246,
+            0.04604049,
+            -0.06003681,
+            -0.02688702,
+            0.54489318,
+            -0.10951559,
+            -0.43730539,
+        ],
+        [
+            -0.25090748,
+            -0.37466262,
+            0.34085833,
+            -0.26690852,
+            -0.37676917,
+            0.29080825,
+            0.31600481,
+            0.37558276,
+            -0.33251064,
+        ],
+        [
+            -0.80195614,
+            -0.10273138,
+            0.06935364,
+            -0.10429256,
+            -0.29693811,
+            0.45643496,
+            0.07247872,
+            0.45604679,
+            -0.71048816,
+        ],
+        [
+            -0.03840668,
+            -0.07680205,
+            0.10940472,
+            -0.02374189,
+            -0.27610266,
+            0.4336071,
+            0.02465248,
+            0.4290638,
+            -0.67496763,
+        ],
+        [
+            -0.61475065,
+            -0.21163135,
+            0.26652929,
+            -0.26134659,
+            -0.11560267,
+            0.15415902,
+            0.34343952,
+            0.1589482,
+            -0.21370642,
+        ],
+    ]
+).reshape(6, 9)
+
+box = np.array([0, 13, 0, 13, 0, 13, 0, 0, 0])
+coord = np.array(
+    [
+        [12.83, 2.56, 2.18],
+        [12.09, 2.87, 2.74],
+        [0.25, 3.32, 1.68],
+        [3.36, 3.00, 1.81],
+        [3.51, 2.51, 2.60],
+        [4.27, 3.22, 1.56],
+    ]
+)
+type_OH = np.array([1, 2, 2, 1, 2, 2])
+type_HO = np.array([2, 1, 1, 2, 1, 1])
+
+
+sp.check_output(
+    f"{sys.executable} -m deepmd convert-from pbtxt -i {pbtxt_file2.resolve()} -o {pb_file2.resolve()}".split()
+)
+
+
+def setup_module():
+    write_lmp_data(box, coord, type_OH, data_file)
+    write_lmp_data(box, coord, type_HO, data_type_map_file)
+    write_lmp_data(
+        box * constants.dist_metal2si,
+        coord * constants.dist_metal2si,
+        type_OH,
+        data_file_si,
+    )
+
+
+def teardown_module():
+    os.remove(data_file)
+    os.remove(data_type_map_file)
+
+
+def _lammps(data_file, units="metal") -> PyLammps:
+    lammps = PyLammps()
+    lammps.units(units)
+    lammps.boundary("p p p")
+    lammps.atom_style("atomic")
+    if units == "metal" or units == "real":
+        lammps.neighbor("2.0 bin")
+    elif units == "si":
+        lammps.neighbor("2.0e-10 bin")
+    else:
+        raise ValueError("units should be metal, real, or si")
+    lammps.neigh_modify("every 10 delay 0 check no")
+    lammps.read_data(data_file.resolve())
+    if units == "metal" or units == "real":
+        lammps.mass("1 16")
+        lammps.mass("2 2")
+    elif units == "si":
+        lammps.mass("1 %.10e" % (16 * constants.mass_metal2si))
+        lammps.mass("2 %.10e" % (2 * constants.mass_metal2si))
+    else:
+        raise ValueError("units should be metal, real, or si")
+    if units == "metal":
+        lammps.timestep(0.0005)
+    elif units == "real":
+        lammps.timestep(0.5)
+    elif units == "si":
+        lammps.timestep(5e-16)
+    else:
+        raise ValueError("units should be metal, real, or si")
+    lammps.fix("1 all nve")
+    return lammps
+
+
+@pytest.fixture
+def lammps():
+    lmp = _lammps(data_file=data_file)
+    yield lmp
+    lmp.close()
+
+
+@pytest.fixture
+def lammps_type_map():
+    lmp = _lammps(data_file=data_type_map_file)
+    yield lmp
+    lmp.close()
+
+
+@pytest.fixture
+def lammps_real():
+    lmp = _lammps(data_file=data_file, units="real")
+    yield lmp
+    lmp.close()
+
+
+@pytest.fixture
+def lammps_si():
+    lmp = _lammps(data_file=data_file_si, units="si")
+    yield lmp
+    lmp.close()
+
+
+def test_pair_deepmd(lammps):
+    lammps.pair_style(f"deepmd {pb_file.resolve()}")
+    lammps.pair_coeff("* *")
+    lammps.run(0)
+    assert lammps.eval("pe") == pytest.approx(expected_e)
+    for ii in range(6):
+        assert lammps.atoms[ii].force == pytest.approx(
+            expected_f[lammps.atoms[ii].id - 1]
+        )
+    lammps.run(1)
+
+
+def test_pair_deepmd_virial(lammps):
+    lammps.pair_style(f"deepmd {pb_file.resolve()}")
+    lammps.pair_coeff("* *")
+    lammps.compute("virial all centroid/stress/atom NULL pair")
+    for ii in range(9):
+        jj = [0, 4, 8, 3, 6, 7, 1, 2, 5][ii]
+        lammps.variable(f"virial{jj} atom c_virial[{ii+1}]")
+    lammps.dump(
+        "1 all custom 1 dump id " + " ".join([f"v_virial{ii}" for ii in range(9)])
+    )
+    lammps.run(0)
+    assert lammps.eval("pe") == pytest.approx(expected_e)
+    for ii in range(6):
+        assert lammps.atoms[ii].force == pytest.approx(
+            expected_f[lammps.atoms[ii].id - 1]
+        )
+    idx_map = lammps.lmp.numpy.extract_atom("id") - 1
+    for ii in range(9):
+        assert np.array(
+            lammps.variables[f"virial{ii}"].value
+        ) / constants.nktv2p == pytest.approx(expected_v[idx_map, ii])
+
+
+def test_pair_deepmd_model_devi(lammps):
+    lammps.pair_style(
+        f"deepmd {pb_file.resolve()} {pb_file2.resolve()} out_file {md_file.resolve()} out_freq 1 atomic"
+    )
+    lammps.pair_coeff("* *")
+    lammps.run(0)
+    assert lammps.eval("pe") == pytest.approx(expected_e)
+    for ii in range(6):
+        assert lammps.atoms[ii].force == pytest.approx(
+            expected_f[lammps.atoms[ii].id - 1]
+        )
+    # load model devi
+    md = np.loadtxt(md_file.resolve())
+    expected_md_f = np.linalg.norm(np.std([expected_f, expected_f2], axis=0), axis=1)
+    assert md[7:] == pytest.approx(expected_md_f)
+    assert md[4] == pytest.approx(np.max(expected_md_f))
+    assert md[5] == pytest.approx(np.min(expected_md_f))
+    assert md[6] == pytest.approx(np.mean(expected_md_f))
+    expected_md_v = (
+        np.std([np.sum(expected_v, axis=0), np.sum(expected_v2, axis=0)], axis=0) / 6
+    )
+    assert md[1] == pytest.approx(np.max(expected_md_v))
+    assert md[2] == pytest.approx(np.min(expected_md_v))
+    assert md[3] == pytest.approx(np.sqrt(np.mean(np.square(expected_md_v))))
+
+
+def test_pair_deepmd_model_devi_virial(lammps):
+    lammps.pair_style(
+        f"deepmd {pb_file.resolve()} {pb_file2.resolve()} out_file {md_file.resolve()} out_freq 1 atomic"
+    )
+    lammps.pair_coeff("* *")
+    lammps.compute("virial all centroid/stress/atom NULL pair")
+    for ii in range(9):
+        jj = [0, 4, 8, 3, 6, 7, 1, 2, 5][ii]
+        lammps.variable(f"virial{jj} atom c_virial[{ii+1}]")
+    lammps.dump(
+        "1 all custom 1 dump id " + " ".join([f"v_virial{ii}" for ii in range(9)])
+    )
+    lammps.run(0)
+    assert lammps.eval("pe") == pytest.approx(expected_e)
+    for ii in range(6):
+        assert lammps.atoms[ii].force == pytest.approx(
+            expected_f[lammps.atoms[ii].id - 1]
+        )
+    idx_map = lammps.lmp.numpy.extract_atom("id") - 1
+    for ii in range(9):
+        assert np.array(
+            lammps.variables[f"virial{ii}"].value
+        ) / constants.nktv2p == pytest.approx(expected_v[idx_map, ii])
+    # load model devi
+    md = np.loadtxt(md_file.resolve())
+    expected_md_f = np.linalg.norm(np.std([expected_f, expected_f2], axis=0), axis=1)
+    assert md[7:] == pytest.approx(expected_md_f)
+    assert md[4] == pytest.approx(np.max(expected_md_f))
+    assert md[5] == pytest.approx(np.min(expected_md_f))
+    assert md[6] == pytest.approx(np.mean(expected_md_f))
+    expected_md_v = (
+        np.std([np.sum(expected_v, axis=0), np.sum(expected_v2, axis=0)], axis=0) / 6
+    )
+    assert md[1] == pytest.approx(np.max(expected_md_v))
+    assert md[2] == pytest.approx(np.min(expected_md_v))
+    assert md[3] == pytest.approx(np.sqrt(np.mean(np.square(expected_md_v))))
+
+
+def test_pair_deepmd_model_devi_atomic_relative(lammps):
+    relative = 1.0
+    lammps.pair_style(
+        f"deepmd {pb_file.resolve()} {pb_file2.resolve()} out_file {md_file.resolve()} out_freq 1 atomic relative {relative}"
+    )
+    lammps.pair_coeff("* *")
+    lammps.run(0)
+    assert lammps.eval("pe") == pytest.approx(expected_e)
+    for ii in range(6):
+        assert lammps.atoms[ii].force == pytest.approx(
+            expected_f[lammps.atoms[ii].id - 1]
+        )
+    # load model devi
+    md = np.loadtxt(md_file.resolve())
+    norm = np.linalg.norm(np.mean([expected_f, expected_f2], axis=0), axis=1)
+    expected_md_f = np.linalg.norm(np.std([expected_f, expected_f2], axis=0), axis=1)
+    expected_md_f /= norm + relative
+    assert md[7:] == pytest.approx(expected_md_f)
+    assert md[4] == pytest.approx(np.max(expected_md_f))
+    assert md[5] == pytest.approx(np.min(expected_md_f))
+    assert md[6] == pytest.approx(np.mean(expected_md_f))
+    expected_md_v = (
+        np.std([np.sum(expected_v, axis=0), np.sum(expected_v2, axis=0)], axis=0) / 6
+    )
+    assert md[1] == pytest.approx(np.max(expected_md_v))
+    assert md[2] == pytest.approx(np.min(expected_md_v))
+    assert md[3] == pytest.approx(np.sqrt(np.mean(np.square(expected_md_v))))
+
+
+def test_pair_deepmd_model_devi_atomic_relative_v(lammps):
+    relative = 1.0
+    lammps.pair_style(
+        f"deepmd {pb_file.resolve()} {pb_file2.resolve()} out_file {md_file.resolve()} out_freq 1 atomic relative_v {relative}"
+    )
+    lammps.pair_coeff("* *")
+    lammps.run(0)
+    assert lammps.eval("pe") == pytest.approx(expected_e)
+    for ii in range(6):
+        assert lammps.atoms[ii].force == pytest.approx(
+            expected_f[lammps.atoms[ii].id - 1]
+        )
+    md = np.loadtxt(md_file.resolve())
+    expected_md_f = np.linalg.norm(np.std([expected_f, expected_f2], axis=0), axis=1)
+    assert md[7:] == pytest.approx(expected_md_f)
+    assert md[4] == pytest.approx(np.max(expected_md_f))
+    assert md[5] == pytest.approx(np.min(expected_md_f))
+    assert md[6] == pytest.approx(np.mean(expected_md_f))
+    expected_md_v = (
+        np.std([np.sum(expected_v, axis=0), np.sum(expected_v2, axis=0)], axis=0) / 6
+    )
+    norm = (
+        np.abs(
+            np.mean([np.sum(expected_v, axis=0), np.sum(expected_v2, axis=0)], axis=0)
+        )
+        / 6
+    )
+    expected_md_v /= norm + relative
+    assert md[1] == pytest.approx(np.max(expected_md_v))
+    assert md[2] == pytest.approx(np.min(expected_md_v))
+    assert md[3] == pytest.approx(np.sqrt(np.mean(np.square(expected_md_v))))
+
+
+def test_pair_deepmd_type_map(lammps_type_map):
+    lammps_type_map.pair_style(f"deepmd {pb_file.resolve()}")
+    lammps_type_map.pair_coeff("* * H O")
+    lammps_type_map.run(0)
+    assert lammps_type_map.eval("pe") == pytest.approx(expected_e)
+    for ii in range(6):
+        assert lammps_type_map.atoms[ii].force == pytest.approx(
+            expected_f[lammps_type_map.atoms[ii].id - 1]
+        )
+    lammps_type_map.run(1)
+
+
+def test_pair_deepmd_real(lammps_real):
+    lammps_real.pair_style(f"deepmd {pb_file.resolve()}")
+    lammps_real.pair_coeff("* *")
+    lammps_real.run(0)
+    assert lammps_real.eval("pe") == pytest.approx(
+        expected_e * constants.ener_metal2real
+    )
+    for ii in range(6):
+        assert lammps_real.atoms[ii].force == pytest.approx(
+            expected_f[lammps_real.atoms[ii].id - 1] * constants.force_metal2real
+        )
+    lammps_real.run(1)
+
+
+def test_pair_deepmd_virial_real(lammps_real):
+    lammps_real.pair_style(f"deepmd {pb_file.resolve()}")
+    lammps_real.pair_coeff("* *")
+    lammps_real.compute("virial all centroid/stress/atom NULL pair")
+    for ii in range(9):
+        jj = [0, 4, 8, 3, 6, 7, 1, 2, 5][ii]
+        lammps_real.variable(f"virial{jj} atom c_virial[{ii+1}]")
+    lammps_real.dump(
+        "1 all custom 1 dump id " + " ".join([f"v_virial{ii}" for ii in range(9)])
+    )
+    lammps_real.run(0)
+    assert lammps_real.eval("pe") == pytest.approx(
+        expected_e * constants.ener_metal2real
+    )
+    for ii in range(6):
+        assert lammps_real.atoms[ii].force == pytest.approx(
+            expected_f[lammps_real.atoms[ii].id - 1] * constants.force_metal2real
+        )
+    idx_map = lammps_real.lmp.numpy.extract_atom("id") - 1
+    for ii in range(9):
+        assert np.array(
+            lammps_real.variables[f"virial{ii}"].value
+        ) / constants.nktv2p_real == pytest.approx(
+            expected_v[idx_map, ii] * constants.ener_metal2real
+        )
+
+
+def test_pair_deepmd_model_devi_real(lammps_real):
+    lammps_real.pair_style(
+        f"deepmd {pb_file.resolve()} {pb_file2.resolve()} out_file {md_file.resolve()} out_freq 1 atomic"
+    )
+    lammps_real.pair_coeff("* *")
+    lammps_real.run(0)
+    assert lammps_real.eval("pe") == pytest.approx(
+        expected_e * constants.ener_metal2real
+    )
+    for ii in range(6):
+        assert lammps_real.atoms[ii].force == pytest.approx(
+            expected_f[lammps_real.atoms[ii].id - 1] * constants.force_metal2real
+        )
+    # load model devi
+    md = np.loadtxt(md_file.resolve())
+    expected_md_f = np.linalg.norm(np.std([expected_f, expected_f2], axis=0), axis=1)
+    assert md[7:] == pytest.approx(expected_md_f * constants.force_metal2real)
+    assert md[4] == pytest.approx(np.max(expected_md_f) * constants.force_metal2real)
+    assert md[5] == pytest.approx(np.min(expected_md_f) * constants.force_metal2real)
+    assert md[6] == pytest.approx(np.mean(expected_md_f) * constants.force_metal2real)
+    expected_md_v = (
+        np.std([np.sum(expected_v, axis=0), np.sum(expected_v2, axis=0)], axis=0) / 6
+    )
+    assert md[1] == pytest.approx(np.max(expected_md_v) * constants.ener_metal2real)
+    assert md[2] == pytest.approx(np.min(expected_md_v) * constants.ener_metal2real)
+    assert md[3] == pytest.approx(
+        np.sqrt(np.mean(np.square(expected_md_v))) * constants.ener_metal2real
+    )
+
+
+def test_pair_deepmd_model_devi_virial_real(lammps_real):
+    lammps_real.pair_style(
+        f"deepmd {pb_file.resolve()} {pb_file2.resolve()} out_file {md_file.resolve()} out_freq 1 atomic"
+    )
+    lammps_real.pair_coeff("* *")
+    lammps_real.compute("virial all centroid/stress/atom NULL pair")
+    for ii in range(9):
+        jj = [0, 4, 8, 3, 6, 7, 1, 2, 5][ii]
+        lammps_real.variable(f"virial{jj} atom c_virial[{ii+1}]")
+    lammps_real.dump(
+        "1 all custom 1 dump id " + " ".join([f"v_virial{ii}" for ii in range(9)])
+    )
+    lammps_real.run(0)
+    assert lammps_real.eval("pe") == pytest.approx(
+        expected_e * constants.ener_metal2real
+    )
+    for ii in range(6):
+        assert lammps_real.atoms[ii].force == pytest.approx(
+            expected_f[lammps_real.atoms[ii].id - 1] * constants.force_metal2real
+        )
+    idx_map = lammps_real.lmp.numpy.extract_atom("id") - 1
+    for ii in range(9):
+        assert np.array(
+            lammps_real.variables[f"virial{ii}"].value
+        ) / constants.nktv2p_real == pytest.approx(
+            expected_v[idx_map, ii] * constants.ener_metal2real
+        )
+    # load model devi
+    md = np.loadtxt(md_file.resolve())
+    expected_md_f = np.linalg.norm(np.std([expected_f, expected_f2], axis=0), axis=1)
+    assert md[7:] == pytest.approx(expected_md_f * constants.force_metal2real)
+    assert md[4] == pytest.approx(np.max(expected_md_f) * constants.force_metal2real)
+    assert md[5] == pytest.approx(np.min(expected_md_f) * constants.force_metal2real)
+    assert md[6] == pytest.approx(np.mean(expected_md_f) * constants.force_metal2real)
+    expected_md_v = (
+        np.std([np.sum(expected_v, axis=0), np.sum(expected_v2, axis=0)], axis=0) / 6
+    )
+    assert md[1] == pytest.approx(np.max(expected_md_v) * constants.ener_metal2real)
+    assert md[2] == pytest.approx(np.min(expected_md_v) * constants.ener_metal2real)
+    assert md[3] == pytest.approx(
+        np.sqrt(np.mean(np.square(expected_md_v))) * constants.ener_metal2real
+    )
+
+
+def test_pair_deepmd_model_devi_atomic_relative_real(lammps_real):
+    relative = 1.0
+    lammps_real.pair_style(
+        f"deepmd {pb_file.resolve()} {pb_file2.resolve()} out_file {md_file.resolve()} out_freq 1 atomic relative {relative * constants.force_metal2real}"
+    )
+    lammps_real.pair_coeff("* *")
+    lammps_real.run(0)
+    assert lammps_real.eval("pe") == pytest.approx(
+        expected_e * constants.ener_metal2real
+    )
+    for ii in range(6):
+        assert lammps_real.atoms[ii].force == pytest.approx(
+            expected_f[lammps_real.atoms[ii].id - 1] * constants.force_metal2real
+        )
+    # load model devi
+    md = np.loadtxt(md_file.resolve())
+    norm = np.linalg.norm(np.mean([expected_f, expected_f2], axis=0), axis=1)
+    expected_md_f = np.linalg.norm(np.std([expected_f, expected_f2], axis=0), axis=1)
+    expected_md_f /= norm + relative
+    assert md[7:] == pytest.approx(expected_md_f * constants.force_metal2real)
+    assert md[4] == pytest.approx(np.max(expected_md_f) * constants.force_metal2real)
+    assert md[5] == pytest.approx(np.min(expected_md_f) * constants.force_metal2real)
+    assert md[6] == pytest.approx(np.mean(expected_md_f) * constants.force_metal2real)
+    expected_md_v = (
+        np.std([np.sum(expected_v, axis=0), np.sum(expected_v2, axis=0)], axis=0) / 6
+    )
+    assert md[1] == pytest.approx(np.max(expected_md_v) * constants.ener_metal2real)
+    assert md[2] == pytest.approx(np.min(expected_md_v) * constants.ener_metal2real)
+    assert md[3] == pytest.approx(
+        np.sqrt(np.mean(np.square(expected_md_v))) * constants.ener_metal2real
+    )
+
+
+def test_pair_deepmd_model_devi_atomic_relative_v_real(lammps_real):
+    relative = 1.0
+    lammps_real.pair_style(
+        f"deepmd {pb_file.resolve()} {pb_file2.resolve()} out_file {md_file.resolve()} out_freq 1 atomic relative_v {relative * constants.ener_metal2real}"
+    )
+    lammps_real.pair_coeff("* *")
+    lammps_real.run(0)
+    assert lammps_real.eval("pe") == pytest.approx(
+        expected_e * constants.ener_metal2real
+    )
+    for ii in range(6):
+        assert lammps_real.atoms[ii].force == pytest.approx(
+            expected_f[lammps_real.atoms[ii].id - 1] * constants.force_metal2real
+        )
+    md = np.loadtxt(md_file.resolve())
+    expected_md_f = np.linalg.norm(np.std([expected_f, expected_f2], axis=0), axis=1)
+    assert md[7:] == pytest.approx(expected_md_f * constants.force_metal2real)
+    assert md[4] == pytest.approx(np.max(expected_md_f) * constants.force_metal2real)
+    assert md[5] == pytest.approx(np.min(expected_md_f) * constants.force_metal2real)
+    assert md[6] == pytest.approx(np.mean(expected_md_f) * constants.force_metal2real)
+    expected_md_v = (
+        np.std([np.sum(expected_v, axis=0), np.sum(expected_v2, axis=0)], axis=0) / 6
+    )
+    norm = (
+        np.abs(
+            np.mean([np.sum(expected_v, axis=0), np.sum(expected_v2, axis=0)], axis=0)
+        )
+        / 6
+    )
+    expected_md_v /= norm + relative
+    assert md[1] == pytest.approx(np.max(expected_md_v) * constants.ener_metal2real)
+    assert md[2] == pytest.approx(np.min(expected_md_v) * constants.ener_metal2real)
+    assert md[3] == pytest.approx(
+        np.sqrt(np.mean(np.square(expected_md_v))) * constants.ener_metal2real
+    )
+
+
+def test_pair_deepmd_si(lammps_si):
+    lammps_si.pair_style(f"deepmd {pb_file.resolve()}")
+    lammps_si.pair_coeff("* *")
+    lammps_si.run(0)
+    assert lammps_si.eval("pe") == pytest.approx(expected_e * constants.ener_metal2si)
+    for ii in range(6):
+        assert lammps_si.atoms[ii].force == pytest.approx(
+            expected_f[lammps_si.atoms[ii].id - 1] * constants.force_metal2si
+        )
+    lammps_si.run(1)
+
+
+@pytest.mark.skipif(
+    shutil.which("mpirun") is None, reason="MPI is not installed on this system"
+)
+@pytest.mark.skipif(
+    importlib.util.find_spec("mpi4py") is None, reason="mpi4py is not installed"
+)
+@pytest.mark.parametrize(
+    ("balance_args",),
+    [(["--balance"],)],
+)
+def test_pair_deepmd_mpi(balance_args: list):
+    with tempfile.NamedTemporaryFile() as f:
+        sp.check_call(
+            [
+                "mpirun",
+                "-n",
+                "2",
+                sys.executable,
+                Path(__file__).parent / "run_mpi_pair_deepmd.py",
+                data_file,
+                pb_file,
+                pb_file2,
+                md_file,
+                f.name,
+                *balance_args,
+            ]
+        )
+        arr = np.loadtxt(f.name, ndmin=1)
+    pe = arr[0]
+
+    relative = 1.0
+    assert pe == pytest.approx(expected_e)
+    # load model devi
+    md = np.loadtxt(md_file.resolve())
+    norm = np.linalg.norm(np.mean([expected_f, expected_f2], axis=0), axis=1)
+    expected_md_f = np.linalg.norm(np.std([expected_f, expected_f2], axis=0), axis=1)
+    expected_md_f /= norm + relative
+    assert md[7:] == pytest.approx(expected_md_f)
+    assert md[4] == pytest.approx(np.max(expected_md_f))
+    assert md[5] == pytest.approx(np.min(expected_md_f))
+    assert md[6] == pytest.approx(np.mean(expected_md_f))
+    expected_md_v = (
+        np.std([np.sum(expected_v, axis=0), np.sum(expected_v2, axis=0)], axis=0) / 6
+    )
+    assert md[1] == pytest.approx(np.max(expected_md_v))
+    assert md[2] == pytest.approx(np.min(expected_md_v))
+    assert md[3] == pytest.approx(np.sqrt(np.mean(np.square(expected_md_v))))
diff --git a/source/op/pt/CMakeLists.txt b/source/op/pt/CMakeLists.txt
index 362a0fd89d..3254e5e852 100644
--- a/source/op/pt/CMakeLists.txt
+++ b/source/op/pt/CMakeLists.txt
@@ -1,4 +1,4 @@
-file(GLOB OP_SRC print_summary.cc)
+file(GLOB OP_SRC print_summary.cc comm.cc)
 
 add_library(deepmd_op_pt MODULE ${OP_SRC})
 # link: libdeepmd libtorch
@@ -14,7 +14,6 @@ if(MPI_FOUND)
   target_link_libraries(deepmd_op_pt PRIVATE MPI::MPI_CXX)
   target_compile_definitions(deepmd_op_pt PRIVATE USE_MPI)
 endif()
-
 if(CMAKE_TESTING_ENABLED)
   target_link_libraries(deepmd_op_pt PRIVATE coverage_config)
 endif()
diff --git a/source/op/pt/comm.cc b/source/op/pt/comm.cc
new file mode 100644
index 0000000000..11047ad1d6
--- /dev/null
+++ b/source/op/pt/comm.cc
@@ -0,0 +1,362 @@
+// SPDX-License-Identifier: LGPL-3.0-or-later
+#if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM)
+#include "device.h"
+#endif
+#include <torch/torch.h>
+#ifdef USE_MPI
+#include <mpi.h>
+#ifdef OMPI_MPI_H
+#include <mpi-ext.h>
+#endif
+template <typename T>
+static MPI_Datatype get_mpi_type();
+
+template <>
+MPI_Datatype get_mpi_type<float>() {
+  return MPI_FLOAT;
+}
+
+template <>
+MPI_Datatype get_mpi_type<double>() {
+  return MPI_DOUBLE;
+}
+#endif
+class Border : public torch::autograd::Function<Border> {
+ public:
+  static torch::autograd::variable_list forward(
+      torch::autograd::AutogradContext* ctx,
+      const torch::Tensor& sendlist_tensor,
+      const torch::Tensor& sendproc_tensor,
+      const torch::Tensor& recvproc_tensor,
+      const torch::Tensor& sendnum_tensor,
+      const torch::Tensor& recvnum_tensor,
+      const torch::Tensor& g1,
+      const torch::Tensor& communicator_tensor,
+      const torch::Tensor& nlocal_tensor,
+      const torch::Tensor& nghost_tensor) {
+    bool type_flag = (g1.dtype() == torch::kDouble) ? true : false;
+    if (type_flag) {
+      return forward_t<double>(ctx, sendlist_tensor, sendproc_tensor,
+                               recvproc_tensor, sendnum_tensor, recvnum_tensor,
+                               g1, communicator_tensor, nlocal_tensor,
+                               nghost_tensor);
+    } else {
+      return forward_t<float>(ctx, sendlist_tensor, sendproc_tensor,
+                              recvproc_tensor, sendnum_tensor, recvnum_tensor,
+                              g1, communicator_tensor, nlocal_tensor,
+                              nghost_tensor);
+    }
+  }
+  template <typename FPTYPE>
+  static torch::autograd::variable_list forward_t(
+      torch::autograd::AutogradContext* ctx,
+      const torch::Tensor& sendlist_tensor,
+      const torch::Tensor& sendproc_tensor,
+      const torch::Tensor& recvproc_tensor,
+      const torch::Tensor& sendnum_tensor,
+      const torch::Tensor& recvnum_tensor,
+      const torch::Tensor& g1,
+      const torch::Tensor& communicator_tensor,
+      const torch::Tensor& nlocal_tensor,
+      const torch::Tensor& nghost_tensor) {
+    ctx->save_for_backward({sendlist_tensor, sendproc_tensor, recvproc_tensor,
+                            sendnum_tensor, recvnum_tensor, communicator_tensor,
+                            nlocal_tensor, nghost_tensor});
+    int** sendlist = reinterpret_cast<int**>(sendlist_tensor.data_ptr());
+    int* sendproc = sendproc_tensor.data_ptr<int>();
+    int* recvproc = recvproc_tensor.data_ptr<int>();
+    int* sendnum = sendnum_tensor.data_ptr<int>();
+    int* recvnum = recvnum_tensor.data_ptr<int>();
+    int tensor_size = g1.size(1);
+    int nswap = sendproc_tensor.size(0);
+
+    int nlocal = nlocal_tensor.item<int>();
+    int nghost = nghost_tensor.item<int>();
+    int ntotal = nlocal + nghost;
+    torch::Tensor recv_g1_tensor = g1;
+
+#ifdef USE_MPI
+    int mpi_init = 0;
+    MPI_Initialized(&mpi_init);
+    int cuda_aware = 1;
+    int me;
+    MPI_Comm world;
+    int world_size = 0;
+    unpack_communicator(communicator_tensor, world);
+    MPI_Comm_rank(world, &me);
+    MPI_Comm_size(world, &world_size);
+    MPI_Datatype mpi_type = get_mpi_type<FPTYPE>();
+    MPI_Request request;
+#if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM)
+    if (world_size != 1) {
+      int version, subversion;
+      MPI_Get_version(&version, &subversion);
+      if (version >= 4) {
+        cuda_aware = MPIX_Query_cuda_support();
+      } else {
+        cuda_aware = 0;
+      }
+      if (cuda_aware == 0) {
+        recv_g1_tensor = torch::empty_like(g1).to(torch::kCPU);
+        recv_g1_tensor.copy_(g1);
+      }
+    }
+#endif
+#endif
+    FPTYPE* recv_g1 = recv_g1_tensor.data_ptr<FPTYPE>() + nlocal * tensor_size;
+    auto int32_options = torch::TensorOptions().dtype(torch::kInt32);
+    for (int iswap = 0; iswap < nswap; ++iswap) {
+      int nrecv = recvnum[iswap];
+      int nsend = sendnum[iswap];
+      torch::Tensor isendlist =
+          torch::from_blob(sendlist[iswap], {nsend}, int32_options)
+              .to(recv_g1_tensor.device());
+      torch::Tensor send_g1_tensor = recv_g1_tensor.index_select(0, isendlist);
+      FPTYPE* send_g1 = send_g1_tensor.data_ptr<FPTYPE>();
+#ifdef USE_MPI
+      if (sendproc[iswap] != me) {
+        if (nrecv) {
+          MPI_Irecv(recv_g1, nrecv * tensor_size, mpi_type, recvproc[iswap], 0,
+                    world, &request);
+        }
+        if (nsend) {
+          MPI_Send(send_g1, nsend * tensor_size, mpi_type, sendproc[iswap], 0,
+                   world);
+        }
+        if (nrecv) {
+          MPI_Wait(&request, MPI_STATUS_IGNORE);
+        }
+      } else {
+#endif
+#if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM)
+#ifdef USE_MPI
+        if (cuda_aware == 0) {
+          memcpy(recv_g1, send_g1,
+                 (unsigned long)nsend * tensor_size * sizeof(FPTYPE));
+        } else {
+          gpuMemcpy(recv_g1, send_g1,
+                    (unsigned long)nsend * tensor_size * sizeof(FPTYPE),
+                    gpuMemcpyDeviceToDevice);
+        }
+#else
+        gpuMemcpy(recv_g1, send_g1,
+                  (unsigned long)nsend * tensor_size * sizeof(FPTYPE),
+                  gpuMemcpyDeviceToDevice);
+#endif
+#else
+      memcpy(recv_g1, send_g1,
+             (unsigned long)nsend * tensor_size * sizeof(FPTYPE));
+#endif
+#ifdef USE_MPI
+      }
+#endif
+      recv_g1 += nrecv * tensor_size;
+    }
+#ifdef USE_MPI
+#if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM)
+    if (cuda_aware == 0) {
+      g1.copy_(recv_g1_tensor);
+    }
+#endif
+#endif
+    return {g1};
+  }
+  static torch::autograd::variable_list backward(
+      torch::autograd::AutogradContext* ctx,
+      torch::autograd::variable_list grad_output) {
+    bool type_flag = (grad_output[0].dtype() == torch::kDouble) ? true : false;
+    if (type_flag) {
+      return backward_t<double>(ctx, grad_output);
+    } else {
+      return backward_t<float>(ctx, grad_output);
+    }
+  }
+  template <typename FPTYPE>
+  static torch::autograd::variable_list backward_t(
+      torch::autograd::AutogradContext* ctx,
+      torch::autograd::variable_list grad_output) {
+#if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM)
+    gpuDeviceSynchronize();
+#endif
+
+    torch::autograd::variable_list saved_variables = ctx->get_saved_variables();
+    torch::Tensor sendlist_tensor = saved_variables[0];
+    torch::Tensor sendproc_tensor = saved_variables[1];
+    torch::Tensor recvproc_tensor = saved_variables[2];
+    torch::Tensor sendnum_tensor = saved_variables[3];
+    torch::Tensor recvnum_tensor = saved_variables[4];
+    torch::Tensor communicator_tensor = saved_variables[5];
+    torch::Tensor nlocal_tensor = saved_variables[6];
+    torch::Tensor nghost_tensor = saved_variables[7];
+
+    torch::Tensor d_local_g1_tensor = grad_output[0];
+#ifdef USE_MPI
+    int mpi_init = 0;
+    MPI_Initialized(&mpi_init);
+    int world_size = 0;
+    int cuda_aware = 1;
+    MPI_Comm world;
+    unpack_communicator(communicator_tensor, world);
+    int me;
+    MPI_Comm_rank(world, &me);
+    MPI_Comm_size(world, &world_size);
+    MPI_Datatype mpi_type = get_mpi_type<FPTYPE>();
+    MPI_Request request;
+#if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM)
+    if (world_size != 1) {
+      int version, subversion;
+      MPI_Get_version(&version, &subversion);
+      if (version >= 4) {
+        cuda_aware = MPIX_Query_cuda_support();
+      } else {
+        cuda_aware = 0;
+      }
+      if (cuda_aware == 0) {
+        d_local_g1_tensor = torch::empty_like(grad_output[0]).to(torch::kCPU);
+        d_local_g1_tensor.copy_(grad_output[0]);
+      }
+    }
+#endif
+#endif
+    int** recvlist = reinterpret_cast<int**>(sendlist_tensor.data_ptr());
+    // swap send and recv here
+    int* recvproc = sendproc_tensor.data_ptr<int>();
+    int* sendproc = recvproc_tensor.data_ptr<int>();
+    int* recvnum = sendnum_tensor.data_ptr<int>();
+    int* sendnum = recvnum_tensor.data_ptr<int>();
+
+    FPTYPE* local_g1 = d_local_g1_tensor.data_ptr<FPTYPE>();
+    int tensor_size = d_local_g1_tensor.size(1);
+    int nswap = sendproc_tensor.size(0);
+
+    int nlocal = nlocal_tensor.item<int>();
+    int nghost = nghost_tensor.item<int>();
+    int ntotal = nlocal + nghost;
+
+    torch::Tensor send_g1_tensor = d_local_g1_tensor;
+
+    int max_recvnum = sendnum_tensor.max().item<int>();
+    auto options = torch::TensorOptions()
+                       .dtype(d_local_g1_tensor.dtype())
+                       .device(d_local_g1_tensor.device());
+    torch::Tensor recv_g1_tensor =
+        torch::empty({max_recvnum, tensor_size}, options);
+    FPTYPE* recv_g1 = recv_g1_tensor.data_ptr<FPTYPE>();
+    FPTYPE* send_g1 = send_g1_tensor.data_ptr<FPTYPE>() + ntotal * tensor_size;
+
+    int end = ntotal;
+    auto int32_options = torch::TensorOptions().dtype(torch::kInt32);
+    for (int iswap = nswap - 1; iswap >= 0; --iswap) {
+      int nrecv = recvnum[iswap];
+      int nsend = sendnum[iswap];
+
+      torch::Tensor irecvlist;
+      if (nrecv) {
+        irecvlist = torch::from_blob(recvlist[iswap], {nrecv}, int32_options)
+                        .to(d_local_g1_tensor.device());
+      }
+      if (nsend) {
+        send_g1 -= nsend * tensor_size;
+      }
+#ifdef USE_MPI
+      if (sendproc[iswap] != me) {
+        if (nrecv) {
+          MPI_Irecv(recv_g1, nrecv * tensor_size, mpi_type, recvproc[iswap], 0,
+                    world, &request);
+        }
+        if (nsend) {
+          MPI_Send(send_g1, nsend * tensor_size, mpi_type, sendproc[iswap], 0,
+                   world);
+        }
+        if (nrecv) {
+          MPI_Wait(&request, MPI_STATUS_IGNORE);
+        }
+      } else {
+#endif
+        if (nrecv) {
+#if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM)
+#ifdef USE_MPI
+          if (cuda_aware == 0) {
+            memcpy(recv_g1, send_g1,
+                   (unsigned long)nrecv * tensor_size * sizeof(FPTYPE));
+          } else {
+            gpuMemcpy(recv_g1, send_g1,
+                      (unsigned long)nrecv * tensor_size * sizeof(FPTYPE),
+                      gpuMemcpyDeviceToDevice);
+          }
+#else
+          gpuMemcpy(recv_g1, send_g1,
+                    (unsigned long)nrecv * tensor_size * sizeof(FPTYPE),
+                    gpuMemcpyDeviceToDevice);
+#endif
+#else
+        memcpy(recv_g1, send_g1,
+               (unsigned long)nrecv * tensor_size * sizeof(FPTYPE));
+#endif
+        }
+#ifdef USE_MPI
+      }
+#endif
+      if (nrecv) {
+        d_local_g1_tensor.index_add_(0, irecvlist,
+                                     recv_g1_tensor.slice(0, 0, nrecv));
+      }
+    }
+#if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM)
+    gpuDeviceSynchronize();
+#endif
+#ifdef USE_MPI
+#if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM)
+    if (cuda_aware == 0) {
+      grad_output[0].copy_(d_local_g1_tensor);
+    }
+#endif
+#endif
+
+    return {torch::Tensor(), torch::Tensor(), torch::Tensor(), torch::Tensor(),
+            torch::Tensor(), grad_output[0],  torch::Tensor(), torch::Tensor(),
+            torch::Tensor(), torch::Tensor()};
+  }
+#ifdef USE_MPI
+  static void unpack_communicator(const torch::Tensor& communicator_tensor,
+                                  MPI_Comm& mpi_comm) {
+#ifdef OMPI_MPI_H
+    long int* communicator = communicator_tensor.data_ptr<long int>();
+#else
+    long int* ptr = communicator_tensor.data_ptr<long int>();
+    int* communicator = reinterpret_cast<int*>(ptr);
+#endif
+    mpi_comm = reinterpret_cast<MPI_Comm>(*communicator);
+  }
+#endif
+};
+std::vector<torch::Tensor> border_op(const torch::Tensor& sendlist_tensor,
+                                     const torch::Tensor& sendproc_tensor,
+                                     const torch::Tensor& recvproc_tensor,
+                                     const torch::Tensor& sendnum_tensor,
+                                     const torch::Tensor& recvnum_tensor,
+                                     const torch::Tensor& g1_tensor,
+                                     const torch::Tensor& communicator_tensor,
+                                     const torch::Tensor& nlocal_tensor,
+                                     const torch::Tensor& nghost_tensor)
+
+/**
+ * @brief communicate the latest g1 info to other lmp proc
+ * @param[out] recv_g1_tensor g1 after communication
+ * @param[in]  sendlist_tensor list of atoms to send in each swap
+ * @param[in]  sendproc_tensor proc to send to at each swap
+ * @param[in]  recvproc_tensor proc to recv from at each swap
+ * @param[in]  sendnum_tensor # of atoms to send in each swap
+ * @param[in]  recvnum_tensor # of atoms to recv in each swap
+ * @param[in]  g1_tensor tensor to store g1 info
+ * @param[in]  communicator_tensor MPI_comm data in lmp
+ * @param[in]  nlocal_tensor # of local atoms
+ * @param[in]  nghost_tensor # of nghost atoms
+ **/
+{
+  return Border::apply(sendlist_tensor, sendproc_tensor, recvproc_tensor,
+                       sendnum_tensor, recvnum_tensor, g1_tensor,
+                       communicator_tensor, nlocal_tensor, nghost_tensor);
+}
+
+TORCH_LIBRARY_FRAGMENT(deepmd, m) { m.def("border_op", border_op); }
diff --git a/source/tests/infer/deeppot_dpa.pth b/source/tests/infer/deeppot_dpa.pth
new file mode 100644
index 0000000000..d54a1c1779
Binary files /dev/null and b/source/tests/infer/deeppot_dpa.pth differ