diff --git a/.github/workflows/test_cuda.yml b/.github/workflows/test_cuda.yml
index 4dbdc5acb9..db0dfb6c61 100644
--- a/.github/workflows/test_cuda.yml
+++ b/.github/workflows/test_cuda.yml
@@ -51,6 +51,7 @@ jobs:
     - run: |
         export PYTORCH_ROOT=$(python -c 'import torch;print(torch.__path__[0])')
         export TENSORFLOW_ROOT=$(python -c 'import importlib,pathlib;print(pathlib.Path(importlib.util.find_spec("tensorflow").origin).parent)')
+        source/install/uv_with_retry.sh pip install --system --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu123/
         source/install/uv_with_retry.sh pip install --system -v -e .[gpu,test,lmp,cu12,torch,jax] mpi4py
       env:
         DP_VARIANT: cuda
diff --git a/.github/workflows/test_python.yml b/.github/workflows/test_python.yml
index e30a19c8b1..e153164232 100644
--- a/.github/workflows/test_python.yml
+++ b/.github/workflows/test_python.yml
@@ -31,6 +31,7 @@ jobs:
         export PYTORCH_ROOT=$(python -c 'import torch;print(torch.__path__[0])')
         source/install/uv_with_retry.sh pip install --system -e .[test,jax] mpi4py
         source/install/uv_with_retry.sh pip install --system horovod --no-build-isolation
+        source/install/uv_with_retry.sh pip install --system --pre "paddlepaddle" -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/
       env:
         # Please note that uv has some issues with finding
         # existing TensorFlow package. Currently, it uses
diff --git a/backend/find_paddle.py b/backend/find_paddle.py
new file mode 100644
index 0000000000..bc54cdcaa5
--- /dev/null
+++ b/backend/find_paddle.py
@@ -0,0 +1,133 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import importlib
+import os
+import site
+from functools import (
+    lru_cache,
+)
+from importlib.machinery import (
+    FileFinder,
+)
+from importlib.util import (
+    find_spec,
+)
+from pathlib import (
+    Path,
+)
+from sysconfig import (
+    get_path,
+)
+from typing import (
+    Optional,
+    Union,
+)
+
+
+@lru_cache
+def find_paddle() -> tuple[Optional[str], list[str]]:
+    """Find PaddlePadle library.
+
+    Tries to find PaddlePadle in the order of:
+
+    1. Environment variable `PADDLE_ROOT` if set
+    2. The current Python environment.
+    3. user site packages directory if enabled
+    4. system site packages directory (purelib)
+
+    Considering the default PaddlePadle package still uses old CXX11 ABI, we
+    cannot install it automatically.
+
+    Returns
+    -------
+    str, optional
+        PaddlePadle library path if found.
+    list of str
+        Paddle requirement if not found. Empty if found.
+    """
+    if os.environ.get("DP_ENABLE_PADDLE", "0") == "0":
+        return None, []
+    requires = []
+    pd_spec = None
+
+    if (pd_spec is None or not pd_spec) and os.environ.get("PADDLE_ROOT") is not None:
+        site_packages = Path(os.environ.get("PADDLE_ROOT")).parent.absolute()
+        pd_spec = FileFinder(str(site_packages)).find_spec("paddle")
+
+    # get paddle spec
+    # note: isolated build will not work for backend
+    if pd_spec is None or not pd_spec:
+        pd_spec = find_spec("paddle")
+
+    if not pd_spec and site.ENABLE_USER_SITE:
+        # first search TF from user site-packages before global site-packages
+        site_packages = site.getusersitepackages()
+        if site_packages:
+            pd_spec = FileFinder(site_packages).find_spec("paddle")
+
+    if not pd_spec:
+        # purelib gets site-packages path
+        site_packages = get_path("purelib")
+        if site_packages:
+            pd_spec = FileFinder(site_packages).find_spec("paddle")
+
+    # get install dir from spec
+    try:
+        pd_install_dir = pd_spec.submodule_search_locations[0]  # type: ignore
+        # AttributeError if ft_spec is None
+        # TypeError if submodule_search_locations are None
+        # IndexError if submodule_search_locations is an empty list
+    except (AttributeError, TypeError, IndexError):
+        pd_install_dir = None
+        requires.extend(get_pd_requirement()["paddle"])
+    return pd_install_dir, requires
+
+
+@lru_cache
+def get_pd_requirement(pd_version: str = "") -> dict:
+    """Get PaddlePadle requirement when Paddle is not installed.
+
+    If pd_version is not given and the environment variable `PADDLE_VERSION` is set, use it as the requirement.
+
+    Parameters
+    ----------
+    pd_version : str, optional
+        Paddle version
+
+    Returns
+    -------
+    dict
+        PaddlePadle requirement.
+    """
+    if pd_version is None:
+        return {"paddle": []}
+    if pd_version == "":
+        pd_version = os.environ.get("PADDLE_VERSION", "")
+
+    return {
+        "paddle": [
+            "paddlepaddle>=3.0.0b1" if pd_version != "" else "paddlepaddle>=3.0.0b1",
+        ],
+    }
+
+
+@lru_cache
+def get_pd_version(pd_path: Optional[Union[str, Path]]) -> str:
+    """Get Paddle version from a Paddle Python library path.
+
+    Parameters
+    ----------
+    pd_path : str or Path
+        Paddle Python library path, e.g. "/python3.10/site-packages/paddle/"
+
+    Returns
+    -------
+    str
+        version
+    """
+    if pd_path is None or pd_path == "":
+        return ""
+    version_file = Path(pd_path) / "version" / "__init__.py"
+    spec = importlib.util.spec_from_file_location("paddle.version", version_file)
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module.full_version
diff --git a/deepmd/backend/paddle.py b/deepmd/backend/paddle.py
new file mode 100644
index 0000000000..b1f664e76a
--- /dev/null
+++ b/deepmd/backend/paddle.py
@@ -0,0 +1,124 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+from importlib.util import (
+    find_spec,
+)
+from typing import (
+    TYPE_CHECKING,
+    Callable,
+    ClassVar,
+)
+
+from deepmd.backend.backend import (
+    Backend,
+)
+
+if TYPE_CHECKING:
+    from argparse import (
+        Namespace,
+    )
+
+    from deepmd.infer.deep_eval import (
+        DeepEvalBackend,
+    )
+    from deepmd.utils.neighbor_stat import (
+        NeighborStat,
+    )
+
+
+@Backend.register("pd")
+@Backend.register("paddle")
+class PaddleBackend(Backend):
+    """Paddle backend."""
+
+    name = "Paddle"
+    """The formal name of the backend."""
+    features: ClassVar[Backend.Feature] = (
+        Backend.Feature.ENTRY_POINT
+        | Backend.Feature.DEEP_EVAL
+        | Backend.Feature.NEIGHBOR_STAT
+        | Backend.Feature.IO
+    )
+    """The features of the backend."""
+    suffixes: ClassVar[list[str]] = [".json", ".pd"]
+    """The suffixes of the backend."""
+
+    def is_available(self) -> bool:
+        """Check if the backend is available.
+
+        Returns
+        -------
+        bool
+            Whether the backend is available.
+        """
+        return find_spec("paddle") is not None
+
+    @property
+    def entry_point_hook(self) -> Callable[["Namespace"], None]:
+        """The entry point hook of the backend.
+
+        Returns
+        -------
+        Callable[[Namespace], None]
+            The entry point hook of the backend.
+        """
+        from deepmd.pd.entrypoints.main import main as deepmd_main
+
+        return deepmd_main
+
+    @property
+    def deep_eval(self) -> type["DeepEvalBackend"]:
+        """The Deep Eval backend of the backend.
+
+        Returns
+        -------
+        type[DeepEvalBackend]
+            The Deep Eval backend of the backend.
+        """
+        from deepmd.pd.infer.deep_eval import DeepEval as DeepEvalPD
+
+        return DeepEvalPD
+
+    @property
+    def neighbor_stat(self) -> type["NeighborStat"]:
+        """The neighbor statistics of the backend.
+
+        Returns
+        -------
+        type[NeighborStat]
+            The neighbor statistics of the backend.
+        """
+        from deepmd.pd.utils.neighbor_stat import (
+            NeighborStat,
+        )
+
+        return NeighborStat
+
+    @property
+    def serialize_hook(self) -> Callable[[str], dict]:
+        """The serialize hook to convert the model file to a dictionary.
+
+        Returns
+        -------
+        Callable[[str], dict]
+            The serialize hook of the backend.
+        """
+        from deepmd.pd.utils.serialization import (
+            serialize_from_file,
+        )
+
+        return serialize_from_file
+
+    @property
+    def deserialize_hook(self) -> Callable[[str, dict], None]:
+        """The deserialize hook to convert the dictionary to a model file.
+
+        Returns
+        -------
+        Callable[[str, dict], None]
+            The deserialize hook of the backend.
+        """
+        from deepmd.pd.utils.serialization import (
+            deserialize_to_file,
+        )
+
+        return deserialize_to_file
diff --git a/deepmd/dpmodel/model/make_model.py b/deepmd/dpmodel/model/make_model.py
index fbf2c6e21f..70ddbe09b8 100644
--- a/deepmd/dpmodel/model/make_model.py
+++ b/deepmd/dpmodel/model/make_model.py
@@ -457,7 +457,7 @@ def format_nlist(
 
             Returns
             -------
-            formated_nlist
+            formatted_nlist
                 the formatted nlist.
 
             """
diff --git a/deepmd/main.py b/deepmd/main.py
index b3daf75963..097588ca0a 100644
--- a/deepmd/main.py
+++ b/deepmd/main.py
@@ -99,9 +99,10 @@ def main_parser() -> argparse.ArgumentParser:
         formatter_class=RawTextArgumentDefaultsHelpFormatter,
         epilog=textwrap.dedent(
             """\
-        Use --tf or --pt to choose the backend:
+        Use --tf, --pt or --pd to choose the backend:
             dp --tf train input.json
             dp --pt train input.json
+            dp --pd train input.json
         """
         ),
     )
diff --git a/deepmd/pd/__init__.py b/deepmd/pd/__init__.py
new file mode 100644
index 0000000000..c3b2e96ef2
--- /dev/null
+++ b/deepmd/pd/__init__.py
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+
+# import customized OPs globally
+
+from deepmd.utils.entry_point import (
+    load_entry_point,
+)
+
+load_entry_point("deepmd.pd")
+
+__all__ = []
diff --git a/deepmd/pd/entrypoints/__init__.py b/deepmd/pd/entrypoints/__init__.py
new file mode 100644
index 0000000000..6ceb116d85
--- /dev/null
+++ b/deepmd/pd/entrypoints/__init__.py
@@ -0,0 +1 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
diff --git a/deepmd/pd/entrypoints/main.py b/deepmd/pd/entrypoints/main.py
new file mode 100644
index 0000000000..19653d6ea7
--- /dev/null
+++ b/deepmd/pd/entrypoints/main.py
@@ -0,0 +1,543 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import argparse
+import copy
+import json
+import logging
+from pathlib import (
+    Path,
+)
+from typing import (
+    Optional,
+    Union,
+)
+
+import h5py
+import paddle
+import paddle.distributed as dist
+import paddle.distributed.fleet as fleet
+import paddle.version
+
+from deepmd import (
+    __version__,
+)
+from deepmd.common import (
+    expand_sys_str,
+)
+from deepmd.loggers.loggers import (
+    set_log_handles,
+)
+from deepmd.main import (
+    parse_args,
+)
+from deepmd.pd.infer import (
+    inference,
+)
+from deepmd.pd.model.model import (
+    BaseModel,
+)
+from deepmd.pd.train import (
+    training,
+)
+from deepmd.pd.train.wrapper import (
+    ModelWrapper,
+)
+from deepmd.pd.utils.dataloader import (
+    DpLoaderSet,
+)
+from deepmd.pd.utils.env import (
+    DEVICE,
+)
+from deepmd.pd.utils.finetune import (
+    get_finetune_rules,
+)
+from deepmd.pd.utils.multi_task import (
+    preprocess_shared_params,
+)
+from deepmd.pd.utils.stat import (
+    make_stat_input,
+)
+from deepmd.pd.utils.utils import (
+    to_numpy_array,
+)
+from deepmd.utils.argcheck import (
+    normalize,
+)
+from deepmd.utils.compat import (
+    update_deepmd_input,
+)
+from deepmd.utils.data_system import (
+    get_data,
+    process_systems,
+)
+from deepmd.utils.path import (
+    DPPath,
+)
+from deepmd.utils.summary import SummaryPrinter as BaseSummaryPrinter
+
+log = logging.getLogger(__name__)
+
+
+def get_trainer(
+    config,
+    init_model=None,
+    restart_model=None,
+    finetune_model=None,
+    force_load=False,
+    init_frz_model=None,
+    shared_links=None,
+    finetune_links=None,
+):
+    multi_task = "model_dict" in config.get("model", {})
+
+    # Initialize DDP
+    world_size = dist.get_world_size()
+    if world_size > 1:
+        assert paddle.version.nccl() != "0"
+        fleet.init(is_collective=True)
+
+    def prepare_trainer_input_single(
+        model_params_single, data_dict_single, rank=0, seed=None
+    ):
+        training_dataset_params = data_dict_single["training_data"]
+        validation_dataset_params = data_dict_single.get("validation_data", None)
+        validation_systems = (
+            validation_dataset_params["systems"] if validation_dataset_params else None
+        )
+        training_systems = training_dataset_params["systems"]
+        training_systems = process_systems(training_systems)
+        if validation_systems is not None:
+            validation_systems = process_systems(validation_systems)
+
+        # stat files
+        stat_file_path_single = data_dict_single.get("stat_file", None)
+        if rank != 0:
+            stat_file_path_single = None
+        elif stat_file_path_single is not None:
+            if not Path(stat_file_path_single).exists():
+                if stat_file_path_single.endswith((".h5", ".hdf5")):
+                    with h5py.File(stat_file_path_single, "w") as f:
+                        pass
+                else:
+                    Path(stat_file_path_single).mkdir()
+            stat_file_path_single = DPPath(stat_file_path_single, "a")
+
+        # validation and training data
+        # avoid the same batch sequence among devices
+        rank_seed = (seed + rank) % (2**32) if seed is not None else None
+        validation_data_single = (
+            DpLoaderSet(
+                validation_systems,
+                validation_dataset_params["batch_size"],
+                model_params_single["type_map"],
+                seed=rank_seed,
+            )
+            if validation_systems
+            else None
+        )
+        train_data_single = DpLoaderSet(
+            training_systems,
+            training_dataset_params["batch_size"],
+            model_params_single["type_map"],
+            seed=rank_seed,
+        )
+        return (
+            train_data_single,
+            validation_data_single,
+            stat_file_path_single,
+        )
+
+    rank = dist.get_rank() if dist.is_available() and dist.is_initialized() else 0
+    data_seed = config["training"].get("seed", None)
+    if not multi_task:
+        (
+            train_data,
+            validation_data,
+            stat_file_path,
+        ) = prepare_trainer_input_single(
+            config["model"],
+            config["training"],
+            rank=rank,
+            seed=data_seed,
+        )
+    else:
+        train_data, validation_data, stat_file_path = {}, {}, {}
+        for model_key in config["model"]["model_dict"]:
+            (
+                train_data[model_key],
+                validation_data[model_key],
+                stat_file_path[model_key],
+            ) = prepare_trainer_input_single(
+                config["model"]["model_dict"][model_key],
+                config["training"]["data_dict"][model_key],
+                rank=rank,
+                seed=data_seed,
+            )
+
+    trainer = training.Trainer(
+        config,
+        train_data,
+        stat_file_path=stat_file_path,
+        validation_data=validation_data,
+        init_model=init_model,
+        restart_model=restart_model,
+        finetune_model=finetune_model,
+        force_load=force_load,
+        shared_links=shared_links,
+        finetune_links=finetune_links,
+        init_frz_model=init_frz_model,
+    )
+    return trainer
+
+
+class SummaryPrinter(BaseSummaryPrinter):
+    """Summary printer for Paddle."""
+
+    def is_built_with_cuda(self) -> bool:
+        """Check if the backend is built with CUDA."""
+        return paddle.device.is_compiled_with_cuda()
+
+    def is_built_with_rocm(self) -> bool:
+        """Check if the backend is built with ROCm."""
+        return paddle.device.is_compiled_with_rocm()
+
+    def get_compute_device(self) -> str:
+        """Get Compute device."""
+        return str(DEVICE)
+
+    def get_ngpus(self) -> int:
+        """Get the number of GPUs."""
+        return paddle.device.cuda.device_count()
+
+    def get_backend_info(self) -> dict:
+        """Get backend information."""
+        op_info = {}
+        return {
+            "Backend": "Paddle",
+            "PD ver": f"v{paddle.__version__}-g{paddle.version.commit[:11]}",
+            "Enable custom OP": False,
+            **op_info,
+        }
+
+
+def train(
+    input_file: str,
+    init_model: Optional[str],
+    restart: Optional[str],
+    finetune: Optional[str],
+    init_frz_model: Optional[str],
+    model_branch: str,
+    skip_neighbor_stat: bool = False,
+    use_pretrain_script: bool = False,
+    force_load: bool = False,
+    output: str = "out.json",
+):
+    log.info("Configuration path: %s", input_file)
+    SummaryPrinter()()
+    with open(input_file) as fin:
+        config = json.load(fin)
+    # ensure suffix, as in the command line help, we say "path prefix of checkpoint files"
+    if init_model is not None and not init_model.endswith(".pd"):
+        init_model += ".pd"
+    if restart is not None and not restart.endswith(".pd"):
+        restart += ".pd"
+
+    # update multitask config
+    multi_task = "model_dict" in config["model"]
+    shared_links = None
+    if multi_task:
+        config["model"], shared_links = preprocess_shared_params(config["model"])
+        # handle the special key
+        assert (
+            "RANDOM" not in config["model"]["model_dict"]
+        ), "Model name can not be 'RANDOM' in multi-task mode!"
+
+    # update fine-tuning config
+    finetune_links = None
+    if finetune is not None:
+        config["model"], finetune_links = get_finetune_rules(
+            finetune,
+            config["model"],
+            model_branch=model_branch,
+            change_model_params=use_pretrain_script,
+        )
+    # update init_model or init_frz_model config if necessary
+    if (init_model is not None or init_frz_model is not None) and use_pretrain_script:
+        if init_model is not None:
+            init_state_dict = paddle.load(init_model)
+            if "model" in init_state_dict:
+                init_state_dict = init_state_dict["model"]
+            config["model"] = init_state_dict["_extra_state"]["model_params"]
+        else:
+            raise NotImplementedError("init_frz_model is not supported yet")
+
+    # argcheck
+    config = update_deepmd_input(config, warning=True, dump="input_v2_compat.json")
+    config = normalize(config, multi_task=multi_task)
+
+    # do neighbor stat
+    min_nbor_dist = None
+    if not skip_neighbor_stat:
+        log.info(
+            "Calculate neighbor statistics... (add --skip-neighbor-stat to skip this step)"
+        )
+
+        if not multi_task:
+            type_map = config["model"].get("type_map")
+            train_data = get_data(
+                config["training"]["training_data"], 0, type_map, None
+            )
+            config["model"], min_nbor_dist = BaseModel.update_sel(
+                train_data, type_map, config["model"]
+            )
+        else:
+            min_nbor_dist = {}
+            for model_item in config["model"]["model_dict"]:
+                type_map = config["model"]["model_dict"][model_item].get("type_map")
+                train_data = get_data(
+                    config["training"]["data_dict"][model_item]["training_data"],
+                    0,
+                    type_map,
+                    None,
+                )
+                config["model"]["model_dict"][model_item], min_nbor_dist[model_item] = (
+                    BaseModel.update_sel(
+                        train_data, type_map, config["model"]["model_dict"][model_item]
+                    )
+                )
+
+    with open(output, "w") as fp:
+        json.dump(config, fp, indent=4)
+
+    trainer = get_trainer(
+        config,
+        init_model,
+        restart,
+        finetune,
+        force_load,
+        init_frz_model,
+        shared_links=shared_links,
+        finetune_links=finetune_links,
+    )
+    # save min_nbor_dist
+    if min_nbor_dist is not None:
+        if not multi_task:
+            trainer.model.min_nbor_dist = min_nbor_dist
+        else:
+            for model_item in min_nbor_dist:
+                trainer.model[model_item].min_nbor_dist = min_nbor_dist[model_item]
+    trainer.run()
+
+
+def freeze(
+    model: str,
+    output: str = "frozen_model.json",
+    head: Optional[str] = None,
+):
+    paddle.set_flags(
+        {
+            "FLAGS_save_cf_stack_op": 1,
+            "FLAGS_prim_enable_dynamic": 1,
+            "FLAGS_enable_pir_api": 1,
+        }
+    )
+    model = inference.Tester(model, head=head).model
+    model.eval()
+    from paddle.static import (
+        InputSpec,
+    )
+
+    """
+    ** coord [None, natoms, 3] paddle.float64
+    ** atype [None, natoms] paddle.int64
+    ** nlist [None, natoms, nnei] paddle.int32
+    """
+    # NOTE: 'FLAGS_save_cf_stack_op', 'FLAGS_prim_enable_dynamic' and
+    # 'FLAGS_enable_pir_api' shoule be enabled when freezing model.
+    jit_model = paddle.jit.to_static(
+        model.forward_lower,
+        full_graph=True,
+        input_spec=[
+            InputSpec([-1, -1, 3], dtype="float64", name="coord"),
+            InputSpec([-1, -1], dtype="int32", name="atype"),
+            InputSpec([-1, -1, -1], dtype="int32", name="nlist"),
+        ],
+    )
+    if output.endswith(".json"):
+        output = output[:-5]
+    paddle.jit.save(
+        jit_model,
+        path=output,
+        skip_prune_program=True,
+    )
+    log.info(
+        f"Paddle inference model has been exported to: {output}.json and {output}.pdiparams"
+    )
+
+
+def change_bias(
+    input_file: str,
+    mode: str = "change",
+    bias_value: Optional[list] = None,
+    datafile: Optional[str] = None,
+    system: str = ".",
+    numb_batch: int = 0,
+    model_branch: Optional[str] = None,
+    output: Optional[str] = None,
+):
+    if input_file.endswith(".pd"):
+        old_state_dict = paddle.load(input_file)
+        model_state_dict = copy.deepcopy(old_state_dict.get("model", old_state_dict))
+        model_params = model_state_dict["_extra_state"]["model_params"]
+    else:
+        raise RuntimeError(
+            "Paddle now do not support change bias directly from a freezed model file"
+            "Please provided a checkpoint file with a .pd extension"
+        )
+    multi_task = "model_dict" in model_params
+    bias_adjust_mode = "change-by-statistic" if mode == "change" else "set-by-statistic"
+    if multi_task:
+        assert (
+            model_branch is not None
+        ), "For multitask model, the model branch must be set!"
+        assert model_branch in model_params["model_dict"], (
+            f"For multitask model, the model branch must be in the 'model_dict'! "
+            f"Available options are : {list(model_params['model_dict'].keys())}."
+        )
+        log.info(f"Changing out bias for model {model_branch}.")
+    model = training.get_model_for_wrapper(model_params)
+    type_map = (
+        model_params["type_map"]
+        if not multi_task
+        else model_params["model_dict"][model_branch]["type_map"]
+    )
+    model_to_change = model if not multi_task else model[model_branch]
+    if input_file.endswith(".pd"):
+        wrapper = ModelWrapper(model)
+        wrapper.set_state_dict(old_state_dict["model"])
+    else:
+        raise NotImplementedError("Only support .pd file")
+
+    if bias_value is not None:
+        # use user-defined bias
+        assert model_to_change.model_type in [
+            "ener"
+        ], "User-defined bias is only available for energy model!"
+        assert (
+            len(bias_value) == len(type_map)
+        ), f"The number of elements in the bias should be the same as that in the type_map: {type_map}."
+        old_bias = model_to_change.get_out_bias()
+        bias_to_set = paddle.to_tensor(
+            bias_value, dtype=old_bias.dtype, place=old_bias.place
+        ).reshape(old_bias.shape)
+        model_to_change.set_out_bias(bias_to_set)
+        log.info(
+            f"Change output bias of {type_map!s} "
+            f"from {to_numpy_array(old_bias).reshape(-1)!s} "
+            f"to {to_numpy_array(bias_to_set).reshape(-1)!s}."
+        )
+        updated_model = model_to_change
+    else:
+        # calculate bias on given systems
+        if datafile is not None:
+            with open(datafile) as datalist:
+                all_sys = datalist.read().splitlines()
+        else:
+            all_sys = expand_sys_str(system)
+        data_systems = process_systems(all_sys)
+        data_single = DpLoaderSet(
+            data_systems,
+            1,
+            type_map,
+        )
+        mock_loss = training.get_loss(
+            {"inference": True}, 1.0, len(type_map), model_to_change
+        )
+        data_requirement = mock_loss.label_requirement
+        data_requirement += training.get_additional_data_requirement(model_to_change)
+        data_single.add_data_requirement(data_requirement)
+        nbatches = numb_batch if numb_batch != 0 else float("inf")
+        sampled_data = make_stat_input(
+            data_single.systems,
+            data_single.dataloaders,
+            nbatches,
+        )
+        updated_model = training.model_change_out_bias(
+            model_to_change, sampled_data, _bias_adjust_mode=bias_adjust_mode
+        )
+
+    if not multi_task:
+        model = updated_model
+    else:
+        model[model_branch] = updated_model
+
+    if input_file.endswith(".pd"):
+        output_path = (
+            output if output is not None else input_file.replace(".pd", "_updated.pd")
+        )
+        wrapper = ModelWrapper(model)
+        if "model" in old_state_dict:
+            old_state_dict["model"] = wrapper.state_dict()
+            old_state_dict["model"]["_extra_state"] = model_state_dict["_extra_state"]
+        else:
+            old_state_dict = wrapper.state_dict()
+            old_state_dict["_extra_state"] = model_state_dict["_extra_state"]
+        paddle.save(old_state_dict, output_path)
+    else:
+        raise NotImplementedError("Only support .pd file now")
+
+    log.info(f"Saved model to {output_path}")
+
+
+def main(args: Optional[Union[list[str], argparse.Namespace]] = None):
+    if not isinstance(args, argparse.Namespace):
+        FLAGS = parse_args(args=args)
+    else:
+        FLAGS = args
+
+    set_log_handles(
+        FLAGS.log_level,
+        Path(FLAGS.log_path) if FLAGS.log_path else None,
+        mpi_log=None,
+    )
+    log.debug("Log handles were successfully set")
+    log.info("DeePMD version: %s", __version__)
+
+    if FLAGS.command == "train":
+        train(
+            input_file=FLAGS.INPUT,
+            init_model=FLAGS.init_model,
+            restart=FLAGS.restart,
+            finetune=FLAGS.finetune,
+            init_frz_model=FLAGS.init_frz_model,
+            model_branch=FLAGS.model_branch,
+            skip_neighbor_stat=FLAGS.skip_neighbor_stat,
+            use_pretrain_script=FLAGS.use_pretrain_script,
+            force_load=FLAGS.force_load,
+            output=FLAGS.output,
+        )
+    elif FLAGS.command == "freeze":
+        if Path(FLAGS.checkpoint_folder).is_dir():
+            checkpoint_path = Path(FLAGS.checkpoint_folder)
+            latest_ckpt_file = (checkpoint_path / "checkpoint").read_text()
+            FLAGS.model = str(checkpoint_path.joinpath(latest_ckpt_file))
+        else:
+            FLAGS.model = FLAGS.checkpoint_folder
+        FLAGS.output = str(Path(FLAGS.output).with_suffix(".json"))
+        freeze(model=FLAGS.model, output=FLAGS.output, head=FLAGS.head)
+    elif FLAGS.command == "change-bias":
+        change_bias(
+            input_file=FLAGS.INPUT,
+            mode=FLAGS.mode,
+            bias_value=FLAGS.bias_value,
+            datafile=FLAGS.datafile,
+            system=FLAGS.system,
+            numb_batch=FLAGS.numb_batch,
+            model_branch=FLAGS.model_branch,
+            output=FLAGS.output,
+        )
+    else:
+        raise RuntimeError(f"Invalid command {FLAGS.command}!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/deepmd/pd/infer/__init__.py b/deepmd/pd/infer/__init__.py
new file mode 100644
index 0000000000..6ceb116d85
--- /dev/null
+++ b/deepmd/pd/infer/__init__.py
@@ -0,0 +1 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
diff --git a/deepmd/pd/infer/deep_eval.py b/deepmd/pd/infer/deep_eval.py
new file mode 100644
index 0000000000..a2f8510f28
--- /dev/null
+++ b/deepmd/pd/infer/deep_eval.py
@@ -0,0 +1,537 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    Optional,
+    Union,
+)
+
+import numpy as np
+import paddle
+
+from deepmd.dpmodel.common import PRECISION_DICT as NP_PRECISION_DICT
+from deepmd.dpmodel.output_def import (
+    ModelOutputDef,
+    OutputVariableCategory,
+    OutputVariableDef,
+)
+from deepmd.infer.deep_eval import DeepEval as DeepEvalWrapper
+from deepmd.infer.deep_eval import (
+    DeepEvalBackend,
+)
+from deepmd.infer.deep_pot import (
+    DeepPot,
+)
+from deepmd.pd.model.model import (
+    get_model,
+)
+from deepmd.pd.train.wrapper import (
+    ModelWrapper,
+)
+from deepmd.pd.utils.auto_batch_size import (
+    AutoBatchSize,
+)
+from deepmd.pd.utils.env import (
+    DEVICE,
+    GLOBAL_PD_FLOAT_PRECISION,
+    RESERVED_PRECISON_DICT,
+    enable_prim,
+)
+from deepmd.pd.utils.utils import (
+    to_numpy_array,
+    to_paddle_tensor,
+)
+
+if TYPE_CHECKING:
+    import ase.neighborlist
+
+
+class DeepEval(DeepEvalBackend):
+    """Paddle backend implementation of DeepEval.
+
+    Parameters
+    ----------
+    model_file : Path
+        The name of the frozen model file.
+    output_def : ModelOutputDef
+        The output definition of the model.
+    *args : list
+        Positional arguments.
+    auto_batch_size : bool or int or AutomaticBatchSize, default: False
+        If True, automatic batch size will be used. If int, it will be used
+        as the initial batch size.
+    neighbor_list : ase.neighborlist.NewPrimitiveNeighborList, optional
+        The ASE neighbor list class to produce the neighbor list. If None, the
+        neighbor list will be built natively in the model.
+    **kwargs : dict
+        Keyword arguments.
+    """
+
+    def __init__(
+        self,
+        model_file: str,
+        output_def: ModelOutputDef,
+        *args: Any,
+        auto_batch_size: Union[bool, int, AutoBatchSize] = True,
+        neighbor_list: Optional["ase.neighborlist.NewPrimitiveNeighborList"] = None,
+        head: Optional[Union[str, int]] = None,
+        **kwargs: Any,
+    ):
+        enable_prim(True)
+        self.output_def = output_def
+        self.model_path = model_file
+        if str(self.model_path).endswith(".pd"):
+            state_dict = paddle.load(model_file)
+            if "model" in state_dict:
+                state_dict = state_dict["model"]
+            self.input_param = state_dict["_extra_state"]["model_params"]
+            self.model_def_script = self.input_param
+            self.multi_task = "model_dict" in self.input_param
+            if self.multi_task:
+                model_keys = list(self.input_param["model_dict"].keys())
+                if isinstance(head, int):
+                    head = model_keys[0]
+                assert (
+                    head is not None
+                ), f"Head must be set for multitask model! Available heads are: {model_keys}"
+                assert (
+                    head in model_keys
+                ), f"No head named {head} in model! Available heads are: {model_keys}"
+                self.input_param = self.input_param["model_dict"][head]
+                state_dict_head = {"_extra_state": state_dict["_extra_state"]}
+                for item in state_dict:
+                    if f"model.{head}." in item:
+                        state_dict_head[
+                            item.replace(f"model.{head}.", "model.Default.")
+                        ] = state_dict[item].clone()
+                state_dict = state_dict_head
+            model = get_model(self.input_param).to(DEVICE)
+            # model = paddle.jit.to_static(model)
+            self.dp = ModelWrapper(model)
+            self.dp.set_state_dict(state_dict)
+        else:
+            # self.dp = paddle.jit.load(self.model_path.split(".json")[0])
+            raise ValueError(f"Unknown model file format: {self.model_path}!")
+        self.rcut = self.dp.model["Default"].get_rcut()
+        self.type_map = self.dp.model["Default"].get_type_map()
+        if isinstance(auto_batch_size, bool):
+            if auto_batch_size:
+                self.auto_batch_size = AutoBatchSize()
+            else:
+                self.auto_batch_size = None
+        elif isinstance(auto_batch_size, int):
+            self.auto_batch_size = AutoBatchSize(auto_batch_size)
+        elif isinstance(auto_batch_size, AutoBatchSize):
+            self.auto_batch_size = auto_batch_size
+        else:
+            raise TypeError("auto_batch_size should be bool, int, or AutoBatchSize")
+        self._has_spin = getattr(self.dp.model["Default"], "has_spin", False)
+        if callable(self._has_spin):
+            self._has_spin = self._has_spin()
+
+    def get_rcut(self) -> float:
+        """Get the cutoff radius of this model."""
+        return self.rcut
+
+    def get_ntypes(self) -> int:
+        """Get the number of atom types of this model."""
+        return len(self.type_map)
+
+    def get_type_map(self) -> list[str]:
+        """Get the type map (element name of the atom types) of this model."""
+        return self.type_map
+
+    def get_dim_fparam(self) -> int:
+        """Get the number (dimension) of frame parameters of this DP."""
+        return self.dp.model["Default"].get_dim_fparam()
+
+    def get_dim_aparam(self) -> int:
+        """Get the number (dimension) of atomic parameters of this DP."""
+        return self.dp.model["Default"].get_dim_aparam()
+
+    def get_intensive(self) -> bool:
+        return self.dp.model["Default"].get_intensive()
+
+    @property
+    def model_type(self) -> type["DeepEvalWrapper"]:
+        """The the evaluator of the model type."""
+        model_output_type = self.dp.model["Default"].model_output_type()
+        if "energy" in model_output_type:
+            return DeepPot
+        else:
+            raise RuntimeError("Unknown model type")
+
+    def get_sel_type(self) -> list[int]:
+        """Get the selected atom types of this model.
+
+        Only atoms with selected atom types have atomic contribution
+        to the result of the model.
+        If returning an empty list, all atom types are selected.
+        """
+        return self.dp.model["Default"].get_sel_type()
+
+    def get_numb_dos(self) -> int:
+        """Get the number of DOS."""
+        return self.dp.model["Default"].get_numb_dos()
+
+    def get_task_dim(self) -> int:
+        """Get the output dimension."""
+        return self.dp.model["Default"].get_task_dim()
+
+    def get_has_efield(self):
+        """Check if the model has efield."""
+        return False
+
+    def get_ntypes_spin(self):
+        """Get the number of spin atom types of this model. Only used in old implement."""
+        return 0
+
+    def get_has_spin(self):
+        """Check if the model has spin atom types."""
+        return self._has_spin
+
+    def eval(
+        self,
+        coords: np.ndarray,
+        cells: Optional[np.ndarray],
+        atom_types: np.ndarray,
+        atomic: bool = False,
+        fparam: Optional[np.ndarray] = None,
+        aparam: Optional[np.ndarray] = None,
+        **kwargs: Any,
+    ) -> dict[str, np.ndarray]:
+        """Evaluate the energy, force and virial by using this DP.
+
+        Parameters
+        ----------
+        coords
+            The coordinates of atoms.
+            The array should be of size nframes x natoms x 3
+        cells
+            The cell of the region.
+            If None then non-PBC is assumed, otherwise using PBC.
+            The array should be of size nframes x 9
+        atom_types
+            The atom types
+            The list should contain natoms ints
+        atomic
+            Calculate the atomic energy and virial
+        fparam
+            The frame parameter.
+            The array can be of size :
+            - nframes x dim_fparam.
+            - dim_fparam. Then all frames are assumed to be provided with the same fparam.
+        aparam
+            The atomic parameter
+            The array can be of size :
+            - nframes x natoms x dim_aparam.
+            - natoms x dim_aparam. Then all frames are assumed to be provided with the same aparam.
+            - dim_aparam. Then all frames and atoms are provided with the same aparam.
+        **kwargs
+            Other parameters
+
+        Returns
+        -------
+        output_dict : dict
+            The output of the evaluation. The keys are the names of the output
+            variables, and the values are the corresponding output arrays.
+        """
+        # convert all of the input to numpy array
+        atom_types = np.array(atom_types, dtype=np.int32)
+        coords = np.array(coords)
+        if cells is not None:
+            cells = np.array(cells)
+        natoms, numb_test = self._get_natoms_and_nframes(
+            coords, atom_types, len(atom_types.shape) > 1
+        )
+        request_defs = self._get_request_defs(atomic)
+        if "spin" not in kwargs or kwargs["spin"] is None:
+            out = self._eval_func(self._eval_model, numb_test, natoms)(
+                coords, cells, atom_types, fparam, aparam, request_defs
+            )
+        else:
+            out = self._eval_func(self._eval_model_spin, numb_test, natoms)(
+                coords,
+                cells,
+                atom_types,
+                np.array(kwargs["spin"]),
+                fparam,
+                aparam,
+                request_defs,
+            )
+        return dict(
+            zip(
+                [x.name for x in request_defs],
+                out,
+            )
+        )
+
+    def _get_request_defs(self, atomic: bool) -> list[OutputVariableDef]:
+        """Get the requested output definitions.
+
+        When atomic is True, all output_def are requested.
+        When atomic is False, only energy (tensor), force, and virial
+        are requested.
+
+        Parameters
+        ----------
+        atomic : bool
+            Whether to request the atomic output.
+
+        Returns
+        -------
+        list[OutputVariableDef]
+            The requested output definitions.
+        """
+        if atomic:
+            return list(self.output_def.var_defs.values())
+        else:
+            return [
+                x
+                for x in self.output_def.var_defs.values()
+                if x.category
+                in (
+                    OutputVariableCategory.OUT,
+                    OutputVariableCategory.REDU,
+                    OutputVariableCategory.DERV_R,
+                    OutputVariableCategory.DERV_C_REDU,
+                )
+            ]
+
+    def _eval_func(self, inner_func: Callable, numb_test: int, natoms: int) -> Callable:
+        """Wrapper method with auto batch size.
+
+        Parameters
+        ----------
+        inner_func : Callable
+            the method to be wrapped
+        numb_test : int
+            number of tests
+        natoms : int
+            number of atoms
+
+        Returns
+        -------
+        Callable
+            the wrapper
+        """
+        if self.auto_batch_size is not None:
+
+            def eval_func(*args, **kwargs):
+                return self.auto_batch_size.execute_all(
+                    inner_func, numb_test, natoms, *args, **kwargs
+                )
+
+        else:
+            eval_func = inner_func
+        return eval_func
+
+    def _get_natoms_and_nframes(
+        self,
+        coords: np.ndarray,
+        atom_types: np.ndarray,
+        mixed_type: bool = False,
+    ) -> tuple[int, int]:
+        if mixed_type:
+            natoms = len(atom_types[0])
+        else:
+            natoms = len(atom_types)
+        if natoms == 0:
+            assert coords.size == 0
+        else:
+            coords = np.reshape(np.array(coords), [-1, natoms * 3])
+        nframes = coords.shape[0]
+        return natoms, nframes
+
+    def _eval_model(
+        self,
+        coords: np.ndarray,
+        cells: Optional[np.ndarray],
+        atom_types: np.ndarray,
+        fparam: Optional[np.ndarray],
+        aparam: Optional[np.ndarray],
+        request_defs: list[OutputVariableDef],
+    ):
+        model = self.dp.to(DEVICE)
+        prec = NP_PRECISION_DICT[RESERVED_PRECISON_DICT[GLOBAL_PD_FLOAT_PRECISION]]
+
+        nframes = coords.shape[0]
+        if len(atom_types.shape) == 1:
+            natoms = len(atom_types)
+            atom_types = np.tile(atom_types, nframes).reshape([nframes, -1])
+        else:
+            natoms = len(atom_types[0])
+
+        coord_input = paddle.to_tensor(
+            coords.reshape([nframes, natoms, 3]).astype(prec),
+            dtype=GLOBAL_PD_FLOAT_PRECISION,
+            place=DEVICE,
+        )
+        type_input = paddle.to_tensor(
+            atom_types.astype(NP_PRECISION_DICT[RESERVED_PRECISON_DICT[paddle.int64]]),
+            dtype=paddle.int64,
+            place=DEVICE,
+        )
+        if cells is not None:
+            box_input = paddle.to_tensor(
+                cells.reshape([nframes, 3, 3]),
+                dtype=GLOBAL_PD_FLOAT_PRECISION,
+                place=DEVICE,
+            )
+        else:
+            box_input = None
+        if fparam is not None:
+            fparam_input = to_paddle_tensor(
+                fparam.reshape([nframes, self.get_dim_fparam()])
+            )
+        else:
+            fparam_input = None
+        if aparam is not None:
+            aparam_input = to_paddle_tensor(
+                aparam.reshape([nframes, natoms, self.get_dim_aparam()])
+            )
+        else:
+            aparam_input = None
+        do_atomic_virial = any(
+            x.category == OutputVariableCategory.DERV_C for x in request_defs
+        )
+        batch_output = model(
+            coord_input,
+            type_input,
+            box=box_input,
+            do_atomic_virial=do_atomic_virial,
+            fparam=fparam_input,
+            aparam=aparam_input,
+        )
+        if isinstance(batch_output, tuple):
+            batch_output = batch_output[0]
+
+        results = []
+        for odef in request_defs:
+            pd_name = self._OUTDEF_DP2BACKEND[odef.name]
+            if pd_name in batch_output:
+                shape = self._get_output_shape(odef, nframes, natoms)
+                out = batch_output[pd_name].reshape(shape).numpy()
+                results.append(out)
+            else:
+                shape = self._get_output_shape(odef, nframes, natoms)
+                results.append(
+                    np.full(np.abs(shape), np.nan, dtype=prec)
+                )  # this is kinda hacky
+        return tuple(results)
+
+    def _eval_model_spin(
+        self,
+        coords: np.ndarray,
+        cells: Optional[np.ndarray],
+        atom_types: np.ndarray,
+        spins: np.ndarray,
+        fparam: Optional[np.ndarray],
+        aparam: Optional[np.ndarray],
+        request_defs: list[OutputVariableDef],
+    ):
+        raise NotImplementedError("_eval_model_spin is not supported yet.")
+
+    def _get_output_shape(self, odef, nframes, natoms):
+        if odef.category == OutputVariableCategory.DERV_C_REDU:
+            # virial
+            return [nframes, *odef.shape[:-1], 9]
+        elif odef.category == OutputVariableCategory.REDU:
+            # energy
+            return [nframes, *odef.shape, 1]
+        elif odef.category == OutputVariableCategory.DERV_C:
+            # atom_virial
+            return [nframes, *odef.shape[:-1], natoms, 9]
+        elif odef.category == OutputVariableCategory.DERV_R:
+            # force
+            return [nframes, *odef.shape[:-1], natoms, 3]
+        elif odef.category == OutputVariableCategory.OUT:
+            # atom_energy, atom_tensor
+            # Something wrong here?
+            # return [nframes, *shape, natoms, 1]
+            return [nframes, natoms, *odef.shape, 1]
+        else:
+            raise RuntimeError("unknown category")
+
+    def eval_typeebd(self) -> np.ndarray:
+        """Evaluate output of type embedding network by using this model.
+
+        Returns
+        -------
+        np.ndarray
+            The output of type embedding network. The shape is [ntypes, o_size] or [ntypes + 1, o_size],
+            where ntypes is the number of types, and o_size is the number of nodes
+            in the output layer. If there are multiple type embedding networks,
+            these outputs will be concatenated along the second axis.
+
+        Raises
+        ------
+        KeyError
+            If the model does not enable type embedding.
+
+        See Also
+        --------
+        deepmd.pd.model.network.network.TypeEmbedNetConsistent :
+            The type embedding network.
+        """
+        raise NotImplementedError("eval_typeebd is not supported yet.")
+
+    def get_model_def_script(self) -> str:
+        """Get model definition script."""
+        return self.model_def_script
+
+    def eval_descriptor(
+        self,
+        coords: np.ndarray,
+        cells: Optional[np.ndarray],
+        atom_types: np.ndarray,
+        fparam: Optional[np.ndarray] = None,
+        aparam: Optional[np.ndarray] = None,
+        **kwargs: Any,
+    ) -> np.ndarray:
+        """Evaluate descriptors by using this DP.
+
+        Parameters
+        ----------
+        coords
+            The coordinates of atoms.
+            The array should be of size nframes x natoms x 3
+        cells
+            The cell of the region.
+            If None then non-PBC is assumed, otherwise using PBC.
+            The array should be of size nframes x 9
+        atom_types
+            The atom types
+            The list should contain natoms ints
+        fparam
+            The frame parameter.
+            The array can be of size :
+            - nframes x dim_fparam.
+            - dim_fparam. Then all frames are assumed to be provided with the same fparam.
+        aparam
+            The atomic parameter
+            The array can be of size :
+            - nframes x natoms x dim_aparam.
+            - natoms x dim_aparam. Then all frames are assumed to be provided with the same aparam.
+            - dim_aparam. Then all frames and atoms are provided with the same aparam.
+
+        Returns
+        -------
+        descriptor
+            Descriptors.
+        """
+        model = self.dp.model["Default"]
+        model.set_eval_descriptor_hook(True)
+        self.eval(
+            coords,
+            cells,
+            atom_types,
+            atomic=False,
+            fparam=fparam,
+            aparam=aparam,
+            **kwargs,
+        )
+        descriptor = model.eval_descriptor()
+        model.set_eval_descriptor_hook(False)
+        return to_numpy_array(descriptor)
diff --git a/deepmd/pd/infer/inference.py b/deepmd/pd/infer/inference.py
new file mode 100644
index 0000000000..ae1b8e8516
--- /dev/null
+++ b/deepmd/pd/infer/inference.py
@@ -0,0 +1,64 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import logging
+from copy import (
+    deepcopy,
+)
+
+import paddle
+
+from deepmd.pd.model.model import (
+    get_model,
+)
+from deepmd.pd.train.wrapper import (
+    ModelWrapper,
+)
+from deepmd.pd.utils.env import (
+    DEVICE,
+    JIT,
+)
+
+log = logging.getLogger(__name__)
+
+
+class Tester:
+    def __init__(
+        self,
+        model_ckpt,
+        head=None,
+    ):
+        """Construct a DeePMD tester.
+
+        Args:
+        - config: The Dict-like configuration with training options.
+        """
+        # Model
+        state_dict = paddle.load(model_ckpt)
+        if "model" in state_dict:
+            state_dict = state_dict["model"]
+        model_params = state_dict["_extra_state"]["model_params"]
+        self.multi_task = "model_dict" in model_params
+        if self.multi_task:
+            assert head is not None, "Head must be specified in multitask mode!"
+            self.head = head
+            assert head in model_params["model_dict"], (
+                f"Specified head {head} not found in model {model_ckpt}! "
+                f"Available ones are {list(model_params['model_dict'].keys())}."
+            )
+            model_params = model_params["model_dict"][head]
+            state_dict_head = {"_extra_state": state_dict["_extra_state"]}
+            for item in state_dict:
+                if f"model.{head}." in item:
+                    state_dict_head[
+                        item.replace(f"model.{head}.", "model.Default.")
+                    ] = state_dict[item].clone()
+            state_dict = state_dict_head
+
+        self.model_params = deepcopy(model_params)
+        self.model = get_model(model_params).to(DEVICE)
+
+        # Model Wrapper
+        self.wrapper = ModelWrapper(self.model)  # inference only
+        if JIT:
+            raise NotImplementedError
+            # self.wrapper = paddle.jit.to_static(self.wrapper)
+        self.wrapper.set_state_dict(state_dict)
diff --git a/deepmd/pd/loss/__init__.py b/deepmd/pd/loss/__init__.py
new file mode 100644
index 0000000000..0e978b95c2
--- /dev/null
+++ b/deepmd/pd/loss/__init__.py
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+from .ener import (
+    EnergyStdLoss,
+)
+from .loss import (
+    TaskLoss,
+)
+
+__all__ = [
+    "EnergyStdLoss",
+    "TaskLoss",
+]
diff --git a/deepmd/pd/loss/ener.py b/deepmd/pd/loss/ener.py
new file mode 100644
index 0000000000..7c5d848b45
--- /dev/null
+++ b/deepmd/pd/loss/ener.py
@@ -0,0 +1,428 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+from typing import (
+    Optional,
+)
+
+import paddle
+import paddle.nn.functional as F
+
+from deepmd.pd.loss.loss import (
+    TaskLoss,
+)
+from deepmd.pd.utils import (
+    decomp,
+    env,
+)
+from deepmd.pd.utils.env import (
+    GLOBAL_PD_FLOAT_PRECISION,
+)
+from deepmd.utils.data import (
+    DataRequirementItem,
+)
+
+
+class EnergyStdLoss(TaskLoss):
+    def __init__(
+        self,
+        starter_learning_rate=1.0,
+        start_pref_e=0.0,
+        limit_pref_e=0.0,
+        start_pref_f=0.0,
+        limit_pref_f=0.0,
+        start_pref_v=0.0,
+        limit_pref_v=0.0,
+        start_pref_ae: float = 0.0,
+        limit_pref_ae: float = 0.0,
+        start_pref_pf: float = 0.0,
+        limit_pref_pf: float = 0.0,
+        relative_f: Optional[float] = None,
+        enable_atom_ener_coeff: bool = False,
+        start_pref_gf: float = 0.0,
+        limit_pref_gf: float = 0.0,
+        numb_generalized_coord: int = 0,
+        use_l1_all: bool = False,
+        inference=False,
+        **kwargs,
+    ):
+        r"""Construct a layer to compute loss on energy, force and virial.
+
+        Parameters
+        ----------
+        starter_learning_rate : float
+            The learning rate at the start of the training.
+        start_pref_e : float
+            The prefactor of energy loss at the start of the training.
+        limit_pref_e : float
+            The prefactor of energy loss at the end of the training.
+        start_pref_f : float
+            The prefactor of force loss at the start of the training.
+        limit_pref_f : float
+            The prefactor of force loss at the end of the training.
+        start_pref_v : float
+            The prefactor of virial loss at the start of the training.
+        limit_pref_v : float
+            The prefactor of virial loss at the end of the training.
+        start_pref_ae : float
+            The prefactor of atomic energy loss at the start of the training.
+        limit_pref_ae : float
+            The prefactor of atomic energy loss at the end of the training.
+        start_pref_pf : float
+            The prefactor of atomic prefactor force loss at the start of the training.
+        limit_pref_pf : float
+            The prefactor of atomic prefactor force loss at the end of the training.
+        relative_f : float
+            If provided, relative force error will be used in the loss. The difference
+            of force will be normalized by the magnitude of the force in the label with
+            a shift given by relative_f
+        enable_atom_ener_coeff : bool
+            if true, the energy will be computed as \sum_i c_i E_i
+        start_pref_gf : float
+            The prefactor of generalized force loss at the start of the training.
+        limit_pref_gf : float
+            The prefactor of generalized force loss at the end of the training.
+        numb_generalized_coord : int
+            The dimension of generalized coordinates.
+        use_l1_all : bool
+            Whether to use L1 loss, if False (default), it will use L2 loss.
+        inference : bool
+            If true, it will output all losses found in output, ignoring the pre-factors.
+        **kwargs
+            Other keyword arguments.
+        """
+        super().__init__()
+        self.starter_learning_rate = starter_learning_rate
+        self.has_e = (start_pref_e != 0.0 and limit_pref_e != 0.0) or inference
+        self.has_f = (start_pref_f != 0.0 and limit_pref_f != 0.0) or inference
+        self.has_v = (start_pref_v != 0.0 and limit_pref_v != 0.0) or inference
+        self.has_ae = (start_pref_ae != 0.0 and limit_pref_ae != 0.0) or inference
+        self.has_pf = (start_pref_pf != 0.0 and limit_pref_pf != 0.0) or inference
+        self.has_gf = start_pref_gf != 0.0 and limit_pref_gf != 0.0
+
+        self.start_pref_e = start_pref_e
+        self.limit_pref_e = limit_pref_e
+        self.start_pref_f = start_pref_f
+        self.limit_pref_f = limit_pref_f
+        self.start_pref_v = start_pref_v
+        self.limit_pref_v = limit_pref_v
+        self.start_pref_ae = start_pref_ae
+        self.limit_pref_ae = limit_pref_ae
+        self.start_pref_pf = start_pref_pf
+        self.limit_pref_pf = limit_pref_pf
+        self.start_pref_gf = start_pref_gf
+        self.limit_pref_gf = limit_pref_gf
+        self.relative_f = relative_f
+        self.enable_atom_ener_coeff = enable_atom_ener_coeff
+        self.numb_generalized_coord = numb_generalized_coord
+        if self.has_gf and self.numb_generalized_coord < 1:
+            raise RuntimeError(
+                "When generalized force loss is used, the dimension of generalized coordinates should be larger than 0"
+            )
+        self.use_l1_all = use_l1_all
+        self.inference = inference
+
+    def forward(self, input_dict, model, label, natoms, learning_rate, mae=False):
+        """Return loss on energy and force.
+
+        Parameters
+        ----------
+        input_dict : dict[str, paddle.Tensor]
+            Model inputs.
+        model : paddle.nn.Layer
+            Model to be used to output the predictions.
+        label : dict[str, paddle.Tensor]
+            Labels.
+        natoms : int
+            The local atom number.
+
+        Returns
+        -------
+        model_pred: dict[str, paddle.Tensor]
+            Model predictions.
+        loss: paddle.Tensor
+            Loss for model to minimize.
+        more_loss: dict[str, paddle.Tensor]
+            Other losses for display.
+        """
+        model_pred = model(**input_dict)
+        coef = learning_rate / self.starter_learning_rate
+        pref_e = self.limit_pref_e + (self.start_pref_e - self.limit_pref_e) * coef
+        pref_f = self.limit_pref_f + (self.start_pref_f - self.limit_pref_f) * coef
+        pref_v = self.limit_pref_v + (self.start_pref_v - self.limit_pref_v) * coef
+        pref_ae = self.limit_pref_ae + (self.start_pref_ae - self.limit_pref_ae) * coef
+        pref_pf = self.limit_pref_pf + (self.start_pref_pf - self.limit_pref_pf) * coef
+        pref_gf = self.limit_pref_gf + (self.start_pref_gf - self.limit_pref_gf) * coef
+
+        loss = paddle.zeros([1], dtype=env.GLOBAL_PD_FLOAT_PRECISION).to(env.DEVICE)[0]
+        more_loss = {}
+        # more_loss['log_keys'] = []  # showed when validation on the fly
+        # more_loss['test_keys'] = []  # showed when doing dp test
+        atom_norm = 1.0 / natoms
+        if self.has_e and "energy" in model_pred and "energy" in label:
+            energy_pred = model_pred["energy"]
+            energy_label = label["energy"]
+            if self.enable_atom_ener_coeff and "atom_energy" in model_pred:
+                atom_ener_pred = model_pred["atom_energy"]
+                # when ener_coeff (\nu) is defined, the energy is defined as
+                # E = \sum_i \nu_i E_i
+                # instead of the sum of atomic energies.
+                #
+                # A case is that we want to train reaction energy
+                # A + B -> C + D
+                # E = - E(A) - E(B) + E(C) + E(D)
+                # A, B, C, D could be put far away from each other
+                atom_ener_coeff = label["atom_ener_coeff"]
+                atom_ener_coeff = atom_ener_coeff.reshape(atom_ener_pred.shape)
+                energy_pred = paddle.sum(atom_ener_coeff * atom_ener_pred, axis=1)
+            find_energy = label.get("find_energy", 0.0)
+            pref_e = pref_e * find_energy
+            if not self.use_l1_all:
+                l2_ener_loss = paddle.mean(paddle.square(energy_pred - energy_label))
+                if not self.inference:
+                    more_loss["l2_ener_loss"] = self.display_if_exist(
+                        l2_ener_loss.detach(), find_energy
+                    )
+                loss += atom_norm * (pref_e * l2_ener_loss)
+                rmse_e = l2_ener_loss.sqrt() * atom_norm
+                more_loss["rmse_e"] = self.display_if_exist(
+                    rmse_e.detach(), find_energy
+                )
+                # more_loss['log_keys'].append('rmse_e')
+            else:  # use l1 and for all atoms
+                l1_ener_loss = F.l1_loss(
+                    energy_pred.reshape([-1]),
+                    energy_label.reshape([-1]),
+                    reduction="sum",
+                )
+                loss += pref_e * l1_ener_loss
+                more_loss["mae_e"] = self.display_if_exist(
+                    F.l1_loss(
+                        energy_pred.reshape([-1]),
+                        energy_label.reshape([-1]),
+                        reduction="mean",
+                    ).detach(),
+                    find_energy,
+                )
+                # more_loss['log_keys'].append('rmse_e')
+            if mae:
+                mae_e = paddle.mean(paddle.abs(energy_pred - energy_label)) * atom_norm
+                more_loss["mae_e"] = self.display_if_exist(mae_e.detach(), find_energy)
+                mae_e_all = paddle.mean(paddle.abs(energy_pred - energy_label))
+                more_loss["mae_e_all"] = self.display_if_exist(
+                    mae_e_all.detach(), find_energy
+                )
+
+        if (
+            (self.has_f or self.has_pf or self.relative_f or self.has_gf)
+            and "force" in model_pred
+            and "force" in label
+        ):
+            find_force = label.get("find_force", 0.0)
+            pref_f = pref_f * find_force
+            force_pred = model_pred["force"]
+            force_label = label["force"]
+            diff_f = (force_label - force_pred).reshape([-1])
+
+            if self.relative_f is not None:
+                force_label_3 = force_label.reshape([-1, 3])
+                # norm_f = force_label_3.norm(axis=1, keepdim=True) + self.relative_f
+                norm_f = (
+                    decomp.norm(force_label_3, axis=1, keepdim=True) + self.relative_f
+                )
+                diff_f_3 = diff_f.reshape([-1, 3])
+                diff_f_3 = diff_f_3 / norm_f
+                diff_f = diff_f_3.reshape([-1])
+
+            if self.has_f:
+                if not self.use_l1_all:
+                    l2_force_loss = paddle.mean(paddle.square(diff_f))
+                    if not self.inference:
+                        more_loss["l2_force_loss"] = self.display_if_exist(
+                            l2_force_loss.detach(), find_force
+                        )
+                    loss += (pref_f * l2_force_loss).to(GLOBAL_PD_FLOAT_PRECISION)
+                    rmse_f = l2_force_loss.sqrt()
+                    more_loss["rmse_f"] = self.display_if_exist(
+                        rmse_f.detach(), find_force
+                    )
+                else:
+                    l1_force_loss = F.l1_loss(force_label, force_pred, reduction="none")
+                    more_loss["mae_f"] = self.display_if_exist(
+                        l1_force_loss.mean().detach(), find_force
+                    )
+                    l1_force_loss = l1_force_loss.sum(-1).mean(-1).sum()
+                    loss += (pref_f * l1_force_loss).to(GLOBAL_PD_FLOAT_PRECISION)
+                if mae:
+                    mae_f = paddle.mean(paddle.abs(diff_f))
+                    more_loss["mae_f"] = self.display_if_exist(
+                        mae_f.detach(), find_force
+                    )
+
+            if self.has_pf and "atom_pref" in label:
+                atom_pref = label["atom_pref"]
+                find_atom_pref = label.get("find_atom_pref", 0.0)
+                pref_pf = pref_pf * find_atom_pref
+                atom_pref_reshape = atom_pref.reshape([-1])
+                l2_pref_force_loss = (paddle.square(diff_f) * atom_pref_reshape).mean()
+                if not self.inference:
+                    more_loss["l2_pref_force_loss"] = self.display_if_exist(
+                        l2_pref_force_loss.detach(), find_atom_pref
+                    )
+                loss += (pref_pf * l2_pref_force_loss).to(GLOBAL_PD_FLOAT_PRECISION)
+                rmse_pf = l2_pref_force_loss.sqrt()
+                more_loss["rmse_pf"] = self.display_if_exist(
+                    rmse_pf.detach(), find_atom_pref
+                )
+
+            if self.has_gf and "drdq" in label:
+                drdq = label["drdq"]
+                find_drdq = label.get("find_drdq", 0.0)
+                pref_gf = pref_gf * find_drdq
+                force_reshape_nframes = force_pred.reshape([-1, natoms * 3])
+                force_label_reshape_nframes = force_label.reshape([-1, natoms * 3])
+                drdq_reshape = drdq.reshape(
+                    [-1, natoms * 3, self.numb_generalized_coord]
+                )
+
+                # gen_force_label = paddle.einsum(
+                #     "bij,bi->bj", drdq_reshape, force_label_reshape_nframes
+                # )
+                gen_force_label = (
+                    drdq_reshape * force_label_reshape_nframes.unsqueeze(-1)
+                ).sum([-2])
+
+                # gen_force = paddle.einsum(
+                #     "bij,bi->bj", drdq_reshape, force_reshape_nframes
+                # )
+                gen_force = (drdq_reshape * force_reshape_nframes.unsqueeze(-1)).sum(
+                    [-2]
+                )
+
+                diff_gen_force = gen_force_label - gen_force
+                l2_gen_force_loss = paddle.square(diff_gen_force).mean()
+                if not self.inference:
+                    more_loss["l2_gen_force_loss"] = self.display_if_exist(
+                        l2_gen_force_loss.detach(), find_drdq
+                    )
+                loss += (pref_gf * l2_gen_force_loss).to(GLOBAL_PD_FLOAT_PRECISION)
+                rmse_gf = l2_gen_force_loss.sqrt()
+                more_loss["rmse_gf"] = self.display_if_exist(
+                    rmse_gf.detach(), find_drdq
+                )
+
+        if self.has_v and "virial" in model_pred and "virial" in label:
+            find_virial = label.get("find_virial", 0.0)
+            pref_v = pref_v * find_virial
+            diff_v = label["virial"] - model_pred["virial"].reshape([-1, 9])
+            l2_virial_loss = paddle.mean(paddle.square(diff_v))
+            if not self.inference:
+                more_loss["l2_virial_loss"] = self.display_if_exist(
+                    l2_virial_loss.detach(), find_virial
+                )
+            loss += atom_norm * (pref_v * l2_virial_loss)
+            rmse_v = l2_virial_loss.sqrt() * atom_norm
+            more_loss["rmse_v"] = self.display_if_exist(rmse_v.detach(), find_virial)
+            if mae:
+                mae_v = paddle.mean(paddle.abs(diff_v)) * atom_norm
+                more_loss["mae_v"] = self.display_if_exist(mae_v.detach(), find_virial)
+
+        if self.has_ae and "atom_energy" in model_pred and "atom_ener" in label:
+            atom_ener = model_pred["atom_energy"]
+            atom_ener_label = label["atom_ener"]
+            find_atom_ener = label.get("find_atom_ener", 0.0)
+            pref_ae = pref_ae * find_atom_ener
+            atom_ener_reshape = atom_ener.reshape([-1])
+            atom_ener_label_reshape = atom_ener_label.reshape([-1])
+            l2_atom_ener_loss = paddle.square(
+                atom_ener_label_reshape - atom_ener_reshape
+            ).mean()
+            if not self.inference:
+                more_loss["l2_atom_ener_loss"] = self.display_if_exist(
+                    l2_atom_ener_loss.detach(), find_atom_ener
+                )
+            loss += (pref_ae * l2_atom_ener_loss).to(GLOBAL_PD_FLOAT_PRECISION)
+            rmse_ae = l2_atom_ener_loss.sqrt()
+            more_loss["rmse_ae"] = self.display_if_exist(
+                rmse_ae.detach(), find_atom_ener
+            )
+
+        if not self.inference:
+            more_loss["rmse"] = paddle.sqrt(loss.detach())
+        return model_pred, loss, more_loss
+
+    @property
+    def label_requirement(self) -> list[DataRequirementItem]:
+        """Return data label requirements needed for this loss calculation."""
+        label_requirement = []
+        if self.has_e:
+            label_requirement.append(
+                DataRequirementItem(
+                    "energy",
+                    ndof=1,
+                    atomic=False,
+                    must=False,
+                    high_prec=True,
+                )
+            )
+        if self.has_f:
+            label_requirement.append(
+                DataRequirementItem(
+                    "force",
+                    ndof=3,
+                    atomic=True,
+                    must=False,
+                    high_prec=False,
+                )
+            )
+        if self.has_v:
+            label_requirement.append(
+                DataRequirementItem(
+                    "virial",
+                    ndof=9,
+                    atomic=False,
+                    must=False,
+                    high_prec=False,
+                )
+            )
+        if self.has_ae:
+            label_requirement.append(
+                DataRequirementItem(
+                    "atom_ener",
+                    ndof=1,
+                    atomic=True,
+                    must=False,
+                    high_prec=False,
+                )
+            )
+        if self.has_pf:
+            label_requirement.append(
+                DataRequirementItem(
+                    "atom_pref",
+                    ndof=1,
+                    atomic=True,
+                    must=False,
+                    high_prec=False,
+                    repeat=3,
+                )
+            )
+        if self.has_gf > 0:
+            label_requirement.append(
+                DataRequirementItem(
+                    "drdq",
+                    ndof=self.numb_generalized_coord * 3,
+                    atomic=True,
+                    must=False,
+                    high_prec=False,
+                )
+            )
+        if self.enable_atom_ener_coeff:
+            label_requirement.append(
+                DataRequirementItem(
+                    "atom_ener_coeff",
+                    ndof=1,
+                    atomic=True,
+                    must=False,
+                    high_prec=False,
+                    default=1.0,
+                )
+            )
+        return label_requirement
diff --git a/deepmd/pd/loss/loss.py b/deepmd/pd/loss/loss.py
new file mode 100644
index 0000000000..f825f9ff61
--- /dev/null
+++ b/deepmd/pd/loss/loss.py
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+from abc import (
+    ABC,
+    abstractmethod,
+)
+
+import paddle
+
+from deepmd.utils.data import (
+    DataRequirementItem,
+)
+from deepmd.utils.plugin import (
+    make_plugin_registry,
+)
+
+
+class TaskLoss(paddle.nn.Layer, ABC, make_plugin_registry("loss")):
+    def __init__(self, **kwargs):
+        """Construct loss."""
+        super().__init__()
+
+    def forward(self, input_dict, model, label, natoms, learning_rate):
+        """Return loss ."""
+        raise NotImplementedError
+
+    @property
+    @abstractmethod
+    def label_requirement(self) -> list[DataRequirementItem]:
+        """Return data label requirements needed for this loss calculation."""
+        pass
+
+    @staticmethod
+    def display_if_exist(loss: paddle.Tensor, find_property: float) -> paddle.Tensor:
+        """Display NaN if labeled property is not found.
+
+        Parameters
+        ----------
+        loss : paddle.Tensor
+            the loss tensor
+        find_property : float
+            whether the property is found
+        """
+        return loss if bool(find_property) else paddle.to_tensor(float("nan"))
+
+    @classmethod
+    def get_loss(cls, loss_params: dict) -> "TaskLoss":
+        """Get the loss module by the parameters.
+
+        By default, all the parameters are directly passed to the constructor.
+        If not, override this method.
+
+        Parameters
+        ----------
+        loss_params : dict
+            The loss parameters
+
+        Returns
+        -------
+        TaskLoss
+            The loss module
+        """
+        loss = cls(**loss_params)
+        return loss
diff --git a/deepmd/pd/model/__init__.py b/deepmd/pd/model/__init__.py
new file mode 100644
index 0000000000..6ceb116d85
--- /dev/null
+++ b/deepmd/pd/model/__init__.py
@@ -0,0 +1 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
diff --git a/deepmd/pd/model/atomic_model/__init__.py b/deepmd/pd/model/atomic_model/__init__.py
new file mode 100644
index 0000000000..68a7cc8f79
--- /dev/null
+++ b/deepmd/pd/model/atomic_model/__init__.py
@@ -0,0 +1,31 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""The atomic model provides the prediction of some property on each
+atom.  All the atomic models are not supposed to be directly accessed
+by users, but it provides a convenient interface for the
+implementation of models.
+
+Taking the energy models for example, the developeres only needs to
+implement the atomic energy prediction via an atomic model, and the
+model can be automatically made by the `deepmd.dpmodel.make_model`
+method. The `DPModel` is made by
+```
+DPModel = make_model(DPAtomicModel)
+```
+
+"""
+
+from .base_atomic_model import (
+    BaseAtomicModel,
+)
+from .dp_atomic_model import (
+    DPAtomicModel,
+)
+from .energy_atomic_model import (
+    DPEnergyAtomicModel,
+)
+
+__all__ = [
+    "BaseAtomicModel",
+    "DPAtomicModel",
+    "DPEnergyAtomicModel",
+]
diff --git a/deepmd/pd/model/atomic_model/base_atomic_model.py b/deepmd/pd/model/atomic_model/base_atomic_model.py
new file mode 100644
index 0000000000..1100813fb4
--- /dev/null
+++ b/deepmd/pd/model/atomic_model/base_atomic_model.py
@@ -0,0 +1,579 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+
+import copy
+import logging
+from typing import (
+    Callable,
+    Optional,
+    Union,
+)
+
+import numpy as np
+import paddle
+
+from deepmd.dpmodel.atomic_model import (
+    make_base_atomic_model,
+)
+from deepmd.dpmodel.output_def import (
+    FittingOutputDef,
+    OutputVariableDef,
+)
+from deepmd.pd.utils import (
+    AtomExcludeMask,
+    PairExcludeMask,
+    env,
+)
+from deepmd.pd.utils.nlist import (
+    extend_input_and_build_neighbor_list,
+)
+from deepmd.pd.utils.stat import (
+    compute_output_stats,
+)
+from deepmd.pd.utils.utils import (
+    to_numpy_array,
+    to_paddle_tensor,
+)
+from deepmd.utils.finetune import (
+    get_index_between_two_maps,
+    map_atom_exclude_types,
+    map_pair_exclude_types,
+)
+from deepmd.utils.path import (
+    DPPath,
+)
+
+log = logging.getLogger(__name__)
+dtype = env.GLOBAL_PD_FLOAT_PRECISION
+device = env.DEVICE
+
+BaseAtomicModel_ = make_base_atomic_model(paddle.Tensor)
+
+
+class BaseAtomicModel(paddle.nn.Layer, BaseAtomicModel_):
+    """The base of atomic model.
+
+    Parameters
+    ----------
+    type_map
+        Mapping atom type to the name (str) of the type.
+        For example `type_map[1]` gives the name of the type 1.
+    atom_exclude_types
+        Exclude the atomic contribution of the given types
+    pair_exclude_types
+        Exclude the pair of atoms of the given types from computing the output
+        of the atomic model. Implemented by removing the pairs from the nlist.
+    rcond : float, optional
+        The condition number for the regression of atomic energy.
+    preset_out_bias : Dict[str, list[Optional[paddle.Tensor]]], optional
+        Specifying atomic energy contribution in vacuum. Given by key:value pairs.
+        The value is a list specifying the bias. the elements can be None or np.array of output shape.
+        For example: [None, [2.]] means type 0 is not set, type 1 is set to [2.]
+        The `set_davg_zero` key in the descriptor should be set.
+
+    """
+
+    def __init__(
+        self,
+        type_map: list[str],
+        atom_exclude_types: list[int] = [],
+        pair_exclude_types: list[tuple[int, int]] = [],
+        rcond: Optional[float] = None,
+        preset_out_bias: Optional[dict[str, np.ndarray]] = None,
+    ):
+        paddle.nn.Layer.__init__(self)
+        BaseAtomicModel_.__init__(self)
+        self.type_map = type_map
+        self.reinit_atom_exclude(atom_exclude_types)
+        self.reinit_pair_exclude(pair_exclude_types)
+        self.rcond = rcond
+        self.preset_out_bias = preset_out_bias
+
+    def init_out_stat(self):
+        """Initialize the output bias."""
+        ntypes = self.get_ntypes()
+        self.bias_keys: list[str] = list(self.fitting_output_def().keys())
+        self.max_out_size = max(
+            [self.atomic_output_def()[kk].size for kk in self.bias_keys]
+        )
+        self.n_out = len(self.bias_keys)
+        out_bias_data = self._default_bias()
+        out_std_data = self._default_std()
+        self.register_buffer("out_bias", out_bias_data)
+        self.register_buffer("out_std", out_std_data)
+
+    def set_out_bias(self, out_bias: paddle.Tensor) -> None:
+        self.out_bias = out_bias
+
+    def __setitem__(self, key, value):
+        if key in ["out_bias"]:
+            self.out_bias = value
+        elif key in ["out_std"]:
+            self.out_std = value
+        else:
+            raise KeyError(key)
+
+    def __getitem__(self, key):
+        if key in ["out_bias"]:
+            return self.out_bias
+        elif key in ["out_std"]:
+            return self.out_std
+        else:
+            raise KeyError(key)
+
+    def get_type_map(self) -> list[str]:
+        """Get the type map."""
+        return self.type_map
+
+    def reinit_atom_exclude(
+        self,
+        exclude_types: list[int] = [],
+    ):
+        self.atom_exclude_types = exclude_types
+        if exclude_types == []:
+            self.atom_excl = None
+        else:
+            self.atom_excl = AtomExcludeMask(self.get_ntypes(), self.atom_exclude_types)
+
+    def reinit_pair_exclude(
+        self,
+        exclude_types: list[tuple[int, int]] = [],
+    ):
+        self.pair_exclude_types = exclude_types
+        if exclude_types == []:
+            self.pair_excl = None
+        else:
+            self.pair_excl = PairExcludeMask(self.get_ntypes(), self.pair_exclude_types)
+
+    # to make jit happy...
+    def make_atom_mask(
+        self,
+        atype: paddle.Tensor,
+    ) -> paddle.Tensor:
+        """The atoms with type < 0 are treated as virtual atoms,
+        which serves as place-holders for multi-frame calculations
+        with different number of atoms in different frames.
+
+        Parameters
+        ----------
+        atype
+            Atom types. >= 0 for real atoms <0 for virtual atoms.
+
+        Returns
+        -------
+        mask
+            True for real atoms and False for virtual atoms.
+
+        """
+        # supposed to be supported by all backends
+        return atype >= 0
+
+    def atomic_output_def(self) -> FittingOutputDef:
+        old_def = self.fitting_output_def()
+        old_list = list(old_def.get_data().values())
+        return FittingOutputDef(
+            old_list  # noqa:RUF005
+            + [
+                OutputVariableDef(
+                    name="mask",
+                    shape=[1],
+                    reducible=False,
+                    r_differentiable=False,
+                    c_differentiable=False,
+                )
+            ]
+        )
+
+    def forward_common_atomic(
+        self,
+        extended_coord: paddle.Tensor,
+        extended_atype: paddle.Tensor,
+        nlist: paddle.Tensor,
+        mapping: Optional[paddle.Tensor] = None,
+        fparam: Optional[paddle.Tensor] = None,
+        aparam: Optional[paddle.Tensor] = None,
+        comm_dict: Optional[dict[str, paddle.Tensor]] = None,
+    ) -> dict[str, paddle.Tensor]:
+        """Common interface for atomic inference.
+
+        This method accept extended coordinates, extended atom typs, neighbor list,
+        and predict the atomic contribution of the fit property.
+
+        Parameters
+        ----------
+        extended_coord
+            extended coordinates, shape: nf x (nall x 3)
+        extended_atype
+            extended atom typs, shape: nf x nall
+            for a type < 0 indicating the atomic is virtual.
+        nlist
+            neighbor list, shape: nf x nloc x nsel
+        mapping
+            extended to local index mapping, shape: nf x nall
+        fparam
+            frame parameters, shape: nf x dim_fparam
+        aparam
+            atomic parameter, shape: nf x nloc x dim_aparam
+        comm_dict
+            The data needed for communication for parallel inference.
+
+        Returns
+        -------
+        ret_dict
+            dict of output atomic properties.
+            should implement the definition of `fitting_output_def`.
+            ret_dict["mask"] of shape nf x nloc will be provided.
+            ret_dict["mask"][ff,ii] == 1 indicating the ii-th atom of the ff-th frame is real.
+            ret_dict["mask"][ff,ii] == 0 indicating the ii-th atom of the ff-th frame is virtual.
+
+        """
+        _, nloc, _ = nlist.shape
+        atype = extended_atype[:, :nloc]
+
+        if self.pair_excl is not None:
+            pair_mask = self.pair_excl(nlist, extended_atype)
+            # exclude neighbors in the nlist
+            nlist = paddle.where(pair_mask == 1, nlist, -1)
+
+        ext_atom_mask = self.make_atom_mask(extended_atype)
+        ret_dict = self.forward_atomic(
+            extended_coord,
+            paddle.where(
+                ext_atom_mask, extended_atype, paddle.zeros_like(extended_atype)
+            ),
+            nlist,
+            mapping=mapping,
+            fparam=fparam,
+            aparam=aparam,
+            comm_dict=comm_dict,
+        )
+        ret_dict = self.apply_out_stat(ret_dict, atype)
+
+        # nf x nloc
+        atom_mask = ext_atom_mask[:, :nloc].astype(paddle.int32)
+        if self.atom_excl is not None:
+            atom_mask *= self.atom_excl(atype)
+
+        for kk in ret_dict.keys():
+            out_shape = ret_dict[kk].shape
+            out_shape2 = 1
+            for ss in out_shape[2:]:
+                out_shape2 *= ss
+            ret_dict[kk] = (
+                ret_dict[kk].reshape([out_shape[0], out_shape[1], out_shape2])
+                * atom_mask.unsqueeze(2).astype(ret_dict[kk].dtype)
+            ).reshape(out_shape)
+        ret_dict["mask"] = atom_mask
+
+        return ret_dict
+
+    def forward(
+        self,
+        extended_coord: paddle.Tensor,
+        extended_atype: paddle.Tensor,
+        nlist: paddle.Tensor,
+        mapping: Optional[paddle.Tensor] = None,
+        fparam: Optional[paddle.Tensor] = None,
+        aparam: Optional[paddle.Tensor] = None,
+        comm_dict: Optional[dict[str, paddle.Tensor]] = None,
+    ) -> dict[str, paddle.Tensor]:
+        return self.forward_common_atomic(
+            extended_coord,
+            extended_atype,
+            nlist,
+            mapping=mapping,
+            fparam=fparam,
+            aparam=aparam,
+            comm_dict=comm_dict,
+        )
+
+    def change_type_map(
+        self, type_map: list[str], model_with_new_type_stat=None
+    ) -> None:
+        """Change the type related params to new ones, according to `type_map` and the original one in the model.
+        If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
+        """
+        remap_index, has_new_type = get_index_between_two_maps(self.type_map, type_map)
+        self.type_map = type_map
+        self.reinit_atom_exclude(
+            map_atom_exclude_types(self.atom_exclude_types, remap_index)
+        )
+        self.reinit_pair_exclude(
+            map_pair_exclude_types(self.pair_exclude_types, remap_index)
+        )
+        if has_new_type:
+            extend_shape = [
+                self.out_bias.shape[0],
+                len(type_map),
+                *list(self.out_bias.shape[2:]),
+            ]
+            extend_bias = paddle.zeros(extend_shape, dtype=self.out_bias.dtype).to(
+                device=self.out_bias.place
+            )
+            self.out_bias = paddle.concat([self.out_bias, extend_bias], axis=1)
+            extend_std = paddle.ones(extend_shape, dtype=self.out_std.dtype).to(
+                device=self.out_std.place
+            )
+            self.out_std = paddle.concat([self.out_std, extend_std], axis=1)
+        self.out_bias = self.out_bias[:, remap_index, :]
+        self.out_std = self.out_std[:, remap_index, :]
+
+    def serialize(self) -> dict:
+        return {
+            "type_map": self.type_map,
+            "atom_exclude_types": self.atom_exclude_types,
+            "pair_exclude_types": self.pair_exclude_types,
+            "rcond": self.rcond,
+            "preset_out_bias": self.preset_out_bias,
+            "@variables": {
+                "out_bias": to_numpy_array(self.out_bias),
+                "out_std": to_numpy_array(self.out_std),
+            },
+        }
+
+    @classmethod
+    def deserialize(cls, data: dict) -> "BaseAtomicModel":
+        data = copy.deepcopy(data)
+        variables = data.pop("@variables", None)
+        variables = (
+            {"out_bias": None, "out_std": None} if variables is None else variables
+        )
+        obj = cls(**data)
+        obj["out_bias"] = (
+            to_paddle_tensor(variables["out_bias"])
+            if variables["out_bias"] is not None
+            else obj._default_bias()
+        )
+        obj["out_std"] = (
+            to_paddle_tensor(variables["out_std"])
+            if variables["out_std"] is not None
+            else obj._default_std()
+        )
+        return obj
+
+    def compute_or_load_stat(
+        self,
+        merged: Union[Callable[[], list[dict]], list[dict]],
+        stat_file_path: Optional[DPPath] = None,
+    ):
+        """
+        Compute the output statistics (e.g. energy bias) for the fitting net from packed data.
+
+        Parameters
+        ----------
+        merged : Union[Callable[[], list[dict]], list[dict]]
+            - list[dict]: A list of data samples from various data systems.
+                Each element, `merged[i]`, is a data dictionary containing `keys`: `paddle.Tensor`
+                originating from the `i`-th data system.
+            - Callable[[], list[dict]]: A lazy function that returns data samples in the above format
+                only when needed. Since the sampling process can be slow and memory-intensive,
+                the lazy function helps by only sampling once.
+        stat_file_path : Optional[DPPath]
+            The path to the stat file.
+
+        """
+        raise NotImplementedError
+
+    def compute_or_load_out_stat(
+        self,
+        merged: Union[Callable[[], list[dict]], list[dict]],
+        stat_file_path: Optional[DPPath] = None,
+    ):
+        """
+        Compute the output statistics (e.g. energy bias) for the fitting net from packed data.
+
+        Parameters
+        ----------
+        merged : Union[Callable[[], list[dict]], list[dict]]
+            - list[dict]: A list of data samples from various data systems.
+                Each element, `merged[i]`, is a data dictionary containing `keys`: `paddle.Tensor`
+                originating from the `i`-th data system.
+            - Callable[[], list[dict]]: A lazy function that returns data samples in the above format
+                only when needed. Since the sampling process can be slow and memory-intensive,
+                the lazy function helps by only sampling once.
+        stat_file_path : Optional[DPPath]
+            The path to the stat file.
+
+        """
+        self.change_out_bias(
+            merged,
+            stat_file_path=stat_file_path,
+            bias_adjust_mode="set-by-statistic",
+        )
+
+    def apply_out_stat(
+        self,
+        ret: dict[str, paddle.Tensor],
+        atype: paddle.Tensor,
+    ):
+        """Apply the stat to each atomic output.
+        The developer may override the method to define how the bias is applied
+        to the atomic output of the model.
+
+        Parameters
+        ----------
+        ret
+            The returned dict by the forward_atomic method
+        atype
+            The atom types. nf x nloc
+
+        """
+        out_bias, out_std = self._fetch_out_stat(self.bias_keys)
+        for kk in self.bias_keys:
+            # nf x nloc x odims, out_bias: ntypes x odims
+            ret[kk] = ret[kk] + out_bias[kk][atype]
+        return ret
+
+    def change_out_bias(
+        self,
+        sample_merged,
+        stat_file_path: Optional[DPPath] = None,
+        bias_adjust_mode="change-by-statistic",
+    ) -> None:
+        """Change the output bias according to the input data and the pretrained model.
+
+        Parameters
+        ----------
+        sample_merged : Union[Callable[[], list[dict]], list[dict]]
+            - list[dict]: A list of data samples from various data systems.
+                Each element, `merged[i]`, is a data dictionary containing `keys`: `paddle.Tensor`
+                originating from the `i`-th data system.
+            - Callable[[], list[dict]]: A lazy function that returns data samples in the above format
+                only when needed. Since the sampling process can be slow and memory-intensive,
+                the lazy function helps by only sampling once.
+        bias_adjust_mode : str
+            The mode for changing output bias : ['change-by-statistic', 'set-by-statistic']
+            'change-by-statistic' : perform predictions on labels of target dataset,
+                    and do least square on the errors to obtain the target shift as bias.
+            'set-by-statistic' : directly use the statistic output bias in the target dataset.
+        stat_file_path : Optional[DPPath]
+            The path to the stat file.
+        """
+        if bias_adjust_mode == "change-by-statistic":
+            delta_bias, out_std = compute_output_stats(
+                sample_merged,
+                self.get_ntypes(),
+                keys=list(self.atomic_output_def().keys()),
+                stat_file_path=stat_file_path,
+                model_forward=self._get_forward_wrapper_func(),
+                rcond=self.rcond,
+                preset_bias=self.preset_out_bias,
+                atomic_output=self.atomic_output_def(),
+            )
+            self._store_out_stat(delta_bias, out_std, add=True)
+        elif bias_adjust_mode == "set-by-statistic":
+            bias_out, std_out = compute_output_stats(
+                sample_merged,
+                self.get_ntypes(),
+                keys=list(self.atomic_output_def().keys()),
+                stat_file_path=stat_file_path,
+                rcond=self.rcond,
+                preset_bias=self.preset_out_bias,
+                atomic_output=self.atomic_output_def(),
+            )
+            self._store_out_stat(bias_out, std_out)
+        else:
+            raise RuntimeError("Unknown bias_adjust_mode mode: " + bias_adjust_mode)
+
+    def _get_forward_wrapper_func(self) -> Callable[..., paddle.Tensor]:
+        """Get a forward wrapper of the atomic model for output bias calculation."""
+
+        def model_forward(coord, atype, box, fparam=None, aparam=None):
+            with (
+                paddle.no_grad()
+            ):  # it's essential for pure paddle forward function to use auto_batchsize
+                (
+                    extended_coord,
+                    extended_atype,
+                    mapping,
+                    nlist,
+                ) = extend_input_and_build_neighbor_list(
+                    coord,
+                    atype,
+                    self.get_rcut(),
+                    self.get_sel(),
+                    mixed_types=self.mixed_types(),
+                    box=box,
+                )
+                atomic_ret = self.forward_common_atomic(
+                    extended_coord,
+                    extended_atype,
+                    nlist,
+                    mapping=mapping,
+                    fparam=fparam,
+                    aparam=aparam,
+                )
+                return {kk: vv.detach() for kk, vv in atomic_ret.items()}
+
+        return model_forward
+
+    def _default_bias(self):
+        ntypes = self.get_ntypes()
+        return paddle.zeros([self.n_out, ntypes, self.max_out_size], dtype=dtype).to(
+            device=device
+        )
+
+    def _default_std(self):
+        ntypes = self.get_ntypes()
+        return paddle.ones([self.n_out, ntypes, self.max_out_size], dtype=dtype).to(
+            device=device
+        )
+
+    def _varsize(
+        self,
+        shape: list[int],
+    ) -> int:
+        output_size = 1
+        len_shape = len(shape)
+        for i in range(len_shape):
+            output_size *= shape[i]
+        return output_size
+
+    def _get_bias_index(
+        self,
+        kk: str,
+    ) -> int:
+        res: list[int] = []
+        for i, e in enumerate(self.bias_keys):
+            if e == kk:
+                res.append(i)
+        assert len(res) == 1
+        return res[0]
+
+    def _store_out_stat(
+        self,
+        out_bias: dict[str, paddle.Tensor],
+        out_std: dict[str, paddle.Tensor],
+        add: bool = False,
+    ):
+        ntypes = self.get_ntypes()
+        out_bias_data = paddle.clone(self.out_bias)
+        out_std_data = paddle.clone(self.out_std)
+        for kk in out_bias.keys():
+            assert kk in out_std.keys()
+            idx = self._get_bias_index(kk)
+            size = self._varsize(self.atomic_output_def()[kk].shape)
+            if not add:
+                out_bias_data[idx, :, :size] = out_bias[kk].reshape([ntypes, size])
+            else:
+                out_bias_data[idx, :, :size] += out_bias[kk].reshape([ntypes, size])
+            out_std_data[idx, :, :size] = out_std[kk].reshape([ntypes, size])
+        paddle.assign(out_bias_data, self.out_bias)
+        paddle.assign(out_std_data, self.out_std)
+
+    def _fetch_out_stat(
+        self,
+        keys: list[str],
+    ) -> tuple[dict[str, paddle.Tensor], dict[str, paddle.Tensor]]:
+        ret_bias = {}
+        ret_std = {}
+        ntypes = self.get_ntypes()
+        for kk in keys:
+            idx = self._get_bias_index(kk)
+            isize = self._varsize(self.atomic_output_def()[kk].shape)
+            ret_bias[kk] = self.out_bias[idx, :, :isize].reshape(
+                [ntypes] + list(self.atomic_output_def()[kk].shape)  # noqa: RUF005
+            )
+            ret_std[kk] = self.out_std[idx, :, :isize].reshape(
+                [ntypes] + list(self.atomic_output_def()[kk].shape)  # noqa: RUF005
+            )
+        return ret_bias, ret_std
diff --git a/deepmd/pd/model/atomic_model/dp_atomic_model.py b/deepmd/pd/model/atomic_model/dp_atomic_model.py
new file mode 100644
index 0000000000..47b881e0cc
--- /dev/null
+++ b/deepmd/pd/model/atomic_model/dp_atomic_model.py
@@ -0,0 +1,333 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import copy
+import functools
+import logging
+from typing import (
+    Optional,
+)
+
+import paddle
+
+from deepmd.dpmodel import (
+    FittingOutputDef,
+)
+from deepmd.pd.model.descriptor.base_descriptor import (
+    BaseDescriptor,
+)
+from deepmd.pd.model.task.base_fitting import (
+    BaseFitting,
+)
+from deepmd.utils.path import (
+    DPPath,
+)
+from deepmd.utils.version import (
+    check_version_compatibility,
+)
+
+from .base_atomic_model import (
+    BaseAtomicModel,
+)
+
+log = logging.getLogger(__name__)
+
+
+@BaseAtomicModel.register("standard")
+class DPAtomicModel(BaseAtomicModel):
+    """Model give atomic prediction of some physical property.
+
+    Parameters
+    ----------
+    descriptor
+            Descriptor
+    fitting_net
+            Fitting net
+    type_map
+            Mapping atom type to the name (str) of the type.
+            For example `type_map[1]` gives the name of the type 1.
+    """
+
+    def __init__(
+        self,
+        descriptor,
+        fitting,
+        type_map: list[str],
+        **kwargs,
+    ):
+        super().__init__(type_map, **kwargs)
+        ntypes = len(type_map)
+        self.type_map = type_map
+        self.ntypes = ntypes
+        self.descriptor = descriptor
+        self.rcut = self.descriptor.get_rcut()
+        self.sel = self.descriptor.get_sel()
+        self.fitting_net = fitting
+        super().init_out_stat()
+        self.enable_eval_descriptor_hook = False
+        self.eval_descriptor_list = []
+
+        # register 'type_map' as buffer
+        def _string_to_array(s: str) -> list[int]:
+            return [ord(c) for c in s]
+
+        self.register_buffer(
+            "buffer_type_map",
+            paddle.to_tensor(_string_to_array(" ".join(self.type_map)), dtype="int32"),
+        )
+        self.buffer_type_map.name = "buffer_type_map"
+        if hasattr(self.descriptor, "has_message_passing"):
+            # register 'has_message_passing' as buffer(cast to int32 as problems may meets with vector<bool>)
+            self.register_buffer(
+                "buffer_has_message_passing",
+                paddle.to_tensor(self.descriptor.has_message_passing(), dtype="int32"),
+            )
+            self.buffer_has_message_passing.name = "buffer_has_message_passing"
+        # register 'ntypes' as buffer
+        self.register_buffer(
+            "buffer_ntypes", paddle.to_tensor(self.ntypes, dtype="int32")
+        )
+        self.buffer_ntypes.name = "buffer_ntypes"
+        # register 'rcut' as buffer
+        self.register_buffer(
+            "buffer_rcut", paddle.to_tensor(self.rcut, dtype="float64")
+        )
+        self.buffer_rcut.name = "buffer_rcut"
+        if hasattr(self.fitting_net, "get_dim_fparam"):
+            # register 'dfparam' as buffer
+            self.register_buffer(
+                "buffer_dfparam",
+                paddle.to_tensor(self.fitting_net.get_dim_fparam(), dtype="int32"),
+            )
+            self.buffer_dfparam.name = "buffer_dfparam"
+        if hasattr(self.fitting_net, "get_dim_aparam"):
+            # register 'daparam' as buffer
+            self.register_buffer(
+                "buffer_daparam",
+                paddle.to_tensor(self.fitting_net.get_dim_aparam(), dtype="int32"),
+            )
+            self.buffer_daparam.name = "buffer_daparam"
+        # register 'aparam_nall' as buffer
+        self.register_buffer(
+            "buffer_aparam_nall",
+            paddle.to_tensor(False, dtype="int32"),
+        )
+        self.buffer_aparam_nall.name = "buffer_aparam_nall"
+
+    eval_descriptor_list: list[paddle.Tensor]
+
+    def set_eval_descriptor_hook(self, enable: bool) -> None:
+        """Set the hook for evaluating descriptor and clear the cache for descriptor list."""
+        self.enable_eval_descriptor_hook = enable
+        self.eval_descriptor_list = []
+
+    def eval_descriptor(self) -> paddle.Tensor:
+        """Evaluate the descriptor."""
+        return paddle.concat(self.eval_descriptor_list)
+
+    def fitting_output_def(self) -> FittingOutputDef:
+        """Get the output def of the fitting net."""
+        return (
+            self.fitting_net.output_def()
+            if self.fitting_net is not None
+            else self.coord_denoise_net.output_def()
+        )
+
+    def get_rcut(self) -> float:
+        """Get the cut-off radius."""
+        return self.rcut
+
+    def get_sel(self) -> list[int]:
+        """Get the neighbor selection."""
+        return self.sel
+
+    def mixed_types(self) -> bool:
+        """If true, the model
+        1. assumes total number of atoms aligned across frames;
+        2. uses a neighbor list that does not distinguish different atomic types.
+
+        If false, the model
+        1. assumes total number of atoms of each atom type aligned across frames;
+        2. uses a neighbor list that distinguishes different atomic types.
+
+        """
+        return self.descriptor.mixed_types()
+
+    def change_type_map(
+        self, type_map: list[str], model_with_new_type_stat=None
+    ) -> None:
+        """Change the type related params to new ones, according to `type_map` and the original one in the model.
+        If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
+        """
+        super().change_type_map(
+            type_map=type_map, model_with_new_type_stat=model_with_new_type_stat
+        )
+        self.type_map = type_map
+        self.ntypes = len(type_map)
+        self.descriptor.change_type_map(
+            type_map=type_map,
+            model_with_new_type_stat=model_with_new_type_stat.descriptor
+            if model_with_new_type_stat is not None
+            else None,
+        )
+        self.fitting_net.change_type_map(type_map=type_map)
+
+    def has_message_passing(self) -> bool:
+        """Returns whether the atomic model has message passing."""
+        return self.descriptor.has_message_passing()
+
+    def need_sorted_nlist_for_lower(self) -> bool:
+        """Returns whether the atomic model needs sorted nlist when using `forward_lower`."""
+        return self.descriptor.need_sorted_nlist_for_lower()
+
+    def serialize(self) -> dict:
+        dd = BaseAtomicModel.serialize(self)
+        dd.update(
+            {
+                "@class": "Model",
+                "@version": 2,
+                "type": "standard",
+                "type_map": self.type_map,
+                "descriptor": self.descriptor.serialize(),
+                "fitting": self.fitting_net.serialize(),
+            }
+        )
+        return dd
+
+    @classmethod
+    def deserialize(cls, data) -> "DPAtomicModel":
+        data = copy.deepcopy(data)
+        check_version_compatibility(data.pop("@version", 1), 2, 1)
+        data.pop("@class", None)
+        data.pop("type", None)
+        descriptor_obj = BaseDescriptor.deserialize(data.pop("descriptor"))
+        fitting_obj = BaseFitting.deserialize(data.pop("fitting"))
+        data["descriptor"] = descriptor_obj
+        data["fitting"] = fitting_obj
+        obj = super().deserialize(data)
+        return obj
+
+    def forward_atomic(
+        self,
+        extended_coord,
+        extended_atype,
+        nlist,
+        mapping: Optional[paddle.Tensor] = None,
+        fparam: Optional[paddle.Tensor] = None,
+        aparam: Optional[paddle.Tensor] = None,
+        comm_dict: Optional[dict[str, paddle.Tensor]] = None,
+    ) -> dict[str, paddle.Tensor]:
+        """Return atomic prediction.
+
+        Parameters
+        ----------
+        extended_coord
+            coordinates in extended region
+        extended_atype
+            atomic type in extended region
+        nlist
+            neighbor list. nf x nloc x nsel
+        mapping
+            mapps the extended indices to local indices
+        fparam
+            frame parameter. nf x ndf
+        aparam
+            atomic parameter. nf x nloc x nda
+
+        Returns
+        -------
+        result_dict
+            the result dict, defined by the `FittingOutputDef`.
+
+        """
+        nframes, nloc, nnei = nlist.shape
+        atype = extended_atype[:, :nloc]
+        if self.do_grad_r() or self.do_grad_c():
+            extended_coord.stop_gradient = False
+        descriptor, rot_mat, g2, h2, sw = self.descriptor(
+            extended_coord,
+            extended_atype,
+            nlist,
+            mapping=mapping,
+            comm_dict=comm_dict,
+        )
+        assert descriptor is not None
+        if self.enable_eval_descriptor_hook:
+            self.eval_descriptor_list.append(descriptor)
+        # energy, force
+        fit_ret = self.fitting_net(
+            descriptor,
+            atype,
+            gr=rot_mat,
+            g2=g2,
+            h2=h2,
+            fparam=fparam,
+            aparam=aparam,
+        )
+        return fit_ret
+
+    def get_out_bias(self) -> paddle.Tensor:
+        return self.out_bias
+
+    def compute_or_load_stat(
+        self,
+        sampled_func,
+        stat_file_path: Optional[DPPath] = None,
+    ):
+        """
+        Compute or load the statistics parameters of the model,
+        such as mean and standard deviation of descriptors or the energy bias of the fitting net.
+        When `sampled` is provided, all the statistics parameters will be calculated (or re-calculated for update),
+        and saved in the `stat_file_path`(s).
+        When `sampled` is not provided, it will check the existence of `stat_file_path`(s)
+        and load the calculated statistics parameters.
+
+        Parameters
+        ----------
+        sampled_func
+            The lazy sampled function to get data frames from different data systems.
+        stat_file_path
+            The dictionary of paths to the statistics files.
+        """
+        if stat_file_path is not None and self.type_map is not None:
+            # descriptors and fitting net with different type_map
+            # should not share the same parameters
+            stat_file_path /= " ".join(self.type_map)
+
+        @functools.lru_cache
+        def wrapped_sampler():
+            sampled = sampled_func()
+            if self.pair_excl is not None:
+                pair_exclude_types = self.pair_excl.get_exclude_types()
+                for sample in sampled:
+                    sample["pair_exclude_types"] = list(pair_exclude_types)
+            if self.atom_excl is not None:
+                atom_exclude_types = self.atom_excl.get_exclude_types()
+                for sample in sampled:
+                    sample["atom_exclude_types"] = list(atom_exclude_types)
+            return sampled
+
+        self.descriptor.compute_input_stats(wrapped_sampler, stat_file_path)
+        self.compute_or_load_out_stat(wrapped_sampler, stat_file_path)
+
+    def get_dim_fparam(self) -> int:
+        """Get the number (dimension) of frame parameters of this atomic model."""
+        return self.fitting_net.get_dim_fparam()
+
+    def get_dim_aparam(self) -> int:
+        """Get the number (dimension) of atomic parameters of this atomic model."""
+        return self.fitting_net.get_dim_aparam()
+
+    def get_sel_type(self) -> list[int]:
+        """Get the selected atom types of this model.
+
+        Only atoms with selected atom types have atomic contribution
+        to the result of the model.
+        If returning an empty list, all atom types are selected.
+        """
+        return self.fitting_net.get_sel_type()
+
+    def is_aparam_nall(self) -> bool:
+        """Check whether the shape of atomic parameters is (nframes, nall, ndim).
+
+        If False, the shape is (nframes, nloc, ndim).
+        """
+        return False
diff --git a/deepmd/pd/model/atomic_model/energy_atomic_model.py b/deepmd/pd/model/atomic_model/energy_atomic_model.py
new file mode 100644
index 0000000000..708ec9db7f
--- /dev/null
+++ b/deepmd/pd/model/atomic_model/energy_atomic_model.py
@@ -0,0 +1,17 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+from deepmd.pd.model.task.ener import (
+    EnergyFittingNet,
+    InvarFitting,
+)
+
+from .dp_atomic_model import (
+    DPAtomicModel,
+)
+
+
+class DPEnergyAtomicModel(DPAtomicModel):
+    def __init__(self, descriptor, fitting, type_map, **kwargs):
+        assert isinstance(fitting, EnergyFittingNet) or isinstance(
+            fitting, InvarFitting
+        )
+        super().__init__(descriptor, fitting, type_map, **kwargs)
diff --git a/deepmd/pd/model/descriptor/__init__.py b/deepmd/pd/model/descriptor/__init__.py
new file mode 100644
index 0000000000..654643959b
--- /dev/null
+++ b/deepmd/pd/model/descriptor/__init__.py
@@ -0,0 +1,22 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+from .base_descriptor import (
+    BaseDescriptor,
+)
+from .descriptor import (
+    DescriptorBlock,
+)
+from .env_mat import (
+    prod_env_mat,
+)
+from .se_a import (
+    DescrptBlockSeA,
+    DescrptSeA,
+)
+
+__all__ = [
+    "BaseDescriptor",
+    "DescriptorBlock",
+    "DescrptBlockSeA",
+    "DescrptSeA",
+    "prod_env_mat",
+]
diff --git a/deepmd/pd/model/descriptor/base_descriptor.py b/deepmd/pd/model/descriptor/base_descriptor.py
new file mode 100644
index 0000000000..8f0b799f87
--- /dev/null
+++ b/deepmd/pd/model/descriptor/base_descriptor.py
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import paddle
+
+from deepmd.dpmodel.descriptor import (
+    make_base_descriptor,
+)
+
+BaseDescriptor = make_base_descriptor(paddle.Tensor, "forward")
diff --git a/deepmd/pd/model/descriptor/descriptor.py b/deepmd/pd/model/descriptor/descriptor.py
new file mode 100644
index 0000000000..36de5b1948
--- /dev/null
+++ b/deepmd/pd/model/descriptor/descriptor.py
@@ -0,0 +1,219 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import logging
+from abc import (
+    ABC,
+    abstractmethod,
+)
+from typing import (
+    Callable,
+    Optional,
+    Union,
+)
+
+import paddle
+
+from deepmd.pd.utils import (
+    env,
+)
+from deepmd.pd.utils.env_mat_stat import (
+    EnvMatStatSe,
+)
+from deepmd.utils.env_mat_stat import (
+    StatItem,
+)
+from deepmd.utils.path import (
+    DPPath,
+)
+from deepmd.utils.plugin import (
+    make_plugin_registry,
+)
+
+log = logging.getLogger(__name__)
+
+
+class DescriptorBlock(paddle.nn.Layer, ABC, make_plugin_registry("DescriptorBlock")):
+    """The building block of descriptor.
+    Given the input descriptor, provide with the atomic coordinates,
+    atomic types and neighbor list, calculate the new descriptor.
+    """
+
+    local_cluster = False
+
+    def __new__(cls, *args, **kwargs):
+        if cls is DescriptorBlock:
+            try:
+                descrpt_type = kwargs["type"]
+            except KeyError as e:
+                raise KeyError(
+                    "the type of DescriptorBlock should be set by `type`"
+                ) from e
+            cls = cls.get_class_by_type(descrpt_type)
+        return super().__new__(cls)
+
+    @abstractmethod
+    def get_rcut(self) -> float:
+        """Returns the cut-off radius."""
+        pass
+
+    @abstractmethod
+    def get_rcut_smth(self) -> float:
+        """Returns the radius where the neighbor information starts to smoothly decay to 0."""
+        pass
+
+    @abstractmethod
+    def get_nsel(self) -> int:
+        """Returns the number of selected atoms in the cut-off radius."""
+        pass
+
+    @abstractmethod
+    def get_sel(self) -> list[int]:
+        """Returns the number of selected atoms for each type."""
+        pass
+
+    @abstractmethod
+    def get_ntypes(self) -> int:
+        """Returns the number of element types."""
+        pass
+
+    @abstractmethod
+    def get_dim_out(self) -> int:
+        """Returns the output dimension."""
+        pass
+
+    @abstractmethod
+    def get_dim_in(self) -> int:
+        """Returns the input dimension."""
+        pass
+
+    @abstractmethod
+    def get_dim_emb(self) -> int:
+        """Returns the embedding dimension."""
+        pass
+
+    @abstractmethod
+    def get_env_protection(self) -> float:
+        """Returns the protection of building environment matrix."""
+        pass
+
+    def compute_input_stats(
+        self,
+        merged: Union[Callable[[], list[dict]], list[dict]],
+        path: Optional[DPPath] = None,
+    ):
+        """
+        Compute the input statistics (e.g. mean and stddev) for the descriptors from packed data.
+
+        Parameters
+        ----------
+        merged : Union[Callable[[], list[dict]], list[dict]]
+            - list[dict]: A list of data samples from various data systems.
+                Each element, `merged[i]`, is a data dictionary containing `keys`: `paddle.Tensor`
+                originating from the `i`-th data system.
+            - Callable[[], list[dict]]: A lazy function that returns data samples in the above format
+                only when needed. Since the sampling process can be slow and memory-intensive,
+                the lazy function helps by only sampling once.
+        path : Optional[DPPath]
+            The path to the stat file.
+
+        """
+        raise NotImplementedError
+
+    def get_stats(self) -> dict[str, StatItem]:
+        """Get the statistics of the descriptor."""
+        raise NotImplementedError
+
+    def share_params(self, base_class, shared_level, resume=False):
+        """
+        Share the parameters of self to the base_class with shared_level during multitask training.
+        If not start from checkpoint (resume is False),
+        some separated parameters (e.g. mean and stddev) will be re-calculated across different classes.
+        """
+        assert (
+            self.__class__ == base_class.__class__
+        ), "Only descriptors of the same type can share params!"
+        if shared_level == 0:
+            # link buffers
+            if hasattr(self, "mean"):
+                if not resume:
+                    # in case of change params during resume
+                    base_env = EnvMatStatSe(base_class)
+                    base_env.stats = base_class.stats
+                    for kk in base_class.get_stats():
+                        base_env.stats[kk] += self.get_stats()[kk]
+                    mean, stddev = base_env()
+                    if not base_class.set_davg_zero:
+                        paddle.assign(
+                            paddle.to_tensor(mean).to(device=env.DEVICE),
+                            base_class.mean,
+                        )
+                    paddle.assign(
+                        paddle.to_tensor(stddev).to(device=env.DEVICE),
+                        base_class.stddev,
+                    )
+                # must share, even if not do stat
+                self.mean = base_class.mean
+                self.stddev = base_class.stddev
+            # self.set_state_dict(base_class.state_dict()) # this does not work, because it only inits the model
+            # the following will successfully link all the params except buffers
+            for item in self._sub_layers:
+                self._sub_layers[item] = base_class._sub_layers[item]
+        else:
+            raise NotImplementedError
+
+    @abstractmethod
+    def forward(
+        self,
+        nlist: paddle.Tensor,
+        extended_coord: paddle.Tensor,
+        extended_atype: paddle.Tensor,
+        extended_atype_embd: Optional[paddle.Tensor] = None,
+        mapping: Optional[paddle.Tensor] = None,
+    ):
+        """Calculate DescriptorBlock."""
+        pass
+
+    @abstractmethod
+    def has_message_passing(self) -> bool:
+        """Returns whether the descriptor block has message passing."""
+
+    @abstractmethod
+    def need_sorted_nlist_for_lower(self) -> bool:
+        """Returns whether the descriptor block needs sorted nlist when using `forward_lower`."""
+
+
+def extend_descrpt_stat(des, type_map, des_with_stat=None):
+    r"""
+    Extend the statistics of a descriptor block with types from newly provided `type_map`.
+
+    After extending, the type related dimension of the extended statistics will have a length of
+    `len(old_type_map) + len(type_map)`, where `old_type_map` represents the type map in `des`.
+    The `get_index_between_two_maps()` function can then be used to correctly select statistics for types
+    from `old_type_map` or `type_map`.
+    Positive indices from 0 to `len(old_type_map) - 1` will select old statistics of types in `old_type_map`,
+    while negative indices from `-len(type_map)` to -1 will select new statistics of types in `type_map`.
+
+    Parameters
+    ----------
+    des : DescriptorBlock
+        The descriptor block to be extended.
+    type_map : list[str]
+        The name of each type of atoms to be extended.
+    des_with_stat : DescriptorBlock, Optional
+        The descriptor block has additional statistics of types from newly provided `type_map`.
+        If None, the default statistics will be used.
+        Otherwise, the statistics provided in this DescriptorBlock will be used.
+
+    """
+    if des_with_stat is not None:
+        extend_davg = des_with_stat["davg"]
+        extend_dstd = des_with_stat["dstd"]
+    else:
+        extend_shape = [len(type_map), *list(des["davg"].shape[1:])]
+        extend_davg = paddle.zeros(extend_shape, dtype=des["davg"].dtype).to(
+            device=des["davg"].place
+        )
+        extend_dstd = paddle.ones(extend_shape, dtype=des["dstd"].dtype).to(
+            device=des["dstd"].place
+        )
+    des["davg"] = paddle.concat([des["davg"], extend_davg], axis=0)
+    des["dstd"] = paddle.concat([des["dstd"], extend_dstd], axis=0)
diff --git a/deepmd/pd/model/descriptor/env_mat.py b/deepmd/pd/model/descriptor/env_mat.py
new file mode 100644
index 0000000000..3a9daec1e8
--- /dev/null
+++ b/deepmd/pd/model/descriptor/env_mat.py
@@ -0,0 +1,87 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+
+import paddle
+
+from deepmd.pd.utils import (
+    decomp,
+)
+from deepmd.pd.utils.preprocess import (
+    compute_smooth_weight,
+)
+
+
+def _make_env_mat(
+    nlist,
+    coord,
+    rcut: float,
+    ruct_smth: float,
+    radial_only: bool = False,
+    protection: float = 0.0,
+):
+    """Make smooth environment matrix."""
+    bsz, natoms, nnei = nlist.shape
+    coord = coord.reshape([bsz, -1, 3])
+    nall = coord.shape[1]
+    mask = nlist >= 0
+    # nlist = nlist * mask  ## this impl will contribute nans in Hessian calculation.
+    nlist = paddle.where(mask, nlist, nall - 1)
+    coord_l = coord[:, :natoms].reshape([bsz, -1, 1, 3])
+    index = nlist.reshape([bsz, -1]).unsqueeze(-1).expand([-1, -1, 3])
+    # coord_r = paddle.take_along_axis(coord, axis=1, indices=index)
+    coord_r = decomp.take_along_axis(coord, axis=1, indices=index)
+    coord_r = coord_r.reshape([bsz, natoms, nnei, 3])
+    diff = coord_r - coord_l
+    # length = paddle.linalg.norm(diff, axis=-1, keepdim=True)
+    length = decomp.norm(diff, axis=-1, keepdim=True)
+    # for index 0 nloc atom
+    length = length + (~mask.unsqueeze(-1)).astype(length.dtype)
+    t0 = 1 / (length + protection)
+    t1 = diff / (length + protection) ** 2
+    weight = compute_smooth_weight(length, ruct_smth, rcut)
+    weight = weight * mask.unsqueeze(-1).astype(weight.dtype)
+    if radial_only:
+        env_mat = t0 * weight
+    else:
+        env_mat = paddle.concat([t0.astype(t1.dtype), t1], axis=-1) * weight
+    return env_mat, diff * mask.unsqueeze(-1).astype(diff.dtype), weight
+
+
+def prod_env_mat(
+    extended_coord,
+    nlist,
+    atype,
+    mean,
+    stddev,
+    rcut: float,
+    rcut_smth: float,
+    radial_only: bool = False,
+    protection: float = 0.0,
+):
+    """Generate smooth environment matrix from atom coordinates and other context.
+
+    Args:
+    - extended_coord: Copied atom coordinates with shape [nframes, nall*3].
+    - atype: Atom types with shape [nframes, nloc].
+    - mean: Average value of descriptor per element type with shape [len(sec), nnei, 4 or 1].
+    - stddev: Standard deviation of descriptor per element type with shape [len(sec), nnei, 4 or 1].
+    - rcut: Cut-off radius.
+    - rcut_smth: Smooth hyper-parameter for pair force & energy.
+    - radial_only: Whether to return a full description or a radial-only descriptor.
+    - protection: Protection parameter to prevent division by zero errors during calculations.
+
+    Returns
+    -------
+    - env_mat: Shape is [nframes, natoms[1]*nnei*4].
+    """
+    _env_mat_se_a, diff, switch = _make_env_mat(
+        nlist,
+        extended_coord,
+        rcut,
+        rcut_smth,
+        radial_only,
+        protection=protection,
+    )  # shape [n_atom, dim, 4 or 1]
+    t_avg = mean[atype]  # [n_atom, dim, 4 or 1]
+    t_std = stddev[atype]  # [n_atom, dim, 4 or 1]
+    env_mat_se_a = (_env_mat_se_a - t_avg) / t_std
+    return env_mat_se_a, diff, switch
diff --git a/deepmd/pd/model/descriptor/se_a.py b/deepmd/pd/model/descriptor/se_a.py
new file mode 100644
index 0000000000..180d6f0a3f
--- /dev/null
+++ b/deepmd/pd/model/descriptor/se_a.py
@@ -0,0 +1,715 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import itertools
+from typing import (
+    Callable,
+    ClassVar,
+    Optional,
+    Union,
+)
+
+import numpy as np
+import paddle
+
+from deepmd.dpmodel.utils.seed import (
+    child_seed,
+)
+from deepmd.pd.model.descriptor import (
+    DescriptorBlock,
+    prod_env_mat,
+)
+from deepmd.pd.utils import (
+    env,
+)
+from deepmd.pd.utils.env import (
+    PRECISION_DICT,
+    RESERVED_PRECISON_DICT,
+)
+from deepmd.pd.utils.env_mat_stat import (
+    EnvMatStatSe,
+)
+from deepmd.pd.utils.update_sel import (
+    UpdateSel,
+)
+from deepmd.utils.data_system import (
+    DeepmdDataSystem,
+)
+from deepmd.utils.env_mat_stat import (
+    StatItem,
+)
+from deepmd.utils.path import (
+    DPPath,
+)
+from deepmd.utils.version import (
+    check_version_compatibility,
+)
+
+try:
+    from typing import (
+        Final,
+    )
+except ImportError:
+    pass
+
+from deepmd.dpmodel.utils import EnvMat as DPEnvMat
+from deepmd.pd.model.network.mlp import (
+    EmbeddingNet,
+    NetworkCollection,
+)
+from deepmd.pd.utils.exclude_mask import (
+    PairExcludeMask,
+)
+
+from .base_descriptor import (
+    BaseDescriptor,
+)
+
+
+@BaseDescriptor.register("se_e2_a")
+@BaseDescriptor.register("se_a")
+class DescrptSeA(BaseDescriptor, paddle.nn.Layer):
+    def __init__(
+        self,
+        rcut,
+        rcut_smth,
+        sel,
+        neuron=[25, 50, 100],
+        axis_neuron=16,
+        set_davg_zero: bool = False,
+        activation_function: str = "tanh",
+        precision: str = "float64",
+        resnet_dt: bool = False,
+        exclude_types: list[tuple[int, int]] = [],
+        env_protection: float = 0.0,
+        type_one_side: bool = True,
+        trainable: bool = True,
+        seed: Optional[Union[int, list[int]]] = None,
+        ntypes: Optional[int] = None,  # to be compat with input
+        type_map: Optional[list[str]] = None,
+        # not implemented
+        spin=None,
+    ):
+        del ntypes
+        if spin is not None:
+            raise NotImplementedError("old implementation of spin is not supported.")
+        super().__init__()
+        self.type_map = type_map
+        self.compress = False
+        self.sea = DescrptBlockSeA(
+            rcut,
+            rcut_smth,
+            sel,
+            neuron=neuron,
+            axis_neuron=axis_neuron,
+            set_davg_zero=set_davg_zero,
+            activation_function=activation_function,
+            precision=precision,
+            resnet_dt=resnet_dt,
+            exclude_types=exclude_types,
+            env_protection=env_protection,
+            type_one_side=type_one_side,
+            trainable=trainable,
+            seed=seed,
+        )
+
+    def get_rcut(self) -> float:
+        """Returns the cut-off radius."""
+        return self.sea.get_rcut()
+
+    def get_rcut_smth(self) -> float:
+        """Returns the radius where the neighbor information starts to smoothly decay to 0."""
+        return self.sea.get_rcut_smth()
+
+    def get_nsel(self) -> int:
+        """Returns the number of selected atoms in the cut-off radius."""
+        return self.sea.get_nsel()
+
+    def get_sel(self) -> list[int]:
+        """Returns the number of selected atoms for each type."""
+        return self.sea.get_sel()
+
+    def get_ntypes(self) -> int:
+        """Returns the number of element types."""
+        return self.sea.get_ntypes()
+
+    def get_type_map(self) -> list[str]:
+        """Get the name to each type of atoms."""
+        return self.type_map
+
+    def get_dim_out(self) -> int:
+        """Returns the output dimension."""
+        return self.sea.get_dim_out()
+
+    def get_dim_emb(self) -> int:
+        """Returns the output dimension."""
+        return self.sea.get_dim_emb()
+
+    def mixed_types(self):
+        """Returns if the descriptor requires a neighbor list that distinguish different
+        atomic types or not.
+        """
+        return self.sea.mixed_types()
+
+    def has_message_passing(self) -> bool:
+        """Returns whether the descriptor has message passing."""
+        return self.sea.has_message_passing()
+
+    def need_sorted_nlist_for_lower(self) -> bool:
+        """Returns whether the descriptor needs sorted nlist when using `forward_lower`."""
+        return self.sea.need_sorted_nlist_for_lower()
+
+    def get_env_protection(self) -> float:
+        """Returns the protection of building environment matrix."""
+        return self.sea.get_env_protection()
+
+    def share_params(self, base_class, shared_level, resume=False):
+        """
+        Share the parameters of self to the base_class with shared_level during multitask training.
+        If not start from checkpoint (resume is False),
+        some separated parameters (e.g. mean and stddev) will be re-calculated across different classes.
+        """
+        assert (
+            self.__class__ == base_class.__class__
+        ), "Only descriptors of the same type can share params!"
+        # For SeA descriptors, the user-defined share-level
+        # shared_level: 0
+        # share all parameters in sea
+        if shared_level == 0:
+            self.sea.share_params(base_class.sea, 0, resume=resume)
+        # Other shared levels
+        else:
+            raise NotImplementedError
+
+    @property
+    def dim_out(self):
+        """Returns the output dimension of this descriptor."""
+        return self.sea.dim_out
+
+    def change_type_map(
+        self, type_map: list[str], model_with_new_type_stat=None
+    ) -> None:
+        """Change the type related params to new ones, according to `type_map` and the original one in the model.
+        If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
+        """
+        raise NotImplementedError(
+            "Descriptor se_e2_a does not support changing for type related params!"
+            "This feature is currently not implemented because it would require additional work to support the non-mixed-types case. "
+            "We may consider adding this support in the future if there is a clear demand for it."
+        )
+
+    def compute_input_stats(
+        self,
+        merged: Union[Callable[[], list[dict]], list[dict]],
+        path: Optional[DPPath] = None,
+    ):
+        """
+        Compute the input statistics (e.g. mean and stddev) for the descriptors from packed data.
+
+        Parameters
+        ----------
+        merged : Union[Callable[[], list[dict]], list[dict]]
+            - list[dict]: A list of data samples from various data systems.
+                Each element, `merged[i]`, is a data dictionary containing `keys`: `paddle.Tensor`
+                originating from the `i`-th data system.
+            - Callable[[], list[dict]]: A lazy function that returns data samples in the above format
+                only when needed. Since the sampling process can be slow and memory-intensive,
+                the lazy function helps by only sampling once.
+        path : Optional[DPPath]
+            The path to the stat file.
+
+        """
+        return self.sea.compute_input_stats(merged, path)
+
+    def reinit_exclude(
+        self,
+        exclude_types: list[tuple[int, int]] = [],
+    ):
+        """Update the type exclusions."""
+        self.sea.reinit_exclude(exclude_types)
+
+    def forward(
+        self,
+        coord_ext: paddle.Tensor,
+        atype_ext: paddle.Tensor,
+        nlist: paddle.Tensor,
+        mapping: Optional[paddle.Tensor] = None,
+        comm_dict: Optional[dict[str, paddle.Tensor]] = None,
+    ):
+        """Compute the descriptor.
+
+        Parameters
+        ----------
+        coord_ext
+            The extended coordinates of atoms. shape: nf x (nallx3)
+        atype_ext
+            The extended aotm types. shape: nf x nall
+        nlist
+            The neighbor list. shape: nf x nloc x nnei
+        mapping
+            The index mapping, not required by this descriptor.
+        comm_dict
+            The data needed for communication for parallel inference.
+
+        Returns
+        -------
+        descriptor
+            The descriptor. shape: nf x nloc x (ng x axis_neuron)
+        gr
+            The rotationally equivariant and permutationally invariant single particle
+            representation. shape: nf x nloc x ng x 3
+        g2
+            The rotationally invariant pair-partical representation.
+            this descriptor returns None
+        h2
+            The rotationally equivariant pair-partical representation.
+            this descriptor returns None
+        sw
+            The smooth switch function.
+
+        """
+        return self.sea.forward(nlist, coord_ext, atype_ext, None, mapping)
+
+    def set_stat_mean_and_stddev(
+        self,
+        mean: paddle.Tensor,
+        stddev: paddle.Tensor,
+    ) -> None:
+        """Update mean and stddev for descriptor."""
+        self.sea.mean = mean
+        self.sea.stddev = stddev
+
+    def get_stat_mean_and_stddev(self) -> tuple[paddle.Tensor, paddle.Tensor]:
+        """Get mean and stddev for descriptor."""
+        return self.sea.mean, self.sea.stddev
+
+    def serialize(self) -> dict:
+        obj = self.sea
+        return {
+            "@class": "Descriptor",
+            "type": "se_e2_a",
+            "@version": 2,
+            "rcut": obj.rcut,
+            "rcut_smth": obj.rcut_smth,
+            "sel": obj.sel,
+            "neuron": obj.neuron,
+            "axis_neuron": obj.axis_neuron,
+            "resnet_dt": obj.resnet_dt,
+            "set_davg_zero": obj.set_davg_zero,
+            "activation_function": obj.activation_function,
+            # make deterministic
+            "precision": RESERVED_PRECISON_DICT[obj.prec],
+            "embeddings": obj.filter_layers.serialize(),
+            "env_mat": DPEnvMat(obj.rcut, obj.rcut_smth).serialize(),
+            "exclude_types": obj.exclude_types,
+            "env_protection": obj.env_protection,
+            "@variables": {
+                "davg": obj["davg"].numpy(),
+                "dstd": obj["dstd"].numpy(),
+            },
+            "type_map": self.type_map,
+            ## to be updated when the options are supported.
+            "trainable": True,
+            "type_one_side": obj.type_one_side,
+            "spin": None,
+        }
+
+    @classmethod
+    def deserialize(cls, data: dict) -> "DescrptSeA":
+        data = data.copy()
+        check_version_compatibility(data.pop("@version", 1), 2, 1)
+        data.pop("@class", None)
+        data.pop("type", None)
+        variables = data.pop("@variables")
+        embeddings = data.pop("embeddings")
+        env_mat = data.pop("env_mat")
+        obj = cls(**data)
+
+        def t_cvt(xx):
+            return paddle.to_tensor(xx, dtype=obj.sea.prec).to(device=env.DEVICE)
+
+        obj.sea["davg"] = t_cvt(variables["davg"])
+        obj.sea["dstd"] = t_cvt(variables["dstd"])
+        obj.sea.filter_layers = NetworkCollection.deserialize(embeddings)
+        return obj
+
+    @classmethod
+    def update_sel(
+        cls,
+        train_data: DeepmdDataSystem,
+        type_map: Optional[list[str]],
+        local_jdata: dict,
+    ) -> tuple[dict, Optional[float]]:
+        """Update the selection and perform neighbor statistics.
+
+        Parameters
+        ----------
+        train_data : DeepmdDataSystem
+            data used to do neighbor statistics
+        type_map : list[str], optional
+            The name of each type of atoms
+        local_jdata : dict
+            The local data refer to the current class
+
+        Returns
+        -------
+        dict
+            The updated local data
+        float
+            The minimum distance between two atoms
+        """
+        local_jdata_cpy = local_jdata.copy()
+        min_nbor_dist, local_jdata_cpy["sel"] = UpdateSel().update_one_sel(
+            train_data, type_map, local_jdata_cpy["rcut"], local_jdata_cpy["sel"], False
+        )
+        return local_jdata_cpy, min_nbor_dist
+
+
+@DescriptorBlock.register("se_e2_a")
+class DescrptBlockSeA(DescriptorBlock):
+    ndescrpt: Final[int]
+    __constants__: ClassVar[list] = ["ndescrpt"]
+    lower: dict[str, int]
+    upper: dict[str, int]
+    table_data: dict[str, paddle.Tensor]
+    table_config: list[Union[int, float]]
+
+    def __init__(
+        self,
+        rcut,
+        rcut_smth,
+        sel,
+        neuron=[25, 50, 100],
+        axis_neuron=16,
+        set_davg_zero: bool = False,
+        activation_function: str = "tanh",
+        precision: str = "float64",
+        resnet_dt: bool = False,
+        exclude_types: list[tuple[int, int]] = [],
+        env_protection: float = 0.0,
+        type_one_side: bool = True,
+        trainable: bool = True,
+        seed: Optional[Union[int, list[int]]] = None,
+        **kwargs,
+    ):
+        """Construct an embedding net of type `se_a`.
+
+        Args:
+        - rcut: Cut-off radius.
+        - rcut_smth: Smooth hyper-parameter for pair force & energy.
+        - sel: For each element type, how many atoms is selected as neighbors.
+        - filter_neuron: Number of neurons in each hidden layers of the embedding net.
+        - axis_neuron: Number of columns of the sub-matrix of the embedding matrix.
+        """
+        super().__init__()
+        self.rcut = float(rcut)
+        self.rcut_smth = float(rcut_smth)
+        self.neuron = neuron
+        self.filter_neuron = self.neuron
+        self.axis_neuron = axis_neuron
+        self.set_davg_zero = set_davg_zero
+        self.activation_function = activation_function
+        self.precision = precision
+        self.prec = PRECISION_DICT[self.precision]
+        self.resnet_dt = resnet_dt
+        self.env_protection = env_protection
+        self.ntypes = len(sel)
+        self.type_one_side = type_one_side
+        self.seed = seed
+        # order matters, placed after the assignment of self.ntypes
+        self.reinit_exclude(exclude_types)
+
+        self.sel = sel
+        # should be on CPU to avoid D2H, as it is used as slice index
+        self.sec = [0, *np.cumsum(self.sel).tolist()]
+        self.split_sel = self.sel
+        self.nnei = sum(sel)
+        self.ndescrpt = self.nnei * 4
+
+        wanted_shape = (self.ntypes, self.nnei, 4)
+        mean = paddle.zeros(wanted_shape, dtype=self.prec).to(device=env.DEVICE)
+        stddev = paddle.ones(wanted_shape, dtype=self.prec).to(device=env.DEVICE)
+        self.register_buffer("mean", mean)
+        self.register_buffer("stddev", stddev)
+
+        # add for compression
+        self.compress = False
+        self.lower = {}
+        self.upper = {}
+        self.table_data = {}
+        self.table_config = []
+
+        ndim = 1 if self.type_one_side else 2
+        filter_layers = NetworkCollection(
+            ndim=ndim, ntypes=len(sel), network_type="embedding_network"
+        )
+        for ii, embedding_idx in enumerate(
+            itertools.product(range(self.ntypes), repeat=ndim)
+        ):
+            filter_layers[embedding_idx] = EmbeddingNet(
+                1,
+                self.filter_neuron,
+                activation_function=self.activation_function,
+                precision=self.precision,
+                resnet_dt=self.resnet_dt,
+                seed=child_seed(self.seed, ii),
+            )
+        self.filter_layers = filter_layers
+        self.stats = None
+        # set trainable
+        self.trainable = trainable
+        for param in self.parameters():
+            param.stop_gradient = not trainable
+
+    def get_rcut(self) -> float:
+        """Returns the cut-off radius."""
+        return self.rcut
+
+    def get_rcut_smth(self) -> float:
+        """Returns the radius where the neighbor information starts to smoothly decay to 0."""
+        return self.rcut_smth
+
+    def get_nsel(self) -> int:
+        """Returns the number of selected atoms in the cut-off radius."""
+        return sum(self.sel)
+
+    def get_sel(self) -> list[int]:
+        """Returns the number of selected atoms for each type."""
+        return self.sel
+
+    def get_ntypes(self) -> int:
+        """Returns the number of element types."""
+        return self.ntypes
+
+    def get_dim_out(self) -> int:
+        """Returns the output dimension."""
+        return self.dim_out
+
+    def get_dim_rot_mat_1(self) -> int:
+        """Returns the first dimension of the rotation matrix. The rotation is of shape dim_1 x 3."""
+        return self.filter_neuron[-1]
+
+    def get_dim_emb(self) -> int:
+        """Returns the output dimension."""
+        return self.neuron[-1]
+
+    def get_dim_in(self) -> int:
+        """Returns the input dimension."""
+        return self.dim_in
+
+    def mixed_types(self) -> bool:
+        """If true, the descriptor
+        1. assumes total number of atoms aligned across frames;
+        2. requires a neighbor list that does not distinguish different atomic types.
+
+        If false, the descriptor
+        1. assumes total number of atoms of each atom type aligned across frames;
+        2. requires a neighbor list that distinguishes different atomic types.
+
+        """
+        return False
+
+    def get_env_protection(self) -> float:
+        """Returns the protection of building environment matrix."""
+        return self.env_protection
+
+    @property
+    def dim_out(self):
+        """Returns the output dimension of this descriptor."""
+        return self.filter_neuron[-1] * self.axis_neuron
+
+    @property
+    def dim_in(self):
+        """Returns the atomic input dimension of this descriptor."""
+        return 0
+
+    def __setitem__(self, key, value):
+        if key in ("avg", "data_avg", "davg"):
+            self.mean = value
+        elif key in ("std", "data_std", "dstd"):
+            self.stddev = value
+        else:
+            raise KeyError(key)
+
+    def __getitem__(self, key):
+        if key in ("avg", "data_avg", "davg"):
+            return self.mean
+        elif key in ("std", "data_std", "dstd"):
+            return self.stddev
+        else:
+            raise KeyError(key)
+
+    def compute_input_stats(
+        self,
+        merged: Union[Callable[[], list[dict]], list[dict]],
+        path: Optional[DPPath] = None,
+    ):
+        """
+        Compute the input statistics (e.g. mean and stddev) for the descriptors from packed data.
+
+        Parameters
+        ----------
+        merged : Union[Callable[[], list[dict]], list[dict]]
+            - list[dict]: A list of data samples from various data systems.
+                Each element, `merged[i]`, is a data dictionary containing `keys`: `paddle.Tensor`
+                originating from the `i`-th data system.
+            - Callable[[], list[dict]]: A lazy function that returns data samples in the above format
+                only when needed. Since the sampling process can be slow and memory-intensive,
+                the lazy function helps by only sampling once.
+        path : Optional[DPPath]
+            The path to the stat file.
+
+        """
+        env_mat_stat = EnvMatStatSe(self)
+        if path is not None:
+            path = path / env_mat_stat.get_hash()
+        if path is None or not path.is_dir():
+            if callable(merged):
+                # only get data for once
+                sampled = merged()
+            else:
+                sampled = merged
+        else:
+            sampled = []
+        env_mat_stat.load_or_compute_stats(sampled, path)
+        self.stats = env_mat_stat.stats
+        mean, stddev = env_mat_stat()
+        if not self.set_davg_zero:
+            paddle.assign(paddle.to_tensor(mean).to(device=env.DEVICE), self.mean)  # pylint: disable=no-explicit-dtype
+        paddle.assign(paddle.to_tensor(stddev).to(device=env.DEVICE), self.stddev)  # pylint: disable=no-explicit-dtype
+
+    def get_stats(self) -> dict[str, StatItem]:
+        """Get the statistics of the descriptor."""
+        if self.stats is None:
+            raise RuntimeError(
+                "The statistics of the descriptor has not been computed."
+            )
+        return self.stats
+
+    def reinit_exclude(
+        self,
+        exclude_types: list[tuple[int, int]] = [],
+    ):
+        self.exclude_types = exclude_types
+        self.emask = PairExcludeMask(self.ntypes, exclude_types=exclude_types)
+
+    def enable_compression(
+        self,
+        table_data,
+        table_config,
+        lower,
+        upper,
+    ) -> None:
+        self.compress = True
+        self.table_data = table_data
+        self.table_config = table_config
+        self.lower = lower
+        self.upper = upper
+
+    def forward(
+        self,
+        nlist: paddle.Tensor,
+        extended_coord: paddle.Tensor,
+        extended_atype: paddle.Tensor,
+        extended_atype_embd: Optional[paddle.Tensor] = None,
+        mapping: Optional[paddle.Tensor] = None,
+    ):
+        """Calculate decoded embedding for each atom.
+
+        Args:
+        - coord: Tell atom coordinates with shape [nframes, natoms[1]*3].
+        - atype: Tell atom types with shape [nframes, natoms[1]].
+        - natoms: Tell atom count and element count. Its shape is [2+self.ntypes].
+        - box: Tell simulation box with shape [nframes, 9].
+
+        Returns
+        -------
+        - `paddle.Tensor`: descriptor matrix with shape [nframes, natoms[0]*self.filter_neuron[-1]*self.axis_neuron].
+        """
+        del extended_atype_embd, mapping
+        nf = nlist.shape[0]
+        nloc = nlist.shape[1]
+        atype: paddle.Tensor = extended_atype[:, :nloc]
+        dmatrix, diff, sw = prod_env_mat(
+            extended_coord,
+            nlist,
+            atype,
+            self.mean,
+            self.stddev,
+            self.rcut,
+            self.rcut_smth,
+            protection=self.env_protection,
+        )
+
+        dmatrix = dmatrix.reshape([-1, self.nnei, 4])
+        dmatrix = dmatrix.astype(self.prec)
+        nfnl = dmatrix.shape[0]
+        # pre-allocate a shape to pass jit
+        xyz_scatter = paddle.zeros(
+            [nfnl, 4, self.filter_neuron[-1]],
+            dtype=self.prec,
+        ).to(extended_coord.place)
+        # nfnl x nnei
+        exclude_mask = self.emask(nlist, extended_atype).reshape([nfnl, self.nnei])
+        for embedding_idx, ll in enumerate(self.filter_layers.networks):
+            if self.type_one_side:
+                ii = embedding_idx
+                ti = -1
+                # paddle.jit is not happy with slice(None)
+                # ti_mask = paddle.ones(nfnl, dtype=paddle.bool, device=dmatrix.place)
+                # applying a mask seems to cause performance degradation
+                ti_mask = None
+            else:
+                # ti: center atom type, ii: neighbor type...
+                ii = embedding_idx // self.ntypes
+                ti = embedding_idx % self.ntypes
+                ti_mask = atype.flatten() == ti
+            # nfnl x nt
+            if ti_mask is not None:
+                mm = exclude_mask[ti_mask, self.sec[ii] : self.sec[ii + 1]]
+            else:
+                mm = exclude_mask[:, self.sec[ii] : self.sec[ii + 1]]
+            # nfnl x nt x 4
+            if ti_mask is not None:
+                rr = dmatrix[ti_mask, self.sec[ii] : self.sec[ii + 1], :]
+            else:
+                rr = dmatrix[:, self.sec[ii] : self.sec[ii + 1], :]
+            if self.compress:
+                raise NotImplementedError(
+                    "Compressed environment is not implemented yet."
+                )
+            else:
+                if rr.numel() > 0:
+                    rr = rr * mm.unsqueeze(2).astype(rr.dtype)
+                    ss = rr[:, :, :1]
+                    # nfnl x nt x ng
+                    gg = ll.forward(ss)
+                    # nfnl x 4 x ng
+                    gr = paddle.matmul(rr.transpose([0, 2, 1]), gg)
+                    if ti_mask is not None:
+                        xyz_scatter[ti_mask] += gr
+                    else:
+                        xyz_scatter += gr
+
+        xyz_scatter /= self.nnei
+        xyz_scatter_1 = xyz_scatter.transpose([0, 2, 1])
+        rot_mat: paddle.Tensor = xyz_scatter_1[:, :, 1:4]
+        xyz_scatter_2 = xyz_scatter[:, :, 0 : self.axis_neuron]
+        result = paddle.matmul(
+            xyz_scatter_1, xyz_scatter_2
+        )  # shape is [nframes*nall, self.filter_neuron[-1], self.axis_neuron]
+        result = result.reshape([nf, nloc, self.filter_neuron[-1] * self.axis_neuron])
+        rot_mat = rot_mat.reshape([nf, nloc] + list(rot_mat.shape[1:]))  # noqa:RUF005
+        return (
+            result.astype(env.GLOBAL_PD_FLOAT_PRECISION),
+            rot_mat.astype(env.GLOBAL_PD_FLOAT_PRECISION),
+            None,
+            None,
+            sw,
+        )
+
+    def has_message_passing(self) -> bool:
+        """Returns whether the descriptor block has message passing."""
+        return False
+
+    def need_sorted_nlist_for_lower(self) -> bool:
+        """Returns whether the descriptor block needs sorted nlist when using `forward_lower`."""
+        return False
diff --git a/deepmd/pd/model/model/__init__.py b/deepmd/pd/model/model/__init__.py
new file mode 100644
index 0000000000..990ee51348
--- /dev/null
+++ b/deepmd/pd/model/model/__init__.py
@@ -0,0 +1,144 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""The model that takes the coordinates, cell and atom types as input
+and predicts some property. The models are automatically generated from
+atomic models by the `deepmd.dpmodel.make_model` method.
+
+The `make_model` method does the reduction, auto-differentiation and
+communication of the atomic properties according to output variable
+definition `deepmd.dpmodel.OutputVariableDef`.
+
+All models should be inherited from :class:`deepmd.pd.model.model.model.BaseModel`.
+Models generated by `make_model` have already done it.
+"""
+
+import copy
+import json
+
+import numpy as np
+
+from deepmd.pd.model.descriptor.base_descriptor import (
+    BaseDescriptor,
+)
+from deepmd.pd.model.task import (
+    BaseFitting,
+)
+
+from .dp_model import (
+    DPModelCommon,
+)
+from .ener_model import (
+    EnergyModel,
+)
+from .frozen import (
+    FrozenModel,
+)
+from .make_model import (
+    make_model,
+)
+from .model import (
+    BaseModel,
+)
+
+
+def _get_standard_model_components(model_params, ntypes):
+    # descriptor
+    model_params["descriptor"]["ntypes"] = ntypes
+    model_params["descriptor"]["type_map"] = copy.deepcopy(model_params["type_map"])
+    descriptor = BaseDescriptor(**model_params["descriptor"])
+    # fitting
+    fitting_net = model_params.get("fitting_net", {})
+    fitting_net["type"] = fitting_net.get("type", "ener")
+    fitting_net["ntypes"] = descriptor.get_ntypes()
+    fitting_net["type_map"] = copy.deepcopy(model_params["type_map"])
+    fitting_net["mixed_types"] = descriptor.mixed_types()
+    if fitting_net["type"] in ["dipole", "polar"]:
+        fitting_net["embedding_width"] = descriptor.get_dim_emb()
+    fitting_net["dim_descrpt"] = descriptor.get_dim_out()
+    grad_force = "direct" not in fitting_net["type"]
+    if not grad_force:
+        fitting_net["out_dim"] = descriptor.get_dim_emb()
+        if "ener" in fitting_net["type"]:
+            fitting_net["return_energy"] = True
+    fitting = BaseFitting(**fitting_net)
+    return descriptor, fitting, fitting_net["type"]
+
+
+def _can_be_converted_to_float(value):
+    try:
+        float(value)
+        return True
+    except (TypeError, ValueError):
+        # return false for any failure...
+        return False
+
+
+def _convert_preset_out_bias_to_array(preset_out_bias, type_map):
+    if preset_out_bias is not None:
+        for kk in preset_out_bias:
+            if len(preset_out_bias[kk]) != len(type_map):
+                raise ValueError(
+                    "length of the preset_out_bias should be the same as the type_map"
+                )
+            for jj in range(len(preset_out_bias[kk])):
+                if preset_out_bias[kk][jj] is not None:
+                    if isinstance(preset_out_bias[kk][jj], list):
+                        bb = preset_out_bias[kk][jj]
+                    elif _can_be_converted_to_float(preset_out_bias[kk][jj]):
+                        bb = [float(preset_out_bias[kk][jj])]
+                    else:
+                        raise ValueError(
+                            f"unsupported type/value of the {jj}th element of "
+                            f"preset_out_bias['{kk}'] "
+                            f"{type(preset_out_bias[kk][jj])}"
+                        )
+                    preset_out_bias[kk][jj] = np.array(bb)
+    return preset_out_bias
+
+
+def get_standard_model(model_params):
+    model_params_old = model_params
+    model_params = copy.deepcopy(model_params)
+    ntypes = len(model_params["type_map"])
+    descriptor, fitting, fitting_net_type = _get_standard_model_components(
+        model_params, ntypes
+    )
+    atom_exclude_types = model_params.get("atom_exclude_types", [])
+    pair_exclude_types = model_params.get("pair_exclude_types", [])
+    preset_out_bias = model_params.get("preset_out_bias")
+    preset_out_bias = _convert_preset_out_bias_to_array(
+        preset_out_bias, model_params["type_map"]
+    )
+
+    if fitting_net_type in ["ener", "direct_force_ener"]:
+        modelcls = EnergyModel
+    else:
+        raise RuntimeError(f"Unknown fitting type: {fitting_net_type}")
+
+    model = modelcls(
+        descriptor=descriptor,
+        fitting=fitting,
+        type_map=model_params["type_map"],
+        atom_exclude_types=atom_exclude_types,
+        pair_exclude_types=pair_exclude_types,
+        preset_out_bias=preset_out_bias,
+    )
+    model.model_def_script = json.dumps(model_params_old)
+    return model
+
+
+def get_model(model_params):
+    model_type = model_params.get("type", "standard")
+    if model_type == "standard":
+        return get_standard_model(model_params)
+    else:
+        return BaseModel.get_class_by_type(model_type).get_model(model_params)
+
+
+__all__ = [
+    "BaseModel",
+    "get_model",
+    "DPModelCommon",
+    "EnergyModel",
+    "FrozenModel",
+    "make_model",
+]
diff --git a/deepmd/pd/model/model/dp_model.py b/deepmd/pd/model/model/dp_model.py
new file mode 100644
index 0000000000..e014be5b68
--- /dev/null
+++ b/deepmd/pd/model/model/dp_model.py
@@ -0,0 +1,64 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+from typing import (
+    Optional,
+)
+
+import paddle
+
+from deepmd.pd.model.descriptor.base_descriptor import (
+    BaseDescriptor,
+)
+from deepmd.utils.data_system import (
+    DeepmdDataSystem,
+)
+
+
+class DPModelCommon:
+    """A base class to implement common methods for all the Models."""
+
+    @classmethod
+    def update_sel(
+        cls,
+        train_data: DeepmdDataSystem,
+        type_map: Optional[list[str]],
+        local_jdata: dict,
+    ) -> tuple[dict, Optional[float]]:
+        """Update the selection and perform neighbor statistics.
+
+        Parameters
+        ----------
+        train_data : DeepmdDataSystem
+            data used to do neighbor statistics
+        type_map : list[str], optional
+            The name of each type of atoms
+        local_jdata : dict
+            The local data refer to the current class
+
+        Returns
+        -------
+        dict
+            The updated local data
+        float
+            The minimum distance between two atoms
+        """
+        local_jdata_cpy = local_jdata.copy()
+        local_jdata_cpy["descriptor"], min_nbor_dist = BaseDescriptor.update_sel(
+            train_data, type_map, local_jdata["descriptor"]
+        )
+        return local_jdata_cpy, min_nbor_dist
+
+    def get_fitting_net(self):
+        """Get the fitting network."""
+        return self.atomic_model.fitting_net
+
+    def get_descriptor(self):
+        """Get the descriptor."""
+        return self.atomic_model.descriptor
+
+    def set_eval_descriptor_hook(self, enable: bool) -> None:
+        """Set the hook for evaluating descriptor and clear the cache for descriptor list."""
+        self.atomic_model.set_eval_descriptor_hook(enable)
+
+    def eval_descriptor(self) -> paddle.Tensor:
+        """Evaluate the descriptor."""
+        return self.atomic_model.eval_descriptor()
diff --git a/deepmd/pd/model/model/ener_model.py b/deepmd/pd/model/model/ener_model.py
new file mode 100644
index 0000000000..3f3db4a527
--- /dev/null
+++ b/deepmd/pd/model/model/ener_model.py
@@ -0,0 +1,135 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+from copy import (
+    deepcopy,
+)
+from typing import (
+    Optional,
+)
+
+import paddle
+
+from deepmd.pd.model.atomic_model import (
+    DPEnergyAtomicModel,
+)
+from deepmd.pd.model.model.model import (
+    BaseModel,
+)
+
+from .dp_model import (
+    DPModelCommon,
+)
+from .make_model import (
+    make_model,
+)
+
+DPEnergyModel_ = make_model(DPEnergyAtomicModel)
+
+
+@BaseModel.register("ener")
+class EnergyModel(DPModelCommon, DPEnergyModel_):
+    model_type = "ener"
+
+    def __init__(
+        self,
+        *args,
+        **kwargs,
+    ):
+        DPModelCommon.__init__(self)
+        DPEnergyModel_.__init__(self, *args, **kwargs)
+
+    def translated_output_def(self):
+        out_def_data = self.model_output_def().get_data()
+        output_def = {
+            "atom_energy": deepcopy(out_def_data["energy"]),
+            "energy": deepcopy(out_def_data["energy_redu"]),
+        }
+        if self.do_grad_r("energy"):
+            output_def["force"] = deepcopy(out_def_data["energy_derv_r"])
+            output_def["force"].squeeze(-2)
+        if self.do_grad_c("energy"):
+            output_def["virial"] = deepcopy(out_def_data["energy_derv_c_redu"])
+            output_def["virial"].squeeze(-2)
+            output_def["atom_virial"] = deepcopy(out_def_data["energy_derv_c"])
+            output_def["atom_virial"].squeeze(-3)
+        if "mask" in out_def_data:
+            output_def["mask"] = deepcopy(out_def_data["mask"])
+        return output_def
+
+    def forward(
+        self,
+        coord,
+        atype,
+        box: Optional[paddle.Tensor] = None,
+        fparam: Optional[paddle.Tensor] = None,
+        aparam: Optional[paddle.Tensor] = None,
+        do_atomic_virial: bool = False,
+    ) -> dict[str, paddle.Tensor]:
+        model_ret = self.forward_common(
+            coord,
+            atype,
+            box,
+            fparam=fparam,
+            aparam=aparam,
+            do_atomic_virial=do_atomic_virial,
+        )
+        if self.get_fitting_net() is not None:
+            model_predict = {}
+            model_predict["atom_energy"] = model_ret["energy"]
+            model_predict["energy"] = model_ret["energy_redu"]
+            if self.do_grad_r("energy"):
+                model_predict["force"] = model_ret["energy_derv_r"].squeeze(-2)
+            if self.do_grad_c("energy"):
+                model_predict["virial"] = model_ret["energy_derv_c_redu"].squeeze(-2)
+                if do_atomic_virial:
+                    model_predict["atom_virial"] = model_ret["energy_derv_c"].squeeze(
+                        -3
+                    )
+            else:
+                model_predict["force"] = model_ret["dforce"]
+            if "mask" in model_ret:
+                model_predict["mask"] = model_ret["mask"]
+        else:
+            model_predict = model_ret
+            model_predict["updated_coord"] += coord
+        return model_predict
+
+    def forward_lower(
+        self,
+        extended_coord,
+        extended_atype,
+        nlist,
+        mapping: Optional[paddle.Tensor] = None,
+        fparam: Optional[paddle.Tensor] = None,
+        aparam: Optional[paddle.Tensor] = None,
+        do_atomic_virial: bool = False,
+        comm_dict: Optional[dict[str, paddle.Tensor]] = None,
+    ):
+        model_ret = self.forward_common_lower(
+            extended_coord,
+            extended_atype,
+            nlist,
+            mapping,
+            fparam=fparam,
+            aparam=aparam,
+            do_atomic_virial=do_atomic_virial,
+            comm_dict=comm_dict,
+            extra_nlist_sort=self.need_sorted_nlist_for_lower(),
+        )
+        if self.get_fitting_net() is not None:
+            model_predict = {}
+            model_predict["atom_energy"] = model_ret["energy"]
+            model_predict["energy"] = model_ret["energy_redu"]
+            if self.do_grad_r("energy"):
+                model_predict["extended_force"] = model_ret["energy_derv_r"].squeeze(-2)
+            if self.do_grad_c("energy"):
+                model_predict["virial"] = model_ret["energy_derv_c_redu"].squeeze(-2)
+                if do_atomic_virial:
+                    model_predict["extended_virial"] = model_ret[
+                        "energy_derv_c"
+                    ].squeeze(-3)
+            else:
+                assert model_ret["dforce"] is not None
+                model_predict["dforce"] = model_ret["dforce"]
+        else:
+            model_predict = model_ret
+        return model_predict
diff --git a/deepmd/pd/model/model/frozen.py b/deepmd/pd/model/model/frozen.py
new file mode 100644
index 0000000000..e8128c6bd1
--- /dev/null
+++ b/deepmd/pd/model/model/frozen.py
@@ -0,0 +1,182 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import json
+from typing import (
+    Optional,
+)
+
+import paddle
+
+from deepmd.dpmodel.output_def import (
+    FittingOutputDef,
+)
+from deepmd.pd.model.model.model import (
+    BaseModel,
+)
+from deepmd.utils.data_system import (
+    DeepmdDataSystem,
+)
+
+
+@BaseModel.register("frozen")
+class FrozenModel(BaseModel):
+    """Load model from a frozen model, which cannot be trained.
+
+    Parameters
+    ----------
+    model_file : str
+        The path to the frozen model
+    """
+
+    def __init__(self, model_file: str, **kwargs):
+        super().__init__(**kwargs)
+        self.model_file = model_file
+        if model_file.endswith(".json"):
+            self.model = paddle.jit.load(model_file.split(".json")[0])
+        else:
+            raise NotImplementedError(
+                f"Only support .json file, but received {model_file}"
+            )
+
+    def fitting_output_def(self) -> FittingOutputDef:
+        """Get the output def of developer implemented atomic models."""
+        return self.model.fitting_output_def()
+
+    def get_rcut(self) -> float:
+        """Get the cut-off radius."""
+        return self.model.get_rcut()
+
+    def get_type_map(self) -> list[str]:
+        """Get the type map."""
+        return self.model.get_type_map()
+
+    def get_sel(self) -> list[int]:
+        """Returns the number of selected atoms for each type."""
+        return self.model.get_sel()
+
+    def get_dim_fparam(self) -> int:
+        """Get the number (dimension) of frame parameters of this atomic model."""
+        return self.model.get_dim_fparam()
+
+    def get_dim_aparam(self) -> int:
+        """Get the number (dimension) of atomic parameters of this atomic model."""
+        return self.model.get_dim_aparam()
+
+    def get_sel_type(self) -> list[int]:
+        """Get the selected atom types of this model.
+
+        Only atoms with selected atom types have atomic contribution
+        to the result of the model.
+        If returning an empty list, all atom types are selected.
+        """
+        return self.model.get_sel_type()
+
+    def is_aparam_nall(self) -> bool:
+        """Check whether the shape of atomic parameters is (nframes, nall, ndim).
+
+        If False, the shape is (nframes, nloc, ndim).
+        """
+        return self.model.is_aparam_nall()
+
+    def mixed_types(self) -> bool:
+        """If true, the model
+        1. assumes total number of atoms aligned across frames;
+        2. uses a neighbor list that does not distinguish different atomic types.
+
+        If false, the model
+        1. assumes total number of atoms of each atom type aligned across frames;
+        2. uses a neighbor list that distinguishes different atomic types.
+
+        """
+        return self.model.mixed_types()
+
+    def has_message_passing(self) -> bool:
+        """Returns whether the descriptor has message passing."""
+        return self.model.has_message_passing()
+
+    def need_sorted_nlist_for_lower(self) -> bool:
+        """Returns whether the model needs sorted nlist when using `forward_lower`."""
+        return self.model.need_sorted_nlist_for_lower()
+
+    def forward(
+        self,
+        coord,
+        atype,
+        box: Optional[paddle.Tensor] = None,
+        fparam: Optional[paddle.Tensor] = None,
+        aparam: Optional[paddle.Tensor] = None,
+        do_atomic_virial: bool = False,
+    ) -> dict[str, paddle.Tensor]:
+        return self.model.forward(
+            coord,
+            atype,
+            box=box,
+            fparam=fparam,
+            aparam=aparam,
+            do_atomic_virial=do_atomic_virial,
+        )
+
+    def get_model_def_script(self) -> str:
+        """Get the model definition script."""
+        # try to use the original script instead of "frozen model"
+        # Note: this cannot change the script of the parent model
+        # it may still try to load hard-coded filename, which might
+        # be a problem
+        return self.model.get_model_def_script()
+
+    def get_min_nbor_dist(self) -> Optional[float]:
+        """Get the minimum neighbor distance."""
+        return self.model.get_min_nbor_dist()
+
+    def serialize(self) -> dict:
+        from deepmd.pd.model.model import (
+            get_model,
+        )
+
+        # try to recover the original model
+        model_def_script = json.loads(self.get_model_def_script())
+        model = get_model(model_def_script)
+        model.set_state_dict(self.model.state_dict())
+        return model.serialize()
+
+    @classmethod
+    def deserialize(cls, data: dict):
+        raise RuntimeError("Should not touch here.")
+
+    def get_nnei(self) -> int:
+        """Returns the total number of selected neighboring atoms in the cut-off radius."""
+        return self.model.get_nnei()
+
+    def get_nsel(self) -> int:
+        """Returns the total number of selected neighboring atoms in the cut-off radius."""
+        return self.model.get_nsel()
+
+    @classmethod
+    def update_sel(
+        cls,
+        train_data: DeepmdDataSystem,
+        type_map: Optional[list[str]],
+        local_jdata: dict,
+    ) -> tuple[dict, Optional[float]]:
+        """Update the selection and perform neighbor statistics.
+
+        Parameters
+        ----------
+        train_data : DeepmdDataSystem
+            data used to do neighbor statistics
+        type_map : list[str], optional
+            The name of each type of atoms
+        local_jdata : dict
+            The local data refer to the current class
+
+        Returns
+        -------
+        dict
+            The updated local data
+        float
+            The minimum distance between two atoms
+        """
+        return local_jdata, None
+
+    def model_output_type(self) -> str:
+        """Get the output type for the model."""
+        return self.model.model_output_type()
diff --git a/deepmd/pd/model/model/make_model.py b/deepmd/pd/model/model/make_model.py
new file mode 100644
index 0000000000..67b46d4d87
--- /dev/null
+++ b/deepmd/pd/model/model/make_model.py
@@ -0,0 +1,614 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+from typing import (
+    Optional,
+)
+
+import paddle
+
+from deepmd.dpmodel import (
+    ModelOutputDef,
+)
+from deepmd.dpmodel.output_def import (
+    FittingOutputDef,
+    OutputVariableCategory,
+    OutputVariableOperation,
+    check_operation_applied,
+)
+from deepmd.pd.model.atomic_model.base_atomic_model import (
+    BaseAtomicModel,
+)
+from deepmd.pd.model.model.model import (
+    BaseModel,
+)
+from deepmd.pd.model.model.transform_output import (
+    communicate_extended_output,
+    fit_output_to_model_output,
+)
+from deepmd.pd.utils import (
+    decomp,
+)
+from deepmd.pd.utils.env import (
+    GLOBAL_PD_ENER_FLOAT_PRECISION,
+    GLOBAL_PD_FLOAT_PRECISION,
+    PRECISION_DICT,
+    RESERVED_PRECISON_DICT,
+)
+from deepmd.pd.utils.nlist import (
+    extend_input_and_build_neighbor_list,
+    nlist_distinguish_types,
+)
+from deepmd.utils.path import (
+    DPPath,
+)
+
+
+def make_model(T_AtomicModel: type[BaseAtomicModel]):
+    """Make a model as a derived class of an atomic model.
+
+    The model provide two interfaces.
+
+    1. the `forward_common_lower`, that takes extended coordinates, atyps and neighbor list,
+    and outputs the atomic and property and derivatives (if required) on the extended region.
+
+    2. the `forward_common`, that takes coordinates, atypes and cell and predicts
+    the atomic and reduced property, and derivatives (if required) on the local region.
+
+    Parameters
+    ----------
+    T_AtomicModel
+        The atomic model.
+
+    Returns
+    -------
+    CM
+        The model.
+
+    """
+
+    class CM(BaseModel):
+        def __init__(
+            self,
+            *args,
+            # underscore to prevent conflict with normal inputs
+            atomic_model_: Optional[T_AtomicModel] = None,
+            **kwargs,
+        ):
+            super().__init__(*args, **kwargs)
+            if atomic_model_ is not None:
+                self.atomic_model: T_AtomicModel = atomic_model_
+            else:
+                self.atomic_model: T_AtomicModel = T_AtomicModel(*args, **kwargs)
+            self.precision_dict = PRECISION_DICT
+            self.reverse_precision_dict = RESERVED_PRECISON_DICT
+            self.global_pd_float_precision = GLOBAL_PD_FLOAT_PRECISION
+            self.global_pd_ener_float_precision = GLOBAL_PD_ENER_FLOAT_PRECISION
+
+        def model_output_def(self):
+            """Get the output def for the model."""
+            return ModelOutputDef(self.atomic_output_def())
+
+        def model_output_type(self) -> list[str]:
+            """Get the output type for the model."""
+            output_def = self.model_output_def()
+            var_defs = output_def.var_defs
+            # jit: Comprehension ifs are not supported yet
+            # type hint is critical for JIT
+            vars: list[str] = []
+            for kk, vv in var_defs.items():
+                # .value is critical for JIT
+                if vv.category == OutputVariableCategory.OUT.value:
+                    vars.append(kk)
+            return vars
+
+        def enable_compression(
+            self,
+            table_extrapolate: float = 5,
+            table_stride_1: float = 0.01,
+            table_stride_2: float = 0.1,
+            check_frequency: int = -1,
+        ) -> None:
+            """Call atomic_model enable_compression().
+
+            Parameters
+            ----------
+            table_extrapolate
+                The scale of model extrapolation
+            table_stride_1
+                The uniform stride of the first table
+            table_stride_2
+                The uniform stride of the second table
+            check_frequency
+                The overflow check frequency
+            """
+            self.atomic_model.enable_compression(
+                self.get_min_nbor_dist(),
+                table_extrapolate,
+                table_stride_1,
+                table_stride_2,
+                check_frequency,
+            )
+
+        def forward_common(
+            self,
+            coord,
+            atype,
+            box: Optional[paddle.Tensor] = None,
+            fparam: Optional[paddle.Tensor] = None,
+            aparam: Optional[paddle.Tensor] = None,
+            do_atomic_virial: bool = False,
+        ) -> dict[str, paddle.Tensor]:
+            """Return model prediction.
+
+            Parameters
+            ----------
+            coord
+                The coordinates of the atoms.
+                shape: nf x (nloc x 3)
+            atype
+                The type of atoms. shape: nf x nloc
+            box
+                The simulation box. shape: nf x 9
+            fparam
+                frame parameter. nf x ndf
+            aparam
+                atomic parameter. nf x nloc x nda
+            do_atomic_virial
+                If calculate the atomic virial.
+
+            Returns
+            -------
+            ret_dict
+                The result dict of type dict[str,paddle.Tensor].
+                The keys are defined by the `ModelOutputDef`.
+
+            """
+            cc, bb, fp, ap, input_prec = self.input_type_cast(
+                coord, box=box, fparam=fparam, aparam=aparam
+            )
+            del coord, box, fparam, aparam
+            (
+                extended_coord,
+                extended_atype,
+                mapping,
+                nlist,
+            ) = extend_input_and_build_neighbor_list(
+                cc,
+                atype,
+                self.get_rcut(),
+                self.get_sel(),
+                mixed_types=self.mixed_types(),
+                box=bb,
+            )
+            model_predict_lower = self.forward_common_lower(
+                extended_coord,
+                extended_atype,
+                nlist,
+                mapping,
+                do_atomic_virial=do_atomic_virial,
+                fparam=fp,
+                aparam=ap,
+            )
+            model_predict = communicate_extended_output(
+                model_predict_lower,
+                self.model_output_def(),
+                mapping,
+                do_atomic_virial=do_atomic_virial,
+            )
+            model_predict = self.output_type_cast(model_predict, input_prec)
+            return model_predict
+
+        def get_out_bias(self) -> paddle.Tensor:
+            return self.atomic_model.get_out_bias()
+
+        def set_out_bias(self, out_bias: paddle.Tensor) -> None:
+            self.atomic_model.set_out_bias(out_bias)
+
+        def change_out_bias(
+            self,
+            merged,
+            bias_adjust_mode="change-by-statistic",
+        ) -> None:
+            """Change the output bias of atomic model according to the input data and the pretrained model.
+
+            Parameters
+            ----------
+            merged : Union[Callable[[], list[dict]], list[dict]]
+                - list[dict]: A list of data samples from various data systems.
+                    Each element, `merged[i]`, is a data dictionary containing `keys`: `paddle.Tensor`
+                    originating from the `i`-th data system.
+                - Callable[[], list[dict]]: A lazy function that returns data samples in the above format
+                    only when needed. Since the sampling process can be slow and memory-intensive,
+                    the lazy function helps by only sampling once.
+            bias_adjust_mode : str
+                The mode for changing output bias : ['change-by-statistic', 'set-by-statistic']
+                'change-by-statistic' : perform predictions on labels of target dataset,
+                        and do least square on the errors to obtain the target shift as bias.
+                'set-by-statistic' : directly use the statistic output bias in the target dataset.
+            """
+            self.atomic_model.change_out_bias(
+                merged,
+                bias_adjust_mode=bias_adjust_mode,
+            )
+
+        def forward_common_lower(
+            self,
+            extended_coord,
+            extended_atype,
+            nlist,
+            mapping: Optional[paddle.Tensor] = None,
+            fparam: Optional[paddle.Tensor] = None,
+            aparam: Optional[paddle.Tensor] = None,
+            do_atomic_virial: bool = False,
+            comm_dict: Optional[dict[str, paddle.Tensor]] = None,
+            extra_nlist_sort: bool = False,
+        ):
+            """Return model prediction. Lower interface that takes
+            extended atomic coordinates and types, nlist, and mapping
+            as input, and returns the predictions on the extended region.
+            The predictions are not reduced.
+
+            Parameters
+            ----------
+            extended_coord
+                coordinates in extended region. nf x (nall x 3)
+            extended_atype
+                atomic type in extended region. nf x nall
+            nlist
+                neighbor list. nf x nloc x nsel.
+            mapping
+                mapps the extended indices to local indices. nf x nall.
+            fparam
+                frame parameter. nf x ndf
+            aparam
+                atomic parameter. nf x nloc x nda
+            do_atomic_virial
+                whether calculate atomic virial.
+            comm_dict
+                The data needed for communication for parallel inference.
+            extra_nlist_sort
+                whether to forcibly sort the nlist.
+
+            Returns
+            -------
+            result_dict
+                the result dict, defined by the `FittingOutputDef`.
+
+            """
+            nframes, nall = extended_atype.shape[:2]
+            extended_coord = extended_coord.reshape([nframes, -1, 3])
+            nlist = self.format_nlist(
+                extended_coord, extended_atype, nlist, extra_nlist_sort=extra_nlist_sort
+            )
+            cc_ext, _, fp, ap, input_prec = self.input_type_cast(
+                extended_coord, fparam=fparam, aparam=aparam
+            )
+            del extended_coord, fparam, aparam
+            atomic_ret = self.atomic_model.forward_common_atomic(
+                cc_ext,
+                extended_atype,
+                nlist,
+                mapping=mapping,
+                fparam=fp,
+                aparam=ap,
+                comm_dict=comm_dict,
+            )
+            model_predict = fit_output_to_model_output(
+                atomic_ret,
+                self.atomic_output_def(),
+                cc_ext,
+                do_atomic_virial=do_atomic_virial,
+                create_graph=self.training,
+            )
+            model_predict = self.output_type_cast(model_predict, input_prec)
+            return model_predict
+
+        def input_type_cast(
+            self,
+            coord: paddle.Tensor,
+            box: Optional[paddle.Tensor] = None,
+            fparam: Optional[paddle.Tensor] = None,
+            aparam: Optional[paddle.Tensor] = None,
+        ) -> tuple[
+            paddle.Tensor,
+            Optional[paddle.Tensor],
+            Optional[paddle.Tensor],
+            Optional[paddle.Tensor],
+            str,
+        ]:
+            """Cast the input data to global float type."""
+            input_prec = self.reverse_precision_dict[coord.dtype]
+            ###
+            ### type checking would not pass jit, convert to coord prec anyway
+            ###
+            # for vv, kk in zip([fparam, aparam], ["frame", "atomic"]):
+            #     if vv is not None and self.reverse_precision_dict[vv.dtype] != input_prec:
+            #         log.warning(
+            #           f"type of {kk} parameter {self.reverse_precision_dict[vv.dtype]}"
+            #           " does not match"
+            #           f" that of the coordinate {input_prec}"
+            #         )
+            _lst: list[Optional[paddle.Tensor]] = [
+                vv.astype(coord.dtype) if vv is not None else None
+                for vv in [box, fparam, aparam]
+            ]
+            box, fparam, aparam = _lst
+            if (
+                input_prec
+                == self.reverse_precision_dict[self.global_pd_float_precision]
+            ):
+                return coord, box, fparam, aparam, input_prec
+            else:
+                pp = self.global_pd_float_precision
+                return (
+                    coord.to(pp),
+                    box.to(pp) if box is not None else None,
+                    fparam.to(pp) if fparam is not None else None,
+                    aparam.to(pp) if aparam is not None else None,
+                    input_prec,
+                )
+
+        def output_type_cast(
+            self,
+            model_ret: dict[str, paddle.Tensor],
+            input_prec: str,
+        ) -> dict[str, paddle.Tensor]:
+            """Convert the model output to the input prec."""
+            do_cast = (
+                input_prec
+                != self.reverse_precision_dict[self.global_pd_float_precision]
+            )
+            pp = self.precision_dict[input_prec]
+            odef = self.model_output_def()
+            for kk in odef.keys():
+                if kk not in model_ret.keys():
+                    # do not return energy_derv_c if not do_atomic_virial
+                    continue
+                if check_operation_applied(odef[kk], OutputVariableOperation.REDU):
+                    model_ret[kk] = (
+                        model_ret[kk].to(self.global_pd_ener_float_precision)
+                        if model_ret[kk] is not None
+                        else None
+                    )
+                elif do_cast:
+                    model_ret[kk] = (
+                        model_ret[kk].to(pp) if model_ret[kk] is not None else None
+                    )
+            return model_ret
+
+        def format_nlist(
+            self,
+            extended_coord: paddle.Tensor,
+            extended_atype: paddle.Tensor,
+            nlist: paddle.Tensor,
+            extra_nlist_sort: bool = False,
+        ):
+            """Format the neighbor list.
+
+            1. If the number of neighbors in the `nlist` is equal to sum(self.sel),
+            it does nothong
+
+            2. If the number of neighbors in the `nlist` is smaller than sum(self.sel),
+            the `nlist` is pad with -1.
+
+            3. If the number of neighbors in the `nlist` is larger than sum(self.sel),
+            the nearest sum(sel) neighbors will be preserved.
+
+            Known limitations:
+
+            In the case of not self.mixed_types, the nlist is always formatted.
+            May have side effact on the efficiency.
+
+            Parameters
+            ----------
+            extended_coord
+                coordinates in extended region. nf x nall x 3
+            extended_atype
+                atomic type in extended region. nf x nall
+            nlist
+                neighbor list. nf x nloc x nsel
+            extra_nlist_sort
+                whether to forcibly sort the nlist.
+
+            Returns
+            -------
+            formatted_nlist
+                the formatted nlist.
+
+            """
+            mixed_types = self.mixed_types()
+            nlist = self._format_nlist(
+                extended_coord,
+                nlist,
+                sum(self.get_sel()),
+                extra_nlist_sort=extra_nlist_sort,
+            )
+            if not mixed_types:
+                nlist = nlist_distinguish_types(nlist, extended_atype, self.get_sel())
+            return nlist
+
+        def _format_nlist(
+            self,
+            extended_coord: paddle.Tensor,
+            nlist: paddle.Tensor,
+            nnei: int,
+            extra_nlist_sort: bool = False,
+        ):
+            n_nf, n_nloc, n_nnei = nlist.shape
+            # nf x nall x 3
+            extended_coord = extended_coord.reshape([n_nf, -1, 3])
+            rcut = self.get_rcut()
+
+            if n_nnei < nnei:
+                nlist = paddle.concat(
+                    [
+                        nlist,
+                        -1
+                        * paddle.ones(
+                            [n_nf, n_nloc, nnei - n_nnei],
+                            dtype=nlist.dtype,
+                        ).to(nlist.place),
+                    ],
+                    axis=-1,
+                )
+
+            if n_nnei > nnei or extra_nlist_sort:
+                n_nf, n_nloc, n_nnei = nlist.shape
+                m_real_nei = nlist >= 0
+                nlist = paddle.where(m_real_nei, nlist, paddle.zeros_like(nlist))
+                # nf x nloc x 3
+                coord0 = extended_coord[:, :n_nloc, :]
+                # nf x (nloc x nnei) x 3
+                index = nlist.reshape([n_nf, n_nloc * n_nnei, 1]).expand([-1, -1, 3])
+                coord1 = decomp.take_along_axis(extended_coord, axis=1, indices=index)
+                # nf x nloc x nnei x 3
+                coord1 = coord1.reshape([n_nf, n_nloc, n_nnei, 3])
+                # nf x nloc x nnei
+                # rr = paddle.linalg.norm(coord0[:, :, None, :] - coord1, axis=-1)
+                rr = decomp.norm(coord0[:, :, None, :] - coord1, axis=-1)
+                rr = paddle.where(m_real_nei, rr, float("inf"))
+                rr, nlist_mapping = (
+                    paddle.sort(rr, axis=-1),
+                    paddle.argsort(rr, axis=-1),
+                )
+                nlist = decomp.take_along_axis(nlist, axis=2, indices=nlist_mapping)
+                nlist = paddle.where(rr > rcut, paddle.full_like(nlist, -1), nlist)
+                nlist = nlist[..., :nnei]
+            else:  # not extra_nlist_sort and n_nnei <= nnei:
+                pass  # great!
+            assert nlist.shape[-1] == nnei
+            return nlist
+
+        def do_grad_r(
+            self,
+            var_name: Optional[str] = None,
+        ) -> bool:
+            """Tell if the output variable `var_name` is r_differentiable.
+            if var_name is None, returns if any of the variable is r_differentiable.
+            """
+            return self.atomic_model.do_grad_r(var_name)
+
+        def do_grad_c(
+            self,
+            var_name: Optional[str] = None,
+        ) -> bool:
+            """Tell if the output variable `var_name` is c_differentiable.
+            if var_name is None, returns if any of the variable is c_differentiable.
+            """
+            return self.atomic_model.do_grad_c(var_name)
+
+        def change_type_map(
+            self, type_map: list[str], model_with_new_type_stat=None
+        ) -> None:
+            """Change the type related params to new ones, according to `type_map` and the original one in the model.
+            If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
+            """
+            self.atomic_model.change_type_map(
+                type_map=type_map,
+                model_with_new_type_stat=model_with_new_type_stat.atomic_model
+                if model_with_new_type_stat is not None
+                else None,
+            )
+
+        def serialize(self) -> dict:
+            return self.atomic_model.serialize()
+
+        @classmethod
+        def deserialize(cls, data) -> "CM":
+            return cls(atomic_model_=T_AtomicModel.deserialize(data))
+
+        def get_dim_fparam(self) -> int:
+            """Get the number (dimension) of frame parameters of this atomic model."""
+            return self.atomic_model.get_dim_fparam()
+
+        def get_dim_aparam(self) -> int:
+            """Get the number (dimension) of atomic parameters of this atomic model."""
+            return self.atomic_model.get_dim_aparam()
+
+        def get_sel_type(self) -> list[int]:
+            """Get the selected atom types of this model.
+
+            Only atoms with selected atom types have atomic contribution
+            to the result of the model.
+            If returning an empty list, all atom types are selected.
+            """
+            return self.atomic_model.get_sel_type()
+
+        def is_aparam_nall(self) -> bool:
+            """Check whether the shape of atomic parameters is (nframes, nall, ndim).
+
+            If False, the shape is (nframes, nloc, ndim).
+            """
+            return self.atomic_model.is_aparam_nall()
+
+        def get_rcut(self) -> float:
+            """Get the cut-off radius."""
+            return self.atomic_model.get_rcut()
+
+        def get_type_map(self) -> list[str]:
+            """Get the type map."""
+            return self.atomic_model.get_type_map()
+
+        def get_nsel(self) -> int:
+            """Returns the total number of selected neighboring atoms in the cut-off radius."""
+            return self.atomic_model.get_nsel()
+
+        def get_nnei(self) -> int:
+            """Returns the total number of selected neighboring atoms in the cut-off radius."""
+            return self.atomic_model.get_nnei()
+
+        def atomic_output_def(self) -> FittingOutputDef:
+            """Get the output def of the atomic model."""
+            return self.atomic_model.atomic_output_def()
+
+        def compute_or_load_stat(
+            self,
+            sampled_func,
+            stat_file_path: Optional[DPPath] = None,
+        ):
+            """Compute or load the statistics."""
+            return self.atomic_model.compute_or_load_stat(sampled_func, stat_file_path)
+
+        def get_sel(self) -> list[int]:
+            """Returns the number of selected atoms for each type."""
+            return self.atomic_model.get_sel()
+
+        def mixed_types(self) -> bool:
+            """If true, the model
+            1. assumes total number of atoms aligned across frames;
+            2. uses a neighbor list that does not distinguish different atomic types.
+
+            If false, the model
+            1. assumes total number of atoms of each atom type aligned across frames;
+            2. uses a neighbor list that distinguishes different atomic types.
+
+            """
+            return self.atomic_model.mixed_types()
+
+        def has_message_passing(self) -> bool:
+            """Returns whether the model has message passing."""
+            return self.atomic_model.has_message_passing()
+
+        def need_sorted_nlist_for_lower(self) -> bool:
+            """Returns whether the model needs sorted nlist when using `forward_lower`."""
+            return self.atomic_model.need_sorted_nlist_for_lower()
+
+        def forward(
+            self,
+            coord,
+            atype,
+            box: Optional[paddle.Tensor] = None,
+            fparam: Optional[paddle.Tensor] = None,
+            aparam: Optional[paddle.Tensor] = None,
+            do_atomic_virial: bool = False,
+        ) -> dict[str, paddle.Tensor]:
+            # directly call the forward_common method when no specific transform rule
+            return self.forward_common(
+                coord,
+                atype,
+                box,
+                fparam=fparam,
+                aparam=aparam,
+                do_atomic_virial=do_atomic_virial,
+            )
+
+    return CM
diff --git a/deepmd/pd/model/model/model.py b/deepmd/pd/model/model/model.py
new file mode 100644
index 0000000000..06a2c6910f
--- /dev/null
+++ b/deepmd/pd/model/model/model.py
@@ -0,0 +1,55 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+from typing import (
+    Optional,
+)
+
+import paddle
+
+from deepmd.dpmodel.model.base_model import (
+    make_base_model,
+)
+from deepmd.utils.path import (
+    DPPath,
+)
+
+
+class BaseModel(paddle.nn.Layer, make_base_model()):
+    def __init__(self, *args, **kwargs):
+        """Construct a basic model for different tasks."""
+        paddle.nn.Layer.__init__(self)
+        self.model_def_script = ""
+        self.min_nbor_dist = None
+
+    def compute_or_load_stat(
+        self,
+        sampled_func,
+        stat_file_path: Optional[DPPath] = None,
+    ):
+        """
+        Compute or load the statistics parameters of the model,
+        such as mean and standard deviation of descriptors or the energy bias of the fitting net.
+        When `sampled` is provided, all the statistics parameters will be calculated (or re-calculated for update),
+        and saved in the `stat_file_path`(s).
+        When `sampled` is not provided, it will check the existence of `stat_file_path`(s)
+        and load the calculated statistics parameters.
+
+        Parameters
+        ----------
+        sampled_func
+            The sampled data frames from different data systems.
+        stat_file_path
+            The path to the statistics files.
+        """
+        raise NotImplementedError
+
+    def get_model_def_script(self) -> str:
+        """Get the model definition script."""
+        return self.model_def_script
+
+    def get_min_nbor_dist(self) -> Optional[float]:
+        """Get the minimum distance between two atoms."""
+        return self.min_nbor_dist
+
+    def get_ntypes(self):
+        """Returns the number of element types."""
+        return len(self.get_type_map())
diff --git a/deepmd/pd/model/model/transform_output.py b/deepmd/pd/model/model/transform_output.py
new file mode 100644
index 0000000000..469bfd3168
--- /dev/null
+++ b/deepmd/pd/model/model/transform_output.py
@@ -0,0 +1,262 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+
+import paddle
+
+from deepmd.dpmodel import (
+    FittingOutputDef,
+    ModelOutputDef,
+    OutputVariableDef,
+    get_deriv_name,
+    get_reduce_name,
+)
+from deepmd.pd.utils import (
+    decomp,
+    env,
+)
+
+
+def atomic_virial_corr(
+    extended_coord: paddle.Tensor,
+    atom_energy: paddle.Tensor,
+):
+    nall = extended_coord.shape[1]
+    nloc = atom_energy.shape[1]
+    coord, _ = paddle.split(extended_coord, [nloc, nall - nloc], axis=1)
+    # no derivative with respect to the loc coord.
+    coord = coord.detach()
+    ce = coord * atom_energy
+    sumce0, sumce1, sumce2 = paddle.split(paddle.sum(ce, axis=1), [1, 1, 1], axis=-1)
+    # faked_grad = paddle.ones_like(sumce0)
+    extended_virial_corr0 = paddle.autograd.grad(
+        [sumce0],
+        [extended_coord],
+        # grad_outputs=lst,
+        create_graph=False,
+        retain_graph=True,
+    )[0]
+    assert extended_virial_corr0 is not None
+    extended_virial_corr1 = paddle.autograd.grad(
+        [sumce1],
+        [extended_coord],
+        # grad_outputs=lst,
+        create_graph=False,
+        retain_graph=True,
+    )[0]
+    assert extended_virial_corr1 is not None
+    extended_virial_corr2 = paddle.autograd.grad(
+        [sumce2],
+        [extended_coord],
+        # grad_outputs=lst,
+        create_graph=False,
+        retain_graph=True,
+    )[0]
+    assert extended_virial_corr2 is not None
+    extended_virial_corr = paddle.concat(
+        [
+            extended_virial_corr0.unsqueeze(-1),
+            extended_virial_corr1.unsqueeze(-1),
+            extended_virial_corr2.unsqueeze(-1),
+        ],
+        axis=-1,
+    )
+    return extended_virial_corr
+
+
+def task_deriv_one(
+    atom_energy: paddle.Tensor,
+    energy: paddle.Tensor,
+    extended_coord: paddle.Tensor,
+    do_virial: bool = True,
+    do_atomic_virial: bool = False,
+    create_graph: bool = True,
+):
+    # faked_grad = paddle.ones_like(energy)
+    # lst = paddle.jit.annotate(List[Optional[paddle.Tensor]], [faked_grad])
+    extended_force = paddle.autograd.grad(
+        [energy],
+        [extended_coord],
+        # grad_outputs=lst,
+        create_graph=create_graph,
+        retain_graph=True,
+    )[0]
+    assert extended_force is not None
+    extended_force = -extended_force
+    if do_virial:
+        extended_virial = extended_force.unsqueeze(-1) @ extended_coord.unsqueeze(-2)
+        # the correction sums to zero, which does not contribute to global virial
+        if do_atomic_virial:
+            extended_virial_corr = atomic_virial_corr(extended_coord, atom_energy)
+            extended_virial = extended_virial + extended_virial_corr
+        # to [...,3,3] -> [...,9]
+        extended_virial = extended_virial.reshape(
+            [*list(extended_virial.shape[:-2]), 9]
+        )
+    else:
+        extended_virial = None
+    return extended_force, extended_virial
+
+
+def get_leading_dims(
+    vv: paddle.Tensor,
+    vdef: OutputVariableDef,
+):
+    """Get the dimensions of nf x nloc."""
+    vshape = vv.shape
+    return list(vshape[: (len(vshape) - len(vdef.shape))])
+
+
+def take_deriv(
+    vv: paddle.Tensor,
+    svv: paddle.Tensor,
+    vdef: OutputVariableDef,
+    coord_ext: paddle.Tensor,
+    do_virial: bool = False,
+    do_atomic_virial: bool = False,
+    create_graph: bool = True,
+):
+    size = 1
+    for ii in vdef.shape:
+        size *= ii
+    vv1 = vv.reshape(list(get_leading_dims(vv, vdef)) + [size])  # noqa: RUF005
+    svv1 = svv.reshape(list(get_leading_dims(svv, vdef)) + [size])  # noqa: RUF005
+    split_vv1 = paddle.split(vv1, [1] * size, axis=-1)
+    split_svv1 = paddle.split(svv1, [1] * size, axis=-1)
+    split_ff, split_avir = [], []
+    for vvi, svvi in zip(split_vv1, split_svv1):
+        # nf x nloc x 3, nf x nloc x 9
+        ffi, aviri = task_deriv_one(
+            vvi,
+            svvi,
+            coord_ext,
+            do_virial=do_virial,
+            do_atomic_virial=do_atomic_virial,
+            create_graph=create_graph,
+        )
+        # nf x nloc x 1 x 3, nf x nloc x 1 x 9
+        ffi = ffi.unsqueeze(-2)
+        split_ff.append(ffi)
+        if do_virial:
+            assert aviri is not None
+            aviri = aviri.unsqueeze(-2)
+            split_avir.append(aviri)
+    # nf x nall x v_dim x 3, nf x nall x v_dim x 9
+    out_lead_shape = list(coord_ext.shape[:-1]) + vdef.shape
+    ff = paddle.concat(split_ff, axis=-2).reshape(out_lead_shape + [3])  # noqa: RUF005
+    if do_virial:
+        avir = paddle.concat(split_avir, axis=-2).reshape(out_lead_shape + [9])  # noqa: RUF005
+    else:
+        avir = None
+    return ff, avir
+
+
+def fit_output_to_model_output(
+    fit_ret: dict[str, paddle.Tensor],
+    fit_output_def: FittingOutputDef,
+    coord_ext: paddle.Tensor,
+    do_atomic_virial: bool = False,
+    create_graph: bool = True,
+) -> dict[str, paddle.Tensor]:
+    """Transform the output of the fitting network to
+    the model output.
+
+    """
+    redu_prec = env.GLOBAL_PD_ENER_FLOAT_PRECISION
+    model_ret = dict(fit_ret.items())
+    for kk, vv in fit_ret.items():
+        vdef = fit_output_def[kk]
+        shap = vdef.shape
+        atom_axis = -(len(shap) + 1)
+        if vdef.reducible:
+            kk_redu = get_reduce_name(kk)
+            if vdef.intensive:
+                model_ret[kk_redu] = paddle.mean(vv.astype(redu_prec), axis=atom_axis)
+            else:
+                model_ret[kk_redu] = paddle.sum(vv.astype(redu_prec), axis=atom_axis)
+            if vdef.r_differentiable:
+                kk_derv_r, kk_derv_c = get_deriv_name(kk)
+                dr, dc = take_deriv(
+                    vv,
+                    model_ret[kk_redu],
+                    vdef,
+                    coord_ext,
+                    do_virial=vdef.c_differentiable,
+                    do_atomic_virial=do_atomic_virial,
+                    create_graph=create_graph,
+                )
+                model_ret[kk_derv_r] = dr
+                if vdef.c_differentiable:
+                    assert dc is not None
+                    model_ret[kk_derv_c] = dc
+                    model_ret[kk_derv_c + "_redu"] = paddle.sum(
+                        model_ret[kk_derv_c].astype(redu_prec), axis=1
+                    )
+    return model_ret
+
+
+def communicate_extended_output(
+    model_ret: dict[str, paddle.Tensor],
+    model_output_def: ModelOutputDef,
+    mapping: paddle.Tensor,  # nf x nloc
+    do_atomic_virial: bool = False,
+) -> dict[str, paddle.Tensor]:
+    """Transform the output of the model network defined on
+    local and ghost (extended) atoms to local atoms.
+
+    """
+    redu_prec = env.GLOBAL_PD_ENER_FLOAT_PRECISION
+    new_ret = {}
+    for kk in model_output_def.keys_outp():
+        vv = model_ret[kk]
+        vdef = model_output_def[kk]
+        new_ret[kk] = vv
+        if vdef.reducible:
+            kk_redu = get_reduce_name(kk)
+            new_ret[kk_redu] = model_ret[kk_redu]
+            # nf x nloc
+            vldims = get_leading_dims(vv, vdef)
+            # nf x nall
+            mldims = list(mapping.shape)
+            kk_derv_r, kk_derv_c = get_deriv_name(kk)
+            if vdef.r_differentiable:
+                # vdim x 3
+                derv_r_ext_dims = list(vdef.shape) + [3]  # noqa:RUF005
+                mapping = mapping.reshape(mldims + [1] * len(derv_r_ext_dims)).expand(
+                    [-1] * len(mldims) + derv_r_ext_dims
+                )
+                force = paddle.zeros(vldims + derv_r_ext_dims, dtype=vv.dtype).to(
+                    device=vv.place
+                )
+                # nf x nloc x nvar x 3
+                new_ret[kk_derv_r] = decomp.scatter_reduce(
+                    force,
+                    1,
+                    index=mapping,
+                    src=model_ret[kk_derv_r],
+                    reduce="sum",
+                )
+            if vdef.c_differentiable:
+                assert vdef.r_differentiable
+                derv_c_ext_dims = list(vdef.shape) + [9]  # noqa:RUF005
+                # nf x nloc x nvar x 3 -> nf x nloc x nvar x 9
+                mapping = paddle.tile(
+                    mapping,
+                    [1] * (len(mldims) + len(vdef.shape)) + [3],
+                )
+                virial = paddle.zeros(vldims + derv_c_ext_dims, dtype=vv.dtype).to(
+                    device=vv.place
+                )
+                # nf x nloc x nvar x 9
+                new_ret[kk_derv_c] = decomp.scatter_reduce(
+                    virial,
+                    1,
+                    index=mapping,
+                    src=model_ret[kk_derv_c],
+                    reduce="sum",
+                )
+                new_ret[kk_derv_c + "_redu"] = paddle.sum(
+                    new_ret[kk_derv_c].to(redu_prec), axis=1
+                )
+                if not do_atomic_virial:
+                    # pop atomic virial, because it is not correctly calculated.
+                    new_ret.pop(kk_derv_c)
+    return new_ret
diff --git a/deepmd/pd/model/network/__init__.py b/deepmd/pd/model/network/__init__.py
new file mode 100644
index 0000000000..6ceb116d85
--- /dev/null
+++ b/deepmd/pd/model/network/__init__.py
@@ -0,0 +1 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
diff --git a/deepmd/pd/model/network/init.py b/deepmd/pd/model/network/init.py
new file mode 100644
index 0000000000..dbdad56794
--- /dev/null
+++ b/deepmd/pd/model/network/init.py
@@ -0,0 +1,458 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+
+# Copyright (c) 2024 The PyTorch Authors. All rights reserved.
+#
+# This file includes source code from PyTorch of version v2.3.0, which is released under the BSD-3-Clause license.
+# For more information about PyTorch, visit https://pytorch.org/.
+
+
+# These no_grad_* functions are necessary as wrappers around the parts of these
+# functions that use `with paddle.no_grad()`. The JIT doesn't support context
+# managers, so these need to be implemented as builtins. Using these wrappers
+# lets us keep those builtins small and re-usable.
+
+from __future__ import (
+    annotations,
+)
+
+import math
+import warnings
+
+import paddle
+from paddle import (
+    Tensor,
+)
+
+PaddleGenerator = paddle.base.libpaddle.Generator
+
+
+def _no_grad_uniform_(tensor: paddle.Tensor, a, b, generator=None):
+    with paddle.no_grad():
+        return tensor.uniform_(a, b)
+
+
+def _no_grad_normal_(tensor: paddle.Tensor, mean, std, generator=None):
+    with paddle.no_grad():
+        return tensor.normal_(mean, std)
+
+
+def _no_grad_trunc_normal_(tensor: paddle.Tensor, mean, std, a, b, generator=None):
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
+
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn(
+            "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+            "The distribution of values may be incorrect.",
+            stacklevel=2,
+        )
+
+    with paddle.no_grad():
+        # Values are generated by using a truncated uniform distribution and
+        # then using the inverse CDF for the normal distribution.
+        # Get upper and lower cdf values
+        l = norm_cdf((a - mean) / std)
+        u = norm_cdf((b - mean) / std)
+
+        # Uniformly fill tensor with values from [l, u], then translate to
+        # [2l-1, 2u-1].
+        tensor.uniform_(2 * l - 1, 2 * u - 1)
+
+        # Use inverse cdf transform for normal distribution to get truncated
+        # standard normal
+        tensor.erfinv_()
+
+        # Transform to proper mean, std
+        tensor.multiply_(std * math.sqrt(2.0))
+        tensor.add_(mean)
+
+        # Clamp to ensure it's in the proper range
+        tensor.clip_(min=a, max=b)
+        return tensor
+
+
+def _no_grad_zero_(tensor: paddle.Tensor):
+    with paddle.no_grad():
+        return tensor.zero_()
+
+
+def _no_grad_fill_(tensor: paddle.Tensor, val):
+    with paddle.no_grad():
+        return tensor.fill_(val)
+
+
+def calculate_gain(nonlinearity, param=None):
+    r"""Return the recommended gain value for the given nonlinearity function.
+
+    The values are as follows:
+
+    ================= ====================================================
+    nonlinearity      gain
+    ================= ====================================================
+    Linear / Identity :math:`1`
+    Conv{1,2,3}D      :math:`1`
+    Sigmoid           :math:`1`
+    Tanh              :math:`\frac{5}{3}`
+    ReLU              :math:`\sqrt{2}`
+    Leaky Relu        :math:`\sqrt{\frac{2}{1 + \text{negative\_slope}^2}}`
+    SELU              :math:`\frac{3}{4}`
+    ================= ====================================================
+
+    .. warning::
+        In order to implement `Self-Normalizing Neural Networks`_ ,
+        you should use ``nonlinearity='linear'`` instead of ``nonlinearity='selu'``.
+        This gives the initial weights a variance of ``1 / N``,
+        which is necessary to induce a stable fixed point in the forward pass.
+        In contrast, the default gain for ``SELU`` sacrifices the normalization
+        effect for more stable gradient flow in rectangular layers.
+
+    Args:
+        nonlinearity: the non-linear function (`nn.functional` name)
+        param: optional parameter for the non-linear function
+
+    Examples
+    --------
+        >>> gain = nn.init.calculate_gain(
+        ...     "leaky_relu", 0.2
+        ... )  # leaky_relu with negative_slope=0.2
+
+    .. _Self-Normalizing Neural Networks: https://papers.nips.cc/paper/2017/hash/5d44ee6f2c3f71b73125876103c8f6c4-Abstract.html
+    """
+    linear_fns = [
+        "linear",
+        "conv1d",
+        "conv2d",
+        "conv3d",
+        "conv_transpose1d",
+        "conv_transpose2d",
+        "conv_transpose3d",
+    ]
+    if nonlinearity in linear_fns or nonlinearity == "sigmoid":
+        return 1
+    elif nonlinearity == "tanh":
+        return 5.0 / 3
+    elif nonlinearity == "relu":
+        return math.sqrt(2.0)
+    elif nonlinearity == "leaky_relu":
+        if param is None:
+            negative_slope = 0.01
+        elif (
+            not isinstance(param, bool)
+            and isinstance(param, int)
+            or isinstance(param, float)
+        ):
+            # True/False are instances of int, hence check above
+            negative_slope = param
+        else:
+            raise ValueError(f"negative_slope {param} not a valid number")
+        return math.sqrt(2.0 / (1 + negative_slope**2))
+    elif nonlinearity == "selu":
+        return (
+            3.0 / 4
+        )  # Value found empirically (https://github.com/pytorch/pytorch/pull/50664)
+    else:
+        raise ValueError(f"Unsupported nonlinearity {nonlinearity}")
+
+
+def _calculate_fan_in_and_fan_out(tensor, reverse=False):
+    dimensions = tensor.ndim
+    if dimensions < 2:
+        raise ValueError(
+            "Fan in and fan out can not be computed for tensor with fewer than 2 dimensions"
+        )
+
+    if reverse:
+        num_input_fmaps, num_output_fmaps = tensor.shape[0], tensor.shape[1]
+    else:
+        num_input_fmaps, num_output_fmaps = tensor.shape[1], tensor.shape[0]
+
+    receptive_field_size = 1
+    if tensor.ndim > 2:
+        for s in tensor.shape[2:]:
+            receptive_field_size *= s
+    fan_in = num_input_fmaps * receptive_field_size
+    fan_out = num_output_fmaps * receptive_field_size
+
+    return fan_in, fan_out
+
+
+def _calculate_correct_fan(tensor, mode, reverse=False):
+    mode = mode.lower()
+    valid_modes = ["fan_in", "fan_out"]
+    if mode not in valid_modes:
+        raise ValueError(f"Mode {mode} not supported, please use one of {valid_modes}")
+
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse)
+    return fan_in if mode == "fan_in" else fan_out
+
+
+def zeros_(tensor: Tensor) -> Tensor:
+    r"""Fill the input Tensor with the scalar value `0`.
+
+    Args:
+        tensor: an n-dimensional `paddle.Tensor`
+
+    Examples
+    --------
+        >>> w = paddle.empty(3, 5)
+        >>> nn.init.zeros_(w)
+    """
+    return _no_grad_zero_(tensor)
+
+
+def ones_(tensor: Tensor) -> Tensor:
+    r"""Fill the input Tensor with the scalar value `1`.
+
+    Args:
+        tensor: an n-dimensional `paddle.Tensor`
+
+    Examples
+    --------
+        >>> w = paddle.empty(3, 5)
+        >>> nn.init.ones_(w)
+    """
+    return _no_grad_fill_(tensor, 1.0)
+
+
+def constant_(tensor: Tensor, val: float) -> Tensor:
+    r"""Fill the input Tensor with the value :math:`\text{val}`.
+
+    Args:
+        tensor: an n-dimensional `paddle.Tensor`
+        val: the value to fill the tensor with
+
+    Examples
+    --------
+        >>> w = paddle.empty(3, 5)
+        >>> nn.init.constant_(w, 0.3)
+    """
+    return _no_grad_fill_(tensor, val)
+
+
+def normal_(
+    tensor: Tensor,
+    mean: float = 0.0,
+    std: float = 1.0,
+    generator: PaddleGenerator | None = None,
+) -> Tensor:
+    r"""Fill the input Tensor with values drawn from the normal distribution.
+
+    :math:`\mathcal{N}(\text{mean}, \text{std}^2)`.
+
+    Args:
+        tensor: an n-dimensional `paddle.Tensor`
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+        generator: the paddle Generator to sample from (default: None)
+
+    Examples
+    --------
+        >>> w = paddle.empty(3, 5)
+        >>> nn.init.normal_(w)
+    """
+    return _no_grad_normal_(tensor, mean, std, generator)
+
+
+def trunc_normal_(
+    tensor: Tensor,
+    mean: float = 0.0,
+    std: float = 1.0,
+    a: float = -2.0,
+    b: float = 2.0,
+    generator: PaddleGenerator | None = None,
+) -> Tensor:
+    r"""Fill the input Tensor with values drawn from a truncated normal distribution.
+
+    The values are effectively drawn from the
+    normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
+    with values outside :math:`[a, b]` redrawn until they are within
+    the bounds. The method used for generating the random values works
+    best when :math:`a \leq \text{mean} \leq b`.
+
+    Args:
+        tensor: an n-dimensional `paddle.Tensor`
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+        a: the minimum cutoff value
+        b: the maximum cutoff value
+        generator: the paddle Generator to sample from (default: None)
+
+    Examples
+    --------
+        >>> w = paddle.empty(3, 5)
+        >>> nn.init.trunc_normal_(w)
+    """
+    return _no_grad_trunc_normal_(tensor, mean, std, a, b)
+
+
+def kaiming_uniform_(
+    tensor: Tensor,
+    a: float = 0,
+    mode: str = "fan_in",
+    nonlinearity: str = "leaky_relu",
+    generator: PaddleGenerator | None = None,
+    reverse: bool = False,
+):
+    r"""Fill the input `Tensor` with values using a Kaiming uniform distribution.
+
+    The method is described in `Delving deep into rectifiers: Surpassing
+    human-level performance on ImageNet classification` - He, K. et al. (2015).
+    The resulting tensor will have values sampled from
+    :math:`\mathcal{U}(-\text{bound}, \text{bound})` where
+
+    .. math::
+        \text{bound} = \text{gain} \times \sqrt{\frac{3}{\text{fan\_mode}}}
+
+    Also known as He initialization.
+
+    Args:
+        tensor: an n-dimensional `paddle.Tensor`
+        a: the negative slope of the rectifier used after this layer (only
+            used with ``'leaky_relu'``)
+        mode: either ``'fan_in'`` (default) or ``'fan_out'``. Choosing ``'fan_in'``
+            preserves the magnitude of the variance of the weights in the
+            forward pass. Choosing ``'fan_out'`` preserves the magnitudes in the
+            backwards pass.
+        nonlinearity: the non-linear function (`nn.functional` name),
+            recommended to use only with ``'relu'`` or ``'leaky_relu'`` (default).
+        generator: the paddle Generator to sample from (default: None)
+        reverse (bool, optional): Tensor data format order, False by default as
+            [fout, fin, ...].. Defaults to False.
+
+    Examples
+    --------
+        >>> w = paddle.empty(3, 5)
+        >>> nn.init.kaiming_uniform_(w, mode="fan_in", nonlinearity="relu")
+    """
+    if 0 in tensor.shape:
+        warnings.warn("Initializing zero-element tensors is a no-op")
+        return tensor
+    fan = _calculate_correct_fan(tensor, mode, reverse)
+    gain = calculate_gain(nonlinearity, a)
+    std = gain / math.sqrt(fan)
+    bound = math.sqrt(3.0) * std  # Calculate uniform bounds from standard deviation
+    with paddle.no_grad():
+        return tensor.uniform_(-bound, bound)
+
+
+def kaiming_normal_(
+    tensor: Tensor,
+    a: float = 0,
+    mode: str = "fan_in",
+    nonlinearity: str = "leaky_relu",
+    generator: PaddleGenerator | None = None,
+    reverse: bool = False,
+):
+    r"""Fill the input `Tensor` with values using a Kaiming normal distribution.
+
+    The method is described in `Delving deep into rectifiers: Surpassing
+    human-level performance on ImageNet classification` - He, K. et al. (2015).
+    The resulting tensor will have values sampled from
+    :math:`\mathcal{N}(0, \text{std}^2)` where
+
+    .. math::
+        \text{std} = \frac{\text{gain}}{\sqrt{\text{fan\_mode}}}
+
+    Also known as He initialization.
+
+    Args:
+        tensor: an n-dimensional `paddle.Tensor`
+        a: the negative slope of the rectifier used after this layer (only
+            used with ``'leaky_relu'``)
+        mode: either ``'fan_in'`` (default) or ``'fan_out'``. Choosing ``'fan_in'``
+            preserves the magnitude of the variance of the weights in the
+            forward pass. Choosing ``'fan_out'`` preserves the magnitudes in the
+            backwards pass.
+        nonlinearity: the non-linear function (`nn.functional` name),
+            recommended to use only with ``'relu'`` or ``'leaky_relu'`` (default).
+        generator: the paddle Generator to sample from (default: None)
+        reverse (bool, optional): Tensor data format order, False by default as
+            [fout, fin, ...].. Defaults to False.
+
+    Examples
+    --------
+        >>> w = paddle.empty(3, 5)
+        >>> nn.init.kaiming_normal_(w, mode="fan_out", nonlinearity="relu")
+    """
+    if 0 in tensor.shape:
+        warnings.warn("Initializing zero-element tensors is a no-op")
+        return tensor
+    fan = _calculate_correct_fan(tensor, mode, reverse)
+    gain = calculate_gain(nonlinearity, a)
+    std = gain / math.sqrt(fan)
+    with paddle.no_grad():
+        return tensor.normal_(0, std)
+
+
+def xavier_uniform_(
+    tensor: Tensor,
+    gain: float = 1.0,
+    generator: PaddleGenerator | None = None,
+    reverse: bool = False,
+) -> Tensor:
+    r"""Fill the input `Tensor` with values using a Xavier uniform distribution.
+
+    The method is described in `Understanding the difficulty of training
+    deep feedforward neural networks` - Glorot, X. & Bengio, Y. (2010).
+    The resulting tensor will have values sampled from
+    :math:`\mathcal{U}(-a, a)` where
+
+    .. math::
+        a = \text{gain} \times \sqrt{\frac{6}{\text{fan\_in} + \text{fan\_out}}}
+
+    Also known as Glorot initialization.
+
+    Args:
+        tensor: an n-dimensional `paddle.Tensor`
+        gain: an optional scaling factor
+        generator: the paddle Generator to sample from (default: None)
+        reverse (bool, optional): Tensor data format order, False by default as
+            [fout, fin, ...].. Defaults to False.
+
+    Examples
+    --------
+        >>> w = paddle.empty(3, 5)
+        >>> nn.init.xavier_uniform_(w, gain=nn.init.calculate_gain("relu"))
+    """
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse=reverse)
+    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
+    a = math.sqrt(3.0) * std  # Calculate uniform bounds from standard deviation
+
+    return _no_grad_uniform_(tensor, -a, a, generator)
+
+
+def xavier_normal_(
+    tensor: Tensor,
+    gain: float = 1.0,
+    generator: PaddleGenerator | None = None,
+    reverse: bool = False,
+) -> Tensor:
+    r"""Fill the input `Tensor` with values using a Xavier normal distribution.
+
+    The method is described in `Understanding the difficulty of training deep feedforward
+    neural networks` - Glorot, X. & Bengio, Y. (2010). The resulting tensor
+    will have values sampled from :math:`\mathcal{N}(0, \text{std}^2)` where
+
+    .. math::
+        \text{std} = \text{gain} \times \sqrt{\frac{2}{\text{fan\_in} + \text{fan\_out}}}
+
+    Also known as Glorot initialization.
+
+    Args:
+        tensor: an n-dimensional `paddle.Tensor`
+        gain: an optional scaling factor
+        generator: the paddle Generator to sample from (default: None)
+        reverse (bool, optional): Tensor data format order, False by
+            default as [fout, fin, ...]. Defaults to False.
+
+    Examples
+    --------
+        >>> w = paddle.empty(3, 5)
+        >>> nn.init.xavier_normal_(w)
+    """
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse=reverse)
+    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
+
+    return _no_grad_normal_(tensor, 0.0, std, generator)
diff --git a/deepmd/pd/model/network/mlp.py b/deepmd/pd/model/network/mlp.py
new file mode 100644
index 0000000000..370b0fa8fa
--- /dev/null
+++ b/deepmd/pd/model/network/mlp.py
@@ -0,0 +1,328 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+from __future__ import (
+    annotations,
+)
+
+from typing import (
+    ClassVar,
+)
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+
+from deepmd.pd.utils import (
+    env,
+)
+
+device = env.DEVICE
+
+from deepmd.dpmodel.utils import (
+    NativeLayer,
+)
+from deepmd.dpmodel.utils import NetworkCollection as DPNetworkCollection
+from deepmd.dpmodel.utils import (
+    make_embedding_network,
+    make_fitting_network,
+    make_multilayer_network,
+)
+from deepmd.pd.model.network.init import (
+    PaddleGenerator,
+    kaiming_normal_,
+    normal_,
+    trunc_normal_,
+    xavier_uniform_,
+)
+from deepmd.pd.utils.env import (
+    DEFAULT_PRECISION,
+    PRECISION_DICT,
+)
+from deepmd.pd.utils.utils import (
+    ActivationFn,
+    get_generator,
+    to_numpy_array,
+    to_paddle_tensor,
+)
+
+
+def empty_t(shape, precision):
+    return paddle.empty(shape, dtype=precision).to(device=device)
+
+
+class Identity(nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(
+        self,
+        xx: paddle.Tensor,
+    ) -> paddle.Tensor:
+        """The Identity operation layer."""
+        return xx
+
+    def serialize(self) -> dict:
+        return {
+            "@class": "Identity",
+            "@version": 1,
+        }
+
+    @classmethod
+    def deserialize(cls, data: dict) -> Identity:
+        return Identity()
+
+
+class MLPLayer(nn.Layer):
+    def __init__(
+        self,
+        num_in,
+        num_out,
+        bias: bool = True,
+        use_timestep: bool = False,
+        activation_function: str | None = None,
+        resnet: bool = False,
+        bavg: float = 0.0,
+        stddev: float = 1.0,
+        precision: str = DEFAULT_PRECISION,
+        init: str = "default",
+        seed: int | list[int] | None = None,
+    ):
+        super().__init__()
+        # only use_timestep when skip connection is established.
+        self.use_timestep = use_timestep and (
+            num_out == num_in or num_out == num_in * 2
+        )
+        self.num_in = num_in
+        self.num_out = num_out
+        self.activate_name = activation_function
+        self.activate = ActivationFn(self.activate_name)
+        self.precision = precision
+        self.prec = PRECISION_DICT[self.precision]
+        self.matrix = self.create_parameter(
+            (num_in, num_out),
+            dtype=self.prec,
+            default_initializer=nn.initializer.Assign(
+                empty_t((num_in, num_out), self.prec)
+            ),
+        )
+        random_generator = get_generator(seed)
+        if bias:
+            self.bias = self.create_parameter(
+                [num_out],
+                dtype=self.prec,
+                default_initializer=nn.initializer.Assign(
+                    empty_t([num_out], self.prec)
+                ),
+            )
+        else:
+            self.bias = None
+        if self.use_timestep:
+            self.idt = self.create_parameter(
+                [num_out],
+                dtype=self.prec,
+                default_initializer=nn.initializer.Assign(
+                    empty_t([num_out], self.prec)
+                ),
+            )
+        else:
+            self.idt = None
+        self.resnet = resnet
+        if init == "default":
+            self._default_normal_init(
+                bavg=bavg, stddev=stddev, generator=random_generator
+            )
+        elif init == "trunc_normal":
+            self._trunc_normal_init(1.0, generator=random_generator)
+        elif init == "relu":
+            self._trunc_normal_init(2.0, generator=random_generator)
+        elif init == "glorot":
+            self._glorot_uniform_init(generator=random_generator)
+        elif init == "gating":
+            self._zero_init(self.use_bias)
+        elif init == "kaiming_normal":
+            self._normal_init(generator=random_generator)
+        elif init == "final":
+            self._zero_init(False)
+        else:
+            raise ValueError(f"Unknown initialization method: {init}")
+
+    def check_type_consistency(self):
+        precision = self.precision
+
+        def check_var(var):
+            if var is not None:
+                # assertion "float64" == "double" would fail
+                assert PRECISION_DICT[var.dtype.name] is PRECISION_DICT[precision]
+
+        check_var(self.matrix)
+        check_var(self.bias)
+        check_var(self.idt)
+
+    def dim_in(self) -> int:
+        return self.matrix.shape[0]
+
+    def dim_out(self) -> int:
+        return self.matrix.shape[1]
+
+    def _default_normal_init(
+        self,
+        bavg: float = 0.0,
+        stddev: float = 1.0,
+        generator: PaddleGenerator | None = None,
+    ):
+        normal_(
+            self.matrix.data,
+            std=stddev / np.sqrt(self.num_out + self.num_in),
+            generator=generator,
+        )
+        if self.bias is not None:
+            normal_(self.bias.data, mean=bavg, std=stddev, generator=generator)
+        if self.idt is not None:
+            normal_(self.idt.data, mean=0.1, std=0.001, generator=generator)
+
+    def _trunc_normal_init(self, scale=1.0, generator: PaddleGenerator | None = None):
+        # Constant from scipy.stats.truncnorm.std(a=-2, b=2, loc=0., scale=1.)
+        TRUNCATED_NORMAL_STDDEV_FACTOR = 0.87962566103423978
+        _, fan_in = self.matrix.shape
+        scale = scale / max(1, fan_in)
+        std = (scale**0.5) / TRUNCATED_NORMAL_STDDEV_FACTOR
+        trunc_normal_(self.matrix, mean=0.0, std=std, generator=generator)
+
+    def _glorot_uniform_init(self, generator: PaddleGenerator | None = None):
+        xavier_uniform_(self.matrix, gain=1, generator=generator)
+
+    def _zero_init(self, use_bias=True):
+        with paddle.no_grad():
+            self.matrix.fill_(0.0)
+            if use_bias and self.bias is not None:
+                with paddle.no_grad():
+                    self.bias.fill_(1.0)
+
+    def _normal_init(self, generator: PaddleGenerator | None = None):
+        kaiming_normal_(self.matrix, nonlinearity="linear", generator=generator)
+
+    def forward(
+        self,
+        xx: paddle.Tensor,
+    ) -> paddle.Tensor:
+        """One MLP layer used by DP model.
+
+        Parameters
+        ----------
+        xx : paddle.Tensor
+            The input.
+
+        Returns
+        -------
+        yy: paddle.Tensor
+            The output.
+        """
+        ori_prec = xx.dtype
+        xx = xx.astype(self.prec)
+        yy = (
+            paddle.matmul(xx, self.matrix) + self.bias
+            if self.bias is not None
+            else paddle.matmul(xx, self.matrix)
+        )
+        yy = self.activate(yy).clone()
+        yy = yy * self.idt if self.idt is not None else yy
+        if self.resnet:
+            if xx.shape[-1] == yy.shape[-1]:
+                yy += xx
+            elif 2 * xx.shape[-1] == yy.shape[-1]:
+                yy += paddle.concat([xx, xx], axis=-1)
+            # else:
+            #     yy = yy
+        yy = yy.astype(ori_prec)
+        return yy
+
+    def serialize(self) -> dict:
+        """Serialize the layer to a dict.
+
+        Returns
+        -------
+        dict
+            The serialized layer.
+        """
+        nl = NativeLayer(
+            self.matrix.shape[0],
+            self.matrix.shape[1],
+            bias=self.bias is not None,
+            use_timestep=self.idt is not None,
+            activation_function=self.activate_name,
+            resnet=self.resnet,
+            precision=self.precision,
+        )
+        nl.w, nl.b, nl.idt = (
+            to_numpy_array(self.matrix),
+            to_numpy_array(self.bias),
+            to_numpy_array(self.idt),
+        )
+        return nl.serialize()
+
+    @classmethod
+    def deserialize(cls, data: dict) -> MLPLayer:
+        """Deserialize the layer from a dict.
+
+        Parameters
+        ----------
+        data : dict
+            The dict to deserialize from.
+        """
+        nl = NativeLayer.deserialize(data)
+        obj = cls(
+            nl["matrix"].shape[0],
+            nl["matrix"].shape[1],
+            bias=nl["bias"] is not None,
+            use_timestep=nl["idt"] is not None,
+            activation_function=nl["activation_function"],
+            resnet=nl["resnet"],
+            precision=nl["precision"],
+        )
+        prec = PRECISION_DICT[obj.precision]
+
+        def check_load_param(ss):
+            if nl[ss] is not None:
+                tensor = to_paddle_tensor(nl[ss])
+                return paddle.create_parameter(
+                    tensor.shape,
+                    dtype=tensor.dtype,
+                    default_initializer=nn.initializer.Assign(tensor),
+                )
+            return None
+
+        obj.matrix = check_load_param("matrix")
+        obj.bias = check_load_param("bias")
+        obj.idt = check_load_param("idt")
+        return obj
+
+
+MLP_ = make_multilayer_network(MLPLayer, nn.Layer)
+
+
+class MLP(MLP_):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.layers = paddle.nn.LayerList(self.layers)
+
+    forward = MLP_.call
+
+
+EmbeddingNet = make_embedding_network(MLP, MLPLayer)
+
+FittingNet = make_fitting_network(EmbeddingNet, MLP, MLPLayer)
+
+
+class NetworkCollection(DPNetworkCollection, nn.Layer):
+    """Paddle implementation of NetworkCollection."""
+
+    NETWORK_TYPE_MAP: ClassVar[dict[str, type]] = {
+        "network": MLP,
+        "embedding_network": EmbeddingNet,
+        "fitting_network": FittingNet,
+    }
+
+    def __init__(self, *args, **kwargs):
+        # init both two base classes
+        DPNetworkCollection.__init__(self, *args, **kwargs)
+        nn.Layer.__init__(self)
+        self.networks = self._networks = paddle.nn.LayerList(self._networks)
diff --git a/deepmd/pd/model/network/network.py b/deepmd/pd/model/network/network.py
new file mode 100644
index 0000000000..f118c234ab
--- /dev/null
+++ b/deepmd/pd/model/network/network.py
@@ -0,0 +1,325 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+from typing import (
+    Optional,
+    Union,
+)
+
+import paddle
+import paddle.nn as nn
+
+from deepmd.dpmodel.utils.type_embed import (
+    get_econf_tebd,
+)
+from deepmd.pd.model.network.mlp import (
+    EmbeddingNet,
+)
+from deepmd.pd.utils import (
+    env,
+)
+from deepmd.pd.utils.utils import (
+    to_paddle_tensor,
+)
+from deepmd.utils.finetune import (
+    get_index_between_two_maps,
+)
+from deepmd.utils.version import (
+    check_version_compatibility,
+)
+
+
+def Tensor(*shape):
+    return paddle.empty(shape, dtype=env.GLOBAL_PD_FLOAT_PRECISION).to(
+        device=env.DEVICE
+    )
+
+
+class TypeEmbedNet(nn.Layer):
+    def __init__(
+        self,
+        type_nums,
+        embed_dim,
+        bavg=0.0,
+        stddev=1.0,
+        precision="default",
+        seed: Optional[Union[int, list[int]]] = None,
+        use_econf_tebd=False,
+        use_tebd_bias: bool = False,
+        type_map=None,
+    ):
+        """Construct a type embedding net."""
+        super().__init__()
+        self.type_nums = type_nums
+        self.embed_dim = embed_dim
+        self.bavg = bavg
+        self.stddev = stddev
+        self.use_econf_tebd = use_econf_tebd
+        self.use_tebd_bias = use_tebd_bias
+        self.type_map = type_map
+        self.embedding = TypeEmbedNetConsistent(
+            ntypes=self.type_nums,
+            neuron=[self.embed_dim],
+            padding=True,
+            activation_function="Linear",
+            use_econf_tebd=use_econf_tebd,
+            use_tebd_bias=use_tebd_bias,
+            type_map=type_map,
+            precision=precision,
+            seed=seed,
+        )
+        # init.normal_(self.embedding.weight[:-1], mean=bavg, std=stddev)
+
+    def forward(self, atype):
+        """
+        Args:
+            atype: Type of each input, [nframes, nloc] or [nframes, nloc, nnei].
+
+        Returns
+        -------
+        type_embedding:
+
+        """
+        return self.embedding(atype.place)[atype]
+
+    def share_params(self, base_class, shared_level, resume=False):
+        """
+        Share the parameters of self to the base_class with shared_level during multitask training.
+        If not start from checkpoint (resume is False),
+        some seperated parameters (e.g. mean and stddev) will be re-calculated across different classes.
+        """
+        assert (
+            self.__class__ == base_class.__class__
+        ), "Only TypeEmbedNet of the same type can share params!"
+        if shared_level == 0:
+            # the following will successfully link all the params except buffers, which need manually link.
+            for item in self._sub_layers:
+                self._sub_layers[item] = base_class._sub_layers[item]
+        else:
+            raise NotImplementedError
+
+    def change_type_map(
+        self, type_map: list[str], model_with_new_type_stat=None
+    ) -> None:
+        """Change the type related params to new ones, according to `type_map` and the original one in the model.
+        If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
+        """
+        self.embedding.change_type_map(type_map=type_map)
+
+
+class TypeEmbedNetConsistent(nn.Layer):
+    r"""Type embedding network that is consistent with other backends.
+
+    Parameters
+    ----------
+    ntypes : int
+        Number of atom types
+    neuron : list[int]
+        Number of neurons in each hidden layers of the embedding net
+    resnet_dt
+        Time-step `dt` in the resnet construction: y = x + dt * \phi (Wx + b)
+    activation_function
+        The activation function in the embedding net. Supported options are |ACTIVATION_FN|
+    precision
+        The precision of the embedding net parameters. Supported options are |PRECISION|
+    trainable
+        If the weights of embedding net are trainable.
+    seed
+        Random seed for initializing the network parameters.
+    padding
+        Concat the zero padding to the output, as the default embedding of empty type.
+    use_econf_tebd: bool, Optional
+        Whether to use electronic configuration type embedding.
+    use_tebd_bias : bool, Optional
+        Whether to use bias in the type embedding layer.
+    type_map: list[str], Optional
+        A list of strings. Give the name to each type of atoms.
+    """
+
+    def __init__(
+        self,
+        *,
+        ntypes: int,
+        neuron: list[int],
+        resnet_dt: bool = False,
+        activation_function: str = "tanh",
+        precision: str = "default",
+        trainable: bool = True,
+        seed: Optional[Union[int, list[int]]] = None,
+        padding: bool = False,
+        use_econf_tebd: bool = False,
+        use_tebd_bias: bool = False,
+        type_map: Optional[list[str]] = None,
+    ):
+        """Construct a type embedding net."""
+        super().__init__()
+        self.ntypes = ntypes
+        self.neuron = neuron
+        self.seed = seed
+        self.resnet_dt = resnet_dt
+        self.precision = precision
+        self.prec = env.PRECISION_DICT[self.precision]
+        self.activation_function = str(activation_function)
+        self.trainable = trainable
+        self.padding = padding
+        self.use_econf_tebd = use_econf_tebd
+        self.use_tebd_bias = use_tebd_bias
+        self.type_map = type_map
+        self.econf_tebd = None
+        embed_input_dim = ntypes
+        if self.use_econf_tebd:
+            econf_tebd, embed_input_dim = get_econf_tebd(
+                self.type_map, precision=self.precision
+            )
+            self.econf_tebd = to_paddle_tensor(econf_tebd)
+        self.embedding_net = EmbeddingNet(
+            embed_input_dim,
+            self.neuron,
+            self.activation_function,
+            self.resnet_dt,
+            self.precision,
+            self.seed,
+            bias=self.use_tebd_bias,
+        )
+        for param in self.parameters():
+            param.stop_gradient = not trainable
+
+    def forward(self, device: str):
+        """Caulate type embedding network.
+
+        Returns
+        -------
+        type_embedding: paddle.Tensor
+            Type embedding network.
+        """
+        if not self.use_econf_tebd:
+            embed = self.embedding_net(
+                paddle.eye(self.ntypes, dtype=self.prec).to(device=device)
+            )
+        else:
+            assert self.econf_tebd is not None
+            embed = self.embedding_net(self.econf_tebd.to(device))
+        if self.padding:
+            embed = paddle.concat(
+                [
+                    embed,
+                    paddle.zeros([1, embed.shape[1]], dtype=self.prec).to(
+                        device=device
+                    ),
+                ]
+            )
+        return embed
+
+    def change_type_map(
+        self, type_map: list[str], model_with_new_type_stat=None
+    ) -> None:
+        """Change the type related params to new ones, according to `type_map` and the original one in the model.
+        If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
+        """
+        assert (
+            self.type_map is not None
+        ), "'type_map' must be defined when performing type changing!"
+        remap_index, has_new_type = get_index_between_two_maps(self.type_map, type_map)
+        if not self.use_econf_tebd:
+            do_resnet = self.neuron[0] in [
+                self.ntypes,
+                self.ntypes * 2,
+                len(type_map),
+                len(type_map) * 2,
+            ]
+            assert (
+                not do_resnet or self.activation_function == "Linear"
+            ), "'activation_function' must be 'Linear' when performing type changing on resnet structure!"
+            first_layer_matrix = self.embedding_net.layers[0].matrix
+            eye_vector = paddle.eye(self.ntypes, dtype=self.prec).to(
+                device=first_layer_matrix.place
+            )
+            # preprocess for resnet connection
+            if self.neuron[0] == self.ntypes:
+                first_layer_matrix += eye_vector
+            elif self.neuron[0] == self.ntypes * 2:
+                first_layer_matrix += paddle.concat([eye_vector, eye_vector], axis=-1)
+
+            # randomly initialize params for the unseen types
+            if has_new_type:
+                extend_type_params = paddle.rand(
+                    [len(type_map), first_layer_matrix.shape[-1]],
+                    dtype=first_layer_matrix.dtype,
+                ).to(device=first_layer_matrix.place)
+                first_layer_matrix = paddle.concat(
+                    [first_layer_matrix, extend_type_params], axis=0
+                )
+
+            first_layer_matrix = first_layer_matrix[remap_index]
+            new_ntypes = len(type_map)
+            eye_vector = paddle.eye(new_ntypes, dtype=self.prec).to(
+                device=first_layer_matrix.place
+            )
+
+            if self.neuron[0] == new_ntypes:
+                first_layer_matrix -= eye_vector
+            elif self.neuron[0] == new_ntypes * 2:
+                first_layer_matrix -= paddle.concat([eye_vector, eye_vector], axis=-1)
+
+            self.embedding_net.layers[0].num_in = new_ntypes
+            self.embedding_net.layers[0].matrix = self.create_parameter(
+                first_layer_matrix.shape,
+                dtype=first_layer_matrix.dtype,
+                default_initializer=nn.initializer.Assign(first_layer_matrix),
+            )
+        else:
+            econf_tebd, embed_input_dim = get_econf_tebd(
+                type_map, precision=self.precision
+            )
+            self.econf_tebd = to_paddle_tensor(econf_tebd)
+        self.type_map = type_map
+        self.ntypes = len(type_map)
+
+    @classmethod
+    def deserialize(cls, data: dict):
+        """Deserialize the model.
+
+        Parameters
+        ----------
+        data : dict
+            The serialized data
+
+        Returns
+        -------
+        TypeEmbedNetConsistent
+            The deserialized model
+        """
+        data = data.copy()
+        check_version_compatibility(data.pop("@version", 1), 2, 1)
+        data_cls = data.pop("@class")
+        assert data_cls == "TypeEmbedNet", f"Invalid class {data_cls}"
+
+        embedding_net = EmbeddingNet.deserialize(data.pop("embedding"))
+        # compat with version 1
+        if "use_tebd_bias" not in data:
+            data["use_tebd_bias"] = True
+        type_embedding_net = cls(**data)
+        type_embedding_net.embedding_net = embedding_net
+        return type_embedding_net
+
+    def serialize(self) -> dict:
+        """Serialize the model.
+
+        Returns
+        -------
+        dict
+            The serialized data
+        """
+        return {
+            "@class": "TypeEmbedNet",
+            "@version": 2,
+            "ntypes": self.ntypes,
+            "neuron": self.neuron,
+            "resnet_dt": self.resnet_dt,
+            "precision": self.precision,
+            "activation_function": self.activation_function,
+            "trainable": self.trainable,
+            "padding": self.padding,
+            "use_econf_tebd": self.use_econf_tebd,
+            "use_tebd_bias": self.use_tebd_bias,
+            "type_map": self.type_map,
+            "embedding": self.embedding_net.serialize(),
+        }
diff --git a/deepmd/pd/model/task/__init__.py b/deepmd/pd/model/task/__init__.py
new file mode 100644
index 0000000000..ad616156c7
--- /dev/null
+++ b/deepmd/pd/model/task/__init__.py
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+from .base_fitting import (
+    BaseFitting,
+)
+from .ener import (
+    EnergyFittingNet,
+)
+from .fitting import (
+    Fitting,
+)
+
+__all__ = [
+    "EnergyFittingNet",
+    "Fitting",
+    "BaseFitting",
+]
diff --git a/deepmd/pd/model/task/base_fitting.py b/deepmd/pd/model/task/base_fitting.py
new file mode 100644
index 0000000000..9ad3b801cd
--- /dev/null
+++ b/deepmd/pd/model/task/base_fitting.py
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import paddle
+
+from deepmd.dpmodel.fitting import (
+    make_base_fitting,
+)
+
+BaseFitting = make_base_fitting(paddle.Tensor, fwd_method_name="forward")
diff --git a/deepmd/pd/model/task/ener.py b/deepmd/pd/model/task/ener.py
new file mode 100644
index 0000000000..ed0cfac69d
--- /dev/null
+++ b/deepmd/pd/model/task/ener.py
@@ -0,0 +1,85 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import copy
+import logging
+from typing import (
+    Optional,
+    Union,
+)
+
+import paddle
+
+from deepmd.pd.model.task.fitting import (
+    Fitting,
+    GeneralFitting,
+)
+from deepmd.pd.model.task.invar_fitting import (
+    InvarFitting,
+)
+from deepmd.pd.utils import (
+    env,
+)
+from deepmd.pd.utils.env import (
+    DEFAULT_PRECISION,
+)
+from deepmd.utils.version import (
+    check_version_compatibility,
+)
+
+dtype = env.GLOBAL_PD_FLOAT_PRECISION
+device = env.DEVICE
+
+log = logging.getLogger(__name__)
+
+
+@Fitting.register("ener")
+class EnergyFittingNet(InvarFitting):
+    def __init__(
+        self,
+        ntypes: int,
+        dim_descrpt: int,
+        neuron: list[int] = [128, 128, 128],
+        bias_atom_e: Optional[paddle.Tensor] = None,
+        resnet_dt: bool = True,
+        numb_fparam: int = 0,
+        numb_aparam: int = 0,
+        activation_function: str = "tanh",
+        precision: str = DEFAULT_PRECISION,
+        mixed_types: bool = True,
+        seed: Optional[Union[int, list[int]]] = None,
+        type_map: Optional[list[str]] = None,
+        **kwargs,
+    ):
+        super().__init__(
+            "energy",
+            ntypes,
+            dim_descrpt,
+            1,
+            neuron=neuron,
+            bias_atom_e=bias_atom_e,
+            resnet_dt=resnet_dt,
+            numb_fparam=numb_fparam,
+            numb_aparam=numb_aparam,
+            activation_function=activation_function,
+            precision=precision,
+            mixed_types=mixed_types,
+            seed=seed,
+            type_map=type_map,
+            **kwargs,
+        )
+
+    @classmethod
+    def deserialize(cls, data: dict) -> "GeneralFitting":
+        data = copy.deepcopy(data)
+        check_version_compatibility(data.pop("@version", 1), 2, 1)
+        data.pop("var_name")
+        data.pop("dim_out")
+        return super().deserialize(data)
+
+    def serialize(self) -> dict:
+        """Serialize the fitting to dict."""
+        return {
+            **super().serialize(),
+            "type": "ener",
+        }
+
+    exclude_types: list[int]
diff --git a/deepmd/pd/model/task/fitting.py b/deepmd/pd/model/task/fitting.py
new file mode 100644
index 0000000000..9008ef8af3
--- /dev/null
+++ b/deepmd/pd/model/task/fitting.py
@@ -0,0 +1,506 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import copy
+import logging
+from abc import (
+    abstractmethod,
+)
+from typing import (
+    Optional,
+    Union,
+)
+
+import numpy as np
+import paddle
+
+from deepmd.dpmodel.utils.seed import (
+    child_seed,
+)
+from deepmd.pd.model.network.mlp import (
+    FittingNet,
+    NetworkCollection,
+)
+from deepmd.pd.model.task.base_fitting import (
+    BaseFitting,
+)
+from deepmd.pd.utils import (
+    env,
+)
+from deepmd.pd.utils.env import (
+    DEFAULT_PRECISION,
+    PRECISION_DICT,
+)
+from deepmd.pd.utils.exclude_mask import (
+    AtomExcludeMask,
+)
+from deepmd.pd.utils.utils import (
+    to_numpy_array,
+    to_paddle_tensor,
+)
+from deepmd.utils.finetune import (
+    get_index_between_two_maps,
+    map_atom_exclude_types,
+)
+
+dtype = env.GLOBAL_PD_FLOAT_PRECISION
+device = env.DEVICE
+
+log = logging.getLogger(__name__)
+
+
+class Fitting(paddle.nn.Layer, BaseFitting):
+    # plugin moved to BaseFitting
+
+    def __new__(cls, *args, **kwargs):
+        if cls is Fitting:
+            return BaseFitting.__new__(BaseFitting, *args, **kwargs)
+        return super().__new__(cls)
+
+    def share_params(self, base_class, shared_level, resume=False):
+        """
+        Share the parameters of self to the base_class with shared_level during multitask training.
+        If not start from checkpoint (resume is False),
+        some separated parameters (e.g. mean and stddev) will be re-calculated across different classes.
+        """
+        assert (
+            self.__class__ == base_class.__class__
+        ), "Only fitting nets of the same type can share params!"
+        if shared_level == 0:
+            # link buffers
+            if hasattr(self, "bias_atom_e"):
+                self.bias_atom_e = base_class.bias_atom_e
+            # the following will successfully link all the params except buffers, which need manually link.
+            for item in self._sub_layers:
+                self._sub_layers[item] = base_class._sub_layers[item]
+        elif shared_level == 1:
+            # only not share the bias_atom_e
+            # the following will successfully link all the params except buffers, which need manually link.
+            for item in self._sub_layers:
+                self._sub_layers[item] = base_class._sub_layers[item]
+        else:
+            raise NotImplementedError
+
+
+class GeneralFitting(Fitting):
+    """Construct a general fitting net.
+
+    Parameters
+    ----------
+    var_name : str
+        The atomic property to fit, 'energy', 'dipole', and 'polar'.
+    ntypes : int
+        Element count.
+    dim_descrpt : int
+        Embedding width per atom.
+    dim_out : int
+        The output dimension of the fitting net.
+    neuron : list[int]
+        Number of neurons in each hidden layers of the fitting net.
+    bias_atom_e : paddle.Tensor, optional
+        Average energy per atom for each element.
+    resnet_dt : bool
+        Using time-step in the ResNet construction.
+    numb_fparam : int
+        Number of frame parameters.
+    numb_aparam : int
+        Number of atomic parameters.
+    activation_function : str
+        Activation function.
+    precision : str
+        Numerical precision.
+    mixed_types : bool
+        If true, use a uniform fitting net for all atom types, otherwise use
+        different fitting nets for different atom types.
+    rcond : float, optional
+        The condition number for the regression of atomic energy.
+    seed : int, optional
+        Random seed.
+    exclude_types: list[int]
+        Atomic contributions of the excluded atom types are set zero.
+    trainable : Union[list[bool], bool]
+        If the parameters in the fitting net are trainable.
+        Now this only supports setting all the parameters in the fitting net at one state.
+        When in list[bool], the trainable will be True only if all the boolean parameters are True.
+    remove_vaccum_contribution: list[bool], optional
+        Remove vacuum contribution before the bias is added. The list assigned each
+        type. For `mixed_types` provide `[True]`, otherwise it should be a list of the same
+        length as `ntypes` signaling if or not removing the vacuum contribution for the atom types in the list.
+    type_map: list[str], Optional
+        A list of strings. Give the name to each type of atoms.
+    use_aparam_as_mask: bool
+        If True, the aparam will not be used in fitting net for embedding.
+    """
+
+    def __init__(
+        self,
+        var_name: str,
+        ntypes: int,
+        dim_descrpt: int,
+        neuron: list[int] = [128, 128, 128],
+        bias_atom_e: Optional[paddle.Tensor] = None,
+        resnet_dt: bool = True,
+        numb_fparam: int = 0,
+        numb_aparam: int = 0,
+        activation_function: str = "tanh",
+        precision: str = DEFAULT_PRECISION,
+        mixed_types: bool = True,
+        rcond: Optional[float] = None,
+        seed: Optional[Union[int, list[int]]] = None,
+        exclude_types: list[int] = [],
+        trainable: Union[bool, list[bool]] = True,
+        remove_vaccum_contribution: Optional[list[bool]] = None,
+        type_map: Optional[list[str]] = None,
+        use_aparam_as_mask: bool = False,
+        **kwargs,
+    ):
+        super().__init__()
+        self.var_name = var_name
+        self.ntypes = ntypes
+        self.dim_descrpt = dim_descrpt
+        self.neuron = neuron
+        self.mixed_types = mixed_types
+        self.resnet_dt = resnet_dt
+        self.numb_fparam = numb_fparam
+        self.numb_aparam = numb_aparam
+        self.activation_function = activation_function
+        self.precision = precision
+        self.prec = PRECISION_DICT[self.precision]
+        self.rcond = rcond
+        self.seed = seed
+        self.type_map = type_map
+        self.use_aparam_as_mask = use_aparam_as_mask
+        # order matters, should be place after the assignment of ntypes
+        self.reinit_exclude(exclude_types)
+        self.trainable = trainable
+        # need support for each layer settings
+        self.trainable = (
+            all(self.trainable) if isinstance(self.trainable, list) else self.trainable
+        )
+        self.remove_vaccum_contribution = remove_vaccum_contribution
+
+        net_dim_out = self._net_out_dim()
+        # init constants
+        if bias_atom_e is None:
+            bias_atom_e = np.zeros([self.ntypes, net_dim_out], dtype=np.float64)
+        bias_atom_e = paddle.to_tensor(bias_atom_e, dtype=self.prec).to(device=device)
+        bias_atom_e = bias_atom_e.reshape([self.ntypes, net_dim_out])
+        if not self.mixed_types:
+            assert self.ntypes == bias_atom_e.shape[0], "Element count mismatches!"
+        self.register_buffer("bias_atom_e", bias_atom_e)
+
+        if self.numb_fparam > 0:
+            self.register_buffer(
+                "fparam_avg",
+                paddle.zeros([self.numb_fparam], dtype=self.prec).to(device=device),
+            )
+            self.register_buffer(
+                "fparam_inv_std",
+                paddle.ones([self.numb_fparam], dtype=self.prec).to(device=device),
+            )
+        else:
+            self.fparam_avg, self.fparam_inv_std = None, None
+        if self.numb_aparam > 0:
+            self.register_buffer(
+                "aparam_avg",
+                paddle.zeros([self.numb_aparam], dtype=self.prec).to(device=device),
+            )
+            self.register_buffer(
+                "aparam_inv_std",
+                paddle.ones([self.numb_aparam], dtype=self.prec).to(device=device),
+            )
+        else:
+            self.aparam_avg, self.aparam_inv_std = None, None
+
+        in_dim = (
+            self.dim_descrpt
+            + self.numb_fparam
+            + (0 if self.use_aparam_as_mask else self.numb_aparam)
+        )
+
+        self.filter_layers = NetworkCollection(
+            1 if not self.mixed_types else 0,
+            self.ntypes,
+            network_type="fitting_network",
+            networks=[
+                FittingNet(
+                    in_dim,
+                    net_dim_out,
+                    self.neuron,
+                    self.activation_function,
+                    self.resnet_dt,
+                    self.precision,
+                    bias_out=True,
+                    seed=child_seed(self.seed, ii),
+                )
+                for ii in range(self.ntypes if not self.mixed_types else 1)
+            ],
+        )
+        # set trainable
+        for param in self.parameters():
+            param.stop_gradient = not self.trainable
+
+    def reinit_exclude(
+        self,
+        exclude_types: list[int] = [],
+    ):
+        self.exclude_types = exclude_types
+        self.emask = AtomExcludeMask(self.ntypes, self.exclude_types)
+
+    def change_type_map(
+        self, type_map: list[str], model_with_new_type_stat=None
+    ) -> None:
+        """Change the type related params to new ones, according to `type_map` and the original one in the model.
+        If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
+        """
+        assert (
+            self.type_map is not None
+        ), "'type_map' must be defined when performing type changing!"
+        assert self.mixed_types, "Only models in mixed types can perform type changing!"
+        remap_index, has_new_type = get_index_between_two_maps(self.type_map, type_map)
+        self.type_map = type_map
+        self.ntypes = len(type_map)
+        self.reinit_exclude(map_atom_exclude_types(self.exclude_types, remap_index))
+        if has_new_type:
+            extend_shape = [len(type_map), *list(self.bias_atom_e.shape[1:])]
+            extend_bias_atom_e = paddle.zeros(
+                extend_shape,
+                dtype=self.bias_atom_e.dtype,
+            ).to(device=self.bias_atom_e.place)
+            self.bias_atom_e = paddle.concat(
+                [self.bias_atom_e, extend_bias_atom_e], axis=0
+            )
+        self.bias_atom_e = self.bias_atom_e[remap_index]
+
+    def serialize(self) -> dict:
+        """Serialize the fitting to dict."""
+        return {
+            "@class": "Fitting",
+            "@version": 2,
+            "var_name": self.var_name,
+            "ntypes": self.ntypes,
+            "dim_descrpt": self.dim_descrpt,
+            "neuron": self.neuron,
+            "resnet_dt": self.resnet_dt,
+            "numb_fparam": self.numb_fparam,
+            "numb_aparam": self.numb_aparam,
+            "activation_function": self.activation_function,
+            "precision": self.precision,
+            "mixed_types": self.mixed_types,
+            "nets": self.filter_layers.serialize(),
+            "rcond": self.rcond,
+            "exclude_types": self.exclude_types,
+            "@variables": {
+                "bias_atom_e": to_numpy_array(self.bias_atom_e),
+                "fparam_avg": to_numpy_array(self.fparam_avg),
+                "fparam_inv_std": to_numpy_array(self.fparam_inv_std),
+                "aparam_avg": to_numpy_array(self.aparam_avg),
+                "aparam_inv_std": to_numpy_array(self.aparam_inv_std),
+            },
+            "type_map": self.type_map,
+            # "tot_ener_zero": self.tot_ener_zero ,
+            # "trainable": self.trainable ,
+            # "atom_ener": self.atom_ener ,
+            # "layer_name": self.layer_name ,
+            # "spin": self.spin ,
+            ## NOTICE:  not supported by far
+            "tot_ener_zero": False,
+            "trainable": [self.trainable] * (len(self.neuron) + 1),
+            "layer_name": None,
+            "use_aparam_as_mask": self.use_aparam_as_mask,
+            "spin": None,
+        }
+
+    @classmethod
+    def deserialize(cls, data: dict) -> "GeneralFitting":
+        data = copy.deepcopy(data)
+        variables = data.pop("@variables")
+        nets = data.pop("nets")
+        obj = cls(**data)
+        for kk in variables.keys():
+            obj[kk] = to_paddle_tensor(variables[kk])
+        obj.filter_layers = NetworkCollection.deserialize(nets)
+        return obj
+
+    def get_dim_fparam(self) -> int:
+        """Get the number (dimension) of frame parameters of this atomic model."""
+        return self.numb_fparam
+
+    def get_dim_aparam(self) -> int:
+        """Get the number (dimension) of atomic parameters of this atomic model."""
+        return self.numb_aparam
+
+    # make jit happy
+    exclude_types: list[int]
+
+    def get_sel_type(self) -> list[int]:
+        """Get the selected atom types of this model.
+
+        Only atoms with selected atom types have atomic contribution
+        to the result of the model.
+        If returning an empty list, all atom types are selected.
+        """
+        # make jit happy
+        sel_type: list[int] = []
+        for ii in range(self.ntypes):
+            if ii not in self.exclude_types:
+                sel_type.append(ii)
+        return sel_type
+
+    def get_type_map(self) -> list[str]:
+        """Get the name to each type of atoms."""
+        return self.type_map
+
+    def __setitem__(self, key, value):
+        if key in ["bias_atom_e"]:
+            value = value.reshape([self.ntypes, self._net_out_dim()])
+            self.bias_atom_e = value
+        elif key in ["fparam_avg"]:
+            self.fparam_avg = value
+        elif key in ["fparam_inv_std"]:
+            self.fparam_inv_std = value
+        elif key in ["aparam_avg"]:
+            self.aparam_avg = value
+        elif key in ["aparam_inv_std"]:
+            self.aparam_inv_std = value
+        elif key in ["scale"]:
+            self.scale = value
+        else:
+            raise KeyError(key)
+
+    def __getitem__(self, key):
+        if key in ["bias_atom_e"]:
+            return self.bias_atom_e
+        elif key in ["fparam_avg"]:
+            return self.fparam_avg
+        elif key in ["fparam_inv_std"]:
+            return self.fparam_inv_std
+        elif key in ["aparam_avg"]:
+            return self.aparam_avg
+        elif key in ["aparam_inv_std"]:
+            return self.aparam_inv_std
+        elif key in ["scale"]:
+            return self.scale
+        else:
+            raise KeyError(key)
+
+    @abstractmethod
+    def _net_out_dim(self):
+        """Set the FittingNet output dim."""
+        pass
+
+    def _extend_f_avg_std(self, xx: paddle.Tensor, nb: int) -> paddle.Tensor:
+        return paddle.tile(xx.reshape([1, self.numb_fparam]), [nb, 1])
+
+    def _extend_a_avg_std(self, xx: paddle.Tensor, nb: int, nloc: int) -> paddle.Tensor:
+        return paddle.tile(xx.reshape([1, 1, self.numb_aparam]), [nb, nloc, 1])
+
+    def _forward_common(
+        self,
+        descriptor: paddle.Tensor,
+        atype: paddle.Tensor,
+        gr: Optional[paddle.Tensor] = None,
+        g2: Optional[paddle.Tensor] = None,
+        h2: Optional[paddle.Tensor] = None,
+        fparam: Optional[paddle.Tensor] = None,
+        aparam: Optional[paddle.Tensor] = None,
+    ):
+        xx = descriptor
+        if self.remove_vaccum_contribution is not None:
+            # TODO: compute the input for vaccm when remove_vaccum_contribution is set
+            # Ideally, the input for vacuum should be computed;
+            # we consider it as always zero for convenience.
+            # Needs a compute_input_stats for vacuum passed from the
+            # descriptor.
+            xx_zeros = paddle.zeros_like(xx)
+        else:
+            xx_zeros = None
+        nf, nloc, nd = xx.shape
+        net_dim_out = self._net_out_dim()
+
+        if nd != self.dim_descrpt:
+            raise ValueError(
+                f"get an input descriptor of dim {nd},"
+                f"which is not consistent with {self.dim_descrpt}."
+            )
+        # check fparam dim, concate to input descriptor
+        if self.numb_fparam > 0:
+            assert fparam is not None, "fparam should not be None"
+            assert self.fparam_avg is not None
+            assert self.fparam_inv_std is not None
+            if fparam.shape[-1] != self.numb_fparam:
+                raise ValueError(
+                    "get an input fparam of dim {fparam.shape[-1]}, ",
+                    "which is not consistent with {self.numb_fparam}.",
+                )
+            fparam = fparam.reshape([nf, self.numb_fparam])
+            nb, _ = fparam.shape
+            t_fparam_avg = self._extend_f_avg_std(self.fparam_avg, nb)
+            t_fparam_inv_std = self._extend_f_avg_std(self.fparam_inv_std, nb)
+            fparam = (fparam - t_fparam_avg) * t_fparam_inv_std
+            fparam = paddle.tile(fparam.reshape([nf, 1, -1]), [1, nloc, 1])
+            xx = paddle.concat(
+                [xx, fparam],
+                axis=-1,
+            )
+            if xx_zeros is not None:
+                xx_zeros = paddle.concat(
+                    [xx_zeros, fparam],
+                    axis=-1,
+                )
+        # check aparam dim, concate to input descriptor
+        if self.numb_aparam > 0 and not self.use_aparam_as_mask:
+            assert aparam is not None, "aparam should not be None"
+            assert self.aparam_avg is not None
+            assert self.aparam_inv_std is not None
+            if aparam.shape[-1] != self.numb_aparam:
+                raise ValueError(
+                    f"get an input aparam of dim {aparam.shape[-1]}, ",
+                    f"which is not consistent with {self.numb_aparam}.",
+                )
+            aparam = aparam.reshape([nf, -1, self.numb_aparam])
+            nb, nloc, _ = aparam.shape
+            t_aparam_avg = self._extend_a_avg_std(self.aparam_avg, nb, nloc)
+            t_aparam_inv_std = self._extend_a_avg_std(self.aparam_inv_std, nb, nloc)
+            aparam = (aparam - t_aparam_avg) * t_aparam_inv_std
+            xx = paddle.concat(
+                [xx, aparam],
+                axis=-1,
+            )
+            if xx_zeros is not None:
+                xx_zeros = paddle.concat(
+                    [xx_zeros, aparam],
+                    axis=-1,
+                )
+
+        outs = paddle.zeros(
+            (nf, nloc, net_dim_out),
+            dtype=env.GLOBAL_PD_FLOAT_PRECISION,
+        ).to(device=descriptor.place)  # jit assertion
+        if self.mixed_types:
+            atom_property = self.filter_layers.networks[0](xx) + self.bias_atom_e[atype]
+            if xx_zeros is not None:
+                atom_property -= self.filter_layers.networks[0](xx_zeros)
+            outs = outs + atom_property  # Shape is [nframes, natoms[0], net_dim_out]
+        else:
+            for type_i, ll in enumerate(self.filter_layers.networks):
+                mask = (atype == type_i).unsqueeze(-1)
+                mask.stop_gradient = True
+                mask = paddle.tile(mask, (1, 1, net_dim_out))
+                atom_property = ll(xx)
+                if xx_zeros is not None:
+                    # must assert, otherwise jit is not happy
+                    assert self.remove_vaccum_contribution is not None
+                    if not (
+                        len(self.remove_vaccum_contribution) > type_i
+                        and not self.remove_vaccum_contribution[type_i]
+                    ):
+                        atom_property -= ll(xx_zeros)
+                atom_property = atom_property + self.bias_atom_e[type_i]
+                atom_property = atom_property * mask.astype(atom_property.dtype)
+                outs = (
+                    outs + atom_property
+                )  # Shape is [nframes, natoms[0], net_dim_out]
+        # nf x nloc
+        mask = self.emask(atype)
+        # nf x nloc x nod
+        outs = outs * mask[:, :, None].astype(outs.dtype)
+        return {self.var_name: outs.astype(env.GLOBAL_PD_FLOAT_PRECISION)}
diff --git a/deepmd/pd/model/task/invar_fitting.py b/deepmd/pd/model/task/invar_fitting.py
new file mode 100644
index 0000000000..b366fc1d2e
--- /dev/null
+++ b/deepmd/pd/model/task/invar_fitting.py
@@ -0,0 +1,183 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import copy
+import logging
+from typing import (
+    Optional,
+    Union,
+)
+
+import paddle
+
+from deepmd.dpmodel import (
+    FittingOutputDef,
+    OutputVariableDef,
+    fitting_check_output,
+)
+from deepmd.pd.model.task.fitting import (
+    GeneralFitting,
+)
+from deepmd.pd.utils import (
+    env,
+)
+from deepmd.pd.utils.env import (
+    DEFAULT_PRECISION,
+)
+from deepmd.utils.version import (
+    check_version_compatibility,
+)
+
+dtype = env.GLOBAL_PD_FLOAT_PRECISION
+device = env.DEVICE
+
+log = logging.getLogger(__name__)
+
+
+@GeneralFitting.register("invar")
+@fitting_check_output
+class InvarFitting(GeneralFitting):
+    """Construct a fitting net for energy.
+
+    Parameters
+    ----------
+    var_name : str
+        The atomic property to fit, 'energy', 'dipole', and 'polar'.
+    ntypes : int
+        Element count.
+    dim_descrpt : int
+        Embedding width per atom.
+    dim_out : int
+        The output dimension of the fitting net.
+    neuron : list[int]
+        Number of neurons in each hidden layers of the fitting net.
+    bias_atom_e : paddle.Tensor, optional
+        Average energy per atom for each element.
+    resnet_dt : bool
+        Using time-step in the ResNet construction.
+    numb_fparam : int
+        Number of frame parameters.
+    numb_aparam : int
+        Number of atomic parameters.
+    activation_function : str
+        Activation function.
+    precision : str
+        Numerical precision.
+    mixed_types : bool
+        If true, use a uniform fitting net for all atom types, otherwise use
+        different fitting nets for different atom types.
+    rcond : float, optional
+        The condition number for the regression of atomic energy.
+    seed : int, optional
+        Random seed.
+    exclude_types: list[int]
+        Atomic contributions of the excluded atom types are set zero.
+    atom_ener: list[Optional[paddle.Tensor]], optional
+        Specifying atomic energy contribution in vacuum.
+        The value is a list specifying the bias. the elements can be None or np.array of output shape.
+        For example: [None, [2.]] means type 0 is not set, type 1 is set to [2.]
+        The `set_davg_zero` key in the descriptor should be set.
+    type_map: list[str], Optional
+        A list of strings. Give the name to each type of atoms.
+    use_aparam_as_mask: bool
+        If True, the aparam will not be used in fitting net for embedding.
+    """
+
+    def __init__(
+        self,
+        var_name: str,
+        ntypes: int,
+        dim_descrpt: int,
+        dim_out: int,
+        neuron: list[int] = [128, 128, 128],
+        bias_atom_e: Optional[paddle.Tensor] = None,
+        resnet_dt: bool = True,
+        numb_fparam: int = 0,
+        numb_aparam: int = 0,
+        activation_function: str = "tanh",
+        precision: str = DEFAULT_PRECISION,
+        mixed_types: bool = True,
+        rcond: Optional[float] = None,
+        seed: Optional[Union[int, list[int]]] = None,
+        exclude_types: list[int] = [],
+        atom_ener: Optional[list[Optional[paddle.Tensor]]] = None,
+        type_map: Optional[list[str]] = None,
+        use_aparam_as_mask: bool = False,
+        **kwargs,
+    ):
+        self.dim_out = dim_out
+        self.atom_ener = atom_ener
+        super().__init__(
+            var_name=var_name,
+            ntypes=ntypes,
+            dim_descrpt=dim_descrpt,
+            neuron=neuron,
+            bias_atom_e=bias_atom_e,
+            resnet_dt=resnet_dt,
+            numb_fparam=numb_fparam,
+            numb_aparam=numb_aparam,
+            activation_function=activation_function,
+            precision=precision,
+            mixed_types=mixed_types,
+            rcond=rcond,
+            seed=seed,
+            exclude_types=exclude_types,
+            remove_vaccum_contribution=None
+            if atom_ener is None or len([x for x in atom_ener if x is not None]) == 0
+            else [x is not None for x in atom_ener],
+            type_map=type_map,
+            use_aparam_as_mask=use_aparam_as_mask,
+            **kwargs,
+        )
+
+    def _net_out_dim(self):
+        """Set the FittingNet output dim."""
+        return self.dim_out
+
+    def serialize(self) -> dict:
+        data = super().serialize()
+        data["type"] = "invar"
+        data["dim_out"] = self.dim_out
+        data["atom_ener"] = self.atom_ener
+        return data
+
+    @classmethod
+    def deserialize(cls, data: dict) -> "GeneralFitting":
+        data = copy.deepcopy(data)
+        check_version_compatibility(data.pop("@version", 1), 2, 1)
+        return super().deserialize(data)
+
+    def output_def(self) -> FittingOutputDef:
+        return FittingOutputDef(
+            [
+                OutputVariableDef(
+                    self.var_name,
+                    [self.dim_out],
+                    reducible=True,
+                    r_differentiable=True,
+                    c_differentiable=True,
+                ),
+            ]
+        )
+
+    def forward(
+        self,
+        descriptor: paddle.Tensor,
+        atype: paddle.Tensor,
+        gr: Optional[paddle.Tensor] = None,
+        g2: Optional[paddle.Tensor] = None,
+        h2: Optional[paddle.Tensor] = None,
+        fparam: Optional[paddle.Tensor] = None,
+        aparam: Optional[paddle.Tensor] = None,
+    ):
+        """Based on embedding net output, alculate total energy.
+
+        Args:
+        - inputs: Embedding matrix. Its shape is [nframes, natoms[0], self.dim_descrpt].
+        - natoms: Tell atom count and element count. Its shape is [2+self.ntypes].
+
+        Returns
+        -------
+        - `paddle.Tensor`: Total energy with shape [nframes, natoms[0]].
+        """
+        return self._forward_common(descriptor, atype, gr, g2, h2, fparam, aparam)
+
+    exclude_types: list[int]
diff --git a/deepmd/pd/model/task/task.py b/deepmd/pd/model/task/task.py
new file mode 100644
index 0000000000..6ceb116d85
--- /dev/null
+++ b/deepmd/pd/model/task/task.py
@@ -0,0 +1 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
diff --git a/deepmd/pd/train/__init__.py b/deepmd/pd/train/__init__.py
new file mode 100644
index 0000000000..6ceb116d85
--- /dev/null
+++ b/deepmd/pd/train/__init__.py
@@ -0,0 +1 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
diff --git a/deepmd/pd/train/training.py b/deepmd/pd/train/training.py
new file mode 100644
index 0000000000..09cf86ecdd
--- /dev/null
+++ b/deepmd/pd/train/training.py
@@ -0,0 +1,1240 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import datetime
+import functools
+import logging
+import time
+from contextlib import (
+    contextmanager,
+)
+from copy import (
+    deepcopy,
+)
+from pathlib import (
+    Path,
+)
+from typing import (
+    Any,
+)
+
+import numpy as np
+import paddle
+import paddle.distributed as dist
+from paddle.distributed import (
+    fleet,
+)
+from paddle.framework import (
+    core,
+)
+from paddle.io import (
+    DataLoader,
+)
+
+from deepmd.common import (
+    symlink_prefix_files,
+)
+from deepmd.dpmodel.utils.learning_rate import (
+    LearningRateExp,
+)
+from deepmd.loggers.training import (
+    format_training_message_per_task,
+)
+from deepmd.pd.loss import (
+    EnergyStdLoss,
+    TaskLoss,
+)
+from deepmd.pd.model.model import (
+    get_model,
+)
+from deepmd.pd.train.wrapper import (
+    ModelWrapper,
+)
+from deepmd.pd.utils import (
+    dp_random,
+)
+from deepmd.pd.utils.dataloader import (
+    BufferedIterator,
+    get_weighted_sampler,
+)
+from deepmd.pd.utils.env import (
+    DEVICE,
+    JIT,
+    NUM_WORKERS,
+    SAMPLER_RECORD,
+    enable_prim,
+)
+from deepmd.pd.utils.stat import (
+    make_stat_input,
+)
+from deepmd.pd.utils.utils import (
+    to_numpy_array,
+)
+from deepmd.utils.data import (
+    DataRequirementItem,
+)
+from deepmd.utils.path import (
+    DPH5Path,
+)
+
+log = logging.getLogger(__name__)
+
+from typing import (
+    Optional,
+)
+
+
+def format_training_message(
+    batch: int,
+    wall_time: float,
+    eta: Optional[int] = None,
+):
+    msg = f"batch {batch:7d}: " f"total wall time = {wall_time:.2f} s"
+    if isinstance(eta, int):
+        msg += f", eta = {datetime.timedelta(seconds=int(eta))!s}"
+    return msg
+
+
+class Trainer:
+    def __init__(
+        self,
+        config: dict[str, Any],
+        training_data,
+        stat_file_path=None,
+        validation_data=None,
+        init_model=None,
+        restart_model=None,
+        finetune_model=None,
+        force_load=False,
+        shared_links=None,
+        finetune_links=None,
+        init_frz_model=None,
+    ):
+        """Construct a DeePMD trainer.
+
+        Args:
+        - config: The Dict-like configuration with training options.
+        """
+        enable_prim(True)
+        if init_model is not None:
+            resume_model = init_model
+        elif restart_model is not None:
+            resume_model = restart_model
+        elif finetune_model is not None:
+            resume_model = finetune_model
+        else:
+            resume_model = None
+        resuming = resume_model is not None
+        self.restart_training = restart_model is not None
+        model_params = config["model"]
+        training_params = config["training"]
+        self.multi_task = "model_dict" in model_params
+        self.finetune_links = finetune_links
+        self.finetune_update_stat = False
+        self.model_keys = (
+            list(model_params["model_dict"]) if self.multi_task else ["Default"]
+        )
+        self.rank = (
+            dist.get_rank() if dist.is_available() and dist.is_initialized() else 0
+        )
+        self.world_size = (
+            dist.get_world_size()
+            if dist.is_available() and dist.is_initialized()
+            else 1
+        )
+        self.num_model = len(self.model_keys)
+
+        # Iteration config
+        self.num_steps = training_params["numb_steps"]
+        self.disp_file = training_params.get("disp_file", "lcurve.out")
+        self.disp_freq = training_params.get("disp_freq", 1000)
+        self.save_ckpt = training_params.get("save_ckpt", "model.ckpt")
+        self.save_freq = training_params.get("save_freq", 1000)
+        self.max_ckpt_keep = training_params.get("max_ckpt_keep", 5)
+        self.display_in_training = training_params.get("disp_training", True)
+        self.timing_in_training = training_params.get("time_training", True)
+        self.change_bias_after_training = training_params.get(
+            "change_bias_after_training", False
+        )
+        self.lcurve_should_print_header = True
+
+        def get_opt_param(params):
+            opt_type = params.get("opt_type", "Adam")
+            opt_param = {
+                "kf_blocksize": params.get("kf_blocksize", 5120),
+                "kf_start_pref_e": params.get("kf_start_pref_e", 1),
+                "kf_limit_pref_e": params.get("kf_limit_pref_e", 1),
+                "kf_start_pref_f": params.get("kf_start_pref_f", 1),
+                "kf_limit_pref_f": params.get("kf_limit_pref_f", 1),
+            }
+            return opt_type, opt_param
+
+        def get_data_loader(_training_data, _validation_data, _training_params):
+            def get_dataloader_and_buffer(_data, _params):
+                if "auto_prob" in _training_params["training_data"]:
+                    _sampler = get_weighted_sampler(
+                        _data, _params["training_data"]["auto_prob"]
+                    )
+                elif "sys_probs" in _training_params["training_data"]:
+                    _sampler = get_weighted_sampler(
+                        _data,
+                        _params["training_data"]["sys_probs"],
+                        sys_prob=True,
+                    )
+                else:
+                    _sampler = get_weighted_sampler(_data, "prob_sys_size")
+
+                if _sampler is None:
+                    log.warning(
+                        "Sampler not specified!"
+                    )  # None sampler will lead to a premature stop iteration. Replacement should be True in attribute of the sampler to produce expected number of items in one iteration.
+                _dataloader = DataLoader(
+                    _data,
+                    batch_sampler=paddle.io.BatchSampler(
+                        sampler=_sampler,
+                        drop_last=False,
+                    ),
+                    num_workers=NUM_WORKERS
+                    if dist.is_available()
+                    else 0,  # setting to 0 diverges the behavior of its iterator; should be >=1
+                    collate_fn=lambda batch: batch[0],  # prevent extra conversion
+                    # pin_memory=True,
+                )
+                _data_buffered = BufferedIterator(iter(_dataloader))
+                return _dataloader, _data_buffered
+
+            training_dataloader, training_data_buffered = get_dataloader_and_buffer(
+                _training_data, _training_params
+            )
+
+            if _validation_data is not None:
+                (
+                    validation_dataloader,
+                    validation_data_buffered,
+                ) = get_dataloader_and_buffer(_validation_data, _training_params)
+                valid_numb_batch = _training_params["validation_data"].get(
+                    "numb_btch", 1
+                )
+            else:
+                validation_dataloader = None
+                validation_data_buffered = None
+                valid_numb_batch = 1
+            return (
+                training_dataloader,
+                training_data_buffered,
+                validation_dataloader,
+                validation_data_buffered,
+                valid_numb_batch,
+            )
+
+        def single_model_stat(
+            _model,
+            _data_stat_nbatch,
+            _training_data,
+            _validation_data,
+            _stat_file_path,
+            _data_requirement,
+            finetune_has_new_type=False,
+        ):
+            _data_requirement += get_additional_data_requirement(_model)
+            _training_data.add_data_requirement(_data_requirement)
+            if _validation_data is not None:
+                _validation_data.add_data_requirement(_data_requirement)
+
+            @functools.lru_cache
+            def get_sample():
+                sampled = make_stat_input(
+                    _training_data.systems,
+                    _training_data.dataloaders,
+                    _data_stat_nbatch,
+                )
+                return sampled
+
+            if (not resuming or finetune_has_new_type) and self.rank == 0:
+                _model.compute_or_load_stat(
+                    sampled_func=get_sample,
+                    stat_file_path=_stat_file_path,
+                )
+                if isinstance(_stat_file_path, DPH5Path):
+                    _stat_file_path.root.close()
+            return get_sample
+
+        def get_lr(lr_params):
+            assert (
+                lr_params.get("type", "exp") == "exp"
+            ), "Only learning rate `exp` is supported!"
+            lr_params["stop_steps"] = self.num_steps - self.warmup_steps
+            lr_exp = LearningRateExp(**lr_params)
+            return lr_exp
+
+        # Optimizer
+        if self.multi_task and training_params.get("optim_dict", None) is not None:
+            self.optim_dict = training_params.get("optim_dict")
+            missing_keys = [
+                key for key in self.model_keys if key not in self.optim_dict
+            ]
+            assert (
+                not missing_keys
+            ), f"These keys are not in optim_dict: {missing_keys}!"
+            self.opt_type = {}
+            self.opt_param = {}
+            for model_key in self.model_keys:
+                self.opt_type[model_key], self.opt_param[model_key] = get_opt_param(
+                    self.optim_dict[model_key]
+                )
+        else:
+            self.opt_type, self.opt_param = get_opt_param(training_params)
+
+        # Model
+        self.model = get_model_for_wrapper(model_params)
+
+        # Loss
+        if not self.multi_task:
+            self.loss = get_loss(
+                config["loss"],
+                config["learning_rate"]["start_lr"],
+                len(model_params["type_map"]),
+                self.model,
+            )
+        else:
+            self.loss = {}
+            for model_key in self.model_keys:
+                loss_param = config["loss_dict"][model_key]
+                if config.get("learning_rate_dict", None) is not None:
+                    lr_param = config["learning_rate_dict"][model_key]["start_lr"]
+                else:
+                    lr_param = config["learning_rate"]["start_lr"]
+                ntypes = len(model_params["model_dict"][model_key]["type_map"])
+                self.loss[model_key] = get_loss(
+                    loss_param, lr_param, ntypes, self.model[model_key]
+                )
+
+        # Data
+        if not self.multi_task:
+            self.get_sample_func = single_model_stat(
+                self.model,
+                model_params.get("data_stat_nbatch", 10),
+                training_data,
+                validation_data,
+                stat_file_path,
+                self.loss.label_requirement,
+                finetune_has_new_type=self.finetune_links["Default"].get_has_new_type()
+                if self.finetune_links is not None
+                else False,
+            )
+            (
+                self.training_dataloader,
+                self.training_data,
+                self.validation_dataloader,
+                self.validation_data,
+                self.valid_numb_batch,
+            ) = get_data_loader(training_data, validation_data, training_params)
+            training_data.print_summary(
+                "training",
+                to_numpy_array(self.training_dataloader.batch_sampler.sampler.weights),
+            )
+            if validation_data is not None:
+                validation_data.print_summary(
+                    "validation",
+                    to_numpy_array(
+                        self.validation_dataloader.batch_sampler.sampler.weights
+                    ),
+                )
+        else:
+            (
+                self.training_dataloader,
+                self.training_data,
+                self.validation_dataloader,
+                self.validation_data,
+                self.valid_numb_batch,
+                self.get_sample_func,
+            ) = {}, {}, {}, {}, {}, {}
+            for model_key in self.model_keys:
+                self.get_sample_func[model_key] = single_model_stat(
+                    self.model[model_key],
+                    model_params["model_dict"][model_key].get("data_stat_nbatch", 10),
+                    training_data[model_key],
+                    validation_data[model_key],
+                    stat_file_path[model_key],
+                    self.loss[model_key].label_requirement,
+                    finetune_has_new_type=self.finetune_links[
+                        model_key
+                    ].get_has_new_type()
+                    if self.finetune_links is not None
+                    else False,
+                )
+                (
+                    self.training_dataloader[model_key],
+                    self.training_data[model_key],
+                    self.validation_dataloader[model_key],
+                    self.validation_data[model_key],
+                    self.valid_numb_batch[model_key],
+                ) = get_data_loader(
+                    training_data[model_key],
+                    validation_data[model_key],
+                    training_params["data_dict"][model_key],
+                )
+
+                training_data[model_key].print_summary(
+                    f"training in {model_key}",
+                    to_numpy_array(
+                        self.training_dataloader[
+                            model_key
+                        ].batch_sampler.sampler.weights
+                    ),
+                )
+                if (
+                    validation_data is not None
+                    and validation_data[model_key] is not None
+                ):
+                    validation_data[model_key].print_summary(
+                        f"validation in {model_key}",
+                        to_numpy_array(
+                            self.validation_dataloader[
+                                model_key
+                            ].batch_sampler.sampler.weights
+                        ),
+                    )
+
+        # Learning rate
+        self.warmup_steps = training_params.get("warmup_steps", 0)
+        self.gradient_max_norm = training_params.get("gradient_max_norm", 0.0)
+        assert (
+            self.num_steps - self.warmup_steps > 0 or self.warmup_steps == 0
+        ), "Warm up steps must be less than total training steps!"
+        if self.multi_task and config.get("learning_rate_dict", None) is not None:
+            self.lr_exp = {}
+            for model_key in self.model_keys:
+                self.lr_exp[model_key] = get_lr(config["learning_rate_dict"][model_key])
+        else:
+            self.lr_exp = get_lr(config["learning_rate"])
+
+        # JIT
+        if JIT:
+            raise NotImplementedError(
+                "JIT is not supported yet when training with Paddle"
+            )
+            self.model = paddle.jit.to_static(self.model)
+
+        # Model Wrapper
+        self.wrapper = ModelWrapper(self.model, self.loss, model_params=model_params)
+        self.start_step = 0
+
+        # resuming and finetune
+        optimizer_state_dict = None
+        if resuming:
+            log.info(f"Resuming from {resume_model}.")
+            state_dict = paddle.load(resume_model)
+            if "model" in state_dict:
+                optimizer_state_dict = (
+                    state_dict["optimizer"] if finetune_model is None else None
+                )
+                state_dict = state_dict["model"]
+            self.start_step = (
+                state_dict["_extra_state"]["train_infos"]["step"]
+                if self.restart_training
+                else 0
+            )
+            if self.rank == 0:
+                if force_load:
+                    input_keys = list(state_dict.keys())
+                    target_keys = list(self.wrapper.state_dict().keys())
+                    missing_keys = [
+                        item for item in target_keys if item not in input_keys
+                    ]
+                    if missing_keys:
+                        target_state_dict = self.wrapper.state_dict()
+                        slim_keys = []
+                        for item in missing_keys:
+                            state_dict[item] = target_state_dict[item].clone().detach()
+                            new_key = True
+                            for slim_key in slim_keys:
+                                if slim_key in item:
+                                    new_key = False
+                                    break
+                            if new_key:
+                                tmp_keys = ".".join(item.split(".")[:3])
+                                slim_keys.append(tmp_keys)
+                        slim_keys = [i + ".*" for i in slim_keys]
+                        log.warning(
+                            f"Force load mode allowed! These keys are not in ckpt and will re-init: {slim_keys}"
+                        )
+                # update model params in the pretrained model
+                if finetune_model is not None:
+                    new_state_dict = {}
+                    target_state_dict = self.wrapper.state_dict()
+                    # pretrained_model
+                    pretrained_model = get_model_for_wrapper(
+                        state_dict["_extra_state"]["model_params"]
+                    )
+                    pretrained_model_wrapper = ModelWrapper(pretrained_model)
+                    pretrained_model_wrapper.set_state_dict(state_dict)
+                    # update type related params
+                    for model_key in self.model_keys:
+                        finetune_rule_single = self.finetune_links[model_key]
+                        _model_key_from = finetune_rule_single.get_model_branch()
+                        # skip if updated
+                        if (
+                            finetune_rule_single.get_finetune_tmap()
+                            != pretrained_model_wrapper.model[
+                                _model_key_from
+                            ].get_type_map()
+                        ):
+                            model_with_new_type_stat = None
+                            if finetune_rule_single.get_has_new_type():
+                                self.finetune_update_stat = True
+                                model_with_new_type_stat = self.wrapper.model[model_key]
+                            pretrained_model_wrapper.model[
+                                _model_key_from
+                            ].change_type_map(
+                                finetune_rule_single.get_finetune_tmap(),
+                                model_with_new_type_stat=model_with_new_type_stat,
+                            )
+                    state_dict = pretrained_model_wrapper.state_dict()
+
+                    def collect_single_finetune_params(
+                        _model_key,
+                        _finetune_rule_single,
+                        _new_state_dict,
+                        _origin_state_dict,
+                        _random_state_dict,
+                    ):
+                        _new_fitting = _finetune_rule_single.get_random_fitting()
+                        _model_key_from = _finetune_rule_single.get_model_branch()
+                        target_keys = [
+                            i
+                            for i in _random_state_dict.keys()
+                            if i != "_extra_state" and f".{_model_key}." in i
+                        ]
+                        for item_key in target_keys:
+                            if _new_fitting and (".descriptor." not in item_key):
+                                # print(f'Keep {item_key} in old model!')
+                                _new_state_dict[item_key] = (
+                                    _random_state_dict[item_key].clone().detach()
+                                )
+                            else:
+                                new_key = item_key.replace(
+                                    f".{_model_key}.", f".{_model_key_from}."
+                                )
+                                # print(f'Replace {item_key} with {new_key} in pretrained_model!')
+                                _new_state_dict[item_key] = (
+                                    _origin_state_dict[new_key].clone().detach()
+                                )
+
+                    # collect model params from the pretrained model
+                    for model_key in self.model_keys:
+                        finetune_rule_single = self.finetune_links[model_key]
+                        collect_single_finetune_params(
+                            model_key,
+                            finetune_rule_single,
+                            new_state_dict,
+                            state_dict,
+                            target_state_dict,
+                        )
+                    state_dict = new_state_dict
+                    state_dict["_extra_state"] = self.wrapper.state_dict()[
+                        "_extra_state"
+                    ]
+
+                self.wrapper.set_state_dict(state_dict)
+
+                # change bias for fine-tuning
+                if finetune_model is not None:
+
+                    def single_model_finetune(
+                        _model,
+                        _finetune_rule_single,
+                        _sample_func,
+                    ):
+                        _model = model_change_out_bias(
+                            _model,
+                            _sample_func,
+                            _bias_adjust_mode="change-by-statistic"
+                            if not _finetune_rule_single.get_random_fitting()
+                            else "set-by-statistic",
+                        )
+                        return _model
+
+                    if not self.multi_task:
+                        finetune_rule_single = self.finetune_links["Default"]
+                        self.model = single_model_finetune(
+                            self.model, finetune_rule_single, self.get_sample_func
+                        )
+                    else:
+                        for model_key in self.model_keys:
+                            finetune_rule_single = self.finetune_links[model_key]
+                            if not finetune_rule_single.get_resuming():
+                                log.info(
+                                    f"Model branch {model_key} will be fine-tuned. This may take a long time..."
+                                )
+                                self.model[model_key] = single_model_finetune(
+                                    self.model[model_key],
+                                    finetune_rule_single,
+                                    self.get_sample_func[model_key],
+                                )
+                            else:
+                                log.info(
+                                    f"Model branch {model_key} will resume training."
+                                )
+
+        if init_frz_model is not None:
+            frz_model = paddle.jit.load(init_frz_model)
+            self.model.set_state_dict(frz_model.state_dict())
+
+        # Multi-task share params
+        if shared_links is not None:
+            self.wrapper.share_params(
+                shared_links,
+                resume=(resuming and not self.finetune_update_stat) or self.rank != 0,
+            )
+
+        # TODO add lr warmups for multitask
+        # author: iProzd
+        def warm_up_linear(step, warmup_steps):
+            if step < warmup_steps:
+                return step / warmup_steps
+            else:
+                return self.lr_exp.value(step - warmup_steps) / self.lr_exp.start_lr
+
+        # TODO add optimizers for multitask
+        # author: iProzd
+        if self.opt_type == "Adam":
+            self.scheduler = paddle.optimizer.lr.LambdaDecay(
+                learning_rate=self.lr_exp.start_lr,
+                lr_lambda=lambda step: warm_up_linear(
+                    step + self.start_step, self.warmup_steps
+                ),
+            )
+            self.optimizer = paddle.optimizer.Adam(
+                learning_rate=self.scheduler, parameters=self.wrapper.parameters()
+            )
+            if optimizer_state_dict is not None and self.restart_training:
+                self.optimizer.set_state_dict(optimizer_state_dict)
+        else:
+            raise ValueError(f"Not supported optimizer type '{self.opt_type}'")
+
+        if dist.is_available() and dist.is_initialized():
+            # DDP will guarantee the model parameters are identical across all processes
+            self.wrapper = fleet.distributed_model(
+                self.wrapper,
+                # find_unused_parameters=True,
+            )
+            self.optimizer = fleet.distributed_optimizer(self.optimizer)
+
+        # Get model prob for multi-task
+        if self.multi_task:
+            self.model_prob = np.array([0.0 for key in self.model_keys])
+            if training_params.get("model_prob", None) is not None:
+                model_prob = training_params["model_prob"]
+                for ii, model_key in enumerate(self.model_keys):
+                    if model_key in model_prob:
+                        self.model_prob[ii] += float(model_prob[model_key])
+            else:
+                for ii, model_key in enumerate(self.model_keys):
+                    self.model_prob[ii] += float(len(self.training_data[model_key]))
+            sum_prob = np.sum(self.model_prob)
+            assert sum_prob > 0.0, "Sum of model prob must be larger than 0!"
+            self.model_prob = self.model_prob / sum_prob
+
+        # Tensorboard
+        self.enable_tensorboard = training_params.get("tensorboard", False)
+        self.tensorboard_log_dir = training_params.get("tensorboard_log_dir", "log")
+        self.tensorboard_freq = training_params.get("tensorboard_freq", 1)
+        self.enable_profiler = training_params.get("enable_profiler", False)
+        self.profiling = training_params.get("profiling", False)
+        self.profiling_file = training_params.get("profiling_file", "timeline.json")
+
+    def run(self):
+        fout = (
+            open(
+                self.disp_file,
+                mode="w" if not self.restart_training else "a",
+                buffering=1,
+            )
+            if self.rank == 0
+            else None
+        )  # line buffered
+        if SAMPLER_RECORD:
+            record_file = f"Sample_rank_{self.rank}.txt"
+            fout1 = open(record_file, mode="w", buffering=1)
+        log.info("Start to train %d steps.", self.num_steps)
+        if dist.is_available() and dist.is_initialized():
+            log.info(f"Rank: {dist.get_rank()}/{dist.get_world_size()}")
+        if self.enable_tensorboard:
+            from tensorboardX import (
+                SummaryWriter,
+            )
+
+            writer = SummaryWriter(log_dir=self.tensorboard_log_dir)
+        enable_profiling = self.enable_profiler or self.profiling
+        if enable_profiling:
+            core.nvprof_start()
+            core.nvprof_enable_record_event()
+
+        def step(_step_id, task_key="Default"):
+            # Paddle Profiler
+            if enable_profiling:
+                core.nvprof_nvtx_push(f"Training step {_step_id}")
+            self.wrapper.train()
+            if isinstance(self.lr_exp, dict):
+                _lr = self.lr_exp[task_key]
+            else:
+                _lr = self.lr_exp
+            cur_lr = _lr.value(_step_id)
+            pref_lr = cur_lr
+            self.optimizer.clear_grad(set_to_zero=False)
+            input_dict, label_dict, log_dict = self.get_data(
+                is_train=True, task_key=task_key
+            )
+            if SAMPLER_RECORD:
+                print_str = f"Step {_step_id}: sample system{log_dict['sid']}  frame{log_dict['fid']}\n"
+                fout1.write(print_str)
+                fout1.flush()
+            if self.opt_type == "Adam":
+                cur_lr = self.scheduler.get_lr()
+                if _step_id < self.warmup_steps:
+                    pref_lr = _lr.start_lr
+                else:
+                    pref_lr = cur_lr
+                with nvprof_context(enable_profiling, "Forward pass"):
+                    model_pred, loss, more_loss = self.wrapper(
+                        **input_dict,
+                        cur_lr=pref_lr,
+                        label=label_dict,
+                        task_key=task_key,
+                    )
+
+                with nvprof_context(enable_profiling, "Backward pass"):
+                    loss.backward()
+
+                if self.gradient_max_norm > 0.0:
+                    with nvprof_context(enable_profiling, "Gradient clip"):
+                        grad_norm = paddle.nn.utils.clip_grad_norm_(
+                            self.wrapper.parameters(), self.gradient_max_norm
+                        )
+                    if not paddle.isfinite(grad_norm).all():
+                        # check local gradnorm single GPU case, trigger NanDetector
+                        raise FloatingPointError("gradients are Nan/Inf")
+
+                with nvprof_context(enable_profiling, "Adam update"):
+                    self.optimizer.step()
+
+                self.scheduler.step()
+
+                if enable_profiling:
+                    core.nvprof_nvtx_pop()
+            else:
+                raise ValueError(f"Not supported optimizer type '{self.opt_type}'")
+
+            # Log and persist
+            display_step_id = _step_id + 1
+            if self.display_in_training and (
+                display_step_id % self.disp_freq == 0 or display_step_id == 1
+            ):
+                self.wrapper.eval()
+
+                def log_loss_train(_loss, _more_loss, _task_key="Default"):
+                    results = {}
+                    rmse_val = {
+                        item: _more_loss[item]
+                        for item in _more_loss
+                        if "l2_" not in item
+                    }
+                    for item in sorted(rmse_val.keys()):
+                        results[item] = rmse_val[item]
+                    return results
+
+                def log_loss_valid(_task_key="Default"):
+                    single_results = {}
+                    sum_natoms = 0
+                    if not self.multi_task:
+                        valid_numb_batch = self.valid_numb_batch
+                    else:
+                        valid_numb_batch = self.valid_numb_batch[_task_key]
+                    for ii in range(valid_numb_batch):
+                        self.optimizer.clear_grad()
+                        input_dict, label_dict, _ = self.get_data(
+                            is_train=False, task_key=_task_key
+                        )
+                        if input_dict == {}:
+                            # no validation data
+                            return {}
+                        _, loss, more_loss = self.wrapper(
+                            **input_dict,
+                            cur_lr=pref_lr,
+                            label=label_dict,
+                            task_key=_task_key,
+                        )
+                        # more_loss.update({"rmse": math.sqrt(loss)})
+                        natoms = int(input_dict["atype"].shape[-1])
+                        sum_natoms += natoms
+                        for k, v in more_loss.items():
+                            if "l2_" not in k:
+                                single_results[k] = (
+                                    single_results.get(k, 0.0) + v * natoms
+                                )
+                    results = {k: v / sum_natoms for k, v in single_results.items()}
+                    return results
+
+                if not self.multi_task:
+                    train_results = log_loss_train(loss, more_loss)
+                    valid_results = log_loss_valid()
+                    if self.rank == 0:
+                        log.info(
+                            format_training_message_per_task(
+                                batch=display_step_id,
+                                task_name="trn",
+                                rmse=train_results,
+                                learning_rate=cur_lr,
+                            )
+                        )
+                        if valid_results:
+                            log.info(
+                                format_training_message_per_task(
+                                    batch=display_step_id,
+                                    task_name="val",
+                                    rmse=valid_results,
+                                    learning_rate=None,
+                                )
+                            )
+                else:
+                    train_results = {_key: {} for _key in self.model_keys}
+                    valid_results = {_key: {} for _key in self.model_keys}
+                    train_results[task_key] = log_loss_train(
+                        loss, more_loss, _task_key=task_key
+                    )
+                    for _key in self.model_keys:
+                        if _key != task_key:
+                            self.optimizer.clear_grad()
+                            input_dict, label_dict, _ = self.get_data(
+                                is_train=True, task_key=_key
+                            )
+                            _, loss, more_loss = self.wrapper(
+                                **input_dict,
+                                cur_lr=pref_lr,
+                                label=label_dict,
+                                task_key=_key,
+                            )
+                            train_results[_key] = log_loss_train(
+                                loss, more_loss, _task_key=_key
+                            )
+                        valid_results[_key] = log_loss_valid(_task_key=_key)
+                        if self.rank == 0:
+                            log.info(
+                                format_training_message_per_task(
+                                    batch=display_step_id,
+                                    task_name=_key + "_trn",
+                                    rmse=train_results[_key],
+                                    learning_rate=cur_lr,
+                                )
+                            )
+                            if valid_results[_key]:
+                                log.info(
+                                    format_training_message_per_task(
+                                        batch=display_step_id,
+                                        task_name=_key + "_val",
+                                        rmse=valid_results[_key],
+                                        learning_rate=None,
+                                    )
+                                )
+
+                current_time = time.time()
+                train_time = current_time - self.t0
+                self.t0 = current_time
+                if self.rank == 0 and self.timing_in_training:
+                    eta = int(
+                        (self.num_steps - _step_id - 1) / self.disp_freq * train_time
+                    )
+                    log.info(
+                        format_training_message(
+                            batch=display_step_id,
+                            wall_time=train_time,
+                            eta=eta,
+                        )
+                    )
+                # the first training time is not accurate
+                if (
+                    (_step_id + 1 - self.start_step) > self.disp_freq
+                    or self.num_steps - self.start_step < 2 * self.disp_freq
+                ):
+                    self.total_train_time += train_time
+
+                if fout:
+                    if self.lcurve_should_print_header:
+                        self.print_header(fout, train_results, valid_results)
+                        self.lcurve_should_print_header = False
+                    self.print_on_training(
+                        fout, display_step_id, cur_lr, train_results, valid_results
+                    )
+
+            if (
+                ((_step_id + 1) % self.save_freq == 0 and _step_id != self.start_step)
+                or (_step_id + 1) == self.num_steps
+            ) and (self.rank == 0 or dist.get_rank() == 0):
+                # Handle the case if rank 0 aborted and re-assigned
+                self.latest_model = Path(self.save_ckpt + f"-{_step_id + 1}.pd")
+
+                module = (
+                    self.wrapper.module
+                    if dist.is_available() and dist.is_initialized()
+                    else self.wrapper
+                )
+                self.save_model(self.latest_model, lr=cur_lr, step=_step_id)
+                log.info(f"Saved model to {self.latest_model}")
+                symlink_prefix_files(self.latest_model.stem, self.save_ckpt)
+                with open("checkpoint", "w") as f:
+                    f.write(str(self.latest_model))
+
+            # tensorboard
+            if self.enable_tensorboard and (
+                display_step_id % self.tensorboard_freq == 0 or display_step_id == 1
+            ):
+                writer.add_scalar(f"{task_key}/lr", cur_lr, display_step_id)
+                writer.add_scalar(f"{task_key}/loss", loss, display_step_id)
+                for item in more_loss:
+                    writer.add_scalar(
+                        f"{task_key}/{item}", more_loss[item].item(), _step_id
+                    )
+
+        self.t0 = time.time()
+        self.total_train_time = 0.0
+        for step_id in range(self.num_steps):
+            if step_id < self.start_step:
+                continue
+            if self.multi_task:
+                chosen_index_list = dp_random.choice(
+                    np.arange(
+                        self.num_model, dtype=np.int32
+                    ),  # int32 should be enough for # models...
+                    p=np.array(self.model_prob),
+                    size=self.world_size,
+                    replace=True,
+                )
+                assert chosen_index_list.size == self.world_size
+                model_index = chosen_index_list[self.rank]
+                model_key = self.model_keys[model_index]
+            else:
+                model_key = "Default"
+            step(step_id, model_key)
+            if JIT:
+                break
+
+        if self.change_bias_after_training and (self.rank == 0 or dist.get_rank() == 0):
+            if not self.multi_task:
+                self.model = model_change_out_bias(
+                    self.model,
+                    self.get_sample_func,
+                    _bias_adjust_mode="change-by-statistic",
+                )
+            else:
+                for model_key in self.model_keys:
+                    self.model[model_key] = model_change_out_bias(
+                        self.model[model_key],
+                        self.get_sample_func[model_key],
+                        _bias_adjust_mode="change-by-statistic",
+                    )
+            self.latest_model = Path(self.save_ckpt + f"-{self.num_steps}.pd")
+            cur_lr = self.lr_exp.value(self.num_steps - 1)
+            self.save_model(self.latest_model, lr=cur_lr, step=self.num_steps - 1)
+            log.info(f"Saved model to {self.latest_model}")
+            symlink_prefix_files(self.latest_model.stem, self.save_ckpt)
+            with open("checkpoint", "w") as f:
+                f.write(str(self.latest_model))
+
+        if (
+            self.rank == 0 or dist.get_rank() == 0
+        ):  # Handle the case if rank 0 aborted and re-assigned
+            if self.num_steps == 0:
+                # when num_steps is 0, the checkpoint is never not saved
+                self.latest_model = Path(self.save_ckpt + "-0.pd")
+                self.save_model(self.latest_model, lr=0, step=0)
+                log.info(f"Saved model to {self.latest_model}")
+                symlink_prefix_files(self.latest_model.stem, self.save_ckpt)
+                with open("checkpoint", "w") as f:
+                    f.write(str(self.latest_model))
+
+            elapsed_batch = self.num_steps - self.start_step
+            if self.timing_in_training and elapsed_batch // self.disp_freq > 0:
+                if self.start_step >= 2 * self.disp_freq:
+                    log.info(
+                        "average training time: %.4f s/batch (exclude first %d batches)",
+                        self.total_train_time
+                        / (
+                            elapsed_batch // self.disp_freq * self.disp_freq
+                            - self.disp_freq
+                        ),
+                        self.disp_freq,
+                    )
+                else:
+                    log.info(
+                        "average training time: %.4f s/batch",
+                        self.total_train_time
+                        / (elapsed_batch // self.disp_freq * self.disp_freq),
+                    )
+
+            if JIT:
+                raise NotImplementedError(
+                    "Paddle JIT saving during training is not supported yet."
+                )
+            log.info(f"Trained model has been saved to: {self.save_ckpt}")
+
+        if fout:
+            fout.close()
+        if SAMPLER_RECORD:
+            fout1.close()
+        if self.enable_tensorboard:
+            writer.close()
+        if enable_profiling:
+            core.nvprof_stop()
+            log.info(
+                "The nsys profiling trace have been saved to *.nsys-rep and *.sqlite "
+                "files, which can be viewd in NVIDIA Nsight Systems software"
+            )
+
+    def save_model(self, save_path, lr=0.0, step=0):
+        module = (
+            self.wrapper.module
+            if dist.is_available() and dist.is_initialized()
+            else self.wrapper
+        )
+        module.train_infos["lr"] = float(lr)
+        module.train_infos["step"] = step
+        paddle.save(
+            {"model": module.state_dict(), "optimizer": self.optimizer.state_dict()},
+            str(save_path),
+        )
+        checkpoint_dir = save_path.parent
+        checkpoint_files = [
+            f
+            for f in checkpoint_dir.glob("*.pd")
+            if not f.is_symlink() and f.name.startswith(self.save_ckpt)
+        ]
+        if len(checkpoint_files) > self.max_ckpt_keep:
+            checkpoint_files.sort(key=lambda x: x.stat().st_mtime)
+            checkpoint_files[0].unlink()
+
+    def get_data(self, is_train=True, task_key="Default"):
+        if not self.multi_task:
+            if is_train:
+                try:
+                    batch_data = next(iter(self.training_data))
+                except StopIteration:
+                    # Refresh the status of the dataloader to start from a new epoch
+                    self.training_data = BufferedIterator(
+                        iter(self.training_dataloader)
+                    )
+                    batch_data = next(iter(self.training_data))
+            else:
+                if self.validation_data is None:
+                    return {}, {}, {}
+                try:
+                    batch_data = next(iter(self.validation_data))
+                except StopIteration:
+                    self.validation_data = BufferedIterator(
+                        iter(self.validation_dataloader)
+                    )
+                    batch_data = next(iter(self.validation_data))
+        else:
+            if is_train:
+                try:
+                    batch_data = next(iter(self.training_data[task_key]))
+                except StopIteration:
+                    # Refresh the status of the dataloader to start from a new epoch
+                    self.training_data[task_key] = BufferedIterator(
+                        iter(self.training_dataloader[task_key])
+                    )
+                    batch_data = next(iter(self.training_data[task_key]))
+            else:
+                if self.validation_data[task_key] is None:
+                    return {}, {}, {}
+                try:
+                    batch_data = next(iter(self.validation_data[task_key]))
+                except StopIteration:
+                    self.validation_data[task_key] = BufferedIterator(
+                        iter(self.validation_dataloader[task_key])
+                    )
+                    batch_data = next(iter(self.validation_data[task_key]))
+
+        for key in batch_data.keys():
+            if key == "sid" or key == "fid" or key == "box" or "find_" in key:
+                continue
+            elif not isinstance(batch_data[key], list):
+                if batch_data[key] is not None:
+                    batch_data[key] = batch_data[key].to(DEVICE)
+            else:
+                batch_data[key] = [item.to(DEVICE) for item in batch_data[key]]
+        # we may need a better way to classify which are inputs and which are labels
+        # now wrapper only supports the following inputs:
+        input_keys = [
+            "coord",
+            "atype",
+            "spin",
+            "box",
+            "fparam",
+            "aparam",
+        ]
+        input_dict = {item_key: None for item_key in input_keys}
+        label_dict = {}
+        for item_key in batch_data:
+            if item_key in input_keys:
+                input_dict[item_key] = batch_data[item_key]
+            else:
+                if item_key not in ["sid", "fid"]:
+                    label_dict[item_key] = batch_data[item_key]
+        log_dict = {}
+        if "fid" in batch_data:
+            log_dict["fid"] = batch_data["fid"]
+        log_dict["sid"] = batch_data["sid"]
+        return input_dict, label_dict, log_dict
+
+    def print_header(self, fout, train_results, valid_results):
+        train_keys = sorted(train_results.keys())
+        print_str = ""
+        print_str += "# %5s" % "step"
+        if not self.multi_task:
+            if valid_results:
+                prop_fmt = "   %11s %11s"
+                for k in train_keys:
+                    print_str += prop_fmt % (k + "_val", k + "_trn")
+            else:
+                prop_fmt = "   %11s"
+                for k in train_keys:
+                    print_str += prop_fmt % (k + "_trn")
+        else:
+            for model_key in self.model_keys:
+                if valid_results[model_key]:
+                    prop_fmt = "   %11s %11s"
+                    for k in sorted(train_results[model_key].keys()):
+                        print_str += prop_fmt % (
+                            k + f"_val_{model_key}",
+                            k + f"_trn_{model_key}",
+                        )
+                else:
+                    prop_fmt = "   %11s"
+                    for k in sorted(train_results[model_key].keys()):
+                        print_str += prop_fmt % (k + f"_trn_{model_key}")
+        print_str += "   %8s\n" % "lr"
+        print_str += "# If there is no available reference data, rmse_*_{val,trn} will print nan\n"
+        fout.write(print_str)
+        fout.flush()
+
+    def print_on_training(self, fout, step_id, cur_lr, train_results, valid_results):
+        train_keys = sorted(train_results.keys())
+        print_str = ""
+        print_str += "%7d" % step_id
+        if not self.multi_task:
+            if valid_results:
+                prop_fmt = "   %11.2e %11.2e"
+                for k in train_keys:
+                    print_str += prop_fmt % (valid_results[k], train_results[k])
+            else:
+                prop_fmt = "   %11.2e"
+                for k in train_keys:
+                    print_str += prop_fmt % (train_results[k])
+        else:
+            for model_key in self.model_keys:
+                if valid_results[model_key]:
+                    prop_fmt = "   %11.2e %11.2e"
+                    for k in sorted(valid_results[model_key].keys()):
+                        print_str += prop_fmt % (
+                            valid_results[model_key][k],
+                            train_results[model_key][k],
+                        )
+                else:
+                    prop_fmt = "   %11.2e"
+                    for k in sorted(train_results[model_key].keys()):
+                        print_str += prop_fmt % (train_results[model_key][k])
+        print_str += f"   {cur_lr:8.1e}\n"
+        fout.write(print_str)
+        fout.flush()
+
+
+def get_additional_data_requirement(_model):
+    additional_data_requirement = []
+    if _model.get_dim_fparam() > 0:
+        fparam_requirement_items = [
+            DataRequirementItem(
+                "fparam", _model.get_dim_fparam(), atomic=False, must=True
+            )
+        ]
+        additional_data_requirement += fparam_requirement_items
+    if _model.get_dim_aparam() > 0:
+        aparam_requirement_items = [
+            DataRequirementItem(
+                "aparam", _model.get_dim_aparam(), atomic=True, must=True
+            )
+        ]
+        additional_data_requirement += aparam_requirement_items
+    has_spin = getattr(_model, "has_spin", False)
+    if callable(has_spin):
+        has_spin = has_spin()
+    if has_spin:
+        spin_requirement_items = [
+            DataRequirementItem("spin", ndof=3, atomic=True, must=True)
+        ]
+        additional_data_requirement += spin_requirement_items
+    return additional_data_requirement
+
+
+def get_loss(loss_params, start_lr, _ntypes, _model):
+    loss_type = loss_params.get("type", "ener")
+    if loss_type == "ener":
+        loss_params["starter_learning_rate"] = start_lr
+        return EnergyStdLoss(**loss_params)
+    else:
+        loss_params["starter_learning_rate"] = start_lr
+        return TaskLoss.get_class_by_type(loss_type).get_loss(loss_params)
+
+
+def get_single_model(
+    _model_params,
+):
+    model = get_model(deepcopy(_model_params)).to(DEVICE)
+    return model
+
+
+def get_model_for_wrapper(_model_params):
+    if "model_dict" not in _model_params:
+        _model = get_single_model(
+            _model_params,
+        )
+    else:
+        _model = {}
+        model_keys = list(_model_params["model_dict"])
+        for _model_key in model_keys:
+            _model[_model_key] = get_single_model(
+                _model_params["model_dict"][_model_key],
+            )
+    return _model
+
+
+def model_change_out_bias(
+    _model,
+    _sample_func,
+    _bias_adjust_mode="change-by-statistic",
+):
+    old_bias = deepcopy(_model.get_out_bias())
+    _model.change_out_bias(
+        _sample_func,
+        bias_adjust_mode=_bias_adjust_mode,
+    )
+    new_bias = deepcopy(_model.get_out_bias())
+
+    model_type_map = _model.get_type_map()
+    log.info(
+        f"Change output bias of {model_type_map!s} "
+        f"from {to_numpy_array(old_bias).reshape(-1)!s} "
+        f"to {to_numpy_array(new_bias).reshape(-1)!s}."
+    )
+    return _model
+
+
+@contextmanager
+def nvprof_context(enable_profiler: bool, name: str):
+    if enable_profiler:
+        core.nvprof_nvtx_push(name)
+
+    try:
+        yield
+
+    finally:
+        if enable_profiler:
+            core.nvprof_nvtx_pop()
diff --git a/deepmd/pd/train/wrapper.py b/deepmd/pd/train/wrapper.py
new file mode 100644
index 0000000000..c3643f8372
--- /dev/null
+++ b/deepmd/pd/train/wrapper.py
@@ -0,0 +1,213 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+from __future__ import (
+    annotations,
+)
+
+import logging
+from collections import (
+    OrderedDict,
+)
+from typing import (
+    Union,
+)
+
+import paddle
+
+_StateDict = Union[dict[str, paddle.Tensor], OrderedDict[str, paddle.Tensor]]
+
+
+log = logging.getLogger(__name__)
+
+
+class ModelWrapper(paddle.nn.Layer):
+    def __init__(
+        self,
+        model: paddle.nn.Layer | dict,
+        loss: paddle.nn.Layer | dict = None,
+        model_params=None,
+        shared_links=None,
+    ):
+        """Construct a DeePMD model wrapper.
+
+        Args:
+        - config: The Dict-like configuration with training options.
+        """
+        super().__init__()
+        self.model_params = model_params if model_params is not None else {}
+        self.train_infos = {
+            "lr": 0,
+            "step": 0,
+        }
+        self.multi_task = False
+        self.model = paddle.nn.LayerDict()
+        # Model
+        if isinstance(model, paddle.nn.Layer):
+            self.model["Default"] = model
+        elif isinstance(model, dict):
+            self.multi_task = True
+            for task_key in model:
+                assert isinstance(
+                    model[task_key], paddle.nn.Layer
+                ), f"{task_key} in model_dict is not a paddle.nn.Layer!"
+                self.model[task_key] = model[task_key]
+        # Loss
+        self.loss = None
+        if loss is not None:
+            self.loss = paddle.nn.LayerDict()
+            if isinstance(loss, paddle.nn.Layer):
+                self.loss["Default"] = loss
+            elif isinstance(loss, dict):
+                for task_key in loss:
+                    assert isinstance(
+                        loss[task_key], paddle.nn.Layer
+                    ), f"{task_key} in loss_dict is not a paddle.nn.Layer!"
+                    self.loss[task_key] = loss[task_key]
+        self.inference_only = self.loss is None
+
+    def share_params(self, shared_links, resume=False):
+        """
+        Share the parameters of classes following rules defined in shared_links during multitask training.
+        If not start from checkpoint (resume is False),
+        some separated parameters (e.g. mean and stddev) will be re-calculated across different classes.
+        """
+        supported_types = ["descriptor", "fitting_net"]
+        for shared_item in shared_links:
+            class_name = shared_links[shared_item]["type"]
+            shared_base = shared_links[shared_item]["links"][0]
+            class_type_base = shared_base["shared_type"]
+            model_key_base = shared_base["model_key"]
+            shared_level_base = shared_base["shared_level"]
+            if "descriptor" in class_type_base:
+                if class_type_base == "descriptor":
+                    base_class = self.model[model_key_base].get_descriptor()
+                elif "hybrid" in class_type_base:
+                    raise NotImplementedError(
+                        "Hybrid descriptor is not implemented yet"
+                    )
+                else:
+                    raise RuntimeError(f"Unknown class_type {class_type_base}!")
+                for link_item in shared_links[shared_item]["links"][1:]:
+                    class_type_link = link_item["shared_type"]
+                    model_key_link = link_item["model_key"]
+                    shared_level_link = int(link_item["shared_level"])
+                    assert (
+                        shared_level_link >= shared_level_base
+                    ), "The shared_links must be sorted by shared_level!"
+                    assert (
+                        "descriptor" in class_type_link
+                    ), f"Class type mismatched: {class_type_base} vs {class_type_link}!"
+                    if class_type_link == "descriptor":
+                        link_class = self.model[model_key_link].get_descriptor()
+                    elif "hybrid" in class_type_link:
+                        raise NotImplementedError(
+                            "Hybrid descriptor is not implemented yet"
+                        )
+                    else:
+                        raise RuntimeError(f"Unknown class_type {class_type_link}!")
+                    link_class.share_params(
+                        base_class, shared_level_link, resume=resume
+                    )
+                    log.warning(
+                        f"Shared params of {model_key_base}.{class_type_base} and {model_key_link}.{class_type_link}!"
+                    )
+            else:
+                if hasattr(self.model[model_key_base], class_type_base):
+                    base_class = self.model[model_key_base].__getattr__(class_type_base)
+                    for link_item in shared_links[shared_item]["links"][1:]:
+                        class_type_link = link_item["shared_type"]
+                        model_key_link = link_item["model_key"]
+                        shared_level_link = int(link_item["shared_level"])
+                        assert (
+                            shared_level_link >= shared_level_base
+                        ), "The shared_links must be sorted by shared_level!"
+                        assert (
+                            class_type_base == class_type_link
+                        ), f"Class type mismatched: {class_type_base} vs {class_type_link}!"
+                        link_class = self.model[model_key_link].__getattr__(
+                            class_type_link
+                        )
+                        link_class.share_params(
+                            base_class, shared_level_link, resume=resume
+                        )
+                        log.warning(
+                            f"Shared params of {model_key_base}.{class_type_base} and {model_key_link}.{class_type_link}!"
+                        )
+
+    def forward(
+        self,
+        coord,
+        atype,
+        spin: paddle.Tensor | None = None,
+        box: paddle.Tensor | None = None,
+        cur_lr: paddle.Tensor | None = None,
+        label: paddle.Tensor | None = None,
+        task_key: paddle.Tensor | None = None,
+        inference_only=False,
+        do_atomic_virial=False,
+        fparam: paddle.Tensor | None = None,
+        aparam: paddle.Tensor | None = None,
+    ):
+        if not self.multi_task:
+            task_key = "Default"
+        else:
+            assert (
+                task_key is not None
+            ), f"Multitask model must specify the inference task! Supported tasks are {list(self.model.keys())}."
+        input_dict = {
+            "coord": coord,
+            "atype": atype,
+            "box": box,
+            "do_atomic_virial": do_atomic_virial,
+            "fparam": fparam,
+            "aparam": aparam,
+        }
+        has_spin = getattr(self.model[task_key], "has_spin", False)
+        if callable(has_spin):
+            has_spin = has_spin()
+        if has_spin:
+            input_dict["spin"] = spin
+
+        if self.inference_only or inference_only:
+            model_pred = self.model[task_key](**input_dict)
+            return model_pred, None, None
+        else:
+            natoms = atype.shape[-1]
+            model_pred, loss, more_loss = self.loss[task_key](
+                input_dict,
+                self.model[task_key],
+                label,
+                natoms=natoms,
+                learning_rate=cur_lr,
+            )
+            return model_pred, loss, more_loss
+
+    def load_state_dict(
+        self,
+        state_dict: _StateDict,
+    ) -> tuple[list[str], list[str]]:
+        self.set_extra_state(state_dict.pop("_extra_state"))
+        return super().set_state_dict(state_dict)
+
+    def set_state_dict(
+        self,
+        state_dict: _StateDict,
+    ) -> tuple[list[str], list[str]]:
+        return self.load_state_dict(state_dict)
+
+    def state_dict(self):
+        state_dict = super().state_dict()
+        extra_state = self.get_extra_state()
+        state_dict.update({"_extra_state": extra_state})
+        return state_dict
+
+    def set_extra_state(self, extra_state: dict):
+        self.model_params = extra_state["model_params"]
+        self.train_infos = extra_state["train_infos"]
+        return None
+
+    def get_extra_state(self) -> dict:
+        extra_state = {
+            "model_params": self.model_params,
+            "train_infos": self.train_infos,
+        }
+        return extra_state
diff --git a/deepmd/pd/utils/__init__.py b/deepmd/pd/utils/__init__.py
new file mode 100644
index 0000000000..7e1043eda4
--- /dev/null
+++ b/deepmd/pd/utils/__init__.py
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+
+from .exclude_mask import (
+    AtomExcludeMask,
+    PairExcludeMask,
+)
+
+__all__ = [
+    "PairExcludeMask",
+    "AtomExcludeMask",
+]
diff --git a/deepmd/pd/utils/auto_batch_size.py b/deepmd/pd/utils/auto_batch_size.py
new file mode 100644
index 0000000000..8cdb5ddea2
--- /dev/null
+++ b/deepmd/pd/utils/auto_batch_size.py
@@ -0,0 +1,60 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+
+import paddle
+
+from deepmd.utils.batch_size import AutoBatchSize as AutoBatchSizeBase
+
+
+class AutoBatchSize(AutoBatchSizeBase):
+    """Auto batch size.
+
+    Parameters
+    ----------
+    initial_batch_size : int, default: 1024
+        initial batch size (number of total atoms) when DP_INFER_BATCH_SIZE
+        is not set
+    factor : float, default: 2.
+        increased factor
+
+    """
+
+    def __init__(
+        self,
+        initial_batch_size: int = 1024,
+        factor: float = 2.0,
+    ):
+        super().__init__(
+            initial_batch_size=initial_batch_size,
+            factor=factor,
+        )
+
+    def is_gpu_available(self) -> bool:
+        """Check if GPU is available.
+
+        Returns
+        -------
+        bool
+            True if GPU is available
+        """
+        return paddle.device.cuda.device_count() > 0
+
+    def is_oom_error(self, e: Exception) -> bool:
+        """Check if the exception is an OOM error.
+
+        Parameters
+        ----------
+        e : Exception
+            Exception
+        """
+        # several sources think CUSOLVER_STATUS_INTERNAL_ERROR is another out-of-memory error,
+        # such as https://github.com/JuliaGPU/CUDA.jl/issues/1924
+        # (the meaningless error message should be considered as a bug in cusolver)
+        if isinstance(e, RuntimeError) and (
+            "CUDA out of memory." in e.args[0]
+            or "CUDA driver error: out of memory" in e.args[0]
+            or "cusolver error: CUSOLVER_STATUS_INTERNAL_ERROR" in e.args[0]
+        ):
+            # Release all unoccupied cached memory
+            # paddle.device.cuda.empty_cache()
+            return True
+        return False
diff --git a/deepmd/pd/utils/dataloader.py b/deepmd/pd/utils/dataloader.py
new file mode 100644
index 0000000000..7a2bf4fe9c
--- /dev/null
+++ b/deepmd/pd/utils/dataloader.py
@@ -0,0 +1,339 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import logging
+import os
+import queue
+import time
+from collections.abc import (
+    Iterator,
+)
+from multiprocessing.dummy import (
+    Pool,
+)
+from threading import (
+    Thread,
+)
+
+import h5py
+import numpy as np
+import paddle
+import paddle.distributed as dist
+
+# import paddle.multiprocessing
+from paddle.io import (
+    BatchSampler,
+    DataLoader,
+    Dataset,
+    DistributedBatchSampler,
+    WeightedRandomSampler,
+)
+from paddle.io.dataloader.collate import (
+    default_collate_fn,
+)
+
+from deepmd.pd.utils import (
+    env,
+)
+from deepmd.pd.utils.dataset import (
+    DeepmdDataSetForLoader,
+)
+from deepmd.utils.data import (
+    DataRequirementItem,
+)
+from deepmd.utils.data_system import (
+    print_summary,
+    prob_sys_size_ext,
+    process_sys_probs,
+)
+
+log = logging.getLogger(__name__)
+# paddle.multiprocessing.set_sharing_strategy("file_system")
+
+
+def setup_seed(seed):
+    paddle.seed(seed)
+    os.environ["FLAGS_cudnn_deterministic"] = "True"
+
+
+class DpLoaderSet(Dataset):
+    """A dataset for storing DataLoaders to multiple Systems.
+
+    Parameters
+    ----------
+    sys_path
+            Path to the data system
+    batch_size
+            Max frame count in a batch.
+    type_map
+            Gives the name of different atom types
+    seed
+            Random seed for dataloader
+    shuffle
+            If the data are shuffled (Only effective in serial mode. Always shuffle in distributed data parallelism)
+    """
+
+    def __init__(
+        self,
+        systems,
+        batch_size,
+        type_map,
+        seed=None,
+        shuffle=True,
+    ):
+        if seed is not None:
+            setup_seed(seed)
+        if isinstance(systems, str):
+            with h5py.File(systems) as file:
+                systems = [os.path.join(systems, item) for item in file.keys()]
+
+        self.systems: list[DeepmdDataSetForLoader] = []
+        if len(systems) >= 100:
+            log.info(f"Constructing DataLoaders from {len(systems)} systems")
+
+        def construct_dataset(system):
+            return DeepmdDataSetForLoader(
+                system=system,
+                type_map=type_map,
+            )
+
+        MAX_PROCESSES_NUM = 4
+        processes = min(
+            os.cpu_count()
+            // (
+                dist.get_world_size()
+                if dist.is_available() and dist.is_initialized()
+                else 1
+            ),
+            MAX_PROCESSES_NUM,
+        )
+        with Pool(processes) as pool:
+            self.systems = pool.map(construct_dataset, systems)
+
+        self.sampler_list: list[DistributedBatchSampler] = []
+        self.index = []
+        self.total_batch = 0
+
+        self.dataloaders = []
+        self.batch_sizes = []
+        if isinstance(batch_size, str):
+            if batch_size == "auto":
+                rule = 32
+            elif batch_size.startswith("auto:"):
+                rule = int(batch_size.split(":")[1])
+            else:
+                rule = None
+                log.error("Unsupported batch size type")
+            for ii in self.systems:
+                ni = ii._natoms
+                bsi = rule // ni
+                if bsi * ni < rule:
+                    bsi += 1
+                self.batch_sizes.append(bsi)
+        elif isinstance(batch_size, list):
+            self.batch_sizes = batch_size
+        else:
+            self.batch_sizes = batch_size * np.ones(len(systems), dtype=int)
+        assert len(self.systems) == len(self.batch_sizes)
+        for system, batch_size in zip(self.systems, self.batch_sizes):
+            if dist.is_available() and dist.is_initialized():
+                system_batch_sampler = DistributedBatchSampler(
+                    system,
+                    shuffle=(
+                        (not (dist.is_available() and dist.is_initialized()))
+                        and shuffle
+                    ),
+                    batch_size=int(batch_size),
+                )
+                self.sampler_list.append(system_batch_sampler)
+            else:
+                system_batch_sampler = BatchSampler(
+                    system,
+                    shuffle=(
+                        (not (dist.is_available() and dist.is_initialized()))
+                        and shuffle
+                    ),
+                    batch_size=int(batch_size),
+                )
+                self.sampler_list.append(system_batch_sampler)
+            system_dataloader = DataLoader(
+                dataset=system,
+                num_workers=0,  # Should be 0 to avoid too many threads forked
+                batch_sampler=system_batch_sampler,
+                collate_fn=collate_batch,
+                use_buffer_reader=False,
+                places=["cpu"],
+            )
+            self.dataloaders.append(system_dataloader)
+            self.index.append(len(system_dataloader))
+            self.total_batch += len(system_dataloader)
+
+        class LazyIter:
+            """Lazy iterator to prevent fetching data when iter(item)."""
+
+            def __init__(self, item):
+                self.item = item
+
+            def __iter__(self):
+                # directly return
+                return self
+
+            def __next__(self):
+                if not isinstance(self.item, Iterator):
+                    # make iterator here lazily
+                    self.item = iter(self.item)
+                return next(self.item)
+
+        self.iters = []
+        for item in self.dataloaders:
+            self.iters.append(LazyIter(item))
+
+    def set_noise(self, noise_settings):
+        # noise_settings['noise_type'] # "trunc_normal", "normal", "uniform"
+        # noise_settings['noise'] # float, default 1.0
+        # noise_settings['noise_mode'] # "prob", "fix_num"
+        # noise_settings['mask_num'] # if "fix_num", int
+        # noise_settings['mask_prob'] # if "prob", float
+        # noise_settings['same_mask'] # coord and type same mask?
+        for system in self.systems:
+            system.set_noise(noise_settings)
+
+    def __len__(self):
+        return len(self.dataloaders)
+
+    def __getitem__(self, idx):
+        # log.warning(str(paddle.distributed.get_rank())+" idx: "+str(idx)+" index: "+str(self.index[idx]))
+        try:
+            batch = next(self.iters[idx])
+        except StopIteration:
+            self.iters[idx] = iter(self.dataloaders[idx])
+            batch = next(self.iters[idx])
+        batch["sid"] = idx
+        return batch
+
+    def add_data_requirement(self, data_requirement: list[DataRequirementItem]):
+        """Add data requirement for each system in multiple systems."""
+        for system in self.systems:
+            system.add_data_requirement(data_requirement)
+
+    def print_summary(
+        self,
+        name: str,
+        prob: list[float],
+    ):
+        print_summary(
+            name,
+            len(self.systems),
+            [ss.system for ss in self.systems],
+            [ss._natoms for ss in self.systems],
+            self.batch_sizes,
+            [
+                ss._data_system.get_sys_numb_batch(self.batch_sizes[ii])
+                for ii, ss in enumerate(self.systems)
+            ],
+            prob,
+            [ss._data_system.pbc for ss in self.systems],
+        )
+
+
+_sentinel = object()
+QUEUESIZE = 32
+
+
+class BackgroundConsumer(Thread):
+    def __init__(self, queue, source, max_len):
+        Thread.__init__(self)
+        self._queue = queue
+        self._source = source  # Main DL iterator
+        self._max_len = max_len  #
+
+    def run(self):
+        for item in self._source:
+            self._queue.put(item)  # Blocking if the queue is full
+
+        # Signal the consumer we are done.
+        self._queue.put(_sentinel)
+
+
+class BufferedIterator:
+    def __init__(self, iterable):
+        self._queue = queue.Queue(QUEUESIZE)
+        self._iterable = iterable
+        self._consumer = None
+
+        self.start_time = time.time()
+        self.warning_time = None
+        self.total = len(iterable)
+
+    def _create_consumer(self):
+        self._consumer = BackgroundConsumer(self._queue, self._iterable, self.total)
+        self._consumer.daemon = True
+        self._consumer.start()
+
+    def __iter__(self):
+        return self
+
+    def __len__(self):
+        return self.total
+
+    def __next__(self):
+        # Create consumer if not created yet
+        if self._consumer is None:
+            self._create_consumer()
+        # Notify the user if there is a data loading bottleneck
+        if self._queue.qsize() < min(2, max(1, self._queue.maxsize // 2)):
+            if time.time() - self.start_time > 5 * 60:
+                if (
+                    self.warning_time is None
+                    or time.time() - self.warning_time > 15 * 60
+                ):
+                    log.warning(
+                        "Data loading buffer is empty or nearly empty. This may "
+                        "indicate a data loading bottleneck, and increasing the "
+                        "number of workers (--num-workers) may help."
+                    )
+                    self.warning_time = time.time()
+
+        # Get next example
+        item = self._queue.get()
+        if isinstance(item, Exception):
+            raise item
+        if item is _sentinel:
+            raise StopIteration
+        return item
+
+
+def collate_batch(batch):
+    example = batch[0]
+    result = {}
+    for key in example.keys():
+        if "find_" in key:
+            result[key] = batch[0][key]
+        else:
+            if batch[0][key] is None:
+                result[key] = None
+            elif key == "fid":
+                result[key] = [d[key] for d in batch]
+            elif key == "type":
+                continue
+            else:
+                result[key] = default_collate_fn([d[key] for d in batch])
+    return result
+
+
+def get_weighted_sampler(training_data, prob_style, sys_prob=False):
+    if sys_prob is False:
+        if prob_style == "prob_uniform":
+            prob_v = 1.0 / float(training_data.__len__())
+            probs = [prob_v for ii in range(training_data.__len__())]
+        else:  # prob_sys_size;A:B:p1;C:D:p2 or prob_sys_size = prob_sys_size;0:nsys:1.0
+            if prob_style == "prob_sys_size":
+                style = f"prob_sys_size;0:{len(training_data)}:1.0"
+            else:
+                style = prob_style
+            probs = prob_sys_size_ext(style, len(training_data), training_data.index)
+    else:
+        probs = process_sys_probs(prob_style, training_data.index)
+    log.debug("Generated weighted sampler with prob array: " + str(probs))
+    # training_data.total_batch is the size of one epoch, you can increase it to avoid too many  rebuilding of iteraters
+    len_sampler = training_data.total_batch * max(env.NUM_WORKERS, 1)
+    sampler = WeightedRandomSampler(probs, len_sampler, replacement=True)
+    return sampler
diff --git a/deepmd/pd/utils/dataset.py b/deepmd/pd/utils/dataset.py
new file mode 100644
index 0000000000..1f0533d8fc
--- /dev/null
+++ b/deepmd/pd/utils/dataset.py
@@ -0,0 +1,57 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+
+
+from typing import (
+    Optional,
+)
+
+from paddle.io import (
+    Dataset,
+)
+
+from deepmd.utils.data import (
+    DataRequirementItem,
+    DeepmdData,
+)
+
+
+class DeepmdDataSetForLoader(Dataset):
+    def __init__(self, system: str, type_map: Optional[list[str]] = None):
+        """Construct DeePMD-style dataset containing frames cross different systems.
+
+        Args:
+        - systems: Paths to systems.
+        - type_map: Atom types.
+        """
+        self.system = system
+        self._type_map = type_map
+        self._data_system = DeepmdData(sys_path=system, type_map=self._type_map)
+        self.mixed_type = self._data_system.mixed_type
+        self._ntypes = self._data_system.get_ntypes()
+        self._natoms = self._data_system.get_natoms()
+        self._natoms_vec = self._data_system.get_natoms_vec(self._ntypes)
+
+    def __len__(self):
+        return self._data_system.nframes
+
+    def __getitem__(self, index):
+        """Get a frame from the selected system."""
+        b_data = self._data_system.get_item_paddle(index)
+        b_data["natoms"] = self._natoms_vec
+        return b_data
+
+    def add_data_requirement(self, data_requirement: list[DataRequirementItem]):
+        """Add data requirement for this data system."""
+        for data_item in data_requirement:
+            self._data_system.add(
+                data_item["key"],
+                data_item["ndof"],
+                atomic=data_item["atomic"],
+                must=data_item["must"],
+                high_prec=data_item["high_prec"],
+                type_sel=data_item["type_sel"],
+                repeat=data_item["repeat"],
+                default=data_item["default"],
+                dtype=data_item["dtype"],
+                output_natoms_for_type_sel=data_item["output_natoms_for_type_sel"],
+            )
diff --git a/deepmd/pd/utils/decomp.py b/deepmd/pd/utils/decomp.py
new file mode 100644
index 0000000000..434301441a
--- /dev/null
+++ b/deepmd/pd/utils/decomp.py
@@ -0,0 +1,247 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+
+# This file is used to implement some paddle functions with composite API,
+# so as to support high-order differentation when double-backward is needed.
+# For example: [norm] --decomposition--> [multiply, power, sum]
+# This file will be removed when implmented functions are decomposed into primitive
+# function in Paddle framework in the future.
+
+from __future__ import (
+    annotations,
+)
+
+import paddle
+
+__all__ = [
+    "softmax",
+    "norm",
+    "take_along_axis",
+    "scatter_reduce",
+    "sec",
+    "masked_add_",
+]
+
+
+# decomposition for forward function
+def softmax_decomp(x: paddle.Tensor, axis: int = -1) -> paddle.Tensor:
+    """Forward decompsition function of softmax.
+
+    Parameters
+    ----------
+    x : paddle.Tensor
+        Input.
+    axis : int, defaults: -1.
+        A dimension along which softmax will be computed.
+
+    Returns
+    -------
+    paddle.Tensor
+        Computed output.
+    """
+    x_max = paddle.max(x, axis=axis, keepdim=True)
+    x = x - x_max
+    return paddle.exp(x) / paddle.sum(paddle.exp(x), axis=axis, keepdim=True)
+
+
+def norm_decomp(
+    x: paddle.Tensor, p: float = 2, axis: bool = -1, keepdim: bool = False
+) -> paddle.Tensor:
+    """Forward decompsition function of norm.
+
+    Parameters
+    ----------
+    x : paddle.Tensor
+        Input
+    p : float, default: 2
+        Order of norm
+    axis : bool, default: -1
+        Dimensions over which to compute the vector or matrix norm
+    keepdim : bool, default: False
+        If set to True, the reduced dimensions are retained in the result as dimensions
+        with size one
+
+    Returns
+    -------
+    paddle.Tensor
+        A real-valued tensor, even when A is complex.
+    """
+    if p == 2 or p == 2.0:
+        # clip for negative indexing, or 1/(0^(k-1)) will cause inf in backward
+        return (x * x).sum(axis=axis, keepdim=keepdim) ** 0.5
+    return (x.abs() ** p).sum(axis=axis, keepdim=keepdim) ** (1 / p)
+
+
+def take_along_axis_decomp(
+    x: paddle.Tensor, indices: paddle.Tensor, axis: int, broadcast: bool = True
+) -> paddle.Tensor:
+    """Forward decompsition function of take_along_axis.
+
+    Parameters
+    ----------
+    x : paddle.Tensor
+        The input tensor.
+    indices : paddle.Tensor
+        Indices to take along each 1d slice of array.
+    axis : int
+        The axis to take 1d slices along.
+    broadcast : bool, default: True
+        Whether the indices broadcast.
+
+    Returns
+    -------
+    paddle.Tensor
+        Computed output.
+    """
+    # manually contruct indices for gather_nd(ind_gather_nd.ndim == indices.ndim + 1,
+    # the lsat 1 represents the number of dimension(s) of indices)
+    ind_gather_nd = paddle.stack(
+        paddle.meshgrid(*[paddle.arange(v) for v in indices.shape], indexing="ij"),
+        axis=-1,
+    )
+    ind_gather_nd[..., axis] = indices
+    # compute output using constructed indices via gather_nd
+    out = paddle.gather_nd(x, ind_gather_nd)
+    return out
+
+
+def scatter_reduce_decomp(
+    input: paddle.Tensor,
+    axis: int,
+    index: paddle.Tensor,
+    src: paddle.Tensor,
+    reduce: str,
+) -> paddle.Tensor:
+    """Forward decompsition function of scatter_reduce.
+
+    Parameters
+    ----------
+    input : paddle.Tensor
+        Input tensor.
+    axis : int
+        The axis along which to index.
+    index : paddle.Tensor
+        The indices of elements to scatter and reduce.
+    src : paddle.Tensor
+        The source elements to scatter and reduce.
+    reduce : str
+        The reduction operation to apply for non-unique indices.
+        Supported modes: ("sum", "prod", "mean", "amax", "amin").
+
+    Returns
+    -------
+    paddle.Tensor
+        Computed output.
+    """
+    # reduce: "sum", "prod", "mean", "amax", "amin"
+    if reduce == "sum":
+        output = input.put_along_axis(
+            indices=index, values=src, axis=axis, reduce="add"
+        )
+    elif reduce == "mean":
+        output = input.put_along_axis(
+            indices=index, values=src, axis=axis, reduce="mean"
+        )
+    elif reduce == "prod":
+        output = input.put_along_axis(
+            indices=index, values=src, axis=axis, reduce="mul"
+        )
+    else:
+        raise NotImplementedError("only support mode in ['sum', 'prod', 'mean']!")
+    return output
+
+
+def sec(length: int, size: int) -> list[int]:
+    """Auxiliary function for decomposed functions.
+
+    If length is not divisible by size, the last chunk will be smaller.
+
+    Parameters
+    ----------
+    length : int
+        Length to be chunked.
+    size : int
+        Chunk size.
+
+    Returns
+    -------
+    list[int]
+        Chunked output list.
+    """
+    assert length > 0
+    assert size > 0
+    if length % size == 0:
+        return [size] * (length // size)
+    return [size] * (length // size) + [length % size]
+
+
+def masked_add__decomp(
+    x: paddle.Tensor, mask: paddle.Tensor, v: paddle.Tensor
+) -> paddle.Tensor:
+    """Forward decompsition function of masked_add_(inplace operator).
+
+    Parameters
+    ----------
+    x : paddle.Tensor
+        Input tensor.
+    mask : paddle.Tensor
+        Mask tensor.
+    v : paddle.Tensor
+        Value to add.
+
+    Returns
+    -------
+    paddle.Tensor
+        Computed output.
+    """
+    assert mask.dtype == paddle.bool, f"mask must be bool type, but got {mask.dtype}"
+    # indices is bool mask
+    mask_coord = paddle.concat(
+        paddle.nonzero(mask, as_tuple=True),
+        axis=1,
+    )  # [nz, dim]
+    if not paddle.is_tensor(v):
+        v = paddle.full([mask_coord.shape[0]], v, dtype=x.dtype)
+    t = paddle.scatter_nd_add(
+        x,
+        mask_coord,
+        v,
+    )
+    paddle.assign(t, x)  # inplace update
+    return x
+
+
+def normalize_decomp(
+    x: paddle.Tensor,
+    p: float = 2,
+    axis: int = 1,
+    epsilon: float = 1e-12,
+) -> paddle.Tensor:
+    """Forward decompsition function of normalize.
+
+    Parameters
+    ----------
+    x : paddle.Tensor
+        Input tensor.
+    p : float, optional
+        Order of the norm, default: 2
+    axis : int, optional
+        Axis on which to perform normalization, default: 1
+    epsilon : float, optional
+        Epislon value, default: 1e-12
+
+    Returns
+    -------
+    paddle.Tensor
+        Computed output.
+    """
+    return paddle.nn.functional.normalize(x, p, axis, epsilon)
+    # return x / norm(x, p=p, axis=axis, keepdim=True)
+
+
+# alias for decomposed functions for convinience
+normalize = normalize_decomp
+masked_add_ = masked_add__decomp
+scatter_reduce = scatter_reduce_decomp
+take_along_axis = take_along_axis_decomp
+norm = norm_decomp
+softmax = softmax_decomp
diff --git a/deepmd/pd/utils/dp_random.py b/deepmd/pd/utils/dp_random.py
new file mode 100644
index 0000000000..e81488c506
--- /dev/null
+++ b/deepmd/pd/utils/dp_random.py
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+from deepmd.utils.random import (
+    choice,
+    random,
+    seed,
+    shuffle,
+)
+
+__all__ = [
+    "choice",
+    "random",
+    "seed",
+    "shuffle",
+]
diff --git a/deepmd/pd/utils/env.py b/deepmd/pd/utils/env.py
new file mode 100644
index 0000000000..4c104db374
--- /dev/null
+++ b/deepmd/pd/utils/env.py
@@ -0,0 +1,107 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import logging
+import os
+
+import numpy as np
+import paddle
+
+from deepmd.common import (
+    VALID_PRECISION,
+)
+from deepmd.env import (
+    GLOBAL_ENER_FLOAT_PRECISION,
+    GLOBAL_NP_FLOAT_PRECISION,
+    get_default_nthreads,
+    set_default_nthreads,
+)
+
+SAMPLER_RECORD = os.environ.get("SAMPLER_RECORD", False)
+try:
+    # only linux
+    ncpus = len(os.sched_getaffinity(0))
+except AttributeError:
+    ncpus = os.cpu_count()
+NUM_WORKERS = int(os.environ.get("NUM_WORKERS", min(0, ncpus)))
+# Make sure DDP uses correct device if applicable
+LOCAL_RANK = paddle.distributed.get_rank()
+
+if os.environ.get("DEVICE") == "cpu" or paddle.device.cuda.device_count() <= 0:
+    DEVICE = "cpu"
+else:
+    DEVICE = f"gpu:{LOCAL_RANK}"
+
+paddle.device.set_device(DEVICE)
+
+JIT = False
+CACHE_PER_SYS = 5  # keep at most so many sets per sys in memory
+ENERGY_BIAS_TRAINABLE = True
+
+PRECISION_DICT = {
+    "float16": paddle.float16,
+    "float32": paddle.float32,
+    "float64": paddle.float64,
+    "half": paddle.float16,
+    "single": paddle.float32,
+    "double": paddle.float64,
+    "int32": paddle.int32,
+    "int64": paddle.int64,
+    "bfloat16": paddle.bfloat16,
+    "bool": paddle.bool,
+}
+GLOBAL_PD_FLOAT_PRECISION = PRECISION_DICT[np.dtype(GLOBAL_NP_FLOAT_PRECISION).name]
+GLOBAL_PD_ENER_FLOAT_PRECISION = PRECISION_DICT[
+    np.dtype(GLOBAL_ENER_FLOAT_PRECISION).name
+]
+PRECISION_DICT["default"] = GLOBAL_PD_FLOAT_PRECISION
+assert VALID_PRECISION.issubset(PRECISION_DICT.keys())
+# cannot automatically generated
+RESERVED_PRECISON_DICT = {
+    paddle.float16: "float16",
+    paddle.float32: "float32",
+    paddle.float64: "float64",
+    paddle.int32: "int32",
+    paddle.int64: "int64",
+    paddle.bfloat16: "bfloat16",
+    paddle.bool: "bool",
+}
+assert set(PRECISION_DICT.values()) == set(RESERVED_PRECISON_DICT.keys())
+DEFAULT_PRECISION = "float64"
+
+# throw warnings if threads not set
+set_default_nthreads()
+inter_nthreads, intra_nthreads = get_default_nthreads()
+# if inter_nthreads > 0:  # the behavior of 0 is not documented
+#     os.environ['OMP_NUM_THREADS'] = str(inter_nthreads)
+# if intra_nthreads > 0:
+#     os.environ['CPU_NUM'] = str(intra_nthreads)
+
+
+def enable_prim(enable: bool = True):
+    """Enable running program in primitive C++ API in eager/static mode."""
+    from paddle.framework import (
+        core,
+    )
+
+    core.set_prim_eager_enabled(enable)
+    core._set_prim_all_enabled(enable)
+    log = logging.getLogger(__name__)
+    log.info(f"{'Enable' if enable else 'Disable'} prim in eager and static mode.")
+
+
+__all__ = [
+    "GLOBAL_ENER_FLOAT_PRECISION",
+    "GLOBAL_NP_FLOAT_PRECISION",
+    "GLOBAL_PD_FLOAT_PRECISION",
+    "GLOBAL_PD_ENER_FLOAT_PRECISION",
+    "DEFAULT_PRECISION",
+    "PRECISION_DICT",
+    "RESERVED_PRECISON_DICT",
+    "SAMPLER_RECORD",
+    "NUM_WORKERS",
+    "DEVICE",
+    "JIT",
+    "CACHE_PER_SYS",
+    "ENERGY_BIAS_TRAINABLE",
+    "LOCAL_RANK",
+    "enable_prim",
+]
diff --git a/deepmd/pd/utils/env_mat_stat.py b/deepmd/pd/utils/env_mat_stat.py
new file mode 100644
index 0000000000..a37a9672f9
--- /dev/null
+++ b/deepmd/pd/utils/env_mat_stat.py
@@ -0,0 +1,235 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+from collections.abc import (
+    Iterator,
+)
+from typing import (
+    TYPE_CHECKING,
+    Union,
+)
+
+import numpy as np
+import paddle
+
+from deepmd.common import (
+    get_hash,
+)
+from deepmd.pd.model.descriptor.env_mat import (
+    prod_env_mat,
+)
+from deepmd.pd.utils import (
+    env,
+)
+from deepmd.pd.utils.exclude_mask import (
+    PairExcludeMask,
+)
+from deepmd.pd.utils.nlist import (
+    extend_input_and_build_neighbor_list,
+)
+from deepmd.utils.env_mat_stat import EnvMatStat as BaseEnvMatStat
+from deepmd.utils.env_mat_stat import (
+    StatItem,
+)
+
+if TYPE_CHECKING:
+    from deepmd.pd.model.descriptor import (
+        DescriptorBlock,
+    )
+
+
+class EnvMatStat(BaseEnvMatStat):
+    def compute_stat(self, env_mat: dict[str, paddle.Tensor]) -> dict[str, StatItem]:
+        """Compute the statistics of the environment matrix for a single system.
+
+        Parameters
+        ----------
+        env_mat : paddle.Tensor
+            The environment matrix.
+
+        Returns
+        -------
+        dict[str, StatItem]
+            The statistics of the environment matrix.
+        """
+        stats = {}
+        for kk, vv in env_mat.items():
+            stats[kk] = StatItem(
+                number=vv.numel().item(),
+                sum=vv.sum().item() if vv.numel().item() != 0 else paddle.zeros([]),
+                squared_sum=paddle.square(vv).sum().item()
+                if vv.numel().item() != 0
+                else paddle.zeros([]),
+            )
+        return stats
+
+
+class EnvMatStatSe(EnvMatStat):
+    """Environmental matrix statistics for the se_a/se_r environmental matrix.
+
+    Parameters
+    ----------
+    descriptor : DescriptorBlock
+        The descriptor of the model.
+    """
+
+    def __init__(self, descriptor: "DescriptorBlock"):
+        super().__init__()
+        self.descriptor = descriptor
+        self.last_dim = (
+            self.descriptor.ndescrpt // self.descriptor.nnei
+        )  # se_r=1, se_a=4
+
+    def iter(
+        self, data: list[dict[str, Union[paddle.Tensor, list[tuple[int, int]]]]]
+    ) -> Iterator[dict[str, StatItem]]:
+        """Get the iterator of the environment matrix.
+
+        Parameters
+        ----------
+        data : list[dict[str, Union[paddle.Tensor, list[tuple[int, int]]]]]
+            The data.
+
+        Yields
+        ------
+        dict[str, StatItem]
+            The statistics of the environment matrix.
+        """
+        zero_mean = paddle.zeros(
+            [self.descriptor.get_ntypes(), self.descriptor.get_nsel(), self.last_dim],
+            dtype=env.GLOBAL_PD_FLOAT_PRECISION,
+        ).to(env.DEVICE)
+        one_stddev = paddle.ones(
+            [self.descriptor.get_ntypes(), self.descriptor.get_nsel(), self.last_dim],
+            dtype=env.GLOBAL_PD_FLOAT_PRECISION,
+        ).to(env.DEVICE)
+        if self.last_dim == 4:
+            radial_only = False
+        elif self.last_dim == 1:
+            radial_only = True
+        else:
+            raise ValueError(
+                "last_dim should be 1 for raial-only or 4 for full descriptor."
+            )
+        for system in data:
+            coord, atype, box, natoms = (
+                system["coord"],
+                system["atype"],
+                system["box"],
+                system["natoms"],
+            )
+            (
+                extended_coord,
+                extended_atype,
+                mapping,
+                nlist,
+            ) = extend_input_and_build_neighbor_list(
+                coord,
+                atype,
+                self.descriptor.get_rcut(),
+                self.descriptor.get_sel(),
+                mixed_types=self.descriptor.mixed_types(),
+                box=box,
+            )
+            env_mat, _, _ = prod_env_mat(
+                extended_coord,
+                nlist,
+                atype,
+                zero_mean,
+                one_stddev,
+                self.descriptor.get_rcut(),
+                self.descriptor.get_rcut_smth(),
+                radial_only,
+                protection=self.descriptor.get_env_protection(),
+            )
+            # apply excluded_types
+            exclude_mask = self.descriptor.emask(nlist, extended_atype)
+            env_mat *= exclude_mask.unsqueeze(-1).astype(env_mat.dtype)
+            # reshape to nframes * nloc at the atom level,
+            # so nframes/mixed_type do not matter
+            env_mat = env_mat.reshape(
+                [
+                    coord.shape[0] * coord.shape[1],
+                    self.descriptor.get_nsel(),
+                    self.last_dim,
+                ]
+            )
+            atype = atype.reshape([coord.shape[0] * coord.shape[1]])
+            # (1, nloc) eq (ntypes, 1), so broadcast is possible
+            # shape: (ntypes, nloc)
+            type_idx = paddle.equal(
+                atype.reshape([1, -1]),
+                paddle.arange(self.descriptor.get_ntypes(), dtype=atype.dtype)
+                .to(device=env.DEVICE)
+                .reshape([-1, 1]),
+            )
+            if "pair_exclude_types" in system:
+                # shape: (1, nloc, nnei)
+                exclude_mask = PairExcludeMask(
+                    self.descriptor.get_ntypes(), system["pair_exclude_types"]
+                )(nlist, extended_atype).reshape(
+                    [1, coord.shape[0] * coord.shape[1], -1]
+                )
+                # shape: (ntypes, nloc, nnei)
+                type_idx = paddle.logical_and(type_idx.unsqueeze(-1), exclude_mask)
+            for type_i in range(self.descriptor.get_ntypes()):
+                dd = env_mat[type_idx[type_i]]
+                dd = dd.reshape([-1, self.last_dim])  # typen_atoms * unmasked_nnei, 4
+                env_mats = {}
+                env_mats[f"r_{type_i}"] = dd[:, :1]
+                if self.last_dim == 4:
+                    env_mats[f"a_{type_i}"] = dd[:, 1:]
+                yield self.compute_stat(env_mats)
+
+    def get_hash(self) -> str:
+        """Get the hash of the environment matrix.
+
+        Returns
+        -------
+        str
+            The hash of the environment matrix.
+        """
+        dscpt_type = "se_a" if self.last_dim == 4 else "se_r"
+        return get_hash(
+            {
+                "type": dscpt_type,
+                "ntypes": self.descriptor.get_ntypes(),
+                "rcut": round(self.descriptor.get_rcut(), 2),
+                "rcut_smth": round(self.descriptor.rcut_smth, 2),
+                "nsel": self.descriptor.get_nsel(),
+                "sel": self.descriptor.get_sel(),
+                "mixed_types": self.descriptor.mixed_types(),
+            }
+        )
+
+    def __call__(self):
+        avgs = self.get_avg()
+        stds = self.get_std()
+
+        all_davg = []
+        all_dstd = []
+
+        for type_i in range(self.descriptor.get_ntypes()):
+            if self.last_dim == 4:
+                davgunit = [[avgs[f"r_{type_i}"], 0, 0, 0]]
+                dstdunit = [
+                    [
+                        stds[f"r_{type_i}"],
+                        stds[f"a_{type_i}"],
+                        stds[f"a_{type_i}"],
+                        stds[f"a_{type_i}"],
+                    ]
+                ]
+            elif self.last_dim == 1:
+                davgunit = [[avgs[f"r_{type_i}"]]]
+                dstdunit = [
+                    [
+                        stds[f"r_{type_i}"],
+                    ]
+                ]
+            davg = np.tile(davgunit, [self.descriptor.get_nsel(), 1])
+            dstd = np.tile(dstdunit, [self.descriptor.get_nsel(), 1])
+            all_davg.append(davg)
+            all_dstd.append(dstd)
+
+        mean = np.stack(all_davg)
+        stddev = np.stack(all_dstd)
+        return mean, stddev
diff --git a/deepmd/pd/utils/exclude_mask.py b/deepmd/pd/utils/exclude_mask.py
new file mode 100644
index 0000000000..088ac186a8
--- /dev/null
+++ b/deepmd/pd/utils/exclude_mask.py
@@ -0,0 +1,164 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+
+import numpy as np
+import paddle
+
+from deepmd.pd.utils import (
+    decomp,
+)
+from deepmd.pd.utils.utils import (
+    to_paddle_tensor,
+)
+
+
+class AtomExcludeMask(paddle.nn.Layer):
+    """Computes the type exclusion mask for atoms."""
+
+    def __init__(
+        self,
+        ntypes: int,
+        exclude_types: list[int] = [],
+    ):
+        super().__init__()
+        self.reinit(ntypes, exclude_types)
+
+    def reinit(
+        self,
+        ntypes: int,
+        exclude_types: list[int] = [],
+    ):
+        self.ntypes = ntypes
+        self.exclude_types = exclude_types
+        self.type_mask = np.array(
+            [1 if tt_i not in self.exclude_types else 0 for tt_i in range(ntypes)],
+            dtype=np.int32,
+        )
+        self.type_mask = to_paddle_tensor(self.type_mask).reshape([-1])
+
+    def get_exclude_types(self):
+        return self.exclude_types
+
+    def get_type_mask(self):
+        return self.type_mask
+
+    def forward(
+        self,
+        atype: paddle.Tensor,
+    ) -> paddle.Tensor:
+        """Compute type exclusion mask for atoms.
+
+        Parameters
+        ----------
+        atype
+            The extended atom types. shape: nf x natom
+
+        Returns
+        -------
+        mask
+            The type exclusion mask for atoms. shape: nf x natom
+            Element [ff,ii] being 0 if type(ii) is excluded,
+            otherwise being 1.
+
+        """
+        nf, natom = atype.shape
+        return self.type_mask[atype].reshape([nf, natom]).to(atype.place)
+
+
+class PairExcludeMask(paddle.nn.Layer):
+    """Computes the type exclusion mask for atom pairs."""
+
+    def __init__(
+        self,
+        ntypes: int,
+        exclude_types: list[tuple[int, int]] = [],
+    ):
+        super().__init__()
+        self.reinit(ntypes, exclude_types)
+
+    def reinit(
+        self,
+        ntypes: int,
+        exclude_types: list[tuple[int, int]] = [],
+    ):
+        self.ntypes = ntypes
+        self._exclude_types: set[tuple[int, int]] = set()
+        for tt in exclude_types:
+            assert len(tt) == 2
+            self._exclude_types.add((tt[0], tt[1]))
+            self._exclude_types.add((tt[1], tt[0]))
+        # ntypes + 1 for nlist masks
+        self.type_mask = np.array(
+            [
+                [
+                    1 if (tt_i, tt_j) not in self._exclude_types else 0
+                    for tt_i in range(ntypes + 1)
+                ]
+                for tt_j in range(ntypes + 1)
+            ],
+            dtype=np.int32,
+        )
+        # (ntypes+1 x ntypes+1)
+        self.type_mask = to_paddle_tensor(self.type_mask).reshape([-1])
+        self.no_exclusion = len(self._exclude_types) == 0
+
+    def get_exclude_types(self):
+        return self._exclude_types
+
+    # may have a better place for this method...
+    def forward(
+        self,
+        nlist: paddle.Tensor,
+        atype_ext: paddle.Tensor,
+    ) -> paddle.Tensor:
+        """Compute type exclusion mask.
+
+        Parameters
+        ----------
+        nlist
+            The neighbor list. shape: nf x nloc x nnei
+        atype_ext
+            The extended aotm types. shape: nf x nall
+
+        Returns
+        -------
+        mask
+            The type exclusion mask of shape: nf x nloc x nnei.
+            Element [ff,ii,jj] being 0 if type(ii), type(nlist[ff,ii,jj]) is excluded,
+            otherwise being 1.
+
+        """
+        if self.no_exclusion:
+            # safely return 1 if nothing is excluded.
+            return paddle.ones_like(nlist, dtype=paddle.int32).to(device=nlist.place)
+        nf, nloc, nnei = nlist.shape
+        nall = atype_ext.shape[1]
+        # add virtual atom of type ntypes. nf x nall+1
+        ae = paddle.concat(
+            [
+                atype_ext,
+                self.ntypes
+                * paddle.ones([nf, 1], dtype=atype_ext.dtype).to(
+                    device=atype_ext.place
+                ),
+            ],
+            axis=-1,
+        )
+        type_i = atype_ext[:, :nloc].reshape([nf, nloc]) * (self.ntypes + 1)
+        # nf x nloc x nnei
+        index = paddle.where(nlist == -1, nall, nlist).reshape([nf, nloc * nnei])
+        # type_j = paddle.take_along_axis(ae, axis=1, indices=index).reshape(
+        #     [nf, nloc, nnei]
+        # )
+        type_j = decomp.take_along_axis(ae, axis=1, indices=index).reshape(
+            [nf, nloc, nnei]
+        )
+        type_ij = type_i[:, :, None] + type_j
+        # nf x (nloc x nnei)
+        type_ij = type_ij.reshape([nf, nloc * nnei])
+        mask = (
+            self.type_mask[type_ij]
+            .reshape([nf, nloc, nnei])
+            .to(atype_ext.place)
+            .astype("bool")
+        )
+        return mask
diff --git a/deepmd/pd/utils/finetune.py b/deepmd/pd/utils/finetune.py
new file mode 100644
index 0000000000..edac72d9c9
--- /dev/null
+++ b/deepmd/pd/utils/finetune.py
@@ -0,0 +1,200 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import logging
+from copy import (
+    deepcopy,
+)
+
+import paddle
+
+from deepmd.utils.finetune import (
+    FinetuneRuleItem,
+)
+
+log = logging.getLogger(__name__)
+
+
+def get_finetune_rule_single(
+    _single_param_target,
+    _model_param_pretrained,
+    from_multitask=False,
+    model_branch="Default",
+    model_branch_from="",
+    change_model_params=False,
+):
+    single_config = deepcopy(_single_param_target)
+    new_fitting = False
+    model_branch_chosen = "Default"
+
+    if not from_multitask:
+        single_config_chosen = deepcopy(_model_param_pretrained)
+        if model_branch_from == "RANDOM":
+            # not ["", "RANDOM"], because single-from-single finetune uses pretrained fitting in default
+            new_fitting = True
+    else:
+        model_dict_params = _model_param_pretrained["model_dict"]
+        if model_branch_from in ["", "RANDOM"]:
+            model_branch_chosen = next(iter(model_dict_params.keys()))
+            new_fitting = True
+            log.warning(
+                "The fitting net will be re-init instead of using that in the pretrained model! "
+                "The bias_adjust_mode will be set-by-statistic!"
+            )
+        else:
+            model_branch_chosen = model_branch_from
+        assert model_branch_chosen in model_dict_params, (
+            f"No model branch named '{model_branch_chosen}'! "
+            f"Available ones are {list(model_dict_params.keys())}."
+        )
+        single_config_chosen = deepcopy(model_dict_params[model_branch_chosen])
+    old_type_map, new_type_map = (
+        single_config_chosen["type_map"],
+        single_config["type_map"],
+    )
+    finetune_rule = FinetuneRuleItem(
+        p_type_map=old_type_map,
+        type_map=new_type_map,
+        model_branch=model_branch_chosen,
+        random_fitting=new_fitting,
+    )
+    if change_model_params:
+        trainable_param = {
+            "descriptor": single_config.get("descriptor", {}).get("trainable", True),
+            "fitting_net": single_config.get("fitting_net", {}).get("trainable", True),
+        }
+        single_config["descriptor"] = single_config_chosen["descriptor"]
+        if not new_fitting:
+            single_config["fitting_net"] = single_config_chosen["fitting_net"]
+        log.info(
+            f"Change the '{model_branch}' model configurations according to the model branch "
+            f"'{model_branch_chosen}' in the pretrained one..."
+        )
+        for net_type in trainable_param:
+            if net_type in single_config:
+                single_config[net_type]["trainable"] = trainable_param[net_type]
+            else:
+                single_config[net_type] = {"trainable": trainable_param[net_type]}
+    return single_config, finetune_rule
+
+
+def get_finetune_rules(
+    finetune_model, model_config, model_branch="", change_model_params=True
+):
+    """
+    Get fine-tuning rules and (optionally) change the model_params according to the pretrained one.
+
+    This function gets the fine-tuning rules and (optionally) changes input in different modes as follows:
+    1. Single-task fine-tuning from a single-task pretrained model:
+        - The model will be fine-tuned based on the pretrained model.
+        - (Optional) Updates the model parameters based on the pretrained model.
+    2. Single-task fine-tuning from a multi-task pretrained model:
+        - The model will be fine-tuned based on the selected branch in the pretrained model.
+          The chosen branch can be defined from the command-line or `finetune_head` input parameter.
+          If not defined, model parameters in the fitting network will be randomly initialized.
+        - (Optional) Updates the model parameters based on the selected branch in the pretrained model.
+    3. Multi-task fine-tuning from a single-task pretrained model:
+        - The model in each branch will be fine-tuned or resumed based on the single branch ('Default') in the pretrained model.
+          The chosen branches can be defined from the `finetune_head` input parameter of each branch.
+          - If `finetune_head` is defined as 'Default',
+            it will be fine-tuned based on the single branch ('Default') in the pretrained model.
+          - If `finetune_head` is not defined and the model_key is 'Default',
+            it will resume from the single branch ('Default') in the pretrained model without fine-tuning.
+          - If `finetune_head` is not defined and the model_key is not 'Default',
+            it will be fine-tuned based on the single branch ('Default') in the pretrained model,
+            while model parameters in the fitting network of the branch will be randomly initialized.
+        - (Optional) Updates model parameters in each branch based on the single branch ('Default') in the pretrained model.
+    4. Multi-task fine-tuning from a multi-task pretrained model:
+        - The model in each branch will be fine-tuned or resumed based on the chosen branches in the pretrained model.
+          The chosen branches can be defined from the `finetune_head` input parameter of each branch.
+            - If `finetune_head` is defined as one of the branches in the pretrained model,
+              it will be fine-tuned based on the chosen branch in the pretrained model.
+            - If `finetune_head` is not defined and the model_key is the same as one of those in the pretrained model,
+              it will resume from the model_key branch in the pretrained model without fine-tuning.
+            - If `finetune_head` is not defined and a new model_key is used,
+              it will be fine-tuned based on the chosen branch in the pretrained model,
+              while model parameters in the fitting network of the branch will be randomly initialized.
+        - (Optional) Updates model parameters in each branch based on the chosen branches in the pretrained model.
+
+    Parameters
+    ----------
+    finetune_model
+        The pretrained model.
+    model_config
+        The fine-tuning input parameters.
+    model_branch
+        The model branch chosen in command-line mode, only for single-task fine-tuning.
+    change_model_params
+        Whether to change the model parameters according to the pretrained one.
+
+    Returns
+    -------
+    model_config:
+        Updated model parameters.
+    finetune_links:
+        Fine-tuning rules in a dict format, with `model_branch`: FinetuneRuleItem pairs.
+    """
+    multi_task = "model_dict" in model_config
+    state_dict = paddle.load(finetune_model)
+    if "model" in state_dict:
+        state_dict = state_dict["model"]
+    last_model_params = state_dict["_extra_state"]["model_params"]
+    finetune_from_multi_task = "model_dict" in last_model_params
+    finetune_links = {}
+    if not multi_task:
+        # use command-line first
+        if model_branch == "" and "finetune_head" in model_config:
+            model_branch = model_config["finetune_head"]
+        model_config, finetune_rule = get_finetune_rule_single(
+            model_config,
+            last_model_params,
+            from_multitask=finetune_from_multi_task,
+            model_branch="Default",
+            model_branch_from=model_branch,
+            change_model_params=change_model_params,
+        )
+        finetune_links["Default"] = finetune_rule
+    else:
+        assert model_branch == "", (
+            "Multi-task fine-tuning does not support command-line branches chosen!"
+            "Please define the 'finetune_head' in each model params!"
+        )
+        target_keys = model_config["model_dict"].keys()
+        if not finetune_from_multi_task:
+            pretrained_keys = ["Default"]
+        else:
+            pretrained_keys = last_model_params["model_dict"].keys()
+        for model_key in target_keys:
+            resuming = False
+            if (
+                "finetune_head" in model_config["model_dict"][model_key]
+                and model_config["model_dict"][model_key]["finetune_head"] != "RANDOM"
+            ):
+                pretrained_key = model_config["model_dict"][model_key]["finetune_head"]
+                assert pretrained_key in pretrained_keys, (
+                    f"'{pretrained_key}' head chosen to finetune not exist in the pretrained model!"
+                    f"Available heads are: {list(pretrained_keys)}"
+                )
+                model_branch_from = pretrained_key
+            elif (
+                "finetune_head" not in model_config["model_dict"][model_key]
+                and model_key in pretrained_keys
+            ):
+                # not do anything if not defined "finetune_head" in heads that exist in the pretrained model
+                # this will just do resuming
+                model_branch_from = model_key
+                resuming = True
+            else:
+                # if not defined "finetune_head" in new heads or "finetune_head" is "RANDOM", the fitting net will bre randomly initialized
+                model_branch_from = "RANDOM"
+            model_config["model_dict"][model_key], finetune_rule = (
+                get_finetune_rule_single(
+                    model_config["model_dict"][model_key],
+                    last_model_params,
+                    from_multitask=finetune_from_multi_task,
+                    model_branch=model_key,
+                    model_branch_from=model_branch_from,
+                    change_model_params=change_model_params,
+                )
+            )
+            finetune_links[model_key] = finetune_rule
+            finetune_links[model_key].resuming = resuming
+    return model_config, finetune_links
diff --git a/deepmd/pd/utils/multi_task.py b/deepmd/pd/utils/multi_task.py
new file mode 100644
index 0000000000..680dc53c79
--- /dev/null
+++ b/deepmd/pd/utils/multi_task.py
@@ -0,0 +1,162 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+from copy import (
+    deepcopy,
+)
+
+from deepmd.pd.model.descriptor import (
+    BaseDescriptor,
+)
+from deepmd.pd.model.task import (
+    BaseFitting,
+)
+
+
+def preprocess_shared_params(model_config):
+    """Preprocess the model params for multitask model, and generate the links dict for further sharing.
+
+    Args:
+        model_config: Model params of multitask model.
+
+    Returns
+    -------
+    model_config: Preprocessed model params of multitask model.
+        Those string names are replaced with real params in `shared_dict` of model params.
+    shared_links: Dict of link infos for further sharing.
+        Each item, whose key must be in `shared_dict`, is a dict with following keys:
+        - "type": The real class type of this item.
+        - "links": List of shared settings, each sub-item is a dict with following keys:
+            - "model_key": Model key in the `model_dict` to share this item.
+            - "shared_type": Type of this shard item.
+            - "shared_level": Shared level (int) of this item in this model.
+                Lower for more params to share, 0 means to share all params in this item.
+            This list are sorted by "shared_level".
+    For example, if one has `model_config` like this:
+    "model": {
+        "shared_dict": {
+            "my_type_map": ["foo", "bar"],
+            "my_des1": {
+                "type": "se_e2_a",
+                "neuron": [10, 20, 40]
+                },
+        },
+        "model_dict": {
+            "model_1": {
+                "type_map": "my_type_map",
+                "descriptor": "my_des1",
+                "fitting_net": {
+                    "neuron": [100, 100, 100]
+                }
+            },
+            "model_2": {
+                "type_map": "my_type_map",
+                "descriptor": "my_des1",
+                "fitting_net": {
+                    "neuron": [100, 100, 100]
+                }
+            }
+            "model_3": {
+                "type_map": "my_type_map",
+                "descriptor": "my_des1:1",
+                "fitting_net": {
+                    "neuron": [100, 100, 100]
+                }
+            }
+        }
+    }
+    The above config will init three model branches named `model_1` and `model_2` and `model_3`,
+    in which:
+        - `model_2` and `model_3` will have the same `type_map` as that in `model_1`.
+        - `model_2` will share all the parameters of `descriptor` with `model_1`,
+        while `model_3` will share part of parameters of `descriptor` with `model_1`
+        on human-defined share-level `1` (default is `0`, meaning share all the parameters).
+        - `model_1`, `model_2` and `model_3` have three different `fitting_net`s.
+    The returned `model_config` will automatically fulfill the input `model_config` as if there's no sharing,
+    and the `shared_links` will keep all the sharing information with looking:
+    {
+    'my_des1': {
+        'type': 'DescrptSeA',
+        'links': [
+            {'model_key': 'model_1',
+            'shared_type': 'descriptor',
+            'shared_level': 0},
+            {'model_key': 'model_2',
+            'shared_type': 'descriptor',
+            'shared_level': 0},
+            {'model_key': 'model_3',
+            'shared_type': 'descriptor',
+            'shared_level': 1}
+            ]
+        }
+    }
+
+    """
+    assert "model_dict" in model_config, "only multi-task model can use this method!"
+    supported_types = ["type_map", "descriptor", "fitting_net"]
+    shared_dict = model_config.get("shared_dict", {})
+    shared_links = {}
+    type_map_keys = []
+
+    def replace_one_item(params_dict, key_type, key_in_dict, suffix="", index=None):
+        shared_type = key_type
+        shared_key = key_in_dict
+        shared_level = 0
+        if ":" in key_in_dict:
+            shared_key = key_in_dict.split(":")[0]
+            shared_level = int(key_in_dict.split(":")[1])
+        assert (
+            shared_key in shared_dict
+        ), f"Appointed {shared_type} {shared_key} are not in the shared_dict! Please check the input params."
+        if index is None:
+            params_dict[shared_type] = deepcopy(shared_dict[shared_key])
+        else:
+            params_dict[index] = deepcopy(shared_dict[shared_key])
+        if shared_type == "type_map":
+            if key_in_dict not in type_map_keys:
+                type_map_keys.append(key_in_dict)
+        else:
+            if shared_key not in shared_links:
+                class_name = get_class_name(shared_type, shared_dict[shared_key])
+                shared_links[shared_key] = {"type": class_name, "links": []}
+            link_item = {
+                "model_key": model_key,
+                "shared_type": shared_type + suffix,
+                "shared_level": shared_level,
+            }
+            shared_links[shared_key]["links"].append(link_item)
+
+    for model_key in model_config["model_dict"]:
+        model_params_item = model_config["model_dict"][model_key]
+        for item_key in model_params_item:
+            if item_key in supported_types:
+                item_params = model_params_item[item_key]
+                if isinstance(item_params, str):
+                    replace_one_item(model_params_item, item_key, item_params)
+                elif item_params.get("type", "") == "hybrid":
+                    for ii, hybrid_item in enumerate(item_params["list"]):
+                        if isinstance(hybrid_item, str):
+                            replace_one_item(
+                                model_params_item[item_key]["list"],
+                                item_key,
+                                hybrid_item,
+                                suffix=f"_hybrid_{ii}",
+                                index=ii,
+                            )
+    for shared_key in shared_links:
+        shared_links[shared_key]["links"] = sorted(
+            shared_links[shared_key]["links"],
+            key=lambda x: x["shared_level"]
+            - ("spin" in model_config["model_dict"][x["model_key"]]) * 100,
+        )
+        # little trick to make spin models in the front to be the base models,
+        # because its type embeddings are more general.
+    assert len(type_map_keys) == 1, "Multitask model must have only one type_map!"
+    return model_config, shared_links
+
+
+def get_class_name(item_key, item_params):
+    if item_key == "descriptor":
+        return BaseDescriptor.get_class_by_type(item_params.get("type", "se_e2_a"))
+    elif item_key == "fitting_net":
+        return BaseFitting.get_class_by_type(item_params.get("type", "ener"))
+    else:
+        raise RuntimeError(f"Unknown class_name type {item_key}")
diff --git a/deepmd/pd/utils/neighbor_stat.py b/deepmd/pd/utils/neighbor_stat.py
new file mode 100644
index 0000000000..af39161e98
--- /dev/null
+++ b/deepmd/pd/utils/neighbor_stat.py
@@ -0,0 +1,197 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+from collections.abc import (
+    Iterator,
+)
+from typing import (
+    Optional,
+)
+
+import numpy as np
+import paddle
+
+from deepmd.pd.utils.auto_batch_size import (
+    AutoBatchSize,
+)
+from deepmd.pd.utils.env import (
+    DEVICE,
+)
+from deepmd.pd.utils.nlist import (
+    extend_coord_with_ghosts,
+)
+from deepmd.utils.data_system import (
+    DeepmdDataSystem,
+)
+from deepmd.utils.neighbor_stat import NeighborStat as BaseNeighborStat
+
+
+class NeighborStatOP(paddle.nn.Layer):
+    """Class for getting neighbor statistics data information.
+
+    Parameters
+    ----------
+    ntypes
+        The num of atom types
+    rcut
+        The cut-off radius
+    mixed_types : bool, optional
+        If True, treat neighbors of all types as a single type.
+    """
+
+    def __init__(
+        self,
+        ntypes: int,
+        rcut: float,
+        mixed_types: bool,
+    ) -> None:
+        super().__init__()
+        self.rcut = float(rcut)
+        self.ntypes = ntypes
+        self.mixed_types = mixed_types
+
+    def forward(
+        self,
+        coord: paddle.Tensor,
+        atype: paddle.Tensor,
+        cell: Optional[paddle.Tensor],
+    ) -> tuple[paddle.Tensor, paddle.Tensor]:
+        """Calculate the neareest neighbor distance between atoms, maximum nbor size of
+        atoms and the output data range of the environment matrix.
+
+        Parameters
+        ----------
+        coord
+            The coordinates of atoms.
+        atype
+            The atom types.
+        cell
+            The cell.
+
+        Returns
+        -------
+        paddle.Tensor
+            The minimal squared distance between two atoms, in the shape of (nframes,)
+        paddle.Tensor
+            The maximal number of neighbors
+        """
+        nframes = coord.shape[0]
+        coord = coord.reshape([nframes, -1, 3])
+        nloc = coord.shape[1]
+        coord = coord.reshape([nframes, nloc * 3])
+        extend_coord, extend_atype, _ = extend_coord_with_ghosts(
+            coord, atype, cell, self.rcut
+        )
+
+        coord1 = extend_coord.reshape([nframes, -1])
+        nall = coord1.shape[1] // 3
+        coord0 = coord1[:, : nloc * 3]
+        diff: paddle.Tensor = coord1.reshape([nframes, -1, 3]).unsqueeze(
+            1
+        ) - coord0.reshape([nframes, -1, 3]).unsqueeze(2)
+        assert list(diff.shape) == [nframes, nloc, nall, 3]
+        # remove the diagonal elements
+        mask = paddle.eye(nloc, nall).to(dtype=paddle.bool, device=diff.place)
+        # diff[:, mask] = float("inf")
+        # diff.masked_fill_(
+        #     paddle.broadcast_to(mask.unsqueeze([0, -1]), diff.shape),
+        #     paddle.to_tensor(float("inf")),
+        # )
+        diff[paddle.broadcast_to(mask.unsqueeze([0, -1]), diff.shape)] = float("inf")
+        rr2 = paddle.sum(paddle.square(diff), axis=-1)
+        min_rr2 = paddle.min(rr2, axis=-1)
+        # count the number of neighbors
+        if not self.mixed_types:
+            mask = rr2 < self.rcut**2
+            nnei = paddle.zeros((nframes, nloc, self.ntypes), dtype=paddle.int64)
+            for ii in range(self.ntypes):
+                nnei[:, :, ii] = paddle.sum(
+                    mask & ((extend_atype == ii)[:, None, :]), axis=-1
+                )
+        else:
+            mask = rr2 < self.rcut**2
+            # virtual types (<0) are not counted
+            nnei = paddle.sum(
+                mask & ((extend_atype >= 0).unsqueeze(1)), axis=-1
+            ).reshape([nframes, nloc, 1])
+        max_nnei = paddle.max(nnei, axis=1)
+        return min_rr2, max_nnei
+
+
+class NeighborStat(BaseNeighborStat):
+    """Neighbor statistics using pure NumPy.
+
+    Parameters
+    ----------
+    ntypes : int
+        The num of atom types
+    rcut : float
+        The cut-off radius
+    mixed_type : bool, optional, default=False
+        Treat all types as a single type.
+    """
+
+    def __init__(
+        self,
+        ntypes: int,
+        rcut: float,
+        mixed_type: bool = False,
+    ) -> None:
+        super().__init__(ntypes, rcut, mixed_type)
+        op = NeighborStatOP(ntypes, rcut, mixed_type)
+        # self.op = paddle.jit.to_static(op)
+        self.op = op
+        self.auto_batch_size = AutoBatchSize()
+
+    def iterator(
+        self, data: DeepmdDataSystem
+    ) -> Iterator[tuple[np.ndarray, float, str]]:
+        """Abstract method for producing data.
+
+        Yields
+        ------
+        np.ndarray
+            The maximal number of neighbors
+        float
+            The squared minimal distance between two atoms
+        str
+            The directory of the data system
+        """
+        for ii in range(len(data.system_dirs)):
+            for jj in data.data_systems[ii].dirs:
+                data_set = data.data_systems[ii]
+                data_set_data = data_set._load_set(jj)
+                minrr2, max_nnei = self.auto_batch_size.execute_all(
+                    self._execute,
+                    data_set_data["coord"].shape[0],
+                    data_set.get_natoms(),
+                    data_set_data["coord"],
+                    data_set_data["type"],
+                    data_set_data["box"] if data_set.pbc else None,
+                )
+                yield np.max(max_nnei, axis=0), np.min(minrr2), jj
+
+    def _execute(
+        self,
+        coord: np.ndarray,
+        atype: np.ndarray,
+        cell: Optional[np.ndarray],
+    ):
+        """Execute the operation.
+
+        Parameters
+        ----------
+        coord
+            The coordinates of atoms.
+        atype
+            The atom types.
+        cell
+            The cell.
+        """
+        with paddle.no_grad():
+            minrr2, max_nnei = self.op(
+                paddle.to_tensor(coord, place=DEVICE),
+                paddle.to_tensor(atype, place=DEVICE),
+                paddle.to_tensor(cell, place=DEVICE) if cell is not None else None,
+            )
+        minrr2 = minrr2.numpy()
+        max_nnei = max_nnei.numpy()
+        return minrr2, max_nnei
diff --git a/deepmd/pd/utils/nlist.py b/deepmd/pd/utils/nlist.py
new file mode 100644
index 0000000000..44924ce07d
--- /dev/null
+++ b/deepmd/pd/utils/nlist.py
@@ -0,0 +1,535 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+from typing import (
+    Optional,
+    Union,
+)
+
+import paddle
+
+from deepmd.pd.utils import (
+    decomp,
+    env,
+)
+from deepmd.pd.utils.region import (
+    normalize_coord,
+    to_face_distance,
+)
+
+
+def extend_input_and_build_neighbor_list(
+    coord,
+    atype,
+    rcut: float,
+    sel: list[int],
+    mixed_types: bool = False,
+    box: Optional[paddle.Tensor] = None,
+):
+    nframes, nloc = atype.shape[:2]
+    if box is not None:
+        box_gpu = box.to(coord.place)
+        coord_normalized = normalize_coord(
+            coord.reshape([nframes, nloc, 3]),
+            box_gpu.reshape([nframes, 3, 3]),
+        )
+    else:
+        box_gpu = None
+        coord_normalized = coord.clone()
+    extended_coord, extended_atype, mapping = extend_coord_with_ghosts(
+        coord_normalized, atype, box_gpu, rcut, box
+    )
+    nlist = build_neighbor_list(
+        extended_coord,
+        extended_atype,
+        nloc,
+        rcut,
+        sel,
+        distinguish_types=(not mixed_types),
+    )
+    extended_coord = extended_coord.reshape([nframes, -1, 3])
+    return extended_coord, extended_atype, mapping, nlist
+
+
+def build_neighbor_list(
+    coord: paddle.Tensor,
+    atype: paddle.Tensor,
+    nloc: int,
+    rcut: float,
+    sel: Union[int, list[int]],
+    distinguish_types: bool = True,
+) -> paddle.Tensor:
+    """Build neighbor list for a single frame. keeps nsel neighbors.
+
+    Parameters
+    ----------
+    coord : paddle.Tensor
+        exptended coordinates of shape [batch_size, nall x 3]
+    atype : paddle.Tensor
+        extended atomic types of shape [batch_size, nall]
+        if type < 0 the atom is treat as virtual atoms.
+    nloc : int
+        number of local atoms.
+    rcut : float
+        cut-off radius
+    sel : int or list[int]
+        maximal number of neighbors (of each type).
+        if distinguish_types==True, nsel should be list and
+        the length of nsel should be equal to number of
+        types.
+    distinguish_types : bool
+        distinguish different types.
+
+    Returns
+    -------
+    neighbor_list : paddle.Tensor
+        Neighbor list of shape [batch_size, nloc, nsel], the neighbors
+        are stored in an ascending order. If the number of
+        neighbors is less than nsel, the positions are masked
+        with -1. The neighbor list of an atom looks like
+        |------ nsel ------|
+        xx xx xx xx -1 -1 -1
+        if distinguish_types==True and we have two types
+        |---- nsel[0] -----| |---- nsel[1] -----|
+        xx xx xx xx -1 -1 -1 xx xx xx -1 -1 -1 -1
+        For virtual atoms all neighboring positions are filled with -1.
+
+    """
+    batch_size = coord.shape[0]
+    coord = coord.reshape([batch_size, -1])
+    nall = coord.shape[1] // 3
+    # fill virtual atoms with large coords so they are not neighbors of any
+    # real atom.
+    if coord.numel() > 0:
+        xmax = paddle.max(coord) + 2.0 * rcut
+    else:
+        xmax = paddle.zeros([], dtype=coord.dtype).to(device=coord.place) + 2.0 * rcut
+    # nf x nall
+    is_vir = atype < 0
+    coord1 = paddle.where(
+        is_vir[:, :, None], xmax, coord.reshape([batch_size, nall, 3])
+    ).reshape([batch_size, nall * 3])
+    if isinstance(sel, int):
+        sel = [sel]
+    # nloc x 3
+    coord0 = coord1[:, : nloc * 3]
+    # nloc x nall x 3
+    diff = coord1.reshape([batch_size, -1, 3]).unsqueeze(1) - coord0.reshape(
+        [batch_size, -1, 3]
+    ).unsqueeze(2)
+    if paddle.in_dynamic_mode():
+        assert list(diff.shape) == [batch_size, nloc, nall, 3]
+    # nloc x nall
+    # rr = paddle.linalg.norm(diff, axis=-1)
+    rr = decomp.norm(diff, axis=-1)
+    # if central atom has two zero distances, sorting sometimes can not exclude itself
+    rr = rr - paddle.eye(nloc, nall, dtype=rr.dtype).to(device=rr.place).unsqueeze(0)
+    rr, nlist = paddle.sort(rr, axis=-1), paddle.argsort(rr, axis=-1)
+    # nloc x (nall-1)
+    rr = rr[:, :, 1:]
+    nlist = nlist[:, :, 1:]
+
+    return _trim_mask_distinguish_nlist(
+        is_vir, atype, rr, nlist, rcut, sel, distinguish_types
+    )
+
+
+def _trim_mask_distinguish_nlist(
+    is_vir_cntl: paddle.Tensor,
+    atype_neig: paddle.Tensor,
+    rr: paddle.Tensor,
+    nlist: paddle.Tensor,
+    rcut: float,
+    sel: list[int],
+    distinguish_types: bool,
+) -> paddle.Tensor:
+    """Trim the size of nlist, mask if any central atom is virtual, distinguish types if necessary."""
+    nsel = sum(sel)
+    # nloc x nsel
+    batch_size, nloc, nnei = rr.shape
+    if paddle.in_dynamic_mode():
+        assert batch_size == is_vir_cntl.shape[0]
+    if nsel <= nnei:
+        rr = rr[:, :, :nsel]
+        nlist = nlist[:, :, :nsel]
+    else:
+        rr = paddle.concat(
+            [
+                rr,
+                paddle.ones([batch_size, nloc, nsel - nnei]).to(
+                    device=rr.place, dtype=rr.dtype
+                )
+                + rcut,
+            ],
+            axis=-1,
+        )
+        nlist = paddle.concat(
+            [
+                nlist,
+                paddle.ones([batch_size, nloc, nsel - nnei], dtype=nlist.dtype).to(
+                    device=rr.place
+                ),
+            ],
+            axis=-1,
+        )
+        if paddle.in_dynamic_mode():
+            assert list(nlist.shape) == [batch_size, nloc, nsel]
+    nlist = paddle.where(
+        paddle.logical_or((rr > rcut), is_vir_cntl[:, :nloc, None]), -1, nlist
+    )
+    if distinguish_types:
+        return nlist_distinguish_types(nlist, atype_neig, sel)
+    else:
+        return nlist
+
+
+def build_directional_neighbor_list(
+    coord_cntl: paddle.Tensor,
+    atype_cntl: paddle.Tensor,
+    coord_neig: paddle.Tensor,
+    atype_neig: paddle.Tensor,
+    rcut: float,
+    sel: Union[int, list[int]],
+    distinguish_types: bool = True,
+) -> paddle.Tensor:
+    """Build directional neighbor list.
+
+    With each central atom, all the neighbor atoms in the cut-off radius will
+    be recorded in the neighbor list. The maximum neighbors is nsel. If the real
+    number of neighbors is larger than nsel, the neighbors will be sorted with the
+    distance and the first nsel neighbors are kept.
+
+    Important: the central and neighboring atoms are assume to be different atoms.
+
+    Parameters
+    ----------
+    coord_central : paddle.Tensor
+        coordinates of central atoms. assumed to be local atoms.
+        shape [batch_size, nloc_central x 3]
+    atype_central : paddle.Tensor
+        atomic types of central atoms. shape [batch_size, nloc_central]
+        if type < 0 the atom is treated as virtual atoms.
+    coord_neighbor : paddle.Tensor
+        extended coordinates of neighbors atoms. shape [batch_size, nall_neighbor x 3]
+    atype_central : paddle.Tensor
+        extended atomic types of neighbors atoms. shape [batch_size, nall_neighbor]
+        if type < 0 the atom is treated as virtual atoms.
+    rcut : float
+        cut-off radius
+    sel : int or list[int]
+        maximal number of neighbors (of each type).
+        if distinguish_types==True, nsel should be list and
+        the length of nsel should be equal to number of
+        types.
+    distinguish_types : bool
+        distinguish different types.
+
+    Returns
+    -------
+    neighbor_list : paddle.Tensor
+        Neighbor list of shape [batch_size, nloc_central, nsel], the neighbors
+        are stored in an ascending order. If the number of neighbors is less than nsel,
+        the positions are masked with -1. The neighbor list of an atom looks like
+        |------ nsel ------|
+        xx xx xx xx -1 -1 -1
+        if distinguish_types==True and we have two types
+        |---- nsel[0] -----| |---- nsel[1] -----|
+        xx xx xx xx -1 -1 -1 xx xx xx -1 -1 -1 -1
+        For virtual atoms all neighboring positions are filled with -1.
+    """
+    batch_size = coord_cntl.shape[0]
+    coord_cntl = coord_cntl.reshape([batch_size, -1])
+    nloc_cntl = coord_cntl.shape[1] // 3
+    coord_neig = coord_neig.reshape([batch_size, -1])
+    nall_neig = coord_neig.shape[1] // 3
+    # fill virtual atoms with large coords so they are not neighbors of any
+    # real atom.
+    if coord_neig.numel() > 0:
+        xmax = paddle.max(coord_cntl) + 2.0 * rcut
+    else:
+        xmax = (
+            paddle.zeros([1], dtype=coord_neig.dtype, device=coord_neig.place)
+            + 2.0 * rcut
+        )
+    # nf x nloc
+    is_vir_cntl = atype_cntl < 0
+    # nf x nall
+    is_vir_neig = atype_neig < 0
+    # nf x nloc x 3
+    coord_cntl = coord_cntl.reshape([batch_size, nloc_cntl, 3])
+    # nf x nall x 3
+    coord_neig = paddle.where(
+        is_vir_neig[:, :, None], xmax, coord_neig.reshape([batch_size, nall_neig, 3])
+    ).reshape([batch_size, nall_neig, 3])
+    # nsel
+    if isinstance(sel, int):
+        sel = [sel]
+    # nloc x nall x 3
+    diff = coord_neig[:, None, :, :] - coord_cntl[:, :, None, :]
+    if paddle.in_dynamic_mode():
+        assert list(diff.shape) == [batch_size, nloc_cntl, nall_neig, 3]
+    # nloc x nall
+    # rr = paddle.linalg.norm(diff, axis=-1)
+    rr = decomp.norm(diff, axis=-1)
+    rr, nlist = paddle.sort(rr, axis=-1), paddle.argsort(rr, axis=-1)
+
+    # We assume that the central and neighbor atoms are diffferent,
+    # thus we do not need to exclude self-neighbors.
+    # # if central atom has two zero distances, sorting sometimes can not exclude itself
+    # rr -= paddle.eye(nloc_cntl, nall_neig, dtype=rr.dtype, device=rr.place).unsqueeze(0)
+    # rr, nlist = paddle.sort(rr, axis=-1)
+    # # nloc x (nall-1)
+    # rr = rr[:, :, 1:]
+    # nlist = nlist[:, :, 1:]
+
+    return _trim_mask_distinguish_nlist(
+        is_vir_cntl, atype_neig, rr, nlist, rcut, sel, distinguish_types
+    )
+
+
+def nlist_distinguish_types(
+    nlist: paddle.Tensor,
+    atype: paddle.Tensor,
+    sel: list[int],
+):
+    """Given a nlist that does not distinguish atom types, return a nlist that
+    distinguish atom types.
+
+    """
+    nf, nloc, nnei = nlist.shape
+    ret_nlist = []
+    # nloc x nall
+    tmp_atype = paddle.tile(atype.unsqueeze(1), [1, nloc, 1])
+    mask = nlist == -1
+    # nloc x s(nsel)
+    # tnlist = paddle.take_along_axis(
+    #     tmp_atype,
+    #     axis=2,
+    #     indices=nlist.masked_fill(mask, 0),
+    # )
+    tnlist = decomp.take_along_axis(
+        tmp_atype,
+        axis=2,
+        indices=nlist.masked_fill(mask, 0),
+    )
+    tnlist = tnlist.masked_fill(mask, -1)
+    snsel = tnlist.shape[2]
+    for ii, ss in enumerate(sel):
+        # nloc x s(nsel)
+        # to int because bool cannot be sort on GPU
+        pick_mask = (tnlist == ii).to(paddle.int64)
+        # nloc x s(nsel), stable sort, nearer neighbors first
+        pick_mask, imap = (
+            paddle.sort(pick_mask, axis=-1, descending=True, stable=True),
+            paddle.argsort(pick_mask, axis=-1, descending=True, stable=True),
+        )
+        # nloc x s(nsel)
+        # inlist = paddle.take_along_axis(nlist, axis=2, indices=imap)
+        inlist = decomp.take_along_axis(nlist, axis=2, indices=imap)
+        inlist = inlist.masked_fill(~(pick_mask.to(paddle.bool)), -1)
+        # nloc x nsel[ii]
+        ret_nlist.append(paddle.split(inlist, [ss, snsel - ss], axis=-1)[0])
+    return paddle.concat(ret_nlist, axis=-1)
+
+
+# build_neighbor_list = paddle.vmap(
+#   build_neighbor_list_lower,
+#   in_dims=(0,0,None,None,None),
+#   out_dims=(0),
+# )
+
+
+def get_multiple_nlist_key(
+    rcut: float,
+    nsel: int,
+) -> str:
+    return str(rcut) + "_" + str(nsel)
+
+
+def build_multiple_neighbor_list(
+    coord: paddle.Tensor,
+    nlist: paddle.Tensor,
+    rcuts: list[float],
+    nsels: list[int],
+) -> dict[str, paddle.Tensor]:
+    """Input one neighbor list, and produce multiple neighbor lists with
+    different cutoff radius and numbers of selection out of it.  The
+    required rcuts and nsels should be smaller or equal to the input nlist.
+
+    Parameters
+    ----------
+    coord : paddle.Tensor
+        exptended coordinates of shape [batch_size, nall x 3]
+    nlist : paddle.Tensor
+        Neighbor list of shape [batch_size, nloc, nsel], the neighbors
+        should be stored in an ascending order.
+    rcuts : list[float]
+        list of cut-off radius in ascending order.
+    nsels : list[int]
+        maximal number of neighbors in ascending order.
+
+    Returns
+    -------
+    nlist_dict : dict[str, paddle.Tensor]
+        A dict of nlists, key given by get_multiple_nlist_key(rc, nsel)
+        value being the corresponding nlist.
+
+    """
+    if paddle.in_dynamic_mode():
+        assert len(rcuts) == len(nsels)
+    if len(rcuts) == 0:
+        return {}
+    nb, nloc, nsel = nlist.shape
+    if nsel < nsels[-1]:
+        pad = -paddle.ones(
+            [nb, nloc, nsels[-1] - nsel],
+            dtype=nlist.dtype,
+        ).to(device=nlist.place)
+        # nb x nloc x nsel
+        nlist = paddle.concat([nlist, pad], axis=-1)
+        if paddle.is_tensor(nsel):
+            nsel = paddle.to_tensor(nsels[-1], dtype=nsel.dtype)
+        else:
+            nsel = nsels[-1]
+
+    # nb x nall x 3
+    coord1 = coord.reshape([nb, -1, 3])
+    nall = coord1.shape[1]
+    # nb x nloc x 3
+    coord0 = coord1[:, :nloc, :]
+    nlist_mask = nlist == -1
+    # nb x (nloc x nsel) x 3
+    index = (
+        nlist.masked_fill(nlist_mask, 0)
+        .reshape([nb, nloc * nsel])
+        .unsqueeze(-1)
+        .expand([-1, -1, 3])
+    )
+    # nb x nloc x nsel x 3
+    # coord2 = paddle.take_along_axis(coord1, axis=1, index=index).reshape(
+    #     [nb, nloc, nsel, 3]
+    # )
+    coord2 = decomp.take_along_axis(coord1, axis=1, indices=index).reshape(
+        [nb, nloc, nsel, 3]
+    )
+    # nb x nloc x nsel x 3
+    diff = coord2 - coord0[:, :, None, :]
+    # nb x nloc x nsel
+    # rr = paddle.linalg.norm(diff, axis=-1)
+    rr = decomp.norm(diff, axis=-1)
+    rr.masked_fill(nlist_mask, float("inf"))
+    nlist0 = nlist
+    ret = {}
+    for rc, ns in zip(rcuts[::-1], nsels[::-1]):
+        nlist0 = nlist0[:, :, :ns].masked_fill(rr[:, :, :ns] > rc, -1)
+        ret[get_multiple_nlist_key(rc, ns)] = nlist0
+    return ret
+
+
+def extend_coord_with_ghosts(
+    coord: paddle.Tensor,
+    atype: paddle.Tensor,
+    cell: Optional[paddle.Tensor],
+    rcut: float,
+    cell_cpu: Optional[paddle.Tensor] = None,
+) -> tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
+    """Extend the coordinates of the atoms by appending peridoc images.
+    The number of images is large enough to ensure all the neighbors
+    within rcut are appended.
+
+    Parameters
+    ----------
+    coord : paddle.Tensor
+        original coordinates of shape [-1, nloc*3].
+    atype : paddle.Tensor
+        atom type of shape [-1, nloc].
+    cell : paddle.Tensor
+        simulation cell tensor of shape [-1, 9].
+    rcut : float
+        the cutoff radius
+    cell_cpu : paddle.Tensor
+        cell on cpu for performance
+
+    Returns
+    -------
+    extended_coord: paddle.Tensor
+        extended coordinates of shape [-1, nall*3].
+    extended_atype: paddle.Tensor
+        extended atom type of shape [-1, nall].
+    index_mapping: paddle.Tensor
+        maping extended index to the local index
+
+    """
+    device = coord.place
+    nf, nloc = atype.shape[:2]
+    # int64 for index
+    aidx = paddle.tile(paddle.arange(nloc).to(device=device).unsqueeze(0), [nf, 1])  # pylint: disable=no-explicit-dtype
+    if cell is None:
+        nall = nloc
+        extend_coord = coord.clone()
+        extend_atype = atype.clone()
+        extend_aidx = aidx.clone()
+    else:
+        coord = coord.reshape([nf, nloc, 3])
+        cell = cell.reshape([nf, 3, 3])
+        cell_cpu = cell_cpu.reshape([nf, 3, 3]) if cell_cpu is not None else cell
+        # nf x 3
+        to_face = to_face_distance(cell_cpu)
+        # nf x 3
+        # *2: ghost copies on + and - directions
+        # +1: central cell
+        nbuff = paddle.ceil(rcut / to_face)
+        INT64_MIN = -9223372036854775808
+        nbuff = paddle.where(
+            paddle.isinf(nbuff),
+            paddle.full_like(nbuff, INT64_MIN, dtype=paddle.int64),
+            nbuff.astype(paddle.int64),
+        )
+        # 3
+        nbuff = paddle.amax(nbuff, axis=0)  # faster than paddle.max
+        nbuff_cpu = nbuff.cpu()
+        xi = (
+            paddle.arange(-nbuff_cpu[0], nbuff_cpu[0] + 1, 1).to(
+                dtype=env.GLOBAL_PD_FLOAT_PRECISION
+            )
+            # .cpu()
+        )  # pylint: disable=no-explicit-dtype
+        yi = (
+            paddle.arange(-nbuff_cpu[1], nbuff_cpu[1] + 1, 1).to(
+                dtype=env.GLOBAL_PD_FLOAT_PRECISION
+            )
+            # .cpu()
+        )  # pylint: disable=no-explicit-dtype
+        zi = (
+            paddle.arange(-nbuff_cpu[2], nbuff_cpu[2] + 1, 1).to(
+                dtype=env.GLOBAL_PD_FLOAT_PRECISION
+            )
+            # .cpu()
+        )  # pylint: disable=no-explicit-dtype
+        eye_3 = (
+            paddle.eye(3, dtype=env.GLOBAL_PD_FLOAT_PRECISION).to(
+                dtype=env.GLOBAL_PD_FLOAT_PRECISION
+            )
+            # .cpu()
+        )
+        xyz = xi.reshape([-1, 1, 1, 1]) * eye_3[0]
+        xyz = xyz + yi.reshape([1, -1, 1, 1]) * eye_3[1]
+        xyz = xyz + zi.reshape([1, 1, -1, 1]) * eye_3[2]
+        xyz = xyz.reshape([-1, 3])
+        # xyz = xyz.to(device=device)
+        # ns x 3
+        # shift_idx = xyz[paddle.argsort(paddle.norm(xyz, axis=1))]
+        shift_idx = xyz[paddle.argsort(decomp.norm(xyz, axis=1))]
+        ns, _ = shift_idx.shape
+        nall = ns * nloc
+        # nf x ns x 3
+        shift_vec = paddle.einsum("sd,fdk->fsk", shift_idx, cell)
+        # nf x ns x nloc x 3
+        extend_coord = coord[:, None, :, :] + shift_vec[:, :, None, :]
+        # nf x ns x nloc
+        extend_atype = paddle.tile(atype.unsqueeze(-2), [1, ns, 1])
+        # nf x ns x nloc
+        extend_aidx = paddle.tile(aidx.unsqueeze(-2), [1, ns, 1])
+    return (
+        extend_coord.reshape([nf, nall * 3]).to(device),
+        extend_atype.reshape([nf, nall]).to(device),
+        extend_aidx.reshape([nf, nall]).to(device),
+    )
diff --git a/deepmd/pd/utils/preprocess.py b/deepmd/pd/utils/preprocess.py
new file mode 100644
index 0000000000..3e047c1b8b
--- /dev/null
+++ b/deepmd/pd/utils/preprocess.py
@@ -0,0 +1,18 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import logging
+
+import paddle
+
+log = logging.getLogger(__name__)
+
+
+def compute_smooth_weight(distance, rmin: float, rmax: float):
+    """Compute smooth weight for descriptor elements."""
+    if rmin >= rmax:
+        raise ValueError("rmin should be less than rmax.")
+    min_mask = distance <= rmin
+    max_mask = distance >= rmax
+    mid_mask = paddle.logical_not(paddle.logical_or(min_mask, max_mask))
+    uu = (distance - rmin) / (rmax - rmin)
+    vv = uu * uu * uu * (-6 * uu * uu + 15 * uu - 10) + 1
+    return vv * mid_mask.astype(vv.dtype) + min_mask.astype(vv.dtype)
diff --git a/deepmd/pd/utils/region.py b/deepmd/pd/utils/region.py
new file mode 100644
index 0000000000..21927e3619
--- /dev/null
+++ b/deepmd/pd/utils/region.py
@@ -0,0 +1,119 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import paddle
+
+from deepmd.pd.utils import (
+    decomp,
+)
+
+
+def phys2inter(
+    coord: paddle.Tensor,
+    cell: paddle.Tensor,
+) -> paddle.Tensor:
+    """Convert physical coordinates to internal(direct) coordinates.
+
+    Parameters
+    ----------
+    coord : paddle.Tensor
+        physical coordinates of shape [*, na, 3].
+    cell : paddle.Tensor
+        simulation cell tensor of shape [*, 3, 3].
+
+    Returns
+    -------
+    inter_coord: paddle.Tensor
+        the internal coordinates
+
+    """
+    if paddle.in_dynamic_mode():
+        try:
+            rec_cell = paddle.linalg.inv(cell)
+        except Exception as e:
+            rec_cell = paddle.full_like(cell, float("nan"))
+            rec_cell.stop_gradient = cell.stop_gradient
+    else:
+        rec_cell = paddle.linalg.inv(cell)
+    return paddle.matmul(coord, rec_cell)
+
+
+def inter2phys(
+    coord: paddle.Tensor,
+    cell: paddle.Tensor,
+) -> paddle.Tensor:
+    """Convert internal(direct) coordinates to physical coordinates.
+
+    Parameters
+    ----------
+    coord : paddle.Tensor
+        internal coordinates of shape [*, na, 3].
+    cell : paddle.Tensor
+        simulation cell tensor of shape [*, 3, 3].
+
+    Returns
+    -------
+    phys_coord: paddle.Tensor
+        the physical coordinates
+
+    """
+    return paddle.matmul(coord, cell)
+
+
+def to_face_distance(
+    cell: paddle.Tensor,
+) -> paddle.Tensor:
+    """Compute the to-face-distance of the simulation cell.
+
+    Parameters
+    ----------
+    cell : paddle.Tensor
+        simulation cell tensor of shape [*, 3, 3].
+
+    Returns
+    -------
+    dist: paddle.Tensor
+        the to face distances of shape [*, 3]
+
+    """
+    cshape = cell.shape
+    dist = b_to_face_distance(cell.reshape([-1, 3, 3]))
+    return dist.reshape(list(cshape[:-2]) + [3])  # noqa:RUF005
+
+
+def b_to_face_distance(cell):
+    volume = paddle.linalg.det(cell)
+    c_yz = paddle.cross(cell[:, 1], cell[:, 2], axis=-1)
+    # _h2yz = volume / paddle.linalg.norm(c_yz, axis=-1)
+    _h2yz = volume / decomp.norm(c_yz, axis=-1)
+    c_zx = paddle.cross(cell[:, 2], cell[:, 0], axis=-1)
+    # _h2zx = volume / paddle.linalg.norm(c_zx, axis=-1)
+    _h2zx = volume / decomp.norm(c_zx, axis=-1)
+    c_xy = paddle.cross(cell[:, 0], cell[:, 1], axis=-1)
+    # _h2xy = volume / paddle.linalg.norm(c_xy, axis=-1)
+    _h2xy = volume / decomp.norm(c_xy, axis=-1)
+    return paddle.stack([_h2yz, _h2zx, _h2xy], axis=1)
+
+
+# b_to_face_distance = paddle.vmap(
+#   _to_face_distance, in_dims=(0), out_dims=(0))
+
+
+def normalize_coord(
+    coord: paddle.Tensor,
+    cell: paddle.Tensor,
+) -> paddle.Tensor:
+    """Apply PBC according to the atomic coordinates.
+
+    Parameters
+    ----------
+    coord : paddle.Tensor
+        original coordinates of shape [*, na, 3].
+
+    Returns
+    -------
+    wrapped_coord: paddle.Tensor
+        wrapped coordinates of shape [*, na, 3].
+
+    """
+    icoord = phys2inter(coord, cell)
+    icoord = paddle.remainder(icoord, paddle.full([], 1.0))
+    return inter2phys(icoord, cell)
diff --git a/deepmd/pd/utils/serialization.py b/deepmd/pd/utils/serialization.py
new file mode 100644
index 0000000000..0274608424
--- /dev/null
+++ b/deepmd/pd/utils/serialization.py
@@ -0,0 +1,55 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import json
+
+import paddle
+
+from deepmd.pd.model.model.model import (
+    BaseModel,
+)
+
+
+def serialize_from_file(model_file: str) -> dict:
+    """Serialize the model file to a dictionary.
+
+    Parameters
+    ----------
+    model_file : str
+        The model file to be serialized.
+
+    Returns
+    -------
+    dict
+        The serialized model data.
+    """
+    raise NotImplementedError("Paddle do not support jit.export yet.")
+
+
+def deserialize_to_file(model_file: str, data: dict) -> None:
+    """Deserialize the dictionary to a model file.
+
+    Parameters
+    ----------
+    model_file : str
+        The model file to be saved.
+    data : dict
+        The dictionary to be deserialized.
+    """
+    if not model_file.endswith(".json"):
+        raise ValueError("Paddle backend only supports converting .json file")
+    model = BaseModel.deserialize(data["model"])
+    # JIT will happy in this way...
+    model.model_def_script = json.dumps(data["model_def_script"])
+    if "min_nbor_dist" in data.get("@variables", {}):
+        model.min_nbor_dist = float(data["@variables"]["min_nbor_dist"])
+    # model = paddle.jit.to_static(model)
+    paddle.set_flags(
+        {
+            "FLAGS_save_cf_stack_op": 1,
+            "FLAGS_prim_enable_dynamic": 1,
+            "FLAGS_enable_pir_api": 1,
+        }
+    )
+    paddle.jit.save(
+        model,
+        model_file.split(".json")[0],
+    )
diff --git a/deepmd/pd/utils/stat.py b/deepmd/pd/utils/stat.py
new file mode 100644
index 0000000000..a8bdbd6415
--- /dev/null
+++ b/deepmd/pd/utils/stat.py
@@ -0,0 +1,604 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import logging
+from collections import (
+    defaultdict,
+)
+from typing import (
+    Callable,
+    Optional,
+    Union,
+)
+
+import numpy as np
+import paddle
+
+from deepmd.dpmodel.output_def import (
+    FittingOutputDef,
+)
+from deepmd.pd.utils import (
+    AtomExcludeMask,
+)
+from deepmd.pd.utils.auto_batch_size import (
+    AutoBatchSize,
+)
+from deepmd.pd.utils.utils import (
+    dict_to_device,
+    to_numpy_array,
+    to_paddle_tensor,
+)
+from deepmd.utils.out_stat import (
+    compute_stats_from_atomic,
+    compute_stats_from_redu,
+)
+from deepmd.utils.path import (
+    DPPath,
+)
+
+log = logging.getLogger(__name__)
+
+
+def make_stat_input(datasets, dataloaders, nbatches):
+    """Pack data for statistics.
+
+    Args:
+    - dataset: A list of dataset to analyze.
+    - nbatches: Batch count for collecting stats.
+
+    Returns
+    -------
+    - a list of dicts, each of which contains data from a system
+    """
+    lst = []
+    log.info(f"Packing data for statistics from {len(datasets)} systems")
+    for i in range(len(datasets)):
+        sys_stat = {}
+
+        iterator = iter(dataloaders[i])
+        numb_batches = min(nbatches, len(dataloaders[i]))
+        for _ in range(numb_batches):
+            try:
+                stat_data = next(iterator)
+            except StopIteration:
+                iterator = iter(dataloaders[i])
+                stat_data = next(iterator)
+            for dd in stat_data:
+                if stat_data[dd] is None:
+                    sys_stat[dd] = None
+                elif isinstance(stat_data[dd], paddle.Tensor):
+                    if dd not in sys_stat:
+                        sys_stat[dd] = []
+                    sys_stat[dd].append(stat_data[dd])
+                elif isinstance(stat_data[dd], np.float32):
+                    sys_stat[dd] = stat_data[dd]
+                else:
+                    pass
+
+        for key in sys_stat:
+            if isinstance(sys_stat[key], np.float32):
+                pass
+            elif sys_stat[key] is None or sys_stat[key][0] is None:
+                sys_stat[key] = None
+            elif isinstance(stat_data[dd], paddle.Tensor):
+                sys_stat[key] = paddle.concat(sys_stat[key], axis=0)
+        dict_to_device(sys_stat)
+        lst.append(sys_stat)
+    return lst
+
+
+def _restore_from_file(
+    stat_file_path: DPPath,
+    keys: list[str] = ["energy"],
+) -> Optional[dict]:
+    if stat_file_path is None:
+        return None, None
+    stat_files = [stat_file_path / f"bias_atom_{kk}" for kk in keys]
+    if all(not (ii.is_file()) for ii in stat_files):
+        return None, None
+    stat_files = [stat_file_path / f"std_atom_{kk}" for kk in keys]
+    if all(not (ii.is_file()) for ii in stat_files):
+        return None, None
+
+    ret_bias = {}
+    ret_std = {}
+    for kk in keys:
+        fp = stat_file_path / f"bias_atom_{kk}"
+        # only read the key that exists
+        if fp.is_file():
+            ret_bias[kk] = fp.load_numpy()
+    for kk in keys:
+        fp = stat_file_path / f"std_atom_{kk}"
+        # only read the key that exists
+        if fp.is_file():
+            ret_std[kk] = fp.load_numpy()
+    return ret_bias, ret_std
+
+
+def _save_to_file(
+    stat_file_path: DPPath,
+    bias_out: dict,
+    std_out: dict,
+):
+    assert stat_file_path is not None
+    stat_file_path.mkdir(exist_ok=True, parents=True)
+    for kk, vv in bias_out.items():
+        fp = stat_file_path / f"bias_atom_{kk}"
+        fp.save_numpy(vv)
+    for kk, vv in std_out.items():
+        fp = stat_file_path / f"std_atom_{kk}"
+        fp.save_numpy(vv)
+
+
+def _post_process_stat(
+    out_bias,
+    out_std,
+):
+    """Post process the statistics.
+
+    For global statistics, we do not have the std for each type of atoms,
+    thus fake the output std by ones for all the types.
+
+    """
+    new_std = {}
+    for kk, vv in out_bias.items():
+        new_std[kk] = np.ones_like(vv)
+    return out_bias, new_std
+
+
+def _compute_model_predict(
+    sampled: Union[Callable[[], list[dict]], list[dict]],
+    keys: list[str],
+    model_forward: Callable[..., paddle.Tensor],
+):
+    auto_batch_size = AutoBatchSize()
+    model_predict = {kk: [] for kk in keys}
+    for system in sampled:
+        nframes = system["coord"].shape[0]
+        coord, atype, box, natoms = (
+            system["coord"],
+            system["atype"],
+            system["box"],
+            system["natoms"],
+        )
+        fparam = system.get("fparam", None)
+        aparam = system.get("aparam", None)
+
+        def model_forward_auto_batch_size(*args, **kwargs):
+            return auto_batch_size.execute_all(
+                model_forward,
+                nframes,
+                system["atype"].shape[-1],
+                *args,
+                **kwargs,
+            )
+
+        sample_predict = model_forward_auto_batch_size(
+            coord, atype, box, fparam=fparam, aparam=aparam
+        )
+        for kk in keys:
+            model_predict[kk].append(
+                to_numpy_array(
+                    sample_predict[kk]  # nf x nloc x odims
+                )
+            )
+    return model_predict
+
+
+def _make_preset_out_bias(
+    ntypes: int,
+    ibias: list[Optional[np.ndarray]],
+) -> Optional[np.ndarray]:
+    """Make preset out bias.
+
+    output:
+        a np array of shape [ntypes, *(odim0, odim1, ...)] is any item is not None
+        None if all items are None.
+    """
+    if len(ibias) != ntypes:
+        raise ValueError("the length of preset bias list should be ntypes")
+    if all(ii is None for ii in ibias):
+        return None
+    for refb in ibias:
+        if refb is not None:
+            break
+    refb = np.array(refb)
+    nbias = [
+        np.full_like(refb, np.nan, dtype=np.float64) if ii is None else ii
+        for ii in ibias
+    ]
+    return np.array(nbias)
+
+
+def _fill_stat_with_global(
+    atomic_stat: Union[np.ndarray, None],
+    global_stat: np.ndarray,
+):
+    """This function is used to fill atomic stat with global stat.
+
+    Parameters
+    ----------
+    atomic_stat : Union[np.ndarray, None]
+        The atomic stat.
+    global_stat : np.ndarray
+        The global stat.
+    if the atomic stat is None, use global stat.
+    if the atomic stat is not None, but has nan values (missing atypes), fill with global stat.
+    """
+    if atomic_stat is None:
+        return global_stat
+    else:
+        atomic_stat = atomic_stat.reshape(global_stat.shape)
+        return np.nan_to_num(
+            np.where(
+                np.isnan(atomic_stat) & ~np.isnan(global_stat), global_stat, atomic_stat
+            )
+        )
+
+
+def compute_output_stats(
+    merged: Union[Callable[[], list[dict]], list[dict]],
+    ntypes: int,
+    keys: Union[str, list[str]] = ["energy"],
+    stat_file_path: Optional[DPPath] = None,
+    rcond: Optional[float] = None,
+    preset_bias: Optional[dict[str, list[Optional[np.ndarray]]]] = None,
+    model_forward: Optional[Callable[..., paddle.Tensor]] = None,
+    atomic_output: Optional[FittingOutputDef] = None,
+):
+    """
+    Compute the output statistics (e.g. energy bias) for the fitting net from packed data.
+
+    Parameters
+    ----------
+    merged : Union[Callable[[], list[dict]], list[dict]]
+        - list[dict]: A list of data samples from various data systems.
+            Each element, `merged[i]`, is a data dictionary containing `keys`: `paddle.Tensor`
+            originating from the `i`-th data system.
+        - Callable[[], list[dict]]: A lazy function that returns data samples in the above format
+            only when needed. Since the sampling process can be slow and memory-intensive,
+            the lazy function helps by only sampling once.
+    ntypes : int
+        The number of atom types.
+    stat_file_path : DPPath, optional
+        The path to the stat file.
+    rcond : float, optional
+        The condition number for the regression of atomic energy.
+    preset_bias : dict[str, list[Optional[paddle.Tensor]]], optional
+        Specifying atomic energy contribution in vacuum. Given by key:value pairs.
+        The value is a list specifying the bias. the elements can be None or np.ndarray of output shape.
+        For example: [None, [2.]] means type 0 is not set, type 1 is set to [2.]
+        The `set_davg_zero` key in the descriptor should be set.
+    model_forward : Callable[..., paddle.Tensor], optional
+        The wrapped forward function of atomic model.
+        If not None, the model will be utilized to generate the original energy prediction,
+        which will be subtracted from the energy label of the data.
+        The difference will then be used to calculate the delta complement energy bias for each type.
+    atomic_output : FittingOutputDef, optional
+        The output of atomic model.
+    """
+    # try to restore the bias from stat file
+    bias_atom_e, std_atom_e = _restore_from_file(stat_file_path, keys)
+
+    # failed to restore the bias from stat file. compute
+    if bias_atom_e is None:
+        # only get data once, sampled is a list of dict[str, paddle.Tensor]
+        sampled = merged() if callable(merged) else merged
+        if model_forward is not None:
+            model_pred = _compute_model_predict(sampled, keys, model_forward)
+        else:
+            model_pred = None
+
+        # remove the keys that are not in the sample
+        keys = [keys] if isinstance(keys, str) else keys
+        assert isinstance(keys, list)
+        new_keys = [
+            ii
+            for ii in keys
+            if (ii in sampled[0].keys()) or ("atom_" + ii in sampled[0].keys())
+        ]
+        del keys
+        keys = new_keys
+        # split system based on label
+        atomic_sampled_idx = defaultdict(list)
+        global_sampled_idx = defaultdict(list)
+
+        for kk in keys:
+            for idx, system in enumerate(sampled):
+                if (("find_atom_" + kk) in system) and (
+                    system["find_atom_" + kk] > 0.0
+                ):
+                    atomic_sampled_idx[kk].append(idx)
+                elif (("find_" + kk) in system) and (system["find_" + kk] > 0.0):
+                    global_sampled_idx[kk].append(idx)
+
+                else:
+                    continue
+
+        # use index to gather model predictions for the corresponding systems.
+
+        model_pred_g = (
+            {
+                kk: [
+                    np.sum(vv[idx], axis=1) for idx in global_sampled_idx[kk]
+                ]  # sum atomic dim
+                for kk, vv in model_pred.items()
+            }
+            if model_pred
+            else None
+        )
+        model_pred_a = (
+            {
+                kk: [vv[idx] for idx in atomic_sampled_idx[kk]]
+                for kk, vv in model_pred.items()
+            }
+            if model_pred
+            else None
+        )
+
+        # concat all frames within those systems
+        model_pred_g = (
+            {
+                kk: np.concatenate(model_pred_g[kk])
+                for kk in model_pred_g.keys()
+                if len(model_pred_g[kk]) > 0
+            }
+            if model_pred
+            else None
+        )
+        model_pred_a = (
+            {
+                kk: np.concatenate(model_pred_a[kk])
+                for kk in model_pred_a.keys()
+                if len(model_pred_a[kk]) > 0
+            }
+            if model_pred
+            else None
+        )
+
+        # compute stat
+        bias_atom_g, std_atom_g = compute_output_stats_global(
+            sampled,
+            ntypes,
+            keys,
+            rcond,
+            preset_bias,
+            model_pred_g,
+            atomic_output,
+        )
+        bias_atom_a, std_atom_a = compute_output_stats_atomic(
+            sampled,
+            ntypes,
+            keys,
+            model_pred_a,
+        )
+
+        # merge global/atomic bias
+        bias_atom_e, std_atom_e = {}, {}
+        for kk in keys:
+            # use atomic bias whenever available
+            if kk in bias_atom_a:
+                bias_atom_e[kk] = bias_atom_a[kk]
+                std_atom_e[kk] = std_atom_a[kk]
+            else:
+                bias_atom_e[kk] = None
+                std_atom_e[kk] = None
+            # use global bias to fill missing atomic bias
+            if kk in bias_atom_g:
+                bias_atom_e[kk] = _fill_stat_with_global(
+                    bias_atom_e[kk], bias_atom_g[kk]
+                )
+                std_atom_e[kk] = _fill_stat_with_global(std_atom_e[kk], std_atom_g[kk])
+            if (bias_atom_e[kk] is None) or (std_atom_e[kk] is None):
+                raise RuntimeError("Fail to compute stat.")
+
+        if stat_file_path is not None:
+            _save_to_file(stat_file_path, bias_atom_e, std_atom_e)
+
+    bias_atom_e = {kk: to_paddle_tensor(vv) for kk, vv in bias_atom_e.items()}
+    std_atom_e = {kk: to_paddle_tensor(vv) for kk, vv in std_atom_e.items()}
+    return bias_atom_e, std_atom_e
+
+
+def compute_output_stats_global(
+    sampled: list[dict],
+    ntypes: int,
+    keys: list[str],
+    rcond: Optional[float] = None,
+    preset_bias: Optional[dict[str, list[Optional[paddle.Tensor]]]] = None,
+    model_pred: Optional[dict[str, np.ndarray]] = None,
+    atomic_output: Optional[FittingOutputDef] = None,
+):
+    """This function only handle stat computation from reduced global labels."""
+    # return directly if model predict is empty for global
+    if model_pred == {}:
+        return {}, {}
+
+    # get label dict from sample; for each key, only picking the system with global labels.
+    outputs = {
+        kk: [
+            system[kk]
+            for system in sampled
+            if kk in system and system.get(f"find_{kk}", 0) > 0
+        ]
+        for kk in keys
+    }
+
+    data_mixed_type = "real_natoms_vec" in sampled[0]
+    natoms_key = "natoms" if not data_mixed_type else "real_natoms_vec"
+    for system in sampled:
+        if "atom_exclude_types" in system:
+            type_mask = AtomExcludeMask(
+                ntypes, system["atom_exclude_types"]
+            ).get_type_mask()
+            system[natoms_key][:, 2:] *= type_mask.unsqueeze(0)
+
+    input_natoms = {
+        kk: [
+            item[natoms_key]
+            for item in sampled
+            if kk in item and item.get(f"find_{kk}", 0) > 0
+        ]
+        for kk in keys
+    }
+    # shape: (nframes, ndim)
+    merged_output = {
+        kk: to_numpy_array(paddle.concat(outputs[kk]))
+        for kk in keys
+        if len(outputs[kk]) > 0
+    }
+    # shape: (nframes, ntypes)
+
+    merged_natoms = {
+        kk: to_numpy_array(paddle.concat(input_natoms[kk])[:, 2:])
+        for kk in keys
+        if len(input_natoms[kk]) > 0
+    }
+    nf = {kk: merged_natoms[kk].shape[0] for kk in keys if kk in merged_natoms}
+    if preset_bias is not None:
+        assigned_atom_ener = {
+            kk: _make_preset_out_bias(ntypes, preset_bias[kk])
+            if kk in preset_bias.keys()
+            else None
+            for kk in keys
+        }
+    else:
+        assigned_atom_ener = {kk: None for kk in keys}
+
+    if model_pred is None:
+        stats_input = merged_output
+    else:
+        # subtract the model bias and output the delta bias
+
+        stats_input = {
+            kk: merged_output[kk] - model_pred[kk] for kk in keys if kk in merged_output
+        }
+
+    bias_atom_e = {}
+    std_atom_e = {}
+    for kk in keys:
+        if kk in stats_input:
+            if atomic_output is not None and atomic_output.get_data()[kk].intensive:
+                task_dim = stats_input[kk].shape[1]
+                assert merged_natoms[kk].shape == (nf[kk], ntypes)
+                stats_input[kk] = (
+                    merged_natoms[kk].sum(axis=1).reshape([-1, 1]) * stats_input[kk]
+                )
+                assert stats_input[kk].shape == (nf[kk], task_dim)
+            bias_atom_e[kk], std_atom_e[kk] = compute_stats_from_redu(
+                stats_input[kk],
+                merged_natoms[kk],
+                assigned_bias=assigned_atom_ener[kk],
+                rcond=rcond,
+            )
+        else:
+            # this key does not have global labels, skip it.
+            continue
+    bias_atom_e, std_atom_e = _post_process_stat(bias_atom_e, std_atom_e)
+
+    # unbias_e is only used for print rmse
+
+    if model_pred is None:
+        unbias_e = {
+            kk: merged_natoms[kk] @ bias_atom_e[kk].reshape([ntypes, -1])
+            for kk in bias_atom_e.keys()
+        }
+    else:
+        unbias_e = {
+            kk: model_pred[kk].reshape([nf[kk], -1])
+            + merged_natoms[kk] @ bias_atom_e[kk].reshape([ntypes, -1])
+            for kk in bias_atom_e.keys()
+        }
+    atom_numbs = {kk: merged_natoms[kk].sum(-1) for kk in bias_atom_e.keys()}
+
+    def rmse(x):
+        return np.sqrt(np.mean(np.square(x)))
+
+    for kk in bias_atom_e.keys():
+        rmse_ae = rmse(
+            (
+                unbias_e[kk].reshape([nf[kk], -1]).astype(merged_output[kk].dtype)
+                - merged_output[kk].reshape([nf[kk], -1])
+            )
+            / atom_numbs[kk][:, None].astype(merged_output[kk].dtype)
+        )
+        log.info(
+            f"RMSE of {kk} per atom after linear regression is: {rmse_ae} in the unit of {kk}."
+        )
+    return bias_atom_e, std_atom_e
+
+
+def compute_output_stats_atomic(
+    sampled: list[dict],
+    ntypes: int,
+    keys: list[str],
+    model_pred: Optional[dict[str, np.ndarray]] = None,
+):
+    # get label dict from sample; for each key, only picking the system with atomic labels.
+    outputs = {
+        kk: [
+            system["atom_" + kk]
+            for system in sampled
+            if ("atom_" + kk) in system and system.get(f"find_atom_{kk}", 0) > 0
+        ]
+        for kk in keys
+    }
+    natoms = {
+        kk: [
+            system["atype"]
+            for system in sampled
+            if ("atom_" + kk) in system and system.get(f"find_atom_{kk}", 0) > 0
+        ]
+        for kk in keys
+    }
+    # shape: (nframes, nloc, ndim)
+    merged_output = {
+        kk: to_numpy_array(paddle.concat(outputs[kk]))
+        for kk in keys
+        if len(outputs[kk]) > 0
+    }
+    merged_natoms = {
+        kk: to_numpy_array(paddle.concat(natoms[kk]))
+        for kk in keys
+        if len(natoms[kk]) > 0
+    }
+    # reshape merged data to [nf, nloc, ndim]
+    merged_output = {
+        kk: merged_output[kk].reshape((*merged_natoms[kk].shape, -1))
+        for kk in merged_output
+    }
+
+    if model_pred is None:
+        stats_input = merged_output
+    else:
+        # subtract the model bias and output the delta bias
+        stats_input = {
+            kk: merged_output[kk] - model_pred[kk].reshape(merged_output[kk].shape)
+            for kk in keys
+            if kk in merged_output
+        }
+
+    bias_atom_e = {}
+    std_atom_e = {}
+
+    for kk in keys:
+        if kk in stats_input:
+            bias_atom_e[kk], std_atom_e[kk] = compute_stats_from_atomic(
+                stats_input[kk],
+                merged_natoms[kk],
+            )
+            # correction for missing types
+            missing_types = ntypes - merged_natoms[kk].max() - 1
+            if missing_types > 0:
+                assert (
+                    bias_atom_e[kk].dtype is std_atom_e[kk].dtype
+                ), "bias and std should be of the same dtypes"
+                nan_padding = np.empty(
+                    (missing_types, bias_atom_e[kk].shape[1]),
+                    dtype=bias_atom_e[kk].dtype,
+                )
+                nan_padding.fill(np.nan)
+                bias_atom_e[kk] = np.concatenate([bias_atom_e[kk], nan_padding], axis=0)
+                std_atom_e[kk] = np.concatenate([std_atom_e[kk], nan_padding], axis=0)
+        else:
+            # this key does not have atomic labels, skip it.
+            continue
+    return bias_atom_e, std_atom_e
diff --git a/deepmd/pd/utils/update_sel.py b/deepmd/pd/utils/update_sel.py
new file mode 100644
index 0000000000..32b8d66c73
--- /dev/null
+++ b/deepmd/pd/utils/update_sel.py
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+
+from deepmd.pd.utils.neighbor_stat import (
+    NeighborStat,
+)
+from deepmd.utils.update_sel import (
+    BaseUpdateSel,
+)
+
+
+class UpdateSel(BaseUpdateSel):
+    @property
+    def neighbor_stat(self) -> type[NeighborStat]:
+        return NeighborStat
diff --git a/deepmd/pd/utils/utils.py b/deepmd/pd/utils/utils.py
new file mode 100644
index 0000000000..48732ff84e
--- /dev/null
+++ b/deepmd/pd/utils/utils.py
@@ -0,0 +1,179 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+from __future__ import (
+    annotations,
+)
+
+from typing import (
+    TYPE_CHECKING,
+    overload,
+)
+
+import ml_dtypes
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+
+from deepmd.dpmodel.common import PRECISION_DICT as NP_PRECISION_DICT
+
+from .env import (
+    DEVICE,
+)
+from .env import PRECISION_DICT as PD_PRECISION_DICT
+
+if TYPE_CHECKING:
+    from deepmd.pd.model.network.init import (
+        PaddleGenerator,
+    )
+
+
+class ActivationFn(paddle.nn.Layer):
+    def __init__(self, activation: str | None):
+        super().__init__()
+        self.activation: str = activation if activation is not None else "linear"
+
+    def forward(self, x: paddle.Tensor) -> paddle.Tensor:
+        """Returns the tensor after applying activation function corresponding to `activation`."""
+        if self.activation.lower() == "relu":
+            return F.relu(x)
+        elif self.activation.lower() == "gelu" or self.activation.lower() == "gelu_tf":
+            return F.gelu(x, approximate=True)
+        elif self.activation.lower() == "tanh":
+            return paddle.tanh(x)
+        elif self.activation.lower() == "relu6":
+            return F.relu6(x)
+        elif self.activation.lower() == "softplus":
+            return F.softplus(x)
+        elif self.activation.lower() == "sigmoid":
+            return F.sigmoid(x)
+        elif self.activation.lower() == "linear" or self.activation.lower() == "none":
+            return x
+        else:
+            raise RuntimeError(f"activation function {self.activation} not supported")
+
+
+@overload
+def to_numpy_array(xx: paddle.Tensor) -> np.ndarray: ...
+
+
+@overload
+def to_numpy_array(xx: None) -> None: ...
+
+
+def to_numpy_array(
+    xx,
+):
+    if xx is None:
+        return None
+    assert xx is not None
+    # Create a reverse mapping of PD_PRECISION_DICT
+    reverse_precision_dict = {v: k for k, v in PD_PRECISION_DICT.items()}
+    # Use the reverse mapping to find keys with the desired value
+    prec = reverse_precision_dict.get(xx.dtype, None)
+    prec = NP_PRECISION_DICT.get(prec, np.float64)
+    if prec is None:
+        raise ValueError(f"unknown precision {xx.dtype}")
+    if isinstance(xx, np.ndarray):
+        return xx.astype(prec)
+    if xx.dtype == paddle.bfloat16:
+        xx = xx.astype(paddle.get_default_dtype())
+    return xx.numpy().astype(prec)
+
+
+@overload
+def to_paddle_tensor(xx: np.ndarray) -> paddle.Tensor: ...
+
+
+@overload
+def to_paddle_tensor(xx: None) -> None: ...
+
+
+def to_paddle_tensor(
+    xx,
+):
+    if xx is None:
+        return None
+    assert xx is not None
+    if not isinstance(xx, np.ndarray):
+        return xx
+    # Create a reverse mapping of NP_PRECISION_DICT
+    reverse_precision_dict = {v: k for k, v in NP_PRECISION_DICT.items()}
+    # Use the reverse mapping to find keys with the desired value
+    prec = reverse_precision_dict.get(xx.dtype.type, None)
+    prec = PD_PRECISION_DICT.get(prec, None)
+    if prec is None:
+        raise ValueError(f"unknown precision {xx.dtype}")
+    if xx.dtype == ml_dtypes.bfloat16:
+        xx = xx.astype(np.float32)
+    return paddle.to_tensor(xx, dtype=prec, place=DEVICE)
+
+
+def dict_to_device(sample_dict):
+    for key in sample_dict:
+        if isinstance(sample_dict[key], list):
+            sample_dict[key] = [item.to(DEVICE) for item in sample_dict[key]]
+        if isinstance(sample_dict[key], np.float32):
+            sample_dict[key] = (
+                paddle.ones(1, dtype=paddle.float32).to(device=DEVICE)
+                * sample_dict[key]
+            )
+        else:
+            if sample_dict[key] is not None:
+                sample_dict[key] = sample_dict[key].to(DEVICE)
+
+
+# https://github.com/numpy/numpy/blob/a4cddb60489f821a1a4dffc16cd5c69755d43bdb/numpy/random/bit_generator.pyx#L58-L63
+INIT_A = 0x43B0D7E5
+MULT_A = 0x931E8875
+MIX_MULT_L = 0xCA01F9DD
+MIX_MULT_R = 0x4973F715
+XSHIFT = 16
+
+
+def hashmix(value: int, hash_const: list[int]):
+    value ^= INIT_A
+    hash_const[0] *= MULT_A
+    value *= INIT_A
+    # prevent overflow
+    hash_const[0] &= 0xFFFF_FFFF_FFFF_FFFF
+    value &= 0xFFFF_FFFF_FFFF_FFFF
+    value ^= value >> XSHIFT
+    return value
+
+
+def mix(x: int, y: int):
+    result = MIX_MULT_L * x - MIX_MULT_R * y
+    # prevent overflow
+    result &= 0xFFFF_FFFF_FFFF_FFFF
+    result ^= result >> XSHIFT
+    return result
+
+
+def mix_entropy(entropy_array: list[int]) -> int:
+    # https://github.com/numpy/numpy/blob/a4cddb60489f821a1a4dffc16cd5c69755d43bdb/numpy/random/bit_generator.pyx#L341-L374
+    hash_const = [INIT_A]
+    mixer = hashmix(entropy_array[0], hash_const)
+    for i_src in range(1, len(entropy_array)):
+        mixer = mix(mixer, hashmix(entropy_array[i_src], hash_const))
+    return mixer
+
+
+def get_generator(
+    seed: int | list[int] | None = None,
+) -> PaddleGenerator | None:
+    if seed is not None:
+        if isinstance(seed, list):
+            seed = mix_entropy(seed)
+        if DEVICE == "cpu":
+            generator = paddle.framework.core.default_cpu_generator()
+        elif DEVICE == "gpu":
+            generator = paddle.framework.core.default_cuda_generator(0)
+        elif DEVICE.startswith("gpu:"):
+            generator = paddle.framework.core.default_cuda_generator(
+                int(DEVICE.split("gpu:")[1])
+            )
+        else:
+            raise ValueError("DEVICE should be cpu or gpu or gpu:x")
+        generator.manual_seed(seed)
+        return generator
+    else:
+        return None
diff --git a/deepmd/pt/model/network/network.py b/deepmd/pt/model/network/network.py
index 353ed0c063..5b4d741a3b 100644
--- a/deepmd/pt/model/network/network.py
+++ b/deepmd/pt/model/network/network.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
+    Final,
     Optional,
     Union,
 )
@@ -8,29 +9,17 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+import torch.utils.checkpoint
 
+from deepmd.dpmodel.utils.type_embed import (
+    get_econf_tebd,
+)
 from deepmd.pt.model.network.mlp import (
     EmbeddingNet,
 )
 from deepmd.pt.utils import (
     env,
 )
-from deepmd.utils.version import (
-    check_version_compatibility,
-)
-
-try:
-    from typing import (
-        Final,
-    )
-except ImportError:
-    from torch.jit import Final
-
-import torch.utils.checkpoint
-
-from deepmd.dpmodel.utils.type_embed import (
-    get_econf_tebd,
-)
 from deepmd.pt.utils.utils import (
     ActivationFn,
     to_torch_tensor,
@@ -38,6 +27,9 @@
 from deepmd.utils.finetune import (
     get_index_between_two_maps,
 )
+from deepmd.utils.version import (
+    check_version_compatibility,
+)
 
 
 def Tensor(*shape):
diff --git a/deepmd/utils/batch_size.py b/deepmd/utils/batch_size.py
index 7cd125c97b..c1cbea4cda 100644
--- a/deepmd/utils/batch_size.py
+++ b/deepmd/utils/batch_size.py
@@ -175,7 +175,11 @@ def execute_with_batch_size(
                 *[
                     (
                         vv[start_index:end_index, ...]
-                        if array_api_compat.is_array_api_obj(vv) and vv.ndim > 1
+                        if (
+                            array_api_compat.is_array_api_obj(vv)
+                            and vv.ndim > 1
+                            or str(vv.__class__) == "<class 'paddle.Tensor'>"
+                        )
                         else vv
                     )
                     for vv in args
@@ -183,7 +187,11 @@ def execute_with_batch_size(
                 **{
                     kk: (
                         vv[start_index:end_index, ...]
-                        if array_api_compat.is_array_api_obj(vv) and vv.ndim > 1
+                        if (
+                            array_api_compat.is_array_api_obj(vv)
+                            and vv.ndim > 1
+                            or str(vv.__class__) == "<class 'paddle.Tensor'>"
+                        )
                         else vv
                     )
                     for kk, vv in kwargs.items()
@@ -222,6 +230,14 @@ def concate_result(r):
             if array_api_compat.is_array_api_obj(r[0]):
                 xp = array_api_compat.array_namespace(r[0])
                 ret = xp.concat(r, axis=0)
+            elif str(r[0].__class__) == "<class 'paddle.Tensor'>":
+                try:
+                    import paddle
+                except ModuleNotFoundError as e:
+                    raise ModuleNotFoundError(
+                        "The 'paddlepaddle' is required but not installed."
+                    ) from e
+                ret = paddle.concat(r, axis=0)
             else:
                 raise RuntimeError(f"Unexpected result type {type(r[0])}")
             return ret
diff --git a/deepmd/utils/data.py b/deepmd/utils/data.py
index 493a9d8d54..b93356bdbf 100644
--- a/deepmd/utils/data.py
+++ b/deepmd/utils/data.py
@@ -247,6 +247,21 @@ def get_item_torch(self, index: int) -> dict:
         frame["fid"] = index
         return frame
 
+    def get_item_paddle(self, index: int) -> dict:
+        """Get a single frame data . The frame is picked from the data system by index. The index is coded across all the sets.
+
+        Parameters
+        ----------
+        index
+            index of the frame
+        """
+        i = bisect.bisect_right(self.prefix_sum, index)
+        frames = self._load_set(self.dirs[i])
+        frame = self._get_subdata(frames, index - self.prefix_sum[i])
+        frame = self.reformat_data_torch(frame)
+        frame["fid"] = index
+        return frame
+
     def get_batch(self, batch_size: int) -> dict:
         """Get a batch of data with `batch_size` frames. The frames are randomly picked from the data system.
 
diff --git a/pyproject.toml b/pyproject.toml
index b9d8503b18..fd0c76839b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -224,12 +224,12 @@ replacement = '\1="https://github.com/deepmodeling/deepmd-kit/raw/master/\g<2>"'
 [tool.cibuildwheel]
 test-command = [
     "python -m deepmd -h",
-    """python -c "import deepmd.tf;import deepmd.pt" """,
+    """python -c "import deepmd.tf;import deepmd.pt;import deepmd.pd" """,
     "dp -h",
     "dp_ipi",
     "pytest {project}/source/tests/tf/test_lammps.py"
 ]
-test-extras = ["cpu", "test", "lmp", "ipi", "torch"]
+test-extras = ["cpu", "test", "lmp", "ipi", "torch", "paddle"]
 build = ["cp311-*"]
 skip = ["*-win32", "*-manylinux_i686", "*-musllinux*"]
 # TODO: uncomment to use the latest image when CUDA 11 is deprecated
@@ -248,6 +248,7 @@ PIP_PREFER_BINARY = "1"
 DP_LAMMPS_VERSION = "stable_29Aug2024_update1"
 DP_ENABLE_IPI = "1"
 DP_ENABLE_PYTORCH = "1"
+DP_ENABLE_PADDLE = "1"
 # for unclear reason, when enabling PyTorch, OpenMP is found accidentally
 CMAKE_ARGS = "-DCMAKE_DISABLE_FIND_PACKAGE_OpenMP=1"
 
@@ -284,6 +285,7 @@ PIP_PREFER_BINARY = "1"
 DP_LAMMPS_VERSION = "stable_29Aug2024_update1"
 DP_ENABLE_IPI = "1"
 DP_ENABLE_PYTORCH = "1"
+DP_ENABLE_PADDLE = "1"
 MPI_HOME = "/usr/lib64/mpich"
 PATH = "/usr/lib64/mpich/bin:$PATH"
 # use CPU version of torch for building, which should also work for GPU
@@ -294,7 +296,7 @@ UV_EXTRA_INDEX_URL = "https://download.pytorch.org/whl/cpu"
 CMAKE_PREFIX_PATH="/opt/python/cp311-cp311/"
 
 [tool.cibuildwheel.windows]
-test-extras = ["cpu", "torch"]
+test-extras = ["cpu", "torch", "paddle"]
 test-command = [
     "python -m deepmd -h",
     "dp -h",
@@ -302,6 +304,7 @@ test-command = [
 [tool.cibuildwheel.windows.environment]
 PIP_PREFER_BINARY = "1"
 DP_ENABLE_PYTORCH = "1"
+DP_ENABLE_PADDLE = "1"
 
 # One can run `tox` or `tox -e gpu`
 # to run pytest in an isolated environment
@@ -407,10 +410,12 @@ convention = "numpy"
 banned-module-level-imports = [
     "deepmd.tf",
     "deepmd.pt",
+    "deepmd.pd",
     "deepmd.jax",
     "tensorflow",
     "torch",
     "jax",
+    "paddle",
 ]
 
 [tool.ruff.lint.flake8-tidy-imports.banned-api]
@@ -424,10 +429,13 @@ runtime-evaluated-base-classes = ["torch.nn.Module"]
 "deepmd/tf/**" = ["TID253"]
 "deepmd/pt/**" = ["TID253"]
 "deepmd/jax/**" = ["TID253"]
+"deepmd/pd/**" = ["TID253"]
 "source/tests/tf/**" = ["TID253"]
 "source/tests/pt/**" = ["TID253"]
 "source/tests/jax/**" = ["TID253"]
+"source/tests/pd/**" = ["TID253"]
 "source/tests/universal/pt/**" = ["TID253"]
+"source/tests/universal/pd/**" = ["TID253"]
 "source/jax2tf_tests/**" = ["TID253"]
 "source/ipi/tests/**" = ["TID253"]
 "source/lmp/tests/**" = ["TID253"]
diff --git a/source/tests/consistent/common.py b/source/tests/consistent/common.py
index 358ac8d542..cb4dbed391 100644
--- a/source/tests/consistent/common.py
+++ b/source/tests/consistent/common.py
@@ -42,10 +42,11 @@
 INSTALLED_TF = Backend.get_backend("tensorflow")().is_available()
 INSTALLED_PT = Backend.get_backend("pytorch")().is_available()
 INSTALLED_JAX = Backend.get_backend("jax")().is_available()
+INSTALLED_PD = Backend.get_backend("paddle")().is_available()
 INSTALLED_ARRAY_API_STRICT = find_spec("array_api_strict") is not None
 
-if os.environ.get("CI") and not (INSTALLED_TF and INSTALLED_PT):
-    raise ImportError("TensorFlow or PyTorch should be tested in the CI")
+if os.environ.get("CI") and not (INSTALLED_TF and INSTALLED_PT and INSTALLED_PD):
+    raise ImportError("TensorFlow, PyTorch or Paddle should be tested in the CI")
 
 
 if INSTALLED_TF:
@@ -66,6 +67,7 @@
     "INSTALLED_TF",
     "INSTALLED_PT",
     "INSTALLED_JAX",
+    "INSTALLED_PD",
     "INSTALLED_ARRAY_API_STRICT",
 ]
 
@@ -85,6 +87,8 @@ class CommonTest(ABC):
     """PyTorch model class."""
     jax_class: ClassVar[Optional[type]]
     """JAX model class."""
+    pd_class: ClassVar[Optional[type]]
+    """Paddle model class."""
     array_api_strict_class: ClassVar[Optional[type]]
     args: ClassVar[Optional[Union[Argument, list[Argument]]]]
     """Arguments that maps to the `data`."""
@@ -97,6 +101,8 @@ class CommonTest(ABC):
     # we may usually skip jax before jax is fully supported
     skip_jax: ClassVar[bool] = True
     """Whether to skip the JAX model."""
+    skip_pd: ClassVar[bool] = not INSTALLED_PD
+    """Whether to skip the Paddle model."""
     skip_array_api_strict: ClassVar[bool] = True
     """Whether to skip the array_api_strict model."""
     rtol = 1e-10
@@ -179,6 +185,16 @@ def eval_jax(self, jax_obj: Any) -> Any:
         """
         raise NotImplementedError("Not implemented")
 
+    @abstractmethod
+    def eval_pd(self, pd_obj: Any) -> Any:
+        """Evaluate the return value of PD.
+
+        Parameters
+        ----------
+        pd_obj : Any
+            The object of PD
+        """
+
     def eval_array_api_strict(self, array_api_strict_obj: Any) -> Any:
         """Evaluate the return value of array_api_strict.
 
@@ -195,6 +211,7 @@ class RefBackend(Enum):
         TF = 1
         DP = 2
         PT = 3
+        PD = 4
         JAX = 5
         ARRAY_API_STRICT = 6
 
@@ -262,6 +279,11 @@ def get_jax_ret_serialization_from_cls(self, obj):
         data = obj.serialize()
         return ret, data
 
+    def get_pd_ret_serialization_from_cls(self, obj):
+        ret = self.eval_pd(obj)
+        data = obj.serialize()
+        return ret, data
+
     def get_array_api_strict_ret_serialization_from_cls(self, obj):
         ret = self.eval_array_api_strict(obj)
         data = obj.serialize()
@@ -280,6 +302,8 @@ def get_reference_backend(self):
             return self.RefBackend.PT
         if not self.skip_jax:
             return self.RefBackend.JAX
+        if not self.skip_pd:
+            return self.RefBackend.PD
         if not self.skip_array_api_strict:
             return self.RefBackend.ARRAY_API_STRICT
         raise ValueError("No available reference")
@@ -298,6 +322,9 @@ def get_reference_ret_serialization(self, ref: RefBackend):
         if ref == self.RefBackend.JAX:
             obj = self.init_backend_cls(self.jax_class)
             return self.get_jax_ret_serialization_from_cls(obj)
+        if ref == self.RefBackend.PD:
+            obj = self.init_backend_cls(self.pd_class)
+            return self.get_pd_ret_serialization_from_cls(obj)
         if ref == self.RefBackend.ARRAY_API_STRICT:
             obj = self.init_backend_cls(self.array_api_strict_class)
             return self.get_array_api_strict_ret_serialization_from_cls(obj)
@@ -459,6 +486,45 @@ def test_jax_self_consistent(self) -> None:
             else:
                 self.assertEqual(rr1, rr2)
 
+    def test_pd_consistent_with_ref(self):
+        """Test whether PD and reference are consistent."""
+        if self.skip_pd:
+            self.skipTest("Unsupported backend")
+        ref_backend = self.get_reference_backend()
+        if ref_backend == self.RefBackend.PD:
+            self.skipTest("Reference is self")
+        ret1, data1 = self.get_reference_ret_serialization(ref_backend)
+        ret1 = self.extract_ret(ret1, ref_backend)
+        obj = self.pd_class.deserialize(data1)
+        ret2 = self.eval_pd(obj)
+        ret2 = self.extract_ret(ret2, self.RefBackend.PD)
+        data2 = obj.serialize()
+        if obj.__class__.__name__.startswith(("Polar", "Dipole", "DOS")):
+            # tf, pd serialization mismatch
+            common_keys = set(data1.keys()) & set(data2.keys())
+            data1 = {k: data1[k] for k in common_keys}
+            data2 = {k: data2[k] for k in common_keys}
+        np.testing.assert_equal(data1, data2)
+        for rr1, rr2 in zip(ret1, ret2):
+            np.testing.assert_allclose(rr1, rr2, rtol=self.rtol, atol=self.atol)
+            assert rr1.dtype == rr2.dtype, f"{rr1.dtype} != {rr2.dtype}"
+
+    def test_pd_self_consistent(self):
+        """Test whether PD is self consistent."""
+        if self.skip_pd:
+            self.skipTest("Unsupported backend")
+        obj1 = self.init_backend_cls(self.pd_class)
+        ret1, data1 = self.get_pd_ret_serialization_from_cls(obj1)
+        obj2 = self.pd_class.deserialize(data1)
+        ret2, data2 = self.get_pd_ret_serialization_from_cls(obj2)
+        np.testing.assert_equal(data1, data2)
+        for rr1, rr2 in zip(ret1, ret2):
+            if isinstance(rr1, np.ndarray) and isinstance(rr2, np.ndarray):
+                np.testing.assert_allclose(rr1, rr2, rtol=self.rtol, atol=self.atol)
+                assert rr1.dtype == rr2.dtype, f"{rr1.dtype} != {rr2.dtype}"
+            else:
+                self.assertEqual(rr1, rr2)
+
     @unittest.skipIf(TEST_DEVICE != "cpu" and CI, "Only test on CPU.")
     def test_array_api_strict_consistent_with_ref(self) -> None:
         """Test whether array_api_strict and reference are consistent."""
diff --git a/source/tests/consistent/descriptor/common.py b/source/tests/consistent/descriptor/common.py
index a469a22348..baa6e97d04 100644
--- a/source/tests/consistent/descriptor/common.py
+++ b/source/tests/consistent/descriptor/common.py
@@ -19,6 +19,7 @@
 from ..common import (
     INSTALLED_ARRAY_API_STRICT,
     INSTALLED_JAX,
+    INSTALLED_PD,
     INSTALLED_PT,
     INSTALLED_TF,
 )
@@ -43,6 +44,15 @@
 if INSTALLED_ARRAY_API_STRICT:
     import array_api_strict
 
+if INSTALLED_PD:
+    import paddle
+
+    from deepmd.pd.utils.env import DEVICE as PD_DEVICE
+    from deepmd.pd.utils.nlist import build_neighbor_list as build_neighbor_list_pd
+    from deepmd.pd.utils.nlist import (
+        extend_coord_with_ghosts as extend_coord_with_ghosts_pd,
+    )
+
 
 class DescriptorTest:
     """Useful utilities for descriptor tests."""
@@ -135,6 +145,28 @@ def eval_jax_descriptor(
             for x in jax_obj(ext_coords, ext_atype, nlist=nlist, mapping=mapping)
         ]
 
+    def eval_pd_descriptor(
+        self, pd_obj: Any, natoms, coords, atype, box, mixed_types: bool = False
+    ) -> Any:
+        ext_coords, ext_atype, mapping = extend_coord_with_ghosts_pd(
+            paddle.to_tensor(coords).to(PD_DEVICE).reshape([1, -1, 3]),
+            paddle.to_tensor(atype).to(PD_DEVICE).reshape([1, -1]),
+            paddle.to_tensor(box).to(PD_DEVICE).reshape([1, 3, 3]),
+            pd_obj.get_rcut(),
+        )
+        nlist = build_neighbor_list_pd(
+            ext_coords,
+            ext_atype,
+            natoms[0],
+            pd_obj.get_rcut(),
+            pd_obj.get_sel(),
+            distinguish_types=(not mixed_types),
+        )
+        return [
+            x.detach().cpu().numpy() if paddle.is_tensor(x) else x
+            for x in pd_obj(ext_coords, ext_atype, nlist=nlist, mapping=mapping)
+        ]
+
     def eval_array_api_strict_descriptor(
         self,
         array_api_strict_obj: Any,
diff --git a/source/tests/consistent/descriptor/test_se_e2_a.py b/source/tests/consistent/descriptor/test_se_e2_a.py
index a3ed19e8f3..a463960fb7 100644
--- a/source/tests/consistent/descriptor/test_se_e2_a.py
+++ b/source/tests/consistent/descriptor/test_se_e2_a.py
@@ -14,6 +14,7 @@
 from ..common import (
     INSTALLED_ARRAY_API_STRICT,
     INSTALLED_JAX,
+    INSTALLED_PD,
     INSTALLED_PT,
     INSTALLED_TF,
     CommonTest,
@@ -31,6 +32,10 @@
     from deepmd.tf.descriptor.se_a import DescrptSeA as DescrptSeATF
 else:
     DescrptSeATF = None
+if INSTALLED_PD:
+    from deepmd.pd.model.descriptor.se_a import DescrptSeA as DescrptSeAPD
+else:
+    DescrptSeAPD = None
 from deepmd.utils.argcheck import (
     descrpt_se_a_args,
 )
@@ -122,6 +127,17 @@ def skip_jax(self) -> bool:
         ) = self.param
         return not type_one_side or not INSTALLED_JAX
 
+    @property
+    def skip_pd(self) -> bool:
+        (
+            resnet_dt,
+            type_one_side,
+            excluded_types,
+            precision,
+            env_protection,
+        ) = self.param
+        return CommonTest.skip_pd
+
     @property
     def skip_array_api_strict(self) -> bool:
         (
@@ -137,6 +153,7 @@ def skip_array_api_strict(self) -> bool:
     dp_class = DescrptSeADP
     pt_class = DescrptSeAPT
     jax_class = DescrptSeAJAX
+    pd_class = DescrptSeAPD
     array_api_strict_class = DescrptSeAArrayAPIStrict
     args = descrpt_se_a_args()
 
@@ -223,6 +240,15 @@ def eval_jax(self, jax_obj: Any) -> Any:
             self.box,
         )
 
+    def eval_pd(self, pd_obj: Any) -> Any:
+        return self.eval_pd_descriptor(
+            pd_obj,
+            self.natoms,
+            self.coords,
+            self.atype,
+            self.box,
+        )
+
     def eval_array_api_strict(self, array_api_strict_obj: Any) -> Any:
         return self.eval_array_api_strict_descriptor(
             array_api_strict_obj,
diff --git a/source/tests/consistent/fitting/common.py b/source/tests/consistent/fitting/common.py
index 95557d9ab8..1f6c2da565 100644
--- a/source/tests/consistent/fitting/common.py
+++ b/source/tests/consistent/fitting/common.py
@@ -2,6 +2,7 @@
 
 
 from ..common import (
+    INSTALLED_PD,
     INSTALLED_PT,
     INSTALLED_TF,
 )
@@ -13,6 +14,8 @@
         GLOBAL_TF_FLOAT_PRECISION,
         tf,
     )
+if INSTALLED_PD:
+    pass
 
 
 class FittingTest:
diff --git a/source/tests/consistent/fitting/test_ener.py b/source/tests/consistent/fitting/test_ener.py
index 1ef846dbcc..12fafa7ba8 100644
--- a/source/tests/consistent/fitting/test_ener.py
+++ b/source/tests/consistent/fitting/test_ener.py
@@ -17,6 +17,7 @@
 from ..common import (
     INSTALLED_ARRAY_API_STRICT,
     INSTALLED_JAX,
+    INSTALLED_PD,
     INSTALLED_PT,
     INSTALLED_TF,
     CommonTest,
@@ -37,6 +38,13 @@
     from deepmd.tf.fit.ener import EnerFitting as EnerFittingTF
 else:
     EnerFittingTF = object
+if INSTALLED_PD:
+    import paddle
+
+    from deepmd.pd.model.task.ener import EnergyFittingNet as EnerFittingPD
+    from deepmd.pd.utils.env import DEVICE as PD_DEVICE
+else:
+    EnerFittingPD = object
 from deepmd.utils.argcheck import (
     fitting_ener,
 )
@@ -115,10 +123,25 @@ def skip_array_api_strict(self) -> bool:
         # TypeError: The array_api_strict namespace does not support the dtype 'bfloat16'
         return not INSTALLED_ARRAY_API_STRICT or precision == "bfloat16"
 
+    @property
+    def skip_pd(self) -> bool:
+        (
+            resnet_dt,
+            precision,
+            mixed_types,
+            numb_fparam,
+            (numb_aparam, use_aparam_as_mask),
+            atom_ener,
+        ) = self.param
+        # Paddle do not support "bfloat16" in some kernels,
+        # so skip this in CI test
+        return CommonTest.skip_pd or precision == "bfloat16"
+
     tf_class = EnerFittingTF
     dp_class = EnerFittingDP
     pt_class = EnerFittingPT
     jax_class = EnerFittingJAX
+    pd_class = EnerFittingPD
     array_api_strict_class = EnerFittingStrict
     args = fitting_ener()
 
@@ -252,6 +275,35 @@ def eval_array_api_strict(self, array_api_strict_obj: Any) -> Any:
             )["energy"]
         )
 
+    def eval_pd(self, pd_obj: Any) -> Any:
+        (
+            resnet_dt,
+            precision,
+            mixed_types,
+            numb_fparam,
+            (numb_aparam, use_aparam_as_mask),
+            atom_ener,
+        ) = self.param
+        return (
+            pd_obj(
+                paddle.to_tensor(self.inputs).to(device=PD_DEVICE),
+                paddle.to_tensor(self.atype.reshape([1, -1])).to(device=PD_DEVICE),
+                fparam=(
+                    paddle.to_tensor(self.fparam).to(device=PD_DEVICE)
+                    if numb_fparam
+                    else None
+                ),
+                aparam=(
+                    paddle.to_tensor(self.aparam).to(device=PD_DEVICE)
+                    if numb_aparam
+                    else None
+                ),
+            )["energy"]
+            .detach()
+            .cpu()
+            .numpy()
+        )
+
     def extract_ret(self, ret: Any, backend) -> tuple[np.ndarray, ...]:
         if backend == self.RefBackend.TF:
             # shape is not same
diff --git a/source/tests/consistent/model/common.py b/source/tests/consistent/model/common.py
index bb38abc5b6..7cf71000db 100644
--- a/source/tests/consistent/model/common.py
+++ b/source/tests/consistent/model/common.py
@@ -12,6 +12,7 @@
 
 from ..common import (
     INSTALLED_JAX,
+    INSTALLED_PD,
     INSTALLED_PT,
     INSTALLED_TF,
 )
@@ -29,6 +30,9 @@
     from deepmd.jax.env import (
         jnp,
     )
+if INSTALLED_PD:
+    from deepmd.pd.utils.utils import to_numpy_array as paddle_to_numpy
+    from deepmd.pd.utils.utils import to_paddle_tensor as numpy_to_paddle
 
 
 class ModelTest:
@@ -114,3 +118,14 @@ def assert_jax_array(arr):
                 do_atomic_virial=True,
             ).items()
         }
+
+    def eval_pd_model(self, pd_obj: Any, natoms, coords, atype, box) -> Any:
+        return {
+            kk: paddle_to_numpy(vv)
+            for kk, vv in pd_obj(
+                numpy_to_paddle(coords),
+                numpy_to_paddle(atype),
+                box=numpy_to_paddle(box),
+                do_atomic_virial=True,
+            ).items()
+        }
diff --git a/source/tests/consistent/model/test_ener.py b/source/tests/consistent/model/test_ener.py
index 4c50c08bef..d56b9a257b 100644
--- a/source/tests/consistent/model/test_ener.py
+++ b/source/tests/consistent/model/test_ener.py
@@ -24,6 +24,7 @@
 
 from ..common import (
     INSTALLED_JAX,
+    INSTALLED_PD,
     INSTALLED_PT,
     INSTALLED_TF,
     SKIP_FLAG,
@@ -45,6 +46,13 @@
     from deepmd.tf.model.ener import EnerModel as EnergyModelTF
 else:
     EnergyModelTF = None
+if INSTALLED_PD:
+    from deepmd.pd.model.model import get_model as get_model_pd
+    from deepmd.pd.model.model.ener_model import EnergyModel as EnergyModelPD
+    from deepmd.pd.utils.utils import to_numpy_array as paddle_to_numpy
+    from deepmd.pd.utils.utils import to_paddle_tensor as numpy_to_paddle
+else:
+    EnergyModelPD = None
 from deepmd.utils.argcheck import (
     model_args,
 )
@@ -106,7 +114,9 @@ def data(self) -> dict:
     tf_class = EnergyModelTF
     dp_class = EnergyModelDP
     pt_class = EnergyModelPT
+    pd_class = EnergyModelPD
     jax_class = EnergyModelJAX
+    pd_class = EnergyModelPD
     args = model_args()
 
     def get_reference_backend(self):
@@ -120,6 +130,8 @@ def get_reference_backend(self):
             return self.RefBackend.TF
         if not self.skip_jax:
             return self.RefBackend.JAX
+        if not self.skip_pd:
+            return self.RefBackend.PD
         if not self.skip_dp:
             return self.RefBackend.DP
         raise ValueError("No available reference")
@@ -146,6 +158,8 @@ def pass_data_to_cls(self, cls, data) -> Any:
             return model
         elif cls is EnergyModelJAX:
             return get_model_jax(data)
+        elif cls is EnergyModelPD:
+            return get_model_pd(data)
         return cls(**data, **self.additional_data)
 
     def setUp(self) -> None:
@@ -224,6 +238,15 @@ def eval_jax(self, jax_obj: Any) -> Any:
             self.box,
         )
 
+    def eval_pd(self, pd_obj: Any) -> Any:
+        return self.eval_pd_model(
+            pd_obj,
+            self.natoms,
+            self.coords,
+            self.atype,
+            self.box,
+        )
+
     def extract_ret(self, ret: Any, backend) -> tuple[np.ndarray, ...]:
         # shape not matched. ravel...
         if backend is self.RefBackend.DP:
@@ -258,6 +281,14 @@ def extract_ret(self, ret: Any, backend) -> tuple[np.ndarray, ...]:
                 ret["energy_derv_c_redu"].ravel(),
                 ret["energy_derv_c"].ravel(),
             )
+        elif backend is self.RefBackend.PD:
+            return (
+                ret["energy"].flatten(),
+                ret["atom_energy"].flatten(),
+                ret["force"].flatten(),
+                ret["virial"].flatten(),
+                ret["atom_virial"].flatten(),
+            )
         raise ValueError(f"Unknown backend: {backend}")
 
 
@@ -309,6 +340,7 @@ def data(self) -> dict:
     dp_class = EnergyModelDP
     pt_class = EnergyModelPT
     jax_class = EnergyModelJAX
+    pd_class = EnergyModelPD
     args = model_args()
 
     def get_reference_backend(self):
@@ -322,6 +354,8 @@ def get_reference_backend(self):
             return self.RefBackend.JAX
         if not self.skip_dp:
             return self.RefBackend.DP
+        if not self.skip_pd:
+            return self.RefBackend.PD
         raise ValueError("No available reference")
 
     @property
@@ -342,6 +376,8 @@ def pass_data_to_cls(self, cls, data) -> Any:
             return get_model_pt(data)
         elif cls is EnergyModelJAX:
             return get_model_jax(data)
+        elif cls is EnergyModelPD:
+            return get_model_pd(data)
         return cls(**data, **self.additional_data)
 
     def setUp(self) -> None:
@@ -436,6 +472,18 @@ def eval_jax(self, jax_obj: Any) -> Any:
             ).items()
         }
 
+    def eval_pd(self, pd_obj: Any) -> Any:
+        return {
+            kk: paddle_to_numpy(vv)
+            for kk, vv in pd_obj.forward_lower(
+                numpy_to_paddle(self.extended_coord),
+                numpy_to_paddle(self.extended_atype),
+                numpy_to_paddle(self.nlist),
+                numpy_to_paddle(self.mapping),
+                do_atomic_virial=True,
+            ).items()
+        }
+
     def extract_ret(self, ret: Any, backend) -> tuple[np.ndarray, ...]:
         # shape not matched. ravel...
         if backend is self.RefBackend.DP:
@@ -462,4 +510,12 @@ def extract_ret(self, ret: Any, backend) -> tuple[np.ndarray, ...]:
                 ret["energy_derv_c_redu"].ravel(),
                 ret["energy_derv_c"].ravel(),
             )
+        elif backend is self.RefBackend.PD:
+            return (
+                ret["energy"].flatten(),
+                ret["atom_energy"].flatten(),
+                ret["extended_force"].flatten(),
+                ret["virial"].flatten(),
+                ret["extended_virial"].flatten(),
+            )
         raise ValueError(f"Unknown backend: {backend}")
diff --git a/source/tests/consistent/test_activation.py b/source/tests/consistent/test_activation.py
index 2368b6c473..31351d4a9d 100644
--- a/source/tests/consistent/test_activation.py
+++ b/source/tests/consistent/test_activation.py
@@ -17,6 +17,7 @@
 )
 from .common import (
     INSTALLED_JAX,
+    INSTALLED_PD,
     INSTALLED_PT,
     INSTALLED_TF,
     parameterized,
@@ -37,6 +38,12 @@
     from deepmd.jax.env import (
         jnp,
     )
+if INSTALLED_PD:
+    from deepmd.pd.utils.utils import ActivationFn as ActivationFn_pd
+    from deepmd.pd.utils.utils import to_numpy_array as paddle_to_numpy
+    from deepmd.pd.utils.utils import (
+        to_paddle_tensor,
+    )
 
 
 @parameterized(
@@ -83,3 +90,11 @@ def test_jax_consistent_with_ref(self) -> None:
         test = get_activation_fn_dp(self.activation)(input)
         self.assertTrue(isinstance(test, jnp.ndarray))
         np.testing.assert_allclose(self.ref, np.from_dlpack(test), atol=1e-10)
+
+    @unittest.skipUnless(INSTALLED_PD, "Paddle is not installed")
+    def test_pd_consistent_with_ref(self):
+        if INSTALLED_PD:
+            test = paddle_to_numpy(
+                ActivationFn_pd(self.activation)(to_paddle_tensor(self.random_input))
+            )
+            np.testing.assert_allclose(self.ref, test, atol=1e-10)
diff --git a/source/tests/consistent/test_neighbor_stat.py b/source/tests/consistent/test_neighbor_stat.py
index 573e367267..9c9f97045b 100644
--- a/source/tests/consistent/test_neighbor_stat.py
+++ b/source/tests/consistent/test_neighbor_stat.py
@@ -14,6 +14,7 @@
 )
 from .common import (
     INSTALLED_JAX,
+    INSTALLED_PD,
     INSTALLED_PT,
     INSTALLED_TF,
 )
@@ -87,3 +88,7 @@ def test_neighbor_stat_dp(self) -> None:
     @unittest.skipUnless(INSTALLED_JAX, "jax is not installed")
     def test_neighbor_stat_jax(self) -> None:
         self.run_neighbor_stat("jax")
+
+    @unittest.skipUnless(INSTALLED_PD, "paddle is not installed")
+    def test_neighbor_stat_pd(self):
+        self.run_neighbor_stat("paddle")
diff --git a/source/tests/consistent/test_type_embedding.py b/source/tests/consistent/test_type_embedding.py
index 1c56abea0c..9c1de0e8c5 100644
--- a/source/tests/consistent/test_type_embedding.py
+++ b/source/tests/consistent/test_type_embedding.py
@@ -17,6 +17,7 @@
 from .common import (
     INSTALLED_ARRAY_API_STRICT,
     INSTALLED_JAX,
+    INSTALLED_PD,
     INSTALLED_PT,
     INSTALLED_TF,
     CommonTest,
@@ -45,6 +46,13 @@
     from ..array_api_strict.utils.type_embed import TypeEmbedNet as TypeEmbedNetStrict
 else:
     TypeEmbedNetStrict = None
+if INSTALLED_PD:
+    import paddle
+
+    from deepmd.pd.model.network.network import TypeEmbedNetConsistent as TypeEmbedNetPD
+    from deepmd.pd.utils.env import DEVICE as PD_DEVICE
+else:
+    TypeEmbedNetPD = object
 
 
 @parameterized(
@@ -79,6 +87,7 @@ def data(self) -> dict:
     dp_class = TypeEmbedNetDP
     pt_class = TypeEmbedNetPT
     jax_class = TypeEmbedNetJAX
+    pd_class = TypeEmbedNetPD
     array_api_strict_class = TypeEmbedNetStrict
     args = type_embedding_args()
     skip_jax = not INSTALLED_JAX
@@ -130,6 +139,12 @@ def eval_jax(self, jax_obj: Any) -> Any:
                 raise ValueError("Output is numpy array")
         return [np.array(x) if isinstance(x, jnp.ndarray) else x for x in (out,)]
 
+    def eval_pd(self, pd_obj: Any) -> Any:
+        return [
+            x.detach().cpu().numpy() if paddle.is_tensor(x) else x
+            for x in (pd_obj(device=PD_DEVICE),)
+        ]
+
     def eval_array_api_strict(self, array_api_strict_obj: Any) -> Any:
         out = array_api_strict_obj()
         return [
diff --git a/source/tests/pd/__init__.py b/source/tests/pd/__init__.py
new file mode 100644
index 0000000000..8d1616afb2
--- /dev/null
+++ b/source/tests/pd/__init__.py
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+from deepmd.pd.utils import (
+    env,
+)
+
+env.enable_prim(True)
diff --git a/source/tests/pd/common.py b/source/tests/pd/common.py
new file mode 100644
index 0000000000..59a9672330
--- /dev/null
+++ b/source/tests/pd/common.py
@@ -0,0 +1,263 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+from typing import (
+    Optional,
+    Union,
+)
+
+import numpy as np
+import paddle
+
+from deepmd.main import (
+    main,
+)
+from deepmd.pd.utils.env import (
+    DEVICE,
+    GLOBAL_PD_FLOAT_PRECISION,
+)
+
+
+def run_dp(cmd: str) -> int:
+    """Run DP directly from the entry point instead of the subprocess.
+
+    It is quite slow to start DeePMD-kit with subprocess.
+
+    Parameters
+    ----------
+    cmd : str
+        The command to run.
+
+    Returns
+    -------
+    int
+        Always returns 0.
+    """
+    cmds = cmd.split()
+    if cmds[0] == "dp":
+        cmds = cmds[1:]
+    else:
+        raise RuntimeError("The command is not dp")
+
+    main(cmds)
+    return 0
+
+
+def eval_model(
+    model,
+    coords: Union[np.ndarray, paddle.Tensor],
+    cells: Optional[Union[np.ndarray, paddle.Tensor]],
+    atom_types: Union[np.ndarray, paddle.Tensor, list[int]],
+    spins: Optional[Union[np.ndarray, paddle.Tensor]] = None,
+    atomic: bool = False,
+    infer_batch_size: int = 2,
+    denoise: bool = False,
+):
+    model = model.to(DEVICE)
+    energy_out = []
+    atomic_energy_out = []
+    force_out = []
+    force_mag_out = []
+    virial_out = []
+    atomic_virial_out = []
+    updated_coord_out = []
+    logits_out = []
+    err_msg = (
+        f"All inputs should be the same format, "
+        f"but found {type(coords)}, {type(cells)}, {type(atom_types)} instead! "
+    )
+    return_tensor = True
+    if isinstance(coords, paddle.Tensor):
+        if cells is not None:
+            assert isinstance(cells, paddle.Tensor), err_msg
+        if spins is not None:
+            assert isinstance(spins, paddle.Tensor), err_msg
+        assert isinstance(atom_types, paddle.Tensor) or isinstance(atom_types, list)
+        atom_types = paddle.to_tensor(atom_types, dtype=paddle.int32, place=DEVICE)
+    elif isinstance(coords, np.ndarray):
+        if cells is not None:
+            assert isinstance(cells, np.ndarray), err_msg
+        if spins is not None:
+            assert isinstance(spins, np.ndarray), err_msg
+        assert isinstance(atom_types, np.ndarray) or isinstance(atom_types, list)
+        atom_types = np.array(atom_types, dtype=np.int32)
+        return_tensor = False
+
+    nframes = coords.shape[0]
+    if len(atom_types.shape) == 1:
+        natoms = len(atom_types)
+        if isinstance(atom_types, paddle.Tensor):
+            atom_types = paddle.tile(atom_types.unsqueeze(0), [nframes, 1]).reshape(
+                [nframes, -1]
+            )
+        else:
+            atom_types = np.tile(atom_types, nframes).reshape(nframes, -1)
+    else:
+        natoms = len(atom_types[0])
+
+    coord_input = paddle.to_tensor(
+        coords.reshape([-1, natoms, 3]), dtype=GLOBAL_PD_FLOAT_PRECISION, place=DEVICE
+    )
+    spin_input = None
+    if spins is not None:
+        spin_input = paddle.to_tensor(
+            spins.reshape([-1, natoms, 3]),
+            dtype=GLOBAL_PD_FLOAT_PRECISION,
+            place=DEVICE,
+        )
+    has_spin = getattr(model, "has_spin", False)
+    if callable(has_spin):
+        has_spin = has_spin()
+    type_input = paddle.to_tensor(atom_types, dtype=paddle.int64, place=DEVICE)
+    box_input = None
+    if cells is None:
+        pbc = False
+    else:
+        pbc = True
+        box_input = paddle.to_tensor(
+            cells.reshape([-1, 3, 3]), dtype=GLOBAL_PD_FLOAT_PRECISION, place=DEVICE
+        )
+    num_iter = int((nframes + infer_batch_size - 1) / infer_batch_size)
+
+    for ii in range(num_iter):
+        batch_coord = coord_input[ii * infer_batch_size : (ii + 1) * infer_batch_size]
+        batch_atype = type_input[ii * infer_batch_size : (ii + 1) * infer_batch_size]
+        batch_box = None
+        batch_spin = None
+        if spin_input is not None:
+            batch_spin = spin_input[ii * infer_batch_size : (ii + 1) * infer_batch_size]
+        if pbc:
+            batch_box = box_input[ii * infer_batch_size : (ii + 1) * infer_batch_size]
+        input_dict = {
+            "coord": batch_coord,
+            "atype": batch_atype,
+            "box": batch_box,
+            "do_atomic_virial": atomic,
+        }
+        if has_spin:
+            input_dict["spin"] = batch_spin
+        batch_output = model(**input_dict)
+        if isinstance(batch_output, tuple):
+            batch_output = batch_output[0]
+        if not return_tensor:
+            if "energy" in batch_output:
+                energy_out.append(batch_output["energy"].numpy())
+            if "atom_energy" in batch_output:
+                atomic_energy_out.append(batch_output["atom_energy"].numpy())
+            if "force" in batch_output:
+                force_out.append(batch_output["force"].numpy())
+            if "force_mag" in batch_output:
+                force_mag_out.append(batch_output["force_mag"].numpy())
+            if "virial" in batch_output:
+                virial_out.append(batch_output["virial"].numpy())
+            if "atom_virial" in batch_output:
+                atomic_virial_out.append(batch_output["atom_virial"].numpy())
+            if "updated_coord" in batch_output:
+                updated_coord_out.append(batch_output["updated_coord"].numpy())
+            if "logits" in batch_output:
+                logits_out.append(batch_output["logits"].numpy())
+        else:
+            if "energy" in batch_output:
+                energy_out.append(batch_output["energy"])
+            if "atom_energy" in batch_output:
+                atomic_energy_out.append(batch_output["atom_energy"])
+            if "force" in batch_output:
+                force_out.append(batch_output["force"])
+            if "force_mag" in batch_output:
+                force_mag_out.append(batch_output["force_mag"])
+            if "virial" in batch_output:
+                virial_out.append(batch_output["virial"])
+            if "atom_virial" in batch_output:
+                atomic_virial_out.append(batch_output["atom_virial"])
+            if "updated_coord" in batch_output:
+                updated_coord_out.append(batch_output["updated_coord"])
+            if "logits" in batch_output:
+                logits_out.append(batch_output["logits"])
+    if not return_tensor:
+        energy_out = (
+            np.concatenate(energy_out) if energy_out else np.zeros([nframes, 1])  # pylint: disable=no-explicit-dtype
+        )
+        atomic_energy_out = (
+            np.concatenate(atomic_energy_out)
+            if atomic_energy_out
+            else np.zeros([nframes, natoms, 1])  # pylint: disable=no-explicit-dtype
+        )
+        force_out = (
+            np.concatenate(force_out) if force_out else np.zeros([nframes, natoms, 3])  # pylint: disable=no-explicit-dtype
+        )
+        force_mag_out = (
+            np.concatenate(force_mag_out)
+            if force_mag_out
+            else np.zeros([nframes, natoms, 3])  # pylint: disable=no-explicit-dtype
+        )
+        virial_out = (
+            np.concatenate(virial_out) if virial_out else np.zeros([nframes, 3, 3])  # pylint: disable=no-explicit-dtype
+        )
+        atomic_virial_out = (
+            np.concatenate(atomic_virial_out)
+            if atomic_virial_out
+            else np.zeros([nframes, natoms, 3, 3])  # pylint: disable=no-explicit-dtype
+        )
+        updated_coord_out = (
+            np.concatenate(updated_coord_out) if updated_coord_out else None
+        )
+        logits_out = np.concatenate(logits_out) if logits_out else None
+    else:
+        energy_out = (
+            paddle.concat(energy_out)
+            if energy_out
+            else paddle.zeros([nframes, 1], dtype=GLOBAL_PD_FLOAT_PRECISION).to(
+                device=DEVICE
+            )
+        )
+        atomic_energy_out = (
+            paddle.concat(atomic_energy_out)
+            if atomic_energy_out
+            else paddle.zeros([nframes, natoms, 1], dtype=GLOBAL_PD_FLOAT_PRECISION).to(
+                device=DEVICE
+            )
+        )
+        force_out = (
+            paddle.concat(force_out)
+            if force_out
+            else paddle.zeros([nframes, natoms, 3], dtype=GLOBAL_PD_FLOAT_PRECISION).to(
+                device=DEVICE
+            )
+        )
+        force_mag_out = (
+            paddle.concat(force_mag_out)
+            if force_mag_out
+            else paddle.zeros([nframes, natoms, 3], dtype=GLOBAL_PD_FLOAT_PRECISION).to(
+                device=DEVICE
+            )
+        )
+        virial_out = (
+            paddle.concat(virial_out)
+            if virial_out
+            else paddle.zeros([nframes, 3, 3], dtype=GLOBAL_PD_FLOAT_PRECISION).to(
+                device=DEVICE
+            )
+        )
+        atomic_virial_out = (
+            paddle.concat(atomic_virial_out)
+            if atomic_virial_out
+            else paddle.zeros(
+                [nframes, natoms, 3, 3], dtype=GLOBAL_PD_FLOAT_PRECISION
+            ).to(device=DEVICE)
+        )
+        updated_coord_out = (
+            paddle.concat(updated_coord_out) if updated_coord_out else None
+        )
+        logits_out = paddle.concat(logits_out) if logits_out else None
+    if denoise:
+        return updated_coord_out, logits_out
+    else:
+        results_dict = {
+            "energy": energy_out,
+            "force": force_out,
+            "virial": virial_out,
+        }
+        if has_spin:
+            results_dict["force_mag"] = force_mag_out
+        if atomic:
+            results_dict["atom_energy"] = atomic_energy_out
+            results_dict["atom_virial"] = atomic_virial_out
+        return results_dict
diff --git a/source/tests/pd/conftest.py b/source/tests/pd/conftest.py
new file mode 100644
index 0000000000..530cb18907
--- /dev/null
+++ b/source/tests/pd/conftest.py
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import paddle
+import pytest
+
+
+@pytest.fixture(scope="package", autouse=True)
+def clear_cuda_memory(request):
+    yield
+    paddle.device.cuda.empty_cache()
diff --git a/source/tests/pd/model/__init__.py b/source/tests/pd/model/__init__.py
new file mode 100644
index 0000000000..6ceb116d85
--- /dev/null
+++ b/source/tests/pd/model/__init__.py
@@ -0,0 +1 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
diff --git a/source/tests/pd/model/test_autodiff.py b/source/tests/pd/model/test_autodiff.py
new file mode 100644
index 0000000000..a056491fb3
--- /dev/null
+++ b/source/tests/pd/model/test_autodiff.py
@@ -0,0 +1,263 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import copy
+import unittest
+
+import numpy as np
+import paddle
+
+from deepmd.pd.model.model import (
+    get_model,
+)
+from deepmd.pd.utils import (
+    env,
+)
+from deepmd.pd.utils.utils import (
+    to_numpy_array,
+)
+
+from ...seed import (
+    GLOBAL_SEED,
+)
+
+dtype = paddle.float64
+
+from ..common import (
+    eval_model,
+)
+from .test_permutation import (
+    model_dpa1,
+    model_dpa2,
+    model_hybrid,
+    model_se_e2_a,
+    model_spin,
+    model_zbl,
+)
+
+
+# from deepmd-kit repo
+def finite_difference(f, x, delta=1e-6):
+    in_shape = x.shape
+    y0 = f(x)
+    out_shape = y0.shape
+    res = np.empty(out_shape + in_shape)
+    for idx in np.ndindex(*in_shape):
+        diff = np.zeros(in_shape)
+        diff[idx] += delta
+        y1p = f(x + diff)
+        y1n = f(x - diff)
+        res[(Ellipsis, *idx)] = (y1p - y1n) / (2 * delta)
+    return res
+
+
+def stretch_box(old_coord, old_box, new_box):
+    ocoord = old_coord.reshape(-1, 3)
+    obox = old_box.reshape(3, 3)
+    nbox = new_box.reshape(3, 3)
+    ncoord = ocoord @ np.linalg.inv(obox) @ nbox
+    return ncoord.reshape(old_coord.shape)
+
+
+class ForceTest:
+    def test(
+        self,
+    ):
+        env.enable_prim(True)
+        places = 5
+        delta = 1e-5
+        natoms = 5
+        generator = paddle.seed(GLOBAL_SEED)
+        cell = paddle.rand([3, 3], dtype=dtype).to(device="cpu")
+        cell = (cell + cell.T) + 5.0 * paddle.eye(3).to(device="cpu")
+        coord = paddle.rand([natoms, 3], dtype=dtype).to(device="cpu")
+        coord = paddle.matmul(coord, cell)
+        spin = paddle.rand([natoms, 3], dtype=dtype).to(device="cpu")
+        atype = paddle.to_tensor([0, 0, 0, 1, 1])
+        # assumes input to be numpy tensor
+        coord = coord.numpy()
+        spin = spin.numpy()
+        test_spin = getattr(self, "test_spin", False)
+        if not test_spin:
+            test_keys = ["energy", "force", "virial"]
+        else:
+            test_keys = ["energy", "force", "force_mag", "virial"]
+
+        def np_infer_coord(
+            coord,
+        ):
+            result = eval_model(
+                self.model,
+                paddle.to_tensor(coord).to(device=env.DEVICE).unsqueeze(0),
+                cell.unsqueeze(0),
+                atype,
+                spins=paddle.to_tensor(spin).to(device=env.DEVICE).unsqueeze(0),
+            )
+            # detach
+            ret = {key: to_numpy_array(result[key].squeeze(0)) for key in test_keys}
+            return ret
+
+        def np_infer_spin(
+            spin,
+        ):
+            result = eval_model(
+                self.model,
+                paddle.to_tensor(coord).to(device=env.DEVICE).unsqueeze(0),
+                cell.unsqueeze(0),
+                atype,
+                spins=paddle.to_tensor(spin).to(device=env.DEVICE).unsqueeze(0),
+            )
+            # detach
+            ret = {key: to_numpy_array(result[key].squeeze(0)) for key in test_keys}
+            return ret
+
+        def ff_coord(_coord):
+            return np_infer_coord(_coord)["energy"]
+
+        def ff_spin(_spin):
+            return np_infer_spin(_spin)["energy"]
+
+        if not test_spin:
+            fdf = -finite_difference(ff_coord, coord, delta=delta).squeeze()
+            rff = np_infer_coord(coord)["force"]
+            np.testing.assert_almost_equal(fdf, rff, decimal=places)
+        else:
+            # real force
+            fdf = -finite_difference(ff_coord, coord, delta=delta).squeeze()
+            rff = np_infer_coord(coord)["force"]
+            np.testing.assert_almost_equal(fdf, rff, decimal=places)
+            # magnetic force
+            fdf = -finite_difference(ff_spin, spin, delta=delta).squeeze()
+            rff = np_infer_spin(spin)["force_mag"]
+            np.testing.assert_almost_equal(fdf, rff, decimal=places)
+
+
+class VirialTest:
+    def test(
+        self,
+    ):
+        places = 5
+        delta = 1e-4
+        natoms = 5
+        generator = paddle.seed(GLOBAL_SEED)
+        cell = paddle.rand([3, 3], dtype=dtype).to(device="cpu")
+        cell = (cell) + 5.0 * paddle.eye(3).to(device="cpu")
+        coord = paddle.rand([natoms, 3], dtype=dtype).to(device="cpu")
+        coord = paddle.matmul(coord, cell)
+        atype = paddle.to_tensor([0, 0, 0, 1, 1])
+        # assumes input to be numpy tensor
+        coord = coord.numpy()
+        cell = cell.numpy()
+        test_keys = ["energy", "force", "virial"]
+
+        def np_infer(
+            new_cell,
+        ):
+            result = eval_model(
+                self.model,
+                paddle.to_tensor(stretch_box(coord, cell, new_cell))
+                .to(device="cpu")
+                .unsqueeze(0),
+                paddle.to_tensor(new_cell).to(device="cpu").unsqueeze(0),
+                atype,
+            )
+            # detach
+            ret = {key: to_numpy_array(result[key].squeeze(0)) for key in test_keys}
+            # detach
+            return ret
+
+        def ff(bb):
+            return np_infer(bb)["energy"]
+
+        fdv = (
+            -(finite_difference(ff, cell, delta=delta).transpose([0, 2, 1]) @ cell)
+            .squeeze()
+            .reshape([9])
+        )
+        rfv = np_infer(cell)["virial"]
+        np.testing.assert_almost_equal(fdv, rfv, decimal=places)
+
+
+class TestEnergyModelSeAForce(unittest.TestCase, ForceTest):
+    def setUp(self):
+        model_params = copy.deepcopy(model_se_e2_a)
+        self.type_split = False
+        self.model = get_model(model_params).to(env.DEVICE)
+
+
+class TestEnergyModelSeAVirial(unittest.TestCase, VirialTest):
+    def setUp(self):
+        model_params = copy.deepcopy(model_se_e2_a)
+        self.type_split = False
+        self.model = get_model(model_params).to(env.DEVICE)
+
+
+@unittest.skip("Skip for not implemented yet")
+class TestEnergyModelDPA1Force(unittest.TestCase, ForceTest):
+    def setUp(self):
+        model_params = copy.deepcopy(model_dpa1)
+        self.type_split = True
+        self.model = get_model(model_params).to(env.DEVICE)
+
+
+@unittest.skip("Skip for not implemented yet")
+class TestEnergyModelDPA1Virial(unittest.TestCase, VirialTest):
+    def setUp(self):
+        model_params = copy.deepcopy(model_dpa1)
+        self.type_split = True
+        self.model = get_model(model_params).to(env.DEVICE)
+
+
+@unittest.skip("Skip for not implemented yet")
+class TestEnergyModelDPA2Force(unittest.TestCase, ForceTest):
+    def setUp(self):
+        model_params = copy.deepcopy(model_dpa2)
+        self.type_split = True
+        self.model = get_model(model_params).to(env.DEVICE)
+
+
+@unittest.skip("Skip for not implemented yet")
+class TestEnergyModelDPAUniVirial(unittest.TestCase, VirialTest):
+    def setUp(self):
+        model_params = copy.deepcopy(model_dpa2)
+        self.type_split = True
+        self.model = get_model(model_params).to(env.DEVICE)
+
+
+@unittest.skip("Skip for not implemented yet")
+class TestEnergyModelHybridForce(unittest.TestCase, ForceTest):
+    def setUp(self):
+        model_params = copy.deepcopy(model_hybrid)
+        self.type_split = True
+        self.model = get_model(model_params).to(env.DEVICE)
+
+
+@unittest.skip("Skip for not implemented yet")
+class TestEnergyModelHybridVirial(unittest.TestCase, VirialTest):
+    def setUp(self):
+        model_params = copy.deepcopy(model_hybrid)
+        self.type_split = True
+        self.model = get_model(model_params).to(env.DEVICE)
+
+
+@unittest.skip("Skip for not implemented yet")
+class TestEnergyModelZBLForce(unittest.TestCase, ForceTest):
+    def setUp(self):
+        model_params = copy.deepcopy(model_zbl)
+        self.type_split = False
+        self.model = get_model(model_params).to(env.DEVICE)
+
+
+@unittest.skip("Skip for not implemented yet")
+class TestEnergyModelZBLVirial(unittest.TestCase, VirialTest):
+    def setUp(self):
+        model_params = copy.deepcopy(model_zbl)
+        self.type_split = False
+        self.model = get_model(model_params).to(env.DEVICE)
+
+
+@unittest.skip("Skip for not implemented yet")
+class TestEnergyModelSpinSeAForce(unittest.TestCase, ForceTest):
+    def setUp(self):
+        model_params = copy.deepcopy(model_spin)
+        self.type_split = False
+        self.test_spin = True
+        self.model = get_model(model_params).to(env.DEVICE)
diff --git a/source/tests/pd/model/test_descriptor.py b/source/tests/pd/model/test_descriptor.py
new file mode 100644
index 0000000000..10f2fd271b
--- /dev/null
+++ b/source/tests/pd/model/test_descriptor.py
@@ -0,0 +1,195 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import os
+import unittest
+
+import numpy as np
+import paddle
+import tensorflow.compat.v1 as tf
+
+tf.disable_eager_execution()
+
+import json
+from pathlib import (
+    Path,
+)
+
+from deepmd.pd.model.descriptor import (
+    prod_env_mat,
+)
+from deepmd.pd.utils import (
+    decomp,
+    dp_random,
+    env,
+)
+from deepmd.pd.utils.dataset import (
+    DeepmdDataSetForLoader,
+)
+from deepmd.pd.utils.env import (
+    DEVICE,
+    GLOBAL_NP_FLOAT_PRECISION,
+    GLOBAL_PD_FLOAT_PRECISION,
+)
+from deepmd.pd.utils.nlist import (
+    extend_input_and_build_neighbor_list,
+)
+from deepmd.tf.common import (
+    expand_sys_str,
+)
+from deepmd.tf.env import (
+    op_module,
+)
+
+from ..test_finetune import (
+    energy_data_requirement,
+)
+from .test_embedding_net import (
+    get_single_batch,
+)
+
+CUR_DIR = os.path.dirname(__file__)
+
+
+def base_se_a(rcut, rcut_smth, sel, batch, mean, stddev):
+    g = tf.Graph()
+    with g.as_default():
+        coord = tf.placeholder(GLOBAL_NP_FLOAT_PRECISION, [None, None])
+        box = tf.placeholder(GLOBAL_NP_FLOAT_PRECISION, [None, None])
+        atype = tf.placeholder(tf.int32, [None, None])
+        natoms_vec = tf.placeholder(tf.int32, [None])
+        default_mesh = tf.placeholder(tf.int32, [None])
+        stat_descrpt, descrpt_deriv, rij, nlist = op_module.prod_env_mat_a(
+            coord,
+            atype,
+            natoms_vec,
+            box,
+            default_mesh,
+            tf.constant(mean),
+            tf.constant(stddev),
+            rcut_a=-1.0,
+            rcut_r=rcut,
+            rcut_r_smth=rcut_smth,
+            sel_a=sel,
+            sel_r=[0 for i in sel],
+        )
+
+        net_deriv_reshape = tf.ones_like(stat_descrpt)
+        force = op_module.prod_force_se_a(
+            net_deriv_reshape,
+            descrpt_deriv,
+            nlist,
+            natoms_vec,
+            n_a_sel=sum(sel),
+            n_r_sel=0,
+        )
+
+    with tf.Session(graph=g) as sess:
+        y = sess.run(
+            [stat_descrpt, force, nlist],
+            feed_dict={
+                coord: batch["coord"],
+                box: batch["box"],
+                natoms_vec: batch["natoms"],
+                atype: batch["atype"],
+                default_mesh: np.array([0, 0, 0, 2, 2, 2]),
+            },
+        )
+    tf.reset_default_graph()
+    return y
+
+
+class TestSeA(unittest.TestCase):
+    def setUp(self):
+        dp_random.seed(20)
+        with open(str(Path(__file__).parent / "water/se_e2_a.json")) as fin:
+            content = fin.read()
+        config = json.loads(content)
+        data_file = [str(Path(__file__).parent / "water/data/data_0")]
+        config["training"]["training_data"]["systems"] = data_file
+        config["training"]["validation_data"]["systems"] = data_file
+        model_config = config["model"]
+        self.rcut = model_config["descriptor"]["rcut"]
+        self.rcut_smth = model_config["descriptor"]["rcut_smth"]
+        self.sel = model_config["descriptor"]["sel"]
+        self.bsz = config["training"]["training_data"]["batch_size"]
+        self.systems = config["training"]["validation_data"]["systems"]
+        if isinstance(self.systems, str):
+            self.systems = expand_sys_str(self.systems)
+        ds = DeepmdDataSetForLoader(
+            self.systems[0],
+            model_config["type_map"],
+        )
+        ds.add_data_requirement(energy_data_requirement)
+        self.np_batch, self.pd_batch = get_single_batch(ds)
+        self.sec = np.cumsum(self.sel)
+        self.ntypes = len(self.sel)
+        self.nnei = sum(self.sel)
+
+    def test_consistency(self):
+        avg_zero = paddle.zeros(
+            [self.ntypes, self.nnei * 4],
+            dtype=GLOBAL_PD_FLOAT_PRECISION,
+        ).to(device=env.DEVICE)
+        std_ones = paddle.ones(
+            [self.ntypes, self.nnei * 4],
+            dtype=GLOBAL_PD_FLOAT_PRECISION,
+        ).to(device=env.DEVICE)
+        base_d, base_force, base_nlist = base_se_a(
+            rcut=self.rcut,
+            rcut_smth=self.rcut_smth,
+            sel=self.sel,
+            batch=self.np_batch,
+            mean=avg_zero.detach().cpu(),
+            stddev=std_ones.detach().cpu(),
+        )
+
+        pd_coord = self.pd_batch["coord"].to(env.DEVICE)
+        atype = self.pd_batch["atype"].to(env.DEVICE)
+        pd_coord.stop_gradient = False
+        (
+            extended_coord,
+            extended_atype,
+            mapping,
+            nlist,
+        ) = extend_input_and_build_neighbor_list(
+            pd_coord,
+            self.pd_batch["atype"].to(env.DEVICE),
+            self.rcut,
+            self.sel,
+            mixed_types=False,
+            box=self.pd_batch["box"].to(env.DEVICE),
+        )
+        my_d, _, _ = prod_env_mat(
+            extended_coord,
+            nlist,
+            atype,
+            avg_zero.reshape([-1, self.nnei, 4]).to(DEVICE),
+            std_ones.reshape([-1, self.nnei, 4]).to(DEVICE),
+            self.rcut,
+            self.rcut_smth,
+        )
+        my_d.sum().backward()
+        bsz = pd_coord.shape[0]
+        my_force = pd_coord.grad.reshape([bsz, -1, 3]).cpu().detach().numpy()
+        base_force = base_force.reshape(bsz, -1, 3)
+        base_d = base_d.reshape(bsz, -1, self.nnei, 4)
+        my_d = my_d.reshape([bsz, -1, self.nnei, 4]).cpu().detach().numpy()
+        base_nlist = base_nlist.reshape(bsz, -1, self.nnei)
+
+        mapping = mapping.cpu()
+        my_nlist = nlist.reshape([bsz, -1]).cpu()
+        mask = my_nlist == -1
+        my_nlist = my_nlist * (~mask).astype(my_nlist.dtype)
+        my_nlist = decomp.take_along_axis(mapping, axis=-1, indices=my_nlist)
+        my_nlist = my_nlist * (~mask).astype(my_nlist.dtype) - mask.astype(
+            my_nlist.dtype
+        )
+        my_nlist = my_nlist.cpu().reshape([bsz, -1, self.nnei]).numpy()
+        self.assertTrue(np.allclose(base_nlist, my_nlist))
+        self.assertTrue(np.allclose(np.mean(base_d, axis=2), np.mean(my_d, axis=2)))
+        self.assertTrue(np.allclose(np.std(base_d, axis=2), np.std(my_d, axis=2)))
+        # descriptors may be different when there are multiple neighbors in the same distance
+        self.assertTrue(np.allclose(base_force, -my_force))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/source/tests/pd/model/test_dp_atomic_model.py b/source/tests/pd/model/test_dp_atomic_model.py
new file mode 100644
index 0000000000..785bfa1076
--- /dev/null
+++ b/source/tests/pd/model/test_dp_atomic_model.py
@@ -0,0 +1,235 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import itertools
+import unittest
+
+import numpy as np
+import paddle
+
+from deepmd.dpmodel.atomic_model import DPAtomicModel as DPDPAtomicModel
+from deepmd.dpmodel.descriptor import DescrptSeA as DPDescrptSeA
+from deepmd.dpmodel.fitting import InvarFitting as DPInvarFitting
+from deepmd.pd.model.atomic_model import (
+    DPAtomicModel,
+)
+from deepmd.pd.model.descriptor.se_a import (
+    DescrptSeA,
+)
+from deepmd.pd.model.task.ener import (
+    InvarFitting,
+)
+from deepmd.pd.utils import (
+    env,
+)
+from deepmd.pd.utils.utils import (
+    to_numpy_array,
+    to_paddle_tensor,
+)
+
+from .test_env_mat import (
+    TestCaseSingleFrameWithNlist,
+    TestCaseSingleFrameWithNlistWithVirtual,
+)
+
+dtype = env.GLOBAL_PD_FLOAT_PRECISION
+
+
+class TestDPAtomicModel(unittest.TestCase, TestCaseSingleFrameWithNlist):
+    def setUp(self):
+        TestCaseSingleFrameWithNlist.setUp(self)
+
+    def test_self_consistency(self):
+        nf, nloc, nnei = self.nlist.shape
+        ds = DescrptSeA(
+            self.rcut,
+            self.rcut_smth,
+            self.sel,
+        ).to(env.DEVICE)
+        ft = InvarFitting(
+            "energy",
+            self.nt,
+            ds.get_dim_out(),
+            1,
+            mixed_types=ds.mixed_types(),
+        ).to(env.DEVICE)
+        type_map = ["foo", "bar"]
+
+        # test the case of exclusion
+        for atom_excl, pair_excl in itertools.product([[], [1]], [[], [[0, 1]]]):
+            md0 = DPAtomicModel(
+                ds,
+                ft,
+                type_map=type_map,
+            ).to(env.DEVICE)
+            md0.reinit_atom_exclude(atom_excl)
+            md0.reinit_pair_exclude(pair_excl)
+            md1 = DPAtomicModel.deserialize(md0.serialize()).to(env.DEVICE)
+            args = [
+                to_paddle_tensor(ii)
+                for ii in [self.coord_ext, self.atype_ext, self.nlist]
+            ]
+            ret0 = md0.forward_common_atomic(*args)
+            ret1 = md1.forward_common_atomic(*args)
+            np.testing.assert_allclose(
+                to_numpy_array(ret0["energy"]),
+                to_numpy_array(ret1["energy"]),
+            )
+
+    def test_dp_consistency(self):
+        nf, nloc, nnei = self.nlist.shape
+        ds = DPDescrptSeA(
+            self.rcut,
+            self.rcut_smth,
+            self.sel,
+        )
+        ft = DPInvarFitting(
+            "energy",
+            self.nt,
+            ds.get_dim_out(),
+            1,
+            mixed_types=ds.mixed_types(),
+        )
+        type_map = ["foo", "bar"]
+        md0 = DPDPAtomicModel(ds, ft, type_map=type_map)
+        md1 = DPAtomicModel.deserialize(md0.serialize()).to(env.DEVICE)
+        args0 = [self.coord_ext, self.atype_ext, self.nlist]
+        args1 = [
+            to_paddle_tensor(ii) for ii in [self.coord_ext, self.atype_ext, self.nlist]
+        ]
+        ret0 = md0.forward_common_atomic(*args0)
+        ret1 = md1.forward_common_atomic(*args1)
+        np.testing.assert_allclose(
+            ret0["energy"],
+            to_numpy_array(ret1["energy"]),
+        )
+
+    def test_jit(self):
+        nf, nloc, nnei = self.nlist.shape
+        ds = DescrptSeA(
+            self.rcut,
+            self.rcut_smth,
+            self.sel,
+        ).to(env.DEVICE)
+        ft = InvarFitting(
+            "energy",
+            self.nt,
+            ds.get_dim_out(),
+            1,
+            mixed_types=ds.mixed_types(),
+        ).to(env.DEVICE)
+        type_map = ["foo", "bar"]
+        md0 = DPAtomicModel(ds, ft, type_map=type_map).to(env.DEVICE)
+        md0 = paddle.jit.to_static(md0)
+        self.assertEqual(md0.get_rcut(), self.rcut)
+        self.assertEqual(md0.get_type_map(), type_map)
+
+    def test_excl_consistency(self):
+        type_map = ["foo", "bar"]
+
+        # test the case of exclusion
+        for atom_excl, pair_excl in itertools.product([[], [1]], [[], [[0, 1]]]):
+            ds = DescrptSeA(
+                self.rcut,
+                self.rcut_smth,
+                self.sel,
+            ).to(env.DEVICE)
+            ft = InvarFitting(
+                "energy",
+                self.nt,
+                ds.get_dim_out(),
+                1,
+                mixed_types=ds.mixed_types(),
+            ).to(env.DEVICE)
+            md0 = DPAtomicModel(
+                ds,
+                ft,
+                type_map=type_map,
+            ).to(env.DEVICE)
+            md1 = DPAtomicModel.deserialize(md0.serialize()).to(env.DEVICE)
+
+            md0.reinit_atom_exclude(atom_excl)
+            md0.reinit_pair_exclude(pair_excl)
+            # hacking!
+            md1.descriptor.reinit_exclude(pair_excl)
+            md1.fitting_net.reinit_exclude(atom_excl)
+
+            # check energy consistency
+            args = [
+                to_paddle_tensor(ii)
+                for ii in [self.coord_ext, self.atype_ext, self.nlist]
+            ]
+            ret0 = md0.forward_common_atomic(*args)
+            ret1 = md1.forward_common_atomic(*args)
+            np.testing.assert_allclose(
+                to_numpy_array(ret0["energy"]),
+                to_numpy_array(ret1["energy"]),
+            )
+
+            # check output def
+            out_names = [vv.name for vv in md0.atomic_output_def().get_data().values()]
+            self.assertEqual(out_names, ["energy", "mask"])
+            if atom_excl != []:
+                for ii in md0.atomic_output_def().get_data().values():
+                    if ii.name == "mask":
+                        self.assertEqual(ii.shape, [1])
+                        self.assertFalse(ii.reducible)
+                        self.assertFalse(ii.r_differentiable)
+                        self.assertFalse(ii.c_differentiable)
+
+            # check mask
+            if atom_excl == []:
+                pass
+            elif atom_excl == [1]:
+                self.assertIn("mask", ret0.keys())
+                expected = np.array([1, 1, 0], dtype="int64")
+                expected = np.concatenate(
+                    [expected, expected[self.perm[: self.nloc]]]
+                ).reshape(2, 3)
+                np.testing.assert_array_equal(to_numpy_array(ret0["mask"]), expected)
+            else:
+                raise ValueError(f"not expected atom_excl {atom_excl}")
+
+
+class TestDPAtomicModelVirtualConsistency(unittest.TestCase):
+    def setUp(self):
+        self.case0 = TestCaseSingleFrameWithNlist()
+        self.case1 = TestCaseSingleFrameWithNlistWithVirtual()
+        self.case0.setUp()
+        self.case1.setUp()
+
+    def test_virtual_consistency(self):
+        nf, _, _ = self.case0.nlist.shape
+        ds = DescrptSeA(
+            self.case0.rcut,
+            self.case0.rcut_smth,
+            self.case0.sel,
+        )
+        ft = InvarFitting(
+            "energy",
+            self.case0.nt,
+            ds.get_dim_out(),
+            1,
+            mixed_types=ds.mixed_types(),
+        )
+        type_map = ["foo", "bar"]
+        md1 = DPAtomicModel(ds, ft, type_map=type_map).to(env.DEVICE)
+
+        args0 = [self.case0.coord_ext, self.case0.atype_ext, self.case0.nlist]
+        args0 = [to_paddle_tensor(ii) for ii in args0]
+        args1 = [self.case1.coord_ext, self.case1.atype_ext, self.case1.nlist]
+        args1 = [to_paddle_tensor(ii) for ii in args1]
+
+        ret0 = md1.forward_common_atomic(*args0)
+        ret1 = md1.forward_common_atomic(*args1)
+
+        for dd in range(self.case0.nf):
+            np.testing.assert_allclose(
+                to_numpy_array(ret0["energy"])[dd],
+                to_numpy_array(ret1["energy"])[dd, self.case1.get_real_mapping[dd], :],
+            )
+        expected_mask = np.array(
+            [
+                [1, 0, 1, 1],
+                [1, 1, 0, 1],
+            ]
+        )
+        np.testing.assert_equal(to_numpy_array(ret1["mask"]), expected_mask)
diff --git a/source/tests/pd/model/test_dp_model.py b/source/tests/pd/model/test_dp_model.py
new file mode 100644
index 0000000000..a281851f14
--- /dev/null
+++ b/source/tests/pd/model/test_dp_model.py
@@ -0,0 +1,633 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import unittest
+
+import numpy as np
+import paddle
+
+from deepmd.dpmodel.descriptor import DescrptSeA as DPDescrptSeA
+from deepmd.dpmodel.fitting import EnergyFittingNet as DPEnergyFittingNet
+from deepmd.dpmodel.model.ener_model import EnergyModel as DPEnergyModel
+from deepmd.pd.model.descriptor.se_a import (
+    DescrptSeA,
+)
+from deepmd.pd.model.model import (
+    EnergyModel,
+)
+from deepmd.pd.model.task.ener import (
+    EnergyFittingNet,
+)
+from deepmd.pd.utils import (
+    env,
+)
+from deepmd.pd.utils.nlist import (
+    build_neighbor_list,
+    extend_coord_with_ghosts,
+    extend_input_and_build_neighbor_list,
+)
+from deepmd.pd.utils.utils import (
+    to_numpy_array,
+    to_paddle_tensor,
+)
+
+from ...seed import (
+    GLOBAL_SEED,
+)
+from .test_env_mat import (
+    TestCaseSingleFrameWithNlist,
+    TestCaseSingleFrameWithoutNlist,
+)
+
+dtype = env.GLOBAL_PD_FLOAT_PRECISION
+
+
+class TestDPModel(unittest.TestCase, TestCaseSingleFrameWithoutNlist):
+    def setUp(self):
+        TestCaseSingleFrameWithoutNlist.setUp(self)
+
+    def test_self_consistency(self):
+        nf, nloc = self.atype.shape
+        ds = DescrptSeA(
+            self.rcut,
+            self.rcut_smth,
+            self.sel,
+        ).to(env.DEVICE)
+        ft = EnergyFittingNet(
+            self.nt,
+            ds.get_dim_out(),
+            mixed_types=ds.mixed_types(),
+        ).to(env.DEVICE)
+        type_map = ["foo", "bar"]
+        md0 = EnergyModel(ds, ft, type_map=type_map).to(env.DEVICE)
+        md1 = EnergyModel.deserialize(md0.serialize()).to(env.DEVICE)
+        args = [to_paddle_tensor(ii) for ii in [self.coord, self.atype, self.cell]]
+        ret0 = md0.forward_common(*args)
+        ret1 = md1.forward_common(*args)
+        np.testing.assert_allclose(
+            to_numpy_array(ret0["energy"]),
+            to_numpy_array(ret1["energy"]),
+            atol=self.atol,
+        )
+        np.testing.assert_allclose(
+            to_numpy_array(ret0["energy_redu"]),
+            to_numpy_array(ret1["energy_redu"]),
+            atol=self.atol,
+        )
+        np.testing.assert_allclose(
+            to_numpy_array(ret0["energy_derv_r"]),
+            to_numpy_array(ret1["energy_derv_r"]),
+            atol=self.atol,
+        )
+        np.testing.assert_allclose(
+            to_numpy_array(ret0["energy_derv_c_redu"]),
+            to_numpy_array(ret1["energy_derv_c_redu"]),
+            atol=self.atol,
+        )
+        ret0 = md0.forward_common(*args, do_atomic_virial=True)
+        ret1 = md1.forward_common(*args, do_atomic_virial=True)
+        np.testing.assert_allclose(
+            to_numpy_array(ret0["energy_derv_c"]),
+            to_numpy_array(ret1["energy_derv_c"]),
+            atol=self.atol,
+        )
+
+        coord_ext, atype_ext, mapping = extend_coord_with_ghosts(
+            to_paddle_tensor(self.coord),
+            to_paddle_tensor(self.atype),
+            to_paddle_tensor(self.cell),
+            self.rcut,
+        )
+        nlist = build_neighbor_list(
+            coord_ext,
+            atype_ext,
+            self.nloc,
+            self.rcut,
+            self.sel,
+            distinguish_types=(not md0.mixed_types()),
+        )
+        args = [coord_ext, atype_ext, nlist]
+        ret2 = md0.forward_common_lower(*args, do_atomic_virial=True)
+        # check the consistency between the reduced virial from
+        # forward_common and forward_common_lower
+        np.testing.assert_allclose(
+            to_numpy_array(ret0["energy_derv_c_redu"]),
+            to_numpy_array(ret2["energy_derv_c_redu"]),
+            atol=self.atol,
+        )
+
+    def test_dp_consistency(self):
+        nf, nloc = self.atype.shape
+        nfp, nap = 2, 3
+        ds = DPDescrptSeA(
+            self.rcut,
+            self.rcut_smth,
+            self.sel,
+        )
+        ft = DPEnergyFittingNet(
+            self.nt,
+            ds.get_dim_out(),
+            mixed_types=ds.mixed_types(),
+            numb_fparam=nfp,
+            numb_aparam=nap,
+        )
+        type_map = ["foo", "bar"]
+        md0 = DPEnergyModel(ds, ft, type_map=type_map)
+        md1 = EnergyModel.deserialize(md0.serialize()).to(env.DEVICE)
+
+        rng = np.random.default_rng(GLOBAL_SEED)
+        fparam = rng.normal(size=[self.nf, nfp])
+        aparam = rng.normal(size=[self.nf, nloc, nap])
+        args0 = [self.coord, self.atype, self.cell]
+        args1 = [to_paddle_tensor(ii) for ii in [self.coord, self.atype, self.cell]]
+        kwargs0 = {"fparam": fparam, "aparam": aparam}
+        kwargs1 = {kk: to_paddle_tensor(vv) for kk, vv in kwargs0.items()}
+        ret0 = md0.call(*args0, **kwargs0)
+        ret1 = md1.forward_common(*args1, **kwargs1)
+        np.testing.assert_allclose(
+            ret0["energy"],
+            to_numpy_array(ret1["energy"]),
+            atol=self.atol,
+        )
+        np.testing.assert_allclose(
+            ret0["energy_redu"],
+            to_numpy_array(ret1["energy_redu"]),
+            atol=self.atol,
+        )
+
+    def test_dp_consistency_nopbc(self):
+        nf, nloc = self.atype.shape
+        nfp, nap = 2, 3
+        ds = DPDescrptSeA(
+            self.rcut,
+            self.rcut_smth,
+            self.sel,
+        )
+        ft = DPEnergyFittingNet(
+            self.nt,
+            ds.get_dim_out(),
+            mixed_types=ds.mixed_types(),
+            numb_fparam=nfp,
+            numb_aparam=nap,
+        )
+        type_map = ["foo", "bar"]
+        md0 = DPEnergyModel(ds, ft, type_map=type_map)
+        md1 = EnergyModel.deserialize(md0.serialize()).to(env.DEVICE)
+
+        rng = np.random.default_rng(GLOBAL_SEED)
+        fparam = rng.normal(size=[self.nf, nfp])
+        aparam = rng.normal(size=[self.nf, self.nloc, nap])
+        args0 = [self.coord, self.atype]
+        args1 = [to_paddle_tensor(ii) for ii in args0]
+        kwargs0 = {"fparam": fparam, "aparam": aparam}
+        kwargs1 = {kk: to_paddle_tensor(vv) for kk, vv in kwargs0.items()}
+        ret0 = md0.call(*args0, **kwargs0)
+        ret1 = md1.forward_common(*args1, **kwargs1)
+        np.testing.assert_allclose(
+            ret0["energy"],
+            to_numpy_array(ret1["energy"]),
+            atol=self.atol,
+        )
+        np.testing.assert_allclose(
+            ret0["energy_redu"],
+            to_numpy_array(ret1["energy_redu"]),
+            atol=self.atol,
+        )
+
+    def test_prec_consistency(self):
+        rng = np.random.default_rng(GLOBAL_SEED)
+        nf, nloc = self.atype.shape
+        ds = DPDescrptSeA(
+            self.rcut,
+            self.rcut_smth,
+            self.sel,
+        )
+        ft = DPEnergyFittingNet(
+            self.nt,
+            ds.get_dim_out(),
+            mixed_types=ds.mixed_types(),
+        )
+        nfp, nap = 2, 3
+        type_map = ["foo", "bar"]
+        fparam = rng.normal(size=[self.nf, nfp])
+        aparam = rng.normal(size=[self.nf, nloc, nap])
+
+        md0 = DPEnergyModel(ds, ft, type_map=type_map)
+        md1 = EnergyModel.deserialize(md0.serialize()).to(env.DEVICE)
+
+        args64 = [to_paddle_tensor(ii) for ii in [self.coord, self.atype, self.cell]]
+        args64[0] = args64[0].to(paddle.float64)
+        args64[2] = args64[2].to(paddle.float64)
+        args32 = [to_paddle_tensor(ii) for ii in [self.coord, self.atype, self.cell]]
+        args32[0] = args32[0].to(paddle.float32)
+        args32[2] = args32[2].to(paddle.float32)
+        # fparam, aparam are converted to coordinate precision by model
+        fparam = to_paddle_tensor(fparam)
+        aparam = to_paddle_tensor(aparam)
+
+        model_l_ret_64 = md1.forward_common(*args64, fparam=fparam, aparam=aparam)
+        model_l_ret_32 = md1.forward_common(*args32, fparam=fparam, aparam=aparam)
+
+        for ii in model_l_ret_32.keys():
+            if ii[-4:] == "redu":
+                self.assertEqual(model_l_ret_32[ii].dtype, paddle.float64)
+            else:
+                self.assertEqual(model_l_ret_32[ii].dtype, paddle.float32)
+            if ii != "mask":
+                self.assertEqual(model_l_ret_64[ii].dtype, paddle.float64)
+            else:
+                self.assertEqual(model_l_ret_64[ii].dtype, paddle.int32)
+            np.testing.assert_allclose(
+                to_numpy_array(model_l_ret_32[ii]),
+                to_numpy_array(model_l_ret_64[ii]),
+                atol=self.atol,
+            )
+
+
+class TestDPModelLower(unittest.TestCase, TestCaseSingleFrameWithNlist):
+    def setUp(self):
+        TestCaseSingleFrameWithNlist.setUp(self)
+
+    def test_self_consistency(self):
+        nf, nloc, nnei = self.nlist.shape
+        ds = DescrptSeA(
+            self.rcut,
+            self.rcut_smth,
+            self.sel,
+        ).to(env.DEVICE)
+        ft = EnergyFittingNet(
+            self.nt,
+            ds.get_dim_out(),
+            mixed_types=ds.mixed_types(),
+        ).to(env.DEVICE)
+        type_map = ["foo", "bar"]
+        md0 = EnergyModel(ds, ft, type_map=type_map).to(env.DEVICE)
+        md1 = EnergyModel.deserialize(md0.serialize()).to(env.DEVICE)
+        args = [
+            to_paddle_tensor(ii) for ii in [self.coord_ext, self.atype_ext, self.nlist]
+        ]
+        ret0 = md0.forward_common_lower(*args)
+        ret1 = md1.forward_common_lower(*args)
+        np.testing.assert_allclose(
+            to_numpy_array(ret0["energy"]),
+            to_numpy_array(ret1["energy"]),
+            atol=self.atol,
+        )
+        np.testing.assert_allclose(
+            to_numpy_array(ret0["energy_redu"]),
+            to_numpy_array(ret1["energy_redu"]),
+            atol=self.atol,
+        )
+        np.testing.assert_allclose(
+            to_numpy_array(ret0["energy_derv_r"]),
+            to_numpy_array(ret1["energy_derv_r"]),
+            atol=self.atol,
+        )
+        np.testing.assert_allclose(
+            to_numpy_array(ret0["energy_derv_c_redu"]),
+            to_numpy_array(ret1["energy_derv_c_redu"]),
+            atol=self.atol,
+        )
+        ret0 = md0.forward_common_lower(*args, do_atomic_virial=True)
+        ret1 = md1.forward_common_lower(*args, do_atomic_virial=True)
+        np.testing.assert_allclose(
+            to_numpy_array(ret0["energy_derv_c"]),
+            to_numpy_array(ret1["energy_derv_c"]),
+            atol=self.atol,
+        )
+
+    def test_dp_consistency(self):
+        nf, nloc, nnei = self.nlist.shape
+        ds = DPDescrptSeA(
+            self.rcut,
+            self.rcut_smth,
+            self.sel,
+        )
+        ft = DPEnergyFittingNet(
+            self.nt,
+            ds.get_dim_out(),
+            mixed_types=ds.mixed_types(),
+        )
+        type_map = ["foo", "bar"]
+        md0 = DPEnergyModel(ds, ft, type_map=type_map)
+        md1 = EnergyModel.deserialize(md0.serialize()).to(env.DEVICE)
+        args0 = [self.coord_ext, self.atype_ext, self.nlist]
+        args1 = [
+            to_paddle_tensor(ii) for ii in [self.coord_ext, self.atype_ext, self.nlist]
+        ]
+        ret0 = md0.call_lower(*args0)
+        ret1 = md1.forward_common_lower(*args1)
+        np.testing.assert_allclose(
+            ret0["energy"],
+            to_numpy_array(ret1["energy"]),
+            atol=self.atol,
+        )
+        np.testing.assert_allclose(
+            ret0["energy_redu"],
+            to_numpy_array(ret1["energy_redu"]),
+            atol=self.atol,
+        )
+
+    def test_prec_consistency(self):
+        rng = np.random.default_rng(GLOBAL_SEED)
+        nf, nloc, nnei = self.nlist.shape
+        ds = DPDescrptSeA(
+            self.rcut,
+            self.rcut_smth,
+            self.sel,
+        )
+        ft = DPEnergyFittingNet(
+            self.nt,
+            ds.get_dim_out(),
+            mixed_types=ds.mixed_types(),
+        )
+        nfp, nap = 2, 3
+        type_map = ["foo", "bar"]
+        fparam = rng.normal(size=[self.nf, nfp])
+        aparam = rng.normal(size=[self.nf, nloc, nap])
+
+        md0 = DPEnergyModel(ds, ft, type_map=type_map)
+        md1 = EnergyModel.deserialize(md0.serialize()).to(env.DEVICE)
+
+        args64 = [
+            to_paddle_tensor(ii) for ii in [self.coord_ext, self.atype_ext, self.nlist]
+        ]
+        args64[0] = args64[0].to(paddle.float64)
+        args32 = [
+            to_paddle_tensor(ii) for ii in [self.coord_ext, self.atype_ext, self.nlist]
+        ]
+        args32[0] = args32[0].to(paddle.float32)
+        # fparam, aparam are converted to coordinate precision by model
+        fparam = to_paddle_tensor(fparam)
+        aparam = to_paddle_tensor(aparam)
+
+        model_l_ret_64 = md1.forward_common_lower(*args64, fparam=fparam, aparam=aparam)
+        model_l_ret_32 = md1.forward_common_lower(*args32, fparam=fparam, aparam=aparam)
+
+        for ii in model_l_ret_32.keys():
+            if ii[-4:] == "redu":
+                self.assertEqual(model_l_ret_32[ii].dtype, paddle.float64)
+            else:
+                self.assertEqual(model_l_ret_32[ii].dtype, paddle.float32)
+            if ii != "mask":
+                self.assertEqual(model_l_ret_64[ii].dtype, paddle.float64)
+            else:
+                self.assertEqual(model_l_ret_64[ii].dtype, paddle.int32)
+            np.testing.assert_allclose(
+                to_numpy_array(model_l_ret_32[ii]),
+                to_numpy_array(model_l_ret_64[ii]),
+                atol=self.atol,
+            )
+
+    def test_jit(self):
+        nf, nloc, nnei = self.nlist.shape
+        ds = DescrptSeA(
+            self.rcut,
+            self.rcut_smth,
+            self.sel,
+        ).to(env.DEVICE)
+        ft = EnergyFittingNet(
+            self.nt,
+            ds.get_dim_out(),
+            mixed_types=ds.mixed_types(),
+        ).to(env.DEVICE)
+        type_map = ["foo", "bar"]
+        md0 = EnergyModel(ds, ft, type_map=type_map).to(env.DEVICE)
+        md0 = paddle.jit.to_static(md0)
+        md0.get_rcut()
+        md0.get_type_map()
+
+
+class TestDPModelFormatNlist(unittest.TestCase):
+    def setUp(self):
+        # nloc == 3, nall == 4
+        self.nloc = 3
+        self.nall = 5
+        self.nf, self.nt = 1, 2
+        self.coord_ext = np.array(
+            [
+                [0, 0, 0],
+                [0, 1, 0],
+                [0, 0, 1],
+                [0, -2, 0],
+                [2.3, 0, 0],
+            ],
+            dtype=np.float64,
+        ).reshape([1, self.nall * 3])
+        # sel = [5, 2]
+        self.sel = [5, 2]
+        self.expected_nlist = np.array(
+            [
+                [1, 3, -1, -1, -1, 2, -1],
+                [0, -1, -1, -1, -1, 2, -1],
+                [0, 1, -1, -1, -1, -1, -1],
+            ],
+            dtype="int64",
+        ).reshape([1, self.nloc, sum(self.sel)])
+        self.atype_ext = np.array([0, 0, 1, 0, 1], dtype="int64").reshape(
+            [1, self.nall]
+        )
+        self.rcut_smth = 0.4
+        self.rcut = 2.0
+
+        nf, nloc, nnei = self.expected_nlist.shape
+        ds = DescrptSeA(
+            self.rcut,
+            self.rcut_smth,
+            self.sel,
+        ).to(env.DEVICE)
+        ft = EnergyFittingNet(
+            self.nt,
+            ds.get_dim_out(),
+            mixed_types=ds.mixed_types(),
+        ).to(env.DEVICE)
+        type_map = ["foo", "bar"]
+        self.md = EnergyModel(ds, ft, type_map=type_map).to(env.DEVICE)
+
+    def test_nlist_eq(self):
+        # n_nnei == nnei
+        nlist = np.array(
+            [
+                [1, 3, -1, -1, -1, 2, -1],
+                [0, -1, -1, -1, -1, 2, -1],
+                [0, 1, -1, -1, -1, -1, -1],
+            ],
+            dtype=np.int64,
+        ).reshape([1, self.nloc, -1])
+        nlist1 = self.md.format_nlist(
+            to_paddle_tensor(self.coord_ext),
+            to_paddle_tensor(self.atype_ext),
+            to_paddle_tensor(nlist),
+        )
+        np.testing.assert_equal(self.expected_nlist, to_numpy_array(nlist1))
+
+    def test_nlist_st(self):
+        # n_nnei < nnei
+        nlist = np.array(
+            [
+                [1, 3, -1, 2],
+                [0, -1, -1, 2],
+                [0, 1, -1, -1],
+            ],
+            dtype=np.int64,
+        ).reshape([1, self.nloc, -1])
+        nlist1 = self.md.format_nlist(
+            to_paddle_tensor(self.coord_ext),
+            to_paddle_tensor(self.atype_ext),
+            to_paddle_tensor(nlist),
+        )
+        np.testing.assert_equal(self.expected_nlist, to_numpy_array(nlist1))
+
+    def test_nlist_lt(self):
+        # n_nnei > nnei
+        nlist = np.array(
+            [
+                [1, 3, -1, -1, -1, 2, -1, -1, 4],
+                [0, -1, 4, -1, -1, 2, -1, 3, -1],
+                [0, 1, -1, -1, -1, 4, -1, -1, 3],
+            ],
+            dtype=np.int64,
+        ).reshape([1, self.nloc, -1])
+        nlist1 = self.md.format_nlist(
+            to_paddle_tensor(self.coord_ext),
+            to_paddle_tensor(self.atype_ext),
+            to_paddle_tensor(nlist),
+        )
+        np.testing.assert_equal(self.expected_nlist, to_numpy_array(nlist1))
+
+
+class TestEnergyModel(unittest.TestCase, TestCaseSingleFrameWithoutNlist):
+    def setUp(self):
+        TestCaseSingleFrameWithoutNlist.setUp(self)
+
+    def test_self_consistency(self):
+        nf, nloc = self.atype.shape
+        ds = DescrptSeA(
+            self.rcut,
+            self.rcut_smth,
+            self.sel,
+        ).to(env.DEVICE)
+        ft = EnergyFittingNet(
+            self.nt,
+            ds.get_dim_out(),
+            mixed_types=ds.mixed_types(),
+        ).to(env.DEVICE)
+        type_map = ["foo", "bar"]
+        md0 = EnergyModel(ds, ft, type_map=type_map).to(env.DEVICE)
+        md1 = EnergyModel.deserialize(md0.serialize()).to(env.DEVICE)
+        args = [to_paddle_tensor(ii) for ii in [self.coord, self.atype, self.cell]]
+        ret0 = md0.forward(*args)
+        ret1 = md1.forward(*args)
+        np.testing.assert_allclose(
+            to_numpy_array(ret0["atom_energy"]),
+            to_numpy_array(ret1["atom_energy"]),
+            atol=self.atol,
+        )
+        np.testing.assert_allclose(
+            to_numpy_array(ret0["energy"]),
+            to_numpy_array(ret1["energy"]),
+            atol=self.atol,
+        )
+        np.testing.assert_allclose(
+            to_numpy_array(ret0["force"]),
+            to_numpy_array(ret1["force"]),
+            atol=self.atol,
+        )
+        np.testing.assert_allclose(
+            to_numpy_array(ret0["virial"]),
+            to_numpy_array(ret1["virial"]),
+            atol=self.atol,
+        )
+        ret0 = md0.forward(*args, do_atomic_virial=True)
+        ret1 = md1.forward(*args, do_atomic_virial=True)
+        np.testing.assert_allclose(
+            to_numpy_array(ret0["atom_virial"]),
+            to_numpy_array(ret1["atom_virial"]),
+            atol=self.atol,
+        )
+        coord_ext, atype_ext, mapping, nlist = extend_input_and_build_neighbor_list(
+            to_paddle_tensor(self.coord),
+            to_paddle_tensor(self.atype),
+            self.rcut,
+            self.sel,
+            mixed_types=md0.mixed_types(),
+            box=to_paddle_tensor(self.cell),
+        )
+        args = [coord_ext, atype_ext, nlist]
+        ret2 = md0.forward_lower(*args, do_atomic_virial=True)
+        # check the consistency between the reduced virial from
+        # forward and forward_lower
+        np.testing.assert_allclose(
+            to_numpy_array(ret0["virial"]),
+            to_numpy_array(ret2["virial"]),
+            atol=self.atol,
+        )
+
+
+class TestEnergyModelLower(unittest.TestCase, TestCaseSingleFrameWithNlist):
+    def setUp(self):
+        TestCaseSingleFrameWithNlist.setUp(self)
+
+    def test_self_consistency(self):
+        nf, nloc, nnei = self.nlist.shape
+        ds = DescrptSeA(
+            self.rcut,
+            self.rcut_smth,
+            self.sel,
+        ).to(env.DEVICE)
+        ft = EnergyFittingNet(
+            self.nt,
+            ds.get_dim_out(),
+            mixed_types=ds.mixed_types(),
+        ).to(env.DEVICE)
+        type_map = ["foo", "bar"]
+        md0 = EnergyModel(ds, ft, type_map=type_map).to(env.DEVICE)
+        md1 = EnergyModel.deserialize(md0.serialize()).to(env.DEVICE)
+        args = [
+            to_paddle_tensor(ii) for ii in [self.coord_ext, self.atype_ext, self.nlist]
+        ]
+        ret0 = md0.forward_lower(*args)
+        ret1 = md1.forward_lower(*args)
+        np.testing.assert_allclose(
+            to_numpy_array(ret0["atom_energy"]),
+            to_numpy_array(ret1["atom_energy"]),
+            atol=self.atol,
+        )
+        np.testing.assert_allclose(
+            to_numpy_array(ret0["energy"]),
+            to_numpy_array(ret1["energy"]),
+            atol=self.atol,
+        )
+        np.testing.assert_allclose(
+            to_numpy_array(ret0["extended_force"]),
+            to_numpy_array(ret1["extended_force"]),
+            atol=self.atol,
+        )
+        np.testing.assert_allclose(
+            to_numpy_array(ret0["virial"]),
+            to_numpy_array(ret1["virial"]),
+            atol=self.atol,
+        )
+        ret0 = md0.forward_lower(*args, do_atomic_virial=True)
+        ret1 = md1.forward_lower(*args, do_atomic_virial=True)
+        np.testing.assert_allclose(
+            to_numpy_array(ret0["extended_virial"]),
+            to_numpy_array(ret1["extended_virial"]),
+            atol=self.atol,
+        )
+
+    def test_jit(self):
+        nf, nloc, nnei = self.nlist.shape
+        ds = DescrptSeA(
+            self.rcut,
+            self.rcut_smth,
+            self.sel,
+        ).to(env.DEVICE)
+        ft = EnergyFittingNet(
+            self.nt,
+            ds.get_dim_out(),
+            mixed_types=ds.mixed_types(),
+        ).to(env.DEVICE)
+        type_map = ["foo", "bar"]
+        md0 = EnergyModel(ds, ft, type_map=type_map).to(env.DEVICE)
+        md0 = paddle.jit.to_static(md0)
+        self.assertEqual(md0.get_rcut(), self.rcut)
+        self.assertEqual(md0.get_type_map(), type_map)
diff --git a/source/tests/pd/model/test_embedding_net.py b/source/tests/pd/model/test_embedding_net.py
new file mode 100644
index 0000000000..2dcc9f821b
--- /dev/null
+++ b/source/tests/pd/model/test_embedding_net.py
@@ -0,0 +1,217 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import json
+import os
+import re
+import unittest
+
+import numpy as np
+import paddle
+import tensorflow.compat.v1 as tf
+
+from deepmd.pd.utils import (
+    env,
+)
+
+tf.disable_eager_execution()
+
+from pathlib import (
+    Path,
+)
+
+from deepmd.pd.model.descriptor import (
+    DescrptSeA,
+)
+from deepmd.pd.utils import (
+    dp_random,
+)
+from deepmd.pd.utils.dataset import (
+    DeepmdDataSetForLoader,
+)
+from deepmd.pd.utils.env import (
+    DEVICE,
+    GLOBAL_NP_FLOAT_PRECISION,
+)
+from deepmd.pd.utils.nlist import (
+    extend_input_and_build_neighbor_list,
+)
+from deepmd.tf.common import (
+    expand_sys_str,
+)
+from deepmd.tf.descriptor import DescrptSeA as DescrptSeA_tf
+
+from ..test_finetune import (
+    energy_data_requirement,
+)
+
+CUR_DIR = os.path.dirname(__file__)
+
+
+def gen_key(worb, depth, elemid):
+    return (worb, depth, elemid)
+
+
+def get_single_batch(dataset, index=None):
+    if index is None:
+        index = dp_random.choice(np.arange(len(dataset)))
+    np_batch = dataset[index]
+    pd_batch = {}
+
+    for key in [
+        "coord",
+        "box",
+        "force",
+        "force_mag",
+        "energy",
+        "virial",
+        "atype",
+        "natoms",
+    ]:
+        if key in np_batch.keys():
+            np_batch[key] = np.expand_dims(np_batch[key], axis=0)
+            pd_batch[key] = paddle.to_tensor(np_batch[key]).to(device=env.DEVICE)
+            if key in ["coord", "force", "force_mag"]:
+                np_batch[key] = np_batch[key].reshape(1, -1)
+    np_batch["natoms"] = np_batch["natoms"][0]
+    return np_batch, pd_batch
+
+
+def base_se_a(descriptor, coord, atype, natoms, box):
+    g = tf.Graph()
+    with g.as_default():
+        name_pfx = "d_sea_"
+        t_coord = tf.placeholder(
+            GLOBAL_NP_FLOAT_PRECISION, [None, None], name=name_pfx + "t_coord"
+        )
+        t_atype = tf.placeholder(tf.int32, [None, None], name=name_pfx + "t_type")
+        t_natoms = tf.placeholder(
+            tf.int32, [descriptor.ntypes + 2], name=name_pfx + "t_natoms"
+        )
+        t_box = tf.placeholder(
+            GLOBAL_NP_FLOAT_PRECISION, [None, None], name=name_pfx + "t_box"
+        )
+        t_default_mesh = tf.placeholder(tf.int32, [None], name=name_pfx + "t_mesh")
+        t_embedding = descriptor.build(
+            t_coord, t_atype, t_natoms, t_box, t_default_mesh, input_dict={}
+        )
+        fake_energy = tf.reduce_sum(t_embedding)
+        t_force = descriptor.prod_force_virial(fake_energy, t_natoms)[0]
+        t_vars = {}
+        for var in tf.global_variables():
+            ms = re.findall(r"([a-z]+)_(\d)_(\d)", var.name)
+            if len(ms) == 1:
+                m = ms[0]
+                key = gen_key(worb=m[0], depth=int(m[1]), elemid=int(m[2]))
+                t_vars[key] = var
+        init_op = tf.global_variables_initializer()
+
+    with tf.Session(graph=g) as sess:
+        sess.run(init_op)
+        embedding, force, values = sess.run(
+            [t_embedding, t_force, t_vars],
+            feed_dict={
+                t_coord: coord,
+                t_atype: atype,
+                t_natoms: natoms,
+                t_box: box,
+                t_default_mesh: np.array([0, 0, 0, 2, 2, 2]),
+            },
+        )
+    tf.reset_default_graph()
+    return embedding, force, values
+
+
+class TestSeA(unittest.TestCase):
+    def setUp(self):
+        dp_random.seed(0)
+        with open(str(Path(__file__).parent / "water/se_e2_a.json")) as fin:
+            content = fin.read()
+        config = json.loads(content)
+        data_file = [str(Path(__file__).parent / "water/data/data_0")]
+        config["training"]["training_data"]["systems"] = data_file
+        config["training"]["validation_data"]["systems"] = data_file
+        model_config = config["model"]
+        self.rcut = model_config["descriptor"]["rcut"]
+        self.rcut_smth = model_config["descriptor"]["rcut_smth"]
+        self.sel = model_config["descriptor"]["sel"]
+        self.bsz = config["training"]["training_data"]["batch_size"]
+        self.systems = config["training"]["validation_data"]["systems"]
+        if isinstance(self.systems, str):
+            self.systems = expand_sys_str(self.systems)
+        ds = DeepmdDataSetForLoader(
+            self.systems[0],
+            model_config["type_map"],
+        )
+        ds.add_data_requirement(energy_data_requirement)
+        self.filter_neuron = model_config["descriptor"]["neuron"]
+        self.axis_neuron = model_config["descriptor"]["axis_neuron"]
+        self.np_batch, self.paddle_batch = get_single_batch(ds)
+
+    def test_consistency(self):
+        dp_d = DescrptSeA_tf(
+            rcut=self.rcut,
+            rcut_smth=self.rcut_smth,
+            sel=self.sel,
+            neuron=self.filter_neuron,
+            axis_neuron=self.axis_neuron,
+            seed=1,
+        )
+        dp_embedding, dp_force, dp_vars = base_se_a(
+            descriptor=dp_d,
+            coord=self.np_batch["coord"],
+            atype=self.np_batch["atype"],
+            natoms=self.np_batch["natoms"],
+            box=self.np_batch["box"],
+        )
+
+        # Reproduced
+        descriptor = DescrptSeA(
+            self.rcut,
+            self.rcut_smth,
+            self.sel,
+            neuron=self.filter_neuron,
+            axis_neuron=self.axis_neuron,
+        ).to(DEVICE)
+        for name, param in descriptor.named_parameters():
+            ms = re.findall(r"(\d)\.layers\.(\d)\.([a-z]+)", name)
+            if len(ms) == 1:
+                m = ms[0]
+                key = gen_key(worb=m[2], depth=int(m[1]) + 1, elemid=int(m[0]))
+                var = dp_vars[key]
+                with paddle.no_grad():
+                    # Keep parameter value consistency between 2 implentations
+                    paddle.assign(var, param)
+
+        pd_coord = self.paddle_batch["coord"].to(env.DEVICE)
+        pd_coord.stop_gradient = False
+
+        (
+            extended_coord,
+            extended_atype,
+            mapping,
+            nlist,
+        ) = extend_input_and_build_neighbor_list(
+            pd_coord,
+            self.paddle_batch["atype"].to(env.DEVICE),
+            self.rcut,
+            self.sel,
+            mixed_types=False,
+            box=self.paddle_batch["box"].to(env.DEVICE),
+        )
+        descriptor_out, _, _, _, _ = descriptor(
+            extended_coord,
+            extended_atype,
+            nlist,
+        )
+        my_embedding = descriptor_out.cpu().detach().numpy()
+        fake_energy = paddle.sum(descriptor_out)
+        fake_energy.backward()
+        my_force = -pd_coord.grad.cpu().numpy()
+
+        # Check
+        np.testing.assert_allclose(dp_embedding, my_embedding)
+        dp_force = dp_force.reshape(*my_force.shape)
+        np.testing.assert_allclose(dp_force, my_force)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/source/tests/pd/model/test_ener_fitting.py b/source/tests/pd/model/test_ener_fitting.py
new file mode 100644
index 0000000000..dd13f139dc
--- /dev/null
+++ b/source/tests/pd/model/test_ener_fitting.py
@@ -0,0 +1,150 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import itertools
+import unittest
+
+import numpy as np
+import paddle
+
+from deepmd.dpmodel.fitting import InvarFitting as DPInvarFitting
+from deepmd.pd.model.descriptor.se_a import (
+    DescrptSeA,
+)
+from deepmd.pd.model.task.ener import (
+    InvarFitting,
+)
+from deepmd.pd.utils import (
+    env,
+)
+from deepmd.pd.utils.utils import (
+    to_numpy_array,
+)
+
+from ...seed import (
+    GLOBAL_SEED,
+)
+from .test_env_mat import (
+    TestCaseSingleFrameWithNlist,
+)
+
+dtype = env.GLOBAL_PD_FLOAT_PRECISION
+
+
+class TestInvarFitting(unittest.TestCase, TestCaseSingleFrameWithNlist):
+    def setUp(self):
+        TestCaseSingleFrameWithNlist.setUp(self)
+
+    def test_consistency(
+        self,
+    ):
+        rng = np.random.default_rng(GLOBAL_SEED)
+        nf, nloc, nnei = self.nlist.shape
+        dd0 = DescrptSeA(self.rcut, self.rcut_smth, self.sel).to(env.DEVICE)
+        rd0, _, _, _, _ = dd0(
+            paddle.to_tensor(self.coord_ext, dtype=dtype).to(device=env.DEVICE),
+            paddle.to_tensor(self.atype_ext, dtype="int64").to(device=env.DEVICE),
+            paddle.to_tensor(self.nlist, dtype="int64").to(device=env.DEVICE),
+        )
+        atype = paddle.to_tensor(self.atype_ext[:, :nloc], dtype="int64").to(
+            device=env.DEVICE
+        )
+
+        for od, mixed_types, nfp, nap, et, nn in itertools.product(
+            [1, 3],
+            [True, False],
+            [0, 3],
+            [0, 4],
+            [[], [0], [1]],
+            [[4, 4, 4], []],
+        ):
+            ft0 = InvarFitting(
+                "foo",
+                self.nt,
+                dd0.dim_out,
+                od,
+                numb_fparam=nfp,
+                numb_aparam=nap,
+                mixed_types=mixed_types,
+                exclude_types=et,
+                neuron=nn,
+                seed=GLOBAL_SEED,
+            ).to(env.DEVICE)
+            ft1 = DPInvarFitting.deserialize(ft0.serialize())
+            ft2 = InvarFitting.deserialize(ft0.serialize())
+
+            if nfp > 0:
+                ifp = paddle.to_tensor(
+                    rng.normal(size=(self.nf, nfp)), dtype=dtype, place=env.DEVICE
+                )
+            else:
+                ifp = None
+            if nap > 0:
+                iap = paddle.to_tensor(
+                    rng.normal(size=(self.nf, self.nloc, nap)),
+                    dtype=dtype,
+                    place=env.DEVICE,
+                )
+            else:
+                iap = None
+
+            ret0 = ft0(rd0, atype, fparam=ifp, aparam=iap)
+            ret1 = ft1(
+                rd0.detach().cpu().numpy(),
+                atype.detach().cpu().numpy(),
+                fparam=to_numpy_array(ifp),
+                aparam=to_numpy_array(iap),
+            )
+            ret2 = ft2(rd0, atype, fparam=ifp, aparam=iap)
+            np.testing.assert_allclose(
+                to_numpy_array(ret0["foo"]),
+                ret1["foo"],
+            )
+            np.testing.assert_allclose(
+                to_numpy_array(ret0["foo"]),
+                to_numpy_array(ret2["foo"]),
+            )
+            self.assertEqual(ft0.get_sel_type(), ft1.get_sel_type())
+
+    def test_jit(
+        self,
+    ):
+        for od, mixed_types, nfp, nap, et in itertools.product(
+            [1, 3],
+            [True, False],
+            [0, 3],
+            [0, 4],
+            [[], [0]],
+        ):
+            ft0 = InvarFitting(
+                "foo",
+                self.nt,
+                9,
+                od,
+                numb_fparam=nfp,
+                numb_aparam=nap,
+                mixed_types=mixed_types,
+                exclude_types=et,
+                seed=GLOBAL_SEED,
+            ).to(env.DEVICE)
+            paddle.jit.to_static(ft0)
+
+    def test_get_set(self):
+        ifn0 = InvarFitting(
+            "energy",
+            self.nt,
+            3,
+            1,
+            seed=GLOBAL_SEED,
+        )
+        rng = np.random.default_rng(GLOBAL_SEED)
+        foo = rng.normal([3, 4])
+        for ii in [
+            "bias_atom_e",
+            "fparam_avg",
+            "fparam_inv_std",
+            "aparam_avg",
+            "aparam_inv_std",
+        ]:
+            ifn0[ii] = paddle.to_tensor(foo, dtype=dtype).to(device=env.DEVICE)
+            np.testing.assert_allclose(
+                foo, np.reshape(ifn0[ii].detach().cpu().numpy(), foo.shape)
+            )
diff --git a/source/tests/pd/model/test_env_mat.py b/source/tests/pd/model/test_env_mat.py
new file mode 100644
index 0000000000..7cbc698264
--- /dev/null
+++ b/source/tests/pd/model/test_env_mat.py
@@ -0,0 +1,187 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import unittest
+
+import numpy as np
+import paddle
+
+from deepmd.dpmodel.utils import (
+    EnvMat,
+)
+from deepmd.pd.model.descriptor.env_mat import (
+    prod_env_mat,
+)
+from deepmd.pd.utils import (
+    env,
+)
+
+from ...seed import (
+    GLOBAL_SEED,
+)
+
+dtype = env.GLOBAL_PD_FLOAT_PRECISION
+
+
+class TestCaseSingleFrameWithNlist:
+    def setUp(self):
+        # nloc == 3, nall == 4
+        self.nloc = 3
+        self.nall = 4
+        self.nf, self.nt = 2, 2
+        self.coord_ext = np.array(
+            [
+                [0, 0, 0],
+                [0, 1, 0],
+                [0, 0, 1],
+                [0, -2, 0],
+            ],
+            dtype=np.float64,
+        ).reshape([1, self.nall, 3])
+        self.atype_ext = np.array([0, 0, 1, 0], dtype="int64").reshape([1, self.nall])
+        self.mapping = np.array([0, 1, 2, 0], dtype="int64").reshape([1, self.nall])
+        # sel = [5, 2]
+        self.sel = [5, 2]
+        self.sel_mix = [7]
+        self.natoms = [3, 3, 2, 1]
+        self.nlist = np.array(
+            [
+                [1, 3, -1, -1, -1, 2, -1],
+                [0, -1, -1, -1, -1, 2, -1],
+                [0, 1, -1, -1, -1, -1, -1],
+            ],
+            dtype="int64",
+        ).reshape([1, self.nloc, sum(self.sel)])
+        self.rcut = 2.2
+        self.rcut_smth = 0.4
+        # permutations
+        self.perm = np.array([2, 0, 1, 3], dtype=np.int32)
+        inv_perm = np.array([1, 2, 0, 3], dtype=np.int32)
+        # permute the coord and atype
+        self.coord_ext = np.concatenate(
+            [self.coord_ext, self.coord_ext[:, self.perm, :]], axis=0
+        ).reshape(self.nf, self.nall * 3)
+        self.atype_ext = np.concatenate(
+            [self.atype_ext, self.atype_ext[:, self.perm]], axis=0
+        )
+        self.mapping = np.concatenate(
+            [self.mapping, self.mapping[:, self.perm]], axis=0
+        )
+
+        # permute the nlist
+        nlist1 = self.nlist[:, self.perm[: self.nloc], :]
+        mask = nlist1 == -1
+        nlist1 = inv_perm[nlist1]
+        nlist1 = np.where(mask, -1, nlist1)
+        self.nlist = np.concatenate([self.nlist, nlist1], axis=0)
+        self.atol = 1e-12
+
+
+class TestCaseSingleFrameWithNlistWithVirtual:
+    def setUp(self):
+        # nloc == 3, nall == 4
+        self.nloc = 4
+        self.nall = 5
+        self.nf, self.nt = 2, 2
+        self.coord_ext = np.array(
+            [
+                [0, 0, 0],
+                [0, 0, 0],
+                [0, 1, 0],
+                [0, 0, 1],
+                [0, -2, 0],
+            ],
+            dtype=np.float64,
+        ).reshape([1, self.nall, 3])
+        self.atype_ext = np.array([0, -1, 0, 1, 0], dtype="int64").reshape(
+            [1, self.nall]
+        )
+        # sel = [5, 2]
+        self.sel = [5, 2]
+        self.sel_mix = [7]
+        self.natoms = [3, 3, 2, 1]
+        self.nlist = np.array(
+            [
+                [2, 4, -1, -1, -1, 3, -1],
+                [-1, -1, -1, -1, -1, -1, -1],
+                [0, -1, -1, -1, -1, 3, -1],
+                [0, 2, -1, -1, -1, -1, -1],
+            ],
+            dtype="int64",
+        ).reshape([1, self.nloc, sum(self.sel)])
+        self.rcut = 2.2
+        self.rcut_smth = 0.4
+        # permutations
+        self.perm = np.array([3, 0, 1, 2, 4], dtype=np.int32)
+        inv_perm = np.argsort(self.perm)
+        # permute the coord and atype
+        self.coord_ext = np.concatenate(
+            [self.coord_ext, self.coord_ext[:, self.perm, :]], axis=0
+        ).reshape(self.nf, self.nall * 3)
+        self.atype_ext = np.concatenate(
+            [self.atype_ext, self.atype_ext[:, self.perm]], axis=0
+        )
+        # permute the nlist
+        nlist1 = self.nlist[:, self.perm[: self.nloc], :]
+        mask = nlist1 == -1
+        nlist1 = inv_perm[nlist1]
+        nlist1 = np.where(mask, -1, nlist1)
+        self.nlist = np.concatenate([self.nlist, nlist1], axis=0)
+        self.get_real_mapping = np.array([[0, 2, 3], [0, 1, 3]], dtype=np.int32)
+        self.atol = 1e-12
+
+
+class TestCaseSingleFrameWithoutNlist:
+    def setUp(self):
+        # nloc == 3, nall == 4
+        self.nloc = 3
+        self.nf, self.nt = 1, 2
+        self.coord = np.array(
+            [
+                [0, 0, 0],
+                [0, 1, 0],
+                [0, 0, 1],
+            ],
+            dtype=np.float64,
+        ).reshape([1, self.nloc * 3])
+        self.atype = np.array([0, 0, 1], dtype="int64").reshape([1, self.nloc])
+        self.cell = 2.0 * np.eye(3).reshape([1, 9])
+        # sel = [5, 2]
+        self.sel = [16, 8]
+        self.sel_mix = [24]
+        self.natoms = [3, 3, 2, 1]
+        self.rcut = 2.2
+        self.rcut_smth = 0.4
+        self.atol = 1e-12
+
+
+# to be merged with the tf test case
+class TestEnvMat(unittest.TestCase, TestCaseSingleFrameWithNlist):
+    def setUp(self):
+        TestCaseSingleFrameWithNlist.setUp(self)
+
+    def test_consistency(
+        self,
+    ):
+        rng = np.random.default_rng(GLOBAL_SEED)
+        nf, nloc, nnei = self.nlist.shape
+        davg = rng.normal(size=(self.nt, nnei, 4))
+        dstd = rng.normal(size=(self.nt, nnei, 4))
+        dstd = 0.1 + np.abs(dstd)
+        em0 = EnvMat(self.rcut, self.rcut_smth)
+        mm0, diff0, ww0 = em0.call(
+            self.coord_ext, self.atype_ext, self.nlist, davg, dstd
+        )
+        mm1, diff1, ww1 = prod_env_mat(
+            paddle.to_tensor(self.coord_ext, dtype=dtype).to(device=env.DEVICE),
+            paddle.to_tensor(self.nlist, dtype="int64").to(device=env.DEVICE),
+            paddle.to_tensor(self.atype_ext[:, :nloc], dtype="int64").to(
+                device=env.DEVICE
+            ),
+            paddle.to_tensor(davg).to(device=env.DEVICE),
+            paddle.to_tensor(dstd).to(device=env.DEVICE),
+            self.rcut,
+            self.rcut_smth,
+        )
+        np.testing.assert_allclose(mm0, mm1.detach().cpu().numpy())
+        np.testing.assert_allclose(diff0, diff1.detach().cpu().numpy())
+        np.testing.assert_allclose(ww0, ww1.detach().cpu().numpy())
+        np.testing.assert_allclose(mm0[0][self.perm[: self.nloc]], mm0[1])
diff --git a/source/tests/pd/model/test_exclusion_mask.py b/source/tests/pd/model/test_exclusion_mask.py
new file mode 100644
index 0000000000..ff479ee7db
--- /dev/null
+++ b/source/tests/pd/model/test_exclusion_mask.py
@@ -0,0 +1,70 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import unittest
+
+import numpy as np
+
+from deepmd.pd.utils import (
+    env,
+)
+from deepmd.pd.utils.exclude_mask import (
+    AtomExcludeMask,
+    PairExcludeMask,
+)
+from deepmd.pd.utils.utils import (
+    to_numpy_array,
+    to_paddle_tensor,
+)
+
+from .test_env_mat import (
+    TestCaseSingleFrameWithNlist,
+)
+
+dtype = env.GLOBAL_PD_FLOAT_PRECISION
+
+
+class TestAtomExcludeMask(unittest.TestCase):
+    def test_build_type_exclude_mask(self):
+        nf = 2
+        nt = 3
+        exclude_types = [0, 2]
+        atype = np.array(
+            [
+                [0, 2, 1, 2, 0, 1, 0],
+                [1, 2, 0, 0, 2, 2, 1],
+            ],
+            dtype=np.int32,
+        ).reshape([nf, -1])
+        expected_mask = np.array(
+            [
+                [0, 0, 1, 0, 0, 1, 0],
+                [1, 0, 0, 0, 0, 0, 1],
+            ]
+        ).reshape([nf, -1])
+        des = AtomExcludeMask(nt, exclude_types=exclude_types)
+        mask = des(to_paddle_tensor(atype))
+        np.testing.assert_equal(to_numpy_array(mask), expected_mask)
+
+
+# to be merged with the tf test case
+class TestPairExcludeMask(unittest.TestCase, TestCaseSingleFrameWithNlist):
+    def setUp(self):
+        TestCaseSingleFrameWithNlist.setUp(self)
+
+    def test_build_type_exclude_mask(self):
+        exclude_types = [[0, 1]]
+        expected_mask = np.array(
+            [
+                [1, 1, 1, 1, 1, 0, 1],
+                [1, 1, 1, 1, 1, 0, 1],
+                [0, 0, 1, 1, 1, 1, 1],
+                [0, 0, 1, 1, 1, 1, 1],
+                [1, 1, 1, 1, 1, 0, 1],
+                [1, 1, 1, 1, 1, 0, 1],
+            ]
+        ).reshape(self.nf, self.nloc, sum(self.sel))
+        des = PairExcludeMask(self.nt, exclude_types=exclude_types).to(env.DEVICE)
+        mask = des(
+            to_paddle_tensor(self.nlist),
+            to_paddle_tensor(self.atype_ext),
+        )
+        np.testing.assert_equal(to_numpy_array(mask), expected_mask)
diff --git a/source/tests/pd/model/test_fitting_net.py b/source/tests/pd/model/test_fitting_net.py
new file mode 100644
index 0000000000..9a4d4d128f
--- /dev/null
+++ b/source/tests/pd/model/test_fitting_net.py
@@ -0,0 +1,148 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import re
+import unittest
+
+import numpy as np
+import paddle
+import tensorflow.compat.v1 as tf
+
+tf.disable_eager_execution()
+
+from deepmd.pd.model.task import (
+    EnergyFittingNet,
+)
+from deepmd.pd.utils import (
+    env,
+)
+from deepmd.pd.utils.env import (
+    GLOBAL_NP_FLOAT_PRECISION,
+)
+from deepmd.tf.fit.ener import (
+    EnerFitting,
+)
+
+from ...seed import (
+    GLOBAL_SEED,
+)
+
+
+class FakeDescriptor:
+    def __init__(self, ntypes, embedding_width):
+        self._ntypes = ntypes
+        self._dim_out = embedding_width
+
+    def get_ntypes(self):
+        return self._ntypes
+
+    def get_dim_out(self):
+        return self._dim_out
+
+
+def gen_key(type_id, layer_id, w_or_b):
+    return (type_id, layer_id, w_or_b)
+
+
+def base_fitting_net(dp_fn, embedding, natoms, atype):
+    g = tf.Graph()
+    with g.as_default():
+        t_embedding = tf.placeholder(GLOBAL_NP_FLOAT_PRECISION, [None, None])
+        t_natoms = tf.placeholder(tf.int32, [None])
+        t_atype = tf.placeholder(tf.int32, [None, None])
+        t_energy = dp_fn.build(t_embedding, t_natoms, {"atype": t_atype})
+        init_op = tf.global_variables_initializer()
+        t_vars = {}
+        for var in tf.global_variables():
+            key = None
+            matched = re.match(r"layer_(\d)_type_(\d)/([a-z]+)", var.name)
+            if matched:
+                key = gen_key(
+                    type_id=matched.group(2),
+                    layer_id=matched.group(1),
+                    w_or_b=matched.group(3),
+                )
+            else:
+                matched = re.match(r"final_layer_type_(\d)/([a-z]+)", var.name)
+                if matched:
+                    key = gen_key(
+                        type_id=matched.group(1), layer_id=-1, w_or_b=matched.group(2)
+                    )
+            if key is not None:
+                t_vars[key] = var
+
+    with tf.Session(graph=g) as sess:
+        sess.run(init_op)
+        energy, values = sess.run(
+            [t_energy, t_vars],
+            feed_dict={
+                t_embedding: embedding,
+                t_natoms: natoms,
+                t_atype: atype,
+            },
+        )
+    tf.reset_default_graph()
+    return energy, values
+
+
+class TestFittingNet(unittest.TestCase):
+    def setUp(self):
+        nloc = 7
+        self.embedding_width = 30
+        self.natoms = np.array([nloc, nloc, 2, 5], dtype=np.int32)
+        rng = np.random.default_rng(GLOBAL_SEED)
+        self.embedding = rng.uniform(size=[4, nloc * self.embedding_width])
+        self.ntypes = self.natoms.size - 2
+        self.n_neuron = [32, 32, 32]
+        self.atype = np.zeros([4, nloc], dtype=np.int32)
+        cnt = 0
+        for i in range(self.ntypes):
+            self.atype[:, cnt : cnt + self.natoms[i + 2]] = i
+            cnt += self.natoms[i + 2]
+
+        fake_d = FakeDescriptor(2, 30)
+        self.dp_fn = EnerFitting(
+            fake_d.get_ntypes(), fake_d.get_dim_out(), self.n_neuron
+        )
+        self.dp_fn.bias_atom_e = rng.uniform(size=[self.ntypes])
+
+    def test_consistency(self):
+        dp_energy, values = base_fitting_net(
+            self.dp_fn, self.embedding, self.natoms, self.atype
+        )
+        my_fn = EnergyFittingNet(
+            self.ntypes,
+            self.embedding_width,
+            neuron=self.n_neuron,
+            bias_atom_e=self.dp_fn.bias_atom_e,
+            mixed_types=False,
+        ).to(env.DEVICE)
+        for name, param in my_fn.named_parameters():
+            matched = re.match(
+                r"filter_layers\.networks\.(\d).layers\.(\d)\.([a-z]+)", name
+            )
+            key = None
+            if matched:
+                if int(matched.group(2)) == len(self.n_neuron):
+                    layer_id = -1
+                else:
+                    layer_id = matched.group(2)
+                key = gen_key(
+                    type_id=matched.group(1),
+                    layer_id=layer_id,
+                    w_or_b=matched.group(3),
+                )
+            assert key is not None
+            var = values[key]
+            with paddle.no_grad():
+                # Keep parameter value consistency between 2 implentations
+                paddle.assign(var, param)
+        embedding = paddle.to_tensor(self.embedding)
+        embedding = embedding.reshape([4, -1, self.embedding_width])
+        atype = paddle.to_tensor(self.atype)
+        ret = my_fn(embedding.to(env.DEVICE), atype.to(env.DEVICE))
+        my_energy = ret["energy"]
+        my_energy = my_energy.detach().cpu()
+        np.testing.assert_allclose(dp_energy, my_energy.numpy().reshape([-1]))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/source/tests/pd/model/test_force_grad.py b/source/tests/pd/model/test_force_grad.py
new file mode 100644
index 0000000000..d7b569ef38
--- /dev/null
+++ b/source/tests/pd/model/test_force_grad.py
@@ -0,0 +1,111 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import copy
+import json
+import unittest
+from pathlib import (
+    Path,
+)
+from typing import (
+    Optional,
+)
+
+import numpy as np
+import paddle
+
+from deepmd.pd.model.model import (
+    get_model,
+)
+from deepmd.pd.utils import (
+    env,
+)
+from deepmd.utils.data import (
+    DeepmdData,
+)
+
+from ...seed import (
+    GLOBAL_SEED,
+)
+
+
+class CheckSymmetry(DeepmdData):
+    def __init__(
+        self,
+        sys_path: str,
+        type_map: Optional[list[str]] = None,
+    ):
+        super().__init__(sys_path=sys_path, type_map=type_map)
+        self.add("energy", 1, atomic=False, must=False, high_prec=True)
+        self.add("force", 3, atomic=True, must=False, high_prec=False)
+        self.add("virial", 9, atomic=False, must=False, high_prec=False)
+
+    def get_disturb(self, index, atom_index, axis_index, delta):
+        for i in range(
+            0, len(self.dirs) + 1
+        ):  # note: if different sets can be merged, prefix sum is unused to calculate
+            if index < self.prefix_sum[i]:
+                break
+        frames = self._load_set(self.dirs[i - 1])
+        tmp = copy.deepcopy(frames["coord"].reshape(self.nframes, -1, 3))
+        tmp[:, atom_index, axis_index] += delta
+        frames["coord"] = tmp
+        frame = self._get_subdata(frames, index - self.prefix_sum[i - 1])
+        frame = self.reformat_data_torch(frame)
+        return frame
+
+
+def get_data(batch):
+    inputs = {}
+    for key in ["coord", "atype", "box"]:
+        inputs[key] = batch[key].unsqueeze(0).to(env.DEVICE)
+    return inputs
+
+
+class TestForceGrad(unittest.TestCase):
+    def setUp(self):
+        with open(str(Path(__file__).parent / "water/se_e2_a.json")) as fin:
+            self.config = json.load(fin)
+        data_file = [str(Path(__file__).parent / "water/data/data_0")]
+        self.config["training"]["training_data"]["systems"] = data_file
+        self.config["training"]["validation_data"]["systems"] = data_file
+        self.system_index = 0
+        self.batch_index = 0
+        self.get_dataset(self.system_index, self.batch_index)
+        self.get_model()
+
+    def get_model(self):
+        self.model = get_model(self.config["model"]).to(env.DEVICE)
+
+    def get_dataset(self, system_index=0, batch_index=0):
+        systems = self.config["training"]["training_data"]["systems"]
+        rcut = self.config["model"]["descriptor"]["rcut"]
+        sel = self.config["model"]["descriptor"]["sel"]
+        sec = paddle.cumsum(paddle.to_tensor(sel), axis=0)
+        type_map = self.config["model"]["type_map"]
+        self.dpdatasystem = CheckSymmetry(
+            sys_path=systems[system_index], type_map=type_map
+        )
+        self.origin_batch = self.dpdatasystem.get_item_paddle(batch_index)
+
+    @unittest.skip("it can be replaced by autodiff")
+    def test_force_grad(self, threshold=1e-2, delta0=1e-6, seed=20):
+        rng = np.random.default_rng(GLOBAL_SEED)
+        result0 = self.model(**get_data(self.origin_batch))
+        np.random.default_rng(seed)
+        errors = np.zeros((self.dpdatasystem.natoms, 3))
+        for atom_index in range(self.dpdatasystem.natoms):
+            for axis_index in range(3):
+                delta = rng.random() * delta0
+                disturb_batch = self.dpdatasystem.get_disturb(
+                    self.batch_index, atom_index, axis_index, delta
+                )
+                disturb_result = self.model(**get_data(disturb_batch))
+                disturb_force = -(disturb_result["energy"] - result0["energy"]) / delta
+                disturb_error = (
+                    result0["force"][0, atom_index, axis_index] - disturb_force
+                )
+                errors[atom_index, axis_index] = disturb_error.detach().cpu().numpy()
+        self.assertTrue(np.abs(errors).max() < threshold, msg=str(np.abs(errors).max()))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/source/tests/pd/model/test_forward_lower.py b/source/tests/pd/model/test_forward_lower.py
new file mode 100644
index 0000000000..ac8d0f54fc
--- /dev/null
+++ b/source/tests/pd/model/test_forward_lower.py
@@ -0,0 +1,208 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import copy
+import unittest
+
+import numpy as np
+import paddle
+
+from deepmd.pd.model.model import (
+    get_model,
+)
+from deepmd.pd.utils import (
+    decomp,
+    env,
+)
+from deepmd.pd.utils.nlist import (
+    extend_input_and_build_neighbor_list,
+)
+
+from ...seed import (
+    GLOBAL_SEED,
+)
+from ..common import (
+    eval_model,
+)
+from .test_permutation import (  # model_dpau,
+    model_dpa1,
+    model_dpa2,
+    model_se_e2_a,
+    model_spin,
+    model_zbl,
+)
+
+dtype = paddle.float64
+
+
+def reduce_tensor(extended_tensor, mapping, nloc: int):
+    nframes, nall = extended_tensor.shape[:2]
+    ext_dims = extended_tensor.shape[2:]
+    reduced_tensor = paddle.zeros(
+        [nframes, nloc, *ext_dims],
+        dtype=extended_tensor.dtype,
+    ).to(device=extended_tensor.place)
+    mldims = list(mapping.shape)
+    mapping = mapping.reshape(mldims + [1] * len(ext_dims)).expand(
+        [-1] * len(mldims) + list(ext_dims)
+    )
+    # nf x nloc x (*ext_dims)
+    reduced_tensor = decomp.scatter_reduce(
+        reduced_tensor,
+        1,
+        index=mapping,
+        src=extended_tensor,
+        reduce="sum",
+    )
+    return reduced_tensor
+
+
+class ForwardLowerTest:
+    def test(
+        self,
+    ):
+        prec = self.prec
+        natoms = 5
+        cell = 4.0 * paddle.eye(3, dtype=dtype).to(device=env.DEVICE)
+        generator = paddle.seed(GLOBAL_SEED)
+        coord = 3.0 * paddle.rand([natoms, 3], dtype=dtype).to(device=env.DEVICE)
+        spin = 0.5 * paddle.rand([natoms, 3], dtype=dtype).to(device=env.DEVICE)
+        atype = paddle.to_tensor([0, 0, 0, 1, 1], dtype=paddle.int64).to(
+            device=env.DEVICE
+        )
+        test_spin = getattr(self, "test_spin", False)
+        if not test_spin:
+            test_keys = ["energy", "force", "virial"]
+        else:
+            test_keys = ["energy", "force", "force_mag"]
+
+        result_forward = eval_model(
+            self.model,
+            coord.unsqueeze(0),
+            cell.unsqueeze(0),
+            atype,
+            spins=spin.unsqueeze(0),
+        )
+        (
+            extended_coord,
+            extended_atype,
+            mapping,
+            nlist,
+        ) = extend_input_and_build_neighbor_list(
+            coord.unsqueeze(0),
+            atype.unsqueeze(0),
+            self.model.get_rcut() + 1.0
+            if test_spin
+            else self.model.get_rcut(),  # buffer region for spin nlist
+            self.model.get_sel(),
+            mixed_types=self.model.mixed_types(),
+            box=cell.unsqueeze(0),
+        )
+        extended_spin = decomp.take_along_axis(
+            spin.unsqueeze(0), indices=mapping.unsqueeze(-1).tile((1, 1, 3)), axis=1
+        )
+        input_dict = {
+            "extended_coord": extended_coord,
+            "extended_atype": extended_atype,
+            "nlist": nlist,
+            "mapping": mapping,
+            "do_atomic_virial": False,
+        }
+        if test_spin:
+            input_dict["extended_spin"] = extended_spin
+        result_forward_lower = self.model.forward_lower(**input_dict)
+        for key in test_keys:
+            if key in ["energy"]:
+                np.testing.assert_allclose(
+                    result_forward_lower[key].numpy(),
+                    result_forward[key].numpy(),
+                    rtol=prec,
+                    atol=prec,
+                )
+            elif key in ["force", "force_mag"]:
+                reduced_vv = reduce_tensor(
+                    result_forward_lower[f"extended_{key}"], mapping, natoms
+                )
+                np.testing.assert_allclose(
+                    reduced_vv.numpy(),
+                    result_forward[key].numpy(),
+                    rtol=prec,
+                    atol=prec,
+                )
+            elif key == "virial":
+                if not hasattr(self, "test_virial") or self.test_virial:
+                    np.testing.assert_allclose(
+                        result_forward_lower[key].numpy(),
+                        result_forward[key].numpy(),
+                        rtol=prec,
+                        atol=prec,
+                    )
+            else:
+                raise RuntimeError(f"Unexpected test key {key}")
+
+
+class TestEnergyModelSeA(unittest.TestCase, ForwardLowerTest):
+    def setUp(self):
+        self.prec = 1e-10
+        model_params = copy.deepcopy(model_se_e2_a)
+        self.model = get_model(model_params).to(env.DEVICE)
+
+
+@unittest.skip("Skip for not implemented yet")
+class TestEnergyModelDPA1(unittest.TestCase, ForwardLowerTest):
+    def setUp(self):
+        self.prec = 1e-10
+        model_params = copy.deepcopy(model_dpa1)
+        self.model = get_model(model_params).to(env.DEVICE)
+
+
+@unittest.skip("Skip for not implemented yet")
+class TestEnergyModelDPA2(unittest.TestCase, ForwardLowerTest):
+    def setUp(self):
+        self.prec = 1e-10
+        model_params = copy.deepcopy(model_dpa2)
+        self.model = get_model(model_params).to(env.DEVICE)
+
+
+@unittest.skip("Skip for not implemented yet")
+class TestEnergyModelZBL(unittest.TestCase, ForwardLowerTest):
+    def setUp(self):
+        self.prec = 1e-10
+        model_params = copy.deepcopy(model_zbl)
+        self.model = get_model(model_params).to(env.DEVICE)
+
+
+@unittest.skip("Skip for not implemented yet")
+class TestEnergyModelSpinSeA(unittest.TestCase, ForwardLowerTest):
+    def setUp(self):
+        self.prec = 1e-10
+        model_params = copy.deepcopy(model_spin)
+        self.test_spin = True
+        self.model = get_model(model_params).to(env.DEVICE)
+
+
+@unittest.skip("Skip for not implemented yet")
+class TestEnergyModelSpinDPA1(unittest.TestCase, ForwardLowerTest):
+    def setUp(self):
+        self.prec = 1e-10
+        model_params = copy.deepcopy(model_spin)
+        model_params["descriptor"] = copy.deepcopy(model_dpa1)["descriptor"]
+        # double sel for virtual atoms to avoid large error
+        model_params["descriptor"]["sel"] *= 2
+        self.test_spin = True
+        self.model = get_model(model_params).to(env.DEVICE)
+
+
+@unittest.skip("Skip for not implemented yet")
+class TestEnergyModelSpinDPA2(unittest.TestCase, ForwardLowerTest):
+    def setUp(self):
+        self.prec = 1e-10
+        model_params = copy.deepcopy(model_spin)
+        model_params["descriptor"] = copy.deepcopy(model_dpa2)["descriptor"]
+        # double sel for virtual atoms to avoid large error
+        model_params["descriptor"]["repinit"]["nsel"] *= 2
+        model_params["descriptor"]["repformer"]["nsel"] *= 2
+        self.test_spin = True
+        self.model = get_model(model_params).to(env.DEVICE)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/source/tests/pd/model/test_get_model.py b/source/tests/pd/model/test_get_model.py
new file mode 100644
index 0000000000..7ace7c4e43
--- /dev/null
+++ b/source/tests/pd/model/test_get_model.py
@@ -0,0 +1,113 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import copy
+import unittest
+
+import numpy as np
+import paddle
+
+from deepmd.pd.model.model import (
+    get_model,
+)
+from deepmd.pd.utils import (
+    env,
+)
+
+dtype = paddle.float64
+
+model_se_e2_a = {
+    "type_map": ["O", "H", "B"],
+    "descriptor": {
+        "type": "se_e2_a",
+        "sel": [46, 92, 4],
+        "rcut_smth": 0.50,
+        "rcut": 4.00,
+        "neuron": [25, 50, 100],
+        "resnet_dt": False,
+        "axis_neuron": 16,
+        "seed": 1,
+    },
+    "fitting_net": {
+        "neuron": [24, 24, 24],
+        "resnet_dt": True,
+        "seed": 1,
+    },
+    "data_stat_nbatch": 20,
+    "atom_exclude_types": [1],
+    "pair_exclude_types": [[1, 2]],
+    "preset_out_bias": {
+        "energy": [
+            None,
+            [1.0],
+            [3.0],
+        ]
+    },
+}
+
+
+class TestGetModel(unittest.TestCase):
+    def test_model_attr(self):
+        model_params = copy.deepcopy(model_se_e2_a)
+        self.model = get_model(model_params).to(env.DEVICE)
+        atomic_model = self.model.atomic_model
+        self.assertEqual(atomic_model.type_map, ["O", "H", "B"])
+        self.assertEqual(
+            atomic_model.preset_out_bias,
+            {
+                "energy": [
+                    None,
+                    np.array([1.0]),
+                    np.array([3.0]),
+                ]
+            },
+        )
+        self.assertEqual(atomic_model.atom_exclude_types, [1])
+        self.assertEqual(atomic_model.pair_exclude_types, [[1, 2]])
+
+    def test_model_attr_energy_float(self):
+        model_params = copy.deepcopy(model_se_e2_a)
+        model_params["preset_out_bias"] = {"energy": ["1.", 3, None]}
+        self.model = get_model(model_params).to(env.DEVICE)
+        atomic_model = self.model.atomic_model
+        self.assertEqual(atomic_model.type_map, ["O", "H", "B"])
+        self.assertEqual(
+            atomic_model.preset_out_bias,
+            {
+                "energy": [
+                    np.array([1.0]),
+                    np.array([3.0]),
+                    None,
+                ]
+            },
+        )
+        self.assertEqual(atomic_model.atom_exclude_types, [1])
+        self.assertEqual(atomic_model.pair_exclude_types, [[1, 2]])
+
+    def test_model_attr_energy_unsupported_type(self):
+        model_params = copy.deepcopy(model_se_e2_a)
+        model_params["preset_out_bias"] = {"energy": [1.0 + 2.0j, 3, None]}
+        with self.assertRaises(ValueError):
+            self.model = get_model(model_params).to(env.DEVICE)
+
+    def test_model_attr_energy_unsupported_value(self):
+        model_params = copy.deepcopy(model_se_e2_a)
+        model_params["preset_out_bias"] = {"energy": ["1.0 + 2.0j", 3, None]}
+        with self.assertRaises(ValueError):
+            self.model = get_model(model_params).to(env.DEVICE)
+
+    def test_notset_model_attr(self):
+        model_params = copy.deepcopy(model_se_e2_a)
+        model_params.pop("atom_exclude_types")
+        model_params.pop("pair_exclude_types")
+        model_params.pop("preset_out_bias")
+        self.model = get_model(model_params).to(env.DEVICE)
+        atomic_model = self.model.atomic_model
+        self.assertEqual(atomic_model.type_map, ["O", "H", "B"])
+        self.assertEqual(atomic_model.preset_out_bias, None)
+        self.assertEqual(atomic_model.atom_exclude_types, [])
+        self.assertEqual(atomic_model.pair_exclude_types, [])
+
+    def test_preset_wrong_len(self):
+        model_params = copy.deepcopy(model_se_e2_a)
+        model_params["preset_out_bias"] = {"energy": [None]}
+        with self.assertRaises(ValueError):
+            self.model = get_model(model_params).to(env.DEVICE)
diff --git a/source/tests/pd/model/test_jit.py b/source/tests/pd/model/test_jit.py
new file mode 100644
index 0000000000..28ab499bf1
--- /dev/null
+++ b/source/tests/pd/model/test_jit.py
@@ -0,0 +1,83 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import json
+import os
+import shutil
+import unittest
+from copy import (
+    deepcopy,
+)
+from pathlib import (
+    Path,
+)
+
+import paddle
+from paddle.static import (
+    InputSpec,
+)
+
+from deepmd.pd.entrypoints.main import (
+    get_trainer,
+)
+from deepmd.pd.infer import (
+    inference,
+)
+
+from .test_permutation import (
+    model_se_e2_a,
+)
+
+
+class JITTest:
+    def test_jit(self):
+        trainer = get_trainer(deepcopy(self.config))
+        trainer.run()
+        paddle.set_flags(
+            {
+                "FLAGS_save_cf_stack_op": 1,
+                "FLAGS_prim_enable_dynamic": 1,
+                "FLAGS_enable_pir_api": 1,
+            }
+        )
+        model = paddle.jit.to_static(
+            inference.Tester("./model.pd").model, full_graph=True
+        )
+        paddle.jit.save(
+            model,
+            "./frozen_model",
+            input_spec=[
+                InputSpec([-1, -1, 3], dtype="float64"),
+                InputSpec([-1, -1], dtype="int32"),
+                InputSpec([-1, -1, -1], dtype="int32"),
+            ],
+        )
+
+    def tearDown(self):
+        for f in os.listdir("."):
+            if f.startswith("model") and f.endswith("pd"):
+                os.remove(f)
+            if f in ["lcurve.out", "frozen_model.json", "frozen_model.pdiparams"]:
+                os.remove(f)
+            if f in ["stat_files"]:
+                shutil.rmtree(f)
+            if f in ["checkpoint"]:
+                os.remove(f)
+
+
+class TestEnergyModelSeA(unittest.TestCase, JITTest):
+    def setUp(self):
+        input_json = str(Path(__file__).parent / "water/se_atten.json")
+        with open(input_json) as f:
+            self.config = json.load(f)
+        data_file = [str(Path(__file__).parent / "water/data/data_0")]
+        self.config["training"]["training_data"]["systems"] = data_file
+        self.config["training"]["validation_data"]["systems"] = data_file
+        self.config["model"] = deepcopy(model_se_e2_a)
+        self.config["training"]["numb_steps"] = 10
+        self.config["training"]["save_freq"] = 10
+
+    def tearDown(self):
+        JITTest.tearDown(self)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/source/tests/pd/model/test_mlp.py b/source/tests/pd/model/test_mlp.py
new file mode 100644
index 0000000000..90653644d3
--- /dev/null
+++ b/source/tests/pd/model/test_mlp.py
@@ -0,0 +1,283 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import itertools
+import unittest
+
+import numpy as np
+import paddle
+
+from deepmd.dpmodel.utils import EmbeddingNet as DPEmbeddingNet
+from deepmd.dpmodel.utils import FittingNet as DPFittingNet
+from deepmd.dpmodel.utils import (
+    NativeLayer,
+    NativeNet,
+)
+from deepmd.pd.model.network.mlp import (
+    MLP,
+    EmbeddingNet,
+    FittingNet,
+    MLPLayer,
+)
+from deepmd.pd.utils import (
+    env,
+)
+from deepmd.pd.utils.env import (
+    PRECISION_DICT,
+)
+
+
+def get_tols(prec):
+    if prec in ["single", "float32"]:
+        rtol, atol = 0.0, 1e-4
+    elif prec in ["double", "float64"]:
+        rtol, atol = 0.0, 1e-12
+    # elif prec in ["half", "float16"]:
+    #   rtol, atol=1e-2, 0
+    else:
+        raise ValueError(f"unknown prec {prec}")
+    return rtol, atol
+
+
+class TestMLPLayer(unittest.TestCase):
+    def setUp(self):
+        self.test_cases = itertools.product(
+            [(5, 5), (5, 10), (5, 8), (8, 5)],  # inp, out
+            [True, False],  # bias
+            [True, False],  # use time step
+            ["tanh", "none"],  # activation
+            [True, False],  # resnet
+            [None, [4], [3, 2]],  # prefix shapes
+            ["float32", "double"],  # precision
+        )
+
+    def test_match_native_layer(
+        self,
+    ):
+        for (ninp, nout), bias, ut, ac, resnet, ashp, prec in self.test_cases:
+            # input
+            inp_shap = [ninp]
+            if ashp is not None:
+                inp_shap = ashp + inp_shap
+            rtol, atol = get_tols(prec)
+            dtype = PRECISION_DICT[prec]
+            xx = (
+                paddle.arange(np.prod(inp_shap), dtype=dtype)
+                .to(device=env.DEVICE)
+                .reshape(inp_shap)
+            )
+            # def mlp layer
+            ml = MLPLayer(ninp, nout, bias, ut, ac, resnet, precision=prec).to(
+                env.DEVICE
+            )
+            # check consistency
+            nl = NativeLayer.deserialize(ml.serialize())
+            np.testing.assert_allclose(
+                ml.forward(xx).detach().cpu().numpy(),
+                nl.call(xx.detach().cpu().numpy()),
+                rtol=rtol,
+                atol=atol,
+                err_msg=f"(i={ninp}, o={nout}) bias={bias} use_dt={ut} act={ac} resnet={resnet} prec={prec}",
+            )
+            # check self-consistency
+            ml1 = MLPLayer.deserialize(ml.serialize()).to(env.DEVICE)
+            np.testing.assert_allclose(
+                ml.forward(xx).detach().cpu().numpy(),
+                ml1.forward(xx).detach().cpu().numpy(),
+                rtol=rtol,
+                atol=atol,
+                err_msg=f"(i={ninp}, o={nout}) bias={bias} use_dt={ut} act={ac} resnet={resnet} prec={prec}",
+            )
+
+    def test_jit(self):
+        for (ninp, nout), bias, ut, ac, resnet, _, prec in self.test_cases:
+            ml = MLPLayer(ninp, nout, bias, ut, ac, resnet, precision=prec)
+            model = paddle.jit.to_static(ml)
+            ml1 = MLPLayer.deserialize(ml.serialize())
+            model = paddle.jit.to_static(ml1)
+
+
+class TestMLP(unittest.TestCase):
+    def setUp(self):
+        self.test_cases = itertools.product(
+            [[2, 2, 4, 8], [1, 3, 3]],  # inp and hiddens
+            [True, False],  # bias
+            [True, False],  # use time step
+            ["tanh", "none"],  # activation
+            [True, False],  # resnet
+            [None, [4], [3, 2]],  # prefix shapes
+            ["float32", "double"],  # precision
+        )
+
+    def test_match_native_net(
+        self,
+    ):
+        for ndims, bias, ut, ac, resnet, ashp, prec in self.test_cases:
+            # input
+            inp_shap = [ndims[0]]
+            if ashp is not None:
+                inp_shap = ashp + inp_shap
+            rtol, atol = get_tols(prec)
+            dtype = PRECISION_DICT[prec]
+            xx = (
+                paddle.arange(np.prod(inp_shap), dtype=dtype)
+                .to(device=env.DEVICE)
+                .reshape(inp_shap)
+            )
+            # def MLP
+            layers = []
+            for ii in range(1, len(ndims)):
+                layers.append(
+                    MLPLayer(
+                        ndims[ii - 1], ndims[ii], bias, ut, ac, resnet, precision=prec
+                    ).serialize()
+                )
+            ml = MLP(layers).to(env.DEVICE)
+            # check consistency
+            nl = NativeNet.deserialize(ml.serialize())
+            np.testing.assert_allclose(
+                ml.forward(xx).detach().cpu().numpy(),
+                nl.call(xx.detach().cpu().numpy()),
+                rtol=rtol,
+                atol=atol,
+                err_msg=f"net={ndims} bias={bias} use_dt={ut} act={ac} resnet={resnet} prec={prec}",
+            )
+            # check self-consistency
+            ml1 = MLP.deserialize(ml.serialize()).to(env.DEVICE)
+            np.testing.assert_allclose(
+                ml.forward(xx).detach().cpu().numpy(),
+                ml1.forward(xx).detach().cpu().numpy(),
+                rtol=rtol,
+                atol=atol,
+                err_msg=f"net={ndims} bias={bias} use_dt={ut} act={ac} resnet={resnet} prec={prec}",
+            )
+
+    def test_jit(self):
+        for ndims, bias, ut, ac, resnet, _, prec in self.test_cases:
+            layers = []
+            for ii in range(1, len(ndims)):
+                ml = layers.append(
+                    MLPLayer(
+                        ndims[ii - 1], ndims[ii], bias, ut, ac, resnet, precision=prec
+                    ).serialize()
+                )
+            ml = MLP(ml)
+            model = paddle.jit.to_static(ml)
+            ml1 = MLP.deserialize(ml.serialize())
+            model = paddle.jit.to_static(ml1)
+
+
+class TestEmbeddingNet(unittest.TestCase):
+    def setUp(self):
+        self.test_cases = itertools.product(
+            [1, 3],  # inp
+            [[24, 48, 96], [24, 36]],  # and hiddens
+            ["tanh", "none"],  # activation
+            [True, False],  # resnet_dt
+            ["float32", "double"],  # precision
+        )
+
+    def test_match_embedding_net(
+        self,
+    ):
+        for idim, nn, act, idt, prec in self.test_cases:
+            # input
+            rtol, atol = get_tols(prec)
+            dtype = PRECISION_DICT[prec]
+            xx = paddle.arange(idim, dtype=dtype).to(device=env.DEVICE)
+            # def MLP
+            ml = EmbeddingNet(idim, nn, act, idt, prec).to(env.DEVICE)
+            # check consistency
+            nl = DPEmbeddingNet.deserialize(ml.serialize())
+            np.testing.assert_allclose(
+                ml.forward(xx).detach().cpu().numpy(),
+                nl.call(xx.detach().cpu().numpy()),
+                rtol=rtol,
+                atol=atol,
+                err_msg=f"idim={idim} nn={nn} use_dt={idt} act={act} prec={prec}",
+            )
+            # check self-consistency
+            ml1 = EmbeddingNet.deserialize(ml.serialize()).to(env.DEVICE)
+            np.testing.assert_allclose(
+                ml.forward(xx).detach().cpu().numpy(),
+                ml1.forward(xx).detach().cpu().numpy(),
+                rtol=rtol,
+                atol=atol,
+                err_msg=f"idim={idim} nn={nn} use_dt={idt} act={act} prec={prec}",
+            )
+
+    def test_jit(
+        self,
+    ):
+        for idim, nn, act, idt, prec in self.test_cases:
+            # def MLP
+            ml = EmbeddingNet(idim, nn, act, idt, prec).to(env.DEVICE)
+            ml1 = EmbeddingNet.deserialize(ml.serialize()).to(env.DEVICE)
+            model = paddle.jit.to_static(ml)
+            model = paddle.jit.to_static(ml1)
+
+
+class TestFittingNet(unittest.TestCase):
+    def setUp(self):
+        self.test_cases = itertools.product(
+            [1, 3],  # inp
+            [1, 5],  # out
+            [[24, 48, 96], [24, 36]],  # and hiddens
+            ["tanh", "none"],  # activation
+            [True, False],  # resnet_dt
+            ["float32", "double"],  # precision
+            [True, False],  # bias_out
+        )
+
+    def test_match_fitting_net(
+        self,
+    ):
+        for idim, odim, nn, act, idt, prec, ob in self.test_cases:
+            # input
+            rtol, atol = get_tols(prec)
+            dtype = PRECISION_DICT[prec]
+            xx = paddle.arange(idim, dtype=dtype).to(device=env.DEVICE)
+            # def MLP
+            ml = FittingNet(
+                idim,
+                odim,
+                neuron=nn,
+                activation_function=act,
+                resnet_dt=idt,
+                precision=prec,
+                bias_out=ob,
+            ).to(env.DEVICE)
+            # check consistency
+            nl = DPFittingNet.deserialize(ml.serialize())
+            np.testing.assert_allclose(
+                ml.forward(xx).detach().cpu().numpy(),
+                nl.call(xx.detach().cpu().numpy()),
+                rtol=rtol,
+                atol=atol,
+                err_msg=f"idim={idim} nn={nn} use_dt={idt} act={act} prec={prec}",
+            )
+            # check self-consistency
+            ml1 = FittingNet.deserialize(ml.serialize()).to(env.DEVICE)
+            np.testing.assert_allclose(
+                ml.forward(xx).detach().cpu().numpy(),
+                ml1.forward(xx).detach().cpu().numpy(),
+                rtol=rtol,
+                atol=atol,
+                err_msg=f"idim={idim} nn={nn} use_dt={idt} act={act} prec={prec}",
+            )
+
+    def test_jit(
+        self,
+    ):
+        for idim, odim, nn, act, idt, prec, ob in self.test_cases:
+            # def MLP
+            ml = FittingNet(
+                idim,
+                odim,
+                neuron=nn,
+                activation_function=act,
+                resnet_dt=idt,
+                precision=prec,
+                bias_out=ob,
+            ).to(env.DEVICE)
+            ml1 = FittingNet.deserialize(ml.serialize()).to(env.DEVICE)
+            model = paddle.jit.to_static(ml)
+            model = paddle.jit.to_static(ml1)
diff --git a/source/tests/pd/model/test_model.py b/source/tests/pd/model/test_model.py
new file mode 100644
index 0000000000..2566a9ce41
--- /dev/null
+++ b/source/tests/pd/model/test_model.py
@@ -0,0 +1,433 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import json
+import unittest
+from typing import (
+    NamedTuple,
+)
+
+import numpy as np
+import paddle
+import tensorflow.compat.v1 as tf
+
+from deepmd.pd.utils import (
+    env,
+)
+
+tf.disable_eager_execution()
+
+from pathlib import (
+    Path,
+)
+
+from deepmd.dpmodel.utils.learning_rate import LearningRateExp as MyLRExp
+from deepmd.pd.loss import (
+    EnergyStdLoss,
+)
+from deepmd.pd.model.model import (
+    get_model,
+)
+from deepmd.pd.utils.dataloader import (
+    DpLoaderSet,
+)
+from deepmd.pd.utils.env import (
+    DEVICE,
+)
+from deepmd.tf.common import (
+    expand_sys_str,
+)
+from deepmd.tf.descriptor import DescrptSeA as DescrptSeA_tf
+from deepmd.tf.fit import (
+    EnerFitting,
+)
+from deepmd.tf.loss import (
+    EnerStdLoss,
+)
+from deepmd.tf.model import (
+    EnerModel,
+)
+from deepmd.tf.utils.data_system import (
+    DeepmdDataSystem,
+)
+from deepmd.tf.utils.learning_rate import (
+    LearningRateExp,
+)
+
+from ..test_finetune import (
+    energy_data_requirement,
+)
+
+
+class VariableState(NamedTuple):
+    value: np.ndarray
+    gradient: np.ndarray
+
+
+def paddle2tf(paddle_name, last_layer_id=None):
+    fields = paddle_name.split(".")
+    offset = int(fields[3] == "networks") + 1
+    element_id = int(fields[2 + offset])
+    if fields[1] == "descriptor":
+        if fields[2].startswith("compress_"):
+            return None
+        layer_id = int(fields[4 + offset]) + 1
+        weight_type = fields[5 + offset]
+        ret = "filter_type_all/%s_%d_%d:0" % (weight_type, layer_id, element_id)
+    elif fields[1] == "fitting_net":
+        layer_id = int(fields[4 + offset])
+        weight_type = fields[5 + offset]
+        if layer_id != last_layer_id:
+            ret = "layer_%d_type_%d/%s:0" % (layer_id, element_id, weight_type)
+        else:
+            ret = "final_layer_type_%d/%s:0" % (element_id, weight_type)
+    else:
+        raise RuntimeError(f"Unexpected parameter name: {paddle_name}")
+    return ret
+
+
+class DpTrainer:
+    def __init__(self) -> None:
+        with open(str(Path(__file__).parent / "water/se_e2_a.json")) as fin:
+            content = fin.read()
+        config = json.loads(content)
+        data_file = [str(Path(__file__).parent / "water/data/data_0")]
+        config["training"]["training_data"]["systems"] = data_file
+        config["training"]["validation_data"]["systems"] = data_file
+        model_config = config["model"]
+        self.rcut = model_config["descriptor"]["rcut"]
+        self.rcut_smth = model_config["descriptor"]["rcut_smth"]
+        self.sel = model_config["descriptor"]["sel"]
+        self.systems = config["training"]["validation_data"]["systems"]
+        if isinstance(self.systems, str):
+            self.systems = expand_sys_str(self.systems)
+        self.batch_size = config["training"]["training_data"]["batch_size"]
+        self.type_map = model_config["type_map"]
+        self.filter_neuron = model_config["descriptor"]["neuron"]
+        self.axis_neuron = model_config["descriptor"]["axis_neuron"]
+        self.n_neuron = model_config["fitting_net"]["neuron"]
+        self.data_stat_nbatch = 3
+        self.start_lr = 0.001
+        self.stop_lr = 3.51e-8
+        self.decay_steps = 500
+        self.stop_steps = 1600
+        self.start_pref_e = 1.0
+        self.limit_pref_e = 2.0
+        self.start_pref_f = 2.0
+        self.limit_pref_f = 1.0
+        self.ntypes = len(self.type_map)
+
+    def get_intermediate_state(self, num_steps=1):
+        dp_model = self._get_dp_model()
+        dp_loss = self._get_dp_loss()
+        dp_lr = self._get_dp_lr()
+        dp_ds = self._get_dp_dataset()
+        dp_ds.add_data_requirements(dp_model.input_requirement)
+        dp_ds.add_data_requirements(dp_loss.label_requirement)
+        dp_model.data_stat(dp_ds)
+
+        # Build graph
+        g = tf.Graph()
+        with g.as_default():
+            place_holders = self._get_dp_placeholders(dp_ds)
+            model_pred = dp_model.build(
+                coord_=place_holders["coord"],
+                atype_=place_holders["type"],
+                natoms=place_holders["natoms_vec"],
+                box=place_holders["box"],
+                mesh=place_holders["default_mesh"],
+                input_dict=place_holders,
+            )
+            global_step = tf.train.get_or_create_global_step()
+            learning_rate = dp_lr.build(global_step, self.stop_steps)
+            l2_l, _ = dp_loss.build(
+                learning_rate=learning_rate,
+                natoms=place_holders["natoms_vec"],
+                model_dict=model_pred,
+                label_dict=place_holders,
+                suffix="test",
+            )
+            t_vars = tf.trainable_variables()
+            optimizer = tf.train.AdamOptimizer(learning_rate)
+            t_grad_and_vars = optimizer.compute_gradients(l2_l, t_vars)
+            train_op = optimizer.apply_gradients(t_grad_and_vars, global_step)
+            init_op = tf.global_variables_initializer()
+            t_heads = {
+                "loss": l2_l,
+                "energy": model_pred["energy"],
+                "force": model_pred["force"],
+                "virial": model_pred["virial"],
+                "atom_virial": model_pred["atom_virial"],
+            }
+
+        # Get statistics of each component
+        stat_dict = {
+            "descriptor.mean": dp_model.descrpt.davg,
+            "descriptor.stddev": dp_model.descrpt.dstd,
+            "fitting_net.bias_atom_e": dp_model.fitting.bias_atom_e,
+        }
+
+        # Get variables and their gradients
+        with tf.Session(graph=g) as sess:
+            sess.run(init_op)
+            for _ in range(num_steps):
+                batch = dp_ds.get_batch()
+                feeds = self._get_feed_dict(batch, place_holders)
+                sess.run(train_op, feed_dict=feeds)
+
+            batch = dp_ds.get_batch()
+            feeds = self._get_feed_dict(batch, place_holders)
+            grads_and_vars, head_dict = sess.run(
+                [t_grad_and_vars, t_heads], feed_dict=feeds
+            )
+            vs_dict = {}
+            for idx, one in enumerate(t_vars):
+                grad, var = grads_and_vars[idx]
+                vs_dict[one.name] = VariableState(var, grad)
+
+        tf.reset_default_graph()
+        # Used for reproducing
+        return batch, head_dict, stat_dict, vs_dict
+
+    def _get_dp_dataset(self):
+        data = DeepmdDataSystem(
+            systems=self.systems,
+            batch_size=self.batch_size,
+            test_size=1,
+            rcut=self.rcut,
+            type_map=self.type_map,
+            trn_all_set=True,
+        )
+        return data
+
+    def _get_dp_model(self):
+        dp_descrpt = DescrptSeA_tf(
+            rcut=self.rcut,
+            rcut_smth=self.rcut_smth,
+            sel=self.sel,
+            neuron=self.filter_neuron,
+            axis_neuron=self.axis_neuron,
+        )
+        dp_fitting = EnerFitting(
+            dp_descrpt.get_ntypes(), dp_descrpt.get_dim_out(), neuron=self.n_neuron
+        )
+        return EnerModel(
+            dp_descrpt,
+            dp_fitting,
+            type_map=self.type_map,
+            data_stat_nbatch=self.data_stat_nbatch,
+        )
+
+    def _get_dp_loss(self):
+        return EnerStdLoss(
+            starter_learning_rate=self.start_lr,
+            start_pref_e=self.start_pref_e,
+            limit_pref_e=self.limit_pref_e,
+            start_pref_f=self.start_pref_f,
+            limit_pref_f=self.limit_pref_f,
+        )
+
+    def _get_dp_lr(self):
+        return LearningRateExp(
+            start_lr=self.start_lr, stop_lr=self.stop_lr, decay_steps=self.decay_steps
+        )
+
+    def _get_dp_placeholders(self, dataset):
+        place_holders = {}
+        data_dict = dataset.get_data_dict()
+        for kk in data_dict.keys():
+            if kk == "type":
+                continue
+            prec = tf.float64
+            place_holders[kk] = tf.placeholder(prec, [None], name="t_" + kk)
+            place_holders["find_" + kk] = tf.placeholder(
+                tf.float32, name="t_find_" + kk
+            )
+        place_holders["type"] = tf.placeholder(tf.int32, [None], name="t_type")
+        place_holders["natoms_vec"] = tf.placeholder(
+            tf.int32, [self.ntypes + 2], name="t_natoms"
+        )
+        place_holders["default_mesh"] = tf.placeholder(tf.int32, [None], name="t_mesh")
+        place_holders["is_training"] = tf.placeholder(tf.bool)
+        return place_holders
+
+    def _get_feed_dict(self, batch, place_holders):
+        feed_dict = {}
+        for kk in batch.keys():
+            if kk == "find_type" or kk == "type":
+                continue
+            if "find_" in kk:
+                feed_dict[place_holders[kk]] = batch[kk]
+            else:
+                feed_dict[place_holders[kk]] = np.reshape(batch[kk], [-1])
+        for ii in ["type"]:
+            feed_dict[place_holders[ii]] = np.reshape(batch[ii], [-1])
+        for ii in ["natoms_vec", "default_mesh"]:
+            feed_dict[place_holders[ii]] = batch[ii]
+        feed_dict[place_holders["is_training"]] = True
+        return feed_dict
+
+
+class TestEnergy(unittest.TestCase):
+    def setUp(self) -> None:
+        self.dp_trainer = DpTrainer()
+        self.wanted_step = 0
+        for key in dir(self.dp_trainer):
+            if not key.startswith("_") or key == "get_intermediate_state":
+                value = getattr(self.dp_trainer, key)
+                setattr(self, key, value)
+
+    def test_consistency(self) -> None:
+        batch, head_dict, stat_dict, vs_dict = self.dp_trainer.get_intermediate_state(
+            self.wanted_step
+        )
+        # Build DeePMD graph
+        my_ds = DpLoaderSet(self.systems, self.batch_size, self.type_map)
+        my_ds.add_data_requirement(energy_data_requirement)
+        my_model = get_model(
+            model_params={
+                "descriptor": {
+                    "type": "se_e2_a",
+                    "sel": self.sel,
+                    "rcut_smth": self.rcut_smth,
+                    "rcut": self.rcut,
+                    "neuron": self.filter_neuron,
+                    "axis_neuron": self.axis_neuron,
+                },
+                "fitting_net": {"neuron": self.n_neuron, "mixed_types": False},
+                "data_stat_nbatch": self.data_stat_nbatch,
+                "type_map": self.type_map,
+            },
+        )
+        my_model.to(DEVICE)
+        my_lr = MyLRExp(self.start_lr, self.stop_lr, self.decay_steps, self.stop_steps)
+        my_loss = EnergyStdLoss(
+            starter_learning_rate=self.start_lr,
+            start_pref_e=self.start_pref_e,
+            limit_pref_e=self.limit_pref_e,
+            start_pref_f=self.start_pref_f,
+            limit_pref_f=self.limit_pref_f,
+        )
+
+        # Keep statistics consistency between 2 implementations
+        my_em = my_model.get_descriptor()
+        mean = stat_dict["descriptor.mean"].reshape([self.ntypes, my_em.get_nsel(), 4])
+        stddev = stat_dict["descriptor.stddev"].reshape(
+            [self.ntypes, my_em.get_nsel(), 4]
+        )
+        my_em.set_stat_mean_and_stddev(
+            paddle.to_tensor(mean).to(device=DEVICE),
+            paddle.to_tensor(stddev).to(device=DEVICE),
+        )
+        my_model.get_fitting_net().bias_atom_e = paddle.to_tensor(
+            stat_dict["fitting_net.bias_atom_e"], place=DEVICE
+        )
+
+        # Keep parameter value consistency between 2 implementations
+        for name, param in my_model.named_parameters():
+            name = name.replace("sea.", "")
+            var_name = paddle2tf(name, last_layer_id=len(self.n_neuron))
+            if var_name is None:
+                continue
+            var = vs_dict[var_name].value
+            with paddle.no_grad():
+                src = paddle.to_tensor(var)
+                dst = param
+                # print(name)
+                # print(src.mean(), src.std())
+                # print(dst.mean(), dst.std())
+                paddle.assign(src, dst)
+        # Start forward computing
+        tmp = np.copy(batch["natoms_vec"])
+        batch = my_ds.systems[0]._data_system._get_subdata(batch, 0)
+        batch = my_ds.systems[0]._data_system.reformat_data_torch(batch)
+        for key in ["coord", "atype", "box", "energy", "force"]:
+            batch[key] = paddle.to_tensor(batch[key]).to(device=env.DEVICE)
+            batch[key] = batch[key].unsqueeze(0)
+        batch["coord"].stop_gradient = False
+        batch["natoms_vec"] = tmp
+        batch["natoms"] = paddle.to_tensor(
+            batch["natoms_vec"], place=batch["coord"].place
+        ).unsqueeze(0)
+        model_input = {
+            "coord": batch["coord"].to(env.DEVICE),
+            "atype": batch["atype"].to(env.DEVICE),
+            "box": batch["box"].to(env.DEVICE),
+            "do_atomic_virial": True,
+        }
+        model_input_1 = {
+            "coord": batch["coord"].to(env.DEVICE),
+            "atype": batch["atype"].to(env.DEVICE),
+            "box": batch["box"].to(env.DEVICE),
+            "do_atomic_virial": False,
+        }
+        label = {
+            "energy": batch["energy"].to(env.DEVICE),
+            "find_energy": 1.0,
+            "force": batch["force"].to(env.DEVICE),
+            "find_force": 1.0,
+        }
+        cur_lr = my_lr.value(self.wanted_step)
+        model_predict, loss, _ = my_loss(
+            model_input, my_model, label, int(batch["natoms"][0, 0]), cur_lr
+        )
+        model_predict_1 = my_model(**model_input_1)
+        p_energy, p_force, p_virial, p_atomic_virial = (
+            model_predict["energy"],
+            model_predict["force"],
+            model_predict["virial"],
+            model_predict["atom_virial"],
+        )
+        np.testing.assert_allclose(
+            head_dict["energy"], p_energy.reshape([-1]).cpu().detach().numpy()
+        )
+        np.testing.assert_allclose(
+            head_dict["force"],
+            p_force.reshape(head_dict["force"].shape).cpu().detach().numpy(),
+        )
+        rtol = 1e-5
+        atol = 1e-8
+        np.testing.assert_allclose(
+            head_dict["loss"], loss.cpu().detach().numpy(), rtol=rtol, atol=atol
+        )
+        np.testing.assert_allclose(
+            head_dict["virial"],
+            p_virial.reshape(head_dict["virial"].shape).cpu().detach().numpy(),
+        )
+        np.testing.assert_allclose(
+            head_dict["virial"],
+            model_predict_1["virial"]
+            .reshape([*head_dict["virial"].shape])
+            .cpu()
+            .detach()
+            .numpy(),
+        )
+        self.assertIsNone(model_predict_1.get("atom_virial", None))
+        np.testing.assert_allclose(
+            head_dict["atom_virial"],
+            p_atomic_virial.reshape(head_dict["atom_virial"].shape)
+            .cpu()
+            .detach()
+            .numpy(),
+        )
+        optimizer = paddle.optimizer.Adam(cur_lr, parameters=my_model.parameters())
+        optimizer.clear_grad()
+
+        def step(step_id) -> None:
+            bdata = self.training_data.get_trainning_batch()
+            optimizer.clear_grad()
+
+        # Compare gradient for consistency
+        loss.backward()
+
+        for name, param in my_model.named_parameters():
+            name = name.replace("sea.", "")
+            var_name = paddle2tf(name, last_layer_id=len(self.n_neuron))
+            if var_name is None:
+                continue
+            var_grad = vs_dict[var_name].gradient
+            param_grad = param.grad.cpu()
+            var_grad = paddle.to_tensor(var_grad).to(device="cpu")
+            assert np.allclose(var_grad, param_grad, rtol=rtol, atol=atol)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/source/tests/pd/model/test_nlist.py b/source/tests/pd/model/test_nlist.py
new file mode 100644
index 0000000000..0947355ac0
--- /dev/null
+++ b/source/tests/pd/model/test_nlist.py
@@ -0,0 +1,304 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import unittest
+
+import numpy as np
+import paddle
+
+from deepmd.pd.utils import (
+    env,
+)
+from deepmd.pd.utils.nlist import (
+    build_directional_neighbor_list,
+    build_multiple_neighbor_list,
+    build_neighbor_list,
+    extend_coord_with_ghosts,
+    get_multiple_nlist_key,
+)
+from deepmd.pd.utils.region import (
+    inter2phys,
+)
+
+dtype = paddle.float64
+
+
+class TestNeighList(unittest.TestCase):
+    def setUp(self):
+        self.nf = 3
+        self.nloc = 3
+        self.ns = 5 * 5 * 3
+        self.nall = self.ns * self.nloc
+        self.cell = paddle.to_tensor(
+            [[1, 0, 0], [0.4, 0.8, 0], [0.1, 0.3, 2.1]], dtype=dtype, place=env.DEVICE
+        )
+        self.icoord = paddle.to_tensor(
+            [[0, 0, 0], [0, 0, 0], [0.5, 0.5, 0.1]], dtype=dtype, place=env.DEVICE
+        )
+        self.atype = paddle.to_tensor([-1, 0, 1], dtype=paddle.int64).to(
+            device=env.DEVICE
+        )
+        [self.cell, self.icoord, self.atype] = [
+            ii.unsqueeze(0) for ii in [self.cell, self.icoord, self.atype]
+        ]
+        self.coord = inter2phys(self.icoord, self.cell).reshape([-1, self.nloc * 3])
+        self.cell = self.cell.reshape([-1, 9])
+        [self.cell, self.coord, self.atype] = [
+            paddle.tile(ii, [self.nf, 1]) for ii in [self.cell, self.coord, self.atype]
+        ]
+        self.rcut = 1.01
+        self.prec = 1e-10
+        self.nsel = [10, 10]
+        # genrated by preprocess.build_neighbor_list
+        # ref_nlist, _, _ = legacy_build_neighbor_list(
+        #   2, ecoord[0], eatype[0],
+        #   self.rcut,
+        #   paddle.to_tensor([10,20], dtype=paddle.int64),
+        #   mapping[0], type_split=True, )
+        self.ref_nlist = paddle.to_tensor(
+            [
+                [-1] * sum(self.nsel),
+                [1, 1, 1, 1, 1, 1, -1, -1, -1, -1, 2, 2, 2, 2, -1, -1, -1, -1, -1, -1],
+                [1, 1, 1, 1, -1, -1, -1, -1, -1, -1, 2, 2, 2, 2, 2, 2, -1, -1, -1, -1],
+            ],
+            place=env.DEVICE,
+        )
+
+    def test_build_notype(self):
+        ecoord, eatype, mapping = extend_coord_with_ghosts(
+            self.coord, self.atype, self.cell, self.rcut
+        )
+        # test normal sel
+        nlist = build_neighbor_list(
+            ecoord,
+            eatype,
+            self.nloc,
+            self.rcut,
+            sum(self.nsel),
+            distinguish_types=False,
+        )
+        nlist_mask = nlist[0] == -1
+        nlist_loc = mapping[0][nlist[0]]
+        nlist_loc[nlist_mask] = -1
+        np.testing.assert_allclose(
+            paddle.sort(nlist_loc, axis=-1).numpy(),
+            paddle.sort(self.ref_nlist, axis=-1).numpy(),
+        )
+        # test a very large sel
+        nlist = build_neighbor_list(
+            ecoord,
+            eatype,
+            self.nloc,
+            self.rcut,
+            sum(self.nsel) + 300,  # +300, real nnei==224
+            distinguish_types=False,
+        )
+        nlist_mask = nlist[0] == -1
+        nlist_loc = mapping[0][nlist[0]]
+        nlist_loc[nlist_mask] = -1
+        np.testing.assert_allclose(
+            paddle.sort(nlist_loc, descending=True, axis=-1)[
+                :, : sum(self.nsel)
+            ].numpy(),
+            paddle.sort(self.ref_nlist, descending=True, axis=-1).numpy(),
+        )
+
+    def test_build_type(self):
+        ecoord, eatype, mapping = extend_coord_with_ghosts(
+            self.coord, self.atype, self.cell, self.rcut
+        )
+        nlist = build_neighbor_list(
+            ecoord,
+            eatype,
+            self.nloc,
+            self.rcut,
+            self.nsel,
+            distinguish_types=True,
+        )
+        np.testing.assert_allclose(nlist[0].numpy(), nlist[1].numpy())
+        nlist_mask = nlist[0] == -1
+        nlist_loc = mapping[0][nlist[0]]
+        nlist_loc[nlist_mask] = -1
+        for ii in range(2):
+            np.testing.assert_allclose(
+                paddle.sort(
+                    paddle.split(nlist_loc, (self.nsel), axis=-1)[ii], axis=-1
+                ).numpy(),
+                paddle.sort(
+                    paddle.split(self.ref_nlist, (self.nsel), axis=-1)[ii], axis=-1
+                ).numpy(),
+            )
+
+    def test_build_multiple_nlist(self):
+        rcuts = [1.01, 2.01]
+        nsels = [20, 80]
+        ecoord, eatype, mapping = extend_coord_with_ghosts(
+            self.coord, self.atype, self.cell, max(rcuts)
+        )
+        nlist1 = build_neighbor_list(
+            ecoord,
+            eatype,
+            self.nloc,
+            rcuts[1],
+            nsels[1] - 1,
+            distinguish_types=False,
+        )
+        pad = -1 * paddle.ones([self.nf, self.nloc, 1], dtype=nlist1.dtype).to(
+            device=nlist1.place
+        )
+        nlist2 = paddle.concat([nlist1, pad], axis=-1)
+        nlist0 = build_neighbor_list(
+            ecoord,
+            eatype,
+            self.nloc,
+            rcuts[0],
+            nsels[0],
+            distinguish_types=False,
+        )
+        nlists = build_multiple_neighbor_list(ecoord, nlist1, rcuts, nsels)
+        for dd in range(2):
+            self.assertEqual(
+                nlists[get_multiple_nlist_key(rcuts[dd], nsels[dd])].shape[-1],
+                nsels[dd],
+            )
+        np.testing.assert_allclose(
+            nlists[get_multiple_nlist_key(rcuts[0], nsels[0])].numpy(),
+            nlist0.numpy(),
+        )
+        np.testing.assert_allclose(
+            nlists[get_multiple_nlist_key(rcuts[1], nsels[1])].numpy(),
+            nlist2.numpy(),
+        )
+
+    def test_extend_coord(self):
+        ecoord, eatype, mapping = extend_coord_with_ghosts(
+            self.coord, self.atype, self.cell, self.rcut
+        )
+        # expected ncopy x nloc
+        self.assertEqual(list(ecoord.shape), [self.nf, self.nall * 3])
+        self.assertEqual(list(eatype.shape), [self.nf, self.nall])
+        self.assertEqual(list(mapping.shape), [self.nf, self.nall])
+        # check the nloc part is identical with original coord
+        np.testing.assert_allclose(
+            ecoord[:, : self.nloc * 3].numpy(),
+            self.coord.numpy(),
+            rtol=self.prec,
+            atol=self.prec,
+        )
+        # check the shift vectors are aligned with grid
+        shift_vec = (
+            ecoord.reshape([-1, self.ns, self.nloc, 3])
+            - self.coord.reshape([-1, self.nloc, 3])[:, None, :, :]
+        )
+        shift_vec = shift_vec.reshape([-1, self.nall, 3])
+        # hack!!! assumes identical cell across frames
+        shift_vec = paddle.matmul(
+            shift_vec, paddle.linalg.inv(self.cell.reshape([self.nf, 3, 3])[0])
+        )
+        # nf x nall x 3
+        shift_vec = paddle.round(shift_vec)
+        # check: identical shift vecs
+        np.testing.assert_allclose(
+            shift_vec[0].numpy(), shift_vec[1].numpy(), rtol=self.prec, atol=self.prec
+        )
+        # check: shift idx aligned with grid
+        mm, cc = paddle.unique(shift_vec[0][:, 0], axis=-1, return_counts=True)
+        np.testing.assert_allclose(
+            mm.numpy(),
+            paddle.to_tensor([-2, -1, 0, 1, 2], dtype=dtype)
+            .to(device=env.DEVICE)
+            .numpy(),
+            rtol=self.prec,
+            atol=self.prec,
+        )
+        np.testing.assert_allclose(
+            cc.numpy(),
+            paddle.to_tensor(
+                [self.ns * self.nloc // 5] * 5, dtype=paddle.int64, place=env.DEVICE
+            ).numpy(),
+            rtol=self.prec,
+            atol=self.prec,
+        )
+        mm, cc = paddle.unique(shift_vec[1][:, 1], axis=-1, return_counts=True)
+        np.testing.assert_allclose(
+            mm.numpy(),
+            paddle.to_tensor([-2, -1, 0, 1, 2], dtype=dtype).to(device=env.DEVICE),
+            rtol=self.prec,
+            atol=self.prec,
+        )
+        np.testing.assert_allclose(
+            cc.numpy(),
+            paddle.to_tensor(
+                [self.ns * self.nloc // 5] * 5, dtype=paddle.int64, place=env.DEVICE
+            ),
+            rtol=self.prec,
+            atol=self.prec,
+        )
+        mm, cc = paddle.unique(shift_vec[1][:, 2], axis=-1, return_counts=True)
+        np.testing.assert_allclose(
+            mm.numpy(),
+            paddle.to_tensor([-1, 0, 1], dtype=dtype).to(device=env.DEVICE).numpy(),
+            rtol=self.prec,
+            atol=self.prec,
+        )
+        np.testing.assert_allclose(
+            cc.numpy(),
+            paddle.to_tensor(
+                [self.ns * self.nloc // 3] * 3, dtype=paddle.int64, place=env.DEVICE
+            ).numpy(),
+            rtol=self.prec,
+            atol=self.prec,
+        )
+
+    def test_build_directional_nlist(self):
+        """Directional nlist is tested against the standard nlist implementation."""
+        ecoord, eatype, mapping = extend_coord_with_ghosts(
+            self.coord, self.atype, self.cell, self.rcut
+        )
+        for distinguish_types, mysel in zip([True, False], [sum(self.nsel), 300]):
+            # full neighbor list
+            nlist_full = build_neighbor_list(
+                ecoord,
+                eatype,
+                self.nloc,
+                self.rcut,
+                sum(self.nsel),
+                distinguish_types=distinguish_types,
+            )
+            # central as part of the system
+            nlist = build_directional_neighbor_list(
+                ecoord[:, 3:6],
+                eatype[:, 1:2],
+                paddle.concat(
+                    [
+                        ecoord[:, 0:3],
+                        paddle.zeros(
+                            [self.nf, 3],
+                            dtype=dtype,
+                        ).to(device=env.DEVICE),  # placeholder
+                        ecoord[:, 6:],
+                    ],
+                    axis=1,
+                ),
+                paddle.concat(
+                    [
+                        eatype[:, 0:1],
+                        -1
+                        * paddle.ones(
+                            [self.nf, 1],
+                            dtype="int64",
+                        ).to(device=env.DEVICE),  # placeholder
+                        eatype[:, 2:],
+                    ],
+                    axis=1,
+                ),
+                self.rcut,
+                mysel,
+                distinguish_types=distinguish_types,
+            )
+            np.testing.assert_allclose(nlist[0].numpy(), nlist[1].numpy())
+            np.testing.assert_allclose(nlist[0].numpy(), nlist[2].numpy())
+            np.testing.assert_allclose(
+                paddle.sort(nlist[0], descending=True, axis=-1)[
+                    :, : sum(self.nsel)
+                ].numpy(),
+                paddle.sort(nlist_full[0][1:2], descending=True, axis=-1).numpy(),
+            )
diff --git a/source/tests/pd/model/test_null_input.py b/source/tests/pd/model/test_null_input.py
new file mode 100644
index 0000000000..9bf0860265
--- /dev/null
+++ b/source/tests/pd/model/test_null_input.py
@@ -0,0 +1,94 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import copy
+import unittest
+
+import numpy as np
+import paddle
+
+from deepmd.pd.model.model import (
+    get_model,
+)
+from deepmd.pd.utils import (
+    env,
+)
+from deepmd.pd.utils.utils import (
+    to_numpy_array,
+)
+
+from ...seed import (
+    GLOBAL_SEED,
+)
+from ..common import (
+    eval_model,
+)
+from .test_permutation import (
+    model_se_e2_a,
+)
+
+dtype = paddle.float64
+
+
+class NullTest:
+    def test_nloc_1(
+        self,
+    ):
+        natoms = 1
+        generator = paddle.seed(GLOBAL_SEED)
+        # paddle.seed(1000)
+        cell = paddle.rand([3, 3], dtype=dtype).to(device=env.DEVICE)
+        # large box to exclude images
+        cell = (cell + cell.T) + 100.0 * paddle.eye(3).to(device=env.DEVICE)
+        coord = paddle.rand([natoms, 3], dtype=dtype).to(device=env.DEVICE)
+        atype = paddle.to_tensor([0], dtype=paddle.int32).to(device=env.DEVICE)
+        test_keys = ["energy", "force", "virial"]
+        result = eval_model(self.model, coord.unsqueeze(0), cell.unsqueeze(0), atype)
+        ret0 = {key: result[key].squeeze(0) for key in test_keys}
+        prec = 1e-10
+        expect_e_shape = [1]
+        expect_f = paddle.zeros([natoms, 3], dtype=dtype).to(device=env.DEVICE)
+        expect_v = paddle.zeros([9], dtype=dtype).to(device=env.DEVICE)
+        self.assertEqual(list(ret0["energy"].shape), expect_e_shape)
+        self.assertFalse(np.isnan(to_numpy_array(ret0["energy"])[0]))
+        np.testing.assert_allclose(
+            ret0["force"].numpy(), expect_f.numpy(), rtol=prec, atol=prec
+        )
+        if not hasattr(self, "test_virial") or self.test_virial:
+            np.testing.assert_allclose(
+                ret0["virial"].numpy(), expect_v.numpy(), rtol=prec, atol=prec
+            )
+
+    def test_nloc_2_far(
+        self,
+    ):
+        natoms = 2
+        generator = paddle.seed(GLOBAL_SEED)
+        cell = paddle.rand([3, 3], dtype=dtype).to(device=env.DEVICE)
+        # large box to exclude images
+        cell = (cell + cell.T) + 3000.0 * paddle.eye(3).to(device=env.DEVICE)
+        coord = paddle.rand([1, 3], dtype=dtype).to(device=env.DEVICE)
+        # 2 far-away atoms
+        coord = paddle.concat([coord, coord + 100.0], axis=0)
+        atype = paddle.to_tensor([0, 2], dtype=paddle.int32).to(device=env.DEVICE)
+        test_keys = ["energy", "force", "virial"]
+        result = eval_model(self.model, coord.unsqueeze(0), cell.unsqueeze(0), atype)
+        ret0 = {key: result[key].squeeze(0) for key in test_keys}
+        prec = 1e-10
+        expect_e_shape = [1]
+        expect_f = paddle.zeros([natoms, 3], dtype=dtype).to(device=env.DEVICE)
+        expect_v = paddle.zeros([9], dtype=dtype).to(device=env.DEVICE)
+        self.assertEqual(list(ret0["energy"].shape), expect_e_shape)
+        self.assertFalse(np.isnan(to_numpy_array(ret0["energy"])[0]))
+        np.testing.assert_allclose(
+            ret0["force"].numpy(), expect_f.numpy(), rtol=prec, atol=prec
+        )
+        if not hasattr(self, "test_virial") or self.test_virial:
+            np.testing.assert_allclose(
+                ret0["virial"].numpy(), expect_v.numpy(), rtol=prec, atol=prec
+            )
+
+
+class TestEnergyModelSeA(unittest.TestCase, NullTest):
+    def setUp(self):
+        model_params = copy.deepcopy(model_se_e2_a)
+        self.type_split = False
+        self.model = get_model(model_params).to(env.DEVICE)
diff --git a/source/tests/pd/model/test_permutation.py b/source/tests/pd/model/test_permutation.py
new file mode 100644
index 0000000000..8482ca7ffe
--- /dev/null
+++ b/source/tests/pd/model/test_permutation.py
@@ -0,0 +1,489 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import copy
+import os
+import unittest
+
+import paddle
+
+from deepmd.pd.model.model import (
+    get_model,
+)
+from deepmd.pd.utils import (
+    env,
+)
+
+from ...seed import (
+    GLOBAL_SEED,
+)
+from ..common import (
+    eval_model,
+)
+
+CUR_DIR = os.path.dirname(__file__)
+
+dtype = paddle.float64
+import numpy as np
+
+model_se_e2_a = {
+    "type_map": ["O", "H", "B"],
+    "descriptor": {
+        "type": "se_e2_a",
+        "sel": [46, 92, 4],
+        "rcut_smth": 0.50,
+        "rcut": 4.00,
+        "neuron": [25, 50, 100],
+        "resnet_dt": False,
+        "axis_neuron": 16,
+        "seed": 1,
+    },
+    "fitting_net": {
+        "neuron": [24, 24, 24],
+        "resnet_dt": True,
+        "seed": 1,
+    },
+    "data_stat_nbatch": 20,
+}
+
+model_dos = {
+    "type_map": ["O", "H", "B"],
+    "descriptor": {
+        "type": "se_e2_a",
+        "sel": [46, 92, 4],
+        "rcut_smth": 0.50,
+        "rcut": 4.00,
+        "neuron": [25, 50, 100],
+        "resnet_dt": False,
+        "axis_neuron": 16,
+        "seed": 1,
+    },
+    "fitting_net": {
+        "neuron": [24, 24, 24],
+        "resnet_dt": True,
+        "seed": 1,
+        "type": "dos",
+        "numb_dos": 250,
+    },
+    "data_stat_nbatch": 20,
+}
+
+model_zbl = {
+    "type_map": ["O", "H", "B"],
+    "use_srtab": f"{CUR_DIR}/water/data/zbl_tab_potential/H2O_tab_potential.txt",
+    "smin_alpha": 0.1,
+    "sw_rmin": 0.2,
+    "sw_rmax": 4.0,
+    "descriptor": {
+        "type": "se_atten",
+        "sel": 40,
+        "rcut_smth": 0.5,
+        "rcut": 4.0,
+        "neuron": [25, 50, 100],
+        "axis_neuron": 16,
+        "attn": 64,
+        "attn_layer": 2,
+        "attn_dotr": True,
+        "attn_mask": False,
+        "activation_function": "tanh",
+        "scaling_factor": 1.0,
+        "normalize": False,
+        "temperature": 1.0,
+        "set_davg_zero": True,
+        "type_one_side": True,
+        "seed": 1,
+    },
+    "fitting_net": {
+        "neuron": [24, 24, 24],
+        "resnet_dt": True,
+        "seed": 1,
+    },
+    "data_stat_nbatch": 20,
+}
+
+
+model_spin = {
+    "type_map": ["O", "H", "B"],
+    "descriptor": {
+        "type": "se_e2_a",
+        "sel": [46, 92, 4],
+        "rcut_smth": 0.50,
+        "rcut": 4.00,
+        "neuron": [25, 50, 100],
+        "resnet_dt": False,
+        "axis_neuron": 16,
+        "seed": 1,
+    },
+    "fitting_net": {
+        "neuron": [24, 24, 24],
+        "resnet_dt": True,
+        "seed": 1,
+    },
+    "data_stat_nbatch": 20,
+    "spin": {
+        "use_spin": [True, False, False],
+        "virtual_scale": [0.3140],
+        "_comment": " that's all",
+    },
+}
+
+model_dpa2 = {
+    "type_map": ["O", "H", "B"],
+    "descriptor": {
+        "type": "dpa2",
+        "repinit": {
+            "rcut": 6.0,
+            "rcut_smth": 2.0,
+            "nsel": 100,
+            "neuron": [2, 4, 8],
+            "axis_neuron": 4,
+            "activation_function": "tanh",
+        },
+        "repformer": {
+            "rcut": 4.0,
+            "rcut_smth": 0.5,
+            "nsel": 40,
+            "nlayers": 12,
+            "g1_dim": 8,
+            "g2_dim": 5,
+            "attn2_hidden": 3,
+            "attn2_nhead": 1,
+            "attn1_hidden": 5,
+            "attn1_nhead": 1,
+            "axis_neuron": 4,
+            "update_h2": False,
+            "update_g1_has_conv": True,
+            "update_g1_has_grrg": True,
+            "update_g1_has_drrd": True,
+            "update_g1_has_attn": True,
+            "update_g2_has_g1g1": True,
+            "update_g2_has_attn": True,
+            "attn2_has_gate": True,
+        },
+        "seed": 1,
+        "add_tebd_to_repinit_out": False,
+    },
+    "fitting_net": {
+        "neuron": [24, 24],
+        "resnet_dt": True,
+        "seed": 1,
+    },
+}
+
+model_dpa2tebd = {
+    "type_map": ["O", "H", "B"],
+    "descriptor": {
+        "type": "dpa2",
+        "repinit": {
+            "rcut": 6.0,
+            "rcut_smth": 0.5,
+            "nsel": 100,
+            "neuron": [2, 4, 8],
+            "axis_neuron": 4,
+            "activation_function": "tanh",
+            "three_body_sel": 40,
+            "three_body_rcut": 4.0,
+            "three_body_rcut_smth": 3.5,
+            "use_three_body": True,
+        },
+        "repformer": {
+            "rcut": 4.0,
+            "rcut_smth": 0.5,
+            "nsel": 40,
+            "nlayers": 6,
+            "g1_dim": 8,
+            "g2_dim": 5,
+            "attn2_hidden": 3,
+            "attn2_nhead": 1,
+            "attn1_hidden": 5,
+            "attn1_nhead": 1,
+            "axis_neuron": 4,
+            "update_h2": False,
+            "update_g1_has_conv": True,
+            "update_g1_has_grrg": True,
+            "update_g1_has_drrd": True,
+            "update_g1_has_attn": False,
+            "update_g2_has_g1g1": False,
+            "update_g2_has_attn": True,
+            "update_style": "res_residual",
+            "update_residual": 0.01,
+            "update_residual_init": "norm",
+            "attn2_has_gate": True,
+            "use_sqrt_nnei": True,
+            "g1_out_conv": True,
+            "g1_out_mlp": True,
+        },
+        "seed": 1,
+        "add_tebd_to_repinit_out": False,
+    },
+    "fitting_net": {
+        "neuron": [24, 24],
+        "resnet_dt": True,
+        "seed": 1,
+    },
+}
+
+model_dpa1 = {
+    "type_map": ["O", "H", "B"],
+    "descriptor": {
+        "type": "se_atten",
+        "sel": 40,
+        "rcut_smth": 0.5,
+        "rcut": 4.0,
+        "neuron": [25, 50, 100],
+        "axis_neuron": 16,
+        "attn": 64,
+        "attn_layer": 2,
+        "attn_dotr": True,
+        "attn_mask": False,
+        "activation_function": "tanh",
+        "scaling_factor": 1.0,
+        "normalize": False,
+        "temperature": 1.0,
+        "set_davg_zero": True,
+        "type_one_side": True,
+        "seed": 1,
+    },
+    "fitting_net": {
+        "neuron": [24, 24, 24],
+        "resnet_dt": True,
+        "seed": 1,
+    },
+}
+
+
+model_hybrid = {
+    "type_map": ["O", "H", "B"],
+    "descriptor": {
+        "type": "hybrid",
+        "list": [
+            {
+                "type": "se_atten",
+                "sel": 120,
+                "rcut_smth": 0.5,
+                "rcut": 6.0,
+                "neuron": [25, 50, 100],
+                "axis_neuron": 16,
+                "attn": 128,
+                "attn_layer": 0,
+                "attn_dotr": True,
+                "attn_mask": False,
+                "activation_function": "tanh",
+                "scaling_factor": 1.0,
+                "normalize": True,
+                "temperature": 1.0,
+                "seed": 1,
+            },
+            {
+                "type": "dpa2",
+                "repinit": {
+                    "rcut": 6.0,
+                    "rcut_smth": 2.0,
+                    "nsel": 30,
+                    "neuron": [2, 4, 8],
+                    "axis_neuron": 4,
+                    "activation_function": "tanh",
+                },
+                "repformer": {
+                    "rcut": 4.0,
+                    "rcut_smth": 0.5,
+                    "nsel": 10,
+                    "nlayers": 12,
+                    "g1_dim": 8,
+                    "g2_dim": 5,
+                    "attn2_hidden": 3,
+                    "attn2_nhead": 1,
+                    "attn1_hidden": 5,
+                    "attn1_nhead": 1,
+                    "axis_neuron": 4,
+                    "update_h2": False,
+                    "update_g1_has_conv": True,
+                    "update_g1_has_grrg": True,
+                    "update_g1_has_drrd": True,
+                    "update_g1_has_attn": True,
+                    "update_g2_has_g1g1": True,
+                    "update_g2_has_attn": True,
+                    "attn2_has_gate": True,
+                },
+                "seed": 1,
+                "add_tebd_to_repinit_out": False,
+            },
+        ],
+    },
+    "fitting_net": {
+        "neuron": [240, 240, 240],
+        "resnet_dt": True,
+        "seed": 1,
+        "_comment": " that's all",
+    },
+    "_comment": " that's all",
+}
+
+model_property = {
+    "type_map": ["H", "C", "N", "O"],
+    "descriptor": {
+        "type": "se_e2_a",
+        "sel": [3, 3, 3, 3],
+        "rcut_smth": 0.50,
+        "rcut": 4.00,
+        "neuron": [25, 50, 100],
+        "resnet_dt": False,
+        "axis_neuron": 16,
+        "seed": 1,
+    },
+    "fitting_net": {
+        "type": "property",
+        "task_dim": 3,
+        "neuron": [24, 24, 24],
+        "resnet_dt": True,
+        "bias_method": "normal",
+        "intensive": True,
+        "seed": 1,
+    },
+}
+
+
+class PermutationTest:
+    def test(
+        self,
+    ):
+        natoms = 5
+        generator = paddle.seed(GLOBAL_SEED)
+        cell = paddle.rand([3, 3], dtype=dtype)
+        cell = (cell + cell.T) + 5.0 * paddle.eye(3)
+        coord = paddle.rand([natoms, 3], dtype=dtype)
+        spin = paddle.rand([natoms, 3], dtype=dtype)
+        coord = paddle.matmul(coord, cell)
+        atype = paddle.to_tensor([0, 0, 0, 1, 1], dtype=paddle.int32)
+        idx_perm = [1, 0, 4, 3, 2]
+        test_spin = getattr(self, "test_spin", False)
+        if not test_spin:
+            test_keys = ["energy", "force", "virial"]
+        else:
+            test_keys = ["energy", "force", "force_mag", "virial"]
+        result_0 = eval_model(
+            self.model,
+            coord.unsqueeze(0),
+            cell.unsqueeze(0),
+            atype,
+            spins=spin.unsqueeze(0),
+        )
+        ret0 = {key: result_0[key].squeeze(0) for key in test_keys}
+        result_1 = eval_model(
+            self.model,
+            coord[idx_perm].unsqueeze(0),
+            cell.unsqueeze(0),
+            atype[idx_perm],
+            spins=spin[idx_perm].unsqueeze(0),
+        )
+        ret1 = {key: result_1[key].squeeze(0) for key in test_keys}
+        prec = 1e-10
+        for key in test_keys:
+            if key in ["energy"]:
+                np.testing.assert_allclose(
+                    ret0[key].numpy(), ret1[key].numpy(), rtol=prec, atol=prec
+                )
+            elif key in ["force", "force_mag"]:
+                np.testing.assert_allclose(
+                    ret0[key][idx_perm].numpy(), ret1[key].numpy(), rtol=prec, atol=prec
+                )
+            elif key == "virial":
+                if not hasattr(self, "test_virial") or self.test_virial:
+                    np.testing.assert_allclose(
+                        ret0[key], ret1[key], rtol=prec, atol=prec
+                    )
+            else:
+                raise RuntimeError(f"Unexpected test key {key}")
+
+
+class TestEnergyModelSeA(unittest.TestCase, PermutationTest):
+    def setUp(self):
+        model_params = copy.deepcopy(model_se_e2_a)
+        self.type_split = False
+        self.model = get_model(model_params).to(env.DEVICE)
+
+
+@unittest.skip("Skip for not implemented yet")
+class TestDOSModelSeA(unittest.TestCase, PermutationTest):
+    def setUp(self):
+        model_params = copy.deepcopy(model_dos)
+        self.type_split = False
+        self.model = get_model(model_params).to(env.DEVICE)
+
+
+@unittest.skip("Skip for not implemented yet")
+class TestEnergyModelDPA1(unittest.TestCase, PermutationTest):
+    def setUp(self):
+        model_params = copy.deepcopy(model_dpa1)
+        self.type_split = True
+        self.model = get_model(model_params).to(env.DEVICE)
+
+
+@unittest.skip("Skip for not implemented yet")
+class TestEnergyModelDPA2(unittest.TestCase, PermutationTest):
+    def setUp(self):
+        model_params = copy.deepcopy(model_dpa2)
+        self.type_split = True
+        self.model = get_model(model_params).to(env.DEVICE)
+
+
+@unittest.skip("Skip for not implemented yet")
+class TestForceModelDPA2(unittest.TestCase, PermutationTest):
+    def setUp(self):
+        model_params = copy.deepcopy(model_dpa2)
+        model_params["fitting_net"]["type"] = "direct_force_ener"
+        self.type_split = True
+        self.test_virial = False
+        self.model = get_model(model_params).to(env.DEVICE)
+
+
+@unittest.skip("Skip for not implemented yet")
+class TestEnergyModelHybrid(unittest.TestCase, PermutationTest):
+    def setUp(self):
+        model_params = copy.deepcopy(model_hybrid)
+        self.type_split = True
+        self.model = get_model(model_params).to(env.DEVICE)
+
+
+@unittest.skip("Skip for not implemented yet")
+class TestForceModelHybrid(unittest.TestCase, PermutationTest):
+    def setUp(self):
+        model_params = copy.deepcopy(model_hybrid)
+        model_params["fitting_net"]["type"] = "direct_force_ener"
+        self.type_split = True
+        self.test_virial = False
+        self.model = get_model(model_params).to(env.DEVICE)
+
+
+@unittest.skip("Skip for not implemented yet")
+class TestEnergyModelZBL(unittest.TestCase, PermutationTest):
+    def setUp(self):
+        model_params = copy.deepcopy(model_zbl)
+        self.type_split = False
+        self.model = get_model(model_params).to(env.DEVICE)
+
+
+@unittest.skip("Skip for not implemented yet")
+class TestEnergyModelSpinSeA(unittest.TestCase, PermutationTest):
+    def setUp(self):
+        model_params = copy.deepcopy(model_spin)
+        self.type_split = False
+        self.test_spin = True
+        self.model = get_model(model_params).to(env.DEVICE)
+
+
+# class TestEnergyFoo(unittest.TestCase):
+#   def test(self):
+#     model_params = model_dpau
+#     self.model = EnergyModelDPAUni(model_params).to(env.DEVICE)
+
+#     natoms = 5
+#     cell = paddle.rand([3, 3], dtype=dtype)
+#     cell = (cell + cell.T) + 5. * paddle.eye(3)
+#     coord = paddle.rand([natoms, 3], dtype=dtype)
+#     coord = paddle.matmul(coord, cell)
+#     atype = paddle.to_tensor([0, 0, 0, 1, 1])
+#     idx_perm = [1, 0, 4, 3, 2]
+#     ret0 = infer_model(self.model, coord, cell, atype, type_split=True)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/source/tests/pd/model/test_region.py b/source/tests/pd/model/test_region.py
new file mode 100644
index 0000000000..93fa82d8a5
--- /dev/null
+++ b/source/tests/pd/model/test_region.py
@@ -0,0 +1,62 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import unittest
+
+import numpy as np
+import paddle
+
+from deepmd.pd.utils.region import (
+    inter2phys,
+    to_face_distance,
+)
+
+from ...seed import (
+    GLOBAL_SEED,
+)
+
+dtype = paddle.float64
+
+
+class TestRegion(unittest.TestCase):
+    def setUp(self):
+        self.cell = paddle.to_tensor(
+            [[1, 0, 0], [0.4, 0.8, 0], [0.1, 0.3, 2.1]], dtype=dtype, place="cpu"
+        )
+        self.cell = self.cell.unsqueeze(0).unsqueeze(0)
+        self.cell = paddle.tile(self.cell, [4, 5, 1, 1])
+        self.prec = 9e-8
+
+    def test_inter_to_phys(self):
+        generator = paddle.seed(GLOBAL_SEED)
+        inter = paddle.rand([4, 5, 3, 3], dtype=dtype).to(device="cpu")
+        phys = inter2phys(inter, self.cell)
+        for ii in range(4):
+            for jj in range(5):
+                expected_phys = paddle.matmul(inter[ii, jj], self.cell[ii, jj])
+                np.testing.assert_allclose(
+                    phys[ii, jj].numpy(),
+                    expected_phys.numpy(),
+                    rtol=self.prec,
+                    atol=self.prec,
+                )
+
+    def test_to_face_dist(self):
+        cell0 = self.cell[0][0].numpy()
+        vol = np.linalg.det(cell0)
+        # area of surfaces xy, xz, yz
+        sxy = np.linalg.norm(np.cross(cell0[0], cell0[1]))
+        sxz = np.linalg.norm(np.cross(cell0[0], cell0[2]))
+        syz = np.linalg.norm(np.cross(cell0[1], cell0[2]))
+        # vol / area gives distance
+        dz = vol / sxy
+        dy = vol / sxz
+        dx = vol / syz
+        expected = paddle.to_tensor([dx, dy, dz], place="cpu")
+        dists = to_face_distance(self.cell)
+        for ii in range(4):
+            for jj in range(5):
+                np.testing.assert_allclose(
+                    dists[ii][jj].numpy(),
+                    expected.numpy(),
+                    rtol=self.prec,
+                    atol=self.prec,
+                )
diff --git a/source/tests/pd/model/test_rot.py b/source/tests/pd/model/test_rot.py
new file mode 100644
index 0000000000..4d59117560
--- /dev/null
+++ b/source/tests/pd/model/test_rot.py
@@ -0,0 +1,234 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import copy
+import unittest
+
+import numpy as np
+import paddle
+
+from deepmd.pd.model.model import (
+    get_model,
+)
+from deepmd.pd.utils import (
+    env,
+)
+
+from ...seed import (
+    GLOBAL_SEED,
+)
+from ..common import (
+    eval_model,
+)
+from .test_permutation import (  # model_dpau,
+    model_dos,
+    model_dpa1,
+    model_dpa2,
+    model_hybrid,
+    model_se_e2_a,
+    model_spin,
+    model_zbl,
+)
+
+dtype = paddle.float64
+
+
+class RotTest:
+    def test(
+        self,
+    ):
+        generator = paddle.seed(GLOBAL_SEED)
+        prec = 1e-9
+        natoms = 5
+        cell = 10.0 * paddle.eye(3, dtype=dtype).to(device=env.DEVICE)
+        coord = 2 * paddle.rand([natoms, 3], dtype=dtype).to(device=env.DEVICE)
+        spin = 2 * paddle.rand([natoms, 3], dtype=dtype).to(device=env.DEVICE)
+        shift = paddle.to_tensor([4, 4, 4], dtype=dtype).to(device=env.DEVICE)
+        atype = paddle.to_tensor([0, 0, 0, 1, 1], dtype=paddle.int32).to(
+            device=env.DEVICE
+        )
+        from scipy.stats import (
+            special_ortho_group,
+        )
+
+        test_spin = getattr(self, "test_spin", False)
+        if not test_spin:
+            test_keys = ["energy", "force", "virial"]
+        else:
+            test_keys = ["energy", "force", "force_mag"]
+        rmat = paddle.to_tensor(special_ortho_group.rvs(3), dtype=dtype).to(
+            device=env.DEVICE
+        )
+
+        # rotate only coord and shift to the center of cell
+        coord_rot = paddle.matmul(coord, rmat)
+        spin_rot = paddle.matmul(spin, rmat)
+        result_0 = eval_model(
+            self.model,
+            (coord + shift).unsqueeze(0),
+            cell.unsqueeze(0),
+            atype,
+            spins=spin.unsqueeze(0),
+        )
+        ret0 = {key: result_0[key].squeeze(0) for key in test_keys}
+        result_1 = eval_model(
+            self.model,
+            (coord_rot + shift).unsqueeze(0),
+            cell.unsqueeze(0),
+            atype,
+            spins=spin_rot.unsqueeze(0),
+        )
+        ret1 = {key: result_1[key].squeeze(0) for key in test_keys}
+        for key in test_keys:
+            if key in ["energy"]:
+                np.testing.assert_allclose(
+                    ret0[key].numpy(), ret1[key].numpy(), rtol=prec, atol=prec
+                )
+            elif key in ["force", "force_mag"]:
+                np.testing.assert_allclose(
+                    paddle.matmul(ret0[key], rmat).numpy(),
+                    ret1[key].numpy(),
+                    rtol=prec,
+                    atol=prec,
+                )
+            elif key == "virial":
+                if not hasattr(self, "test_virial") or self.test_virial:
+                    np.testing.assert_allclose(
+                        paddle.matmul(
+                            rmat.T, paddle.matmul(ret0[key].reshape([3, 3]), rmat)
+                        ).numpy(),
+                        ret1[key].reshape([3, 3]).numpy(),
+                        rtol=prec,
+                        atol=prec,
+                    )
+            else:
+                raise RuntimeError(f"Unexpected test key {key}")
+        # rotate coord and cell
+        paddle.seed(0)
+        cell = paddle.rand([3, 3], dtype=dtype).to(device=env.DEVICE)
+        cell = (cell + cell.T) + 5.0 * paddle.eye(3).to(device=env.DEVICE)
+        coord = paddle.rand([natoms, 3], dtype=dtype).to(device=env.DEVICE)
+        coord = paddle.matmul(coord, cell)
+        spin = paddle.rand([natoms, 3], dtype=dtype).to(device=env.DEVICE)
+        atype = paddle.to_tensor([0, 0, 0, 1, 1], dtype=paddle.int32).to(
+            device=env.DEVICE
+        )
+        coord_rot = paddle.matmul(coord, rmat)
+        spin_rot = paddle.matmul(spin, rmat)
+        cell_rot = paddle.matmul(cell, rmat)
+        result_0 = eval_model(
+            self.model,
+            coord.unsqueeze(0),
+            cell.unsqueeze(0),
+            atype,
+            spins=spin.unsqueeze(0),
+        )
+        ret0 = {key: result_0[key].squeeze(0) for key in test_keys}
+        result_1 = eval_model(
+            self.model,
+            coord_rot.unsqueeze(0),
+            cell_rot.unsqueeze(0),
+            atype,
+            spins=spin_rot.unsqueeze(0),
+        )
+        ret1 = {key: result_1[key].squeeze(0) for key in test_keys}
+        for key in test_keys:
+            if key in ["energy"]:
+                np.testing.assert_allclose(
+                    ret0[key].numpy(), ret1[key].numpy(), rtol=prec, atol=prec
+                )
+            elif key in ["force", "force_mag"]:
+                np.testing.assert_allclose(
+                    paddle.matmul(ret0[key], rmat).numpy(),
+                    ret1[key].numpy(),
+                    rtol=prec,
+                    atol=prec,
+                )
+            elif key == "virial":
+                if not hasattr(self, "test_virial") or self.test_virial:
+                    np.testing.assert_allclose(
+                        paddle.matmul(
+                            rmat.T, paddle.matmul(ret0[key].reshape([3, 3]), rmat)
+                        ).numpy(),
+                        ret1[key].reshape([3, 3]).numpy(),
+                        rtol=prec,
+                        atol=prec,
+                    )
+
+
+class TestEnergyModelSeA(unittest.TestCase, RotTest):
+    def setUp(self):
+        model_params = copy.deepcopy(model_se_e2_a)
+        self.type_split = False
+        self.model = get_model(model_params).to(env.DEVICE)
+
+
+@unittest.skip("Skip for not implemented yet")
+class TestDOSModelSeA(unittest.TestCase, RotTest):
+    def setUp(self):
+        model_params = copy.deepcopy(model_dos)
+        self.type_split = False
+        self.model = get_model(model_params).to(env.DEVICE)
+
+
+@unittest.skip("Skip for not implemented yet")
+class TestEnergyModelDPA1(unittest.TestCase, RotTest):
+    def setUp(self):
+        model_params = copy.deepcopy(model_dpa1)
+        self.type_split = True
+        self.model = get_model(model_params).to(env.DEVICE)
+
+
+@unittest.skip("Skip for not implemented yet")
+class TestEnergyModelDPA2(unittest.TestCase, RotTest):
+    def setUp(self):
+        model_params = copy.deepcopy(model_dpa2)
+        self.type_split = True
+        self.model = get_model(model_params).to(env.DEVICE)
+
+
+@unittest.skip("Skip for not implemented yet")
+class TestForceModelDPA2(unittest.TestCase, RotTest):
+    def setUp(self):
+        model_params = copy.deepcopy(model_dpa2)
+        model_params["fitting_net"]["type"] = "direct_force_ener"
+        self.type_split = True
+        self.test_virial = False
+        self.model = get_model(model_params).to(env.DEVICE)
+
+
+@unittest.skip("Skip for not implemented yet")
+class TestEnergyModelHybrid(unittest.TestCase, RotTest):
+    def setUp(self):
+        model_params = copy.deepcopy(model_hybrid)
+        self.type_split = True
+        self.model = get_model(model_params).to(env.DEVICE)
+
+
+@unittest.skip("Skip for not implemented yet")
+class TestForceModelHybrid(unittest.TestCase, RotTest):
+    def setUp(self):
+        model_params = copy.deepcopy(model_hybrid)
+        model_params["fitting_net"]["type"] = "direct_force_ener"
+        self.type_split = True
+        self.test_virial = False
+        self.model = get_model(model_params).to(env.DEVICE)
+
+
+@unittest.skip("Skip for not implemented yet")
+class TestEnergyModelZBL(unittest.TestCase, RotTest):
+    def setUp(self):
+        model_params = copy.deepcopy(model_zbl)
+        self.type_split = False
+        self.model = get_model(model_params).to(env.DEVICE)
+
+
+@unittest.skip("Skip for not implemented yet")
+class TestEnergyModelSpinSeA(unittest.TestCase, RotTest):
+    def setUp(self):
+        model_params = copy.deepcopy(model_spin)
+        self.type_split = False
+        self.test_spin = True
+        self.model = get_model(model_params).to(env.DEVICE)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/source/tests/pd/model/test_rotation.py b/source/tests/pd/model/test_rotation.py
new file mode 100644
index 0000000000..94e3442631
--- /dev/null
+++ b/source/tests/pd/model/test_rotation.py
@@ -0,0 +1,113 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import json
+import unittest
+from pathlib import (
+    Path,
+)
+from typing import (
+    Optional,
+)
+
+import numpy as np
+import paddle
+from scipy.stats import (
+    special_ortho_group,
+)
+
+from deepmd.pd.model.model import (
+    get_model,
+)
+from deepmd.pd.utils import (
+    env,
+)
+from deepmd.utils.data import (
+    DeepmdData,
+)
+
+
+class CheckSymmetry(DeepmdData):
+    def __init__(
+        self,
+        sys_path: str,
+        type_map: Optional[list[str]] = None,
+    ):
+        super().__init__(sys_path=sys_path, type_map=type_map)
+        self.add("energy", 1, atomic=False, must=False, high_prec=True)
+        self.add("force", 3, atomic=True, must=False, high_prec=False)
+        self.add("virial", 9, atomic=False, must=False, high_prec=False)
+
+    def get_rotation(self, index, rotation_matrix):
+        for i in range(
+            0, len(self.dirs) + 1
+        ):  # note: if different sets can be merged, prefix sum is unused to calculate
+            if index < self.prefix_sum[i]:
+                break
+        frames = self._load_set(self.dirs[i - 1])
+        frames["coord"] = np.dot(
+            rotation_matrix, frames["coord"].reshape(-1, 3).T
+        ).T.reshape(self.nframes, -1)
+        frames["box"] = np.dot(
+            rotation_matrix, frames["box"].reshape(-1, 3).T
+        ).T.reshape(self.nframes, -1)
+        frames["force"] = np.dot(
+            rotation_matrix, frames["force"].reshape(-1, 3).T
+        ).T.reshape(self.nframes, -1)
+        frame = self._get_subdata(frames, index - self.prefix_sum[i - 1])
+        frame = self.reformat_data_torch(frame)
+        return frame
+
+
+def get_data(batch):
+    inputs = {}
+    for key in ["coord", "atype", "box"]:
+        inputs[key] = paddle.to_tensor(batch[key]).to(device=env.DEVICE)
+        inputs[key] = inputs[key].unsqueeze(0).to(env.DEVICE)
+    return inputs
+
+
+class TestRotation(unittest.TestCase):
+    def setUp(self):
+        with open(str(Path(__file__).parent / "water/se_e2_a.json")) as fin:
+            self.config = json.load(fin)
+        data_file = [str(Path(__file__).parent / "water/data/data_0")]
+        self.config["training"]["training_data"]["systems"] = data_file
+        self.config["training"]["validation_data"]["systems"] = data_file
+        self.rotation = special_ortho_group.rvs(3)
+        device = paddle.get_device()
+        paddle.set_device("cpu")
+        self.get_dataset(0)
+        paddle.set_device(device)
+        self.get_model()
+
+    def get_model(self):
+        self.model = get_model(self.config["model"]).to(env.DEVICE)
+
+    def get_dataset(self, system_index=0, batch_index=0):
+        systems = self.config["training"]["training_data"]["systems"]
+        type_map = self.config["model"]["type_map"]
+        dpdatasystem = CheckSymmetry(sys_path=systems[system_index], type_map=type_map)
+        self.origin_batch = dpdatasystem.get_item_paddle(batch_index)
+        self.rotated_batch = dpdatasystem.get_rotation(batch_index, self.rotation)
+
+    def test_rotation(self):
+        result1 = self.model(**get_data(self.origin_batch))
+        result2 = self.model(**get_data(self.rotated_batch))
+        rotation = paddle.to_tensor(self.rotation).to(env.DEVICE)
+        np.testing.assert_allclose(result1["energy"].numpy(), result2["energy"].numpy())
+        if "force" in result1:
+            np.testing.assert_allclose(
+                result2["force"][0].numpy(),
+                paddle.matmul(rotation, result1["force"][0].T).T.numpy(),
+            )
+        if "virial" in result1:
+            np.testing.assert_allclose(
+                result2["virial"][0].view([3, 3]).numpy(),
+                paddle.matmul(
+                    paddle.matmul(rotation, result1["virial"][0].view([3, 3]).T),
+                    rotation.T,
+                ).numpy(),
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/source/tests/pd/model/test_saveload_se_e2_a.py b/source/tests/pd/model/test_saveload_se_e2_a.py
new file mode 100644
index 0000000000..c1c2ba2cdd
--- /dev/null
+++ b/source/tests/pd/model/test_saveload_se_e2_a.py
@@ -0,0 +1,138 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import copy
+import json
+import os
+import unittest
+from pathlib import (
+    Path,
+)
+
+import numpy as np
+import paddle
+from paddle.io import (
+    DataLoader,
+)
+
+from deepmd.pd.loss import (
+    EnergyStdLoss,
+)
+from deepmd.pd.model.model import (
+    get_model,
+)
+from deepmd.pd.train.wrapper import (
+    ModelWrapper,
+)
+from deepmd.pd.utils import (
+    env,
+)
+from deepmd.pd.utils.dataloader import (
+    BufferedIterator,
+    DpLoaderSet,
+)
+from deepmd.pd.utils.stat import (
+    make_stat_input,
+)
+from deepmd.tf.common import (
+    expand_sys_str,
+)
+
+
+def get_dataset(config):
+    model_config = config["model"]
+    rcut = model_config["descriptor"]["rcut"]
+    sel = model_config["descriptor"]["sel"]
+    systems = config["training"]["validation_data"]["systems"]
+    if isinstance(systems, str):
+        systems = expand_sys_str(systems)
+    batch_size = config["training"]["training_data"]["batch_size"]
+    type_map = model_config["type_map"]
+
+    dataset = DpLoaderSet(systems, batch_size, type_map)
+    data_stat_nbatch = model_config.get("data_stat_nbatch", 10)
+    sampled = make_stat_input(dataset.systems, dataset.dataloaders, data_stat_nbatch)
+    return dataset, sampled
+
+
+class TestSaveLoadSeA(unittest.TestCase):
+    def setUp(self):
+        input_json = str(Path(__file__).parent / "water/se_e2_a.json")
+        with open(input_json) as fin:
+            self.config = json.load(fin)
+        data_file = [str(Path(__file__).parent / "water/data/data_0")]
+        self.config["training"]["training_data"]["systems"] = data_file
+        self.config["training"]["validation_data"]["systems"] = data_file
+        self.config["loss"]["starter_learning_rate"] = self.config["learning_rate"][
+            "start_lr"
+        ]
+        self.dataset, self.sampled = get_dataset(self.config)
+        self.training_dataloader = DataLoader(
+            self.dataset,
+            batch_sampler=paddle.io.BatchSampler(
+                sampler=paddle.io.RandomSampler(self.dataset),
+                drop_last=False,
+            ),
+            num_workers=0,  # setting to 0 diverges the behavior of its iterator; should be >=1
+            collate_fn=lambda batch: batch[0],
+        )
+        device = paddle.get_device()
+        paddle.set_device("cpu")
+        self.training_data = BufferedIterator(iter(self.training_dataloader))
+        paddle.set_device(device)
+        self.loss = EnergyStdLoss(**self.config["loss"])
+        self.cur_lr = 1
+        self.task_key = "Default"
+        self.input_dict, self.label_dict = self.get_data()
+        self.start_lr = self.config["learning_rate"]["start_lr"]
+
+    def get_model_result(self, read=False, model_file="tmp_model.pd"):
+        wrapper = self.create_wrapper()
+        optimizer = paddle.optimizer.Adam(
+            learning_rate=self.start_lr, parameters=wrapper.parameters()
+        )
+        optimizer.clear_grad()
+        if read:
+            wrapper.set_state_dict(paddle.load(model_file))
+            os.remove(model_file)
+        else:
+            paddle.save(wrapper.state_dict(), model_file)
+        result = wrapper(
+            **self.input_dict,
+            cur_lr=self.cur_lr,
+            label=self.label_dict,
+            task_key=self.task_key,
+        )[0]
+        return result
+
+    def create_wrapper(self):
+        model_config = copy.deepcopy(self.config["model"])
+        model = get_model(model_config).to(env.DEVICE)
+        return ModelWrapper(model, self.loss)
+
+    def get_data(self):
+        try:
+            batch_data = next(iter(self.training_data))
+        except StopIteration:
+            # Refresh the status of the dataloader to start from a new epoch
+            self.training_data = BufferedIterator(iter(self.training_dataloader))
+            batch_data = next(iter(self.training_data))
+        input_dict = {}
+        for item in ["coord", "atype", "box"]:
+            if item in batch_data:
+                input_dict[item] = batch_data[item].to(env.DEVICE)
+            else:
+                input_dict[item] = None
+        label_dict = {}
+        for item in ["energy", "force", "virial"]:
+            if item in batch_data:
+                label_dict[item] = batch_data[item].to(env.DEVICE)
+        return input_dict, label_dict
+
+    def test_saveload(self):
+        result1 = self.get_model_result()
+        result2 = self.get_model_result(read=True)
+        for item in result1:
+            np.testing.assert_allclose(result1[item].numpy(), result2[item].numpy())
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/source/tests/pd/model/test_se_e2_a.py b/source/tests/pd/model/test_se_e2_a.py
new file mode 100644
index 0000000000..b1e6abe5ae
--- /dev/null
+++ b/source/tests/pd/model/test_se_e2_a.py
@@ -0,0 +1,137 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import itertools
+import unittest
+
+import numpy as np
+import paddle
+
+from deepmd.dpmodel.descriptor import DescrptSeA as DPDescrptSeA
+from deepmd.pd.model.descriptor.se_a import (
+    DescrptSeA,
+)
+from deepmd.pd.utils import (
+    env,
+)
+from deepmd.pd.utils.env import (
+    PRECISION_DICT,
+)
+
+from ...seed import (
+    GLOBAL_SEED,
+)
+from .test_env_mat import (
+    TestCaseSingleFrameWithNlist,
+)
+from .test_mlp import (
+    get_tols,
+)
+
+dtype = env.GLOBAL_PD_FLOAT_PRECISION
+
+
+# to be merged with the tf test case
+class TestDescrptSeA(unittest.TestCase, TestCaseSingleFrameWithNlist):
+    def setUp(self):
+        TestCaseSingleFrameWithNlist.setUp(self)
+
+    def test_consistency(
+        self,
+    ):
+        rng = np.random.default_rng(GLOBAL_SEED)
+        nf, nloc, nnei = self.nlist.shape
+        davg = rng.normal(size=(self.nt, nnei, 4))
+        dstd = rng.normal(size=(self.nt, nnei, 4))
+        dstd = 0.1 + np.abs(dstd)
+
+        for idt, prec, em in itertools.product(
+            [False, True],
+            ["float64", "float32"],
+            [[], [[0, 1]], [[1, 1]]],
+        ):
+            dtype = PRECISION_DICT[prec]
+            rtol, atol = get_tols(prec)
+            err_msg = f"idt={idt} prec={prec}"
+            # sea new impl
+            dd0 = DescrptSeA(
+                self.rcut,
+                self.rcut_smth,
+                self.sel,
+                precision=prec,
+                resnet_dt=idt,
+                exclude_types=em,
+                seed=GLOBAL_SEED,
+            ).to(env.DEVICE)
+            dd0.sea.mean = paddle.to_tensor(davg, dtype=dtype).to(device=env.DEVICE)
+            dd0.sea.dstd = paddle.to_tensor(dstd, dtype=dtype).to(device=env.DEVICE)
+            rd0, _, _, _, _ = dd0(
+                paddle.to_tensor(self.coord_ext, dtype=dtype).to(device=env.DEVICE),
+                paddle.to_tensor(self.atype_ext, dtype="int64").to(device=env.DEVICE),
+                paddle.to_tensor(self.nlist, dtype="int64").to(device=env.DEVICE),
+            )
+            # serialization
+            dd1 = DescrptSeA.deserialize(dd0.serialize())
+            rd1, gr1, _, _, sw1 = dd1(
+                paddle.to_tensor(self.coord_ext, dtype=dtype).to(device=env.DEVICE),
+                paddle.to_tensor(self.atype_ext, dtype="int64").to(device=env.DEVICE),
+                paddle.to_tensor(self.nlist, dtype="int64").to(device=env.DEVICE),
+            )
+            np.testing.assert_allclose(
+                rd0.detach().cpu().numpy(),
+                rd1.detach().cpu().numpy(),
+                rtol=rtol,
+                atol=atol,
+                err_msg=err_msg,
+            )
+            np.testing.assert_allclose(
+                rd0.detach().cpu().numpy()[0][self.perm[: self.nloc]],
+                rd0.detach().cpu().numpy()[1],
+                rtol=rtol,
+                atol=atol,
+                err_msg=err_msg,
+            )
+            # dp impl
+            dd2 = DPDescrptSeA.deserialize(dd0.serialize())
+            rd2, gr2, _, _, sw2 = dd2.call(
+                self.coord_ext,
+                self.atype_ext,
+                self.nlist,
+            )
+            for aa, bb in zip([rd1, gr1, sw1], [rd2, gr2, sw2]):
+                np.testing.assert_allclose(
+                    aa.detach().cpu().numpy(),
+                    bb,
+                    rtol=rtol,
+                    atol=atol,
+                    err_msg=err_msg,
+                )
+
+    def test_jit(
+        self,
+    ):
+        rng = np.random.default_rng(GLOBAL_SEED)
+        nf, nloc, nnei = self.nlist.shape
+        davg = rng.normal(size=(self.nt, nnei, 4))
+        dstd = rng.normal(size=(self.nt, nnei, 4))
+        dstd = 0.1 + np.abs(dstd)
+
+        for idt, prec in itertools.product(
+            [False, True],
+            ["float64", "float32"],
+        ):
+            dtype = PRECISION_DICT[prec]
+            rtol, atol = get_tols(prec)
+            err_msg = f"idt={idt} prec={prec}"
+            # sea new impl
+            dd0 = DescrptSeA(
+                self.rcut,
+                self.rcut_smth,
+                self.sel,
+                precision=prec,
+                resnet_dt=idt,
+                seed=GLOBAL_SEED,
+            )
+            dd0.sea.mean = paddle.to_tensor(davg, dtype=dtype).to(device=env.DEVICE)
+            dd0.sea.dstd = paddle.to_tensor(dstd, dtype=dtype).to(device=env.DEVICE)
+            dd1 = DescrptSeA.deserialize(dd0.serialize())
+            model = paddle.jit.to_static(dd0)
+            model = paddle.jit.to_static(dd1)
diff --git a/source/tests/pd/model/test_smooth.py b/source/tests/pd/model/test_smooth.py
new file mode 100644
index 0000000000..7f77a6f188
--- /dev/null
+++ b/source/tests/pd/model/test_smooth.py
@@ -0,0 +1,172 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import copy
+import unittest
+
+import numpy as np
+import paddle
+
+from deepmd.pd.model.model import (
+    get_model,
+)
+from deepmd.pd.utils import (
+    env,
+)
+
+from ...seed import (
+    GLOBAL_SEED,
+)
+from ..common import (
+    eval_model,
+)
+from .test_permutation import (  # model_dpau,
+    model_se_e2_a,
+)
+
+dtype = paddle.float64
+
+
+class SmoothTest:
+    def test(
+        self,
+    ):
+        generator = paddle.seed(GLOBAL_SEED)
+        # displacement of atoms
+        epsilon = 1e-5 if self.epsilon is None else self.epsilon
+        # required prec. relative prec is not checked.
+        rprec = 0.0
+        aprec = 1e-5 if self.aprec is None else self.aprec
+
+        natoms = 10
+        cell = 8.6 * paddle.eye(3, dtype=dtype).to(device=env.DEVICE)
+        atype0 = paddle.arange(3, dtype=dtype).to(device=env.DEVICE)
+        atype1 = paddle.randint(0, 3, [natoms - 3]).to(
+            device=env.DEVICE, dtype=atype0.dtype
+        )
+        atype = paddle.concat([atype0, atype1]).reshape([natoms])
+        coord0 = (
+            paddle.to_tensor(
+                [
+                    0.0,
+                    0.0,
+                    0.0,
+                    4.0 - 0.5 * epsilon,
+                    0.0,
+                    0.0,
+                    0.0,
+                    4.0 - 0.5 * epsilon,
+                    0.0,
+                ],
+                dtype=dtype,
+            )
+            .reshape([-1, 3])
+            .to(device=env.DEVICE)
+        )
+        coord1 = paddle.rand(
+            [natoms - coord0.shape[0], 3],
+            dtype=dtype,
+        ).to(device=env.DEVICE)
+        coord1 = paddle.matmul(coord1, cell)
+        coord = paddle.concat([coord0, coord1], axis=0)
+        spin = paddle.rand([natoms, 3], dtype=dtype).to(device=env.DEVICE)
+        coord0 = paddle.clone(coord)
+        coord1 = paddle.clone(coord)
+        coord1[1][0] += epsilon
+        coord2 = paddle.clone(coord)
+        coord2[2][1] += epsilon
+        coord3 = paddle.clone(coord)
+        coord3[1][0] += epsilon
+        coord3[2][1] += epsilon
+        test_spin = getattr(self, "test_spin", False)
+        if not test_spin:
+            test_keys = ["energy", "force", "virial"]
+        else:
+            test_keys = ["energy", "force", "force_mag", "virial"]
+
+        result_0 = eval_model(
+            self.model,
+            coord0.unsqueeze(0),
+            cell.unsqueeze(0),
+            atype,
+            spins=spin.unsqueeze(0),
+        )
+        ret0 = {key: result_0[key].squeeze(0) for key in test_keys}
+        result_1 = eval_model(
+            self.model,
+            coord1.unsqueeze(0),
+            cell.unsqueeze(0),
+            atype,
+            spins=spin.unsqueeze(0),
+        )
+        ret1 = {key: result_1[key].squeeze(0) for key in test_keys}
+        result_2 = eval_model(
+            self.model,
+            coord2.unsqueeze(0),
+            cell.unsqueeze(0),
+            atype,
+            spins=spin.unsqueeze(0),
+        )
+        ret2 = {key: result_2[key].squeeze(0) for key in test_keys}
+        result_3 = eval_model(
+            self.model,
+            coord3.unsqueeze(0),
+            cell.unsqueeze(0),
+            atype,
+            spins=spin.unsqueeze(0),
+        )
+        ret3 = {key: result_3[key].squeeze(0) for key in test_keys}
+
+        def compare(ret0, ret1):
+            for key in test_keys:
+                if key in ["energy"]:
+                    np.testing.assert_allclose(
+                        ret0[key].numpy(), ret1[key].numpy(), rtol=rprec, atol=aprec
+                    )
+                elif key in ["force", "force_mag"]:
+                    # plus 1. to avoid the divided-by-zero issue
+                    np.testing.assert_allclose(
+                        (1.0 + ret0[key]).numpy(),
+                        (1.0 + ret1[key]).numpy(),
+                        rtol=rprec,
+                        atol=aprec,
+                    )
+                elif key == "virial":
+                    if not hasattr(self, "test_virial") or self.test_virial:
+                        np.testing.assert_allclose(
+                            (1.0 + ret0[key]).numpy(),
+                            (1.0 + ret1[key]).numpy(),
+                            rtol=rprec,
+                            atol=aprec,
+                        )
+                else:
+                    raise RuntimeError(f"Unexpected test key {key}")
+
+        compare(ret0, ret1)
+        compare(ret1, ret2)
+        compare(ret0, ret3)
+
+
+class TestEnergyModelSeA(unittest.TestCase, SmoothTest):
+    def setUp(self):
+        model_params = copy.deepcopy(model_se_e2_a)
+        self.type_split = False
+        self.model = get_model(model_params).to(env.DEVICE)
+        self.epsilon, self.aprec = None, None
+
+
+# class TestEnergyFoo(unittest.TestCase):
+#   def test(self):
+#     model_params = model_dpau
+#     self.model = EnergyModelDPAUni(model_params).to(env.DEVICE)
+
+#     natoms = 5
+#     cell = paddle.rand([3, 3], dtype=dtype)
+#     cell = (cell + cell.T) + 5. * paddle.eye(3)
+#     coord = paddle.rand([natoms, 3], dtype=dtype)
+#     coord = paddle.matmul(coord, cell)
+#     atype = paddle.to_tensor([0, 0, 0, 1, 1])
+#     idx_perm = [1, 0, 4, 3, 2]
+#     ret0 = infer_model(self.model, coord, cell, atype, type_split=True)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/source/tests/pd/model/test_trans.py b/source/tests/pd/model/test_trans.py
new file mode 100644
index 0000000000..f69d2f5b83
--- /dev/null
+++ b/source/tests/pd/model/test_trans.py
@@ -0,0 +1,168 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import copy
+import unittest
+
+import numpy as np
+import paddle
+
+from deepmd.pd.model.model import (
+    get_model,
+)
+from deepmd.pd.utils import (
+    env,
+)
+
+from ...seed import (
+    GLOBAL_SEED,
+)
+from ..common import (
+    eval_model,
+)
+from .test_permutation import (  # model_dpau,
+    model_dos,
+    model_dpa1,
+    model_dpa2,
+    model_hybrid,
+    model_se_e2_a,
+    model_spin,
+    model_zbl,
+)
+
+dtype = paddle.float64
+
+
+class TransTest:
+    def test(
+        self,
+    ):
+        natoms = 5
+        generator = paddle.seed(GLOBAL_SEED)
+        cell = paddle.rand([3, 3], dtype=dtype).to(device=env.DEVICE)
+        cell = (cell + cell.T) + 5.0 * paddle.eye(3).to(device=env.DEVICE)
+        coord = paddle.rand([natoms, 3], dtype=dtype).to(device=env.DEVICE)
+        coord = paddle.matmul(coord, cell)
+        spin = paddle.rand([natoms, 3], dtype=dtype).to(device=env.DEVICE)
+        atype = paddle.to_tensor([0, 0, 0, 1, 1], dtype=paddle.int32).to(
+            device=env.DEVICE
+        )
+        shift = (paddle.rand([3], dtype=dtype).to(device=env.DEVICE) - 0.5) * 2.0
+        coord_s = paddle.matmul(
+            paddle.remainder(
+                paddle.matmul(coord + shift, paddle.linalg.inv(cell)), paddle.ones([])
+            ),
+            cell,
+        )
+        test_spin = getattr(self, "test_spin", False)
+        if not test_spin:
+            test_keys = ["energy", "force", "virial"]
+        else:
+            test_keys = ["energy", "force", "force_mag", "virial"]
+        result_0 = eval_model(
+            self.model,
+            coord.unsqueeze(0),
+            cell.unsqueeze(0),
+            atype,
+            spins=spin.unsqueeze(0),
+        )
+        ret0 = {key: result_0[key].squeeze(0) for key in test_keys}
+        result_1 = eval_model(
+            self.model,
+            coord_s.unsqueeze(0),
+            cell.unsqueeze(0),
+            atype,
+            spins=spin.unsqueeze(0),
+        )
+        ret1 = {key: result_1[key].squeeze(0) for key in test_keys}
+        prec = 1e-7
+        for key in test_keys:
+            if key in ["energy", "force", "force_mag"]:
+                np.testing.assert_allclose(
+                    ret0[key].numpy(), ret1[key].numpy(), rtol=prec, atol=prec
+                )
+            elif key == "virial":
+                if not hasattr(self, "test_virial") or self.test_virial:
+                    np.testing.assert_allclose(
+                        ret0[key].numpy(), ret1[key].numpy(), rtol=prec, atol=prec
+                    )
+            else:
+                raise RuntimeError(f"Unexpected test key {key}")
+
+
+class TestEnergyModelSeA(unittest.TestCase, TransTest):
+    def setUp(self):
+        model_params = copy.deepcopy(model_se_e2_a)
+        self.type_split = False
+        self.model = get_model(model_params).to(env.DEVICE)
+
+
+@unittest.skip("Skip for not implemented yet")
+class TestDOSModelSeA(unittest.TestCase, TransTest):
+    def setUp(self):
+        model_params = copy.deepcopy(model_dos)
+        self.type_split = False
+        self.model = get_model(model_params).to(env.DEVICE)
+
+
+@unittest.skip("Skip for not implemented yet")
+class TestEnergyModelDPA1(unittest.TestCase, TransTest):
+    def setUp(self):
+        model_params = copy.deepcopy(model_dpa1)
+        self.type_split = True
+        self.model = get_model(model_params).to(env.DEVICE)
+
+
+@unittest.skip("Skip for not implemented yet")
+class TestEnergyModelDPA2(unittest.TestCase, TransTest):
+    def setUp(self):
+        model_params = copy.deepcopy(model_dpa2)
+        self.type_split = True
+        self.model = get_model(model_params).to(env.DEVICE)
+
+
+@unittest.skip("Skip for not implemented yet")
+class TestForceModelDPA2(unittest.TestCase, TransTest):
+    def setUp(self):
+        model_params = copy.deepcopy(model_dpa2)
+        model_params["fitting_net"]["type"] = "direct_force_ener"
+        self.type_split = True
+        self.test_virial = False
+        self.model = get_model(model_params).to(env.DEVICE)
+
+
+@unittest.skip("Skip for not implemented yet")
+class TestEnergyModelHybrid(unittest.TestCase, TransTest):
+    def setUp(self):
+        model_params = copy.deepcopy(model_hybrid)
+        self.type_split = True
+        self.model = get_model(model_params).to(env.DEVICE)
+
+
+@unittest.skip("Skip for not implemented yet")
+class TestForceModelHybrid(unittest.TestCase, TransTest):
+    def setUp(self):
+        model_params = copy.deepcopy(model_hybrid)
+        model_params["fitting_net"]["type"] = "direct_force_ener"
+        self.type_split = True
+        self.test_virial = False
+        self.model = get_model(model_params).to(env.DEVICE)
+
+
+@unittest.skip("Skip for not implemented yet")
+class TestEnergyModelZBL(unittest.TestCase, TransTest):
+    def setUp(self):
+        model_params = copy.deepcopy(model_zbl)
+        self.type_split = False
+        self.model = get_model(model_params).to(env.DEVICE)
+
+
+@unittest.skip("Skip for not implemented yet")
+class TestEnergyModelSpinSeA(unittest.TestCase, TransTest):
+    def setUp(self):
+        model_params = copy.deepcopy(model_spin)
+        self.type_split = False
+        self.test_spin = True
+        self.model = get_model(model_params).to(env.DEVICE)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/source/tests/pd/model/water/data/data_0/set.000/box.npy b/source/tests/pd/model/water/data/data_0/set.000/box.npy
new file mode 100644
index 0000000000..6ad2de625b
Binary files /dev/null and b/source/tests/pd/model/water/data/data_0/set.000/box.npy differ
diff --git a/source/tests/pd/model/water/data/data_0/set.000/coord.npy b/source/tests/pd/model/water/data/data_0/set.000/coord.npy
new file mode 100644
index 0000000000..8bd448b125
Binary files /dev/null and b/source/tests/pd/model/water/data/data_0/set.000/coord.npy differ
diff --git a/source/tests/pd/model/water/data/data_0/set.000/energy.npy b/source/tests/pd/model/water/data/data_0/set.000/energy.npy
new file mode 100644
index 0000000000..d03db103f5
Binary files /dev/null and b/source/tests/pd/model/water/data/data_0/set.000/energy.npy differ
diff --git a/source/tests/pd/model/water/data/data_0/set.000/force.npy b/source/tests/pd/model/water/data/data_0/set.000/force.npy
new file mode 100644
index 0000000000..10b2ab83a2
Binary files /dev/null and b/source/tests/pd/model/water/data/data_0/set.000/force.npy differ
diff --git a/source/tests/pd/model/water/data/data_0/type.raw b/source/tests/pd/model/water/data/data_0/type.raw
new file mode 100644
index 0000000000..97e8fdfcf8
--- /dev/null
+++ b/source/tests/pd/model/water/data/data_0/type.raw
@@ -0,0 +1,192 @@
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
diff --git a/source/tests/pd/model/water/data/data_0/type_map.raw b/source/tests/pd/model/water/data/data_0/type_map.raw
new file mode 100644
index 0000000000..e900768b1d
--- /dev/null
+++ b/source/tests/pd/model/water/data/data_0/type_map.raw
@@ -0,0 +1,2 @@
+O
+H
diff --git a/source/tests/pd/model/water/data/single/set.000/box.npy b/source/tests/pd/model/water/data/single/set.000/box.npy
new file mode 100644
index 0000000000..65897e0f9c
Binary files /dev/null and b/source/tests/pd/model/water/data/single/set.000/box.npy differ
diff --git a/source/tests/pd/model/water/data/single/set.000/coord.npy b/source/tests/pd/model/water/data/single/set.000/coord.npy
new file mode 100644
index 0000000000..6e0594a803
Binary files /dev/null and b/source/tests/pd/model/water/data/single/set.000/coord.npy differ
diff --git a/source/tests/pd/model/water/data/single/set.000/energy.npy b/source/tests/pd/model/water/data/single/set.000/energy.npy
new file mode 100644
index 0000000000..a0a88fb78a
Binary files /dev/null and b/source/tests/pd/model/water/data/single/set.000/energy.npy differ
diff --git a/source/tests/pd/model/water/data/single/set.000/force.npy b/source/tests/pd/model/water/data/single/set.000/force.npy
new file mode 100644
index 0000000000..d5b847a86e
Binary files /dev/null and b/source/tests/pd/model/water/data/single/set.000/force.npy differ
diff --git a/source/tests/pd/model/water/data/single/type.raw b/source/tests/pd/model/water/data/single/type.raw
new file mode 100644
index 0000000000..97e8fdfcf8
--- /dev/null
+++ b/source/tests/pd/model/water/data/single/type.raw
@@ -0,0 +1,192 @@
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
diff --git a/source/tests/pd/model/water/data/single/type_map.raw b/source/tests/pd/model/water/data/single/type_map.raw
new file mode 100644
index 0000000000..e900768b1d
--- /dev/null
+++ b/source/tests/pd/model/water/data/single/type_map.raw
@@ -0,0 +1,2 @@
+O
+H
diff --git a/source/tests/pd/model/water/multitask.json b/source/tests/pd/model/water/multitask.json
new file mode 100644
index 0000000000..83524a8b77
--- /dev/null
+++ b/source/tests/pd/model/water/multitask.json
@@ -0,0 +1,140 @@
+{
+  "model": {
+    "shared_dict": {
+      "my_type_map": [
+        "O",
+        "H",
+        "B"
+      ],
+      "my_descriptor": {
+        "type": "se_e2_a",
+        "sel": [
+          46,
+          92
+        ],
+        "rcut_smth": 0.50,
+        "rcut": 6.00,
+        "neuron": [
+          25,
+          50,
+          100
+        ],
+        "resnet_dt": false,
+        "axis_neuron": 16,
+        "seed": 1,
+        "_comment": " that's all"
+      },
+      "_comment": "that's all"
+    },
+    "model_dict": {
+      "model_1": {
+        "type_map": "my_type_map",
+        "descriptor": "my_descriptor",
+        "fitting_net": {
+          "neuron": [
+            240,
+            240,
+            240
+          ],
+          "resnet_dt": true,
+          "seed": 1,
+          "_comment": " that's all"
+        },
+        "data_stat_nbatch": 1
+      },
+      "model_2": {
+        "type_map": "my_type_map",
+        "descriptor": "my_descriptor",
+        "fitting_net": {
+          "neuron": [
+            240,
+            240,
+            240
+          ],
+          "resnet_dt": true,
+          "seed": 1,
+          "_comment": " that's all"
+        },
+        "data_stat_nbatch": 1
+      }
+    }
+  },
+  "learning_rate": {
+    "type": "exp",
+    "decay_steps": 5000,
+    "start_lr": 0.0002,
+    "decay_rate": 0.98,
+    "stop_lr": 3.51e-08,
+    "_comment": "that's all"
+  },
+  "loss_dict": {
+    "model_1": {
+      "type": "ener",
+      "start_pref_e": 0.02,
+      "limit_pref_e": 1,
+      "start_pref_f": 1000,
+      "limit_pref_f": 1,
+      "start_pref_v": 0,
+      "limit_pref_v": 0
+    },
+    "model_2": {
+      "type": "ener",
+      "start_pref_e": 0.02,
+      "limit_pref_e": 1,
+      "start_pref_f": 1000,
+      "limit_pref_f": 1,
+      "start_pref_v": 0,
+      "limit_pref_v": 0
+    }
+  },
+  "training": {
+    "model_prob": {
+      "model_1": 0.5,
+      "model_2": 0.5
+    },
+    "data_dict": {
+      "model_1": {
+        "stat_file": "./stat_files/model_1.hdf5",
+        "training_data": {
+          "systems": [
+            "pd/water/data/data_0"
+          ],
+          "batch_size": 1,
+          "_comment": "that's all"
+        },
+        "validation_data": {
+          "systems": [
+            "pd/water/data/data_0"
+          ],
+          "batch_size": 1,
+          "_comment": "that's all"
+        }
+      },
+      "model_2": {
+        "stat_file": "./stat_files/model_2.hdf5",
+        "training_data": {
+          "systems": [
+            "pd/water/data/data_0"
+          ],
+          "batch_size": 1,
+          "_comment": "that's all"
+        },
+        "validation_data": {
+          "systems": [
+            "pd/water/data/data_0"
+          ],
+          "batch_size": 1,
+          "_comment": "that's all"
+        }
+      }
+    },
+    "numb_steps": 100000,
+    "warmup_steps": 0,
+    "gradient_max_norm": 5.0,
+    "seed": 10,
+    "disp_file": "lcurve.out",
+    "disp_freq": 100,
+    "save_freq": 100,
+    "_comment": "that's all"
+  }
+}
diff --git a/source/tests/pd/model/water/se_atten.json b/source/tests/pd/model/water/se_atten.json
new file mode 100644
index 0000000000..70abf6759c
--- /dev/null
+++ b/source/tests/pd/model/water/se_atten.json
@@ -0,0 +1,83 @@
+{
+  "_comment": "that's all",
+  "model": {
+    "type_map": [
+      "O",
+      "H"
+    ],
+    "descriptor": {
+      "type": "se_atten",
+      "sel": 40,
+      "rcut_smth": 0.5,
+      "rcut": 4.0,
+      "neuron": [
+        25,
+        50,
+        100
+      ],
+      "axis_neuron": 16,
+      "type_one_side": true,
+      "attn": 64,
+      "attn_layer": 2,
+      "attn_dotr": true,
+      "attn_mask": false,
+      "activation_function": "tanh",
+      "scaling_factor": 1.0,
+      "normalize": false,
+      "temperature": 1.0,
+      "seed": 1
+    },
+    "fitting_net": {
+      "neuron": [
+        240,
+        240,
+        240
+      ],
+      "resnet_dt": true,
+      "seed": 1,
+      "_comment": " that's all"
+    },
+    "_comment": " that's all"
+  },
+  "learning_rate": {
+    "type": "exp",
+    "decay_steps": 5000,
+    "start_lr": 0.001,
+    "stop_lr": 3.51e-08,
+    "_comment": "that's all"
+  },
+  "loss": {
+    "type": "ener",
+    "start_pref_e": 0.02,
+    "limit_pref_e": 1,
+    "start_pref_f": 1000,
+    "limit_pref_f": 1,
+    "start_pref_v": 0,
+    "limit_pref_v": 0,
+    "_comment": " that's all"
+  },
+  "training": {
+    "training_data": {
+      "systems": [
+        "pd/water/data/data_0"
+      ],
+      "batch_size": 1,
+      "_comment": "that's all"
+    },
+    "validation_data": {
+      "systems": [
+        "pd/water/data/data_0"
+      ],
+      "batch_size": 1,
+      "numb_btch": 1,
+      "_comment": "that's all"
+    },
+    "numb_steps": 1000000,
+    "seed": 10,
+    "disp_file": "lcurve.out",
+    "disp_freq": 100,
+    "save_freq": 1000,
+    "save_ckpt": "model",
+    "_comment": "that's all"
+  }
+}
diff --git a/source/tests/pd/model/water/se_e2_a.json b/source/tests/pd/model/water/se_e2_a.json
new file mode 100644
index 0000000000..96f51ba5aa
--- /dev/null
+++ b/source/tests/pd/model/water/se_e2_a.json
@@ -0,0 +1,77 @@
+{
+  "model": {
+    "type_map": [
+      "O",
+      "H"
+    ],
+    "descriptor": {
+      "type": "se_e2_a",
+      "sel": [
+        46,
+        92
+      ],
+      "rcut_smth": 0.50,
+      "rcut": 6.00,
+      "neuron": [
+        25,
+        50,
+        100
+      ],
+      "resnet_dt": false,
+      "axis_neuron": 16,
+      "seed": 1,
+      "_comment": " that's all"
+    },
+    "fitting_net": {
+      "neuron": [
+        240,
+        240,
+        240
+      ],
+      "resnet_dt": true,
+      "seed": 1,
+      "_comment": " that's all"
+    },
+    "data_stat_nbatch": 20,
+    "_comment": " that's all"
+  },
+  "learning_rate": {
+    "type": "exp",
+    "decay_steps": 5000,
+    "start_lr": 0.001,
+    "stop_lr": 3.51e-8,
+    "_comment": "that's all"
+  },
+  "loss": {
+    "type": "ener",
+    "start_pref_e": 0.02,
+    "limit_pref_e": 1,
+    "start_pref_f": 1000,
+    "limit_pref_f": 1,
+    "_comment": " that's all"
+  },
+  "training": {
+    "training_data": {
+      "systems": [
+        "pd/water/data/data_0"
+      ],
+      "batch_size": 1,
+      "_comment": "that's all"
+    },
+    "validation_data": {
+      "systems": [
+        "pd/water/data/data_0"
+      ],
+      "batch_size": 1,
+      "numb_btch": 3,
+      "_comment": "that's all"
+    },
+    "numb_steps": 100000,
+    "seed": 10,
+    "disp_file": "lcurve.out",
+    "disp_freq": 100,
+    "save_freq": 10000,
+    "_comment": "that's all"
+  },
+  "_comment": "that's all"
+}
diff --git a/source/tests/pd/test_auto_batch_size.py b/source/tests/pd/test_auto_batch_size.py
new file mode 100644
index 0000000000..966333f47c
--- /dev/null
+++ b/source/tests/pd/test_auto_batch_size.py
@@ -0,0 +1,38 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import unittest
+
+import numpy as np
+import paddle
+
+from deepmd.pd.utils.auto_batch_size import (
+    AutoBatchSize,
+)
+
+
+class TestAutoBatchSize(unittest.TestCase):
+    def test_execute_all(self):
+        dd0 = paddle.zeros((10000, 2, 1, 3, 4))
+        dd1 = paddle.ones((10000, 2, 1, 3, 4))
+        auto_batch_size = AutoBatchSize(256, 2.0)
+
+        def func(dd1):
+            return paddle.zeros_like(dd1), paddle.ones_like(dd1)
+
+        dd2 = auto_batch_size.execute_all(func, 10000, 2, dd1)
+        np.testing.assert_equal(dd0.numpy(), dd2[0].numpy())
+        np.testing.assert_equal(dd1.numpy(), dd2[1].numpy())
+
+    def test_execute_all_dict(self):
+        dd0 = paddle.zeros((10000, 2, 1, 3, 4))
+        dd1 = paddle.ones((10000, 2, 1, 3, 4))
+        auto_batch_size = AutoBatchSize(256, 2.0)
+
+        def func(dd1):
+            return {
+                "foo": paddle.zeros_like(dd1),
+                "bar": paddle.ones_like(dd1),
+            }
+
+        dd2 = auto_batch_size.execute_all(func, 10000, 2, dd1)
+        np.testing.assert_equal(dd0.numpy(), dd2["foo"].numpy())
+        np.testing.assert_equal(dd1.numpy(), dd2["bar"].numpy())
diff --git a/source/tests/pd/test_change_bias.py b/source/tests/pd/test_change_bias.py
new file mode 100644
index 0000000000..2d87b739ff
--- /dev/null
+++ b/source/tests/pd/test_change_bias.py
@@ -0,0 +1,150 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import json
+import os
+import shutil
+import tempfile
+import unittest
+from copy import (
+    deepcopy,
+)
+from pathlib import (
+    Path,
+)
+
+import numpy as np
+import paddle
+
+from deepmd.pd.entrypoints.main import (
+    get_trainer,
+)
+from deepmd.pd.train.training import (
+    get_model_for_wrapper,
+    model_change_out_bias,
+)
+from deepmd.pd.train.wrapper import (
+    ModelWrapper,
+)
+from deepmd.pd.utils.dataloader import (
+    DpLoaderSet,
+)
+from deepmd.pd.utils.stat import (
+    make_stat_input,
+)
+from deepmd.pd.utils.utils import (
+    to_paddle_tensor,
+)
+
+from .common import (
+    run_dp,
+)
+from .model.test_permutation import (
+    model_se_e2_a,
+)
+from .test_finetune import (
+    energy_data_requirement,
+)
+
+current_path = os.getcwd()
+
+
+class TestChangeBias(unittest.TestCase):
+    def setUp(self):
+        input_json = str(Path(__file__).parent / "water/se_atten.json")
+        with open(input_json) as f:
+            self.config = json.load(f)
+        model_name = "change-bias-model.ckpt"
+        self.data_file = [str(Path(__file__).parent / "water/data/single")]
+        self.config["training"]["training_data"]["systems"] = self.data_file
+        self.config["training"]["validation_data"]["systems"] = self.data_file
+        self.config["model"] = deepcopy(model_se_e2_a)
+        self.config["training"]["numb_steps"] = 1
+        self.config["training"]["save_freq"] = 1
+        self.config["training"]["save_ckpt"] = model_name
+        self.trainer = get_trainer(deepcopy(self.config))
+        self.trainer.run()
+        self.state_dict_trained = self.trainer.wrapper.model.state_dict()
+        data = DpLoaderSet(
+            self.data_file,
+            batch_size=1,
+            type_map=self.config["model"]["type_map"],
+        )
+        data.add_data_requirement(energy_data_requirement)
+        self.sampled = make_stat_input(
+            data.systems,
+            data.dataloaders,
+            nbatches=1,
+        )
+        self.model_path = Path(current_path) / (model_name + ".pd")
+        self.model_path_data_bias = Path(current_path) / (
+            model_name + "data_bias" + ".pd"
+        )
+        self.model_path_data_file_bias = Path(current_path) / (
+            model_name + "data_file_bias" + ".pd"
+        )
+        self.model_path_user_bias = Path(current_path) / (
+            model_name + "user_bias" + ".pd"
+        )
+
+    def test_change_bias_with_data(self):
+        run_dp(
+            f"dp --pd change-bias {self.model_path!s} -s {self.data_file[0]} -o {self.model_path_data_bias!s}"
+        )
+        state_dict = paddle.load(str(self.model_path_data_bias))
+        model_params = state_dict["model"]["_extra_state"]["model_params"]
+        model_for_wrapper = get_model_for_wrapper(model_params)
+        wrapper = ModelWrapper(model_for_wrapper)
+        wrapper.set_state_dict(state_dict["model"])
+        updated_bias = wrapper.model["Default"].get_out_bias()
+        expected_model = model_change_out_bias(
+            self.trainer.wrapper.model["Default"],
+            self.sampled,
+            _bias_adjust_mode="change-by-statistic",
+        )
+        expected_bias = expected_model.get_out_bias()
+        np.testing.assert_allclose(updated_bias.numpy(), expected_bias.numpy())
+
+    def test_change_bias_with_data_sys_file(self):
+        tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".txt")
+        with open(tmp_file.name, "w") as f:
+            f.writelines([sys + "\n" for sys in self.data_file])
+        run_dp(
+            f"dp --pd change-bias {self.model_path!s} -f {tmp_file.name} -o {self.model_path_data_file_bias!s}"
+        )
+        state_dict = paddle.load(str(self.model_path_data_file_bias))
+        model_params = state_dict["model"]["_extra_state"]["model_params"]
+        model_for_wrapper = get_model_for_wrapper(model_params)
+        wrapper = ModelWrapper(model_for_wrapper)
+        wrapper.set_state_dict(state_dict["model"])
+        updated_bias = wrapper.model["Default"].get_out_bias()
+        expected_model = model_change_out_bias(
+            self.trainer.wrapper.model["Default"],
+            self.sampled,
+            _bias_adjust_mode="change-by-statistic",
+        )
+        expected_bias = expected_model.get_out_bias()
+        np.testing.assert_allclose(updated_bias.numpy(), expected_bias.numpy())
+
+    def test_change_bias_with_user_defined(self):
+        user_bias = [0.1, 3.2, -0.5]
+        run_dp(
+            f"dp --pd change-bias {self.model_path!s} -b {' '.join([str(_) for _ in user_bias])} -o {self.model_path_user_bias!s}"
+        )
+        state_dict = paddle.load(str(self.model_path_user_bias))
+        model_params = state_dict["model"]["_extra_state"]["model_params"]
+        model_for_wrapper = get_model_for_wrapper(model_params)
+        wrapper = ModelWrapper(model_for_wrapper)
+        wrapper.set_state_dict(state_dict["model"])
+        updated_bias = wrapper.model["Default"].get_out_bias()
+        expected_bias = to_paddle_tensor(np.array(user_bias)).reshape(
+            updated_bias.shape
+        )
+        np.testing.assert_allclose(updated_bias.numpy(), expected_bias.numpy())
+
+    def tearDown(self):
+        for f in os.listdir("."):
+            if f.startswith("change-bias-model") and f.endswith(".pd"):
+                os.remove(f)
+            if f in ["lcurve.out"]:
+                os.remove(f)
+            if f in ["stat_files"]:
+                shutil.rmtree(f)
diff --git a/source/tests/pd/test_decomp.py b/source/tests/pd/test_decomp.py
new file mode 100644
index 0000000000..d8439ad994
--- /dev/null
+++ b/source/tests/pd/test_decomp.py
@@ -0,0 +1,131 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import unittest
+
+import numpy as np
+import paddle
+
+from deepmd.pd.utils import (
+    decomp,
+)
+
+from ..seed import (
+    GLOBAL_SEED,
+)
+
+
+class TestDecomp(unittest.TestCase):
+    def setUp(self):
+        paddle.seed(GLOBAL_SEED)
+
+    def test_softmax_decomp(self):
+        raw_api = paddle.nn.functional.softmax
+        decomp_api = decomp.softmax
+
+        raw_input = paddle.randn([100, 100], "float32")
+        raw_output = raw_api(raw_input)
+        decomp_output = decomp_api(raw_input)
+
+        np.testing.assert_allclose(
+            raw_output.numpy(),
+            decomp_output.numpy(),
+            1e-6,
+            1e-8,
+        )
+
+    def test_norm_decomp(self):
+        raw_api = paddle.linalg.norm
+        decomp_api = decomp.norm
+
+        raw_input = paddle.randn([100, 100], "float32")
+        raw_output = raw_api(raw_input, p=2, axis=-1)
+        decomp_output = decomp_api(raw_input, p=2, axis=-1)
+
+        np.testing.assert_allclose(
+            raw_output.numpy(),
+            decomp_output.numpy(),
+            1e-5,
+            1e-8,
+        )
+
+    def test_take_along_axis_decomp(self):
+        raw_api = paddle.take_along_axis
+        decomp_api = decomp.take_along_axis
+
+        raw_input = paddle.randn([100, 100], "float32")
+        raw_indices = paddle.randint(0, 100, [100, 2])
+        raw_output = raw_api(raw_input, raw_indices, axis=-1)
+        decomp_output = decomp_api(raw_input, raw_indices, axis=-1)
+
+        np.testing.assert_equal(
+            raw_output.numpy(),
+            decomp_output.numpy(),
+        )
+
+    def test_scatter_reduce_decomp(self):
+        raw_api = paddle.put_along_axis
+        decomp_api = decomp.scatter_reduce
+        raw_input = paddle.randn([100, 100], "float32")
+        axis = 0
+        raw_index = paddle.randint(0, 100, [100, 100], "int64")
+        raw_values = paddle.randn([100, 100], "float32")
+        raw_output = raw_api(raw_input, raw_index, raw_values, axis=axis, reduce="add")
+        decomp_output = decomp_api(
+            raw_input, axis, raw_index, src=raw_values, reduce="sum"
+        )
+
+        np.testing.assert_allclose(
+            raw_output.numpy(),
+            decomp_output.numpy(),
+            2e-5,
+            1e-7,
+        )
+
+    def test_sec(self):
+        shape = [10, 3]
+        length = shape[0]
+        size = 3
+
+        split_sections = decomp.sec(length, size)
+        assert split_sections == [3, 3, 3, 1]
+
+    def test_masked_add_(self):
+        decomp_api = decomp.masked_add_
+
+        raw_input = paddle.randn([10, 10], "float32")
+        raw_mask = paddle.randint(0, 2, [10, 10]).astype("bool")
+        add_values = paddle.randn([10, 10], "float32")
+        raw_output = raw_input.clone()
+
+        for i in range(raw_input.shape[0]):
+            for j in range(raw_input.shape[1]):
+                if raw_mask[i][j]:
+                    raw_output[i][j] += add_values[i][j]
+
+        decomp_output = decomp_api(raw_input, raw_mask, add_values[raw_mask])
+
+        np.testing.assert_equal(
+            raw_output.numpy(),
+            decomp_output.numpy(),  # inplace
+        )
+
+        np.testing.assert_equal(
+            raw_output.numpy(),
+            raw_input.numpy(),  # inplace
+        )
+
+    def test_normalize_decomp(self):
+        raw_api = paddle.nn.functional.normalize
+        decomp_api = decomp.normalize_decomp
+
+        raw_input = paddle.randn([100, 100], "float32")
+        axis = -1
+
+        raw_output = raw_api(raw_input, p=2, axis=axis)
+        decomp_output = decomp_api(raw_input, p=2, axis=axis)
+
+        np.testing.assert_allclose(
+            raw_output.numpy(),
+            decomp_output.numpy(),  # inplace
+            1e-5,
+            1e-8,
+        )
diff --git a/source/tests/pd/test_dp_show.py b/source/tests/pd/test_dp_show.py
new file mode 100644
index 0000000000..c1c20ff3a1
--- /dev/null
+++ b/source/tests/pd/test_dp_show.py
@@ -0,0 +1,219 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import io
+import json
+import os
+import shutil
+import unittest
+from contextlib import (
+    redirect_stderr,
+)
+from copy import (
+    deepcopy,
+)
+from pathlib import (
+    Path,
+)
+
+from deepmd.pd.entrypoints.main import (
+    get_trainer,
+)
+from deepmd.pd.utils.multi_task import (
+    preprocess_shared_params,
+)
+
+from .common import (
+    run_dp,
+)
+from .model.test_permutation import (
+    model_se_e2_a,
+)
+
+
+class TestSingleTaskModel(unittest.TestCase):
+    def setUp(self):
+        input_json = str(Path(__file__).parent / "water/se_atten.json")
+        with open(input_json) as f:
+            self.config = json.load(f)
+        os.environ["FLAGS_prim_enable_dynamic"] = "1"
+        os.environ["FLAGS_enable_pir_api"] = "1"
+        self.config["training"]["numb_steps"] = 1
+        self.config["training"]["save_freq"] = 1
+        data_file = [str(Path(__file__).parent / "water/data/single")]
+        self.config["training"]["training_data"]["systems"] = data_file
+        self.config["training"]["validation_data"]["systems"] = data_file
+        self.config["model"] = deepcopy(model_se_e2_a)
+        self.config["model"]["type_map"] = ["O", "H", "Au"]
+        trainer = get_trainer(deepcopy(self.config))
+        trainer.run()
+        run_dp("dp --pd freeze")
+
+    def test_checkpoint(self):
+        INPUT = "model.pd"
+        ATTRIBUTES = "type-map descriptor fitting-net"
+        with redirect_stderr(io.StringIO()) as f:
+            run_dp(f"dp --pd show {INPUT} {ATTRIBUTES}")
+        results = f.getvalue().split("\n")[:-1]
+        assert "This is a singletask model" in results[-4]
+        assert "The type_map is ['O', 'H', 'Au']" in results[-3]
+        assert (
+            "{'type': 'se_e2_a'" and "'sel': [46, 92, 4]" and "'rcut': 4.0"
+        ) in results[-2]
+        assert (
+            "The fitting_net parameter is {'neuron': [24, 24, 24], 'resnet_dt': True, 'seed': 1}"
+            in results[-1]
+        )
+
+    @unittest.skip(
+        "Paddle do not support dp --pd show frozen models(.json and .pdiparams file), "
+        "will be supported in the future."
+    )
+    def test_frozen_model(self):
+        INPUT = "frozen_model.json"
+        ATTRIBUTES = "type-map descriptor fitting-net"
+        with redirect_stderr(io.StringIO()) as f:
+            run_dp(f"dp --pd show {INPUT} {ATTRIBUTES}")
+        results = f.getvalue().split("\n")[:-1]
+        assert "This is a singletask model" in results[-4]
+        assert "The type_map is ['O', 'H', 'Au']" in results[-3]
+        assert (
+            "{'type': 'se_e2_a'" and "'sel': [46, 92, 4]" and "'rcut': 4.0"
+        ) in results[-2]
+        assert (
+            "The fitting_net parameter is {'neuron': [24, 24, 24], 'resnet_dt': True, 'seed': 1}"
+            in results[-1]
+        )
+
+    def test_checkpoint_error(self):
+        INPUT = "model.pd"
+        ATTRIBUTES = "model-branch type-map descriptor fitting-net"
+        with self.assertRaisesRegex(
+            RuntimeError, "The 'model-branch' option requires a multitask model"
+        ):
+            run_dp(f"dp --pd show {INPUT} {ATTRIBUTES}")
+
+    def tearDown(self):
+        for f in os.listdir("."):
+            if f.startswith("model") and f.endswith("pd"):
+                os.remove(f)
+            if f in ["lcurve.out", "frozen_model.pd", "output.txt", "checkpoint"]:
+                os.remove(f)
+            if f in ["stat_files"]:
+                shutil.rmtree(f)
+
+
+class TestMultiTaskModel(unittest.TestCase):
+    def setUp(self):
+        input_json = str(Path(__file__).parent / "water/multitask.json")
+        with open(input_json) as f:
+            self.config = json.load(f)
+        self.config["model"]["shared_dict"]["my_descriptor"] = model_se_e2_a[
+            "descriptor"
+        ]
+        data_file = [str(Path(__file__).parent / "water/data/data_0")]
+        self.stat_files = "se_e2_a"
+        os.makedirs(self.stat_files, exist_ok=True)
+        self.config["training"]["data_dict"]["model_1"]["training_data"]["systems"] = (
+            data_file
+        )
+        self.config["training"]["data_dict"]["model_1"]["validation_data"][
+            "systems"
+        ] = data_file
+        self.config["training"]["data_dict"]["model_1"]["stat_file"] = (
+            f"{self.stat_files}/model_1"
+        )
+        self.config["training"]["data_dict"]["model_2"]["training_data"]["systems"] = (
+            data_file
+        )
+        self.config["training"]["data_dict"]["model_2"]["validation_data"][
+            "systems"
+        ] = data_file
+        self.config["training"]["data_dict"]["model_2"]["stat_file"] = (
+            f"{self.stat_files}/model_2"
+        )
+        self.config["model"]["model_dict"]["model_1"]["fitting_net"] = {
+            "neuron": [1, 2, 3],
+            "seed": 678,
+        }
+        self.config["model"]["model_dict"]["model_2"]["fitting_net"] = {
+            "neuron": [9, 8, 7],
+            "seed": 1111,
+        }
+        self.config["training"]["numb_steps"] = 1
+        self.config["training"]["save_freq"] = 1
+        self.origin_config = deepcopy(self.config)
+        self.config["model"], self.shared_links = preprocess_shared_params(
+            self.config["model"]
+        )
+        trainer = get_trainer(deepcopy(self.config), shared_links=self.shared_links)
+        trainer.run()
+        run_dp("dp --pd freeze --head model_1")
+
+    def test_checkpoint(self):
+        INPUT = "model.ckpt.pd"
+        ATTRIBUTES = "model-branch type-map descriptor fitting-net"
+        with redirect_stderr(io.StringIO()) as f:
+            run_dp(f"dp --pd show {INPUT} {ATTRIBUTES}")
+        results = f.getvalue().split("\n")[:-1]
+        assert "This is a multitask model" in results[-8]
+        assert (
+            "Available model branches are ['model_1', 'model_2', 'RANDOM'], "
+            "where 'RANDOM' means using a randomly initialized fitting net."
+            in results[-7]
+        )
+        assert "The type_map of branch model_1 is ['O', 'H', 'B']" in results[-6]
+        assert "The type_map of branch model_2 is ['O', 'H', 'B']" in results[-5]
+        assert (
+            "model_1"
+            and "'type': 'se_e2_a'"
+            and "'sel': [46, 92, 4]"
+            and "'rcut_smth': 0.5"
+        ) in results[-4]
+        assert (
+            "model_2"
+            and "'type': 'se_e2_a'"
+            and "'sel': [46, 92, 4]"
+            and "'rcut_smth': 0.5"
+        ) in results[-3]
+        assert (
+            "The fitting_net parameter of branch model_1 is {'neuron': [1, 2, 3], 'seed': 678}"
+            in results[-2]
+        )
+        assert (
+            "The fitting_net parameter of branch model_2 is {'neuron': [9, 8, 7], 'seed': 1111}"
+            in results[-1]
+        )
+
+    @unittest.skip(
+        "Paddle do not support dp --pd show frozen models(.json and .pdiparams file), "
+        "will be supported in the future."
+    )
+    def test_frozen_model(self):
+        INPUT = "frozen_model.json"
+        ATTRIBUTES = "type-map descriptor fitting-net"
+        with redirect_stderr(io.StringIO()) as f:
+            run_dp(f"dp --pd show {INPUT} {ATTRIBUTES}")
+        results = f.getvalue().split("\n")[:-1]
+        assert "This is a singletask model" in results[-4]
+        assert "The type_map is ['O', 'H', 'B']" in results[-3]
+        assert (
+            "'type': 'se_e2_a'" and "'sel': [46, 92, 4]" and "'rcut_smth': 0.5"
+        ) in results[-2]
+        assert (
+            "The fitting_net parameter is {'neuron': [1, 2, 3], 'seed': 678}"
+            in results[-1]
+        )
+
+    def tearDown(self):
+        for f in os.listdir("."):
+            if f.startswith("model") and f.endswith("pd"):
+                os.remove(f)
+            if f in [
+                "lcurve.out",
+                "frozen_model.json",
+                "frozen_model.pdiparams",
+                "checkpoint",
+                "output.txt",
+            ]:
+                os.remove(f)
+            if f in ["stat_files", self.stat_files]:
+                shutil.rmtree(f)
diff --git a/source/tests/pd/test_finetune.py b/source/tests/pd/test_finetune.py
new file mode 100644
index 0000000000..2c6cca83aa
--- /dev/null
+++ b/source/tests/pd/test_finetune.py
@@ -0,0 +1,379 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import json
+import os
+import shutil
+import tempfile
+import unittest
+from copy import (
+    deepcopy,
+)
+from pathlib import (
+    Path,
+)
+
+import numpy as np
+import paddle
+
+from deepmd.infer.deep_eval import (
+    DeepEval,
+)
+from deepmd.pd.entrypoints.main import (
+    get_trainer,
+)
+from deepmd.pd.model.model import (
+    get_model,
+)
+from deepmd.pd.utils import (
+    env,
+)
+from deepmd.pd.utils.dataloader import (
+    DpLoaderSet,
+)
+from deepmd.pd.utils.finetune import (
+    get_finetune_rules,
+)
+from deepmd.pd.utils.stat import (
+    make_stat_input,
+)
+from deepmd.pd.utils.utils import (
+    to_numpy_array,
+    to_paddle_tensor,
+)
+from deepmd.utils.data import (
+    DataRequirementItem,
+)
+
+from .model.test_permutation import (
+    model_dos,
+    model_dpa1,
+    model_dpa2,
+    model_se_e2_a,
+    model_zbl,
+)
+
+energy_data_requirement = [
+    DataRequirementItem(
+        "energy",
+        ndof=1,
+        atomic=False,
+        must=False,
+        high_prec=True,
+    ),
+    DataRequirementItem(
+        "force",
+        ndof=3,
+        atomic=True,
+        must=False,
+        high_prec=False,
+    ),
+    DataRequirementItem(
+        "virial",
+        ndof=9,
+        atomic=False,
+        must=False,
+        high_prec=False,
+    ),
+    DataRequirementItem(
+        "dos",
+        ndof=250,
+        atomic=False,
+        must=False,
+        high_prec=True,
+    ),
+    DataRequirementItem(
+        "atom_ener",
+        ndof=1,
+        atomic=True,
+        must=False,
+        high_prec=False,
+    ),
+    DataRequirementItem(
+        "atom_pref",
+        ndof=1,
+        atomic=True,
+        must=False,
+        high_prec=False,
+        repeat=3,
+    ),
+]
+
+
+class FinetuneTest:
+    @unittest.skip(
+        "Paddle do not support finetune in frozen models(.json and .pdiparams file), "
+        "will be supported in the future."
+    )
+    def test_finetune_change_out_bias(self):
+        self.testkey = "energy" if self.testkey is None else self.testkey
+        # get data
+        data = DpLoaderSet(
+            self.data_file,
+            batch_size=1,
+            type_map=self.config["model"]["type_map"],
+        )
+        data.add_data_requirement(energy_data_requirement)
+        sampled = make_stat_input(
+            data.systems,
+            data.dataloaders,
+            nbatches=1,
+        )
+        # make sampled of multiple frames with different atom numbs
+        numb_atom = sampled[0]["atype"].shape[1]
+        small_numb_atom = numb_atom // 2
+        small_atom_data = deepcopy(sampled[0])
+        atomic_key = ["coord", "atype"]
+        for kk in atomic_key:
+            small_atom_data[kk] = small_atom_data[kk][:, :small_numb_atom]
+        scale_pref = float(small_numb_atom / numb_atom)
+        small_atom_data[self.testkey] *= scale_pref
+        small_atom_data["natoms"][:, :2] = small_numb_atom
+        small_atom_data["natoms"][:, 2:] = paddle.bincount(
+            small_atom_data["atype"][0],
+            minlength=small_atom_data["natoms"].shape[1] - 2,
+        )
+        sampled = [sampled[0], small_atom_data]
+
+        # get model
+        model = get_model(self.config["model"]).to(env.DEVICE)
+        atomic_model = model.atomic_model
+        atomic_model["out_bias"] = paddle.randn(atomic_model["out_bias"].shape)
+        energy_bias_before = to_numpy_array(atomic_model["out_bias"])[0]
+
+        # prepare original model for test
+        dp = paddle.jit.to_static(model)
+        tmp_model = tempfile.NamedTemporaryFile(delete=False, suffix=".pd")
+        paddle.jit.save(dp, tmp_model.name)
+        dp = DeepEval(tmp_model.name)
+        origin_type_map = ["O", "H"]
+        full_type_map = ["O", "H", "B"]
+
+        # change energy bias
+        model.atomic_model.change_out_bias(
+            sampled,
+            bias_adjust_mode="change-by-statistic",
+        )
+        energy_bias_after = to_numpy_array(atomic_model["out_bias"])[0]
+
+        # get ground-truth energy bias change
+        sorter = np.argsort(full_type_map)
+        idx_type_map = sorter[
+            np.searchsorted(full_type_map, origin_type_map, sorter=sorter)
+        ]
+        ntest = 1
+        atom_nums = np.tile(
+            np.bincount(to_numpy_array(sampled[0]["atype"][0]))[idx_type_map],
+            (ntest, 1),
+        )
+        atom_nums_small = np.tile(
+            np.bincount(to_numpy_array(sampled[1]["atype"][0]))[idx_type_map],
+            (ntest, 1),
+        )
+        atom_nums = np.concatenate([atom_nums, atom_nums_small], axis=0)
+
+        energy = dp.eval(
+            to_numpy_array(sampled[0]["coord"][:ntest]),
+            to_numpy_array(sampled[0]["box"][:ntest]),
+            to_numpy_array(sampled[0]["atype"][0]),
+        )[0]
+        energy_small = dp.eval(
+            to_numpy_array(sampled[1]["coord"][:ntest]),
+            to_numpy_array(sampled[1]["box"][:ntest]),
+            to_numpy_array(sampled[1]["atype"][0]),
+        )[0]
+        energy_diff = to_numpy_array(sampled[0][self.testkey][:ntest]) - energy
+        energy_diff_small = (
+            to_numpy_array(sampled[1][self.testkey][:ntest]) - energy_small
+        )
+        energy_diff = np.concatenate([energy_diff, energy_diff_small], axis=0)
+        finetune_shift = (
+            energy_bias_after[idx_type_map] - energy_bias_before[idx_type_map]
+        ).ravel()
+        ground_truth_shift = np.linalg.lstsq(atom_nums, energy_diff, rcond=None)[
+            0
+        ].reshape(-1)
+
+        # check values
+        np.testing.assert_almost_equal(finetune_shift, ground_truth_shift, decimal=10)
+
+        self.tearDown()
+
+    def test_finetune_change_type(self):
+        if not self.mixed_types:
+            # skip when not mixed_types
+            return
+        # get data
+        data = DpLoaderSet(
+            self.data_file,
+            batch_size=1,
+            type_map=self.config["model"]["type_map"],
+        )
+        data.add_data_requirement(energy_data_requirement)
+        sampled = make_stat_input(
+            data.systems,
+            data.dataloaders,
+            nbatches=1,
+        )
+        data_type_map = self.config["model"]["type_map"]
+        for [old_type_map, new_type_map] in [
+            [["H", "X1", "X2", "O", "B"], ["O", "H", "B"]],
+            [["O", "H", "B"], ["H", "X1", "X2", "O", "B"]],
+        ]:
+            old_type_map_index = np.array(
+                [old_type_map.index(i) for i in data_type_map], dtype=np.int32
+            )
+            new_type_map_index = np.array(
+                [new_type_map.index(i) for i in data_type_map], dtype=np.int32
+            )
+
+            # get pretrained model with old type map
+            config_old_type_map = deepcopy(self.config)
+            config_old_type_map["model"]["type_map"] = old_type_map
+            trainer = get_trainer(config_old_type_map)
+            trainer.run()
+            finetune_model = (
+                config_old_type_map["training"].get("save_ckpt", "model.ckpt") + ".pd"
+            )
+
+            # finetune load the same type_map
+            config_old_type_map_finetune = deepcopy(self.config)
+            config_old_type_map_finetune["model"]["type_map"] = old_type_map
+            config_old_type_map_finetune["model"], finetune_links = get_finetune_rules(
+                finetune_model,
+                config_old_type_map_finetune["model"],
+            )
+            trainer_finetune_old = get_trainer(
+                config_old_type_map_finetune,
+                finetune_model=finetune_model,
+                finetune_links=finetune_links,
+            )
+
+            # finetune load the slim type_map
+            config_new_type_map_finetune = deepcopy(self.config)
+            config_new_type_map_finetune["model"]["type_map"] = new_type_map
+            config_new_type_map_finetune["model"], finetune_links = get_finetune_rules(
+                finetune_model,
+                config_new_type_map_finetune["model"],
+            )
+            trainer_finetune_new = get_trainer(
+                config_new_type_map_finetune,
+                finetune_model=finetune_model,
+                finetune_links=finetune_links,
+            )
+
+            # test consistency
+            ntest = 1
+            prec = 1e-10
+            model_old_result = trainer_finetune_old.model(
+                sampled[0]["coord"][:ntest],
+                to_paddle_tensor(old_type_map_index)[sampled[0]["atype"][:ntest]],
+                box=sampled[0]["box"][:ntest],
+            )
+            model_new_result = trainer_finetune_new.model(
+                sampled[0]["coord"][:ntest],
+                to_paddle_tensor(new_type_map_index)[sampled[0]["atype"][:ntest]],
+                box=sampled[0]["box"][:ntest],
+            )
+            test_keys = ["energy", "force", "virial"]
+            for key in test_keys:
+                np.testing.assert_allclose(
+                    model_old_result[key].numpy(),
+                    model_new_result[key].numpy(),
+                    rtol=prec,
+                    atol=prec,
+                )
+
+            self.tearDown()
+
+    def tearDown(self):
+        for f in os.listdir("."):
+            if f.startswith("model") and f.endswith(".pd"):
+                os.remove(f)
+            if f in ["lcurve.out"]:
+                os.remove(f)
+            if f in ["stat_files"]:
+                shutil.rmtree(f)
+
+
+class TestEnergyModelSeA(FinetuneTest, unittest.TestCase):
+    def setUp(self):
+        input_json = str(Path(__file__).parent / "water/se_atten.json")
+        with open(input_json) as f:
+            self.config = json.load(f)
+        self.data_file = [str(Path(__file__).parent / "water/data/single")]
+        self.config["training"]["training_data"]["systems"] = self.data_file
+        self.config["training"]["validation_data"]["systems"] = self.data_file
+        self.config["model"] = deepcopy(model_se_e2_a)
+        self.config["training"]["numb_steps"] = 1
+        self.config["training"]["save_freq"] = 1
+        self.mixed_types = False
+        self.testkey = None
+
+
+@unittest.skip("Skip for not implemented yet")
+class TestEnergyZBLModelSeA(FinetuneTest, unittest.TestCase):
+    def setUp(self):
+        input_json = str(Path(__file__).parent / "water/se_atten.json")
+        with open(input_json) as f:
+            self.config = json.load(f)
+        self.data_file = [str(Path(__file__).parent / "water/data/single")]
+        self.config["training"]["training_data"]["systems"] = self.data_file
+        self.config["training"]["validation_data"]["systems"] = self.data_file
+        self.config["model"] = deepcopy(model_zbl)
+        self.config["training"]["numb_steps"] = 1
+        self.config["training"]["save_freq"] = 1
+        self.mixed_types = False
+        self.testkey = None
+
+
+@unittest.skip("Skip for not implemented yet")
+class TestEnergyDOSModelSeA(FinetuneTest, unittest.TestCase):
+    def setUp(self):
+        input_json = str(Path(__file__).parent / "dos/input.json")
+        with open(input_json) as f:
+            self.config = json.load(f)
+        self.data_file = [str(Path(__file__).parent / "dos/data/global_system")]
+        self.config["training"]["training_data"]["systems"] = self.data_file
+        self.config["training"]["validation_data"]["systems"] = self.data_file
+        self.config["model"] = deepcopy(model_dos)
+        self.config["training"]["numb_steps"] = 1
+        self.config["training"]["save_freq"] = 1
+        self.mixed_types = False
+        self.testkey = "dos"
+
+
+@unittest.skip("Skip for not implemented yet")
+class TestEnergyModelDPA1(FinetuneTest, unittest.TestCase):
+    def setUp(self):
+        input_json = str(Path(__file__).parent / "water/se_atten.json")
+        with open(input_json) as f:
+            self.config = json.load(f)
+        self.data_file = [str(Path(__file__).parent / "water/data/single")]
+        self.config["training"]["training_data"]["systems"] = self.data_file
+        self.config["training"]["validation_data"]["systems"] = self.data_file
+        self.config["model"] = deepcopy(model_dpa1)
+        self.config["training"]["numb_steps"] = 1
+        self.config["training"]["save_freq"] = 1
+        self.mixed_types = True
+        self.testkey = None
+
+
+@unittest.skip("Skip for not implemented yet")
+class TestEnergyModelDPA2(FinetuneTest, unittest.TestCase):
+    def setUp(self):
+        input_json = str(Path(__file__).parent / "water/se_atten.json")
+        with open(input_json) as f:
+            self.config = json.load(f)
+        self.data_file = [str(Path(__file__).parent / "water/data/single")]
+        self.config["training"]["training_data"]["systems"] = self.data_file
+        self.config["training"]["validation_data"]["systems"] = self.data_file
+        self.config["model"] = deepcopy(model_dpa2)
+        self.config["model"]["descriptor"]["repformer"]["nlayers"] = 2
+
+        self.config["training"]["numb_steps"] = 1
+        self.config["training"]["save_freq"] = 1
+        self.mixed_types = True
+        self.testkey = None
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/source/tests/pd/test_loss.py b/source/tests/pd/test_loss.py
new file mode 100644
index 0000000000..a7b8109e10
--- /dev/null
+++ b/source/tests/pd/test_loss.py
@@ -0,0 +1,585 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import os
+import unittest
+
+import numpy as np
+import paddle
+import tensorflow.compat.v1 as tf
+
+tf.disable_eager_execution()
+from pathlib import (
+    Path,
+)
+
+from deepmd.pd.loss import (
+    EnergyStdLoss,
+)
+from deepmd.pd.utils.dataset import (
+    DeepmdDataSetForLoader,
+)
+from deepmd.tf.loss.ener import (
+    EnerStdLoss,
+)
+from deepmd.utils.data import (
+    DataRequirementItem,
+)
+
+from ..seed import (
+    GLOBAL_SEED,
+)
+from .model.test_embedding_net import (
+    get_single_batch,
+)
+from .test_finetune import (
+    energy_data_requirement,
+)
+
+CUR_DIR = os.path.dirname(__file__)
+
+
+def get_batch(system, type_map, data_requirement):
+    dataset = DeepmdDataSetForLoader(system, type_map)
+    dataset.add_data_requirement(data_requirement)
+    np_batch, pd_batch = get_single_batch(dataset)
+    return np_batch, pd_batch
+
+
+class LossCommonTest(unittest.TestCase):
+    def setUp(self):
+        self.cur_lr = 1.2
+        if not self.spin:
+            self.system = str(Path(__file__).parent / "water/data/data_0")
+            self.type_map = ["H", "O"]
+        else:
+            self.system = str(Path(__file__).parent / "NiO/data/data_0")
+            self.type_map = ["Ni", "O"]
+            energy_data_requirement.append(
+                DataRequirementItem(
+                    "force_mag",
+                    ndof=3,
+                    atomic=True,
+                    must=False,
+                    high_prec=False,
+                )
+            )
+        # data
+        np_batch, pd_batch = get_batch(
+            self.system, self.type_map, energy_data_requirement
+        )
+        natoms = np_batch["natoms"]
+        self.nloc = natoms[0]
+        nframes = np_batch["energy"].shape[0]
+        rng = np.random.default_rng(GLOBAL_SEED)
+
+        if not self.spin:
+            l_energy, l_force, l_virial = (
+                np_batch["energy"],
+                np_batch["force"],
+                np_batch["virial"],
+            )
+            p_energy, p_force, p_virial = (
+                np.ones_like(l_energy),
+                np.ones_like(l_force),
+                np.ones_like(l_virial),
+            )
+            nloc = natoms[0]
+            batch_size = pd_batch["coord"].shape[0]
+            p_atom_energy = rng.random(size=[batch_size, nloc])
+            l_atom_energy = rng.random(size=[batch_size, nloc])
+            atom_pref = rng.random(size=[batch_size, nloc * 3])
+            drdq = rng.random(size=[batch_size, nloc * 2 * 3])
+            atom_ener_coeff = rng.random(size=[batch_size, nloc])
+            # placeholders
+            l_force_real = l_force
+            l_force_mag = l_force
+            p_force_real = p_force
+            p_force_mag = p_force
+        else:
+            # data
+            np_batch, pd_batch = get_batch(
+                self.system, self.type_map, energy_data_requirement
+            )
+            natoms = np_batch["natoms"]
+            self.nloc = natoms[0]
+            l_energy, l_force_real, l_force_mag, l_virial = (
+                np_batch["energy"],
+                np_batch["force"],
+                np_batch["force_mag"],
+                np_batch["virial"],
+            )
+            # merged force for tf old implement
+            l_force_merge_tf = np.concatenate(
+                [
+                    l_force_real.reshape([nframes, self.nloc, 3]),
+                    l_force_mag.reshape([nframes, self.nloc, 3])[
+                        np_batch["atype"] == 0
+                    ].reshape([nframes, -1, 3]),
+                ],
+                axis=1,
+            ).reshape([nframes, -1])
+            p_energy, p_force_real, p_force_mag, p_force_merge_tf, p_virial = (
+                np.ones_like(l_energy),
+                np.ones_like(l_force_real),
+                np.ones_like(l_force_mag),
+                np.ones_like(l_force_merge_tf),
+                np.ones_like(l_virial),
+            )
+            virt_nloc = (np_batch["atype"] == 0).sum(-1)
+            natoms_tf = np.concatenate([natoms, virt_nloc], axis=0)
+            natoms_tf[:2] += virt_nloc
+            nloc = natoms_tf[0]
+            batch_size = pd_batch["coord"].shape[0]
+            p_atom_energy = rng.random(size=[batch_size, nloc])
+            l_atom_energy = rng.random(size=[batch_size, nloc])
+            atom_pref = rng.random(size=[batch_size, nloc * 3])
+            drdq = rng.random(size=[batch_size, nloc * 2 * 3])
+            atom_ener_coeff = rng.random(size=[batch_size, nloc])
+            self.nloc_tf = nloc
+            natoms = natoms_tf
+            l_force = l_force_merge_tf
+            p_force = p_force_merge_tf
+
+        # tf
+        self.g = tf.Graph()
+        with self.g.as_default():
+            t_cur_lr = tf.placeholder(shape=[], dtype=tf.float64)
+            t_natoms = tf.placeholder(shape=[None], dtype=tf.int32)
+            t_penergy = tf.placeholder(shape=[None, 1], dtype=tf.float64)
+            t_pforce = tf.placeholder(shape=[None, None], dtype=tf.float64)
+            t_pvirial = tf.placeholder(shape=[None, 9], dtype=tf.float64)
+            t_patom_energy = tf.placeholder(shape=[None, None], dtype=tf.float64)
+            t_lenergy = tf.placeholder(shape=[None, 1], dtype=tf.float64)
+            t_lforce = tf.placeholder(shape=[None, None], dtype=tf.float64)
+            t_lvirial = tf.placeholder(shape=[None, 9], dtype=tf.float64)
+            t_latom_energy = tf.placeholder(shape=[None, None], dtype=tf.float64)
+            t_atom_pref = tf.placeholder(shape=[None, None], dtype=tf.float64)
+            t_atom_ener_coeff = tf.placeholder(shape=[None, None], dtype=tf.float64)
+            t_drdq = tf.placeholder(shape=[None, None], dtype=tf.float64)
+            find_energy = tf.constant(1.0, dtype=tf.float64)
+            find_force = tf.constant(1.0, dtype=tf.float64)
+            find_virial = tf.constant(1.0 if not self.spin else 0.0, dtype=tf.float64)
+            find_atom_energy = tf.constant(1.0, dtype=tf.float64)
+            find_atom_pref = tf.constant(1.0, dtype=tf.float64)
+            find_drdq = tf.constant(1.0, dtype=tf.float64)
+            find_atom_ener_coeff = tf.constant(1.0, dtype=tf.float64)
+            model_dict = {
+                "energy": t_penergy,
+                "force": t_pforce,
+                "virial": t_pvirial,
+                "atom_ener": t_patom_energy,
+            }
+            label_dict = {
+                "energy": t_lenergy,
+                "force": t_lforce,
+                "virial": t_lvirial,
+                "atom_ener": t_latom_energy,
+                "atom_pref": t_atom_pref,
+                "drdq": t_drdq,
+                "atom_ener_coeff": t_atom_ener_coeff,
+                "find_energy": find_energy,
+                "find_force": find_force,
+                "find_virial": find_virial,
+                "find_atom_ener": find_atom_energy,
+                "find_atom_pref": find_atom_pref,
+                "find_drdq": find_drdq,
+                "find_atom_ener_coeff": find_atom_ener_coeff,
+            }
+            self.tf_loss_sess = self.tf_loss.build(
+                t_cur_lr, t_natoms, model_dict, label_dict, ""
+            )
+
+        self.feed_dict = {
+            t_cur_lr: self.cur_lr,
+            t_natoms: natoms,
+            t_penergy: p_energy,
+            t_pforce: p_force,
+            t_pvirial: p_virial.reshape([-1, 9]),
+            t_patom_energy: p_atom_energy,
+            t_lenergy: l_energy,
+            t_lforce: l_force,
+            t_lvirial: l_virial.reshape([-1, 9]),
+            t_latom_energy: l_atom_energy,
+            t_atom_pref: atom_pref,
+            t_drdq: drdq,
+            t_atom_ener_coeff: atom_ener_coeff,
+        }
+        # pd
+        if not self.spin:
+            self.model_pred = {
+                "energy": paddle.to_tensor(p_energy),
+                "force": paddle.to_tensor(p_force),
+                "virial": paddle.to_tensor(p_virial),
+                "atom_energy": paddle.to_tensor(p_atom_energy),
+            }
+            self.label = {
+                "energy": paddle.to_tensor(l_energy),
+                "find_energy": 1.0,
+                "force": paddle.to_tensor(l_force),
+                "find_force": 1.0,
+                "virial": paddle.to_tensor(l_virial),
+                "find_virial": 1.0,
+                "atom_ener": paddle.to_tensor(l_atom_energy),
+                "find_atom_ener": 1.0,
+                "atom_pref": paddle.to_tensor(atom_pref),
+                "find_atom_pref": 1.0,
+                "drdq": paddle.to_tensor(drdq),
+                "find_drdq": 1.0,
+                "atom_ener_coeff": paddle.to_tensor(atom_ener_coeff),
+                "find_atom_ener_coeff": 1.0,
+            }
+            self.label_absent = {
+                "energy": paddle.to_tensor(l_energy),
+                "force": paddle.to_tensor(l_force),
+                "virial": paddle.to_tensor(l_virial),
+                "atom_ener": paddle.to_tensor(l_atom_energy),
+                "atom_pref": paddle.to_tensor(atom_pref),
+                "drdq": paddle.to_tensor(drdq),
+                "atom_ener_coeff": paddle.to_tensor(atom_ener_coeff),
+            }
+        else:
+            self.model_pred = {
+                "energy": paddle.to_tensor(p_energy),
+                "force": paddle.to_tensor(p_force_real).reshape(
+                    [nframes, self.nloc, 3]
+                ),
+                "force_mag": paddle.to_tensor(p_force_mag).reshape(
+                    [nframes, self.nloc, 3]
+                ),
+                "mask_mag": paddle.to_tensor(np_batch["atype"] == 0).reshape(
+                    [nframes, self.nloc, 1]
+                ),
+                "atom_energy": paddle.to_tensor(p_atom_energy),
+            }
+            self.label = {
+                "energy": paddle.to_tensor(l_energy),
+                "find_energy": 1.0,
+                "force": paddle.to_tensor(l_force_real).reshape(
+                    [nframes, self.nloc, 3]
+                ),
+                "find_force": 1.0,
+                "force_mag": paddle.to_tensor(l_force_mag).reshape(
+                    [nframes, self.nloc, 3]
+                ),
+                "find_force_mag": 1.0,
+                "atom_ener": paddle.to_tensor(l_atom_energy),
+                "find_atom_ener": 1.0,
+                "atom_ener_coeff": paddle.to_tensor(atom_ener_coeff),
+                "find_atom_ener_coeff": 1.0,
+            }
+            self.label_absent = {
+                "energy": paddle.to_tensor(l_energy),
+                "force": paddle.to_tensor(l_force_real).reshape(
+                    [nframes, self.nloc, 3]
+                ),
+                "force_mag": paddle.to_tensor(l_force_mag).reshape(
+                    [nframes, self.nloc, 3]
+                ),
+                "atom_ener": paddle.to_tensor(l_atom_energy),
+                "atom_ener_coeff": paddle.to_tensor(atom_ener_coeff),
+            }
+        self.natoms = pd_batch["natoms"]
+
+    def tearDown(self) -> None:
+        tf.reset_default_graph()
+        return super().tearDown()
+
+
+class TestEnerStdLoss(LossCommonTest):
+    def setUp(self):
+        self.start_lr = 1.1
+        self.start_pref_e = 0.02
+        self.limit_pref_e = 1.0
+        self.start_pref_f = 1000.0
+        self.limit_pref_f = 1.0
+        self.start_pref_v = 0.02
+        self.limit_pref_v = 1.0
+        # tf
+        self.tf_loss = EnerStdLoss(
+            self.start_lr,
+            self.start_pref_e,
+            self.limit_pref_e,
+            self.start_pref_f,
+            self.limit_pref_f,
+            self.start_pref_v,
+            self.limit_pref_v,
+        )
+        # pd
+        self.pd_loss = EnergyStdLoss(
+            self.start_lr,
+            self.start_pref_e,
+            self.limit_pref_e,
+            self.start_pref_f,
+            self.limit_pref_f,
+            self.start_pref_v,
+            self.limit_pref_v,
+        )
+        self.spin = False
+        super().setUp()
+
+    def test_consistency(self):
+        with tf.Session(graph=self.g) as sess:
+            tf_loss, tf_more_loss = sess.run(
+                self.tf_loss_sess, feed_dict=self.feed_dict
+            )
+
+        def fake_model():
+            return self.model_pred
+
+        _, pd_loss, pd_more_loss = self.pd_loss(
+            {},
+            fake_model,
+            self.label,
+            self.nloc,
+            self.cur_lr,
+        )
+        _, pd_loss_absent, pd_more_loss_absent = self.pd_loss(
+            {},
+            fake_model,
+            self.label_absent,
+            self.nloc,
+            self.cur_lr,
+        )
+        pd_loss = pd_loss.detach().cpu()
+        pd_loss_absent = pd_loss_absent.detach().cpu()
+        self.assertTrue(np.allclose(tf_loss, pd_loss.numpy()))
+        self.assertTrue(np.allclose(0.0, pd_loss_absent.numpy()))
+        for key in ["ener", "force", "virial"]:
+            self.assertTrue(
+                np.allclose(
+                    tf_more_loss[f"l2_{key}_loss"], pd_more_loss[f"l2_{key}_loss"]
+                )
+            )
+            self.assertTrue(np.isnan(pd_more_loss_absent[f"l2_{key}_loss"].numpy()))
+
+
+class TestEnerStdLossAePfGf(LossCommonTest):
+    def setUp(self):
+        self.start_lr = 1.1
+        self.start_pref_e = 0.02
+        self.limit_pref_e = 1.0
+        self.start_pref_f = 1000.0
+        self.limit_pref_f = 1.0
+        self.start_pref_v = 0.02
+        self.limit_pref_v = 1.0
+        self.start_pref_ae = 0.02
+        self.limit_pref_ae = 1.0
+        self.start_pref_pf = 0.02
+        self.limit_pref_pf = 1.0
+        self.start_pref_gf = 0.02
+        self.limit_pref_gf = 1.0
+        self.numb_generalized_coord = 2
+        # tf
+        self.tf_loss = EnerStdLoss(
+            self.start_lr,
+            self.start_pref_e,
+            self.limit_pref_e,
+            self.start_pref_f,
+            self.limit_pref_f,
+            self.start_pref_v,
+            self.limit_pref_v,
+            self.start_pref_ae,
+            self.limit_pref_ae,
+            self.start_pref_pf,
+            self.limit_pref_pf,
+            start_pref_gf=self.start_pref_gf,
+            limit_pref_gf=self.limit_pref_gf,
+            numb_generalized_coord=self.numb_generalized_coord,
+        )
+        # pd
+        self.pd_loss = EnergyStdLoss(
+            self.start_lr,
+            self.start_pref_e,
+            self.limit_pref_e,
+            self.start_pref_f,
+            self.limit_pref_f,
+            self.start_pref_v,
+            self.limit_pref_v,
+            self.start_pref_ae,
+            self.limit_pref_ae,
+            self.start_pref_pf,
+            self.limit_pref_pf,
+            start_pref_gf=self.start_pref_gf,
+            limit_pref_gf=self.limit_pref_gf,
+            numb_generalized_coord=self.numb_generalized_coord,
+        )
+        self.spin = False
+        super().setUp()
+
+    def test_consistency(self):
+        with tf.Session(graph=self.g) as sess:
+            tf_loss, tf_more_loss = sess.run(
+                self.tf_loss_sess, feed_dict=self.feed_dict
+            )
+
+        def fake_model():
+            return self.model_pred
+
+        _, pd_loss, pd_more_loss = self.pd_loss(
+            {},
+            fake_model,
+            self.label,
+            self.nloc,
+            self.cur_lr,
+        )
+        _, pd_loss_absent, pd_more_loss_absent = self.pd_loss(
+            {},
+            fake_model,
+            self.label_absent,
+            self.nloc,
+            self.cur_lr,
+        )
+        pd_loss = pd_loss.detach().cpu()
+        pd_loss_absent = pd_loss_absent.detach().cpu()
+        self.assertTrue(np.allclose(tf_loss, pd_loss.numpy()))
+        self.assertTrue(np.allclose(0.0, pd_loss_absent.numpy()))
+        for key in ["ener", "force", "virial", "atom_ener", "pref_force", "gen_force"]:
+            self.assertTrue(
+                np.allclose(
+                    tf_more_loss[f"l2_{key}_loss"], pd_more_loss[f"l2_{key}_loss"]
+                )
+            )
+            self.assertTrue(np.isnan(pd_more_loss_absent[f"l2_{key}_loss"].numpy()))
+
+
+class TestEnerStdLossAecoeff(LossCommonTest):
+    def setUp(self):
+        self.start_lr = 1.1
+        self.start_pref_e = 0.02
+        self.limit_pref_e = 1.0
+        self.start_pref_f = 1000.0
+        self.limit_pref_f = 1.0
+        self.start_pref_v = 0.02
+        self.limit_pref_v = 1.0
+        # tf
+        self.tf_loss = EnerStdLoss(
+            self.start_lr,
+            self.start_pref_e,
+            self.limit_pref_e,
+            self.start_pref_f,
+            self.limit_pref_f,
+            self.start_pref_v,
+            self.limit_pref_v,
+            enable_atom_ener_coeff=True,
+        )
+        # pd
+        self.pd_loss = EnergyStdLoss(
+            self.start_lr,
+            self.start_pref_e,
+            self.limit_pref_e,
+            self.start_pref_f,
+            self.limit_pref_f,
+            self.start_pref_v,
+            self.limit_pref_v,
+            enable_atom_ener_coeff=True,
+        )
+        self.spin = False
+        super().setUp()
+
+    def test_consistency(self):
+        with tf.Session(graph=self.g) as sess:
+            tf_loss, tf_more_loss = sess.run(
+                self.tf_loss_sess, feed_dict=self.feed_dict
+            )
+
+        def fake_model():
+            return self.model_pred
+
+        _, pd_loss, pd_more_loss = self.pd_loss(
+            {},
+            fake_model,
+            self.label,
+            self.nloc,
+            self.cur_lr,
+        )
+        _, pd_loss_absent, pd_more_loss_absent = self.pd_loss(
+            {},
+            fake_model,
+            self.label_absent,
+            self.nloc,
+            self.cur_lr,
+        )
+        pd_loss = pd_loss.detach().cpu()
+        pd_loss_absent = pd_loss_absent.detach().cpu()
+        self.assertTrue(np.allclose(tf_loss, pd_loss.numpy()))
+        self.assertTrue(np.allclose(0.0, pd_loss_absent.numpy()))
+        for key in ["ener", "force", "virial"]:
+            self.assertTrue(
+                np.allclose(
+                    tf_more_loss[f"l2_{key}_loss"], pd_more_loss[f"l2_{key}_loss"]
+                )
+            )
+            self.assertTrue(np.isnan(pd_more_loss_absent[f"l2_{key}_loss"].numpy()))
+
+
+class TestEnerStdLossRelativeF(LossCommonTest):
+    def setUp(self):
+        self.start_lr = 1.1
+        self.start_pref_e = 0.02
+        self.limit_pref_e = 1.0
+        self.start_pref_f = 1000.0
+        self.limit_pref_f = 1.0
+        self.start_pref_v = 0.02
+        self.limit_pref_v = 1.0
+        # tf
+        self.tf_loss = EnerStdLoss(
+            self.start_lr,
+            self.start_pref_e,
+            self.limit_pref_e,
+            self.start_pref_f,
+            self.limit_pref_f,
+            self.start_pref_v,
+            self.limit_pref_v,
+            relative_f=0.1,
+        )
+        # pd
+        self.pd_loss = EnergyStdLoss(
+            self.start_lr,
+            self.start_pref_e,
+            self.limit_pref_e,
+            self.start_pref_f,
+            self.limit_pref_f,
+            self.start_pref_v,
+            self.limit_pref_v,
+            relative_f=0.1,
+        )
+        self.spin = False
+        super().setUp()
+
+    def test_consistency(self):
+        with tf.Session(graph=self.g) as sess:
+            tf_loss, tf_more_loss = sess.run(
+                self.tf_loss_sess, feed_dict=self.feed_dict
+            )
+
+        def fake_model():
+            return self.model_pred
+
+        _, pd_loss, pd_more_loss = self.pd_loss(
+            {},
+            fake_model,
+            self.label,
+            self.nloc,
+            self.cur_lr,
+        )
+        _, pd_loss_absent, pd_more_loss_absent = self.pd_loss(
+            {},
+            fake_model,
+            self.label_absent,
+            self.nloc,
+            self.cur_lr,
+        )
+        pd_loss = pd_loss.detach().cpu()
+        pd_loss_absent = pd_loss_absent.detach().cpu()
+        self.assertTrue(np.allclose(tf_loss, pd_loss.numpy()))
+        self.assertTrue(np.allclose(0.0, pd_loss_absent.numpy()))
+        for key in ["ener", "force", "virial"]:
+            self.assertTrue(
+                np.allclose(
+                    tf_more_loss[f"l2_{key}_loss"], pd_more_loss[f"l2_{key}_loss"]
+                )
+            )
+            self.assertTrue(np.isnan(pd_more_loss_absent[f"l2_{key}_loss"].numpy()))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/source/tests/pd/test_lr.py b/source/tests/pd/test_lr.py
new file mode 100644
index 0000000000..9607f982fd
--- /dev/null
+++ b/source/tests/pd/test_lr.py
@@ -0,0 +1,106 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import unittest
+
+import numpy as np
+import tensorflow.compat.v1 as tf
+
+tf.disable_eager_execution()
+
+from deepmd.dpmodel.utils.learning_rate import (
+    LearningRateExp,
+)
+from deepmd.tf.utils import (
+    learning_rate,
+)
+
+
+class TestLearningRate(unittest.TestCase):
+    def setUp(self):
+        self.start_lr = 0.001
+        self.stop_lr = 3.51e-8
+        self.decay_steps = np.arange(400, 601, 100)
+        self.stop_steps = np.arange(500, 1600, 500)
+
+    def test_consistency(self):
+        for decay_step in self.decay_steps:
+            for stop_step in self.stop_steps:
+                self.decay_step = decay_step
+                self.stop_step = stop_step
+                self.judge_it()
+                self.decay_rate_pd()
+
+    def judge_it(self):
+        base_lr = learning_rate.LearningRateExp(
+            self.start_lr, self.stop_lr, self.decay_step
+        )
+        g = tf.Graph()
+        with g.as_default():
+            global_step = tf.placeholder(shape=[], dtype=tf.int32)
+            t_lr = base_lr.build(global_step, self.stop_step)
+
+        my_lr = LearningRateExp(
+            self.start_lr, self.stop_lr, self.decay_step, self.stop_step
+        )
+        with tf.Session(graph=g) as sess:
+            base_vals = [
+                sess.run(t_lr, feed_dict={global_step: step_id})
+                for step_id in range(self.stop_step)
+                if step_id % self.decay_step != 0
+            ]
+        my_vals = [
+            my_lr.value(step_id)
+            for step_id in range(self.stop_step)
+            if step_id % self.decay_step != 0
+        ]
+        self.assertTrue(np.allclose(base_vals, my_vals))
+        tf.reset_default_graph()
+
+    def decay_rate_pd(self):
+        my_lr = LearningRateExp(
+            self.start_lr, self.stop_lr, self.decay_step, self.stop_step
+        )
+
+        default_ds = 100 if self.stop_step // 10 > 100 else self.stop_step // 100 + 1
+        if self.decay_step >= self.stop_step:
+            self.decay_step = default_ds
+        decay_rate = np.exp(
+            np.log(self.stop_lr / self.start_lr) / (self.stop_step / self.decay_step)
+        )
+        my_lr_decay = LearningRateExp(
+            self.start_lr,
+            1e-10,
+            self.decay_step,
+            self.stop_step,
+            decay_rate=decay_rate,
+        )
+        min_lr = 1e-5
+        my_lr_decay_trunc = LearningRateExp(
+            self.start_lr,
+            min_lr,
+            self.decay_step,
+            self.stop_step,
+            decay_rate=decay_rate,
+        )
+        my_vals = [
+            my_lr.value(step_id)
+            for step_id in range(self.stop_step)
+            if step_id % self.decay_step != 0
+        ]
+        my_vals_decay = [
+            my_lr_decay.value(step_id)
+            for step_id in range(self.stop_step)
+            if step_id % self.decay_step != 0
+        ]
+        my_vals_decay_trunc = [
+            my_lr_decay_trunc.value(step_id)
+            for step_id in range(self.stop_step)
+            if step_id % self.decay_step != 0
+        ]
+        self.assertTrue(np.allclose(my_vals_decay, my_vals))
+        self.assertTrue(
+            np.allclose(my_vals_decay_trunc, np.clip(my_vals, a_min=min_lr, a_max=None))
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/source/tests/pd/test_multitask.py b/source/tests/pd/test_multitask.py
new file mode 100644
index 0000000000..e3d4cfa7de
--- /dev/null
+++ b/source/tests/pd/test_multitask.py
@@ -0,0 +1,224 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import json
+import os
+import shutil
+import unittest
+from copy import (
+    deepcopy,
+)
+from pathlib import (
+    Path,
+)
+
+import numpy as np
+
+from deepmd.pd.entrypoints.main import (
+    get_trainer,
+)
+from deepmd.pd.utils.finetune import (
+    get_finetune_rules,
+)
+from deepmd.pd.utils.multi_task import (
+    preprocess_shared_params,
+)
+from deepmd.utils.argcheck import (
+    normalize,
+)
+from deepmd.utils.compat import (
+    update_deepmd_input,
+)
+
+from .model.test_permutation import (
+    model_se_e2_a,
+)
+
+
+def setUpModule():
+    global multitask_template
+    multitask_template_json = str(Path(__file__).parent / "water/multitask.json")
+    with open(multitask_template_json) as f:
+        multitask_template = json.load(f)
+
+
+class MultiTaskTrainTest:
+    def test_multitask_train(self):
+        # test multitask training
+        self.config = update_deepmd_input(self.config, warning=True)
+        self.config = normalize(self.config, multi_task=True)
+        trainer = get_trainer(deepcopy(self.config), shared_links=self.shared_links)
+        trainer.run()
+        # check model keys
+        self.assertEqual(len(trainer.wrapper.model), 2)
+        self.assertIn("model_1", trainer.wrapper.model)
+        self.assertIn("model_2", trainer.wrapper.model)
+
+        # check shared parameters
+        multi_state_dict = trainer.wrapper.model.state_dict()
+        for state_key in multi_state_dict:
+            if "model_1" in state_key:
+                self.assertIn(state_key.replace("model_1", "model_2"), multi_state_dict)
+            if "model_2" in state_key:
+                self.assertIn(state_key.replace("model_2", "model_1"), multi_state_dict)
+            if "model_1.descriptor" in state_key:
+                np.testing.assert_allclose(
+                    multi_state_dict[state_key].numpy(),
+                    multi_state_dict[state_key.replace("model_1", "model_2")].numpy(),
+                )
+
+        # test multitask fine-tuning
+        # add model_3
+        self.origin_config["model"]["model_dict"]["model_3"] = deepcopy(
+            self.origin_config["model"]["model_dict"]["model_2"]
+        )
+        self.origin_config["loss_dict"]["model_3"] = deepcopy(
+            self.origin_config["loss_dict"]["model_2"]
+        )
+        self.origin_config["training"]["model_prob"]["model_3"] = deepcopy(
+            self.origin_config["training"]["model_prob"]["model_2"]
+        )
+        self.origin_config["training"]["data_dict"]["model_3"] = deepcopy(
+            self.origin_config["training"]["data_dict"]["model_2"]
+        )
+        self.origin_config["training"]["data_dict"]["model_3"]["stat_file"] = (
+            self.origin_config[
+                "training"
+            ]["data_dict"]["model_3"]["stat_file"].replace("model_2", "model_3")
+        )
+
+        # add model_4
+        self.origin_config["model"]["model_dict"]["model_4"] = deepcopy(
+            self.origin_config["model"]["model_dict"]["model_2"]
+        )
+        self.origin_config["loss_dict"]["model_4"] = deepcopy(
+            self.origin_config["loss_dict"]["model_2"]
+        )
+        self.origin_config["training"]["model_prob"]["model_4"] = deepcopy(
+            self.origin_config["training"]["model_prob"]["model_2"]
+        )
+        self.origin_config["training"]["data_dict"]["model_4"] = deepcopy(
+            self.origin_config["training"]["data_dict"]["model_2"]
+        )
+        self.origin_config["training"]["data_dict"]["model_4"]["stat_file"] = (
+            self.origin_config[
+                "training"
+            ]["data_dict"]["model_4"]["stat_file"].replace("model_2", "model_4")
+        )
+
+        # set finetune rules
+        # model_1 resuming from model_1
+        # pass
+
+        # model_2 fine-tuning from model_2
+        self.origin_config["model"]["model_dict"]["model_2"]["finetune_head"] = (
+            "model_2"
+        )
+
+        # new model_3 fine-tuning from model_2
+        self.origin_config["model"]["model_dict"]["model_3"]["finetune_head"] = (
+            "model_2"
+        )
+
+        # new model_4 fine-tuning with randomly initialized fitting net
+        # pass
+
+        self.origin_config["model"], shared_links_finetune = preprocess_shared_params(
+            self.origin_config["model"]
+        )
+
+        finetune_model = self.config["training"].get("save_ckpt", "model.ckpt") + ".pd"
+        self.origin_config["model"], finetune_links = get_finetune_rules(
+            finetune_model,
+            self.origin_config["model"],
+        )
+        self.origin_config = update_deepmd_input(self.origin_config, warning=True)
+        self.origin_config = normalize(self.origin_config, multi_task=True)
+        trainer_finetune = get_trainer(
+            deepcopy(self.origin_config),
+            finetune_model=finetune_model,
+            shared_links=shared_links_finetune,
+            finetune_links=finetune_links,
+        )
+
+        # check parameters
+        multi_state_dict_finetuned = trainer_finetune.wrapper.model.state_dict()
+        for state_key in multi_state_dict_finetuned:
+            if "model_1" in state_key:
+                np.testing.assert_allclose(
+                    multi_state_dict[state_key].numpy(),
+                    multi_state_dict_finetuned[state_key].numpy(),
+                )
+            elif "model_2" in state_key and "out_bias" not in state_key:
+                np.testing.assert_allclose(
+                    multi_state_dict[state_key].numpy(),
+                    multi_state_dict_finetuned[state_key].numpy(),
+                )
+            elif "model_3" in state_key and "out_bias" not in state_key:
+                np.testing.assert_allclose(
+                    multi_state_dict[state_key.replace("model_3", "model_2")].numpy(),
+                    multi_state_dict_finetuned[state_key].numpy(),
+                )
+            elif (
+                "model_4" in state_key
+                and "fitting_net" not in state_key
+                and "out_bias" not in state_key
+            ):
+                np.testing.assert_allclose(
+                    multi_state_dict[state_key.replace("model_4", "model_2")].numpy(),
+                    multi_state_dict_finetuned[state_key].numpy(),
+                )
+
+        # check running
+        trainer_finetune.run()
+        self.tearDown()
+
+    def tearDown(self):
+        for f in os.listdir("."):
+            if f.startswith("model") and f.endswith(".pd"):
+                os.remove(f)
+            if f in ["lcurve.out"]:
+                os.remove(f)
+            if f in [self.stat_files]:
+                shutil.rmtree(f)
+
+
+class TestMultiTaskSeA(unittest.TestCase, MultiTaskTrainTest):
+    def setUp(self):
+        multitask_se_e2_a = deepcopy(multitask_template)
+        multitask_se_e2_a["model"]["shared_dict"]["my_descriptor"] = model_se_e2_a[
+            "descriptor"
+        ]
+        data_file = [str(Path(__file__).parent / "water/data/data_0")]
+        self.stat_files = "se_e2_a"
+        os.makedirs(self.stat_files, exist_ok=True)
+        self.config = multitask_se_e2_a
+        self.config["training"]["data_dict"]["model_1"]["training_data"]["systems"] = (
+            data_file
+        )
+        self.config["training"]["data_dict"]["model_1"]["validation_data"][
+            "systems"
+        ] = data_file
+        self.config["training"]["data_dict"]["model_1"]["stat_file"] = (
+            f"{self.stat_files}/model_1"
+        )
+        self.config["training"]["data_dict"]["model_2"]["training_data"]["systems"] = (
+            data_file
+        )
+        self.config["training"]["data_dict"]["model_2"]["validation_data"][
+            "systems"
+        ] = data_file
+        self.config["training"]["data_dict"]["model_2"]["stat_file"] = (
+            f"{self.stat_files}/model_2"
+        )
+        self.config["training"]["numb_steps"] = 1
+        self.config["training"]["save_freq"] = 1
+        self.origin_config = deepcopy(self.config)
+        self.config["model"], self.shared_links = preprocess_shared_params(
+            self.config["model"]
+        )
+
+    def tearDown(self) -> None:
+        MultiTaskTrainTest.tearDown(self)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/source/tests/pd/test_neighbor_stat.py b/source/tests/pd/test_neighbor_stat.py
new file mode 100644
index 0000000000..613150b7fc
--- /dev/null
+++ b/source/tests/pd/test_neighbor_stat.py
@@ -0,0 +1,69 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import shutil
+import unittest
+
+import dpdata
+import numpy as np
+
+from deepmd.entrypoints.neighbor_stat import (
+    neighbor_stat,
+)
+
+from ..seed import (
+    GLOBAL_SEED,
+)
+
+
+def gen_sys(nframes):
+    rng = np.random.default_rng(GLOBAL_SEED)
+    natoms = 1000
+    data = {}
+    X, Y, Z = np.mgrid[0:2:3j, 0:2:3j, 0:2:3j]
+    positions = np.vstack([X.ravel(), Y.ravel(), Z.ravel()]).T  # + 0.1
+    data["coords"] = np.repeat(positions[np.newaxis, :, :], nframes, axis=0)
+    data["forces"] = rng.random([nframes, natoms, 3])
+    data["cells"] = np.array([3.0, 0.0, 0.0, 0.0, 3.0, 0.0, 0.0, 0.0, 3.0]).reshape(
+        1, 3, 3
+    )
+    data["energies"] = rng.random([nframes, 1])
+    data["atom_names"] = ["TYPE"]
+    data["atom_numbs"] = [27]
+    data["atom_types"] = np.repeat(0, 27)
+    return data
+
+
+class TestNeighborStat(unittest.TestCase):
+    def setUp(self):
+        data0 = gen_sys(1)
+        sys0 = dpdata.LabeledSystem()
+        sys0.data = data0
+        sys0.to_deepmd_npy("system_0", set_size=1)
+
+    def tearDown(self):
+        shutil.rmtree("system_0")
+
+    def test_neighbor_stat(self):
+        for rcut in (0.0, 1.0, 2.0, 4.0):
+            for mixed_type in (True, False):
+                with self.subTest(rcut=rcut, mixed_type=mixed_type):
+                    rcut += 1e-3  # prevent numerical errors
+                    min_nbor_dist, max_nbor_size = neighbor_stat(
+                        system="system_0",
+                        rcut=rcut,
+                        type_map=["TYPE", "NO_THIS_TYPE"],
+                        mixed_type=mixed_type,
+                        backend="paddle",
+                    )
+                    upper = np.ceil(rcut) + 1
+                    X, Y, Z = np.mgrid[-upper:upper, -upper:upper, -upper:upper]
+                    positions = np.vstack([X.ravel(), Y.ravel(), Z.ravel()]).T
+                    # distance to (0,0,0)
+                    distance = np.linalg.norm(positions, axis=1)
+                    expected_neighbors = np.count_nonzero(
+                        np.logical_and(distance > 0, distance <= rcut)
+                    )
+                    self.assertAlmostEqual(min_nbor_dist, 1.0, 6)
+                    ret = [expected_neighbors]
+                    if not mixed_type:
+                        ret.append(0)
+                    np.testing.assert_array_equal(max_nbor_size, ret)
diff --git a/source/tests/pd/test_sampler.py b/source/tests/pd/test_sampler.py
new file mode 100644
index 0000000000..2af5a9c05c
--- /dev/null
+++ b/source/tests/pd/test_sampler.py
@@ -0,0 +1,114 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import json
+import os
+import unittest
+from pathlib import (
+    Path,
+)
+
+import numpy as np
+import paddle
+from paddle.io import (
+    BatchSampler,
+    DataLoader,
+)
+
+from deepmd.pd.utils.dataloader import (
+    DpLoaderSet,
+    get_weighted_sampler,
+)
+from deepmd.tf.common import (
+    expand_sys_str,
+)
+from deepmd.tf.utils import random as tf_random
+from deepmd.tf.utils.data_system import (
+    DeepmdDataSystem,
+)
+
+CUR_DIR = os.path.dirname(__file__)
+
+
+class TestSampler(unittest.TestCase):
+    def setUp(self):
+        with open(str(Path(__file__).parent / "water/se_e2_a.json")) as fin:
+            content = fin.read()
+        config = json.loads(content)
+        data_file = [str(Path(__file__).parent / "water/data/data_0")]
+        config["training"]["training_data"]["systems"] = data_file
+        config["training"]["validation_data"]["systems"] = data_file
+        model_config = config["model"]
+        self.rcut = model_config["descriptor"]["rcut"]
+        self.rcut_smth = model_config["descriptor"]["rcut_smth"]
+        self.sel = model_config["descriptor"]["sel"]
+        self.batch_size = config["training"]["training_data"]["batch_size"]
+        self.systems = config["training"]["validation_data"]["systems"]
+        if isinstance(self.systems, str):
+            self.systems = expand_sys_str(self.systems)
+        self.my_dataset = DpLoaderSet(
+            self.systems,
+            self.batch_size,
+            model_config["type_map"],
+            seed=10,
+            shuffle=False,
+        )
+
+        tf_random.seed(10)
+        self.dp_dataset = DeepmdDataSystem(self.systems, self.batch_size, 1, self.rcut)
+
+    def test_sampler_debug_info(self):
+        dataloader = DataLoader(
+            self.my_dataset,
+            batch_sampler=BatchSampler(
+                get_weighted_sampler(self.my_dataset, prob_style="prob_sys_size"),
+                drop_last=False,
+            ),
+            num_workers=0,  # setting to 0 diverges the behavior of its iterator; should be >=1
+            # pin_memory=True,
+        )
+        device = paddle.get_device()
+        paddle.set_device("cpu")
+        batch_data = next(iter(dataloader))
+        paddle.set_device(device)
+        sid = batch_data["sid"]
+        fid = batch_data["fid"][0]
+        coord = batch_data["coord"].squeeze(0)
+        frame = self.my_dataset.systems[sid].__getitem__(fid)
+        self.assertTrue(np.allclose(coord, frame["coord"]))
+
+    def test_auto_prob_uniform(self):
+        auto_prob_style = "prob_uniform"
+        sampler = get_weighted_sampler(self.my_dataset, prob_style=auto_prob_style)
+        my_probs = np.array(sampler.weights)
+        self.dp_dataset.set_sys_probs(auto_prob_style=auto_prob_style)
+        dp_probs = np.array(self.dp_dataset.sys_probs)
+        self.assertTrue(np.allclose(my_probs, dp_probs))
+
+    def test_auto_prob_sys_size(self):
+        auto_prob_style = "prob_sys_size"
+        sampler = get_weighted_sampler(self.my_dataset, prob_style=auto_prob_style)
+        my_probs = np.array(sampler.weights)
+        self.dp_dataset.set_sys_probs(auto_prob_style=auto_prob_style)
+        dp_probs = np.array(self.dp_dataset.sys_probs)
+        self.assertTrue(np.allclose(my_probs, dp_probs))
+
+    def test_auto_prob_sys_size_ext(self):
+        auto_prob_style = "prob_sys_size;0:1:0.2;1:3:0.8"
+        sampler = get_weighted_sampler(self.my_dataset, prob_style=auto_prob_style)
+        my_probs = np.array(sampler.weights)
+        self.dp_dataset.set_sys_probs(auto_prob_style=auto_prob_style)
+        dp_probs = np.array(self.dp_dataset.sys_probs)
+        self.assertTrue(np.allclose(my_probs, dp_probs))
+
+    def test_sys_probs(self):
+        sys_probs = [0.1, 0.4, 0.5]
+        sampler = get_weighted_sampler(
+            self.my_dataset, prob_style=sys_probs, sys_prob=True
+        )
+        my_probs = np.array(sampler.weights)
+        self.dp_dataset.set_sys_probs(sys_probs=sys_probs)
+        dp_probs = np.array(self.dp_dataset.sys_probs)
+        self.assertTrue(np.allclose(my_probs, dp_probs))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/source/tests/pd/test_training.py b/source/tests/pd/test_training.py
new file mode 100644
index 0000000000..d4e7309a65
--- /dev/null
+++ b/source/tests/pd/test_training.py
@@ -0,0 +1,176 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import json
+import os
+import shutil
+import unittest
+from copy import (
+    deepcopy,
+)
+from pathlib import (
+    Path,
+)
+
+import numpy as np
+
+from deepmd.pd.entrypoints.main import (
+    get_trainer,
+)
+from deepmd.pd.utils.finetune import (
+    get_finetune_rules,
+)
+
+from .model.test_permutation import (
+    model_se_e2_a,
+)
+
+
+class DPTrainTest:
+    def test_dp_train(self):
+        # test training from scratch
+        trainer = get_trainer(deepcopy(self.config))
+        trainer.run()
+        state_dict_trained = trainer.wrapper.model.state_dict()
+
+        # test fine-tuning using same input
+        finetune_model = self.config["training"].get("save_ckpt", "model.ckpt") + ".pd"
+        self.config["model"], finetune_links = get_finetune_rules(
+            finetune_model,
+            self.config["model"],
+        )
+        trainer_finetune = get_trainer(
+            deepcopy(self.config),
+            finetune_model=finetune_model,
+            finetune_links=finetune_links,
+        )
+
+        # test fine-tuning using empty input
+        self.config_empty = deepcopy(self.config)
+        if "descriptor" in self.config_empty["model"]:
+            self.config_empty["model"]["descriptor"] = {}
+        if "fitting_net" in self.config_empty["model"]:
+            self.config_empty["model"]["fitting_net"] = {}
+        self.config_empty["model"], finetune_links = get_finetune_rules(
+            finetune_model,
+            self.config_empty["model"],
+            change_model_params=True,
+        )
+        trainer_finetune_empty = get_trainer(
+            deepcopy(self.config_empty),
+            finetune_model=finetune_model,
+            finetune_links=finetune_links,
+        )
+
+        # test fine-tuning using random fitting
+        self.config["model"], finetune_links = get_finetune_rules(
+            finetune_model, self.config["model"], model_branch="RANDOM"
+        )
+        trainer_finetune_random = get_trainer(
+            deepcopy(self.config_empty),
+            finetune_model=finetune_model,
+            finetune_links=finetune_links,
+        )
+
+        # check parameters
+        state_dict_finetuned = trainer_finetune.wrapper.model.state_dict()
+        state_dict_finetuned_empty = trainer_finetune_empty.wrapper.model.state_dict()
+        state_dict_finetuned_random = trainer_finetune_random.wrapper.model.state_dict()
+        for state_key in state_dict_finetuned:
+            if "out_bias" not in state_key and "out_std" not in state_key:
+                np.testing.assert_allclose(
+                    state_dict_trained[state_key].numpy(),
+                    state_dict_finetuned[state_key].numpy(),
+                )
+                np.testing.assert_allclose(
+                    state_dict_trained[state_key].numpy(),
+                    state_dict_finetuned_empty[state_key].numpy(),
+                )
+                if "fitting_net" not in state_key:
+                    np.testing.assert_allclose(
+                        state_dict_trained[state_key].numpy(),
+                        state_dict_finetuned_random[state_key].numpy(),
+                    )
+
+        # check running
+        trainer_finetune.run()
+        trainer_finetune_empty.run()
+        trainer_finetune_random.run()
+
+    def test_trainable(self):
+        fix_params = deepcopy(self.config)
+        fix_params["model"]["descriptor"]["trainable"] = False
+        fix_params["model"]["fitting_net"]["trainable"] = False
+        free_descriptor = hasattr(self, "not_all_grad") and self.not_all_grad
+        if free_descriptor:
+            # can not set requires_grad false for all parameters,
+            # because the input coord has no grad, thus the loss if all set to false
+            # we only check trainable for fitting net
+            fix_params["model"]["descriptor"]["trainable"] = True
+            trainer_fix = get_trainer(fix_params)
+            model_dict_before_training = deepcopy(
+                trainer_fix.model.get_fitting_net().state_dict()
+            )
+            trainer_fix.run()
+            model_dict_after_training = deepcopy(
+                trainer_fix.model.get_fitting_net().state_dict()
+            )
+        else:
+            trainer_fix = get_trainer(fix_params)
+            model_dict_before_training = deepcopy(trainer_fix.model.state_dict())
+            trainer_fix.run()
+            model_dict_after_training = deepcopy(trainer_fix.model.state_dict())
+        for key in model_dict_before_training:
+            np.testing.assert_allclose(
+                model_dict_before_training[key].numpy(),
+                model_dict_after_training[key].numpy(),
+            )
+
+    def tearDown(self):
+        for f in os.listdir("."):
+            if f.startswith("model") and f.endswith(".pd"):
+                os.remove(f)
+            if f in ["lcurve.out"]:
+                os.remove(f)
+            if f in ["stat_files"]:
+                shutil.rmtree(f)
+
+
+class TestEnergyModelSeA(unittest.TestCase, DPTrainTest):
+    def setUp(self):
+        input_json = str(Path(__file__).parent / "water/se_atten.json")
+        with open(input_json) as f:
+            self.config = json.load(f)
+        data_file = [str(Path(__file__).parent / "water/data/data_0")]
+        self.config["training"]["training_data"]["systems"] = data_file
+        self.config["training"]["validation_data"]["systems"] = data_file
+        self.config["model"] = deepcopy(model_se_e2_a)
+        self.config["training"]["numb_steps"] = 1
+        self.config["training"]["save_freq"] = 1
+
+    def tearDown(self) -> None:
+        DPTrainTest.tearDown(self)
+
+
+class TestFparam(unittest.TestCase, DPTrainTest):
+    """Test if `fparam` can be loaded correctly."""
+
+    def setUp(self):
+        input_json = str(Path(__file__).parent / "water/se_atten.json")
+        with open(input_json) as f:
+            self.config = json.load(f)
+        data_file = [str(Path(__file__).parent / "water/data/data_0")]
+        self.config["training"]["training_data"]["systems"] = data_file
+        self.config["training"]["validation_data"]["systems"] = data_file
+        self.config["model"] = deepcopy(model_se_e2_a)
+        self.config["model"]["fitting_net"]["numb_fparam"] = 1
+        self.config["training"]["numb_steps"] = 1
+        self.config["training"]["save_freq"] = 1
+        self.set_path = Path(__file__).parent / "water/data/data_0" / "set.000"
+        shutil.copyfile(self.set_path / "energy.npy", self.set_path / "fparam.npy")
+
+    def tearDown(self) -> None:
+        (self.set_path / "fparam.npy").unlink(missing_ok=True)
+        DPTrainTest.tearDown(self)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/source/tests/pd/test_update_sel.py b/source/tests/pd/test_update_sel.py
new file mode 100644
index 0000000000..e7b1acf6ff
--- /dev/null
+++ b/source/tests/pd/test_update_sel.py
@@ -0,0 +1,194 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import random
+import unittest
+from unittest.mock import (
+    patch,
+)
+
+from deepmd.pd.model.model.model import (
+    BaseModel,
+)
+from deepmd.pd.utils.update_sel import (
+    UpdateSel,
+)
+
+from ..seed import (
+    GLOBAL_SEED,
+)
+
+
+def update_sel(jdata):
+    type_map = jdata["model"].get("type_map")
+    train_data = None
+    jdata["model"], _ = BaseModel.update_sel(train_data, type_map, jdata["model"])
+    return jdata
+
+
+class TestTrain(unittest.TestCase):
+    def setUp(self) -> None:
+        self.update_sel = UpdateSel()
+        self.mock_min_nbor_dist = random.Random(GLOBAL_SEED).random()
+        return super().setUp()
+
+    @patch("deepmd.pd.utils.update_sel.UpdateSel.get_nbor_stat")
+    def test_update_one_sel(self, sel_mock):
+        sel_mock.return_value = self.mock_min_nbor_dist, [10, 20]
+
+        min_nbor_dist, sel = self.update_sel.update_one_sel(None, None, 6, "auto")
+        # self.assertEqual(descriptor['sel'], [11,22])
+        self.assertEqual(sel, [12, 24])
+        self.assertAlmostEqual(min_nbor_dist, self.mock_min_nbor_dist)
+        min_nbor_dist, sel = self.update_sel.update_one_sel(None, None, 6, "auto:1.5")
+        # self.assertEqual(descriptor['sel'], [15,30])
+        self.assertEqual(sel, [16, 32])
+        self.assertAlmostEqual(min_nbor_dist, self.mock_min_nbor_dist)
+
+    @unittest.skip("Skip for not implemented yet")
+    @patch("deepmd.pd.utils.update_sel.UpdateSel.get_nbor_stat")
+    def test_update_sel_hybrid(self, sel_mock):
+        sel_mock.return_value = self.mock_min_nbor_dist, [10, 20]
+
+        jdata = {
+            "model": {
+                "descriptor": {
+                    "type": "hybrid",
+                    "list": [
+                        {"type": "se_e2_a", "rcut": 6, "sel": "auto"},
+                        {"type": "se_e2_a", "rcut": 6, "sel": "auto:1.5"},
+                    ],
+                }
+            },
+            "training": {"training_data": {}},
+        }
+        expected_out = {
+            "model": {
+                "descriptor": {
+                    "type": "hybrid",
+                    "list": [
+                        {"type": "se_e2_a", "rcut": 6, "sel": [12, 24]},
+                        {"type": "se_e2_a", "rcut": 6, "sel": [16, 32]},
+                    ],
+                }
+            },
+            "training": {"training_data": {}},
+        }
+        jdata = update_sel(jdata)
+        self.assertEqual(jdata, expected_out)
+
+    @patch("deepmd.pd.utils.update_sel.UpdateSel.get_nbor_stat")
+    def test_update_sel(self, sel_mock):
+        sel_mock.return_value = self.mock_min_nbor_dist, [10, 20]
+
+        jdata = {
+            "model": {"descriptor": {"type": "se_e2_a", "rcut": 6, "sel": "auto"}},
+            "training": {"training_data": {}},
+        }
+        expected_out = {
+            "model": {"descriptor": {"type": "se_e2_a", "rcut": 6, "sel": [12, 24]}},
+            "training": {"training_data": {}},
+        }
+        jdata = update_sel(jdata)
+        self.assertEqual(jdata, expected_out)
+
+    @unittest.skip("Skip for not implemented yet")
+    @patch("deepmd.pd.utils.update_sel.UpdateSel.get_nbor_stat")
+    def test_update_sel_atten_auto(self, sel_mock):
+        sel_mock.return_value = self.mock_min_nbor_dist, [25]
+
+        jdata = {
+            "model": {
+                "descriptor": {
+                    "type": "se_atten",
+                    "sel": "auto",
+                    "rcut": 6,
+                }
+            },
+            "training": {"training_data": {}},
+        }
+        expected_out = {
+            "model": {
+                "descriptor": {
+                    "type": "se_atten",
+                    "sel": 28,
+                    "rcut": 6,
+                }
+            },
+            "training": {"training_data": {}},
+        }
+        jdata = update_sel(jdata)
+        self.assertEqual(jdata, expected_out)
+
+    @unittest.skip("Skip for not implemented yet")
+    @patch("deepmd.pd.utils.update_sel.UpdateSel.get_nbor_stat")
+    def test_update_sel_atten_int(self, sel_mock):
+        sel_mock.return_value = self.mock_min_nbor_dist, [25]
+
+        jdata = {
+            "model": {
+                "descriptor": {
+                    "type": "se_atten",
+                    "sel": 30,
+                    "rcut": 6,
+                }
+            },
+            "training": {"training_data": {}},
+        }
+        expected_out = {
+            "model": {
+                "descriptor": {
+                    "type": "se_atten",
+                    "sel": 30,
+                    "rcut": 6,
+                }
+            },
+            "training": {"training_data": {}},
+        }
+        jdata = update_sel(jdata)
+        self.assertEqual(jdata, expected_out)
+
+    @unittest.skip("Skip for not implemented yet")
+    @patch("deepmd.pd.utils.update_sel.UpdateSel.get_nbor_stat")
+    def test_update_sel_atten_list(self, sel_mock):
+        sel_mock.return_value = self.mock_min_nbor_dist, [25]
+
+        jdata = {
+            "model": {
+                "descriptor": {
+                    "type": "se_atten",
+                    "sel": 30,
+                    "rcut": 6,
+                }
+            },
+            "training": {"training_data": {}},
+        }
+        expected_out = {
+            "model": {
+                "descriptor": {
+                    "type": "se_atten",
+                    "sel": 30,
+                    "rcut": 6,
+                }
+            },
+            "training": {"training_data": {}},
+        }
+        jdata = update_sel(jdata)
+        self.assertEqual(jdata, expected_out)
+
+    def test_skip_frozen(self):
+        jdata = {
+            "model": {
+                "type": "frozen",
+            },
+            "training": {"training_data": {}},
+        }
+        expected_out = jdata.copy()
+        jdata = update_sel(jdata)
+        self.assertEqual(jdata, expected_out)
+
+    def test_wrap_up_4(self):
+        self.assertEqual(self.update_sel.wrap_up_4(12), 3 * 4)
+        self.assertEqual(self.update_sel.wrap_up_4(13), 4 * 4)
+        self.assertEqual(self.update_sel.wrap_up_4(14), 4 * 4)
+        self.assertEqual(self.update_sel.wrap_up_4(15), 4 * 4)
+        self.assertEqual(self.update_sel.wrap_up_4(16), 4 * 4)
+        self.assertEqual(self.update_sel.wrap_up_4(17), 5 * 4)
diff --git a/source/tests/pd/test_utils.py b/source/tests/pd/test_utils.py
new file mode 100644
index 0000000000..8d25cff964
--- /dev/null
+++ b/source/tests/pd/test_utils.py
@@ -0,0 +1,35 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import unittest
+
+import numpy as np
+import paddle
+
+from deepmd.pd.utils.utils import (
+    to_numpy_array,
+    to_paddle_tensor,
+)
+
+from ..seed import (
+    GLOBAL_SEED,
+)
+
+
+class TestCvt(unittest.TestCase):
+    def test_to_numpy(self):
+        rng = np.random.default_rng(GLOBAL_SEED)
+        foo = rng.normal([3, 4])
+        for ptp, npp in zip(
+            [paddle.float16, paddle.float32, paddle.float64],
+            [np.float16, np.float32, np.float64],
+        ):
+            foo = foo.astype(npp)
+            bar = to_paddle_tensor(foo)
+            self.assertEqual(bar.dtype, ptp)
+            onk = to_numpy_array(bar)
+            self.assertEqual(onk.dtype, npp)
+        with self.assertRaises(ValueError) as ee:
+            foo = foo.astype(np.int8)
+            bar = to_paddle_tensor(foo)
+        with self.assertRaises(ValueError) as ee:
+            bar = to_paddle_tensor(foo)
+            bar = to_numpy_array(bar.int())
diff --git a/source/tests/pd/water b/source/tests/pd/water
new file mode 120000
index 0000000000..9e74b75a82
--- /dev/null
+++ b/source/tests/pd/water
@@ -0,0 +1 @@
+model/water/
\ No newline at end of file