Merge branch 'devel' into spin_lmp

Signed-off-by: Duo <50307526+iProzd@users.noreply.github.com>
iProzd · Oct 26, 2024 · fef13f5 · fef13f5
2 parents 114898f + fa61d69
commit fef13f5
Show file tree

Hide file tree

Showing 107 changed files with 1,179 additions and 341 deletions.
diff --git a/.github/workflows/build_wheel.yml b/.github/workflows/build_wheel.yml
@@ -148,6 +148,7 @@ jobs:
       - uses: actions/download-artifact@v4
         with:
           path: source/install/docker/dist
+          pattern: cibw-*-manylinux_x86_64-cu${{ matrix.cuda_version }}*
           merge-multiple: true
       - name: Log in to the Container registry
         uses: docker/login-action@v3
@@ -180,6 +181,7 @@ jobs:
       - uses: actions/download-artifact@v4
         with:
           path: dist/packages
+          pattern: cibw-*
           merge-multiple: true
       - uses: actions/setup-python@v5
         name: Install Python

diff --git a/.github/workflows/package_c.yml b/.github/workflows/package_c.yml
@@ -18,7 +18,7 @@ jobs:
     strategy:
       matrix:
         include:
-          - tensorflow_build_version: "2.15"
+          - tensorflow_build_version: "2.18"
             tensorflow_version: ""
             filename: libdeepmd_c.tar.gz
           - tensorflow_build_version: "2.14"

diff --git a/.github/workflows/test_cuda.yml b/.github/workflows/test_cuda.yml
@@ -47,7 +47,7 @@ jobs:
          && sudo apt-get -y install cuda-12-3 libcudnn8=8.9.5.*-1+cuda12.3
       if: false  # skip as we use nvidia image
     - run: python -m pip install -U uv
-    - run: source/install/uv_with_retry.sh pip install --system "tensorflow>=2.15.0rc0" "torch==2.3.1.*"
+    - run: source/install/uv_with_retry.sh pip install --system "tensorflow~=2.18.0rc2" "torch~=2.5.0"
     - run: |
         export PYTORCH_ROOT=$(python -c 'import torch;print(torch.__path__[0])')
         export TENSORFLOW_ROOT=$(python -c 'import importlib,pathlib;print(pathlib.Path(importlib.util.find_spec("tensorflow").origin).parent)')
@@ -63,7 +63,7 @@ jobs:
         CUDA_VISIBLE_DEVICES: 0
     - name: Download libtorch
       run: |
-         wget https://download.pytorch.org/libtorch/cu121/libtorch-cxx11-abi-shared-with-deps-2.2.1%2Bcu121.zip -O libtorch.zip
+         wget https://download.pytorch.org/libtorch/cu124/libtorch-cxx11-abi-shared-with-deps-2.5.0%2Bcu124.zip -O libtorch.zip
          unzip libtorch.zip
     - run: |
         export CMAKE_PREFIX_PATH=$GITHUB_WORKSPACE/libtorch

diff --git a/.github/workflows/test_python.yml b/.github/workflows/test_python.yml
@@ -26,7 +26,7 @@ jobs:
     - run: python -m pip install -U uv
     - run: |
         source/install/uv_with_retry.sh pip install --system mpich
-        source/install/uv_with_retry.sh pip install --system "torch==2.3.0+cpu.cxx11.abi" -i https://download.pytorch.org/whl/
+        source/install/uv_with_retry.sh pip install --system torch -i https://download.pytorch.org/whl/cpu
         export PYTORCH_ROOT=$(python -c 'import torch;print(torch.__path__[0])')
         source/install/uv_with_retry.sh pip install --system --only-binary=horovod -e .[cpu,test,jax] horovod[tensorflow-cpu] mpi4py
       env:

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -29,7 +29,7 @@ repos:
         exclude: ^source/3rdparty
   - repo: https://github.com/astral-sh/ruff-pre-commit
     # Ruff version.
-    rev: v0.6.9
+    rev: v0.7.0
     hooks:
       - id: ruff
         args: ["--fix"]
@@ -38,6 +38,14 @@ repos:
       - id: ruff-format
         exclude: ^source/3rdparty
         types_or: [python, pyi, jupyter]
+  - repo: https://github.com/pycqa/flake8
+    # flake8 cannot autofix
+    rev: "7.1.1"
+    hooks:
+      - id: flake8
+        additional_dependencies:
+          - torchfix==0.6.0
+          - flake8-pyproject==1.2.3
   # numpydoc
   - repo: https://github.com/Carreau/velin
     rev: 0.0.12
@@ -52,7 +60,7 @@ repos:
       - id: blacken-docs
   # C++
   - repo: https://github.com/pre-commit/mirrors-clang-format
-    rev: v19.1.1
+    rev: v19.1.2
     hooks:
       - id: clang-format
         exclude: ^(source/3rdparty|source/lib/src/gpu/cudart/.+\.inc|.+\.ipynb$)
@@ -66,7 +74,7 @@ repos:
         exclude: ^(source/3rdparty|\.github/workflows|\.clang-format)
   # Shell
   - repo: https://github.com/scop/pre-commit-shfmt
-    rev: v3.9.0-1
+    rev: v3.10.0-1
     hooks:
       - id: shfmt
   # CMake

diff --git a/backend/find_pytorch.py b/backend/find_pytorch.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import importlib
 import os
+import platform
 import site
 from functools import (
     lru_cache,
@@ -22,6 +23,9 @@
     Union,
 )
 
+from packaging.specifiers import (
+    SpecifierSet,
+)
 from packaging.version import (
     Version,
 )
@@ -104,6 +108,20 @@ def get_pt_requirement(pt_version: str = "") -> dict:
     """
     if pt_version is None:
         return {"torch": []}
+    if (
+        os.environ.get("CIBUILDWHEEL", "0") == "1"
+        and platform.system() == "Linux"
+        and platform.machine() == "x86_64"
+    ):
+        cuda_version = os.environ.get("CUDA_VERSION", "12.2")
+        if cuda_version == "" or cuda_version in SpecifierSet(">=12,<13"):
+            # CUDA 12.2, cudnn 9
+            pt_version = "2.5.0"
+        elif cuda_version in SpecifierSet(">=11,<12"):
+            # CUDA 11.8, cudnn 8
+            pt_version = "2.3.1"
+        else:
+            raise RuntimeError("Unsupported CUDA version") from None
     if pt_version == "":
         pt_version = os.environ.get("PYTORCH_VERSION", "")
 

diff --git a/backend/find_tensorflow.py b/backend/find_tensorflow.py
@@ -85,14 +85,14 @@ def find_tensorflow() -> tuple[Optional[str], list[str]]:
         if os.environ.get("CIBUILDWHEEL", "0") == "1":
             cuda_version = os.environ.get("CUDA_VERSION", "12.2")
             if cuda_version == "" or cuda_version in SpecifierSet(">=12,<13"):
-                # CUDA 12.2
+                # CUDA 12.2, cudnn 9
                 requires.extend(
                     [
-                        "tensorflow-cpu>=2.15.0rc0; platform_machine=='x86_64' and platform_system == 'Linux'",
+                        "tensorflow-cpu>=2.18.0rc0; platform_machine=='x86_64' and platform_system == 'Linux'",
                     ]
                 )
             elif cuda_version in SpecifierSet(">=11,<12"):
-                # CUDA 11.8
+                # CUDA 11.8, cudnn 8
                 requires.extend(
                     [
                         "tensorflow-cpu>=2.5.0rc0,<2.15; platform_machine=='x86_64' and platform_system == 'Linux'",

diff --git a/deepmd/backend/jax.py b/deepmd/backend/jax.py
@@ -32,14 +32,13 @@ class JAXBackend(Backend):
     name = "JAX"
     """The formal name of the backend."""
     features: ClassVar[Backend.Feature] = (
-        Backend.Feature(0)
+        Backend.Feature.IO
         # Backend.Feature.ENTRY_POINT
         # | Backend.Feature.DEEP_EVAL
         # | Backend.Feature.NEIGHBOR_STAT
-        # | Backend.Feature.IO
     )
     """The features of the backend."""
-    suffixes: ClassVar[list[str]] = []
+    suffixes: ClassVar[list[str]] = [".jax"]
     """The suffixes of the backend."""
 
     def is_available(self) -> bool:
@@ -94,7 +93,11 @@ def serialize_hook(self) -> Callable[[str], dict]:
         Callable[[str], dict]
             The serialize hook of the backend.
         """
-        raise NotImplementedError
+        from deepmd.jax.utils.serialization import (
+            serialize_from_file,
+        )
+
+        return serialize_from_file
 
     @property
     def deserialize_hook(self) -> Callable[[str, dict], None]:
@@ -105,4 +108,8 @@ def deserialize_hook(self) -> Callable[[str, dict], None]:
         Callable[[str, dict], None]
             The deserialize hook of the backend.
         """
-        raise NotImplementedError
+        from deepmd.jax.utils.serialization import (
+            deserialize_to_file,
+        )
+
+        return deserialize_to_file
diff --git a/deepmd/dpmodel/array_api.py b/deepmd/dpmodel/array_api.py
@@ -61,7 +61,7 @@ def xp_take_along_axis(arr, indices, axis):
     else:
         indices = xp.reshape(indices, (0, 0))
 
-    offset = (xp.arange(indices.shape[0]) * m)[:, xp.newaxis]
+    offset = (xp.arange(indices.shape[0], dtype=indices.dtype) * m)[:, xp.newaxis]
     indices = xp.reshape(offset + indices, (-1,))
 
     out = xp.take(arr, indices)

diff --git a/deepmd/dpmodel/atomic_model/base_atomic_model.py b/deepmd/dpmodel/atomic_model/base_atomic_model.py
@@ -1,13 +1,15 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
-import copy
+import math
 from typing import (
     Optional,
 )
 
+import array_api_compat
 import numpy as np
 
 from deepmd.dpmodel.common import (
     NativeOP,
+    to_numpy_array,
 )
 from deepmd.dpmodel.output_def import (
     FittingOutputDef,
@@ -17,6 +19,9 @@
     AtomExcludeMask,
     PairExcludeMask,
 )
+from deepmd.env import (
+    GLOBAL_NP_FLOAT_PRECISION,
+)
 from deepmd.utils.finetune import (
     get_index_between_two_maps,
     map_atom_exclude_types,
@@ -54,8 +59,12 @@ def init_out_stat(self):
             [self.atomic_output_def()[kk].size for kk in self.bias_keys]
         )
         self.n_out = len(self.bias_keys)
-        out_bias_data = np.zeros([self.n_out, ntypes, self.max_out_size])  # pylint: disable=no-explicit-dtype
-        out_std_data = np.ones([self.n_out, ntypes, self.max_out_size])  # pylint: disable=no-explicit-dtype
+        out_bias_data = np.zeros(
+            [self.n_out, ntypes, self.max_out_size], dtype=GLOBAL_NP_FLOAT_PRECISION
+        )
+        out_std_data = np.ones(
+            [self.n_out, ntypes, self.max_out_size], dtype=GLOBAL_NP_FLOAT_PRECISION
+        )
         self.out_bias = out_bias_data
         self.out_std = out_std_data
 
@@ -172,17 +181,18 @@ def forward_common_atomic(
             ret_dict["mask"][ff,ii] == 0 indicating the ii-th atom of the ff-th frame is virtual.
 
         """
+        xp = array_api_compat.array_namespace(extended_coord, extended_atype, nlist)
         _, nloc, _ = nlist.shape
         atype = extended_atype[:, :nloc]
         if self.pair_excl is not None:
             pair_mask = self.pair_excl.build_type_exclude_mask(nlist, extended_atype)
             # exclude neighbors in the nlist
-            nlist = np.where(pair_mask == 1, nlist, -1)
+            nlist = xp.where(pair_mask == 1, nlist, -1)
 
         ext_atom_mask = self.make_atom_mask(extended_atype)
         ret_dict = self.forward_atomic(
             extended_coord,
-            np.where(ext_atom_mask, extended_atype, 0),
+            xp.where(ext_atom_mask, extended_atype, 0),
             nlist,
             mapping=mapping,
             fparam=fparam,
@@ -191,13 +201,13 @@ def forward_common_atomic(
         ret_dict = self.apply_out_stat(ret_dict, atype)
 
         # nf x nloc
-        atom_mask = ext_atom_mask[:, :nloc].astype(np.int32)
+        atom_mask = ext_atom_mask[:, :nloc].astype(xp.int32)
         if self.atom_excl is not None:
             atom_mask *= self.atom_excl.build_type_exclude_mask(atype)
 
         for kk in ret_dict.keys():
             out_shape = ret_dict[kk].shape
-            out_shape2 = np.prod(out_shape[2:])
+            out_shape2 = math.prod(out_shape[2:])
             ret_dict[kk] = (
                 ret_dict[kk].reshape([out_shape[0], out_shape[1], out_shape2])
                 * atom_mask[:, :, None]
@@ -232,14 +242,15 @@ def serialize(self) -> dict:
             "rcond": self.rcond,
             "preset_out_bias": self.preset_out_bias,
             "@variables": {
-                "out_bias": self.out_bias,
-                "out_std": self.out_std,
+                "out_bias": to_numpy_array(self.out_bias),
+                "out_std": to_numpy_array(self.out_std),
             },
         }
 
     @classmethod
     def deserialize(cls, data: dict) -> "BaseAtomicModel":
-        data = copy.deepcopy(data)
+        # do not deep copy Descriptor and Fitting class
+        data = data.copy()
         variables = data.pop("@variables")
         obj = cls(**data)
         for kk in variables.keys():

diff --git a/deepmd/dpmodel/atomic_model/dp_atomic_model.py b/deepmd/dpmodel/atomic_model/dp_atomic_model.py
@@ -169,14 +169,20 @@ def serialize(self) -> dict:
         )
         return dd
 
+    # for subclass overriden
+    base_descriptor_cls = BaseDescriptor
+    """The base descriptor class."""
+    base_fitting_cls = BaseFitting
+    """The base fitting class."""
+
     @classmethod
     def deserialize(cls, data) -> "DPAtomicModel":
         data = copy.deepcopy(data)
         check_version_compatibility(data.pop("@version", 1), 2, 2)
         data.pop("@class")
         data.pop("type")
-        descriptor_obj = BaseDescriptor.deserialize(data.pop("descriptor"))
-        fitting_obj = BaseFitting.deserialize(data.pop("fitting"))
+        descriptor_obj = cls.base_descriptor_cls.deserialize(data.pop("descriptor"))
+        fitting_obj = cls.base_fitting_cls.deserialize(data.pop("fitting"))
         data["descriptor"] = descriptor_obj
         data["fitting"] = fitting_obj
         obj = super().deserialize(data)

diff --git a/deepmd/dpmodel/atomic_model/linear_atomic_model.py b/deepmd/dpmodel/atomic_model/linear_atomic_model.py
@@ -12,6 +12,9 @@
     get_multiple_nlist_key,
     nlist_distinguish_types,
 )
+from deepmd.env import (
+    GLOBAL_NP_FLOAT_PRECISION,
+)
 from deepmd.utils.version import (
     check_version_compatibility,
 )
@@ -286,7 +289,11 @@ def _compute_weight(
         """This should be a list of user defined weights that matches the number of models to be combined."""
         nmodels = len(self.models)
         nframes, nloc, _ = nlists_[0].shape
-        return [np.ones((nframes, nloc, 1)) / nmodels for _ in range(nmodels)]  # pylint: disable=no-explicit-dtype
+        # the dtype of weights is the interface data type.
+        return [
+            np.ones((nframes, nloc, 1), dtype=GLOBAL_NP_FLOAT_PRECISION) / nmodels
+            for _ in range(nmodels)
+        ]
 
     def get_dim_fparam(self) -> int:
         """Get the number (dimension) of frame parameters of this atomic model."""

diff --git a/deepmd/dpmodel/atomic_model/pairtab_atomic_model.py b/deepmd/dpmodel/atomic_model/pairtab_atomic_model.py
@@ -204,9 +204,10 @@ def forward_atomic(
             self.tab.ntypes, self.tab.ntypes, self.tab.nspline, 4
         )
 
-        # (nframes, nloc, nnei)
+        # (nframes, nloc, nnei), index type is int64.
         j_type = extended_atype[
-            np.arange(extended_atype.shape[0])[:, None, None], masked_nlist  # pylint: disable=no-explicit-dtype
+            np.arange(extended_atype.shape[0], dtype=np.int64)[:, None, None],
+            masked_nlist,
         ]
 
         raw_atomic_energy = self._pair_tabulated_inter(
@@ -303,7 +304,8 @@ def _get_pairwise_dist(coords: np.ndarray, nlist: np.ndarray) -> np.ndarray:
         np.ndarray
             The pairwise distance between the atoms (nframes, nloc, nnei).
         """
-        batch_indices = np.arange(nlist.shape[0])[:, None, None]  # pylint: disable=no-explicit-dtype
+        # index type is int64
+        batch_indices = np.arange(nlist.shape[0], dtype=np.int64)[:, None, None]
         neighbor_atoms = coords[batch_indices, nlist]
         loc_atoms = coords[:, : nlist.shape[1], :]
         pairwise_dr = loc_atoms[:, :, None, :] - neighbor_atoms