From 8f858060ccb77a1e53bdd1024cec72164f500914 Mon Sep 17 00:00:00 2001 From: Sharp Londe <93334987+SharpLonde@users.noreply.github.com> Date: Fri, 1 Dec 2023 10:33:11 +0800 Subject: [PATCH] Added ABACUSInMemoryDataset in data module (#11) * Prototype code for loading Hamiltonian * add 'ABACUSDataset' in data module * modified "basis.dat" storage & can load overlap * recover some original dataset settings * add ABACUSDataset in init * Add the in memory version of ABACUSDataset * add ABACUSInMemoryDataset in data package --- dptb/data/__init__.py | 2 + dptb/data/dataset/__init__.py | 3 +- dptb/data/dataset/_abacus_dataset_mem.py | 109 +++++++++++++++++++++++ dptb/data/interfaces/abacus.py | 15 ++-- 4 files changed, 122 insertions(+), 7 deletions(-) create mode 100644 dptb/data/dataset/_abacus_dataset_mem.py diff --git a/dptb/data/__init__.py b/dptb/data/__init__.py index c6d35e07..2318e9b3 100644 --- a/dptb/data/__init__.py +++ b/dptb/data/__init__.py @@ -16,6 +16,7 @@ ASEDataset, HDF5Dataset, ABACUSDataset, + ABACUSInMemoryDataset, ) from .dataloader import DataLoader, Collater, PartialSampler from .build import dataset_from_config @@ -33,6 +34,7 @@ ASEDataset, HDF5Dataset, ABACUSDataset, + ABACUSInMemoryDataset, DataLoader, Collater, PartialSampler, diff --git a/dptb/data/dataset/__init__.py b/dptb/data/dataset/__init__.py index cf149093..b3bf94ba 100644 --- a/dptb/data/dataset/__init__.py +++ b/dptb/data/dataset/__init__.py @@ -3,5 +3,6 @@ from ._npz_dataset import NpzDataset from ._hdf5_dataset import HDF5Dataset from ._abacus_dataset import ABACUSDataset +from ._abacus_dataset_mem import ABACUSInMemoryDataset -__all__ = [ABACUSDataset, ASEDataset, AtomicDataset, AtomicInMemoryDataset, NpzDataset, HDF5Dataset] +__all__ = [ABACUSInMemoryDataset, ABACUSDataset, ASEDataset, AtomicDataset, AtomicInMemoryDataset, NpzDataset, HDF5Dataset] diff --git a/dptb/data/dataset/_abacus_dataset_mem.py b/dptb/data/dataset/_abacus_dataset_mem.py new file mode 100644 index 00000000..f28da760 --- /dev/null +++ b/dptb/data/dataset/_abacus_dataset_mem.py @@ -0,0 +1,109 @@ +from typing import Dict, Any, List, Callable, Union, Optional +import os + +import numpy as np +import h5py + +import torch + +from .. import ( + AtomicData, + AtomicDataDict, +) +from ..transforms import TypeMapper, OrbitalMapper +from ._base_datasets import AtomicInMemoryDataset +from dptb.nn.hamiltonian import E3Hamiltonian +from dptb.data.interfaces.ham_to_feature import ham_block_to_feature +from dptb.data.interfaces.abacus import recursive_parse + +orbitalLId = {0:"s", 1:"p", 2:"d", 3:"f"} + +def _abacus_h5_reader(h5file_path, AtomicData_options): + data = h5py.File(h5file_path, "r") + atomic_data = AtomicData.from_points( + pos = data["pos"][:], + cell = data["cell"][:], + atomic_numbers = data["atomic_numbers"][:], + **AtomicData_options, + ) + if data["hamiltonian_blocks"]: + basis = {} + for key, value in data["basis"].items(): + basis[key] = [(f"{i+1}" + orbitalLId[l]) for i, l in enumerate(value)] + idp = OrbitalMapper(basis) + e3 = E3Hamiltonian(idp=idp, decompose=True) + ham_block_to_feature(atomic_data, idp, data.get("hamiltonian_blocks", False), data.get("overlap_blocks", False)) + with torch.no_grad(): + atomic_data = e3(atomic_data.to_dict()) + atomic_data = AtomicData.from_dict(atomic_data) + + if data.get("eigenvalue") and data.get("kpoint"): + atomic_data[AtomicDataDict.KPOINT_KEY] = torch.as_tensor(data["kpoint"][:], dtype=torch.get_default_dtype()) + atomic_data[AtomicDataDict.ENERGY_EIGENVALUE_KEY] = torch.as_tensor(data["eigenvalue"][:], dtype=torch.get_default_dtype()) + return atomic_data + + +class ABACUSInMemoryDataset(AtomicInMemoryDataset): + + def __init__( + self, + root: str, + abacus_args: Dict[str, Union[str,bool]] = { + "input_dir": None, + "preprocess_dir": None, + "only_overlap": False, + "get_Ham": False, + "add_overlap": False, + "get_eigenvalues": False, + }, + file_name: Optional[str] = None, + url: Optional[str] = None, + AtomicData_options: Dict[str, Any] = {}, + include_frames: Optional[List[int]] = None, + type_mapper: TypeMapper = None, + key_mapping: Dict[str, str] = { + "pos": AtomicDataDict.POSITIONS_KEY, + "energy": AtomicDataDict.TOTAL_ENERGY_KEY, + "atomic_numbers": AtomicDataDict.ATOMIC_NUMBERS_KEY, + "kpoints": AtomicDataDict.KPOINT_KEY, + "eigenvalues": AtomicDataDict.ENERGY_EIGENVALUE_KEY, + }, + ): + if file_name is not None: + self.file_name = file_name + else: + self.abacus_args = abacus_args + assert self.abacus_args.get("input_dir") is not None, "ABACUS calculation results MUST be provided." + if self.abacus_args.get("preprocess_dir") is None: + print("Creating new preprocess dictionary...") + os.mkdir(os.path.join(root, "preprocess")) + self.abacus_args["preprocess_dir"] = os.path.join(root, "preprocess") + self.key_mapping = key_mapping + + print("Begin parsing ABACUS output...") + h5_filenames = recursive_parse(**self.abacus_args) + self.file_name = h5_filenames + print("Finished parsing ABACUS output.") + + super().__init__( + file_name=self.file_name, + url=url, + root=root, + AtomicData_options=AtomicData_options, + include_frames=include_frames, + type_mapper=type_mapper, + ) + + def get_data(self): + data = [] + for h5_file in self.file_name: + data.append(_abacus_h5_reader(h5_file, self.AtomicData_options)) + return data + + @property + def raw_file_names(self): + return "AtomicData.h5" + + @property + def raw_dir(self): + return self.abacus_args.get("input_dir") \ No newline at end of file diff --git a/dptb/data/interfaces/abacus.py b/dptb/data/interfaces/abacus.py index 1332983d..46ff9475 100644 --- a/dptb/data/interfaces/abacus.py +++ b/dptb/data/interfaces/abacus.py @@ -51,10 +51,11 @@ def transform(self, mat, l_lefts, l_rights): block_rights = block_diag(*[self.get_U(l_right) for l_right in l_rights]) return block_lefts @ mat @ block_rights.T -def recursive_parse(input_dir, output_dir, data_name, only_S=False, get_Ham=False, add_overlap=False, get_eigenvalues=False): +def recursive_parse(input_dir, preprocess_dir, data_name="OUT.ABACUS", only_overlap=False, get_Ham=False, add_overlap=False, get_eigenvalues=False): input_dir = os.path.abspath(input_dir) - output_dir = os.path.abspath(output_dir) - os.makedirs(output_dir, exist_ok=True) + preprocess_dir = os.path.abspath(preprocess_dir) + os.makedirs(preprocess_dir, exist_ok=True) + h5file_names = [] for file in os.listdir(input_dir): if os.path.isdir(os.path.join(input_dir, file)): datafiles = os.listdir(os.path.join(input_dir, file)) @@ -62,13 +63,15 @@ def recursive_parse(input_dir, output_dir, data_name, only_S=False, get_Ham=Fals if os.path.exists(os.path.join(input_dir, file, data_name, "hscsr.tgz")): os.system("cd "+os.path.join(input_dir, file, data_name) + " && tar -zxvf hscsr.tgz && mv OUT.ABACUS/* ./") try: - abacus_parse(os.path.join(input_dir, file), os.path.join(output_dir, file), data_name, only_S=only_S, get_Ham=get_Ham, + _abacus_parse(os.path.join(input_dir, file), os.path.join(preprocess_dir, file), data_name, only_S=only_overlap, get_Ham=get_Ham, add_overlap=add_overlap, get_eigenvalues=get_eigenvalues) + h5file_names.append(os.path.join(preprocess_dir, file, "AtomicData.h5")) except Exception as e: print(f"Error in {data_name}: {e}") continue + return h5file_names -def abacus_parse(input_path, +def _abacus_parse(input_path, output_path, data_name, only_S=False, @@ -328,4 +331,4 @@ def parse_matrix(matrix_path, factor, spinful=False): f["eigenvalue"] = band # else: # f["kpoint"] = False - # f["eigenvalue"] = False \ No newline at end of file + # f["eigenvalue"] = False