Skip to content

Commit

Permalink
Added ABACUSInMemoryDataset in data module (deepmodeling#11)
Browse files Browse the repository at this point in the history
* Prototype code for loading Hamiltonian

* add 'ABACUSDataset' in data module

* modified "basis.dat" storage & can load overlap

* recover some original dataset settings

* add ABACUSDataset in init

* Add the in memory version of ABACUSDataset

* add ABACUSInMemoryDataset in data package
  • Loading branch information
SharpLonde authored Dec 1, 2023
1 parent 2db61f2 commit 8f85806
Show file tree
Hide file tree
Showing 4 changed files with 122 additions and 7 deletions.
2 changes: 2 additions & 0 deletions dptb/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
ASEDataset,
HDF5Dataset,
ABACUSDataset,
ABACUSInMemoryDataset,
)
from .dataloader import DataLoader, Collater, PartialSampler
from .build import dataset_from_config
Expand All @@ -33,6 +34,7 @@
ASEDataset,
HDF5Dataset,
ABACUSDataset,
ABACUSInMemoryDataset,
DataLoader,
Collater,
PartialSampler,
Expand Down
3 changes: 2 additions & 1 deletion dptb/data/dataset/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,6 @@
from ._npz_dataset import NpzDataset
from ._hdf5_dataset import HDF5Dataset
from ._abacus_dataset import ABACUSDataset
from ._abacus_dataset_mem import ABACUSInMemoryDataset

__all__ = [ABACUSDataset, ASEDataset, AtomicDataset, AtomicInMemoryDataset, NpzDataset, HDF5Dataset]
__all__ = [ABACUSInMemoryDataset, ABACUSDataset, ASEDataset, AtomicDataset, AtomicInMemoryDataset, NpzDataset, HDF5Dataset]
109 changes: 109 additions & 0 deletions dptb/data/dataset/_abacus_dataset_mem.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
from typing import Dict, Any, List, Callable, Union, Optional
import os

import numpy as np
import h5py

import torch

from .. import (
AtomicData,
AtomicDataDict,
)
from ..transforms import TypeMapper, OrbitalMapper
from ._base_datasets import AtomicInMemoryDataset
from dptb.nn.hamiltonian import E3Hamiltonian
from dptb.data.interfaces.ham_to_feature import ham_block_to_feature
from dptb.data.interfaces.abacus import recursive_parse

orbitalLId = {0:"s", 1:"p", 2:"d", 3:"f"}

def _abacus_h5_reader(h5file_path, AtomicData_options):
data = h5py.File(h5file_path, "r")
atomic_data = AtomicData.from_points(
pos = data["pos"][:],
cell = data["cell"][:],
atomic_numbers = data["atomic_numbers"][:],
**AtomicData_options,
)
if data["hamiltonian_blocks"]:
basis = {}
for key, value in data["basis"].items():
basis[key] = [(f"{i+1}" + orbitalLId[l]) for i, l in enumerate(value)]
idp = OrbitalMapper(basis)
e3 = E3Hamiltonian(idp=idp, decompose=True)
ham_block_to_feature(atomic_data, idp, data.get("hamiltonian_blocks", False), data.get("overlap_blocks", False))
with torch.no_grad():
atomic_data = e3(atomic_data.to_dict())
atomic_data = AtomicData.from_dict(atomic_data)

if data.get("eigenvalue") and data.get("kpoint"):
atomic_data[AtomicDataDict.KPOINT_KEY] = torch.as_tensor(data["kpoint"][:], dtype=torch.get_default_dtype())
atomic_data[AtomicDataDict.ENERGY_EIGENVALUE_KEY] = torch.as_tensor(data["eigenvalue"][:], dtype=torch.get_default_dtype())
return atomic_data


class ABACUSInMemoryDataset(AtomicInMemoryDataset):

def __init__(
self,
root: str,
abacus_args: Dict[str, Union[str,bool]] = {
"input_dir": None,
"preprocess_dir": None,
"only_overlap": False,
"get_Ham": False,
"add_overlap": False,
"get_eigenvalues": False,
},
file_name: Optional[str] = None,
url: Optional[str] = None,
AtomicData_options: Dict[str, Any] = {},
include_frames: Optional[List[int]] = None,
type_mapper: TypeMapper = None,
key_mapping: Dict[str, str] = {
"pos": AtomicDataDict.POSITIONS_KEY,
"energy": AtomicDataDict.TOTAL_ENERGY_KEY,
"atomic_numbers": AtomicDataDict.ATOMIC_NUMBERS_KEY,
"kpoints": AtomicDataDict.KPOINT_KEY,
"eigenvalues": AtomicDataDict.ENERGY_EIGENVALUE_KEY,
},
):
if file_name is not None:
self.file_name = file_name
else:
self.abacus_args = abacus_args
assert self.abacus_args.get("input_dir") is not None, "ABACUS calculation results MUST be provided."
if self.abacus_args.get("preprocess_dir") is None:
print("Creating new preprocess dictionary...")
os.mkdir(os.path.join(root, "preprocess"))
self.abacus_args["preprocess_dir"] = os.path.join(root, "preprocess")
self.key_mapping = key_mapping

print("Begin parsing ABACUS output...")
h5_filenames = recursive_parse(**self.abacus_args)
self.file_name = h5_filenames
print("Finished parsing ABACUS output.")

super().__init__(
file_name=self.file_name,
url=url,
root=root,
AtomicData_options=AtomicData_options,
include_frames=include_frames,
type_mapper=type_mapper,
)

def get_data(self):
data = []
for h5_file in self.file_name:
data.append(_abacus_h5_reader(h5_file, self.AtomicData_options))
return data

@property
def raw_file_names(self):
return "AtomicData.h5"

@property
def raw_dir(self):
return self.abacus_args.get("input_dir")
15 changes: 9 additions & 6 deletions dptb/data/interfaces/abacus.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,24 +51,27 @@ def transform(self, mat, l_lefts, l_rights):
block_rights = block_diag(*[self.get_U(l_right) for l_right in l_rights])
return block_lefts @ mat @ block_rights.T

def recursive_parse(input_dir, output_dir, data_name, only_S=False, get_Ham=False, add_overlap=False, get_eigenvalues=False):
def recursive_parse(input_dir, preprocess_dir, data_name="OUT.ABACUS", only_overlap=False, get_Ham=False, add_overlap=False, get_eigenvalues=False):
input_dir = os.path.abspath(input_dir)
output_dir = os.path.abspath(output_dir)
os.makedirs(output_dir, exist_ok=True)
preprocess_dir = os.path.abspath(preprocess_dir)
os.makedirs(preprocess_dir, exist_ok=True)
h5file_names = []
for file in os.listdir(input_dir):
if os.path.isdir(os.path.join(input_dir, file)):
datafiles = os.listdir(os.path.join(input_dir, file))
if data_name in datafiles:
if os.path.exists(os.path.join(input_dir, file, data_name, "hscsr.tgz")):
os.system("cd "+os.path.join(input_dir, file, data_name) + " && tar -zxvf hscsr.tgz && mv OUT.ABACUS/* ./")
try:
abacus_parse(os.path.join(input_dir, file), os.path.join(output_dir, file), data_name, only_S=only_S, get_Ham=get_Ham,
_abacus_parse(os.path.join(input_dir, file), os.path.join(preprocess_dir, file), data_name, only_S=only_overlap, get_Ham=get_Ham,
add_overlap=add_overlap, get_eigenvalues=get_eigenvalues)
h5file_names.append(os.path.join(preprocess_dir, file, "AtomicData.h5"))
except Exception as e:
print(f"Error in {data_name}: {e}")
continue
return h5file_names

def abacus_parse(input_path,
def _abacus_parse(input_path,
output_path,
data_name,
only_S=False,
Expand Down Expand Up @@ -328,4 +331,4 @@ def parse_matrix(matrix_path, factor, spinful=False):
f["eigenvalue"] = band
# else:
# f["kpoint"] = False
# f["eigenvalue"] = False
# f["eigenvalue"] = False

0 comments on commit 8f85806

Please sign in to comment.