From 0307f92266f7b10476c95dd83ca7aa6e4510437e Mon Sep 17 00:00:00 2001
From: zhanghao <zhouyinzhanghao@gmail.com>
Date: Wed, 24 Apr 2024 17:10:44 +0800
Subject: [PATCH 1/8] update abacus parse md

---
 dptb/data/interfaces/abacus.py | 305 ++++++++++++++++++++++++++++++++-
 1 file changed, 297 insertions(+), 8 deletions(-)

diff --git a/dptb/data/interfaces/abacus.py b/dptb/data/interfaces/abacus.py
index 74b2bc1d..aa129a65 100644
--- a/dptb/data/interfaces/abacus.py
+++ b/dptb/data/interfaces/abacus.py
@@ -92,14 +92,24 @@ def recursive_parse(input_path,
                 if os.path.exists(os.path.join(folder, data_name, "hscsr.tgz")):
                     os.system("cd "+os.path.join(folder, data_name) + " && tar -zxvf hscsr.tgz && mv OUT.ABACUS/* ./")
                 try:
-                    _abacus_parse(folder, 
-                                  os.path.join(preprocess_dir, f"{prefix}.{index}"), 
-                                  data_name,
-                                  get_Ham=parse_Hamiltonian,
-                                  get_DM=parse_DM,
-                                  get_overlap=parse_overlap, 
-                                  get_eigenvalues=parse_eigenvalues)
+                    if os.path.exists(os.path.join(folder, data_name, "running_get_S.log")) or \
+                        os.path.exists(os.path.join(folder, data_name, "running_scf.log")):
+                        _abacus_parse(folder, 
+                                    os.path.join(preprocess_dir, f"{prefix}.{index}"), 
+                                    data_name,
+                                    get_Ham=parse_Hamiltonian,
+                                    get_DM=parse_DM,
+                                    get_overlap=parse_overlap, 
+                                    get_eigenvalues=parse_eigenvalues)
                     #h5file_names.append(os.path.join(file, "AtomicData.h5"))
+                    elif os.path.exists(os.path.join(folder, data_name, "running_md.log")):
+                        _abacus_parse_md(folder, 
+                                    os.path.join(preprocess_dir, f"{prefix}.{index}"), 
+                                    data_name,
+                                    get_Ham=parse_Hamiltonian,
+                                    get_DM=parse_DM,
+                                    get_overlap=parse_overlap, 
+                                    get_eigenvalues=parse_eigenvalues)
                     pbar.update(1)
                 except Exception as e:
                     print(f"Error in {folder}/{data_name}: {e}")
@@ -212,9 +222,10 @@ def find_target_line(f, target):
         lattice = np.zeros((3, 3))
         for index_lat in range(3):
             lattice[index_lat, :] = np.array(f.readline().split())
+        lattice = lattice * lattice_constant
         if coords_type == "cartesian":
             frac_coords = frac_coords @ np.matrix(lattice).I  # get frac_coords anyway
-        lattice = lattice * lattice_constant
+        
 
         if get_Ham is False and get_overlap is True:
             spinful = False
@@ -394,3 +405,281 @@ def parse_matrix(matrix_path, factor, spinful=False):
     #    # else:
     #    #     f["kpoint"] = False
     #    #     f["eigenvalue"] = False
+
+def _abacus_parse_md(input_path, 
+                  output_path, 
+                  data_name, 
+                  get_Ham=False,
+                  get_DM=False,
+                  get_overlap=False, 
+                  get_eigenvalues=False):
+    
+    input_path = os.path.abspath(input_path)
+    output_path = os.path.abspath(output_path)
+    os.makedirs(output_path, exist_ok=True)
+
+    def find_target_line(f, target):
+        line = f.readline()
+        while line:
+            if target in line:
+                return line
+            line = f.readline()
+        return None
+    log_file_name = "running_md.log"
+
+    with open(os.path.join(input_path, data_name, log_file_name), 'r') as f_chk:
+        lines = f_chk.readlines()
+        if not lines or " Total  Time  :" not in lines[-1]:
+            raise ValueError(f"Job is not normal ending!")
+
+    with open(os.path.join(input_path, data_name, log_file_name), 'r') as f:
+        f.readline()
+        line = f.readline()
+        # assert "WELCOME TO ABACUS" in line
+        assert find_target_line(f, "READING UNITCELL INFORMATION") is not None, 'Cannot find "READING UNITCELL INFORMATION" in log file'
+        num_atom_type = int(f.readline().split()[-1])
+
+        assert find_target_line(f, "lattice constant (Bohr)") is not None
+        lattice_constant = float(f.readline().split()[-1]) # unit is Angstrom, didn't read (Bohr) here.
+
+        site_norbits_dict = {}
+        orbital_types_dict = {}
+        for index_type in range(num_atom_type):
+            tmp = find_target_line(f, "READING ATOM TYPE")
+            assert tmp is not None, 'Cannot find "ATOM TYPE" in log file'
+            assert tmp.split()[-1] == str(index_type + 1)
+            if tmp is None:
+                raise Exception(f"Cannot find ATOM {index_type} in {log_file_name}")
+
+            line = f.readline()
+            assert "atom label =" in line
+            atom_label = line.split()[-1]
+            atom_label = ''.join(re.findall(r'[A-Za-z]', atom_label))
+            assert atom_label in ase.data.atomic_numbers, "Atom label should be in periodic table"
+            atom_type = ase.data.atomic_numbers[atom_label]
+
+            current_site_norbits = 0
+            current_orbital_types = []
+            while True:
+                line = f.readline()
+                if "number of zeta" in line: # parse the basis
+                    tmp = line.split()
+                    L = int(tmp[0][2:-1])
+                    num_L = int(tmp[-1])
+                    current_site_norbits += (2 * L + 1) * num_L
+                    current_orbital_types.extend([L] * num_L)
+                else:
+                    break
+            site_norbits_dict[atom_type] = current_site_norbits
+            orbital_types_dict[atom_type] = current_orbital_types
+
+        #print(orbital_types_dict)
+
+        line = find_target_line(f, "TOTAL ATOM NUMBER")
+        assert line is not None, 'Cannot find "TOTAL ATOM NUMBER" in log file'
+        nsites = int(line.split()[-1])
+        nframes = 0
+        with open(os.path.join(input_path, data_name, "MD_dump"), 'r') as f_dump:
+            line = 0
+            frac_coords_list = []
+            lattice_constant_list = []
+            lattice_list = []
+            line = find_target_line(f_dump, "MDSTEP:")
+            while line is not None:
+                step = line[9:]
+                nframes += 1
+                lattice_constant = float(find_target_line(f_dump, "LATTICE_CONSTANT").split()[1])
+                # get lattice vector   
+                assert find_target_line(f_dump, "LATTICE_VECTORS") is not None
+                lattice = np.zeros((3, 3))
+                for index_lat in range(3):
+                    lattice[index_lat, :] = np.array(f_dump.readline().split())
+                lattice = lattice * lattice_constant
+                lattice_list.append(lattice)
+                lattice_constant_list.append(lattice_constant)
+
+                line = find_target_line(f_dump, "INDEX    LABEL    POSITION (Angstrom)    FORCE (eV/Angstrom)    VELOCITY (Angstrom/fs)")
+                # get coordinates
+                frac_coords = np.zeros((nsites, 3))
+                site_norbits = np.zeros(nsites, dtype=int)
+                element = np.zeros(nsites, dtype=int)
+                for index_site in range(nsites):
+                    line = f_dump.readline()
+                    tmp = line.split()
+                    atom_label = ''.join(re.findall(r'[A-Za-z]', tmp[1]))
+                    assert atom_label in ase.data.atomic_numbers, "Atom label should be in periodic table"
+                    element[index_site] = ase.data.atomic_numbers[atom_label]
+                    site_norbits[index_site] = site_norbits_dict[element[index_site]]
+                    frac_coords[index_site, :] = np.array(tmp[2:5])
+                norbits = int(np.sum(site_norbits))
+                frac_coords_list.append(frac_coords @ np.matrix(lattice).I)
+                site_norbits_cumsum = np.cumsum(site_norbits)
+                
+
+                line = find_target_line(f_dump, "MDSTEP:")
+
+        frac_coords_list = np.concatenate(frac_coords_list, axis=0)
+        lattice_list = np.concatenate(lattice_list, axis=0)
+    
+        if get_Ham is False and get_overlap is True:
+            spinful = False
+        else:
+            line = find_target_line(f, "nspin")
+            if line is None:
+                line = find_target_line(f, "NSPIN")
+            assert line is not None, 'Cannot find "NSPIN" in log file'
+            if "NSPIN == 1" or "npin = 1" in line:
+                spinful = False
+            elif "NSPIN == 4" or "nspin = 4" in line:
+                spinful = True
+            else:
+                raise ValueError(f'{line} is not supported')
+            
+        
+    np.savetxt(os.path.join(output_path, "cell.dat"), lattice_list)
+    np.savetxt(os.path.join(output_path, "rcell.dat"), np.linalg.inv(lattice.reshape(-1, 3, 3)).reshape(-1, 3) * 2 * np.pi)
+    cart_coords_list = frac_coords_list @ lattice
+    np.savetxt(os.path.join(output_path, "positions.dat").format(output_path), cart_coords_list)
+    np.savetxt(os.path.join(output_path, "atomic_numbers.dat"), element, fmt='%d')
+    #info = {'nsites' : nsites, 'isorthogonal': False, 'isspinful': spinful, 'norbits': norbits}
+    #with open('{}/info.json'.format(output_path), 'w') as info_f:
+    #    json.dump(info, info_f)
+    with open(os.path.join(output_path, "basis.dat"), 'w') as f:
+        for atomic_number in element:
+            counter = Counter(orbital_types_dict[atomic_number])
+            num_orbs = [counter[i] for i in range(6)] # s, p, d, f, g, h
+            for index_l, l in enumerate(num_orbs):
+                if l == 0:  # no this orbit
+                    continue
+                if index_l == 0:
+                    f.write(f"{l}{orbitalId[index_l]}")
+                else:
+                    f.write(f"{l}{orbitalId[index_l]}")
+            f.write('\n')
+    atomic_basis = {}
+    for atomic_number, orbitals in orbital_types_dict.items():
+        atomic_basis[ase.data.chemical_symbols[atomic_number]] = orbitals
+
+    U_orbital = OrbAbacus2DeepTB()
+    def parse_matrix(matrix_path, factor, spinful=False):
+        matrix_dict = dict()
+        with open(matrix_path, 'r') as f:
+            line = f.readline() # read "Matrix Dimension of ..."
+            if not "Matrix Dimension of" in line:
+                line = f.readline() # ABACUS >= 3.0
+                assert "Matrix Dimension of" in line
+            f.readline() # read "Matrix number of ..."
+            norbits = int(line.split()[-1])
+            for line in f:
+                line1 = line.split()
+                if len(line1) == 0:
+                    break
+                num_element = int(line1[3])
+                if num_element != 0:
+                    R_cur = np.array(line1[:3]).astype(int)
+                    line2 = f.readline().split()
+                    line3 = f.readline().split()
+                    line4 = f.readline().split()
+                    if not spinful:
+                        hamiltonian_cur = csr_matrix((np.array(line2).astype(float), np.array(line3).astype(int),
+                                                        np.array(line4).astype(int)), shape=(norbits, norbits)).toarray()
+                    else:
+                        line2 = np.char.replace(line2, '(', '')
+                        line2 = np.char.replace(line2, ')', 'j')
+                        line2 = np.char.replace(line2, ',', '+')
+                        line2 = np.char.replace(line2, '+-', '-')
+                        hamiltonian_cur = csr_matrix((np.array(line2).astype(np.complex128), np.array(line3).astype(int),
+                                                    np.array(line4).astype(int)), shape=(norbits, norbits)).toarray()
+                    for index_site_i in range(nsites):
+                        for index_site_j in range(nsites):
+                            key_str = f"{index_site_i + 1}_{index_site_j + 1}_{R_cur[0]}_{R_cur[1]}_{R_cur[2]}"
+                            mat = hamiltonian_cur[(site_norbits_cumsum[index_site_i]
+                                                    - site_norbits[index_site_i]) * (1 + spinful):
+                                                    site_norbits_cumsum[index_site_i] * (1 + spinful),
+                                    (site_norbits_cumsum[index_site_j] - site_norbits[index_site_j]) * (1 + spinful):
+                                    site_norbits_cumsum[index_site_j] * (1 + spinful)]
+                            if abs(mat).max() < 1e-10:
+                                continue
+                            if not spinful:
+                                mat = U_orbital.transform(mat, orbital_types_dict[element[index_site_i]],
+                                                            orbital_types_dict[element[index_site_j]])
+                            else:
+                                mat = mat.reshape((site_norbits[index_site_i], 2, site_norbits[index_site_j], 2))
+                                mat = mat.transpose((1, 0, 3, 2)).reshape((2 * site_norbits[index_site_i],
+                                                                        2 * site_norbits[index_site_j]))
+                                mat = U_orbital.transform(mat, orbital_types_dict[element[index_site_i]] * 2,
+                                                        orbital_types_dict[element[index_site_j]] * 2)
+                            matrix_dict[key_str] = mat * factor
+        return matrix_dict, norbits
+
+    if get_Ham:
+        with h5py.File(os.path.join(output_path, "hamiltonians.h5"), 'w') as fid:
+            for i in range(nframes):
+                hamiltonian_dict, tmp = parse_matrix(
+                    os.path.join(input_path, data_name, "matrix/"+str(i)+"_data-HR-sparse_SPIN0.csr"), 13.605698, # Ryd2eV
+                    spinful=spinful)
+                assert tmp == norbits * (1 + spinful)
+                # creating a default group here adapting to the format used in DefaultDataset.
+                # by the way DefaultDataset loading h5 file, the index should be "1" here.
+                default_group = fid.create_group(str(i+1))
+                for key_str, value in hamiltonian_dict.items():
+                    default_group[key_str] = value
+
+    if get_overlap:
+        with h5py.File(os.path.join(output_path, "overlaps.h5"), 'w') as fid:
+            for i in range(nframes):
+                overlap_dict, tmp = parse_matrix(os.path.join(input_path, data_name, "matrix/"+str(i)+"_data-SR-sparse_SPIN0.csr"), 1,
+                                                    spinful=spinful)
+                assert tmp == norbits * (1 + spinful)
+                if spinful:
+                    overlap_dict_spinless = {}
+                    for k, v in overlap_dict.items():
+                        overlap_dict_spinless[k] = v[:v.shape[0] // 2, :v.shape[1] // 2].real
+                    overlap_dict_spinless, overlap_dict = overlap_dict, overlap_dict_spinless
+
+                default_group = fid.create_group(str(i+1))
+                for key_str, value in overlap_dict.items():
+                    default_group[key_str] = value
+
+    if get_DM:
+        with h5py.File(os.path.join(output_path, "DM.h5"), 'w') as fid:
+            for i in range(nframes):
+                DM_dict, tmp = parse_matrix(
+                    os.path.join(input_path, data_name, "matrix/"+str(i)+"_data-DMR-sparse_SPIN0.csr"), 1, spinful=spinful)
+                assert tmp == norbits * (1 + spinful)
+            # if spinful:
+            #     DM_dict_spinless = {}
+            #     for k, v in overlap_dict.items():
+            #         overlap_dict_spinless[k] = v[:v.shape[0] // 2, :v.shape[1] // 2].real
+            #     overlap_dict_spinless, overlap_dict = overlap_dict, overlap_dict_spinless
+                default_group = fid.create_group(str(i+1))
+                for key_str, value in DM_dict.items():
+                    default_group[key_str] = value
+
+    if get_eigenvalues:
+        raise ValueError("Currently not support MD eigenvalues parsing.")
+        kpts = []
+        with open(os.path.join(input_path, data_name, "kpoints"), "r") as f:
+            nkstot = f.readline().strip().split()[-1]
+            f.readline()
+            for _ in range(int(nkstot)):
+                line = f.readline()
+                kpt = []
+                line = line.strip().split()
+                kpt.extend([float(line[1]), float(line[2]), float(line[3])])
+                kpts.append(kpt)
+        kpts = np.array(kpts)
+
+        with open(os.path.join(input_path, data_name, "BANDS_1.dat"), "r") as file:
+            band_lines = file.readlines()
+        band = []
+        for line in band_lines:
+            values = line.strip().split()
+            eigs = [float(value) for value in values[1:]]
+            band.append(eigs)
+        band = np.array(band)
+
+        assert len(band) == len(kpts)
+        np.save(os.path.join(output_path, "kpoints.npy"), kpts)
+        np.save(os.path.join(output_path, "eigenvalues.npy"), band)
+

From 86087dd59f8ec83293721af097053c3d8032a002 Mon Sep 17 00:00:00 2001
From: zhanghao <zhouyinzhanghao@gmail.com>
Date: Thu, 25 Apr 2024 10:49:22 +0800
Subject: [PATCH 2/8] fix(default_dataset): The natom and nframe default
 setting

---
 dptb/data/dataset/_default_dataset.py | 28 +++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/dptb/data/dataset/_default_dataset.py b/dptb/data/dataset/_default_dataset.py
index 52ca442f..c6d50ae4 100644
--- a/dptb/data/dataset/_default_dataset.py
+++ b/dptb/data/dataset/_default_dataset.py
@@ -65,22 +65,11 @@ def __init__(self,
         else:
             raise ValueError("Wrong cell dimensions.")
         
-        # load atomic numbers
-        atomic_numbers = np.loadtxt(os.path.join(root, "atomic_numbers.dat"))
-        natoms = self.info["natoms"]
-        if natoms < 0:
-            natoms = atomic_numbers.shape[-1]
-        if atomic_numbers.shape[0] == self.info["natoms"]:
-            # same atomic_numbers, copy it to all frames.
-            atomic_numbers = np.expand_dims(atomic_numbers, axis=0)
-            self.data["atomic_numbers"] = np.broadcast_to(atomic_numbers, (self.info["nframes"], natoms))
-        elif atomic_numbers.shape[0] == natoms * self.info["nframes"]:
-            self.data["atomic_numbers"] = atomic_numbers.reshape(self.info["nframes"],natoms)
-        else:
-            raise ValueError("Wrong atomic_number dimensions.")
-        
         # load positions, stored as cartesion no matter what provided.
         pos = np.loadtxt(os.path.join(root, "positions.dat"))
+        natoms = self.info["natoms"]
+        if natoms < 0:
+            natoms = int(pos.shape[0] / self.info["nframes"])
         assert pos.shape[0] == self.info["nframes"] * natoms
         pos = pos.reshape(self.info["nframes"], natoms, 3)
         # ase use cartesian by default.
@@ -91,6 +80,17 @@ def __init__(self,
         else:
             raise NameError("Position type must be cart / frac.")
         
+        # load atomic numbers
+        atomic_numbers = np.loadtxt(os.path.join(root, "atomic_numbers.dat"))
+        if atomic_numbers.shape[0] == natoms:
+            # same atomic_numbers, copy it to all frames.
+            atomic_numbers = np.expand_dims(atomic_numbers, axis=0)
+            self.data["atomic_numbers"] = np.broadcast_to(atomic_numbers, (self.info["nframes"], natoms))
+        elif atomic_numbers.shape[0] == natoms * self.info["nframes"]:
+            self.data["atomic_numbers"] = atomic_numbers.reshape(self.info["nframes"],natoms)
+        else:
+            raise ValueError("Wrong atomic_number dimensions.")
+        
         # load optional data files
         if get_eigenvalues == True:
             if os.path.exists(os.path.join(self.root, "eigenvalues.npy")):

From 2c8fbe79cd18eb62456ba9d789b71c58e4b806ae Mon Sep 17 00:00:00 2001
From: zhanghao <zhouyinzhanghao@gmail.com>
Date: Sat, 27 Apr 2024 08:36:42 +0800
Subject: [PATCH 3/8] =?UTF-8?q?shift=20the=20'pos=E2=80=98=20position=20in?=
 =?UTF-8?q?=20test=20default=20dataset?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 dptb/tests/test_default_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dptb/tests/test_default_dataset.py b/dptb/tests/test_default_dataset.py
index 202edf7a..e8b42e82 100644
--- a/dptb/tests/test_default_dataset.py
+++ b/dptb/tests/test_default_dataset.py
@@ -59,7 +59,7 @@ def test_raw_data(self):
         assert self.dataset.raw_data[0].AtomicData_options == {'r_max': 5.0, 'er_max': 5.0, 'oer_max': 2.5, 'pbc': True}
         assert self.dataset.raw_data[0].info == self.info_files['kpath_spk.0']
         assert "bandinfo" in self.dataset.raw_data[0].info
-        assert  list(self.dataset.raw_data[0].data.keys()) == (['cell', 'atomic_numbers', 'pos', 'kpoints', 'eigenvalues'])
+        assert  list(self.dataset.raw_data[0].data.keys()) == (['cell', 'pos', 'atomic_numbers', 'kpoints', 'eigenvalues'])
         assert (np.abs(self.dataset.raw_data[0].data['cell'] - self.strase[0].cell) < 1e-6).all()
         assert (np.abs(self.dataset.raw_data[0].data['atomic_numbers'] - np.array([[14., 14.]])) < 1e-6).all()
         assert (np.abs(self.dataset.raw_data[0].data['pos'] - self.strase[0].positions) < 1e-6).all()

From af509570cf129454104f3eb771f35af49aa5d90e Mon Sep 17 00:00:00 2001
From: zhanghao <zhouyinzhanghao@gmail.com>
Date: Mon, 29 Apr 2024 16:08:52 +0800
Subject: [PATCH 4/8] fix lattice constant transform

---
 dptb/data/interfaces/abacus.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/dptb/data/interfaces/abacus.py b/dptb/data/interfaces/abacus.py
index aa129a65..25fb7def 100644
--- a/dptb/data/interfaces/abacus.py
+++ b/dptb/data/interfaces/abacus.py
@@ -222,9 +222,9 @@ def find_target_line(f, target):
         lattice = np.zeros((3, 3))
         for index_lat in range(3):
             lattice[index_lat, :] = np.array(f.readline().split())
-        lattice = lattice * lattice_constant
         if coords_type == "cartesian":
             frac_coords = frac_coords @ np.matrix(lattice).I  # get frac_coords anyway
+        lattice = lattice * lattice_constant
         
 
         if get_Ham is False and get_overlap is True:
@@ -439,8 +439,9 @@ def find_target_line(f, target):
         assert find_target_line(f, "READING UNITCELL INFORMATION") is not None, 'Cannot find "READING UNITCELL INFORMATION" in log file'
         num_atom_type = int(f.readline().split()[-1])
 
-        assert find_target_line(f, "lattice constant (Bohr)") is not None
-        lattice_constant = float(f.readline().split()[-1]) # unit is Angstrom, didn't read (Bohr) here.
+        line = find_target_line(f, "lattice constant (Angstrom)")
+        assert line is not None
+        lattice_constant = float(line.split()[-1]) # unit is Angstrom, didn't read (Bohr) here.
 
         site_norbits_dict = {}
         orbital_types_dict = {}

From 1ed4a5aacba6bb1b46b789c2316dacc9b10a60b4 Mon Sep 17 00:00:00 2001
From: zhanghao <zhouyinzhanghao@gmail.com>
Date: Mon, 29 Apr 2024 16:16:43 +0800
Subject: [PATCH 5/8] update parse_abacus_md

---
 dptb/data/interfaces/abacus.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/dptb/data/interfaces/abacus.py b/dptb/data/interfaces/abacus.py
index 25fb7def..b994c65e 100644
--- a/dptb/data/interfaces/abacus.py
+++ b/dptb/data/interfaces/abacus.py
@@ -482,7 +482,7 @@ def find_target_line(f, target):
         nframes = 0
         with open(os.path.join(input_path, data_name, "MD_dump"), 'r') as f_dump:
             line = 0
-            frac_coords_list = []
+            coords_list = []
             lattice_constant_list = []
             lattice_list = []
             line = find_target_line(f_dump, "MDSTEP:")
@@ -513,13 +513,13 @@ def find_target_line(f, target):
                     site_norbits[index_site] = site_norbits_dict[element[index_site]]
                     frac_coords[index_site, :] = np.array(tmp[2:5])
                 norbits = int(np.sum(site_norbits))
-                frac_coords_list.append(frac_coords @ np.matrix(lattice).I)
+                coords_list.append(frac_coords)
                 site_norbits_cumsum = np.cumsum(site_norbits)
                 
 
                 line = find_target_line(f_dump, "MDSTEP:")
 
-        frac_coords_list = np.concatenate(frac_coords_list, axis=0)
+        coords_list = np.concatenate(coords_list, axis=0)
         lattice_list = np.concatenate(lattice_list, axis=0)
     
         if get_Ham is False and get_overlap is True:
@@ -539,8 +539,7 @@ def find_target_line(f, target):
         
     np.savetxt(os.path.join(output_path, "cell.dat"), lattice_list)
     np.savetxt(os.path.join(output_path, "rcell.dat"), np.linalg.inv(lattice.reshape(-1, 3, 3)).reshape(-1, 3) * 2 * np.pi)
-    cart_coords_list = frac_coords_list @ lattice
-    np.savetxt(os.path.join(output_path, "positions.dat").format(output_path), cart_coords_list)
+    np.savetxt(os.path.join(output_path, "positions.dat").format(output_path), coords_list)
     np.savetxt(os.path.join(output_path, "atomic_numbers.dat"), element, fmt='%d')
     #info = {'nsites' : nsites, 'isorthogonal': False, 'isspinful': spinful, 'norbits': norbits}
     #with open('{}/info.json'.format(output_path), 'w') as info_f:

From d2706e575c0791ac0427dea473d66d08c9aeb583 Mon Sep 17 00:00:00 2001
From: qqgu <guqq_phy@qq.com>
Date: Mon, 29 Apr 2024 17:11:08 +0800
Subject: [PATCH 6/8] update abacus.py

---
 dptb/data/interfaces/abacus.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/dptb/data/interfaces/abacus.py b/dptb/data/interfaces/abacus.py
index b994c65e..a18cde90 100644
--- a/dptb/data/interfaces/abacus.py
+++ b/dptb/data/interfaces/abacus.py
@@ -92,8 +92,10 @@ def recursive_parse(input_path,
                 if os.path.exists(os.path.join(folder, data_name, "hscsr.tgz")):
                     os.system("cd "+os.path.join(folder, data_name) + " && tar -zxvf hscsr.tgz && mv OUT.ABACUS/* ./")
                 try:
+                    tasktype = "" 
                     if os.path.exists(os.path.join(folder, data_name, "running_get_S.log")) or \
                         os.path.exists(os.path.join(folder, data_name, "running_scf.log")):
+                        tasktype = tasktype + "single_point"
                         _abacus_parse(folder, 
                                     os.path.join(preprocess_dir, f"{prefix}.{index}"), 
                                     data_name,
@@ -102,7 +104,8 @@ def recursive_parse(input_path,
                                     get_overlap=parse_overlap, 
                                     get_eigenvalues=parse_eigenvalues)
                     #h5file_names.append(os.path.join(file, "AtomicData.h5"))
-                    elif os.path.exists(os.path.join(folder, data_name, "running_md.log")):
+                    if os.path.exists(os.path.join(folder, data_name, "running_md.log")):
+                        tasktype = tasktype + "molecular_dynamics"
                         _abacus_parse_md(folder, 
                                     os.path.join(preprocess_dir, f"{prefix}.{index}"), 
                                     data_name,
@@ -110,6 +113,11 @@ def recursive_parse(input_path,
                                     get_DM=parse_DM,
                                     get_overlap=parse_overlap, 
                                     get_eigenvalues=parse_eigenvalues)
+                    if tasktype == "":
+                        raise ValueError(f"Cannot find any log file in {folder}")
+                    elif not tasktype in ["single_point", "molecular_dynamics"]:
+                        raise ValueError(f"Unknown task type in {folder}")
+                    
                     pbar.update(1)
                 except Exception as e:
                     print(f"Error in {folder}/{data_name}: {e}")

From 6cafead3d3e2b0822ecff82ba78f5c6ba5f0bda3 Mon Sep 17 00:00:00 2001
From: qqgu <guqq_phy@qq.com>
Date: Mon, 29 Apr 2024 17:13:19 +0800
Subject: [PATCH 7/8] update abacus.py

---
 dptb/data/interfaces/abacus.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/dptb/data/interfaces/abacus.py b/dptb/data/interfaces/abacus.py
index a18cde90..5b9e69e2 100644
--- a/dptb/data/interfaces/abacus.py
+++ b/dptb/data/interfaces/abacus.py
@@ -232,6 +232,8 @@ def find_target_line(f, target):
             lattice[index_lat, :] = np.array(f.readline().split())
         if coords_type == "cartesian":
             frac_coords = frac_coords @ np.matrix(lattice).I  # get frac_coords anyway
+        # this is because the cartesian coordinates in abacus log files are in unit of a_0. Thus the lattice constant is not needed.
+        # now we need to convert the lattice constant to Angstrom.
         lattice = lattice * lattice_constant
         
 

From a76b0e3746e7c2095d2677f0ccfd1e69686316c3 Mon Sep 17 00:00:00 2001
From: zhanghao <zhouyinzhanghao@gmail.com>
Date: Mon, 29 Apr 2024 17:17:54 +0800
Subject: [PATCH 8/8] update parse abacus scf lattice constant

---
 dptb/data/interfaces/abacus.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/dptb/data/interfaces/abacus.py b/dptb/data/interfaces/abacus.py
index b994c65e..998e1898 100644
--- a/dptb/data/interfaces/abacus.py
+++ b/dptb/data/interfaces/abacus.py
@@ -153,8 +153,9 @@ def find_target_line(f, target):
         assert find_target_line(f, "READING UNITCELL INFORMATION") is not None, 'Cannot find "READING UNITCELL INFORMATION" in log file'
         num_atom_type = int(f.readline().split()[-1])
 
-        assert find_target_line(f, "lattice constant (Bohr)") is not None
-        lattice_constant = float(f.readline().split()[-1]) # unit is Angstrom, didn't read (Bohr) here.
+        line = find_target_line(f, "lattice constant (Angstrom)")
+        assert line is not None
+        lattice_constant = float(line.split()[-1]) # unit is Angstrom, didn't read (Bohr) here.
 
         site_norbits_dict = {}
         orbital_types_dict = {}