fix bug in reading abacus stru (#1714)

pxlxingliang · pre-commit-ci[bot] · web-flow · commit 674fbb04658c · 2025-02-21T10:13:39.000+08:00
Fix the bug induced by the update of dpdata. Now the reading and writing of ABACUS STRU is by a more general dpdata/abacus interface, and remove some redundant codes. This commit is based on dpdata commit deepmodeling/dpdata#793, and it is valid after that commit is merged. related issue: #1711  ## Summary by CodeRabbit - **Refactor** - Streamlined the workflow for generating atomic structure data by removing redundant parameters and centralizing functionality. - Improved handling of atomic data structures for more efficient processing and organization. - **Tests** - Enhanced the precision of validations by refining comparisons to focus on key attributes, ensuring more robust result accuracy. - Corrected the method name for improved clarity in the test suite. - Added a new key to the test data structure for enhanced functionality in subsequent tests.  --------- Co-authored-by: root <pxlxingliang> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
diff --git a/dpgen/auto_test/lib/abacus.py b/dpgen/auto_test/lib/abacus.py
@@ -4,7 +4,7 @@
 
 import dpdata
 import numpy as np
-from dpdata.abacus.scf import make_unlabeled_stru
+from dpdata.abacus.stru import make_unlabeled_stru
 from dpdata.utils import uniq_atom_names
 from dpdata.vasp import poscar as dpdata_poscar
 
diff --git a/dpgen/data/gen.py b/dpgen/data/gen.py
@@ -145,6 +145,7 @@ def stru_ele(supercell_stru, stru_out, eles, natoms, jdata, path_work):
         dpks_descriptor_name = os.path.basename(jdata["dpks_descriptor"])
     supercell_stru["atom_masses"] = jdata["atom_masses"]
     supercell_stru["atom_names"] = eles
+    supercell_stru["atom_types"] = np.array(supercell_stru["types"])
     stru_text = make_abacus_scf_stru(
         supercell_stru,
         pp_file_names,
diff --git a/dpgen/data/tools/create_random_disturb.py b/dpgen/data/tools/create_random_disturb.py
@@ -192,6 +192,7 @@ def create_disturbs_abacus_dev(
     stru = get_abacus_STRU(fin)
     natoms = sum(stru["atom_numbs"])
     cell0 = stru["cells"]
+    stru["masses"] = stru["atom_masses"]  # in dpdata, it is masses that should be used.
 
     # creat nfile ofmt files.
     for fid in range(1, nfile + 1):
diff --git a/dpgen/generator/lib/abacus_scf.py b/dpgen/generator/lib/abacus_scf.py
@@ -3,7 +3,7 @@
 import re
 
 import numpy as np
-from dpdata.abacus.scf import get_cell, get_coords, get_nele_from_stru
+from dpdata.abacus.stru import get_frame_from_stru, make_unlabeled_stru
 
 from dpgen.auto_test.lib import vasp
 
@@ -208,90 +208,37 @@ def make_abacus_scf_stru(
     fp_pp_files,
     fp_orb_files=None,
     fp_dpks_descriptor=None,
-    fp_params=None,
     type_map=None,
     pporb="",  # pull all pp orb dpks files to pporb folder
 ):
-    atom_names = sys_data["atom_names"]
-    atom_numbs = sys_data["atom_numbs"]
-    if type_map is None:
-        type_map = atom_names
-
-    assert len(atom_names) == len(atom_numbs), "Please check the name of atoms. "
-    cell = sys_data["cells"].reshape([3, 3])
-    coord = sys_data["coords"].reshape([sum(atom_numbs), 3])
-    # volume = np.linalg.det(cell)
-    # lattice_const = np.power(volume, 1/3)
-
-    ret = "ATOMIC_SPECIES\n"
-    for iatom in range(len(atom_names)):
-        assert atom_names[iatom] in type_map, (
-            f"element {atom_names[iatom]} is not defined in type_map"
-        )
-        idx = type_map.index(atom_names[iatom])
-        if "atom_masses" not in sys_data:
-            ret += (
-                atom_names[iatom]
-                + " 1.00 "
-                + os.path.join(pporb, fp_pp_files[idx])
-                + "\n"
-            )
-        else:
-            ret += (
-                atom_names[iatom]
-                + " {:.3f} ".format(sys_data["atom_masses"][iatom])
-                + os.path.join(pporb, fp_pp_files[idx])
-                + "\n"
-            )
-
-    if fp_params is not None and "lattice_constant" in fp_params:
-        ret += "\nLATTICE_CONSTANT\n"
-        ret += (
-            str(fp_params["lattice_constant"]) + "\n\n"
-        )  # in Bohr, in this way coord and cell are in Angstrom
-    else:
-        ret += "\nLATTICE_CONSTANT\n"
-        ret += str(1 / bohr2ang) + "\n\n"
-
-    ret += "LATTICE_VECTORS\n"
-    for ix in range(3):
-        for iy in range(3):
-            ret += str(cell[ix][iy]) + " "
-        ret += "\n"
-    ret += "\n"
-
-    ret += "ATOMIC_POSITIONS\n"
-    ret += "Cartesian    # Cartesian(Unit is LATTICE_CONSTANT)\n"
-    # ret += "\n"
-    natom_tot = 0
-    for iele in range(len(atom_names)):
-        ret += atom_names[iele] + "\n"
-        ret += "0.0\n"
-        ret += str(atom_numbs[iele]) + "\n"
-        for iatom in range(atom_numbs[iele]):
-            ret += "%.12f %.12f %.12f %d %d %d\n" % (  # noqa: UP031
-                coord[natom_tot, 0],
-                coord[natom_tot, 1],
-                coord[natom_tot, 2],
-                1,
-                1,
-                1,
-            )
-            natom_tot += 1
-    assert natom_tot == sum(atom_numbs)
-
+    sys_data_copy = copy.deepcopy(sys_data)
+    # re-construct the path of files by pporb + file name
+    fp_pp_files = [os.path.join(pporb, i) for i in fp_pp_files]
     if fp_orb_files is not None:
-        ret += "\nNUMERICAL_ORBITAL\n"
-        assert len(fp_orb_files) == len(type_map)
-        for iatom in range(len(atom_names)):
-            idx = type_map.index(atom_names[iatom])
-            ret += os.path.join(pporb, fp_orb_files[idx]) + "\n"
-
+        fp_orb_files = [os.path.join(pporb, i) for i in fp_orb_files]
     if fp_dpks_descriptor is not None:
-        ret += "\nNUMERICAL_DESCRIPTOR\n"
-        ret += os.path.join(pporb, fp_dpks_descriptor) + "\n"
+        fp_dpks_descriptor = os.path.join(pporb, fp_dpks_descriptor)
+
+    # we need to make sure that the shape of cells and coords are the same
+    # and if they are 2D, we need to convert them to 3D
+    cells = np.array(sys_data["cells"])
+    coords = np.array(sys_data["coords"])
+    assert len(cells.shape) == len(coords.shape), (
+        "cells and coords should have the same shape."
+    )
 
-    return ret
+    if len(cells.shape) == 2:
+        sys_data_copy["cells"] = np.array([cells])
+        sys_data_copy["coords"] = np.array([coords])
+    c = make_unlabeled_stru(
+        sys_data_copy,
+        0,
+        pp_file=fp_pp_files,
+        numerical_orbital=fp_orb_files,
+        numerical_descriptor=fp_dpks_descriptor,
+    )
+
+    return c
 
 
 def get_abacus_input_parameters(INPUT):
@@ -306,106 +253,37 @@ def get_abacus_input_parameters(INPUT):
     return input_parameters
 
 
-def get_mass_from_STRU(geometry_inlines, atom_names):
-    nele = get_nele_from_stru(geometry_inlines)
-    assert nele > 0
-    mass_list = [0 for i in atom_names]
-    pp_file_list = [i for i in atom_names]
-    for iline, line in enumerate(geometry_inlines):
-        if line.split() == []:
-            continue
-        if "ATOMIC_SPECIES" == line.split()[0]:
-            for iele1 in range(1, 1 + nele):
-                for iele2 in range(nele):
-                    if geometry_inlines[iline + iele1].split()[0] == atom_names[iele2]:
-                        mass_list[iele2] = float(
-                            geometry_inlines[iline + iele1].split()[1]
-                        )
-                        pp_file_list[iele2] = geometry_inlines[iline + iele1].split()[2]
-    for iele in range(len(mass_list)):
-        assert mass_list[iele] > 0
-    return mass_list, pp_file_list
-
-
-def get_natoms_from_stru(geometry_inlines):
-    key_words_list = [
-        "ATOMIC_SPECIES",
-        "NUMERICAL_ORBITAL",
-        "LATTICE_CONSTANT",
-        "LATTICE_VECTORS",
-        "ATOMIC_POSITIONS",
-        "NUMERICAL_DESCRIPTOR",
-    ]
-    keyword_sequence = []
-    keyword_line_index = []
-    atom_names = []
-    atom_numbs = []
-    tmp_line = []
-    for i in geometry_inlines:
-        if i.strip() != "":
-            tmp_line.append(i)
-    for iline, line in enumerate(tmp_line):
-        if line.split() == []:
-            continue
-        have_key_word = False
-        for keyword in key_words_list:
-            if keyword in line and keyword == line.split()[0]:
-                keyword_sequence.append(keyword)
-                keyword_line_index.append(iline)
-    assert len(keyword_line_index) == len(keyword_sequence)
-    assert len(keyword_sequence) > 0
-    keyword_line_index.append(len(tmp_line))
-    for idx, keyword in enumerate(keyword_sequence):
-        if keyword == "ATOMIC_POSITIONS":
-            iline = keyword_line_index[idx] + 2
-            while iline < keyword_line_index[idx + 1] - 1:
-                atom_names.append(tmp_line[iline].split()[0])
-                atom_numbs.append(int(tmp_line[iline + 2].split()[0]))
-                iline += 3 + atom_numbs[-1]
-    return atom_names, atom_numbs
-
-
-def get_additional_from_STRU(geometry_inlines, nele):
-    dpks_descriptor_kw = "NUMERICAL_DESCRIPTOR"
-    orb_file_kw = "NUMERICAL_ORBITAL"
-    dpks_descriptor = None
-    orb_file = None
-    for iline in range(len(geometry_inlines)):
-        if len(geometry_inlines[iline]) > 0:
-            if orb_file_kw == geometry_inlines[iline].split()[0]:
-                orb_file = []
-                for iele in range(nele):
-                    orb_file.append(geometry_inlines[iline + iele + 1].strip())
-            if dpks_descriptor_kw == geometry_inlines[iline].split()[0]:
-                dpks_descriptor = geometry_inlines[iline + 1].strip()
-    return orb_file, dpks_descriptor
-
-
-def get_abacus_STRU(STRU, INPUT=None, n_ele=None):
-    # read in geometry from STRU file. n_ele is the number of elements.
-    # Either n_ele or INPUT should be provided.
-    with open(STRU) as fp:
-        geometry_inlines = fp.read().split("\n")
-    for iline, line in enumerate(geometry_inlines):
-        if line.split() == [] or len(line) == 0:
-            del geometry_inlines[iline]
-    geometry_inlines.append("")
-    celldm, cell = get_cell(geometry_inlines)
-    atom_names, natoms, types, coords = get_coords(celldm, cell, geometry_inlines)
-    masses, pp_files = get_mass_from_STRU(geometry_inlines, atom_names)
-    orb_files, dpks_descriptor = get_additional_from_STRU(geometry_inlines, len(masses))
-    data = {}
-    data["atom_names"] = atom_names
-    data["atom_numbs"] = natoms
-    data["atom_types"] = types
-    data["cells"] = cell
-    data["coords"] = coords
-    data["atom_masses"] = (
-        masses  # Notice that this key is not defined in dpdata system.
-    )
-    data["pp_files"] = pp_files
-    data["orb_files"] = orb_files
-    data["dpks_descriptor"] = dpks_descriptor
+def get_abacus_STRU(STRU):
+    """Read STRU file and return a dictionary containing the structure information.
+
+    Args:
+        STRU (str): The path of STRU file.
+
+    Returns
+    -------
+    dict: A dictionary containing the structure information.
+    {
+        "atom_names": list of str,
+        "atom_numbs": list of int,
+        "atom_masses": list of float,
+        "coords": np.ndarray,
+        "cells": np.ndarray,
+        "pp_files": list of str,
+        "orb_files": list of str,
+        "dpks_descriptor": str,
+    }
+    """
+    data = get_frame_from_stru(STRU)
+    data["atom_masses"] = data.pop("masses")
+    data["cells"] = data.pop("cells")[0]
+    data["coords"] = data.pop("coords")[0]
+    assert "pp_files" in data, "pp_files should be provided in STRU file."
+    if None in data["pp_files"]:
+        data["pp_files"] = None
+    if "orb_files" not in data:
+        data["orb_files"] = None
+    if "dpks_descriptor" not in data:
+        data["dpks_descriptor"] = None
     return data
 
 
@@ -419,7 +297,21 @@ def make_supercell_abacus(from_struct, super_cell):
         # )
         for idx_atm in from_struct["atom_types"]:
             new_types += [idx_atm] * super_cell[0] * super_cell[1] * super_cell[2]
-        to_struct["atom_types"] = new_types
+        to_struct["atom_types"] = np.array(new_types)
+
+    # expand move, spins
+    for ikey in ["move", "spins"]:
+        if ikey in from_struct:
+            new_list = []
+            for ia in range(sum(from_struct["atom_numbs"])):
+                new_list += (
+                    [from_struct[ikey][0][ia]]
+                    * super_cell[0]
+                    * super_cell[1]
+                    * super_cell[2]
+                )
+            to_struct[ikey] = np.array([new_list])
+
     to_atom_num = (
         sum(from_struct["atom_numbs"]) * super_cell[0] * super_cell[1] * super_cell[2]
     )
diff --git a/dpgen/generator/run.py b/dpgen/generator/run.py
@@ -3569,7 +3569,6 @@ def make_fp_abacus_scf(iter_index, jdata):
             fp_pp_files,
             fp_orb_files,
             fp_dpks_descriptor,
-            fp_params,
             type_map=jdata["type_map"],
             pporb=pporb_path,
         )
diff --git a/tests/auto_test/test_abacus.py b/tests/auto_test/test_abacus.py
@@ -216,7 +216,7 @@ def test_make_input_file_kspacing_three_value(self):
             kpt = f1.read().strip().split("\n")[-1].split()
         self.assertEqual(kpt, ["9", "5", "3", "0", "0", "0"])
 
-    def test_compuate(self):
+    def test_compute(self):
         ret = self.ABACUS.compute(os.path.join(self.equi_path))
         self.assertIsNone(ret)
         shutil.copy(
@@ -247,7 +247,21 @@ def compare_dict(dict1, dict2):
                     else:
                         self.assertTrue(dict1[key] == dict2[key])
 
-        compare_dict(ret, ret_ref.as_dict())
+        compare_keys = [
+            "atom_numbs",
+            "atom_names",
+            "atom_types",
+            "cells",
+            "coords",
+            "energies",
+            "forces",
+            "virials",
+            "stress",
+        ]
+        compare_dict(
+            {k: ret["data"][k] for k in compare_keys},
+            {k: ret_ref.data[k] for k in compare_keys},
+        )
 
 
 class TestABACUSDeepKS(unittest.TestCase):
diff --git a/tests/data/test_gen_bulk_abacus.py b/tests/data/test_gen_bulk_abacus.py
@@ -76,6 +76,7 @@ def test(self):
     def testSTRU(self):
         jdata = self.jdata
         jdata["from_poscar_path"] = "./Cu.STRU"
+        jdata["potcars"] = ["abacus.in/Cu_ONCV_PBE-1.0.upf"]
         make_super_cell_STRU(jdata)
         make_abacus_relax(jdata, {"fp_resources": {}})
         make_scale_ABACUS(jdata)

Original file line number	Diff line number	Diff line change
`@@ -3569,7 +3569,6 @@ def make_fp_abacus_scf(iter_index, jdata):`
`3569`	`3569`	`fp_pp_files,`
`3570`	`3570`	`fp_orb_files,`
`3571`	`3571`	`fp_dpks_descriptor,`
`3572`		`- fp_params,`
`3573`	`3572`	`type_map=jdata["type_map"],`
`3574`	`3573`	`pporb=pporb_path,`
`3575`	`3574`	`)`