Change md/geom parsing to frame function for easier use. (#176)

- Fix issue with reading all frames of MD/Geom files - Add a function to parse a single frame of an MD calculation from a block enabling generator approaches. - Makes `Block.__bool__` verify it contains non-whitespace data.
oerc0122 · Jan 10, 2025 · 7a7dad7 · 7a7dad7
1 parent 9fe6e0c
commit 7a7dad7
Show file tree

Hide file tree

Showing 6 changed files with 560 additions and 51 deletions.
diff --git a/castep_outputs/parsers/md_geom_file_parser.py b/castep_outputs/parsers/md_geom_file_parser.py
@@ -7,6 +7,7 @@
 from ..utilities.castep_res import ATOMIC_DATA_TAG, TAG_RE, get_numbers
 from ..utilities.constants import FST_D, TAG_ALIASES
 from ..utilities.datatypes import AtomIndex, ThreeByThreeMatrix, ThreeVector
+from ..utilities.filewrapper import Block
 from ..utilities.utility import add_aliases, atreg_to_index, to_type
 
 
@@ -67,47 +68,64 @@ class MDGeomTimestepInfo(TypedDict, total=False):
     S: ThreeByThreeMatrix
 
 
-def parse_md_geom_file(md_geom_file: TextIO) -> list[MDGeomTimestepInfo]:
+def parse_md_geom_frame(block: Block) -> MDGeomTimestepInfo:
     """
-    Parse standard .md and .geom files.
+    Parse a single frame of a .md/.geom file.
 
     Parameters
     ----------
-    md_geom_file
-        Open handle to file to parse.
+    block
+        Block containing frame of data.
 
     Returns
     -------
-    list[MDGeomTimestepInfo]
-        Step-by-step Parsed info.
+    MDGeomTimestepInfo
+        Parsed frame of data.
     """
-    while "END header" not in md_geom_file.readline():
-        pass
-
-    steps = []
     curr: MDGeomTimestepInfo = defaultdict(list)
     curr["ions"] = {}
-    for line in md_geom_file:
-        if not line.strip():  # Next step
-            if curr and curr["ions"]:
-                add_aliases(curr, TAG_ALIASES)
-                for ion in curr["ions"].values():
-                    add_aliases(ion, TAG_ALIASES)
-                steps.append(curr)
-            curr = defaultdict(list)
-            curr["ions"] = {}
+
+    for line in block:
+        if not line.strip():
+            pass
         elif not TAG_RE.search(line):  # Timestep
             curr["time"] = to_type(get_numbers(line)[0], float)
 
         elif match := ATOMIC_DATA_TAG.match(line):
             ion = atreg_to_index(match)
-            if ion not in curr["ions"]:
-                curr["ions"][ion] = {}
+            curr["ions"].setdefault(ion, {})
             curr["ions"][ion][match.group("tag")] = to_type([match.group(d) for d in FST_D], float)
 
         elif match := TAG_RE.search(line):
             curr[match.group("tag")].append([*to_type(get_numbers(line), float)])
 
+    add_aliases(curr, TAG_ALIASES)
+    for ion in curr["ions"].values():
+        add_aliases(ion, TAG_ALIASES)
+
+    return curr
+
+def parse_md_geom_file(md_geom_file: TextIO) -> list[MDGeomTimestepInfo]:
+    """
+    Parse standard .md and .geom files.
+
+    Parameters
+    ----------
+    md_geom_file
+        Open handle to file to parse.
+
+    Returns
+    -------
+    list[MDGeomTimestepInfo]
+        Step-by-step Parsed info.
+    """
+    while "END header" not in md_geom_file.readline():
+        pass
+    md_geom_file.readline()
+    steps = []
+    while block := Block.from_re("", md_geom_file, "", "^$", eof_possible=True):
+        steps.append(parse_md_geom_frame(block))
+
     return steps