Merge pull request #658 from marrink-lab/system-meta

move header writing in gmx files to a system level meta attribute
marrink-lab · Feb 12, 2025 · ac93f62 · ac93f62
2 parents df2a554 + b5b9638
commit ac93f62
Show file tree

Hide file tree

Showing 6 changed files with 120 additions and 39 deletions.
diff --git a/bin/martinize2 b/bin/martinize2
@@ -174,7 +174,7 @@ def martinize(system, mappings, to_ff, delete_unknown=False, idrs=False, disorde
         mappings=mappings,
         to_ff=to_ff,
         delete_unknown=delete_unknown,
-        attribute_keep=("cgsecstruct", "chain"),
+        attribute_keep=("cgsecstruct", "chain", "secstruct"),
         attribute_must=("resname",),
         attribute_stash=("resid",),
     ).run_system(system)
@@ -911,7 +911,10 @@ def entry():
         write_repair=args.write_repair,
         write_canon=args.write_canon,
     )
-
+    system.meta["header"].extend(("This file was generated using the following command:",
+                                  " ".join(sys.argv),
+                                  VERSION,
+                                  ))
     LOGGER.info("Read input.", type="step")
     for molecule in system.molecules:
         LOGGER.debug("Read molecule {}.", molecule, type="step")
@@ -975,16 +978,6 @@ def entry():
                            'for this force field',
                            type="missing-feature")
 
-    ss_sequence = list(
-        itertools.chain(
-            *(
-                dssp.sequence_from_residues(molecule, "secstruct")
-                for molecule in system.molecules
-                if selectors.is_protein(molecule)
-            )
-        )
-    )
-
     if args.cystein_bridge == "none":
         vermouth.RemoveCysteinBridgeEdges().run_system(system)
     elif args.cystein_bridge != "auto":
@@ -1147,33 +1140,13 @@ def entry():
                     LOGGER.log(loglevel, entry, **fmt_arg, type="model")
 
     # Write the topology if requested
-    # grompp has a limit in the number of character it can read per line
-    # (due to the size limit of a buffer somewhere in its implementation).
-    # The command line can be longer than this limit and therefore
-    # prevent grompp from reading the topology.
-    gromacs_char_limit = 4000  # the limit is actually 4095, but I play safe
-    command = " ".join(sys.argv)
-    if len(command) > gromacs_char_limit:
-        command = command[:gromacs_char_limit] + " ..."
-    header = [
-        "This file was generated using the following command:",
-        command,
-        VERSION,
-    ]
-    if None not in ss_sequence:
-        header += [
-            "The following sequence of secondary structure ",
-            "was used for the full system:",
-            "".join(ss_sequence),
-        ]
 
     if args.top_path is not None:
         write_gmx_topology(system,
                            args.top_path,
                            itp_paths=itp_paths,
                            C6C12=False,
-                           defines=defines,
-                           header=header)
+                           defines=defines)
 
     # Write a PDB file.
     vermouth.pdb.write_pdb(system, str(args.outpath), omit_charges=True)

diff --git a/vermouth/dssp/dssp.py b/vermouth/dssp/dssp.py
@@ -23,6 +23,7 @@
 import subprocess
 import tempfile
 import re
+import itertools
 
 from ..file_writer import deferred_open
 from ..pdb import pdb
@@ -541,6 +542,24 @@ def convert_dssp_annotation_to_martini(
         raise ValueError('Not all residues have a DSSP assignation.')
 
 
+def gmx_system_header(system):
+
+    ss_sequence = list(
+        itertools.chain(
+            *(
+                sequence_from_residues(molecule, "secstruct")
+                for molecule in system.molecules
+                if is_protein(molecule)
+            )
+        )
+    )
+
+    if None not in ss_sequence and ss_sequence:
+        system.meta["header"].extend(("The following sequence of secondary structure ",
+                                      "was used for the full system:",
+                                      "".join(ss_sequence),
+                                     ))
+
 class AnnotateDSSP(Processor):
     name = 'AnnotateDSSP'
 
@@ -570,6 +589,10 @@ def run_molecule(molecule):
         convert_dssp_annotation_to_martini(molecule)
         return molecule
 
+    def run_system(self, system):
+        gmx_system_header(system)
+        super().run_system(system)
+
 
 class AnnotateResidues(Processor):
     """

diff --git a/vermouth/gmx/topology.py b/vermouth/gmx/topology.py
@@ -10,12 +10,14 @@
 from ..log_helpers import StyleAdapter, get_logger
 from .itp import _interaction_sorting_key
 from ..data import COMMON_CITATIONS
-
+from ..dssp.dssp import sequence_from_residues
+from ..selectors import is_protein
 LOGGER = StyleAdapter(get_logger(__name__))
 
 Atomtype = namedtuple('Atomtype', 'molecule node sigma epsilon meta')
 NonbondParam = namedtuple('NonbondParam', 'atoms sigma epsilon meta')
 
+
 def _group_by_conditionals(interactions):
     interactions_group_sorted = sorted(interactions,
         key=_interaction_sorting_key
@@ -116,8 +118,7 @@ def write_gmx_topology(system,
                        itp_paths={"nonbond_params": "extra_nbparams.itp",
                                   "atomtypes": "extra_atomtypes.itp"},
                        C6C12=False,
-                       defines=(),
-                       header=()):
+                       defines=()):
     """
     Writes a Gromacs .top file for the specified system. Gromacs topology
     files are defined by directives for example `[ atomtypes ]`. However,
@@ -142,8 +143,6 @@ def write_gmx_topology(system,
         C6C12 form
     defines: tuple(str)
         define statments to include in the topology
-    header: tuple(str)
-        any comment lines to include at the beginning
     """
     if not system.molecules:
         raise ValueError("No molecule in the system. Nothing to write.")
@@ -178,6 +177,20 @@ def write_gmx_topology(system,
     molecule_groups = itertools.groupby(
         system.molecules, key=lambda x: x.meta["moltype"]
     )
+
+    # grompp has a limit in the number of character it can read per line
+    # (due to the size limit of a buffer somewhere in its implementation).
+    # The command line can be longer than this limit and therefore
+    # prevent grompp from reading the topology.
+    gromacs_char_limit = 4000  # the limit is actually 4095, but I play safe
+    _header = system.meta.get('header', [])
+    header = []
+    for line in _header:
+        if len(line) > gromacs_char_limit:
+            header.append(line[:gromacs_char_limit] + " ...")
+        else:
+            header.append(line)
+
     for moltype, molecules in molecule_groups:
         molecule = next(molecules)
         if molecule.force_field is not None:

diff --git a/vermouth/system.py b/vermouth/system.py
@@ -33,6 +33,7 @@ def __init__(self, force_field=None):
         self.force_field = force_field
         self.gmx_topology_params = defaultdict(list)
         self.go_params = defaultdict(list)
+        self.meta = defaultdict(list)
 
     @property
     def force_field(self):

diff --git a/vermouth/tests/gmx/test_topology.py b/vermouth/tests/gmx/test_topology.py
@@ -19,6 +19,8 @@
 import textwrap
 import pytest
 import vermouth
+from itertools import product
+from string import ascii_letters
 from vermouth.file_writer import DeferredFileWriter
 from vermouth.gmx.topology import (sigma_epsilon_to_C6_C12,
                                    write_atomtypes,
@@ -178,12 +180,12 @@ def test_toplevel_topology(tmp_path, dummy_molecule):
                                                                      sigma=0.43,
                                                                      epsilon=2.3,
                                                                      meta={}))
+    system.meta["header"] = ['first header comment', 'second header comment']
     outpath = tmp_path / 'out.itp'
     atompath = tmp_path / 'atomtypes.itp'
     nbpath = tmp_path / 'nonbond_params.itp'
     write_gmx_topology(system,
                        outpath,
-                       header=['first header comment', 'second header comment'],
                        defines=('random', ),
                        itp_paths={"atomtypes": atompath, "nonbond_params": nbpath},
                        # at this level C6C12 doesn't matter; it gets
@@ -205,3 +207,31 @@ def test_toplevel_topology(tmp_path, dummy_molecule):
     with open(str(outpath)) as infile:
         for line, ref_line in zip(infile, ref_lines):
             assert line.strip() == ref_line
+
+@pytest.mark.parametrize('command, expected',
+    (
+        [ascii_letters, {"length": len(ascii_letters), "string": ascii_letters}],
+        [(ascii_letters*100), {"length": 4004, "string": (ascii_letters*100)[:4000]+" ..."}]
+
+    ))
+def test_gromacs_cmd_len(dummy_molecule, tmp_path, command, expected):
+    os.chdir(tmp_path)
+    system = vermouth.System()
+    system.add_molecule(dummy_molecule)
+    dummy_molecule.meta['moltype'] = "molecule_0"
+    system.meta['header'].extend([command])
+
+    outpath = tmp_path / 'out.itp'
+
+    write_gmx_topology(system,
+                       outpath,
+                       defines=('random', ),
+                       C6C12=False)
+    DeferredFileWriter().write()
+
+    with open(str(tmp_path / 'molecule_0.itp')) as infile:
+        expected_line = infile.readlines()[0].strip()[2:]
+
+    assert len(expected_line) == expected["length"]
+
+    assert expected_line == expected["string"]
diff --git a/vermouth/tests/test_dssp.py b/vermouth/tests/test_dssp.py
@@ -37,6 +37,7 @@
     PDB_ALA5_CG,
     PDB_ALA5,
 )
+from vermouth.tests.helper_functions import test_molecule, create_sys_all_attrs
 
 DSSP_EXECUTABLE = os.environ.get("VERMOUTH_TEST_DSSP", "dssp")
 SECSTRUCT_1BTA = list(
@@ -705,3 +706,43 @@ def test_cterm_atomnames():
 def test_convert_dssp_to_martini(sequence, expected):
     found = dssp.convert_dssp_to_martini(sequence)
     assert expected == found
+
+@pytest.mark.parametrize('resnames, ss_string, secstruc',
+     (      # protein resnames with secstruc
+             ({0: "ALA", 1: "ALA", 2: "ALA",
+               3: "GLY", 4: "GLY",
+               5: "MET",
+               6: "ARG", 7: "ARG", 8: "ARG"},
+              'HHHH',
+              {1: "H", 2: "H", 3: "H", 4: "H"}
+              ),
+             # not protein resnames, no secstruc annotated or gets written
+             ({0: "A", 1: "A", 2: "A",
+               3: "B", 4: "B",
+               5: "C",
+               6: "D", 7: "D", 8: "D"},
+              "",
+              {1: "", 2: "", 3: "", 4: ""}
+              )
+     ))
+def test_gmx_system_header(test_molecule, resnames, ss_string, secstruc):
+
+    atypes = {0: "P1", 1: "SN4a", 2: "SN4a",
+              3: "SP1", 4: "C1",
+              5: "TP1",
+              6: "P1", 7: "SN3a", 8: "SP4"}
+
+    system = create_sys_all_attrs(test_molecule,
+                                  moltype="molecule_0",
+                                  secstruc=secstruc,
+                                  defaults={"chain": "A"},
+                                  attrs={"resname": resnames,
+                                         "atype": atypes})
+
+    # annotate the actual 'secstruct' attribute because create_sys_all_attrs actually annotates cgsecstruct
+    dssp.AnnotateResidues(attribute="secstruct",
+                          sequence="HHHH").run_system(system)
+
+    dssp.AnnotateMartiniSecondaryStructures().run_system(system)
+
+    assert ss_string in system.meta.get('header', [''])