Optimize memory utilization of get_structure().

This enhancement avoids creating a `[N_struct_conn, N_atom]` matrix when reading the "struct_conn" field, preventing excessive memory usage when dealing with CIF files containing a large number of atoms and numerous inter-residue bonds. Co-authored-by: Jincai Yang <[email protected]>
biotite-dev · Mar 4, 2025 · 0024041 · 0024041
1 parent 877efed
commit 0024041
Showing 1 changed file with 54 additions and 1 deletion.
diff --git a/src/biotite/structure/io/pdbx/convert.py b/src/biotite/structure/io/pdbx/convert.py
@@ -644,7 +644,7 @@ def _parse_inter_residue_bonds(atom_site, struct_conn):
     )
 
 
-def _find_matches(query_arrays, reference_arrays):
+def _find_matches_by_dense_array(query_arrays, reference_arrays):
     """
     For each index in the `query_arrays` find the indices in the
     `reference_arrays` where all query values match the reference counterpart.
@@ -677,6 +677,59 @@ def _find_matches(query_arrays, reference_arrays):
     return match_indices
 
 
+def _find_matches_by_dict(query_arrays, reference_arrays):
+    """
+    For each index in the `query_arrays` find the indices in the
+    `reference_arrays` where all query values match the reference counterpart.
+    If no match is found for a query, the corresponding index is -1.
+    """
+    # Convert reference arrays to a dictionary for O(1) lookups
+    reference_dict = {}
+    unambiguously_keys = set()
+    for idx, col in enumerate(np.stack(reference_arrays, axis=-1)):
+        ref_key = tuple(col)
+        if ref_key in reference_dict:
+            unambiguously_keys.add(ref_key)
+            continue
+        reference_dict[ref_key] = idx
+
+    match_indices = []
+    for query_idx, query_col in enumerate(np.stack(query_arrays, axis=-1)):
+        query_key = tuple(query_col)
+        occurrence = reference_dict.get(query_key, -1)
+
+        if occurrence == -1:
+            # -1 indicates that no match was found in the reference
+            match_indices.append(-1)
+        elif query_key in unambiguously_keys:
+            # The query cannot be uniquely matched to an atom in the reference
+            raise InvalidFileError(
+                f"The covalent bond in the 'struct_conn' category at index "
+                f"{query_idx} cannot be unambiguously assigned to atoms in "
+                f"the 'atom_site' category"
+            )
+        else:
+            match_indices.append(occurrence)
+
+    return np.array(match_indices)
+
+
+def _find_matches(query_arrays, reference_arrays):
+    """
+    For each index in the `query_arrays` find the indices in the
+    `reference_arrays` where all query values match the reference counterpart.
+    If no match is found for a query, the corresponding index is -1.
+    """
+    #  it was observed that when the size exceeds 2**13 (8192)
+    #  the dict strategy becomes significantly faster than the dense array
+    #  and does not cause excessive memory usage.
+    if query_arrays[0].size[0] * reference_arrays[0].size[0] <= 8192:
+        match_indices = _find_matches_by_dense_array(query_arrays, reference_arrays)
+    else:
+        match_indices = _find_matches_by_dict(query_arrays, reference_arrays)
+    return match_indices
+
+
 def _get_struct_conn_col_name(col_name, partner):
     """
     For a column name in ``atom_site`` get the corresponding column name