From 0024041a7121505a4dae2386ee1ac24cebc4c457 Mon Sep 17 00:00:00 2001
From: "mawenzhi.5537" <mawenzhi.5537@bytedance.com>
Date: Tue, 4 Mar 2025 14:34:07 +0800
Subject: [PATCH] Optimize memory utilization of get_structure().

This enhancement avoids creating a `[N_struct_conn, N_atom]` matrix when
reading the "struct_conn" field, preventing excessive memory usage
when dealing with CIF files containing a large number of atoms
and numerous inter-residue bonds.

Co-authored-by: Jincai Yang <yangjincai@bytedance.com>
---
 src/biotite/structure/io/pdbx/convert.py | 55 +++++++++++++++++++++++-
 1 file changed, 54 insertions(+), 1 deletion(-)

diff --git a/src/biotite/structure/io/pdbx/convert.py b/src/biotite/structure/io/pdbx/convert.py
index 048bfdc9b..cbcda51dc 100644
--- a/src/biotite/structure/io/pdbx/convert.py
+++ b/src/biotite/structure/io/pdbx/convert.py
@@ -644,7 +644,7 @@ def _parse_inter_residue_bonds(atom_site, struct_conn):
     )
 
 
-def _find_matches(query_arrays, reference_arrays):
+def _find_matches_by_dense_array(query_arrays, reference_arrays):
     """
     For each index in the `query_arrays` find the indices in the
     `reference_arrays` where all query values match the reference counterpart.
@@ -677,6 +677,59 @@ def _find_matches(query_arrays, reference_arrays):
     return match_indices
 
 
+def _find_matches_by_dict(query_arrays, reference_arrays):
+    """
+    For each index in the `query_arrays` find the indices in the
+    `reference_arrays` where all query values match the reference counterpart.
+    If no match is found for a query, the corresponding index is -1.
+    """
+    # Convert reference arrays to a dictionary for O(1) lookups
+    reference_dict = {}
+    unambiguously_keys = set()
+    for idx, col in enumerate(np.stack(reference_arrays, axis=-1)):
+        ref_key = tuple(col)
+        if ref_key in reference_dict:
+            unambiguously_keys.add(ref_key)
+            continue
+        reference_dict[ref_key] = idx
+
+    match_indices = []
+    for query_idx, query_col in enumerate(np.stack(query_arrays, axis=-1)):
+        query_key = tuple(query_col)
+        occurrence = reference_dict.get(query_key, -1)
+
+        if occurrence == -1:
+            # -1 indicates that no match was found in the reference
+            match_indices.append(-1)
+        elif query_key in unambiguously_keys:
+            # The query cannot be uniquely matched to an atom in the reference
+            raise InvalidFileError(
+                f"The covalent bond in the 'struct_conn' category at index "
+                f"{query_idx} cannot be unambiguously assigned to atoms in "
+                f"the 'atom_site' category"
+            )
+        else:
+            match_indices.append(occurrence)
+
+    return np.array(match_indices)
+
+
+def _find_matches(query_arrays, reference_arrays):
+    """
+    For each index in the `query_arrays` find the indices in the
+    `reference_arrays` where all query values match the reference counterpart.
+    If no match is found for a query, the corresponding index is -1.
+    """
+    #  it was observed that when the size exceeds 2**13 (8192)
+    #  the dict strategy becomes significantly faster than the dense array
+    #  and does not cause excessive memory usage.
+    if query_arrays[0].size[0] * reference_arrays[0].size[0] <= 8192:
+        match_indices = _find_matches_by_dense_array(query_arrays, reference_arrays)
+    else:
+        match_indices = _find_matches_by_dict(query_arrays, reference_arrays)
+    return match_indices
+
+
 def _get_struct_conn_col_name(col_name, partner):
     """
     For a column name in ``atom_site`` get the corresponding column name