Skip to content

Commit

Permalink
Optimize memory utilization of get_structure().
Browse files Browse the repository at this point in the history
This enhancement avoids creating a `[N_struct_conn, N_atom]` matrix when
reading the "struct_conn" field, preventing excessive memory usage
when dealing with CIF files containing a large number of atoms
and numerous inter-residue bonds.

Co-authored-by: Jincai Yang <[email protected]>
  • Loading branch information
cloverzizi and 0ut0fcontrol committed Mar 4, 2025
1 parent 877efed commit 0024041
Showing 1 changed file with 54 additions and 1 deletion.
55 changes: 54 additions & 1 deletion src/biotite/structure/io/pdbx/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -644,7 +644,7 @@ def _parse_inter_residue_bonds(atom_site, struct_conn):
)


def _find_matches(query_arrays, reference_arrays):
def _find_matches_by_dense_array(query_arrays, reference_arrays):
"""
For each index in the `query_arrays` find the indices in the
`reference_arrays` where all query values match the reference counterpart.
Expand Down Expand Up @@ -677,6 +677,59 @@ def _find_matches(query_arrays, reference_arrays):
return match_indices


def _find_matches_by_dict(query_arrays, reference_arrays):
"""
For each index in the `query_arrays` find the indices in the
`reference_arrays` where all query values match the reference counterpart.
If no match is found for a query, the corresponding index is -1.
"""
# Convert reference arrays to a dictionary for O(1) lookups
reference_dict = {}
unambiguously_keys = set()
for idx, col in enumerate(np.stack(reference_arrays, axis=-1)):
ref_key = tuple(col)
if ref_key in reference_dict:
unambiguously_keys.add(ref_key)
continue
reference_dict[ref_key] = idx

match_indices = []
for query_idx, query_col in enumerate(np.stack(query_arrays, axis=-1)):
query_key = tuple(query_col)
occurrence = reference_dict.get(query_key, -1)

if occurrence == -1:
# -1 indicates that no match was found in the reference
match_indices.append(-1)
elif query_key in unambiguously_keys:
# The query cannot be uniquely matched to an atom in the reference
raise InvalidFileError(
f"The covalent bond in the 'struct_conn' category at index "
f"{query_idx} cannot be unambiguously assigned to atoms in "
f"the 'atom_site' category"
)
else:
match_indices.append(occurrence)

return np.array(match_indices)


def _find_matches(query_arrays, reference_arrays):
"""
For each index in the `query_arrays` find the indices in the
`reference_arrays` where all query values match the reference counterpart.
If no match is found for a query, the corresponding index is -1.
"""
# it was observed that when the size exceeds 2**13 (8192)
# the dict strategy becomes significantly faster than the dense array
# and does not cause excessive memory usage.
if query_arrays[0].size[0] * reference_arrays[0].size[0] <= 8192:
match_indices = _find_matches_by_dense_array(query_arrays, reference_arrays)
else:
match_indices = _find_matches_by_dict(query_arrays, reference_arrays)
return match_indices


def _get_struct_conn_col_name(col_name, partner):
"""
For a column name in ``atom_site`` get the corresponding column name
Expand Down

0 comments on commit 0024041

Please sign in to comment.