From 0024041a7121505a4dae2386ee1ac24cebc4c457 Mon Sep 17 00:00:00 2001 From: "mawenzhi.5537" Date: Tue, 4 Mar 2025 14:34:07 +0800 Subject: [PATCH] Optimize memory utilization of get_structure(). This enhancement avoids creating a `[N_struct_conn, N_atom]` matrix when reading the "struct_conn" field, preventing excessive memory usage when dealing with CIF files containing a large number of atoms and numerous inter-residue bonds. Co-authored-by: Jincai Yang --- src/biotite/structure/io/pdbx/convert.py | 55 +++++++++++++++++++++++- 1 file changed, 54 insertions(+), 1 deletion(-) diff --git a/src/biotite/structure/io/pdbx/convert.py b/src/biotite/structure/io/pdbx/convert.py index 048bfdc9b..cbcda51dc 100644 --- a/src/biotite/structure/io/pdbx/convert.py +++ b/src/biotite/structure/io/pdbx/convert.py @@ -644,7 +644,7 @@ def _parse_inter_residue_bonds(atom_site, struct_conn): ) -def _find_matches(query_arrays, reference_arrays): +def _find_matches_by_dense_array(query_arrays, reference_arrays): """ For each index in the `query_arrays` find the indices in the `reference_arrays` where all query values match the reference counterpart. @@ -677,6 +677,59 @@ def _find_matches(query_arrays, reference_arrays): return match_indices +def _find_matches_by_dict(query_arrays, reference_arrays): + """ + For each index in the `query_arrays` find the indices in the + `reference_arrays` where all query values match the reference counterpart. + If no match is found for a query, the corresponding index is -1. + """ + # Convert reference arrays to a dictionary for O(1) lookups + reference_dict = {} + unambiguously_keys = set() + for idx, col in enumerate(np.stack(reference_arrays, axis=-1)): + ref_key = tuple(col) + if ref_key in reference_dict: + unambiguously_keys.add(ref_key) + continue + reference_dict[ref_key] = idx + + match_indices = [] + for query_idx, query_col in enumerate(np.stack(query_arrays, axis=-1)): + query_key = tuple(query_col) + occurrence = reference_dict.get(query_key, -1) + + if occurrence == -1: + # -1 indicates that no match was found in the reference + match_indices.append(-1) + elif query_key in unambiguously_keys: + # The query cannot be uniquely matched to an atom in the reference + raise InvalidFileError( + f"The covalent bond in the 'struct_conn' category at index " + f"{query_idx} cannot be unambiguously assigned to atoms in " + f"the 'atom_site' category" + ) + else: + match_indices.append(occurrence) + + return np.array(match_indices) + + +def _find_matches(query_arrays, reference_arrays): + """ + For each index in the `query_arrays` find the indices in the + `reference_arrays` where all query values match the reference counterpart. + If no match is found for a query, the corresponding index is -1. + """ + # it was observed that when the size exceeds 2**13 (8192) + # the dict strategy becomes significantly faster than the dense array + # and does not cause excessive memory usage. + if query_arrays[0].size[0] * reference_arrays[0].size[0] <= 8192: + match_indices = _find_matches_by_dense_array(query_arrays, reference_arrays) + else: + match_indices = _find_matches_by_dict(query_arrays, reference_arrays) + return match_indices + + def _get_struct_conn_col_name(col_name, partner): """ For a column name in ``atom_site`` get the corresponding column name