From fa91b73cf96009aba35263f4989b2699f273464d Mon Sep 17 00:00:00 2001 From: Gustavo Jeuken Date: Thu, 28 Nov 2024 13:34:33 +0100 Subject: [PATCH 1/2] memory efficient mtx loading --- src/scanpy/datasets/_ebi_expression_atlas.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/scanpy/datasets/_ebi_expression_atlas.py b/src/scanpy/datasets/_ebi_expression_atlas.py index b7e1886e71..aac6657199 100644 --- a/src/scanpy/datasets/_ebi_expression_atlas.py +++ b/src/scanpy/datasets/_ebi_expression_atlas.py @@ -67,13 +67,17 @@ def read_mtx_from_stream(stream: BinaryIO) -> sparse.csr_matrix: max_int32 = np.iinfo(np.int32).max coord_dtype = np.int64 if n > max_int32 or m > max_int32 else np.int32 - data = pd.read_csv( + chunks = pd.read_csv( stream, sep=r"\s+", header=None, dtype={0: coord_dtype, 1: coord_dtype, 2: np.float32}, + chunksize=1E7 ) - mtx = sparse.csr_matrix((data[2], (data[1] - 1, data[0] - 1)), shape=(m, n)) + mtx = sparse.csr_matrix(([0],([0],[0])), shape=(m, n)) + for data in chunks: + mtx_chunk = sparse.csr_matrix((data[2], (data[1] - 1, data[0] - 1)), shape=(m, n)) + mtx = mtx + mtx_chunk return mtx From 792b5e2b8073f7d949c8a499563ed5752fb654c2 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 27 Jan 2025 07:29:45 +0000 Subject: [PATCH 2/2] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/scanpy/datasets/_ebi_expression_atlas.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/scanpy/datasets/_ebi_expression_atlas.py b/src/scanpy/datasets/_ebi_expression_atlas.py index aac6657199..4d3766c539 100644 --- a/src/scanpy/datasets/_ebi_expression_atlas.py +++ b/src/scanpy/datasets/_ebi_expression_atlas.py @@ -72,11 +72,13 @@ def read_mtx_from_stream(stream: BinaryIO) -> sparse.csr_matrix: sep=r"\s+", header=None, dtype={0: coord_dtype, 1: coord_dtype, 2: np.float32}, - chunksize=1E7 + chunksize=1e7, ) - mtx = sparse.csr_matrix(([0],([0],[0])), shape=(m, n)) + mtx = sparse.csr_matrix(([0], ([0], [0])), shape=(m, n)) for data in chunks: - mtx_chunk = sparse.csr_matrix((data[2], (data[1] - 1, data[0] - 1)), shape=(m, n)) + mtx_chunk = sparse.csr_matrix( + (data[2], (data[1] - 1, data[0] - 1)), shape=(m, n) + ) mtx = mtx + mtx_chunk return mtx