diff --git a/src/arcade_collection/output/parse_cells_file.py b/src/arcade_collection/output/parse_cells_file.py index 8b81e8f..c5f7a5b 100644 --- a/src/arcade_collection/output/parse_cells_file.py +++ b/src/arcade_collection/output/parse_cells_file.py @@ -1,9 +1,13 @@ +from __future__ import annotations + import json -import tarfile -from typing import Union +from typing import TYPE_CHECKING import pandas as pd +if TYPE_CHECKING: + import tarfile + CELLS_COLUMNS = [ "ID", "TICK", @@ -15,30 +19,95 @@ "PHASE", "NUM_VOXELS", ] +"""Column names for cells data parsed into tidy data format.""" def parse_cells_file(tar: tarfile.TarFile, regions: list[str]) -> pd.DataFrame: - all_cells: list[list[Union[str, int]]] = [] + """ + Parse simulation cells data into tidy data format. + + Parameters + ---------- + tar + Tar archive containing locations data. + regions + List of regions. + + Returns + ------- + : + Parsed cells data. + """ + + all_cells: list[list[str | int]] = [] for member in tar.getmembers(): - timepoint = int(member.name.replace(".CELLS.json", "").split("_")[-1]) + tick = int(member.name.replace(".CELLS.json", "").split("_")[-1]) extracted_member = tar.extractfile(member) - assert extracted_member is not None cells_json = json.loads(extracted_member.read().decode("utf-8")) - cells = [parse_cell_timepoint(timepoint, cell, regions) for cell in cells_json] + cells = [parse_cell_tick(tick, cell, regions) for cell in cells_json] all_cells = all_cells + cells columns = CELLS_COLUMNS + [f"NUM_VOXELS.{region}" for region in regions] - cells_df = pd.DataFrame(all_cells, columns=columns) + return pd.DataFrame(all_cells, columns=columns) + + +def parse_cell_tick(tick: int, cell: dict, regions: list[str]) -> list: + """ + Parse cell data for a single simulation tick. + + Original data is formatted as: + + .. code-block:: python + + { + "id": cell_id, + "parent": parent_id, + "pop": population, + "age": age, + "divisions": divisions, + "state": state, + "phase": phase, + "voxels": voxels, + "criticals": [critical_volume, critical_height], + "regions": [ + { + "region": region_name, + "voxels": region_voxels, + "criticals": [critical_region_volume, critical_region_height] + }, + ... + ] + } + + Parsed data is formatted as: + + .. code-block:: python + + [ cell_id, tick, parent_id, population, age, divisions, state, phase, voxels ] + + When regions are specified, each list also contains the number of voxels for + the corresponding regions. - return cells_df + Parameters + ---------- + tick + Simulation tick. + cell + Original cell data. + regions + List of regions. + Returns + ------- + : + Parsed cell data. + """ -def parse_cell_timepoint(timepoint: int, cell: dict, regions: list[str]) -> list: features = ["parent", "pop", "age", "divisions", "state", "phase", "voxels"] - parsed = [cell["id"], timepoint] + [cell[feature] for feature in features] + parsed = [cell["id"], tick] + [cell[feature] for feature in features] if regions and "regions" in cell: region_voxels = [ diff --git a/tests/arcade_collection/output/test_parse_cells_file.py b/tests/arcade_collection/output/test_parse_cells_file.py new file mode 100644 index 0000000..f262621 --- /dev/null +++ b/tests/arcade_collection/output/test_parse_cells_file.py @@ -0,0 +1,225 @@ +import json +import tarfile +import unittest +from unittest import mock +import pandas as pd + +from arcade_collection.output.parse_cells_file import parse_cells_file, parse_cell_tick + + +class TestParseCellsFile(unittest.TestCase): + def test_parse_cells_file_without_regions(self): + tar_mock = mock.Mock(spec=tarfile.TarFile) + first_member_mock = mock.Mock(spec=tarfile.ExFileObject) + second_member_mock = mock.Mock(spec=tarfile.ExFileObject) + + first_member_mock.name = "key_000005.CELLS.json" + second_member_mock.name = "key_000006.CELLS.json" + + contents = { + first_member_mock.name: first_member_mock, + second_member_mock.name: second_member_mock, + } + + tar_mock.getmembers.return_value = contents.values() + tar_mock.extractfile.side_effect = lambda member: contents.get(member.name, None) + + first_member_contents = [ + { + "id": 1, + "parent": 2, + "pop": 3, + "age": 4, + "divisions": 5, + "state": "STATE_A", + "phase": "PHASE_A", + "voxels": 6, + "criticals": [7, 8], + }, + { + "id": 15, + "parent": 16, + "pop": 17, + "age": 18, + "divisions": 19, + "state": "STATE_B", + "phase": "PHASE_B", + "voxels": 20, + "criticals": [21, 22], + }, + ] + second_member_contents = [ + { + "id": 29, + "parent": 30, + "pop": 31, + "age": 32, + "divisions": 33, + "state": "STATE_C", + "phase": "PHASE_C", + "voxels": 34, + "criticals": [35, 36], + } + ] + + first_member_mock.read.return_value = json.dumps(first_member_contents).encode("utf-8") + second_member_mock.read.return_value = json.dumps(second_member_contents).encode("utf-8") + + regions = [] + + expected_data = { + "ID": [1, 15, 29], + "TICK": [5, 5, 6], + "PARENT": [2, 16, 30], + "POPULATION": [3, 17, 31], + "AGE": [4, 18, 32], + "DIVISIONS": [5, 19, 33], + "STATE": ["STATE_A", "STATE_B", "STATE_C"], + "PHASE": ["PHASE_A", "PHASE_B", "PHASE_C"], + "NUM_VOXELS": [6, 20, 34], + } + + data = parse_cells_file(tar_mock, regions) + + self.assertTrue(pd.DataFrame(expected_data).equals(data)) + + def test_parse_cells_file_with_regions(self): + tar_mock = mock.Mock(spec=tarfile.TarFile) + first_member_mock = mock.Mock(spec=tarfile.ExFileObject) + second_member_mock = mock.Mock(spec=tarfile.ExFileObject) + + first_member_mock.name = "key_000005.CELLS.json" + second_member_mock.name = "key_000006.CELLS.json" + + contents = { + first_member_mock.name: first_member_mock, + second_member_mock.name: second_member_mock, + } + + tar_mock.getmembers.return_value = contents.values() + tar_mock.extractfile.side_effect = lambda member: contents.get(member.name, None) + + first_member_contents = [ + { + "id": 1, + "parent": 2, + "pop": 3, + "age": 4, + "divisions": 5, + "state": "STATE_A", + "phase": "PHASE_A", + "voxels": 6, + "criticals": [7, 8], + "regions": [ + {"region": "REGION_A", "voxels": 9, "criticals": [10, 11]}, + {"region": "REGION_B", "voxels": 12, "criticals": [13, 14]}, + ], + }, + { + "id": 15, + "parent": 16, + "pop": 17, + "age": 18, + "divisions": 19, + "state": "STATE_B", + "phase": "PHASE_B", + "voxels": 20, + "criticals": [21, 22], + "regions": [ + {"region": "REGION_A", "voxels": 23, "criticals": [24, 25]}, + {"region": "REGION_B", "voxels": 26, "criticals": [27, 28]}, + ], + }, + ] + second_member_contents = [ + { + "id": 29, + "parent": 30, + "pop": 31, + "age": 32, + "divisions": 33, + "state": "STATE_C", + "phase": "PHASE_C", + "voxels": 34, + "criticals": [35, 36], + "regions": [ + {"region": "REGION_A", "voxels": 37, "criticals": [38, 39]}, + {"region": "REGION_B", "voxels": 40, "criticals": [41, 42]}, + ], + } + ] + + first_member_mock.read.return_value = json.dumps(first_member_contents).encode("utf-8") + second_member_mock.read.return_value = json.dumps(second_member_contents).encode("utf-8") + + regions = ["REGION_A", "REGION_B"] + + expected_data = { + "ID": [1, 15, 29], + "TICK": [5, 5, 6], + "PARENT": [2, 16, 30], + "POPULATION": [3, 17, 31], + "AGE": [4, 18, 32], + "DIVISIONS": [5, 19, 33], + "STATE": ["STATE_A", "STATE_B", "STATE_C"], + "PHASE": ["PHASE_A", "PHASE_B", "PHASE_C"], + "NUM_VOXELS": [6, 20, 34], + "NUM_VOXELS.REGION_A": [9, 23, 37], + "NUM_VOXELS.REGION_B": [12, 26, 40], + } + + data = parse_cells_file(tar_mock, regions) + + self.assertTrue(pd.DataFrame(expected_data).equals(data)) + + def test_parse_cell_tick_without_regions(self): + tick = 15 + regions = [] + cell = { + "id": 1, + "parent": 2, + "pop": 3, + "age": 4, + "divisions": 5, + "state": "STATE", + "phase": "PHASE", + "voxels": 6, + "criticals": [7, 8], + } + + expected = [1, tick, 2, 3, 4, 5, "STATE", "PHASE", 6] + + parsed = parse_cell_tick(tick, cell, regions) + + self.assertListEqual(expected, parsed) + + def test_parse_cell_tick_with_regions(self): + tick = 15 + regions = ["REGION_A", "REGION_B"] + cell = { + "id": 1, + "parent": 2, + "pop": 3, + "age": 4, + "divisions": 5, + "state": "STATE", + "phase": "PHASE", + "voxels": 6, + "criticals": [7, 8], + "regions": [ + {"region": "REGION_A", "voxels": 9, "criticals": [10, 11]}, + {"region": "REGION_B", "voxels": 12, "criticals": [13, 14]}, + ], + } + + expected = [1, tick, 2, 3, 4, 5, "STATE", "PHASE", 6] + expected = expected + [9] # REGION_A + expected = expected + [12] # REGION_B + + parsed = parse_cell_tick(tick, cell, regions) + + self.assertListEqual(expected, parsed) + + +if __name__ == "__main__": + unittest.main()