Skip to content

Commit

Permalink
Add docstrings and unit tests for parse cells file task
Browse files Browse the repository at this point in the history
  • Loading branch information
jessicasyu committed Sep 4, 2024
1 parent 90c2645 commit 9be2368
Show file tree
Hide file tree
Showing 2 changed files with 304 additions and 10 deletions.
89 changes: 79 additions & 10 deletions src/arcade_collection/output/parse_cells_file.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
from __future__ import annotations

import json
import tarfile
from typing import Union
from typing import TYPE_CHECKING

import pandas as pd

if TYPE_CHECKING:
import tarfile

CELLS_COLUMNS = [
"ID",
"TICK",
Expand All @@ -15,30 +19,95 @@
"PHASE",
"NUM_VOXELS",
]
"""Column names for cells data parsed into tidy data format."""


def parse_cells_file(tar: tarfile.TarFile, regions: list[str]) -> pd.DataFrame:
all_cells: list[list[Union[str, int]]] = []
"""
Parse simulation cells data into tidy data format.
Parameters
----------
tar
Tar archive containing locations data.
regions
List of regions.
Returns
-------
:
Parsed cells data.
"""

all_cells: list[list[str | int]] = []

for member in tar.getmembers():
timepoint = int(member.name.replace(".CELLS.json", "").split("_")[-1])
tick = int(member.name.replace(".CELLS.json", "").split("_")[-1])

extracted_member = tar.extractfile(member)
assert extracted_member is not None
cells_json = json.loads(extracted_member.read().decode("utf-8"))

cells = [parse_cell_timepoint(timepoint, cell, regions) for cell in cells_json]
cells = [parse_cell_tick(tick, cell, regions) for cell in cells_json]
all_cells = all_cells + cells

columns = CELLS_COLUMNS + [f"NUM_VOXELS.{region}" for region in regions]
cells_df = pd.DataFrame(all_cells, columns=columns)
return pd.DataFrame(all_cells, columns=columns)


def parse_cell_tick(tick: int, cell: dict, regions: list[str]) -> list:
"""
Parse cell data for a single simulation tick.
Original data is formatted as:
.. code-block:: python
{
"id": cell_id,
"parent": parent_id,
"pop": population,
"age": age,
"divisions": divisions,
"state": state,
"phase": phase,
"voxels": voxels,
"criticals": [critical_volume, critical_height],
"regions": [
{
"region": region_name,
"voxels": region_voxels,
"criticals": [critical_region_volume, critical_region_height]
},
...
]
}
Parsed data is formatted as:
.. code-block:: python
[ cell_id, tick, parent_id, population, age, divisions, state, phase, voxels ]
When regions are specified, each list also contains the number of voxels for
the corresponding regions.
return cells_df
Parameters
----------
tick
Simulation tick.
cell
Original cell data.
regions
List of regions.
Returns
-------
:
Parsed cell data.
"""

def parse_cell_timepoint(timepoint: int, cell: dict, regions: list[str]) -> list:
features = ["parent", "pop", "age", "divisions", "state", "phase", "voxels"]
parsed = [cell["id"], timepoint] + [cell[feature] for feature in features]
parsed = [cell["id"], tick] + [cell[feature] for feature in features]

if regions and "regions" in cell:
region_voxels = [
Expand Down
225 changes: 225 additions & 0 deletions tests/arcade_collection/output/test_parse_cells_file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,225 @@
import json
import tarfile
import unittest
from unittest import mock
import pandas as pd

from arcade_collection.output.parse_cells_file import parse_cells_file, parse_cell_tick


class TestParseCellsFile(unittest.TestCase):
def test_parse_cells_file_without_regions(self):
tar_mock = mock.Mock(spec=tarfile.TarFile)
first_member_mock = mock.Mock(spec=tarfile.ExFileObject)
second_member_mock = mock.Mock(spec=tarfile.ExFileObject)

first_member_mock.name = "key_000005.CELLS.json"
second_member_mock.name = "key_000006.CELLS.json"

contents = {
first_member_mock.name: first_member_mock,
second_member_mock.name: second_member_mock,
}

tar_mock.getmembers.return_value = contents.values()
tar_mock.extractfile.side_effect = lambda member: contents.get(member.name, None)

first_member_contents = [
{
"id": 1,
"parent": 2,
"pop": 3,
"age": 4,
"divisions": 5,
"state": "STATE_A",
"phase": "PHASE_A",
"voxels": 6,
"criticals": [7, 8],
},
{
"id": 15,
"parent": 16,
"pop": 17,
"age": 18,
"divisions": 19,
"state": "STATE_B",
"phase": "PHASE_B",
"voxels": 20,
"criticals": [21, 22],
},
]
second_member_contents = [
{
"id": 29,
"parent": 30,
"pop": 31,
"age": 32,
"divisions": 33,
"state": "STATE_C",
"phase": "PHASE_C",
"voxels": 34,
"criticals": [35, 36],
}
]

first_member_mock.read.return_value = json.dumps(first_member_contents).encode("utf-8")
second_member_mock.read.return_value = json.dumps(second_member_contents).encode("utf-8")

regions = []

expected_data = {
"ID": [1, 15, 29],
"TICK": [5, 5, 6],
"PARENT": [2, 16, 30],
"POPULATION": [3, 17, 31],
"AGE": [4, 18, 32],
"DIVISIONS": [5, 19, 33],
"STATE": ["STATE_A", "STATE_B", "STATE_C"],
"PHASE": ["PHASE_A", "PHASE_B", "PHASE_C"],
"NUM_VOXELS": [6, 20, 34],
}

data = parse_cells_file(tar_mock, regions)

self.assertTrue(pd.DataFrame(expected_data).equals(data))

def test_parse_cells_file_with_regions(self):
tar_mock = mock.Mock(spec=tarfile.TarFile)
first_member_mock = mock.Mock(spec=tarfile.ExFileObject)
second_member_mock = mock.Mock(spec=tarfile.ExFileObject)

first_member_mock.name = "key_000005.CELLS.json"
second_member_mock.name = "key_000006.CELLS.json"

contents = {
first_member_mock.name: first_member_mock,
second_member_mock.name: second_member_mock,
}

tar_mock.getmembers.return_value = contents.values()
tar_mock.extractfile.side_effect = lambda member: contents.get(member.name, None)

first_member_contents = [
{
"id": 1,
"parent": 2,
"pop": 3,
"age": 4,
"divisions": 5,
"state": "STATE_A",
"phase": "PHASE_A",
"voxels": 6,
"criticals": [7, 8],
"regions": [
{"region": "REGION_A", "voxels": 9, "criticals": [10, 11]},
{"region": "REGION_B", "voxels": 12, "criticals": [13, 14]},
],
},
{
"id": 15,
"parent": 16,
"pop": 17,
"age": 18,
"divisions": 19,
"state": "STATE_B",
"phase": "PHASE_B",
"voxels": 20,
"criticals": [21, 22],
"regions": [
{"region": "REGION_A", "voxels": 23, "criticals": [24, 25]},
{"region": "REGION_B", "voxels": 26, "criticals": [27, 28]},
],
},
]
second_member_contents = [
{
"id": 29,
"parent": 30,
"pop": 31,
"age": 32,
"divisions": 33,
"state": "STATE_C",
"phase": "PHASE_C",
"voxels": 34,
"criticals": [35, 36],
"regions": [
{"region": "REGION_A", "voxels": 37, "criticals": [38, 39]},
{"region": "REGION_B", "voxels": 40, "criticals": [41, 42]},
],
}
]

first_member_mock.read.return_value = json.dumps(first_member_contents).encode("utf-8")
second_member_mock.read.return_value = json.dumps(second_member_contents).encode("utf-8")

regions = ["REGION_A", "REGION_B"]

expected_data = {
"ID": [1, 15, 29],
"TICK": [5, 5, 6],
"PARENT": [2, 16, 30],
"POPULATION": [3, 17, 31],
"AGE": [4, 18, 32],
"DIVISIONS": [5, 19, 33],
"STATE": ["STATE_A", "STATE_B", "STATE_C"],
"PHASE": ["PHASE_A", "PHASE_B", "PHASE_C"],
"NUM_VOXELS": [6, 20, 34],
"NUM_VOXELS.REGION_A": [9, 23, 37],
"NUM_VOXELS.REGION_B": [12, 26, 40],
}

data = parse_cells_file(tar_mock, regions)

self.assertTrue(pd.DataFrame(expected_data).equals(data))

def test_parse_cell_tick_without_regions(self):
tick = 15
regions = []
cell = {
"id": 1,
"parent": 2,
"pop": 3,
"age": 4,
"divisions": 5,
"state": "STATE",
"phase": "PHASE",
"voxels": 6,
"criticals": [7, 8],
}

expected = [1, tick, 2, 3, 4, 5, "STATE", "PHASE", 6]

parsed = parse_cell_tick(tick, cell, regions)

self.assertListEqual(expected, parsed)

def test_parse_cell_tick_with_regions(self):
tick = 15
regions = ["REGION_A", "REGION_B"]
cell = {
"id": 1,
"parent": 2,
"pop": 3,
"age": 4,
"divisions": 5,
"state": "STATE",
"phase": "PHASE",
"voxels": 6,
"criticals": [7, 8],
"regions": [
{"region": "REGION_A", "voxels": 9, "criticals": [10, 11]},
{"region": "REGION_B", "voxels": 12, "criticals": [13, 14]},
],
}

expected = [1, tick, 2, 3, 4, 5, "STATE", "PHASE", 6]
expected = expected + [9] # REGION_A
expected = expected + [12] # REGION_B

parsed = parse_cell_tick(tick, cell, regions)

self.assertListEqual(expected, parsed)


if __name__ == "__main__":
unittest.main()

0 comments on commit 9be2368

Please sign in to comment.