Skip to content

Commit

Permalink
Add docstrings and unit tests for merge region samples task
Browse files Browse the repository at this point in the history
  • Loading branch information
jessicasyu committed Sep 3, 2024
1 parent 971b7bf commit 0e9d1f7
Show file tree
Hide file tree
Showing 2 changed files with 268 additions and 3 deletions.
56 changes: 53 additions & 3 deletions src/arcade_collection/input/merge_region_samples.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,62 @@
def merge_region_samples(
samples: dict[str, pd.DataFrame], margins: tuple[int, int, int]
) -> pd.DataFrame:
"""
Merge different region samples into single valid samples dataframe.
The input samples are formatted as:
.. code-block:: python
{
"DEFAULT": (dataframe with columns = id, x, y, z),
"<REGION>": (dataframe with columns = id, x, y, z),
"<REGION>": (dataframe with columns = id, x, y, z),
...
}
The DEFAULT region is used as the superset of (x, y, z) samples; any sample
found only in a non-DEFAULT region are ignored. For a given id, there must
be at least one sample in each region.
The output samples are formatted as:
.. code-block:: markdown
┍━━━━━━┯━━━━━━━━━━┯━━━━━━━━━━┯━━━━━━━━━━┯━━━━━━━━━━┑
│ id │ x │ y │ z │ region │
┝━━━━━━┿━━━━━━━━━━┿━━━━━━━━━━┿━━━━━━━━━━┿━━━━━━━━━━┥
│ <id> │ <x + dx> │ <y + dy> │ <z + dz> │ DEFAULT │
│ <id> │ <x + dx> │ <y + dy> │ <z + dz> │ <REGION> │
│ ... │ ... │ ... │ ... │ ... │
│ <id> │ <x + dx> │ <y + dy> │ <z + dz> │ <REGION> │
┕━━━━━━┷━━━━━━━━━━┷━━━━━━━━━━┷━━━━━━━━━━┷━━━━━━━━━━┙
Samples that are found in the DEFAULT region, but not in any non-DEFAULT
region are marked as DEFAULT. Otherwise, the sample is marked with the
corresponding region. Region samples should be mutually exclusive.
Parameters
----------
samples
Map of region names to region samples.
margins
Margin in the x, y, and z directions applied to sample locations.
Returns
-------
:
Dataframe of merged samples with applied margins.
"""

default_samples = samples["DEFAULT"]
all_samples = tranform_sample_coordinates(default_samples, margins)
all_samples = transform_sample_coordinates(default_samples, margins)

regions = [key for key in samples.keys() if key != "DEFAULT"]
all_region_samples = []

for region in regions:
region_samples = tranform_sample_coordinates(samples[region], margins, default_samples)
region_samples = transform_sample_coordinates(samples[region], margins, default_samples)
region_samples["region"] = region
all_region_samples.append(region_samples)

Expand All @@ -29,7 +77,7 @@ def merge_region_samples(
return valid_samples


def tranform_sample_coordinates(
def transform_sample_coordinates(
samples: pd.DataFrame,
margins: tuple[int, int, int],
reference: Optional[pd.DataFrame] = None,
Expand All @@ -51,6 +99,7 @@ def tranform_sample_coordinates(
:
Transformed sample cell ids and coordinates.
"""

if reference is None:
reference = samples

Expand Down Expand Up @@ -84,6 +133,7 @@ def filter_valid_samples(samples: pd.DataFrame) -> pd.DataFrame:
:
Valid sample cell ids and coordinates.
"""

if "region" in samples.columns:
num_regions = len(samples.region.unique())
samples = samples.groupby("id").filter(lambda x: len(x.region.unique()) == num_regions)
Expand Down
215 changes: 215 additions & 0 deletions tests/arcade_collection/input/test_merge_region_samples.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
import unittest

import pandas as pd

from arcade_collection.input.merge_region_samples import (
filter_valid_samples,
merge_region_samples,
transform_sample_coordinates,
)


class TestMergeRegionSamples(unittest.TestCase):
def test_merge_region_samples_no_regions(self):
samples = {
"DEFAULT": pd.DataFrame(
{
"id": [1, 1, 1, 2, 2],
"x": [0, 1, 1, 2, 2],
"y": [3, 3, 4, 5, 5],
"z": [6, 6, 7, 7, 8],
}
),
}
margins = (10, 20, 30)

expected_merged = pd.DataFrame(
{
"id": [1, 1, 1, 2, 2],
"x": [11, 12, 12, 13, 13],
"y": [21, 21, 22, 23, 23],
"z": [31, 31, 32, 32, 33],
}
)

merged = merge_region_samples(samples, margins)

self.assertTrue(expected_merged.equals(merged))

def test_merge_region_samples_with_regions_no_fill(self):
samples = {
"DEFAULT": pd.DataFrame(
{
"id": [1, 1, 1, 2, 2],
"x": [0, 1, 1, 2, 2],
"y": [3, 3, 4, 5, 5],
"z": [6, 6, 7, 7, 8],
}
),
"REGION_A": pd.DataFrame(
{"id": [1, 1, 2], "x": [0, 1, 2], "y": [3, 4, 5], "z": [6, 7, 7]}
),
"REGION_B": pd.DataFrame({"id": [1, 2], "x": [1, 2], "y": [3, 5], "z": [6, 8]}),
}
margins = (10, 20, 30)

expected_merged = pd.DataFrame(
{
"id": [1, 1, 1, 2, 2],
"x": [11, 12, 12, 13, 13],
"y": [21, 21, 22, 23, 23],
"z": [31, 31, 32, 32, 33],
"region": ["REGION_A", "REGION_B", "REGION_A", "REGION_A", "REGION_B"],
}
)

merged = merge_region_samples(samples, margins)

self.assertTrue(expected_merged.equals(merged))

def test_merge_region_samples_with_regions_with_fill(self):
samples = {
"DEFAULT": pd.DataFrame(
{
"id": [1, 1, 1, 2, 2],
"x": [0, 1, 1, 2, 2],
"y": [3, 3, 4, 5, 5],
"z": [6, 6, 7, 7, 8],
}
),
"REGION_A": pd.DataFrame(
{"id": [1, 1, 2], "x": [0, 1, 2], "y": [3, 4, 5], "z": [6, 7, 7]}
),
}
margins = (10, 20, 30)

expected_merged = pd.DataFrame(
{
"id": [1, 1, 1, 2, 2],
"x": [11, 12, 12, 13, 13],
"y": [21, 21, 22, 23, 23],
"z": [31, 31, 32, 32, 33],
"region": ["REGION_A", "DEFAULT", "REGION_A", "REGION_A", "DEFAULT"],
}
)

merged = merge_region_samples(samples, margins)

self.assertTrue(expected_merged.equals(merged))

def test_transform_sample_coordinates_no_reference(self):
samples = pd.DataFrame(
{
"id": [1, 1, 1, 2, 2],
"x": [0, 1, 1, 2, 2],
"y": [3, 3, 4, 5, 5],
"z": [6, 6, 7, 7, 8],
}
)
margins = (10, 20, 30)
reference = None

expected_transformed = pd.DataFrame(
{
"id": [1, 1, 1, 2, 2],
"x": [11, 12, 12, 13, 13],
"y": [21, 21, 22, 23, 23],
"z": [31, 31, 32, 32, 33],
}
)

transformed = transform_sample_coordinates(samples, margins, reference)

self.assertTrue(expected_transformed.equals(transformed))

def test_transform_sample_coordinates_with_reference(self):
samples = pd.DataFrame(
{
"id": [1, 1, 1, 2, 2],
"x": [0, 1, 1, 2, 2],
"y": [3, 3, 4, 5, 5],
"z": [6, 6, 7, 7, 8],
}
)
margins = (10, 20, 30)
reference = pd.DataFrame(
{
"x": [0],
"y": [1],
"z": [2],
}
)

expected_transformed = pd.DataFrame(
{
"id": [1, 1, 1, 2, 2],
"x": [11, 12, 12, 13, 13],
"y": [23, 23, 24, 25, 25],
"z": [35, 35, 36, 36, 37],
}
)

transformed = transform_sample_coordinates(samples, margins, reference)

self.assertTrue(expected_transformed.equals(transformed))

def test_filter_valid_samples_no_region_all_valid(self):
samples = pd.DataFrame(
{
"x": [0, 1, 1, 2, 2],
"y": [3, 3, 4, 5, 5],
"z": [6, 6, 7, 7, 8],
}
)

expected_filtered = samples.copy()

filtered = filter_valid_samples(samples)

self.assertTrue(expected_filtered.equals(filtered))

def test_filter_valid_samples_with_region_all_valid(self):
samples = pd.DataFrame(
{
"id": [1, 1, 1, 2, 2],
"x": [0, 1, 1, 2, 2],
"y": [3, 3, 4, 5, 5],
"z": [6, 6, 7, 7, 8],
"region": ["A", "B", "B", "A", "B"],
}
)

expected_filtered = samples.copy()

filtered = filter_valid_samples(samples)

self.assertTrue(expected_filtered.equals(filtered))

def test_filter_valid_samples_sample_outside_region(self):
samples = pd.DataFrame(
{
"id": [1, 1, 1, 2, 2],
"x": [0, 1, 1, 2, 2],
"y": [3, 3, 4, 5, 5],
"z": [6, 6, 7, 7, 8],
"region": ["A", "B", "B", "A", "A"],
}
)

expected_filtered = pd.DataFrame(
{
"id": [1, 1, 1],
"x": [0, 1, 1],
"y": [3, 3, 4],
"z": [6, 6, 7],
"region": ["A", "B", "B"],
}
)

filtered = filter_valid_samples(samples)

self.assertTrue(expected_filtered.equals(filtered))


if __name__ == "__main__":
unittest.main()

0 comments on commit 0e9d1f7

Please sign in to comment.