Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add image filename columns to CellProfiler presets #252

Merged
merged 3 commits into from
Nov 11, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions cytotable/presets.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
"CONFIG_JOINS": """
SELECT
image.Metadata_ImageNumber,
COLUMNS('Image_FileName_.*'),
cytoplasm.* EXCLUDE (Metadata_ImageNumber),
cells.* EXCLUDE (Metadata_ImageNumber, Metadata_ObjectNumber),
nuclei.* EXCLUDE (Metadata_ImageNumber, Metadata_ObjectNumber)
Expand Down Expand Up @@ -92,6 +93,7 @@
per_image.Metadata_ImageNumber,
per_image.Image_Metadata_Well,
per_image.Image_Metadata_Plate,
COLUMNS('Image_FileName_.*'),
per_cytoplasm.* EXCLUDE (Metadata_ImageNumber),
per_cells.* EXCLUDE (Metadata_ImageNumber),
per_nuclei.* EXCLUDE (Metadata_ImageNumber)
Expand Down Expand Up @@ -148,6 +150,7 @@
image.Metadata_Well,
image.Image_Metadata_Site,
image.Image_Metadata_Row,
COLUMNS('Image_FileName_.*'),
cytoplasm.* EXCLUDE (Metadata_ImageNumber),
cells.* EXCLUDE (Metadata_ImageNumber),
nuclei.* EXCLUDE (Metadata_ImageNumber)
Expand Down Expand Up @@ -206,6 +209,7 @@
per_image.Metadata_ImageNumber,
per_image.Image_Metadata_Well,
per_image.Image_Metadata_Plate,
COLUMNS('Image_FileName_.*'),
per_cytoplasm.* EXCLUDE (Metadata_ImageNumber),
per_cells.* EXCLUDE (Metadata_ImageNumber),
per_nuclei.* EXCLUDE (Metadata_ImageNumber)
Expand Down Expand Up @@ -265,6 +269,7 @@
image.Metadata_ImageNumber,
image.Image_Metadata_Well,
image.Image_Metadata_Plate,
COLUMNS('Image_FileName_.*'),
cytoplasm.* EXCLUDE (Metadata_TableNumber, Metadata_ImageNumber),
cells.* EXCLUDE (Metadata_TableNumber, Metadata_ImageNumber),
nuclei.* EXCLUDE (Metadata_TableNumber, Metadata_ImageNumber)
Expand Down
38 changes: 1 addition & 37 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import pathlib
import shutil
import sqlite3
import subprocess
import tempfile
from typing import Any, Dict, Generator, List, Tuple

Expand Down Expand Up @@ -138,42 +137,6 @@ def fixture_data_dir_in_carta() -> List[str]:
return [f"{pathlib.Path(__file__).parent}/data/in-carta/colas-lab"]


# skip this fixture to avoid issues with ubuntu 22.04 and CLI usage of
# cytominer-database. Use instead fixture cytominerdatabase_sqlite_static.
@pytest.mark.skip
@pytest.fixture(name="cytominerdatabase_sqlite", scope="function")
def fixture_cytominerdatabase_sqlite(
tmp_path: str,
data_dirs_cytominerdatabase: List[str],
) -> List[str]:
"""
Processed cytominer-database test data as sqlite data
"""

output_paths = []
for data_dir in data_dirs_cytominerdatabase:
# example command for reference as subprocess below
# cytominer-database ingest source_directory sqlite:///backend.sqlite -c ingest_config.ini
output_path = f"sqlite:///{data_dir}/{pathlib.Path(data_dir).name}.sqlite"

# run cytominer-database as command-line call
subprocess.call(
args=[
"cytominer-database",
"ingest",
data_dir,
output_path,
"-c",
f"{data_dir}/config_SQLite.ini",
]
)

# store the sqlite output file within list to be returned
output_paths.append(output_path)

return output_paths


@pytest.fixture(name="cytominerdatabase_sqlite_static", scope="function")
def fixture_cytominerdatabase_sqlite_static():
"""
Expand Down Expand Up @@ -590,6 +553,7 @@ def fixture_cellprofiler_merged_nf1data(
image.ImageNumber,
image.Image_Metadata_Well,
image.Image_Metadata_Plate,
COLUMNS('Image_FileName_.*'),
cytoplasm.*,
cells.*,
nuclei.*
Expand Down
53 changes: 48 additions & 5 deletions tests/test_convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -799,6 +799,24 @@ def test_convert_cytominerdatabase_csv(
source_datatype="csv",
join=True,
drop_null=False,
# cytominer-database test datasets don't include image FileName columns
d33bs marked this conversation as resolved.
Show resolved Hide resolved
# so we use a custom join SQL here to avoid errors on querying for
# columns which aren't present.
joins="""
SELECT
image.Metadata_ImageNumber,
cytoplasm.* EXCLUDE (Metadata_ImageNumber),
cells.* EXCLUDE (Metadata_ImageNumber, Metadata_ObjectNumber),
nuclei.* EXCLUDE (Metadata_ImageNumber, Metadata_ObjectNumber)
FROM
read_parquet('cytoplasm.parquet') AS cytoplasm
LEFT JOIN read_parquet('cells.parquet') AS cells USING (Metadata_ImageNumber)
LEFT JOIN read_parquet('nuclei.parquet') AS nuclei USING (Metadata_ImageNumber)
LEFT JOIN read_parquet('image.parquet') AS image USING (Metadata_ImageNumber)
WHERE
cells.Metadata_ObjectNumber = cytoplasm.Metadata_Cytoplasm_Parent_Cells
AND nuclei.Metadata_ObjectNumber = cytoplasm.Metadata_Cytoplasm_Parent_Nuclei
""",
),
schema=control_table.schema,
)
Expand Down Expand Up @@ -922,6 +940,14 @@ def test_convert_cellprofiler_csv(
source_datatype="csv",
preset="cellprofiler_csv",
)
# drop image filenames which won't be present in the comparison dataset
).drop(
[
"Image_FileName_DNA",
"Image_FileName_OrigOverlay",
"Image_FileName_PH3",
"Image_FileName_cellbody",
]
)

# sort all values by the same columns
Expand Down Expand Up @@ -1091,11 +1117,12 @@ def test_convert_cellprofiler_sqlite_pycytominer_merge(
chunk_size=100,
preset="cellprofiler_sqlite_pycytominer",
)
)
# drop image columns which won't be present in Pycytominer output.
).drop(["Image_FileName_GFP", "Image_FileName_DAPI", "Image_FileName_RFP"])

# find the difference in column names and display it as part of an assertion
# find the symmetric difference in column names and display it as part of an assertion
column_diff = list(
set(pycytominer_table.schema.names) - set(cytotable_table.schema.names)
set(pycytominer_table.schema.names) ^ set(cytotable_table.schema.names)
)
# if there are no differences in column names, we should pass the assertion
# (empty collections evaluate to false)
Expand Down Expand Up @@ -1217,7 +1244,7 @@ def test_cell_health_cellprofiler_to_cytominer_database_legacy(
)

# check that we have the expected shape
assert test_result.shape == (12, 1790)
assert test_result.shape == (12, 1802)
# check that the tablenumber data arrived properly
assert set(test_result["Metadata_TableNumber"].to_pylist()) == {
"88ac13033d9baf49fda78c3458bef89e",
Expand Down Expand Up @@ -1247,7 +1274,23 @@ def test_cell_health_cellprofiler_to_cytominer_database_legacy(
)["Nuclei_Correlation_Costes_AGP_DNA"].to_pylist()
)
)

# drop image filenames which won't be present in fixture output
test_result = test_result.drop(
[
"Image_FileName_CellOutlines",
"Image_FileName_IllumAGP",
"Image_FileName_IllumDNA",
"Image_FileName_IllumER",
"Image_FileName_IllumMito",
"Image_FileName_IllumRNA",
"Image_FileName_NucleiOutlines",
"Image_FileName_OrigAGP",
"Image_FileName_OrigDNA",
"Image_FileName_OrigER",
"Image_FileName_OrigMito",
"Image_FileName_OrigRNA",
]
)
# assert that a manually configured table is equal to the cytotable result
# note: we sort values by all column names ascendingly for equality comparisons
assert test_result.sort_by(
Expand Down
11 changes: 10 additions & 1 deletion tests/test_convert_threaded.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,14 @@ def test_convert_tpe_cellprofiler_csv(
source_datatype="csv",
preset="cellprofiler_csv",
)
# drop image FileName columns which won't be present in the comparison dataset
).drop(
[
"Image_FileName_DNA",
"Image_FileName_OrigOverlay",
"Image_FileName_PH3",
"Image_FileName_cellbody",
]
)

# sort all values by the same columns
Expand Down Expand Up @@ -73,7 +81,8 @@ def test_convert_s3_path_csv(
parquet_file_meta = parquet.ParquetFile(s3_result).metadata

# check the shape of the data
assert (parquet_file_meta.num_rows, parquet_file_meta.num_columns) == (109, 5794)
# note: includes filename columns
assert (parquet_file_meta.num_rows, parquet_file_meta.num_columns) == (109, 5812)


@pytest.mark.large_data_tests
Expand Down
Loading