Skip to content

Commit

Permalink
Classes for fract and tp datasets (#172)
Browse files Browse the repository at this point in the history
Simplify data model, simplify function signatures.
  • Loading branch information
dweindl authored Feb 19, 2025
1 parent ec9cbaa commit 73161f3
Show file tree
Hide file tree
Showing 5 changed files with 311 additions and 300 deletions.
62 changes: 26 additions & 36 deletions src/ccompass/FDP.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,13 @@
from sklearn.preprocessing import MinMaxScaler

from ._utils import unique_preserve_order
from .core import IDENTIFIER, KEEP
from .core import IDENTIFIER, KEEP, FractDataset

logger = logging.getLogger(__package__)


def create_dataset(
input_data: dict[str, pd.DataFrame],
input_tables: dict[str, list[list[int | str]]],
identifier_columns: dict[str, str],
fract_input: dict[str, FractDataset],
conditions: list[str],
window: sg.Window,
progress: float,
Expand All @@ -40,8 +38,7 @@ def create_dataset(
all_identifiers = list(
set(
chain.from_iterable(
input_data[path][identifier_columns[path]]
for path in input_tables
dset.df[dset.id_col] for dset in fract_input.values()
)
)
)
Expand All @@ -50,7 +47,7 @@ def create_dataset(
stepsize = 10.0 / len(conditions)
# dataset to be created
# condition_id -> replicate_id -> DataFrame
dataset: dict[str, dict[str, pd.DataFrame]] = {}
combined: dict[str, dict[str, pd.DataFrame]] = {}

for condition in conditions:
progress += stepsize
Expand All @@ -60,21 +57,19 @@ def create_dataset(
window.read(timeout=50)

data_new = pd.DataFrame(index=all_identifiers)
for path in input_tables:
for dset in fract_input.values():
for (
samplename,
sample_condition,
sample_replicate,
sample_fraction,
) in input_tables[path]:
) in dset.table:
if sample_condition != condition:
continue

data = pd.DataFrame()
data[samplename] = input_data[path][samplename]
data.set_index(
input_data[path][identifier_columns[path]], inplace=True
)
data[samplename] = dset.df[samplename]
data.set_index(dset.df[dset.id_col], inplace=True)
data_new = pd.merge(
data_new,
data,
Expand Down Expand Up @@ -127,18 +122,21 @@ def create_dataset(
right_index=True,
)
repdata[rep] = data
dataset[condition] = repdata
combined[condition] = repdata

data_keep = {}
if KEEP in dataset:
data_keep = dataset[KEEP]
del dataset[KEEP]
if KEEP in combined:
data_keep = combined[KEEP]
del combined[KEEP]

return dataset, data_keep, progress
return combined, data_keep, progress


def pre_post_scaling(
data, how: Literal["minmax", "area"], window: sg.Window, progress: int
data: dict[str, dict[str, pd.DataFrame]],
how: Literal["minmax", "area"],
window: sg.Window,
progress: int,
):
"""Scale data using MinMaxScaler or area normalization.
Expand Down Expand Up @@ -517,19 +515,17 @@ def create_fract_processing_window() -> sg.Window:


def start_fract_data_processing(
input_tables: dict[str, list[list[int | str]]],
fract_input: dict[str, FractDataset],
preparams: dict[str, dict],
identifiers: dict[str, str],
fract_indata: dict[str, pd.DataFrame],
window: sg.Window | None = None,
):
"""Start fractionation data processing."""
# collect conditions (including [KEEP])
conditions = unique_preserve_order(
sample[1]
for input_table in input_tables.values()
for sample in input_table
if sample[1] != IDENTIFIER
row[1]
for dataset in fract_input.values()
for row in dataset.table
if row[1] != IDENTIFIER
)

# ---------------------------------------------------------------------
Expand All @@ -540,9 +536,7 @@ def start_fract_data_processing(
window.read(timeout=50)

dataset, protein_info, progress = create_dataset(
fract_indata,
input_tables,
identifiers,
fract_input,
conditions,
window,
progress,
Expand Down Expand Up @@ -785,14 +779,12 @@ def sample_tables_are_valid(


def FDP_exec(
input_tables: dict[str, list[list[int | str]]],
fract_input: dict[str, FractDataset],
preparams: dict[str, dict],
identifiers: dict[str, str],
data_ways: dict[str, dict[str, pd.DataFrame]],
std_ways: dict[str, dict[str, pd.DataFrame]],
protein_info: dict[str, pd.DataFrame],
conditions_trans: list[str],
fract_indata: dict[str, pd.DataFrame],
):
"""Execute the Fractionation Data Processing."""
window = create_fract_processing_window()
Expand All @@ -808,7 +800,7 @@ def FDP_exec(
window["--cancel--"].update(disabled=True)

if not sample_tables_are_valid(
input_tables,
{k: dset.table for k, dset in fract_input.items()},
min_replicates=int(preparams["global"]["minrep"][0]),
):
break
Expand All @@ -819,10 +811,8 @@ def FDP_exec(
protein_info,
conditions_trans,
) = start_fract_data_processing(
input_tables,
fract_input,
preparams,
identifiers,
fract_indata,
window,
)
break
Expand Down
120 changes: 50 additions & 70 deletions src/ccompass/TPP.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import logging
import math
from itertools import chain
from tkinter import messagebox
from typing import Any, Literal

Expand All @@ -11,39 +12,43 @@
from scipy.stats import pearsonr

from ._utils import unique_preserve_order
from .core import IDENTIFIER, KEEP
from .core import IDENTIFIER, KEEP, TotalProtDataset

logger = logging.getLogger(__package__)


def create_dataset(
tp_indata, tp_tables, tp_identifiers, tp_conditions, window
tp_input: dict[str, TotalProtDataset], window: sg.Window | None
):
tp_conditions.remove(IDENTIFIER)
tp_conditions = unique_preserve_order(
sample[1]
for dset in tp_input.values()
for sample in dset.table
if sample[1] != IDENTIFIER
)

idents = []
for path in tp_tables:
idents = list(
set(idents + list(tp_indata[path][tp_identifiers[path]]))
identifiers = list(
set(
chain.from_iterable(
dset.df[dset.id_col] for dset in tp_input.values()
)
)

dataset = {}
)
combined = {}

for condition in tp_conditions:
data_new = pd.DataFrame(index=idents)
for path in tp_tables:
data_new = pd.DataFrame(index=identifiers)
for dset in tp_input.values():
if window:
window["--status2--"].update(condition)
window.read(timeout=50)
replicate = 1
for sample in tp_tables[path]:
for sample in dset.table:
data = pd.DataFrame()
if sample[1] == condition:
samplename = sample[0]
data[samplename] = tp_indata[path][sample[0]]
data.set_index(
tp_indata[path][tp_identifiers[path]], inplace=True
)
data[samplename] = dset.df[samplename]
data.set_index(dset.df[dset.id_col], inplace=True)
data_new = pd.merge(
data_new,
data,
Expand Down Expand Up @@ -86,24 +91,15 @@ def create_dataset(
)
data_new = data_new.apply(pd.to_numeric, errors="coerce")

dataset[condition] = data_new
combined[condition] = data_new

if KEEP in dataset:
data_keep = dataset[KEEP]
del dataset[KEEP]
if KEEP in combined:
data_keep = combined[KEEP]
del combined[KEEP]
else:
data_keep = pd.DataFrame()

return dataset, data_keep, tp_conditions


def filter_missing(data, mincount, window):
for condition in data:
if window:
window["--status2--"].update(condition)
window.read(timeout=50)
data[condition].dropna(thresh=mincount, inplace=True)
return data
return combined, data_keep


def calculate_correlations(data):
Expand Down Expand Up @@ -244,31 +240,28 @@ def create_window() -> sg.Window:


def start_total_proteome_processing(
tp_data: dict[str, pd.DataFrame],
tp_tables: dict[str, list[tuple[str, str]]],
tp_input: dict[str, TotalProtDataset],
tp_preparams: dict[str, Any],
tp_identifiers: dict[str, str],
tp_data: dict[str, pd.DataFrame],
tp_info: pd.DataFrame,
tp_icorr: dict,
tp_indata: dict[str, pd.DataFrame],
tp_conditions: list,
window: sg.Window | None = None,
):
) -> tuple[dict[str, pd.DataFrame], pd.DataFrame, dict]:
# validate input
if not all(
any(IDENTIFIER == sample[1] for sample in table)
for table in tp_tables.values()
any(IDENTIFIER == sample[1] for sample in dset.table)
for dset in tp_input.values()
):
messagebox.showerror("Error", "At least one Identifier is missing.")
return tp_data, tp_info, tp_conditions, tp_icorr
return tp_data, tp_info, tp_icorr

if any(
sample[1] == "" for table in tp_tables.values() for sample in table
sample[1] == "" for dset in tp_input.values() for sample in dset.table
):
messagebox.showerror(
"Error", "At least one row does not have a condition assigned."
)
return tp_data, tp_info, tp_conditions, tp_icorr
return tp_data, tp_info, tp_icorr

if window:
# deactivate buttons
Expand All @@ -281,14 +274,8 @@ def start_total_proteome_processing(
window["--status1--"].update(value="creating dataset...")
window.read(timeout=50)

conditions = unique_preserve_order(
sample[1] for table in tp_tables.values() for sample in table
)
tp_data, tp_info, tp_conditions = create_dataset(
tp_indata,
tp_tables,
tp_identifiers,
conditions,
tp_data, tp_info = create_dataset(
tp_input,
window,
)

Expand All @@ -300,7 +287,8 @@ def start_total_proteome_processing(
window["--progress--"].update(progress)
window.read(timeout=50)

tp_data = filter_missing(tp_data, tp_preparams["minrep"], window)
for df in tp_data.values():
df.dropna(thresh=tp_preparams["minrep"], inplace=True)

# ---------------------------------------------------------------------
logger.info("transforming data...")
Expand Down Expand Up @@ -349,19 +337,16 @@ def start_total_proteome_processing(
window["--progress--"].update(progress)
window.read(timeout=50)

return tp_data, tp_info, tp_conditions, tp_icorr
return tp_data, tp_info, tp_icorr


def total_proteome_processing_dialog(
tp_data: dict[str, pd.DataFrame],
tp_tables: dict[str, list[tuple[str, str]]],
tp_input: dict[str, TotalProtDataset],
tp_preparams: dict[str, Any],
tp_identifiers: dict[str, str],
tp_data: dict[str, pd.DataFrame],
tp_info: pd.DataFrame,
tp_icorr: dict,
tp_indata: dict[str, pd.DataFrame],
tp_conditions: list,
):
) -> tuple[dict[str, pd.DataFrame], pd.DataFrame, dict]:
"""Show the total proteome processing dialog."""
window = create_window()

Expand All @@ -372,21 +357,16 @@ def total_proteome_processing_dialog(
break

if event == "--start--":
tp_data, tp_info, tp_conditions, tp_icorr = (
start_total_proteome_processing(
tp_data,
tp_tables,
tp_preparams,
tp_identifiers,
tp_info,
tp_icorr,
tp_indata,
tp_conditions,
window,
)
tp_data, tp_info, tp_icorr = start_total_proteome_processing(
tp_input,
tp_preparams,
tp_data,
tp_info,
tp_icorr,
window,
)
break

window.close()

return tp_data, tp_info, tp_conditions, tp_icorr
return tp_data, tp_info, tp_icorr
Loading

0 comments on commit 73161f3

Please sign in to comment.