Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

REF: q2-fmt tutorial fixes #99

Merged
merged 8 commits into from
Sep 24, 2024
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 32 additions & 10 deletions q2_fmt/_peds.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,17 +144,18 @@ def sample_peds(table: pd.DataFrame, metadata: qiime2.Metadata,
drop_incomplete_timepoints)
table.filter(items=metadata.index)
# TODO: Make incomplete samples possible move this to heatmap
num_timepoints = _check_for_time_column(metadata, time_column)
num_timepoints, time_col = _check_for_time_column(metadata, time_column)
_check_column_type(column_properties, "time",
time_column, "numeric")
metadata = metadata.filter(items=time_col.index, axis=0)
colinvwood marked this conversation as resolved.
Show resolved Hide resolved
reference_series = _check_reference_column(metadata, reference_column)
_check_column_type(column_properties, "reference",
reference_column, "categorical")
# return things that should be removed
metadata, used_references = \
_filter_associated_reference(reference_series, metadata, time_column,
filter_missing_references,
reference_column)
reference_column, ids_with_data)
subject_series = _check_subject_column(metadata, subject_column)
_check_column_type(column_properties, "subject",
subject_column, "categorical")
Expand Down Expand Up @@ -188,7 +189,7 @@ def feature_peds(table: pd.DataFrame, metadata: qiime2.Metadata,
column_properties = metadata.columns
metadata = metadata.to_dataframe()

_ = _check_for_time_column(metadata, time_column)
_, _ = _check_for_time_column(metadata, time_column)
colinvwood marked this conversation as resolved.
Show resolved Hide resolved
_check_column_type(column_properties, "time",
time_column, "numeric")
reference_series = _check_reference_column(metadata, reference_column)
Expand All @@ -197,7 +198,7 @@ def feature_peds(table: pd.DataFrame, metadata: qiime2.Metadata,
metadata, used_references = \
_filter_associated_reference(reference_series, metadata, time_column,
filter_missing_references,
reference_column)
reference_column, ids_with_data)
colinvwood marked this conversation as resolved.
Show resolved Hide resolved
_ = _check_subject_column(metadata, subject_column)
_check_column_type(column_properties, "subject",
subject_column, "categorical")
Expand Down Expand Up @@ -370,10 +371,11 @@ def _rename_features(level_delimiter, data: pd.DataFrame):
# Filtering methods
def _check_for_time_column(metadata, time_column):
try:
num_timepoints = metadata[time_column].dropna().unique().size
time_col = metadata[time_column].dropna()
num_timepoints = time_col.unique().size
except Exception as e:
_check_column_missing(metadata, time_column, "time", e)
return num_timepoints
return num_timepoints, time_col


def _check_reference_column(metadata, reference_column):
Expand All @@ -385,11 +387,11 @@ def _check_reference_column(metadata, reference_column):


def _filter_associated_reference(reference_series, metadata, time_column,
filter_missing_references, reference_column):
filter_missing_references, reference_column,
ids_with_data):
used_references = reference_series[~metadata[time_column].isna()]
if used_references.isna().any():
if filter_missing_references:
metadata = metadata.dropna(subset=[reference_column])
used_references = used_references.dropna()
else:
nan_references = used_references.index[used_references.isna()]
Expand All @@ -398,6 +400,21 @@ def _filter_associated_reference(reference_series, metadata, time_column,
' timepoint value have an associated reference.'
' IDs where missing references were found:'
' %s' % (tuple(nan_references),))
available_references = (used_references.isin(ids_with_data))
if not available_references.all():
if filter_missing_references:
used_references = used_references[available_references]
else:
raise KeyError('References included in the metadata are missing'
' from the feature table. Please make sure all'
' references included in the metadata are also'
' present in the table.'
' Missing references: %s'
% list(used_references[~available_references]
.unique()))

used_references = used_references[available_references]
metadata = metadata.filter(items=used_references.index, axis=0)
colinvwood marked this conversation as resolved.
Show resolved Hide resolved
return metadata, used_references


Expand Down Expand Up @@ -530,9 +547,10 @@ def sample_pprs(table: pd.DataFrame, metadata: qiime2.Metadata,
metadata = _drop_incomplete_timepoints(metadata, time_column,
drop_incomplete_timepoints)
table.filter(items=metadata.index)
num_timepoints = _check_for_time_column(metadata, time_column)
num_timepoints, time_col = _check_for_time_column(metadata, time_column)
_check_column_type(column_properties, 'time',
time_column, 'numeric')
metadata = metadata.filter(items=time_col.index, axis=0)

used_references =\
_get_to_baseline_ref(time_col=metadata[time_column],
Expand Down Expand Up @@ -574,6 +592,9 @@ def peds_simulation(table: pd.DataFrame, metadata: qiime2.Metadata,
drop_incomplete_timepoints: list = None,
num_iterations: int = 999) -> (pd.DataFrame, pd.DataFrame):

ids_with_data = table.index
metadata = metadata.filter_ids(ids_to_keep=ids_with_data)

metadata_df = metadata.to_dataframe()
reference_series = _check_reference_column(metadata_df,
reference_column)
Expand All @@ -582,7 +603,8 @@ def peds_simulation(table: pd.DataFrame, metadata: qiime2.Metadata,
used_references) = _filter_associated_reference(reference_series,
metadata_df, time_column,
filter_missing_references,
reference_column)
reference_column,
ids_with_data)

if len(used_references.unique()) == 1:
raise AssertionError("There is only one donated microbiome in your"
Expand Down
82 changes: 81 additions & 1 deletion q2_fmt/tests/test_engraftment.py
Original file line number Diff line number Diff line change
Expand Up @@ -880,7 +880,8 @@ def test_no_donors(self):
metadata=metadata_df,
time_column="group",
filter_missing_references=False,
reference_column="Ref")
reference_column="Ref",
ids_with_data=None)

def test_incomplete_timepoints(self):
metadata_df = pd.DataFrame({
Expand Down Expand Up @@ -1397,6 +1398,85 @@ def test_rename_features_with_blank_label(self):
self.assertEqual("1", Fs1)
self.assertEqual("2", Fs2)

def test_peds_nan_tp(self):
colinvwood marked this conversation as resolved.
Show resolved Hide resolved
metadata_df = pd.DataFrame({
'id': ['sample1', 'sample2', 'sample3', 'sample4',
'donor1', 'donor2'],
'Ref': ['donor1', 'donor1', 'donor1', 'donor2', np.nan,
np.nan],
'subject': ['sub1', 'sub1', 'sub1', 'sub2', np.nan,
np.nan],
'group': [1, 2, np.nan, 2, np.nan,
np.nan]}).set_index('id')
metadata = Metadata(metadata_df)
table_df = pd.DataFrame({
'id': ['sample1', 'sample2', 'sample3', 'sample4',
'donor1', 'donor2'],
'Feature1': [0, 0, 1, 1, 1, 1],
'Feature2': [0, 1, 1, 1, 1, 1],
'Feature3': [0, 0, 1, 1, 1, 1]}).set_index('id')
sample_peds_df = sample_peds(table=table_df, metadata=metadata,
time_column="group",
reference_column="Ref",
subject_column="subject",
drop_incomplete_subjects=True)
obs_samples = sample_peds_df['id'].to_list()
exp_sample = ['sample1', 'sample2']
self.assertEqual(obs_samples, exp_sample)

def test_peds_no_donor_in_table(self):
metadata_df = pd.DataFrame({
'id': ['sample1', 'sample2', 'sample3', 'sample4',
'donor1', 'donor2'],
'Ref': ['donor1', 'donor1', 'donor2', 'donor2', np.nan,
np.nan],
'subject': ['sub1', 'sub1', 'sub2', 'sub2', np.nan,
np.nan],
'group': [1, 2, 1, 2, np.nan,
np.nan]}).set_index('id')
metadata = Metadata(metadata_df)
table_df = pd.DataFrame({
'id': ['sample1', 'sample2', 'sample3', 'sample4',
'donor1'],
'Feature1': [0, 0, 1, 1, 1],
'Feature2': [0, 1, 1, 1, 1],
'Feature3': [0, 0, 1, 1, 1]}).set_index('id')
with self.assertRaisesRegex(KeyError, "References included in the"
" metadata are missing from the feature"
" table.*"):
sample_peds(table=table_df, metadata=metadata,
time_column="group",
reference_column="Ref",
subject_column="subject",
drop_incomplete_subjects=True)

def test_peds_no_donor_in_table_flag(self):
metadata_df = pd.DataFrame({
'id': ['sample1', 'sample2', 'sample3', 'sample4',
'donor1', 'donor2'],
'Ref': ['donor1', 'donor1', 'donor2', 'donor2', np.nan,
np.nan],
'subject': ['sub1', 'sub1', 'sub2', 'sub2', np.nan,
np.nan],
'group': [1, 2, 1, 2, np.nan,
np.nan]}).set_index('id')
metadata = Metadata(metadata_df)
table_df = pd.DataFrame({
'id': ['sample1', 'sample2', 'sample3', 'sample4',
'donor1'],
'Feature1': [0, 0, 1, 1, 1],
'Feature2': [0, 1, 1, 1, 1],
'Feature3': [0, 0, 1, 1, 1]}).set_index('id')
sample_peds_df = sample_peds(table=table_df, metadata=metadata,
time_column="group",
reference_column="Ref",
subject_column="subject",
drop_incomplete_subjects=True,
filter_missing_references=True)
obs_samples = sample_peds_df['id'].to_list()
exp_sample = ['sample1', 'sample2']
self.assertEqual(obs_samples, exp_sample)

def test_pprs(self):
metadata_df = pd.DataFrame({
'id': ['sample1', 'sample2', 'sample3', 'sample4',
Expand Down
Loading