-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathradial_profiles.py
345 lines (277 loc) · 18.1 KB
/
radial_profiles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
# Import relevant libraries
import pandas as pd
import os
import utils
import time
import scipy.stats
# Define a function to determine the common suffix in a series
def common_suffix(ser):
strings = ser.tolist()
reversed_strings = [s[::-1] for s in strings]
reversed_lcs = os.path.commonprefix(reversed_strings)
suffix = reversed_lcs[::-1]
print(f'The common suffix is "{suffix}".')
return suffix
# Create a function to check for duplicate columns
def has_duplicate_columns(df, sample_size=1000):
print('NOTE: Sampling is being performed, so if this returns True, consider increasing the sample_size parameter (or manually check for duplicates). If it returns False, you know that there are no duplicate columns in the DataFrame and do not need to stress about the sampling.')
df = utils.sample_df_without_replacement_by_number(df=df, n=sample_size)
df_transposed = df.T
df_deduplicated = df_transposed.drop_duplicates().T
return len(df.columns) != len(df_deduplicated.columns) # if they're the same length (has_duplicate_columns() returns False), you know there are no duplicate columns
# Define a function to appropriately transform the dataframe
def transform_dataframe(df1, index, columns, values, repeated_columns, verbose=False):
# Sample call
# df2.equals(transform_dataframe(df1, index='a', columns='c', values=['d', 'e'], repeated_columns=['b', 'f'], verbose=True))
# Pivot the specified values columns based on the specified index and columns
if verbose:
print(f'Pivoting the DataFrame based on the index "{index}", the columns "{columns}", and the values "{values}"...')
start_time = time.time()
pivot_df = df1.pivot_table(index=index, columns=columns, values=values, aggfunc='first', observed=False)
if verbose:
print(f'Pivoting the DataFrame took {time.time() - start_time:.2f} seconds.')
# Flatten the multi-index columns
pivot_df.columns = [f'{x[0]}{x[1]}' for x in pivot_df.columns]
# Reset the index to make the index column a column again
pivot_df.reset_index(inplace=True)
# Merge with the repeated_columns from the original DataFrame
if verbose:
print(f'Merging the DataFrame with the repeated columns "{repeated_columns}"...')
start_time = time.time()
df2 = pd.merge(pivot_df, df1[[index] + repeated_columns].drop_duplicates(), on=index)
if verbose:
print(f'Merging the DataFrame took {time.time() - start_time:.2f} seconds.')
# Return the transformed DataFrame
return df2
# Transform the dataframes image-by-image to save both memory and time
def transform_dataframes_in_chunks(df, image_col, new_index_col, distinct_old_row_identifier_col, cols_with_unique_rows, cols_with_repeated_rows, verbose=False):
for unique_image in df[image_col].unique():
df_image = df[df[image_col] == unique_image]
df_image2 = transform_dataframe(df_image, index=new_index_col, columns=distinct_old_row_identifier_col, values=cols_with_unique_rows, repeated_columns=cols_with_repeated_rows, verbose=verbose)
yield df_image2
# Efficiently turn a series of strings (in the format of the ImageJ "Labe" column) into three series of the relevant parts
def process_label_column(ser_label):
# Regular expression to capture the three parts
pattern = r'^(.*?) - (T=.*?:c:.*/.*? t:.*/.*?) - (.*)$'
# Extract parts into separate columns
extracted_df = ser_label.str.extract(pattern)
# Return the desired parts
return extracted_df[0], extracted_df[1], extracted_df[2]
def preprocess_dataset(df, perc_thresh_rawintnorm_column_check=0.01, image_col='Slide ID', nuclear_channel=2, do_z_score_filter=True, z_score_filter_threshold=3, run_checks=False):
# Note the input dataframe df should likely be that loaded from the Open File page
# This function is essentially the same as the preprocess_radial_profile_data.ipynb Jupyter notebook
# Constant
run_repeated_rows_check = False
# Output the initial dataframe shape
print('Initial dataframe shape:', df.shape)
# Determine whether preprocessing has already been performed
if 'Label' not in df.columns:
print('It appears that the dataset has already been preprocessed because there is no "Label" column. If you would like to re-preprocess the dataset, please reload it from the Open File page.')
return
# Efficiently save some data from the "Label" column
tif_name, middle_data, last_name = process_label_column(df['Label'])
# Show that the basename of the image name is always the same as the last name
if run_checks:
assert (tif_name.apply(lambda x: os.path.splitext(x)[0]) == last_name).all(), 'The basename of the image name is not always the same as the last name.'
# Add the TIF image names to the dataframe
df['tif_name'] = tif_name.astype(str)
print('tif_name:', df['tif_name'])
# Show that there are always five fields of the middle data when split using ":"
if run_checks:
assert list(middle_data.apply(lambda x: len(x.split(':'))).unique()) == [5], 'There are not always five fields of the middle data when split using ":".'
# Add the time ("T") field to the observations dataframe
df['T'] = middle_data.apply(lambda x: x.split(':')[0].removeprefix('T=')).astype(int)
print('T:', df['T'])
# Add the cell ID field to the observations dataframe
df['cell_id'] = middle_data.apply(lambda x: x.split(':')[1]).astype(str)
print('cell_id:', df['cell_id'])
# Check that the "c" field is the exact same as the "Ch" column
if run_checks:
assert (df['Ch'] == middle_data.apply(lambda x: x.split(':')[3].split('/')[0]).astype(int)).all(), 'The "c" field is not the exact same as the "Ch" column.'
# Check that the last field is exactly the same as one plus the "T" field
if run_checks:
assert (df['T'] == middle_data.apply(lambda x: x.split(':')[4].split('/')[0]).astype(int) - 1).all(), 'The last field is not exactly the same as one plus the "T" field.'
# Check that the .tif basename is completely contained in the actual input filename
df_small = df[['input_filename', 'tif_name']].drop_duplicates()
if run_checks:
assert df_small.apply(lambda x: os.path.splitext(x['tif_name'])[0].replace('REEEC', 'REEC') in x['input_filename'], axis='columns').all(), 'The .tif basename is not completely contained in the actual input filename.'
# Determine (and remove from the small df) the common suffix in the input_filename field
ser = df_small['input_filename']
suffix1 = common_suffix(ser)
df_small['input_filename'] = ser.str.removesuffix(suffix1)
print('df_small:', df_small)
# Ensure that the "T=X" part of the input filename is the same as the "T" field
if run_checks:
assert df['input_filename'].apply(lambda x: x.removesuffix(suffix1).split('=')[-1]).astype(int).equals(df['T']), 'The "T=X" part of the input filename is not the same as the "T" field.'
# Determine the next common suffix aside from the T value
ser = df_small['input_filename'].apply(lambda x: x.removesuffix(suffix1).split('=')[0])
suffix2 = common_suffix(ser)
df_small['input_filename'] = ser.str.removesuffix(suffix2)
print('df_small:', df_small)
# Get a series of the data to process from the "input_filename" field
ser_remaining_data = df['input_filename'].apply(lambda x: x.removesuffix(suffix1).split('=')[0].removesuffix(suffix2))
print('ser_remaining_data:', ser_remaining_data)
# Add a column to identify if the cell was processed using the REEC
df['REEC'] = ser_remaining_data.str.endswith('_REEC').astype(bool)
print('REEC:', df['REEC'])
# Remove the just-processed suffix
ser_remaining_data = ser_remaining_data.str.removesuffix('_REEC')
print('ser_remaining_data:', ser_remaining_data)
# Add columns identifying the well and cell type
df['well_id'] = ser_remaining_data.apply(lambda x: x.split('_')[0]).astype(str)
df['cell_type'] = ser_remaining_data.apply(lambda x: x.split('_')[1]).astype(str)
print('well_id and cell_type:', df[['well_id', 'cell_type']])
# Normalize the total intensity using the cell areas
df['RawIntNorm'] = df['RawIntDen'] / df['Area']
print('RawIntNorm:', df['RawIntNorm'])
# Check that this yields the same result already included in the datafile
if run_checks:
min_true_val = df['Mean'].min()
thresh = min_true_val * perc_thresh_rawintnorm_column_check / 100
mad = (df['RawIntNorm'] - df['Mean']).abs().max()
if mad < thresh:
print(f'The new and existing columns ARE equal to within {perc_thresh_rawintnorm_column_check}% of the minimum existing value (MAD: {mad}).')
else:
print(f'The new and existing columns are NOT equal to within {perc_thresh_rawintnorm_column_check}% of the minimum existing value (MAD: {mad}).')
# Set a row ID as a combination of the input filename and the extracted cell ID
# Note then that for each row ID there should be the same number of rows as there are channels, unless some cell IDs are duplicated, which is what the following cells test.
# As may often be necessary, we are doing this on a per-image basis to save memory.
unique_images = df[image_col].unique()
for image in unique_images:
image_loc = df[image_col] == image
df.loc[image_loc, 'row_id'] = df.loc[image_loc, 'input_filename'].astype(str) + ' - ' + df.loc[image_loc, 'cell_id']
# Display the rows corresponding to duplicated cell IDs
num_channels = df['Ch'].nunique()
vc = df['row_id'].value_counts()
df_dupes = df[df['row_id'].isin(vc[vc > num_channels].index)]
print('df_dupes:', df_dupes)
# Create a dataframe to aid in transforming the duplicated cell IDs to something unique
df_assist_deduping = df_dupes.groupby(by=['row_id', 'Ch'], observed=False)[' '].aggregate(sorted).to_frame().reset_index()
df_assist_deduping['index'] = df_assist_deduping[' '].apply(lambda x: list(range(len(x))))
df_assist_deduping_expanded = df_assist_deduping.explode(' ')
df_assist_deduping_expanded['index'] = df_assist_deduping.explode('index')['index'].apply(lambda x: f'{x:04d}')
print('df_assist_deduping_expanded:', df_assist_deduping_expanded)
# Check that the row ID in combination with the "blank index" uniquely identifies all rows (the duplicated cell IDs are not used here)
if run_checks:
assert (df['row_id'] + ' - ' + df[' '].astype(str)).nunique() == len(df), 'The row ID in combination with the "blank index" does not uniquely identify all rows.'
# Append string indices to the cell IDs in order to de-duplicate them
for _, ser in df_assist_deduping_expanded.iterrows():
row_id = ser['row_id']
blank_index = ser[' ']
index_to_append = ser['index']
loc = (df['row_id'] == row_id) & (df[' '] == blank_index)
assert loc.sum() == 1
df.loc[loc, 'cell_id'] = df.loc[loc, 'cell_id'] + ':' + index_to_append
# Output the de-duplicated versions of the previously duplicated rows (just `cell_id` is modified so far)
if run_checks:
print('df at the locations of the duplicates:', df.loc[df_dupes.index])
# Now confirm that there are no more duplicated cell IDs
for image in df[image_col].unique():
image_loc = df[image_col] == image
df.loc[image_loc, 'row_id'] = df.loc[image_loc, 'input_filename'].astype(str) + ' - ' + df.loc[image_loc, 'cell_id']
vc = df['row_id'].value_counts()
if run_checks:
df_dupes = df[df['row_id'].isin(vc[vc > num_channels].index)]
print('df_dupes:', df_dupes)
# Really confirm there are no duplicated rows anymore
if run_checks:
assert list(vc.unique()) == [num_channels], 'There are still duplicated rows.'
# Check for duplicate columns (ignoring MAWA-created columns)
if run_checks:
print('has_duplicate_columns:', has_duplicate_columns(df.drop(columns=['input_filename'])))
# Check whether suspected columns are equal
if run_checks:
assert (df['IntDen'] == df['RawIntDen']).all(), 'The "IntDen" and "RawIntDen" columns are not equal.'
# Drop the ostensibly more processed one
df.drop(columns=['IntDen'], inplace=True)
# Check again for duplicate columns
# This should return False if just loading the data in a Jupyter notebook but not when dataset_formats.py is used (as in Open File) because it probably does add duplicate columns
if run_checks:
print('has_duplicate_columns:', has_duplicate_columns(df.drop(columns=['input_filename'])))
# Show that in general, the cells are not strictly in sorted order
if run_checks:
is_sorted = df['cell_id'].iloc[::2].reset_index(drop=True).equals(df['cell_id'].iloc[1::2].reset_index(drop=True))
print('is_sorted:', is_sorted)
# Sort them appropriately and then check again
df.sort_values(by=['well_id', 'cell_type', 'REEC', 'T', 'cell_id'], inplace=True)
if run_checks:
is_sorted = df['cell_id'].iloc[::2].reset_index(drop=True).equals(df['cell_id'].iloc[1::2].reset_index(drop=True))
print('is_sorted:', is_sorted)
# Sample usage of the transformation function
if run_checks:
df1 = pd.DataFrame(
{
'a': [1, 1, 2, 2, 3, 3],
'c': [1, 2, 1, 2, 1, 2],
'd': ['AA', 'BB', 'CC', 'DD', 'EE', 'FF'],
'e': ['AAA', 'BBB', 'CCC', 'DDD', 'EEE', 'FFF'],
'b': ['b1', 'b1', 'b2', 'b2', 'b3', 'b3'],
'f': ['f1', 'f1', 'f2', 'f2', 'f3', 'f3'],
}
)
df2 = pd.DataFrame(
{
'a': [1, 2, 3],
'd1': ['AA', 'CC', 'EE'],
'd2': ['BB', 'DD', 'FF'],
'e1': ['AAA', 'CCC', 'EEE'],
'e2': ['BBB', 'DDD', 'FFF'],
'b': ['b1', 'b2', 'b3'],
'f': ['f1', 'f2', 'f3'],
}
)
assert df2.equals(transform_dataframe(df1, 'a', 'c', ['d', 'e'], ['b', 'f'], verbose=True)), 'The transformation function does not work as expected.'
# Compress the dataframe prior to transforming it
df = utils.downcast_dataframe_dtypes(df)
print('Shape of dataframe prior to transforming it:', df.shape)
# Programmatically get the columns with duplicated data in sequential rows and manually confirm the result
distinct_old_row_identifier_col = 'Ch'
new_index_col = 'row_id'
cols_with_repeated_rows2 = []
cols_with_unique_rows2 = []
for column in df.columns:
ser = df[column]
if ser.iloc[::2].reset_index(drop=True).equals(ser.iloc[1::2].reset_index(drop=True)):
cols_with_repeated_rows2.append(column)
else:
cols_with_unique_rows2.append(column)
cols_with_repeated_rows2 = [col for col in cols_with_repeated_rows2 if col not in [distinct_old_row_identifier_col, new_index_col]]
cols_with_unique_rows2 = [col for col in cols_with_unique_rows2 if col not in [distinct_old_row_identifier_col, new_index_col]]
# These two definitions are part of a manual check that, if passing consistently, can likely be removed in the future
if run_checks and run_repeated_rows_check:
# These sample rows are expected a unified dataframe but not for one processed by dataset_formats.py (which adds more columns), so in general we'll force this check to be skipped
cols_with_repeated_rows = ['Image ID_(standardized)', 'Centroid X (µm)_(standardized)', 'Centroid Y (µm)_(standardized)', 'Area', 'X', 'Y', 'Perim.', 'Circ.', 'AR', 'Round', 'Solidity', 'input_filename', 'tif_name', 'T', 'cell_id', 'REEC', 'well_id', 'cell_type']
cols_with_unique_rows = [' ', 'Label', 'Mean', 'StdDev', 'RawIntDen', 'RawIntNorm']
print('Repeated:', cols_with_repeated_rows2)
print('Unique:', cols_with_unique_rows2)
assert cols_with_repeated_rows2 == cols_with_repeated_rows, 'The columns with repeated rows are not as expected.'
assert cols_with_unique_rows2 == cols_with_unique_rows, 'The columns with unique rows are not as expected.'
# Create a generator that transforms the dataframes image-by-image
df_generator = transform_dataframes_in_chunks(df, image_col, new_index_col=new_index_col, distinct_old_row_identifier_col=distinct_old_row_identifier_col, cols_with_unique_rows=cols_with_unique_rows2, cols_with_repeated_rows=cols_with_repeated_rows2, verbose=False)
# Concatenate the DataFrames generated by the generator into a single DataFrame
df_transformed = pd.concat(df_generator, ignore_index=True)
# Print the memory usage of the new dataframe
if run_checks:
df_transformed.info(memory_usage='deep')
print('Shape of df_transformed:', df_transformed.shape)
# Get the signal intensity and signal Z score column names
signal_channels = set(df['Ch'].unique()) - {nuclear_channel}
signal_intensity_columns = [f'RawIntNorm{channel}' for channel in signal_channels]
signal_zscore_columns = [f'{col}_zscore' for col in signal_intensity_columns]
print('signal_intensity_columns:', signal_intensity_columns)
print('signal_zscore_columns:', signal_zscore_columns)
# Calculate the Z scores on each area-normalized signal channel for each image
for curr_image in unique_images:
for curr_column in signal_intensity_columns:
curr_image_loc = df_transformed[image_col] == curr_image
df_transformed.loc[curr_image_loc, curr_column + '_zscore'] = scipy.stats.zscore(df_transformed.loc[curr_image_loc, curr_column])
print('Columns of df_transformed that end with "_zscore":', df_transformed.loc[:, df_transformed.columns.str.endswith('_zscore')])
# Filter out cells for each image that are more than three standard deviations from the mean for any channel
if do_z_score_filter:
num_cells_before = df_transformed[image_col].value_counts()
df_transformed = df_transformed[(df_transformed[signal_zscore_columns].abs() < z_score_filter_threshold).all(axis='columns')]
num_cells_removed = num_cells_before - df_transformed[image_col].value_counts()
print('Number of cells removed by Z score threshold:')
print(num_cells_removed)
# Return the transformed dataframe
return df_transformed