-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathisotopolouge_imputer.py
1582 lines (1238 loc) · 82.8 KB
/
isotopolouge_imputer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Conv2D, Dropout, MaxPool2D, Flatten, Add, Dense, Activation, BatchNormalization, Lambda, ReLU, PReLU, LayerNormalization
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers.legacy import Adam, SGD, RMSprop
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, LearningRateScheduler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
from IPython.display import display
import seaborn as sns
from scipy import stats
from visualization import *
from collections import Counter
from statsmodels.stats.multitest import multipletests
from sklearn.metrics import mean_squared_error # for calculating the cost function
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor # for building the model
from prettytable import PrettyTable
import os
import re
# Import data from csv
def get_data(file_name = "brain-glucose-KD-M1-isotopolouges.csv", dir = "/Users/bisramr/MATLAB/Projects/Isoscope_Matlab_V/generated-data", keep_coord = False):
'''
Convert file from csv to dataframe and remove unnecessary columns
Parameters:
- file_name: name of the file
- dir: Absolute path to the directory containing the file (exclude trailing forward slash)
Returns:
- data: dataframe of the data
'''
data_path = f'{dir}/{file_name}'
data = pd.read_csv(data_path)
if keep_coord:
data = data.drop(labels = ['Unnamed: 0'], axis = 1)
else:
data = data.drop(labels = ['x', 'y', 'Unnamed: 0'], axis = 1)
return data
# Model
def multiple_regression_model(num_ion_counts, num_isotopolouges, lambda_val):
model = Sequential([
# Input Layer
Dense(128, input_dim = num_ion_counts, kernel_initializer='he_uniform', activation='relu',kernel_regularizer=l2(lambda_val)),
Dense(128, kernel_initializer='he_uniform', activation='relu',kernel_regularizer=l2(lambda_val)),
Dense(256, kernel_initializer='he_uniform', activation='relu',kernel_regularizer=l2(lambda_val)),
BatchNormalization(),
Dropout(0.25),
Dense(256, kernel_initializer='he_uniform', activation='relu',kernel_regularizer=l2(lambda_val)),
Dense(256, kernel_initializer='he_uniform', activation='relu',kernel_regularizer=l2(lambda_val)),
Dense(256, kernel_initializer='he_uniform', activation='relu',kernel_regularizer=l2(lambda_val)),
BatchNormalization(),
Dropout(0.25),
Dense(128, kernel_initializer='he_uniform', activation='relu',kernel_regularizer=l2(lambda_val)),
Dense(num_isotopolouges, kernel_initializer='he_uniform', activation = 'relu', kernel_regularizer=l2(lambda_val))
])
model.compile(optimizer=tf.keras.optimizers.legacy.Adam(),
#loss=tf.keras.losses.MeanSquaredError(),
loss = tf.keras.losses.MeanSquaredError(),
metrics=['mse', 'mae'])
return model
def multiple_regression_model_2(num_ion_counts, num_isotopolouges, lambda_val):
# Leaky RELU
model = Sequential([
# Input Layer
LayerNormalization.adapt(axis=1),
Dense(128, input_dim = num_ion_counts, kernel_initializer='he_uniform', activation='relu',kernel_regularizer=l2(lambda_val)),
Dense(128, kernel_initializer='he_uniform', activation='relu',kernel_regularizer=l2(lambda_val)),
Dense(256, kernel_initializer='he_uniform', activation='relu',kernel_regularizer=l2(lambda_val)),
BatchNormalization(),
Dropout(0.25),
Dense(256, kernel_initializer='he_uniform', activation='relu',kernel_regularizer=l2(lambda_val)),
Dense(256, kernel_initializer='he_uniform', activation='relu',kernel_regularizer=l2(lambda_val)),
Dense(256, kernel_initializer='he_uniform', activation='relu',kernel_regularizer=l2(lambda_val)),
BatchNormalization(),
Dropout(0.25),
Dense(128, kernel_initializer='he_uniform', activation='relu',kernel_regularizer=l2(lambda_val)),
Dense(num_isotopolouges, kernel_initializer='he_uniform', kernel_regularizer=l2(lambda_val)),
LayerNormalization(axis=1),
])
model.compile(optimizer=tf.keras.optimizers.legacy.Adam(),
#loss=tf.keras.losses.MeanSquaredError(),
loss = tf.keras.losses.MeanSquaredError(),
metrics=['mse', 'mae'])
return model
def FML_regression_model(num_ion_counts, num_isotopolouges, lambda_val):
model = Sequential([
# Original: 128, 128, 256, 256, 256, 256, 128
# Layer 1
Dense(128, input_dim = num_ion_counts, kernel_initializer='he_uniform', activation='relu',kernel_regularizer=l2(lambda_val)),
BatchNormalization(),
# Layer 2
Dense(128, kernel_initializer='he_uniform', activation='relu',kernel_regularizer=l2(lambda_val)),
BatchNormalization(),
# Layer 3
Dense(256, kernel_initializer='he_uniform', activation='relu',kernel_regularizer=l2(lambda_val)),
BatchNormalization(),
Dropout(0.25),
# Layer 4
Dense(256, kernel_initializer='he_uniform', activation='relu',kernel_regularizer=l2(lambda_val)),
BatchNormalization(),
# Layer 5
Dense(256, kernel_initializer='he_uniform', activation='relu',kernel_regularizer=l2(lambda_val)),
BatchNormalization(),
# Layer 6
Dense(256, kernel_initializer='he_uniform', activation='relu',kernel_regularizer=l2(lambda_val)),
BatchNormalization(),
Dropout(0.25),
# Layer 7
Dense(128, kernel_initializer='he_uniform', activation='relu',kernel_regularizer=l2(lambda_val)),
BatchNormalization(),
# Removed relu to allow negative
Dense(num_isotopolouges, kernel_initializer='he_uniform', kernel_regularizer=l2(lambda_val))
])
model.compile(optimizer=tf.keras.optimizers.legacy.Adam(learning_rate = 3e-05),
#loss=tf.keras.losses.MeanSquaredError(),
loss = tf.keras.losses.MeanSquaredError(),
metrics=['mse', 'mae'])
return model
def create_large_data(all_data = True, data_path = '/Users/bisramr/MATLAB/Projects/Isoscope_Matlab_V/generated-data', primary_name = 'brain-glucose-KD-M1-isotopolouges.csv'):
'''
Creates feature and target dataframes consisting of different samples concatenated to each other. Assumes there are 6 total - 3 KD and 3 ND.
params:
- all_data: Bool for whether or not all 6 samples should be turned into a single training file, or the last sample (ND-M3) should be omitted and instead returned as a
separate pair of train/test files for later use.
'''
# List containing the file names of the isotopolouge data
isotopolouges_paths = [f'{data_path}/brain-glucose-KD-M{i+1}-isotopolouges.csv' for i in range(3)]
isotopolouges_paths.extend([f'{data_path}/brain-glucose-ND-M{i+1}-isotopolouges.csv' for i in range(2)])
# List containing the file names of the ion count data
ion_counts_paths = [f'{data_path}/brain-glucose-KD-M{i+1}-ioncounts.csv' for i in range(3)]
ion_counts_paths.extend([f'{data_path}/brain-glucose-ND-M{i+1}-ioncounts.csv' for i in range(2)])
# If all_data flag activated, include the final brain sample, otherwise return a test feature and target
if all_data:
isotopolouges_paths.append(f'{data_path}/brain-glucose-ND-M3-isotopolouges.csv')
ion_counts_paths.append(f'{data_path}/brain-glucose-ND-M3-ioncounts.csv')
else:
test_features = get_data(file_name = 'brain-glucose-ND-M3-ioncounts.csv')
test_targets = get_data(file_name = 'brain-glucose-ND-M3-isotopolouges.csv')
# Load each dataframe from the list and concatenate them to each other for - isotopolouges
isotopolouges = pd.read_csv(isotopolouges_paths[0])
for i, path in enumerate(isotopolouges_paths):
if i == 0:
continue
data = pd.read_csv(path)
isotopolouges = pd.concat([isotopolouges, data], ignore_index=True, axis = 0)
# Load each dataframe from the list and concatenate them to each other for - ions
ion_counts = pd.read_csv(ion_counts_paths[0])
for i, path in enumerate(ion_counts_paths):
if i == 0:
continue
data = pd.read_csv(path)
ion_counts = pd.concat([ion_counts, data], ignore_index=True, axis = 0)
# Drop the unneeded columns from both features and targets
ion_counts = ion_counts.drop(labels = ['x', 'y', 'Unnamed: 0'], axis = 1)
isotopolouges = isotopolouges.drop(labels = ['x', 'y', 'Unnamed: 0'], axis = 1)
if all_data:
return ion_counts, isotopolouges
else:
return ion_counts, isotopolouges, test_features, test_targets
def create_large_data_ranked(all_data = True, data_path = '/Users/bisramr/MATLAB/Projects/Isoscope_Matlab_V/generated-data', FML = False):
'''
Creates feature and target dataframes consisting of different samples concatenated to each other. Assumes there are 6 total - 3 KD and 3 ND.
params:
- all_data: Bool for whether or not all 6 samples should be turned into a single training file, or the last sample (ND-M3) should be omitted and instead returned as a
separate pair of train/test files for later use.
'''
iso_path = 'FML-isotopolouges-ranks' if FML else 'isotopolouges-ranks'
ion_path = 'FML-ioncounts-ranks' if FML else 'ioncounts-ranks'
# List containing the file names of the isotopolouge data
isotopolouges_paths = [f'{data_path}/BG-KD-M{i+1}-{iso_path}.csv' for i in range(3)]
isotopolouges_paths.extend([f'{data_path}/BG-ND-M{i+1}-{iso_path}.csv' for i in range(2)])
# List containing the file names of the ion count data
ion_counts_paths = [f'{data_path}/BG-KD-M{i+1}-{ion_path}.csv' for i in range(3)]
ion_counts_paths.extend([f'{data_path}/BG-ND-M{i+1}-{ion_path}.csv' for i in range(2)])
# If all_data flag activated, include the final brain sample, otherwise return a test feature and target
if all_data:
isotopolouges_paths.append(f'{data_path}/BG-ND-M3-{iso_path}.csv')
ion_counts_paths.append(f'{data_path}/BG-ND-M3-{ion_path}.csv')
else:
test_features = get_data(file_name = f'BG-ND-M3-{ion_path}.csv', dir = data_path)
test_targets = get_data(file_name = f'BG-ND-M3-{iso_path}.csv', dir = data_path)
# Load each dataframe from the list and concatenate them to each other for - isotopolouges
isotopolouges = pd.read_csv(isotopolouges_paths[0])
for i, path in enumerate(isotopolouges_paths):
if i == 0:
continue
data = pd.read_csv(path)
isotopolouges = pd.concat([isotopolouges, data], ignore_index=True, axis = 0)
# Load each dataframe from the list and concatenate them to each other for - ions
ion_counts = pd.read_csv(ion_counts_paths[0])
for i, path in enumerate(ion_counts_paths):
if i == 0:
continue
data = pd.read_csv(path)
ion_counts = pd.concat([ion_counts, data], ignore_index=True, axis = 0)
# Drop the unneeded columns from both features and targets
ion_counts = ion_counts.drop(labels = ['x', 'y', 'Unnamed: 0'], axis = 1)
isotopolouges = isotopolouges.drop(labels = ['x', 'y', 'Unnamed: 0'], axis = 1)
if all_data:
return ion_counts, isotopolouges
else:
return ion_counts, isotopolouges, test_features, test_targets
# ***************************************************** CHECKED FOR GENERALIZABILITY *****************************************************
# ============================================== LIST OF FILEPATHS =====================================================================
def generate_filepath_list(data_path = '/brain-m0-no-log', FML = True, tracer = 'BG'):
'''
Returns relative paths of data files as two lists. If sample includes both normal and ketogenic replicates, the ND replicates are first, and then KD.
- Example Filename: 'B3HB-KD-M1-FML-ioncounts-ranks.csv'
Parameters:
- data_path (string): relative path from main data directory to the directory containing all of the relevant data files. (Assumes you're already in the primary data directory)
- FML (bool): flag indicating whether to use the partial metabolite list (19 metabs) or full metabolite list
- tracer (string): prefix for the tracer whose data you want to generate [Glucose: BG, 3-Hydroxybutyrate: B3HB | B15NGln, B15NLeu, B15NNH4Cl]
- Precuror 'B' stands for brain data, 'G' for Glucose
Returns:
- ion_counts_paths (list): list of filenames with ion_count data
- isotopologues_paths (list): list of filenames with iso data
'''
iso_path = 'FML-isotopolouges-ranks' if FML else 'isotopolouges-ranks'
ion_path = 'FML-ioncounts-ranks' if FML else 'ioncounts-ranks'
# List containing the file names of the isotopolouge data - normal diet mice
isotopologues_paths = [f'{data_path}/{tracer}-ND-M{i+1}-{iso_path}.csv' for i in range(3)]
# List containing the file names of the ion count data - normal diet mice
ion_counts_paths = [f'{data_path}/{tracer}-ND-M{i+1}-{ion_path}.csv' for i in range(3)]
# These two tracers have Ketogenic mice as well, include them in the filepaths
if tracer == 'BG' or tracer == 'B3HB':
isotopologues_paths.extend([f'{data_path}/{tracer}-KD-M{i+1}-{iso_path}.csv' for i in range(3)])
ion_counts_paths.extend([f'{data_path}/{tracer}-KD-M{i+1}-{ion_path}.csv' for i in range(3)])
return ion_counts_paths, isotopologues_paths
# ============================================== IDENTIFYING ION + ISO INCONSISTENCIES ============================================================
def checking_data_consistency(data_path = '/brain-m0-no-log', FML = True, tracer = 'BG'):
'''
Identifies inconsistent metabolites for both ion count and isotopologue data for given tracer set of replicates.
params:
- data_path (string): relative path from main data directory to the directory containing all of the relevant data files. (Assumes you're already in the primary data directory)
- FML (bool): flag indicating whether to use the partial metabolite list (19 metabs) or full metabolite list
- tracer (string): prefix for the tracer whose data you want to generate [Glucose: BG, 3-Hydroxybutyrate: B3HB | B15NGln, B15NLeu, B15NNH4Cl]
- Precuror 'B' stands for brain data, 'G' for Glucose
returns:
- ion_inconsistencies (list): list containing the names of the metabolites that are not common in all ion_count files.
- iso_inconsistencies (list): list containing the name of the isotopologues that are not common in all iso files.
'''
# Generate lists containing the paths to the ion count and isotopologue data
ion_counts_paths, isotopolouges_paths = generate_filepath_list(data_path = data_path, FML = FML, tracer = tracer)
# List of ions that need to be removed from all files
ion_inconsistencies = identify_inconsistencies(ion_counts_paths, show_progress = False)
# List of isotopolouges that need to be removed from all files
iso_inconsistencies = identify_inconsistencies(isotopolouges_paths, show_progress = False)
return ion_inconsistencies, iso_inconsistencies
# ============================================== IDENTIFYING DATA INCONSISTENCIES ============================================================
def identify_inconsistencies(list_of_paths, show_progress = True):
'''
Helper Function - Goes through multiple datafiles and identifies metabolites that do not appear in all files.
These metabolites/isotopolouges would then be removed prior to training the model.
Parameters:
- list_of_paths (list): list containing the relative file paths for csvs that need to be compared
Returns:
- invalid_metabs_names (list): list containing the names (not indices) of the metabolites that are not common in all files.
'''
# Number of replicates
num_replicates = len(list_of_paths)
# Holds all metabolites of all files (including duplicates)
individual_replicate_metabs = []
# List of lists, where each sublist is the metabolites for a single file
all_metabs = []
for i, name in enumerate(list_of_paths):
# Load data
ion_count = get_data(file_name = name, dir = '/Users/bisramr/MATLAB/Projects/Isoscope_Matlab_V/generated-data')
metab_names = ion_count.columns
if show_progress:
print(i, name, len(metab_names))
individual_replicate_metabs.append(metab_names)
all_metabs.extend(metab_names)
# Flatten the list of lists into single lists
all_metabs.sort()
# Returns a dictionary where the keys are the iso indices and the values are the number of times they appear in the flattened list (a count)
metab_index_dict = Counter(all_metabs)
# Create a list the names of all metabolites that do not appear in all replicates
invalid_metabs_names = [index for index in list(metab_index_dict.keys()) if metab_index_dict[index] < num_replicates]
return invalid_metabs_names
# ============================================== REMOVING DATA INCONSISTENCIES ============================================================
def remove_data_inconsistencies(additional_ion_metabolite_to_remove = [], additional_metabs_to_remove = [], data_path = '/brain-m0-no-log', FML = True, tracer = 'BG'):
'''
Generates the final dataset to use for regression. Loads in the relevant input files, and then removes two different sets of metabolites from each replicate:
1). Metabolites/isotopologues that are not consistent across replicates (were not detected through mass spec for some replicates)
2). Metabolites/isotopologues that were deemed invalid by failing to surpass the Moran's I metric for the majority of replicates
Paremeters:
- additional_metabs_to_remove (list): list of isotopologue NAMES (not indices) that must be removed from all replicates (ie the names from Moran's)
- data_path (string): relative path from main data directory to the directory containing all of the relevant data files. (Assumes you're already in the primary data directory)
- FML (bool): flag indicating whether to use the partial metabolite list (19 metabs) or full metabolite list
- tracer (string): prefix for the tracer whose data you want to generate [Glucose: BG, 3-Hydroxybutyrate: B3HB | B15NGln, B15NLeu, B15NNH4Cl]
- Precuror 'B' stands for brain data, 'G' for Glucose
Returns:
- clean_ion_data (list): list of ion count dataframes that are n (number of pixels in this replicate - can be different for each) by m (num of metabolites - consistent across all)
- clean_iso_data (list): list of isotopologue dataframes that are n (number of pixels in this replicate - can be different for each) by m (num of isotopologues for prediction - consistent across all)
'''
# Lists of filepaths to ion_counts and isotopolouges
ion_counts_paths, isotopolouges_paths = generate_filepath_list(data_path = data_path, FML = FML, tracer = tracer)
# Lists of names of inconsistent metabolites and isotopolouges that need to be removed
ion_inconsistencies, iso_inconsistencies = checking_data_consistency(data_path = data_path, FML = FML, tracer = tracer)
print(f"Inconsistencies found: {len(ion_inconsistencies)} metabolites, {len(iso_inconsistencies)} isotopolouges")
# Append Moran's metabs list to iso list for removal
iso_inconsistencies += additional_metabs_to_remove
# Remove any duplicate names
iso_inconsistencies = [*set(iso_inconsistencies)]
print(f"Removing {len(iso_inconsistencies)} isotopolouges")
clean_ion_data = []
clean_iso_data = []
for i, data_path in enumerate(ion_counts_paths):
# Load in the data for single replicate
data = get_data(file_name = data_path, dir = '/Users/bisramr/MATLAB/Projects/Isoscope_Matlab_V/generated-data')
# Get list of metabolites for that replicate
metabolite_names = data.columns
# List of metabolites that must be dropped that are present in this replicate (this step actually not necessary)
metab_to_drop = [metab for metab in ion_inconsistencies if metab in metabolite_names]
# Drop the unneeded metabolites
data = data.drop(labels = metab_to_drop, axis = 1)
new_metabolite_names = data.columns
print(f"File {i}: {data_path} || {len(metab_to_drop)} to drop || {len(metabolite_names) - len(new_metabolite_names)} dropped")
# Append to list of cleaned/filtered dataframes for iso data
clean_ion_data.append(data)
# Confirm that all the dataframes have the same columns in the same order!
checks = [True if (list(item.columns) == list(clean_ion_data[0].columns)) else False for item in clean_ion_data[1:]]
if all(checks):
print("Ion-Data is all consistent! Time to train a model!")
else:
print("THERE HAS BEEN AN ERROR!!!! Dataframes columns not all the same order.")
# same thing for isolouges
for i, data_path in enumerate(isotopolouges_paths):
data = get_data(file_name = data_path, dir = '/Users/bisramr/MATLAB/Projects/Isoscope_Matlab_V/generated-data')
metabolite_names = data.columns
metab_to_drop = [metab for metab in iso_inconsistencies if metab in metabolite_names]
data = data.drop(labels = metab_to_drop, axis = 1)
new_metabolite_names = data.columns
print(f"File {i}: {data_path} || {len(metab_to_drop)} to drop || {len(metabolite_names) - len(new_metabolite_names)} dropped")
clean_iso_data.append(data)
# Confirm that all the dataframes have the same columns in the same order!
checks = [True if (list(item.columns) == list(clean_iso_data[0].columns)) else False for item in clean_iso_data[1:]]
if all(checks):
print("Iso-Data is all consistent! Time to train a model!")
else:
print("THERE HAS BEEN AN ERROR!!!! Dataframes columns not all the same order.")
return clean_ion_data, clean_iso_data
# ============================================== Creating Dataset for Training ===========================================================
def create_full_dataset(ion_dfs, iso_dfs, holdout = True, holdout_index = 0):
'''
Take in the list of cleaned/consistent dataframes and returns a training a test set
Parameters:
- ion_dfs (list): list of dataframes where each element is a replicate's ion_count data
- iso_dfs (list): list of dataframes where each element is a replicate's isotopologue data
- holdout (bool): flag indicating whether there is a replicate being held out for testing or all replicates should be used for training
- holdout_index (int): The index in the ion and iso lists of the replicate that should be held out as the testing set
Returns:
- ion_counts (df): # pixels x # metabolites df of ion_count data. Consists of multiple replicates for training set.
- isotopolouges (df): # pixels x # isotopologues df of isotopologue data. Consists of multiple replicates for training set.
- test_ion_counts (df): single holdout replicates df # pixels x # metabolites df of ion_count data for test set.
- test_iso_counts (df): single holdout replicates df # pixels x # isotopologues df of ion_count data for test set.
'''
if holdout:
test_ion_counts = ion_dfs.pop(holdout_index)
test_iso_counts = iso_dfs.pop(holdout_index)
stop_index = len(ion_dfs) - 1
ion_counts = ion_dfs[0]
for i, data in enumerate(ion_dfs[1:stop_index]):
ion_counts = pd.concat([ion_counts, data], ignore_index = True, axis = 0)
isotopolouges = iso_dfs[0]
for i, data in enumerate(iso_dfs[1:stop_index]):
isotopolouges = pd.concat([isotopolouges, data], ignore_index = True, axis = 0)
return ion_counts, isotopolouges, test_ion_counts, test_iso_counts
else:
ion_counts = ion_dfs[0]
for i, data in enumerate(ion_dfs[1:]):
ion_counts = pd.concat([ion_counts, data], ignore_index = True, axis = 0)
isotopolouges = iso_dfs[0]
for i, data in enumerate(iso_dfs[1:]):
isotopolouges = pd.concat([isotopolouges, data], ignore_index = True, axis = 0)
return ion_counts, isotopolouges
# ***************************************************** CHECKED FOR GENERALIZABILITY *****************************************************
# ============================================== PROCESSING MORANS I =====================================================================
def indices_to_metab_names(list_of_morans_metabs, metabs_to_consider = 'isos', data_path = '/brain-m0-no-log', tracer = 'BG', FML = True, cutoff = 4):
'''
This function iterates through the list of metabolite indices (identified by the Moran's I score), and returns a list
of the metabolite names that should be removed from consideration based on the number of replicates they are poor targets in.
Moran's I was run on each file individually, so the indices for each file may not correspond to each other as intended. Instead,
this function will convert each list of indices to the isotopolouge name of that specific file, and then the comparisons will
be done on the isotopolouge names rather than indices.
Add this list of metab names to the list of inconsistencies and remove them all from every file.
parameters:
- list_of_morans_metabs (list): list of lists, where every sublist contains the indices of metabolites that are poor targets in
a single replicate. Each sublist corresponds to a single replicate.
- metabs_to_consider (string, default = 'isos'): flag that determines whether the function should remove these metabolites from the
ion_count files or isotopologue files.
- data_path (string): file path to the directory containing the relevant isotopologue files.
- FML (bool): flag indicating whether to use the full metabolite list or partial metabolite list.
- cutoff (int, default = 4): Number corresponding to how many replicates should identify a metabolite as invalid before it is marked for removal.
returns:
- invalid_metabs (list): a list of metabolite names that were marked as poor targets for prediction in at least (cutoff) replicates.
'''
# Paths to files
ion_counts_paths, isotopolouges_paths = generate_filepath_list(data_path = data_path, FML = FML, tracer = tracer)
# Iterate through either ion files or iso files
paths_to_iterate = isotopolouges_paths if metabs_to_consider == 'isos' else ion_counts_paths
all_metabs = []
for i, data_tuple in enumerate(zip(paths_to_iterate, list_of_morans_metabs)):
data = get_data(file_name = data_tuple[0], dir = '/Users/bisramr/MATLAB/Projects/Isoscope_Matlab_V/generated-data')
metab_names = list(data.columns)
print(data_tuple[0], data_tuple[1][-1], len(metab_names))
morans_metabs_names = [metab_names[index] for index in data_tuple[1] if index < len(metab_names)]
all_metabs.append(morans_metabs_names)
# print(len(morans_metabs_names))
invalid_metabs = count_list_elements(all_metabs, cutoff=cutoff)
return invalid_metabs
# ============================================== Training ===========================================================
def preprocess_data(ion_counts, isotopolouges, testing_split = True):
'''
Take in the input dataframes and convert the to numpys for the model
Params:
- ion_counts: dataframe containing the feature data
- isotopologues: dataframe containing the target data
- testing_split: flag indicating whether to produce a testing set or only train/val split
Returns:
- num_ion_counts: the number of features going into the model
- num_isotopolouges: the number of targets being predicted
'''
x = ion_counts.to_numpy()
y = isotopolouges.to_numpy()
# print(x.shape, y.shape)
num_ion_counts = x.shape[1]
num_isotopolouges = y.shape[1] if len(y.shape) != 1 else 1
if testing_split:
x_train, x_temp, y_train, y_temp = train_test_split(x, y, test_size=0.3)
x_val, x_test, y_val, y_test = train_test_split(x_temp, y_temp, test_size=0.5)
return num_ion_counts, num_isotopolouges, x_train, y_train, x_val, y_val, x_test, y_test
else:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2)
return num_ion_counts, num_isotopolouges, x_train, y_train, x_val, y_val
def training(feature_data, target_data, checkpoint_path = './saved-weights/KD-M1-unranked-dropout/checkpoint', train = True, TRAIN_ENTIRE_BRAIN = False, ranked = True, EPOCHS = 100, BATCH_SIZE = 32):
print("HI")
# Whether or not to treat the dataset as training data
if TRAIN_ENTIRE_BRAIN:
num_ion_counts, num_isotopolouges, x_train, y_train, x_val, y_val = preprocess_data(feature_data, target_data, testing_split = False)
else:
num_ion_counts, num_isotopolouges, x_train, y_train, x_val, y_val, x_test, y_test = preprocess_data(feature_data, target_data)
isotopolouge_names = list(target_data.columns)
# print(isotopolouge_names)
# define model
model = FML_regression_model(num_ion_counts, num_isotopolouges, 0.01)
# Checkpoints
# https://keras.io/api/callbacks/model_checkpoint/
checkpoint_filepath = checkpoint_path # './saved-weights/KD-M1-unranked-dropout/checkpoint'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
filepath=checkpoint_filepath,
verbose = 1,
save_weights_only=True,
monitor='loss',
mode='max',
save_best_only=False,
save_freq= int(664 * 10))
# Learning rate scheduler
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, verbose = 1, min_lr=0)
# fit model
if train:
history = model.fit(x_train, y_train, batch_size = BATCH_SIZE, verbose=1, validation_data = (x_val, y_val), epochs=EPOCHS, callbacks=[model_checkpoint_callback])
else:
model.load_weights(checkpoint_filepath)
# evaluate model on test set
mae = model.evaluate(x_test, y_test, verbose=1)
prediction = model.predict(x_test)
#plt.scatter(y_test, prediction)
# plt.savefig(f'/Users/bisramr/MATLAB/Projects/Isoscope_Matlab_V/generated-data/brain-glucose-KD-M1-predicting.png')
#plt.show()
print(y_test.shape, prediction.shape)
# print(y_test)
# print(prediction)
plot_individual_isotopolouges_2(y_test, prediction, isotopolouge_names, ranked = ranked)
return history
def correlations():
brain_K1 = get_data(file_name="brain-glucose-KD-M1-isotopolouges.csv")
brain_K1_ranks = get_data(file_name="brain-glucose-KD-M1-isotopolouges-ranks.csv")
brain_K2 = get_data(file_name="brain-glucose-KD-M2-isotopolouges.csv")
brain_K2_ranks = get_data(file_name="brain-glucose-KD-M2-isotopolouges-ranks.csv")
brain_N1 = get_data(file_name="brain-glucose-ND-M1-isotopolouges.csv")
brain_N1_ranks = get_data(file_name="brain-glucose-ND-M1-isotopolouges-ranks.csv")
#corr_heatmap(brain_1)
#corr_heatmap(brain_1_ranks)
double_corr_heatmap(brain_K1, brain_K1_ranks, title = "KD Brain 1 Pairwise Corr Coeff")
double_corr_heatmap(brain_K2, brain_K2_ranks, title = "KD Brain 2 Pairwise Corr Coeff")
double_corr_heatmap(brain_N1, brain_N1_ranks, title = "N1 Brain 1 Pairwise Corr Coeff")
plt.show()
isotopolouge_names = list(brain_K1.columns)
corr_scatter(brain_K1, isotopolouge_names, 1, 2)
def test_whole_brain(feature_data, target_data, checkpoint_path = './saved-weights/KD-M1-unranked-dropout/checkpoint', ranked = False):
isotopolouge_names = list(target_data.columns)
num_ion_counts = feature_data.shape[1]
num_isotopolouges = target_data.shape[1]
features = feature_data.to_numpy()
targets = target_data.to_numpy()
# define model
model = FML_regression_model(num_ion_counts, num_isotopolouges, 0.01)
# Checkpoints
# https://keras.io/api/callbacks/model_checkpoint/
model.load_weights(checkpoint_path).expect_partial()
prediction = model.predict(features)
plt.scatter(targets, prediction)
max = targets.max()
min = targets.min()
# Add the identity line
X_plot = np.linspace(min, max, 100)
Y_plot = X_plot
plt.plot(X_plot, Y_plot, color='r')
plt.xlabel("Actual Value")
plt.ylabel("Predicted Value")
# plt.savefig(f'/Users/bisramr/MATLAB/Projects/Isoscope_Matlab_V/generated-data/brain-glucose-KD-M1-predicting.png')
plt.show()
print(targets.shape, prediction.shape)
plot_individual_isotopolouges_2(targets, prediction, isotopolouge_names, grid_size = 5, ranked = ranked)
'''
# Making new csv
df1 = pd.read_csv("/Users/bisramr/MATLAB/Projects/Isoscope_Matlab_V/generated-data/brain-glucose-KD-M2-isotopolouges.csv")
df2 = pd.DataFrame(prediction, columns = isotopolouge_names)
df2['x'] = df1['x']
df2['y'] = df1['y']
# df2.to_csv('KDM2-predicted.csv')
# print(df2)
'''
def predict(feature_data, target_data, checkpoint_path = './saved-weights/KD-M1-unranked-dropout/checkpoint'):
'''
Predicts the isotopolouge breakdowns of a given list of metabolite, using the weights of the saved path.
Returns:
- prediction_df: a dataframe in which the columns are individual isotopolouges and the rows are observations
'''
isotopolouge_names = list(target_data.columns)
num_ion_counts = feature_data.shape[1]
num_isotopolouges = target_data.shape[1]
features = feature_data.to_numpy()
# targets = target_data.to_numpy()
# define model
model = FML_regression_model(num_ion_counts, num_isotopolouges, 0.01)
# Checkpoints
# https://keras.io/api/callbacks/model_checkpoint/
checkpoint_filepath = checkpoint_path # './saved-weights/KD-M1-unranked-dropout/checkpoint'
model.load_weights(checkpoint_filepath).expect_partial()
prediction = model.predict(features)
prediction_df = pd.DataFrame(prediction, columns = isotopolouge_names)
return prediction_df
def spearman_rankings(actual, predicted, plot = True):
'''
For each pair of actual/predicted isotopolouges, calculates the Spearman's rank correlation coefficient. Allows us to know which isotopolouges are best predicted by this model.
Paramters:
- actual (df): the ground truth dataframe for the regression model's predictions to be compared too.
'''
spearmans = []
p_values = []
actual = actual.drop(labels = ['x', 'y'], axis = 1, errors = 'ignore')
predicted = predicted.drop(labels = ['x', 'y'], axis = 1, errors = 'ignore')
# Save the p values and then regulate them with this:
# https://www.statsmodels.org/dev/generated/statsmodels.stats.multitest.multipletests.html
for i in range(len(list(actual.columns))):
spearman_coeff = stats.spearmanr(actual.iloc[:, i], predicted.iloc[:, i])
spearmans.append(spearman_coeff.correlation)
p_values.append(spearman_coeff.pvalue)
# print(spearmans)
df_spearman = pd.DataFrame()
df_spearman["median_rho"] = spearmans
df_spearman["isotopologue"] = list(actual.columns)
# Calculate q values and plot as color
corrected_pvals = multipletests(p_values, method="fdr_bh", alpha = 0.1)
df_spearman['pvals'] = corrected_pvals[1]
color = ["purple" if pval <= 0.1 else "red" for pval in corrected_pvals[1]]
df_spearman["color"] = color
# Pull p-value
if plot:
median_rho_feature_plot(df_spearman)
sorted = df_spearman.sort_values(by=["median_rho"], ascending=False)
return sorted
def mean_std_var_isotopolouge(isotopolouges, plot = False, mean_cutoff = 0.2):
'''
For given isotopolouge data, divides each individual isotopolouge by it's total metabolite ion count (the sum of the isotopolouges
from the same metabolite). The mean and variance for each isotopolouge are then calculated and plotted. The isotopolouges are ranked
from highest to lowest mean, and a list of the isotopologue names with a mean higher than a specific cut off is returned.
Parameters:
- isotopolouges: a dataframe in which the the columns are isotopolouges
- (follows the naming convention [metab_01 m+00 | metab_01 m+01 | metab_01 m+02 ... metab_n m+05])
Returns: the mean and standard deviation of each isotopolouge divided by the total ion count (sum of isotopolouges for that metabolite)
'''
pd.options.mode.chained_assignment = None # https://stackoverflow.com/questions/20625582/how-to-deal-with-settingwithcopywarning-in-pandas
# List of all isotopolouges in the dataset
isotopolouge_names = list(isotopolouges.keys())
isotoplouge_column_indices = []
single_metabolite_indices = []
reference_name = ""
num_isotopolouges = len(isotopolouge_names)
# Generates isotoplouge_column_indices -> contains sublists which each contain the indices of all the isotopolouges for a single metabolite
# Ex -> [[0,1,2], [3, 4, 5], [6, 7, 8, 9 , 10 , 11]]
# single_metabolite_indices -> the sublist containing all the indices for a single metabolite
for i, isotopolouge_name in enumerate(isotopolouge_names):
# Access the primary metabolite name, ignoring the isotopolouge m+0x
metabolite_name = isotopolouge_name.split()[0]
if reference_name == metabolite_name:
single_metabolite_indices.append(i)
else:
isotoplouge_column_indices.append(single_metabolite_indices)
single_metabolite_indices = []
single_metabolite_indices.append(i)
reference_name = metabolite_name
if i == num_isotopolouges - 1:
isotoplouge_column_indices.append(single_metabolite_indices)
# Remove the initial empty sublist that was generated
isotoplouge_column_indices.pop(0)
# print(isotoplouge_column_indices)
# Instantialize dataframe that will hold all isotopolouges divided by the sum of their respective metabolite
sum_df = pd.DataFrame()
# For each metabolite, create a buffer df with it's isotopolouges, and calculate their sum. Divide them by this sum.
for i, sublist in enumerate(isotoplouge_column_indices):
# Access the isotopolouges and take the sum
temp_metab_df = isotopolouges.iloc[:, isotoplouge_column_indices[i]]
temp_metab_df['sum'] = temp_metab_df.sum(axis = 'columns')
# Move the sum to the first column (easier to do the division later)
first_column = temp_metab_df.pop('sum')
temp_metab_df.insert(0, 'sum', first_column)
# DF where each isotopolouge is divided by the sum of all isotopolouges for that metabolite
temp_metab_df.iloc[:,1:] = temp_metab_df.iloc[:,1:].div(temp_metab_df['sum'], axis=0)
temp_metab_df.pop('sum')
# Concatenate the new isotopolouge values to the final df to return
sum_df = pd.concat([sum_df, temp_metab_df], axis = 1, ignore_index=True)
# Relabel the df with the isotopolouge names
sum_df.set_axis(isotopolouge_names, axis=1, inplace=True)
# display(sum_df)
# Calculate the means of each isotopolouge and sort from largest to smallest.
means = sum_df.mean().sort_values(ascending = False)
means_filtered = means[means >= mean_cutoff]
# List of isotopolouge names that may be able to be predicted well (are above the mean cutoff)
valid_isotopolouges = list(means_filtered.keys())
valid_isotopolouges_indices = [isotopolouge_names.index(isotopolouge) for isotopolouge in valid_isotopolouges]
valid_isotopolouges_indices.sort()
valid_isotopolouges_ordered = [isotopolouge_names[index] for index in valid_isotopolouges_indices]
stds = sum_df.std()
if plot:
means.plot.bar().axhline(y = mean_cutoff, color = "red")
plt.show()
stds.plot.bar()
plt.show()
# Extract the isotopolouge data that we want to use
# print(isotopolouges.iloc[:, valid_isotopolouges_indices])
return [valid_isotopolouges_indices, valid_isotopolouges_ordered], isotopolouges.iloc[:, valid_isotopolouges_indices]
def df_concat(df1, df2):
'''
Concatenates two dataframes with the same number of rows side by side, returns the resulting dataframe
- https://stackoverflow.com/questions/23891575/how-to-merge-two-dataframes-side-by-side
'''
return pd.concat([df1, df2], axis=1)
def morans_I_score():
'''
This function uses the Moran's I score to determine if an isotopolouge has a strong enough signal to be considered for prediction.
In statistics, Moran's I is a measure of spatial autocorrelation. Spatial autocorrelation is characterized by a correlation in a signal among
nearby locations in space.
'''
return 0
def count_list_elements(list_of_lists, cutoff = 3):
'''
This function was written to determine which isotopolouges are "relevant" across multiple replicates. It counts how many times each iso index appears and decides whether
we should keep it or not.
Parameters:
- list_of_lists: a list of lists, where each sublist contains the indices for a single replicate brain (the elements are the indices of the isotopolouges)
'''
# Flatten the list of lists into single lists
flat_list = [item for sublist in list_of_lists for item in sublist]
flat_list.sort()
# Returns a dictionary where the keys are the iso indices and the values are the number of times they appear in the flattened list (a count)
iso_index_dict = Counter(flat_list)
print(iso_index_dict)
# List of the isos that appear an acceptable amount of times (deemed by cutoff)
valid_isotopolouges = [index for index in list(iso_index_dict.keys()) if iso_index_dict[index] >= cutoff]
valid_isotopolouges.sort()
return valid_isotopolouges
def predict_and_plot(sample = "KD-M1", valid_isos = [], checkpoint_path = './saved-weights/train5-test1-m0-ranked/checkpoint'):
ions_coord = get_data(file_name=f"/brain-m0-no-log/BG-{sample}-ioncounts-ranks.csv", keep_coord=True)
isotopolouges_coord = get_data(file_name=f"/brain-m0-no-log/BG-{sample}-isotopolouges-ranks.csv", keep_coord=True)
ions = get_data(file_name=f"/brain-m0-no-log/BG-{sample}-ioncounts-ranks.csv")
isotopolouges = get_data(file_name=f"/brain-m0-no-log/BG-{sample}-isotopolouges-ranks.csv")
isotopolouges_filtered = isotopolouges.iloc[:, valid_isos]
predicted = predict(ions, isotopolouges_filtered, checkpoint_path=checkpoint_path)
#predicted.index = isotopolouges.index
isotopolouges_filtered[['x', 'y']] = isotopolouges_coord[['x', 'y']]
predicted[['x', 'y']] = isotopolouges_coord[['x', 'y']]
plot_multiple_brains(isotopolouges_filtered)
plot_multiple_brains(predicted)
# ============================================== Train Uncertainty ===========================================================
def train_uncertainty(training_features, training_targets, testing_features, testing_targets, ITERATIONS = 10, EPOCHS = 100, BATCH_SIZE = 128, file_name = "10"):
'''
Train the model 100 times and show results for top X percentile with probability >= Y% to measure uncertainty. This will be done by:
- Training a model for EPOCHS amount of epochs
-
'''
# Initialize np array to hold the predictions. Array should be
predictions = np.zeros((ITERATIONS, testing_targets.shape[0], testing_targets.shape[1]))
# Learning rate scheduler
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=10, verbose = 1, min_lr=0)
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=7)
for i in range(ITERATIONS):
print(f'We are in iteration {i}')
# if i % 10 == 0: print(i)
num_ion_counts, num_isotopolouges, x_train, y_train, x_val, y_val = preprocess_data(training_features, training_targets, testing_split = False)
isotopolouge_names = list(training_targets.columns)
# define model
model = FML_regression_model(num_ion_counts, num_isotopolouges, 0.01)
history = model.fit(x_train, y_train, batch_size = BATCH_SIZE, verbose=1, validation_data = (x_val, y_val), epochs=EPOCHS, callbacks=[reduce_lr, early_stopping])
x_test = testing_features.to_numpy()
prediction = model.predict(x_test)
predictions[i] = prediction
# np.save(f'uncertainty/{file_name}-data.npy', predictions)
np.savez_compressed(f'uncertainty/{file_name}-data.npz', predictions)
return predictions, isotopolouge_names
def print_evaluation_metrics(ground_truth_df, predicted_df, num_rows = 200, create_df = False, print_python_table = False, latex_table = False):
'''
Function to report regressional evaluation metrics for deep learning model.
'''
spearman_sorted = spearman_rankings(ground_truth_df, predicted_df, plot = False)
top_predicted_metab_names = list(spearman_sorted['isotopologue'])
median_rhos = list(spearman_sorted['median_rho'])
pvals = list(spearman_sorted['pvals'])
metric_names = ['isotopologue', 'Median Rho', 'p-value', 'MSE', 'MAE', 'R2']
evaluation_metrics = []
metabolites_used = set()
mse_list = []
mae_list = []
r2_list = []
for i, isotopologue_name in enumerate(top_predicted_metab_names): #[0:num_rows]):
metabolite_name = isotopologue_name[0:-5] if not create_df else isotopologue_name
if metabolite_name not in metabolites_used:
ground_truth = ground_truth_df.loc[:, [isotopologue_name]]
predicted = predicted_df.loc[:, [isotopologue_name]]
mse = round(mean_squared_error(ground_truth, predicted), 4)
mse_list.append(mse)
mae = round(mean_absolute_error(ground_truth, predicted), 4)
mae_list.append(mae)
r_2 = round(r2_score(ground_truth, predicted), 4)
r2_list.append(r_2)
evaluation_metrics.append([isotopologue_name, round(median_rhos[i], 4), round(pvals[i], 4), mse, mae, r_2])
if latex_table:
print(f'{isotopologue_name} & {round(median_rhos[i], 4)} & {round(pvals[i], 4)} & {mse} & {mae} & {r_2} \\\\')
metabolites_used.add(metabolite_name)
if print_python_table:
myTable = PrettyTable(metric_names)
for i in range(len(evaluation_metrics)):
myTable.add_row(evaluation_metrics[i])
print(myTable)
#spearman_sorted['MSE'] = mse_list
#spearman_sorted['MAE'] = mae_list
#spearman_sorted['R2'] = r2_list
return pd.DataFrame(evaluation_metrics, columns = metric_names)
def relative_metabolite_success(TIC_metabolite_names = 0, morans_invalid_isos = 0, isotopologue_metrics= 0, all_isotopologues = 0, num_bars = 65):
'''
parameters:
- TIC_metabolite_names: a list of the metabolite names in the total ion counts matrix. The full metabolite list has 353.
- morans_invalid_isos: list of names of isotopologues that were identified by moran's i to be removed.
- isotopologue_metrics: df containing the name of each isotopologue and its regression evaluation metrics
'''
metabs_success_count = dict()
metabs_set = set()
isotopologue_metrics.sort_values(by=["isotopologue"], ascending=False, inplace = True)
successful_metabs = []
for index, row in isotopologue_metrics.iterrows():
metab_name = row['isotopologue'][0:-5]
if metab_name not in metabs_set:
metabs_set.add(metab_name)
metabs_success_count[metab_name] = [0,0,0]
if row['R2'] >= 0.3:
metabs_success_count[metab_name][0] += 1
else:
metabs_success_count[metab_name][1] += 1
for isotopologue in all_isotopologues:
metab_name = isotopologue[0:-5]
if metab_name not in metabs_set:
metabs_set.add(metab_name)
metabs_success_count[metab_name] = [0,0,0]
else:
metabs_success_count[metab_name][2] += 1
# print(metabs_success_count)
stacked_bar_plot(metabs_success_count, num_bars=num_bars)#len(metabs_set))
return isotopologue_metrics
# ======================================================== CROSS VALIDATION ========================================================
def cross_validation_testing(all_invalid_isos, data_path = '/brain-m0-no-log', FML = True, tracer = 'BG', checkpoints_dir_label = "glucose", checkpoints_path = "./saved-weights", cutoff = 4):
checkpoints_dir = f'{checkpoints_path}/cross-validation-{checkpoints_dir_label}'
# List of isotopologue names that should be removed from the dataset based on Moran's I score being low across majority of replicates