-
Notifications
You must be signed in to change notification settings - Fork 1
/
decoders.py
1217 lines (990 loc) · 45.2 KB
/
decoders.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
from builtins import zip
from builtins import range
import pandas
import numpy as np
import my.plot
import matplotlib.pyplot as plt
import os
import sklearn.linear_model
class ConvergenceError(Exception):
"""Raised when decoder fails to converge"""
pass
def to_indicator_df(ser, bins=None, propagate_nan=True):
"""Bin series and convert to DataFrame of indicators
ser : series of data
bins : how to bin the data
If None, assume the ser is already binned (or labeled)
propagate_nan : bool
If True, then wherever ser is null, that row will be all null
in return value
Returns : DataFrame
Rows corresponding to values in `ser` outside `bins`
will be all zero (I think).
"""
# Cut
if bins is not None:
binned = pandas.cut(ser, bins, labels=False)
unique_bins = binned.dropna().unique().astype(np.int)
else:
binned = ser
unique_bins = binned.dropna().unique()
# Indicate
indicated_l = []
for bin in unique_bins:
indicated = (binned == bin).astype(np.int)
indicated_l.append(indicated)
indicator_df = pandas.DataFrame(np.transpose(indicated_l),
index=ser.index, columns=unique_bins).sort_index(axis=1)
# Propagate nan
if propagate_nan and ser.isnull().any():
indicator_df.loc[ser.isnull(), :] = np.nan
return indicator_df
def indicate_series(series):
"""Different version of to_indicator_df"""
# Indicate each unique variable within `series`
indicated_l = []
indicated_keys_l = []
# Get unique
unique_variables = sorted(series.unique())
# Indicate
for unique_variable in unique_variables:
indicated_l.append((series == unique_variable).astype(np.int))
indicated_keys_l.append(unique_variable)
# DataFrame it
df = pandas.concat(indicated_l, keys=indicated_keys_l,
names=[series.name], axis=1, verify_integrity=True)
return df
def intify_classes(session_classes, by=('rewside', 'choice')):
"""Return integer class label for each row in `session_classes`.
session_classes : DataFrame with columns rewside, choice, servo_pos
by : tuple or None
Acceptable values:
('shape',)
('rewside', 'choice')
('rewside', 'choice', 'servo_pos')
None
Returns : Series
index: same as session_classes.index
values: int
if by is None, all values are zero
Otherwise, it will be the class id, which varies from
0-1 if by is ('shape',) or
0-3 if by is ('rewside', 'choice') or
0-11 if by is ('rewside', 'choice', 'servo_pos')
"""
# Error check
assert by in [
None,
('shape',),
('rewside',),
('choice',),
('rewside', 'choice'),
('rewside', 'choice', 'servo_pos'),
]
# Initialize to zero
intified_session_classes = pandas.Series(
np.zeros(len(session_classes), dtype=np.int),
index=session_classes.index,
)
# If by is None we just return zeros
# Otherwise do this
if by is not None:
# Replace each column with integers
replacing_dict = {
'shape': {'concave': 0, 'convex': 1},
'rewside': {'left': 0, 'right': 1},
'choice': {'left': 0, 'right': 1},
'servo_pos': {1670: 0, 1760: 1, 1850: 2}
}
coded_session_classes = session_classes.replace(replacing_dict)
# Add factor * each column
if 'shape' in by:
# Also least significant bit
intified_session_classes += coded_session_classes['shape']
if 'rewside' in by:
# Least significant bit
intified_session_classes += coded_session_classes['rewside']
if 'choice' in by:
# Middle (or most significant)
intified_session_classes += 2 * coded_session_classes['choice']
if 'servo_pos' in by:
# Multiply by 4 because 4 possible values for each servo_pos
# Slowest-changing bit
intified_session_classes += 4 * coded_session_classes['servo_pos']
return intified_session_classes
def normalize_features(session_features):
"""Normalize features
Returns: norm_session_features, normalizing_mu, normalizing_sigma
"""
norm_session_features = session_features.copy()
# Demean
normalizing_mu = norm_session_features.mean()
norm_session_features = norm_session_features - normalizing_mu
# Fill with zero here so that NaNs cannot impact the result
norm_session_features = norm_session_features.fillna(0)
# Scale, AFTER fillna, so that the scale is really 1
normalizing_sigma = norm_session_features.std()
norm_session_features = norm_session_features / normalizing_sigma
# Fillna again, in the case that every feature was the same
# in which case it all became inf after scaling
norm_session_features = norm_session_features.fillna(0)
norm_session_features = norm_session_features.replace(
{-np.inf: 0, np.inf: 0})
return norm_session_features, normalizing_mu, normalizing_sigma
def stratify_and_calculate_sample_weights(strats):
"""Calculate the weighting of each sample
The samples are weighted in inverse proportion to the
frequency of the corresponding value in `strats`.
Returns: strat_id2weight, sample_weights
strat_id2weight : dict of strat value to weight
sample_weights : weight of each sample (mean 1)
"""
# Calculate weight of each strat_id
strat_id2weight = {}
for this_strat in np.unique(strats):
n_this_strat = np.sum(strats == this_strat)
weight_this_strat = len(strats) / float(n_this_strat)
strat_id2weight[this_strat] = weight_this_strat
# Sample weights
sample_weights = np.array([strat_id2weight[strat_id]
for strat_id in strats])
# Make them mean 1
sample_weights = sample_weights / sample_weights.mean()
# Return
return strat_id2weight, sample_weights
def stratified_split_data(stratifications, n_splits=3,
shuffle=False, random_seed=None, n_tune_splits=1,
):
"""Stratify data into different groups, equalizing by class.
This defines `n_splits` "splits" of the data. Within each split, each
row is assigned to "train", "test", or (optionally) "tune" sets. Care
is taken to equalize the representation of each stratification in each
set of each split, as much as possible.
This is an example of the results applied to a small dataset.
split 0 1 2 3 4 5 6
strat trial
0 57 train train train train test tune train
80 train train train train train test tune
90 tune train train train train train test
3 61 train train test tune train train train
64 train train train test tune train train
98 train train train train test tune train
109 train train train train train test tune
117 tune train train train train train test
Because there are only 3 trials in stratification 0, some splits will
have no representatives of this stratification in the testing set.
If there are at least `n_splits` examples of each stratification, then
each split will have at least one example in the test set.
If all of the stratifications are smaller than `n_splits`, then it is
possible that one of the splits will have no testing or tuning set at all.
Everything is implemented with circular shifts to equalize representation
as much as possible.
Each row is in the test set on exactly one split, and in the tuning
set on exactly `n_tune_splits` splits. It is in the training set on
the remaining splits. `n_tune_splits` can be zero.
Each stratification is handled completely separately from the others.
To break symmetry, the first example in each stratification is assigned
to be in the test set on a random split, and from then on the shift
is circular. Without this randomness, split 0 would tend to have larger
test sets than the remaining splits.
stratifications : Series
index : however the data is indexed, e.g., session * trial
values : class of that row
If no stratification is desired, just provide the same value for each
pandas.Series(np.ones(len(big_tm)), index=big_tm.index)
n_splits : int
Number of splits, and number of columns in the result.
n_tune_splits : int
Number of splits each row should be in the tuning set.
Must be in the range [0, n_splits - 2].
shuffle : bool
If True, randomize the order of the trials within each stratification
before applying the circular shift.
random_seed : int, or None
If not None, call np.random.seed(random_seed)
If None, do not change random seed.
group_names : list
The names of the other datasets
Names can be repeated in order to vary the relative sizes
So for instance if n_splits = 5, and group_names is
['train', 'train', 'train', 'tune']
Then there will be 3 parts training, 1 part tuning, and 1 part testing
The allocation is done by modding the indices, not randomly sampling,
so the results are always nearly exactly partitioned equally.
Returns : DataFrame
index : same as stratifications.index
columns : range(n_splits)
values : 'test', 'tune', or 'train'
"""
## Initialize
# Set state
if random_seed is not None:
np.random.seed(random_seed)
# Set group_names
# This determines the relative prevalence of each set
# And the relationship between adjacent splits
n_train_splits = n_splits - n_tune_splits - 1
assert n_train_splits > 0
assert n_tune_splits >= 0
group_names = (
['test'] +
n_tune_splits * ['tune'] +
n_train_splits * ['train'])
# Identify the unique values of `stratifications`
unique_strats = np.sort(np.unique(stratifications))
## Define a random strat_offset for each stratification
# Choose a value in range(n_splits) for each value in unique_strats
# Use permutations to avoid repeating too frequently
# This many runs through range(n_splits)
n_perms = int(np.ceil(len(unique_strats) / n_splits))
# Concatenate each run
strat_offsets = np.concatenate([
np.random.permutation(range(n_splits))
for n_perm in range(n_perms)])
# Truncate to same length
strat_offsets = strat_offsets[:len(unique_strats)]
## Generate the group_shift of each entry in stratifications
# Consider each stratification separately
group_shift_of_each_index_l = []
group_shift_of_each_index_keys_l = []
for strat, strat_offset in zip(unique_strats, strat_offsets):
# Find the corresponding indices into stratifications
# Note: raw indices, not values in stratifications.index
indices = np.where(stratifications == strat)[0]
# Shufle them
if shuffle:
np.random.shuffle(indices)
# Assign each value in `indices` a "shift"
# This is the split on which that row will be in the test set
# It can also be interpreted as the circular shift to apply
# to `group_names` to get the set for each split for this row.
# This works because 'test' is always first in 'group_names'.
#
# Also use strat_offset here to uniformly shift all rows
# Otherwise the first row in this stratification always gets assigned
# to the test set on split 0, which is a problem because split 0 will
# thus always have the largest test set.
group_shift_of_each_index = np.mod(
strat_offset + np.arange(len(indices), dtype=np.int),
n_splits)
# Convert to Series and store
group_shift_of_each_index_l.append(
pandas.Series(group_shift_of_each_index,
index=stratifications.iloc[indices].index))
group_shift_of_each_index_keys_l.append(strat)
# Concat
# Indexed now by strat * trial; no longer in order of `stratifications`
group_shift_ser = pandas.concat(
group_shift_of_each_index_l,
keys=group_shift_of_each_index_keys_l, names=['strat'],
).sort_index()
## Use group_shift to assign each row its groups
# Columns: splits
# Index: strat * trial
# Values: name of set (test, tune, train)
# Each row is just a circular shift of group_names
set_by_split_strat_and_trial = pandas.DataFrame(
[np.roll(group_names, shift) for shift in group_shift_ser.values],
index=group_shift_ser.index,
columns=pandas.Series(range(n_splits), name='split')
)
# Debugging: count set sizes
# For stratifications with fewer examples than `n_splits`,
# any given split may have no examples in one of the sets.
# In principle, if all the stratifications are smaller than `n_splits`,
# it's possible that some split may have no training or tuning examples
# for all stratifications
# Check: set_sizes.sum(level='split')
set_sizes = set_by_split_strat_and_trial.stack().groupby(
['strat', 'split']).value_counts().unstack().fillna(0).astype(np.int)
# Drop the 'strat' level and sort by trial to return
res = set_by_split_strat_and_trial.droplevel('strat').sort_index()
assert res.index.equals(stratifications.index)
return res
def logregress2(
features, labels, train_indices, test_indices,
sample_weights, strats=None, regularization=10**5,
testing_set_name='test', max_iter=10000, non_convergence_action='error',
solver='liblinear', tol=1e-6, min_training_points=10,
balancing_method=None,
):
"""Run cross-validated logistic regression
sample_weights : the weight of each sample
This is now required, even if it's supposed to be 1 for
all of the rows.
strats : the class id of each sample
balancing_method : string
If 'class weighting' or None:
Does nothing special, uses the `sample_weights` provided,
which should already have been balanced if desired.
If 'subsampling', then subsamples the training set.
In this case strats must be provided
In either case, if a strat is totally missing from the training
set, this will probably have undesired results by ignoring
that strat completely.
testing_set_name : this is used to set the values in the 'set' column
of the per_row_df, and also in scores_df
If this is a tuning set, pass 'tune'
solver, max_iter, tol : passed to sklearn.linear_model.LogisticRegression
With solver == 'lbfgs', I had to use huge n_iter (1e6) and even
then sometimes got gradient errors (LNSRCH). Going back to 'liblinear'.
non_convergence_action : string
if 'error': raise error when doesn't converge
if 'pass': do nothing
This doesn't catch all ConvergenceWarning, like LNSRCH
For that, do this:
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.simplefilter("error", ConvergenceWarning)
Returns: dict
'weights': logreg.coef_[0],
'intercept': logreg.intercept_[0],
'scores_df': scores_df,
'per_row_df': res_df,
"""
## Error check
if len(train_indices) < min_training_points:
raise ValueError(
"must provide at least {} training examples, I got {}".format(
min_training_points, len(train_indices)))
if len(np.unique(test_indices)) <= 1:
raise ValueError("must provide more than one label type")
## Split out test and train sets
X_train = features[train_indices]
X_test = features[test_indices]
y_train = labels[train_indices]
y_test = labels[test_indices]
sample_weights_train = sample_weights[train_indices]
sample_weights_test = sample_weights[test_indices]
if strats is not None:
strats_train = strats[train_indices]
strats_test = strats[test_indices]
## Sub-sample the training set
## TODO: repeat this sub-sampling many times and average the results
if balancing_method == 'subsampling':
# Double-check that sample weights were NOT provided
assert (sample_weights == 1).all()
# Double-check that strats were provided
assert strats is not None
# Size of the minimal class
rownum2strat = pandas.Series(strats_train)
strat2n_rows = rownum2strat.value_counts().sort_index()
new_groupsize = strat2n_rows.min()
# Choose new_groupsize rows from each strat
chosen_rows_l = []
for strat in rownum2strat.unique():
strat_rows = rownum2strat[rownum2strat == strat]
chosen_rows = np.random.choice(
strat_rows.index.values, new_groupsize, replace=False)
chosen_rows_l.append(chosen_rows)
# Select only chosen rows
all_chosen_rows = np.sort(np.concatenate(chosen_rows_l))
# Error check
assert (
rownum2strat.loc[all_chosen_rows].value_counts() == new_groupsize
).all()
# Apply
X_train = X_train[all_chosen_rows]
y_train = y_train[all_chosen_rows]
sample_weights_train = sample_weights_train[all_chosen_rows]
## Fit
# Initialize fitter
logreg = sklearn.linear_model.LogisticRegression(
C=(1.0 / regularization),
max_iter=max_iter, solver=solver, tol=tol,
)
# Fit, applying the sample weights
logreg.fit(
X_train, y_train, sample_weight=sample_weights_train,
)
# Deal with non-convergence
if logreg.n_iter_.item() >= logreg.max_iter:
if non_convergence_action == 'error':
raise ConvergenceError("non-convergence in decoder")
elif non_convergence_action == 'pass':
pass
else:
raise ValueError("unknown non_convergence_action: {}".format(
non_convergence_action))
## Extract results
# The per-row predictions and scores
predictions = logreg.predict(features)
res_df = pandas.DataFrame.from_dict({
'dec_function': logreg.decision_function(features),
'proba': logreg.predict_proba(features)[:, 1],
'prediction': predictions,
'pred_correct': (predictions == labels).astype(np.float),
'actual': labels,
'sample_weight': sample_weights,
})
res_df['weighted_correct'] = res_df['sample_weight'] * res_df['pred_correct']
# Assign train or test set to each row
res_df['set'] = ''
res_df['set'].values[test_indices] = testing_set_name
res_df['set'].values[train_indices] = 'train'
# The overall scores
scores_df = res_df.loc[
res_df['set'].isin([testing_set_name, 'train']),
['set', 'pred_correct', 'weighted_correct']
].groupby('set').mean()
## Return
output = {
'weights': logreg.coef_[0],
'intercept': logreg.intercept_[0],
'scores_df': scores_df,
'per_row_df': res_df,
}
return output
def tuned_logregress(folds, norm_session_features, intified_labels,
sample_weights, reg_l, intified_session_classes=None,
balancing_method=None):
"""Train over all splits and all regularizations, evaluate on the tuning set
balancing_method : string, or None
Simply passed to logregress2
If some stratifications are very small, they will be missing from the
test, tune, or train set on some splits. When it's missing from
test set or tune set, it's not so bad, because in the end we combine
data from the split where each trial was in the test/tune set, so
they're all included. It's bad when it's missing from the training set,
because it won't affect the coefficients, but also because the weights
are wrong (ill-posed) with a missing stratification. The use of high-fold
cross-validation helps: in the limit with leave-one-out, it will only
be missing from the training set once.
"""
## Iterate over splits
tuning_results_l = []
tuning_scores_l = []
tuning_keys_l = []
for split in folds.columns:
## Extract data masks and indices
test_data_mask = folds.loc[:, split] == 'test'
tune_data_mask = folds.loc[:, split] == 'tune'
train_data_mask = folds.loc[:, split] == 'train'
test_indices = np.where(test_data_mask.values)[0]
tune_indices = np.where(tune_data_mask.values)[0]
train_indices = np.where(train_data_mask.values)[0]
## Iterate over regularizations
for reg in reg_l:
# Train on the training set, test on the tuning set
logreg_res = logregress2(
features=norm_session_features.values,
labels=intified_labels.values,
train_indices=train_indices,
test_indices=tune_indices,
sample_weights=sample_weights,
regularization=10 ** reg,
testing_set_name='tune',
strats=intified_session_classes.values,
balancing_method=balancing_method,
)
# Rename the sets, because the unused rows marked with
# '' were actually 'test'
logreg_res['per_row_df']['set'] = (
logreg_res['per_row_df']['set'].replace(
{'': 'test'})
)
# Add the index back
logreg_res['per_row_df'].index = norm_session_features.index
logreg_res['weights'] = pandas.Series(logreg_res['weights'],
index=norm_session_features.columns, name='weights')
# Error check
assert (logreg_res['per_row_df']['set'] == folds.loc[:, split]).all()
# Store
tuning_results_l.append(logreg_res)
tuning_keys_l.append((split, reg))
return tuning_keys_l, tuning_results_l
def recalculate_decfun_standard(features, mu, sigma, weights, intercepts):
"""Recalculate the decision function from features in the standard way.
In the standard approach, we first standardize the features (zero mean and
unit variance), multiply by the weights, and add the intercept.
features : DataFrame of shape (n_trials, n_features)
index: MultiIndex (session, trial)
columns: MultiIndex of features
These are the raw features. They can contain null values.
mu, sigma : DataFrame of shape (n_sessions, n_features)
index: session
columns: MultiIndex of features
These are the mean and standard deviation of the raw features.
When there is no data in features, mu will be null, and standard
deviation should be zero.
filled with zeros.
weights : DataFrame of shape (n_sessions * n_decode_labels, n_features)
index : MultiIndex (session, decode_label)
columns : MultiIndex of features
These are the coefficients from the model. They should not be null.
intercepts : Series of length (n_sessions * n_decode_labels)
index : MultiIndex (session, decode_label)
Returns: Series
index : MultiIndex (session, decode_label, trial)
This is the decision function for each trial.
"""
# Debugging: this was for a single session * label
#~ srecalc_decfun = sfeatures.sub(snmu).divide(snsig).mul(sweights).fillna(
#~ 0).sum(1) + sicpt
return features.sub(mu).divide(sigma).mul(weights).fillna(
0).sum(1).add(intercepts)
def recalculate_decfun_raw(features, mu, sigma, weights, intercepts):
"""Recalculate the decision function from features in the raw way.
This is a reordering of the standard formula, so that the weights
are directly interpretable as the effect of a single instance of
a feature (e.g., a single contact).
Because we want to use the raw non-standardized features, we have to
apply the feature scaling to the weights. These "scaled weights" are
the weights divided by the standard deviation of the features. We
also have to account for this change in the intercept.
decfun = weights * (features - mu) / sigma + intercept
decfun = (weights / sigma) * features + (-mu * weights / sigma) + intercept
decfun = scaled_weights * features + (-scaled_weights * mu + intercept)
decfun = scaled_weights * features + icpt_transformed
The inputs are the same as recalculate_decfun_standard.
Returns: scaled_weights, icpt_transformed, decfun
scaled_weights : DataFrame, same shape as weights
The scaled weights
icpt_transformed : Series, same shape as intercepts
The transformed intercepts
decfun : Series, same shape as the result of recalculate_decfun_standard
The recalculated decision function
"""
# Debugging: this was for a single session * label
#~ sweights_unscaled = sweights.divide(snsig).fillna(0)
#~ sicpt_transformed = -sweights_unscaled.mul(snmu).fillna(0).sum() + sicpt
#~ srecalc_decfun2 = (
#~ sfeatures.fillna(sfeatures.mean()).mul(sweights_unscaled).fillna(0).sum(1)
#~ + sicpt_transformed
#~ )
# Scale the weights
scaled_weights = weights.divide(sigma).fillna(0)
# Transform the intercepts
icpt_transformed = (-scaled_weights.mul(mu).fillna(0).sum(1)).add(
intercepts)
# Must pre-fill model_features with the session mean of each feature
filled_features = pandas.concat([
features.loc[session].fillna(features.loc[session].mean())
for session in features.index.levels[0]],
axis=0, keys=features.index.levels[0]
)
# Recalculate
decfun = filled_features.mul(scaled_weights).fillna(0).sum(1).add(
icpt_transformed)
return scaled_weights, icpt_transformed, decfun
def recalculate_decfun_partitioned(features, mu, sigma, weights, intercepts,
raw_mask):
"""Recalculate the decision function using some raw and some scaled.
This is a combination of the approaches in recalculate_decfun_standard
and recalculate_decfun_raw. Some "raw" features are handled in the raw
way and the rest are handled in the scaled way.
The inputs are the same as recalculate_decfun_standard, except for:
raw_mask : boolean Series, same shape as the feature columns
True for raw features
Returns: features_part, weights_part, icpt_transformed, decfun
features_part : DataFrame, same shape as features
The transformed features. Raw features are unchanged from `features`,
and the rest are scaled as in the standard way.
weights_part : DataFrame, same shape as weights
The transformed weights. The weights of raw features are scaled,
and the rest are the the same as in `weights`.
icpt_transformed : Series, same shape as intercepts
The transformed intercepts, now including the effect of the
raw weights.
decfun : Series, same shape as the result of recalculate_decfun_standard
The recalculated decision function
"""
# Debugging
#~ sraw_mask = sweights.index.get_level_values('family') == 'summed_count'
#~ sfeatures_part = sfeatures.copy()
#~ sweights_part = sweights.copy()
#~ sfeatures_part = sfeatures_part.fillna(sfeatures_part.mean())
#~ sfeatures_part.loc[:, ~sraw_mask] = sfeatures_part.loc[:, ~sraw_mask].sub(
#~ snmu).divide(snsig)
#~ sweights_part.loc[raw_mask] = sweights_part.loc[raw_mask].divide(
#~ snsig.loc[raw_mask]).fillna(0)
#~ srecalc_decfun4 = (
#~ sfeatures_part.mul(sweights_part).fillna(0).sum(1) +
#~ raw_sicpt_transformed + sicpt
#~ )
#~ raw_sicpt_transformed = -sweights_part.loc[raw_mask].mul(snmu.loc[raw_mask]
#~ ).fillna(0).sum()
# Copy because some will be changed
features_part = features.copy()
weights_part = weights.copy()
# Fill all features with their session mean
features_part = pandas.concat([
features_part.loc[session].fillna(features_part.loc[session].mean())
for session in features_part.index.levels[0]],
axis=0, keys=features_part.index.levels[0]
)
# Normalize some of the features (the standard way) but leave the
# raw features alone
features_part.loc[:, ~raw_mask] = features_part.loc[:, ~raw_mask].sub(
mu.loc[:, ~raw_mask]).divide(
sigma.loc[:, ~raw_mask])
# Scale the raw weights
weights_part.loc[:, raw_mask] = weights_part.loc[:, raw_mask].divide(
sigma.loc[:, raw_mask]).fillna(0)
# Transform the intercept corresponding to raw weights
raw_icpt_component = -weights_part.loc[:, raw_mask].mul(
mu.loc[:, raw_mask]).fillna(0).sum(1)
icpt_transformed = raw_icpt_component + intercepts
# Compute
decfun = (
features_part.mul(weights_part).fillna(0).sum(1).add(
icpt_transformed)
)
return features_part, weights_part, icpt_transformed, decfun
def bin_features_into_analysis_bins(features_df, C2_whisk_cycles, BINS):
"""Bin locked_t by BINS and add analysis_bin as level
Drops rows that are not contained by a bin
"""
# Make a copy
features_df = features_df.copy()
# Get index as df
idxdf = features_df.index.to_frame().reset_index(drop=True)
# Join the peak frame
idxdf = idxdf.join(C2_whisk_cycles['locked_t'],
on=['session', 'trial', 'cycle'])
# Cut the peak frame by the frame bins
idxdf['analysis_bin'] = pandas.cut(
idxdf['locked_t'],
bins=BINS['bin_edges_t'], labels=False, right=False)
# Mark null bins with -1 (they will later be dropped)
idxdf['analysis_bin'] = idxdf['analysis_bin'].fillna(-1).astype(np.int)
# Reinsert this index
features_df.index = pandas.MultiIndex.from_frame(
idxdf[['session', 'trial', 'analysis_bin', 'cycle']])
# Drop frame_bin == -1, if any
features_df = features_df.drop(-1, level='analysis_bin')
# Sort
features_df = features_df.sort_index()
return features_df
def load_model_results(model_dir):
# Load results from this reduced model
preds = pandas.read_pickle(os.path.join(model_dir,
'finalized_predictions')).sort_index()
weights = pandas.read_pickle(os.path.join(model_dir,
'meaned_weights')).sort_index()
intercepts = pandas.read_pickle(os.path.join(model_dir,
'meaned_intercepts')).sort_index()
try:
normalizing_mu = pandas.read_pickle(os.path.join(model_dir,
'big_normalizing_mu')).sort_index()
normalizing_sigma = pandas.read_pickle(os.path.join(model_dir,
'big_normalizing_sigma')).sort_index()
except FileNotFoundError:
print("warning: obsolete filenames")
normalizing_mu = pandas.read_pickle(os.path.join(model_dir,
'normalizing_mu')).sort_index()
normalizing_sigma = pandas.read_pickle(os.path.join(model_dir,
'normalizing_sigma')).sort_index()
# Fix the way mu and sigma were stored
# They have a redundant choice/rewside level
normalizing_mu = normalizing_mu.xs('rewside', level=1)
normalizing_sigma = normalizing_sigma.xs('rewside', level=1)
# Get only session on the index, to match features
normalizing_mu = normalizing_mu.unstack('session').T
normalizing_sigma = normalizing_sigma.unstack('session').T
# Also put session on the index of weights
weights = weights.T
# This isn't necessary
#~ normalizing_mu = normalizing_mu.loc[:, weights.columns]
#~ normalizing_sigma = normalizing_sigma.loc[:, weights.columns]
assert weights.shape[1] == normalizing_mu.shape[1]
assert weights.shape[1] == normalizing_sigma.shape[1]
# Return
res = {
'preds': preds,
'weights': weights,
'intercepts': intercepts,
'normalizing_mu': normalizing_mu,
'normalizing_sigma': normalizing_sigma,
}
return res
def partition(model_features, model_results, raw_mask):
"""Partition the features and weights into raw and standard
model_features
model_results
raw_mask : array of bool
Indicates which columns (features) of the weights are raw
"""
# Check standard decfun
# Throughout, slightly different than the original decfun because we're using the
# mean weights instead of the fold weights
decfun_standard = (
recalculate_decfun_standard(
model_features,
model_results['normalizing_mu'],
model_results['normalizing_sigma'],
model_results['weights'],
model_results['intercepts']
)
)
# Check raw
scaled_weights, icpt_transformed, decfun_raw = (
recalculate_decfun_raw(
model_features,
model_results['normalizing_mu'],
model_results['normalizing_sigma'],
model_results['weights'],
model_results['intercepts']
)
)
# Calculate partioned
features_part, weights_part, icpt_transformed_part, decfun_part = (
recalculate_decfun_partitioned(
model_features,
model_results['normalizing_mu'],
model_results['normalizing_sigma'],
model_results['weights'],
model_results['intercepts'],
raw_mask
)
)
# Error check
# TODO: get rid of the dropna(), there shouldn't be nulls here
assert np.allclose(decfun_raw.dropna().values, decfun_standard.dropna().values)
assert np.allclose(decfun_raw.dropna().values, decfun_part.dropna().values)
# Return
res = {
'raw_mask': raw_mask,
'features_part': features_part,
'weights_part': weights_part,
'icpt_transformed_part': icpt_transformed_part,
'decfun_part': decfun_part,
}
return res
def iterate_behavioral_classifiers_over_targets_and_sessions(
feature_set,
labels,
reg_l,
to_optimize,
n_splits,
stratify_by,
decode_targets=('rewside', 'choice'),
verbose=True,
min_class_size_warn_thresh=2,
random_seed=None,
balancing_method='sample weighting',
):
"""Runs behavioral classifier on all targets and sessions
Procedure
* For each target * session:
* Intifies the classes (using `intify_classes`)
* Stratifies (using `stratify_and_calculate_sample_weights` and
`stratified_split_data`)
* Normalizes features (using `normalize_features`)
Errors if any feature becomes >25 z-scores
* Checks for size of training and tuning data
* Runs classifier (using `tuned_logregress`)
* Extracts the scores on the tuning set and choose best reg
* Extracts predictions
* Concatenates over all target and sesions
verbose : bool
If True, print each session name as it is analyzed. Also,
print warnings about min class ize.
min_class_size_warn_thresh : int
If `verbose` and the size of any stratification is less than
this value, print a warning.
balancing_method : string
If 'sample weighting': Calculates sample_weight based on stratificatinos
If 'subsampling': Subsamples by stratification
Returns: dict
'best_reg_by_split'
'scores_by_reg_and_split'
'tuning_scores'
'finalized_predictions'
'best_per_row_results_by_split'
'best_weights_by_split'
'best_intercepts_by_split'
'meaned_weights'
'meaned_intercepts'
'big_norm_session_features'
'big_normalizing_mu'
'big_normalizing_sigma'
"""
# Enforce each datapoint used for tuning and testing exactly once
# Actually this is only approximate for tuning due to complexities
# of stratification
splits_group_names = ['train'] * (n_splits - 2) + ['tune']
## Iterate over targets
# Store results here
# Everything is jointly optimized over all targets and sessions
keys_l = []
norm_session_features_l = []
normalizing_mu_l = []
normalizing_sigma_l = []
tuning_keys_l = []
tuning_results_l = []
## Iterate over sessions
for session in list(feature_set.index.levels[0]):