-
Notifications
You must be signed in to change notification settings - Fork 0
/
bibliography.bib
6788 lines (6337 loc) · 643 KB
/
bibliography.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
@unpublished{achille_emergence_2018,
title = {Emergence of {{Invariance}} and {{Disentanglement}} in {{Deep Representations}}},
author = {Achille, Alessandro and Soatto, Stefano},
date = {2018-06-28},
eprint = {1706.01350},
eprinttype = {arxiv},
primaryclass = {cs, stat},
url = {http://arxiv.org/abs/1706.01350},
urldate = {2020-02-13},
abstract = {Using established principles from Statistics and Information Theory, we show that invariance to nuisance factors in a deep neural network is equivalent to information minimality of the learned representation, and that stacking layers and injecting noise during training naturally bias the network towards learning invariant representations. We then decompose the cross-entropy loss used during training and highlight the presence of an inherent overfitting term. We propose regularizing the loss by bounding such a term in two equivalent ways: One with a Kullbach-Leibler term, which relates to a PAC-Bayes perspective; the other using the information in the weights as a measure of complexity of a learned model, yielding a novel Information Bottleneck for the weights. Finally, we show that invariance and independence of the components of the representation learned by the network are bounded above and below by the information in the weights, and therefore are implicitly optimized during training. The theory enables us to quantify and predict sharp phase transitions between underfitting and overfitting of random labels when using our regularized loss, which we verify in experiments, and sheds light on the relation between the geometry of the loss function, invariance properties of the learned representation, and generalization error.},
archiveprefix = {arXiv},
annotation = {citecount: 00000},
file = {/Users/fariedabuzaid/Zotero/storage/DDBPJIAS/Achille and Soatto - 2018 - Emergence of Invariance and Disentanglement in Dee.pdf}
}
@inproceedings{adi_turning_2018,
title = {Turning {{Your Weakness Into}} a {{Strength}}: {{Watermarking Deep Neural Networks}} by {{Backdooring}}},
shorttitle = {Turning {{Your Weakness Into}} a {{Strength}}},
author = {Adi, Yossi and Baum, Carsten and Cisse, Moustapha and Pinkas, Benny and Keshet, Joseph},
date = {2018},
eprint = {1802.04633},
eprinttype = {arxiv},
pages = {1615--1631},
url = {https://www.usenix.org/conference/usenixsecurity18/presentation/adi},
urldate = {2021-10-13},
abstract = {Deep Neural Networks have recently gained lots of success after enabling several breakthroughs in notoriously challenging problems. Training these networks is computationally expensive and requires vast amounts of training data. Selling such pre-trained models can, therefore, be a lucrative business model. Unfortunately, once the models are sold they can be easily copied and redistributed. To avoid this, a tracking mechanism to identify models as the intellectual property of a particular vendor is necessary. In this work, we present an approach for watermarking Deep Neural Networks in a black-box way. Our scheme works for general classification tasks and can easily be combined with current learning algorithms. We show experimentally that such a watermark has no noticeable impact on the primary task that the model is designed for and evaluate the robustness of our proposal against a multitude of practical attacks. Moreover, we provide a theoretical analysis, relating our approach to previous work on backdooring.},
archiveprefix = {arXiv},
eventtitle = {27th \{\vphantom\}{{USENIX}}\vphantom\{\} {{Security Symposium}} (\{\vphantom\}{{USENIX}}\vphantom\{\} {{Security}} 18)},
isbn = {978-1-939133-04-5},
langid = {english},
annotation = {video: https://youtu.be/Fj-4i7BwKGM},
file = {/Users/fariedabuzaid/Zotero/storage/JPP9NQMP/Adi et al. - 2018 - Turning Your Weakness Into a Strength Watermarkin.pdf;/Users/fariedabuzaid/Zotero/storage/ZW7M7Q4X/security18_slides_baum.pdf}
}
@book{aggarwal_outlier_2017,
title = {Outlier {{Analysis}}},
author = {Aggarwal, Charu C.},
date = {2017},
publisher = {{Springer International Publishing}},
location = {{Cham}},
doi = {10.1007/978-3-319-47578-3},
url = {http://link.springer.com/10.1007/978-3-319-47578-3},
urldate = {2021-08-09},
abstract = {Outliers are also referred to as abnormalities, discordants, deviants, or anomalies in the data mining and statistics literature. In most applications, the data is created by one or more generating processes, which could either reflect activity in the system or observations collected about entities. When the generating process behaves unusually, it results in the creation of outliers. Therefore, an outlier often contains useful information about abnormal characteristics of the systems and entities that impact the data generation process. The recognition of such unusual characteristics provides useful application-specific insights.},
isbn = {978-3-319-47577-6 978-3-319-47578-3},
langid = {english},
file = {/Users/fariedabuzaid/Zotero/storage/QN53AWRZ/Aggarwal - 2017 - Outlier Analysis.pdf}
}
@inproceedings{ahn_spanning_2022,
title = {Spanning {{Tree-based Graph Generation}} for {{Molecules}}},
author = {Ahn, Sungsoo and Chen, Binghong and Wang, Tianzhe and Song, Le},
date = {2022},
url = {https://openreview.net/forum?id=w60btE_8T2m},
urldate = {2022-04-28},
abstract = {In this paper, we explore the problem of generating molecules using deep neural networks, which has recently gained much interest in chemistry. To this end, we propose a spanning tree-based graph...},
eventtitle = {International {{Conference}} on {{Learning Representations}} ({{ICLR}} 2022)},
langid = {english},
file = {/Users/fariedabuzaid/Zotero/storage/86FJKTHZ/Ahn et al. - 2021 - Spanning Tree-based Graph Generation for Molecules.pdf}
}
@unpublished{al-aradi_solving_2018,
title = {Solving {{Nonlinear}} and {{High-Dimensional Partial Differential Equations}} via {{Deep Learning}}},
author = {Al-Aradi, Ali and Correia, Adolfo and Naiff, Danilo and Jardim, Gabriel and Saporito, Yuri},
date = {2018-11-21},
eprint = {1811.08782},
eprinttype = {arxiv},
primaryclass = {q-fin},
url = {http://arxiv.org/abs/1811.08782},
urldate = {2021-03-10},
abstract = {In this work we apply the Deep Galerkin Method (DGM) described in Sirignano and Spiliopoulos (2018) to solve a number of partial differential equations that arise in quantitative finance applications including option pricing, optimal execution, mean field games, etc. The main idea behind DGM is to represent the unknown function of interest using a deep neural network. A key feature of this approach is the fact that, unlike other commonly used numerical approaches such as finite difference methods, it is mesh-free. As such, it does not suffer (as much as other numerical methods) from the curse of dimensionality associated with highdimensional PDEs and PDE systems. The main goals of this paper are to elucidate the features, capabilities and limitations of DGM by analyzing aspects of its implementation for a number of different PDEs and PDE systems. Additionally, we present: (1) a brief overview of PDEs in quantitative finance along with numerical methods for solving them; (2) a brief overview of deep learning and, in particular, the notion of neural networks; (3) a discussion of the theoretical foundations of DGM with a focus on the justification of why this method is expected to perform well.},
archiveprefix = {arXiv},
file = {/Users/fariedabuzaid/Zotero/storage/K93K88UX/Al-Aradi et al. - 2018 - Solving Nonlinear and High-Dimensional Partial Dif.pdf}
}
@inproceedings{alemi_deep_2017,
title = {Deep {{Variational Information Bottleneck}}},
booktitle = {Proceedings of {{ICLR}} 2017},
author = {Alemi, Alex and Fischer, Ian and Dillon, Josh and Murphy, Kevin},
date = {2017-04},
location = {{Toulon, France}},
url = {https://arxiv.org/abs/1612.00410},
urldate = {2021-01-22},
eventtitle = {5th {{International Conference}} on {{Learning Representations}}},
langid = {english},
file = {/Users/fariedabuzaid/Zotero/storage/R3SU67RC/Alemi et al. - 2017 - Deep Variational Information Bottleneck.pdf}
}
@article{alom_history_2018,
title = {The {{History Began}} from {{AlexNet}}: {{A Comprehensive Survey}} on {{Deep Learning Approaches}}},
author = {Alom, Zahangir and Taha, Tarek M and Yakopcic, Chris and Westberg, Stefan and Sidike, Paheding and Nasrin, Mst Shamima},
date = {2018-09-12},
pages = {39},
url = {https://arxiv.org/abs/1803.01164},
abstract = {In recent years, deep learning has garnered tremendous success in a variety of application domains. This new field of machine learning has been growing rapidly, and has been applied to most traditional application domains, as well as some new areas that present more opportunities. Different methods have been proposed based on different categories of learning, including supervised, semi-supervised, and un-supervised learning. Experimental results show state-of-the-art performance using deep learning when compared to traditional machine learning approaches in the fields of image processing, computer vision, speech recognition, machine translation, art, medical imaging, medical information processing, robotics and control, bio-informatics, natural language processing (NLP), cybersecurity, and many others.},
langid = {english},
file = {/Users/fariedabuzaid/Zotero/storage/WTEEVLDJ/Alom et al. - The History Began from AlexNet A Comprehensive Su.pdf}
}
@inproceedings{amayuelas_neural_2022,
title = {Neural {{Methods}} for {{Logical Reasoning}} over {{Knowledge Graphs}}},
author = {Amayuelas, Alfonso and Zhang, Shuai and Rao, Xi Susie and Zhang, Ce},
date = {2022},
url = {https://openreview.net/forum?id=tgcAoUVHRIB},
urldate = {2022-04-28},
abstract = {Reasoning is a fundamental problem for computers and deeply studied in Artificial Intelligence. In this paper, we specifically focus on answering multi-hop logical queries on Knowledge Graphs...},
eventtitle = {International {{Conference}} on {{Learning Representations}} ({{ICLR}} 2022)},
langid = {english},
file = {/Users/fariedabuzaid/Zotero/storage/UF6HJNMZ/Amayuelas et al. - 2022 - Neural Methods for Logical Reasoning over Knowledg.pdf}
}
@unpublished{andrychowicz_learning_2016,
title = {Learning to Learn by Gradient Descent by Gradient Descent},
author = {Andrychowicz, Marcin and Denil, Misha and Gomez, Sergio and Hoffman, Matthew W. and Pfau, David and Schaul, Tom and Shillingford, Brendan and de Freitas, Nando},
options = {useprefix=true},
date = {2016-06-14},
eprint = {1606.04474},
eprinttype = {arxiv},
primaryclass = {cs},
pages = {17},
url = {http://arxiv.org/abs/1606.04474},
urldate = {2017-04-18},
abstract = {The move from hand-designed features to learned features in machine learning has been wildly successful. In spite of this, optimization algorithms are still designed by hand. In this paper we show how the design of an optimization algorithm can be cast as a learning problem, allowing the algorithm to learn to exploit structure in the problems of interest in an automatic way. Our learned algorithms, implemented by LSTMs, outperform generic, hand-designed competitors on the tasks for which they are trained, and also generalize well to new tasks with similar structure. We demonstrate this on a number of tasks, including simple convex problems, training neural networks, and styling images with neural art.},
archiveprefix = {arXiv},
annotation = {citecount: 00690},
file = {/Users/fariedabuzaid/Zotero/storage/ABS58KT5/Andrychowicz et al. - 2016 - Learning to learn by gradient descent by gradient .pdf}
}
@inproceedings{bai_don_2021,
title = {Don't {{Just Blame Over-parametrization}} for {{Over-confidence}}: {{Theoretical Analysis}} of {{Calibration}} in {{Binary Classification}}},
shorttitle = {Don't {{Just Blame Over-parametrization}} for {{Over-confidence}}},
author = {Bai, Yu and Mei, Song and Wang, Huan and Xiong, Caiming},
date = {2021-07-19},
eprint = {2102.07856},
eprinttype = {arxiv},
primaryclass = {cs, math, stat},
location = {{Virtual event}},
url = {http://arxiv.org/abs/2102.07856},
urldate = {2021-09-04},
abstract = {Modern machine learning models with high accuracy are often miscalibrated -- the predicted top probability does not reflect the actual accuracy, and tends to be over-confident. It is commonly believed that such over-confidence is mainly due to over-parametrization, in particular when the model is large enough to memorize the training data and maximize the confidence. In this paper, we show theoretically that over-parametrization is not the only reason for over-confidence. We prove that logistic regression is inherently over-confident, in the realizable, under-parametrized setting where the data is generated from the logistic model, and the sample size is much larger than the number of parameters. Further, this over-confidence happens for general well-specified binary classification problems as long as the activation is symmetric and concave on the positive part. Perhaps surprisingly, we also show that over-confidence is not always the case -- there exists another activation function (and a suitable loss function) under which the learned classifier is under-confident at some probability values. Overall, our theory provides a precise characterization of calibration in realizable binary classification, which we verify on simulations and real data experiments.},
archiveprefix = {arXiv},
eventtitle = {Thirty-Eighth {{International Conference}} on {{Machine Learning}} ({{ICML}} 2021)},
langid = {english},
file = {/Users/fariedabuzaid/Zotero/storage/EWZCIJTX/Bai et al. - 2021 - Don't Just Blame Over-parametrization for Over-con.pdf}
}
@inproceedings{bai_recent_2021,
title = {Recent {{Advances}} in {{Adversarial Training}} for {{Adversarial Robustness}}},
author = {Bai, Tao and Luo, Jinqi and Zhao, Jun and Wen, Bihan and Wang, Qian},
date = {2021-04-20},
eprint = {2102.01356},
eprinttype = {arxiv},
primaryclass = {cs},
location = {{Montreal, Canada}},
url = {http://arxiv.org/abs/2102.01356},
urldate = {2021-12-26},
abstract = {Adversarial training is one of the most effective approaches defending against adversarial examples for deep learning models. Unlike other defense strategies, adversarial training aims to promote the robustness of models intrinsically. During the last few years, adversarial training has been studied and discussed from various aspects. A variety of improvements and developments of adversarial training are proposed, which were, however, neglected in existing surveys. For the first time in this survey, we systematically review the recent progress on adversarial training for adversarial robustness with a novel taxonomy. Then we discuss the generalization problems in adversarial training from three perspectives. Finally, we highlight the challenges which are not fully tackled and present potential future directions.},
archiveprefix = {arXiv},
eventtitle = {30th {{International Joint Conference}} on {{Artificial Intelligence}} ({{IJCAI-21}})},
file = {/Users/fariedabuzaid/Zotero/storage/FR6WDI36/Bai et al. - 2021 - Recent Advances in Adversarial Training for Advers.pdf}
}
@inproceedings{bakker_dadi_2019,
title = {{{DADI}}: {{Dynamic Discovery}} of {{Fair Information}} with {{Adversarial Reinforcement Learning}}},
shorttitle = {{{DADI}}},
booktitle = {{{ArXiv}}},
author = {Bakker, M. and Tu, Duy Patrick and Riverón Valdés, Humberto and Gummadi, K. and Varshney, Kush R. and Weller, Adrian and Pentland, A.},
date = {2019},
url = {https://arxiv.org/abs/1910.13983},
abstract = {We introduce a framework for dynamic adversarial discovery of information (DADI), motivated by a scenario where information (a feature set) is used by third parties with unknown objectives. We train a reinforcement learning agent to sequentially acquire a subset of the information while balancing accuracy and fairness of predictors downstream. Based on the set of already acquired features, the agent decides dynamically to either collect more information from the set of available features or to stop and predict using the information that is currently available. Building on previous work exploring adversarial representation learning, we attain group fairness (demographic parity) by rewarding the agent with the adversary's loss, computed over the final feature set. Importantly, however, the framework provides a more general starting point for fair or private dynamic information discovery. Finally, we demonstrate empirically, using two real-world datasets, that we can trade-off fairness and predictive performance},
eventtitle = {{{NeurIPS}} 2019. {{Human-Centric Machine Learning Workshop}}},
langid = {english},
file = {/Users/fariedabuzaid/Zotero/storage/Z5XJKLK6/Bakker et al. - 2019 - DADI Dynamic Discovery of Fair Information with A.pdf}
}
@inproceedings{bakker_fairness_2019,
title = {On {{Fairness}} in {{Budget-Constrained Decision Making}}},
author = {Bakker, Michiel A and Noriega-Campero, Alejandro and Tu, Duy Patrick and Sattigeri, Prasanna and Varshney, Kush R},
date = {2019-08},
pages = {8},
publisher = {{Association for Computing Machinery}},
location = {{Anchorage, Alaska}},
url = {https://krvarshney.github.io/pubs/BakkerNTSVP_kddxai2019.pdf},
abstract = {The machine learning community and society at large have become increasingly concerned with discrimination and bias in data-driven decision making systems. This has led to a dramatic increase in academic and popular interest in algorithmic fairness. In this work, we focus on fairness in budget-constrained decision making, where the goal is to acquire information (features) one-by-one for each individual to achieve maximum classification performance in a cost-effective way. We provide a framework for choosing a set of stopping criteria that ensures that a probabilistic classifier achieves a single error parity (e.g. equal opportunity) and calibration. Our framework scales efficiently to multiple protected attributes and is not susceptible to intra-group unfairness. Finally, using one synthetic and two public datasets, we confirm the effectiveness of our framework and investigate its limitations.},
eventtitle = {{{KDD}} ’19: {{Workshop}} on {{Explainable AI}}/{{ML}} ({{XAI}}) for {{Accountability}}, {{Fairness}}, and {{Transparency}}},
langid = {english},
annotation = {citecount: 00001},
file = {/Users/fariedabuzaid/Zotero/storage/3N93CC23/Bakker et al. - 2019 - On Fairness in Budget-Constrained Decision Making.pdf}
}
@unpublished{banino_pondernet_2021,
title = {{{PonderNet}}: {{Learning}} to {{Ponder}}},
shorttitle = {{{PonderNet}}},
author = {Banino, Andrea and Balaguer, Jan and Blundell, Charles},
date = {2021-09-02},
eprint = {2107.05407},
eprinttype = {arxiv},
primaryclass = {cs},
url = {https://openreview.net/forum?id=1EuxRTe0WN},
urldate = {2022-05-10},
abstract = {In standard neural networks the amount of computation used grows with the size of the inputs, but not with the complexity of the problem being learnt. To overcome this limitation we introduce PonderNet, a new algorithm that learns to adapt the amount of computation based on the complexity of the problem at hand. PonderNet learns end-to-end the number of computational steps to achieve an effective compromise between training prediction accuracy, computational cost and generalization. On a complex synthetic problem, PonderNet dramatically improves performance over previous adaptive computation methods and additionally succeeds at extrapolation tests where traditional neural networks fail. Also, our method matched the current state of the art results on a real world question and answering dataset, but using less compute. Finally, PonderNet reached state of the art results on a complex task designed to test the reasoning capabilities of neural networks.1},
archiveprefix = {arXiv},
annotation = {notion: https://www.notion.so/appliedaiinitiative/PonderNet-Learning-to-Ponder-8a6e9f639ff54c19ad8b3c4ecfb6cd9f post: https://community.appliedai.de/topics/27304/topic\_feed\_posts/1216268},
file = {/Users/fariedabuzaid/Zotero/storage/W9X3W8LB/Banino et al. - 2021 - PonderNet Learning to Ponder.pdf}
}
@inproceedings{bao_analyticdpm_2022,
title = {Analytic-{{DPM}}: An {{Analytic Estimate}} of the {{Optimal Reverse Variance}} in {{Diffusion Probabilistic Models}}},
shorttitle = {Analytic-{{DPM}}},
author = {Bao, Fan and Li, Chongxuan and Zhu, Jun and Zhang, Bo},
date = {2022},
eprint = {2201.06503},
eprinttype = {arxiv},
location = {{|}},
url = {https://openreview.net/forum?id=0xiJLKH-ufZ},
urldate = {2022-04-26},
abstract = {Diffusion probabilistic models (DPMs) represent a class of powerful generative models. Despite their success, the inference of DPMs is expensive since it generally needs to iterate over thousands...},
archiveprefix = {arXiv},
eventtitle = {International {{Conference}} on {{Learning Representations}} ({{ICLR2022}})},
langid = {english},
annotation = {video:https://iclr.cc/virtual/2022/oral/7167},
file = {/Users/fariedabuzaid/Zotero/storage/NADZ3KVY/Bao et al. - 2021 - Analytic-DPM an Analytic Estimate of the Optimal .pdf}
}
@article{bar-sinai_learning_2019,
title = {Learning Data-Driven Discretizations for Partial Differential Equations},
author = {Bar-Sinai, Yohai and Hoyer, Stephan and Hickey, Jason and Brenner, Michael P.},
date = {2019-07-30},
journaltitle = {Proceedings of the National Academy of Sciences},
shortjournal = {PNAS},
volume = {116},
number = {31},
eprint = {1808.04930},
eprinttype = {arxiv},
pages = {15344--15349},
issn = {0027-8424, 1091-6490},
doi = {10.1073/pnas.1814058116},
url = {https://www.pnas.org/content/116/31/15344},
urldate = {2021-03-16},
abstract = {The numerical solution of partial differential equations (PDEs) is challenging because of the need to resolve spatiotemporal features over wide length- and timescales. Often, it is computationally intractable to resolve the finest features in the solution. The only recourse is to use approximate coarse-grained representations, which aim to accurately represent long-wavelength dynamics while properly accounting for unresolved small-scale physics. Deriving such coarse-grained equations is notoriously difficult and often ad hoc. Here we introduce data-driven discretization, a method for learning optimized approximations to PDEs based on actual solutions to the known underlying equations. Our approach uses neural networks to estimate spatial derivatives, which are optimized end to end to best satisfy the equations on a low-resolution grid. The resulting numerical methods are remarkably accurate, allowing us to integrate in time a collection of nonlinear equations in 1 spatial dimension at resolutions 4× to 8× coarser than is possible with standard finite-difference methods.},
archiveprefix = {arXiv},
langid = {english},
file = {/Users/fariedabuzaid/Zotero/storage/97YCYZST/Bar-Sinai et al. - 2019 - Learning data-driven discretizations for partial d.pdf}
}
@inproceedings{basu_influence_2020,
title = {Influence {{Functions}} in {{Deep Learning Are Fragile}}},
booktitle = {Proceedings of the 9th {{International Conference}} on {{Learning Representations}}},
author = {Basu, Samyadeep and Pope, Phil and Feizi, Soheil},
date = {2020-09-28},
url = {https://openreview.net/forum?id=xHKVVHGDOEk},
urldate = {2021-03-21},
abstract = {Influence functions approximate the effect of training samples in test-time predictions and have a wide variety of applications in machine learning interpretability and uncertainty estimation. A...},
eventtitle = {Ninth {{International Conference}} on {{Learning Representations}}},
langid = {english},
file = {/Users/fariedabuzaid/Zotero/storage/9TWFTUYJ/Basu et al. - 2020 - Influence Functions in Deep Learning Are Fragile.pdf;/Users/fariedabuzaid/Zotero/storage/MXIN9CRA/Influence Functions in Deep Learning Are Fragile - Appendix.pdf}
}
@inproceedings{basu_secondorder_2020,
title = {On {{Second-Order Group Influence Functions}} for {{Black-Box Predictions}}},
booktitle = {Proceedings of the 37th {{International Conference}} on {{Machine Learning}}},
author = {Basu, Samyadeep and You, Xuchen and Feizi, Soheil},
date = {2020-07-06},
volume = {119},
eprint = {1911.00418},
eprinttype = {arxiv},
pages = {715:724},
location = {{Vienna, Austria}},
url = {http://proceedings.mlr.press/v119/basu20b.html},
urldate = {2021-10-13},
abstract = {With the rapid adoption of machine learning systems in sensitive applications, there is an increasing need to make black-box models explainable. Often we want to identify an influential group of training samples in a particular test prediction for a given machine learning model. Existing influence functions tackle this problem by using first-order approximations of the effect of removing a sample from the training set on model parameters. To compute the influence of a group of training samples (rather than an individual point) in model predictions, the change in optimal model parameters after removing that group from the training set can be large. Thus, in such cases, the first-order approximation can be loose. In this paper, we address this issue and propose second-order influence functions for identifying influential groups in test-time predictions. For linear models, across different sizes and types of groups, we show that using the proposed second-order influence function improves the correlation between the computed influence values and the ground truth ones. We also show that second-order influence functions could be used with optimization techniques to improve the selection of the most influential group for a test-sample.},
archiveprefix = {arXiv},
eventtitle = {{{ICML}} 2020},
langid = {english},
file = {/Users/fariedabuzaid/Zotero/storage/4X97N7FJ/Basu et al. - 2020 - On Second-Order Group Influence Functions for Blac.pdf}
}
@unpublished{bates_crossvalidation_2021,
title = {Cross-Validation: What Does It Estimate and How Well Does It Do It?},
shorttitle = {Cross-Validation},
author = {Bates, Stephen and Hastie, Trevor and Tibshirani, Robert},
date = {2021-04-01},
eprint = {2104.00673v2},
eprinttype = {arxiv},
url = {https://arxiv.org/abs/2104.00673v2},
urldate = {2021-04-17},
abstract = {Cross-validation is a widely-used technique to estimate prediction error, but its behavior is complex and not fully understood. Ideally, one would like to think that cross-validation estimates the prediction error for the model at hand, fit to the training data. We prove that this is not the case for the linear model fit by ordinary least squares; rather it estimates the average prediction error of models fit on other unseen training sets drawn from the same population. We further show that this phenomenon occurs for most popular estimates of prediction error, including data splitting, bootstrapping, and Mallow's Cp. Next, the standard confidence intervals for prediction error derived from cross-validation may have coverage far below the desired level. Because each data point is used for both training and testing, there are correlations among the measured accuracies for each fold, and so the usual estimate of variance is too small. We introduce a nested cross-validation scheme to estimate this variance more accurately, and show empirically that this modification leads to intervals with approximately correct coverage in many examples where traditional cross-validation intervals fail. Lastly, our analysis also shows that when producing confidence intervals for prediction accuracy with simple data splitting, one should not re-fit the model on the combined data, since this invalidates the confidence intervals.},
archiveprefix = {arXiv},
langid = {english},
file = {/Users/fariedabuzaid/Zotero/storage/6DJT3HFI/KGX8234P.pdf}
}
@inproceedings{bengio_flow_2021,
title = {Flow {{Network}} Based {{Generative Models}} for {{Non-Iterative Diverse Candidate Generation}}},
booktitle = {Advances in {{Neural Information Processing Systems}}},
author = {Bengio, Emmanuel and Jain, Moksh and Korablyov, Maksym and Precup, Doina and Bengio, Yoshua},
date = {2021},
volume = {34},
pages = {27381--27394},
publisher = {{Curran Associates, Inc.}},
url = {https://papers.nips.cc/paper/2021/hash/e614f646836aaed9f89ce58e837e2310-Abstract.html},
urldate = {2022-07-13},
abstract = {This paper is about the problem of learning a stochastic policy for generating an object (like a molecular graph) from a sequence of actions, such that the probability of generating an object is proportional to a given positive reward for that object. Whereas standard return maximization tends to converge to a single return-maximizing sequence, there are cases where we would like to sample a diverse set of high-return solutions. These arise, for example, in black-box function optimization when few rounds are possible, each with large batches of queries, where the batches should be diverse, e.g., in the design of new molecules. One can also see this as a problem of approximately converting an energy function to a generative distribution. While MCMC methods can achieve that, they are expensive and generally only perform local exploration. Instead, training a generative policy amortizes the cost of search during training and yields to fast generation. Using insights from Temporal Difference learning, we propose GFlowNet, based on a view of the generative process as a flow network, making it possible to handle the tricky case where different trajectories can yield the same final state, e.g., there are many ways to sequentially add atoms to generate some molecular graph. We cast the set of trajectories as a flow and convert the flow consistency equations into a learning objective, akin to the casting of the Bellman equations into Temporal Difference methods. We prove that any global minimum of the proposed objectives yields a policy which samples from the desired distribution, and demonstrate the improved performance and diversity of GFlowNet on a simple domain where there are many modes to the reward function, and on a molecule synthesis task.},
file = {/Users/fariedabuzaid/Zotero/storage/89WVLQEJ/Bengio et al. - 2021 - Flow Network based Generative Models for Non-Itera.pdf}
}
@article{bengio_no_2004,
title = {No {{Unbiased Estimator}} of the {{Variance}} of {{K-Fold Cross-Validation}}},
author = {Bengio, Yoshua and Grandvalet, Yves},
date = {2004-12-01},
journaltitle = {The Journal of Machine Learning Research},
shortjournal = {J. Mach. Learn. Res.},
volume = {5},
pages = {1089--1105},
issn = {1532-4435},
doi = {10.5555/1005332.1044695},
url = {https://dl.acm.org/doi/abs/10.5555/1005332.1044695},
abstract = {Most machine learning researchers perform quantitative experiments to estimate generalization error and compare the performance of different algorithms (in particular, their proposed algorithm). In order to be able to draw statistically convincing conclusions, it is important to estimate the uncertainty of such estimates. This paper studies the very commonly used K-fold cross-validation estimator of generalization performance. The main theorem shows that there exists no universal (valid under all distributions) unbiased estimator of the variance of K-fold cross-validation. The analysis that accompanies this result is based on the eigen-decomposition of the covariance matrix of errors, which has only three different eigenvalues corresponding to three degrees of freedom of the matrix and three components of the total variance. This analysis helps to better understand the nature of the problem and how it can make naive estimators (that don't take into account the error correlations due to the overlap between training and test sets) grossly underestimate variance. This is confirmed by numerical experiments in which the three components of the variance are compared when the difficulty of the learning problem and the number of folds are varied.},
langid = {english},
file = {/Users/fariedabuzaid/Zotero/storage/QVXH8EAS/2SGZSFMI.pdf}
}
@inproceedings{benton_calibration_2019,
title = {Calibration for {{Anomaly Detection}}},
booktitle = {25th {{ACM SIGKDD International Conference}} on {{Knowledge Discovery}} \& {{Data Mining}} - {{KDD}} '19: {{Workshop}} on {{Anomaly Detection}} in {{Finance}}},
author = {Benton, Adrian},
date = {2019-08-05},
location = {{Anchorage, Alaska}},
url = {https://drive.google.com/drive/folders/1r_iJYFJru-jdDdgpB-KZ1N0Zathy2LD2},
abstract = {Recent work on model calibration found that a simple variant of Platt scaling, temperature scaling, is effective at calibrating modern neural networks across an array of classification tasks. However, when negative examples overwhelm the dataset, classifiers will often be biased to producing well-calibrated predictions for negative examples, but have trouble producing well-calibrated predictions for true anomalies. A well-calibrated model – one whose scores accurately reflect the true probability of anomaly likelihood – is an invaluable tool for decision makers.},
eventtitle = {25th {{ACM SIGKDD International Conference}} on {{Knowledge Discovery}} \& {{Data Mining}} - {{KDD}} '19: {{Workshop}} on {{Anomaly Detection}} in {{Finance}}},
langid = {english},
file = {/Users/fariedabuzaid/Zotero/storage/G67TNGSJ/Benton - Calibration for Anomaly Detection.pdf}
}
@article{berg_unified_2018,
title = {A Unified Deep Artificial Neural Network Approach to Partial Differential Equations in Complex Geometries},
author = {Berg, Jens and Nyström, Kaj},
date = {2018-11-23},
journaltitle = {Neurocomputing},
shortjournal = {Neurocomputing},
volume = {317},
pages = {28--41},
issn = {0925-2312},
doi = {10.1016/j.neucom.2018.06.056},
url = {https://www.sciencedirect.com/science/article/pii/S092523121830794X},
urldate = {2021-03-28},
abstract = {In this paper, we use deep feedforward artificial neural networks to approximate solutions to partial differential equations in complex geometries. We show how to modify the backpropagation algorithm to compute the partial derivatives of the network output with respect to the space variables which is needed to approximate the differential operator. The method is based on an ansatz for the solution which requires nothing but feedforward neural networks and an unconstrained gradient based optimization method such as gradient descent or a quasi-Newton method. We show an example where classical mesh based methods cannot be used and neural networks can be seen as an attractive alternative. Finally, we highlight the benefits of deep compared to shallow neural networks and device some other convergence enhancing techniques.},
langid = {english},
file = {/Users/fariedabuzaid/Zotero/storage/U5R8284M/Berg and Nyström - 2018 - A unified deep artificial neural network approach .pdf}
}
@unpublished{betancourt_conceptual_2018,
title = {A {{Conceptual Introduction}} to {{Hamiltonian Monte Carlo}}},
author = {Betancourt, Michael},
date = {2018-07-15},
eprint = {1701.02434},
eprinttype = {arxiv},
primaryclass = {stat},
url = {http://arxiv.org/abs/1701.02434},
urldate = {2020-11-03},
abstract = {Hamiltonian Monte Carlo has proven a remarkable empirical success, but only recently have we begun to develop a rigorous understanding of why it performs so well on difficult problems and how it is best applied in practice. Unfortunately, that understanding is confined within the mathematics of differential geometry which has limited its dissemination, especially to the applied communities for which it is particularly important. In this review I provide a comprehensive conceptual account of these theoretical foundations, focusing on developing a principled intuition behind the method and its optimal implementations rather of any exhaustive rigor. Whether a practitioner or a statistician, the dedicated reader will acquire a solid grasp of how Hamiltonian Monte Carlo works, when it succeeds, and, perhaps most importantly, when it fails.},
archiveprefix = {arXiv},
file = {/Users/fariedabuzaid/Zotero/storage/FGUWKAYT/Betancourt - 2018 - A Conceptual Introduction to Hamiltonian Monte Car.pdf}
}
@inproceedings{bevilacqua_equivariant_2022,
title = {Equivariant {{Subgraph Aggregation Networks}}},
author = {Bevilacqua, Beatrice and Frasca, Fabrizio and Lim, Derek and Srinivasan, Balasubramaniam and Cai, Chen and Balamurugan, Gopinath and Bronstein, Michael M. and Maron, Haggai},
date = {2022},
url = {https://openreview.net/forum?id=dFbKQaRk15w},
urldate = {2022-04-28},
abstract = {Message-passing neural networks (MPNNs) are the leading architecture for deep learning on graph-structured data, in large part due to their simplicity and scalability. Unfortunately, it was shown...},
eventtitle = {International {{Conference}} on {{Learning Representations}} ({{ICLR}} 2022)},
langid = {english},
file = {/Users/fariedabuzaid/Zotero/storage/3WX9M4IW/Bevilacqua et al. - 2022 - Equivariant Subgraph Aggregation Networks.pdf}
}
@unpublished{bhatt_cal_2021,
title = {\$f\$-{{Cal}}: {{Calibrated}} Aleatoric Uncertainty Estimation from Neural Networks for Robot Perception},
shorttitle = {\$f\$-{{Cal}}},
author = {Bhatt, Dhaivat and Mani, Kaustubh and Bansal, Dishank and Murthy, Krishna and Lee, Hanju and Paull, Liam},
date = {2021-09-28},
eprint = {2109.13913},
eprinttype = {arxiv},
primaryclass = {cs},
url = {http://arxiv.org/abs/2109.13913},
urldate = {2021-10-06},
abstract = {While modern deep neural networks are performant perception modules, performance (accuracy) alone is insufficient, particularly for safety-critical robotic applications such as self-driving vehicles. Robot autonomy stacks also require these otherwise blackbox models to produce reliable and calibrated measures of confidence on their predictions. Existing approaches estimate uncertainty from these neural network perception stacks by modifying network architectures, inference procedure, or loss functions. However, in general, these methods lack calibration, meaning that the predictive uncertainties do not faithfully represent the true underlying uncertainties (process noise). Our key insight is that calibration is only achieved by imposing constraints across multiple examples, such as those in a mini-batch; as opposed to existing approaches which only impose constraints per-sample, often leading to overconfident (thus miscalibrated) uncertainty estimates. By enforcing the distribution of outputs of a neural network to resemble a target distribution by minimizing an \$f\$-divergence, we obtain significantly better-calibrated models compared to prior approaches. Our approach, \$f\$-Cal, outperforms existing uncertainty calibration approaches on robot perception tasks such as object detection and monocular depth estimation over multiple real-world benchmarks.},
archiveprefix = {arXiv},
file = {/Users/fariedabuzaid/Zotero/storage/5YLQ2MLA/Bhatt et al. - 2021 - $f$-Cal Calibrated aleatoric uncertainty estimati.pdf}
}
@inproceedings{bian_energybased_2022,
title = {Energy-{{Based Learning}} for {{Cooperative Games}}, with {{Applications}} to {{Valuation Problems}} in {{Machine Learning}}},
author = {Bian, Yatao and Rong, Yu and Xu, Tingyang and Wu, Jiaxiang and Krause, Andreas and Huang, Junzhou},
date = {2022},
url = {https://openreview.net/forum?id=xLfAgCroImw},
urldate = {2022-04-20},
abstract = {Valuation problems, such as feature interpretation, data valuation and model valuation for ensembles, become increasingly more important in many machine learning applications. Such problems are...},
eventtitle = {International {{Conference}} on {{Learning Representations}} ({{ICLR}} 2022)},
langid = {english},
annotation = {video: https://iclr.cc/virtual/2022/poster/6807},
file = {/Users/fariedabuzaid/Zotero/storage/YZUM4MQP/Bian et al. - 2021 - Energy-Based Learning for Cooperative Games, with .pdf}
}
@article{bishop_novelty_1994,
title = {Novelty Detection and Neural Network Validation},
author = {Bishop, C. M.},
date = {1994-08-01},
journaltitle = {IEE Proceedings - Vision, Image and Signal Processing},
volume = {141},
number = {4},
pages = {217--222},
publisher = {{IET Digital Library}},
issn = {1359-7108},
doi = {10.1049/ip-vis:19941330},
url = {https://digital-library.theiet.org/content/journals/10.1049/ip-vis_19941330},
urldate = {2022-04-11},
abstract = {One of the key factors which limits the use of neural networks in many industrial applications has been the difficulty of demonstrating that a trained network will continue to generate reliable outputs once it is in routine use. An important potential source of errors is novel input data; that is, input data which differ significantly from the data used to train the network. The author investigates the relationship between the degree of novelty of input data and the corresponding reliability of the outputs from the network. He describes a quantitative procedure for assessing novelty, and demonstrates its performance by using an application which involves monitoring oil flow in multiphase pipelines.},
langid = {english},
file = {/Users/fariedabuzaid/Zotero/storage/6XDXKHR3/ip-vis_19941330.html}
}
@book{bishop_pattern_2006,
title = {Pattern Recognition and Machine Learning},
author = {Bishop, Christopher M.},
date = {2006-08-17},
series = {Information Science and Statistics},
edition = {1},
publisher = {{Springer}},
url = {https://www.microsoft.com/en-us/research/people/cmbishop/prml-book/},
abstract = {This is the first textbook on pattern recognition to present the Bayesian viewpoint. The book presents approximate inference algorithms that permit fast approximate answers in situations where exact answers are not feasible. It uses graphical models to describe probability distributions when no other books apply graphical models to machine learning. No previous knowledge of pattern recognition or machine learning concepts is assumed. Familiarity with multivariate calculus and basic linear algebra is required, and some experience in the use of probabilities would be helpful though not essential as the book includes a self-contained introduction to basic probability theory.},
isbn = {978-0-387-31073-2},
langid = {english},
pagetotal = {738},
annotation = {citecount: 00184},
file = {/Users/fariedabuzaid/Zotero/storage/BTRM9WTQ/Bishop - 2006 - Pattern recognition and machine learning.pdf;/Users/fariedabuzaid/Zotero/storage/MLRXCPTQ/prml-web-sol-2009-09-08.pdf;/Users/fariedabuzaid/Zotero/storage/ZZELHRIX/prml-errata-3rd-20110921.pdf}
}
@unpublished{blechschmidt_three_2021,
title = {Three {{Ways}} to {{Solve Partial Differential Equations}} with {{Neural Networks}} -- {{A Review}}},
author = {Blechschmidt, Jan and Ernst, Oliver G.},
date = {2021-02-23},
eprint = {2102.11802},
eprinttype = {arxiv},
primaryclass = {cs, math},
url = {http://arxiv.org/abs/2102.11802},
urldate = {2021-03-11},
abstract = {Neural networks are increasingly used to construct numerical solution methods for partial differential equations. In this expository review, we introduce and contrast three important recent approaches attractive in their simplicity and their suitability for high-dimensional problems: physics-informed neural networks, methods based on the Feynman-Kac formula and the Deep BSDE solver. The article is accompanied by a suite of expository software in the form of Jupyter notebooks in which each basic methodology is explained step by step, allowing for a quick assimilation and experimentation. An extensive bibliography summarizes the state of the art.},
archiveprefix = {arXiv},
file = {/Users/fariedabuzaid/Zotero/storage/XJ64HHI8/Blechschmidt and Ernst - 2021 - Three Ways to Solve Partial Differential Equations.pdf}
}
@article{blei_variational_2017,
title = {Variational {{Inference}}: {{A Review}} for {{Statisticians}}},
shorttitle = {Variational {{Inference}}},
author = {Blei, David M. and Kucukelbir, Alp and McAuliffe, Jon D.},
date = {2017-04-03},
journaltitle = {Journal of the American Statistical Association},
volume = {112},
number = {518},
eprint = {1601.00670},
eprinttype = {arxiv},
pages = {859--877},
issn = {0162-1459, 1537-274X},
doi = {10.1080/01621459.2017.1285773},
url = {http://arxiv.org/abs/1601.00670},
urldate = {2018-03-11},
abstract = {One of the core problems of modern statistics is to approximate difficult-to-compute probability densities. This problem is especially important in Bayesian statistics, which frames all inference about unknown quantities as a calculation involving the posterior density. In this paper, we review variational inference (VI), a method from machine learning that approximates probability densities through optimization. VI has been used in many applications and tends to be faster than classical methods, such as Markov chain Monte Carlo sampling. The idea behind VI is to first posit a family of densities and then to find the member of that family which is close to the target. Closeness is measured by Kullback-Leibler divergence. We review the ideas behind mean-field variational inference, discuss the special case of VI applied to exponential family models, present a full example with a Bayesian mixture of Gaussians, and derive a variant that uses stochastic optimization to scale up to massive data. We discuss modern research in VI and highlight important open problems. VI is powerful, but it is not yet well understood. Our hope in writing this paper is to catalyze statistical research on this class of algorithms.},
archiveprefix = {arXiv},
langid = {english},
annotation = {citecount: 00205},
file = {/Users/fariedabuzaid/Zotero/storage/HLAN47XE/Blei et al. - 2017 - Variational Inference A Review for Statisticians.pdf}
}
@book{blum_foundations_2020,
title = {Foundations of {{Data Science}}},
author = {Blum, Avrim and Hopcroft, John and Kannan, Ravindran},
date = {2020},
publisher = {{Cambridge University Press}},
location = {{Cambridge}},
doi = {10.1017/9781108755528},
url = {https://www.cambridge.org/core/books/foundations-of-data-science/6A43CE830DE83BED6CC5171E62B0AA9E},
urldate = {2022-04-11},
abstract = {This book provides an introduction to the mathematical and algorithmic foundations of data science, including machine learning, high-dimensional geometry, and analysis of large networks. Topics include the counterintuitive nature of data in high dimensions, important linear algebraic techniques such as singular value decomposition, the theory of random walks and Markov chains, the fundamentals of and important algorithms for machine learning, algorithms and analysis for clustering, probabilistic models for large networks, representation learning including topic modelling and non-negative matrix factorization, wavelets and compressed sensing. Important probabilistic techniques are developed including the law of large numbers, tail inequalities, analysis of random projections, generalization guarantees in machine learning, and moment methods for analysis of phase transitions in large random graphs. Additionally, important structural and complexity measures are discussed such as matrix norms and VC-dimension. This book is suitable for both undergraduate and graduate courses in the design and analysis of algorithms for data.},
isbn = {978-1-108-48506-7},
file = {/Users/fariedabuzaid/Zotero/storage/WU5PNRLH/6A43CE830DE83BED6CC5171E62B0AA9E.html}
}
@article{bodria_benchmarking_2021,
title = {Benchmarking and {{Survey}} of {{Explanation Methods}} for {{Black Box Models}}},
author = {Bodria, Francesco and Giannotti, Fosca and Guidotti, Riccardo and Naretto, Francesca and Pedreschi, Dino and Rinzivillo, Salvatore},
date = {2021-02-25},
url = {https://arxiv.org/abs/2102.13076v1},
urldate = {2021-03-21},
abstract = {The widespread adoption of black-box models in Artificial Intelligence has enhanced the need for explanation methods to reveal how these obscure models reach specific decisions. Retrieving explanations is fundamental to unveil possible biases and to resolve practical or ethical issues. Nowadays, the literature is full of methods with different explanations. We provide a categorization of explanation methods based on the type of explanation returned. We present the most recent and widely used explainers, and we show a visual comparison among explanations and a quantitative benchmarking.},
langid = {english},
file = {/Users/fariedabuzaid/Zotero/storage/7N8S537A/Bodria et al. - 2021 - Benchmarking and Survey of Explanation Methods for.pdf}
}
@inproceedings{bohdal_metacalibration_2021,
title = {Meta-{{Calibration}}: {{Meta-Learning}} of {{Model Calibration Using Differentiable Expected Calibration Error}}},
shorttitle = {Meta-{{Calibration}}},
author = {Bohdal, Ondrej and Yang, Yongxin and Hospedales, Timothy},
date = {2021-06-17},
eprint = {2106.09613},
eprinttype = {arxiv},
primaryclass = {cs, stat},
url = {http://arxiv.org/abs/2106.09613},
urldate = {2021-10-05},
abstract = {Calibration of neural networks is a topical problem that is becoming increasingly important for real-world use of neural networks. The problem is especially noticeable when using modern neural networks, for which there is significant difference between the model confidence and the confidence it should have. Various strategies have been successfully proposed, yet there is more space for improvements. We propose a novel approach that introduces a differentiable metric for expected calibration error and successfully uses it as an objective for meta-learning, achieving competitive results with state-of-the-art approaches. Our approach presents a new direction of using meta-learning to directly optimize model calibration, which we believe will inspire further work in this promising and new direction.},
archiveprefix = {arXiv},
file = {/Users/fariedabuzaid/Zotero/storage/LL7G3KYD/Bohdal et al. - 2021 - Meta-Calibration Meta-Learning of Model Calibrati.pdf}
}
@article{bottou_counterfactual_2013,
title = {Counterfactual {{Reasoning}} and {{Learning Systems}}: {{The Example}} of {{Computational Advertising}}},
shorttitle = {Counterfactual {{Reasoning}} and {{Learning Systems}}},
author = {Bottou, Léon and Peters, Jonas and Quiñonero-Candela, Joaquin and Charles, Denis X. and Chickering, D. Max and Portugaly, Elon and Ray, Dipankar and Simard, Patrice and Snelson, Ed},
date = {2013},
journaltitle = {Journal of Machine Learning Research},
volume = {14},
number = {65},
pages = {3207--3260},
issn = {1533-7928},
url = {http://jmlr.org/papers/v14/bottou13a.html},
urldate = {2021-02-08},
abstract = {This work shows how to leverage causal inference to understand the behavior of complex learning systems interacting with their environment and predict the consequences of changes to the system. Such predictions allow both humans and algorithms to select the changes that would have improved the system performance. This work is illustrated by experiments on the ad placement system associated with the Bing search engine.},
file = {/Users/fariedabuzaid/Zotero/storage/XVTCMUT9/Bottou et al. - 2013 - Counterfactual Reasoning and Learning Systems The.pdf}
}
@unpublished{brach_single_2020,
title = {Single {{Shot MC Dropout Approximation}}},
author = {Brach, Kai and Sick, Beate and Dürr, Oliver},
date = {2020-07-07},
eprint = {2007.03293},
eprinttype = {arxiv},
primaryclass = {cs, stat},
publisher = {{arXiv}},
doi = {10.48550/ARXIV.2007.03293},
url = {http://arxiv.org/abs/2007.03293},
urldate = {2022-05-16},
abstract = {Deep neural networks (DNNs) are known for their high prediction performance, especially in perceptual tasks such as object recognition or autonomous driving. Still, DNNs are prone to yield unreliable predictions when encountering completely new situations without indicating their uncertainty. Bayesian variants of DNNs (BDNNs), such as MC dropout BDNNs, do provide uncertainty measures. However, BDNNs are slow during test time because they rely on a sampling approach. Here we present a single shot MC dropout approximation that preserves the advantages of BDNNs without being slower than a DNN. Our approach is to analytically approximate for each layer in a fully connected network the expected value and the variance of the MC dropout signal. We evaluate our approach on different benchmark datasets and a simulated toy example. We demonstrate that our single shot MC dropout approximation resembles the point estimate and the uncertainty estimate of the predictive distribution that is achieved with an MC approach, while being fast enough for real-time deployments of BDNNs.},
archiveprefix = {arXiv},
eventtitle = {{{ICML}} 2020 {{Workshop}} on {{Uncertainty}} and {{Robustness}} in {{Deep Learning}}},
file = {/Users/fariedabuzaid/Zotero/storage/SIITUGDG/Brach et al. - 2020 - Single Shot MC Dropout Approximation.pdf}
}
@inproceedings{brandstetter_message_2022,
title = {Message {{Passing Neural PDE Solvers}}},
author = {Brandstetter, Johannes and Worrall, Daniel E. and Welling, Max},
date = {2022},
url = {https://openreview.net/forum?id=vSix3HPYKSU},
urldate = {2022-04-28},
abstract = {The numerical solution of partial differential equations (PDEs) is difficult, having led to a century of research so far. Recently, there have been pushes to build neural--numerical hybrid solvers,...},
eventtitle = {International {{Conference}} on {{Learning Representations}} ({{ICLR}} 2022)},
langid = {english},
file = {/Users/fariedabuzaid/Zotero/storage/B2EKTEJN/Brandstetter et al. - 2022 - Message Passing Neural PDE Solvers.pdf}
}
@article{breiman_heuristics_1996,
title = {Heuristics of Instability and Stabilization in Model Selection},
author = {Breiman, Leo},
date = {1996-12},
journaltitle = {The Annals of Statistics},
volume = {24},
number = {6},
pages = {2350--2383},
publisher = {{Institute of Mathematical Statistics}},
issn = {0090-5364, 2168-8966},
doi = {10.1214/aos/1032181158},
url = {https://projecteuclid.org/journals/annals-of-statistics/volume-24/issue-6/Heuristics-of-instability-and-stabilization-in-model-selection/10.1214/aos/1032181158.full},
urldate = {2021-06-03},
abstract = {In model selection, usually a "best" predictor is chosen from a collection \$\{\textbackslash hat\{\textbackslash mu\}(\textbackslash cdot, s)\}\$ of predictors where \$\textbackslash hat\{\textbackslash mu\}(\textbackslash cdot, s)\$ is the minimum least-squares predictor in a collection \$\textbackslash mathsf\{U\}\_s\$ of predictors. Here s is a complexity parameter; that is, the smaller s, the lower dimensional/smoother the models in \$\textbackslash mathsf\{U\}\_s\$. If \$\textbackslash mathsf\{L\}\$ is the data used to derive the sequence \$\{\textbackslash hat\{\textbackslash mu\}(\textbackslash cdot, s)\}\$, the procedure is called unstable if a small change in \$\textbackslash mathsf\{L\}\$ can cause large changes in \$\{\textbackslash hat\{\textbackslash mu\}(\textbackslash cdot, s)\}\$. With a crystal ball, one could pick the predictor in \$\{\textbackslash hat\{\textbackslash mu\}(\textbackslash cdot, s)\}\$ having minimum prediction error. Without prescience, one uses test sets, cross-validation and so forth. The difference in prediction error between the crystal ball selection and the statistician's choice we call predictive loss. For an unstable procedure the predictive loss is large. This is shown by some analytics in a simple case and by simulation results in a more complex comparison of four different linear regression methods. Unstable procedures can be stabilized by perturbing the data, getting a new predictor sequence \$\{\textbackslash hat\{\textbackslash mu'\}(\textbackslash cdot, s)\}\$ and then averaging over many such predictor sequences.},
file = {/Users/fariedabuzaid/Zotero/storage/XGA32PRG/Breiman - 1996 - Heuristics of instability and stabilization in mod.pdf}
}
@article{breunig_lof_2000,
title = {{{LOF}}: Identifying Density-Based Local Outliers},
shorttitle = {{{LOF}}},
author = {Breunig, Markus M. and Kriegel, Hans-Peter and Ng, Raymond T. and Sander, Jörg},
date = {2000-05-16},
journaltitle = {ACM SIGMOD Record},
shortjournal = {SIGMOD Rec.},
volume = {29},
number = {2},
pages = {93--104},
issn = {0163-5808},
doi = {10.1145/335191.335388},
url = {https://doi.org/10.1145/335191.335388},
urldate = {2022-04-21},
abstract = {For many KDD applications, such as detecting criminal activities in E-commerce, finding the rare instances or the outliers, can be more interesting than finding the common patterns. Existing work in outlier detection regards being an outlier as a binary property. In this paper, we contend that for many scenarios, it is more meaningful to assign to each object a degree of being an outlier. This degree is called the local outlier factor (LOF) of an object. It is local in that the degree depends on how isolated the object is with respect to the surrounding neighborhood. We give a detailed formal analysis showing that LOF enjoys many desirable properties. Using real-world datasets, we demonstrate that LOF can be used to find outliers which appear to be meaningful, but can otherwise not be identified with existing approaches. Finally, a careful performance evaluation of our algorithm confirms we show that our approach of finding local outliers can be practical.},
file = {/Users/fariedabuzaid/Zotero/storage/BN8DBWR6/Breunig et al. - 2000 - LOF identifying density-based local outliers.pdf}
}
@article{brocker_increasing_2007,
title = {Increasing the {{Reliability}} of {{Reliability Diagrams}}},
author = {Bröcker, Jochen and Smith, Leonard A.},
date = {2007-06-01},
journaltitle = {Weather and Forecasting},
volume = {22},
number = {3},
pages = {651--661},
issn = {1520-0434, 0882-8156},
doi = {10.1175/WAF993.1},
url = {https://journals.ametsoc.org/doi/10.1175/WAF993.1},
urldate = {2021-04-29},
abstract = {Abstract The reliability diagram is a common diagnostic graph used to summarize and evaluate probabilistic forecasts. Its strengths lie in the ease with which it is produced and the transparency of its definition. While visually appealing, major long-noted shortcomings lie in the difficulty of interpreting the graph visually; for the most part, ambiguities arise from variations in the distributions of forecast probabilities and from various binning procedures. A resampling method for assigning consistency bars to the observed frequencies is introduced that allows for immediate visual evaluation as to just how likely the observed relative frequencies are under the assumption that the predicted probabilities are reliable. Further, an alternative presentation of the same information on probability paper eases quantitative evaluation and comparison. Both presentations can easily be employed for any method of binning.},
langid = {english}
}
@unpublished{broderick_automatic_2021,
title = {An {{Automatic Finite-Sample Robustness Metric}}: {{When Can Dropping}} a {{Little Data Make}} a {{Big Difference}}?},
shorttitle = {An {{Automatic Finite-Sample Robustness Metric}}},
author = {Broderick, Tamara and Giordano, Ryan and Meager, Rachael},
date = {2021-11-03},
eprint = {2011.14999},
eprinttype = {arxiv},
url = {https://arxiv.org/abs/2011.14999},
abstract = {We propose a method to assess the sensitivity of econometric analyses to the removal of a small fraction of the data. Manually checking the influence of all possible small subsets is computationally infeasible, so we provide an approximation to find the most influential subset. Our metric, the "Approximate Maximum Influence Perturbation," is automatically computable for common methods including (but not limited to) OLS, IV, MLE, GMM, and variational Bayes. We provide finite-sample error bounds on approximation performance. At minimal extra cost, we provide an exact finite-sample lower bound on sensitivity. We find that sensitivity is driven by a signal-to-noise ratio in the inference problem, is not reflected in standard errors, does not disappear asymptotically, and is not due to misspecification. While some empirical applications are robust, results of several economics papers can be overturned by removing less than 1\% of the sample.},
archiveprefix = {arXiv},
langid = {english},
file = {/Users/fariedabuzaid/Zotero/storage/A4U5GT7U/Broderick et al. - 2021 - An Automatic Finite-Sample Robustness Metric When.pdf;/Users/fariedabuzaid/Zotero/storage/BM97YF6E/DAQAKJLT.pdf}
}
@inproceedings{brody_how_2022,
title = {How {{Attentive}} Are {{Graph Attention Networks}}?},
author = {Brody, Shaked and Alon, Uri and Yahav, Eran},
date = {2022},
url = {https://openreview.net/forum?id=F72ximsx7C1},
urldate = {2022-04-28},
abstract = {Graph Attention Networks (GATs) are one of the most popular GNN architectures and are considered as the state-of-the-art architecture for representation learning with graphs. In GAT, every node...},
eventtitle = {International {{Conference}} on {{Learning Representations}} ({{ICLR}} 2022)},
langid = {english},
file = {/Users/fariedabuzaid/Zotero/storage/7G3UL44H/Brody et al. - 2022 - How Attentive are Graph Attention Networks.pdf}
}
@article{brunton_discovering_2016,
title = {Discovering Governing Equations from Data by Sparse Identification of Nonlinear Dynamical Systems},
author = {Brunton, Steven L. and Proctor, Joshua L. and Kutz, J. Nathan},
date = {2016-04-12},
journaltitle = {Proceedings of the National Academy of Sciences},
shortjournal = {PNAS},
volume = {113},
number = {15},
eprint = {27035946},
eprinttype = {pmid},
pages = {3932--3937},
publisher = {{National Academy of Sciences}},
issn = {0027-8424, 1091-6490},
doi = {10.1073/pnas.1517384113},
url = {https://www.pnas.org/content/113/15/3932},
urldate = {2021-03-28},
abstract = {Extracting governing equations from data is a central challenge in many diverse areas of science and engineering. Data are abundant whereas models often remain elusive, as in climate science, neuroscience, ecology, finance, and epidemiology, to name only a few examples. In this work, we combine sparsity-promoting techniques and machine learning with nonlinear dynamical systems to discover governing equations from noisy measurement data. The only assumption about the structure of the model is that there are only a few important terms that govern the dynamics, so that the equations are sparse in the space of possible functions; this assumption holds for many physical systems in an appropriate basis. In particular, we use sparse regression to determine the fewest terms in the dynamic governing equations required to accurately represent the data. This results in parsimonious models that balance accuracy with model complexity to avoid overfitting. We demonstrate the algorithm on a wide range of problems, from simple canonical systems, including linear and nonlinear oscillators and the chaotic Lorenz system, to the fluid vortex shedding behind an obstacle. The fluid example illustrates the ability of this method to discover the underlying dynamics of a system that took experts in the community nearly 30 years to resolve. We also show that this method generalizes to parameterized systems and systems that are time-varying or have external forcing.},
langid = {english},
file = {/Users/fariedabuzaid/Zotero/storage/VUSE8FKL/Brunton et al. - 2016 - Discovering governing equations from data by spars.pdf}
}
@inproceedings{cao_relational_2022,
title = {Relational {{Multi-Task Learning}}: {{Modeling Relations}} between {{Data}} and {{Tasks}}},
shorttitle = {Relational {{Multi-Task Learning}}},
author = {Cao, Kaidi and You, Jiaxuan and Leskovec, Jure},
date = {2022},
url = {https://openreview.net/forum?id=8Py-W8lSUgy},
urldate = {2022-04-28},
abstract = {A key assumption in multi-task learning is that at the inference time the multi-task model only has access to a given data point but not to the data point’s labels from other tasks. This presents...},
eventtitle = {International {{Conference}} on {{Learning Representations}} ({{ICLR}} 2022)},
langid = {english},
file = {/Users/fariedabuzaid/Zotero/storage/H9DZMHW7/Cao et al. - 2022 - Relational Multi-Task Learning Modeling Relations.pdf}
}
@inproceedings{carmon_unlabeled_2019,
title = {Unlabeled {{Data Improves Adversarial Robustness}}},
booktitle = {Advances in {{Neural Information Processing Systems}}},
author = {Carmon, Yair and Raghunathan, Aditi and Schmidt, Ludwig and Duchi, John C and Liang, Percy S},
date = {2019},
volume = {32},
publisher = {{Curran Associates, Inc.}},
url = {https://papers.nips.cc/paper/2019/hash/32e0bd1497aa43e02a42f47d9d6515ad-Abstract.html},
urldate = {2021-11-13},
abstract = {We demonstrate, theoretically and empirically, that adversarial robustness can significantly benefit from semisupervised learning. Theoretically, we revisit the simple Gaussian model of Schmidt et al. that shows a sample complexity gap between standard and robust classification. We prove that unlabeled data bridges this gap: a simple semisupervised learning procedure (self-training) achieves high robust accuracy using the same number of labels required for achieving high standard accuracy. Empirically, we augment CIFAR-10 with 500K unlabeled images sourced from 80 Million Tiny Images and use robust self-training to outperform state-of-the-art robust accuracies by over 5 points in (i) \$\textbackslash ell\_\textbackslash infty\$ robustness against several strong attacks via adversarial training and (ii) certified \$\textbackslash ell\_2\$ and \$\textbackslash ell\_\textbackslash infty\$ robustness via randomized smoothing. On SVHN, adding the dataset's own extra training set with the labels removed provides gains of 4 to 10 points, within 1 point of the gain from using the extra labels.},
keywords = {notion},
file = {/Users/fariedabuzaid/Zotero/storage/UGBXMPBT/Carmon et al. - 2019 - Unlabeled Data Improves Adversarial Robustness.pdf}
}
@article{castillo_fitting_1997,
title = {Fitting the {{Generalized Pareto Distribution}} to {{Data}}},
author = {Castillo, Enrique and Hadi, Ali S.},
date = {1997-12-01},
journaltitle = {Journal of the American Statistical Association},
volume = {92},
number = {440},
pages = {1609--1620},
issn = {0162-1459},
doi = {10.1080/01621459.1997.10473683},
url = {https://doi.org/10.1080/01621459.1997.10473683},
urldate = {2021-09-02},
abstract = {The generalized Pareto distribution (GPD) was introduced by Pickands to model exceedances over a threshold. It has since been used by many authors to model data in several fields. The GPD has a scale parameter ([sgrave] {$>$} 0) and a shape parameter (−∞ {$<$} k {$<$} ∞). The estimation of these parameters is not generally an easy problem. When k {$>$} 1, the maximum likelihood estimates do not exist, and when k is between 1/2 and 1, they may have problems. Furthermore, for k ≤ −1/2, second and higher moments do not exist, and hence both the method-of-moments (MOM) and the probability-weighted moments (PWM) estimates do not exist. Another and perhaps more serious problem with the MOM and PWM methods is that they can produce nonsensical estimates (i.e., estimates inconsistent with the observed data). In this article we propose a method for estimating the parameters and quantiles of the GPD. The estimators are well defined for all parameter values. They are also easy to compute. Some asymptotic results are provided. A simulation study is carried out to evaluate the performance of the proposed methods and to compare them with other methods suggested in the literature. The simulation results indicate that although no method is uniformly best for all the parameter values, the proposed method performs well compared to existing methods. The methods are applied to real-life data. Specific recommendations are also given.}
}
@article{cawley_overfitting_2010,
title = {On {{Over-fitting}} in {{Model Selection}} and {{Subsequent Selection Bias}} in {{Performance Evaluation}}},
author = {Cawley, Gavin C. and Talbot, Nicola L. C.},
date = {2010},
journaltitle = {Journal of Machine Learning Research},
volume = {11},
pages = {2079--2107},
issn = {ISSN 1533-7928},
url = {http://www.jmlr.org/papers/v11/cawley10a.html},
urldate = {2018-01-19},
issue = {Jul},
annotation = {citecount: 00341},
file = {/Users/fariedabuzaid/Zotero/storage/DXN8ANRM/Cawley and Talbot - 2010 - On Over-fitting in Model Selection and Subsequent .pdf}
}
@unpublished{chalapathy_deep_2019,
title = {Deep {{Learning}} for {{Anomaly Detection}}: {{A Survey}}},
shorttitle = {Deep {{Learning}} for {{Anomaly Detection}}},
author = {Chalapathy, Raghavendra and Chawla, Sanjay},
date = {2019-01-23},
eprint = {1901.03407},
eprinttype = {arxiv},
primaryclass = {cs, stat},
url = {http://arxiv.org/abs/1901.03407},
urldate = {2021-08-09},
abstract = {Anomaly detection is an important problem that has been well-studied within diverse research areas and application domains. The aim of this survey is two-fold, firstly we present a structured and comprehensive overview of research methods in deep learning-based anomaly detection. Furthermore, we review the adoption of these methods for anomaly across various application domains and assess their effectiveness. We have grouped state-of-the-art research techniques into different categories based on the underlying assumptions and approach adopted. Within each category we outline the basic anomaly detection technique, along with its variants and present key assumptions, to differentiate between normal and anomalous behavior. For each category, we present we also present the advantages and limitations and discuss the computational complexity of the techniques in real application domains. Finally, we outline open issues in research and challenges faced while adopting these techniques.},
archiveprefix = {arXiv},
file = {/Users/fariedabuzaid/Zotero/storage/297DFHH6/Chalapathy and Chawla - 2019 - Deep Learning for Anomaly Detection A Survey.pdf}
}
@article{chandola_anomaly_2009,
title = {Anomaly Detection: {{A}} Survey},
shorttitle = {Anomaly Detection},
author = {Chandola, Varun and Banerjee, Arindam and Kumar, Vipin},
date = {2009-07-30},
journaltitle = {ACM Computing Surveys},
shortjournal = {ACM Comput. Surv.},
volume = {41},
number = {3},
pages = {15:1--15:58},
issn = {0360-0300},
doi = {10.1145/1541880.1541882},
url = {https://doi.org/10.1145/1541880.1541882},
urldate = {2021-08-09},
abstract = {Anomaly detection is an important problem that has been researched within diverse research areas and application domains. Many anomaly detection techniques have been specifically developed for certain application domains, while others are more generic. This survey tries to provide a structured and comprehensive overview of the research on anomaly detection. We have grouped existing techniques into different categories based on the underlying approach adopted by each technique. For each category we have identified key assumptions, which are used by the techniques to differentiate between normal and anomalous behavior. When applying a given technique to a particular domain, these assumptions can be used as guidelines to assess the effectiveness of the technique in that domain. For each category, we provide a basic anomaly detection technique, and then show how the different existing techniques in that category are variants of the basic technique. This template provides an easier and more succinct understanding of the techniques belonging to each category. Further, for each category, we identify the advantages and disadvantages of the techniques in that category. We also provide a discussion on the computational complexity of the techniques since it is an important issue in real application domains. We hope that this survey will provide a better understanding of the different directions in which research has been done on this topic, and how techniques developed in one area can be applied in domains for which they were not intended to begin with.},
file = {/Users/fariedabuzaid/Zotero/storage/A3JTTUDX/Chandola et al. - 2009 - Anomaly detection A survey.pdf}
}
@inproceedings{charpentier_natural_2022,
title = {Natural {{Posterior Network}}: {{Deep Bayesian Predictive Uncertainty}} for {{Exponential Family Distributions}}},
shorttitle = {Natural {{Posterior Network}}},
author = {Charpentier, Bertrand and Borchert, Oliver and Zügner, Daniel and Geisler, Simon and Günnemann, Stephan},
date = {2022},
url = {https://openreview.net/forum?id=tV3N0DWMxCg},
urldate = {2022-04-28},
abstract = {Uncertainty awareness is crucial to develop reliable machine learning models. In this work, we propose the Natural Posterior Network (NatPN) for fast and high-quality uncertainty estimation for any...},
eventtitle = {International {{Conference}} on {{Learning Representations}} ({{ICLR}} 2022)},
langid = {english},
file = {/Users/fariedabuzaid/Zotero/storage/EJ3WJVXK/Charpentier et al. - 2022 - Natural Posterior Network Deep Bayesian Predictiv.pdf}
}
@article{chen_causal_2019,
title = {On Causal Discovery with an Equal-Variance Assumption},
author = {Chen, Wenyu and Drton, Mathias and Wang, Y. Samuel},
date = {2019-12-01},
journaltitle = {Biometrika},
shortjournal = {Biometrika},
volume = {106},
number = {4},
pages = {973--980},
publisher = {{Oxford Academic}},
issn = {0006-3444},
doi = {10.1093/biomet/asz049},
url = {https://academic.oup.com/biomet/article/106/4/973/5573229},
urldate = {2020-03-29},
abstract = {Summary. Prior work has shown that causal structure can be uniquely identified from observational data when these follow a structural equation model whose erro},
langid = {english},
annotation = {citecount: 00002},
file = {/Users/fariedabuzaid/Zotero/storage/Z3GGMRSP/Chen et al. - 2019 - On causal discovery with an equal-variance assumpt.pdf}
}
@inproceedings{chen_does_2022,
title = {Does Your Graph Need a Confidence Boost? {{Convergent}} Boosted Smoothing on Graphs with Tabular Node Features},
shorttitle = {Does Your Graph Need a Confidence Boost?},
author = {Chen, Jiuhai and Mueller, Jonas and Ioannidis, Vassilis N. and Adeshina, Soji and Wang, Yangkun and Goldstein, Tom and Wipf, David},
date = {2022},
url = {https://openreview.net/forum?id=nHpzE7DqAnG},
urldate = {2022-04-28},
abstract = {Many practical modeling tasks require making predictions using tabular data composed of heterogeneous feature types (e.g., text-based, categorical, continuous, etc.). In this setting boosted...},
eventtitle = {International {{Conference}} on {{Learning Representations}} ({{ICLR}} 2022)},
langid = {english},
file = {/Users/fariedabuzaid/Zotero/storage/3UBQIY3X/Chen et al. - 2022 - Does your graph need a confidence boost Convergen.pdf}
}
@unpublished{chen_hydra_2021,
title = {{{HYDRA}}: {{Hypergradient Data Relevance Analysis}} for {{Interpreting Deep Neural Networks}}},
shorttitle = {{{HYDRA}}},
author = {Chen, Yuanyuan and Li, Boyang and Yu, Han and Wu, Pengcheng and Miao, Chunyan},
date = {2021-03-01},
eprint = {2102.02515},
eprinttype = {arxiv},
primaryclass = {cs},
url = {http://arxiv.org/abs/2102.02515},
urldate = {2021-03-21},
abstract = {The behaviors of deep neural networks (DNNs) are notoriously resistant to human interpretations. In this paper, we propose Hypergradient Data Relevance Analysis, or HYDRA, which interprets the predictions made by DNNs as effects of their training data. Existing approaches generally estimate data contributions around the final model parameters and ignore how the training data shape the optimization trajectory. By unrolling the hypergradient of test loss w.r.t. the weights of training data, HYDRA assesses the contribution of training data toward test data points throughout the training trajectory. In order to accelerate computation, we remove the Hessian from the calculation and prove that, under moderate conditions, the approximation error is bounded. Corroborating this theoretical claim, empirical results indicate the error is indeed small. In addition, we quantitatively demonstrate that HYDRA outperforms influence functions in accurately estimating data contribution and detecting noisy data labels. The source code is available at https://github.com/cyyever/aaai\_hydra\_8686.},
archiveprefix = {arXiv},
file = {/Users/fariedabuzaid/Zotero/storage/DRYFU363/Chen et al. - 2021 - HYDRA Hypergradient Data Relevance Analysis for I.pdf}
}
@inproceedings{chen_more_2020,
title = {More {{Data Can Expand The Generalization Gap Between Adversarially Robust}} and {{Standard Models}}},
booktitle = {Proceedings of the 37th {{International Conference}} on {{Machine Learning}}},
author = {Chen, Lin and Min, Yifei and Zhang, Mingrui and Karbasi, Amin},
date = {2020-11-21},
eprint = {2002.04725},
eprinttype = {arxiv},
pages = {1670--1680},
publisher = {{PMLR}},
issn = {2640-3498},
url = {https://proceedings.mlr.press/v119/chen20q.html},
urldate = {2022-02-12},
abstract = {Despite remarkable success in practice, modern machine learning models have been found to be susceptible to adversarial attacks that make human-imperceptible perturbations to the data, but result in serious and potentially dangerous prediction errors. To address this issue, practitioners often use adversarial training to learn models that are robust against such attacks at the cost of higher generalization error on unperturbed test sets. The conventional wisdom is that more training data should shrink the gap between the generalization error of adversarially-trained models and standard models. However, we study the training of robust classifiers for both Gaussian and Bernoulli models under \$\textbackslash ell\_\textbackslash infty\$ attacks, and we prove that more data may actually increase this gap. Furthermore, our theoretical results identify if and when additional data will finally begin to shrink the gap. Lastly, we experimentally demonstrate that our results also hold for linear regression models, which may indicate that this phenomenon occurs more broadly.},
archiveprefix = {arXiv},
eventtitle = {International {{Conference}} on {{Machine Learning}}},
langid = {english},
file = {/Users/fariedabuzaid/Zotero/storage/MEC2LFNL/Chen et al. - 2020 - More Data Can Expand The Generalization Gap Betwee.pdf}
}
@inproceedings{chen_tamps2gcnets_2022,
title = {{{TAMP-S2GCNets}}: {{Coupling Time-Aware Multipersistence Knowledge Representation}} with {{Spatio-Supra Graph Convolutional Networks}} for {{Time-Series Forecasting}}},
shorttitle = {{{TAMP-S2GCNets}}},
author = {Chen, Yuzhou and Segovia-Dominguez, Ignacio and Coskunuzer, Baris and Gel, Yulia},
date = {2022},
url = {https://openreview.net/forum?id=wv6g8fWLX2q},
urldate = {2022-04-28},
abstract = {Graph Neural Networks (GNNs) are proven to be a powerful machinery for learning complex dependencies in multivariate spatio-temporal processes. However, most existing GNNs have inherently static...},
eventtitle = {International {{Conference}} on {{Learning Representations}} ({{ICLR}} 2022)},
langid = {english},
file = {/Users/fariedabuzaid/Zotero/storage/YCJP2YWT/Chen et al. - 2021 - TAMP-S2GCNets Coupling Time-Aware Multipersistenc.pdf;/Users/fariedabuzaid/Zotero/storage/RPGK7SH9/forum.html}
}
@inproceedings{chen_understanding_2022,
title = {Understanding and {{Improving Graph Injection Attack}} by {{Promoting Unnoticeability}}},
author = {Chen, Yongqiang and Yang, Han and Zhang, Yonggang and Kaili, Ma and Liu, Tongliang and Han, Bo and Cheng, James},
date = {2022},
url = {https://openreview.net/forum?id=wkMG8cdvh7-},
urldate = {2022-04-27},
abstract = {Recently Graph Injection Attack (GIA) emerges as a practical attack scenario on Graph Neural Networks (GNNs), where the adversary can merely inject few malicious nodes instead of modifying existing...},
eventtitle = {International {{Conference}} on {{Learning Representations}} ({{ICLR}} 2022)},
langid = {english},
file = {/Users/fariedabuzaid/Zotero/storage/MAU4LF2J/Chen et al. - 2021 - Understanding and Improving Graph Injection Attack.pdf;/Users/fariedabuzaid/Zotero/storage/46FUYQJK/forum.html}
}
@inproceedings{chen_unrestricted_2021,
title = {Unrestricted {{Adversarial Attacks}} on {{ImageNet Competition}}},
author = {Chen, Yuefeng and Mao, Xiaofeng and He, Yuan and Xue, Hui and Li, Chao and Dong, Yinpeng and Fu, Qi-An and Yang, Xiao and Xiang, Wenzhao and Pang, Tianyu and Su, Hang and Zhu, Jun and Liu, Fangcheng and Zhang, Chao and Zhang, Hongyang and Zhang, Yichi and Liu, Shilong and Liu, Chang and Xiang, Wenzhao and Wang, Yajie and Zhou, Huipeng and Lyu, Haoran and Xu, Yidan and Xu, Zixuan and Zhu, Taoyu and Li, Wenjun and Gao, Xianfeng and Wang, Guoqiu and Yan, Huanqian and Guo, Ying and Zhang, Chaoning and Fang, Zheng and Wang, Yang and Fu, Bingyang and Zheng, Yunfei and Wang, Yekui and Luo, Haorong and Yang, Zhen},
date = {2021-10-25},
eprint = {2110.09903},
eprinttype = {arxiv},
url = {http://arxiv.org/abs/2110.09903},
urldate = {2022-05-05},
abstract = {Many works have investigated the adversarial attacks or defenses under the settings where a bounded and imperceptible perturbation can be added to the input. However in the real-world, the attacker does not need to comply with this restriction. In fact, more threats to the deep model come from unrestricted adversarial examples, that is, the attacker makes large and visible modifications on the image, which causes the model classifying mistakenly, but does not affect the normal observation in human perspective. Unrestricted adversarial attack is a popular and practical direction but has not been studied thoroughly. We organize this competition with the purpose of exploring more effective unrestricted adversarial attack algorithm, so as to accelerate the academical research on the model robustness under stronger unbounded attacks. The competition is held on the TianChi platform (\textbackslash url\{https://tianchi.aliyun.com/competition/entrance/531853/introduction\}) as one of the series of AI Security Challengers Program.},
archiveprefix = {arXiv},
eventtitle = {{{CVPR-2021}}},
langid = {english},
file = {/Users/fariedabuzaid/Zotero/storage/NJ84SV8L/Chen et al. - 2021 - Unrestricted Adversarial Attacks on ImageNet Compe.pdf}
}
@article{cheridito_secondorder_2007,
title = {Second-Order Backward Stochastic Differential Equations and Fully Nonlinear Parabolic {{PDEs}}},
author = {Cheridito, Patrick and Soner, H. Mete and Touzi, Nizar and Victoir, Nicolas},
date = {2007},
journaltitle = {Communications on Pure and Applied Mathematics},
volume = {60},
number = {7},
pages = {1081--1110},
issn = {1097-0312},
doi = {10.1002/cpa.20168},
url = {https://onlinelibrary.wiley.com/doi/abs/10.1002/cpa.20168},
urldate = {2021-03-17},
abstract = {For a d-dimensional diffusion of the form dXt = μ(Xt)dt + σ(Xt)dWt and continuous functions f and g, we study the existence and uniqueness of adapted processes Y, Z, Γ, and A solving the second-order backward stochastic differential equation (2BSDE) \$\$dY\_t = f(t,X\_t, Y\_t, Z\_t, \textbackslash Gamma\_t) dt + Z\_t'\textbackslash circ dX\_t, \textbackslash quad t ın [0,T),\$\$ \$\$dZ\_t = A\_t dt + \textbackslash Gamma\_tdX\_t, \textbackslash quad t ın [0,T),\$\$ \$\$Y\_T = g(X\_T).\$\$ If the associated PDE \$\$- v\_t(t,x) + f(t,x,v(t,x), Dv(t,x), D\^2v(t,x)) = 0,\$\$ \$\$(t,x) ın [0,T) \textbackslash times \textbackslash cal R\^d, \textbackslash quad v(T,x) = g(x),\$\$ has a sufficiently regular solution, then it follows directly from Itô's formula that the processes \$\$v(t,X\_t), Dv(t,X\_t), D\^2v(t,X\_t), \textbackslash cal L Dv(t,X\_t), \textbackslash quad t ın [0,T],\$\$ solve the 2BSDE, where 𝓁 is the Dynkin operator of X without the drift term. The main result of the paper shows that if f is Lipschitz in Y as well as decreasing in Γ and the PDE satisfies a comparison principle as in the theory of viscosity solutions, then the existence of a solution (Y, Z,Γ, A) to the 2BSDE implies that the associated PDE has a unique continuous viscosity solution v and the process Y is of the form Yt = v(t, Xt), t ∈ [0, T]. In particular, the 2BSDE has at most one solution. This provides a stochastic representation for solutions of fully nonlinear parabolic PDEs. As a consequence, the numerical treatment of such PDEs can now be approached by Monte Carlo methods. © 2006 Wiley Periodicals, Inc.},
langid = {english},
annotation = {\_eprint: https://onlinelibrary.wiley.com/doi/pdf/10.1002/cpa.20168},
file = {/Users/fariedabuzaid/Zotero/storage/EPLE6XMZ/Cheridito et al. - 2007 - Second-order backward stochastic differential equa.pdf}
}
@article{chickering_optimal_2002,
title = {Optimal {{Structure Identification With Greedy Search}}},
author = {Chickering, David Maxwell},
date = {2002},
journaltitle = {Journal of Machine Learning Research},
volume = {3},
pages = {507--554},
issn = {ISSN 1533-7928},
url = {https://www.jmlr.org/papers/v3/chickering02b.html},
urldate = {2021-02-08},
abstract = {In this paper we prove the so-called "Meek Conjecture". In particular, we show that if a DAG H is an independence map of another DAG G, then there exists a finite sequence of edge additions and covered edge reversals in G such that (1) after each edge modification H remains an independence map of G and (2) after all modifications G =H. As shown by Meek (1997), this result has an important consequence for Bayesian approaches to learning Bayesian networks from data: in the limit of large sample size, there exists a two-phase greedy search algorithm that---when applied to a particular sparsely-connected search space---provably identifies a perfect map of the generative distribution if that perfect map is a DAG. We provide a new implementation of the search space, using equivalence classes as states, for which all operators used in the greedy search can be scored efficiently using local functions of the nodes in the domain. Finally, using both synthetic and real-world datasets, we demonstrate that the two-phase greedy approach leads to good solutions when learning with finite sample sizes.},
issue = {Nov},
langid = {english},
file = {/Users/fariedabuzaid/Zotero/storage/EB59B2ZV/Chickering - Erratum Optimal Structure Identification With Gree.pdf;/Users/fariedabuzaid/Zotero/storage/XGHCMFM6/Chickering - 2002 - Optimal Structure Identification With Greedy Searc.pdf}
}
@inproceedings{chien_node_2022,
title = {Node {{Feature Extraction}} by {{Self-Supervised Multi-scale Neighborhood Prediction}}},
author = {Chien, Eli and Chang, Wei-Cheng and Hsieh, Cho-Jui and Yu, Hsiang-Fu and Zhang, Jiong and Milenkovic, Olgica and Dhillon, Inderjit S.},
date = {2022},
url = {https://openreview.net/forum?id=KJggliHbs8},
urldate = {2022-04-28},
abstract = {Learning on graphs has attracted significant attention in the learning community due to numerous real-world applications. In particular, graph neural networks (GNNs), which take \textbackslash emph\{numerical\}...},
eventtitle = {International {{Conference}} on {{Learning Representations}} ({{ICLR}} 2022)},
langid = {english},
file = {/Users/fariedabuzaid/Zotero/storage/37PLN8NL/Chien et al. - 2022 - Node Feature Extraction by Self-Supervised Multi-s.pdf}
}
@inproceedings{chien_you_2022,
title = {You Are {{AllSet}}: {{A Multiset Function Framework}} for {{Hypergraph Neural Networks}}},
shorttitle = {You Are {{AllSet}}},
author = {Chien, Eli and Pan, Chao and Peng, Jianhao and Milenkovic, Olgica},
date = {2022},
url = {https://openreview.net/forum?id=hpBTIv2uy_E},
urldate = {2022-04-28},
abstract = {Hypergraphs are used to model higher-order interactions amongst agents and there exist many practically relevant instances of hypergraph datasets. To enable the efficient processing of hypergraph...},
eventtitle = {International {{Conference}} on {{Learning Representations}} ({{ICLR}} 2022)},
langid = {english},
file = {/Users/fariedabuzaid/Zotero/storage/PQQG5JAZ/Chien et al. - 2022 - You are AllSet A Multiset Function Framework for .pdf}
}
@article{clifton_novelty_2011,
title = {Novelty {{Detection}} with {{Multivariate Extreme Value Statistics}}},
author = {Clifton, David Andrew and Hugueny, Samuel and Tarassenko, Lionel},
date = {2011-12},
journaltitle = {Journal of Signal Processing Systems},
shortjournal = {J Sign Process Syst},
volume = {65},
number = {3},
pages = {371--389},
issn = {1939-8018, 1939-8115},
doi = {10.1007/s11265-010-0513-6},
url = {http://link.springer.com/10.1007/s11265-010-0513-6},
urldate = {2021-09-02},
abstract = {Novelty detection, or one-class classification, aims to determine if data are “normal” with respect to some model of normality constructed using examples of normal system behaviour. If that model is composed of generative probability distributions, the extent of “normality” in the data space can be described using Extreme Value Theory (EVT), a branch of statistics concerned with describing the tails of distributions. This paper demonstrates that existing approaches to the use of EVT for novelty detection are appropriate only for univariate, unimodal problems. We generalise the use of EVT for novelty detection to the analysis of data with multivariate, multimodal distributions, allowing a principled approach to the analysis of high-dimensional data to be taken. Examples are provided using vital-sign data obtained from a large clinical study of patients in a high-dependency hospital ward.},
langid = {english}
}
@inproceedings{coggins_wattle_1994,
title = {{{WATTLE}}: {{A Trainable Gain Analogue VLSI Neural Network}}},
shorttitle = {{{WATTLE}}},
booktitle = {Advances in {{Neural Information Processing Systems}}},
author = {Coggins, Richard and Jabri, Marwan},
editor = {Cowan, J. and Tesauro, G. and Alspector, J.},
date = {1994},
volume = {6},
publisher = {{Morgan-Kaufmann}},
url = {https://proceedings.neurips.cc/paper/1993/file/ccb0989662211f61edae2e26d58ea92f-Paper.pdf},
urldate = {2021-03-16}
}
@article{cohen_feature_2007,
title = {Feature {{Selection}} via {{Coalitional Game Theory}}},
author = {Cohen, Shay and Dror, Gideor and Ruppin, Eytan},
date = {2007-07},
journaltitle = {Neural computation},
volume = {19},
number = {7},
pages = {1939--1961},
doi = {10.1162/neco.2007.19.7.1939},
url = {https://www.mitpressjournals.org/doi/abs/10.1162/neco.2007.19.7.1939},
urldate = {2020-11-02},
abstract = {We present and study the contribution-selection algorithm (CSA), a novel algorithm for feature selection. The algorithm is based on the multiperturbation shapley analysis (MSA), a framework that relies on game theory to estimate usefulness. The algorithm iteratively estimates the usefulness of features and selects them accordingly, using either forward selection or backward elimination. It can optimize various performance measures over unseen data such as accuracy, balanced error rate, and area under receiver-operator-characteristic curve. Empirical comparison with several other existing feature selection methods shows that the backward elimination variant of CSA leads to the most accurate classification results on an array of data sets.},
langid = {english},
file = {/Users/fariedabuzaid/Zotero/storage/YKELFDRT/Cohen et al. - 2007 - Feature Selection via Coalitional Game Theory.pdf}
}
@book{coles_introduction_2013,
title = {An {{Introduction}} to {{Statistical Modeling}} of {{Extreme Values}}.},
author = {Coles, Stuart},
date = {2013},
publisher = {{Springer London, Limited}},
location = {{London}},
url = {https://public.ebookcentral.proquest.com/choice/publicfullrecord.aspx?p=5587226},
urldate = {2021-09-06},
abstract = {Directly oriented towards real practical application, this book develops the basic theoretical framework of extreme value models and the statistical inference techniques for using these models in practice. Intended for statisticians and non-statisticians alike, the theoretical treatment is elementary, with heuristics often replacing detailed mathematical proof. Most aspects of extreme modeling techniques are covered, including historical techniques (still widely used) and contemporary techniques based on point process models. A wide range of worked examples, using genuine datasets, illustrate the various modeling procedures and a concluding chapter provides a brief intorduction to a number of more advanced topics, including Bayesian inference and spatial extremes. All the computations are carried out using S-PLUS, and the corresponding datasets and functions are available via the internet for readers to recreate examples for themselves. An essential reference for students and researchers in statistics and disciplines such as engineering, finance and environmental science, this book will also appeal to practicioners looking for practical help in solving real problems. Stuart Coles is Reader in Statistics at the University of Bristol, U.K., having previously lectured at the universities of Nottingham and Lancaster. In 1992 he was the first recipient of the Royal Statistical Society's research prize. He has published widely in the statistical literature, principally in the area of extreme value modeling.},
isbn = {978-1-4471-3675-0},
langid = {english},
annotation = {OCLC: 1066184729},
file = {/Users/fariedabuzaid/Zotero/storage/EDRTCQYC/Coles - 2013 - An Introduction to Statistical Modeling of Extreme.pdf}
}
@article{colombo_learning_2012,
title = {Learning High-Dimensional Directed Acyclic Graphs with Latent and Selection Variables},
author = {Colombo, Diego and Maathuis, Marloes H. and Kalisch, Markus and Richardson, Thomas S.},
date = {2012-02},
journaltitle = {The Annals of Statistics},
volume = {40},
number = {1},
pages = {294--321},
publisher = {{Institute of Mathematical Statistics}},
issn = {0090-5364, 2168-8966},