-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathUAI2024.bib
2119 lines (1899 loc) · 292 KB
/
UAI2024.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
@Proceedings{UAI-2024,
booktitle = {Proceedings of the Fortieth Conference on Uncertainty in Artificial Intelligence},
name = {Uncertainty in Artificial Intelligence},
shortname = {UAI},
year = {2024},
editor = {Kiyavash, Negar and Mooij, Joris M.},
volume = {244},
start = {2024-07-15},
end = {2024-07-19},
published = {2024-09-12},
address = {Universitat Pompeu Fabra, Barcelona, Spain},
conference_url = {https://www.auai.org/uai2024/},
conference_number = {40},
sections = {Preface|Papers},
}
@InProceedings{kiyavash24a,
title = {Proceedings of the Fortieth Conference on Uncertainty in Artificial Intelligence -- Preface},
author = {Kiyavash, Negar and Mooij, Joris M.},
pages = {i-xi},
abstract = {The Conference on Uncertainty in Artificial Intelligence (UAI) is one of the premier international conferences on research related to knowledge representation, learning, and reasoning in the presence of uncertainty. UAI is supported by the Association for Uncertainty in Artificial Intelligence (AUAI). The 40th edition of the conference (UAI 2024) took place from July 15 to 19, 2024. It was held at the Universitat Pompeu Fabra, Barcelona, Spain.},
section = {Preface},
}
@InProceedings{an24a,
title = {Convergence Behavior of an Adversarial Weak Supervision Method},
author = {An, Steven and Dasgupta, Sanjoy},
pages = {1-49},
abstract = {Labeling data via rules-of-thumb and minimal label supervision is central to Weak Supervision, a paradigm subsuming subareas of machine learning such as crowdsourced learning and semi-supervised ensemble learning. By using this labeled data to train modern machine learning methods, the cost of acquiring large amounts of hand labeled data can be ameliorated. Approaches to combining the rules-of-thumb falls into two camps, reflecting different ideologies of statistical estimation. The most common approach, exemplified by the Dawid-Skene model, is based on probabilistic modeling. The other, developed in the work of Balsubramani-Freund and others, is adversarial and game-theoretic. We provide a variety of statistical results for the adversarial approach under log-loss: we characterize the form of the solution, relate it to logistic regression, demonstrate consistency, and give rates of convergence. On the other hand, we find that probabilistic approaches for the same model class can fail to be consistent. Experimental results are provided to corroborate the theoretical results.},
openreview = {q9TqTSk9cy},
software = {https://github.com/stevenan5/balsubramani-freund-uai-2024},
section = {Papers},
}
@InProceedings{anderka24a,
title = {Iterated INLA for State and Parameter Estimation in Nonlinear Dynamical Systems},
author = {Anderka, Rafael and Deisenroth, Marc Peter and Takao, So},
pages = {50-76},
abstract = {Data assimilation (DA) methods use priors arising from differential equations to robustly interpolate and extrapolate data. Popular techniques such as ensemble methods that handle high-dimensional, nonlinear PDE priors focus mostly on state estimation, however can have difficulty learning the parameters accurately. On the other hand, machine learning based approaches can naturally learn the state and parameters, but their applicability can be limited, or produce uncertainties that are hard to interpret. Inspired by the Integrated Nested Laplace Approximation (INLA) method in spatial statistics, we propose an alternative approach to DA based on iteratively linearising the dynamical model. This produces a Gaussian Markov random field at each iteration, enabling one to use INLA to infer the state and parameters. Our approach can be used for arbitrary nonlinear systems, while retaining interpretability, and is furthermore demonstrated to outperform existing methods on the DA task. By providing a more nuanced approach to handling nonlinear PDE priors, our methodology offers improved accuracy and robustness in predictions, especially where data sparsity is prevalent.},
openreview = {dQUdw0lEfA},
software = {https://github.com/rafaelanderka/iter-inla},
section = {Papers},
}
@InProceedings{ao24a,
title = {CSS: Contrastive Semantic Similarities for Uncertainty Quantification of LLMs},
author = {Ao, Shuang and Rueger, Stefan and Siddharthan, Advaith},
pages = {77-87},
abstract = {Despite the impressive capability of large language models (LLMs), knowing when to trust their generations remains an open challenge. The recent literature on uncertainty quantification of natural language generation (NLG) utilizes a conventional natural language inference (NLI) classifier to measure the semantic dispersion of LLMs responses. These studies employ logits of NLI classifier for semantic clustering to estimate uncertainty. However, logits represent the probability of the predicted class and barely contain feature information for potential clustering. Alternatively, CLIP (Contrastive Language{\textendash}Image Pre-training) performs impressively in extracting image-text pair features and measuring their similarity. To extend its usability, we propose Contrastive Semantic Similarity, the CLIP-based feature extraction module to obtain similarity features for measuring uncertainty for text pairs. We apply this method to selective NLG, which detects and rejects unreliable generations for better trustworthiness of LLMs. We conduct extensive experiments with three LLMs on several benchmark question-answering datasets with comprehensive evaluation metrics. Results show that our proposed method performs better in estimating reliable responses of LLMs than comparable baselines.},
openreview = {k0Fy2SPwtc},
software = {https://github.com/AoShuang92/css_uq_llms},
section = {Papers},
}
@InProceedings{aouali24a,
title = {Unified PAC-Bayesian Study of Pessimism for Offline Policy Learning with Regularized Importance Sampling},
author = {Aouali, Imad and Brunel, Victor-Emmanuel and Rohde, David and Korba, Anna},
pages = {88-109},
abstract = {Off-policy learning (OPL) often involves minimizing a risk estimator based on importance weighting to correct bias from the logging policy used to collect data. However, this method can produce an estimator with a high variance. A common solution is to regularize the importance weights and learn the policy by minimizing an estimator with penalties derived from generalization bounds specific to the estimator. This approach, known as pessimism, has gained recent attention but lacks a unified framework for analysis. To address this gap, we introduce a comprehensive PAC-Bayesian framework to examine pessimism with regularized importance weighting. We derive a tractable PAC-Bayesian generalization bound that universally applies to common importance weight regularizations, enabling their comparison within a single framework. Our empirical results challenge common understanding, demonstrating the effectiveness of standard IW regularization techniques.},
openreview = {d7W4H0sTXU},
software = {https://github.com/imadaouali/unified-pessimism-opl},
section = {Papers},
}
@InProceedings{arnez24a,
title = {Latent Representation Entropy Density for Distribution Shift Detection},
author = {Arnez, Fabio and Montoya Vasquez, Daniel Alfonso and Radermacher, Ansgar and Terrier, Fran\c{c}ois},
pages = {110-137},
abstract = {Distribution shift detection is paramount in safety-critical tasks that rely on Deep Neural Networks (DNNs). The detection task entails deriving a confidence score to assert whether a new input sample aligns with the training data distribution of the DNN model. While DNN predictive uncertainty offers an intuitive confidence measure, exploring uncertainty-based distribution shift detection with simple sample-based techniques has been relatively overlooked in recent years due to computational overhead and lower performance than plain post-hoc methods. This paper proposes using simple sample-based techniques for estimating uncertainty and employing the entropy density from intermediate representations to detect distribution shifts. We demonstrate the effectiveness of our method using standard benchmark datasets for out-of-distribution detection and across different common perception tasks with convolutional neural network architectures. Our scope extends beyond classification, encompassing image-level distribution shift detection for object detection and semantic segmentation tasks. Our results show that our method's performance is comparable to existing <em>State-of-the-Art</em> methods while being computationally faster and lighter than other Bayesian approaches, affirming its practical utility.},
openreview = {1CKLfh3Ge7},
software = {https://github.com/CEA-LIST/LaREx},
section = {Papers},
}
@InProceedings{askin24a,
title = {FedAST: Federated Asynchronous Simultaneous Training},
author = {Askin, Baris and Sharma, Pranay and Joe-Wong, Carlee and Joshi, Gauri},
pages = {138-172},
abstract = {Federated Learning (FL) enables edge devices or clients to collaboratively train machine learning (ML) models without sharing their private data. Much of the existing work in FL focuses on efficiently learning a model for a single task. In this paper, we study simultaneous training of multiple FL models using a common set of clients. The few existing simultaneous training methods employ synchronous aggregation of client updates, which can cause significant delays because large models and/or slow clients can bottleneck the aggregation. On the other hand, a naive asynchronous aggregation is adversely affected by stale client updates. We propose FedAST, a buffered asynchronous federated simultaneous training algorithm that overcomes bottlenecks from slow models and adaptively allocates client resources across heterogeneous tasks. We provide theoretical convergence guarantees for FedAST for smooth non-convex objective functions. Extensive experiments over multiple real-world datasets demonstrate that our proposed method outperforms existing simultaneous FL approaches, achieving up to 46.0% reduction in time to train multiple tasks to completion.},
openreview = {yfQEtBP988},
software = {https://github.com/askinb/FedAST/tree/main},
section = {Papers},
}
@InProceedings{assaad24a,
title = {Identifiability of total effects from abstractions of time series causal graphs},
author = {Assaad, Charles K. and Devijver, Emilie and Gaussier, Eric and Goessler, Gregor and Meynaoui, Anouar},
pages = {173-185},
abstract = {We study the problem of identifiability of the total effect of an intervention from observational time series only given an abstraction of the causal graph of the system. Specifically, we consider two types of abstractions: the extended summary causal graph which conflates all lagged causal relations but distinguishes between lagged and instantaneous relations; and the summary causal graph which does not give any indication about the lag between causal relations. We show that the total effect is always identifiable in extended summary causal graphs and we provide necessary and sufficient graphical conditions for identifiability in summary causal graphs. Furthermore, we provide adjustment sets allowing to estimate the total effect whenever it is identifiable.},
openreview = {lL4EzE8bY8},
section = {Papers},
}
@InProceedings{auricchio24a,
title = {On the Capacitated Facility Location Problem with Scarce Resources},
author = {Auricchio, Gennaro and Clough, Harry J. and Zhang, Jie},
pages = {186-202},
abstract = {This paper investigates the Mechanism Design aspects of the $m$-Capacitated Facility Location Problem where the total facility capacity is lower than the number of agents. Following \cite{aziz2020capacity}, the Social Welfare of the facility location is determined through a First-Come-First-Served (FCFS) game where agents compete after the facility positions are established. When the number of facilities is $m>1$, the Nash Equilibrium (NE) of the FCFS game is not unique, thus the utility of the agents and the notion of truthfulness are not well-defined. To address these issues, we consider absolutely truthful mechanisms, i.e. mechanisms able to prevent agents from misreporting regardless of the strategies played during the FCFS game. We pair this more stringent truthfulness requirement with the notion of Equilibrium Stable (ES) mechanism, i.e. mechanisms whose Social Welfare does not depend on the NE of the FCFS game. We show that the class of percentile mechanisms is absolutely truthful and characterize under which conditions they are ES. We then show that the approximation ratio of each ES percentile mechanism is bounded and determine its value. Notably, when all the facilities have the same capacity and the number of agents is large enough, it is possible to achieve an approximation ratio smaller than $1+\frac{1}{2m-1}$. We enhance our findings by empirically evaluating the mechanisms' performances when agents' true positions follows a distribution.},
openreview = {3tH77YyFTs},
section = {Papers},
}
@InProceedings{azizmalayeri24a,
title = {Mitigating Overconfidence in Out-of-Distribution Detection by Capturing Extreme Activations},
author = {Azizmalayeri, Mohammad and Abu-Hanna, Ameen and Cin\`a, Giovanni},
pages = {203-224},
abstract = {Detecting out-of-distribution (OOD) instances is crucial for the reliable deployment of machine learning models in real-world scenarios.
OOD inputs are commonly expected to cause a more uncertain prediction in the primary task; however, there are OOD cases for which the model returns a highly confident prediction. This phenomenon, denoted as "overconfidence", presents a challenge to OOD detection.
Specifically, theoretical evidence indicates that overconfidence is an intrinsic property of certain neural network architectures, leading to poor OOD detection. In this work, we address this issue by measuring extreme activation values in the penultimate layer of neural networks and then leverage this proxy of overconfidence to improve on several OOD detection baselines. We test our method on a wide array of experiments spanning synthetic data and real-world data, tabular and image datasets, multiple architectures such as ResNet and Transformer, different training loss functions, and include the scenarios examined in previous theoretical work. Compared to the baselines, our method often grants substantial improvements, with double-digit increases in OOD detection AUC, and it does not damage performance in any scenario.},
openreview = {nwf2mKQVhP},
software = {https://github.com/mazizmalayeri/CEA},
section = {Papers},
}
@InProceedings{azzolini24a,
title = {Inference in Probabilistic Answer Set Programs with Imprecise Probabilities via Optimization},
author = {Azzolini, Damiano and Riguzzi, Fabrizio},
pages = {225-234},
abstract = {Probabilistic answer set programming has recently been extended to manage imprecise probabilities by means of credal probabilistic facts and credal annotated disjunctions.
This increases the expressivity of the language but, at the same time, the cost of inference.
In this paper, we cast inference in probabilistic answer set programs with credal probabilistic facts and credal annotated disjunctions as a constrained nonlinear optimization problem where the function to optimize is obtained via knowledge compilation.
Empirical results on different datasets with multiple configurations shows the effectiveness of our approach.},
openreview = {h5VFqO681Y},
software = {https://github.com/damianoazzolini/aspmc},
section = {Papers},
}
@InProceedings{bai24a,
title = {Differentially Private No-regret Exploration in Adversarial Markov Decision Processes},
author = {Bai, Shaojie and Zeng, Lanting and Zhao, Chengcheng and Duan, Xiaoming and Sadegh Talebi, Mohammad and Cheng, Peng and Chen, Jiming},
pages = {235-272},
abstract = {We study learning adversarial Markov decision process (MDP) in the episodic setting under the constraint of differential privacy (DP).
This is motivated by the widespread applications of reinforcement learning (RL) in non-stationary and even adversarial scenarios, where protecting users' sensitive information is vital.
We first propose two efficient frameworks for adversarial MDPs, spanning full-information and bandit settings.
Within each framework, we consider both Joint DP (JDP), where a central agent is trusted to protect the sensitive data, and Local DP (LDP), where the information is protected directly on the user side.
Then, we design novel privacy mechanisms to privatize the stochastic transition and adversarial losses.
By instantiating such privacy mechanisms to satisfy JDP and LDP requirements, we obtain near-optimal regret guarantees for both frameworks.
To our knowledge, these are the first algorithms to tackle the challenge of private learning in adversarial MDPs.},
openreview = {foNKGt20YE},
section = {Papers},
}
@InProceedings{bajgar24a,
title = {Walking the Values in Bayesian Inverse Reinforcement Learning},
author = {Bajgar, Ondrej and Abate, Alessandro and Gatsis, Konstantinos and Osborne, Michael},
pages = {273-287},
abstract = {The goal of Bayesian inverse reinforcement learning (IRL) is recovering a posterior distribution over reward functions using a set of demonstrations from an expert optimizing for a reward unknown to the learner. The resulting posterior over rewards can then be used to synthesize an apprentice policy that performs well on the same or a similar task.
A key challenge in Bayesian IRL is bridging the computational gap between the hypothesis space of possible rewards and the likelihood, often defined in terms of Q values: vanilla Bayesian IRL needs to solve the costly forward planning problem -- going from rewards to the Q values -- at every step of the algorithm, which may need to be done thousands of times. We propose to solve this by a simple change: instead of focusing on primarily sampling in the space of rewards, we can focus on primarily working in the space of Q-values, since the computation required to go from Q-values to reward is radically cheaper. Furthermore, this reversion of the computation makes it easy to compute the gradient allowing efficient sampling using Hamiltonian Monte Carlo. We propose ValueWalk -- a new Markov chain Monte Carlo method based on this insight -- and illustrate its advantages on several tasks.},
openreview = {48SI6DOqUH},
section = {Papers},
}
@InProceedings{balcan24a,
title = {Learning Accurate and Interpretable Decision Trees},
author = {Balcan, Maria-Florina and Sharma, Dravyansh},
pages = {288-307},
abstract = {Decision trees are a popular tool in machine learning and yield easy-to-understand models. Several techniques have been proposed in the literature for learning a decision tree classifier, with different techniques working well for data from different domains. In this work, we develop approaches to design decision tree learning algorithms given repeated access to data from the same domain. We propose novel parameterized classes of node splitting criteria in top-down algorithms, which interpolate between popularly used entropy and Gini impurity based criteria, and provide theoretical bounds on the number of samples needed to learn the splitting function appropriate for the data at hand. We also study the sample complexity of tuning prior parameters in Bayesian decision tree learning, and extend our results to decision tree regression. We further consider the problem of tuning hyperparameters in pruning the decision tree for classical pruning algorithms including min-cost complexity pruning. We also study the interpretability of the learned decision trees and introduce a data-driven approach for optimizing the explainability versus accuracy trade-off using decision trees. Finally, we demonstrate the significance of our approach on real world datasets by learning data-specific decision trees which are simultaneously more accurate and interpretable.},
openreview = {skdlnUYRzQ},
section = {Papers},
}
@InProceedings{bellot24a,
title = {Towards Bounding Causal Effects under Markov Equivalence},
author = {Bellot, Alexis},
pages = {308-332},
abstract = {Predicting the effect of unseen interventions is a fundamental research question across the data sciences. It is well established that in general such questions cannot be answered definitively from observational data. This realization has fuelled a growing literature introducing various identifying assumptions, for example in the form of a causal diagram among relevant variables. In practice, this paradigm is still too rigid for many practical applications as it is generally not possible to confidently delineate the true causal diagram. In this paper, we consider the derivation of bounds on causal effects given only observational data. We propose to take as input a less informative structure known as a Partial Ancestral Graph, which represents a Markov equivalence class of causal diagrams and is learnable from data. In this more ``data-driven'' setting, we provide a systematic algorithm to derive bounds on causal effects that exploit the invariant properties of the equivalence class, and that can be computed analytically. We demonstrate our method with synthetic and real data examples.},
openreview = {xY33abx46a},
section = {Papers},
}
@InProceedings{benavoli24a,
title = {Linearly Constrained Gaussian Processes are SkewGPs: application to Monotonic Preference Learning and Desirability},
author = {Benavoli, Alessio and Azzimonti, Dario},
pages = {333-348},
abstract = {We show that existing approaches to Linearly Constrained Gaussian Processes (LCGP) for regression, based on imposing constraints on a finite set of operational points, can be seen as Skew Gaussian Processes (SkewGPs). In particular, focusing on inequality constraints and building upon a recent unification of regression, classification, and preference learning through SkewGPs, we extend LCGP to handle monotonic preference learning and desirability, crucial for understanding and predicting human decision making. We demonstrate the efficacy of the proposed model on simulated and real data.},
openreview = {CJqN5503XZ},
section = {Papers},
}
@InProceedings{berke24a,
title = {MetaCOG: A Heirarchical Probabilistic Model for Learning Meta-Cognitive Visual Representations},
author = {Berke, Marlene and Azerbayev, Zhangir and Belledonne, Mario and Tavares, Zenna and Jara-Ettinger, Julian},
pages = {349-359},
abstract = {Humans have the capacity to question what we see and to recognize when our vision is unreliable (e.g., when we realize that we are experiencing a visual illusion). Inspired by this capacity, we present MetaCOG: a hierarchical probabilistic model that can be attached to a neural object detector to monitor its outputs and determine their reliability. MetaCOG achieves this by learning a probabilistic model of the object detector's performance via Bayesian inference{\textemdash}i.e., a meta-cognitive representation of the network's propensity to hallucinate or miss different object categories. Given a set of video frames processed by an object detector, MetaCOG performs joint inference over the underlying 3D scene and the detector's performance, grounding inference on a basic assumption of object permanence. Paired with three neural object detectors, we show that MetaCOG accurately recovers each detector's performance parameters and improves the overall system's accuracy. We additionally show that MetaCOG is robust to varying levels of error in object detector outputs, showing proof-of-concept for a novel approach to the problem of detecting and correcting errors in vision systems when ground-truth is not available.},
openreview = {hESlov4NHG},
software = {https://github.com/marleneberke/MetaCOG/tree/main},
video = {https://github.com/marleneberke/MetaCOG/tree/main/demos},
section = {Papers},
}
@InProceedings{berry24a,
title = {Shedding Light on Large Generative Networks: Estimating Epistemic Uncertainty in Diffusion Models},
author = {Berry, Lucas and Brando, Axel and Meger, David},
pages = {360-376},
abstract = {Generative diffusion models, notable for their large parameter count (exceeding 100 million) and operation within high-dimensional image spaces, pose significant challenges for traditional uncertainty estimation methods due to computational demands. In this work, we introduce an innovative framework, Diffusion Ensembles for Capturing Uncertainty (DECU), designed for estimating epistemic uncertainty for diffusion models. The DECU framework introduces a novel method that efficiently trains ensembles of conditional diffusion models by incorporating a static set of pre-trained parameters, drastically reducing the computational burden and the number of parameters that require training. Additionally, DECU employs Pairwise-Distance Estimators (PaiDEs) to accurately measure epistemic uncertainty by evaluating the mutual information between model outputs and weights in high-dimensional spaces. The effectiveness of this framework is demonstrated through experiments on the ImageNet dataset, highlighting its capability to capture epistemic uncertainty, specifically in under-sampled image classes.},
openreview = {512IkGDqA8},
software = {https://github.com/nwaftp23/DECU},
section = {Papers},
}
@InProceedings{betzer24a,
title = {Publishing Number of Walks and Katz Centrality under Local Differential Privacy},
author = {Betzer, Louis and Suppakitpaisarn, Vorapong and Hillebrand, Quentin},
pages = {377-393},
abstract = {In our study, we present an algorithm for publishing the count of walks and Katz centrality under local differential privacy (LDP), complemented by a comprehensive theoretical analysis. While previous research in LDP has predominantly focused on counting subgraphs with a maximum of five nodes, our work extends this to larger subgraphs. The primary challenge in such an extension lies in managing the exponentially increasing noise associated with LDP as the size of the subgraph grows. Our solution involves an algorithm for publishing the count of walks originating from each node in the graph, which subsequently enables us to publish the Katz centrality of all nodes. This algorithm incorporates multiple communication rounds and employs a clipping technique. Through both theoretical and empirical evaluation, we demonstrate that our algorithm achieves has a relatively small bias and variance, showing significant improvements over both the randomized response method and non-clipping algorithms. Additionally, our approach to estimating Katz centrality successfully identifies up to 90\% of the nodes with the highest centrality values.},
openreview = {76UkTmdmkB},
software = {https://github.com/louisbetzer/Katz-Centrality-Local-Differential-Privacy},
video = {https://www.youtube.com/watch?v=CglDgS4fRYA},
section = {Papers},
}
@InProceedings{bowyer24a,
title = {Using Autodiff to Estimate Posterior Moments, Marginals and Samples},
author = {Bowyer, Sam and Heap, Thomas and Aitchison, Laurence},
pages = {394-417},
abstract = {Importance sampling is a popular technique in Bayesian inference: by reweighting samples drawn from a proposal distribution we are able to obtain samples and moment estimates from a Bayesian posterior over latent variables. Recent work, however, indicates that importance sampling scales poorly --- in order to accurately approximate the true posterior, the required number of importance samples grows is exponential in the number of latent variables [Chatterjee and Diaconis, 2018]. Massively parallel importance sampling works around this issue by drawing $K$ samples for each of the $n$ latent variables and reasoning about all $K^n$ combinations of latent samples. In principle, we can reason efficiently over $K^n$ combinations of samples by exploiting conditional independencies in the generative model. However, in practice this requires complex algorithms that traverse backwards through the graphical model, and we need separate backward traversals for each computation (posterior expectations, marginals and samples). Our contribution is to exploit the source term trick from physics to entirely avoid the need to hand-write backward traversals. Instead, we demonstrate how to simply and easily compute all the required quantities --- posterior expectations, marginals and samples --- by differentiating through a slightly modified marginal likelihood estimator.},
openreview = {QUMZJgrjN0},
software = {https://github.com/sambowyer/MPIS},
section = {Papers},
}
@InProceedings{broadrick24a,
title = {Polynomial Semantics of Tractable Probabilistic Circuits},
author = {Broadrick, Oliver and Zhang, Honghua and Van den Broeck, Guy},
pages = {418-429},
abstract = {Probabilistic circuits compute multilinear polynomials that represent probability distributions. They are tractable models that support efficient marginal inference. However, various polynomial semantics have been considered in the literature (e.g., network polynomials, likelihood polynomials, generating functions, Fourier transforms, and characteristic polynomials). The relationships between these polynomial encodings of distributions is largely unknown. In this paper, we prove that for binary distributions, each of these probabilistic circuit models is equivalent in the sense that any circuit for one of them can be transformed into a circuit for any of the others with only a polynomial increase in size. They are therefore all tractable for marginal inference on the same class of distributions. Finally, we explore the natural extension of one such polynomial semantics, called probabilistic generating circuits, to categorical random variables, and establish that marginal inference becomes #P-hard.},
openreview = {LKgqKbzZKA},
section = {Papers},
}
@InProceedings{buchholz24a,
title = {Products, Abstractions and Inclusions of Causal Spaces},
author = {Buchholz, Simon and Park, Junhyung and Sch\"olkopf, Bernhard},
pages = {430-449},
abstract = {Causal spaces have recently been introduced as a measure-theoretic framework to encode the notion of causality. While it has some advantages over established frameworks, such as structural causal models, the theory is so far only developed for single causal spaces. In many mathematical theories, not least the theory of probability spaces of which causal spaces are a direct extension, combinations of objects and maps between objects form a central part. In this paper, taking inspiration from such objects in probability theory, we propose the definitions of products of causal spaces, as well as (stochastic) transformations between causal spaces. In the context of causality, these quantities can be given direct semantic interpretations as causally independent components, abstractions and extensions.},
openreview = {QURqCXNyjM},
section = {Papers},
}
@InProceedings{bui24a,
title = {Revisiting Kernel Attention with Correlated Gaussian Process Representation},
author = {Bui, Long Minh and Tran Huu, Tho and Dinh, Duy and Nguyen, Tan Minh and Hoang, Trong Nghia},
pages = {450-470},
abstract = {Transformers have increasingly become the de facto method to model sequential data with state-of-the-art performance. Due to its widespread use, being able to estimate and calibrate its modeling uncertainty is important to understand and design robust transformer models. To achieve this, previous works have used Gaussian processes (GPs) to perform uncertainty calibration for the attention units of transformers and attained notable successes. However, such approaches have to confine the transformers to the space of symmetric attention to ensure the necessary symmetric requirement of their GP's kernel specification, which reduces the representation capacity of the model. To mitigate this restriction, we propose the Correlated Gaussian Process Transformer (CGPT), a new class of transformers whose self-attention units are modeled as cross-covariance between two correlated GPs (CGPs). This allows asymmetries in attention and can enhance the representation capacity of GP-based transformers. We also derive a sparse approximation for CGP to make it scale better. Our empirical studies show that both CGP-based and sparse CGP-based transformers achieve better performance than state-of-the-art GP-based transformers on a variety of benchmark tasks.},
openreview = {xlIK0vu3MW},
section = {Papers},
}
@InProceedings{burroni24a,
title = {Sample Average Approximation for Black-Box Variational Inference},
author = {Burroni, Javier and Domke, Justin and Sheldon, Daniel},
pages = {471-498},
abstract = {Black-box variational inference (BBVI) is a general-purpose approximate inference approach that converts inference to a stochastic optimization problem.
However, the difficulty of solving the BBVI optimization problem reliably and robustly using stochastic gradient methods has limited its applicability.
We present a novel optimization approach for BBVI using the sample average approximation (SAA).
SAA converts stochastic problems to deterministic ones by optimizing over a fixed random sample, which enables optimization tools such as quasi-Newton methods and line search that bypass the difficulties faced by stochastic gradient methods.
We design an approach called "SAA for VI" that solves a sequence of SAA problems with increasing sample sizes to reliably and robustly solve BBVI problems without problem-specific tuning.
We focus on quasi-Newton methods, which are well suited to problems with up to hundreds of latent variables.
Our experiments show that SAA for VI simplifies the VI problem and achieves faster performance than existing methods.},
openreview = {1EV6XonWQx},
software = {https://github.com/jburroni/SAA-for-VI},
section = {Papers},
}
@InProceedings{cai24a,
title = {Privacy-Aware Randomized Quantization via Linear Programming},
author = {Cai, Zhongteng and Zhang, Xueru and Khalili, Mohammad Mahdi},
pages = {499-516},
abstract = {Differential privacy mechanisms such as the Gaussian or Laplace mechanism have been widely used in data analytics for preserving individual privacy. However, they are mostly designed for continuous outputs and are unsuitable for scenarios where discrete values are necessary. Although various quantization mechanisms were proposed recently to generate discrete outputs under differential privacy, the outcomes are either biased or have an inferior accuracy-privacy trade-off. In this paper, we propose a family of quantization mechanisms that is unbiased and differentially private. It has a high degree of freedom and we show that some existing mechanisms can be considered as special cases of ours. To find the optimal mechanism, we formulate a linear optimization that can be solved efficiently using linear programming tools. Experiments show that our proposed mechanism can attain a better privacy-accuracy trade-off compared to baselines.},
openreview = {vWsf4L7rHq},
section = {Papers},
}
@InProceedings{camilleri24a,
title = {Fair Active Learning in Low-Data Regimes},
author = {Camilleri, Romain and Wagenmaker, Andrew and Morgenstern, Jamie and Jain, Lalit and Jamieson, Kevin},
pages = {517-531},
abstract = {In critical machine learning applications, ensuring fairness is essential to avoid perpetuating social inequities. In this work, we address the challenges of reducing bias and improving accuracy in data-scarce environments, where the cost of collecting labeled data prohibits the use of large, labeled datasets. In such settings, active learning promises to maximize marginal accuracy gains of small amounts of labeled data. However, existing applications of active learning for fairness fail to deliver on this, typically requiring large labeled datasets, or failing to ensure the desired fairness tolerance is met on the population distribution.
To address such limitations, we introduce an innovative active learning framework that combines an exploration procedure inspired by posterior sampling with a fair classification subroutine. We demonstrate that this framework performs effectively in very data-scarce regimes, maximizing accuracy while satisfying fairness constraints with high probability. We evaluate our proposed approach using well-established real-world benchmark datasets and compare it against state-of-the-art methods, demonstrating its effectiveness in producing fair models, and improvement over existing methods.},
openreview = {goREZw6mdK},
section = {Papers},
}
@InProceedings{cao24a,
title = {Multi-Relational Structural Entropy},
author = {Cao, Yuwei and Peng, Hao and Li, Angsheng and You, Chenyu and Hao, Zhifeng and Yu, Philip S.},
pages = {532-546},
abstract = {Structural Entropy (SE) measures the structural information contained in a graph. Minimizing or maximizing SE helps to reveal or obscure the intrinsic structural patterns underlying graphs in an interpretable manner, finding applications in various tasks driven by networked data. However, SE ignores the heterogeneity inherent in the graph relations, which is ubiquitous in modern networks. In this work, we extend SE to consider heterogeneous relations and propose the first metric for multi-relational graph structural information, namely, Multi-relational Structural Entropy (MrSE). To this end, we first cast SE through the novel lens of the stationary distribution from random surfing, which readily extends to multi-relational networks by considering the choices of both nodes and relation types simultaneously at each step. The resulting MrSE is then optimized by a new greedy algorithm to reveal the essential structures within a multi-relational network. Experimental results highlight that the proposed MrSE offers a more insightful interpretation of the structure of multi-relational graphs compared to SE. Additionally, it enhances the performance of two tasks that involve real-world multi-relational graphs, including node clustering and social event detection.},
openreview = {9sAha7AoNA},
software = {https://github.com/YuweiCao-UIC/MrSE},
section = {Papers},
}
@InProceedings{cattelan24a,
title = {How to Fix a Broken Confidence Estimator: Evaluating Post-hoc Methods for Selective Classification with Deep Neural Networks},
author = {Cattelan, Lu{\'\i}s Felipe and Silva, Danilo},
pages = {547-584},
abstract = {This paper addresses the problem of selective classification for deep neural networks, where a model is allowed to abstain from low-confidence predictions to avoid potential errors. We focus on so-called post-hoc methods, which replace the confidence estimator of a given classifier without modifying or retraining it, thus being practically appealing. Considering neural networks with softmax outputs, our goal is to identify the best confidence estimator that can be computed directly from the unnormalized logits. This problem is motivated by the intriguing observation in recent work that many classifiers appear to have a ``broken'' confidence estimator, in the sense that their selective classification performance is much worse than what could be expected by their corresponding accuracies. We perform an extensive experimental study of many existing and proposed confidence estimators applied to 84 pretrained ImageNet classifiers available from popular repositories. Our results show that a simple $p$-norm normalization of the logits, followed by taking the maximum logit as the confidence estimator, can lead to considerable gains in selective classification performance, completely fixing the pathological behavior observed in many classifiers. As a consequence, the selective classification performance of any classifier becomes almost entirely determined by its corresponding accuracy. Moreover, these results are shown to be consistent under distribution shift.},
openreview = {IJBWLRCvYX},
software = {https://github.com/lfpc/FixSelectiveClassification},
section = {Papers},
}
@InProceedings{challa24a,
title = {QuantProb: Generalizing Probabilities along with Predictions for a Pre-trained Classifier},
author = {Challa, Aditya and Dhavala, Soma and Saha, Snehanshu},
pages = {585-602},
abstract = {Quantification of Uncertainty in predictions is a challenging problem. In the classification settings, although deep learning based models generalize well, class probabilities often lack reliability. Calibration errors are used to quantify uncertainty, and several methods exist to minimize calibration error. We argue that between the choice of having a minimum calibration error on original distribution which increases across distortions or having a (possibly slightly higher) calibration error which is constant across distortions, we prefer the latter
We hypothesize that the reason for unreliability of deep networks is - The way neural networks are currently trained, the probabilities do not generalize across small distortions. We observe that quantile based approaches can potentially solve this problem. We propose an innovative approach to decouple the construction of quantile representations from the loss function allowing us to compute quantile based probabilities without disturbing the original network. We achieve this by establishing a novel duality property between quantiles and probabilities, and an ability to obtain quantile probabilities from any pre-trained classifier.
While post-hoc calibration techniques successfully minimize calibration errors, they do not preserve robustness to distortions. We show that, Quantile probabilities (QuantProb), obtained from Quantile representations, preserve the calibration errors across distortions, since quantile probabilities generalize better than the naive Softmax probabilities.},
openreview = {FBQlYI4LVM},
section = {Papers},
}
@InProceedings{chauhan24a,
title = {Generalization and Learnability in Multiple Instance Regression},
author = {Chauhan, Kushal and Saket, Rishi and Applebaum, Lorne and Badanidiyuru, Ashwinkumar and Giri, Chandan and Raghuveer, Aravindan},
pages = {603-618},
abstract = {Multiple instance regression (MIR) was introduced by Ray and Page (2001) as an analogue of multiple instance learning (MIL) in which we are given bags of feature-vectors (instances) and for each bag there is a bag-label which matches the label of one (unknown) primary instance from that bag. The goal is to compute a hypothesis regressor consistent with the underlying instance-labels. A natural approach is to find the best primary instance assignment and regressor optimizing the mse loss on the bags though no formal generalization guarantees were known. Our work is the first to prove generalization error bounds for MIR when the bags are drawn i.i.d. at random. Essentially, with high probability any MIR regressor with low error on sampled bags also has low error on the underlying instance-label distribution.
We next study the complexity of linear regression on MIR bags, shown to be NP-hard in general by Ray and Page (2001), who however left open the possibility of arbitrarily good approximations. Significantly strengthening previous work, we prove a strong inapproximability bound: even if there exists zero bag-loss MIR linear regressor on a collection of $2$-sized bags with labels in $[-1,1]$, it is NP-hard to find an MIR linear regressor with bag-loss $< C$ for some absolute constant $C > 0$.
Our work also proposes a model training method for MIR based on a novel weighted assignment loss, geared towards handling overlapping bags which have not received much attention previously. We conduct empirical evaluations on synthetic and real-world datasets showing that our method outperforms the baseline MIR methods.},
openreview = {Pn6gcfciMy},
software = {https://github.com/google-research/google-research/tree/master/mir_uai24},
section = {Papers},
}
@InProceedings{chen24a,
title = {Gradient descent in matrix factorization: Understanding large initialization},
author = {Chen, Hengchao and Chen, Xin and Elmasri, Mohamad and Sun, Qiang},
pages = {619-647},
abstract = {Gradient Descent (GD) has been proven effective in solving various matrix factorization problems. However, its optimization behavior with large initial values remains less understood. To address this gap, this paper presents a novel theoretical framework for examining the convergence trajectory of GD with a large initialization. The framework is grounded in signal-to-noise ratio concepts and inductive arguments. The results uncover an implicit incremental learning phenomenon in GD and offer a deeper understanding of its performance in large initialization scenarios.},
openreview = {zHplexWZSr},
section = {Papers},
}
@InProceedings{chen24b,
title = {Conditional Bayesian Quadrature},
author = {Chen, Zonghao and Naslidnyk, Masha and Gretton, Arthur and Briol, Francois-Xavier},
pages = {648-684},
abstract = {We propose a novel approach for estimating conditional or parametric expectations in the setting where obtaining samples or evaluating integrands is costly. Through the framework of probabilistic numerical methods (such as Bayesian quadrature), our novel approach allows to incorporates prior information about the integrands especially the prior smoothness knowledge about the integrands and the conditional expectation. As a result, our approach provides a way of quantifying uncertainty and leads to a fast convergence rate, which is confirmed both theoretically and empirically on challenging tasks in Bayesian sensitivity analysis, computational finance and decision making under uncertainty.},
openreview = {x1YfB33Hj6},
software = {https://github.com/hudsonchen/CBQ},
section = {Papers},
}
@InProceedings{chen24c,
title = {Adaptive Time-Stepping Schedules for Diffusion Models},
author = {Chen, Yuzhu and He, Fengxiang and Fu, Shi and Tian, Xinmei and Tao, Dacheng},
pages = {685-697},
abstract = {This paper studies how to tune the stepping schedule in diffusion models, which is mostly fixed in current practice, lacking theoretical foundations and assurance of optimal performance at the chosen discretization points. In this paper, we advocate the use of adaptive time-stepping schedules and design two algorithms with an optimized sampling error bound $EB$: (1) for continuous diffusion, we treat $EB$ as the loss function to discretization points and run gradient descent to adjust them; and (2) for discrete diffusion, we propose a greedy algorithm that adjusts only one discretization point to its best position in each iteration. We conducted extensive experiments that show (1) improved generation ability in well-trained models, and (2) premature though usable generation ability in under-trained models. The code is submitted and will be released publicly.},
openreview = {lZzriJH2DC},
section = {Papers},
}
@InProceedings{cheng24a,
title = {SMuCo: Reinforcement Learning for Visual Control via Sequential Multi-view Total Correlation},
author = {Cheng, Tong and Dong, Hang and Wang, Lu and Qiao, Bo and Lin, Qingwei and Rajmohan, Saravan and Moscibroda, Thomas},
pages = {698-717},
abstract = {The advent of abundant image data has catalyzed the advancement of visual control in reinforcement learning (RL) systems, leveraging multiple view- points to capture the same physical states, which could enhance control performance theoretically. However, integrating multi-view data into representation learning remains challenging. In this paper, we introduce SMuCo, an innovative multi-view reinforcement learning algorithm that constructs robust latent representations by optimizing multi- view sequential total correlation. This technique effectively captures task-relevant information and temporal dynamics while filtering out irrelevant data. Our method supports an unlimited number of views and demonstrates superior performance over leading model-free and model-based RL algorithms. Empirical results from the DeepMind Control Suite and the Sapien Basic Manipulation Task confirm SMuCo's enhanced efficacy, significantly improving task performance across diverse scenarios and views.},
openreview = {Eg5AmHPQKe},
section = {Papers},
}
@InProceedings{cheng24b,
title = {Inference for Optimal Linear Treatment Regimes in Personalized Decision-making},
author = {Cheng, Yuwen and Yang, Shu},
pages = {718-735},
abstract = {Personalized decision-making, tailored to individual characteristics, is gaining significant attention. The optimal treatment regime aims to provide the best-expected outcome in the entire population, known as the value function. One approach to determine this optimal regime is by maximizing the Augmented Inverse Probability Weighting (AIPW) estimator of the value function. However, the derived treatment regime can be intricate and nonlinear, limiting their use. For clarity and interoperability, we emphasize linear regimes and determine the optimal linear regime by optimizing the AIPW estimator within set constraints.
While the AIPW estimator offers a viable path to estimating the optimal regime, current methodologies predominantly focus on its asymptotic distribution, leaving a gap in studying the linear regime itself. However, there are many benefits to understanding the regime, as pinpointing significant covariates can enhance treatment effects and provide future clinical guidance. In this paper, we explore the asymptotic distribution of the estimated linear regime. Our results show that the parameter associated with the linear regime follows a cube-root convergence to a non-normal limiting distribution characterized by the maximizer of a centered Gaussian process with a quadratic drift. When making inferences for the estimated linear regimes with cube-root convergence in practical scenarios, the standard nonparametric bootstrap is invalid. As a solution, we facilitate the Cattaneo et al. (2020) bootstrap technique to provide a consistent distributional approximation for the estimated linear regimes, validated further through simulations and real-world data applications from the eICU Collaborative Research Database.},
openreview = {7PRZ5O5Dt8},
section = {Papers},
}
@InProceedings{chenreddy24a,
title = {End-to-end Conditional Robust Optimization},
author = {Chenreddy, Abhilash Reddy and Delage, Erick},
pages = {736-748},
abstract = {The field of Contextual Optimization (CO) integrates machine learning and optimization to solve decision making problems under uncertainty. Recently, a risk sensitive variant of CO, known as Conditional Robust Optimization (CRO), combines uncertainty quantification with robust optimization in order to promote safety and reliability in high stake applications. Exploiting modern differentiable optimization methods, we propose a novel end-to-end approach to train a CRO model in a way that accounts for both the empirical risk of the prescribed decisions and the quality of conditional coverage of the contextual uncertainty set that supports them. While guarantees of success for the latter objective are impossible to obtain from the point of view of conformal prediction theory, high quality conditional coverage is achieved empirically by ingeniously employing a logistic regression differentiable layer within the calculation of coverage quality in our training loss.We show that the proposed training algorithms produce decisions that outperform the traditional estimate then optimize approaches.},
openreview = {Oe9ngGi8Gh},
section = {Papers},
}
@InProceedings{chikahara24a,
title = {Differentiable Pareto-Smoothed Weighting for High-Dimensional Heterogeneous Treatment Effect Estimation},
author = {Chikahara, Yoichi and Ushiyama, Kansei},
pages = {749-762},
abstract = {There is a growing interest in estimating heterogeneous treatment effects across individuals using their high-dimensional feature attributes. Achieving high performance in such high-dimensional heterogeneous treatment effect estimation is challenging because in this setup, it is usual that some features induce sample selection bias while others do not but are predictive of potential outcomes. To avoid losing such predictive feature information, existing methods learn separate feature representations using inverse probability weighting (IPW). However, due to their numerically unstable IPW weights, these methods suffer from estimation bias under a finite sample setup. To develop a numerically robust estimator by weighted representation learning, we propose a differentiable Pareto-smoothed weighting framework that replaces extreme weight values in an end-to-end fashion. Our experimental results show that by effectively correcting the weight values, our proposed method outperforms the existing ones, including traditional weighting schemes. Our code is available at [this https URL](https://github.com/ychika/DPSW).},
openreview = {o85WSGg0oB},
software = {https://github.com/ychika/DPSW},
section = {Papers},
}
@InProceedings{chumbalov24a,
title = {Fast Interactive Search under a Scale-Free Comparison Oracle},
author = {Chumbalov, Daniyar and Klein, Lars and Maystre, Lucas and Grossglauser, Matthias},
pages = {763-786},
abstract = {A comparison-based search algorithm lets a user find a target item $t$ in a database by answering queries of the form, ``Which of items $i$ and $j$ is closer to $t$?''
Instead of formulating an explicit query (such as one or several keywords), the user navigates towards the target via a sequence of such (typically noisy) queries.
We propose a scale-free probabilistic oracle model called $\gamma$-CKL for such similarity triplets $(i,j;t)$, which generalizes the CKL triplet model proposed in the literature.
The generalization affords independent control over the discriminating power of the oracle and the dimension of the feature space containing the items.
We develop a search algorithm with provably exponential rate of convergence under the $\gamma$-CKL oracle, thanks to a backtracking strategy that deals with the unavoidable errors in updating the belief region around the target.
We evaluate the performance of the algorithm both over the posited oracle and over several real-world triplet datasets.
We also report on a comprehensive user study, where human subjects navigate a database of face portraits.},
openreview = {dzqbQTm4bi},
section = {Papers},
}
@InProceedings{clarte24a,
title = {Analysis of Bootstrap and Subsampling in High-dimensional Regularized Regression},
author = {Clart\'e, Lucas and Vandenbroucque, Adrien and Dalle, Guillaume and Loureiro, Bruno and Krzakala, Florent and Zdeborov\'a, Lenka},
pages = {787-819},
abstract = {We investigate popular resampling methods for estimating the uncertainty of statistical models, such as subsampling, bootstrap and the jackknife, and their performance in high-dimensional supervised regression tasks. We provide a tight asymptotic description of the biases and variances estimated by these methods in the context of generalized linear models, such as ridge and logistic regression, taking the limit where the number of samples $n$ and dimension $d$ of the covariates grow at a comparable rate: $\alpha=n/d$ fixed. Our findings are three-fold: i) resampling methods are fraught with problems in high dimensions and exhibit the double-descent-like behavior typical of these situations; ii) only when $\alpha$ is large enough do they provide consistent and reliable error estimations (we give convergence rates); iii) in the over-parametrized regime $\alpha<1$ relevant to modern machine learning practice, their predictions are not consistent, even with optimal regularization.},
openreview = {yZaXk3OxVS},
software = {https://github.com/SPOC-group/BootstrapAsymptotics},
section = {Papers},
}
@InProceedings{clavier24a,
title = {Towards Minimax Optimality of Model-based Robust Reinforcement Learning},
author = {Clavier, Pierre and Le Pennec, Erwan and Geist, Matthieu},
pages = {820-855},
abstract = {We study the sample complexity of obtaining an $\epsilon$-optimal policy in <em>Robust</em> discounted Markov Decision Processes (RMDPs), given only access to a generative model of the nominal kernel. This problem is widely studied in the non-robust case, and it is known that any planning approach applied to an empirical MDP estimated with $\tilde{\mathcal{O}}(\frac{H^3 |S||A|}{\epsilon^2})$ samples provides an $\epsilon$-optimal policy, which is minimax optimal. Results in the robust case are much more scarce. For $sa$- (resp $s$-) rectangular uncertainty sets, until recently the best-known sample complexity was $\tilde{\mathcal{O}}(\frac{H^4 |S|^2|A|}{\epsilon^2})$ (resp. $\tilde{\mathcal{O}}(\frac{H^4 | S |^2| A |^2}{\epsilon^2})$), for specific algorithms and when the uncertainty set is based on the total variation (TV), the KL or the Chi-square divergences. In this paper, we consider uncertainty sets defined with an $L_p$-ball (recovering the TV case), and study the sample complexity of <em>any</em> planning algorithm (with high accuracy guarantee on the solution) applied to an empirical RMDP estimated using the generative model. In the general case, we prove a sample complexity of $\tilde{\mathcal{O}}(\frac{H^4 | S || A |}{\epsilon^2})$ for both the $sa$- and $s$-rectangular cases (improvements of $| S |$ and $| S || A |$ respectively). When the size of the uncertainty is small enough, we improve the sample complexity to $\tilde{\mathcal{O}}(\frac{H^3 | S || A | }{\epsilon^2})$, recovering the lower-bound for the non-robust case for the first time and a robust lower-bound. Finally, we also introduce simple and efficient algorithms for solving the studied $L_p$ robust MDPs.},
openreview = {mcmMbWLkfQ},
section = {Papers},
}
@InProceedings{clivio24a,
title = {Towards Representation Learning for Weighting Problems in Design-Based Causal Inference},
author = {Clivio, Oscar and Feller, Avi and Holmes, Chris},
pages = {856-880},
abstract = {Reweighting a distribution to minimize a distance to a target distribution is a powerful and flexible strategy for estimating a wide range of causal effects, but can be challenging in practice because optimal weights typically depend on knowledge of the underlying data generating process. In this paper, we focus on design-based weights, which do not incorporate outcome information; prominent examples include prospective cohort studies, survey weighting, and the weighting portion of augmented weighting estimators. In such applications, we explore the central role of representation learning in finding desirable weights in practice. Unlike the common approach of assuming a well-specified representation, we highlight the error due to the choice of a representation and outline a general framework for finding suitable representations that minimize this error. Building on recent work that combines balancing weights and neural networks, we propose an end-to-end estimation procedure that learns a flexible representation, while retaining promising theoretical properties. We show that this approach is competitive in a range of common causal inference tasks.},
openreview = {KNgBCZXJkY},
software = {https://github.com/oscarclivio/representations_weighting},
section = {Papers},
}
@InProceedings{colombo24a,
title = {Normalizing Flows for Conformal Regression},
author = {Colombo, Nicol\`o},
pages = {881-893},
abstract = {Conformal Prediction (CP) algorithms estimate the uncertainty of a prediction model by calibrating its outputs on labeled data. The same calibration scheme usually applies to any model and data without modifications. The obtained prediction intervals are valid by construction but could be inefficient, i.e. unnecessarily big, if the prediction errors are not uniformly distributed over the input space.
We present a general scheme to localize the intervals by training the calibration process. The standard prediction error is replaced by an optimized distance metric that depends explicitly on the object attributes. Learning the optimal metric is equivalent to training a Normalizing Flow that acts on the joint distribution of the errors and the inputs. Unlike the Error Re-weighting CP algorithm of Papadopoulos et al. (2008), the framework allows estimating the gap between nominal and empirical conditional validity. The approach is compatible with existing locally-adaptive CP strategies based on re-weighting the calibration samples and applies to any point-prediction model without retraining.},
openreview = {acgwLdoB3d},
section = {Papers},
}
@InProceedings{dam24a,
title = {Power Mean Estimation in Stochastic Monte-Carlo Tree Search},
author = {Dam, Tuan and Maillard, Odalric-Ambrym and Kaufmann, Emilie},
pages = {894-918},
abstract = {Monte-Carlo Tree Search (MCTS) is a widely-used online planning strategy that effectively combines Monte-Carlo sampling with forward tree search to make optimal decisions in various real-world scenarios. Its success hinges on the application of the Upper Confidence bound for Trees (UCT) algorithm, an extension of the Upper Confidence bound (UCB) method used in multi-arm bandits. However, theoretical investigations of UCT have been found incomplete due to an error in the estimated "logarithmic" bonus term used for action selection in the tree. This issue was addressed by introducing a "polynomial" exploration bonus, which effectively balances the exploration-exploitation trade-off. However, most theoretical studies have focused on deterministic Markov Decision Processes (MDPs), neglecting stochastic settings. In this paper, we introduce Stochastic-Power-UCT, an MCTS algorithm explicitly designed for stochastic MDPs using power mean as the value estimator. We conduct a comprehensive study of the polynomial convergence assurance for selecting the optimal action and estimating the value at the root node within our methodology. Our findings demonstrate that Stochastic-Power-UCT shares the same convergence rate as UCT, with UCT being a special case of Stochastic-Power-UCT. Furthermore, we validate our theoretical results through empirical assessments across diverse stochastic MDP environments, providing empirical evidence to support our method's theoretical claims},
openreview = {NE2S89Eb9e},
section = {Papers},
}
@InProceedings{damke24a,
title = {Linear Opinion Pooling for Uncertainty Quantification on Graphs},
author = {Damke, Clemens and H\"ullermeier, Eyke},
pages = {919-929},
abstract = {We address the problem of uncertainty quantification for graph-structured data, or, more specifically, the problem to quantify the predictive uncertainty in (semi-supervised) node classification. Key questions in this regard concern the distinction between two different types of uncertainty, aleatoric and epistemic, and how to support uncertainty quantification by leveraging the structural information provided by the graph topology. Challenging assumptions and postulates of state-of-the-art methods, we propose a novel approach that represents (epistemic) uncertainty in terms of mixtures of Dirichlet distributions and refers to the established principle of linear opinion pooling for propagating information between neighbored nodes in the graph. The effectiveness of this approach is demonstrated in a series of experiments on a variety of graph-structured datasets.},
openreview = {qLGkfpXTSn},
software = {https://github.com/Cortys/gpn-extensions},
section = {Papers},
}
@InProceedings{dang24a,
title = {Can we Defend Against the Unknown? An Empirical Study About Threshold Selection for Neural Network Monitoring},
author = {Dang, Khoi Tran and Delmas, Kevin and Guiochet, J\'er\'emie and Gu\'erin, Joris},
pages = {930-942},
abstract = {With the increasing use of neural networks in critical systems, runtime monitoring becomes essential to reject unsafe predictions during inference. Various techniques have emerged to establish rejection scores that maximize the separability between the distributions of safe and unsafe predictions. The efficacy of these approaches is mostly evaluated using threshold-agnostic metrics, such as the area under the receiver operating characteristic curve. However, in real-world applications, an effective monitor also requires identifying a good threshold to transform these scores into meaningful binary decisions. Despite the pivotal importance of threshold optimization, this problem has received little attention. A few studies touch upon this question, but they typically assume that the runtime data distribution mirrors the training distribution, which is a strong assumption as monitors are supposed to safeguard a system against potentially unforeseen threats. In this work, we present rigorous experiments on various image datasets to investigate: 1. The effectiveness of monitors in handling unforeseen threats, which are not available during threshold adjustments. 2. Whether integrating generic threats into the threshold optimization scheme can enhance the robustness of monitors.},
openreview = {nzxnE8t2Sz},
section = {Papers},
}
@InProceedings{de-bartolomeis24a,
title = {Detecting critical treatment effect bias in small subgroups},
author = {De Bartolomeis, Piersilvio and Abad, Javier and Donhauser, Konstantin and Yang, Fanny},
pages = {943-965},
abstract = {Randomized trials are considered the gold standard for making informed decisions in medicine. However, they are often not representative of the patient population in clinical practice. Observational studies, on the other hand, cover a broader patient population but are prone to various biases. Thus, before using observational data for any downstream task, it is crucial to benchmark its treatment effect estimates against a randomized trial.
We propose a novel strategy to benchmark observational studies on a subgroup level. First, we design a statistical test for the null hypothesis that the treatment effects -- conditioned on a subset of relevant features -- differ up to some tolerance value. Our test allows us to estimate an asymptotically valid lower bound on the maximum bias strength for any subgroup. We validate our lower bound in a real-world setting and show that it leads to conclusions that align with established medical knowledge.},
openreview = {A6K0r8bjNg},
software = {https://github.com/jaabmar/kernel-test-bias},
section = {Papers},
}
@InProceedings{decruyenaere24a,
title = {The Real Deal Behind the Artificial Appeal: Inferential Utility of Tabular Synthetic Data},
author = {Decruyenaere, Alexander and Dehaene, Heidelinde and Rabaey, Paloma and Polet, Christiaan and Decruyenaere, Johan and Vansteelandt, Stijn and Demeester, Thomas},
pages = {966-996},
abstract = {Recent advances in generative models facilitate the creation of synthetic data to be made available for research in privacy-sensitive contexts. However, the analysis of synthetic data raises a unique set of methodological challenges. In this work, we highlight the importance of inferential utility and provide empirical evidence against naive inference from synthetic data, whereby synthetic data are treated as if they were actually observed. Before publishing synthetic data, it is essential to develop statistical inference tools for such data. By means of a simulation study, we show that the rate of false-positive findings (type 1 error) will be unacceptably high, even when the estimates are unbiased. Despite the use of a previously proposed correction factor, this problem persists for deep generative models, in part due to slower convergence of estimators and resulting underestimation of the true standard error. We further demonstrate our findings through a case study.},
openreview = {OR9bNsVPWb},
software = {https://github.com/syndara-lab/inferential-utility},
section = {Papers},
}
@InProceedings{deleu24a,
title = {Discrete Probabilistic Inference as Control in Multi-path Environments},
author = {Deleu, Tristan and Nouri, Padideh and Malkin, Nikolay and Precup, Doina and Bengio, Yoshua},
pages = {997-1021},
abstract = {We consider the problem of sampling from a discrete and structured distribution as a sequential decision problem, where the objective is to find a stochastic policy such that objects are sampled at the end of this sequential process proportionally to some predefined reward. While we could use maximum entropy Reinforcement Learning (MaxEnt RL) to solve this problem for some distributions, it has been shown that in general, the distribution over states induced by the optimal policy may be biased in cases where there are multiple ways to generate the same object. To address this issue, Generative Flow Networks (GFlowNets) learn a stochastic policy that samples objects proportionally to their reward by approximately enforcing a conservation of flows across the whole Markov Decision Process (MDP). In this paper, we extend recent methods correcting the reward in order to guarantee that the marginal distribution induced by the optimal MaxEnt RL policy is proportional to the original reward, regardless of the structure of the underlying MDP. We also prove that some flow-matching objectives found in the GFlowNet literature are in fact equivalent to well-established MaxEnt RL algorithms with a corrected reward. Finally, we study empirically the performance of multiple MaxEnt RL and GFlowNet algorithms on multiple problems involving sampling from discrete distributions.},
openreview = {3C69sU1YkK},
software = {https://github.com/tristandeleu/gfn-maxent-rl},
section = {Papers},
}
@InProceedings{deng24a,
title = {On Convergence of Federated Averaging Langevin Dynamics},
author = {Deng, Wei and Zhang, Qian and Ma, Yian and Song, Zhao and Lin, Guang},
pages = {1022-1054},
abstract = {We propose a federated averaging Langevin algorithm (FA-LD) for uncertainty quantification and mean predictions with distributed clients. In particular, we generalize beyond normal posterior distributions and consider a general class of models. We develop theoretical guarantees for FA-LD for strongly log-concave distributions with non-i.i.d data and study how the injected noise and the stochastic-gradient noise, the heterogeneity of data, and the varying learning rates affect the convergence. Such an analysis sheds light on the optimal choice of local updates to minimize the communication cost. Important to our approach is that the communication efficiency does not deteriorate with the injected noise in the Langevin algorithms. In addition, we examine in our FA-LD algorithm both independent and correlated noise used over different clients. We observe that there is a trade-off between the pairs among communication, accuracy, and data privacy. As local devices may become inactive in federated networks, we also show convergence results based on different averaging schemes where only partial device updates are available. In such a case, we discover an additional bias that does not decay to zero.},
openreview = {EmQGdBsOPx},
section = {Papers},
}
@InProceedings{deng24b,
title = {Reflected Schr\"odinger Bridge for Constrained Generative Modeling},
author = {Deng, Wei and Chen, Yu and Yang, Nicole Tianjiao and Du, Hengrong and Feng, Qi and Chen, Ricky Tian Qi},
pages = {1055-1082},
abstract = {Diffusion models have become the go-to method for large-scale generative models in real-world applications. These applications often involve data distributions confined within bounded domains, typically requiring ad-hoc thresholding techniques for boundary enforcement. Reflected diffusion models aim to enhance generalizability by generating the data distribution through a backward process governed by reflected Brownian motion. However, reflected diffusion models may not easily adapt to diverse domains without the derivation of proper diffeomorphic mappings and do not guarantee optimal transport properties. To overcome these limitations, we introduce the Reflected Schr\"odinger Bridge algorithm{\textemdash}an entropy-regularized optimal transport approach tailored for generating data within diverse bounded domains. We derive elegant reflected forward-backward stochastic differential equations with Neumann and Robin boundary conditions, extend divergence-based likelihood training to bounded domains, and explore natural connections to entropic optimal transport for the study of approximate linear convergence{\textemdash}a valuable insight for practical training. Our algorithm yields robust generative modeling in diverse domains, and its scalability is demonstrated in real-world constrained generative modeling through standard image benchmarks.},
openreview = {8Wl7xRXUHK},
section = {Papers},
}
@InProceedings{deshpande24a,
title = {Calibrated and Conformal Propensity Scores for Causal Effect Estimation},
author = {Deshpande, Shachi and Kuleshov, Volodymyr},
pages = {1083-1111},
abstract = {Propensity scores are commonly used to balance observed covariates while estimating treatment effects. We argue that the probabilistic output of a learned propensity score model should be calibrated, i.e. a predictive treatment probability of 90% should correspond to 90% of individuals being assigned the treatment group. We propose simple recalibration techniques to ensure this property. We prove that calibration is a necessary condition for unbiased treatment effect estimation when using popular inverse propensity weighted and doubly robust estimators. We derive error bounds on causal effect estimates that directly relate to the quality of uncertainties provided by the probabilistic propensity score model and show that calibration strictly improves this error bound while also avoiding extreme propensity weights. We demonstrate improved causal effect estimation with calibrated propensity scores in several tasks including high-dimensional image covariates and genome-wide association studies (GWASs). Calibrated propensity scores improve the speed of GWAS analysis by more than two-fold by enabling the use of simpler models that are faster to train.},
openreview = {xgs9VpvVAC},
section = {Papers},
}
@InProceedings{ding24a,
title = {Learning to Rank for Active Learning via Multi-Task Bilevel Optimization},
author = {Ding, Zixin and Chen, Si and Jia, Ruoxi and Chen, Yuxin},
pages = {1112-1128},
abstract = {Active learning is a promising paradigm for reducing labeling costs by strategically requesting labels to improve model performance. However, existing active learning methods often rely on expensive acquisition functions, extensive model retraining, and multiple rounds of interaction with annotators. To address these limitations, we propose a novel approach for active learning, which aims to select batches of unlabeled instances through a learned surrogate model for data acquisition. A key challenge in this approach is to develop an acquisition function that generalizes well, as the history of data, which forms part of the utility function's input, grows over time. Our novel algorithmic contribution is a multi-task bilevel optimization framework that predicts the relative utility---measured by the validation accuracy---of different training sets, and ensures the learned acquisition function generalizes effectively. For cases where validation accuracy is expensive to evaluate, we introduce efficient interpolation-based surrogate models to estimate the utility function, reducing the evaluation cost. We demonstrate the performance of our approach through extensive experiments on standard active classification benchmarks.},
openreview = {o5Iw3kN9Eg},
section = {Papers},
}
@InProceedings{dinh24a,
title = {End-to-End Learning for Fair Multiobjective Optimization Under Uncertainty},
author = {Dinh, My H. and Kotary, James and Fioretto, Ferdinando},
pages = {1129-1145},
abstract = {Many decision processes in artificial intelligence and operations research are modeled by parametric optimization problems whose defining parameters are unknown and must be inferred from observable data. The Predict-Then-Optimize (PtO) paradigm in machine learning aims to maximize downstream decision quality by training the parametric inference model end-to-end with the subsequent constrained optimization. This requires backpropagation through the optimization problem using approximation techniques specific to the problem's form, especially for nondifferentiable linear and mixed-integer programs. This paper extends the PtO methodology to optimization problems with nondifferentiable Ordered Weighted Averaging (OWA) objectives, known for their ability to ensure properties of fairness and robustness in decision models. Through a collection of training techniques and proposed application settings, it shows how optimization of OWA functions can be effectively integrated with parametric prediction for fair and robust optimization under uncertainty.},
openreview = {ptbCobFKtX},
section = {Papers},
}
@InProceedings{dong24a,
title = {Online Policy Optimization for Robust Markov Decision Process},
author = {Dong, Jing and Li, Jingwei and Wang, Baoxiang and Zhang, Jingzhao},
pages = {1146-1175},
abstract = {Reinforcement learning (RL) has exceeded human performance in many synthetic settings such as video games and Go. However, real-world deployment of end-to-end RL models is less common, as RL models can be very sensitive to perturbations in the environment. The robust Markov decision process (MDP) framework---in which the transition probabilities belong to an uncertainty set around a nominal model---provides one way to develop robust models. While previous analysis for robust MDP shows RL algorithms are effective assuming access to a generative model, it remains unclear whether RL can be efficient under a more realistic online setting, which requires a careful balance between exploration and exploitation. In this work, we consider online robust MDP by interacting with an unknown nominal system. We propose a robust optimistic policy optimization algorithm that is provably efficient. To address the additional uncertainty caused by an adversarial environment, our model features a new optimistic update rule derived via Fenchel conjugates. Our analysis establishes the first regret bound for online robust MDPs.},
openreview = {VS8EPaCSY1},
section = {Papers},
}
@InProceedings{dong24b,
title = {Learning Distributionally Robust Tractable Probabilistic Models in Continuous Domains},
author = {Dong, Hailiang and Amato, James and Gogate, Vibhav and Ruozzi, Nicholas},
pages = {1176-1188},
abstract = {Tractable probabilistic models (TPMs) have attracted substantial research interest in recent years, particularly because of their ability to answer various reasoning queries in polynomial time. In this study, we focus on the distributionally robust learning of continuous TPMs and address the challenge of distribution shift at test time by tackling the adversarial risk minimization problem within the framework of distributionally robust learning. Specifically, we demonstrate that the adversarial risk minimization problem can be efficiently addressed when the model permits exact log-likelihood evaluation and efficient learning on weighted data. Our experimental results on several real-world datasets show that our approach achieves significantly higher log-likelihoods on adversarial test sets. Remarkably, we note that the model learned via distributionally robust learning can achieve higher average log-likelihood on the initial uncorrupted test set at times.},
openreview = {SlxO1NpLiE},
software = {https://github.com/LeonDong1993/UAI2024-RobustLearning},
section = {Papers},
}
@InProceedings{drago24a,
title = {Bandits with Knapsacks and Predictions},
author = {Drago, Davide and Celli, Andrea and Elias, Marek},
pages = {1189-1206},
abstract = {We study the Bandits with Knapsacks problem with the aim of designing a learning-augmented online learning algorithm upholding better regret guarantees than the state-of-the-art primal-dual algorithms with worst-case guarantees, under both stochastic and adversarial inputs. In the adversarial case, we obtain better competitive ratios when the input predictions are accurate, while also maintaining worst-case guarantees for imprecise predictions. We introduce two algorithms tailored for the full and bandit feedback settings, respectively. Both algorithms integrate a static prediction with a worst-case no-$\alpha$-regret algorithm. This yields an optimized competitive ratio of $(\pi + (1 -\pi)/\alpha)^{-1}$ in scenarios where the prediction is perfect, and a competitive ratio of $\alpha/(1 - \pi)$ in the case of highly imprecise predictions, where $\pi \in (0,1)$ is chosen by the learner and $\alpha$ is Slater's parameter. We complement this analysis by studying the stochastic setting under full feedback. We provide an algorithm which guarantees a pseudo-regret of $\widetilde{O}(\sqrt{T})$ with poor predictions, and 0 pseudo-regret with perfect predictions. We also characterize the smoothness of the algorithm.},
openreview = {pGnoioQ9z1},
software = {https://github.com/davidedrago0007/AdversarialBanditwithKnapsacksandPredictions},
section = {Papers},
}
@InProceedings{dyer24a,
title = {Approximate Bayesian Computation with Path Signatures},
author = {Dyer, Joel and Cannon, Patrick and Schmon, Sebastian M.},
pages = {1207-1231},
abstract = {Simulation models often lack tractable likelihood functions, making likelihood-free inference methods indispensable. Approximate Bayesian computation generates likelihood-free posterior samples by comparing simulated and observed data through some distance measure, but existing approaches are often poorly suited to time series simulators, for example due to an independent and identically distributed data assumption. In this paper, we propose to use path signatures in approximate Bayesian computation to handle the sequential nature of time series. We provide theoretical guarantees on the resultant posteriors and demonstrate competitive Bayesian parameter inference for simulators generating univariate, multivariate, and irregularly spaced sequences of non-iid data.},
openreview = {MsU6sKOcA4},
section = {Papers},
}
@InProceedings{elkady24a,
title = {Vertical Validation: Evaluating Implicit Generative Models for Graphs on Thin Support Regions},
author = {Elkady, Mai and Bui, Thu and Ribeiro, Bruno and Inouye, David},
pages = {1232-1256},
abstract = {There has been a growing excitement that implicit graph generative models could be used to design or discover new molecules for medicine or material design. Because these molecules have not been discovered, they naturally lie in unexplored or scarcely supported regions of the distribution of known molecules. However, prior evaluation methods for implicit graph generative models have focused on validating statistics computed from the thick support (e.g., mean and variance of a graph property). Therefore, there is a mismatch between the goal of generating novel graphs and the evaluation methods. To address this evaluation gap, we design a novel evaluation method called Vertical Validation (VV) that systematically creates thin support regions during the train-test splitting procedure and then reweights generated samples so that they can be compared to the held-out test data. This procedure can be seen as a generalization of the standard train-test procedure except that the splits are dependent on sample features. We demonstrate that our method can be used to perform model selection if performance on thin support regions is the desired goal. As a side benefit, we also show that our approach can better detect overfitting as exemplified by memorization.},
openreview = {u5zVjFaxSs},
section = {Papers},
}
@InProceedings{enomoto24a,
title = {EntProp: High Entropy Propagation for Improving Accuracy and Robustness},
author = {Enomoto, Shohei},
pages = {1257-1270},
abstract = {Deep neural networks (DNNs) struggle to generalize to out-of-distribution domains that are different from those in training despite their impressive performance.
In practical applications, it is important for DNNs to have both high standard accuracy and robustness against out-of-distribution domains.
One technique that achieves both of these improvements is disentangled learning with mixture distribution via auxiliary batch normalization layers (ABNs).
This technique treats clean and transformed samples as different domains, allowing a DNN to learn better features from mixed domains.
However, if we distinguish the domains of the samples based on entropy, we find that some transformed samples are drawn from the same domain as clean samples, and these samples are not completely different domains.
To generate samples drawn from a completely different domain than clean samples, we hypothesize that transforming clean high-entropy samples to further increase the entropy generates out-of-distribution samples that are much further away from the in-distribution domain.
On the basis of the hypothesis, we propose high entropy propagation~(EntProp), which feeds high-entropy samples to the network that uses ABNs.
We introduce two techniques, data augmentation and free adversarial training, that increase entropy and bring the sample further away from the in-distribution domain.
These techniques do not require additional training costs.
Our experimental results show that EntProp achieves higher standard accuracy and robustness with a lower training cost than the baseline methods.
In particular, EntProp is highly effective at training on small datasets.},
openreview = {kp27EuaR22},
section = {Papers},
}
@InProceedings{fan24a,
title = {Multi-fidelity Bayesian Optimization with Multiple Information Sources of Input-dependent Fidelity},
author = {Fan, Mingzhou and Yoon, Byung-Jun and Dougherty, Edward and Urban, Nathan and Alexander, Francis and Arr\'oyave, Raymundo and Qian, Xiaoning},
pages = {1271-1293},
abstract = {By querying approximate surrogate models of different fidelity as available information sources, Multi-Fidelity Bayesian Optimization (MFBO) aims at optimizing unknown functions that are costly if not infeasible to evaluate. Existing MFBO methods often assume that approximate surrogates have consistently high/low fidelity across the input domain. However, approximate evaluations from the same surrogate can have different fidelity at different input regions due to data availability and model constraints, especially when considering machine learning surrogates. In this work, we investigate MFBO when multi-fidelity approximations have input-dependent fidelity. By explicitly capturing input dependency for multi-fidelity queries in Gaussian Process (GP), our new input-dependent MFBO~(iMFBO) with learnable noise models better captures the fidelity of each information source in an intuitive way.
We further design a new acquisition function for iMFBO and prove that the queries selected by iMFBO have higher quality than those by naive MFBO methods, with the derived sub-linear regret bound. Experiments on both synthetic and real-world data demonstrate its superior empirical performance.},
openreview = {ZZBxbUMrP6},
section = {Papers},
}
@InProceedings{fang24a,
title = {Center-Based Relaxed Learning Against Membership Inference Attacks},
author = {Fang, Xingli and Kim, Jung-Eun},
pages = {1294-1306},
abstract = {Membership inference attacks (MIAs) are currently considered one of the main privacy attack strategies, and their defense mechanisms have also been extensively explored. However, there is still a gap between the existing defense approaches and ideal models in both performance and deployment costs. In particular, we observed that the privacy vulnerability of the model is closely correlated with the gap between the model's data-memorizing ability and generalization ability. To address it, we propose a new architecture-agnostic training paradigm called Center-based Relaxed Learning (CRL), which is adaptive to any classification model and provides privacy preservation by sacrificing a minimal or no loss of model generalizability. We emphasize that CRL can better maintain the model's consistency between member and non-member data. Through extensive experiments on common classification datasets, we empirically show that this approach exhibits comparable performance without requiring additional model capacity or data costs.},
openreview = {unlWrunFjg},
software = {https://github.com/JEKimLab/UAI24_CRL},
section = {Papers},
}
@InProceedings{fang24b,
title = {Enhancing Patient Recruitment Response in Clinical Trials: an Adaptive Learning Framework},
author = {Fang, Xinying and Zhou, Shouhao},
pages = {1307-1322},
abstract = {Patient recruitment remains a key challenge in contemporary clinical trials, often leading to trial failures due to insufficient recruitment rates. To address this issue, we introduce a novel adaptive learning framework that integrates machine learning methods to facilitate evidence-informed recruitment. Through dynamic testing, predictive learning, and adaptive pruning of recruitment plans, the proposed framework ensures superiority over the conventional random assignment approach. We discuss the practical considerations for implementing this framework and conduct a simulation study to assess the overall response rates and chances of improvement. The findings suggest that the proposed approach can substantially enhance patient recruitment efficiency. By systematically optimizing recruitment plan allocation, this adaptive learning framework shows promise in addressing recruitment challenges across broad clinical research settings, potentially transforming how patient recruitment is managed in clinical trials.},
openreview = {qjY8a8jRgy},
software = {https://github.com/vivid225/Integrating-Machine-Learning-Methods-in-Clinical-Trial-Recruitment},
section = {Papers},
}
@InProceedings{fargier24a,
title = {Generalized Expected Utility as a Universal Decision Rule -- A Step Forward},
author = {Fargier, H\'el\`ene and Pomeret-Coquot, Pierre},
pages = {1323-1338},
abstract = {In order to capture a larger range of decision rules, this paper extends the seminal work of [Friedman and Halpern, 1995, Chu and Halpern, 2003, 2004] about Generalized Expected Utility. We introduce the notion of algebraic mass function (and of algebraic M\"obius transform) and provide a new algebraic expression for expected utility based on such functions. This utility, that we call "XEU", generalizes Chu and Halpern's GEU to non-decomposable measures and allows for the representation of several rules that could not be captured up to this point, and noticeably, of the Choquet integral. A representation theorem is provided that shows that only a very weak condition is needed for a rule in order to be representable as a XEU.},
openreview = {OeOZO9rZj3},
section = {Papers},
}
@InProceedings{feng24a,
title = {Last-iterate Convergence Separation between Extra-gradient and Optimism in Constrained Periodic Games},
author = {Feng, Yi and Li, Ping and Panageas, Ioannis and Wang, Xiao},
pages = {1339-1370},
abstract = {Last-iterate behaviors of learning algorithms in repeated two-player zero-sum games have been extensively studied due to their wide applications in machine learning and related tasks. Typical algorithms that exhibit the last-iterate convergence property include optimistic and extra-gradient methods. However, most existing results establish these properties under the assumption that the game is time-independent. Recently, (Feng et al., 2023) studied the last-iterate behaviors of optimistic and extra-gradient methods in games with a time-varying payoff matrix, and proved that in an unconstrained periodic game, extra-gradient method converges to the equilibrium while optimistic method diverges. This finding challenges the conventional wisdom that these two methods are expected to behave similarly as they do in time-independent games. However, compared to unconstrained games, games with constrains are more common both in practical and theoretical studies. In this paper, we investigate the last-iterate behaviors of optimistic and extra-gradient methods in the constrained periodic games, demonstrating that similar separation results for last-iterate convergence also hold in this setting.},
openreview = {IaH9nprqiU},
section = {Papers},
}
@InProceedings{franco24a,
title = {Guaranteeing Robustness Against Real-World Perturbations In Time Series Classification Using Conformalized Randomized Smoothing},
author = {Franco, Nicola and Spiegelberg, Jakob and Lorenz, Jeanette Miriam and G\"unnemann, Stephan},
pages = {1371-1388},
abstract = {Certifying the robustness of machine learning models against domain shifts and input space perturbations is crucial for many applications, where high risk decisions are based on the model's predictions. Techniques such as randomized smoothing have partially addressed this issues with a focus on adversarial attacks in the past. In this paper, we generalize randomized smoothing to arbitrary transformations and extend it to conformal prediction. The proposed ansatz is demonstrated on a time series classifier connected to an automotive use case. We meticulously assess the robustness of smooth classifiers in environments subjected to various degrees and types of time series native perturbations and compare it against standard conformal predictors. The proposed method consistently offers superior resistance to perturbations, maintaining high classification accuracy and reliability. Additionally, we are able to bound the performance on new domains via calibrating generalization with configuration shifts in the training data. In combination, conformalized randomized smoothing may offer a model agnostic approach to construct robust classifiers tailored to perturbations in their respective applications - a crucial capability for AI assurance argumentation.},
openreview = {wu3JIjKmXQ},
section = {Papers},
}
@InProceedings{gao24a,
title = {Consistency Regularization for Domain Generalization with Logit Attribution Matching},
author = {Gao, Han and Li, Kaican and Xie, Weiyan and Lin, Zhi and Huang, Yongxiang and Wang, Luning and Cao, Caleb and Zhang, Nevin},
pages = {1389-1407},
abstract = {Domain generalization (DG) is about training models that generalize well under domain shift. Previous research on DG has been conducted mostly in single-source or multi-source settings. In this paper, we consider a third lesser-known setting where a training domain is endowed with a collection of pairs of examples that share the same semantic information. Such semantic sharing (SS) pairs can be created via data augmentation and then utilized for consistency regularization (CR). We present a theory showing CR is conducive to DG and propose a novel CR method called Logit Attribution Matching (LAM). We conduct experiments on five DG benchmarks and four pretrained models with SS pairs created by both generic and targeted data augmentation methods. LAM outperforms representative single/multi-source DG methods and various CR methods that leverage SS pairs. The code and data of this project are available at https://github.com/Gaohan123/LAM.},
openreview = {WNy1ooHYHx},
software = {https://github.com/Gaohan123/LAM},
section = {Papers},
}
@InProceedings{gedon24a,
title = {Uncertainty Estimation with Recursive Feature Machines},
author = {Gedon, Daniel and Abedsoltan, Amirhesam and Sch\"on, Thomas B. and Belkin, Mikhail},
pages = {1408-1437},
abstract = {In conventional regression analysis, predictions are typically represented as point estimates derived from covariates. The Gaussian Process (GP) offer a kernel-based framework that predicts and quantifies associated uncertainties. However, kernel-based methods often underperform ensemble-based decision tree approaches in regression tasks involving tabular and categorical data. Recently, Recursive Feature Machines (RFMs) were proposed as a novel feature-learning kernel which strengthens the capabilities of kernel machines. In this study, we harness the power of these RFMs in a probabilistic GP-based approach to enhance uncertainty estimation through feature extraction within kernel methods. We employ this learned kernel for in-depth uncertainty analysis. On tabular datasets, our RFM-based method surpasses other leading uncertainty estimation techniques, including NGBoost and CatBoost-ensemble. Additionally, when assessing out-of-distribution performance, we found that boosting-based methods are surpassed by our RFM-based approach.},
openreview = {TBKLXswKnO},
software = {https://github.com/dgedon/rfm_uncertainty},
section = {Papers},
}
@InProceedings{gigli24a,
title = {Bootstrap Your Conversions: Thompson Sampling for Partially Observable Delayed Rewards},
author = {Gigli, Marco and Stella, Fabio},
pages = {1438-1452},
abstract = {This paper presents a novel approach to address contextual bandit problems with partially observable, delayed feedback by introducing an approximate Thompson sampling technique. This is a common setting, with applications ranging from online marketing to vaccine trials. Leveraging Bootstrapped Thompson sampling (BTS), we obtain an approximate posterior distribution over delay distributions and conversion probabilities, thereby extending an Expectation-Maximisation (EM) model to the Bayesian domain. Unlike prior methodologies, our approach does not overlook uncertainty on delays. Within the EM framework, we employ the Kaplan-Meier estimator to place no restriction on delay distributions. Through extensive benchmarking against state-of-the-art techniques, our approach demonstrates superior performance across the majority of tested environments, with comparable performance in the remaining cases. Furthermore, our method offers practical implementation using off-the-shelf libraries, facilitating broader adoption. Our technique lays a foundation for extending to other bandit settings, such as non-contextual bandits or action-dependent delay distributions, promising wider applicability and versatility in real-world applications.},
openreview = {BhFp6cFwDq},
software = {https://github.com/MarcoGigli/bootstrap-conversions},
section = {Papers},
}
@InProceedings{gracyk24a,
title = {GeONet: a neural operator for learning the Wasserstein geodesic},
author = {Gracyk, Andrew and Chen, Xiaohui},
pages = {1453-1478},
abstract = {Optimal transport (OT) offers a versatile framework to compare complex data distributions in a geometrically meaningful way. Traditional methods for computing the Wasserstein distance and geodesic between probability measures require mesh-specific domain discretization and suffer from the curse-of-dimensionality. We present GeONet, a mesh-invariant deep neural operator network that learns the non-linear mapping from the input pair of initial and terminal distributions to the Wasserstein geodesic connecting the two endpoint distributions. In the offline training stage, GeONet learns the saddle point optimality conditions for the dynamic formulation of the OT problem in the primal and dual spaces that are characterized by a coupled PDE system. The subsequent inference stage is instantaneous and can be deployed for real-time predictions in the online learning setting. We demonstrate that GeONet achieves comparable testing accuracy to the standard OT solvers on simulation examples and the MNIST dataset with considerably reduced inference-stage computational cost by orders of magnitude.},
openreview = {huNOnN0UK5},
section = {Papers},
}
@InProceedings{gudovskiy24a,
title = {ContextFlow++: Generalist-Specialist Flow-based Generative Models with Mixed-variable Context Encoding},
author = {Gudovskiy, Denis and Okuno, Tomoyuki and Nakata, Yohei},
pages = {1479-1490},
abstract = {Normalizing flow-based generative models have been widely used in applications where the exact density estimation is of major importance. Recent research proposes numerous methods to improve their expressivity.
However, conditioning on a context is largely overlooked area in the bijective flow research. Conventional conditioning with the vector concatenation is limited to only a few flow types.
More importantly, this approach cannot support a practical setup where a set of context-conditioned (*specialist*) models are trained with the fixed pretrained general-knowledge (*generalist*) model. We propose ContextFlow++ approach to overcome these limitations using an additive conditioning with explicit generalist-specialist knowledge decoupling. Furthermore, we support discrete contexts by the proposed mixed-variable architecture with context encoders. Particularly, our context encoder for discrete variables is a surjective flow from which the context-conditioned continuous variables are sampled. Our experiments on rotated MNIST-R, corrupted CIFAR-10C, real-world ATM predictive maintenance and SMAP unsupervised anomaly detection benchmarks show that the proposed ContextFlow++ offers faster stable training and achieves higher performance metrics. Our code is publicly available at [github.com/gudovskiy/contextflow](https://github.com/gudovskiy/contextflow).},
openreview = {06nlLSkuuu},
software = {https://github.com/gudovskiy/contextflow},
section = {Papers},
}
@InProceedings{guha24a,
title = {One Shot Inverse Reinforcement Learning for Stochastic Linear Bandits},
author = {Guha, Etash and James, Jim and Acharya, Krishna and Muthukumar, Vidya and Pananjady, Ashwin},
pages = {1491-1512},
abstract = {The paradigm of inverse reinforcement learning (IRL) is used to specify the reward function of an agent purely from its actions and is critical for value alignment and AI safety. While IRL is successful in practice, theoretical guarantees remain nascent. Motivated by the need for IRL in large action spaces with limited data, we consider as a first step the problem of learning from a single sequence of actions (i.e., a demonstration) of a stochastic linear bandit algorithm. When the demonstrator employs the Phased Elimination algorithm, we develop a simple inverse learning procedure that estimates the linear reward function consistently in the time horizon with just a <em>single</em> demonstration. In particular, we show that our inverse learner approximates the true reward parameter within a error of $\mathcal{O}(T^{-\frac{\omega - 1}{2\omega }})$ (where $T$ is the length of the demonstrator's trajectory and $\omega$ is a constant that depends on the geometry of the action set). We complement this result with an information-theoretic lower bound for any inverse learning procedure. We corroborate our theoretical results with simulations on synthetic data and a demonstration constructed from the MovieLens dataset.},
openreview = {kQQ20pTbxI},
software = {https://github.com/jimtjames/InvPhasedElim},
section = {Papers},
}
@InProceedings{han24a,
title = {Characterizing Data Point Vulnerability as Average-Case Robustness},
author = {Han, Tessa and Srinivas, Suraj and Lakkaraju, Himabindu},
pages = {1513-1540},
abstract = {Studying the robustness of machine learning models is important to ensure consistent model behaviour across real-world settings. To this end, adversarial robustness is a standard framework, which views robustness of predictions through a binary lens: either a worst-case adversarial perturbation exists in the local region around an input, or it does not. However, this binary perspective does not account for the degrees of vulnerability, as data points with a larger number of misclassified examples in their neighborhoods are more vulnerable. In this work, we consider a complementary framework for robustness, called average-case robustness, which measures the fraction of points in a local region that provides consistent predictions. However, computing this quantity is hard, as standard Monte Carlo approaches are inefficient especially for high-dimensional inputs. In this work, we propose the first analytical estimators for average-case robustness for multi-class classifiers. We show empirically that our estimators are accurate and efficient for standard deep learning models and demonstrate their usefulness for identifying vulnerable data points, and well as quantifying robustness bias of models. Overall, our tools provide a complementary view to robustness, improving our ability to characterize model behaviour.},
openreview = {iLZUHWbDkq},
software = {https://github.com/AI4LIFE-GROUP/average-case-robustness},
section = {Papers},
}
@InProceedings{han24b,
title = {No-Regret Learning of Nash Equilibrium for Black-Box Games via Gaussian Processes},
author = {Han, Minbiao and Zhang, Fengxue and Chen, Yuxin},
pages = {1541-1557},
abstract = {This paper investigates the challenge of learning in black-box games, where the underlying utility function is unknown to any of the agents. While there is an extensive body of literature on the theoretical analysis of algorithms for computing the Nash equilibrium with *complete information* about the game, studies on Nash equilibrium in *black-box* games are less common. In this paper, we focus on learning the Nash equilibrium when the only available information about an agent's payoff comes in the form of empirical queries. We provide a no-regret learning algorithm that utilizes Gaussian processes to identify equilibria in such games. Our approach not only ensures a theoretical convergence rate but also demonstrates effectiveness across a variety collection of games through experimental validation.},
openreview = {LMcHRkpSKZ},
software = {https://github.com/SchroDeCat/UAI24-ARISE},
section = {Papers},
}
@InProceedings{harviainen24a,
title = {Faster Perfect Sampling of Bayesian Network Structures},
author = {Harviainen, Juha and Koivisto, Mikko},
pages = {1558-1568},
abstract = {Bayesian inference of a Bayesian network structure amounts to averaging over directed acyclic graphs (DAGs) on a given set of $n$ variables, each DAG weighted by its posterior probability. In practice, save some special inference tasks, one averages over a sample of DAGs generated perfectly or approximately from the posterior. For the hard problem of perfect sampling, we give an algorithm that runs in $O(2.829^n)$ expected time, getting below $O(3^n)$ for the first time. Our algorithm reduces the problem into two smaller sampling problems whose outputs are combined; followed by a simple rejection step, perfect samples are obtained. Subsequent samples can be generated considerably faster. Empirically, we observe speedups of several orders of magnitude over the state of the art.},
openreview = {eq2rjvKbaG},
section = {Papers},
}
@InProceedings{henckel24a,
title = {Adjustment Identification Distance: A gadjid for Causal Structure Learning},
author = {Henckel, Leonard and W\"urtzen, Theo and Weichwald, Sebastian},
pages = {1569-1598},
abstract = {Evaluating graphs learned by causal discovery algorithms is difficult: The number of edges that differ between two graphs does not reflect how the graphs differ with respect to the identifying formulas they suggest for causal effects. We introduce a framework for developing causal distances between graphs which includes the structural intervention distance for directed acyclic graphs as a special case. We use this framework to develop improved adjustment-based distances as well as extensions to completed partially directed acyclic graphs and causal orders. We develop new reachability algorithms to compute the distances efficiently and to prove their low polynomial time complexity. In our package gadjid (open source at https://github.com/CausalDisco/gadjid), we provide implementations of our distances; they are orders of magnitude faster with proven lower time complexity than the structural intervention distance and thereby provide a success metric for causal discovery that scales to graph sizes that were previously prohibitive.},
openreview = {jO5UNNrjJr},
software = {https://github.com/CausalDisco/gadjid},
section = {Papers},
}
@InProceedings{herlihy24a,
title = {On Overcoming Miscalibrated Conversational Priors in LLM-based ChatBots},
author = {Herlihy, Christine and Neville, Jennifer and Schnabel, Tobias and Swaminathan, Adith},
pages = {1599-1620},
abstract = {We explore the use of Large Language Model (LLM-based) chatbots to power recommender systems. We observe that the chatbots respond poorly when they encounter under-specified requests (e.g., they make incorrect assumptions, hedge with a long response, or refuse to answer). We conjecture that such miscalibrated response tendencies (i.e., conversational priors) can be attributed to LLM fine-tuning by annotators --- single-turn annotations may not capture multi-turn conversation utility, and the annotators' preferences may not even be representative of users interacting with a recommender system.
We first analyze public LLM chat logs to conclude that query under-specification is common. Next, we study synthetic recommendation problems with known but latent item utilities, and frame them as Partially Observed Decision Processes (PODP). We find that pre-trained LLMs can be sub-optimal for PODPs and derive better policies that clarify under-specified queries when appropriate. Then, we re-calibrate LLMs by prompting them with learned control messages to approximate the improved policy. Finally, we show empirically that our lightweight learning approach effectively uses logged conversation data to re-calibrate the response strategies of LLM-based chatbots for recommendation tasks.},
openreview = {iXQIgIsmIr},
section = {Papers},
}
@InProceedings{heuillet24a,
title = {Neural Active Learning Meets the Partial Monitoring Framework},
author = {Heuillet, Maxime and Ahmad, Ola and Durand, Audrey},
pages = {1621-1639},
abstract = {We focus on the online-based active learning (OAL) setting where an agent operates over a stream of observations and trades-off between the costly acquisition of information (labelled observations) and the cost of prediction errors.
We propose a novel foundation for OAL tasks based on partial monitoring, a theoretical framework specialized in online learning from partially informative actions.
We show that previously studied binary and multi-class OAL tasks are instances of partial monitoring.
We expand the real-world potential of OAL by introducing a new class of cost-sensitive OAL tasks.
We propose NeuralCBP, the first PM strategy that accounts for predictive uncertainty with deep neural networks.
Our extensive empirical evaluation on open source datasets shows that NeuralCBP has competitive performance against state-of-the-art baselines on multiple binary, multi-class and cost-sensitive OAL tasks.},
openreview = {K99ZSDvGiD},
software = {https://github.com/MaxHeuillet/neuralCBPside},
section = {Papers},
}
@InProceedings{hikima24a,
title = {Quantum Kernelized Bandits},
author = {Hikima, Yasunari and Murao, Kazunori and Takemori, Sho and Umeda, Yuhei},
pages = {1640-1657},
abstract = {We consider the quantum kernelized bandit problem, where the player observes information of rewards through quantum circuits termed the quantum reward oracle, and the mean reward function belongs to a reproducing kernel Hilbert space (RKHS). We propose a UCB-type algorithm that utilizes the quantum Monte Carlo (QMC) method and provide regret bounds in terms of the decay rate of eigenvalues of the Mercer operator of the kernel. Our algorithm achieves $\widetilde{O}\left( T^{\frac{3}{1 + \beta_p}} \log\left(\frac{1}{\delta} \right)\right)$ and $\widetilde{O} \left( \log^{3(1 + \beta_e^{-1})/2} (T) \log\left(\frac{1 }{\delta} \right) \right)$ cumulative regret bounds with probability at least $1-\delta$ if the kernel has a $\beta_p$-polynomial eigendecay and $\beta_e$-exponential eigendecay, respectively. In particular, in the case of the exponential eigendecay, our regret bounds exponentially improve that of classical algorithms. Moreover, our results indicate that our regret bound is better than the lower bound in the classical kernelized bandit problem if the rate of decay is sufficiently fast.},
openreview = {3GtCwa9nky},
section = {Papers},
}
@InProceedings{ho24a,
title = {Recursively-Constrained Partially Observable Markov Decision Processes},
author = {Ho, Qi Heng and Becker, Tyler and Kraske, Benjamin and Laouar, Zakariya and Feather, Martin and Rossi, Federico and Lahijanian, Morteza and Sunberg, Zachary},
pages = {1658-1680},
abstract = {Many sequential decision problems involve optimizing one objective function while imposing constraints on other objectives. Constrained Partially Observable Markov Decision Processes (C-POMDP) model this case with transition uncertainty and partial observability. In this work, we first show that C-POMDPs violate the optimal substructure property over successive decision steps and thus may exhibit behaviors that are undesirable for some (e.g., safety critical) applications. Additionally, online re-planning in C-POMDPs is often ineffective due to the inconsistency resulting from this violation. To address these drawbacks, we introduce the Recursively-Constrained POMDP (RC-POMDP), which imposes additional history-dependent cost constraints on the C-POMDP. We show that, unlike C-POMDPs, RC-POMDPs always have deterministic optimal policies and that optimal policies obey Bellman's principle of optimality. We also present a point-based dynamic programming algorithm for RC-POMDPs. Evaluations on benchmark problems demonstrate the efficacy of our algorithm and show that policies for RC-POMDPs produce more desirable behaviors than policies for C-POMDPs.},
openreview = {cC2c4KhHni},
software = {https://github.com/CU-ADCL/RC-PBVI.jl},
section = {Papers},
}
@InProceedings{ho24b,
title = {Sound Heuristic Search Value Iteration for Undiscounted POMDPs with Reachability Objectives},
author = {Ho, Qi Heng and Feather, Martin and Rossi, Federico and Sunberg, Zachary and Lahijanian, Morteza},
pages = {1681-1697},
abstract = {Partially Observable Markov Decision Processes (POMDPs) are powerful models for sequential decision making under transition and observation uncertainties. This paper studies the challenging yet important problem in POMDPs known as the (indefinite-horizon) Maximal Reachability Probability Problem (MRPP), where the goal is to maximize the probability of reaching some target states. This is also a core problem in model checking with logical specifications and is naturally undiscounted (discount factor is one). Inspired by the success of point-based methods developed for discounted problems, we study their extensions to MRPP. Specifically, we focus on trial-based heuristic search value iteration techniques and present a novel algorithm that leverages the strengths of these techniques for efficient exploration of the belief space (informed search via value bounds) while addressing their drawbacks in handling loops for indefinite-horizon problems. The algorithm produces policies with two-sided bounds on optimal reachability probabilities. We prove convergence to an optimal policy from below under certain conditions. Experimental evaluations on a suite of benchmarks show that our algorithm outperforms existing methods in almost all cases in both probability guarantees and computation time.},
openreview = {3zSiuXYtqf},
software = {https://github.com/aria-systems-group/HSVI-RP},
section = {Papers},
}
@InProceedings{hochsprung24a,
title = {A Global Markov Property for Solutions of Stochastic Difference Equations and the corresponding Full Time Graphs},
author = {Hochsprung, Tom and Runge, Jakob and Gerhardus, Andreas},
pages = {1698-1726},
abstract = {Structural Causal Models (SCMs) are an important tool in causal inference. They induce a graph and if the graph is acyclic, a unique observational distribution. A standard result states that in this acyclic case, the induced observational distribution satisfies a d-separation global Markov property relative to the induced graph.
Time series can also be modelled like SCMs: One just interprets the stochastic difference equations that a time series solves as structural equations. However, technical problems arise when time series "start" at minus infinity. In particular, a d-separation global Markov property for time series and the corresponding infinite graphs, the so-called full time graphs, has thus far only been shown for stable vector autoregressive processes with independent finite-second-moment noise.
In this paper, we prove a much more general version of this Markov property. We discuss our assumptions and study violations of them.
Doing so hints at several pitfalls at the intersection of time series analysis and causal inference.
Moreover, we introduce a new projection procedure for these infinite graphs which might be of independent interest.},
openreview = {o5pOj2IDYB},
section = {Papers},
}
@InProceedings{hong24a,
title = {Revisiting Convergence of AdaGrad with Relaxed Assumptions},
author = {Hong, Yusu and Lin, Junhong},
pages = {1727-1750},
abstract = {In this study, we revisit the convergence of AdaGrad with momentum (covering AdaGrad as a special case) on non-convex smooth optimization problems. We consider a general noise model where the noise magnitude is controlled by the function value gap together with the gradient magnitude. This model encompasses a broad range of noises including bounded noise, sub-Gaussian noise, affine variance noise and the expected smoothness, and it has been shown to be more realistic in many practical applications. Our analysis yields a probabilistic convergence rate which, under the general noise, could reach at $\tilde{\mathcal{O}}(1/\sqrt{T})$. This rate does not rely on prior knowledge of problem-parameters and could accelerate to $\tilde{\mathcal{O}}(1/T)$ where $T$ denotes the total number iterations, when the noise parameters related to the function value gap and noise level are sufficiently small. The convergence rate thus matches the lower rate for stochastic first-order methods over non-convex smooth landscape up to logarithm terms [Arjevani et al., 2023]. We further derive a convergence bound for AdaGrad with momentum, considering the generalized smoothness where the local smoothness is controlled by a first-order function of the gradient norm.},
openreview = {AlYO5cq1fG},
section = {Papers},
}
@InProceedings{irfan24a,
title = {Equilibrium Computation in Multidimensional Congestion Games: CSP and Learning Dynamics Approaches},
author = {Irfan, Mohammad T. and Chan, Hau and Soundy, Jared},
pages = {1751-1779},
abstract = {We present algorithms of two flavors{\textemdash}one rooted in constraint satisfaction problems (CSPs) and the other in learning dynamics{\textemdash}to compute pure-strategy Nash equilibrium (PSNE) in k-dimensional congestion games (k-DCGs) and their variants. The two algorithmic approaches are driven by whether or not a PSNE is guaranteed to exist. We first show that deciding the existence of a PSNE in a k-DCG is NP-complete even when players have binary and unit demand vectors. For general cost functions (potentially non-monotonic), we devise a new CSP-inspired algorithmic framework for PSNE computation, leading to algorithms that run in polynomial time under certain assumptions while offering exponential savings over standard CSP algorithms. We further refine these algorithms for variants of k-DCGs. Our experiments demonstrate the effectiveness of this new CSP framework for hard, non-monotonic k-DCGs. We then provide learning dynamics-based PSNE computation algorithms for linear and exponential cost functions. These algorithms run in polynomial time under certain assumptions. For general cost, we give a learning dynamics algorithm for an (\ensuremath{(\alpha, \beta)})-approximate PSNE (for certain \ensuremath{\alpha} and \ensuremath{\beta}). Lastly, we also devise polynomial-time algorithms for structured demands and cost functions.},
openreview = {9Sw5gGPzQ1},
section = {Papers},
}
@InProceedings{jazbec24a,
title = {Early-Exit Neural Networks with Nested Prediction Sets},
author = {Jazbec, Metod and Forr\'e, Patrick and Mandt, Stephan and Zhang, Dan and Nalisnick, Eric},
pages = {1780-1796},
abstract = {Early-exit neural networks (EENNs) facilitate adaptive inference by producing predictions at multiple stages of the forward pass. In safety-critical applications, these predictions are only meaningful when complemented with reliable uncertainty estimates. Yet, due to their sequential structure, an EENN's uncertainty estimates should also be *consistent*: labels that are deemed improbable at one exit should not reappear within the confidence interval / set of later exits. We show that standard uncertainty quantification techniques, like Bayesian methods or conformal prediction, can lead to inconsistency across exits. We address this problem by applying anytime-valid confidence sequences (AVCSs) to the exits of EENNs. By design, AVCSs maintain consistency across exits. We examine the theoretical and practical challenges of applying AVCSs to EENNs and empirically validate our approach on both regression and classification tasks.},
openreview = {mAjIKFHa2P},
software = {https://github.com/metodj/EENN-AVCS},
section = {Papers},
}
@InProceedings{jiang24a,
title = {On the Convergence of Hierarchical Federated Learning with Partial Worker Participation},
author = {Jiang, Xiaohan and Zhu, Hongbin},
pages = {1797-1824},
abstract = {Hierarchical federated learning (HFL) has emerged as the architecture of choice for multi-level communication networks, mainly because of its data privacy protection and low communication cost.
However, existing studies on the convergence analysis for HFL are limited to the assumptions of full worker participation and/or i.i.d. datasets across workers, both of which rarely hold in practice.
Motivated by this, we in this work propose a unified convergence analysis framework for HFL covering both full and partial worker participation with non-i.i.d. data, non-convex objective function and stochastic gradient.
We correspondingly develop a three-sided learning rates algorithm to mitigate data divergences issue, thereby realizing better convergence performance.
Our theoretical results provide key insights of why partial participation of HFL is beneficial in significantly reducing the data divergences compared to standard FL.
Besides, the convergence analysis allows certain individualization for each cluster in HFL indicating that adjusting the worker sampling ratio and round period can improve the convergence behavior.},
openreview = {fbOg2MB3Uy},
software = {https://github.com/cardistryj/HFL},
section = {Papers},
}
@InProceedings{kairgeldin24a,
title = {Adaptive Softmax Trees for Many-Class Classification},
author = {Kairgeldin, Rasul and Gabidolla, Magzhan and Carreira-Perpi\~n\'an, Miguel},
pages = {1825-1841},
abstract = {NLP tasks such as language models or document classification involve classification problems with thousands of classes. In these situations, it is difficult to get high predictive accuracy and the resulting model can be huge in number of parameters and inference time. A recent, successful approach is the softmax tree (ST): a decision tree having sparse hyperplane splits at the decision nodes (which make hard, not soft, decisions) and small softmax classifiers at the leaves. Inference here is very fast because only a small subset of class probabilities need to be computed, yet the model is quite accurate. However, a significant drawback is that it assumes a complete tree, whose size grows exponentially with depth. We propose a new algorithm to train a ST of arbitrary structure. The tree structure itself is learned optimally by interleaving steps that grow the structure with steps that optimize the parameters of the current structure. This makes it possible to learn STs that can grow much deeper but in an irregular way, adapting to the data distribution. The resulting STs improve considerably the predictive accuracy while reducing the model size and inference time even further, as demonstrated in datasets with thousands of classes. In addition, they are interpretable to some extent.},
openreview = {WK7rXij5VC},
section = {Papers},
}
@InProceedings{kampen24a,
title = {Towards Scalable Bayesian Transformers: Investigating stochastic subset selection for NLP},
author = {Kampen, Peter Johannes Tejlgaard and Als, Gustav Ragnar Stoettrup and Andersen, Michael Riis},
pages = {1842-1862},
abstract = {Bayesian deep learning provides a framework for quantifying uncertainty. However, the scale of modern neural networks applied in Natural Language Processing (NLP) limits the usability of Bayesian methods. Subnetwork inference aims to approximate the posterior by selecting a stochastic parameter subset for inference, thereby allowing scalable posterior approximations. Determining the optimal parameter space for subnetwork inference is far from trivial. In this paper, we study partially stochastic Bayesian neural networks in the context of transformer models for NLP tasks for the Laplace approximation (LA) and Stochastic weight averaging - Gaussian (SWAG). We propose heuristics for selecting which layers to include in the stochastic subset. We show that norm-based selection is promising for small subsets, and random selection is superior for larger subsets. Moreover, we propose Sparse-KFAC (S-KFAC), an extension of KFAC LA, which selects dense stochastic substructures of linear layers based on parameter magnitudes. S-KFAC retains performance while requiring substantially fewer stochastic parameters and, therefore, drastically limits memory footprint.},
openreview = {ba3McobvmG},
software = {https://github.com/GustavAls/PartialNLP},
section = {Papers},
}
@InProceedings{kang24a,
title = {Low-rank Matrix Bandits with Heavy-tailed Rewards},
author = {Kang, Yue and Hsieh, Cho-Jui and Lee, Thomas Chun Man},
pages = {1863-1889},
abstract = {In stochastic low-rank matrix bandit, the expected reward of an arm is equal to the inner product between its feature matrix and some unknown $d_1$ by $d_2$ low-rank parameter matrix $\Theta^*$ with rank $r \ll d_1\wedge d_2$. While all prior studies assume the payoffs are mixed with sub-Gaussian noises, in this work we loosen this strict assumption and consider the new problem of low-rank matrix bandit with heavy-tailed rewards (LowHTR), where the rewards only have finite $(1+\delta)$ moment for some $\delta \in (0,1]$. By utilizing the truncation on observed payoffs and the dynamic exploration, we propose a novel algorithm called LOTUS attaining the regret bound of order $\tilde O(d^\frac{3}{2}r^\frac{1}{2}T^\frac{1}{1+\delta}/\tilde{D}_{rr})$ without knowing $T$, which matches the state-of-the-art regret bound under sub-Gaussian noises~\citep{lu2021low,kang2022efficient} with $\delta = 1$. Moreover, we establish a lower bound of the order $\Omega(d^\frac{\delta}{1+\delta} r^\frac{\delta}{1+\delta} T^\frac{1}{1+\delta}) = \Omega(T^\frac{1}{1+\delta})$ for LowHTR, which indicates our LOTUS is nearly optimal in the order of $T$. In addition, we improve LOTUS so that it does not require knowledge of the rank $r$ with $\tilde O(dr^\frac{3}{2}T^\frac{1+\delta}{1+2\delta})$ regret bound, and it is efficient under the high-dimensional scenario. We also conduct simulations to demonstrate the practical superiority of our algorithm.},
openreview = {TG4fKjfvxC},
section = {Papers},
}
@InProceedings{kassem-sbeyti24a,
title = {Cost-Sensitive Uncertainty-Based Failure Recognition for Object Detection},
author = {Kassem-Sbeyti, Moussa and Karg, Michelle and Wirth, Christian and Klein, Nadja and Albayrak, Sahin},
pages = {1890-1900},
abstract = {Object detectors in real-world applications often fail to detect objects due to varying factors such as weather conditions and noisy input. Therefore, a process that mitigates false detections is crucial for both safety and accuracy. While uncertainty-based thresholding shows promise, previous works demonstrate an imperfect correlation between uncertainty and detection errors. This hinders ideal thresholding, prompting us to further investigate the correlation and associated cost with different types of uncertainty. We therefore propose a cost-sensitive framework for object detection tailored to user-defined budgets on the two types of errors, missing and false detections. We derive minimum thresholding requirements to prevent performance degradation and define metrics to assess the applicability of uncertainty for failure recognition. Furthermore, we automate and optimize the thresholding process to maximize the failure recognition rate w.r.t. the specified budget. Evaluation on three autonomous driving datasets demonstrates that our approach significantly enhances safety, particularly in challenging scenarios. Leveraging localization aleatoric uncertainty and softmax-based entropy only, our method boosts the failure recognition rate by 36-60\% compared to conventional approaches. Code is available at https://mos-ks.github.io/publications.},
openreview = {HuibNFkaoi},
software = {https://mos-ks.github.io/publications},
section = {Papers},
}
@InProceedings{kawakami24a,
title = {Probabilities of Causation for Continuous and Vector Variables},
author = {Kawakami, Yuta and Kuroki, Manabu and Tian, Jin},
pages = {1901-1921},
abstract = {*Probabilities of causation* (PoC) are valuable concepts for explainable artificial intelligence and practical decision-making. PoC are originally defined for scalar binary variables. In this paper, we extend the concept of PoC to continuous treatment and outcome variables, and further generalize PoC to capture causal effects between multiple treatments and multiple outcomes. In addition, we consider PoC for a sub-population and PoC with multi-hypothetical terms to capture more sophisticated counterfactual information useful for decision-making. We provide a nonparametric identification theorem for each type of PoC we introduce. Finally, we illustrate the application of our results on a real-world dataset about education.},
openreview = {em3fzm95So},
section = {Papers},
}
@InProceedings{kawakami24b,
title = {Identification and Estimation of Conditional Average Partial Causal Effects via Instrumental Variable},
author = {Kawakami, Yuta and Kuroki, Manabu and Tian, Jin},
pages = {1922-1952},
abstract = {There has been considerable recent interest in estimating heterogeneous causal effects. In this paper, we study conditional average partial causal effects (CAPCE) to reveal the heterogeneity of causal effects with continuous treatment. We provide conditions for identifying CAPCE in an instrumental variable setting. Notably, CAPCE is identifiable under a weaker assumption than required by a commonly used measure for estimating heterogeneous causal effects of continuous treatment. We develop three families of CAPCE estimators: sieve, parametric, and reproducing kernel Hilbert space (RKHS)-based, and analyze their statistical properties. We illustrate the proposed CAPCE estimators on synthetic and real-world data.},
openreview = {StHUKqZGNs},
section = {Papers},
}
@InProceedings{kekic24a,
title = {Targeted Reduction of Causal Models},
author = {Keki\'c, Armin and Sch\"olkopf, Bernhard and Besserve, Michel},
pages = {1953-1980},
abstract = {Why does a phenomenon occur? Addressing this question is central to most scientific inquiries and often relies on simulations of scientific models. As models become more intricate, deciphering the causes behind phenomena in high-dimensional spaces of interconnected variables becomes increasingly challenging. Causal Representation Learning (CRL) offers a promising avenue to uncover interpretable causal patterns within these simulations through an interventional lens. However, developing general CRL frameworks suitable for practical applications remains an open challenge. We introduce _Targeted Causal Reduction_ (TCR), a method for condensing complex intervenable models into a concise set of causal factors that explain a specific target phenomenon. We propose an information theoretic objective to learn TCR from interventional data of simulations, establish identifiability for continuous variables under shift interventions and present a practical algorithm for learning TCRs. Its ability to generate interpretable high-level explanations from complex models is demonstrated on toy and mechanical systems, illustrating its potential to assist scientists in the study of complex phenomena in a broad range of disciplines.},
openreview = {CFHpI53xmb},
software = {https://github.com/akekic/targeted-causal-reduction},
section = {Papers},
}
@InProceedings{khong24a,
title = {Active Learning Framework for Incomplete Networks},
author = {Khong, Tung and Tran, Cong and Pham, Cuong},
pages = {1981-1998},
abstract = {Significant progression has been made in active learning algorithms for graph networks in various tasks. However real-world applications frequently involve incomplete graphs with missing links, which pose the challenge that existing approaches might not adequately address. This paper presents an active learning approach tailored specifically for handling incomplete graphs, termed ALIN. Our algorithm employs graph neural networks (GNN) to generate node embeddings and calculates losses for both node classification and link prediction tasks. The losses are combined with appropriate weights and iteratively updating the GNN, ALIN efficiently queries nodes in batches, thereby achieving a balance between training feedbacks and resource utilization. Our empirical experiments have shown ALIN can surpass state-of-the-art baselines on Cora, Citeseer, Pubmed, and Coauthor-CS datasets.},
openreview = {n6wh6WiWvV},
software = {https://github.com/manhtung001/ALIN},
video = {https://www.youtube.com/watch?v=-UHUPH_DPZ4&t=153s},
section = {Papers},
}
@InProceedings{kim24a,
title = {Causal Discovery with Deductive Reasoning: One Less Problem},
author = {Kim, Jonghwan and Hwang, Inwoo and Lee, Sanghack},
pages = {1999-2017},
abstract = {Constraint-based causal discovery algorithms aim to extract causal relationships between variables of interest by using conditional independence tests (CITs). However, CITs with large conditioning sets often lead to unreliable results due to their low statistical power, propagating errors throughout the course of causal discovery. As the reliability of CITs is crucial for their practical applicability, recent approaches rely on either tricky heuristics or complicated routines with high computational costs to tackle inconsistent test results. Against this background, we propose a principled, simple, yet effective method, coined \textsc{deduce-dep}, which corrects unreliable conditional independence statements by replacing them with deductively reasoned results from lower-order CITs. An appealing property of \textsc{deduce-dep} is that it can be seamlessly plugged into existing constraint-based methods and serves as a modular subroutine. In particular, we showcase the integration of \textsc{deduce-dep} into representative algorithms such as HITON-PC and PC, illustrating its practicality. Empirical evaluation demonstrates that our method properly corrects unreliable CITs, leading to improved performance in causal structure learning.},
openreview = {HmhAFOD1Bz},
software = {https://github.com/snu-causality-lab/deduce-dep},
section = {Papers},
}
@InProceedings{kong24a,
title = {ILP-FORMER: Solving Integer Linear Programming with Sequence to Multi-Label Learning},
author = {Kong, Shufeng and Liu, Caihua and Gomes, Carla},
pages = {2018-2028},
abstract = {Integer Linear Programming (ILP) is an essential class of combinatorial optimization problems (COPs). Its inherent NP-hardness has fostered considerable efforts towards the development of heuristic strategies. An emerging approach involves leveraging data-driven methods to automatically learn these heuristics. For example, using deep (reinforcement) learning to recurrently reoptimize an initial solution with Large Neighborhood Search (LNS) has demonstrated exceptional performance across numerous applications. A pivotal challenge within LNS lies in identifying an optimal subset of variables for reoptimization at each stage. Existing methods typically learn a policy to select a subset, either by maintaining a fixed cardinality or by decomposing the subset into independent binary decisions for each variable. However, such strategies overlook the modeling of LNS's sequential processes and fail to explore the correlations inherent in variable selection. To overcome these shortcomings, we introduce ILP-FORMER, an innovative model that reimagines policy learning as a sequence-to-multi-label classification (MLC) problem. Our approach uniquely integrates a causal transformer encoder to capture the sequential nature of LNS. Additionally, we employ an MLC decoder with contrastive learning to exploit the correlations in variable selection. Our extensive experiments confirm that ILP-FORMER delivers state-of-the-art anytime performance on several ILP benchmarks. Furthermore, ILP-FORMER exhibits impressive generalization capabilities when dealing with larger problem instances.},
openreview = {3H90d2FnY0},
section = {Papers},
}
@InProceedings{kook24a,
title = {How Inverse Conditional Flows Can Serve as a Substitute for Distributional Regression},
author = {Kook, Lucas and Kolb, Chris and Schiele, Philipp and Dold, Daniel and Arpogaus, Marcel and Fritz, Cornelius and Baumann, Philipp and Kopper, Philipp and Pielok, Tobias and Dorigatti, Emilio and R\"ugamer, David},
pages = {2029-2046},
abstract = {Neural network representations of simple models, such as linear regression, are being studied increasingly to better understand the underlying principles of deep learning algorithms. However, neural representations of distributional regression models, such as the Cox model, have received little attention so far. We close this gap by proposing a framework for distributional regression using inverse flow transformations (DRIFT), which includes neural representations of the aforementioned models. We empirically demonstrate that the neural representations of models in DRIFT can serve as a substitute for their classical statistical counterparts in several applications involving continuous, ordered, time-series, and survival outcomes. We confirm that models in DRIFT empirically match the performance of several statistical methods in terms of estimation of partial effects, prediction, and aleatoric uncertainty quantification. DRIFT covers both interpretable statistical models and flexible neural networks opening up new avenues in both statistical modeling and deep learning.},
openreview = {jd5DhbTsde},
section = {Papers},
}
@InProceedings{kuiper24a,
title = {Distributionally Robust Optimization as a Scalable Framework to Characterize Extreme Value Distributions},
author = {Kuiper, Patrick and Hasan, Ali and Yang, Wenhao and Ng, Yuting and Bidkhori, Hoda and Blanchet, Jose and Tarokh, Vahid},
pages = {2047-2063},