-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathchil2024.bib
544 lines (502 loc) · 70.1 KB
/
chil2024.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
% AUTHORS: Proceedings Chairs, SACS, Program Chairs, General Chairs
@Proceedings{chil2024,
booktitle = {Proceedings of the fifth Conference on Health, Inference, and Learning},
name = {Conference on Health, Inference, and Learning},
shortname = {CHIL},
year = {2024},
editor = {Pollard, Tom and Choi, Edward and Singhal, Pankhuri and Hughes, Michael and Sizikova, Elena and Mortazavi, Bobak and Chen, Irene and Wang, Fei and Sarker, Tasmie and McDermott, Matthew and Ghassemi, Marzyeh},
volume = {248},
start = {2024-06-27},
end = {2024-06-28},
publisher = {PMLR},
published = {2024-07-24},
series = {Proceedings of Machine Learning Research},
address = {Cornell Tech, 2 West Loop Rd, New York, NY 10044.},
conference_url = {https://www.chilconference.org/},
conference_number = {5}
}
% p0
@InProceedings{pollard24,
title = {Conference on Health, Inference, and Learning (CHIL) 2024},
author = {Pollard, Tom and Choi, Edward and Singhal, Pankhuri and Hughes, Michael and Sizikova, Elena and Mortazavi, Bobak and Chen, Irene and Wang, Fei and Sarker, Tasmie and McDermott, Matthew and Ghassemi, Marzyeh},
pages = {1-6},
abstract = {The Conference on Health, Inference, and Learning (CHIL), brings together a cross-disciplinary group of clinicians and researchers, from industry and academia, with the goal of advancing machine learning for health. CHIL has been an official conference of the Association for Health Learning and Inference (AHLI) since 2022. This volume contains proceedings of the fifth annual CHIL conference, held at Cornell Tech in New York City, US.},
year = {2024},
volume = {248},
publisher = {PMLR},
series = {Proceedings of Machine Learning Research},
software = {https://github.com/ahli-org/chil-proceedings-2024}
}
% p5
@InProceedings{han24,
title = {Interpretation of Intracardiac Electrograms Through Textual Representations},
author = {Han, William and Guadalupe Gomez, Diana and Alok, Avi and Duan, Chaojing and Rosenberg, Michael A and Weber, Douglas J and Liu, Emerson and Zhao, Ding},
pages = {7-23},
abstract = {Understanding the irregular electrical activity of atrial fibrillation (AFib) has been a key challenge in electrocardiography. For serious cases of AFib, catheter ablations are performed to collect intracardiac electrograms (EGMs). EGMs offer intricately detailed and localized electrical activity of the heart and are an ideal modality for interpretable cardiac studies. Recent advancements in artificial intelligence (AI) has allowed some works to utilize deep learning frameworks to interpret EGMs during AFib. Additionally, language models (LMs) have shown exceptional performance in being able to generalize to unseen domains, especially in healthcare. In this study, we are the first to leverage pretrained LMs for finetuning of EGM interpolation and AFib classification via masked language modeling. We formulate the EGM as a textual sequence and present competitive performances on AFib classification compared against other representations. Lastly, we provide a comprehensive interpretability study to provide a multi-perspective intuition of the model's behavior, which could greatly benefit the clinical use.},
year = {2024},
volume = {248},
publisher = {PMLR},
series = {Proceedings of Machine Learning Research},
software = {https://github.com/willxxy/Text-EGM}
}
% p7
@InProceedings{schwarz24,
title = {DDoS: A Graph neural Network Based Drug Synergy Prediction Algorithm},
author = {Schwarz, Kyriakos and Mendieta Alicia, Pliego and Mollaysa, Amina and Lara, Planas-Paz and Pauli, Chantal and Allam, Ahmed and Krauthammer, Michael},
pages = {24-38},
abstract = {Drug synergy arises when the combined impact of two drugs exceeds the sum of their individual effects. While single-drug effects on cell lines are well-documented, the scarcity of data on drug synergy, considering the vast array of potential drug combinations, prompts a growing interest in computational approaches for predicting synergies in untested drug pairs. We introduce a Graph Neural Network (\textit{GNN}) based model for drug synergy prediction, which utilizes drug chemical structures and cell line gene expression data. We extract data from the largest available drug combination database (DrugComb) and generate multiple synergy scores (commonly used in the literature) to create seven datasets that serve as a reliable benchmark with high confidence. In contrast to conventional models relying on pre-computed chemical features, our GNN-based approach learns task-specific drug representations directly from the graph structure of the drugs, providing superior performance in predicting drug synergies. Our work suggests that learning task-specific drug representations and leveraging a diverse dataset is a promising approach to advancing our understanding of drug-drug interaction and synergy.},
year = {2024},
volume = {248},
publisher = {PMLR},
series = {Proceedings of Machine Learning Research},
software = {https://github.com/uzh-dqbm-cmi/graphnn}
}
% p23
@InProceedings{zhang24,
title = {Daily Physical Activity Monitoring: Adaptive Learning from Multi-source Motion Sensor Data},
author = {Zhang, Haoting and Zhan, Donglin and Lin, Yunduan and He, Jinghai and Zhu, Qing and Shen, Zuo-Jun and Zheng, Zeyu},
pages = {39-54},
abstract = {In healthcare applications, there is a growing need to develop machine learning models that use data from a single source, such as that from a wrist wearable device, to monitor physical activities, assess health risks, and provide immediate health recommendations or interventions. However, the limitation of using single-source data often compromises the model's accuracy, as it fails to capture the full scope of human activities. While a more comprehensive dataset can be gathered in a lab setting using multiple sensors attached to various body parts, this approach is not practical for everyday use due to the impracticality of wearing multiple sensors. To address this challenge, we introduce a transfer learning framework that optimizes machine learning models for everyday applications by leveraging multi-source data collected in a laboratory setting. We introduce a novel metric to leverage the inherent relationship between these multiple data sources, as they are all paired to capture aspects of the same physical activity. Through numerical experiments, our framework outperforms existing methods in classification accuracy and robustness to noise, offering a promising avenue for the enhancement of daily activity monitoring.},
year = {2024},
volume = {248},
publisher = {PMLR},
series = {Proceedings of Machine Learning Research},
software = {https://github.com/Oceanjinghai/HealthTimeSerial}
}
% p29
@InProceedings{sakai24,
title = {Enhancing Collaborative Medical Outcomes through Private Synthetic Hypercube Augmentation: PriSHA},
author = {Nakamura Sakai, Shinpei and Shung, Dennis and Sekhon, Jasjeet S},
pages = {55-71},
abstract = {Data sharing across multiple health systems has the significant challenge of maintaining data privacy. Access to detailed, high-quality data is important for machine learning models trained to predict clinically relevant outcomes to generalize across different patient populations. However, health systems often are limited to patient data within their networks, which may not adequately represent the breadth of patient populations. This limitation is especially pronounced in the case of patients with rare or unique characteristics, resulting in decreased accuracy for this minority group. To address these challenges, our work introduces a framework designed to enhance existing clinical models, Private Synthetic Hypercube Augmentation (PriSHA). We use generative models to produce synthetic data as a means to augment these models while adhering to strict privacy standards. This approach has the potential to improve model performance without compromising patient confidentiality. To our knowledge, our framework is the first synthetic data augmentation framework that merges privacy-preserving tabular data and real data from multiple sources.},
year = {2024},
volume = {248},
publisher = {PMLR},
series = {Proceedings of Machine Learning Research}
}
% p34
@InProceedings{kim24a,
title = {Integrating ChatGPT into Secure Hospital Networks: A Case Study on Improving Radiology Report Analysis},
author = {Kim, Kyungsu and Park, Junhyun and Langarica, Saul and Mahmoud Alkhadrawi, Adham and Do, Synho},
pages = {72-87},
abstract = {This study demonstrates the first in-hospital adaptation of a cloud-based AI, similar to ChatGPT, into a secure model for analyzing radiology reports, prioritizing patient data privacy. By employing a unique sentence-level knowledge distillation method through contrastive learning, we achieve over 95\% accuracy in detecting anomalies. The model also accurately flags uncertainties in its predictions, enhancing its reliability and interpretability for physicians with certainty indicators. Despite limitations in data privacy during the training phase, such as requiring de-identification or IRB permission, our study is significant in addressing this issue in the inference phase (once the local model is trained), without the need for human annotation throughout the entire process. These advancements represent a new direction for developing secure and efficient AI tools for healthcare with minimal supervision, paving the way for a promising future of in-hospital AI applications.},
year = {2024},
volume = {248},
publisher = {PMLR},
series = {Proceedings of Machine Learning Research},
software = {https://github.com/MGH-LMIC/LLM_normal}
}
% p35
@InProceedings{krishnamoorthy24,
title = {Multiple Instance Learning with Absolute Position Information},
author = {Krishnamoorthy, Meera and Wiens, Jenna},
pages = {88-104},
abstract = {Most past work in multiple instance learning (MIL), which maps a group or bag of instances to a classification label, has focused on settings in which the order of instances does not contain information. In this paper, we define MIL with \textit{absolute} position information: tasks in which instances of importance remain in similar positions across bags. Such problems arise, for example, in MIL with medical images in which there exists a common global alignment across images (e.g., in chest x-rays the heart is in a similar location). We also evaluate the performance of existing MIL methods on a set of new benchmark tasks and two real data tasks with varying amounts of absolute position information. We find that, despite being less computationally efficient than other approaches, transformer-based MIL methods are more accurate at classifying tasks with absolute position information. Thus, we investigate the ability of positional encodings, a mechanism typically only used in transformers, to improve the accuracy of other MIL approaches. Applied to the task of identifying pathological findings in chest x-rays, when augmented with positional encodings, standard MIL approaches perform significantly better than without (AUROC of 0.799, 95\% CI: [0.791, 0.806] vs. 0.782, 95\% CI: [0.774, 0.789]) and on-par with transformer-based methods (AUROC of 0.797, 95\% CI: [0.790, 0.804]) while being 10 times faster. Our results suggest that one can efficiently and accurately classify MIL data with absolute position information using standard approaches by simply including positional encodings.},
year = {2024},
volume = {248},
publisher = {PMLR},
series = {Proceedings of Machine Learning Research},
software = {https://github.com/MLD3/MILwAPI}
}
% p41
@InProceedings{yan24,
title = {SQUWA: Signal Quality Aware DNN Architecture for Enhanced Accuracy in Atrial Fibrillation Detection from Noisy PPG Signals},
author = {Yan, Runze and Ding, Cheng and Xiao, Ran and Fedorov, Alex and Lee, Randall J and Nahab, Fadi and Hu, Xiao},
pages = {105-119},
abstract = {Atrial fibrillation (AF), a common cardiac arrhythmia, significantly increases the risk of stroke, heart disease, and mortality. Photoplethysmography (PPG) offers a promising solution for continuous AF monitoring, due to its cost efficiency and integration into wearable devices. Nonetheless, PPG signals are susceptible to corruption from motion artifacts and other factors often encountered in ambulatory settings. Conventional approaches typically discard corrupted segments or attempt to reconstruct original signals, allowing for the use of standard machine learning techniques. However, this reduces dataset size and introduces biases, compromising prediction accuracy and the effectiveness of continuous monitoring. We propose a novel deep learning model, \textbf{\underline{S}}ignal \textbf{\underline{Qu}}ality \textbf{\underline{W}}eighted Fusion of \textbf{\underline{A}}ttentional Convolution and Recurrent Neural Network (SQUWA), designed to learn how to retain accurate predictions from partially corrupted PPG. Specifically, SQUWA innovatively integrates an attention mechanism that directly considers signal quality during the learning process, dynamically adjusting the weights of time series segments based on their quality. This approach enhances the influence of higher-quality segments while reducing that of lower-quality ones, effectively utilizing partially corrupted segments. This approach represents a departure from the conventional methods that exclude such segments, enabling the utilization of a broader range of data, which has great implications for less disruption when monitoring of AF risks and more accurate estimation of AF burdens. Moreover, SQUWA utilizes variable-sized convolutional kernels to capture complex PPG signal patterns across different resolutions for enhanced learning. Our extensive experiments show that SQUWA outperform existing PPG-based models, achieving the highest AUCPR of 0.89 with label noise mitigation. This also exceeds the 0.86 AUCPR of models trained with using both electrocardiogram (ECG) and PPG data.},
year = {2024},
volume = {248},
publisher = {PMLR},
series = {Proceedings of Machine Learning Research},
software = {https://github.com/Runz96/SQUWA}
}
% p52
@InProceedings{blanks24,
title = {An Improved Bayesian Permutation Entropy Estimator with Wasserstein-Optimized Hierarchical Priors},
author = {Blanks, Zachary and Brown, Donald E and Adams, Marc A and Angadi, Siddhartha S},
pages = {120-136},
abstract = {We introduce a novel hierarchical Bayesian permutation entropy (PermEn) estimator designed to improve biomedical time series entropy assessments, especially for short signals. Unlike existing methods requiring a substantial number of observations or which impose restrictive priors, our non-centered, Wasserstein optimized hierarchical approach enables efficient MCMC inference and a broader range of PermEn priors. Evaluations on synthetic and secondary benchmark data demonstrate superior performance over the current state-of-the-art, including 13.33-63.67\% lower estimation error, 8.16-47.77\% lower posterior variance, and 47-60.83\% lower prior construction error ($p \leq 2.42 \times 10^{-10}$). Applied to cardiopulmonary exercise test oxygen uptake signals, we reveal a previously unreported 1.55\% (95\% credible interval: [0.62\%, 2.52\%]) entropy difference between obese and lean subjects that diminishes as exercise capacity increases. For individuals capable of completing at least 7.5 minutes of testing, the 95\% credible interval contained zero, suggesting potential insights into physiological complexity, exercise tolerance, and obesity. Our estimator refines biomedical signal PermEn estimation and underscores entropy's potential value as a health biomarker, opening avenues for further physiological and biomedical exploration.},
year = {2024},
volume = {248},
publisher = {PMLR},
series = {Proceedings of Machine Learning Research},
software = {}
}
% p61
@InProceedings{wei24,
title = {Temporally Multi-Scale Sparse Self-Attention for Physical Activity Data Imputation},
author = {Wei, Hui and Xu, Maxwell A and Samplawski, Colin and Rehg, James Matthew and Kumar, Santosh and Marlin, Benjamin},
pages = {137-154},
abstract = {Wearable sensors enable health researchers to continuously collect data pertaining to the physiological state of individuals in real-world settings. However, such data can be subject to extensive missingness due to a complex combination of factors. In this work, we study the problem of imputation of missing step count data, one of the most ubiquitous forms of wearable sensor data. We construct a novel and large scale data set consisting of a training set with over 3 million hourly step count observations and a test set with over 2.5 million hourly step count observations. We propose a domain knowledge-informed sparse self-attention model for this task that captures the temporal multi-scale nature of step-count data. We assess the performance of the model relative to baselines and conduct ablation studies to verify our specific model designs.},
year = {2024},
volume = {248},
publisher = {PMLR},
series = {Proceedings of Machine Learning Research},
software = {https://github.com/reml-lab/allofus-imputation}
}
% p95
@InProceedings{nilsson24,
title = {Regularizing and Interpreting Vision Transformer by Patch Selection on Echocardiography Data},
author = {Nilsson, Alfred and Azizpour, Hossein},
pages = {155-168},
abstract = {This work introduces a novel approach to model regularization and explanation in \Glspl{vit}, particularly beneficial for small-scale but high-dimensional data regimes, such as in healthcare. We introduce stochastic embedded feature selection in the context of echocardiography video analysis, specifically focusing on the EchoNet-Dynamic dataset for the prediction of \gls{lvef}. Our proposed method, termed \Glspl{gvit}, augments \Glspl{vvit}, a performant transformer architecture for videos with \Glspl{cae}, a common dataset-level feature selection technique, to enhance \gls{vvit}'s generalization and interpretability. The key contribution lies in the incorporation of stochastic token selection individually for each video frame during training. Such token selection regularizes the training of \gls{vvit}, improves its interpretability, and is achieved by differentiable sampling of categoricals using the Gumbel-Softmax distribution. Our experiments on EchoNet-Dynamic demonstrate a consistent and notable regularization effect. The \gls{gvit} model outperforms both a random selection baseline and standard \gls{vvit}. % using multiple evaluation metrics. The \gls{gvit} is also compared against recent works on EchoNet-Dynamic where it exhibits state-of-the-art performance among end-to-end learned methods. Finally, we explore model explainability by visualizing selected patches, providing insights into how the \gls{gvit} utilizes regions known to be crucial for \gls{lvef} prediction for humans. This proposed approach, therefore, extends beyond regularization, offering enhanced interpretability for \gls{vit}s. },
year = {2024},
volume = {248},
publisher = {PMLR},
series = {Proceedings of Machine Learning Research},
software = {}
}
% p96
@InProceedings{ravva24,
title = {A Machine Learning Approach for Predicting Upper Limb Motion Intentions with Multimodal Data},
author = {Ravva, Pavan Uttej and Kullu, Pinar and Abrar, Mohammad Fahim and Barmaki, Roghayeh Leila},
pages = {169-181},
abstract = {Over the last decade, there has been significant progress in the field of interactive virtual rehabilitation. Physical therapy (PT) stands as a highly effective approach for enhancing physical impairments. However, patient motivation and progress tracking in rehabilitation outcomes remain a challenge. This work addresses the gap through a machine learning-based approach to objectively measure outcomes of the upper limb virtual therapy system in a user study with non-clinical participants. In this study, we use virtual reality to perform several tracing tasks while collecting motion and movement data using a KinArm robot and a custom-made wearable sleeve sensor. We introduce a two-step machine learning architecture to predict the motion intention of participants. The first step predicts \textbf{reaching task segments} to which the participant-marked points belonged using gaze, while the second step employs a Long Short-Term Memory (LSTM) model to predict \textbf{directional movements} based on resistance change values from the wearable sensor and the KinArm. We specifically propose to transpose our raw resistance data to the time-domain which significantly improves the accuracy of the models by 34.6\%.
To evaluate the effectiveness of our model, we compared different classification techniques with various data configurations. The results show that our proposed computational method is exceptional at predicting participant's actions with accuracy values of 96.72\% for diamond reaching task, and 97.44\% for circle reaching task, which demonstrates the great promise of using multimodal data, including eye-tracking and resistance change, to objectively measure the performance and intention in virtual rehabilitation settings.},
year = {2024},
volume = {248},
publisher = {PMLR},
series = {Proceedings of Machine Learning Research},
software = {https://github.com/pavanravva/A-Machine-Learning-Approach-for-Predicting-Upper-Limb-Motion-Intentions-with-Multimodal-Data}
}
% p100
@InProceedings{xu24,
title = {From Basic to Extra Features: Hypergraph Transformer Pretrain-then-Finetuning for Balanced Clinical Predictions on EHR},
author = {Xu, Ran and Lu, Yiwen and Liu, Chang and Chen, Yong and Sun, Yan and Hu, Xiao and Ho, Joyce C and Yang, Carl},
pages = {182-197},
abstract = {Electronic Health Records (EHRs) contain rich patient information and are crucial for clinical research and practice.
In recent years, deep learning models have been applied to EHRs, but they often rely on massive features, which may not be readily available for all patients.
We propose \ours{}\footnote{Short for \textbf{H}ypergraph \textbf{T}ransformer \textbf{P}retrain-then-Finetuning with \textbf{S}moo\textbf{t}hness-induced regularization \textbf{a}nd \textbf{R}eweighting.}, which leverages hypergraph structures with a pretrain-then-finetune framework for modeling EHR data, enabling seamless integration of additional features. Additionally, we design two techniques, namely (1) \emph{Smoothness-inducing Regularization} and (2) \emph{Group-balanced Reweighting},
to enhance the model's robustness during finetuning. Through experiments conducted on two real EHR datasets, we demonstrate that \ours{} consistently outperforms va},
year = {2024},
volume = {248},
publisher = {PMLR},
series = {Proceedings of Machine Learning Research},
software = {https://github.com/ritaranx/HTP-Star}
}
% p102
@InProceedings{fong24,
title = {Explainable and Privacy-Preserving Machine Learning via Domain-Aware Symbolic Regression},
author = {Fong, Kei Sen and Motani, Mehul},
pages = {198-216},
abstract = {Explainability and privacy are the top concerns in machine learning (ML) for medical applications. In this paper, we propose a novel method, Domain-Aware Symbolic Regression with Homomorphic Encryption (DASR-HE), that addresses both concerns simultaneously by: (i) producing domain-aware, intuitive and explainable models that do not require the end-user to possess ML expertise and (ii) training only on securely encrypted data without access to actual data values or model parameters. DASR-HE is based on Symbolic Regression (SR), which is a first-class ML approach that produces simple and concise equations for regression, requiring no ML expertise to interpret. In our work, we improve the performance of SR algorithms by using existing domain-specific medical equations to augment the search space of equations, decreasing the search complexity and producing equations that are similar in structure to those used in practice. To preserve the privacy of the medical data, we enable our algorithm to learn on data that is homomorphically encrypted (HE), meaning that arithmetic operations can be done in the encrypted space. This makes HE suitable for machine learning algorithms to learn models without access to the actual data values or model parameters. We evaluate DASR-HE on three medical tasks, namely predicting glomerular filtration rate, endotracheal tube (ETT) internal diameter and ETT depth and find that DASR-HE outperforms existing medical equations, other SR ML algorithms and other explainable ML algorithms. },
year = {2024},
volume = {248},
publisher = {PMLR},
series = {Proceedings of Machine Learning Research},
software = {https://github.com/kentridgeai/DASR}
}
% p104
@InProceedings{toye24,
title = {Simulation of Health Time Series with Nonstationarity},
author = {Toye, Adedolapo Aishat and Gomez, Louis and Kleinberg, Samantha},
pages = {217-232},
abstract = {Limited access to health data remains a challenge for developing machine learning (ML) models. Health data is difficult to share due to privacy concerns and often does not have ground truth. Simulated data is often used for evaluating algorithms, as it can be shared freely and generated with ground truth. However, for simulated data to be used as an alternative to real data, algorithmic performance must be similar to that of real data. Existing simulation approaches are either black boxes or rely solely on expert knowledge, which may be incomplete. These methods generate data that often overstates performance, as they do not simulate many of the properties that make real data challenging. Nonstationarity, where a system's properties or parameters change over time, is pervasive in health data with changing health status of patients, standards of care, and populations. This makes ML challenging and can lead to reduced model generalizability, yet there have not been ways to systematically simulate realistic nonstationary data. This paper introduces a modular approach for learning dataset-specific models of nonstationarity in real data and augmenting simulated data with these properties to generate realistic synthetic datasets. We show that our simulation approach brings performance closer to that of real data in stress classification and glucose forecasting in people with diabetes.},
year = {2024},
volume = {248},
publisher = {PMLR},
series = {Proceedings of Machine Learning Research},
software = {https://github.com/health-ai-lab/Nonstationarity-Simulation}
}
% p123
@InProceedings{behrouz24,
title = {Brain-Mamba: Encoding Brain Activity via Selective State Space Models},
author = {Behrouz, Ali and Hashemi, Farnoosh},
pages = {233-250},
abstract = {Representation learning of brain activity is a key step toward unleashing machine learning models for use in the diagnosis of neurological diseases/disorders. Diagnosis of different neurological diseases/disorders, however, might require paying more attention to either spatial or temporal resolutions of brain activity. Accordingly, a generalized brain activity learner requires the ability of learning from both resolutions. Most existing studies, however, use domain knowledge to design brain encoders, and so are limited to a single neuroimage modality (e.g., EEG or fMRI) and its single resolution. Furthermore, their architecture design either: (1) uses self-attention mechanism with quadratic time with respect to input size, making its scalability limited, (2) is \emph{purely} based on message-passing graph neural networks, missing long-range dependencies and temporal resolution, and/or (3) encode brain activity in each unit of brain (e.g., voxel) separately, missing the dependencies of brain regions. In this study, we present BrainMamba, an attention free, scalable, and powerful framework to learn brain activity multivariate timeseries. BrainMamba uses two modules: (i) A novel multivariate timeseries encoder that leverage an MLP to fuse information across variates and an Selective Structured State Space (S4) architecture to encode each timeseries. (ii) A novel graph learning framework that leverage message-passing neural networks along with S4 architecture to selectively choose important brain regions. Our experiments on 7 real-world datasets with 3 modalities show that BrainMamba attains outstanding performance and outperforms all baselines in different downstream tasks.},
year = {2024},
volume = {248},
publisher = {PMLR},
series = {Proceedings of Machine Learning Research},
software = {https://github.com/GraphMamba/BrainMamba}
}
% p8
@InProceedings{naumzik24,
title = {Data-driven Subgrouping of Patient Trajectories with Chronic Diseases: Evidence from Low Back Pain},
author = {Naumzik, Christof Friedrich and Kongsted, Alice and Vach, Werner and Feuerriegel, Stefan},
pages = {251-279},
abstract = {Clinical data informs the personalization of health care with a potential for more effective disease management. In practice, this is achieved by \emph{subgrouping}, whereby clusters with similar patient characteristics are identified and then receive customized treatment plans with the goal of targeting subgroup-specific disease dynamics. In this paper, we propose a novel mixture hidden Markov model for subgrouping patient trajectories from \emph{chronic diseases}. Our model is probabilistic and carefully designed to capture different trajectory phases of chronic diseases (i.e., ``severe'', ``moderate'', and ``mild'') through tailored latent states. We demonstrate our subgrouping framework based on a longitudinal study across 847 patients with non-specific low back pain. Here, our subgrouping framework identifies 8 subgroups. Further, we show that our subgrouping framework outperforms common baselines in terms of cluster validity indices. Finally, we discuss the applicability of the model to other chronic and long-lasting diseases. },
year = {2024},
volume = {248},
publisher = {PMLR},
series = {Proceedings of Machine Learning Research},
software = {https://github.com/sfeuerriegel/PatientSubgrouping}
}
% p13
@InProceedings{lee24,
title = {Vision-Language Generative Model for View-Specific Chest X-ray Generation},
author = {Lee, Hyungyung and Lee, Da Young and Kim, Wonjae and Kim, Jin-Hwa and Kim, Tackeun and Kim, Jihang and Sunwoo, Leonard and Choi, Edward},
pages = {280-296},
abstract = {Synthetic medical data generation has opened up new possibilities in the healthcare domain, offering a powerful tool for simulating clinical scenarios, enhancing diagnostic and treatment quality, gaining granular medical knowledge, and accelerating the development of unbiased algorithms. In this context, we present a novel approach called ViewXGen, designed to overcome the limitations of existing methods that rely on general domain pipelines using only radiology reports to generate frontal-view chest X-rays. Our approach takes into consideration the diverse view positions found in the dataset, enabling the generation of chest X-rays with specific views, which marks a significant advancement in the field. To achieve this, we introduce a set of specially designed tokens for each view position, tailoring the generation process to the user's preferences. Furthermore, we leverage multi-view chest X-rays as input, incorporating valuable information from different views within the same study. This integration rectifies potential errors and contributes to faithfully capturing abnormal findings in chest X-ray generation. To validate the effectiveness of our approach, we conducted statistical analyses, evaluating its performance in a clinical efficacy metric on the MIMIC-CXR dataset. Also, human evaluation demonstrates the remarkable capabilities of ViewXGen, particularly in producing realistic view-specific X-rays that closely resemble the original images.},
year = {2024},
volume = {248},
publisher = {PMLR},
series = {Proceedings of Machine Learning Research},
software = {https://github.com/ttumyche/UniXGen}
}
% p18
@InProceedings{hoche24,
title = {FAMEWS: a Fairness Auditing tool for Medical Early-Warning Systems},
author = {Hoche, Marine and Mineeva, Olga and Burger, Manuel and Blasimme, Alessandro and Ratsch, Gunnar},
pages = {297-311},
abstract = {Machine learning applications hold promise to aid clinicians in a wide range of clinical tasks, from diagnosis to prognosis, treatment, and patient monitoring. These potential applications are accompanied by a surge of ethical concerns surrounding the use of Machine Learning (ML) models in healthcare, especially regarding fairness and non-discrimination. While there is an increasing number of regulatory policies to ensure the ethical and safe integration of such systems, the translation from policies to practices remains an open challenge. Algorithmic frameworks, aiming to bridge this gap, should be tailored to the application to enable the translation from fundamental human-right principles into accurate statistical analysis, capturing the inherent complexity and risks associated with the system.
In this work, we propose a set of fairness impartial checks especially adapted to ML early-warning systems in the medical context, comprising on top of standard fairness metrics, an analysis of clinical outcomes, and a screening of potential sources of bias in the pipeline. Our analysis is further fortified by the inclusion of event-based and prevalence-corrected metrics, as well as statistical tests to measure biases. Additionally, we emphasize the importance of considering subgroups beyond the conventional demographic attributes.
Finally, to facilitate operationalization, we present an open-source tool FAMEWS to generate comprehensive fairness reports. These reports address the diverse needs and interests of the stakeholders involved in integrating ML into medical practice. The use of FAMEWS has the potential to reveal critical insights that might otherwise remain obscured. This can lead to improved model design, which in turn may translate into enhanced health outcomes.},
year = {2024},
volume = {248},
publisher = {PMLR},
series = {Proceedings of Machine Learning Research},
software = {https://github.com/ratschlab/famews}
}
% p22
@InProceedings{en24,
title = {Unsupervised Domain Adaptation for Medical Image Segmentation with Dynamic Prototype-based Contrastive Learning},
author = {En, Qing and Guo, Yuhong},
pages = {312-325},
abstract = {Medical image segmentation typically requires numerous dense annotations in the target domain to train models, which is time-consuming and labour-intensive. To alleviate this challenge, unsupervised domain adaptation (UDA) has emerged to enhance model generalization in the target domain by harnessing labeled data from the source domain along with unlabeled data from the target domain. In this paper, we introduce a novel Dynamic Prototype Contrastive Learning (DPCL) framework for UDA on medical image segmentation, which dynamically updates cross-domain global prototypes and excavates implicit discrepancy information in a contrastive manner. DPCL learns cross-domain global feature representations while enhancing the discriminative capability of the segmentation model. Specifically, we design a novel cross-domain prototype evolution module that generates evolved cross-domain prototypes by employing dynamic updating and evolutionary strategies. This module facilitates a gradual transition from the source to the target domain while acquiring cross-domain global guidance knowledge. Moreover, we devise a cross-domain embedding contrastive module that establishes contrastive relationships within the embedding space. This module captures both homogeneous and heterogeneous information within the same category and among different categories, thereby enhancing the discriminative capability of the segmentation model. Experimental results demonstrate that the proposed DPCL is effective and outperforms the state-of-the-art methods.},
year = {2024},
volume = {248},
publisher = {PMLR},
series = {Proceedings of Machine Learning Research},
software = {https://github.com/EnQing626/DPCL}
}
% p36
@InProceedings{bini24,
title = {FlowCyt: A Comparative Study of Deep Learning Approaches for Multi-Class Classification in Flow Cytometry Benchmarking},
author = {Bini, Lorenzo and Mojarrad, Fatemeh Nassajian and Liarou, Margarita and Matthes, Thomas and Marchand-Maillet, Stephane},
pages = {326-338},
abstract = {This paper presents FlowCyt, the first comprehensive benchmark for multi-class single-cell classification in flow cytometry data. The dataset comprises bone marrow samples from 30 patients, with each cell characterized by twelve markers. Ground truth labels identify five hematological cell types: T lymphocytes, B lymphocytes, Monocytes, Mast cells, and Hematopoietic Stem/Progenitor Cells (HSPCs). Experiments utilize supervised inductive learning and semi-supervised transductive learning on up to 1 million cells per patient. Baseline methods include Gaussian Mixture Models, XGBoost, Random Forests, Deep Neural Networks, and Graph Neural Networks (GNNs). GNNs demonstrate superior performance by exploiting spatial relationships in graph-encoded data. The benchmark allows standardized evaluation of clinically relevant classification tasks, along with exploratory analyses to gain insights into hematological cell phenotypes. This represents the first public flow cytometry benchmark with a richly annotated, heterogeneous dataset. It will empower the development and rigorous assessment of novel methodologies for single-cell analysis.},
year = {2024},
volume = {248},
publisher = {PMLR},
series = {Proceedings of Machine Learning Research},
software = {https://viper.unige.ch/flowcyt}
}
% p37
@InProceedings{hegselmann24,
title = {A Data-Centric Approach To Generate Faithful and High Quality Patient Summaries with Large Language Models},
author = {Hegselmann, Stefan and Shen, Zejiang and Gierse, Florian and Agrawal, Monica and Sontag, David and Jiang, Xiaoyi},
pages = {339-379},
abstract = {Patients often face difficulties in understanding their hospitalizations, while healthcare workers have limited resources to provide explanations. In this work, we investigate the potential of large language models to generate patient summaries based on doctors' notes and study the effect of training data on the faithfulness and quality of the generated summaries. To this end, we release (i) a rigorous labeling protocol for errors in medical texts and (ii) a publicly available dataset of annotated hallucinations in 100 doctor-written and 100 generated summaries. We show that fine-tuning on hallucination-free data effectively reduces hallucinations from 2.60 to 1.55 per summary for Llama 2, while preserving relevant information. We observe a similar effect on GPT-4 (0.70 to 0.40), when the few-shot examples are hallucination-free. We also conduct a qualitative evaluation using hallucination-free and improved training data. We find that common quantitative metrics do not correlate well with faithfulness and quality. Finally, we test GPT-4 for automatic hallucination detection, which clearly outperforms common baselines.},
year = {2024},
volume = {248},
publisher = {PMLR},
series = {Proceedings of Machine Learning Research},
software = {https://github.com/stefanhgm/patient_summaries_with_llms}
}
% p45
@InProceedings{wang24,
title = {Addressing Wearable Sleep Tracking Inequity: A New Dataset and Novel Methods for a Population with Sleep Disorders},
author = {Wang, Will Ke and Yang, Jiamu and Hershkovich, Leeor and Jeong, Hayoung and Chen, Bill and Singh, Karnika and Roghanizad, Ali R and Shandhi, Md Mobashir Hasan and Spector, Andrew R and Dunn, Jessilyn},
pages = {380-396},
abstract = {Sleep is crucial for health, and recent advances in wearable technology and machine learning offer promising methods for monitoring sleep outside the clinical setting. However, sleep tracking using wearables is challenging, particularly for those with irregular sleep patterns or sleep disorders. In this study, we introduce a dataset collected from 100 patients from the Duke Sleep Disorders Center who wore an Empatica E4 smartwatch during an overnight sleep study with concurrent clinical-grade polysomnography (PSG) recording. This dataset encompasses diverse demographics and medical conditions. We further introduce a new methodology that addresses the limitations of existing modeling methods when applied on patients with sleep disorders. Namely, we address the inability of existing models to account for 1) temporal relationships while leveraging relatively small data, by introducing a LSTM post-processing method, and 2) group-wise characteristics that impact classification task performance (i.e., random effects), by ensembling mixed-effects boosted tree models. This approach was highly successful for sleep onset and wakefulness detection in this sleep disordered population, achieving an F1 score of 0.823 ± 0.019, an AUROC of 0.926 ± 0.016, and a 0.695 ± 0.025 Cohen's Kappa. Overall, we demonstrate the utility of both the data that we collected, as well as our unique approach to address the existing gap in wearable-based sleep tracking in sleep disordered populations.},
year = {2024},
volume = {248},
publisher = {PMLR},
series = {Proceedings of Machine Learning Research},
software = {https://github.com/WillKeWang/DREAMT_FE}
}
% p48
@InProceedings{amirshahi24,
title = {FETCH: A Fast and Efficient Technique for Channel Selection in EEG Wearable Systems},
author = {Amirshahi, Alireza and Dan, Jonathan and Miranda, Jose Angel and Aminifar, Amir and Atienza, David},
pages = {397-409},
abstract = {The rapid development of wearable biomedical systems now enables real-time monitoring of electroencephalography (EEG) signals. Acquisition of these signals relies on electrodes. These systems must meet the design challenge of selecting an optimal set of electrodes that balances performance and usability constraints. The search for the optimal subset of electrodes from a larger set is a problem with combinatorial complexity. While existing research has primarily focused on search strategies that only explore limited combinations, our methodology proposes a computationally efficient way to explore all combinations. To avoid the computational burden associated with training the model for each combination, we leverage an innovative approach inspired by few-shot learning. Remarkably, this strategy covers all the wearable electrode combinations while significantly reducing training time compared to retraining the network on each possible combination. In the context of an epileptic seizure detection task, the proposed method achieves an AUC value of 0.917 with configurations using eight electrodes. This performance matches that of prior research but is achieved in significantly less time, transforming a process that would span months into a matter of hours on a single GPU device. Our work allows comprehensive exploration of electrode configurations in wearable biomedical device design, yielding insights that enhance performance and real-world feasibility.},
year = {2024},
volume = {248},
publisher = {PMLR},
series = {Proceedings of Machine Learning Research},
software = {https://github.com/esl-epfl/FETCH}
}
% p50
@InProceedings{balve24,
title = {Interpretable breast cancer classification using CNNs on mammographic images},
author = {Balve, Ann-Kristin and Hendrix, Peter},
pages = {410-426},
abstract = {Deep learning models have achieved promising results in breast cancer classification, yet their 'black-box' nature raises interpretability concerns. This research addresses the crucial need to gain insights into the decision-making process of convolutional neural networks (CNNs) for mammogram classification, specifically focusing on the underlying reasons for the CNN's predictions of breast cancer. For CNNs trained on the Mammographic Image Analysis Society (MIAS) dataset, we compared the post-hoc interpretability techniques LIME, Grad-CAM, and Kernel SHAP in terms of explanatory depth and computational efficiency. The results of this analysis indicate that Grad-CAM, in particular, provides comprehensive insights into the behavior of the CNN, revealing distinctive patterns in normal, benign, and malignant breast tissue. We discuss the implications of the current findings for the use of machine learning models and interpretation techniques in clinical practice.},
year = {2024},
volume = {248},
publisher = {PMLR},
series = {Proceedings of Machine Learning Research},
software = {https://github.com/annkristinbalve/Interpretable_Breast_Cancer_Classification}
}
% p55
@InProceedings{lau24,
title = {Using Expert Gaze for Self-Supervised and Supervised Contrastive Learning of Glaucoma from OCT Data},
author = {Lau, Wai Tak and Tian, Ye and Kenia, Roshan and Aima, Saanvi and Thakoor, Kaveri A},
pages = {427-445},
abstract = {In this work, we address the challenge of limited data availability common in healthcare settings by using clinician (ophthalmologist) gaze data on optical coherence tomography (OCT) report images as they diagnose glaucoma, a top cause of irreversible blindness worldwide. We directly learn gaze representations with our `GazeFormerMD' model to generate pseudo-labels using a novel multi-task objective, combining triplet and cross-entropy losses. We use these pseudo-labels for weakly supervised contrastive learning (WSupCon) to detect glaucoma from a partially-labeled dataset of OCT report images. Our natural-language-inspired region-based-encoding GazeFormerMD model pseudo-labels, trained using our multi-task objective, enable downstream glaucoma detection accuracy via WSupCon exceeding 91\% even with only 70\% labeled training data. Furthermore, a model pre-trained with GazeFormerMD-generated pseudo-labels and used for linear evaluation on an unseen OCT-report dataset achieved comparable performance to a fully-supervised, trained-from-scratch model while using only 25\% labeled data.},
year = {2024},
volume = {248},
publisher = {PMLR},
series = {Proceedings of Machine Learning Research},
software = {https://github.com/AI4VSLab/Expert-Gaze-4-Supervised-Contrastive-Learning}
}
% p58
@InProceedings{mahdi24,
title = {Tuning In: Comparative Analysis of Audio Classifier Performance in Clinical Settings with Limited Data},
author = {Mahdi, Hamza and Nashnoush, Eptehal and Saab, Rami and Balachandar, Arjun and Dagli, Rishit and Perri, Lucas and Khosravani, Houman},
pages = {446-460},
abstract = {This study assesses deep learning models for audio classification in a clinical setting with the constraint of small datasets reflecting the prospective collection of real-world data. We analyze CNNs, including DenseNet and ConvNeXt, alongside transformer models like ViT, and SWIN, and compare them against pretrained audio models such as AST, YAMNet and VGGish. Our method highlights the benefits of pretraining on large datasets before fine-tuning on specific clinical data. We prospectively collected two first-of-its-kind patient audio datasets from stroke patients. We investigated various preprocessing techniques, finding that RGB and grayscale spectrogram transformations affect model performance differently based on the priors they learn from pretraining. Our findings indicate CNNs can match or exceed transformer models in small dataset contexts, with DenseNet-Contrastive and AST models showing notable performance. This study highlights the significance of incremental marginal gains through model selection, pretraining, and preprocessing in sound classification; this offers valuable insights for clinical diagnostics that rely on audio classification.},
year = {2024},
volume = {248},
publisher = {PMLR},
series = {Proceedings of Machine Learning Research},
software = {https://github.com/UofTNeurology}
}
% p59
@InProceedings{tandon24,
title = {s-SuStaIn: Scaling subtype and stage inference via simultaneous clustering of subjects and biomarkers},
author = {Tandon, Raghav and Lah, James J and Mitchell, Cassie S},
pages = {461-476},
abstract = {Event-based models (EBM) provide an important platform for modeling disease progression. This work successfully extends previous EBM approaches to work with larger sets of biomarkers while simultaneously modeling heterogeneity in disease progression trajectories. We develop and validate the s-SuStain method for scalable event-based modeling of disease progression subtypes using large numbers of features. s-SuStaIn is typically an order of magnitude faster than its predecessor (SuStaIn). Moreover, we perform a case study with s-SuStaIn using open access cross-sectional Alzheimer's Disease Neuroimaging (ADNI) data to stage AD patients into four subtypes based on dynamic disease progression. s-SuStaIn shows that the inferred subtypes and stages predict progression to AD among MCI subjects. The subtypes show difference in AD incidence-rates and reveal clinically meaningful progression trajectories when mapped to a brain atlas.},
year = {2024},
volume = {248},
publisher = {PMLR},
series = {Proceedings of Machine Learning Research},
software = {https://github.com/pathology-dynamics/s-SuStaIn}
}
% p69
@InProceedings{wu24,
title = {Regulating AI Adaptation: An Analysis of AI Medical Device Updates},
author = {Wu, Kevin and Wu, Eric and Rodolfa, Kit and Ho, Daniel E and Zou, James},
pages = {477-488},
abstract = {While the pace of development of AI has rapidly progressed in recent years, the implementation of safe and effective regulatory frameworks has lagged behind. In particular, the adaptive nature of AI models presents unique challenges to regulators as updating a model can improve its performance but also introduce safety risks. In the US, the Food and Drug Administration (FDA) has been a forerunner in regulating and approving hundreds of AI medical devices. To better understand how AI is updated and its regulatory considerations, we systematically analyze the frequency and nature of updates in FDA-approved AI medical devices. We find that less than 2\% of all devices report having been updated by being re-trained on new data. Meanwhile, nearly a quarter of devices report updates in the form of new functionality and marketing claims. As an illustrative case study, we analyze pneumothorax detection models and find that while model performance can degrade by as much as 0.18 AUC when evaluated on new sites, re-training on site-specific data can mitigate this performance drop, recovering up to 0.23 AUC. However, we also observed significant degradation on the original site after re-training using data from new sites, providing insight from one example that challenges the current one-model-fits-all approach to regulatory approvals. Our analysis provides an in-depth look at the current state of FDA-approved AI device updates and insights for future regulatory policies toward model updating and adaptive AI.},
year = {2024},
volume = {248},
publisher = {PMLR},
series = {Proceedings of Machine Learning Research},
software = {https://github.com/kevinwu23/AIUpdating}
}
% p76
@InProceedings{ahsan24,
title = {Retrieving Evidence from EHRs with LLMs: Possibilities and Challenges},
author = {Ahsan, Hiba and McInerney, Denis Jered and Kim, Jisoo and Potter, Christopher A and Young, Geoffrey and Amir, Silvio and Wallace, Byron C},
pages = {489-505},
abstract = {Unstructured data in Electronic Health Records (EHRs) often contains critical information---complementary to imaging---that could inform radiologists' diagnoses. But the large volume of notes often associated with patients together with time constraints renders manually identifying relevant evidence practically infeasible. In this work we propose and evaluate a zero-shot strategy for using LLMs as a mechanism to efficiently retrieve and summarize unstructured evidence in patient EHR relevant to a given query. Our method entails tasking an LLM to infer whether a patient has, or is at risk of, a particular condition on the basis of associated notes; if so, we ask the model to summarize the supporting evidence. Under expert evaluation, we find that this LLM-based approach provides outputs consistently preferred to a pre-LLM information retrieval baseline. Manual evaluation is expensive, so we also propose and validate a method using an LLM to evaluate (other) LLM outputs for this task, allowing us to scale up evaluation. Our findings indicate the promise of LLMs as interfaces to EHR, but also highlight the outstanding challenge posed by ``hallucinations''. In this setting, however, we show that model confidence in outputs strongly correlates with faithful summaries, offering a practical means to limit confabulations.},
year = {2024},
volume = {248},
publisher = {PMLR},
series = {Proceedings of Machine Learning Research},
software = {https://github.com/hibaahsan/chil_diagnosis_evidence/}
}
% p79
@InProceedings{raymond24,
title = {Development of Error Passing Network for Optimizing the Prediction of VO$_2$ peak in Childhood Acute Leukemia Survivors},
author = {Raymond, Nicolas and Laribi, Hakima and Caru, Maxime and Mitiche, Mehdi and Marcil, Valerie and Krajinovic, Maja and Curnier, Daniel and Sinnett, Daniel and Valli\`eres, Martin},
pages = {506-521},
abstract = {Approximately two-thirds of survivors of childhood acute lymphoblastic leukemia (ALL) cancer develop late adverse effects post-treatment. Prior studies explored prediction models for personalized follow-up, but none integrated the usage of neural networks to date. In this work, we propose the Error Passing Network (EPN), a graph-based method that leverages relationships between samples to propagate residuals and adjust predictions of any machine learning model. We tested our approach to estimate patients' \vo peak, a reliable indicator of their cardiac health. We used the EPN in conjunction with several baseline models and observed up to 12.16\% improvement in the mean average percentage error compared to the last established equation predicting \vo peak in childhood ALL survivors. Along with this performance improvement, our final model is more efficient considering that it relies only on clinical variables that can be self-reported by patients, therefore removing the previous need of executing a resource-consuming physical test. },
year = {2024},
volume = {248},
publisher = {PMLR},
series = {Proceedings of Machine Learning Research},
software = {https://github.com/Rayn2402/ErrorPassingNetwork}
}
% p86
@InProceedings{kim24b,
title = {Health-LLM: Large Language Models for Health Prediction via Wearable Sensor Data},
author = {Kim, Yubin and Xu, Xuhai and McDuff, Daniel and Breazeal, Cynthia and Park, Hae Won},
pages = {522-539},
abstract = {Large language models (LLMs) are capable of many natural language tasks, yet they are far from perfect. In health applications, grounding and interpreting domain-specific and non-linguistic data is crucial. This paper investigates the capacity of LLMs to make inferences about health based on contextual information (e.g. user demographics, health knowledge) and physiological data (e.g. resting heart rate, sleep minutes). We present a comprehensive evaluation of 12 state-of-the-art LLMs with prompting and fine-tuning techniques on four public health datasets (PMData, LifeSnaps, GLOBEM and AW\_FB). Our experiments cover 10 consumer health prediction tasks in mental health, activity, metabolic, and sleep assessment. Our fine-tuned model, HealthAlpaca exhibits comparable performance to much larger models (GPT-3.5, GPT-4 and Gemini-Pro), achieving the best performance in \textbf{8 out of 10} tasks. Ablation studies highlight the effectiveness of context enhancement strategies. Notably, we observe that our context enhancement can yield up to \textbf{23.8\%} improvement in performance. While constructing contextually rich prompts (combining user context, health knowledge and temporal information) exhibits synergistic improvement, the inclusion of health knowledge context in prompts significantly enhances overall performance.},
year = {2024},
volume = {248},
publisher = {PMLR},
series = {Proceedings of Machine Learning Research},
software = {https://github.com/mitmedialab/Health-LLM}
}
% p87
@InProceedings{yeche24,
title = {Dynamic Survival Analysis for Early Event Prediction},
author = {Y\`eche, Hugo and Burger, Manuel and Veshchezerova, Dinara and Ratsch, Gunnar},
pages = {540-557},
abstract = {This study advances Early Event Prediction (EEP) in healthcare through Dynamic Survival Analysis (DSA), offering a novel approach by integrating risk localization into alarm policies to enhance clinical event metrics. By adapting and evaluating DSA models against traditional EEP benchmarks, our research demonstrates their ability to match EEP models on a time-step level and significantly improve event-level metrics through a new alarm prioritization scheme (up to 11\% AuPRC difference). This approach represents a significant step forward in predictive healthcare, providing a more nuanced and actionable framework for early event prediction and management.},
year = {2024},
volume = {248},
publisher = {PMLR},
series = {Proceedings of Machine Learning Research},
software = {https://github.com/ratschlab/dsa-for-eep}
}
% p93
@InProceedings{sidulova24,
title = {Contextual Unsupervised Deep Clustering in Digital Pathology},
author = {Sidulova, Mariia and Kahaki, Seyed and Hagemann, Ian and Gossmann, Alexej},
pages = {558-565},
abstract = {Clustering can be used in medical imaging research to identify different domains within a specific dataset, aiding in a better understanding of subgroups or strata that may not have been annotated. Moreover, in digital pathology, clustering can be used to effectively sample image patches from whole slide images (WSI). In this work, we conduct a comparative analysis of three deep clustering algorithms -- a simple two-step approach applying K-means onto a learned feature space, an end-to-end deep clustering method (DEC), and a Graph Convolutional Network (GCN) based method -- in application to a digital pathology dataset of endometrial biopsy WSIs. For consistency, all methods use the same Autoencoder (AE) architecture backbone that extracts features from image patches. The GCN-based model, specifically, stands out as a deep clustering algorithm that considers spatial contextual information in predicting clusters. Our study highlights the computation of graphs for WSIs and emphasizes the impact of these graphs on the formation of clusters. The main finding of our research indicates that GCN-based deep clustering demonstrates heightened spatial awareness compared to the other methods, resulting in higher cluster agreement with previous clinical annotations of WSIs.},
year = {2024},
volume = {248},
publisher = {PMLR},
series = {Proceedings of Machine Learning Research},
software = {https://github.com/DIDSR/DomId}
}
% p94
@InProceedings{nzeyimana24,
title = {DoseMate: A Real-world Evaluation of Machine Learning Classification of Pill Taking Using Wrist-worn Motion Sensors},
author = {Nzeyimana, Antoine and Campbell, Anthony and Scanlan, James M and Stekler, Joanne D and Marquard, Jenna and Saver, Barry G and Gummeson, Jeremy},
pages = {566-581},
abstract = {Non-adherence to medication is a complex behavioral issue that costs hundreds of billions of dollars annually in the United States alone. Existing solutions to improve medication adherence are limited in their effectiveness and require significant user involvement. To address this, a minimally invasive mobile health system called \myname{} is proposed, which can provide quantifiable adherence data and imposes minimal user burden. To classify a motion time-series that defines pill-taking, we adopt transfer-learning and data augmentation based techniques that uses captured pill-taking gestures along with other open datasets that represent negative labels of other wrist motions. The paper also provides a design methodology \updated{that generalizes to other} systems and describes a first-of-its-kind, in-the-wild, unobtrusively obtained dataset that contains unrestricted pill-related motion data from a diverse set of users.},
year = {2024},
volume = {248},
publisher = {PMLR},
series = {Proceedings of Machine Learning Research},
software = {https://gitlab.com/umass-smelt-lab/dosemate}
}
% p119
@InProceedings{visy24,
title = {Systematic Evaluation of Self-Supervised Learning Approaches for Wearable-Based Fatigue Recognition},
author = {Visy, Tam\'as and Kuznetsova, Rita and Holz, Christian and Gashi, Shkurta},
pages = {582-596},
abstract = {Fatigue is one of the most prevalent symptoms of chronic diseases, such as Multiple Sclerosis, Alzheimer's, and Parkinson's. Recently researchers have explored unobtrusive and continuous ways of fatigue monitoring using mobile and wearable devices. However, data quality and limited labeled data availability in the wearable health domain pose significant challenges to progress in the field. In this work, we perform a systematic evaluation of self-supervised learning (SSL) tasks for fatigue recognition using wearable sensor data. To establish our benchmark, we use Homekit2020, which is a large-scale dataset collected using Fitbit devices in everyday life settings. Our results show that the majority of the SSL tasks outperform fully supervised baselines for fatigue recognition, even in limited labeled data scenarios. In particular, the domain features and multi-task learning achieve 0.7371 and 0.7323 AUROC, which are higher than the other SSL tasks and supervised learning baselines. In most of the pre-training tasks, the performance is higher when using at least one data augmentation that reflects the potentially low quality of wearable data (e.g., missing data). Our findings open up promising opportunities for continuous assessment of fatigue in real settings and can be used to guide the design and development of health monitoring systems.},
year = {2024},
volume = {248},
publisher = {PMLR},
series = {Proceedings of Machine Learning Research},
software = {https://github.com/tamas-visy/SSLA-WBFR}
}
% p31
@InProceedings{mandyam24,
title = {Adaptive Interventions with User-Defined Goals for Health Behavior Change},
author = {Mandyam, Aishwarya and J\"orke, Matthew and Denton, William and Engelhardt, Barbara E. and Brunskill, Emma},
pages = {597-618},
abstract = {Promoting healthy lifestyle behaviors remains a major public health concern, particularly due to their crucial role in preventing chronic conditions such as cancer, heart disease, and type 2 diabetes. Mobile health applications present a promising avenue for low-cost, scalable health behavior change promotion. Researchers are increasingly exploring adaptive algorithms that personalize interventions to each person's unique context. However, in empirical studies, mobile health applications often suffer from small effect sizes and low adherence rates, particularly in comparison to human coaching. Tailoring advice to a person's unique goals, preferences, and life circumstances is a critical component of health coaching that has been underutilized in adaptive algorithms for mobile health interventions. To address this, we introduce a new Thompson sampling algorithm that can accommodate personalized reward functions (i.e., goals, preferences, and constraints), while also leveraging data sharing across individuals to more quickly be able to provide effective recommendations. We prove that our modification incurs only a constant penalty on cumulative regret while preserving the sample complexity benefits of data sharing. We present empirical results on synthetic and semi-synthetic physical activity simulators, where in the latter we conducted an online survey to solicit preference data relating to physical activity, which we use to construct realistic reward models that leverages historical data from another study. Our algorithm achieves substantial performance improvements compared to baselines that do not share data or do not optimize for individualized rewards.},
year = {2024},
volume = {248},
publisher = {PMLR},
series = {Proceedings of Machine Learning Research},
software = {https://github.com/StanfordAI4HI/adaptive-interventions-with-goals}
}
% p90
@InProceedings{cusick24,
title = {Algorithmic Changes Are Not Enough: Evaluating the Removal of Race Adjustment From the eGFR Equation},
author = {Cusick, Marika M and Chertow, Glenn M and Owens, Douglas K and Williams, Michelle Y and Rose, Sherri},
pages = {619-643},
abstract = {Changing clinical algorithms to remove race adjustment has been proposed and implemented for multiple health conditions. Removing race adjustment from estimated glomerular filtration rate (eGFR) equations may reduce disparities in chronic kidney disease (CKD), but has not been studied in clinical practice after implementation. Here, we assessed whether implementing an eGFR equation (CKD-EPI 2021) without adjustment for Black or African American race modified quarterly rates of nephrology referrals and visits within a single healthcare system, Stanford Health Care (SHC). Our cohort study analyzed 547,194 adult patients aged 21 and older who had at least one recorded serum creatinine or serum cystatin C between January 1, 2019 and September 1, 2023. During the study period, implementation of CKD-EPI 2021 did not modify rates of quarterly nephrology referrals in those documented as Black or African American or in the overall cohort. After adjusting for capacity at SHC nephrology clinics, estimated rates of nephrology referrals and visits with CKD-EPI 2021 were 34 (95\% CI 29, 39) and 188 (175, 201) per 10,000 patients documented as Black or African American. If race adjustment had not been removed, estimated rates were nearly identical: 38 (95\% CI: 28, 53) and 189 (165, 218) per 10,000 patients. Changes to the eGFR equation are likely insufficient to achieve health equity in CKD care decision-making as many other structural inequities remain.},
year = {2024},
volume = {248},
publisher = {PMLR},
series = {Proceedings of Machine Learning Research},
software = {https://github.com/StanfordHPDS/egfr_equation_shc}
}
% p97
@InProceedings{kasl24,
title = {A cross-study Analysis of Wearable Datasets and the Generalizability of Acute Illness Monitoring Models},
author = {Kasl, Patrick and Soltani, Severine and Keeler Bruce, Lauryn and Kumar Viswanath, Varun and Hartogensis, Wendy and Gupta, Amarnath and Altintas, Ilkay and Dilchert, Stephan and Hecht, Frederick M and Mason, Ashley and Smarr, Benjamin L},
pages = {644-682},
abstract = {Large-scale wearable datasets are increasingly being used for biomedical research and to develop machine learning (ML) models for longitudinal health monitoring applications. However, it is largely unknown whether biases in these datasets lead to findings that do not generalize. Here, we present the first comparison of the data underlying multiple longitudinal, wearable-device-based datasets. We examine participant-level resting heart rate (HR) from four studies, each with thousands of wearable device users. We demonstrate that multiple regression, a community standard statistical approach, leads to conflicting conclusions about important demographic variables (age vs resting HR) and significant intra- and inter-dataset differences in HR. We then directly test the cross-dataset generalizability of a commonly used ML model trained for three existing day-level monitoring tasks: prediction of testing positive for a respiratory virus, flu symptoms, and fever symptoms. Regardless of task, most models showed relative performance loss on external datasets; most of this performance change can be attributed to concept shift between datasets. These findings suggest that research using large-scale, pre-existing wearable datasets might face bias and generalizability challenges similar to research in more established biomedical and ML disciplines. We hope that the findings from this study will encourage discussion in the wearable-ML community around standards that anticipate and account for challenges in dataset bias and model generalizability.},
year = {2024},
volume = {248},
publisher = {PMLR},
series = {Proceedings of Machine Learning Research},
software = {https://github.com/chil-submission/wearable_generalizability}
}
% p120
@InProceedings{telukunta24,
title = {Learning Social Fairness Preferences from Non-Expert Stakeholder Opinions in Kidney Placement},
author = {Telukunta, Mukund and Rao, Sukruth and Stickney, Gabriella and Nadendla, Venkata Sriram Siddhardh and Canfield, Casey},
pages = {683-695},
abstract = {Modern kidney placement incorporates several intelligent recommendation systems which exhibit social discrimination due to biases inherited from training data. Although initial attempts were made in the literature to study algorithmic fairness in kidney placement, these methods replace true outcomes with surgeons' decisions due to the long delays involved in recording such outcomes reliably. However, the replacement of true outcomes with surgeons' decisions disregards expert stakeholders' biases as well as social opinions of other stakeholders who do not possess medical expertise. This paper alleviates the latter concern and designs a novel fairness feedback survey to evaluate an acceptance rate predictor (ARP) that predicts a kidney's acceptance rate in a given kidney-match pair. The survey is launched on Prolific, a crowdsourcing platform, and public opinions are collected from 85 anonymous crowd participants. A novel social fairness preference learning algorithm is proposed based on minimizing social feedback regret computed using a novel logit-based fairness feedback model. The proposed model and learning algorithm are both validated using simulation experiments as well as Prolific data. Public preferences towards group fairness notions in the context of kidney placement have been estimated and discussed in detail. The specific ARP tested in the Prolific survey has been deemed fair by the participants.},
year = {2024},
volume = {248},
publisher = {PMLR},
series = {Proceedings of Machine Learning Research},
software = {}
}