icbinb23.bib

@proceedings{icbinb-2023,
  booktitle =	 {Proceedings on "I Can't Believe It's Not Better: Failure 
                Modes in the Age of Foundation Models" at NeurIPS 2023 Workshops},
  editor =	 {Antor\'an, Javier and Blaas, Arno and Buchanan, Kelly and 
              Feng, Fan and Fortuin, Vincent and Ghalebikesabi, Sahra and
              Kriegler, Andreas and Mason, Ian and Rohde, David and 
              Ruiz, Francisco J. R. and Uelwer, Tobias and 
              Xie, Yubin and Yang, Rui  
},
  year =	 2023,
  shortname =	 {ICBINB 23},
  volume =	 239,
  start =	 {2023-12-16},
  end =		 {2023-12-16},
  published =	 {2023-04-24},
  address =	 {New Orleans, Louisiana, USA},
  conference_url ={https://sites.google.com/view/icbinb-2023/},
  conference_number =4,
}

@InProceedings{alazraki23,
  title =	 {How (not) to ensemble LVLMs for VQA},
  author=  {Alazraki, Lisa and Castrejon, Lluis and Dehghani, Mostafa and 
          Huot, Fantine and Uijlings, Jasper and Mensink, Thomas},
  pages =	 {1-20},
  abstract =	 {This paper studies ensembling in the era of Large Vision-Language Models (LVLMs). Ensembling is a classical method to combine different models to get increased performance. In the recent work on Encyclopedic-VQA the authors examine a wide variety of models to solve their task: from vanilla LVLMs, to mod- els including the caption as extra context, to models augmented with Lens-based retrieval of Wikipedia pages. Intuitively these models are highly complementary, which should make them ideal for ensembling. Indeed, an oracle experiment (Fig. 1) shows potential gains from 48.8\% accuracy (the best single model) all the way up to 67\% (best possible ensemble). So it is a trivial exercise to create an ensemble with substantial real gains. Or is it?},
}

@InProceedings{hsu23,
  title =	 {Can Visual Scratchpads With Diagrammatic Abstractions Augment LLM Reasoning?},
  author=  {Hsu, Joy and Poesia, Gabriel and Wu, Jiajun and Goodman, Noah}, 
  pages =	 {21-28},
  abstract =	 {When humans reason about complex text-based questions, we leverage diagrammatic abstractions drawn on a visual scratchpad. In this paper, we introduce and explore the capabilities of Visual-Scratchpad, a method that augments a large language foundation model (LLM) with diagrammatic execution and readout. We enable the LLM to generate drawing commands and to readout abstractions from the resulting picture. The visual readout operation uses a visual foundation model, optionally finetuned with expert iteration. Here, we show that although Visual-Scratchpad outperforms an inference-only LLM, it surprisingly yields worse performance compared to a single finetuned LLM. Through experiments, we propose that this gap is due to the failure mode of vision foundation models in understanding abstractions in diagrams.},
}

@InProceedings{lazovich23,
  title =	 {Filter bubbles and affective polarization in user-personalized large language model outputs},
  author = {Lazovich, Tomo},
  pages =	 {29-37},
  abstract =	 {Echoing the history of search engines and social media content rankings, the advent of large language models (LLMs) has led to a push for increased personalization of model outputs to individual users. In the past, personalized recommendations and ranking systems have been linked to the development of filter bubbles (serving content that may confirm a user's existing biases) and affective polarization (strong negative sentiment towards those with differing views). In this work, we explore how prompting a leading large language model, ChatGPT-3.5, with a user's political affiliation prior to asking factual questions about public figures and organizations leads to differing results. We observe that left-leaning users tend to receive more positive statements about left-leaning political figures and media outlets, while right-leaning users see more positive statements about right-leaning entities. This pattern holds across presidential candidates, members of the U.S. Senate, and media organizations with ratings from AllSides. When qualitatively evaluating some of these outputs, there is evidence that particular facts are included or excluded based on the user's political affiliation. These results illustrate that personalizing LLMs based on user demographics carry the same risks of affective polarization and filter bubbles that have been seen in other personalized internet technologies. This ``failure mode" should be monitored closely as there are more attempts to monetize and personalize these models.},
}

@InProceedings{mohta23,
  title =	 {Are large language models good annotators?},
  author = {Mohta, Jay and Ak, Kenan and Xu, Yan and Shen, Mingwei},
  pages =	 {38-48},
  abstract =	 {Numerous Natural Language Processing (NLP) tasks require precisely labeled data to ensure effective model training and achieve optimal performance. However, data annotation is marked by substantial costs and time requirements, especially when requiring specialized domain expertise or annotating a large number of samples. In this study, we investigate the feasibility of employing large language models (LLMs) as replacements for human annotators. We assess the zero-shot performance of various LLMs of different sizes to determine their viability as substitutes. Furthermore, recognizing that human annotators have access to diverse modalities, we introduce an image-based modality using the BLIP-2 architecture to evaluate LLM annotation performance. Among the tested LLMs, Vicuna-13b demonstrates competitive performance across diverse tasks. To assess the potential for LLMs to replace human annotators, we train a supervised model using labels generated by LLMs and compare its performance with models trained using human-generated labels. However, our findings reveal that models trained with human labels consistently outperform those trained with LLM-generated labels. We also highlights the challenges faced by LLMs in multilingual settings, where their performance significantly diminishes for tasks in languages other than English.},
}

@InProceedings{ren23,
  title =	 {Self-Evaluation Improves Selective Generation in Large Language Models},
  author = {Ren, Jie and Zhao, Yao and Vu, Tu and Liu, Peter J. and Lakshminarayanan, Balaji},
  pages =	 {49-64},
  abstract =	 {Safe deployment of large language models (LLMs) may benefit from a reliable method for assessing their generated content to determine when to abstain or to selectively generate. While likelihood-based metrics such as perplexity are widely employed, recent research has demonstrated the limitations of using sequence-level probability estimates given by LLMs as reliable indicators of generation quality. Conversely, LLMs have demonstrated strong calibration at the token level, particularly when it comes to choosing correct answers in multiple-choice questions or evaluating true/false statements. In this work, we reformulate open-ended generation tasks into token-level prediction tasks, and leverage LLMs' superior calibration at the token level. We instruct an LLM to self-evaluate its answers, employing either a multi-way comparison or a point-wise evaluation approach, with the option to include an ``None of the above'' option to express the model's uncertainty explicitly. We benchmark a range of scoring methods based on self-evaluation and evaluate their performance in selective generation using TruthfulQA and TL;DR. Through extensive experiments with PaLM-2 and GPT-3, we demonstrate that self-evaluation based scores not only improve accuracy, but also correlate better with the overall quality of generated content.},
}

@InProceedings{rezk23,
  title =	 {Is Scaling Learned Optimizers Worth It? Evaluating The Value of VeLO's 4000 TPU Months},
  author = {Rezk, Fady and Antoniou, Antreas and Gouk, Henry and Hospedales, Timothy},
  pages =	 {65-83},
  abstract = {We analyze VeLO (versatile learned optimzer, the largest scale attempt to train a general purpose ``foundational'' optimizer to date. VeLO was trained on thousands of machine learning tasks over 4000 TPU months with the goal of producing an optimizer capable of generalizing to new problems while being hyper-parameter free, and outperforming industry standards such as Adam. We independently evaluate VeLO on the MLcommons optimizer benchmark suite. We find that contrary to initial claims: (1) VeLO has a critical hyper-parameter that needs problem-specific tuning, (2) VeLO does not necessarily outperform competitors in quality of solution found, and (3) VeLO is not faster than competing optimizers at reducing the training loss. These observations call into question VeLO's generality and the value of the investment in training it.},
}

@InProceedings{saravanan23,
  title =	 {Exploring Social Bias in Downstream Applications of Text-to-Image Foundation Models},
  author=  {Saravanan, Adhithya Prakash and Kocielnik, Rafal and Jiang, Roy and Han, Pengrui and Anandkumar, Anima},
  pages =	 {84-102},
  abstract =	 {Text-to-image diffusion models have been adopted into key commercial workflows, such as art generation and image editing. Characterizing the implicit social biases they exhibit, such as gender and racial stereotypes, is a necessary first step in avoiding discriminatory outcomes. While existing studies on social bias focus on image generation, the biases exhibited in alternate applications of diffusion-based foundation models remain under-explored. We propose a framework that uses synthetic images to probe two applications of diffusion models, image editing and classification, for social bias. Using our framework, we uncover meaningful and significant inter-sectional social biases in Stable Diffusion, a state-of-the-art open-source text-to-image model. Our findings caution against the uninformed adoption of text-to-image foundation models for downstream tasks and services.},
}

@InProceedings{schwinn23,
  title =	 {Adversarial Attacks and Defenses in Large Language Models: Old and New Threats},
  author =	 {Schwinn, Leo and Dobre, David and  G{\"u}nnemann, Stephan and Gidel, Gauthier},
  pages =	 {103-117},
  abstract =	 {Over the past decade, there has been extensive research aimed at enhancing the robustness of neural networks, yet this problem remains vastly unsolved. Here, one major impediment has been the overestimation of the robustness of new defense approaches due to faulty defense evaluations. Flawed robustness evaluations necessitate rectifications in subsequent works, dangerously slowing down the research and providing a false sense of security. In this context, we will face substantial challenges associated with an impending adversarial arms race in natural language processing, specifically with closed-source Large Language Models (LLMs), such as ChatGPT, Google Bard, or Anthropic’s Claude. We provide a first set of prerequisites to improve the robustness assessment of new approaches and reduce the amount of faulty evaluations. Additionally, we identify embedding space attacks on LLMs as another viable threat model for the purposes of generating malicious content in open-sourced models. Finally, we demonstrate on a recently proposed defense that, without LLM-specific best practices in place, it is easy to overestimate the robustness of a new approach.},
}

@InProceedings{wu23,
  title =	 {The Role of Linguistic Priors in Measuring Compositional Generalization of Vision-Language Models},
  author =	 {Wu, Chenwei and Li, Li Erran and Ermon, Stefano and Haffner, Patrick and Ge, Rong and Zhang, Zaiwei},
  pages =	 {118-126},
  abstract =	 {Compositionality is a common property in many modalities including text and images, but the compositional generalization of multi-modal models is not well-understood. In this paper, we identify two sources of visual-linguistic compositionality: linguistic priors and the interplay between images and texts. We show that current attempts to improve compositional generalization rely on linguistic priors rather than on information in the image, as the strength of the language model in detecting sentences that are syntactically and semantically likely overwhelms the vision part of the model. We find in particular that a benchmark for compositionality mostly favors pure language models. Finally, we propose a new benchmark for compositionality without such linguistic priors},
}


@InProceedings{zhang23,
  title =	 {Pre-trained Language Models Do Not Help Auto-regressive Text-to-Image Generation},
  author =	 {Zhang, Yuhui and McKinzie, Brandon and Gan, Zhe and Shankar, Vaishaal and Toshev, Alexander},
  pages =	 {127-133},
  abstract =	 {Recent advances in image tokenizers, such as VQ-VAE, have enabled text-to-image generation using auto-regressive methods, similar to language modeling. However, these methods have yet to leverage pre-trained language models, despite their adaptability to various downstream tasks. In this work, we explore this gap by adapting a pre-trained language model for auto-regressive text-to-image generation, and find that pre-trained language models offer limited help. We provide a two-fold explanation by analyzing tokens from each modality. First, we demonstrate that image tokens possess significantly different semantics compared to text tokens, rendering pre-trained language models no more effective in modeling them than randomly initialized ones. Second, the text tokens in the image-text datasets are too simple compared to normal language model pre-training data, which causes the catastrophic degradation of language models' capability.},
}