diff --git a/authors/elisa_bassignana/avatar.jpg b/authors/elisa_bassignana/avatar.jpg
new file mode 100644
index 00000000..6e7f8396
Binary files /dev/null and b/authors/elisa_bassignana/avatar.jpg differ
diff --git a/authors/elisa_bassignana/avatar_hu0203d7a3077547503af8e092de372d9e_14153485_270x270_fill_q90_lanczos_center.jpg b/authors/elisa_bassignana/avatar_hu0203d7a3077547503af8e092de372d9e_14153485_270x270_fill_q90_lanczos_center.jpg
new file mode 100644
index 00000000..050871ce
Binary files /dev/null and b/authors/elisa_bassignana/avatar_hu0203d7a3077547503af8e092de372d9e_14153485_270x270_fill_q90_lanczos_center.jpg differ
diff --git a/authors/elisa_bassignana/index.html b/authors/elisa_bassignana/index.html
new file mode 100644
index 00000000..d6116018
--- /dev/null
+++ b/authors/elisa_bassignana/index.html
@@ -0,0 +1,956 @@
+
+
+
+
+
+
diff --git a/index.json b/index.json
index f31869db..6b4aa85c 100644
--- a/index.json
+++ b/index.json
@@ -1 +1 @@
-[{"authors":["hovy_dirk"],"categories":null,"content":"Dirk Hovy is a Full Professor in the Computing Sciences Department of Bocconi University, and the scientific director of the Data and Marketing Insights research unit. Previously, he was faculty at the University of Copenhagen, got a PhD from USC\u0026rsquo;s Information Sciences Institute, and a linguistics master\u0026rsquo;s in Germany.\nDirk is interested in the interaction between language, society, and machine learning, or what language can tell us about society, and what computers can tell us about language. He is also interested in ethical questions of bias and algorithmic fairness in machine learning.\nHe has authored over 150 articles on these topics, including 3 best and one outstanding paper awards, and published two textbooks on NLP in Python for social scientists.\nDirk has co-founded and organized several workshops (on computational social science, and ethics in NLP), and was a local organizer for the EMNLP 2017 conference. He was awarded an ERC Starting Grant project 2020 for research on demographic bias in NLP.\nOutside of work, Dirk enjoys cooking, leather-crafting, and picking up heavy things just to put them back down.\n","date":-62135596800,"expirydate":-62135596800,"kind":"term","lang":"en","lastmod":-62135596800,"objectID":"2162c73950c747d1a2acd1061edae370","permalink":"https://milanlproc.github.io/authors/1_dirk_hovy/","publishdate":"0001-01-01T00:00:00Z","relpermalink":"/authors/1_dirk_hovy/","section":"authors","summary":"Dirk Hovy is a Full Professor in the Computing Sciences Department of Bocconi University, and the scientific director of the Data and Marketing Insights research unit. Previously, he was faculty at the University of Copenhagen, got a PhD from USC\u0026rsquo;s Information Sciences Institute, and a linguistics master\u0026rsquo;s in Germany.","tags":null,"title":"Dirk Hovy","type":"authors"},{"authors":["debora_nozza"],"categories":null,"content":"Debora Nozza (she/her) is an Assistant Professor in Computing Sciences at Bocconi University. She was awarded a €1.5m ERC Starting Grant project 2023 for research on personalized and subjective approaches to Natural Language Processing. Previously, she was awarded a €120,000 grant from Fondazione Cariplo for her project MONICA, which focuses on monitoring coverage, attitudes, and accessibility of Italian measures in response to COVID-19. Her research interests mainly focus on Natural Language Processing, specifically on the detection and counter-acting of hate speech and algorithmic bias on Social Media data in multilingual context.\nShe organized the 7th Workshop on Online Abuse and Harms (WOAH) at ACL 2023 and the ICWSM 2023 Data Challenge: Temporal social data at ICWSM 2023. She was one of the organizers of the task on Automatic Misogyny Identification (AMI) at Evalita 2018 and Evalita 2020, and one of the organizers of the HatEval Task 5 at SemEval 2019 on multilingual detection of hate speech against immigrants and women in Twitter.\n","date":-62135596800,"expirydate":-62135596800,"kind":"term","lang":"en","lastmod":-62135596800,"objectID":"6270b423d82bc34de53ce30c4d80baf9","permalink":"https://milanlproc.github.io/authors/2_debora_nozza/","publishdate":"0001-01-01T00:00:00Z","relpermalink":"/authors/2_debora_nozza/","section":"authors","summary":"Debora Nozza (she/her) is an Assistant Professor in Computing Sciences at Bocconi University. She was awarded a €1.5m ERC Starting Grant project 2023 for research on personalized and subjective approaches to Natural Language Processing.","tags":null,"title":"Debora Nozza","type":"authors"},{"authors":["amanda_cercas_curry"],"categories":null,"content":"","date":-62135596800,"expirydate":-62135596800,"kind":"term","lang":"en","lastmod":-62135596800,"objectID":"d18e690d2acda385ba9272ff4c77b099","permalink":"https://milanlproc.github.io/authors/amanda_cercas_curry/","publishdate":"0001-01-01T00:00:00Z","relpermalink":"/authors/amanda_cercas_curry/","section":"authors","summary":"","tags":null,"title":"Amanda Cercas Curry","type":"authors"},{"authors":["anne_lauscher"],"categories":null,"content":"","date":-62135596800,"expirydate":-62135596800,"kind":"term","lang":"en","lastmod":-62135596800,"objectID":"52ff7d721d80e8ff7b6937ad8d1aca31","permalink":"https://milanlproc.github.io/authors/anne_lauscher/","publishdate":"0001-01-01T00:00:00Z","relpermalink":"/authors/anne_lauscher/","section":"authors","summary":"","tags":null,"title":"Anne Lauscher","type":"authors"},{"authors":["arianna_muti"],"categories":null,"content":"Arianna Muti is a Postdoctoral Research Fellow at Bocconi University. Her research interests mainly focus on Natural Language Processing, specifically on the detection of cross-cultural and implicit misogyny on Social Media. She is currently working on the project PERSONAE, to develop personalized language technologies.\nShe has co-organized the 13th edition of the CLEF Conference.\n","date":-62135596800,"expirydate":-62135596800,"kind":"term","lang":"en","lastmod":-62135596800,"objectID":"98ebcf466ab5994ecf159a2d980fb2f2","permalink":"https://milanlproc.github.io/authors/arianna_muti/","publishdate":"0001-01-01T00:00:00Z","relpermalink":"/authors/arianna_muti/","section":"authors","summary":"Arianna Muti is a Postdoctoral Research Fellow at Bocconi University. Her research interests mainly focus on Natural Language Processing, specifically on the detection of cross-cultural and implicit misogyny on Social Media.","tags":null,"title":"Arianna Muti","type":"authors"},{"authors":["donya_rooein"],"categories":null,"content":"Donya Rooein is a Postdoctoral Research Fellow at Bocconi University, her work revolves around leveraging natural language processing for educational advancements. She explores the synergy between machine learning, linguistics, and practitioner insights to enhance our education system. Her primary focus is on developing interpretable and scalable measures through NLP systems, aimed at assessing learning effectiveness and fostering adaptive learning environments.\nShe completed her Ph.D. in Information Technology Engineering from Politecnico di Milano in October 2022. Her doctoral research was awarded by EIT Digital and centered on crafting a flexible and adaptable framework for educational chatbots. Throughout her Ph.D. journey, she specialized in creating highly customizable chatbot solutions tailored to the diverse educational requirements of teachers and students alike.\n","date":-62135596800,"expirydate":-62135596800,"kind":"term","lang":"en","lastmod":-62135596800,"objectID":"6edd0fb476ee4f8442b7db4ce05086cf","permalink":"https://milanlproc.github.io/authors/donya_rooein/","publishdate":"0001-01-01T00:00:00Z","relpermalink":"/authors/donya_rooein/","section":"authors","summary":"Donya Rooein is a Postdoctoral Research Fellow at Bocconi University, her work revolves around leveraging natural language processing for educational advancements. She explores the synergy between machine learning, linguistics, and practitioner insights to enhance our education system.","tags":null,"title":"Donya Rooein","type":"authors"},{"authors":["emanuele_moscato"],"categories":null,"content":"Emanuele Moscato Emanuele Moscato is a Postdoctoral Researcher at Bocconi, where he applies NLP-inspired models to a variety of problems including statistical physics of probabilistic languages, bioinformatics and NLP itself.\nMore on his personal website.\n","date":-62135596800,"expirydate":-62135596800,"kind":"term","lang":"en","lastmod":-62135596800,"objectID":"39e58892e6f18d9be59b25b6560d25d5","permalink":"https://milanlproc.github.io/authors/emanuele_moscato/","publishdate":"0001-01-01T00:00:00Z","relpermalink":"/authors/emanuele_moscato/","section":"authors","summary":"Emanuele Moscato Emanuele Moscato is a Postdoctoral Researcher at Bocconi, where he applies NLP-inspired models to a variety of problems including statistical physics of probabilistic languages, bioinformatics and NLP itself.","tags":null,"title":"Emanuele Moscato","type":"authors"},{"authors":["federico_bianchi"],"categories":null,"content":"","date":-62135596800,"expirydate":-62135596800,"kind":"term","lang":"en","lastmod":-62135596800,"objectID":"1b44ab31c433ff8d06f13800865217d5","permalink":"https://milanlproc.github.io/authors/federico_bianchi/","publishdate":"0001-01-01T00:00:00Z","relpermalink":"/authors/federico_bianchi/","section":"authors","summary":"","tags":null,"title":"Federico Bianchi","type":"authors"},{"authors":["flor_plaza"],"categories":null,"content":"Flor Miriam Plaza-del-Arco is a Postdoctoral Research Fellow at Bocconi University. Her research interests mainly focus on Natural Language Processing, specifically on the detection of hate speech and the analysis of emotions on Social Media. During her PhD, she worked on offensive language detection on social media in both English and Spanish, specifically she created different resources including corpora and lexicons, as well as developed computational systems that benefit from different linguistic phenomena to detect offensive language more accurately.\nShe has co-organized the EmoEvalEs and MeOffendES shared tasks at IberLEF 2021 on offensive language detection and emotion detection. She has also co-organized the 36th and 37th editions of the SEPLN Conference.\n","date":-62135596800,"expirydate":-62135596800,"kind":"term","lang":"en","lastmod":-62135596800,"objectID":"3753fb7bdfa63528a46e9d82ba7c5812","permalink":"https://milanlproc.github.io/authors/flor_plaza/","publishdate":"0001-01-01T00:00:00Z","relpermalink":"/authors/flor_plaza/","section":"authors","summary":"Flor Miriam Plaza-del-Arco is a Postdoctoral Research Fellow at Bocconi University. Her research interests mainly focus on Natural Language Processing, specifically on the detection of hate speech and the analysis of emotions on Social Media.","tags":null,"title":"Flor Plaza","type":"authors"},{"authors":["fornaciari_tommaso"],"categories":null,"content":"","date":-62135596800,"expirydate":-62135596800,"kind":"term","lang":"en","lastmod":-62135596800,"objectID":"f93279484d263fb2d790380394be888c","permalink":"https://milanlproc.github.io/authors/fornaciari_tommaso/","publishdate":"0001-01-01T00:00:00Z","relpermalink":"/authors/fornaciari_tommaso/","section":"authors","summary":"","tags":null,"title":"Tommaso Fornaciari","type":"authors"},{"authors":["giuseppe_attanasio"],"categories":null,"content":"Giuseppe Attanasio Giuseppe Attanasio is a Postdoctoral Researcher at Bocconi, where he works on large-scale neural architectures for Natural Language Processing. His research focuses on understanding and regularizing models for debiasing and fairness purposes. He is actively working on project MONICA to characterize Italian measures in response to COVID 19.\nMore on his personal website.\n","date":-62135596800,"expirydate":-62135596800,"kind":"term","lang":"en","lastmod":-62135596800,"objectID":"c69033f31210e3828102d577df47e75c","permalink":"https://milanlproc.github.io/authors/giuseppe_attanasio/","publishdate":"0001-01-01T00:00:00Z","relpermalink":"/authors/giuseppe_attanasio/","section":"authors","summary":"Giuseppe Attanasio Giuseppe Attanasio is a Postdoctoral Researcher at Bocconi, where he works on large-scale neural architectures for Natural Language Processing. His research focuses on understanding and regularizing models for debiasing and fairness purposes.","tags":null,"title":"Giuseppe Attanasio","type":"authors"},{"authors":["jan_globisz"],"categories":null,"content":"Jan Globisz is a Master’s student of Data Science and Business Analytics at Bocconi University. As a research assistant, he worked on uncovering the demographic features from SCOTUS judgments, as well as monitoring the changing roles of males and females in Swedish parenthood. In his thesis, he explored the topic of corruption in public procurement and built a classification model for detecting corruption risks in European calls for tenders. In his spare time, he supports the work of an NGO helping refugees from Ukraine and the Polish - Belarusian border.\n","date":-62135596800,"expirydate":-62135596800,"kind":"term","lang":"en","lastmod":-62135596800,"objectID":"dbe84e29b948daf768241c073c4cf13b","permalink":"https://milanlproc.github.io/authors/jan_globisz/","publishdate":"0001-01-01T00:00:00Z","relpermalink":"/authors/jan_globisz/","section":"authors","summary":"Jan Globisz is a Master’s student of Data Science and Business Analytics at Bocconi University. As a research assistant, he worked on uncovering the demographic features from SCOTUS judgments, as well as monitoring the changing roles of males and females in Swedish parenthood.","tags":null,"title":"Jan Globisz","type":"authors"},{"authors":["kilian_theil"],"categories":null,"content":"","date":-62135596800,"expirydate":-62135596800,"kind":"term","lang":"en","lastmod":-62135596800,"objectID":"0011484030a6940fd202413d2e58ca73","permalink":"https://milanlproc.github.io/authors/kilian_theil/","publishdate":"0001-01-01T00:00:00Z","relpermalink":"/authors/kilian_theil/","section":"authors","summary":"","tags":null,"title":"Kilian Theil","type":"authors"},{"authors":["lorenzo_lupo"],"categories":null,"content":"Lorenzo Lupo is a postdoctoral research fellow at Bocconi University, working on natural language processing and its applications to social and economic challenges. He is collaborating to the MENTALISM project, combining text analysis and machine learning with survey data to track inequality.\nMore on his personal website.\n","date":-62135596800,"expirydate":-62135596800,"kind":"term","lang":"en","lastmod":-62135596800,"objectID":"7da364ecd1fe8524c3bf084438e08df4","permalink":"https://milanlproc.github.io/authors/lorenzo_lupo/","publishdate":"0001-01-01T00:00:00Z","relpermalink":"/authors/lorenzo_lupo/","section":"authors","summary":"Lorenzo Lupo is a postdoctoral research fellow at Bocconi University, working on natural language processing and its applications to social and economic challenges. He is collaborating to the MENTALISM project, combining text analysis and machine learning with survey data to track inequality.","tags":null,"title":"Lorenzo Lupo","type":"authors"},{"authors":["maria_nawrocka"],"categories":null,"content":"Maria Nawrocka is a PhD candidate at the Doctoral School of Social Sciences, at the University of Warsaw. Her research focuses on the discourse surrounding refugees in Polish public television. Utilizing a customized NLP framework designed for the Polish language, she aims to uncover patterns and shifts in the representation of refugees over the past decade.\n","date":-62135596800,"expirydate":-62135596800,"kind":"term","lang":"en","lastmod":-62135596800,"objectID":"b3966a7776583dc30e9fa9c427145927","permalink":"https://milanlproc.github.io/authors/maria_nawrocka/","publishdate":"0001-01-01T00:00:00Z","relpermalink":"/authors/maria_nawrocka/","section":"authors","summary":"Maria Nawrocka is a PhD candidate at the Doctoral School of Social Sciences, at the University of Warsaw. Her research focuses on the discourse surrounding refugees in Polish public television. Utilizing a customized NLP framework designed for the Polish language, she aims to uncover patterns and shifts in the representation of refugees over the past decade.","tags":null,"title":"Maria Nawrocka","type":"authors"},{"authors":["matthias_orlikowski"],"categories":null,"content":"","date":-62135596800,"expirydate":-62135596800,"kind":"term","lang":"en","lastmod":-62135596800,"objectID":"9e7557087ab3b8193a5c6f86cdcee2c0","permalink":"https://milanlproc.github.io/authors/matthias_orlikowski/","publishdate":"0001-01-01T00:00:00Z","relpermalink":"/authors/matthias_orlikowski/","section":"authors","summary":"","tags":null,"title":"Matthias Orlikowski","type":"authors"},{"authors":["nikita_soni"],"categories":null,"content":"","date":-62135596800,"expirydate":-62135596800,"kind":"term","lang":"en","lastmod":-62135596800,"objectID":"8d64189b871ea9a66d62606eeded552e","permalink":"https://milanlproc.github.io/authors/nikita_soni/","publishdate":"0001-01-01T00:00:00Z","relpermalink":"/authors/nikita_soni/","section":"authors","summary":"","tags":null,"title":"Nikita Soni","type":"authors"},{"authors":["paul_rottger"],"categories":null,"content":"I am a postdoctoral researcher in Dirk Hovy‘s MilaNLP Lab at Bocconi University. My work is located at the intersection of computation, language and society. Right now, I am particularly interested in evaluating and aligning social values in large language models.\nIn May 2023, I completed my PhD at the University of Oxford, where I was supervised by Janet Pierrehumbert and Helen Margetts. In my PhD, I worked on improving the evaluation and effectiveness of natural language processing models for hate speech detection. I also worked on general language modelling challenges like language change and annotator subjectivity. The HateCheck project that I led won the Stanford AI Audit Challenge.\nDuring my PhD, I also co-founded Rewire, a start-up building socially responsible AI for online safety. Over two years as CTO, I grew a technical team of 10+ people, working on large projects for Google, Meta and others. In March 2023, Rewire was acquired by ActiveFence.\nFor current updates, follow me on Twitter or visit my website.\n","date":-62135596800,"expirydate":-62135596800,"kind":"term","lang":"en","lastmod":-62135596800,"objectID":"c2616b530e000914ab4b7aeaa5b014ad","permalink":"https://milanlproc.github.io/authors/paul_rottger/","publishdate":"0001-01-01T00:00:00Z","relpermalink":"/authors/paul_rottger/","section":"authors","summary":"I am a postdoctoral researcher in Dirk Hovy‘s MilaNLP Lab at Bocconi University. My work is located at the intersection of computation, language and society. Right now, I am particularly interested in evaluating and aligning social values in large language models.","tags":null,"title":"Paul Röttger","type":"authors"},{"authors":["pieter_delobelle"],"categories":null,"content":"","date":-62135596800,"expirydate":-62135596800,"kind":"term","lang":"en","lastmod":-62135596800,"objectID":"7bad91faad0b4fa4f8ec60697e2bbf43","permalink":"https://milanlproc.github.io/authors/pieter_delobelle/","publishdate":"0001-01-01T00:00:00Z","relpermalink":"/authors/pieter_delobelle/","section":"authors","summary":"","tags":null,"title":"Pieter Delobelle","type":"authors"},{"authors":["pietro_lesci"],"categories":null,"content":"","date":-62135596800,"expirydate":-62135596800,"kind":"term","lang":"en","lastmod":-62135596800,"objectID":"faaed7ac08357878be8b53209c877ee1","permalink":"https://milanlproc.github.io/authors/pietro_lesci/","publishdate":"0001-01-01T00:00:00Z","relpermalink":"/authors/pietro_lesci/","section":"authors","summary":"","tags":null,"title":"Pietro Lesci","type":"authors"},{"authors":["tanise_ceron"],"categories":null,"content":"Tanise Ceron is a Postdoctoral Research Fellow at Bocconi University. Her research lies in understanding how algorithms, and more specifically language models filter information such as understanding the types of biases embedded in large language models (LLMs) and how they manifest in downstream tasks. She is also keen on developing methods for modeling societal discourse. This involves tasks such as developing methods to mine ideologies, and more generally, opinions from texts. Lastly, she enjoys thinking about the implementation of her research in real world use case applications, such as news recommenders.\nShe co-leads the project MULTIVIEW which investigates methods for diversifying news recommendations in terms of perspectives.\n","date":-62135596800,"expirydate":-62135596800,"kind":"term","lang":"en","lastmod":-62135596800,"objectID":"17e17e09710126f90b1391272ffc5d27","permalink":"https://milanlproc.github.io/authors/tanise_ceron/","publishdate":"0001-01-01T00:00:00Z","relpermalink":"/authors/tanise_ceron/","section":"authors","summary":"Tanise Ceron is a Postdoctoral Research Fellow at Bocconi University. Her research lies in understanding how algorithms, and more specifically language models filter information such as understanding the types of biases embedded in large language models (LLMs) and how they manifest in downstream tasks.","tags":null,"title":"Tanise Ceron","type":"authors"},{"authors":["tiancheng_hu"],"categories":null,"content":"I am a first-year second-year third year PhD student Computation, Cognition and Language at the Language Technology Lab at University of Cambridge, supervised by Prof. Nigel Collier. I have broad interests in natural language processing and computational social science. Most recently, my research encompasses two aspects: 1) understanding the inherent biases of LLMs through text generation 2) employing LLMs to understand biased language use in human communication. Previously, I completed my master\u0026rsquo;s in Electrical Engineering and Information Technology at ETH Zürich. My master\u0026rsquo;s thesis is about quotative usage in U.S political news. It was done at The Data Science Lab at EPFL, supervised by Manoel Horta Ribeiro, Prof. Andreas Spitz and Prof. Robert West.\nI obtained a Bachlor\u0026rsquo;s of Science in Electrical Engineering from The University of Texas at Dallas, advised by Prof. Carlos Busso. I worked on driver head pose estimation with 3D data.\n","date":-62135596800,"expirydate":-62135596800,"kind":"term","lang":"en","lastmod":-62135596800,"objectID":"7af1adac0e700a9bddfa9b6905f9afab","permalink":"https://milanlproc.github.io/authors/tiancheng_hu/","publishdate":"0001-01-01T00:00:00Z","relpermalink":"/authors/tiancheng_hu/","section":"authors","summary":"I am a first-year second-year third year PhD student Computation, Cognition and Language at the Language Technology Lab at University of Cambridge, supervised by Prof. Nigel Collier. I have broad interests in natural language processing and computational social science.","tags":null,"title":"Tiancheng Hu","type":"authors"},{"authors":["Flor Miriam Plaza-del-Arco","Amanda Cercas Curry","Susanna Paoli","Alba Curry","Dirk Hovy"],"categories":[],"content":"","date":1726963200,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1726963200,"objectID":"8cf9a7741aced2c3989f8f303097b872","permalink":"https://milanlproc.github.io/publication/2024-divine-llamas-emotion-bias/","publishdate":"2023-04-12T17:19:53+01:00","relpermalink":"/publication/2024-divine-llamas-emotion-bias/","section":"publication","summary":"Emotions play important epistemological and cognitive roles in our lives, revealing our values and guiding our actions. Previous work has shown that LLMs display biases in emotion attribution along gender lines. However, unlike gender, which says little about our values, religion, as a socio-cultural system, prescribes a set of beliefs and values for its followers. Religions, therefore, cultivate certain emotions. Moreover, these rules are explicitly laid out and interpreted by religious leaders. Using emotion attribution, we explore how different religions are represented in LLMs. We find that: Major religions in the US and European countries are represented with more nuance, displaying a more shaded model of their beliefs. Eastern religions like Hinduism and Buddhism are strongly stereotyped. Judaism and Islam are stigmatized -- the models' refusal skyrocket. We ascribe these to cultural bias in LLMs and the scarcity of NLP literature on religion. In the rare instances where religion is discussed, it is often in the context of toxic language, perpetuating the perception of these religions as inherently toxic. This finding underscores the urgent need to address and rectify these biases. Our research underscores the crucial role emotions play in our lives and how our values influence them.","tags":["Emotion attribution","Religion","Bias","Stereotypes","Large Language Models"],"title":"Divine LLaMAs: Bias, Stereotypes, Stigmatization, and Emotion Representation of Religion in Large Language Models","type":"publication"},{"authors":["Fabio Pernisi","Dirk Hovy","Paul Röttger"],"categories":[],"content":"","date":1723334400,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1723334400,"objectID":"156b0278c4b0945d090f234ba8e4045f","permalink":"https://milanlproc.github.io/publication/2024-compromesso/","publishdate":"2023-07-12T14:48:20+01:00","relpermalink":"/publication/2024-compromesso/","section":"publication","summary":"As diverse linguistic communities and users adopt large language models (LLMs), assessing their safety across languages becomes critical. Despite ongoing efforts to make LLMs safe, they can still be made to behave unsafely with jailbreaking, a technique in which models are prompted to act outside their operational guidelines. Research on LLM safety and jailbreaking, however, has so far mostly focused on English, limiting our understanding of LLM safety in other languages. We contribute towards closing this gap by investigating the effectiveness of many-shot jailbreaking, where models are prompted with unsafe demonstrations to induce unsafe behaviour, in Italian. To enable our analysis, we create a new dataset of unsafe Italian question-answer pairs. With this dataset, we identify clear safety vulnerabilities in four families of open-weight LLMs. We find that the models exhibit unsafe behaviors even when prompted with few unsafe demonstrations, and -- more alarmingly -- that this tendency rapidly escalates with more demonstrations.","tags":["Large Language Models","AI Safety","NLP"],"title":"Compromesso! Italian Many-Shot Jailbreaks Undermine the Safety of Large Language Models","type":"publication"},{"authors":["Xinpeng Wang","Bolei Ma","Chengzhi Hu","Leon Weber-Genzel","Paul Röttger","Frauke Kreuter","Dirk Hovy","Barbara Plank"],"categories":[],"content":"","date":1723334400,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1723334400,"objectID":"7705723cc81af3d241d833404acc59a6","permalink":"https://milanlproc.github.io/publication/2024-myanswerisc/","publishdate":"2023-07-12T14:48:20+01:00","relpermalink":"/publication/2024-myanswerisc/","section":"publication","summary":"The open-ended nature of language generation makes the evaluation of autoregressive large language models (LLMs) challenging. One common evaluation approach uses multiple-choice questions to limit the response space. The model is then evaluated by ranking the candidate answers by the log probability of the first token prediction. However, first-tokens may not consistently reflect the final response output, due to model’s diverse response styles such as starting with “Sure” or refusing to answer. Consequently, first-token evaluation is not indicative of model behaviour when interacting with users. But by how much? We evaluate how aligned first-token evaluation is with the text output along several dimensions, namely final option choice, refusal rate, choice distribution and robustness under prompt perturbation. Our results show that the two approaches are severely misaligned on all dimensions, reaching mismatch rates over 60%. Models heavily fine-tuned on conversational or safety data are especially impacted. Crucially, models remain misaligned even when we increasingly constrain prompts, i.e., force them to start with an option letter or example template. Our findings i) underscore the importance of inspecting the text output as well and ii) caution against relying solely on first-token evaluation.","tags":["Large Language Models","Evaluation","NLP"],"title":"My Answer is C: First-Token Probabilities Do Not Match Text Answers in Instruction-Tuned Language Models","type":"publication"},{"authors":["Paul Röttger","Valentin Hofmann","Valentina Pyatkin","Musashi Hinck","Hannah Rose Kirk","Hinrich Schuetze","Dirk Hovy"],"categories":[],"content":"","date":1723334400,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1723334400,"objectID":"aba2a2e9644c22fb67872e026df2a6f8","permalink":"https://milanlproc.github.io/publication/2024-politicalcompass/","publishdate":"2023-07-12T14:48:20+01:00","relpermalink":"/publication/2024-politicalcompass/","section":"publication","summary":"Much recent work seeks to evaluate values and opinions in large language models (LLMs) using multiple-choice surveys and questionnaires. Most of this work is motivated by concerns around real-world LLM applications. For example, politically-biased LLMs may subtly influence society when they are used by millions of people. Such real-world concerns, however, stand in stark contrast to the artificiality of current evaluations: real users do not typically ask LLMs survey questions. Motivated by this discrepancy, we challenge the prevailing *constrained* evaluation paradigm for values and opinions in LLMs and explore more realistic *unconstrained* evaluations. As a case study, we focus on the popular Political Compass Test (PCT). In a systematic review, we find that most prior work using the PCT *forces models to comply with the PCT’s multiple-choice format. We show that models give substantively different answers when not forced; that answers change depending on how models are forced; and that answers lack paraphrase robustness. Then, we demonstrate that models give different answers yet again in a more realistic open-ended answer setting. We distill these findings into recommendations and open challenges in evaluating values and opinions in LLMs.","tags":["Large Language Models","AI Alignment","NLP"],"title":"Political Compass or Spinning Arrow? Towards More Meaningful Evaluations for Values and Opinions in Large Language Models","type":"publication"},{"authors":["Paul Röttger","Hannah Rose Kirk","Bertie Vidgen","Giuseppe Attanasio","Federico Bianchi","Dirk Hovy"],"categories":[],"content":"","date":1721088000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1721088000,"objectID":"a6fed1f8412ee969bf6cd5afc26ac5e8","permalink":"https://milanlproc.github.io/publication/2024-xstest/","publishdate":"2023-07-12T14:48:20+01:00","relpermalink":"/publication/2024-xstest/","section":"publication","summary":"Without proper safeguards, large language models will readily follow malicious instructions and generate toxic content. This risk motivates safety efforts such as red-teaming and large-scale feedback learning, which aim to make models both helpful and harmless. However, there is a tension between these two objectives, since harmlessness requires models to refuse to comply with unsafe prompts, and thus not be helpful. Recent anecdotal evidence suggests that some models may have struck a poor balance, so that even clearly safe prompts are refused if they use similar language to unsafe prompts or mention sensitive topics. In this paper, we introduce a new test suite called XSTest to identify such eXaggerated Safety behaviours in a systematic way. XSTest comprises 250 safe prompts across ten prompt types that well-calibrated models should not refuse to comply with, and 200 unsafe prompts as contrasts that models, for most applications, should refuse. We describe XSTest’s creation and composition, and then use the test suite to highlight systematic failure modes in state-of-the-art language models as well as more general challenges in building safer language models.","tags":["Large Language Models","AI Safety","NLP"],"title":"XSTest: A Test Suite for Identifying Exaggerated Safety Behaviours in Large Language Models","type":"publication"},{"authors":["Lorenzo Lupo","Paul Bose","Mahyar Habibi","Dirk Hovy","Carlo Schwarz"],"categories":[],"content":"","date":1716163200,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1716163200,"objectID":"e1b884510e6a32df5fabb54ecd4853bc","permalink":"https://milanlproc.github.io/publication/2024-dadit/","publishdate":"2024-02-23T14:48:20+01:00","relpermalink":"/publication/2024-dadit/","section":"publication","summary":"Social scientists increasingly use demographically stratified social media data to study the attitudes, beliefs, and behavior of the general public. To facilitate such analyses, we construct, validate, and release the representative DADIT dataset of 30M tweets of 20k Italian Twitter users, along with their bios and profile pictures. We enrich the user data with high-quality labels for gender, age, and location. This new dataset enables us to compare the performance of various state-of-the-art models for the prediction of the gender and age of social media users. In particular, we investigate if tweets contain valuable information for the prediction of user characteristics, since popular classifiers like M3 don't leverage them. Our best XLM-based classifier improves upon the commonly used competitor M3 by up to 53% F1. Especially for age prediction, classifiers profit from including tweets as features. We also confirm these findings on a German test set.","tags":["Twitter data","demographic prediction","language models","multimodal classification"],"title":"DADIT: A Dataset for Demographic Classification of Italian Twitter Users and a Comparison of Prediction Methods","type":"publication"},{"authors":["Donya Rooein","Paul Rottger","Anastassia Shaitarova","Dirk Hovy"],"categories":[],"content":"","date":1715817600,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1715817600,"objectID":"68e0ca6ce3b15d6e054e6e5e289b8963","permalink":"https://milanlproc.github.io/publication/2024-difficulty-classification/","publishdate":"2024-05-15T16:22:16+01:00","relpermalink":"/publication/2024-difficulty-classification/","section":"publication","summary":"Using large language models (LLMs) for educational applications like dialogue-based teaching is a hot topic. Effective teaching, however, requires teachers to adapt the difficulty of content and explanations to the education level of their students. Even the best LLMs today struggle to do this well. If we want to improve LLMs on this adaptation task, we need to be able to measure adaptation success reliably. However, current Static metrics for text difficulty, like the Flesch-Kincaid Reading Ease score, are known to be crude and brittle. We, therefore, introduce and evaluate a new set of Prompt-based metrics for text difficulty. Based on a user study, we create Prompt-based metrics as inputs for LLMs. They leverage LLM's general language understanding capabilities to capture more abstract and complex features than Static metrics. Regression experiments show that adding our Prompt-based metrics significantly improves text difficulty classification over Static metrics alone. Our results demonstrate the promise of using LLMs to evaluate text adaptation to different education levels. ","tags":["Difficulty Classification","Education","Large Language Models"],"title":"Beyond Flesch-Kincaid: Prompt-based Metrics Improve Difficulty Classification of Educational Texts","type":"publication"},{"authors":["Federico Bianchi","Mirac Suzgun","Giuseppe Attanasio","Paul Röttger","Dan Jurafsky","Tatsunori Hashimoto","James Zou"],"categories":[],"content":"","date":1715040000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1715040000,"objectID":"b91aaf47580d3e1f9038e2c7901a17ae","permalink":"https://milanlproc.github.io/publication/2024-safetyllamas/","publishdate":"2023-07-12T14:48:20+01:00","relpermalink":"/publication/2024-safetyllamas/","section":"publication","summary":"Training large language models to follow instructions makes them perform better on a wide range of tasks, generally becoming more helpful. However, a perfectly helpful model will follow even the most malicious instructions and readily generate harmful content. In this paper, we raise concerns over the safety of models that only emphasize helpfulness, not safety, in their instruction-tuning. We show that several popular instruction-tuned models are highly unsafe. Moreover, we show that adding just 3% safety examples (a few hundred demonstrations) in the training set when fine-tuning a model like LLaMA can substantially improve their safety. Our safety-tuning does not make models significantly less capable or helpful as measured by standard benchmarks. However, we do find a behavior of exaggerated safety, where too much safety-tuning makes models refuse to respond to reasonable prompts that superficially resemble unsafe ones. Our study sheds light on trade-offs in training LLMs to follow instructions and exhibit safe behavior.","tags":["Large Language Models","AI Safety","NLP"],"title":"Safety-Tuned LLaMAs: Lessons From Improving the Safety of Large Language Models that Follow Instructions","type":"publication"},{"authors":["Paul Röttger","Fabio Pernisi","Bertie Vidgen","Dirk Hovy"],"categories":[],"content":"","date":1712534400,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1712534400,"objectID":"63bda1d9c59fa404b61557ca9db3376b","permalink":"https://milanlproc.github.io/publication/2024-safetyprompts/","publishdate":"2023-07-12T14:48:20+01:00","relpermalink":"/publication/2024-safetyprompts/","section":"publication","summary":"The last two years have seen a rapid growth in concerns around the safety of large language models (LLMs). Researchers and practitioners have met these concerns by introducing an abundance of new datasets for evaluating and improving LLM safety. However, much of this work has happened in parallel, and with very different goals in mind, ranging from the mitigation of near-term risks around bias and toxic content generation to the assessment of longer-term catastrophic risk potential. This makes it difficult for researchers and practitioners to find the most relevant datasets for a given use case, and to identify gaps in dataset coverage that future work may fill. To remedy these issues, we conduct a first systematic review of open datasets for evaluating and improving LLM safety. We review 102 datasets, which we identified through an iterative and community-driven process over the course of several months. We highlight patterns and trends, such as a a trend towards fully synthetic datasets, as well as gaps in dataset coverage, such as a clear lack of non-English datasets. We also examine how LLM safety datasets are used in practice -- in LLM release publications and popular LLM benchmarks -- finding that current evaluation practices are highly idiosyncratic and make use of only a small fraction of available datasets. Our contributions are based on this http URL, a living catalogue of open datasets for LLM safety, which we commit to updating continuously as the field of LLM safety develops.","tags":["Large Language Models","AI Safety","NLP"],"title":"SafetyPrompts: a Systematic Review of Open Datasets for Evaluating and Improving Large Language Model Safety ","type":"publication"},{"authors":["Flor Miriam Plaza-del-Arco","Amanda Cercas Curry","Alba Curry","Gavin Abercrombie","Dirk Hovy"],"categories":[],"content":"","date":1711584000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1711584000,"objectID":"72fdc8b25d816d74f056cd9b0f6f497b","permalink":"https://milanlproc.github.io/publication/2024-emotion-gender-stereotypes/","publishdate":"2023-04-12T17:19:53+01:00","relpermalink":"/publication/2024-emotion-gender-stereotypes/","section":"publication","summary":"Large language models (LLMs) reflect societal norms and biases, especially about gender. While societal biases and stereotypes have been extensively researched in various NLP applications, there is a surprising gap for emotion analysis. However, emotion and gender are closely linked in societal discourse. E.g., women are often thought of as more empathetic, while men's anger is more socially accepted. To fill this gap, we present the first comprehensive study of gendered emotion attribution in five state-of-the-art LLMs (open- and closed-source). We investigate whether emotions are gendered, and whether these variations are based on societal stereotypes. We prompt the models to adopt a gendered persona and attribute emotions to an event like 'When I had a serious argument with a dear person'. We then analyze the emotions generated by the models in relation to the gender-event pairs. We find that all models consistently exhibit gendered emotions, influenced by gender stereotypes. These findings are in line with established research in psychology and gender studies. Our study sheds light on the complex societal interplay between language, gender, and emotion. The reproduction of emotion stereotypes in LLMs allows us to use those models to study the topic in detail, but raises questions about the predictive use of those same LLMs for emotion applications.","tags":["Emotion attribution","Gender Bias","Large Language Models"],"title":"Angry Men, Sad Women: Large Language Models Reflect Gendered Stereotypes in Emotion Attribution","type":"publication"},{"authors":["Donya Rooein","Dirk Hovy"],"categories":[],"content":"","date":1711584000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1711584000,"objectID":"4f5e4f0fe7cf32b88733c20420ae3e76","permalink":"https://milanlproc.github.io/publication/2024-conversations-data/","publishdate":"2023-04-12T17:19:53+01:00","relpermalink":"/publication/2024-conversations-data/","section":"publication","summary":"Open conversations are one of the most engaging forms of teaching. However, creating those conversations in educational software is a complex endeavor, especially if we want to address the needs of different audiences. While language models hold great promise for educational applications, there are substantial challenges in training them to engage in meaningful and effective conversational teaching, especially when considering the diverse needs of various audiences. No official data sets exist for this task to facilitate the training of language models for conversational teaching, considering the diverse needs of various audiences. This paper presents a novel source for facilitating conversational teaching of scientific concepts at various difficulty levels (from preschooler to expert), namely dialogues taken from video transcripts. We analyse this data source in various ways to show that it offers a diverse array of examples that can be used to generate contextually appropriate and natural responses to scientific topics for specific target audiences. It is a freely available valuable resource for training and evaluating conversation models, encompassing organically occurring dialogues. While the raw data is available online, we provide additional metadata for conversational analysis of dialogues at each level in all available videos. ","tags":["Education","Conversational Data","Adaptive Learning"],"title":"Conversations as a Source for Teaching Scientific Concepts at Different Education Levels","type":"publication"},{"authors":["Flor Miriam Plaza-del-Arco","Alba Curry","Amanda Cercas Curry","Dirk Hovy"],"categories":[],"content":"","date":1711584000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1711584000,"objectID":"d9ced95d92a4ea8da9be921c1eb03a4d","permalink":"https://milanlproc.github.io/publication/2024-emotion-analysis-survey/","publishdate":"2023-04-12T17:19:53+01:00","relpermalink":"/publication/2024-emotion-analysis-survey/","section":"publication","summary":"Emotions are a central aspect of communication. Consequently, emotion analysis (EA) is a rapidly growing field in natural language processing (NLP). However, there is no consensus on scope, direction, or methods. In this paper, we conduct a thorough review of 154 relevant NLP publications from the last decade. Based on this review, we address four different questions: (1) How are EA tasks defined in NLP? (2) What are the most prominent emotion frameworks and which emotions are modeled? (3) Is the subjectivity of emotions considered in terms of demographics and cultural factors? and (4) What are the primary NLP applications for EA? We take stock of trends in EA and tasks, emotion frameworks used, existing datasets, methods, and applications. We then discuss four lacunae: (1) the absence of demographic and cultural aspects does not account for the variation in how emotions are perceived, but instead assumes they are universally experienced in the same manner; (2) the poor fit of emotion categories from the two main emotion theories to the task; (3) the lack of standardized EA terminology hinders gap identification, comparison, and future goals; and (4) the absence of interdisciplinary research isolates EA from insights in other fields. Our work will enable more focused research into EA and a more holistic approach to modeling emotions in NLP.","tags":["Emotion analysis","Survey","Natural Language Processing"],"title":"Emotion Analysis in NLP: Trends, Gaps and Roadmap for Future Directions","type":"publication"},{"authors":["Amanda Cercas Curry","Giuseppe Attanasio","Zeerak Talat","Dirk Hovy"],"categories":[],"content":"","date":1709596800,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1709596800,"objectID":"d22ae7873586110e84d9ca0f0c95c8d7","permalink":"https://milanlproc.github.io/publication/2024-socialclass-experiments/","publishdate":"2024-03-05T14:48:20+01:00","relpermalink":"/publication/2024-socialclass-experiments/","section":"publication","summary":"Since the foundational work of William Labov on the social stratification of language (Labov, 1964), linguistics has made concentrated efforts to explore the links between sociodemographic characteristics and language production and perception. But while there is strong evidence for socio-demographic characteristics in language, they are infrequently used in Natural Language Processing (NLP). Age and gender are somewhat well represented, but Labov's original target, socioeconomic status, is noticeably absent. And yet it matters. We show empirically that NLP disadvantages less-privileged socioeconomic groups. We annotate a corpus of 95K utterances from movies with social class, ethnicity and geographical language variety and measure the performance of NLP systems on three tasks: language modelling, automatic speech recognition, and grammar error correction. We find significant performance disparities that can be attributed to socioeconomic status as well as ethnicity and geographical differences. With NLP technologies becoming ever more ubiquitous and quotidian, they must accommodate all language varieties to avoid disadvantaging already marginalised groups. We argue for the inclusion of socioeconomic class in future language technologies.","tags":["Large Language Models","Fairness","NLP","Demographics"],"title":"Classist Tools: Social Class Correlates with Performance in NLP","type":"publication"},{"authors":["Amanda Cercas Curry","Zeerak Talat","Dirk Hovy"],"categories":[],"content":"","date":1709596800,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1709596800,"objectID":"d2459c7e7c738b73a9cc2ed6b169f672","permalink":"https://milanlproc.github.io/publication/2024-socialclass-survey/","publishdate":"2024-03-05T14:48:20+01:00","relpermalink":"/publication/2024-socialclass-survey/","section":"publication","summary":"Since Labov's (1964) foundational work on the social stratification of language, linguistics has dedicated concerted efforts towards understanding the relationships between socio-demographic factors and language production and perception. Despite the large body of evidence identifying significant relationships between socio-demographic factors and language production, relatively few of these factors have been investigated in the context of NLP technology. While age and gender are well covered, Labov's initial target, socio-economic class, is largely absent. We survey the existing Natural Language Processing (NLP) literature and find that only around 20 papers even mention socio-economic status. However, the majority of those papers do not engage with class beyond collecting information of annotator-demographics. Given this research lacuna, we provide a definition of class that can be operationalised by NLP researchers, and argue for including socio-economic class in future language technologies.","tags":["Social class","Fairness","NLP","Demographics"],"title":"Impoverished Language Technology: The Lack of (Social) Class in NLP","type":"publication"},{"authors":["Amanda Cercas Curry","Gavin Abercrombie","Zeerak Talat"],"categories":[],"content":"","date":1709596800,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1709596800,"objectID":"fc2eaff1942d5f10f348b498415d4298","permalink":"https://milanlproc.github.io/publication/2024-isms/","publishdate":"2024-03-05T14:48:20+01:00","relpermalink":"/publication/2024-isms/","section":"publication","summary":"Natural language processing research has begun to embrace the notion of annotator subjectivity, motivated by variations in labelling. This approach understands each annotator's view as valid, which can be highly suitable for tasks that embed subjectivity, e.g., sentiment analysis. However, this construction may be inappropriate for tasks such as hate speech detection, as it affords equal validity to all positions on e.g., sexism or racism. We argue that the conflation of hate and offence can invalidate findings on hate speech, and call for future work to be situated in theory, disentangling hate from its orthogonal concept, offence.","tags":["Hate speech","Subjectivity","Sexism","Offensiveness"],"title":"Subjective isms? On the Danger of Conflating Hate and Offence in Abusive Language Detection","type":"publication"},{"authors":["Giuseppe Attanasio","Flor Miriam Plaza-del-Arco","Debora Nozza","Anne Lauscher"],"categories":[],"content":"","date":1703721600,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1703721600,"objectID":"5ccaa05d51aad1c15b4187130af80d2d","permalink":"https://milanlproc.github.io/publication/2024-a-tale-of-pronouns/","publishdate":"2023-04-12T17:19:53+01:00","relpermalink":"/publication/2024-a-tale-of-pronouns/","section":"publication","summary":"Recent instruction fine-tuned models can solve multiple NLP tasks when prompted to do so, with machine translation (MT) being a prominent use case. However, current research often focuses on standard performance benchmarks, leaving compelling fairness and ethical considerations behind. In MT, this might lead to misgendered translations, resulting, among other harms, in the perpetuation of stereotypes and prejudices. In this work, we address this gap by investigating whether and to what extent such models exhibit gender bias in machine translation and how we can mitigate it. Concretely, we compute established gender bias metrics on the WinoMT corpus from English to German and Spanish. We discover that IFT models default to male-inflected translations, even disregarding female occupational stereotypes. Next, using interpretability methods, we unveil that models systematically overlook the pronoun indicating the gender of a target occupation in misgendered translations. Finally, based on this finding, we propose an easy-to-implement and effective bias mitigation solution based on few-shot learning that leads to significantly fairer translations.","tags":["Interpretability","Gender Bias","Machine Translation"],"title":"A Tale of Pronouns: Interpretability Informs Gender Bias Mitigation for Fairer Instruction-Tuned Machine Translation","type":"publication"},{"authors":["Gavin Abercrombie","Amanda Cercas Curry","Tanvi Dinkar","Verena Rieser","Zeerak Talat"],"categories":[],"content":"","date":1701734400,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1701734400,"objectID":"c69ca0738e769aa55302da9237c0bc50","permalink":"https://milanlproc.github.io/publication/2023-mirages-anthropomorphism/","publishdate":"2023-12-05T14:48:20+01:00","relpermalink":"/publication/2023-mirages-anthropomorphism/","section":"publication","summary":"Automated dialogue or conversational systems are anthropomorphised by developers and personified by users. While a degree of anthropomorphism is inevitable, conscious and unconscious design choices can guide users to personify them to varying degrees. Encouraging users to relate to automated systems as if they were human can lead to transparency and trust issues, and high risk scenarios caused by over-reliance on their outputs. As a result, natural language processing researchers have investigated the factors that induce personification and develop resources to mitigate such effects. However, these efforts are fragmented, and many aspects of anthropomorphism have yet to be explored. In this paper, we discuss the linguistic factors that contribute to the anthropomorphism of dialogue systems and the harms that can arise thereof, including reinforcing gender stereotypes and conceptions of acceptable language. We recommend that future efforts towards developing dialogue systems take particular care in their design, development, release, and description; and attend to the many linguistic cues that can elicit personification by users.","tags":["Dialogue systems","Anthropomorphism","Trust"],"title":"Mirages. On Anthropomorphism in Dialogue Systems","type":"publication"},{"authors":["Hannah Rose Kirk","Bertie Vidgen","Paul Röttger","Scott A. Hale"],"categories":[],"content":"","date":1700006400,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1700006400,"objectID":"095a97910f182f7e49e6d7e4530057a8","permalink":"https://milanlproc.github.io/publication/2023-alignmentparadigms/","publishdate":"2023-07-12T14:48:20+01:00","relpermalink":"/publication/2023-alignmentparadigms/","section":"publication","summary":"In this paper, we address the concept of 'alignment' in large language models (LLMs) through the lens of post-structuralist socio-political theory, specifically examining its parallels to empty signifiers. To establish a shared vocabulary around how abstract concepts of alignment are operationalised in empirical datasets, we propose a framework that demarcates: 1) which dimensions of model behaviour are considered important, then 2) how meanings and definitions are ascribed to these dimensions, and by whom. We situate existing empirical literature and provide guidance on deciding which paradigm to follow. Through this framework, we aim to foster a culture of transparency and critical evaluation, aiding the community in navigating the complexities of aligning LLMs with human populations.","tags":["Large Language Models","Alignment","NLP"],"title":"The Empty Signifier Problem: Towards Clearer Paradigms for Operationalising 'Alignment' in Large Language Models","type":"publication"},{"authors":["Bertie Vidgen","Hannah Rose Kirk","Rebecca Qian","Nino Scherrer","Anand Kannappan","Scott A. Hale","Paul Röttger"],"categories":[],"content":"","date":1699920000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1699920000,"objectID":"629614984bc611fb8f0195a8b91ef894","permalink":"https://milanlproc.github.io/publication/2023-simplesafetytests/","publishdate":"2023-07-12T14:48:20+01:00","relpermalink":"/publication/2023-simplesafetytests/","section":"publication","summary":"The past year has seen rapid acceleration in the development of large language models (LLMs). For many tasks, there is now a wide range of open-source and open-access LLMs that are viable alternatives to proprietary models like ChatGPT. Without proper steering and safeguards, however, LLMs will readily follow malicious instructions, provide unsafe advice, and generate toxic content. This is a critical safety risk for businesses and developers. We introduce SimpleSafetyTests as a new test suite for rapidly and systematically identifying such critical safety risks. The test suite comprises 100 test prompts across five harm areas that LLMs, for the vast majority of applications, should refuse to comply with. We test 11 popular open LLMs and find critical safety weaknesses in several of them. While some LLMs do not give a single unsafe response, most models we test respond unsafely on more than 20% of cases, with over 50% unsafe responses in the extreme. Prepending a safety-emphasising system prompt substantially reduces the occurrence of unsafe responses, but does not completely stop them from happening. We recommend that developers use such system prompts as a first line of defence against critical safety risks.","tags":["Large Language Models","AI Safety","NLP"],"title":"SimpleSafetyTests: a Test Suite for Identifying Critical Safety Risks in Large Language Models","type":"publication"},{"authors":null,"categories":null,"content":"We love hosting talented researchers for research visits at MilaNLP. If you would like to apply for a visit, read on.\nWe have two main visiting periods: between April and July, and between September and December. We try to assess all applications for a period together and make a decision with enough time to spare, usually in January and July.\nMost of our visitors are PhD students or postdocs, i.e., people who do not have their own funding. This program is designed to help them come for a visit. If you do have your own funding, please feel free to reach out directly.\nDue to limited office space, we will only be able to host 1–2 people per period.\nNOTE: Due to the overwhelming response, we have already filled all slots for 2024, but will run another round in 2025. Please keep an eye out on social media for the announcement.\n","date":1698019200,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1698019200,"objectID":"eaab68c9b73be5f959ac98c857109899","permalink":"https://milanlproc.github.io/open_positions/visiting_researcher/","publishdate":"2023-10-23T00:00:00Z","relpermalink":"/open_positions/visiting_researcher/","section":"open_positions","summary":"Open Application for Research Visits with MilaNLP","tags":null,"title":"Visiting Researcher","type":"open_positions"},{"authors":["Hannah Rose Kirk","Andrew M. Bean","Bertie Vidgen","Paul Röttger","Scott A. Hale"],"categories":[],"content":"","date":1696982400,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1696982400,"objectID":"40048bf816282f959fe4fbf4b0c69019","permalink":"https://milanlproc.github.io/publication/2023-human-feedback-learning-survey/","publishdate":"2023-07-12T14:48:20+01:00","relpermalink":"/publication/2023-human-feedback-learning-survey/","section":"publication","summary":"Human feedback is increasingly used to steer the behaviours of Large Language Models (LLMs). However, it is unclear how to collect and incorporate feedback in a way that is efficient, effective and unbiased, especially for highly subjective human preferences and values. In this paper, we survey existing approaches for learning from human feedback, drawing on 95 papers primarily from the ACL and arXiv repositories. First, we summarise the past, pre-LLM trends for integrating human feedback into language models. Second, we give an overview of present techniques and practices, as well as the motivations for using feedback; conceptual frameworks for defining values and preferences; and how feedback is collected and from whom. Finally, we encourage a better future of feedback learning in LLMs by raising five unresolved conceptual and practical challenges.","tags":["Large Language Models","Human Feedback","NLP"],"title":"The Past, Present and Better Future of Feedback Learning in Large Language Models for Subjective Human Preferences and Values","type":"publication"},{"authors":["Flor Miriam Plaza-del-Arco","Debora Nozza","Dirk Hovy"],"categories":[],"content":"","date":1690156800,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1690156800,"objectID":"af13235c20bbbfc9b999e446ca7c1bb9","permalink":"https://milanlproc.github.io/publication/2023-label-variation-llms/","publishdate":"2023-07-24T14:48:20+01:00","relpermalink":"/publication/2023-label-variation-llms/","section":"publication","summary":"Large Language Models (LLMs) exhibit remarkable text classification capabilities, excelling in zero- and few-shot learning (ZSL and FSL) scenarios. However, since they are trained on different datasets, performance varies widely across tasks between those models. Recent studies emphasize the importance of considering human label variation in data annotation. However, how this human label variation also applies to LLMs remains unexplored. Given this likely model specialization, we ask: Do aggregate LLM labels improve over individual models (as for human annotators)? We evaluate four recent instruction-tuned LLMs as annotators on five subjective tasks across four languages. We use ZSL and FSL setups and label aggregation from human annotation. Aggregations are indeed substantially better than any individual model, benefiting from specialization in diverse tasks or languages. Surprisingly, FSL does not surpass ZSL, as it depends on the quality of the selected examples. However, there seems to be no good information-theoretical strategy to select those. We find that no LLM method rivals even simple supervised models. We also discuss the tradeoffs in accuracy, cost, and moral/ethical considerations between LLM and human annotation.","tags":["NLP","LLMs","annotation"],"title":"Wisdom of Instruction-Tuned Language Model Crowds. Exploring Model Label Variation","type":"publication"},{"authors":["Gabriele Ruggeri","Debora Nozza"],"categories":[],"content":"","date":1689120000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1689120000,"objectID":"63001a4b70ad72b7258ca6178b221fa1","permalink":"https://milanlproc.github.io/publication/2023-multidimensional-bias-vision-language-models/","publishdate":"2023-07-12T14:48:20+01:00","relpermalink":"/publication/2023-multidimensional-bias-vision-language-models/","section":"publication","summary":"In recent years, joint Vision-Language (VL) models have increased in popularity and capability. Very few studies have attempted to investigate bias in VL models, even though it is a well-known issue in both individual modalities.This paper presents the first multi-dimensional analysis of bias in English VL models, focusing on gender, ethnicity, and age as dimensions.When subjects are input as images, pre-trained VL models complete a neutral template with a hurtful word 5% of the time, with higher percentages for female and young subjects.Bias presence in downstream models has been tested on Visual Question Answering. We developed a novel bias metric called the Vision-Language Association Test based on questions designed to elicit biased associations between stereotypical concepts and targets. Our findings demonstrate that pre-trained VL models contain biases that are perpetuated in downstream tasks.","tags":["Fairness","NLP","multimodal"],"title":"A Multi-dimensional study on Bias in Vision-Language models","type":"publication"},{"authors":["Amanda Cercas Curry","Giuseppe Attanasio","Debora Nozza","Dirk Hovy"],"categories":[],"content":"","date":1689120000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1689120000,"objectID":"f06c4346370b070a759172242d7d10a1","permalink":"https://milanlproc.github.io/publication/2023-milanlp-semeval-2023-task-10-ensembling-domain-adapted-regularized-pretrained-language-models-robust-sexism-detection/","publishdate":"2023-07-12T14:48:20+01:00","relpermalink":"/publication/2023-milanlp-semeval-2023-task-10-ensembling-domain-adapted-regularized-pretrained-language-models-robust-sexism-detection/","section":"publication","summary":"We present the system proposed by the MilaNLP team for the Explainable Detection of Online Sexism (EDOS) shared task. We propose an ensemble modeling approach to combine different classifiers trained with domain adaptation objectives and standard fine-tuning.Our results show that the ensemble is more robust than individual models and that regularized models generate more “conservative” predictions, mitigating the effects of lexical overfitting.However, our error analysis also finds that many of the misclassified instances are debatable, raising questions about the objective annotatability of hate speech data.","tags":["Hate Speech","NLP","domain adaptation","language models"],"title":"MilaNLP at SemEval-2023 Task 10: Ensembling Domain-Adapted and Regularized Pretrained Language Models for Robust Sexism Detection","type":"publication"},{"authors":["Flor Miriam Plaza-del-Arco","Debora Nozza","Dirk Hovy"],"categories":[],"content":"","date":1689120000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1689120000,"objectID":"3f2cc27a6d89ea664c434ace9993294a","permalink":"https://milanlproc.github.io/publication/2023-zero-shot-prompting-hate-speech/","publishdate":"2023-07-12T14:48:20+01:00","relpermalink":"/publication/2023-zero-shot-prompting-hate-speech/","section":"publication","summary":"Hate speech detection faces two significant challenges: 1) the limited availability of labeled data and 2) the high variability of hate speech across different contexts and languages. Prompting brings a ray of hope to these challenges. It allows injecting a model with task-specific knowledge without relying on labeled data. This paper explores zero-shot learning with prompting for hate speech detection. We investigate how well zero-shot learning can detect hate speech in 3 languages with limited labeled data. We experiment with various large language models and verbalizers on 8 benchmark datasets. Our findings highlight the impact of prompt selection on the results. They also suggest that prompting, specifically with recent large language models, can achieve performance comparable to and surpass fine-tuned models, making it a promising alternative for under-resourced languages. Our findings highlight the potential of prompting for hate speech detection and show how both the prompt and the model have a significant impact on achieving more accurate predictions in this task.","tags":["Hate Speech","NLP","multilingual"],"title":"Respectful or Toxic? Using Zero-Shot Learning with Language Models to Detect Hate Speech","type":"publication"},{"authors":["Gavin Abercrombie","Dirk Hovy","Vinodkumar Prabhakaran"],"categories":[],"content":"","date":1689120000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1689120000,"objectID":"6cffcdc288e8545b5e096b0c26db8ca6","permalink":"https://milanlproc.github.io/publication/2023-temporal-second-language-influence-intra-annotator-agreement-stability-hate-speech-labelling/","publishdate":"2023-07-12T14:48:20+01:00","relpermalink":"/publication/2023-temporal-second-language-influence-intra-annotator-agreement-stability-hate-speech-labelling/","section":"publication","summary":"Much work in natural language processing (NLP) relies on human annotation. The majority of this implicitly assumes that annotator’s labels are temporally stable, although the reality is that human judgements are rarely consistent over time. As a subjective annotation task, hate speech labels depend on annotator’s emotional and moral reactions to the language used to convey the message. Studies in Cognitive Science reveal a ‘foreign language effect’, whereby people take differing moral positions and perceive offensive phrases to be weaker in their second languages. Does this affect annotations as well? We conduct an experiment to investigate the impacts of (1) time and (2) different language conditions (English and German) on measurements of intra-annotator agreement in a hate speech labelling task. While we do not observe the expected lower stability in the different language condition, we find that overall agreement is significantly lower than is implicitly assumed in annotation tasks, which has important implications for dataset reproducibility in NLP.","tags":["annotation","NLP","sociodemographics"],"title":"Temporal and Second Language Influence on Intra-Annotator Agreement and Stability in Hate Speech Labelling","type":"publication"},{"authors":["Matthias Orlikowski","Paul Röttger","Philipp Cimiano","Dirk Hovy"],"categories":[],"content":"","date":1689120000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1689120000,"objectID":"4abddd30922258b0fac152dcdf4b69f0","permalink":"https://milanlproc.github.io/publication/2023-ecological-fallacy-annotation-modeling-human-label-variation-goes-beyond-sociodemographics/","publishdate":"2023-07-12T14:48:20+01:00","relpermalink":"/publication/2023-ecological-fallacy-annotation-modeling-human-label-variation-goes-beyond-sociodemographics/","section":"publication","summary":"Many NLP tasks exhibit human label variation, where different annotators give different labels to the same texts. This variation is known to depend, at least in part, on the sociodemographics of annotators. Recent research aims to model individual annotator behaviour rather than predicting aggregated labels, and we would expect that sociodemographic information is useful for these models. On the other hand, the ecological fallacy states that aggregate group behaviour, such as the behaviour of the average female annotator, does not necessarily explain individual behaviour. To account for sociodemographics in models of individual annotator behaviour, we introduce group-specific layers to multi-annotator models. In a series of experiments for toxic content detection, we find that explicitly accounting for sociodemographic attributes in this way does not significantly improve model performance. This result shows that individual annotation behaviour depends on much more than just sociodemographics.","tags":["annotation","NLP","sociodemographics"],"title":"The Ecological Fallacy in Annotation: Modeling Human Label Variation goes beyond Sociodemographics","type":"publication"},{"authors":["Debora Nozza","Dirk Hovy"],"categories":[],"content":"","date":1689120000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1689120000,"objectID":"1ebebe2dde0f3ca3bcb2976010570452","permalink":"https://milanlproc.github.io/publication/2023-prof-profanity-obfuscation-nlp/","publishdate":"2023-07-12T14:48:20+01:00","relpermalink":"/publication/2023-prof-profanity-obfuscation-nlp/","section":"publication","summary":"Work on hate speech has made considering rude and harmful examples in scientific publications inevitable. This situation raises various problems, such as whether or not to obscure profanities. While science must accurately disclose what it does, the unwarranted spread of hate speech can harm readers and increases its internet frequency. While maintaining publications’ professional appearance, obfuscating profanities makes it challenging to evaluate the content, especially for non-native speakers. Surveying 150 ACL papers, we discovered that obfuscation is usually used for English but not other languages, and even then, quite unevenly. We discuss the problems with obfuscation and suggest a multilingual community resource called PrOf with a Python module to standardize profanity obfuscation processes. We believe PrOf can help scientific publication policies to make hate speech work accessible and comparable, irrespective of language.","tags":["Hate Speech","NLP","multilingual"],"title":"The State of Profanity Obfuscation in Natural Language Processing Scientific Publications","type":"publication"},{"authors":["Anne Lauscher","Debora Nozza","Ehm Miltersen","Archie Crowley","Dirk Hovy"],"categories":[],"content":"","date":1689120000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1689120000,"objectID":"ff9d8520053fa8f944c7faa36dfc35da","permalink":"https://milanlproc.github.io/publication/2023-commercial-machine-translation-fail-neopronouns/","publishdate":"2023-07-12T14:48:20+01:00","relpermalink":"/publication/2023-commercial-machine-translation-fail-neopronouns/","section":"publication","summary":"As 3rd-person pronoun usage shifts to include novel forms, e.g., neopronouns, we need more research on identity-inclusive NLP. Exclusion is particularly harmful in one of the most popular NLP applications, machine translation (MT). Wrong pronoun translations can discriminate against marginalized groups, e.g., non-binary individuals (Dev et al., 2021). In this “reality check”, we study how three commercial MT systems translate 3rd-person pronouns. Concretely, we compare the translations of gendered vs. gender-neutral pronouns from English to five other languages (Danish, Farsi, French, German, Italian), and vice versa, from Danish to English.Our error analysis shows that the presence of a gender-neutral pronoun often leads to grammatical and semantic translation errors. Similarly, gender neutrality is often not preserved. By surveying the opinions of affected native speakers from diverse languages, we provide recommendations to address the issue in future MT research.","tags":["NLP","pronouns","fairness","ethics"],"title":"What about ''em''? How Commercial Machine Translation Fails to Handle (Neo-)Pronouns","type":"publication"},{"authors":["Anne Lauscher","Debora Nozza","Ehm Miltersen","Archie Crowley","Dirk Hovy"],"categories":[],"content":"","date":1689120000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1689120000,"objectID":"fd19c1177957eaded55b19f54d5290bc","permalink":"https://milanlproc.github.io/publication/2023-interpretability-for-fairer-machine-translation/","publishdate":"2023-07-12T14:48:20+01:00","relpermalink":"/publication/2023-interpretability-for-fairer-machine-translation/","section":"publication","summary":"As 3rd-person pronoun usage shifts to include novel forms, e.g., neopronouns, we need more research on identity-inclusive NLP. Exclusion is particularly harmful in one of the most popular NLP applications, machine translation (MT). Wrong pronoun translations can discriminate against marginalized groups, e.g., non-binary individuals (Dev et al., 2021). In this “reality check”, we study how three commercial MT systems translate 3rd-person pronouns. Concretely, we compare the translations of gendered vs. gender-neutral pronouns from English to five other languages (Danish, Farsi, French, German, Italian), and vice versa, from Danish to English.Our error analysis shows that the presence of a gender-neutral pronoun often leads to grammatical and semantic translation errors. Similarly, gender neutrality is often not preserved. By surveying the opinions of affected native speakers from diverse languages, we provide recommendations to address the issue in future MT research.","tags":["NLP","pronouns","fairness","ethics"],"title":"What about ''em''? How Commercial Machine Translation Fails to Handle (Neo-)Pronouns","type":"publication"},{"authors":["Alba Curry","Amanda Cercas Curry"],"categories":[],"content":"","date":1685923200,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1685923200,"objectID":"d0eb857a6a57b8204d7de17a9a7d5e25","permalink":"https://milanlproc.github.io/publication/2023-computer-says-no/","publishdate":"2023-06-05T14:48:20+01:00","relpermalink":"/publication/2023-computer-says-no/","section":"publication","summary":"Emotions are an integral part of human cognition and they guide not only our understanding of the world but also our actions within it. As such, whether we soothe or flame an emotion is not inconsequential. Recent work in conversational AI has focused on responding empathetically to users, validating and soothing their emotions without a real basis. This AI-aided emotional regulation can have negative consequences for users and society, tending towards a one-noted happiness defined as only the absence of “negative” emotions. We argue that we must carefully consider whether and how to respond to users’ emotions.","tags":["Dialogue systems","Empathy","Anthropomorphism","Trust"],"title":"Computer says “No”: The Case Against Empathetic Conversational AI","type":"publication"},{"authors":["Tommaso Fornaciari","Luca Luceri","Emilio Ferrara","Dirk Hovy"],"categories":[],"content":"","date":1685923200,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1685923200,"objectID":"d77d8b1e10205c823e23d8f2019f4585","permalink":"https://milanlproc.github.io/publication/2023-leveraging-social-interactions-detect-misinformation-social-media/","publishdate":"2023-06-05T14:48:20+01:00","relpermalink":"/publication/2023-leveraging-social-interactions-detect-misinformation-social-media/","section":"publication","summary":"Detecting misinformation threads is crucial to guarantee a healthy environment on social media. We address the problem using the data set created during the COVID-19 pandemic. It contains cascades of tweets discussing information weakly labeled as reliable or unreliable, based on a previous evaluation of the information source. The models identifying unreliable threads usually rely on textual features. But reliability is not just what is said, but by whom and to whom. We additionally leverage on network information. Following the homophily principle, we hypothesize that users who interact are generally interested in similar topics and spreading similar kind of news, which in turn is generally reliable or not. We test several methods to learn representations of the social interactions within the cascades, combining them with deep neural language models in a Multi-Input (MI) framework. Keeping track of thesequence of the interactions during the time, we improve over previous state-of-the-art models.","tags":["misinformation","NLP","social media","networks"],"title":"Leveraging Social Interactions to Detect Misinformation on Social Media","type":"publication"},{"authors":["Davide Locatelli","Greta Damo","Debora Nozza"],"categories":[],"content":"","date":1683849600,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1683849600,"objectID":"51e9a40f92fa680cf8527ec7904a72b4","permalink":"https://milanlproc.github.io/publication/2023-cross-lingual-study-homotransphobia/","publishdate":"2023-05-12T14:48:20+01:00","relpermalink":"/publication/2023-cross-lingual-study-homotransphobia/","section":"publication","summary":"We present a cross-lingual study of homotransphobia on Twitter, examining the prevalence and forms of homotransphobic content in tweets related to LGBT issues in seven languages. Our findings reveal that homotransphobia is a global problem that takes on distinct cultural expressions, influenced by factors such as misinformation, cultural prejudices, and religious beliefs. To aid the detection of hate speech, we also devise a taxonomy that classifies public discourse around LGBT issues. By contributing to the growing body of research on online hate speech, our study provides valuable insights for creating effective strategies to combat homotransphobia on social media.","tags":["Hate Speech","NLP","multilingual"],"title":"A Cross-Lingual Study of Homotransphobia on Twitter","type":"publication"},{"authors":["Federico Bianchi","Pratyusha Kalluri","Esin Durmus","Faisal Ladhak","Myra Cheng","Debora Nozza","Tatsunori Hashimoto","Dan Jurafsky","James Zou","Aylin Caliskan"],"categories":[],"content":"","date":1683417600,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1683417600,"objectID":"7483eea9fc94959cf7db5c17264fa5e3","permalink":"https://milanlproc.github.io/publication/2023-text-to-image-stereotypes/","publishdate":"2023-05-07T14:48:20+01:00","relpermalink":"/publication/2023-text-to-image-stereotypes/","section":"publication","summary":"Machine learning models are now able to convert user-written text descriptions into naturalistic images. These models are available to anyone online and are being used to generate millions of images a day. We investigate these models and find that they amplify dangerous and complex stereotypes. Moreover, we find that the amplified stereotypes are difficult to predict and not easily mitigated by users or model owners. The extent to which these image-generation models perpetuate and amplify stereotypes and their mass deployment is cause for serious concern.","tags":["Vision","NLP","Bias","Fairness"],"title":"Easily Accessible Text-to-Image Generation Amplifies Demographic Stereotypes at Large Scale","type":"publication"},{"authors":["Sunipa Dev","Vinodkumar Prabhakaran","David Adelani","Dirk Hovy","Luciana Benotti"],"categories":[],"content":"","date":1683331200,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1683331200,"objectID":"9e1fac152996630e6fb17446c2ddd42a","permalink":"https://milanlproc.github.io/publication/2023-proceedings-c3nlp/","publishdate":"2023-05-06T14:48:20+01:00","relpermalink":"/publication/2023-proceedings-c3nlp/","section":"publication","summary":"Natural Language Processing has seen impressive gains in recent years. This research includes the demonstration by NLP models to have turned into useful technologies with improved capabilities, measured in terms of how well they match human behavior captured in web-scale language data or through annotations. However, human behavior is inherently shaped by the cultural contexts humans are embedded in, the values and beliefs they hold, and the social practices they follow, part of which will be reflected in the data used to train NLP models, and the behavior these NLP models exhibit. This workshop will bring together NLP researchers invested in this work, along with a community of scholars with multi-disciplinary expertise spanning linguistics, social sciences, and cultural anthropology.","tags":["computational social science","NLP","culture"],"title":"Proceedings of the First Workshop on Cross-Cultural Considerations in NLP (C3NLP)","type":"publication"},{"authors":["Chia-chien Hung","Anne Lauscher","Dirk Hovy","Simone Paolo Ponzetto","Goran Glavaš"],"categories":[],"content":"","date":1682985600,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1682985600,"objectID":"3c029d51f2b8c8bba4e070b7cafee2cc","permalink":"https://milanlproc.github.io/publication/2023-can-demographic-factors-improve-text-classification/","publishdate":"2023-05-02T14:48:20+01:00","relpermalink":"/publication/2023-can-demographic-factors-improve-text-classification/","section":"publication","summary":"Demographic factors (e.g., gender or age) shape our language. Previous work showed that incorporating demographic factors can consistently improve performance for various NLP tasks with traditional NLP models. In this work, we investigate whether these previous findings still hold with state-of-the-art pretrained Transformer-based language models (PLMs). We use three common specialization methods proven effective for incorporating external knowledge into pretrained Transformers (e.g., domain-specific or geographic knowledge). We adapt the language representations for the demographic dimensions of gender and age, using continuous language modeling and dynamic multi-task learning for adaptation, where we couple language modeling objectives with the prediction of demographic classes. Our results, when employing a multilingual PLM, show substantial gains in task performance across four languages (English, German, French, and Danish), which is consistent with the results of previous work. However, controlling for confounding factors – primarily domain and language proficiency of Transformer-based PLMs – shows that downstream performance gains from our demographic adaptation do not actually stem from demographic knowledge. Our results indicate that demographic specialization of PLMs, while holding promise for positive societal impact, still represents an unsolved problem for (modern) NLP.","tags":["NLP","language models","demographics"],"title":"Can Demographic Factors Improve Text Classification? Revisiting Demographic Adaptation in the Age of Transformers","type":"publication"},{"authors":["Giuseppe Attanasio","Eliana Pastor","Chiara Di Bonaventura","Debora Nozza"],"categories":[],"content":"","date":1682985600,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1682985600,"objectID":"32074e80da1ca4ed7fe78f30fa8cc384","permalink":"https://milanlproc.github.io/publication/2023-ferret-explainers-transformers/","publishdate":"2023-05-02T14:48:20+01:00","relpermalink":"/publication/2023-ferret-explainers-transformers/","section":"publication","summary":"As Transformers are increasingly relied upon to solve complex NLP problems, there is an increased need for their decisions to be humanly interpretable. While several explainable AI (XAI) techniques for interpreting the outputs of transformer-based models have been proposed, there is still a lack of easy access to using and comparing them.We introduce ferret, a Python library to simplify the use and comparisons of XAI methods on transformer-based classifiers.With ferret, users can visualize and compare transformers-based models output explanations using state-of-the-art XAI methods on any free-text or existing XAI corpora. Moreover, users can also evaluate ad-hoc XAI metrics to select the most faithful and plausible explanations. To align with the recently consolidated process of sharing and using transformers-based models from Hugging Face, ferret interfaces directly with its Python library.In this paper, we showcase ferret to benchmark XAI methods used on transformers for sentiment analysis and hate speech detection. We show how specific methods provide consistently better explanations and are preferable in the context of transformer models.","tags":["BERT","NLP","interpretability"],"title":"ferret: a Framework for Benchmarking Explainers on Transformers","type":"publication"},{"authors":null,"categories":["NLP","hate speech"],"content":"Hate speech is one of the most central problems of online life, with real-life consequences: various hate crimes started as online hate. 1 in 4 users have been harassed online (Pew Research), 63% of the targets are women (Cox commission). The pandemic-related increase of online activity has only intensified this problem: over 500 million messages are sent each day. To address this problem, content providers and policymakers need automated assistance in spotting and addressing hateful comments. INDOMITA will provide those methods.\nBut hate speech is complex. What is considered offensive varies by social norms and user demographics. \u0026ldquo;Yo, a**hole!\u0026rdquo; is acceptable among friends, but problematic with strangers. But current hate speech detection only uses the words in a message to determine whether it is hate speech or not. It does not consider who says those words and to whom, potentially missing subtle forms of hate speech and mislabeling harmless interactions due to overreliance on keywords. This overly simplified approach is a significant limitation. Our user-based approach will address that.\nBut \u0026ldquo;better\u0026rdquo; detection is subjective: people have very different thresholds for what they find offensive. Current evaluation metrics do not allow for such nuance. Any tool that improves the overall detection rate will be judged sufficient. But a tool that works great for most users, but fails for some other groups might achieve good performance. It still fails in the task it was designed to do. Our fairness metrics will correct this.\nBut detection alone does not solve the problem. Interventions like counterspeech or education have a lasting impact on abusive users. It can be enough to alert them to the hurtful nature of their message. At other times, they will only respond if someone they perceive as authoritative engages in a discussion. This decision requires an understanding of the abusive user\u0026rsquo;s social context. Our user-based counterspeech approach facilitates this.\nOur novel, user-centered approach will address hatespeech in three ways:\n comprehensively modeling a complex issue to improve detection across input formats (text, images, and video), by incorporating socio-demographic context into the model. developing methods to automate counterspeech and to address abusive users effectively. developing evaluation metrics that assess fairness and performance and account for the subjective nature of hate speech. In sum, our user focus will revolutionize existing research on hate speech detection, both in Italian and other languages, to give authorities and media providers better ways to assess content for immediate countermeasures. It will allow us to bridge language differences more easily than purely text-based methods, as we capture socio-behavioral patterns that generalize across languages. It will generate revolutionary insights of the complex dynamics between online actors and the generation of online hate.\nINDOMITA is supported by a MUR FARE 2020 initiative under grant agreement Prot. R20YSMBZ8S.\n","date":1681344000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1681344000,"objectID":"5af9db37d16b9da1095a77d2e9b2ea3b","permalink":"https://milanlproc.github.io/project/indomita/","publishdate":"2023-04-13T00:00:00Z","relpermalink":"/project/indomita/","section":"project","summary":"Innovative Demographically-aware Hate Speech Detection in Online Media in Italian","tags":["demographic","social media","NLP"],"title":"INDOMITA","type":"project"},{"authors":["Donya Rooein","Amanda Cercas Curry","Dirk Hovy"],"categories":[],"content":"","date":1681257600,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1681257600,"objectID":"d47bea93b523ae78237df41065e8b24f","permalink":"https://milanlproc.github.io/publication/2023-know-your-audience-education/","publishdate":"2023-04-12T17:19:53+01:00","relpermalink":"/publication/2023-know-your-audience-education/","section":"publication","summary":"Large language models (LLMs) offer a range of new possibilities, including adapting the text to different audiences and their reading needs. But how well do they adapt? We evaluate the readability of answers generated by four state-of-the-art LLMs (commercial and open-source) to science questions when prompted to target different age groups and education levels. To assess the adaptability of LLMs to diverse audiences, we compare the readability scores of the generated responses against the recommended comprehension level of each age and education group. We find large variations in the readability of the answers by different LLMs. Our results suggest LLM answers need to be better adapted to the intended audience demographics to be more comprehensible. They underline the importance of enhancing the adaptability of LLMs in education settings to cater to diverse age and education levels. Overall, current LLMs have set readability ranges and do not adapt well to different audiences, even when prompted. That limits their potential for educational purposes.","tags":["Education","NLP","LLMs"],"title":"Know Your Audience: Do LLMs Adapt to Different Age and Education Levels?","type":"publication"},{"authors":["Rishav Hada","Amir Ebrahimi Fard","Sarah Shugars","Federico Bianchi","Patricia Rossini","Dirk Hovy","Rebekah Tromble","Nava Tintareva"],"categories":[],"content":"","date":1677456000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1677456000,"objectID":"0dd3066b1e69e63f2f33c17401858cd2","permalink":"https://milanlproc.github.io/publication/2023-beyond-digital-echo-chambers-role-viewpoint-diversity-political-discussion/","publishdate":"2023-02-27T14:48:20+01:00","relpermalink":"/publication/2023-beyond-digital-echo-chambers-role-viewpoint-diversity-political-discussion/","section":"publication","summary":"Increasingly taking place in online spaces, modern political conversations are typically perceived to be unproductively affirming---siloed in so called 'echo chambers' of exclusively like-minded discussants. Yet, to date we lack sufficient means to measure viewpoint diversity in conversations. To this end, in this paper, we operationalize two viewpoint metrics proposed for recommender systems and adapt them to the context of social media conversations. This is the first study to apply these two metrics (Representation and Fragmentation) to real world data and to consider the implications for online conversations specifically. We apply these measures to two topics---daylight savings time (DST), which serves as a control, and the more politically polarized topic of immigration. We find that the diversity scores for both Fragmentation and Representation are lower for immigration than for DST. Further, we find that while pro-immigrant views receive consistent pushback on the platform, anti-immigrant views largely operate within echo chambers. We observe less severe yet similar patterns for DST. Taken together, Representation and Fragmentation paint a meaningful and important new picture of viewpoint diversity.","tags":["NLP","computational social science","political science","echo chambers"],"title":"Beyond Digital 'Echo Chambers': The Role of Viewpoint Diversity in Political Discussion","type":"publication"},{"authors":null,"categories":["nlp"],"content":"Debora Nozza have been recently awarded a €1.5m ERC Starting Grant project 2023 for my project PERSONAE.\nPERSONAE will make language technology (LT) accessible and valuable to everyone. I will revolutionize research in subjective tasks in NLP such as abusive language detection and sentiment and emotion analysis by developing a new field called personal NLP, yielding new datasets, tasks, and algorithms. This new research area will explore subjective tasks from the perspective of the individual as information receiver, making users active actors in the creation of LTs instead of mere recipients. This will allow for a more tailored, effective approach to NLP model design, resulting in better models overall.\nEach person has their own interests and preferences based on their background and experience. These factors impact their views of what makes them happy, angry, or depressed over time. Language technologies (LTs) can consider individual preferences. However, current research presumes a static view of subjectivity: that a single ground truth underlies subjective tasks such as abusive language detection, an assumption that lacks human variability and prevents universal access to LTs.\nLanguage-based AI such as virtual assistants is widely available. But despite significant scientific advances, most LT applications are inaccessible to individuals and their public\u0026rsquo;s opinion has become increasingly negative. GPT-3\u0026rsquo;s 2020 release boosted business-oriented applications such as copywriting and chatbots, yet few that let people improve their lives—for example, by controlling what they see on social media. This gap becomes more pronounced for subjective tasks.\nPERSONAE will help design subjective LTs that can be adapted by individuals at will over time. Based on an ambitious meta approach able to generalize from existing, disconnected work, PERSONAE will rely on fully personalizable privacy-aware algorithms that can be used by anyone. It will reveal benefits of LT far beyond those of existing systems, paving the way for future applications.\n🌏🌏 Check out the web article on my project!\n🎙️🎙️ Check out my latest interview on Radio 24 in Italian!\n","date":1677456000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1677456000,"objectID":"f4fd2ab1f739b9ebb14a4a53c0a9ae87","permalink":"https://milanlproc.github.io/project/personae/","publishdate":"2023-02-27T00:00:00Z","relpermalink":"/project/personae/","section":"project","summary":"Personalized and Subjective approaches to Natural Language Processing","tags":["hate speech","subjectivity","nlp"],"title":"PERSONAE","type":"project"},{"authors":["Federico Bianchi","Amanda Cercas Curry","Dirk Hovy"],"categories":[],"content":"","date":1673136000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1673136000,"objectID":"c607865224483ec3caddadf624167af7","permalink":"https://milanlproc.github.io/publication/2023-ai-normal-accidents-waiting-happen/","publishdate":"2023-01-08T14:48:20+01:00","relpermalink":"/publication/2023-ai-normal-accidents-waiting-happen/","section":"publication","summary":"Artificial Intelligence (AI) is at a crucial point in its development: stable enough to be used in production systems, and increasingly pervasive in our lives. What does that mean for its safety? In his book Normal Accidents, the sociologist Charles Perrow proposed a framework to analyze new technologies and the risks they entail. He showed that major accidents are nearly unavoidable in complex systems with tightly coupled components if they are run long enough. In this essay, we apply and extend Perrow’s framework to AI to assess its potential risks. Today’s AI systems are already highly complex, and their complexity is steadily increasing. As they become more ubiquitous, different algorithms will interact directly, leading to tightly coupled systems whose capacity to cause harm we will be unable to predict. We argue that under the current paradigm, Perrow’s normal accidents apply to AI systems and it is only a matter of time before one occurs.","tags":["AI","models","sociology"],"title":"Viewpoint: Artificial Intelligence Accidents Waiting to Happen?","type":"publication"},{"authors":["Federico Bianchi","Stefanie Hills","Patricia Rossini","Dirk Hovy","Rebekah Tromble","Nava Tintarev"],"categories":[],"content":"","date":1670803200,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1670803200,"objectID":"281aa24403feb3bfd4b39889e2df47e3","permalink":"https://milanlproc.github.io/publication/2022-not_just_hate/","publishdate":"2022-12-12T14:48:20+01:00","relpermalink":"/publication/2022-not_just_hate/","section":"publication","summary":"Well-annotated data is a prerequisite for good Natural Language Processing models. Too often, though, annotation decisions are governed by optimizing time or annotator agreement. We make a case for nuanced efforts in an interdisciplinary setting for annotating offensive online speech. Detecting offensive content is rapidly becoming one of the most important real-world NLP tasks. However, most datasets use a single binary label, e.g., for hate or incivility, even though each concept is multi-faceted. This modeling choice severely limits nuanced insights, but also performance.We show that a more fine-grained multi-label approach to predicting incivility and hateful or intolerant content addresses both conceptual and performance issues.We release a novel dataset of over 40,000 tweets about immigration from the US and UK, annotated with six labels for different aspects of incivility and intolerance.Our dataset not only allows for a more nuanced understanding of harmful speech online, models trained on it also outperform or match performance on benchmark datasets","tags":["Hate Speech","NLP","dataset"],"title":"It's Not Just Hate: A Multi-Dimensional Perspective on Detecting Harmful Speech Online","type":"publication"},{"authors":["Federico Bianchi","Vincenzo Cutrona","Dirk Hovy"],"categories":[],"content":"","date":1670803200,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1670803200,"objectID":"3ac3335b591cf77ff6390c5bf2664375","permalink":"https://milanlproc.github.io/publication/2022-twitter_demographer/","publishdate":"2022-12-12T14:48:20+01:00","relpermalink":"/publication/2022-twitter_demographer/","section":"publication","summary":"Twitter data have become essential to Natural Language Processing (NLP) and social science research, driving various scientific discoveries in recent years. However, the textual data alone are often not enough to conduct studies: especially, social scientists need more variables to perform their analysis and control for various factors. How we augment this information, such as users’ location, age, or tweet sentiment, has ramifications for anonymity and reproducibility, and requires dedicated effort. This paper describes Twitter-Demographer, a simple, flow-based tool to enrich Twitter data with additional information about tweets and users. {tool is aimed at NLP practitioners, psycho-linguists, and (computational) social scientists who want to enrich their datasets with aggregated information, facilitating reproducibility, and providing algorithmic privacy-by-design measures for pseudo-anonymity. We discuss our design choices, inspired by the flow-based programming paradigm, to use black-box components that can easily be chained together and extended. We also analyze the ethical issues related to the use of this tool, and the built-in measures to facilitate pseudo-anonymity.","tags":["Social Media","NLP","dataset","Twitter"],"title":"Twitter-Demographer: A Flow-based Tool to Enrich Twitter Data","type":"publication"},{"authors":["Marius Hessenthaler","Emma Strubell","Dirk Hovy","Anne Lauscher"],"categories":[],"content":"","date":1670630400,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1670630400,"objectID":"c8e28f24b48da4ce4375ff8698f4155c","permalink":"https://milanlproc.github.io/publication/2022-bridging_fairness_and_environmental_sustainability_in_natural_language_processing/","publishdate":"2022-12-12T14:48:20+01:00","relpermalink":"/publication/2022-bridging_fairness_and_environmental_sustainability_in_natural_language_processing/","section":"publication","summary":"Fairness and environmental impact are important research directions for the sustainable development of artificial intelligence. However, while each topic is an active research area in natural language processing (NLP), there is a surprising lack of research on the interplay between the two fields. This lacuna is highly problematic, since there is increasing evidence that an exclusive focus on fairness can actually hinder environmental sustainability, and vice versa. In this work, we shed light on this crucial intersection in NLP by (1) investigating the efficiency of current fairness approaches through surveying example methods for reducing unfair stereotypical bias from the literature, and (2) evaluating a common technique to reduce energy consumption (and thus environmental impact) of English NLP models, knowledge distillation (KD), for its impact on fairness. In this case study, we evaluate the effect of important KD factors, including layer and dimensionality reduction, with respect to: (a) performance on the distillation task (natural language inference and semantic similarity prediction), and (b) multiple measures and dimensions of stereotypical bias (e.g., gender bias measured via the Word Embedding Association Test). Our results lead us to clarify current assumptions regarding the effect of KD on unfair bias: contrary to other findings, we show that KD can actually decrease model fairness.","tags":["NLP","fairness","sustainability"],"title":"Bridging Fairness and Environmental Sustainability in Natural Language Processing","type":"publication"},{"authors":["Anne Lauscher","Federico Bianchi","Samuel R. Bowman","Dirk Hovy"],"categories":[],"content":"","date":1670630400,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1670630400,"objectID":"d3d511c9d6fd6b178e0bf4de6bc057c3","permalink":"https://milanlproc.github.io/publication/2022-socioprobe_what_when_where_language_models_learn_about_sociodemographics/","publishdate":"2022-12-10T14:48:20+01:00","relpermalink":"/publication/2022-socioprobe_what_when_where_language_models_learn_about_sociodemographics/","section":"publication","summary":"Pre-trained language models (PLMs) have outperformed other NLP models on a wide range of tasks. Opting for a more thorough understanding of their capabilities and inner workings, researchers have established the extend to which they capture lower-level knowledge like grammaticality, and mid-level semantic knowledge like factual understanding. However, there is still little understanding of their knowledge of higher-level aspects of language. In particular, despite the importance of sociodemographic aspects in shaping our language, the questions of whether, where, and how PLMs encode these aspects, e.g., gender or age, is still unexplored. We address this research gap by probing the sociodemographic knowledge of different single-GPU PLMs on multiple English data sets via traditional classifier probing and information-theoretic minimum description length probing. Our results show that PLMs do encode these sociodemographics, and that this knowledge is sometimes spread across the layers of some of the tested PLMs. We further conduct a multilingual analysis and investigate the effect of supplementary training to further explore to what extent, where, and with what amount of pre-training data the knowledge is encoded. Our overall results indicate that sociodemographic knowledge is still a major challenge for NLP. PLMs require large amounts of pre-training data to acquire the knowledge and models that excel in general language understanding do not seem to own more knowledge about these aspects.","tags":["NLP","sociodemographics","transformers","language models"],"title":"SocioProbe: What, When, and Where Language Models Learn about Sociodemographics","type":"publication"},{"authors":["Samia Touileb","Debora Nozza"],"categories":[],"content":"","date":1670544000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1670544000,"objectID":"5d92f30200b04a92ce9594e899951021","permalink":"https://milanlproc.github.io/publication/2022-honest-harmful-scandinavian-language-model/","publishdate":"2022-12-09T14:48:20+01:00","relpermalink":"/publication/2022-honest-harmful-scandinavian-language-model/","section":"publication","summary":"Scandinavian countries are perceived as role-models when it comes to gender equality. With the advent of pre-trained language models and their widespread usage, we investigate to what extent gender-based harmful and toxic content exist in selected Scandinavian language models. We examine nine models, covering Danish, Swedish, and Norwegian, by manually creating template-based sentences and probing the models for completion. We evaluate the completions using two methods for measuring harmful and toxic completions and provide a thorough analysis of the results. We show that Scandinavian pre-trained language models contain harmful and gender-based stereotypes with similar values across all languages. This finding goes against the general expectations related to gender equality in Scandinavian countries and shows the possible problematic outcomes of using such models in real-world settings.","tags":["Hate Speech","BERT","NLP","dataset","multilingual"],"title":"Measuring Harmful Representations in Scandinavian Language Models","type":"publication"},{"authors":["Paul Röttger","Debora Nozza","Federico Bianchi","Dirk Hovy"],"categories":[],"content":"","date":1666224000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1666224000,"objectID":"a2ca0d19f38470541be45569c6948729","permalink":"https://milanlproc.github.io/publication/2022-strategies-hate-speech-detection-under-resourced-languages/","publishdate":"2022-10-20T14:48:20+01:00","relpermalink":"/publication/2022-strategies-hate-speech-detection-under-resourced-languages/","section":"publication","summary":"Hate speech is a global phenomenon, but most hate speech datasets so far focus on English-language content. This hinders the development of more effective hate speech detection models in hundreds of languages spoken by billions across the world. More data is needed, but annotating hateful content is expensive, time-consuming and potentially harmful to annotators. To mitigate these issues, we explore data-efficient strategies for expanding hate speech detection into under-resourced languages. In a series of experiments with mono- and multilingual models across five non-English languages, we find that 1) a small amount of target-language fine-tuning data is needed to achieve strong performance, 2) the benefits of using more such data decrease exponentially, and 3) initial fine-tuning on readily-available English data can partially substitute target-language data and improve model generalisability. Based on these findings, we formulate actionable recommendations for hate speech detection in low-resource language settings.","tags":["Hate Speech","NLP","multilingual"],"title":"Data-Efficient Strategies for Expanding Hate Speech Detection into Under-Resourced Languages","type":"publication"},{"authors":["Giuseppe Attanasio","Debora Nozza","Federico Bianchi","Dirk Hovy"],"categories":[],"content":"","date":1665619200,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1665619200,"objectID":"bcb8bb12e133a3d8b26d9091bc35a703","permalink":"https://milanlproc.github.io/publication/2022-limitation-diachronic-continuous-training/","publishdate":"2022-10-13T14:48:20+01:00","relpermalink":"/publication/2022-limitation-diachronic-continuous-training/","section":"publication","summary":"Language is constantly changing and evolving, leaving language models to quickly become outdated, both factually and linguistically. Recent research proposes we continuously update our models using new data. Continuous training allows us to teach language models about new events and facts and changing norms. However, continuous training also means continuous costs. We show there is currently limited evidence for the benefits of continuous training, be it for the actual downstream performance or the environmental cost. Our results show continuous training does not significantly improve performance. While it is clear that, sooner or later, our language models need to be updated, it is unclear when this effort is worth the cost. We call for a critical reflection about when and how to use continuous training and for more benchmarks to support this research direction.","tags":["NLP","BERT"],"title":"Is It Worth the (Environmental) Cost? Limited Evidence for the Benefits of Diachronic Continuous Training","type":"publication"},{"authors":["Anne Lauscher","Archie Crowley","Dirk Hovy"],"categories":[],"content":"","date":1665532800,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1665532800,"objectID":"a7cbc519326b6c1f4579513c619bb3ac","permalink":"https://milanlproc.github.io/publication/2022-welcome_modern_world_pronouns_identity-inclusive_natural_language_processing_beyond_gender/","publishdate":"2022-10-12T14:48:20+01:00","relpermalink":"/publication/2022-welcome_modern_world_pronouns_identity-inclusive_natural_language_processing_beyond_gender/","section":"publication","summary":"The world of pronouns is changing – from a closed word class with few members to an open set of terms to reflect identities. However, Natural Language Processing (NLP) barely reflects this linguistic shift, resulting in the possible exclusion of non-binary users, even though recent work outlined the harms of gender-exclusive language technology. The current modeling of 3rd person pronouns is particularly problematic. It largely ignores various phenomena like neopronouns, i.e., novel pronoun sets that are not (yet) widely established. This omission contributes to the discrimination of marginalized and underrepresented groups, e.g., non-binary individuals. It thus prevents gender equality, one of the UN’s sustainable development goals (goal 5). Further, other identity-expressions beyond gender are ignored by current NLP technology. This paper provides an overview of 3rd person pronoun issues for NLP. Based on our observations and ethical considerations, we define a series of five desiderata for modeling pronouns in language technology, which we validate through a survey. We evaluate existing and novel modeling approaches w.r.t. these desiderata qualitatively and quantify the impact of a more discrimination-free approach on an established benchmark dataset.","tags":["NLP","pronouns","fairness","ethics"],"title":"Welcome to the Modern World of Pronouns: Identity-Inclusive Natural Language Processing beyond Gender","type":"publication"},{"authors":["A. Stevie Bergman","Gavin Abercrombie","Shannon Spruit","Dirk Hovy","Emily Dinan","Y-Lan Boureau","Verena Rieser"],"categories":[],"content":"","date":1662940800,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1662940800,"objectID":"3acc9d4e5de91b3e10aeb9ecb187107d","permalink":"https://milanlproc.github.io/publication/2022-guiding_release_safer_e2e_conversational_ai_through_value_sensitive_design/","publishdate":"2022-09-12T14:48:20+01:00","relpermalink":"/publication/2022-guiding_release_safer_e2e_conversational_ai_through_value_sensitive_design/","section":"publication","summary":"Over the last several years, end-to-end neural conversational agents have vastly improved their ability to carry unrestricted, open-domain conversations with humans. However, these models are often trained on large datasets from the Internet and, as a result, may learn undesirable behaviours from this data, such as toxic or otherwise harmful language. Thus, researchers must wrestle with how and when to release these models. In this paper, we survey recent and related work to highlight tensions between values, potential positive impact, and potential harms. We also provide a framework to support practitioners in deciding whether and how to release these models, following the tenets of value-sensitive design.","tags":["NLP","NLG","fairness","ethics","value sensitive design","chatbots"],"title":"Guiding the Release of Safer E2E Conversational AI through Value Sensitive Design","type":"publication"},{"authors":["Debora Nozza","Federico Bianchi","Giuseppe Attanasio"],"categories":[],"content":"","date":1657584000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1657584000,"objectID":"e1aa310a2b39f2b3d80eb484d6acf1be","permalink":"https://milanlproc.github.io/publication/2022-hate-speech-detection-italian-social-media/","publishdate":"2022-07-12T14:48:20+01:00","relpermalink":"/publication/2022-hate-speech-detection-italian-social-media/","section":"publication","summary":"Online hate speech is a dangerous phenomenon that can (and should) be promptly counteracted properly. While Natural Language Processing supplies appropriate algorithms for trying to reach this objective, all research efforts are directed toward the English language. This strongly limits the classification power on non-English languages. In this paper, we test several learning frameworks for identifying hate speech in Italian text. We release HATE-ITA, a multi-language model trained on a large set of English data and available Italian datasets. HATE-ITA performs better than mono-lingual models and seems to adapt well also on language-specific slurs. We hope our findings will encourage the research in other mid-to-low resource communities and provide a valuable benchmarking tool for the Italian community.","tags":["Hate Speech","BERT","NLP"],"title":"HATE-ITA: Hate Speech Detection in Italian Social Media Text","type":"publication"},{"authors":["Paul Röttger","Haitham Seelawi","Debora Nozza","Zeerak Talat","Bertie Vidgen"],"categories":[],"content":"","date":1657584000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1657584000,"objectID":"44e6c8311312a4c0b80faea18522c363","permalink":"https://milanlproc.github.io/publication/2022-multilingual-hatecheck-hate-speech-functional-tests/","publishdate":"2022-07-12T14:48:20+01:00","relpermalink":"/publication/2022-multilingual-hatecheck-hate-speech-functional-tests/","section":"publication","summary":"Hate speech detection models are typically evaluated on held-out test sets. However, this risks painting an incomplete and potentially misleading picture of model performance because of increasingly well-documented systematic gaps and biases in hate speech datasets. To enable more targeted diagnostic insights, recent research has thus introduced functional tests for hate speech detection models. However, these tests currently only exist for English-language content, which means that they cannot support the development of more effective models in other languages spoken by billions across the world. To help address this issue, we introduce Multilingual HateCheck (MHC), a suite of functional tests for multilingual hate speech detection models. MHC covers 34 functionalities across ten languages, which is more languages than any other hate speech dataset. To illustrate MHC’s utility, we train and test a high-performing multilingual hate speech detection model, and reveal critical model weaknesses for monolingual and cross-lingual applications.","tags":["Hate Speech","BERT","NLP"],"title":"Multilingual HateCheck: Functional Tests for Multilingual Hate Speech Detection Models","type":"publication"},{"authors":["Tommaso Fornaciari","Alexandra Uma","Massimo Poesio","Dirk Hovy"],"categories":[],"content":"","date":1652313600,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1652313600,"objectID":"9827bbbeabf9916ea04af9f70ca7d9ed","permalink":"https://milanlproc.github.io/publication/2022-hard_soft_evaluation_nlp_models_bootstrap_sampling-boostsa/","publishdate":"2022-05-12T14:48:20+01:00","relpermalink":"/publication/2022-hard_soft_evaluation_nlp_models_bootstrap_sampling-boostsa/","section":"publication","summary":"Natural Language Processing (NLP) ‘s applied nature makes it necessary to select the most effective and robust models. Producing slightly higher performance is insufficient; we want to know whether this advantage will carry over to other data sets. Bootstrapped significance tests can indicate that ability.So while necessary, computing the significance of models’ performance differences has many levels of complexity. It can be tedious, especially when the experimental design has many conditions to compare and several runs of experiments.We present BooStSa, a tool that makes it easy to compute significance levels with the BOOtSTrap SAmpling procedure to evaluate models that predict not only standard hard labels but soft-labels (i.e., probability distributions over different classes) as well.","tags":["NLP","bootstrap sampling","stats","p-value"],"title":"Hard and Soft Evaluation of NLP models with BOOtSTrap SAmpling - BooStSa","type":"publication"},{"authors":["Giuseppe Attanasio","Debora Nozza","Federico Bianchi"],"categories":[],"content":"","date":1651276800,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1651276800,"objectID":"8358f2dc7dc8713df3a0102aca07e999","permalink":"https://milanlproc.github.io/publication/2022-semeval-mami-perceiverio-misogyny-multimodal-meme/","publishdate":"2022-04-30T14:48:20+01:00","relpermalink":"/publication/2022-semeval-mami-perceiverio-misogyny-multimodal-meme/","section":"publication","summary":"In this paper, we describe the system proposed by the MilaNLP team for the Multimedia Automatic Misogyny Identification (MAMI) challenge. We use Perceiver IO as a multimodal late fusion over unimodal streams to address both sub-tasks A and B. We build unimodal embeddings using Vision Transformer (image) and RoBERTa (text transcript). We enrich the input representation using face and demographic recognition, image captioning, and detection of adult content and web entities. To the best of our knowledge, this work is the first to use Perceiver IO combining text and image modalities. The proposed approach outperforms unimodal and multimodal baselines.","tags":["Misogyny","Meme","Multimodal","PerceiverIO","Architectures"],"title":"MilaNLP at SemEval-2022 Task 5: Using Perceiver IO for Detecting Misogynous Memes with Text and Image Modalities","type":"publication"},{"authors":["Federico Bianchi","Debora Nozza","Dirk Hovy"],"categories":[],"content":"","date":1650499200,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1650499200,"objectID":"100083027ef95cdeecf37cf18bd6bf97","permalink":"https://milanlproc.github.io/publication/2022-language-invariant-properties-nlp/","publishdate":"2022-04-12T14:48:20+01:00","relpermalink":"/publication/2022-language-invariant-properties-nlp/","section":"publication","summary":"Meaning is context-dependent, but many properties of language (should) remain the same even if we transform the context. For example, sentiment, entailment, or speaker properties should be the same in a translation and original of a text. We introduce language invariant properties: i.e., properties that should not change when we transform text, and how they can be used to quantitatively evaluate the robustness of transformation algorithms. We use translation and paraphrasing as transformation examples, but our findings apply more broadly to any transformation. Our results indicate that many NLP transformations change properties like author characteristics, i.e., make them sound more male. We believe that studying these properties will allow NLP to address both social factors and pragmatic aspects of language. We also release an application suite that can be used to evaluate the invariance of transformation applications.","tags":["NLP","Language Invariant Properties","Meaning"],"title":"Language Invariant Properties in Natural Language Processing","type":"publication"},{"authors":null,"categories":["NLP","computational social science"],"content":"Over the last decade, discontent in democracy, mistrust in institutions, and the rise of populist parties have strained European societies. Underlying these tensions are often increasing inequalities in Western countries, which fuel the discontent of individuals. The Covid pandemic further exacerbated these problems, as anti-Covid measures taken by governments differently impacted societal groups.\nThe MENTALISM project, funded by Fondazione Cariplo under grant agreement 2022-1480, combines modern social media analysis with traditional survey data to track inequality across Italy through the lens of the pandemic.\nOur ground-breaking mixed-methods approach uses machine learning and text analysis to trace online grievances in a vast corpus of social media data. We combine these methods with survey protocols and econometric analysis to validate the findings and provide actionable policy advice. MENTALISM combines the advantages of social media data (high-frequency, individual-level information) with the strength of socio-economic surveys (representativeness). Our novel interdisciplinary approach will critically evaluate the value of social media monitoring for policy feedback. Moreover, it will establish protocols for policymakers to better respond to growing grievances brought on by inequality at various steps in the process.\nThis interdisciplinary project is led by Profs. Carlo Schwarz (economics), and Dirk Hovy (NLP).\n","date":1649808000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1649808000,"objectID":"f1efa90206391b8dcd0df63edbc66767","permalink":"https://milanlproc.github.io/project/mentalism/","publishdate":"2022-04-13T00:00:00Z","relpermalink":"/project/mentalism/","section":"project","summary":"Measuring, Tracking, and Analyzing Inequality using Social Media","tags":["demographic","inequality","economics","social media","NLP"],"title":"MENTALISM","type":"project"},{"authors":["Giuseppe Attanasio","Debora Nozza","Eliana Pastor","Dirk Hovy"],"categories":[],"content":"","date":1649721600,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1649721600,"objectID":"1bc8b1ca2c825938f3fb71def878b8a1","permalink":"https://milanlproc.github.io/publication/2022-interpretability-transformer-mysogyny-detection/","publishdate":"2022-04-12T14:48:20+01:00","relpermalink":"/publication/2022-interpretability-transformer-mysogyny-detection/","section":"publication","summary":"Transformer-based Natural Language Processing models have become the standard for hate speech detection. However, the unconscious use of these techniques for such a critical task comes with negative consequences. Various works have demonstrated that hate speech classifiers are biased. These findings have prompted efforts to explain classifiers, mainly using attribution methods. In this paper, **we provide the first benchmark study of interpretability approaches for hate speech detection**. We cover four post-hoc token attribution approaches to explain the predictions of Transformer-based misogyny classifiers in English and Italian. Further, we compare generated attributions to attention analysis. We find that only two algorithms provide faithful explanations aligned with human expectations. Gradient-based methods and attention, however, show inconsistent outputs, making their value for explanations questionable for hate speech detection tasks.","tags":["Hate Speech","BERT","NLP"],"title":"Benchmarking Post-Hoc Interpretability Approaches for Transformer-based Misogyny Detection","type":"publication"},{"authors":["Debora Nozza","Federico Bianchi","Anne Lauscher","Dirk Hovy"],"categories":[],"content":"","date":1649721600,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1649721600,"objectID":"9088977c974224d138af8fd8ed78b98e","permalink":"https://milanlproc.github.io/publication/2022-honest-hurtful-language-model-lgbtqia+/","publishdate":"2022-04-12T14:48:20+01:00","relpermalink":"/publication/2022-honest-hurtful-language-model-lgbtqia+/","section":"publication","summary":"Current language technology is ubiquitous and directly influences individuals' lives worldwide. Given the recent trend in AI on training and constantly releasing new and powerful large language models (LLMs), there is a need to assess their biases and potential concrete consequences. While some studies have highlighted the shortcomings of these models, there is only little on the negative impact of LLMs on LGBTQIA+ individuals. In this paper, we investigated a state-of-the-art template-based approach for measuring the harmfulness of English LLMs sentence completion when the subjects belong to the LGBTQIA+ community. Our findings show that, on average, **the most likely LLM-generated completion is an identity attack 13% of the time**. Our results raise serious concerns about the applicability of these models in production environments.","tags":["Hate Speech","BERT","NLP"],"title":"Measuring Harmful Sentence Completion in Language Models for LGBTQIA+ Individuals","type":"publication"},{"authors":["Debora Nozza","Federico Bianchi","Dirk Hovy"],"categories":[],"content":"","date":1649721600,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1649721600,"objectID":"caa2e3beea8b00d14fdcf83df519c588","permalink":"https://milanlproc.github.io/publication/2022-pipelines-social-bias-testing-language-models/","publishdate":"2022-04-12T14:48:20+01:00","relpermalink":"/publication/2022-pipelines-social-bias-testing-language-models/","section":"publication","summary":"The maturity level of language models is now at a stage in which many companies rely on them to solve various tasks. However, while research has shown how biased and harmful these models are, **systematic ways of integrating social bias tests into development pipelines are still lacking. This short paper suggests how to use these verification techniques in development pipelines.** We take inspiration from software testing and suggest addressing social bias evaluation as software testing. We hope to open a discussion on the best methodologies to handle social bias testing in language models.","tags":["Hate Speech","BERT","NLP"],"title":"Pipelines for Social Bias Testing of Large Language Models","type":"publication"},{"authors":["Paul Röttger","Bertie Vidgen","Dirk Hovy","Janet B. Pierrehumbert"],"categories":[],"content":"","date":1649721600,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1649721600,"objectID":"805f462ec62d4458e8694ccd8b6293f5","permalink":"https://milanlproc.github.io/publication/2022-two-contrasting-data-annotation-paradigms-subjective-nlp-tasks/","publishdate":"2022-04-12T14:48:20+01:00","relpermalink":"/publication/2022-two-contrasting-data-annotation-paradigms-subjective-nlp-tasks/","section":"publication","summary":"Labelled data is the foundation of most natural language processing tasks. However, labelling data is difficult and there often are diverse valid beliefs about what the correct data labels should be. So far, dataset creators have acknowledged annotator subjectivity, but rarely actively managed it in the annotation process. This has led to partly-subjective datasets that fail to serve a clear downstream use. To address this issue, we propose two contrasting paradigms for data annotation. The descriptive paradigm encourages annotator subjectivity, whereas the prescriptive paradigm discourages it. Descriptive annotation allows for the surveying and modelling of different beliefs, whereas prescriptive annotation enables the training of models that consistently apply one belief. We discuss benefits and challenges in implementing both paradigms, and argue that dataset creators should explicitly aim for one or the other to facilitate the intended use of their dataset. Lastly, we conduct an annotation experiment using hate speech data that illustrates the contrast between the two paradigms.","tags":["Annotation","NLP","dataset"],"title":"Two Contrasting Data Annotation Paradigms for Subjective NLP Tasks","type":"publication"},{"authors":["Federico Bianchi","Debora Nozza","Dirk Hovy"],"categories":[],"content":"","date":1649721600,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1649721600,"objectID":"c798f81abd0bcd706c51aa0ea04a2d12","permalink":"https://milanlproc.github.io/publication/2022-xlmemo-multilingual-emotion-prediction/","publishdate":"2022-04-12T14:48:20+01:00","relpermalink":"/publication/2022-xlmemo-multilingual-emotion-prediction/","section":"publication","summary":"Detecting emotion in text allows social and computational scientists to study how people behave and react to online events. However, developing these tools for different languages requires data that is not always available. This paper collects the available emotion detection datasets across 19 languages. We train a multilingual emotion prediction model for social media data, XLM-EMO. The model shows competitive performance in a zero-shot setting, suggesting it is helpful in the context of low-resource languages. We release our model to the community so that interested researchers can directly use it.","tags":["Sentiment Analysis","Emotion Detection","Italian","BERT","NLP","dataset","multilingual"],"title":"XLM-EMO: Multilingual Emotion Prediction in Social Media Text","type":"publication"},{"authors":["Chia-Chien Hung, Anne Lauscher, Simone Paolo Ponzetto, Goran Glavaš"],"categories":[],"content":"","date":1649376000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1649376000,"objectID":"7c36894900b37eb709e17f232c12092e","permalink":"https://milanlproc.github.io/publication/2022-acl-findings-dstod/","publishdate":"2022-04-07T11:48:20+01:00","relpermalink":"/publication/2022-acl-findings-dstod/","section":"publication","summary":" Recent work has shown that self-supervised dialog-specific pretraining on large conversational datasets yields substantial gains over traditional language modeling (LM) pretraining in downstream task-oriented dialog (TOD). These approaches, however, exploit general dialogic corpora (e.g., Reddit) and thus presumably fail to reliably embed domain-specific knowledge useful for concrete downstream TOD domains. In this work, we investigate the effects of domain specialization of pretrained language models (PLMs) for task-oriented dialog. Within our DS-TOD framework, we first automatically extract salient domain-specific terms, and then use them to construct DomainCC and DomainReddit -- resources that we leverage for domain-specific pretraining, based on (i) masked language modeling (MLM) and (ii) response selection (RS) objectives, respectively. We further propose a resource-efficient and modular domain specialization by means of domain adapters -- additional parameter-light layers in which we encode the domain knowledge. Our experiments with two prominent TOD tasks -- dialog state tracking (DST) and response retrieval (RR) -- encompassing five domains from the MultiWOZ TOD benchmark demonstrate the effectiveness of our domain specialization approach. Moreover, we show that the light-weight adapter-based specialization (1) performs comparably to full fine-tuning in single-domain setups and (2) is particularly suitable for multi-domain specialization, in which, besides advantageous computational footprint, it can offer better downstream performance. ","tags":["Domain Specialization","Conversational AI","NLP"],"title":"DS-TOD: Efficient Domain Specialization for Task Oriented Dialog","type":"publication"},{"authors":["Carolin Holtermann, Anne Lauscher, Simone Paolo Ponzetto"],"categories":[],"content":"","date":1649376000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1649376000,"objectID":"a85dc7b1f28949d457eb563059b0d86a","permalink":"https://milanlproc.github.io/publication/2022-acl-argumentation/","publishdate":"2022-04-07T11:48:20+01:00","relpermalink":"/publication/2022-acl-argumentation/","section":"publication","summary":" Although much work in NLP has focused on measuring and mitigating stereotypical bias in semantic spaces, research addressing bias in computational argumentation is still in its infancy. In this paper, we address this research gap and conduct a thorough investigation of bias in argumentative language models. To this end, we introduce ABBA, a novel resource for bias measurement specifically tailored to argumentation. We employ our resource to assess the effect of argumentative fine-tuning and debiasing on the intrinsic bias found in transformer-based language models using a lightweight adapter-based approach that is more sustainable and parameter-efficient than full fine-tuning. Finally, we analyze the potential impact of language model debiasing on the performance in argument quality prediction, a downstream task of computational argumentation. Our results show that we are able to successfully and sustainably remove bias in general and argumentative language models while preserving (and sometimes improving) model performance in downstream tasks. We make all experimental code and data available at https://github.com/umanlp/FairArgumentativeLM.","tags":["Fairness","Computational Argumentation","NLP"],"title":"Fair and Argumentative Language Modeling for Computational Argumentation","type":"publication"},{"authors":["Giuseppe Attanasio","Debora Nozza","Dirk Hovy","Elena Baralis"],"categories":[],"content":"","date":1647216000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1647216000,"objectID":"b9f7adfa2c36e4d8da97d41a76dcdbf8","permalink":"https://milanlproc.github.io/publication/2022-entropy-attention-regularization-bias/","publishdate":"2022-03-17T14:48:20+01:00","relpermalink":"/publication/2022-entropy-attention-regularization-bias/","section":"publication","summary":"Natural Language Processing (NLP) models risk overfitting to specific terms in the training data, thereby reducing their performance, fairness, and generalizability. E.g., neural hate speech detection models are strongly influenced by identity terms like gay, or women, resulting in false positives, severe unintended bias, and lower performance. Most mitigation techniques use lists of identity terms or samples from the target domain during training. However, this approach requires a-priori knowledge and introduces further bias if important terms are neglected. Instead, we propose a knowledge-free Entropy-based Attention Regularization (EAR) to discourage overfitting to training-specific terms. An additional objective function penalizes tokens with low self-attention entropy. We fine-tune BERT via EAR: the resulting model matches or exceeds state-of-the-art performance for hate speech classification and bias metrics on three benchmark corpora in English and Italian. EAR also reveals overfitting terms, i.e., terms most likely to induce bias, to help identify their effect on the model, task, and predictions.","tags":["Hate Speech","Bias","Entropy","Attention","Regularization","NLP"],"title":"Entropy-based Attention Regularization Frees Unintended Bias Mitigation from Lists","type":"publication"},{"authors":["Emily Dinan","Gavin Abercrombie","A. Stevie Bergman","Shannon Spruit","Dirk Hovy","Y-Lan Boureau","Verena Rieser"],"categories":[],"content":"","date":1647216000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1647216000,"objectID":"75ad76dd011e594c70e4a27a81805308","permalink":"https://milanlproc.github.io/publication/2022-safetykit-first-aid-measuring-safety-open-domain-conversational-systems/","publishdate":"2022-03-17T14:48:20+01:00","relpermalink":"/publication/2022-safetykit-first-aid-measuring-safety-open-domain-conversational-systems/","section":"publication","summary":"The social impact of natural language processing and its applications has received increasing attention. In this position paper, we focus on the problem of safety for end-to-end conversational AI. We survey the problem landscape therein, introducing a taxonomy of three observed phenomena: the Instigator, Yea-Sayer, and Impostor effects. We then empirically assess the extent to which current tools can measure these effects and current systems display them. We release these tools as part of a ``first aid kit'' (SAFETYKIT) to quickly assess apparent safety concerns. Our results show that, while current tools are able to provide an estimate of the relative safety of systems in various settings, they still have several shortcomings. We suggest several future directions and discuss ethical considerations.","tags":["dialog","Bias","conversational AI","NLG","NLP"],"title":"SAFETYKIT: First Aid for Measuring Safety in Open-domain Conversational Systems","type":"publication"},{"authors":["Dirk Hovy"],"categories":[],"content":"","date":1642291200,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1642291200,"objectID":"eb70cdde35ea1df981863c7b83b5fa20","permalink":"https://milanlproc.github.io/publication/2022_nlpss2/","publishdate":"2020-02-29T14:48:20+01:00","relpermalink":"/publication/2022_nlpss2/","section":"publication","summary":"Text contains a wealth of information about about a wide variety of sociocultural constructs. Automated prediction methods can infer these quantities (sentiment analysis is probably the most well-known application). However, there is virtually no limit to the kinds of things we can predict from text: power, trust, misogyny, are all signaled in language. These algorithms easily scale to corpus sizes infeasible for manual analysis. Prediction algorithms have become steadily more powerful, especially with the advent of neural network methods. However, applying these techniques usually requires profound programming knowledge and machine learning expertise. As a result, many social scientists do not apply them. This Element provides the working social scientist with an overview of the most common methods for text classification, an intuition of their applicability, and Python code to execute them. It covers both the ethical foundations of such work as well as the emerging potential of neural network methods.","tags":["text analysis","social science","NLP","Python","classification"],"title":"Text Analysis in Python for Social Scientists – Prediction and Classification","type":"publication"},{"authors":["Alexandra N Uma","Tommaso Fornaciari","Dirk Hovy","Silviu Paun","Barbara Plank","Massimo Poesio"],"categories":[],"content":"","date":1640563200,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1640563200,"objectID":"89043ca549ca8b57f7d851ddb9c709fd","permalink":"https://milanlproc.github.io/publication/2021-learning_from_disagreement_survey/","publishdate":"2021-12-27T14:48:20+01:00","relpermalink":"/publication/2021-learning_from_disagreement_survey/","section":"publication","summary":"Many tasks in Natural Language Processing (NLP) and Computer Vision (CV) offer evidence that humans disagree, from objective tasks such as part-of-speech tagging to more subjective tasks such as classifying an image or deciding whether a proposition follows from certain premises. While most learning in artificial intelligence (AI) still relies on the assumption that a single (gold) interpretation exists for each item, a growing body of research aims to develop learning methods that do not rely on this assumption. In this survey, we review the evidence for disagreements on NLP and CV tasks, focusing on tasks for which substantial datasets containing this information have been created. We discuss the most popular approaches to training models from datasets containing multiple judgments potentially in disagreement. We systematically compare these different approaches by training them with each of the available datasets, considering several ways to evaluate the resulting models. Finally, we discuss the results in depth, focusing on four key research questions, and assess how the type of evaluation and the characteristics of a dataset determine the answers to these questions. Our results suggest, first of all, that even if we abandon the assumption of a gold standard, it is still essential to reach a consensus on how to evaluate models. This is because the relative performance of the various training methods is critically affected by the chosen form of evaluation. Secondly, we observed a strong dataset effect. With substantial datasets, providing many judgments by high-quality coders for each item, training directly with soft labels achieved better results than training from aggregated or even gold labels. This result holds for both hard and soft evaluation. But when the above conditions do not hold, leveraging both gold and soft labels generally achieved the best results in the hard evaluation. All datasets and models employed in this paper are freely available as supplementary materials.","tags":["annotation","NLP","disagreement","agreement"],"title":"Learning from Disagreement: A Survey","type":"publication"},{"authors":["Dirk Hovy","Shrimai Prabhumoye"],"categories":[],"content":"","date":1628208000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1628208000,"objectID":"2c3bf2de81d5732843e3bf56900eee2d","permalink":"https://milanlproc.github.io/publication/2021-five-sources-bias/","publishdate":"2021-05-06T01:41:26+01:00","relpermalink":"/publication/2021-five-sources-bias/","section":"publication","summary":"Recently, there has been an increased interest in demographically grounded bias in natural language processing (NLP) applications. Much of the recent work has focused on describing bias and providing an overview of bias in a larger context. Here, we provide a simple, actionable summary of this recent work. We outline five sources where bias can occur in NLP systems: (1) the data, (2) the annotation process, (3) the input representations, (4) the models, and finally (5) the research design (or how we conceptualize our research). We explore each of the bias sources in detail in this article, including examples and links to related work, as well as potential counter-measures.","tags":["Position Paper","Issues","NLP","bias"],"title":"Five sources of bias in natural language processing","type":"publication"},{"authors":["Federico Bianchi","Dirk Hovy"],"categories":[],"content":"","date":1628208000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1628208000,"objectID":"ae8849cc748767803b8f7f12282a9f1e","permalink":"https://milanlproc.github.io/publication/2021-gap-between-understanding-adoption/","publishdate":"2021-05-06T01:41:26+01:00","relpermalink":"/publication/2021-gap-between-understanding-adoption/","section":"publication","summary":"There are some issues with current research trends in NLP that can hamper the free development of scientific research. We identify five of particular concern: 1) the early adoption of methods without sufficient understanding or analysis; 2) the preference for computational methods regardless of risks associated with their limitations; 3) the resulting bias in the papers we publish; 4) the impossibility of re-running some experiments due to their cost; 5) the dangers of unexplainable methods. If these issues are not addressed, we risk a loss of reproducibility, reputability, and subsequently public trust in our field. In this position paper, we outline each of these points and suggest ways forward.","tags":["Position Paper","Issues","NLP"],"title":"On the Gap between Adoption and Understanding in NLP","type":"publication"},{"authors":["Federico Bianchi","Silvia Terragni","Dirk Hovy"],"categories":[],"content":"","date":1628208000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1628208000,"objectID":"e3b6d4f859c0f40521fc0ad7439aa5b6","permalink":"https://milanlproc.github.io/publication/2021-contextualized-improve-topic-models-coherence/","publishdate":"2021-05-06T01:41:26+01:00","relpermalink":"/publication/2021-contextualized-improve-topic-models-coherence/","section":"publication","summary":"Topic models extract groups of words from documents, whose interpretation as a topic hopefully allows for a better understanding of the data. However, the resulting word groups are often not coherent, making them harder to interpret. Recently, neural topic models have shown improvements in overall coherence. Concurrently, contextual embeddings have advanced the state of the art of neural models in general. In this paper, we combine contextualized BERT representations with neural topic models. We find that our approach produces more meaningful and coherent topics than traditional bag-of-word topic models and recent neural models. Our results indicate that future improvements in language models will translate into better topic models.","tags":["Topic Modeling","Coherence","NLP"],"title":"Pre-training is a Hot Topic: Contextualized Document Embeddings Improve Topic Coherence","type":"publication"},{"authors":["Debora Nozza"],"categories":[],"content":"","date":1627862400,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1627862400,"objectID":"21f8b8f0a896aaf439bea0d701f2ea7d","permalink":"https://milanlproc.github.io/publication/2021-zeroshot-crosslingual-hate-speech/","publishdate":"2021-05-06T14:48:20+01:00","relpermalink":"/publication/2021-zeroshot-crosslingual-hate-speech/","section":"publication","summary":"Reducing and counter-acting hate speech on Social Media is a significant concern. Most of the proposed automatic methods are conducted exclusively on English and very few consistently labeled, non-English resources have been proposed. Learning to detect hate speech on English and transferring to unseen languages seems an immediate solution. This work is the first to shed light on the limits of this zero-shot, cross-lingual transfer learning framework for hate speech detection. We use benchmark data sets in English, Italian, and Spanish to detect hate speech towards immigrants and women. Investigating post-hoc explanations of the model, we discover that non-hateful, language-specific taboo interjections are misinterpreted as signals of hate speech. Our findings demonstrate that zero-shot, cross-lingual models cannot be used as they are, but need to be carefully designed.","tags":["Hate Speech","BERT","NLP"],"title":"Exposing the limits of Zero-shot Cross-lingual Hate Speech Detection","type":"publication"},{"authors":["Tommaso Fornaciari","Dirk Hovy","Elin Naurin","Julia Runeson","Robert Thomson","Pankaj Adhikari"],"categories":[],"content":"","date":1627776000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1627776000,"objectID":"c7b4b48cb786e0a2763d7f93ab0ed3f5","permalink":"https://milanlproc.github.io/publication/2021-aclfindings-mimac/","publishdate":"2021-05-06T01:41:26+01:00","relpermalink":"/publication/2021-aclfindings-mimac/","section":"publication","summary":"In an election campaign, political parties pledge to implement various projects--should they be elected. But do they follow through? To track election pledges from parties' election manifestos, we need to distinguish between pledges and general statements. In this paper, we use election manifestos of Swedish and Indian political parties to learn neural models that distinguish actual pledges from generic political positions. Since pledges might vary by election year and party, we implement a Multi-Task Learning (MTL) setup, predicting election year and manifesto's party as auxiliary tasks. Pledges can also span several sentences, so we use hierarchical models that incorporate contextual information. Lastly, we evaluate the models in a Zero-Shot Learning (ZSL) framework across countries and languages. Our results indicate that year and party have predictive power even in ZSL, while context introduces some noise. We finally discuss the linguistic features of pledges.","tags":["Election pledges","Zero-Shot Learning","NLP"],"title":"'We will Reduce Taxes' - Identifying Election Pledges with Language Models","type":"publication"},{"authors":["Debora Nozza","Federico Bianchi","Dirk Hovy"],"categories":[],"content":"","date":1622937600,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1622937600,"objectID":"3c6a04000acf7b38009176972f9bf596","permalink":"https://milanlproc.github.io/publication/2021-honest-hurtful-language-model/","publishdate":"2021-03-29T14:48:20+01:00","relpermalink":"/publication/2021-honest-hurtful-language-model/","section":"publication","summary":"Language models have revolutionized the field of NLP. However, language models capture and proliferate hurtful stereotypes, especially in text generation. Our results show that **4.3% of the time, language models complete a sentence with a hurtful word**. These cases are not random, but follow language and gender-specific patterns. We propose a score to measure hurtful sentence completions in language models (HONEST). It uses a systematic template- and lexicon-based bias evaluation methodology for six languages. Our findings suggest that these models replicate and amplify deep-seated societal stereotypes about gender roles. Sentence completions refer to sexual promiscuity when the target is female in 9% of the time, and in 4% to homosexuality when the target is male. The results raise questions about the use of these models in production settings.","tags":["Hate Speech","BERT","NLP"],"title":"HONEST: Measuring Hurtful Sentence Completion in Language Models","type":"publication"},{"authors":["Dirk Hovy","Diyi Yang"],"categories":[],"content":"","date":1622937600,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1622937600,"objectID":"9ebb2f8b0e03f6f8b03537d2944bf15e","permalink":"https://milanlproc.github.io/publication/2021-importance-modeling-social-factors-language/","publishdate":"2021-05-06T14:48:20+01:00","relpermalink":"/publication/2021-importance-modeling-social-factors-language/","section":"publication","summary":"Natural language processing (NLP) applications are now more powerful and ubiquitous than ever before. With rapidly developing (neural) models and ever-more available data, current NLP models have access to more information than any human speaker during their life. Still, it would be hard to argue that NLP models have reached human-level capacity. In this position paper, we argue that the reason for the current limitations is a focus on information content while ignoring language's social factors. We show that current NLP systems systematically break down when faced with interpreting the social factors of language. This limits applications to a subset of information-related tasks and prevents NLP from reaching human-level performance. At the same time, systems that incorporate even a minimum of social factors already show remarkable improvements. We formalize a taxonomy of seven social factors based on linguistic theory and exemplify current failures and emerging successes for each of them. We suggest that the NLP community address social factors to get closer to the goal of human-like language understanding. ","tags":["social factors","computational linguistics","NLP"],"title":"The Importance of Modeling Social Factors of Language: Theory and Practice","type":"publication"},{"authors":["Federico Bianchi","Ciro Greco","Jacopo Tagliabue"],"categories":null,"content":"","date":1622592000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1622592000,"objectID":"b46929d332157b5fc41b82ef8d8c28dd","permalink":"https://milanlproc.github.io/publication/2021-language-in-a-search-box/","publishdate":"2021-03-02T00:00:00Z","relpermalink":"/publication/2021-language-in-a-search-box/","section":"publication","summary":"We investigate grounded language learning through real-world data, by modelling a teacher-learner dynamics through the natural interactions occurring between users and search engines.","tags":["NLP","Meaning","Linguistics","BERT","Embeddings","Language Models"],"title":"Language in a (Search) Box: Grounding Language Learning in Real-World Human-Machine Interaction","type":"publication"},{"authors":["Federico Bianchi","Debora Nozza","Dirk Hovy"],"categories":[],"content":"","date":1621123200,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1621123200,"objectID":"ede87cdeecd57ecdb6e7b020668ace0c","permalink":"https://milanlproc.github.io/publication/2021-feelit-italian-sentiment-emotion/","publishdate":"2021-03-28T14:48:20+01:00","relpermalink":"/publication/2021-feelit-italian-sentiment-emotion/","section":"publication","summary":"Sentiment analysis is a common task to understand people's reactions online. Still, we often need more nuanced information: is the post negative because the user is angry or because they are sad? An abundance of approaches has been introduced for tackling both tasks. However, at least for Italian, they all treat only one of the tasks at a time. We introduce FEEL-IT, a novel benchmark corpus of Italian Twitter posts annotated with four basic emotions: **anger**, **fear**, **joy**, **sadness**. By collapsing them, we can also do sentiment analysis. We evaluate our corpus on benchmark datasets for both emotion and sentiment classification, obtaining competitive results. We release an [open-source Python library](https://github.com/MilaNLProc/feel-it), so researchers can use a model trained on FEEL-IT for inferring both sentiments and emotions from Italian text.","tags":["Sentiment Analysis","Emotion Detection","Italian","BERT","NLP","dataset"],"title":"FEEL-IT: Emotion and Sentiment Classification for the Italian Language","type":"publication"},{"authors":["Tommaso Fornaciari","Federico Bianchi","Debora Nozza","Dirk Hovy"],"categories":[],"content":"","date":1621123200,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1621123200,"objectID":"3bc06910533e6771816b1c186a6e9ade","permalink":"https://milanlproc.github.io/publication/2021-wassa-emotion-multitask/","publishdate":"2021-03-27T14:48:20+01:00","relpermalink":"/publication/2021-wassa-emotion-multitask/","section":"publication","summary":"The paper describes the MilaNLP team’s submission (Bocconi University, Milan) in the WASSA 2021 Shared Task on Empathy Detection and Emotion Classification. We focus on Track 2 - Emotion Classification - which consists of predicting the emotion of reactions to English news stories at the essay-level. We test different models based on multi-task and multi-input frameworks. The goal was to better exploit all the correlated information given in the data set. We find, though, that empathy as an auxiliary task in multi-task learning and demographic attributes as additional input provide worse performance with respect to single-task learning. While the result is competitive in terms of the competition, our results suggest that emotion and empathy are not related tasks - at least for the purpose of prediction.","tags":["Emotion Detection","BERT","NLP"],"title":"MilaNLP @ WASSA: Does BERT Feel Sad When You Cry?","type":"publication"},{"authors":null,"categories":null,"content":"We are delighted to announce that our group has four papers accepted at at ACL-IJCNLP 2021 main conference and Findings of ACL!\n Title: Pre-training is a Hot Topic: Contextualized Document Embeddings Improve Topic Coherence\nAuthors: Federico Bianchi, Silvia Terragni, Dirk Hovy\n Title: Exposing the limits of Zero-shot Cross-lingual Hate Speech Detection\nAuthors: Debora Nozza\n Title: On the Gap between Adoption and Understanding in NLP\nAuthors: Federico Bianchi, Dirk Hovy\n Title: \u0026lsquo;We will Reduce Taxes\u0026rsquo; - Identifying Election Pledges with Language Models\nAuthors: Tommaso Fornaciari, Dirk Hovy, Elin Naurin, Julia Runeson, Robert Thomson, Pankaj Adhikari\n ","date":1620259200,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1620259200,"objectID":"49e0aca1b8b895a3f2d2861b558d2e0e","permalink":"https://milanlproc.github.io/post/2021-acl-acceptance/","publishdate":"2021-05-06T00:00:00Z","relpermalink":"/post/2021-acl-acceptance/","section":"post","summary":"We are delighted to announce that our group has four papers accepted at at ACL-IJCNLP 2021 main conference and Findings of ACL!\n Title: Pre-training is a Hot Topic: Contextualized Document Embeddings Improve Topic Coherence","tags":null,"title":"Four papers accepted at ACL","type":"post"},{"authors":["Tommaso Fornaciari","Alexandra Uma","Silviu Paun","Barbara Plank","Dirk Hovy and Massimo Poesio"],"categories":[],"content":"","date":1620172800,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1620172800,"objectID":"23b4a371f95c37c5b13f4d6d76389ec6","permalink":"https://milanlproc.github.io/publication/2021-naacl-softlabels/","publishdate":"2021-05-06T01:41:26+01:00","relpermalink":"/publication/2021-naacl-softlabels/","section":"publication","summary":"Supervised learning assumes that a ground truth label exists. However, the reliability of this ground truth depends on human annotators, who often disagree. Prior work has shown that this disagreement can be helpful in training models. We propose a novel method to incorporate this disagreement as information: in addition to the standard error computation, we use soft labels (i.e., probability distributions over the annotator labels) as an auxiliary task in a multi-task neural network. We measure the divergence between the predictions and the target soft labels with several loss-functions and evaluate the models on various NLP tasks. We find that the soft-label prediction auxiliary task reduces the penalty for errors on ambiguous entities and thereby mitigates overfitting. It significantly improves performance across tasks beyond the standard approach and prior work.","tags":["Soft-labels","Agreement","NLP"],"title":"Beyond Black \u0026 White: Leveraging Annotator Disagreement via Soft-Label Multi-Task Learning","type":"publication"},{"authors":["Sotiris Lamprinidis","Federico Bianchi","Daniel Hardt","Dirk Hovy"],"categories":[],"content":"","date":1618531200,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1618531200,"objectID":"97f586c2e6f028d5128baf1d6a1aaa97","permalink":"https://milanlproc.github.io/publication/2021-universal-joy/","publishdate":"2021-03-27T14:48:20+01:00","relpermalink":"/publication/2021-universal-joy/","section":"publication","summary":"While emotions are universal aspects of human psychology, they are expressed differently across different languages and cultures. We introduce a new data set of over 530k anonymized public Facebook posts across 18 languages, labeled with five different emotions. Using multilingual BERT embeddings, we show that emotions can be reliably inferred both within and across languages. Zero-shot learning produces promising results for low-resource languages. Following established theories of basic emotions, we provide a detailed analysis of the possibilities and limits of cross-lingual emotion classification. We find that structural and typological similarity between languages facilitates cross-lingual learning, as well as linguistic diversity of training data. Our results suggest that there are commonalities underlying the expression of emotion in different languages. We publicly release the anonymized data for future research.","tags":["Emotion Detection","BERT","NLP","data set"],"title":"Universal Joy A Data Set and Results for Classifying Emotions Across Languages","type":"publication"},{"authors":["Tommaso Fornaciari","Federico Bianchi","Dirk Hovy","Massimo Poesio"],"categories":[],"content":"","date":1617926400,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1617926400,"objectID":"3a875d0db3d135d98a7b5d26b50805ed","permalink":"https://milanlproc.github.io/publication/2021_eacl_decour/","publishdate":"2021-04-09T01:41:26+01:00","relpermalink":"/publication/2021_eacl_decour/","section":"publication","summary":"Spotting a lie is challenging but has an enormous potential impact on security as well as private and public safety. Several NLP methods have been proposed to classify texts as truthful or deceptive. In most cases, however, the target texts’ preceding context is not considered. This is a severe limitation, as any communication takes place in context, not in a vacuum, and context can help to detect deception. We study a corpus of Italian dialogues containing deceptive statements and implement deep neural models that incorporate various linguistic contexts. We establish a new state-of-the-art identifying deception and find that not all context is equally useful to the task. Only the texts closest to the target, if from the same speaker (rather than questions by an interlocutor), boost performance. We also find that the semantic information in language models such as BERT contributes to the performance. However, BERT alone does not capture the implicit knowledge of deception cues: its contribution is conditional on the concurrent use of attention to learn cues from BERT’s representations.","tags":["deception detection","dataset","NLP"],"title":"BERTective: Language Models and Contextual Information for Deception Detection","type":"publication"},{"authors":["Federico Bianchi","Silvia Terragni","Dirk Hovy","Debora Nozza","Elisabetta Fersini"],"categories":null,"content":"","date":1614556800,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1614556800,"objectID":"b4e481bdc36e151c5f7c537366aa81d6","permalink":"https://milanlproc.github.io/publication/2021-crosslingual-topic-model/","publishdate":"2021-03-01T00:00:00Z","relpermalink":"/publication/2021-crosslingual-topic-model/","section":"publication","summary":"We introduce a novel topic modeling method that can make use of contextulized embeddings (e.g., BERT) to do zero-shot cross-lingual topic modeling.","tags":["NLP","Topic Modeling","BERT","Language Models"],"title":"Cross-lingual Contextualized Topic Models with Zero-shot Learning","type":"publication"},{"authors":["Dirk Hovy"],"categories":[],"content":"","date":1608076800,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1608076800,"objectID":"5cb6c78d7dfa74acb0b3dc3d0145145c","permalink":"https://milanlproc.github.io/publication/2020_nlpss/","publishdate":"2020-02-29T14:48:20+01:00","relpermalink":"/publication/2020_nlpss/","section":"publication","summary":"Text is everywhere, and it is a fantastic resource for social scientists. However, because it is so abundant, and because language is so variable, it is often difficult to extract the information we want. There is a whole subfield of AI concerned with text analysis (natural language processing). Many of the basic analysis methods developed are now readily available as Python implementations. This Element will teach you when to use which method, the mathematical background of how it works, and the Python code to implement it.","tags":["text analysis","social science","NLP","Python"],"title":"Text Analysis in Python for Social Scientists – Discovery and Exploration","type":"publication"},{"authors":["Deven Santosh Shah","H. Andrew Schwartz","Dirk Hovy"],"categories":[],"content":"","date":1593561600,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1593561600,"objectID":"15d5ca711d194a2336fb3c44dc0ea869","permalink":"https://milanlproc.github.io/publication/2020_bias/","publishdate":"2020-02-29T14:48:20+01:00","relpermalink":"/publication/2020_bias/","section":"publication","summary":"An increasing number of natural language processing papers address the effect of bias on predictions, introducing mitigation techniques at different parts of the standard NLP pipeline (data and models). However, these works have been conducted individually, without a unifying framework to organize efforts within the field. This situation leads to repetitive approaches, and focuses overly on bias symptoms/effects, rather than on their origins, which could limit the development of effective countermeasures. In this paper, we propose a unifying predictive bias framework for NLP. We summarize the NLP literature and suggest general mathematical definitions of predictive bias. We differentiate two consequences of bias: outcome disparities and error disparities, as well as four potential origins of biases: label bias, selection bias, model overamplification, and semantic bias. Our framework serves as an overview of predictive bias in NLP, integrating existing work into a single structure, and providing a conceptual baseline for improved frameworks.","tags":["bias","ethics","NLP"],"title":"Predictive Biases in Natural Language Processing Models: A Conceptual Framework and Overview","type":"publication"},{"authors":["Dirk Hovy","Federico Bianchi","Tommaso Fornaciari"],"categories":[],"content":"","date":1593561600,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1593561600,"objectID":"38a575da835464063eb667eb335f436f","permalink":"https://milanlproc.github.io/publication/2020_mt/","publishdate":"2020-02-29T14:48:20+01:00","relpermalink":"/publication/2020_mt/","section":"publication","summary":"The main goal of machine translation has been to convey the correct content. Stylistic considerations have been at best secondary. We show that as a consequence, the output of three commercial machine translation systems (Bing, DeepL, Google) make demographically diverse samples from five languages “sound” older and more male than the original. Our findings suggest that translation models reflect demographic bias in the training data. This opens up interesting new research avenues in machine translation to take stylistic considerations into account.","tags":["bias","ethics","machine translation","NLP"],"title":"“You Sound Just Like Your Father” Commercial Machine Translation Systems Include Stylistic Biases","type":"publication"},{"authors":["Dirk Hovy","Afshin Rahimi","Timothy Baldwin","Julian Brooke"],"categories":[],"content":"","date":1584835200,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1584835200,"objectID":"196496b369b51faaabbe3f66aa224c90","permalink":"https://milanlproc.github.io/publication/2020_eutwitter/","publishdate":"2020-02-29T14:48:20+01:00","relpermalink":"/publication/2020_eutwitter/","section":"publication","summary":"Geotagged Twitter data allows us to investigate correlations of geographic language variation, both at an interlingual and intralingual level. Based on data-driven studies of such relationships, this paper investigates regional variation of language usage on Twitter across Europe and compares it to traditional research of regional variation. This paper presents a novel method to process large amounts of data and to capture gradual differences in language variation. Visualizing the results by deterministically translating linguistic features into color hues presents a novel view of language variation across Europe, as it is reflected on Twitter. The technique is easy to apply to large amounts of data and provides a fast visual reference that can serve as input for further qualitative studies. The general applicability is demonstrated on a number of studies both across and within national languages. This paper also discusses the unique challenges of large-scale analysis and visualization, and the complementary nature of traditional qualitative and data-driven quantitative methods, and argues for their possible synthesis.","tags":["computational sociolinguistics","sociolinguistics","NLP","representation learning","embeddings"],"title":"Visualizing Regional Language Variation Across Europe on Twitter","type":"publication"},{"authors":["Farzana Rashid","Tommaso Fornaciari","Dirk Hovy","Eduardo Blanco","Fernando Vega-Redondo"],"categories":[],"content":"","date":1583020800,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1583020800,"objectID":"a7aa9d88e9614128120dc2a841ac10ab","permalink":"https://milanlproc.github.io/publication/2020_helpful/","publishdate":"2020-02-29T14:48:20+01:00","relpermalink":"/publication/2020_helpful/","section":"publication","summary":"When interacting with each other, we motivate, advise, inform, show love or power towards our peers. However, the way we interact may also hold some indication on how successful we are, as people often try to help each other to achieve their goals. We study the chat interactions of thousands of aspiring entrepreneurs who discuss and develop business models. We manually annotate a set of about 5,500 chat interactions with four dimensions of interaction styles (motivation, cooperation, equality, advice). We find that these styles can be reliably predicted, and that the communication styles can be used to predict a number of indices of business success. Our findings indicate that successful communicators are also successful in other domains.","tags":["conversation","style","communication","NLP"],"title":"Helpful or Hierarchical? Predicting the Communicative Strategies of Chat Participants, and their Impact on Success","type":"publication"},{"authors":["Debora Nozza","Federico Bianchi","Dirk Hovy"],"categories":[],"content":"","date":1583020800,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1583020800,"objectID":"39527c2d2966939ebd36a97e84e5382f","permalink":"https://milanlproc.github.io/publication/2020-bertlang-language-specific-bert/","publishdate":"2020-02-29T14:48:20+01:00","relpermalink":"/publication/2020-bertlang-language-specific-bert/","section":"publication","summary":"Recently, Natural Language Processing (NLP) has witnessed an impressive progress in many areas, due to the advent of novel, pretrained contextual representation models. In particular, Devlin et al. (2019) proposed a model, called BERT (Bidirectional Encoder Representations from Transformers), which enables researchers to obtain state-of-the art performance on numerous NLP tasks by fine-tuning the representations on their data set and task, without the need for developing and training highly-specific architectures. The authors also released multilingual BERT (mBERT), a model trained on a corpus of 104 languages, which can serve as a universal language model. This model obtained impressive results on a zero-shot cross-lingual natural inference task. Driven by the potential of BERT models, the NLP community has started to investigate and generate an abundant number of BERT models that are trained on a particular language, and tested on a specific data domain and task. This allows us to evaluate the true potential of mBERT as a universal language model, by comparing it to the performance of these more specific models. This paper presents the current state of the art in language-specific BERT models, providing an overall picture with respect to different dimensions (i.e. architectures, data domains, and tasks). Our aim is to provide an immediate and straightforward overview of the commonalities and differences between Language-Specific (language-specific) BERT models and mBERT. We also provide an interactive and constantly updated website that can be used to explore the information we have collected, at [https://bertlang.unibocconi.it](https://bertlang.unibocconi.it/).","tags":["multilingual","BERT","representation learning","NLP"],"title":"What the [MASK]? Making Sense of Language-Specific BERT Models","type":"publication"},{"authors":null,"categories":["demographic"],"content":"Dirk Hovy, scientific director of DMI and Professor of computer science, has won an ERC starting grant of 1.5mln euros. His project INTEGRATOR, funded under grant agreement 949944, introduces demographic factors into language processing systems, which will improve algorithmic performance, avoid racism, sexism, and ageism, and open up new applications. What if I wrote that “winning an ERC Grant, Dirk Hovy got a sick result?”. Those familiar with the use of “sick” as a synonym for “great” or “awesome” among teenagers would think that Bocconi Knowledge hired a very young writer (or someone posing as such). The rest would think I went crazy. Current artificial intelligence-based language systems wouldn’t have a clue. “Natural language processing (NLP) technologies,” Prof. Hovy says, “fail to account for demographics both in understanding language and in generating it. And this failure prevents us from reaching human-like performance. It limits possible future applications and it introduces systematic bias against underrepresented demographic groups”.\n🗞️🗞️ Related articles featured in Corriere Innovazione and Bocconi News.\n ","date":1580083200,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1580083200,"objectID":"d6b57eefc5eca031cd0bb3edb943a34f","permalink":"https://milanlproc.github.io/project/integrator/","publishdate":"2020-01-27T00:00:00Z","relpermalink":"/project/integrator/","section":"project","summary":"Incorporating Demographic Factors into Natural Language Processing Models","tags":["demographic","NLP"],"title":"INTEGRATOR","type":"project"},{"authors":null,"categories":["computational social science","political science","nlp"],"content":"In this inter-disciplinary project, Dirk Hovy and Tommaso Fornaciari team up with an international team of political scientists (led bt the University of Gothenburg) to develop mixed methods for analyzing political parties’ promises to voters during election campaigns. For democracy to function effectively, political parties must offer clear choices to voters during election campaigns. However, as parties’ communication with voters has become increasingly fragmented and targeted, it is much harder for citizens to keep track of what parties are promising. This threatens the quality of democratic representation. It also challenges established research methods for studying parties’ campaign promises. This project will develop new methods for studying parties’ promises in modern election campaigns. The project will integrate existing qualitative methods in political science and develop new research tools based on NLP. These AI-powered tools will enable researchers to examine parties’ campaign promises in large amounts of text and speech. The resulting research will be of significant benefit to citizens, who will receive greater clarity on the choices that parties are offering. These existing and new methods are highly relevant to research on text and speech in a wide range of social science fields. Until now, progress in this field has been stifled by limited dialogue among the proponents of different qualitative and quantitative methods. The project includes established experts on parties’ campaign promises, new media, qualitative and quantitative methods for analyzing political texts, and machine learning and natural language processing. The project is funded by the Swedish Riksbankens Jubileumsfond for 12M SEK.\n","date":1580083200,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1580083200,"objectID":"cc64e6761fb9bc89dcd828508f7b467b","permalink":"https://milanlproc.github.io/project/mimac/","publishdate":"2020-01-27T00:00:00Z","relpermalink":"/project/mimac/","section":"project","summary":"Mixed methods for analyzing political parties’ promises to voters during election campaigns","tags":["computational social science","political science","nlp"],"title":"MiMac","type":"project"},{"authors":null,"categories":null,"content":"PI: Debora Nozza\nco-PI: Dirk Hovy and Nicoletta Balbo\n The interdisciplinary MONICA project will create a digital barometer of Italians’ attitudes towards the government measures implemented in response to COVID-19. The pandemic has plunged millions of vulnerable people into abject poverty. The government created financial measures to improve the economic situation and social inclusion. However, it is unclear whether these measures reach those who need them most. To find out, we will uncover the public perception of these measures and provide concrete metrics for three related dimensions: 1) coverage of the potential beneficiaries, 2) attitudes of the Italian population stratified by different demographic factors, and 3) accessibility of the information. MONICA will provide citizens with a tool to automatically rank and simplify articles about requirements and steps to access these initiatives. MONICA will enable policymakers to understand which segments of the vulnerable population are not accessing these initiatives and why.\nSee this project featured in the news. 🗞️🗞️\n 🇮🇹 Italian version\nPer fronteggiare le ripercussioni economiche della pandemia, il governo italiano ha adottato diverse misure finanziarie volte ad arginare gli effetti della crisi, migliorando l’inclusione sociale delle persone che negli ultimi mesi si sono trovate in difficoltà. Tuttavia, tali misure hanno inaspettatamente riscosso una partecipazione inferiore alle aspettative e la loro efficacia, soprattutto in termini di raggiungimento dei soggetti più bisognosi, risulta difficilmente verificabile. MONICA, analizzando una grande mole di dati tramite tecniche di data science, fornirà delle metriche atte a valutare tali misure in termini di: 1) capacità di raggiungimento dei soggetti bisognosi, 2) sentiment dell’opinione pubblica, differenziata in base a fattori demografici, 3) accessibilità delle informazioni. MONICA, inoltre, automaticamente cercherà e creerà versioni semplificate di articoli e procedure inerenti alle misure, garantendo l’accessibilità di tali informazioni a tutti i cittadini.\n","date":1580083200,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1580083200,"objectID":"0c9656761b6df1422a95e6f71eb8ae37","permalink":"https://milanlproc.github.io/project/monitoring_italian_measures_response_covid19/","publishdate":"2020-01-27T00:00:00Z","relpermalink":"/project/monitoring_italian_measures_response_covid19/","section":"project","summary":"MONItoring Coverage, Attitudes and Accessibility of Italian measures in response to COVID-19","tags":["computational social sciences","nlp"],"title":"MONICA","type":"project"},{"authors":null,"categories":["political science","nlp","social media"],"content":"Echo chambers and online abuse are two significant problems affecting the health of conversations on social media. This interdisciplinary, multi-institutional project (led by George Washington University) helps Twitter tackle these issues by developing metrics and algorithms to measure various uncivil behaviors. Given the concerns about growing polarization and the spread of misinformation, our first two metrics, mutual recognition and diversity of perspectives, will help Twitter diagnose issues that arise when users isolate themselves from those who hold differing opinions. Mutual recognition measures whether and to what extent people on opposing sides of an issue acknowledge and engage with rival claims. When recognition occurs, a public sphere is established. When there is no recognition, echo chambers result. Diversity of perspectives measures the range of claims made on the platform, how likely users are to encounter (as opposed to engaging with) divergent and unfamiliar claims, and how polarized the debate is.\nOur second two metrics, incivility, and intolerance, will help Twitter identify and address abuse and targeted harassment. Incivility measures the presence of anti-normative intensity in conversation, including the use of profanity and vulgarity. However, recognizing that such anti-normative communication sometimes serves justifiable\u0026ndash;and in some cases, even beneficial\u0026ndash;ends, we distinguish this concept from intolerance. Targeted attacks on individuals or groups, particularly when carried out based on gender, sexuality, race, ethnicity, religion, or ability, threaten the fundamental democratic principles of equality and freedom.\nTo classify these measures at scale, we draw upon existing work in various computational fields, notably natural language processing and network analysis, but take this work further in addressing the metrics outlined here. Moreover, beyond merely detecting and measuring mutual recognition, diversity of perspectives, incivility, and intolerance, we propose to study the effects these four phenomena have on users. In doing so, we offer a theoretically and empirically driven approach that will help Twitter diagnose the conversation\u0026rsquo;s relative health on its platform.\n","date":1580083200,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1580083200,"objectID":"b687a82907bfa3278ec0305ce0470f43","permalink":"https://milanlproc.github.io/project/twitterhealth/","publishdate":"2020-01-27T00:00:00Z","relpermalink":"/project/twitterhealth/","section":"project","summary":"Devising Metrics for Assessing Echo Chambers, Incivility, and Intolerance on Twitter","tags":["social media","political science","nlp"],"title":"Twitter Healthy Conversations","type":"project"},{"authors":["Alexandra Uma","Tommaso Fornaciari","Dirk Hovy","Silviu Paun","Barbara Plank","Massimo Poesio"],"categories":[],"content":"","date":1577836800,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1577836800,"objectID":"a5fe34b8d82f43aca4cded8dbebd251c","permalink":"https://milanlproc.github.io/publication/2020_aaai_softlabels/","publishdate":"2020-01-01T00:00:00Z","relpermalink":"/publication/2020_aaai_softlabels/","section":"publication","summary":"Recently, Peterson et al. provided evidence of the benefits of using probabilistic soft labels generated from crowd annotations for training a computer vision model, showing that using such labels maximizes performance of the models over unseen data. In this paper, we generalize these results by showing that training with soft labels is an effective method for using crowd annotations in several other AI tasks besides the one studied by Peterson et al., and also when their performance is compared with that of state-of-the-art methods for learning from crowdsourced data. ","tags":["annotation","disagreement","loss function","NLP"],"title":"A Case for Soft Loss Functions","type":"publication"},{"authors":["Tommaso Fornaciari","Letitia Cagnina","Paolo Rosso","Massimo Poesio"],"categories":[],"content":"","date":1577836800,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1577836800,"objectID":"4d9157d4612c24b48becd4300cc37f9c","permalink":"https://milanlproc.github.io/publication/2020_lre/","publishdate":"2020-01-01T00:00:00Z","relpermalink":"/publication/2020_lre/","section":"publication","summary":"Identifying deceptive online reviews is a challenging tasks for Natural Language Processing (NLP). Collecting corpora for the task is difficult, because normally it is not possible to know whether reviews are genuine. A common workaround involves collecting (supposedly) truthful reviews online and adding them to a set of deceptive reviews obtained through crowdsourcing services. Models trained this way are generally successful at discriminating between ‘genuine’ online reviews and the crowdsourced deceptive reviews. It has been argued that the deceptive reviews obtained via crowdsourcing are very different from real fake reviews, but the claim has never been properly tested. In this paper, we compare (false) crowdsourced reviews with a set of ‘real’ fake reviews published on line. We evaluate their degree of similarity and their usefulness in training models for the detection of untrustworthy reviews. We find that the deceptive reviews collected via crowdsourcing are significantly different from the fake reviews published online. In the case of the artificially produced deceptive texts, it turns out that their domain similarity with the targets affects the models’ performance, much more than their","tags":["dataset","deception detection","NLP"],"title":"Fake opinion detection: how similar are crowdsourced datasets to real data?","type":"publication"},{"authors":["Tommaso Fornaciari","Dirk Hovy"],"categories":[],"content":"","date":1572739200,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1572739200,"objectID":"e6f7d976c82c2e5990707d0efc2d07b8","permalink":"https://milanlproc.github.io/publication/2019_m2v/","publishdate":"2019-10-31T01:35:54+01:00","relpermalink":"/publication/2019_m2v/","section":"publication","summary":"Prior research has shown that geolocation can be substantially improved by including user network information. While effective, it suffers from the curse of dimensionality, since networks are usually represented as sparse adjacency matrices of connections, which grow exponentially with the number of users. In order to incorporate this information, we therefore need to limit the network size, in turn limiting performance and risking sample bias. In this paper, we address these limitations by instead using dense network representations. We explore two methods to learn continuous node representations from either 1) the network structure with node2vec (Grover and Leskovec, 2016), or 2) textual user mentions via doc2vec (Le and Mikolov, 2014). We combine both methods with input from social media posts in an attention-based convolutional neural network and evaluate the contribution of each component on geolocation performance. Our method enables us to incorporate arbitrarily large networks in a fixed-length vector, without limiting the network size. Our models achieve competitive results with similar state-of-the-art methods, but with much fewer model parameters, while being applicable to networks of virtually any size. ","tags":["geolocation","representation learning","NLP"],"title":"Dense Node Representation for Geolocation","type":"publication"},{"authors":["Tommaso Fornaciari","Dirk Hovy"],"categories":[],"content":"","date":1572739200,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1572739200,"objectID":"c99ae1a9dbc181af0fec07ee706ff45e","permalink":"https://milanlproc.github.io/publication/2019_geo_mtl/","publishdate":"2019-10-31T01:25:36+01:00","relpermalink":"/publication/2019_geo_mtl/","section":"publication","summary":"Geolocation, predicting the location of a post based on text and other information, has a huge potential for several social media applications. Typically, the problem is modeled as either multi-class classification or regression. In the first case, the classes are geographic areas previously identified; in the second, the models directly predict geographic coordinates. The former requires discretization of the coordinates, but yields better performance. The latter is potentially more precise and true to the nature of the problem, but often results in worse performance. We propose to combine the two approaches in an attention-based multitask convolutional neural network that jointly predicts both discrete locations and continuous geographic coordinates. We evaluate the multi-task (MTL) model against single-task models and prior work. We find that MTL significantly improves performance, reporting large gains on one data set, but also note that the correlation between labels and coordinates has a marked impact on the effectiveness of including a regression task.","tags":["geolocation","multitask learning","NLP"],"title":"Geolocation with Attention-Based Multitask Learning Models","type":"publication"},{"authors":["Hanh Nguyen","Dirk Hovy"],"categories":[],"content":"","date":1572739200,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1572739200,"objectID":"da268d55d2d3daa3188db963df72f794","permalink":"https://milanlproc.github.io/publication/2019_siri/","publishdate":"2019-10-31T01:35:54+01:00","relpermalink":"/publication/2019_siri/","section":"publication","summary":"User reviews provide a significant source of information for companies to understand their market and audience. In order to discover broad trends in this source, researchers have typically used topic models such as Latent Dirichlet Allocation (LDA). However, while there are metrics to choose the “best” number of topics, it is not clear whether the resulting topics can also provide in-depth, actionable product analysis. Our paper examines this issue by analyzing user reviews from the Best Buy US website for smart speakers. Using coherence scores to choose topics, we test whether the results help us to understand user interests and concerns. We find that while coherence scores are a good starting point to identify a number of topics, it still requires manual adaptation based on domain knowledge to provide market insights. We show that the resulting dimensions capture brand performance and differences, and differentiate the market into two distinct groups with different properties.","tags":["NLP","smart speakers","topic modeling"],"title":"Hey Siri. Ok Google. Alexa: A topic modeling of user reviews for smart speakers","type":"publication"},{"authors":["Tommaso Fornaciari","Dirk Hovy"],"categories":[],"content":"","date":1572739200,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1572739200,"objectID":"f03e8b5dae24f99c3b8d42cc770514c9","permalink":"https://milanlproc.github.io/publication/2019_p2c/","publishdate":"2019-10-31T01:38:23+01:00","relpermalink":"/publication/2019_p2c/","section":"publication","summary":" Geolocating social media posts relies on the assumption that language carries sufficient geographic information. However, locations are usually given as continuous latitude/longitude tuples, so we first need to define discrete geographic regions that can serve as labels. Most studies use some form of clustering to discretize the continuous coordinates (Han et al., 2016). However, the resulting regions do not always correspond to existing linguistic areas. Consequently, accuracy at 100 miles tends to be good, but degrades for finer-grained distinctions, when different linguistic regions get lumped together. We describe a new algorithm, Point-to-City (P2C), an iterative k-d tree-based method for clustering geographic coordinates and associating them with towns. We create three sets of labels at different levels of granularity, and compare performance of a state-of-the-art geolocation model trained and tested with P2C labels to one with regular k-d tree labels. Even though P2C results in substantially more labels than the baseline, model accuracy increases significantly over using traditional labels at the fine-grained level, while staying comparable at 100 miles. The results suggest that identifying meaningful linguistic areas is crucial for improving geolocation at a fine-grained level.","tags":["geolocation","NLP","clustering"],"title":"Identifying Linguistic Areas for Geolocation","type":"publication"},{"authors":["Aparna Garimella","Carmen Banea","Dirk Hovy","Rada Mihalcea"],"categories":[],"content":"","date":1562112000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1562112000,"objectID":"5dec16fd019afc36d15e76cab7ab6ea4","permalink":"https://milanlproc.github.io/publication/2019-gender-bias-part-of-speech-tagging-dependency-parsing/","publishdate":"2019-10-31T01:35:54+01:00","relpermalink":"/publication/2019-gender-bias-part-of-speech-tagging-dependency-parsing/","section":"publication","summary":"Several linguistic studies have shown the prevalence of various lexical and grammatical patterns in texts authored by a person of a particular gender, but models for part-of-speech tagging and dependency parsing have still not adapted to account for these differences. To address this, we annotate the Wall Street Journal part of the Penn Treebank with the gender information of the articles’ authors, and build taggers and parsers trained on this data that show performance differences in text written by men and women. Further analyses reveal numerous part-of-speech tags and syntactic relations whose prediction performances benefit from the prevalence of a specific gender in the training data. The results underscore the importance of accounting for gendered differences in syntactic tasks, and outline future venues for developing more accurate taggers and parsers. We release our data to the research community.","tags":["pos tagging","parsing","NLP","bias"],"title":"Women’s Syntactic Resilience and Men’s Grammatical Luck: Gender-Bias in Part-of-Speech Tagging and Dependency Parsing","type":"publication"},{"authors":null,"categories":null,"content":"","date":1546300800,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1546300800,"objectID":"6d99026b9e19e4fa43d5aadf147c7176","permalink":"https://milanlproc.github.io/contact/","publishdate":"2019-01-01T00:00:00Z","relpermalink":"/contact/","section":"","summary":"A little more about me and how to get in touch","tags":null,"title":"About / Contact","type":"widget_page"},{"authors":null,"categories":null,"content":"","date":1546300800,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1546300800,"objectID":"be566fdb6f0fa08cfea50d77a89a6b5a","permalink":"https://milanlproc.github.io/data/","publishdate":"2019-01-01T00:00:00Z","relpermalink":"/data/","section":"","summary":"","tags":null,"title":"How to partecipate","type":"widget_page"},{"authors":null,"categories":null,"content":"","date":1546300800,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1546300800,"objectID":"8795eba9bd19b87b0616d17da3c16590","permalink":"https://milanlproc.github.io/join/","publishdate":"2019-01-01T00:00:00Z","relpermalink":"/join/","section":"","summary":"Open Positions @ MilaNLP lab Bocconi University","tags":null,"title":"Join","type":"widget_page"},{"authors":["Fernando Vega-Redondo","Paolo Pin","Diego Ubfal","Cristiana Benedetti-Fasil","Charles Brummitt","Gaia Rubera","Dirk Hovy","Tommaso Fornaciari"],"categories":[],"content":"","date":1546300800,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1546300800,"objectID":"12bbc46c9ef5b1283e70d7fd9cdc0c89","permalink":"https://milanlproc.github.io/publication/2019_adansonia/","publishdate":"2019-01-01T00:00:00Z","relpermalink":"/publication/2019_adansonia/","section":"publication","summary":"Can large-scale peer interaction foster entrepreneurship and innovation? We conducted an RCT involving almost 5,000 entrepreneurs from 49 African countries. All were enrolled in an online business course, and the treatment involved random assignment to either face-to-face or virtual (Internet-mediated) interaction. We find positive treatment effects on both the submission of business plans and their quality, provided interaction displays some intermediate diversity. Network effects are also significant on both outcomes, although diversity plays a different role for each. This shows that effective peer interaction can be feasibly implemented quite broadly but must also be designed carefully, in view of the pursued objectives.","tags":["social science","economics","text analysis"],"title":"Peer networks and entrepreneurship: A Pan-African RCT","type":"publication"},{"authors":null,"categories":null,"content":"","date":1546300800,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1546300800,"objectID":"6087c0ef875554f4409ac52928d79279","permalink":"https://milanlproc.github.io/projects/","publishdate":"2019-01-01T00:00:00Z","relpermalink":"/projects/","section":"","summary":"See some of the projects I have worked on","tags":null,"title":"Projects","type":"widget_page"},{"authors":["Dirk Hovy","Tommaso Fornaciari"],"categories":[],"content":"","date":1541203200,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1541203200,"objectID":"ff3dfca9f7c85f2d365ce66a6da7f88f","permalink":"https://milanlproc.github.io/publication/2018_emnlp_retro/","publishdate":"2019-10-31T01:41:26+01:00","relpermalink":"/publication/2018_emnlp_retro/","section":"publication","summary":"","tags":[],"title":"Increasing In-Class Similarity by Retrofitting Embeddings with Demographic Information","type":"publication"},{"authors":["Dirk Hovy","Christoph Purschke"],"categories":[],"content":"","date":1540166400,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1540166400,"objectID":"66a0f211e4d88095c64e9e6679ad64a7","permalink":"https://milanlproc.github.io/publication/2018-capturing-regional-variation-distributed-representations-geographic-retrofitting/","publishdate":"2020-02-29T14:48:20+01:00","relpermalink":"/publication/2018-capturing-regional-variation-distributed-representations-geographic-retrofitting/","section":"publication","summary":"Dialects are one of the main drivers of language variation, a major challenge for natural language processing tools. In most languages, dialects exist along a continuum, and are commonly discretized by combining the extent of several preselected linguistic variables. However, the selection of these variables is theory-driven and itself insensitive to change. We use Doc2Vec on a corpus of 16.8M anonymous online posts in the German-speaking area to learn continuous document representations of cities. These representations capture continuous regional linguistic distinctions, and can serve as input to downstream NLP tasks sensitive to regional variation. By incorporating geographic information via retrofitting and agglomerative clustering with structure, we recover dialect areas at various levels of granularity. Evaluating these clusters against an existing dialect map, we achieve a match of up to 0.77 V-score (harmonic mean of cluster completeness and homogeneity). Our results show that representation learning with retrofitting offers a robust general method to automatically expose dialectal differences and regional variation at a finer granularity than was previously possible.","tags":["computational sociolinguistics","sociolinguistics","NLP","representation learning","embeddings","retrofitting"],"title":"Capturing Regional Variation with Distributed Place Representations and Geographic Retrofitting","type":"publication"},{"authors":["Silviu Paun","Bob Carpenter","Jon Chamberlain","Dirk Hovy","Udo Kruschwitz","Massimo Poesio"],"categories":[],"content":"","date":1540166400,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1540166400,"objectID":"ac97468b9625b50fc42dc47858c6d8d2","permalink":"https://milanlproc.github.io/publication/2018-comparing-bayesian-models-annotation/","publishdate":"2020-02-29T14:48:20+01:00","relpermalink":"/publication/2018-comparing-bayesian-models-annotation/","section":"publication","summary":"The analysis of crowdsourced annotations in natural language processing is concerned with identifying (1) gold standard labels, (2) annotator accuracies and biases, and (3) item difficulties and error patterns. Traditionally, majority voting was used for 1, and coefficients of agreement for 2 and 3. Lately, model-based analysis of corpus annotations have proven better at all three tasks. But there has been relatively little work comparing them on the same datasets. This paper aims to fill this gap by analyzing six models of annotation, covering different approaches to annotator ability, item difficulty, and parameter pooling (tying) across annotators and items. We evaluate these models along four aspects: comparison to gold labels, predictive accuracy for new annotations, annotator characterization, and item difficulty, using four datasets with varying degrees of noise in the form of random (spammy) annotators. We conclude with guidelines for model selection, application, and implementation.","tags":["NLP","annotation","generative models","disagreement"],"title":"Comparing Bayesian Models of Annotation","type":"publication"},{"authors":["Sotiris Lamprinidis","Daniel Hardt","Dirk Hovy"],"categories":[],"content":"","date":1540166400,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1540166400,"objectID":"376dee532427df70e4cf017de36dccec","permalink":"https://milanlproc.github.io/publication/2018-predicting-news-headline-popularity-syntactic-semantic-knowledge-multitask-learning/","publishdate":"2020-02-29T14:48:20+01:00","relpermalink":"/publication/2018-predicting-news-headline-popularity-syntactic-semantic-knowledge-multitask-learning/","section":"publication","summary":"Newspapers need to attract readers with headlines, anticipating their readers’ preferences. These preferences rely on topical, structural, and lexical factors. We model each of these factors in a multi-task GRU network to predict headline popularity. We find that pre-trained word embeddings provide significant improvements over untrained embeddings, as do the combination of two auxiliary tasks, news-section prediction and part-of-speech tagging. However, we also find that performance is very similar to that of a simple Logistic Regression model over character n-grams. Feature analysis reveals structural patterns of headline popularity, including the use of forward-looking deictic expressions and second person pronouns.","tags":["NLP","multitask learning","text classification"],"title":"Predicting News Headline Popularity with Syntactic and Semantic Knowledge Using Multi-Task Learning","type":"publication"},{"authors":null,"categories":null,"content":" \u0026ldquo;Have a negroni. Have two. Be open to a world where you may not understand or agree with the person next to you, but have a drink with them anyways.\u0026quot; \u0026ndash;Anthony Bourdain\n The Coding Aperitivo is our take on a weekly seminar series. We end the working week and wind down with some relaxed academic chatter, a drink and some snacks.\nFormat We usually host external speakers on Fridays at 4pm Milan time. Talks are mostly virtual, and sometimes in person. We encourage our guests to try different formats with us such as guided discussions, hands-on activities, debates or just a nice academic chat. Got some research ideas you and you want a sounding board? We are very happy to discuss ongoing or upcoming research.\nPast Guests 2021 Emily Sheng: \u0026ldquo;Biases in NLG and Dialogue Systems\u0026rdquo; Nedjma Ousidhoum: \u0026ldquo;Expectations vs. Reality when Working on Toxic Content Detection in NLP\u0026rdquo; Nils Reimers: \u0026ldquo;Training State-of-the-Art Text Embedding \u0026amp; Neural Search Models\u0026rdquo; Maarten Sap: \u0026ldquo;Detecting and Rewriting Socially Biased Language\u0026rdquo; Sunipa Dev: \u0026ldquo;Towards Interpretable, Fair and Socially-Aware of Language Representations\u0026rdquo; Alba Curry: \u0026ldquo;Philosophy of Emotion and Sentiment Detection\u0026rdquo; Rob van der Goot: \u0026ldquo;Multi-lingual and Multi-task learning: from Dataset Creation to Modeling Su Lin Blodgett: \u0026ldquo;Social and Ethical Implications of NLP Technologies\u0026rdquo; Gabriele Sarti: \u0026ldquo;Interpreting Neural Language Models for Linguistic Complexity Assessment\u0026rdquo; Paul Röttger: \u0026ldquo;Two Contrasting Data Annotation Paradigms for Subjective NLP Tasks\u0026rdquo; Chia-Chien Hung: \u0026ldquo;Multi-domain and Multilingual Dialog\u0026rdquo; Anna Wegmann: \u0026ldquo;Does It Capture STEL? A Modular, Similarity-based Linguistic Style Evaluation Framework\u0026rdquo; Abhilasha Ravichander: \u0026ldquo;Probing the Probing Paradigm: Does Probing Accuracy Entail Task Relevance?\u0026rdquo; Samson Tan (AWS AI Research \u0026amp; Education): \u0026ldquo;Towards Sociolinguistically-Inclusive NLP: An Adversarial Approach\u0026rdquo; 2022 Christine de Kock: \u0026ldquo;I Beg to Differ: A study of constructive disagreement in online conversations\u0026rdquo; Eliana Pastor: \u0026ldquo;Pattern-based algorithms for Explainable AI\u0026rdquo; Dave Howcroft: \u0026ldquo;Low-Resource NLG\u0026rdquo; Zeerak Talat: \u0026ldquo;Ethics and Bias\u0026rdquo; Christopher Klamm: \u0026ldquo;Defining and Measuring Polasiration Across Disciplines\u0026rdquo; Swabha Swayamdipta: \u0026ldquo;Annotation Challenges in NLP\u0026rdquo; Carlo Schwarz: \u0026ldquo;How Polarized are Citizens? Measuring Ideology from the Ground-Up\u0026rdquo; Lorenzo Bertolini: \u0026ldquo;Testing Language Models on Compositionality\u0026rdquo; Alessandro Raganato Mark Dingemanse and Andreas Liesenfeld: \u0026ldquo;Language Diversity in Conversational AI Research\u0026rdquo; Agostina Calabrese: \u0026ldquo;If Data Patterns is the Answer, What was the Question?\u0026rdquo; Aida Mostafazadeh: \u0026ldquo;Incorporating annotators' psychological profiles into modeling language classification tasks\u0026rdquo; Myrthe Reuver: \u0026ldquo;Viewpoint diversity in news recommendation: Theories, Models, and Tasks to support democracy\u0026rdquo; Tommaso Caselli: \u0026ldquo;Language Resources to Monitor Abusive Language in Dutch\u0026rdquo; Valentin Hoffman: \u0026ldquo;Semantic Diffusion: Deep Learning Sense of network\u0026rdquo; Beatrice Savoldi: \u0026ldquo;Designing a course for Ethics in NLP\u0026rdquo; Hannah Rose Kirk: \u0026ldquo;Bias harms and mitigation\u0026rdquo; Juan Manuel Perez: \u0026ldquo;Assessing the impact of contextual information in hate speech detection\u0026rdquo; Daryna Dementieva: \u0026ldquo;Text detoxification\u0026rdquo; Fabio Tollon: \u0026ldquo;From designed properties to possibilities for action\u0026rdquo; Ryan Cotterell: \u0026ldquo;Some Thoughts on Compositionality\u0026rdquo; William Agnew: \u0026ldquo;Values, Ethics and NLP\u0026rdquo; Rami Aly: \u0026ldquo;Automatic fact checking\u0026rdquo; Indira Sen: \u0026ldquo;Measuring social constructs with NLP: Two case studies of abusive language and workplace depression\u0026rdquo; 2023 Maurice Jakesch: \u0026ldquo;Assessing the Effects and Risks of Large Language Models in AI-Mediated Communication\u0026rdquo; Marco del Tredici: \u0026ldquo;Current trends in NLP\u0026rdquo; Fatma Elsafoury: \u0026ldquo;Hate Speech and Toxicity\u0026rdquo; Mor Geva: \u0026ldquo;Annotation bias sources and prevention\u0026rdquo; Emanuele Bugliarello: \u0026ldquo;Language modelling as pixels\u0026rdquo; Tess Buckley: \u0026ldquo;Computational creativity and the ethics of AI-generated music\u0026rdquo; Marina Rizzi: \u0026ldquo;Self-regulation and the Evolution of Content: A Cross-Platform Analysis\u0026rdquo; Giovanni Cassani, Marco Bragoni, and Paul Schreiber: \u0026ldquo;Multimodal Representations for Words that Don’t Exist Yet\u0026rdquo; Laura Vasquez-Rodriguez: \u0026ldquo;Introduction to text simplification with NLP\u0026rdquo; Raj Ammanabrolu: \u0026ldquo;Interactive Language Learning\u0026rdquo; Suchin Gururangan: \u0026ldquo;All things language models, open-sourcing and regulation\u0026rdquo; Giada Pistilli: \u0026ldquo;Ethics in NLP\u0026rdquo; Edoardo Ponti: \u0026ldquo;Modular Deep Learning\u0026rdquo; Julie-Anne Meaney: \u0026ldquo;Demographically-aware Computational Humour\u0026rdquo; Giorgio Franceschelli: \u0026ldquo;Creativity and machine learning\u0026rdquo; Aubrie Amstutz: \u0026ldquo;Managing toxicity and hate speech in the private sector\u0026rdquo; Tom McCoy: \u0026ldquo;Embers of Autoregression: Understanding Large Language Models Through the Problem They are Trained to Solve\u0026rdquo; Camilo Carvajal Reyes: \u0026ldquo;EthicApp: analysing and understanding how people debate ethical issues\u0026rdquo; Tanvi Dinkar: \u0026ldquo;Safety and robustness in conversational AI\u0026rdquo; 2024 Emanuele La Malfa: \u0026ldquo;Code Simulation Challenges for Large Language Models\u0026rdquo; Enrico Liscio: \u0026ldquo;Context-Specific Value Inference via Hybrid Intelligence\u0026rdquo; Eve Fleisig: \u0026ldquo;When the Majority is Wrong: Modeling Annotator Disagreement for Language Tasks\u0026rdquo; Vishakh Padmakumar: \u0026ldquo;Does Writing with Language Models Reduce Content Diversity?\u0026rdquo; Enrico Bertino: \u0026ldquo;AI at a Milanese Chatbot Start-Up\u0026rdquo; Fangru Lin: \u0026ldquo;Graph-enhanced Large Language Models in Asynchronous Plan Reasoning\u0026rdquo; Xuhui Zhou: \u0026ldquo;Towards Socially Aware and Interactional NLP Systems\u0026rdquo; Minje Choi: \u0026ldquo;Towards Evaluating and Measuring the Social Capabilities of Large Language Models\u0026rdquo; Sachin Kumar: \u0026ldquo;Adapting Language Models to Improve Reliability: Experiments with Refusals and Diverse Preference Modeling\u0026rdquo; Nino Scherrer: \u0026ldquo;Evaluating (Moral) Beliefs Encoded in LLMs\u0026rdquo; Mary Sanford: \u0026ldquo;Political Discourse on Climate Change in EU Party Manifestos: A Computational Text Analysis Approach\u0026rdquo; Anna Rogers, Faeze Brahman and Elman Mansimov: Workshop on LLMs in Research and Industry Eugenia Stamboliev: \u0026ldquo;Can we Explain AI? On the Pitfalls of XAI\u0026rdquo; Maria Antoniak: \u0026ldquo;Computational Approaches to Narratives\u0026rdquo; Lucy Li: \u0026ldquo;AboutMe: Using Self-Descriptions in Webpages to Document the Effects of English Pretraining Data Filters\u0026rdquo; Jasmijn Bastings: \u0026ldquo;Bits, Bats \u0026amp; Bots: Deconstructing Gender in Language Technology\u0026rdquo; Fatma Elsafoury: \u0026ldquo;On the Sources of Bias in NLP Models: Origin, Impact, Mitigation, and the Ways Forward\u0026rdquo; Caleb Ziems: \u0026ldquo;How to Use Large Language Models for Computational Social Science\u0026rdquo; Rose Wang: \u0026ldquo;Scaling Expertise via Language Models with Applications to Education\u0026rdquo; Amin al Hazwani: \u0026ldquo;Collaborating to Create a Language-Independent Encyclopedia\u0026rdquo; Luna De Bruyne: \u0026ldquo;Emotions without Borders: Challenges in Multilingual Emotion Detection\u0026rdquo; Dirk\u0026rsquo;s Drinks When in Milan, drink as the Milanese. There are many excellent drink options, but they all start with a bitter and a red vermouth. The big names here are Campari and Martini, but there are plenty of other options worth exploring. Though the official recipes call for equal parts bitter and red vermouth, here we opt for a punchier taste, heavier on the bitter.\nBase: 3 parts bitter 2 parts red vermouth Options: You can now take this into several directions, by adding different mixers:\n 3 parts sparkling water (or fill it up) will get you an Americano (not to be confused with the coffee drink of the same name) For an interesting and refreshing twist, try tonic water instead of sparkling 3 parts prosecco get you a negroni sbagliato (the \u0026ldquo;messed up negroni\u0026rdquo;) 3 parts gin get you the original negroni 3 parts bourbon get you a boulevardier Pour the ingredients into a mixing glass with some ice and stir until the glass feels very cold. Strain into a glass with a large ice cube (the larger the better: it will melt more slowly) and a twist of orange or lemon peel (and rub the glass rim with it). Enjoy!\n","date":1530144000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1530144000,"objectID":"8f31c223c05e11c1ed02d06c6cc14b7b","permalink":"https://milanlproc.github.io/coding_aperitivo/","publishdate":"2018-06-28T00:00:00Z","relpermalink":"/coding_aperitivo/","section":"","summary":"Here we describe how to add a page to your site.","tags":null,"title":"Coding Aperitivo","type":"page"},{"authors":null,"categories":null,"content":"The Reading Group is our weekly meeting to present and discuss exciting contributions from the community.\nIt currently takes place every Thursday at 12:00 PM (Milan). For more info, feel free to reach out.\nUpcoming Program Date Presenter Paper Nov-21 Yujie Does Fine-Tuning LLMs on New Knowledge Encourage Hallucinations? Nov-28 Roberta Dec-5 Flor Why AI Is WEIRD and Should Not Be This Way: Towards AI For Everyone, With Everyone, By Everyone Dec-12 Dirk Dec-19 Ariana Dec-26 Debora ","date":1530144000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1530144000,"objectID":"b5c4438decf60aef693d14fa16033443","permalink":"https://milanlproc.github.io/reading_group/","publishdate":"2018-06-28T00:00:00Z","relpermalink":"/reading_group/","section":"","summary":"Papers and program of the MilaNLP Reading Group","tags":null,"title":"Reading Group","type":"page"},{"authors":["Dirk Hovy"],"categories":[],"content":"","date":1529625600,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1529625600,"objectID":"f18513a3f10ae12b8977d1caf7da095f","permalink":"https://milanlproc.github.io/publication/2018-social-neural-network/","publishdate":"2020-02-29T14:48:20+01:00","relpermalink":"/publication/2018-social-neural-network/","section":"publication","summary":"Over the years, natural language processing has increasingly focused on tasks that can be solved by statistical models, but ignored the social aspects of language. These limitations are in large part due to historically available data and the limitations of the models, but have narrowed our focus and biased the tools demographically. However, with the increased availability of data sets including socio-demographic information and more expressive (neural) models, we have the opportunity to address both issues. I argue that this combination can broaden the focus of NLP to solve a whole new range of tasks, enable us to generate novel linguistic insights, and provide fairer tools for everyone.","tags":["NLP","computational sociolinguistics","retrofitting","representation learning"],"title":"The Social and the Neural Network: How to Make Natural Language Processing about People again","type":"publication"}]
\ No newline at end of file
+[{"authors":["hovy_dirk"],"categories":null,"content":"Dirk Hovy is a Full Professor in the Computing Sciences Department of Bocconi University, and the scientific director of the Data and Marketing Insights research unit. Previously, he was faculty at the University of Copenhagen, got a PhD from USC\u0026rsquo;s Information Sciences Institute, and a linguistics master\u0026rsquo;s in Germany.\nDirk is interested in the interaction between language, society, and machine learning, or what language can tell us about society, and what computers can tell us about language. He is also interested in ethical questions of bias and algorithmic fairness in machine learning.\nHe has authored over 150 articles on these topics, including 3 best and one outstanding paper awards, and published two textbooks on NLP in Python for social scientists.\nDirk has co-founded and organized several workshops (on computational social science, and ethics in NLP), and was a local organizer for the EMNLP 2017 conference. He was awarded an ERC Starting Grant project 2020 for research on demographic bias in NLP.\nOutside of work, Dirk enjoys cooking, leather-crafting, and picking up heavy things just to put them back down.\n","date":-62135596800,"expirydate":-62135596800,"kind":"term","lang":"en","lastmod":-62135596800,"objectID":"2162c73950c747d1a2acd1061edae370","permalink":"https://milanlproc.github.io/authors/1_dirk_hovy/","publishdate":"0001-01-01T00:00:00Z","relpermalink":"/authors/1_dirk_hovy/","section":"authors","summary":"Dirk Hovy is a Full Professor in the Computing Sciences Department of Bocconi University, and the scientific director of the Data and Marketing Insights research unit. Previously, he was faculty at the University of Copenhagen, got a PhD from USC\u0026rsquo;s Information Sciences Institute, and a linguistics master\u0026rsquo;s in Germany.","tags":null,"title":"Dirk Hovy","type":"authors"},{"authors":["debora_nozza"],"categories":null,"content":"Debora Nozza (she/her) is an Assistant Professor in Computing Sciences at Bocconi University. She was awarded a €1.5m ERC Starting Grant project 2023 for research on personalized and subjective approaches to Natural Language Processing. Previously, she was awarded a €120,000 grant from Fondazione Cariplo for her project MONICA, which focuses on monitoring coverage, attitudes, and accessibility of Italian measures in response to COVID-19. Her research interests mainly focus on Natural Language Processing, specifically on the detection and counter-acting of hate speech and algorithmic bias on Social Media data in multilingual context.\nShe organized the 7th Workshop on Online Abuse and Harms (WOAH) at ACL 2023 and the ICWSM 2023 Data Challenge: Temporal social data at ICWSM 2023. She was one of the organizers of the task on Automatic Misogyny Identification (AMI) at Evalita 2018 and Evalita 2020, and one of the organizers of the HatEval Task 5 at SemEval 2019 on multilingual detection of hate speech against immigrants and women in Twitter.\n","date":-62135596800,"expirydate":-62135596800,"kind":"term","lang":"en","lastmod":-62135596800,"objectID":"6270b423d82bc34de53ce30c4d80baf9","permalink":"https://milanlproc.github.io/authors/2_debora_nozza/","publishdate":"0001-01-01T00:00:00Z","relpermalink":"/authors/2_debora_nozza/","section":"authors","summary":"Debora Nozza (she/her) is an Assistant Professor in Computing Sciences at Bocconi University. She was awarded a €1.5m ERC Starting Grant project 2023 for research on personalized and subjective approaches to Natural Language Processing.","tags":null,"title":"Debora Nozza","type":"authors"},{"authors":["amanda_cercas_curry"],"categories":null,"content":"","date":-62135596800,"expirydate":-62135596800,"kind":"term","lang":"en","lastmod":-62135596800,"objectID":"d18e690d2acda385ba9272ff4c77b099","permalink":"https://milanlproc.github.io/authors/amanda_cercas_curry/","publishdate":"0001-01-01T00:00:00Z","relpermalink":"/authors/amanda_cercas_curry/","section":"authors","summary":"","tags":null,"title":"Amanda Cercas Curry","type":"authors"},{"authors":["anne_lauscher"],"categories":null,"content":"","date":-62135596800,"expirydate":-62135596800,"kind":"term","lang":"en","lastmod":-62135596800,"objectID":"52ff7d721d80e8ff7b6937ad8d1aca31","permalink":"https://milanlproc.github.io/authors/anne_lauscher/","publishdate":"0001-01-01T00:00:00Z","relpermalink":"/authors/anne_lauscher/","section":"authors","summary":"","tags":null,"title":"Anne Lauscher","type":"authors"},{"authors":["arianna_muti"],"categories":null,"content":"Arianna Muti is a Postdoctoral Research Fellow at Bocconi University. Her research interests mainly focus on Natural Language Processing, specifically on the detection of cross-cultural and implicit misogyny on Social Media. She is currently working on the project PERSONAE, to develop personalized language technologies.\nShe has co-organized the 13th edition of the CLEF Conference.\n","date":-62135596800,"expirydate":-62135596800,"kind":"term","lang":"en","lastmod":-62135596800,"objectID":"98ebcf466ab5994ecf159a2d980fb2f2","permalink":"https://milanlproc.github.io/authors/arianna_muti/","publishdate":"0001-01-01T00:00:00Z","relpermalink":"/authors/arianna_muti/","section":"authors","summary":"Arianna Muti is a Postdoctoral Research Fellow at Bocconi University. Her research interests mainly focus on Natural Language Processing, specifically on the detection of cross-cultural and implicit misogyny on Social Media.","tags":null,"title":"Arianna Muti","type":"authors"},{"authors":["donya_rooein"],"categories":null,"content":"Donya Rooein is a Postdoctoral Research Fellow at Bocconi University, her work revolves around leveraging natural language processing for educational advancements. She explores the synergy between machine learning, linguistics, and practitioner insights to enhance our education system. Her primary focus is on developing interpretable and scalable measures through NLP systems, aimed at assessing learning effectiveness and fostering adaptive learning environments.\nShe completed her Ph.D. in Information Technology Engineering from Politecnico di Milano in October 2022. Her doctoral research was awarded by EIT Digital and centered on crafting a flexible and adaptable framework for educational chatbots. Throughout her Ph.D. journey, she specialized in creating highly customizable chatbot solutions tailored to the diverse educational requirements of teachers and students alike.\n","date":-62135596800,"expirydate":-62135596800,"kind":"term","lang":"en","lastmod":-62135596800,"objectID":"6edd0fb476ee4f8442b7db4ce05086cf","permalink":"https://milanlproc.github.io/authors/donya_rooein/","publishdate":"0001-01-01T00:00:00Z","relpermalink":"/authors/donya_rooein/","section":"authors","summary":"Donya Rooein is a Postdoctoral Research Fellow at Bocconi University, her work revolves around leveraging natural language processing for educational advancements. She explores the synergy between machine learning, linguistics, and practitioner insights to enhance our education system.","tags":null,"title":"Donya Rooein","type":"authors"},{"authors":["elisa_bassignana"],"categories":null,"content":"Elisa Bassignana is a Postdoctoral Research Fellow at the NLPnorth research group at the IT University of Copenhagen and an affiliated member at the Pioneer Centre for Artificial Intelligence. Her research interest lays at the intersection between Natural Language Processing and Computational Social Science. Specifically, she\u0026rsquo;s interested in analyzing social phenomena through the lens of language, by using NLP technologies. During her PhD, she worked on developing computation systems for Information Extraction with strong abilities to generalize over unseen data sources and label spaces. More specifically, she worked on cross-domain Relation Extraction. Before that, she worked in the field of Computational Social Science (hate speech and personality detection).\nElisa serves as part of the EACL Student Board and has co-organized the EACL Student Research Workshop 2023.\n","date":-62135596800,"expirydate":-62135596800,"kind":"term","lang":"en","lastmod":-62135596800,"objectID":"c7dcee2a0f13effb5254b1881e990eec","permalink":"https://milanlproc.github.io/authors/elisa_bassignana/","publishdate":"0001-01-01T00:00:00Z","relpermalink":"/authors/elisa_bassignana/","section":"authors","summary":"Elisa Bassignana is a Postdoctoral Research Fellow at the NLPnorth research group at the IT University of Copenhagen and an affiliated member at the Pioneer Centre for Artificial Intelligence. Her research interest lays at the intersection between Natural Language Processing and Computational Social Science.","tags":null,"title":"Elisa Bassignana","type":"authors"},{"authors":["emanuele_moscato"],"categories":null,"content":"Emanuele Moscato Emanuele Moscato is a Postdoctoral Researcher at Bocconi, where he applies NLP-inspired models to a variety of problems including statistical physics of probabilistic languages, bioinformatics and NLP itself.\nMore on his personal website.\n","date":-62135596800,"expirydate":-62135596800,"kind":"term","lang":"en","lastmod":-62135596800,"objectID":"39e58892e6f18d9be59b25b6560d25d5","permalink":"https://milanlproc.github.io/authors/emanuele_moscato/","publishdate":"0001-01-01T00:00:00Z","relpermalink":"/authors/emanuele_moscato/","section":"authors","summary":"Emanuele Moscato Emanuele Moscato is a Postdoctoral Researcher at Bocconi, where he applies NLP-inspired models to a variety of problems including statistical physics of probabilistic languages, bioinformatics and NLP itself.","tags":null,"title":"Emanuele Moscato","type":"authors"},{"authors":["federico_bianchi"],"categories":null,"content":"","date":-62135596800,"expirydate":-62135596800,"kind":"term","lang":"en","lastmod":-62135596800,"objectID":"1b44ab31c433ff8d06f13800865217d5","permalink":"https://milanlproc.github.io/authors/federico_bianchi/","publishdate":"0001-01-01T00:00:00Z","relpermalink":"/authors/federico_bianchi/","section":"authors","summary":"","tags":null,"title":"Federico Bianchi","type":"authors"},{"authors":["flor_plaza"],"categories":null,"content":"Flor Miriam Plaza-del-Arco is a Postdoctoral Research Fellow at Bocconi University. Her research interests mainly focus on Natural Language Processing, specifically on the detection of hate speech and the analysis of emotions on Social Media. During her PhD, she worked on offensive language detection on social media in both English and Spanish, specifically she created different resources including corpora and lexicons, as well as developed computational systems that benefit from different linguistic phenomena to detect offensive language more accurately.\nShe has co-organized the EmoEvalEs and MeOffendES shared tasks at IberLEF 2021 on offensive language detection and emotion detection. She has also co-organized the 36th and 37th editions of the SEPLN Conference.\n","date":-62135596800,"expirydate":-62135596800,"kind":"term","lang":"en","lastmod":-62135596800,"objectID":"3753fb7bdfa63528a46e9d82ba7c5812","permalink":"https://milanlproc.github.io/authors/flor_plaza/","publishdate":"0001-01-01T00:00:00Z","relpermalink":"/authors/flor_plaza/","section":"authors","summary":"Flor Miriam Plaza-del-Arco is a Postdoctoral Research Fellow at Bocconi University. Her research interests mainly focus on Natural Language Processing, specifically on the detection of hate speech and the analysis of emotions on Social Media.","tags":null,"title":"Flor Plaza","type":"authors"},{"authors":["fornaciari_tommaso"],"categories":null,"content":"","date":-62135596800,"expirydate":-62135596800,"kind":"term","lang":"en","lastmod":-62135596800,"objectID":"f93279484d263fb2d790380394be888c","permalink":"https://milanlproc.github.io/authors/fornaciari_tommaso/","publishdate":"0001-01-01T00:00:00Z","relpermalink":"/authors/fornaciari_tommaso/","section":"authors","summary":"","tags":null,"title":"Tommaso Fornaciari","type":"authors"},{"authors":["giuseppe_attanasio"],"categories":null,"content":"Giuseppe Attanasio Giuseppe Attanasio is a Postdoctoral Researcher at Bocconi, where he works on large-scale neural architectures for Natural Language Processing. His research focuses on understanding and regularizing models for debiasing and fairness purposes. He is actively working on project MONICA to characterize Italian measures in response to COVID 19.\nMore on his personal website.\n","date":-62135596800,"expirydate":-62135596800,"kind":"term","lang":"en","lastmod":-62135596800,"objectID":"c69033f31210e3828102d577df47e75c","permalink":"https://milanlproc.github.io/authors/giuseppe_attanasio/","publishdate":"0001-01-01T00:00:00Z","relpermalink":"/authors/giuseppe_attanasio/","section":"authors","summary":"Giuseppe Attanasio Giuseppe Attanasio is a Postdoctoral Researcher at Bocconi, where he works on large-scale neural architectures for Natural Language Processing. His research focuses on understanding and regularizing models for debiasing and fairness purposes.","tags":null,"title":"Giuseppe Attanasio","type":"authors"},{"authors":["jan_globisz"],"categories":null,"content":"Jan Globisz is a Master’s student of Data Science and Business Analytics at Bocconi University. As a research assistant, he worked on uncovering the demographic features from SCOTUS judgments, as well as monitoring the changing roles of males and females in Swedish parenthood. In his thesis, he explored the topic of corruption in public procurement and built a classification model for detecting corruption risks in European calls for tenders. In his spare time, he supports the work of an NGO helping refugees from Ukraine and the Polish - Belarusian border.\n","date":-62135596800,"expirydate":-62135596800,"kind":"term","lang":"en","lastmod":-62135596800,"objectID":"dbe84e29b948daf768241c073c4cf13b","permalink":"https://milanlproc.github.io/authors/jan_globisz/","publishdate":"0001-01-01T00:00:00Z","relpermalink":"/authors/jan_globisz/","section":"authors","summary":"Jan Globisz is a Master’s student of Data Science and Business Analytics at Bocconi University. As a research assistant, he worked on uncovering the demographic features from SCOTUS judgments, as well as monitoring the changing roles of males and females in Swedish parenthood.","tags":null,"title":"Jan Globisz","type":"authors"},{"authors":["kilian_theil"],"categories":null,"content":"","date":-62135596800,"expirydate":-62135596800,"kind":"term","lang":"en","lastmod":-62135596800,"objectID":"0011484030a6940fd202413d2e58ca73","permalink":"https://milanlproc.github.io/authors/kilian_theil/","publishdate":"0001-01-01T00:00:00Z","relpermalink":"/authors/kilian_theil/","section":"authors","summary":"","tags":null,"title":"Kilian Theil","type":"authors"},{"authors":["lorenzo_lupo"],"categories":null,"content":"Lorenzo Lupo is a postdoctoral research fellow at Bocconi University, working on natural language processing and its applications to social and economic challenges. He is collaborating to the MENTALISM project, combining text analysis and machine learning with survey data to track inequality.\nMore on his personal website.\n","date":-62135596800,"expirydate":-62135596800,"kind":"term","lang":"en","lastmod":-62135596800,"objectID":"7da364ecd1fe8524c3bf084438e08df4","permalink":"https://milanlproc.github.io/authors/lorenzo_lupo/","publishdate":"0001-01-01T00:00:00Z","relpermalink":"/authors/lorenzo_lupo/","section":"authors","summary":"Lorenzo Lupo is a postdoctoral research fellow at Bocconi University, working on natural language processing and its applications to social and economic challenges. He is collaborating to the MENTALISM project, combining text analysis and machine learning with survey data to track inequality.","tags":null,"title":"Lorenzo Lupo","type":"authors"},{"authors":["maria_nawrocka"],"categories":null,"content":"Maria Nawrocka is a PhD candidate at the Doctoral School of Social Sciences, at the University of Warsaw. Her research focuses on the discourse surrounding refugees in Polish public television. Utilizing a customized NLP framework designed for the Polish language, she aims to uncover patterns and shifts in the representation of refugees over the past decade.\n","date":-62135596800,"expirydate":-62135596800,"kind":"term","lang":"en","lastmod":-62135596800,"objectID":"b3966a7776583dc30e9fa9c427145927","permalink":"https://milanlproc.github.io/authors/maria_nawrocka/","publishdate":"0001-01-01T00:00:00Z","relpermalink":"/authors/maria_nawrocka/","section":"authors","summary":"Maria Nawrocka is a PhD candidate at the Doctoral School of Social Sciences, at the University of Warsaw. Her research focuses on the discourse surrounding refugees in Polish public television. Utilizing a customized NLP framework designed for the Polish language, she aims to uncover patterns and shifts in the representation of refugees over the past decade.","tags":null,"title":"Maria Nawrocka","type":"authors"},{"authors":["matthias_orlikowski"],"categories":null,"content":"","date":-62135596800,"expirydate":-62135596800,"kind":"term","lang":"en","lastmod":-62135596800,"objectID":"9e7557087ab3b8193a5c6f86cdcee2c0","permalink":"https://milanlproc.github.io/authors/matthias_orlikowski/","publishdate":"0001-01-01T00:00:00Z","relpermalink":"/authors/matthias_orlikowski/","section":"authors","summary":"","tags":null,"title":"Matthias Orlikowski","type":"authors"},{"authors":["nikita_soni"],"categories":null,"content":"","date":-62135596800,"expirydate":-62135596800,"kind":"term","lang":"en","lastmod":-62135596800,"objectID":"8d64189b871ea9a66d62606eeded552e","permalink":"https://milanlproc.github.io/authors/nikita_soni/","publishdate":"0001-01-01T00:00:00Z","relpermalink":"/authors/nikita_soni/","section":"authors","summary":"","tags":null,"title":"Nikita Soni","type":"authors"},{"authors":["paul_rottger"],"categories":null,"content":"I am a postdoctoral researcher in Dirk Hovy‘s MilaNLP Lab at Bocconi University. My work is located at the intersection of computation, language and society. Right now, I am particularly interested in evaluating and aligning social values in large language models.\nIn May 2023, I completed my PhD at the University of Oxford, where I was supervised by Janet Pierrehumbert and Helen Margetts. In my PhD, I worked on improving the evaluation and effectiveness of natural language processing models for hate speech detection. I also worked on general language modelling challenges like language change and annotator subjectivity. The HateCheck project that I led won the Stanford AI Audit Challenge.\nDuring my PhD, I also co-founded Rewire, a start-up building socially responsible AI for online safety. Over two years as CTO, I grew a technical team of 10+ people, working on large projects for Google, Meta and others. In March 2023, Rewire was acquired by ActiveFence.\nFor current updates, follow me on Twitter or visit my website.\n","date":-62135596800,"expirydate":-62135596800,"kind":"term","lang":"en","lastmod":-62135596800,"objectID":"c2616b530e000914ab4b7aeaa5b014ad","permalink":"https://milanlproc.github.io/authors/paul_rottger/","publishdate":"0001-01-01T00:00:00Z","relpermalink":"/authors/paul_rottger/","section":"authors","summary":"I am a postdoctoral researcher in Dirk Hovy‘s MilaNLP Lab at Bocconi University. My work is located at the intersection of computation, language and society. Right now, I am particularly interested in evaluating and aligning social values in large language models.","tags":null,"title":"Paul Röttger","type":"authors"},{"authors":["pieter_delobelle"],"categories":null,"content":"","date":-62135596800,"expirydate":-62135596800,"kind":"term","lang":"en","lastmod":-62135596800,"objectID":"7bad91faad0b4fa4f8ec60697e2bbf43","permalink":"https://milanlproc.github.io/authors/pieter_delobelle/","publishdate":"0001-01-01T00:00:00Z","relpermalink":"/authors/pieter_delobelle/","section":"authors","summary":"","tags":null,"title":"Pieter Delobelle","type":"authors"},{"authors":["pietro_lesci"],"categories":null,"content":"","date":-62135596800,"expirydate":-62135596800,"kind":"term","lang":"en","lastmod":-62135596800,"objectID":"faaed7ac08357878be8b53209c877ee1","permalink":"https://milanlproc.github.io/authors/pietro_lesci/","publishdate":"0001-01-01T00:00:00Z","relpermalink":"/authors/pietro_lesci/","section":"authors","summary":"","tags":null,"title":"Pietro Lesci","type":"authors"},{"authors":["tanise_ceron"],"categories":null,"content":"Tanise Ceron is a Postdoctoral Research Fellow at Bocconi University. Her research lies in understanding how algorithms, and more specifically language models filter information such as understanding the types of biases embedded in large language models (LLMs) and how they manifest in downstream tasks. She is also keen on developing methods for modeling societal discourse. This involves tasks such as developing methods to mine ideologies, and more generally, opinions from texts. Lastly, she enjoys thinking about the implementation of her research in real world use case applications, such as news recommenders.\nShe co-leads the project MULTIVIEW which investigates methods for diversifying news recommendations in terms of perspectives.\n","date":-62135596800,"expirydate":-62135596800,"kind":"term","lang":"en","lastmod":-62135596800,"objectID":"17e17e09710126f90b1391272ffc5d27","permalink":"https://milanlproc.github.io/authors/tanise_ceron/","publishdate":"0001-01-01T00:00:00Z","relpermalink":"/authors/tanise_ceron/","section":"authors","summary":"Tanise Ceron is a Postdoctoral Research Fellow at Bocconi University. Her research lies in understanding how algorithms, and more specifically language models filter information such as understanding the types of biases embedded in large language models (LLMs) and how they manifest in downstream tasks.","tags":null,"title":"Tanise Ceron","type":"authors"},{"authors":["tiancheng_hu"],"categories":null,"content":"I am a first-year second-year third year PhD student Computation, Cognition and Language at the Language Technology Lab at University of Cambridge, supervised by Prof. Nigel Collier. I have broad interests in natural language processing and computational social science. Most recently, my research encompasses two aspects: 1) understanding the inherent biases of LLMs through text generation 2) employing LLMs to understand biased language use in human communication. Previously, I completed my master\u0026rsquo;s in Electrical Engineering and Information Technology at ETH Zürich. My master\u0026rsquo;s thesis is about quotative usage in U.S political news. It was done at The Data Science Lab at EPFL, supervised by Manoel Horta Ribeiro, Prof. Andreas Spitz and Prof. Robert West.\nI obtained a Bachlor\u0026rsquo;s of Science in Electrical Engineering from The University of Texas at Dallas, advised by Prof. Carlos Busso. I worked on driver head pose estimation with 3D data.\n","date":-62135596800,"expirydate":-62135596800,"kind":"term","lang":"en","lastmod":-62135596800,"objectID":"7af1adac0e700a9bddfa9b6905f9afab","permalink":"https://milanlproc.github.io/authors/tiancheng_hu/","publishdate":"0001-01-01T00:00:00Z","relpermalink":"/authors/tiancheng_hu/","section":"authors","summary":"I am a first-year second-year third year PhD student Computation, Cognition and Language at the Language Technology Lab at University of Cambridge, supervised by Prof. Nigel Collier. I have broad interests in natural language processing and computational social science.","tags":null,"title":"Tiancheng Hu","type":"authors"},{"authors":["Flor Miriam Plaza-del-Arco","Amanda Cercas Curry","Susanna Paoli","Alba Curry","Dirk Hovy"],"categories":[],"content":"","date":1726963200,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1726963200,"objectID":"8cf9a7741aced2c3989f8f303097b872","permalink":"https://milanlproc.github.io/publication/2024-divine-llamas-emotion-bias/","publishdate":"2023-04-12T17:19:53+01:00","relpermalink":"/publication/2024-divine-llamas-emotion-bias/","section":"publication","summary":"Emotions play important epistemological and cognitive roles in our lives, revealing our values and guiding our actions. Previous work has shown that LLMs display biases in emotion attribution along gender lines. However, unlike gender, which says little about our values, religion, as a socio-cultural system, prescribes a set of beliefs and values for its followers. Religions, therefore, cultivate certain emotions. Moreover, these rules are explicitly laid out and interpreted by religious leaders. Using emotion attribution, we explore how different religions are represented in LLMs. We find that: Major religions in the US and European countries are represented with more nuance, displaying a more shaded model of their beliefs. Eastern religions like Hinduism and Buddhism are strongly stereotyped. Judaism and Islam are stigmatized -- the models' refusal skyrocket. We ascribe these to cultural bias in LLMs and the scarcity of NLP literature on religion. In the rare instances where religion is discussed, it is often in the context of toxic language, perpetuating the perception of these religions as inherently toxic. This finding underscores the urgent need to address and rectify these biases. Our research underscores the crucial role emotions play in our lives and how our values influence them.","tags":["Emotion attribution","Religion","Bias","Stereotypes","Large Language Models"],"title":"Divine LLaMAs: Bias, Stereotypes, Stigmatization, and Emotion Representation of Religion in Large Language Models","type":"publication"},{"authors":["Fabio Pernisi","Dirk Hovy","Paul Röttger"],"categories":[],"content":"","date":1723334400,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1723334400,"objectID":"156b0278c4b0945d090f234ba8e4045f","permalink":"https://milanlproc.github.io/publication/2024-compromesso/","publishdate":"2023-07-12T14:48:20+01:00","relpermalink":"/publication/2024-compromesso/","section":"publication","summary":"As diverse linguistic communities and users adopt large language models (LLMs), assessing their safety across languages becomes critical. Despite ongoing efforts to make LLMs safe, they can still be made to behave unsafely with jailbreaking, a technique in which models are prompted to act outside their operational guidelines. Research on LLM safety and jailbreaking, however, has so far mostly focused on English, limiting our understanding of LLM safety in other languages. We contribute towards closing this gap by investigating the effectiveness of many-shot jailbreaking, where models are prompted with unsafe demonstrations to induce unsafe behaviour, in Italian. To enable our analysis, we create a new dataset of unsafe Italian question-answer pairs. With this dataset, we identify clear safety vulnerabilities in four families of open-weight LLMs. We find that the models exhibit unsafe behaviors even when prompted with few unsafe demonstrations, and -- more alarmingly -- that this tendency rapidly escalates with more demonstrations.","tags":["Large Language Models","AI Safety","NLP"],"title":"Compromesso! Italian Many-Shot Jailbreaks Undermine the Safety of Large Language Models","type":"publication"},{"authors":["Xinpeng Wang","Bolei Ma","Chengzhi Hu","Leon Weber-Genzel","Paul Röttger","Frauke Kreuter","Dirk Hovy","Barbara Plank"],"categories":[],"content":"","date":1723334400,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1723334400,"objectID":"7705723cc81af3d241d833404acc59a6","permalink":"https://milanlproc.github.io/publication/2024-myanswerisc/","publishdate":"2023-07-12T14:48:20+01:00","relpermalink":"/publication/2024-myanswerisc/","section":"publication","summary":"The open-ended nature of language generation makes the evaluation of autoregressive large language models (LLMs) challenging. One common evaluation approach uses multiple-choice questions to limit the response space. The model is then evaluated by ranking the candidate answers by the log probability of the first token prediction. However, first-tokens may not consistently reflect the final response output, due to model’s diverse response styles such as starting with “Sure” or refusing to answer. Consequently, first-token evaluation is not indicative of model behaviour when interacting with users. But by how much? We evaluate how aligned first-token evaluation is with the text output along several dimensions, namely final option choice, refusal rate, choice distribution and robustness under prompt perturbation. Our results show that the two approaches are severely misaligned on all dimensions, reaching mismatch rates over 60%. Models heavily fine-tuned on conversational or safety data are especially impacted. Crucially, models remain misaligned even when we increasingly constrain prompts, i.e., force them to start with an option letter or example template. Our findings i) underscore the importance of inspecting the text output as well and ii) caution against relying solely on first-token evaluation.","tags":["Large Language Models","Evaluation","NLP"],"title":"My Answer is C: First-Token Probabilities Do Not Match Text Answers in Instruction-Tuned Language Models","type":"publication"},{"authors":["Paul Röttger","Valentin Hofmann","Valentina Pyatkin","Musashi Hinck","Hannah Rose Kirk","Hinrich Schuetze","Dirk Hovy"],"categories":[],"content":"","date":1723334400,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1723334400,"objectID":"aba2a2e9644c22fb67872e026df2a6f8","permalink":"https://milanlproc.github.io/publication/2024-politicalcompass/","publishdate":"2023-07-12T14:48:20+01:00","relpermalink":"/publication/2024-politicalcompass/","section":"publication","summary":"Much recent work seeks to evaluate values and opinions in large language models (LLMs) using multiple-choice surveys and questionnaires. Most of this work is motivated by concerns around real-world LLM applications. For example, politically-biased LLMs may subtly influence society when they are used by millions of people. Such real-world concerns, however, stand in stark contrast to the artificiality of current evaluations: real users do not typically ask LLMs survey questions. Motivated by this discrepancy, we challenge the prevailing *constrained* evaluation paradigm for values and opinions in LLMs and explore more realistic *unconstrained* evaluations. As a case study, we focus on the popular Political Compass Test (PCT). In a systematic review, we find that most prior work using the PCT *forces models to comply with the PCT’s multiple-choice format. We show that models give substantively different answers when not forced; that answers change depending on how models are forced; and that answers lack paraphrase robustness. Then, we demonstrate that models give different answers yet again in a more realistic open-ended answer setting. We distill these findings into recommendations and open challenges in evaluating values and opinions in LLMs.","tags":["Large Language Models","AI Alignment","NLP"],"title":"Political Compass or Spinning Arrow? Towards More Meaningful Evaluations for Values and Opinions in Large Language Models","type":"publication"},{"authors":["Paul Röttger","Hannah Rose Kirk","Bertie Vidgen","Giuseppe Attanasio","Federico Bianchi","Dirk Hovy"],"categories":[],"content":"","date":1721088000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1721088000,"objectID":"a6fed1f8412ee969bf6cd5afc26ac5e8","permalink":"https://milanlproc.github.io/publication/2024-xstest/","publishdate":"2023-07-12T14:48:20+01:00","relpermalink":"/publication/2024-xstest/","section":"publication","summary":"Without proper safeguards, large language models will readily follow malicious instructions and generate toxic content. This risk motivates safety efforts such as red-teaming and large-scale feedback learning, which aim to make models both helpful and harmless. However, there is a tension between these two objectives, since harmlessness requires models to refuse to comply with unsafe prompts, and thus not be helpful. Recent anecdotal evidence suggests that some models may have struck a poor balance, so that even clearly safe prompts are refused if they use similar language to unsafe prompts or mention sensitive topics. In this paper, we introduce a new test suite called XSTest to identify such eXaggerated Safety behaviours in a systematic way. XSTest comprises 250 safe prompts across ten prompt types that well-calibrated models should not refuse to comply with, and 200 unsafe prompts as contrasts that models, for most applications, should refuse. We describe XSTest’s creation and composition, and then use the test suite to highlight systematic failure modes in state-of-the-art language models as well as more general challenges in building safer language models.","tags":["Large Language Models","AI Safety","NLP"],"title":"XSTest: A Test Suite for Identifying Exaggerated Safety Behaviours in Large Language Models","type":"publication"},{"authors":["Lorenzo Lupo","Paul Bose","Mahyar Habibi","Dirk Hovy","Carlo Schwarz"],"categories":[],"content":"","date":1716163200,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1716163200,"objectID":"e1b884510e6a32df5fabb54ecd4853bc","permalink":"https://milanlproc.github.io/publication/2024-dadit/","publishdate":"2024-02-23T14:48:20+01:00","relpermalink":"/publication/2024-dadit/","section":"publication","summary":"Social scientists increasingly use demographically stratified social media data to study the attitudes, beliefs, and behavior of the general public. To facilitate such analyses, we construct, validate, and release the representative DADIT dataset of 30M tweets of 20k Italian Twitter users, along with their bios and profile pictures. We enrich the user data with high-quality labels for gender, age, and location. This new dataset enables us to compare the performance of various state-of-the-art models for the prediction of the gender and age of social media users. In particular, we investigate if tweets contain valuable information for the prediction of user characteristics, since popular classifiers like M3 don't leverage them. Our best XLM-based classifier improves upon the commonly used competitor M3 by up to 53% F1. Especially for age prediction, classifiers profit from including tweets as features. We also confirm these findings on a German test set.","tags":["Twitter data","demographic prediction","language models","multimodal classification"],"title":"DADIT: A Dataset for Demographic Classification of Italian Twitter Users and a Comparison of Prediction Methods","type":"publication"},{"authors":["Donya Rooein","Paul Rottger","Anastassia Shaitarova","Dirk Hovy"],"categories":[],"content":"","date":1715817600,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1715817600,"objectID":"68e0ca6ce3b15d6e054e6e5e289b8963","permalink":"https://milanlproc.github.io/publication/2024-difficulty-classification/","publishdate":"2024-05-15T16:22:16+01:00","relpermalink":"/publication/2024-difficulty-classification/","section":"publication","summary":"Using large language models (LLMs) for educational applications like dialogue-based teaching is a hot topic. Effective teaching, however, requires teachers to adapt the difficulty of content and explanations to the education level of their students. Even the best LLMs today struggle to do this well. If we want to improve LLMs on this adaptation task, we need to be able to measure adaptation success reliably. However, current Static metrics for text difficulty, like the Flesch-Kincaid Reading Ease score, are known to be crude and brittle. We, therefore, introduce and evaluate a new set of Prompt-based metrics for text difficulty. Based on a user study, we create Prompt-based metrics as inputs for LLMs. They leverage LLM's general language understanding capabilities to capture more abstract and complex features than Static metrics. Regression experiments show that adding our Prompt-based metrics significantly improves text difficulty classification over Static metrics alone. Our results demonstrate the promise of using LLMs to evaluate text adaptation to different education levels. ","tags":["Difficulty Classification","Education","Large Language Models"],"title":"Beyond Flesch-Kincaid: Prompt-based Metrics Improve Difficulty Classification of Educational Texts","type":"publication"},{"authors":["Federico Bianchi","Mirac Suzgun","Giuseppe Attanasio","Paul Röttger","Dan Jurafsky","Tatsunori Hashimoto","James Zou"],"categories":[],"content":"","date":1715040000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1715040000,"objectID":"b91aaf47580d3e1f9038e2c7901a17ae","permalink":"https://milanlproc.github.io/publication/2024-safetyllamas/","publishdate":"2023-07-12T14:48:20+01:00","relpermalink":"/publication/2024-safetyllamas/","section":"publication","summary":"Training large language models to follow instructions makes them perform better on a wide range of tasks, generally becoming more helpful. However, a perfectly helpful model will follow even the most malicious instructions and readily generate harmful content. In this paper, we raise concerns over the safety of models that only emphasize helpfulness, not safety, in their instruction-tuning. We show that several popular instruction-tuned models are highly unsafe. Moreover, we show that adding just 3% safety examples (a few hundred demonstrations) in the training set when fine-tuning a model like LLaMA can substantially improve their safety. Our safety-tuning does not make models significantly less capable or helpful as measured by standard benchmarks. However, we do find a behavior of exaggerated safety, where too much safety-tuning makes models refuse to respond to reasonable prompts that superficially resemble unsafe ones. Our study sheds light on trade-offs in training LLMs to follow instructions and exhibit safe behavior.","tags":["Large Language Models","AI Safety","NLP"],"title":"Safety-Tuned LLaMAs: Lessons From Improving the Safety of Large Language Models that Follow Instructions","type":"publication"},{"authors":["Paul Röttger","Fabio Pernisi","Bertie Vidgen","Dirk Hovy"],"categories":[],"content":"","date":1712534400,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1712534400,"objectID":"63bda1d9c59fa404b61557ca9db3376b","permalink":"https://milanlproc.github.io/publication/2024-safetyprompts/","publishdate":"2023-07-12T14:48:20+01:00","relpermalink":"/publication/2024-safetyprompts/","section":"publication","summary":"The last two years have seen a rapid growth in concerns around the safety of large language models (LLMs). Researchers and practitioners have met these concerns by introducing an abundance of new datasets for evaluating and improving LLM safety. However, much of this work has happened in parallel, and with very different goals in mind, ranging from the mitigation of near-term risks around bias and toxic content generation to the assessment of longer-term catastrophic risk potential. This makes it difficult for researchers and practitioners to find the most relevant datasets for a given use case, and to identify gaps in dataset coverage that future work may fill. To remedy these issues, we conduct a first systematic review of open datasets for evaluating and improving LLM safety. We review 102 datasets, which we identified through an iterative and community-driven process over the course of several months. We highlight patterns and trends, such as a a trend towards fully synthetic datasets, as well as gaps in dataset coverage, such as a clear lack of non-English datasets. We also examine how LLM safety datasets are used in practice -- in LLM release publications and popular LLM benchmarks -- finding that current evaluation practices are highly idiosyncratic and make use of only a small fraction of available datasets. Our contributions are based on this http URL, a living catalogue of open datasets for LLM safety, which we commit to updating continuously as the field of LLM safety develops.","tags":["Large Language Models","AI Safety","NLP"],"title":"SafetyPrompts: a Systematic Review of Open Datasets for Evaluating and Improving Large Language Model Safety ","type":"publication"},{"authors":["Flor Miriam Plaza-del-Arco","Amanda Cercas Curry","Alba Curry","Gavin Abercrombie","Dirk Hovy"],"categories":[],"content":"","date":1711584000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1711584000,"objectID":"72fdc8b25d816d74f056cd9b0f6f497b","permalink":"https://milanlproc.github.io/publication/2024-emotion-gender-stereotypes/","publishdate":"2023-04-12T17:19:53+01:00","relpermalink":"/publication/2024-emotion-gender-stereotypes/","section":"publication","summary":"Large language models (LLMs) reflect societal norms and biases, especially about gender. While societal biases and stereotypes have been extensively researched in various NLP applications, there is a surprising gap for emotion analysis. However, emotion and gender are closely linked in societal discourse. E.g., women are often thought of as more empathetic, while men's anger is more socially accepted. To fill this gap, we present the first comprehensive study of gendered emotion attribution in five state-of-the-art LLMs (open- and closed-source). We investigate whether emotions are gendered, and whether these variations are based on societal stereotypes. We prompt the models to adopt a gendered persona and attribute emotions to an event like 'When I had a serious argument with a dear person'. We then analyze the emotions generated by the models in relation to the gender-event pairs. We find that all models consistently exhibit gendered emotions, influenced by gender stereotypes. These findings are in line with established research in psychology and gender studies. Our study sheds light on the complex societal interplay between language, gender, and emotion. The reproduction of emotion stereotypes in LLMs allows us to use those models to study the topic in detail, but raises questions about the predictive use of those same LLMs for emotion applications.","tags":["Emotion attribution","Gender Bias","Large Language Models"],"title":"Angry Men, Sad Women: Large Language Models Reflect Gendered Stereotypes in Emotion Attribution","type":"publication"},{"authors":["Donya Rooein","Dirk Hovy"],"categories":[],"content":"","date":1711584000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1711584000,"objectID":"4f5e4f0fe7cf32b88733c20420ae3e76","permalink":"https://milanlproc.github.io/publication/2024-conversations-data/","publishdate":"2023-04-12T17:19:53+01:00","relpermalink":"/publication/2024-conversations-data/","section":"publication","summary":"Open conversations are one of the most engaging forms of teaching. However, creating those conversations in educational software is a complex endeavor, especially if we want to address the needs of different audiences. While language models hold great promise for educational applications, there are substantial challenges in training them to engage in meaningful and effective conversational teaching, especially when considering the diverse needs of various audiences. No official data sets exist for this task to facilitate the training of language models for conversational teaching, considering the diverse needs of various audiences. This paper presents a novel source for facilitating conversational teaching of scientific concepts at various difficulty levels (from preschooler to expert), namely dialogues taken from video transcripts. We analyse this data source in various ways to show that it offers a diverse array of examples that can be used to generate contextually appropriate and natural responses to scientific topics for specific target audiences. It is a freely available valuable resource for training and evaluating conversation models, encompassing organically occurring dialogues. While the raw data is available online, we provide additional metadata for conversational analysis of dialogues at each level in all available videos. ","tags":["Education","Conversational Data","Adaptive Learning"],"title":"Conversations as a Source for Teaching Scientific Concepts at Different Education Levels","type":"publication"},{"authors":["Flor Miriam Plaza-del-Arco","Alba Curry","Amanda Cercas Curry","Dirk Hovy"],"categories":[],"content":"","date":1711584000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1711584000,"objectID":"d9ced95d92a4ea8da9be921c1eb03a4d","permalink":"https://milanlproc.github.io/publication/2024-emotion-analysis-survey/","publishdate":"2023-04-12T17:19:53+01:00","relpermalink":"/publication/2024-emotion-analysis-survey/","section":"publication","summary":"Emotions are a central aspect of communication. Consequently, emotion analysis (EA) is a rapidly growing field in natural language processing (NLP). However, there is no consensus on scope, direction, or methods. In this paper, we conduct a thorough review of 154 relevant NLP publications from the last decade. Based on this review, we address four different questions: (1) How are EA tasks defined in NLP? (2) What are the most prominent emotion frameworks and which emotions are modeled? (3) Is the subjectivity of emotions considered in terms of demographics and cultural factors? and (4) What are the primary NLP applications for EA? We take stock of trends in EA and tasks, emotion frameworks used, existing datasets, methods, and applications. We then discuss four lacunae: (1) the absence of demographic and cultural aspects does not account for the variation in how emotions are perceived, but instead assumes they are universally experienced in the same manner; (2) the poor fit of emotion categories from the two main emotion theories to the task; (3) the lack of standardized EA terminology hinders gap identification, comparison, and future goals; and (4) the absence of interdisciplinary research isolates EA from insights in other fields. Our work will enable more focused research into EA and a more holistic approach to modeling emotions in NLP.","tags":["Emotion analysis","Survey","Natural Language Processing"],"title":"Emotion Analysis in NLP: Trends, Gaps and Roadmap for Future Directions","type":"publication"},{"authors":["Amanda Cercas Curry","Giuseppe Attanasio","Zeerak Talat","Dirk Hovy"],"categories":[],"content":"","date":1709596800,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1709596800,"objectID":"d22ae7873586110e84d9ca0f0c95c8d7","permalink":"https://milanlproc.github.io/publication/2024-socialclass-experiments/","publishdate":"2024-03-05T14:48:20+01:00","relpermalink":"/publication/2024-socialclass-experiments/","section":"publication","summary":"Since the foundational work of William Labov on the social stratification of language (Labov, 1964), linguistics has made concentrated efforts to explore the links between sociodemographic characteristics and language production and perception. But while there is strong evidence for socio-demographic characteristics in language, they are infrequently used in Natural Language Processing (NLP). Age and gender are somewhat well represented, but Labov's original target, socioeconomic status, is noticeably absent. And yet it matters. We show empirically that NLP disadvantages less-privileged socioeconomic groups. We annotate a corpus of 95K utterances from movies with social class, ethnicity and geographical language variety and measure the performance of NLP systems on three tasks: language modelling, automatic speech recognition, and grammar error correction. We find significant performance disparities that can be attributed to socioeconomic status as well as ethnicity and geographical differences. With NLP technologies becoming ever more ubiquitous and quotidian, they must accommodate all language varieties to avoid disadvantaging already marginalised groups. We argue for the inclusion of socioeconomic class in future language technologies.","tags":["Large Language Models","Fairness","NLP","Demographics"],"title":"Classist Tools: Social Class Correlates with Performance in NLP","type":"publication"},{"authors":["Amanda Cercas Curry","Zeerak Talat","Dirk Hovy"],"categories":[],"content":"","date":1709596800,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1709596800,"objectID":"d2459c7e7c738b73a9cc2ed6b169f672","permalink":"https://milanlproc.github.io/publication/2024-socialclass-survey/","publishdate":"2024-03-05T14:48:20+01:00","relpermalink":"/publication/2024-socialclass-survey/","section":"publication","summary":"Since Labov's (1964) foundational work on the social stratification of language, linguistics has dedicated concerted efforts towards understanding the relationships between socio-demographic factors and language production and perception. Despite the large body of evidence identifying significant relationships between socio-demographic factors and language production, relatively few of these factors have been investigated in the context of NLP technology. While age and gender are well covered, Labov's initial target, socio-economic class, is largely absent. We survey the existing Natural Language Processing (NLP) literature and find that only around 20 papers even mention socio-economic status. However, the majority of those papers do not engage with class beyond collecting information of annotator-demographics. Given this research lacuna, we provide a definition of class that can be operationalised by NLP researchers, and argue for including socio-economic class in future language technologies.","tags":["Social class","Fairness","NLP","Demographics"],"title":"Impoverished Language Technology: The Lack of (Social) Class in NLP","type":"publication"},{"authors":["Amanda Cercas Curry","Gavin Abercrombie","Zeerak Talat"],"categories":[],"content":"","date":1709596800,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1709596800,"objectID":"fc2eaff1942d5f10f348b498415d4298","permalink":"https://milanlproc.github.io/publication/2024-isms/","publishdate":"2024-03-05T14:48:20+01:00","relpermalink":"/publication/2024-isms/","section":"publication","summary":"Natural language processing research has begun to embrace the notion of annotator subjectivity, motivated by variations in labelling. This approach understands each annotator's view as valid, which can be highly suitable for tasks that embed subjectivity, e.g., sentiment analysis. However, this construction may be inappropriate for tasks such as hate speech detection, as it affords equal validity to all positions on e.g., sexism or racism. We argue that the conflation of hate and offence can invalidate findings on hate speech, and call for future work to be situated in theory, disentangling hate from its orthogonal concept, offence.","tags":["Hate speech","Subjectivity","Sexism","Offensiveness"],"title":"Subjective isms? On the Danger of Conflating Hate and Offence in Abusive Language Detection","type":"publication"},{"authors":["Giuseppe Attanasio","Flor Miriam Plaza-del-Arco","Debora Nozza","Anne Lauscher"],"categories":[],"content":"","date":1703721600,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1703721600,"objectID":"5ccaa05d51aad1c15b4187130af80d2d","permalink":"https://milanlproc.github.io/publication/2024-a-tale-of-pronouns/","publishdate":"2023-04-12T17:19:53+01:00","relpermalink":"/publication/2024-a-tale-of-pronouns/","section":"publication","summary":"Recent instruction fine-tuned models can solve multiple NLP tasks when prompted to do so, with machine translation (MT) being a prominent use case. However, current research often focuses on standard performance benchmarks, leaving compelling fairness and ethical considerations behind. In MT, this might lead to misgendered translations, resulting, among other harms, in the perpetuation of stereotypes and prejudices. In this work, we address this gap by investigating whether and to what extent such models exhibit gender bias in machine translation and how we can mitigate it. Concretely, we compute established gender bias metrics on the WinoMT corpus from English to German and Spanish. We discover that IFT models default to male-inflected translations, even disregarding female occupational stereotypes. Next, using interpretability methods, we unveil that models systematically overlook the pronoun indicating the gender of a target occupation in misgendered translations. Finally, based on this finding, we propose an easy-to-implement and effective bias mitigation solution based on few-shot learning that leads to significantly fairer translations.","tags":["Interpretability","Gender Bias","Machine Translation"],"title":"A Tale of Pronouns: Interpretability Informs Gender Bias Mitigation for Fairer Instruction-Tuned Machine Translation","type":"publication"},{"authors":["Gavin Abercrombie","Amanda Cercas Curry","Tanvi Dinkar","Verena Rieser","Zeerak Talat"],"categories":[],"content":"","date":1701734400,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1701734400,"objectID":"c69ca0738e769aa55302da9237c0bc50","permalink":"https://milanlproc.github.io/publication/2023-mirages-anthropomorphism/","publishdate":"2023-12-05T14:48:20+01:00","relpermalink":"/publication/2023-mirages-anthropomorphism/","section":"publication","summary":"Automated dialogue or conversational systems are anthropomorphised by developers and personified by users. While a degree of anthropomorphism is inevitable, conscious and unconscious design choices can guide users to personify them to varying degrees. Encouraging users to relate to automated systems as if they were human can lead to transparency and trust issues, and high risk scenarios caused by over-reliance on their outputs. As a result, natural language processing researchers have investigated the factors that induce personification and develop resources to mitigate such effects. However, these efforts are fragmented, and many aspects of anthropomorphism have yet to be explored. In this paper, we discuss the linguistic factors that contribute to the anthropomorphism of dialogue systems and the harms that can arise thereof, including reinforcing gender stereotypes and conceptions of acceptable language. We recommend that future efforts towards developing dialogue systems take particular care in their design, development, release, and description; and attend to the many linguistic cues that can elicit personification by users.","tags":["Dialogue systems","Anthropomorphism","Trust"],"title":"Mirages. On Anthropomorphism in Dialogue Systems","type":"publication"},{"authors":["Hannah Rose Kirk","Bertie Vidgen","Paul Röttger","Scott A. Hale"],"categories":[],"content":"","date":1700006400,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1700006400,"objectID":"095a97910f182f7e49e6d7e4530057a8","permalink":"https://milanlproc.github.io/publication/2023-alignmentparadigms/","publishdate":"2023-07-12T14:48:20+01:00","relpermalink":"/publication/2023-alignmentparadigms/","section":"publication","summary":"In this paper, we address the concept of 'alignment' in large language models (LLMs) through the lens of post-structuralist socio-political theory, specifically examining its parallels to empty signifiers. To establish a shared vocabulary around how abstract concepts of alignment are operationalised in empirical datasets, we propose a framework that demarcates: 1) which dimensions of model behaviour are considered important, then 2) how meanings and definitions are ascribed to these dimensions, and by whom. We situate existing empirical literature and provide guidance on deciding which paradigm to follow. Through this framework, we aim to foster a culture of transparency and critical evaluation, aiding the community in navigating the complexities of aligning LLMs with human populations.","tags":["Large Language Models","Alignment","NLP"],"title":"The Empty Signifier Problem: Towards Clearer Paradigms for Operationalising 'Alignment' in Large Language Models","type":"publication"},{"authors":["Bertie Vidgen","Hannah Rose Kirk","Rebecca Qian","Nino Scherrer","Anand Kannappan","Scott A. Hale","Paul Röttger"],"categories":[],"content":"","date":1699920000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1699920000,"objectID":"629614984bc611fb8f0195a8b91ef894","permalink":"https://milanlproc.github.io/publication/2023-simplesafetytests/","publishdate":"2023-07-12T14:48:20+01:00","relpermalink":"/publication/2023-simplesafetytests/","section":"publication","summary":"The past year has seen rapid acceleration in the development of large language models (LLMs). For many tasks, there is now a wide range of open-source and open-access LLMs that are viable alternatives to proprietary models like ChatGPT. Without proper steering and safeguards, however, LLMs will readily follow malicious instructions, provide unsafe advice, and generate toxic content. This is a critical safety risk for businesses and developers. We introduce SimpleSafetyTests as a new test suite for rapidly and systematically identifying such critical safety risks. The test suite comprises 100 test prompts across five harm areas that LLMs, for the vast majority of applications, should refuse to comply with. We test 11 popular open LLMs and find critical safety weaknesses in several of them. While some LLMs do not give a single unsafe response, most models we test respond unsafely on more than 20% of cases, with over 50% unsafe responses in the extreme. Prepending a safety-emphasising system prompt substantially reduces the occurrence of unsafe responses, but does not completely stop them from happening. We recommend that developers use such system prompts as a first line of defence against critical safety risks.","tags":["Large Language Models","AI Safety","NLP"],"title":"SimpleSafetyTests: a Test Suite for Identifying Critical Safety Risks in Large Language Models","type":"publication"},{"authors":null,"categories":null,"content":"We love hosting talented researchers for research visits at MilaNLP. If you would like to apply for a visit, read on.\nWe have two main visiting periods: between April and July, and between September and December. We try to assess all applications for a period together and make a decision with enough time to spare, usually in January and July.\nMost of our visitors are PhD students or postdocs, i.e., people who do not have their own funding. This program is designed to help them come for a visit. If you do have your own funding, please feel free to reach out directly.\nDue to limited office space, we will only be able to host 1–2 people per period.\nNOTE: Due to the overwhelming response, we have already filled all slots for 2024, but will run another round in 2025. Please keep an eye out on social media for the announcement.\n","date":1698019200,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1698019200,"objectID":"eaab68c9b73be5f959ac98c857109899","permalink":"https://milanlproc.github.io/open_positions/visiting_researcher/","publishdate":"2023-10-23T00:00:00Z","relpermalink":"/open_positions/visiting_researcher/","section":"open_positions","summary":"Open Application for Research Visits with MilaNLP","tags":null,"title":"Visiting Researcher","type":"open_positions"},{"authors":["Hannah Rose Kirk","Andrew M. Bean","Bertie Vidgen","Paul Röttger","Scott A. Hale"],"categories":[],"content":"","date":1696982400,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1696982400,"objectID":"40048bf816282f959fe4fbf4b0c69019","permalink":"https://milanlproc.github.io/publication/2023-human-feedback-learning-survey/","publishdate":"2023-07-12T14:48:20+01:00","relpermalink":"/publication/2023-human-feedback-learning-survey/","section":"publication","summary":"Human feedback is increasingly used to steer the behaviours of Large Language Models (LLMs). However, it is unclear how to collect and incorporate feedback in a way that is efficient, effective and unbiased, especially for highly subjective human preferences and values. In this paper, we survey existing approaches for learning from human feedback, drawing on 95 papers primarily from the ACL and arXiv repositories. First, we summarise the past, pre-LLM trends for integrating human feedback into language models. Second, we give an overview of present techniques and practices, as well as the motivations for using feedback; conceptual frameworks for defining values and preferences; and how feedback is collected and from whom. Finally, we encourage a better future of feedback learning in LLMs by raising five unresolved conceptual and practical challenges.","tags":["Large Language Models","Human Feedback","NLP"],"title":"The Past, Present and Better Future of Feedback Learning in Large Language Models for Subjective Human Preferences and Values","type":"publication"},{"authors":["Flor Miriam Plaza-del-Arco","Debora Nozza","Dirk Hovy"],"categories":[],"content":"","date":1690156800,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1690156800,"objectID":"af13235c20bbbfc9b999e446ca7c1bb9","permalink":"https://milanlproc.github.io/publication/2023-label-variation-llms/","publishdate":"2023-07-24T14:48:20+01:00","relpermalink":"/publication/2023-label-variation-llms/","section":"publication","summary":"Large Language Models (LLMs) exhibit remarkable text classification capabilities, excelling in zero- and few-shot learning (ZSL and FSL) scenarios. However, since they are trained on different datasets, performance varies widely across tasks between those models. Recent studies emphasize the importance of considering human label variation in data annotation. However, how this human label variation also applies to LLMs remains unexplored. Given this likely model specialization, we ask: Do aggregate LLM labels improve over individual models (as for human annotators)? We evaluate four recent instruction-tuned LLMs as annotators on five subjective tasks across four languages. We use ZSL and FSL setups and label aggregation from human annotation. Aggregations are indeed substantially better than any individual model, benefiting from specialization in diverse tasks or languages. Surprisingly, FSL does not surpass ZSL, as it depends on the quality of the selected examples. However, there seems to be no good information-theoretical strategy to select those. We find that no LLM method rivals even simple supervised models. We also discuss the tradeoffs in accuracy, cost, and moral/ethical considerations between LLM and human annotation.","tags":["NLP","LLMs","annotation"],"title":"Wisdom of Instruction-Tuned Language Model Crowds. Exploring Model Label Variation","type":"publication"},{"authors":["Gabriele Ruggeri","Debora Nozza"],"categories":[],"content":"","date":1689120000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1689120000,"objectID":"63001a4b70ad72b7258ca6178b221fa1","permalink":"https://milanlproc.github.io/publication/2023-multidimensional-bias-vision-language-models/","publishdate":"2023-07-12T14:48:20+01:00","relpermalink":"/publication/2023-multidimensional-bias-vision-language-models/","section":"publication","summary":"In recent years, joint Vision-Language (VL) models have increased in popularity and capability. Very few studies have attempted to investigate bias in VL models, even though it is a well-known issue in both individual modalities.This paper presents the first multi-dimensional analysis of bias in English VL models, focusing on gender, ethnicity, and age as dimensions.When subjects are input as images, pre-trained VL models complete a neutral template with a hurtful word 5% of the time, with higher percentages for female and young subjects.Bias presence in downstream models has been tested on Visual Question Answering. We developed a novel bias metric called the Vision-Language Association Test based on questions designed to elicit biased associations between stereotypical concepts and targets. Our findings demonstrate that pre-trained VL models contain biases that are perpetuated in downstream tasks.","tags":["Fairness","NLP","multimodal"],"title":"A Multi-dimensional study on Bias in Vision-Language models","type":"publication"},{"authors":["Amanda Cercas Curry","Giuseppe Attanasio","Debora Nozza","Dirk Hovy"],"categories":[],"content":"","date":1689120000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1689120000,"objectID":"f06c4346370b070a759172242d7d10a1","permalink":"https://milanlproc.github.io/publication/2023-milanlp-semeval-2023-task-10-ensembling-domain-adapted-regularized-pretrained-language-models-robust-sexism-detection/","publishdate":"2023-07-12T14:48:20+01:00","relpermalink":"/publication/2023-milanlp-semeval-2023-task-10-ensembling-domain-adapted-regularized-pretrained-language-models-robust-sexism-detection/","section":"publication","summary":"We present the system proposed by the MilaNLP team for the Explainable Detection of Online Sexism (EDOS) shared task. We propose an ensemble modeling approach to combine different classifiers trained with domain adaptation objectives and standard fine-tuning.Our results show that the ensemble is more robust than individual models and that regularized models generate more “conservative” predictions, mitigating the effects of lexical overfitting.However, our error analysis also finds that many of the misclassified instances are debatable, raising questions about the objective annotatability of hate speech data.","tags":["Hate Speech","NLP","domain adaptation","language models"],"title":"MilaNLP at SemEval-2023 Task 10: Ensembling Domain-Adapted and Regularized Pretrained Language Models for Robust Sexism Detection","type":"publication"},{"authors":["Flor Miriam Plaza-del-Arco","Debora Nozza","Dirk Hovy"],"categories":[],"content":"","date":1689120000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1689120000,"objectID":"3f2cc27a6d89ea664c434ace9993294a","permalink":"https://milanlproc.github.io/publication/2023-zero-shot-prompting-hate-speech/","publishdate":"2023-07-12T14:48:20+01:00","relpermalink":"/publication/2023-zero-shot-prompting-hate-speech/","section":"publication","summary":"Hate speech detection faces two significant challenges: 1) the limited availability of labeled data and 2) the high variability of hate speech across different contexts and languages. Prompting brings a ray of hope to these challenges. It allows injecting a model with task-specific knowledge without relying on labeled data. This paper explores zero-shot learning with prompting for hate speech detection. We investigate how well zero-shot learning can detect hate speech in 3 languages with limited labeled data. We experiment with various large language models and verbalizers on 8 benchmark datasets. Our findings highlight the impact of prompt selection on the results. They also suggest that prompting, specifically with recent large language models, can achieve performance comparable to and surpass fine-tuned models, making it a promising alternative for under-resourced languages. Our findings highlight the potential of prompting for hate speech detection and show how both the prompt and the model have a significant impact on achieving more accurate predictions in this task.","tags":["Hate Speech","NLP","multilingual"],"title":"Respectful or Toxic? Using Zero-Shot Learning with Language Models to Detect Hate Speech","type":"publication"},{"authors":["Gavin Abercrombie","Dirk Hovy","Vinodkumar Prabhakaran"],"categories":[],"content":"","date":1689120000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1689120000,"objectID":"6cffcdc288e8545b5e096b0c26db8ca6","permalink":"https://milanlproc.github.io/publication/2023-temporal-second-language-influence-intra-annotator-agreement-stability-hate-speech-labelling/","publishdate":"2023-07-12T14:48:20+01:00","relpermalink":"/publication/2023-temporal-second-language-influence-intra-annotator-agreement-stability-hate-speech-labelling/","section":"publication","summary":"Much work in natural language processing (NLP) relies on human annotation. The majority of this implicitly assumes that annotator’s labels are temporally stable, although the reality is that human judgements are rarely consistent over time. As a subjective annotation task, hate speech labels depend on annotator’s emotional and moral reactions to the language used to convey the message. Studies in Cognitive Science reveal a ‘foreign language effect’, whereby people take differing moral positions and perceive offensive phrases to be weaker in their second languages. Does this affect annotations as well? We conduct an experiment to investigate the impacts of (1) time and (2) different language conditions (English and German) on measurements of intra-annotator agreement in a hate speech labelling task. While we do not observe the expected lower stability in the different language condition, we find that overall agreement is significantly lower than is implicitly assumed in annotation tasks, which has important implications for dataset reproducibility in NLP.","tags":["annotation","NLP","sociodemographics"],"title":"Temporal and Second Language Influence on Intra-Annotator Agreement and Stability in Hate Speech Labelling","type":"publication"},{"authors":["Matthias Orlikowski","Paul Röttger","Philipp Cimiano","Dirk Hovy"],"categories":[],"content":"","date":1689120000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1689120000,"objectID":"4abddd30922258b0fac152dcdf4b69f0","permalink":"https://milanlproc.github.io/publication/2023-ecological-fallacy-annotation-modeling-human-label-variation-goes-beyond-sociodemographics/","publishdate":"2023-07-12T14:48:20+01:00","relpermalink":"/publication/2023-ecological-fallacy-annotation-modeling-human-label-variation-goes-beyond-sociodemographics/","section":"publication","summary":"Many NLP tasks exhibit human label variation, where different annotators give different labels to the same texts. This variation is known to depend, at least in part, on the sociodemographics of annotators. Recent research aims to model individual annotator behaviour rather than predicting aggregated labels, and we would expect that sociodemographic information is useful for these models. On the other hand, the ecological fallacy states that aggregate group behaviour, such as the behaviour of the average female annotator, does not necessarily explain individual behaviour. To account for sociodemographics in models of individual annotator behaviour, we introduce group-specific layers to multi-annotator models. In a series of experiments for toxic content detection, we find that explicitly accounting for sociodemographic attributes in this way does not significantly improve model performance. This result shows that individual annotation behaviour depends on much more than just sociodemographics.","tags":["annotation","NLP","sociodemographics"],"title":"The Ecological Fallacy in Annotation: Modeling Human Label Variation goes beyond Sociodemographics","type":"publication"},{"authors":["Debora Nozza","Dirk Hovy"],"categories":[],"content":"","date":1689120000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1689120000,"objectID":"1ebebe2dde0f3ca3bcb2976010570452","permalink":"https://milanlproc.github.io/publication/2023-prof-profanity-obfuscation-nlp/","publishdate":"2023-07-12T14:48:20+01:00","relpermalink":"/publication/2023-prof-profanity-obfuscation-nlp/","section":"publication","summary":"Work on hate speech has made considering rude and harmful examples in scientific publications inevitable. This situation raises various problems, such as whether or not to obscure profanities. While science must accurately disclose what it does, the unwarranted spread of hate speech can harm readers and increases its internet frequency. While maintaining publications’ professional appearance, obfuscating profanities makes it challenging to evaluate the content, especially for non-native speakers. Surveying 150 ACL papers, we discovered that obfuscation is usually used for English but not other languages, and even then, quite unevenly. We discuss the problems with obfuscation and suggest a multilingual community resource called PrOf with a Python module to standardize profanity obfuscation processes. We believe PrOf can help scientific publication policies to make hate speech work accessible and comparable, irrespective of language.","tags":["Hate Speech","NLP","multilingual"],"title":"The State of Profanity Obfuscation in Natural Language Processing Scientific Publications","type":"publication"},{"authors":["Anne Lauscher","Debora Nozza","Ehm Miltersen","Archie Crowley","Dirk Hovy"],"categories":[],"content":"","date":1689120000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1689120000,"objectID":"ff9d8520053fa8f944c7faa36dfc35da","permalink":"https://milanlproc.github.io/publication/2023-commercial-machine-translation-fail-neopronouns/","publishdate":"2023-07-12T14:48:20+01:00","relpermalink":"/publication/2023-commercial-machine-translation-fail-neopronouns/","section":"publication","summary":"As 3rd-person pronoun usage shifts to include novel forms, e.g., neopronouns, we need more research on identity-inclusive NLP. Exclusion is particularly harmful in one of the most popular NLP applications, machine translation (MT). Wrong pronoun translations can discriminate against marginalized groups, e.g., non-binary individuals (Dev et al., 2021). In this “reality check”, we study how three commercial MT systems translate 3rd-person pronouns. Concretely, we compare the translations of gendered vs. gender-neutral pronouns from English to five other languages (Danish, Farsi, French, German, Italian), and vice versa, from Danish to English.Our error analysis shows that the presence of a gender-neutral pronoun often leads to grammatical and semantic translation errors. Similarly, gender neutrality is often not preserved. By surveying the opinions of affected native speakers from diverse languages, we provide recommendations to address the issue in future MT research.","tags":["NLP","pronouns","fairness","ethics"],"title":"What about ''em''? How Commercial Machine Translation Fails to Handle (Neo-)Pronouns","type":"publication"},{"authors":["Anne Lauscher","Debora Nozza","Ehm Miltersen","Archie Crowley","Dirk Hovy"],"categories":[],"content":"","date":1689120000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1689120000,"objectID":"fd19c1177957eaded55b19f54d5290bc","permalink":"https://milanlproc.github.io/publication/2023-interpretability-for-fairer-machine-translation/","publishdate":"2023-07-12T14:48:20+01:00","relpermalink":"/publication/2023-interpretability-for-fairer-machine-translation/","section":"publication","summary":"As 3rd-person pronoun usage shifts to include novel forms, e.g., neopronouns, we need more research on identity-inclusive NLP. Exclusion is particularly harmful in one of the most popular NLP applications, machine translation (MT). Wrong pronoun translations can discriminate against marginalized groups, e.g., non-binary individuals (Dev et al., 2021). In this “reality check”, we study how three commercial MT systems translate 3rd-person pronouns. Concretely, we compare the translations of gendered vs. gender-neutral pronouns from English to five other languages (Danish, Farsi, French, German, Italian), and vice versa, from Danish to English.Our error analysis shows that the presence of a gender-neutral pronoun often leads to grammatical and semantic translation errors. Similarly, gender neutrality is often not preserved. By surveying the opinions of affected native speakers from diverse languages, we provide recommendations to address the issue in future MT research.","tags":["NLP","pronouns","fairness","ethics"],"title":"What about ''em''? How Commercial Machine Translation Fails to Handle (Neo-)Pronouns","type":"publication"},{"authors":["Alba Curry","Amanda Cercas Curry"],"categories":[],"content":"","date":1685923200,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1685923200,"objectID":"d0eb857a6a57b8204d7de17a9a7d5e25","permalink":"https://milanlproc.github.io/publication/2023-computer-says-no/","publishdate":"2023-06-05T14:48:20+01:00","relpermalink":"/publication/2023-computer-says-no/","section":"publication","summary":"Emotions are an integral part of human cognition and they guide not only our understanding of the world but also our actions within it. As such, whether we soothe or flame an emotion is not inconsequential. Recent work in conversational AI has focused on responding empathetically to users, validating and soothing their emotions without a real basis. This AI-aided emotional regulation can have negative consequences for users and society, tending towards a one-noted happiness defined as only the absence of “negative” emotions. We argue that we must carefully consider whether and how to respond to users’ emotions.","tags":["Dialogue systems","Empathy","Anthropomorphism","Trust"],"title":"Computer says “No”: The Case Against Empathetic Conversational AI","type":"publication"},{"authors":["Tommaso Fornaciari","Luca Luceri","Emilio Ferrara","Dirk Hovy"],"categories":[],"content":"","date":1685923200,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1685923200,"objectID":"d77d8b1e10205c823e23d8f2019f4585","permalink":"https://milanlproc.github.io/publication/2023-leveraging-social-interactions-detect-misinformation-social-media/","publishdate":"2023-06-05T14:48:20+01:00","relpermalink":"/publication/2023-leveraging-social-interactions-detect-misinformation-social-media/","section":"publication","summary":"Detecting misinformation threads is crucial to guarantee a healthy environment on social media. We address the problem using the data set created during the COVID-19 pandemic. It contains cascades of tweets discussing information weakly labeled as reliable or unreliable, based on a previous evaluation of the information source. The models identifying unreliable threads usually rely on textual features. But reliability is not just what is said, but by whom and to whom. We additionally leverage on network information. Following the homophily principle, we hypothesize that users who interact are generally interested in similar topics and spreading similar kind of news, which in turn is generally reliable or not. We test several methods to learn representations of the social interactions within the cascades, combining them with deep neural language models in a Multi-Input (MI) framework. Keeping track of thesequence of the interactions during the time, we improve over previous state-of-the-art models.","tags":["misinformation","NLP","social media","networks"],"title":"Leveraging Social Interactions to Detect Misinformation on Social Media","type":"publication"},{"authors":["Davide Locatelli","Greta Damo","Debora Nozza"],"categories":[],"content":"","date":1683849600,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1683849600,"objectID":"51e9a40f92fa680cf8527ec7904a72b4","permalink":"https://milanlproc.github.io/publication/2023-cross-lingual-study-homotransphobia/","publishdate":"2023-05-12T14:48:20+01:00","relpermalink":"/publication/2023-cross-lingual-study-homotransphobia/","section":"publication","summary":"We present a cross-lingual study of homotransphobia on Twitter, examining the prevalence and forms of homotransphobic content in tweets related to LGBT issues in seven languages. Our findings reveal that homotransphobia is a global problem that takes on distinct cultural expressions, influenced by factors such as misinformation, cultural prejudices, and religious beliefs. To aid the detection of hate speech, we also devise a taxonomy that classifies public discourse around LGBT issues. By contributing to the growing body of research on online hate speech, our study provides valuable insights for creating effective strategies to combat homotransphobia on social media.","tags":["Hate Speech","NLP","multilingual"],"title":"A Cross-Lingual Study of Homotransphobia on Twitter","type":"publication"},{"authors":["Federico Bianchi","Pratyusha Kalluri","Esin Durmus","Faisal Ladhak","Myra Cheng","Debora Nozza","Tatsunori Hashimoto","Dan Jurafsky","James Zou","Aylin Caliskan"],"categories":[],"content":"","date":1683417600,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1683417600,"objectID":"7483eea9fc94959cf7db5c17264fa5e3","permalink":"https://milanlproc.github.io/publication/2023-text-to-image-stereotypes/","publishdate":"2023-05-07T14:48:20+01:00","relpermalink":"/publication/2023-text-to-image-stereotypes/","section":"publication","summary":"Machine learning models are now able to convert user-written text descriptions into naturalistic images. These models are available to anyone online and are being used to generate millions of images a day. We investigate these models and find that they amplify dangerous and complex stereotypes. Moreover, we find that the amplified stereotypes are difficult to predict and not easily mitigated by users or model owners. The extent to which these image-generation models perpetuate and amplify stereotypes and their mass deployment is cause for serious concern.","tags":["Vision","NLP","Bias","Fairness"],"title":"Easily Accessible Text-to-Image Generation Amplifies Demographic Stereotypes at Large Scale","type":"publication"},{"authors":["Sunipa Dev","Vinodkumar Prabhakaran","David Adelani","Dirk Hovy","Luciana Benotti"],"categories":[],"content":"","date":1683331200,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1683331200,"objectID":"9e1fac152996630e6fb17446c2ddd42a","permalink":"https://milanlproc.github.io/publication/2023-proceedings-c3nlp/","publishdate":"2023-05-06T14:48:20+01:00","relpermalink":"/publication/2023-proceedings-c3nlp/","section":"publication","summary":"Natural Language Processing has seen impressive gains in recent years. This research includes the demonstration by NLP models to have turned into useful technologies with improved capabilities, measured in terms of how well they match human behavior captured in web-scale language data or through annotations. However, human behavior is inherently shaped by the cultural contexts humans are embedded in, the values and beliefs they hold, and the social practices they follow, part of which will be reflected in the data used to train NLP models, and the behavior these NLP models exhibit. This workshop will bring together NLP researchers invested in this work, along with a community of scholars with multi-disciplinary expertise spanning linguistics, social sciences, and cultural anthropology.","tags":["computational social science","NLP","culture"],"title":"Proceedings of the First Workshop on Cross-Cultural Considerations in NLP (C3NLP)","type":"publication"},{"authors":["Chia-chien Hung","Anne Lauscher","Dirk Hovy","Simone Paolo Ponzetto","Goran Glavaš"],"categories":[],"content":"","date":1682985600,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1682985600,"objectID":"3c029d51f2b8c8bba4e070b7cafee2cc","permalink":"https://milanlproc.github.io/publication/2023-can-demographic-factors-improve-text-classification/","publishdate":"2023-05-02T14:48:20+01:00","relpermalink":"/publication/2023-can-demographic-factors-improve-text-classification/","section":"publication","summary":"Demographic factors (e.g., gender or age) shape our language. Previous work showed that incorporating demographic factors can consistently improve performance for various NLP tasks with traditional NLP models. In this work, we investigate whether these previous findings still hold with state-of-the-art pretrained Transformer-based language models (PLMs). We use three common specialization methods proven effective for incorporating external knowledge into pretrained Transformers (e.g., domain-specific or geographic knowledge). We adapt the language representations for the demographic dimensions of gender and age, using continuous language modeling and dynamic multi-task learning for adaptation, where we couple language modeling objectives with the prediction of demographic classes. Our results, when employing a multilingual PLM, show substantial gains in task performance across four languages (English, German, French, and Danish), which is consistent with the results of previous work. However, controlling for confounding factors – primarily domain and language proficiency of Transformer-based PLMs – shows that downstream performance gains from our demographic adaptation do not actually stem from demographic knowledge. Our results indicate that demographic specialization of PLMs, while holding promise for positive societal impact, still represents an unsolved problem for (modern) NLP.","tags":["NLP","language models","demographics"],"title":"Can Demographic Factors Improve Text Classification? Revisiting Demographic Adaptation in the Age of Transformers","type":"publication"},{"authors":["Giuseppe Attanasio","Eliana Pastor","Chiara Di Bonaventura","Debora Nozza"],"categories":[],"content":"","date":1682985600,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1682985600,"objectID":"32074e80da1ca4ed7fe78f30fa8cc384","permalink":"https://milanlproc.github.io/publication/2023-ferret-explainers-transformers/","publishdate":"2023-05-02T14:48:20+01:00","relpermalink":"/publication/2023-ferret-explainers-transformers/","section":"publication","summary":"As Transformers are increasingly relied upon to solve complex NLP problems, there is an increased need for their decisions to be humanly interpretable. While several explainable AI (XAI) techniques for interpreting the outputs of transformer-based models have been proposed, there is still a lack of easy access to using and comparing them.We introduce ferret, a Python library to simplify the use and comparisons of XAI methods on transformer-based classifiers.With ferret, users can visualize and compare transformers-based models output explanations using state-of-the-art XAI methods on any free-text or existing XAI corpora. Moreover, users can also evaluate ad-hoc XAI metrics to select the most faithful and plausible explanations. To align with the recently consolidated process of sharing and using transformers-based models from Hugging Face, ferret interfaces directly with its Python library.In this paper, we showcase ferret to benchmark XAI methods used on transformers for sentiment analysis and hate speech detection. We show how specific methods provide consistently better explanations and are preferable in the context of transformer models.","tags":["BERT","NLP","interpretability"],"title":"ferret: a Framework for Benchmarking Explainers on Transformers","type":"publication"},{"authors":null,"categories":["NLP","hate speech"],"content":"Hate speech is one of the most central problems of online life, with real-life consequences: various hate crimes started as online hate. 1 in 4 users have been harassed online (Pew Research), 63% of the targets are women (Cox commission). The pandemic-related increase of online activity has only intensified this problem: over 500 million messages are sent each day. To address this problem, content providers and policymakers need automated assistance in spotting and addressing hateful comments. INDOMITA will provide those methods.\nBut hate speech is complex. What is considered offensive varies by social norms and user demographics. \u0026ldquo;Yo, a**hole!\u0026rdquo; is acceptable among friends, but problematic with strangers. But current hate speech detection only uses the words in a message to determine whether it is hate speech or not. It does not consider who says those words and to whom, potentially missing subtle forms of hate speech and mislabeling harmless interactions due to overreliance on keywords. This overly simplified approach is a significant limitation. Our user-based approach will address that.\nBut \u0026ldquo;better\u0026rdquo; detection is subjective: people have very different thresholds for what they find offensive. Current evaluation metrics do not allow for such nuance. Any tool that improves the overall detection rate will be judged sufficient. But a tool that works great for most users, but fails for some other groups might achieve good performance. It still fails in the task it was designed to do. Our fairness metrics will correct this.\nBut detection alone does not solve the problem. Interventions like counterspeech or education have a lasting impact on abusive users. It can be enough to alert them to the hurtful nature of their message. At other times, they will only respond if someone they perceive as authoritative engages in a discussion. This decision requires an understanding of the abusive user\u0026rsquo;s social context. Our user-based counterspeech approach facilitates this.\nOur novel, user-centered approach will address hatespeech in three ways:\n comprehensively modeling a complex issue to improve detection across input formats (text, images, and video), by incorporating socio-demographic context into the model. developing methods to automate counterspeech and to address abusive users effectively. developing evaluation metrics that assess fairness and performance and account for the subjective nature of hate speech. In sum, our user focus will revolutionize existing research on hate speech detection, both in Italian and other languages, to give authorities and media providers better ways to assess content for immediate countermeasures. It will allow us to bridge language differences more easily than purely text-based methods, as we capture socio-behavioral patterns that generalize across languages. It will generate revolutionary insights of the complex dynamics between online actors and the generation of online hate.\nINDOMITA is supported by a MUR FARE 2020 initiative under grant agreement Prot. R20YSMBZ8S.\n","date":1681344000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1681344000,"objectID":"5af9db37d16b9da1095a77d2e9b2ea3b","permalink":"https://milanlproc.github.io/project/indomita/","publishdate":"2023-04-13T00:00:00Z","relpermalink":"/project/indomita/","section":"project","summary":"Innovative Demographically-aware Hate Speech Detection in Online Media in Italian","tags":["demographic","social media","NLP"],"title":"INDOMITA","type":"project"},{"authors":["Donya Rooein","Amanda Cercas Curry","Dirk Hovy"],"categories":[],"content":"","date":1681257600,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1681257600,"objectID":"d47bea93b523ae78237df41065e8b24f","permalink":"https://milanlproc.github.io/publication/2023-know-your-audience-education/","publishdate":"2023-04-12T17:19:53+01:00","relpermalink":"/publication/2023-know-your-audience-education/","section":"publication","summary":"Large language models (LLMs) offer a range of new possibilities, including adapting the text to different audiences and their reading needs. But how well do they adapt? We evaluate the readability of answers generated by four state-of-the-art LLMs (commercial and open-source) to science questions when prompted to target different age groups and education levels. To assess the adaptability of LLMs to diverse audiences, we compare the readability scores of the generated responses against the recommended comprehension level of each age and education group. We find large variations in the readability of the answers by different LLMs. Our results suggest LLM answers need to be better adapted to the intended audience demographics to be more comprehensible. They underline the importance of enhancing the adaptability of LLMs in education settings to cater to diverse age and education levels. Overall, current LLMs have set readability ranges and do not adapt well to different audiences, even when prompted. That limits their potential for educational purposes.","tags":["Education","NLP","LLMs"],"title":"Know Your Audience: Do LLMs Adapt to Different Age and Education Levels?","type":"publication"},{"authors":["Rishav Hada","Amir Ebrahimi Fard","Sarah Shugars","Federico Bianchi","Patricia Rossini","Dirk Hovy","Rebekah Tromble","Nava Tintareva"],"categories":[],"content":"","date":1677456000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1677456000,"objectID":"0dd3066b1e69e63f2f33c17401858cd2","permalink":"https://milanlproc.github.io/publication/2023-beyond-digital-echo-chambers-role-viewpoint-diversity-political-discussion/","publishdate":"2023-02-27T14:48:20+01:00","relpermalink":"/publication/2023-beyond-digital-echo-chambers-role-viewpoint-diversity-political-discussion/","section":"publication","summary":"Increasingly taking place in online spaces, modern political conversations are typically perceived to be unproductively affirming---siloed in so called 'echo chambers' of exclusively like-minded discussants. Yet, to date we lack sufficient means to measure viewpoint diversity in conversations. To this end, in this paper, we operationalize two viewpoint metrics proposed for recommender systems and adapt them to the context of social media conversations. This is the first study to apply these two metrics (Representation and Fragmentation) to real world data and to consider the implications for online conversations specifically. We apply these measures to two topics---daylight savings time (DST), which serves as a control, and the more politically polarized topic of immigration. We find that the diversity scores for both Fragmentation and Representation are lower for immigration than for DST. Further, we find that while pro-immigrant views receive consistent pushback on the platform, anti-immigrant views largely operate within echo chambers. We observe less severe yet similar patterns for DST. Taken together, Representation and Fragmentation paint a meaningful and important new picture of viewpoint diversity.","tags":["NLP","computational social science","political science","echo chambers"],"title":"Beyond Digital 'Echo Chambers': The Role of Viewpoint Diversity in Political Discussion","type":"publication"},{"authors":null,"categories":["nlp"],"content":"Debora Nozza have been recently awarded a €1.5m ERC Starting Grant project 2023 for my project PERSONAE.\nPERSONAE will make language technology (LT) accessible and valuable to everyone. I will revolutionize research in subjective tasks in NLP such as abusive language detection and sentiment and emotion analysis by developing a new field called personal NLP, yielding new datasets, tasks, and algorithms. This new research area will explore subjective tasks from the perspective of the individual as information receiver, making users active actors in the creation of LTs instead of mere recipients. This will allow for a more tailored, effective approach to NLP model design, resulting in better models overall.\nEach person has their own interests and preferences based on their background and experience. These factors impact their views of what makes them happy, angry, or depressed over time. Language technologies (LTs) can consider individual preferences. However, current research presumes a static view of subjectivity: that a single ground truth underlies subjective tasks such as abusive language detection, an assumption that lacks human variability and prevents universal access to LTs.\nLanguage-based AI such as virtual assistants is widely available. But despite significant scientific advances, most LT applications are inaccessible to individuals and their public\u0026rsquo;s opinion has become increasingly negative. GPT-3\u0026rsquo;s 2020 release boosted business-oriented applications such as copywriting and chatbots, yet few that let people improve their lives—for example, by controlling what they see on social media. This gap becomes more pronounced for subjective tasks.\nPERSONAE will help design subjective LTs that can be adapted by individuals at will over time. Based on an ambitious meta approach able to generalize from existing, disconnected work, PERSONAE will rely on fully personalizable privacy-aware algorithms that can be used by anyone. It will reveal benefits of LT far beyond those of existing systems, paving the way for future applications.\n🌏🌏 Check out the web article on my project!\n🎙️🎙️ Check out my latest interview on Radio 24 in Italian!\n","date":1677456000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1677456000,"objectID":"f4fd2ab1f739b9ebb14a4a53c0a9ae87","permalink":"https://milanlproc.github.io/project/personae/","publishdate":"2023-02-27T00:00:00Z","relpermalink":"/project/personae/","section":"project","summary":"Personalized and Subjective approaches to Natural Language Processing","tags":["hate speech","subjectivity","nlp"],"title":"PERSONAE","type":"project"},{"authors":["Federico Bianchi","Amanda Cercas Curry","Dirk Hovy"],"categories":[],"content":"","date":1673136000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1673136000,"objectID":"c607865224483ec3caddadf624167af7","permalink":"https://milanlproc.github.io/publication/2023-ai-normal-accidents-waiting-happen/","publishdate":"2023-01-08T14:48:20+01:00","relpermalink":"/publication/2023-ai-normal-accidents-waiting-happen/","section":"publication","summary":"Artificial Intelligence (AI) is at a crucial point in its development: stable enough to be used in production systems, and increasingly pervasive in our lives. What does that mean for its safety? In his book Normal Accidents, the sociologist Charles Perrow proposed a framework to analyze new technologies and the risks they entail. He showed that major accidents are nearly unavoidable in complex systems with tightly coupled components if they are run long enough. In this essay, we apply and extend Perrow’s framework to AI to assess its potential risks. Today’s AI systems are already highly complex, and their complexity is steadily increasing. As they become more ubiquitous, different algorithms will interact directly, leading to tightly coupled systems whose capacity to cause harm we will be unable to predict. We argue that under the current paradigm, Perrow’s normal accidents apply to AI systems and it is only a matter of time before one occurs.","tags":["AI","models","sociology"],"title":"Viewpoint: Artificial Intelligence Accidents Waiting to Happen?","type":"publication"},{"authors":["Federico Bianchi","Stefanie Hills","Patricia Rossini","Dirk Hovy","Rebekah Tromble","Nava Tintarev"],"categories":[],"content":"","date":1670803200,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1670803200,"objectID":"281aa24403feb3bfd4b39889e2df47e3","permalink":"https://milanlproc.github.io/publication/2022-not_just_hate/","publishdate":"2022-12-12T14:48:20+01:00","relpermalink":"/publication/2022-not_just_hate/","section":"publication","summary":"Well-annotated data is a prerequisite for good Natural Language Processing models. Too often, though, annotation decisions are governed by optimizing time or annotator agreement. We make a case for nuanced efforts in an interdisciplinary setting for annotating offensive online speech. Detecting offensive content is rapidly becoming one of the most important real-world NLP tasks. However, most datasets use a single binary label, e.g., for hate or incivility, even though each concept is multi-faceted. This modeling choice severely limits nuanced insights, but also performance.We show that a more fine-grained multi-label approach to predicting incivility and hateful or intolerant content addresses both conceptual and performance issues.We release a novel dataset of over 40,000 tweets about immigration from the US and UK, annotated with six labels for different aspects of incivility and intolerance.Our dataset not only allows for a more nuanced understanding of harmful speech online, models trained on it also outperform or match performance on benchmark datasets","tags":["Hate Speech","NLP","dataset"],"title":"It's Not Just Hate: A Multi-Dimensional Perspective on Detecting Harmful Speech Online","type":"publication"},{"authors":["Federico Bianchi","Vincenzo Cutrona","Dirk Hovy"],"categories":[],"content":"","date":1670803200,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1670803200,"objectID":"3ac3335b591cf77ff6390c5bf2664375","permalink":"https://milanlproc.github.io/publication/2022-twitter_demographer/","publishdate":"2022-12-12T14:48:20+01:00","relpermalink":"/publication/2022-twitter_demographer/","section":"publication","summary":"Twitter data have become essential to Natural Language Processing (NLP) and social science research, driving various scientific discoveries in recent years. However, the textual data alone are often not enough to conduct studies: especially, social scientists need more variables to perform their analysis and control for various factors. How we augment this information, such as users’ location, age, or tweet sentiment, has ramifications for anonymity and reproducibility, and requires dedicated effort. This paper describes Twitter-Demographer, a simple, flow-based tool to enrich Twitter data with additional information about tweets and users. {tool is aimed at NLP practitioners, psycho-linguists, and (computational) social scientists who want to enrich their datasets with aggregated information, facilitating reproducibility, and providing algorithmic privacy-by-design measures for pseudo-anonymity. We discuss our design choices, inspired by the flow-based programming paradigm, to use black-box components that can easily be chained together and extended. We also analyze the ethical issues related to the use of this tool, and the built-in measures to facilitate pseudo-anonymity.","tags":["Social Media","NLP","dataset","Twitter"],"title":"Twitter-Demographer: A Flow-based Tool to Enrich Twitter Data","type":"publication"},{"authors":["Marius Hessenthaler","Emma Strubell","Dirk Hovy","Anne Lauscher"],"categories":[],"content":"","date":1670630400,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1670630400,"objectID":"c8e28f24b48da4ce4375ff8698f4155c","permalink":"https://milanlproc.github.io/publication/2022-bridging_fairness_and_environmental_sustainability_in_natural_language_processing/","publishdate":"2022-12-12T14:48:20+01:00","relpermalink":"/publication/2022-bridging_fairness_and_environmental_sustainability_in_natural_language_processing/","section":"publication","summary":"Fairness and environmental impact are important research directions for the sustainable development of artificial intelligence. However, while each topic is an active research area in natural language processing (NLP), there is a surprising lack of research on the interplay between the two fields. This lacuna is highly problematic, since there is increasing evidence that an exclusive focus on fairness can actually hinder environmental sustainability, and vice versa. In this work, we shed light on this crucial intersection in NLP by (1) investigating the efficiency of current fairness approaches through surveying example methods for reducing unfair stereotypical bias from the literature, and (2) evaluating a common technique to reduce energy consumption (and thus environmental impact) of English NLP models, knowledge distillation (KD), for its impact on fairness. In this case study, we evaluate the effect of important KD factors, including layer and dimensionality reduction, with respect to: (a) performance on the distillation task (natural language inference and semantic similarity prediction), and (b) multiple measures and dimensions of stereotypical bias (e.g., gender bias measured via the Word Embedding Association Test). Our results lead us to clarify current assumptions regarding the effect of KD on unfair bias: contrary to other findings, we show that KD can actually decrease model fairness.","tags":["NLP","fairness","sustainability"],"title":"Bridging Fairness and Environmental Sustainability in Natural Language Processing","type":"publication"},{"authors":["Anne Lauscher","Federico Bianchi","Samuel R. Bowman","Dirk Hovy"],"categories":[],"content":"","date":1670630400,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1670630400,"objectID":"d3d511c9d6fd6b178e0bf4de6bc057c3","permalink":"https://milanlproc.github.io/publication/2022-socioprobe_what_when_where_language_models_learn_about_sociodemographics/","publishdate":"2022-12-10T14:48:20+01:00","relpermalink":"/publication/2022-socioprobe_what_when_where_language_models_learn_about_sociodemographics/","section":"publication","summary":"Pre-trained language models (PLMs) have outperformed other NLP models on a wide range of tasks. Opting for a more thorough understanding of their capabilities and inner workings, researchers have established the extend to which they capture lower-level knowledge like grammaticality, and mid-level semantic knowledge like factual understanding. However, there is still little understanding of their knowledge of higher-level aspects of language. In particular, despite the importance of sociodemographic aspects in shaping our language, the questions of whether, where, and how PLMs encode these aspects, e.g., gender or age, is still unexplored. We address this research gap by probing the sociodemographic knowledge of different single-GPU PLMs on multiple English data sets via traditional classifier probing and information-theoretic minimum description length probing. Our results show that PLMs do encode these sociodemographics, and that this knowledge is sometimes spread across the layers of some of the tested PLMs. We further conduct a multilingual analysis and investigate the effect of supplementary training to further explore to what extent, where, and with what amount of pre-training data the knowledge is encoded. Our overall results indicate that sociodemographic knowledge is still a major challenge for NLP. PLMs require large amounts of pre-training data to acquire the knowledge and models that excel in general language understanding do not seem to own more knowledge about these aspects.","tags":["NLP","sociodemographics","transformers","language models"],"title":"SocioProbe: What, When, and Where Language Models Learn about Sociodemographics","type":"publication"},{"authors":["Samia Touileb","Debora Nozza"],"categories":[],"content":"","date":1670544000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1670544000,"objectID":"5d92f30200b04a92ce9594e899951021","permalink":"https://milanlproc.github.io/publication/2022-honest-harmful-scandinavian-language-model/","publishdate":"2022-12-09T14:48:20+01:00","relpermalink":"/publication/2022-honest-harmful-scandinavian-language-model/","section":"publication","summary":"Scandinavian countries are perceived as role-models when it comes to gender equality. With the advent of pre-trained language models and their widespread usage, we investigate to what extent gender-based harmful and toxic content exist in selected Scandinavian language models. We examine nine models, covering Danish, Swedish, and Norwegian, by manually creating template-based sentences and probing the models for completion. We evaluate the completions using two methods for measuring harmful and toxic completions and provide a thorough analysis of the results. We show that Scandinavian pre-trained language models contain harmful and gender-based stereotypes with similar values across all languages. This finding goes against the general expectations related to gender equality in Scandinavian countries and shows the possible problematic outcomes of using such models in real-world settings.","tags":["Hate Speech","BERT","NLP","dataset","multilingual"],"title":"Measuring Harmful Representations in Scandinavian Language Models","type":"publication"},{"authors":["Paul Röttger","Debora Nozza","Federico Bianchi","Dirk Hovy"],"categories":[],"content":"","date":1666224000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1666224000,"objectID":"a2ca0d19f38470541be45569c6948729","permalink":"https://milanlproc.github.io/publication/2022-strategies-hate-speech-detection-under-resourced-languages/","publishdate":"2022-10-20T14:48:20+01:00","relpermalink":"/publication/2022-strategies-hate-speech-detection-under-resourced-languages/","section":"publication","summary":"Hate speech is a global phenomenon, but most hate speech datasets so far focus on English-language content. This hinders the development of more effective hate speech detection models in hundreds of languages spoken by billions across the world. More data is needed, but annotating hateful content is expensive, time-consuming and potentially harmful to annotators. To mitigate these issues, we explore data-efficient strategies for expanding hate speech detection into under-resourced languages. In a series of experiments with mono- and multilingual models across five non-English languages, we find that 1) a small amount of target-language fine-tuning data is needed to achieve strong performance, 2) the benefits of using more such data decrease exponentially, and 3) initial fine-tuning on readily-available English data can partially substitute target-language data and improve model generalisability. Based on these findings, we formulate actionable recommendations for hate speech detection in low-resource language settings.","tags":["Hate Speech","NLP","multilingual"],"title":"Data-Efficient Strategies for Expanding Hate Speech Detection into Under-Resourced Languages","type":"publication"},{"authors":["Giuseppe Attanasio","Debora Nozza","Federico Bianchi","Dirk Hovy"],"categories":[],"content":"","date":1665619200,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1665619200,"objectID":"bcb8bb12e133a3d8b26d9091bc35a703","permalink":"https://milanlproc.github.io/publication/2022-limitation-diachronic-continuous-training/","publishdate":"2022-10-13T14:48:20+01:00","relpermalink":"/publication/2022-limitation-diachronic-continuous-training/","section":"publication","summary":"Language is constantly changing and evolving, leaving language models to quickly become outdated, both factually and linguistically. Recent research proposes we continuously update our models using new data. Continuous training allows us to teach language models about new events and facts and changing norms. However, continuous training also means continuous costs. We show there is currently limited evidence for the benefits of continuous training, be it for the actual downstream performance or the environmental cost. Our results show continuous training does not significantly improve performance. While it is clear that, sooner or later, our language models need to be updated, it is unclear when this effort is worth the cost. We call for a critical reflection about when and how to use continuous training and for more benchmarks to support this research direction.","tags":["NLP","BERT"],"title":"Is It Worth the (Environmental) Cost? Limited Evidence for the Benefits of Diachronic Continuous Training","type":"publication"},{"authors":["Anne Lauscher","Archie Crowley","Dirk Hovy"],"categories":[],"content":"","date":1665532800,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1665532800,"objectID":"a7cbc519326b6c1f4579513c619bb3ac","permalink":"https://milanlproc.github.io/publication/2022-welcome_modern_world_pronouns_identity-inclusive_natural_language_processing_beyond_gender/","publishdate":"2022-10-12T14:48:20+01:00","relpermalink":"/publication/2022-welcome_modern_world_pronouns_identity-inclusive_natural_language_processing_beyond_gender/","section":"publication","summary":"The world of pronouns is changing – from a closed word class with few members to an open set of terms to reflect identities. However, Natural Language Processing (NLP) barely reflects this linguistic shift, resulting in the possible exclusion of non-binary users, even though recent work outlined the harms of gender-exclusive language technology. The current modeling of 3rd person pronouns is particularly problematic. It largely ignores various phenomena like neopronouns, i.e., novel pronoun sets that are not (yet) widely established. This omission contributes to the discrimination of marginalized and underrepresented groups, e.g., non-binary individuals. It thus prevents gender equality, one of the UN’s sustainable development goals (goal 5). Further, other identity-expressions beyond gender are ignored by current NLP technology. This paper provides an overview of 3rd person pronoun issues for NLP. Based on our observations and ethical considerations, we define a series of five desiderata for modeling pronouns in language technology, which we validate through a survey. We evaluate existing and novel modeling approaches w.r.t. these desiderata qualitatively and quantify the impact of a more discrimination-free approach on an established benchmark dataset.","tags":["NLP","pronouns","fairness","ethics"],"title":"Welcome to the Modern World of Pronouns: Identity-Inclusive Natural Language Processing beyond Gender","type":"publication"},{"authors":["A. Stevie Bergman","Gavin Abercrombie","Shannon Spruit","Dirk Hovy","Emily Dinan","Y-Lan Boureau","Verena Rieser"],"categories":[],"content":"","date":1662940800,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1662940800,"objectID":"3acc9d4e5de91b3e10aeb9ecb187107d","permalink":"https://milanlproc.github.io/publication/2022-guiding_release_safer_e2e_conversational_ai_through_value_sensitive_design/","publishdate":"2022-09-12T14:48:20+01:00","relpermalink":"/publication/2022-guiding_release_safer_e2e_conversational_ai_through_value_sensitive_design/","section":"publication","summary":"Over the last several years, end-to-end neural conversational agents have vastly improved their ability to carry unrestricted, open-domain conversations with humans. However, these models are often trained on large datasets from the Internet and, as a result, may learn undesirable behaviours from this data, such as toxic or otherwise harmful language. Thus, researchers must wrestle with how and when to release these models. In this paper, we survey recent and related work to highlight tensions between values, potential positive impact, and potential harms. We also provide a framework to support practitioners in deciding whether and how to release these models, following the tenets of value-sensitive design.","tags":["NLP","NLG","fairness","ethics","value sensitive design","chatbots"],"title":"Guiding the Release of Safer E2E Conversational AI through Value Sensitive Design","type":"publication"},{"authors":["Debora Nozza","Federico Bianchi","Giuseppe Attanasio"],"categories":[],"content":"","date":1657584000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1657584000,"objectID":"e1aa310a2b39f2b3d80eb484d6acf1be","permalink":"https://milanlproc.github.io/publication/2022-hate-speech-detection-italian-social-media/","publishdate":"2022-07-12T14:48:20+01:00","relpermalink":"/publication/2022-hate-speech-detection-italian-social-media/","section":"publication","summary":"Online hate speech is a dangerous phenomenon that can (and should) be promptly counteracted properly. While Natural Language Processing supplies appropriate algorithms for trying to reach this objective, all research efforts are directed toward the English language. This strongly limits the classification power on non-English languages. In this paper, we test several learning frameworks for identifying hate speech in Italian text. We release HATE-ITA, a multi-language model trained on a large set of English data and available Italian datasets. HATE-ITA performs better than mono-lingual models and seems to adapt well also on language-specific slurs. We hope our findings will encourage the research in other mid-to-low resource communities and provide a valuable benchmarking tool for the Italian community.","tags":["Hate Speech","BERT","NLP"],"title":"HATE-ITA: Hate Speech Detection in Italian Social Media Text","type":"publication"},{"authors":["Paul Röttger","Haitham Seelawi","Debora Nozza","Zeerak Talat","Bertie Vidgen"],"categories":[],"content":"","date":1657584000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1657584000,"objectID":"44e6c8311312a4c0b80faea18522c363","permalink":"https://milanlproc.github.io/publication/2022-multilingual-hatecheck-hate-speech-functional-tests/","publishdate":"2022-07-12T14:48:20+01:00","relpermalink":"/publication/2022-multilingual-hatecheck-hate-speech-functional-tests/","section":"publication","summary":"Hate speech detection models are typically evaluated on held-out test sets. However, this risks painting an incomplete and potentially misleading picture of model performance because of increasingly well-documented systematic gaps and biases in hate speech datasets. To enable more targeted diagnostic insights, recent research has thus introduced functional tests for hate speech detection models. However, these tests currently only exist for English-language content, which means that they cannot support the development of more effective models in other languages spoken by billions across the world. To help address this issue, we introduce Multilingual HateCheck (MHC), a suite of functional tests for multilingual hate speech detection models. MHC covers 34 functionalities across ten languages, which is more languages than any other hate speech dataset. To illustrate MHC’s utility, we train and test a high-performing multilingual hate speech detection model, and reveal critical model weaknesses for monolingual and cross-lingual applications.","tags":["Hate Speech","BERT","NLP"],"title":"Multilingual HateCheck: Functional Tests for Multilingual Hate Speech Detection Models","type":"publication"},{"authors":["Tommaso Fornaciari","Alexandra Uma","Massimo Poesio","Dirk Hovy"],"categories":[],"content":"","date":1652313600,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1652313600,"objectID":"9827bbbeabf9916ea04af9f70ca7d9ed","permalink":"https://milanlproc.github.io/publication/2022-hard_soft_evaluation_nlp_models_bootstrap_sampling-boostsa/","publishdate":"2022-05-12T14:48:20+01:00","relpermalink":"/publication/2022-hard_soft_evaluation_nlp_models_bootstrap_sampling-boostsa/","section":"publication","summary":"Natural Language Processing (NLP) ‘s applied nature makes it necessary to select the most effective and robust models. Producing slightly higher performance is insufficient; we want to know whether this advantage will carry over to other data sets. Bootstrapped significance tests can indicate that ability.So while necessary, computing the significance of models’ performance differences has many levels of complexity. It can be tedious, especially when the experimental design has many conditions to compare and several runs of experiments.We present BooStSa, a tool that makes it easy to compute significance levels with the BOOtSTrap SAmpling procedure to evaluate models that predict not only standard hard labels but soft-labels (i.e., probability distributions over different classes) as well.","tags":["NLP","bootstrap sampling","stats","p-value"],"title":"Hard and Soft Evaluation of NLP models with BOOtSTrap SAmpling - BooStSa","type":"publication"},{"authors":["Giuseppe Attanasio","Debora Nozza","Federico Bianchi"],"categories":[],"content":"","date":1651276800,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1651276800,"objectID":"8358f2dc7dc8713df3a0102aca07e999","permalink":"https://milanlproc.github.io/publication/2022-semeval-mami-perceiverio-misogyny-multimodal-meme/","publishdate":"2022-04-30T14:48:20+01:00","relpermalink":"/publication/2022-semeval-mami-perceiverio-misogyny-multimodal-meme/","section":"publication","summary":"In this paper, we describe the system proposed by the MilaNLP team for the Multimedia Automatic Misogyny Identification (MAMI) challenge. We use Perceiver IO as a multimodal late fusion over unimodal streams to address both sub-tasks A and B. We build unimodal embeddings using Vision Transformer (image) and RoBERTa (text transcript). We enrich the input representation using face and demographic recognition, image captioning, and detection of adult content and web entities. To the best of our knowledge, this work is the first to use Perceiver IO combining text and image modalities. The proposed approach outperforms unimodal and multimodal baselines.","tags":["Misogyny","Meme","Multimodal","PerceiverIO","Architectures"],"title":"MilaNLP at SemEval-2022 Task 5: Using Perceiver IO for Detecting Misogynous Memes with Text and Image Modalities","type":"publication"},{"authors":["Federico Bianchi","Debora Nozza","Dirk Hovy"],"categories":[],"content":"","date":1650499200,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1650499200,"objectID":"100083027ef95cdeecf37cf18bd6bf97","permalink":"https://milanlproc.github.io/publication/2022-language-invariant-properties-nlp/","publishdate":"2022-04-12T14:48:20+01:00","relpermalink":"/publication/2022-language-invariant-properties-nlp/","section":"publication","summary":"Meaning is context-dependent, but many properties of language (should) remain the same even if we transform the context. For example, sentiment, entailment, or speaker properties should be the same in a translation and original of a text. We introduce language invariant properties: i.e., properties that should not change when we transform text, and how they can be used to quantitatively evaluate the robustness of transformation algorithms. We use translation and paraphrasing as transformation examples, but our findings apply more broadly to any transformation. Our results indicate that many NLP transformations change properties like author characteristics, i.e., make them sound more male. We believe that studying these properties will allow NLP to address both social factors and pragmatic aspects of language. We also release an application suite that can be used to evaluate the invariance of transformation applications.","tags":["NLP","Language Invariant Properties","Meaning"],"title":"Language Invariant Properties in Natural Language Processing","type":"publication"},{"authors":null,"categories":["NLP","computational social science"],"content":"Over the last decade, discontent in democracy, mistrust in institutions, and the rise of populist parties have strained European societies. Underlying these tensions are often increasing inequalities in Western countries, which fuel the discontent of individuals. The Covid pandemic further exacerbated these problems, as anti-Covid measures taken by governments differently impacted societal groups.\nThe MENTALISM project, funded by Fondazione Cariplo under grant agreement 2022-1480, combines modern social media analysis with traditional survey data to track inequality across Italy through the lens of the pandemic.\nOur ground-breaking mixed-methods approach uses machine learning and text analysis to trace online grievances in a vast corpus of social media data. We combine these methods with survey protocols and econometric analysis to validate the findings and provide actionable policy advice. MENTALISM combines the advantages of social media data (high-frequency, individual-level information) with the strength of socio-economic surveys (representativeness). Our novel interdisciplinary approach will critically evaluate the value of social media monitoring for policy feedback. Moreover, it will establish protocols for policymakers to better respond to growing grievances brought on by inequality at various steps in the process.\nThis interdisciplinary project is led by Profs. Carlo Schwarz (economics), and Dirk Hovy (NLP).\n","date":1649808000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1649808000,"objectID":"f1efa90206391b8dcd0df63edbc66767","permalink":"https://milanlproc.github.io/project/mentalism/","publishdate":"2022-04-13T00:00:00Z","relpermalink":"/project/mentalism/","section":"project","summary":"Measuring, Tracking, and Analyzing Inequality using Social Media","tags":["demographic","inequality","economics","social media","NLP"],"title":"MENTALISM","type":"project"},{"authors":["Giuseppe Attanasio","Debora Nozza","Eliana Pastor","Dirk Hovy"],"categories":[],"content":"","date":1649721600,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1649721600,"objectID":"1bc8b1ca2c825938f3fb71def878b8a1","permalink":"https://milanlproc.github.io/publication/2022-interpretability-transformer-mysogyny-detection/","publishdate":"2022-04-12T14:48:20+01:00","relpermalink":"/publication/2022-interpretability-transformer-mysogyny-detection/","section":"publication","summary":"Transformer-based Natural Language Processing models have become the standard for hate speech detection. However, the unconscious use of these techniques for such a critical task comes with negative consequences. Various works have demonstrated that hate speech classifiers are biased. These findings have prompted efforts to explain classifiers, mainly using attribution methods. In this paper, **we provide the first benchmark study of interpretability approaches for hate speech detection**. We cover four post-hoc token attribution approaches to explain the predictions of Transformer-based misogyny classifiers in English and Italian. Further, we compare generated attributions to attention analysis. We find that only two algorithms provide faithful explanations aligned with human expectations. Gradient-based methods and attention, however, show inconsistent outputs, making their value for explanations questionable for hate speech detection tasks.","tags":["Hate Speech","BERT","NLP"],"title":"Benchmarking Post-Hoc Interpretability Approaches for Transformer-based Misogyny Detection","type":"publication"},{"authors":["Debora Nozza","Federico Bianchi","Anne Lauscher","Dirk Hovy"],"categories":[],"content":"","date":1649721600,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1649721600,"objectID":"9088977c974224d138af8fd8ed78b98e","permalink":"https://milanlproc.github.io/publication/2022-honest-hurtful-language-model-lgbtqia+/","publishdate":"2022-04-12T14:48:20+01:00","relpermalink":"/publication/2022-honest-hurtful-language-model-lgbtqia+/","section":"publication","summary":"Current language technology is ubiquitous and directly influences individuals' lives worldwide. Given the recent trend in AI on training and constantly releasing new and powerful large language models (LLMs), there is a need to assess their biases and potential concrete consequences. While some studies have highlighted the shortcomings of these models, there is only little on the negative impact of LLMs on LGBTQIA+ individuals. In this paper, we investigated a state-of-the-art template-based approach for measuring the harmfulness of English LLMs sentence completion when the subjects belong to the LGBTQIA+ community. Our findings show that, on average, **the most likely LLM-generated completion is an identity attack 13% of the time**. Our results raise serious concerns about the applicability of these models in production environments.","tags":["Hate Speech","BERT","NLP"],"title":"Measuring Harmful Sentence Completion in Language Models for LGBTQIA+ Individuals","type":"publication"},{"authors":["Debora Nozza","Federico Bianchi","Dirk Hovy"],"categories":[],"content":"","date":1649721600,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1649721600,"objectID":"caa2e3beea8b00d14fdcf83df519c588","permalink":"https://milanlproc.github.io/publication/2022-pipelines-social-bias-testing-language-models/","publishdate":"2022-04-12T14:48:20+01:00","relpermalink":"/publication/2022-pipelines-social-bias-testing-language-models/","section":"publication","summary":"The maturity level of language models is now at a stage in which many companies rely on them to solve various tasks. However, while research has shown how biased and harmful these models are, **systematic ways of integrating social bias tests into development pipelines are still lacking. This short paper suggests how to use these verification techniques in development pipelines.** We take inspiration from software testing and suggest addressing social bias evaluation as software testing. We hope to open a discussion on the best methodologies to handle social bias testing in language models.","tags":["Hate Speech","BERT","NLP"],"title":"Pipelines for Social Bias Testing of Large Language Models","type":"publication"},{"authors":["Paul Röttger","Bertie Vidgen","Dirk Hovy","Janet B. Pierrehumbert"],"categories":[],"content":"","date":1649721600,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1649721600,"objectID":"805f462ec62d4458e8694ccd8b6293f5","permalink":"https://milanlproc.github.io/publication/2022-two-contrasting-data-annotation-paradigms-subjective-nlp-tasks/","publishdate":"2022-04-12T14:48:20+01:00","relpermalink":"/publication/2022-two-contrasting-data-annotation-paradigms-subjective-nlp-tasks/","section":"publication","summary":"Labelled data is the foundation of most natural language processing tasks. However, labelling data is difficult and there often are diverse valid beliefs about what the correct data labels should be. So far, dataset creators have acknowledged annotator subjectivity, but rarely actively managed it in the annotation process. This has led to partly-subjective datasets that fail to serve a clear downstream use. To address this issue, we propose two contrasting paradigms for data annotation. The descriptive paradigm encourages annotator subjectivity, whereas the prescriptive paradigm discourages it. Descriptive annotation allows for the surveying and modelling of different beliefs, whereas prescriptive annotation enables the training of models that consistently apply one belief. We discuss benefits and challenges in implementing both paradigms, and argue that dataset creators should explicitly aim for one or the other to facilitate the intended use of their dataset. Lastly, we conduct an annotation experiment using hate speech data that illustrates the contrast between the two paradigms.","tags":["Annotation","NLP","dataset"],"title":"Two Contrasting Data Annotation Paradigms for Subjective NLP Tasks","type":"publication"},{"authors":["Federico Bianchi","Debora Nozza","Dirk Hovy"],"categories":[],"content":"","date":1649721600,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1649721600,"objectID":"c798f81abd0bcd706c51aa0ea04a2d12","permalink":"https://milanlproc.github.io/publication/2022-xlmemo-multilingual-emotion-prediction/","publishdate":"2022-04-12T14:48:20+01:00","relpermalink":"/publication/2022-xlmemo-multilingual-emotion-prediction/","section":"publication","summary":"Detecting emotion in text allows social and computational scientists to study how people behave and react to online events. However, developing these tools for different languages requires data that is not always available. This paper collects the available emotion detection datasets across 19 languages. We train a multilingual emotion prediction model for social media data, XLM-EMO. The model shows competitive performance in a zero-shot setting, suggesting it is helpful in the context of low-resource languages. We release our model to the community so that interested researchers can directly use it.","tags":["Sentiment Analysis","Emotion Detection","Italian","BERT","NLP","dataset","multilingual"],"title":"XLM-EMO: Multilingual Emotion Prediction in Social Media Text","type":"publication"},{"authors":["Chia-Chien Hung, Anne Lauscher, Simone Paolo Ponzetto, Goran Glavaš"],"categories":[],"content":"","date":1649376000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1649376000,"objectID":"7c36894900b37eb709e17f232c12092e","permalink":"https://milanlproc.github.io/publication/2022-acl-findings-dstod/","publishdate":"2022-04-07T11:48:20+01:00","relpermalink":"/publication/2022-acl-findings-dstod/","section":"publication","summary":" Recent work has shown that self-supervised dialog-specific pretraining on large conversational datasets yields substantial gains over traditional language modeling (LM) pretraining in downstream task-oriented dialog (TOD). These approaches, however, exploit general dialogic corpora (e.g., Reddit) and thus presumably fail to reliably embed domain-specific knowledge useful for concrete downstream TOD domains. In this work, we investigate the effects of domain specialization of pretrained language models (PLMs) for task-oriented dialog. Within our DS-TOD framework, we first automatically extract salient domain-specific terms, and then use them to construct DomainCC and DomainReddit -- resources that we leverage for domain-specific pretraining, based on (i) masked language modeling (MLM) and (ii) response selection (RS) objectives, respectively. We further propose a resource-efficient and modular domain specialization by means of domain adapters -- additional parameter-light layers in which we encode the domain knowledge. Our experiments with two prominent TOD tasks -- dialog state tracking (DST) and response retrieval (RR) -- encompassing five domains from the MultiWOZ TOD benchmark demonstrate the effectiveness of our domain specialization approach. Moreover, we show that the light-weight adapter-based specialization (1) performs comparably to full fine-tuning in single-domain setups and (2) is particularly suitable for multi-domain specialization, in which, besides advantageous computational footprint, it can offer better downstream performance. ","tags":["Domain Specialization","Conversational AI","NLP"],"title":"DS-TOD: Efficient Domain Specialization for Task Oriented Dialog","type":"publication"},{"authors":["Carolin Holtermann, Anne Lauscher, Simone Paolo Ponzetto"],"categories":[],"content":"","date":1649376000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1649376000,"objectID":"a85dc7b1f28949d457eb563059b0d86a","permalink":"https://milanlproc.github.io/publication/2022-acl-argumentation/","publishdate":"2022-04-07T11:48:20+01:00","relpermalink":"/publication/2022-acl-argumentation/","section":"publication","summary":" Although much work in NLP has focused on measuring and mitigating stereotypical bias in semantic spaces, research addressing bias in computational argumentation is still in its infancy. In this paper, we address this research gap and conduct a thorough investigation of bias in argumentative language models. To this end, we introduce ABBA, a novel resource for bias measurement specifically tailored to argumentation. We employ our resource to assess the effect of argumentative fine-tuning and debiasing on the intrinsic bias found in transformer-based language models using a lightweight adapter-based approach that is more sustainable and parameter-efficient than full fine-tuning. Finally, we analyze the potential impact of language model debiasing on the performance in argument quality prediction, a downstream task of computational argumentation. Our results show that we are able to successfully and sustainably remove bias in general and argumentative language models while preserving (and sometimes improving) model performance in downstream tasks. We make all experimental code and data available at https://github.com/umanlp/FairArgumentativeLM.","tags":["Fairness","Computational Argumentation","NLP"],"title":"Fair and Argumentative Language Modeling for Computational Argumentation","type":"publication"},{"authors":["Giuseppe Attanasio","Debora Nozza","Dirk Hovy","Elena Baralis"],"categories":[],"content":"","date":1647216000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1647216000,"objectID":"b9f7adfa2c36e4d8da97d41a76dcdbf8","permalink":"https://milanlproc.github.io/publication/2022-entropy-attention-regularization-bias/","publishdate":"2022-03-17T14:48:20+01:00","relpermalink":"/publication/2022-entropy-attention-regularization-bias/","section":"publication","summary":"Natural Language Processing (NLP) models risk overfitting to specific terms in the training data, thereby reducing their performance, fairness, and generalizability. E.g., neural hate speech detection models are strongly influenced by identity terms like gay, or women, resulting in false positives, severe unintended bias, and lower performance. Most mitigation techniques use lists of identity terms or samples from the target domain during training. However, this approach requires a-priori knowledge and introduces further bias if important terms are neglected. Instead, we propose a knowledge-free Entropy-based Attention Regularization (EAR) to discourage overfitting to training-specific terms. An additional objective function penalizes tokens with low self-attention entropy. We fine-tune BERT via EAR: the resulting model matches or exceeds state-of-the-art performance for hate speech classification and bias metrics on three benchmark corpora in English and Italian. EAR also reveals overfitting terms, i.e., terms most likely to induce bias, to help identify their effect on the model, task, and predictions.","tags":["Hate Speech","Bias","Entropy","Attention","Regularization","NLP"],"title":"Entropy-based Attention Regularization Frees Unintended Bias Mitigation from Lists","type":"publication"},{"authors":["Emily Dinan","Gavin Abercrombie","A. Stevie Bergman","Shannon Spruit","Dirk Hovy","Y-Lan Boureau","Verena Rieser"],"categories":[],"content":"","date":1647216000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1647216000,"objectID":"75ad76dd011e594c70e4a27a81805308","permalink":"https://milanlproc.github.io/publication/2022-safetykit-first-aid-measuring-safety-open-domain-conversational-systems/","publishdate":"2022-03-17T14:48:20+01:00","relpermalink":"/publication/2022-safetykit-first-aid-measuring-safety-open-domain-conversational-systems/","section":"publication","summary":"The social impact of natural language processing and its applications has received increasing attention. In this position paper, we focus on the problem of safety for end-to-end conversational AI. We survey the problem landscape therein, introducing a taxonomy of three observed phenomena: the Instigator, Yea-Sayer, and Impostor effects. We then empirically assess the extent to which current tools can measure these effects and current systems display them. We release these tools as part of a ``first aid kit'' (SAFETYKIT) to quickly assess apparent safety concerns. Our results show that, while current tools are able to provide an estimate of the relative safety of systems in various settings, they still have several shortcomings. We suggest several future directions and discuss ethical considerations.","tags":["dialog","Bias","conversational AI","NLG","NLP"],"title":"SAFETYKIT: First Aid for Measuring Safety in Open-domain Conversational Systems","type":"publication"},{"authors":["Dirk Hovy"],"categories":[],"content":"","date":1642291200,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1642291200,"objectID":"eb70cdde35ea1df981863c7b83b5fa20","permalink":"https://milanlproc.github.io/publication/2022_nlpss2/","publishdate":"2020-02-29T14:48:20+01:00","relpermalink":"/publication/2022_nlpss2/","section":"publication","summary":"Text contains a wealth of information about about a wide variety of sociocultural constructs. Automated prediction methods can infer these quantities (sentiment analysis is probably the most well-known application). However, there is virtually no limit to the kinds of things we can predict from text: power, trust, misogyny, are all signaled in language. These algorithms easily scale to corpus sizes infeasible for manual analysis. Prediction algorithms have become steadily more powerful, especially with the advent of neural network methods. However, applying these techniques usually requires profound programming knowledge and machine learning expertise. As a result, many social scientists do not apply them. This Element provides the working social scientist with an overview of the most common methods for text classification, an intuition of their applicability, and Python code to execute them. It covers both the ethical foundations of such work as well as the emerging potential of neural network methods.","tags":["text analysis","social science","NLP","Python","classification"],"title":"Text Analysis in Python for Social Scientists – Prediction and Classification","type":"publication"},{"authors":["Alexandra N Uma","Tommaso Fornaciari","Dirk Hovy","Silviu Paun","Barbara Plank","Massimo Poesio"],"categories":[],"content":"","date":1640563200,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1640563200,"objectID":"89043ca549ca8b57f7d851ddb9c709fd","permalink":"https://milanlproc.github.io/publication/2021-learning_from_disagreement_survey/","publishdate":"2021-12-27T14:48:20+01:00","relpermalink":"/publication/2021-learning_from_disagreement_survey/","section":"publication","summary":"Many tasks in Natural Language Processing (NLP) and Computer Vision (CV) offer evidence that humans disagree, from objective tasks such as part-of-speech tagging to more subjective tasks such as classifying an image or deciding whether a proposition follows from certain premises. While most learning in artificial intelligence (AI) still relies on the assumption that a single (gold) interpretation exists for each item, a growing body of research aims to develop learning methods that do not rely on this assumption. In this survey, we review the evidence for disagreements on NLP and CV tasks, focusing on tasks for which substantial datasets containing this information have been created. We discuss the most popular approaches to training models from datasets containing multiple judgments potentially in disagreement. We systematically compare these different approaches by training them with each of the available datasets, considering several ways to evaluate the resulting models. Finally, we discuss the results in depth, focusing on four key research questions, and assess how the type of evaluation and the characteristics of a dataset determine the answers to these questions. Our results suggest, first of all, that even if we abandon the assumption of a gold standard, it is still essential to reach a consensus on how to evaluate models. This is because the relative performance of the various training methods is critically affected by the chosen form of evaluation. Secondly, we observed a strong dataset effect. With substantial datasets, providing many judgments by high-quality coders for each item, training directly with soft labels achieved better results than training from aggregated or even gold labels. This result holds for both hard and soft evaluation. But when the above conditions do not hold, leveraging both gold and soft labels generally achieved the best results in the hard evaluation. All datasets and models employed in this paper are freely available as supplementary materials.","tags":["annotation","NLP","disagreement","agreement"],"title":"Learning from Disagreement: A Survey","type":"publication"},{"authors":["Dirk Hovy","Shrimai Prabhumoye"],"categories":[],"content":"","date":1628208000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1628208000,"objectID":"2c3bf2de81d5732843e3bf56900eee2d","permalink":"https://milanlproc.github.io/publication/2021-five-sources-bias/","publishdate":"2021-05-06T01:41:26+01:00","relpermalink":"/publication/2021-five-sources-bias/","section":"publication","summary":"Recently, there has been an increased interest in demographically grounded bias in natural language processing (NLP) applications. Much of the recent work has focused on describing bias and providing an overview of bias in a larger context. Here, we provide a simple, actionable summary of this recent work. We outline five sources where bias can occur in NLP systems: (1) the data, (2) the annotation process, (3) the input representations, (4) the models, and finally (5) the research design (or how we conceptualize our research). We explore each of the bias sources in detail in this article, including examples and links to related work, as well as potential counter-measures.","tags":["Position Paper","Issues","NLP","bias"],"title":"Five sources of bias in natural language processing","type":"publication"},{"authors":["Federico Bianchi","Dirk Hovy"],"categories":[],"content":"","date":1628208000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1628208000,"objectID":"ae8849cc748767803b8f7f12282a9f1e","permalink":"https://milanlproc.github.io/publication/2021-gap-between-understanding-adoption/","publishdate":"2021-05-06T01:41:26+01:00","relpermalink":"/publication/2021-gap-between-understanding-adoption/","section":"publication","summary":"There are some issues with current research trends in NLP that can hamper the free development of scientific research. We identify five of particular concern: 1) the early adoption of methods without sufficient understanding or analysis; 2) the preference for computational methods regardless of risks associated with their limitations; 3) the resulting bias in the papers we publish; 4) the impossibility of re-running some experiments due to their cost; 5) the dangers of unexplainable methods. If these issues are not addressed, we risk a loss of reproducibility, reputability, and subsequently public trust in our field. In this position paper, we outline each of these points and suggest ways forward.","tags":["Position Paper","Issues","NLP"],"title":"On the Gap between Adoption and Understanding in NLP","type":"publication"},{"authors":["Federico Bianchi","Silvia Terragni","Dirk Hovy"],"categories":[],"content":"","date":1628208000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1628208000,"objectID":"e3b6d4f859c0f40521fc0ad7439aa5b6","permalink":"https://milanlproc.github.io/publication/2021-contextualized-improve-topic-models-coherence/","publishdate":"2021-05-06T01:41:26+01:00","relpermalink":"/publication/2021-contextualized-improve-topic-models-coherence/","section":"publication","summary":"Topic models extract groups of words from documents, whose interpretation as a topic hopefully allows for a better understanding of the data. However, the resulting word groups are often not coherent, making them harder to interpret. Recently, neural topic models have shown improvements in overall coherence. Concurrently, contextual embeddings have advanced the state of the art of neural models in general. In this paper, we combine contextualized BERT representations with neural topic models. We find that our approach produces more meaningful and coherent topics than traditional bag-of-word topic models and recent neural models. Our results indicate that future improvements in language models will translate into better topic models.","tags":["Topic Modeling","Coherence","NLP"],"title":"Pre-training is a Hot Topic: Contextualized Document Embeddings Improve Topic Coherence","type":"publication"},{"authors":["Debora Nozza"],"categories":[],"content":"","date":1627862400,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1627862400,"objectID":"21f8b8f0a896aaf439bea0d701f2ea7d","permalink":"https://milanlproc.github.io/publication/2021-zeroshot-crosslingual-hate-speech/","publishdate":"2021-05-06T14:48:20+01:00","relpermalink":"/publication/2021-zeroshot-crosslingual-hate-speech/","section":"publication","summary":"Reducing and counter-acting hate speech on Social Media is a significant concern. Most of the proposed automatic methods are conducted exclusively on English and very few consistently labeled, non-English resources have been proposed. Learning to detect hate speech on English and transferring to unseen languages seems an immediate solution. This work is the first to shed light on the limits of this zero-shot, cross-lingual transfer learning framework for hate speech detection. We use benchmark data sets in English, Italian, and Spanish to detect hate speech towards immigrants and women. Investigating post-hoc explanations of the model, we discover that non-hateful, language-specific taboo interjections are misinterpreted as signals of hate speech. Our findings demonstrate that zero-shot, cross-lingual models cannot be used as they are, but need to be carefully designed.","tags":["Hate Speech","BERT","NLP"],"title":"Exposing the limits of Zero-shot Cross-lingual Hate Speech Detection","type":"publication"},{"authors":["Tommaso Fornaciari","Dirk Hovy","Elin Naurin","Julia Runeson","Robert Thomson","Pankaj Adhikari"],"categories":[],"content":"","date":1627776000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1627776000,"objectID":"c7b4b48cb786e0a2763d7f93ab0ed3f5","permalink":"https://milanlproc.github.io/publication/2021-aclfindings-mimac/","publishdate":"2021-05-06T01:41:26+01:00","relpermalink":"/publication/2021-aclfindings-mimac/","section":"publication","summary":"In an election campaign, political parties pledge to implement various projects--should they be elected. But do they follow through? To track election pledges from parties' election manifestos, we need to distinguish between pledges and general statements. In this paper, we use election manifestos of Swedish and Indian political parties to learn neural models that distinguish actual pledges from generic political positions. Since pledges might vary by election year and party, we implement a Multi-Task Learning (MTL) setup, predicting election year and manifesto's party as auxiliary tasks. Pledges can also span several sentences, so we use hierarchical models that incorporate contextual information. Lastly, we evaluate the models in a Zero-Shot Learning (ZSL) framework across countries and languages. Our results indicate that year and party have predictive power even in ZSL, while context introduces some noise. We finally discuss the linguistic features of pledges.","tags":["Election pledges","Zero-Shot Learning","NLP"],"title":"'We will Reduce Taxes' - Identifying Election Pledges with Language Models","type":"publication"},{"authors":["Debora Nozza","Federico Bianchi","Dirk Hovy"],"categories":[],"content":"","date":1622937600,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1622937600,"objectID":"3c6a04000acf7b38009176972f9bf596","permalink":"https://milanlproc.github.io/publication/2021-honest-hurtful-language-model/","publishdate":"2021-03-29T14:48:20+01:00","relpermalink":"/publication/2021-honest-hurtful-language-model/","section":"publication","summary":"Language models have revolutionized the field of NLP. However, language models capture and proliferate hurtful stereotypes, especially in text generation. Our results show that **4.3% of the time, language models complete a sentence with a hurtful word**. These cases are not random, but follow language and gender-specific patterns. We propose a score to measure hurtful sentence completions in language models (HONEST). It uses a systematic template- and lexicon-based bias evaluation methodology for six languages. Our findings suggest that these models replicate and amplify deep-seated societal stereotypes about gender roles. Sentence completions refer to sexual promiscuity when the target is female in 9% of the time, and in 4% to homosexuality when the target is male. The results raise questions about the use of these models in production settings.","tags":["Hate Speech","BERT","NLP"],"title":"HONEST: Measuring Hurtful Sentence Completion in Language Models","type":"publication"},{"authors":["Dirk Hovy","Diyi Yang"],"categories":[],"content":"","date":1622937600,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1622937600,"objectID":"9ebb2f8b0e03f6f8b03537d2944bf15e","permalink":"https://milanlproc.github.io/publication/2021-importance-modeling-social-factors-language/","publishdate":"2021-05-06T14:48:20+01:00","relpermalink":"/publication/2021-importance-modeling-social-factors-language/","section":"publication","summary":"Natural language processing (NLP) applications are now more powerful and ubiquitous than ever before. With rapidly developing (neural) models and ever-more available data, current NLP models have access to more information than any human speaker during their life. Still, it would be hard to argue that NLP models have reached human-level capacity. In this position paper, we argue that the reason for the current limitations is a focus on information content while ignoring language's social factors. We show that current NLP systems systematically break down when faced with interpreting the social factors of language. This limits applications to a subset of information-related tasks and prevents NLP from reaching human-level performance. At the same time, systems that incorporate even a minimum of social factors already show remarkable improvements. We formalize a taxonomy of seven social factors based on linguistic theory and exemplify current failures and emerging successes for each of them. We suggest that the NLP community address social factors to get closer to the goal of human-like language understanding. ","tags":["social factors","computational linguistics","NLP"],"title":"The Importance of Modeling Social Factors of Language: Theory and Practice","type":"publication"},{"authors":["Federico Bianchi","Ciro Greco","Jacopo Tagliabue"],"categories":null,"content":"","date":1622592000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1622592000,"objectID":"b46929d332157b5fc41b82ef8d8c28dd","permalink":"https://milanlproc.github.io/publication/2021-language-in-a-search-box/","publishdate":"2021-03-02T00:00:00Z","relpermalink":"/publication/2021-language-in-a-search-box/","section":"publication","summary":"We investigate grounded language learning through real-world data, by modelling a teacher-learner dynamics through the natural interactions occurring between users and search engines.","tags":["NLP","Meaning","Linguistics","BERT","Embeddings","Language Models"],"title":"Language in a (Search) Box: Grounding Language Learning in Real-World Human-Machine Interaction","type":"publication"},{"authors":["Federico Bianchi","Debora Nozza","Dirk Hovy"],"categories":[],"content":"","date":1621123200,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1621123200,"objectID":"ede87cdeecd57ecdb6e7b020668ace0c","permalink":"https://milanlproc.github.io/publication/2021-feelit-italian-sentiment-emotion/","publishdate":"2021-03-28T14:48:20+01:00","relpermalink":"/publication/2021-feelit-italian-sentiment-emotion/","section":"publication","summary":"Sentiment analysis is a common task to understand people's reactions online. Still, we often need more nuanced information: is the post negative because the user is angry or because they are sad? An abundance of approaches has been introduced for tackling both tasks. However, at least for Italian, they all treat only one of the tasks at a time. We introduce FEEL-IT, a novel benchmark corpus of Italian Twitter posts annotated with four basic emotions: **anger**, **fear**, **joy**, **sadness**. By collapsing them, we can also do sentiment analysis. We evaluate our corpus on benchmark datasets for both emotion and sentiment classification, obtaining competitive results. We release an [open-source Python library](https://github.com/MilaNLProc/feel-it), so researchers can use a model trained on FEEL-IT for inferring both sentiments and emotions from Italian text.","tags":["Sentiment Analysis","Emotion Detection","Italian","BERT","NLP","dataset"],"title":"FEEL-IT: Emotion and Sentiment Classification for the Italian Language","type":"publication"},{"authors":["Tommaso Fornaciari","Federico Bianchi","Debora Nozza","Dirk Hovy"],"categories":[],"content":"","date":1621123200,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1621123200,"objectID":"3bc06910533e6771816b1c186a6e9ade","permalink":"https://milanlproc.github.io/publication/2021-wassa-emotion-multitask/","publishdate":"2021-03-27T14:48:20+01:00","relpermalink":"/publication/2021-wassa-emotion-multitask/","section":"publication","summary":"The paper describes the MilaNLP team’s submission (Bocconi University, Milan) in the WASSA 2021 Shared Task on Empathy Detection and Emotion Classification. We focus on Track 2 - Emotion Classification - which consists of predicting the emotion of reactions to English news stories at the essay-level. We test different models based on multi-task and multi-input frameworks. The goal was to better exploit all the correlated information given in the data set. We find, though, that empathy as an auxiliary task in multi-task learning and demographic attributes as additional input provide worse performance with respect to single-task learning. While the result is competitive in terms of the competition, our results suggest that emotion and empathy are not related tasks - at least for the purpose of prediction.","tags":["Emotion Detection","BERT","NLP"],"title":"MilaNLP @ WASSA: Does BERT Feel Sad When You Cry?","type":"publication"},{"authors":null,"categories":null,"content":"We are delighted to announce that our group has four papers accepted at at ACL-IJCNLP 2021 main conference and Findings of ACL!\n Title: Pre-training is a Hot Topic: Contextualized Document Embeddings Improve Topic Coherence\nAuthors: Federico Bianchi, Silvia Terragni, Dirk Hovy\n Title: Exposing the limits of Zero-shot Cross-lingual Hate Speech Detection\nAuthors: Debora Nozza\n Title: On the Gap between Adoption and Understanding in NLP\nAuthors: Federico Bianchi, Dirk Hovy\n Title: \u0026lsquo;We will Reduce Taxes\u0026rsquo; - Identifying Election Pledges with Language Models\nAuthors: Tommaso Fornaciari, Dirk Hovy, Elin Naurin, Julia Runeson, Robert Thomson, Pankaj Adhikari\n ","date":1620259200,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1620259200,"objectID":"49e0aca1b8b895a3f2d2861b558d2e0e","permalink":"https://milanlproc.github.io/post/2021-acl-acceptance/","publishdate":"2021-05-06T00:00:00Z","relpermalink":"/post/2021-acl-acceptance/","section":"post","summary":"We are delighted to announce that our group has four papers accepted at at ACL-IJCNLP 2021 main conference and Findings of ACL!\n Title: Pre-training is a Hot Topic: Contextualized Document Embeddings Improve Topic Coherence","tags":null,"title":"Four papers accepted at ACL","type":"post"},{"authors":["Tommaso Fornaciari","Alexandra Uma","Silviu Paun","Barbara Plank","Dirk Hovy and Massimo Poesio"],"categories":[],"content":"","date":1620172800,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1620172800,"objectID":"23b4a371f95c37c5b13f4d6d76389ec6","permalink":"https://milanlproc.github.io/publication/2021-naacl-softlabels/","publishdate":"2021-05-06T01:41:26+01:00","relpermalink":"/publication/2021-naacl-softlabels/","section":"publication","summary":"Supervised learning assumes that a ground truth label exists. However, the reliability of this ground truth depends on human annotators, who often disagree. Prior work has shown that this disagreement can be helpful in training models. We propose a novel method to incorporate this disagreement as information: in addition to the standard error computation, we use soft labels (i.e., probability distributions over the annotator labels) as an auxiliary task in a multi-task neural network. We measure the divergence between the predictions and the target soft labels with several loss-functions and evaluate the models on various NLP tasks. We find that the soft-label prediction auxiliary task reduces the penalty for errors on ambiguous entities and thereby mitigates overfitting. It significantly improves performance across tasks beyond the standard approach and prior work.","tags":["Soft-labels","Agreement","NLP"],"title":"Beyond Black \u0026 White: Leveraging Annotator Disagreement via Soft-Label Multi-Task Learning","type":"publication"},{"authors":["Sotiris Lamprinidis","Federico Bianchi","Daniel Hardt","Dirk Hovy"],"categories":[],"content":"","date":1618531200,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1618531200,"objectID":"97f586c2e6f028d5128baf1d6a1aaa97","permalink":"https://milanlproc.github.io/publication/2021-universal-joy/","publishdate":"2021-03-27T14:48:20+01:00","relpermalink":"/publication/2021-universal-joy/","section":"publication","summary":"While emotions are universal aspects of human psychology, they are expressed differently across different languages and cultures. We introduce a new data set of over 530k anonymized public Facebook posts across 18 languages, labeled with five different emotions. Using multilingual BERT embeddings, we show that emotions can be reliably inferred both within and across languages. Zero-shot learning produces promising results for low-resource languages. Following established theories of basic emotions, we provide a detailed analysis of the possibilities and limits of cross-lingual emotion classification. We find that structural and typological similarity between languages facilitates cross-lingual learning, as well as linguistic diversity of training data. Our results suggest that there are commonalities underlying the expression of emotion in different languages. We publicly release the anonymized data for future research.","tags":["Emotion Detection","BERT","NLP","data set"],"title":"Universal Joy A Data Set and Results for Classifying Emotions Across Languages","type":"publication"},{"authors":["Tommaso Fornaciari","Federico Bianchi","Dirk Hovy","Massimo Poesio"],"categories":[],"content":"","date":1617926400,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1617926400,"objectID":"3a875d0db3d135d98a7b5d26b50805ed","permalink":"https://milanlproc.github.io/publication/2021_eacl_decour/","publishdate":"2021-04-09T01:41:26+01:00","relpermalink":"/publication/2021_eacl_decour/","section":"publication","summary":"Spotting a lie is challenging but has an enormous potential impact on security as well as private and public safety. Several NLP methods have been proposed to classify texts as truthful or deceptive. In most cases, however, the target texts’ preceding context is not considered. This is a severe limitation, as any communication takes place in context, not in a vacuum, and context can help to detect deception. We study a corpus of Italian dialogues containing deceptive statements and implement deep neural models that incorporate various linguistic contexts. We establish a new state-of-the-art identifying deception and find that not all context is equally useful to the task. Only the texts closest to the target, if from the same speaker (rather than questions by an interlocutor), boost performance. We also find that the semantic information in language models such as BERT contributes to the performance. However, BERT alone does not capture the implicit knowledge of deception cues: its contribution is conditional on the concurrent use of attention to learn cues from BERT’s representations.","tags":["deception detection","dataset","NLP"],"title":"BERTective: Language Models and Contextual Information for Deception Detection","type":"publication"},{"authors":["Federico Bianchi","Silvia Terragni","Dirk Hovy","Debora Nozza","Elisabetta Fersini"],"categories":null,"content":"","date":1614556800,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1614556800,"objectID":"b4e481bdc36e151c5f7c537366aa81d6","permalink":"https://milanlproc.github.io/publication/2021-crosslingual-topic-model/","publishdate":"2021-03-01T00:00:00Z","relpermalink":"/publication/2021-crosslingual-topic-model/","section":"publication","summary":"We introduce a novel topic modeling method that can make use of contextulized embeddings (e.g., BERT) to do zero-shot cross-lingual topic modeling.","tags":["NLP","Topic Modeling","BERT","Language Models"],"title":"Cross-lingual Contextualized Topic Models with Zero-shot Learning","type":"publication"},{"authors":["Dirk Hovy"],"categories":[],"content":"","date":1608076800,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1608076800,"objectID":"5cb6c78d7dfa74acb0b3dc3d0145145c","permalink":"https://milanlproc.github.io/publication/2020_nlpss/","publishdate":"2020-02-29T14:48:20+01:00","relpermalink":"/publication/2020_nlpss/","section":"publication","summary":"Text is everywhere, and it is a fantastic resource for social scientists. However, because it is so abundant, and because language is so variable, it is often difficult to extract the information we want. There is a whole subfield of AI concerned with text analysis (natural language processing). Many of the basic analysis methods developed are now readily available as Python implementations. This Element will teach you when to use which method, the mathematical background of how it works, and the Python code to implement it.","tags":["text analysis","social science","NLP","Python"],"title":"Text Analysis in Python for Social Scientists – Discovery and Exploration","type":"publication"},{"authors":["Deven Santosh Shah","H. Andrew Schwartz","Dirk Hovy"],"categories":[],"content":"","date":1593561600,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1593561600,"objectID":"15d5ca711d194a2336fb3c44dc0ea869","permalink":"https://milanlproc.github.io/publication/2020_bias/","publishdate":"2020-02-29T14:48:20+01:00","relpermalink":"/publication/2020_bias/","section":"publication","summary":"An increasing number of natural language processing papers address the effect of bias on predictions, introducing mitigation techniques at different parts of the standard NLP pipeline (data and models). However, these works have been conducted individually, without a unifying framework to organize efforts within the field. This situation leads to repetitive approaches, and focuses overly on bias symptoms/effects, rather than on their origins, which could limit the development of effective countermeasures. In this paper, we propose a unifying predictive bias framework for NLP. We summarize the NLP literature and suggest general mathematical definitions of predictive bias. We differentiate two consequences of bias: outcome disparities and error disparities, as well as four potential origins of biases: label bias, selection bias, model overamplification, and semantic bias. Our framework serves as an overview of predictive bias in NLP, integrating existing work into a single structure, and providing a conceptual baseline for improved frameworks.","tags":["bias","ethics","NLP"],"title":"Predictive Biases in Natural Language Processing Models: A Conceptual Framework and Overview","type":"publication"},{"authors":["Dirk Hovy","Federico Bianchi","Tommaso Fornaciari"],"categories":[],"content":"","date":1593561600,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1593561600,"objectID":"38a575da835464063eb667eb335f436f","permalink":"https://milanlproc.github.io/publication/2020_mt/","publishdate":"2020-02-29T14:48:20+01:00","relpermalink":"/publication/2020_mt/","section":"publication","summary":"The main goal of machine translation has been to convey the correct content. Stylistic considerations have been at best secondary. We show that as a consequence, the output of three commercial machine translation systems (Bing, DeepL, Google) make demographically diverse samples from five languages “sound” older and more male than the original. Our findings suggest that translation models reflect demographic bias in the training data. This opens up interesting new research avenues in machine translation to take stylistic considerations into account.","tags":["bias","ethics","machine translation","NLP"],"title":"“You Sound Just Like Your Father” Commercial Machine Translation Systems Include Stylistic Biases","type":"publication"},{"authors":["Dirk Hovy","Afshin Rahimi","Timothy Baldwin","Julian Brooke"],"categories":[],"content":"","date":1584835200,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1584835200,"objectID":"196496b369b51faaabbe3f66aa224c90","permalink":"https://milanlproc.github.io/publication/2020_eutwitter/","publishdate":"2020-02-29T14:48:20+01:00","relpermalink":"/publication/2020_eutwitter/","section":"publication","summary":"Geotagged Twitter data allows us to investigate correlations of geographic language variation, both at an interlingual and intralingual level. Based on data-driven studies of such relationships, this paper investigates regional variation of language usage on Twitter across Europe and compares it to traditional research of regional variation. This paper presents a novel method to process large amounts of data and to capture gradual differences in language variation. Visualizing the results by deterministically translating linguistic features into color hues presents a novel view of language variation across Europe, as it is reflected on Twitter. The technique is easy to apply to large amounts of data and provides a fast visual reference that can serve as input for further qualitative studies. The general applicability is demonstrated on a number of studies both across and within national languages. This paper also discusses the unique challenges of large-scale analysis and visualization, and the complementary nature of traditional qualitative and data-driven quantitative methods, and argues for their possible synthesis.","tags":["computational sociolinguistics","sociolinguistics","NLP","representation learning","embeddings"],"title":"Visualizing Regional Language Variation Across Europe on Twitter","type":"publication"},{"authors":["Farzana Rashid","Tommaso Fornaciari","Dirk Hovy","Eduardo Blanco","Fernando Vega-Redondo"],"categories":[],"content":"","date":1583020800,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1583020800,"objectID":"a7aa9d88e9614128120dc2a841ac10ab","permalink":"https://milanlproc.github.io/publication/2020_helpful/","publishdate":"2020-02-29T14:48:20+01:00","relpermalink":"/publication/2020_helpful/","section":"publication","summary":"When interacting with each other, we motivate, advise, inform, show love or power towards our peers. However, the way we interact may also hold some indication on how successful we are, as people often try to help each other to achieve their goals. We study the chat interactions of thousands of aspiring entrepreneurs who discuss and develop business models. We manually annotate a set of about 5,500 chat interactions with four dimensions of interaction styles (motivation, cooperation, equality, advice). We find that these styles can be reliably predicted, and that the communication styles can be used to predict a number of indices of business success. Our findings indicate that successful communicators are also successful in other domains.","tags":["conversation","style","communication","NLP"],"title":"Helpful or Hierarchical? Predicting the Communicative Strategies of Chat Participants, and their Impact on Success","type":"publication"},{"authors":["Debora Nozza","Federico Bianchi","Dirk Hovy"],"categories":[],"content":"","date":1583020800,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1583020800,"objectID":"39527c2d2966939ebd36a97e84e5382f","permalink":"https://milanlproc.github.io/publication/2020-bertlang-language-specific-bert/","publishdate":"2020-02-29T14:48:20+01:00","relpermalink":"/publication/2020-bertlang-language-specific-bert/","section":"publication","summary":"Recently, Natural Language Processing (NLP) has witnessed an impressive progress in many areas, due to the advent of novel, pretrained contextual representation models. In particular, Devlin et al. (2019) proposed a model, called BERT (Bidirectional Encoder Representations from Transformers), which enables researchers to obtain state-of-the art performance on numerous NLP tasks by fine-tuning the representations on their data set and task, without the need for developing and training highly-specific architectures. The authors also released multilingual BERT (mBERT), a model trained on a corpus of 104 languages, which can serve as a universal language model. This model obtained impressive results on a zero-shot cross-lingual natural inference task. Driven by the potential of BERT models, the NLP community has started to investigate and generate an abundant number of BERT models that are trained on a particular language, and tested on a specific data domain and task. This allows us to evaluate the true potential of mBERT as a universal language model, by comparing it to the performance of these more specific models. This paper presents the current state of the art in language-specific BERT models, providing an overall picture with respect to different dimensions (i.e. architectures, data domains, and tasks). Our aim is to provide an immediate and straightforward overview of the commonalities and differences between Language-Specific (language-specific) BERT models and mBERT. We also provide an interactive and constantly updated website that can be used to explore the information we have collected, at [https://bertlang.unibocconi.it](https://bertlang.unibocconi.it/).","tags":["multilingual","BERT","representation learning","NLP"],"title":"What the [MASK]? Making Sense of Language-Specific BERT Models","type":"publication"},{"authors":null,"categories":["demographic"],"content":"Dirk Hovy, scientific director of DMI and Professor of computer science, has won an ERC starting grant of 1.5mln euros. His project INTEGRATOR, funded under grant agreement 949944, introduces demographic factors into language processing systems, which will improve algorithmic performance, avoid racism, sexism, and ageism, and open up new applications. What if I wrote that “winning an ERC Grant, Dirk Hovy got a sick result?”. Those familiar with the use of “sick” as a synonym for “great” or “awesome” among teenagers would think that Bocconi Knowledge hired a very young writer (or someone posing as such). The rest would think I went crazy. Current artificial intelligence-based language systems wouldn’t have a clue. “Natural language processing (NLP) technologies,” Prof. Hovy says, “fail to account for demographics both in understanding language and in generating it. And this failure prevents us from reaching human-like performance. It limits possible future applications and it introduces systematic bias against underrepresented demographic groups”.\n🗞️🗞️ Related articles featured in Corriere Innovazione and Bocconi News.\n ","date":1580083200,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1580083200,"objectID":"d6b57eefc5eca031cd0bb3edb943a34f","permalink":"https://milanlproc.github.io/project/integrator/","publishdate":"2020-01-27T00:00:00Z","relpermalink":"/project/integrator/","section":"project","summary":"Incorporating Demographic Factors into Natural Language Processing Models","tags":["demographic","NLP"],"title":"INTEGRATOR","type":"project"},{"authors":null,"categories":["computational social science","political science","nlp"],"content":"In this inter-disciplinary project, Dirk Hovy and Tommaso Fornaciari team up with an international team of political scientists (led bt the University of Gothenburg) to develop mixed methods for analyzing political parties’ promises to voters during election campaigns. For democracy to function effectively, political parties must offer clear choices to voters during election campaigns. However, as parties’ communication with voters has become increasingly fragmented and targeted, it is much harder for citizens to keep track of what parties are promising. This threatens the quality of democratic representation. It also challenges established research methods for studying parties’ campaign promises. This project will develop new methods for studying parties’ promises in modern election campaigns. The project will integrate existing qualitative methods in political science and develop new research tools based on NLP. These AI-powered tools will enable researchers to examine parties’ campaign promises in large amounts of text and speech. The resulting research will be of significant benefit to citizens, who will receive greater clarity on the choices that parties are offering. These existing and new methods are highly relevant to research on text and speech in a wide range of social science fields. Until now, progress in this field has been stifled by limited dialogue among the proponents of different qualitative and quantitative methods. The project includes established experts on parties’ campaign promises, new media, qualitative and quantitative methods for analyzing political texts, and machine learning and natural language processing. The project is funded by the Swedish Riksbankens Jubileumsfond for 12M SEK.\n","date":1580083200,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1580083200,"objectID":"cc64e6761fb9bc89dcd828508f7b467b","permalink":"https://milanlproc.github.io/project/mimac/","publishdate":"2020-01-27T00:00:00Z","relpermalink":"/project/mimac/","section":"project","summary":"Mixed methods for analyzing political parties’ promises to voters during election campaigns","tags":["computational social science","political science","nlp"],"title":"MiMac","type":"project"},{"authors":null,"categories":null,"content":"PI: Debora Nozza\nco-PI: Dirk Hovy and Nicoletta Balbo\n The interdisciplinary MONICA project will create a digital barometer of Italians’ attitudes towards the government measures implemented in response to COVID-19. The pandemic has plunged millions of vulnerable people into abject poverty. The government created financial measures to improve the economic situation and social inclusion. However, it is unclear whether these measures reach those who need them most. To find out, we will uncover the public perception of these measures and provide concrete metrics for three related dimensions: 1) coverage of the potential beneficiaries, 2) attitudes of the Italian population stratified by different demographic factors, and 3) accessibility of the information. MONICA will provide citizens with a tool to automatically rank and simplify articles about requirements and steps to access these initiatives. MONICA will enable policymakers to understand which segments of the vulnerable population are not accessing these initiatives and why.\nSee this project featured in the news. 🗞️🗞️\n 🇮🇹 Italian version\nPer fronteggiare le ripercussioni economiche della pandemia, il governo italiano ha adottato diverse misure finanziarie volte ad arginare gli effetti della crisi, migliorando l’inclusione sociale delle persone che negli ultimi mesi si sono trovate in difficoltà. Tuttavia, tali misure hanno inaspettatamente riscosso una partecipazione inferiore alle aspettative e la loro efficacia, soprattutto in termini di raggiungimento dei soggetti più bisognosi, risulta difficilmente verificabile. MONICA, analizzando una grande mole di dati tramite tecniche di data science, fornirà delle metriche atte a valutare tali misure in termini di: 1) capacità di raggiungimento dei soggetti bisognosi, 2) sentiment dell’opinione pubblica, differenziata in base a fattori demografici, 3) accessibilità delle informazioni. MONICA, inoltre, automaticamente cercherà e creerà versioni semplificate di articoli e procedure inerenti alle misure, garantendo l’accessibilità di tali informazioni a tutti i cittadini.\n","date":1580083200,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1580083200,"objectID":"0c9656761b6df1422a95e6f71eb8ae37","permalink":"https://milanlproc.github.io/project/monitoring_italian_measures_response_covid19/","publishdate":"2020-01-27T00:00:00Z","relpermalink":"/project/monitoring_italian_measures_response_covid19/","section":"project","summary":"MONItoring Coverage, Attitudes and Accessibility of Italian measures in response to COVID-19","tags":["computational social sciences","nlp"],"title":"MONICA","type":"project"},{"authors":null,"categories":["political science","nlp","social media"],"content":"Echo chambers and online abuse are two significant problems affecting the health of conversations on social media. This interdisciplinary, multi-institutional project (led by George Washington University) helps Twitter tackle these issues by developing metrics and algorithms to measure various uncivil behaviors. Given the concerns about growing polarization and the spread of misinformation, our first two metrics, mutual recognition and diversity of perspectives, will help Twitter diagnose issues that arise when users isolate themselves from those who hold differing opinions. Mutual recognition measures whether and to what extent people on opposing sides of an issue acknowledge and engage with rival claims. When recognition occurs, a public sphere is established. When there is no recognition, echo chambers result. Diversity of perspectives measures the range of claims made on the platform, how likely users are to encounter (as opposed to engaging with) divergent and unfamiliar claims, and how polarized the debate is.\nOur second two metrics, incivility, and intolerance, will help Twitter identify and address abuse and targeted harassment. Incivility measures the presence of anti-normative intensity in conversation, including the use of profanity and vulgarity. However, recognizing that such anti-normative communication sometimes serves justifiable\u0026ndash;and in some cases, even beneficial\u0026ndash;ends, we distinguish this concept from intolerance. Targeted attacks on individuals or groups, particularly when carried out based on gender, sexuality, race, ethnicity, religion, or ability, threaten the fundamental democratic principles of equality and freedom.\nTo classify these measures at scale, we draw upon existing work in various computational fields, notably natural language processing and network analysis, but take this work further in addressing the metrics outlined here. Moreover, beyond merely detecting and measuring mutual recognition, diversity of perspectives, incivility, and intolerance, we propose to study the effects these four phenomena have on users. In doing so, we offer a theoretically and empirically driven approach that will help Twitter diagnose the conversation\u0026rsquo;s relative health on its platform.\n","date":1580083200,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1580083200,"objectID":"b687a82907bfa3278ec0305ce0470f43","permalink":"https://milanlproc.github.io/project/twitterhealth/","publishdate":"2020-01-27T00:00:00Z","relpermalink":"/project/twitterhealth/","section":"project","summary":"Devising Metrics for Assessing Echo Chambers, Incivility, and Intolerance on Twitter","tags":["social media","political science","nlp"],"title":"Twitter Healthy Conversations","type":"project"},{"authors":["Alexandra Uma","Tommaso Fornaciari","Dirk Hovy","Silviu Paun","Barbara Plank","Massimo Poesio"],"categories":[],"content":"","date":1577836800,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1577836800,"objectID":"a5fe34b8d82f43aca4cded8dbebd251c","permalink":"https://milanlproc.github.io/publication/2020_aaai_softlabels/","publishdate":"2020-01-01T00:00:00Z","relpermalink":"/publication/2020_aaai_softlabels/","section":"publication","summary":"Recently, Peterson et al. provided evidence of the benefits of using probabilistic soft labels generated from crowd annotations for training a computer vision model, showing that using such labels maximizes performance of the models over unseen data. In this paper, we generalize these results by showing that training with soft labels is an effective method for using crowd annotations in several other AI tasks besides the one studied by Peterson et al., and also when their performance is compared with that of state-of-the-art methods for learning from crowdsourced data. ","tags":["annotation","disagreement","loss function","NLP"],"title":"A Case for Soft Loss Functions","type":"publication"},{"authors":["Tommaso Fornaciari","Letitia Cagnina","Paolo Rosso","Massimo Poesio"],"categories":[],"content":"","date":1577836800,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1577836800,"objectID":"4d9157d4612c24b48becd4300cc37f9c","permalink":"https://milanlproc.github.io/publication/2020_lre/","publishdate":"2020-01-01T00:00:00Z","relpermalink":"/publication/2020_lre/","section":"publication","summary":"Identifying deceptive online reviews is a challenging tasks for Natural Language Processing (NLP). Collecting corpora for the task is difficult, because normally it is not possible to know whether reviews are genuine. A common workaround involves collecting (supposedly) truthful reviews online and adding them to a set of deceptive reviews obtained through crowdsourcing services. Models trained this way are generally successful at discriminating between ‘genuine’ online reviews and the crowdsourced deceptive reviews. It has been argued that the deceptive reviews obtained via crowdsourcing are very different from real fake reviews, but the claim has never been properly tested. In this paper, we compare (false) crowdsourced reviews with a set of ‘real’ fake reviews published on line. We evaluate their degree of similarity and their usefulness in training models for the detection of untrustworthy reviews. We find that the deceptive reviews collected via crowdsourcing are significantly different from the fake reviews published online. In the case of the artificially produced deceptive texts, it turns out that their domain similarity with the targets affects the models’ performance, much more than their","tags":["dataset","deception detection","NLP"],"title":"Fake opinion detection: how similar are crowdsourced datasets to real data?","type":"publication"},{"authors":["Tommaso Fornaciari","Dirk Hovy"],"categories":[],"content":"","date":1572739200,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1572739200,"objectID":"e6f7d976c82c2e5990707d0efc2d07b8","permalink":"https://milanlproc.github.io/publication/2019_m2v/","publishdate":"2019-10-31T01:35:54+01:00","relpermalink":"/publication/2019_m2v/","section":"publication","summary":"Prior research has shown that geolocation can be substantially improved by including user network information. While effective, it suffers from the curse of dimensionality, since networks are usually represented as sparse adjacency matrices of connections, which grow exponentially with the number of users. In order to incorporate this information, we therefore need to limit the network size, in turn limiting performance and risking sample bias. In this paper, we address these limitations by instead using dense network representations. We explore two methods to learn continuous node representations from either 1) the network structure with node2vec (Grover and Leskovec, 2016), or 2) textual user mentions via doc2vec (Le and Mikolov, 2014). We combine both methods with input from social media posts in an attention-based convolutional neural network and evaluate the contribution of each component on geolocation performance. Our method enables us to incorporate arbitrarily large networks in a fixed-length vector, without limiting the network size. Our models achieve competitive results with similar state-of-the-art methods, but with much fewer model parameters, while being applicable to networks of virtually any size. ","tags":["geolocation","representation learning","NLP"],"title":"Dense Node Representation for Geolocation","type":"publication"},{"authors":["Tommaso Fornaciari","Dirk Hovy"],"categories":[],"content":"","date":1572739200,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1572739200,"objectID":"c99ae1a9dbc181af0fec07ee706ff45e","permalink":"https://milanlproc.github.io/publication/2019_geo_mtl/","publishdate":"2019-10-31T01:25:36+01:00","relpermalink":"/publication/2019_geo_mtl/","section":"publication","summary":"Geolocation, predicting the location of a post based on text and other information, has a huge potential for several social media applications. Typically, the problem is modeled as either multi-class classification or regression. In the first case, the classes are geographic areas previously identified; in the second, the models directly predict geographic coordinates. The former requires discretization of the coordinates, but yields better performance. The latter is potentially more precise and true to the nature of the problem, but often results in worse performance. We propose to combine the two approaches in an attention-based multitask convolutional neural network that jointly predicts both discrete locations and continuous geographic coordinates. We evaluate the multi-task (MTL) model against single-task models and prior work. We find that MTL significantly improves performance, reporting large gains on one data set, but also note that the correlation between labels and coordinates has a marked impact on the effectiveness of including a regression task.","tags":["geolocation","multitask learning","NLP"],"title":"Geolocation with Attention-Based Multitask Learning Models","type":"publication"},{"authors":["Hanh Nguyen","Dirk Hovy"],"categories":[],"content":"","date":1572739200,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1572739200,"objectID":"da268d55d2d3daa3188db963df72f794","permalink":"https://milanlproc.github.io/publication/2019_siri/","publishdate":"2019-10-31T01:35:54+01:00","relpermalink":"/publication/2019_siri/","section":"publication","summary":"User reviews provide a significant source of information for companies to understand their market and audience. In order to discover broad trends in this source, researchers have typically used topic models such as Latent Dirichlet Allocation (LDA). However, while there are metrics to choose the “best” number of topics, it is not clear whether the resulting topics can also provide in-depth, actionable product analysis. Our paper examines this issue by analyzing user reviews from the Best Buy US website for smart speakers. Using coherence scores to choose topics, we test whether the results help us to understand user interests and concerns. We find that while coherence scores are a good starting point to identify a number of topics, it still requires manual adaptation based on domain knowledge to provide market insights. We show that the resulting dimensions capture brand performance and differences, and differentiate the market into two distinct groups with different properties.","tags":["NLP","smart speakers","topic modeling"],"title":"Hey Siri. Ok Google. Alexa: A topic modeling of user reviews for smart speakers","type":"publication"},{"authors":["Tommaso Fornaciari","Dirk Hovy"],"categories":[],"content":"","date":1572739200,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1572739200,"objectID":"f03e8b5dae24f99c3b8d42cc770514c9","permalink":"https://milanlproc.github.io/publication/2019_p2c/","publishdate":"2019-10-31T01:38:23+01:00","relpermalink":"/publication/2019_p2c/","section":"publication","summary":" Geolocating social media posts relies on the assumption that language carries sufficient geographic information. However, locations are usually given as continuous latitude/longitude tuples, so we first need to define discrete geographic regions that can serve as labels. Most studies use some form of clustering to discretize the continuous coordinates (Han et al., 2016). However, the resulting regions do not always correspond to existing linguistic areas. Consequently, accuracy at 100 miles tends to be good, but degrades for finer-grained distinctions, when different linguistic regions get lumped together. We describe a new algorithm, Point-to-City (P2C), an iterative k-d tree-based method for clustering geographic coordinates and associating them with towns. We create three sets of labels at different levels of granularity, and compare performance of a state-of-the-art geolocation model trained and tested with P2C labels to one with regular k-d tree labels. Even though P2C results in substantially more labels than the baseline, model accuracy increases significantly over using traditional labels at the fine-grained level, while staying comparable at 100 miles. The results suggest that identifying meaningful linguistic areas is crucial for improving geolocation at a fine-grained level.","tags":["geolocation","NLP","clustering"],"title":"Identifying Linguistic Areas for Geolocation","type":"publication"},{"authors":["Aparna Garimella","Carmen Banea","Dirk Hovy","Rada Mihalcea"],"categories":[],"content":"","date":1562112000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1562112000,"objectID":"5dec16fd019afc36d15e76cab7ab6ea4","permalink":"https://milanlproc.github.io/publication/2019-gender-bias-part-of-speech-tagging-dependency-parsing/","publishdate":"2019-10-31T01:35:54+01:00","relpermalink":"/publication/2019-gender-bias-part-of-speech-tagging-dependency-parsing/","section":"publication","summary":"Several linguistic studies have shown the prevalence of various lexical and grammatical patterns in texts authored by a person of a particular gender, but models for part-of-speech tagging and dependency parsing have still not adapted to account for these differences. To address this, we annotate the Wall Street Journal part of the Penn Treebank with the gender information of the articles’ authors, and build taggers and parsers trained on this data that show performance differences in text written by men and women. Further analyses reveal numerous part-of-speech tags and syntactic relations whose prediction performances benefit from the prevalence of a specific gender in the training data. The results underscore the importance of accounting for gendered differences in syntactic tasks, and outline future venues for developing more accurate taggers and parsers. We release our data to the research community.","tags":["pos tagging","parsing","NLP","bias"],"title":"Women’s Syntactic Resilience and Men’s Grammatical Luck: Gender-Bias in Part-of-Speech Tagging and Dependency Parsing","type":"publication"},{"authors":null,"categories":null,"content":"","date":1546300800,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1546300800,"objectID":"6d99026b9e19e4fa43d5aadf147c7176","permalink":"https://milanlproc.github.io/contact/","publishdate":"2019-01-01T00:00:00Z","relpermalink":"/contact/","section":"","summary":"A little more about me and how to get in touch","tags":null,"title":"About / Contact","type":"widget_page"},{"authors":null,"categories":null,"content":"","date":1546300800,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1546300800,"objectID":"be566fdb6f0fa08cfea50d77a89a6b5a","permalink":"https://milanlproc.github.io/data/","publishdate":"2019-01-01T00:00:00Z","relpermalink":"/data/","section":"","summary":"","tags":null,"title":"How to partecipate","type":"widget_page"},{"authors":null,"categories":null,"content":"","date":1546300800,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1546300800,"objectID":"8795eba9bd19b87b0616d17da3c16590","permalink":"https://milanlproc.github.io/join/","publishdate":"2019-01-01T00:00:00Z","relpermalink":"/join/","section":"","summary":"Open Positions @ MilaNLP lab Bocconi University","tags":null,"title":"Join","type":"widget_page"},{"authors":["Fernando Vega-Redondo","Paolo Pin","Diego Ubfal","Cristiana Benedetti-Fasil","Charles Brummitt","Gaia Rubera","Dirk Hovy","Tommaso Fornaciari"],"categories":[],"content":"","date":1546300800,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1546300800,"objectID":"12bbc46c9ef5b1283e70d7fd9cdc0c89","permalink":"https://milanlproc.github.io/publication/2019_adansonia/","publishdate":"2019-01-01T00:00:00Z","relpermalink":"/publication/2019_adansonia/","section":"publication","summary":"Can large-scale peer interaction foster entrepreneurship and innovation? We conducted an RCT involving almost 5,000 entrepreneurs from 49 African countries. All were enrolled in an online business course, and the treatment involved random assignment to either face-to-face or virtual (Internet-mediated) interaction. We find positive treatment effects on both the submission of business plans and their quality, provided interaction displays some intermediate diversity. Network effects are also significant on both outcomes, although diversity plays a different role for each. This shows that effective peer interaction can be feasibly implemented quite broadly but must also be designed carefully, in view of the pursued objectives.","tags":["social science","economics","text analysis"],"title":"Peer networks and entrepreneurship: A Pan-African RCT","type":"publication"},{"authors":null,"categories":null,"content":"","date":1546300800,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1546300800,"objectID":"6087c0ef875554f4409ac52928d79279","permalink":"https://milanlproc.github.io/projects/","publishdate":"2019-01-01T00:00:00Z","relpermalink":"/projects/","section":"","summary":"See some of the projects I have worked on","tags":null,"title":"Projects","type":"widget_page"},{"authors":["Dirk Hovy","Tommaso Fornaciari"],"categories":[],"content":"","date":1541203200,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1541203200,"objectID":"ff3dfca9f7c85f2d365ce66a6da7f88f","permalink":"https://milanlproc.github.io/publication/2018_emnlp_retro/","publishdate":"2019-10-31T01:41:26+01:00","relpermalink":"/publication/2018_emnlp_retro/","section":"publication","summary":"","tags":[],"title":"Increasing In-Class Similarity by Retrofitting Embeddings with Demographic Information","type":"publication"},{"authors":["Dirk Hovy","Christoph Purschke"],"categories":[],"content":"","date":1540166400,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1540166400,"objectID":"66a0f211e4d88095c64e9e6679ad64a7","permalink":"https://milanlproc.github.io/publication/2018-capturing-regional-variation-distributed-representations-geographic-retrofitting/","publishdate":"2020-02-29T14:48:20+01:00","relpermalink":"/publication/2018-capturing-regional-variation-distributed-representations-geographic-retrofitting/","section":"publication","summary":"Dialects are one of the main drivers of language variation, a major challenge for natural language processing tools. In most languages, dialects exist along a continuum, and are commonly discretized by combining the extent of several preselected linguistic variables. However, the selection of these variables is theory-driven and itself insensitive to change. We use Doc2Vec on a corpus of 16.8M anonymous online posts in the German-speaking area to learn continuous document representations of cities. These representations capture continuous regional linguistic distinctions, and can serve as input to downstream NLP tasks sensitive to regional variation. By incorporating geographic information via retrofitting and agglomerative clustering with structure, we recover dialect areas at various levels of granularity. Evaluating these clusters against an existing dialect map, we achieve a match of up to 0.77 V-score (harmonic mean of cluster completeness and homogeneity). Our results show that representation learning with retrofitting offers a robust general method to automatically expose dialectal differences and regional variation at a finer granularity than was previously possible.","tags":["computational sociolinguistics","sociolinguistics","NLP","representation learning","embeddings","retrofitting"],"title":"Capturing Regional Variation with Distributed Place Representations and Geographic Retrofitting","type":"publication"},{"authors":["Silviu Paun","Bob Carpenter","Jon Chamberlain","Dirk Hovy","Udo Kruschwitz","Massimo Poesio"],"categories":[],"content":"","date":1540166400,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1540166400,"objectID":"ac97468b9625b50fc42dc47858c6d8d2","permalink":"https://milanlproc.github.io/publication/2018-comparing-bayesian-models-annotation/","publishdate":"2020-02-29T14:48:20+01:00","relpermalink":"/publication/2018-comparing-bayesian-models-annotation/","section":"publication","summary":"The analysis of crowdsourced annotations in natural language processing is concerned with identifying (1) gold standard labels, (2) annotator accuracies and biases, and (3) item difficulties and error patterns. Traditionally, majority voting was used for 1, and coefficients of agreement for 2 and 3. Lately, model-based analysis of corpus annotations have proven better at all three tasks. But there has been relatively little work comparing them on the same datasets. This paper aims to fill this gap by analyzing six models of annotation, covering different approaches to annotator ability, item difficulty, and parameter pooling (tying) across annotators and items. We evaluate these models along four aspects: comparison to gold labels, predictive accuracy for new annotations, annotator characterization, and item difficulty, using four datasets with varying degrees of noise in the form of random (spammy) annotators. We conclude with guidelines for model selection, application, and implementation.","tags":["NLP","annotation","generative models","disagreement"],"title":"Comparing Bayesian Models of Annotation","type":"publication"},{"authors":["Sotiris Lamprinidis","Daniel Hardt","Dirk Hovy"],"categories":[],"content":"","date":1540166400,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1540166400,"objectID":"376dee532427df70e4cf017de36dccec","permalink":"https://milanlproc.github.io/publication/2018-predicting-news-headline-popularity-syntactic-semantic-knowledge-multitask-learning/","publishdate":"2020-02-29T14:48:20+01:00","relpermalink":"/publication/2018-predicting-news-headline-popularity-syntactic-semantic-knowledge-multitask-learning/","section":"publication","summary":"Newspapers need to attract readers with headlines, anticipating their readers’ preferences. These preferences rely on topical, structural, and lexical factors. We model each of these factors in a multi-task GRU network to predict headline popularity. We find that pre-trained word embeddings provide significant improvements over untrained embeddings, as do the combination of two auxiliary tasks, news-section prediction and part-of-speech tagging. However, we also find that performance is very similar to that of a simple Logistic Regression model over character n-grams. Feature analysis reveals structural patterns of headline popularity, including the use of forward-looking deictic expressions and second person pronouns.","tags":["NLP","multitask learning","text classification"],"title":"Predicting News Headline Popularity with Syntactic and Semantic Knowledge Using Multi-Task Learning","type":"publication"},{"authors":null,"categories":null,"content":" \u0026ldquo;Have a negroni. Have two. Be open to a world where you may not understand or agree with the person next to you, but have a drink with them anyways.\u0026quot; \u0026ndash;Anthony Bourdain\n The Coding Aperitivo is our take on a weekly seminar series. We end the working week and wind down with some relaxed academic chatter, a drink and some snacks.\nFormat We usually host external speakers on Fridays at 4pm Milan time. Talks are mostly virtual, and sometimes in person. We encourage our guests to try different formats with us such as guided discussions, hands-on activities, debates or just a nice academic chat. Got some research ideas you and you want a sounding board? We are very happy to discuss ongoing or upcoming research.\nPast Guests 2021 Emily Sheng: \u0026ldquo;Biases in NLG and Dialogue Systems\u0026rdquo; Nedjma Ousidhoum: \u0026ldquo;Expectations vs. Reality when Working on Toxic Content Detection in NLP\u0026rdquo; Nils Reimers: \u0026ldquo;Training State-of-the-Art Text Embedding \u0026amp; Neural Search Models\u0026rdquo; Maarten Sap: \u0026ldquo;Detecting and Rewriting Socially Biased Language\u0026rdquo; Sunipa Dev: \u0026ldquo;Towards Interpretable, Fair and Socially-Aware of Language Representations\u0026rdquo; Alba Curry: \u0026ldquo;Philosophy of Emotion and Sentiment Detection\u0026rdquo; Rob van der Goot: \u0026ldquo;Multi-lingual and Multi-task learning: from Dataset Creation to Modeling Su Lin Blodgett: \u0026ldquo;Social and Ethical Implications of NLP Technologies\u0026rdquo; Gabriele Sarti: \u0026ldquo;Interpreting Neural Language Models for Linguistic Complexity Assessment\u0026rdquo; Paul Röttger: \u0026ldquo;Two Contrasting Data Annotation Paradigms for Subjective NLP Tasks\u0026rdquo; Chia-Chien Hung: \u0026ldquo;Multi-domain and Multilingual Dialog\u0026rdquo; Anna Wegmann: \u0026ldquo;Does It Capture STEL? A Modular, Similarity-based Linguistic Style Evaluation Framework\u0026rdquo; Abhilasha Ravichander: \u0026ldquo;Probing the Probing Paradigm: Does Probing Accuracy Entail Task Relevance?\u0026rdquo; Samson Tan (AWS AI Research \u0026amp; Education): \u0026ldquo;Towards Sociolinguistically-Inclusive NLP: An Adversarial Approach\u0026rdquo; 2022 Christine de Kock: \u0026ldquo;I Beg to Differ: A study of constructive disagreement in online conversations\u0026rdquo; Eliana Pastor: \u0026ldquo;Pattern-based algorithms for Explainable AI\u0026rdquo; Dave Howcroft: \u0026ldquo;Low-Resource NLG\u0026rdquo; Zeerak Talat: \u0026ldquo;Ethics and Bias\u0026rdquo; Christopher Klamm: \u0026ldquo;Defining and Measuring Polasiration Across Disciplines\u0026rdquo; Swabha Swayamdipta: \u0026ldquo;Annotation Challenges in NLP\u0026rdquo; Carlo Schwarz: \u0026ldquo;How Polarized are Citizens? Measuring Ideology from the Ground-Up\u0026rdquo; Lorenzo Bertolini: \u0026ldquo;Testing Language Models on Compositionality\u0026rdquo; Alessandro Raganato Mark Dingemanse and Andreas Liesenfeld: \u0026ldquo;Language Diversity in Conversational AI Research\u0026rdquo; Agostina Calabrese: \u0026ldquo;If Data Patterns is the Answer, What was the Question?\u0026rdquo; Aida Mostafazadeh: \u0026ldquo;Incorporating annotators' psychological profiles into modeling language classification tasks\u0026rdquo; Myrthe Reuver: \u0026ldquo;Viewpoint diversity in news recommendation: Theories, Models, and Tasks to support democracy\u0026rdquo; Tommaso Caselli: \u0026ldquo;Language Resources to Monitor Abusive Language in Dutch\u0026rdquo; Valentin Hoffman: \u0026ldquo;Semantic Diffusion: Deep Learning Sense of network\u0026rdquo; Beatrice Savoldi: \u0026ldquo;Designing a course for Ethics in NLP\u0026rdquo; Hannah Rose Kirk: \u0026ldquo;Bias harms and mitigation\u0026rdquo; Juan Manuel Perez: \u0026ldquo;Assessing the impact of contextual information in hate speech detection\u0026rdquo; Daryna Dementieva: \u0026ldquo;Text detoxification\u0026rdquo; Fabio Tollon: \u0026ldquo;From designed properties to possibilities for action\u0026rdquo; Ryan Cotterell: \u0026ldquo;Some Thoughts on Compositionality\u0026rdquo; William Agnew: \u0026ldquo;Values, Ethics and NLP\u0026rdquo; Rami Aly: \u0026ldquo;Automatic fact checking\u0026rdquo; Indira Sen: \u0026ldquo;Measuring social constructs with NLP: Two case studies of abusive language and workplace depression\u0026rdquo; 2023 Maurice Jakesch: \u0026ldquo;Assessing the Effects and Risks of Large Language Models in AI-Mediated Communication\u0026rdquo; Marco del Tredici: \u0026ldquo;Current trends in NLP\u0026rdquo; Fatma Elsafoury: \u0026ldquo;Hate Speech and Toxicity\u0026rdquo; Mor Geva: \u0026ldquo;Annotation bias sources and prevention\u0026rdquo; Emanuele Bugliarello: \u0026ldquo;Language modelling as pixels\u0026rdquo; Tess Buckley: \u0026ldquo;Computational creativity and the ethics of AI-generated music\u0026rdquo; Marina Rizzi: \u0026ldquo;Self-regulation and the Evolution of Content: A Cross-Platform Analysis\u0026rdquo; Giovanni Cassani, Marco Bragoni, and Paul Schreiber: \u0026ldquo;Multimodal Representations for Words that Don’t Exist Yet\u0026rdquo; Laura Vasquez-Rodriguez: \u0026ldquo;Introduction to text simplification with NLP\u0026rdquo; Raj Ammanabrolu: \u0026ldquo;Interactive Language Learning\u0026rdquo; Suchin Gururangan: \u0026ldquo;All things language models, open-sourcing and regulation\u0026rdquo; Giada Pistilli: \u0026ldquo;Ethics in NLP\u0026rdquo; Edoardo Ponti: \u0026ldquo;Modular Deep Learning\u0026rdquo; Julie-Anne Meaney: \u0026ldquo;Demographically-aware Computational Humour\u0026rdquo; Giorgio Franceschelli: \u0026ldquo;Creativity and machine learning\u0026rdquo; Aubrie Amstutz: \u0026ldquo;Managing toxicity and hate speech in the private sector\u0026rdquo; Tom McCoy: \u0026ldquo;Embers of Autoregression: Understanding Large Language Models Through the Problem They are Trained to Solve\u0026rdquo; Camilo Carvajal Reyes: \u0026ldquo;EthicApp: analysing and understanding how people debate ethical issues\u0026rdquo; Tanvi Dinkar: \u0026ldquo;Safety and robustness in conversational AI\u0026rdquo; 2024 Emanuele La Malfa: \u0026ldquo;Code Simulation Challenges for Large Language Models\u0026rdquo; Enrico Liscio: \u0026ldquo;Context-Specific Value Inference via Hybrid Intelligence\u0026rdquo; Eve Fleisig: \u0026ldquo;When the Majority is Wrong: Modeling Annotator Disagreement for Language Tasks\u0026rdquo; Vishakh Padmakumar: \u0026ldquo;Does Writing with Language Models Reduce Content Diversity?\u0026rdquo; Enrico Bertino: \u0026ldquo;AI at a Milanese Chatbot Start-Up\u0026rdquo; Fangru Lin: \u0026ldquo;Graph-enhanced Large Language Models in Asynchronous Plan Reasoning\u0026rdquo; Xuhui Zhou: \u0026ldquo;Towards Socially Aware and Interactional NLP Systems\u0026rdquo; Minje Choi: \u0026ldquo;Towards Evaluating and Measuring the Social Capabilities of Large Language Models\u0026rdquo; Sachin Kumar: \u0026ldquo;Adapting Language Models to Improve Reliability: Experiments with Refusals and Diverse Preference Modeling\u0026rdquo; Nino Scherrer: \u0026ldquo;Evaluating (Moral) Beliefs Encoded in LLMs\u0026rdquo; Mary Sanford: \u0026ldquo;Political Discourse on Climate Change in EU Party Manifestos: A Computational Text Analysis Approach\u0026rdquo; Anna Rogers, Faeze Brahman and Elman Mansimov: Workshop on LLMs in Research and Industry Eugenia Stamboliev: \u0026ldquo;Can we Explain AI? On the Pitfalls of XAI\u0026rdquo; Maria Antoniak: \u0026ldquo;Computational Approaches to Narratives\u0026rdquo; Lucy Li: \u0026ldquo;AboutMe: Using Self-Descriptions in Webpages to Document the Effects of English Pretraining Data Filters\u0026rdquo; Jasmijn Bastings: \u0026ldquo;Bits, Bats \u0026amp; Bots: Deconstructing Gender in Language Technology\u0026rdquo; Fatma Elsafoury: \u0026ldquo;On the Sources of Bias in NLP Models: Origin, Impact, Mitigation, and the Ways Forward\u0026rdquo; Caleb Ziems: \u0026ldquo;How to Use Large Language Models for Computational Social Science\u0026rdquo; Rose Wang: \u0026ldquo;Scaling Expertise via Language Models with Applications to Education\u0026rdquo; Amin al Hazwani: \u0026ldquo;Collaborating to Create a Language-Independent Encyclopedia\u0026rdquo; Luna De Bruyne: \u0026ldquo;Emotions without Borders: Challenges in Multilingual Emotion Detection\u0026rdquo; Dirk\u0026rsquo;s Drinks When in Milan, drink as the Milanese. There are many excellent drink options, but they all start with a bitter and a red vermouth. The big names here are Campari and Martini, but there are plenty of other options worth exploring. Though the official recipes call for equal parts bitter and red vermouth, here we opt for a punchier taste, heavier on the bitter.\nBase: 3 parts bitter 2 parts red vermouth Options: You can now take this into several directions, by adding different mixers:\n 3 parts sparkling water (or fill it up) will get you an Americano (not to be confused with the coffee drink of the same name) For an interesting and refreshing twist, try tonic water instead of sparkling 3 parts prosecco get you a negroni sbagliato (the \u0026ldquo;messed up negroni\u0026rdquo;) 3 parts gin get you the original negroni 3 parts bourbon get you a boulevardier Pour the ingredients into a mixing glass with some ice and stir until the glass feels very cold. Strain into a glass with a large ice cube (the larger the better: it will melt more slowly) and a twist of orange or lemon peel (and rub the glass rim with it). Enjoy!\n","date":1530144000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1530144000,"objectID":"8f31c223c05e11c1ed02d06c6cc14b7b","permalink":"https://milanlproc.github.io/coding_aperitivo/","publishdate":"2018-06-28T00:00:00Z","relpermalink":"/coding_aperitivo/","section":"","summary":"Here we describe how to add a page to your site.","tags":null,"title":"Coding Aperitivo","type":"page"},{"authors":null,"categories":null,"content":"The Reading Group is our weekly meeting to present and discuss exciting contributions from the community.\nIt currently takes place every Thursday at 12:00 PM (Milan). For more info, feel free to reach out.\nUpcoming Program Date Presenter Paper Nov-21 Yujie Does Fine-Tuning LLMs on New Knowledge Encourage Hallucinations? Nov-28 Roberta Dec-5 Flor Why AI Is WEIRD and Should Not Be This Way: Towards AI For Everyone, With Everyone, By Everyone Dec-12 Dirk Dec-19 Ariana Dec-26 Debora ","date":1530144000,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1530144000,"objectID":"b5c4438decf60aef693d14fa16033443","permalink":"https://milanlproc.github.io/reading_group/","publishdate":"2018-06-28T00:00:00Z","relpermalink":"/reading_group/","section":"","summary":"Papers and program of the MilaNLP Reading Group","tags":null,"title":"Reading Group","type":"page"},{"authors":["Dirk Hovy"],"categories":[],"content":"","date":1529625600,"expirydate":-62135596800,"kind":"page","lang":"en","lastmod":1529625600,"objectID":"f18513a3f10ae12b8977d1caf7da095f","permalink":"https://milanlproc.github.io/publication/2018-social-neural-network/","publishdate":"2020-02-29T14:48:20+01:00","relpermalink":"/publication/2018-social-neural-network/","section":"publication","summary":"Over the years, natural language processing has increasingly focused on tasks that can be solved by statistical models, but ignored the social aspects of language. These limitations are in large part due to historically available data and the limitations of the models, but have narrowed our focus and biased the tools demographically. However, with the increased availability of data sets including socio-demographic information and more expressive (neural) models, we have the opportunity to address both issues. I argue that this combination can broaden the focus of NLP to solve a whole new range of tasks, enable us to generate novel linguistic insights, and provide fairer tools for everyone.","tags":["NLP","computational sociolinguistics","retrofitting","representation learning"],"title":"The Social and the Neural Network: How to Make Natural Language Processing about People again","type":"publication"}]
\ No newline at end of file
diff --git a/project/indomita/index.html b/project/indomita/index.html
index 4feb1093..4219a3ae 100644
--- a/project/indomita/index.html
+++ b/project/indomita/index.html
@@ -875,7 +875,7 @@
Related
Beyond Digital 'Echo Chambers': The Role of Viewpoint Diversity in Political Discussion
-
It's Not Just Hate: A Multi-Dimensional Perspective on Detecting Harmful Speech Online
+
Bridging Fairness and Environmental Sustainability in Natural Language Processing
diff --git a/project/integrator/index.html b/project/integrator/index.html
index 5c07ff37..a84c4ee2 100644
--- a/project/integrator/index.html
+++ b/project/integrator/index.html
@@ -861,7 +861,7 @@