From ac8f0b9b4b93978bd2634a78f6e15aa3e6d13113 Mon Sep 17 00:00:00 2001 From: Kc Jordan Date: Mon, 11 Dec 2023 15:49:58 -0500 Subject: [PATCH] Added new Transformer 'OpennessRuleScore' that performs a rule-based metric on questions to determine their 'openness' --- .../OpennessRuleScoreScript.ipynb | 472 ++++++++++++++++++ convokit/openness_rule/opennessRuleScore.py | 159 ++++++ convokit/openness_rule/pull_request.md | 20 + 3 files changed, 651 insertions(+) create mode 100644 convokit/openness_rule/OpennessRuleScoreScript.ipynb create mode 100644 convokit/openness_rule/opennessRuleScore.py create mode 100644 convokit/openness_rule/pull_request.md diff --git a/convokit/openness_rule/OpennessRuleScoreScript.ipynb b/convokit/openness_rule/OpennessRuleScoreScript.ipynb new file mode 100644 index 00000000..31906ddd --- /dev/null +++ b/convokit/openness_rule/OpennessRuleScoreScript.ipynb @@ -0,0 +1,472 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 60, + "id": "57a1b3ff-d9cb-4d1d-9b64-5e1693e4ba63", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 393 + }, + "executionInfo": { + "elapsed": 2258, + "status": "error", + "timestamp": 1702236835431, + "user": { + "displayName": "Kassandra Jordan", + "userId": "09149191759086076767" + }, + "user_tz": 300 + }, + "id": "57a1b3ff-d9cb-4d1d-9b64-5e1693e4ba63", + "outputId": "ac0d5397-a5d8-4e5a-f82c-1ca46e40b405" + }, + "outputs": [], + "source": [ + "from collections import defaultdict\n", + "from tqdm import tqdm\n", + "from itertools import permutations\n", + "from nltk.tokenize import word_tokenize, sent_tokenize\n", + "from convokit import Corpus, download\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import random\n", + "import language_tool_python\n", + "from convokit import text_processing\n", + "from convokit.transformer import Transformer\n", + "# import nltk" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "id": "32fb77e3-236c-4877-ae47-19ced3321af5", + "metadata": { + "executionInfo": { + "elapsed": 8, + "status": "aborted", + "timestamp": 1702236835432, + "user": { + "displayName": "Kassandra Jordan", + "userId": "09149191759086076767" + }, + "user_tz": 300 + }, + "id": "32fb77e3-236c-4877-ae47-19ced3321af5" + }, + "outputs": [], + "source": [ + "PATH = '/Users/kcjordan/Code/cs4350/CANDOR-corpus-cliffhanger'\n", + "data_dir = f'{PATH}'\n", + "\n", + "corpus = Corpus(filename=data_dir)" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "id": "a085c56b-aed9-4ad5-875d-f3c5fc9dbd54", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Utterance(id: '263955', conversation_id: c4d50c68-5b6d-4180-a693-25cab706ada4, reply-to: 263954, speaker: Speaker(id: '5f4fd45ac62ac40f6bdf8ee1', vectors: [], meta: ConvoKitMeta({'sex': 'male', 'politics': 2.0, 'race': 'black_or_african_american', 'edu': 'masters_degree', 'employ': 'unemployed', 'age': 28.0})), timestamp: 203.14, text: 'Yeah. Hello. Yeah how you doing?', vectors: [], meta: ConvoKitMeta({'turn_id': 1, 'start': 203.14, 'stop': 211.55, 'interval': -0.7200000000000273, 'delta': 8.410000000000025, 'questions': 1, 'end_question': True, 'overlap': True, 'n_words': 6}))\n" + ] + } + ], + "source": [ + "#Example utterance\n", + "convo = corpus.random_conversation()\n", + "for utt in convo.iter_utterances():\n", + " if utt.meta['questions'] > 0:\n", + " before = utt\n", + " break\n", + "print(before)" + ] + }, + { + "cell_type": "markdown", + "id": "06e90f1a-c39c-4c26-a9ae-51c93a1a1c42", + "metadata": { + "id": "06e90f1a-c39c-4c26-a9ae-51c93a1a1c42" + }, + "source": [ + "# Transformer" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "id": "99d640b1-f6c5-4958-9bcf-0308124dae82", + "metadata": { + "executionInfo": { + "elapsed": 7, + "status": "aborted", + "timestamp": 1702236835432, + "user": { + "displayName": "Kassandra Jordan", + "userId": "09149191759086076767" + }, + "user_tz": 300 + }, + "id": "99d640b1-f6c5-4958-9bcf-0308124dae82" + }, + "outputs": [], + "source": [ + "class OpennessRuleScore(Transformer):\n", + " \"\"\"\n", + " A simple transformer to count all the usage of \"open\" or \"closed\" indicator words in the last question posed.\n", + "\n", + " :param obj_type: type of Corpus object to calculate: 'conversation', 'speaker', or 'utterance', default to be 'utterance'\n", + " :param input_field: Input fields from every utterance object. Will default to reading 'utt.text'. If a string is provided, than consider metadata with field name input_field.\n", + " :param output_field: field for writing the computed output in metadata. Will default to write to utterance metadata with name 'openness'.\n", + " :param input_filter: a boolean function of signature `input_filter(utterance, aux_input)`. attributes will only be computed for utterances where `input_filter` returns `True`. By default, will always return `True`, meaning that attributes will be computed for all utterances.\n", + " :param verbosity: frequency at which to print status messages when computing attributes.\n", + " \"\"\"\n", + "\n", + " def __init__(\n", + " self,\n", + " obj_type='utterance',\n", + " output_field='openness_score',\n", + " input_field=None,\n", + " input_filter=None,\n", + " verbosity=10000,\n", + " ):\n", + " if input_filter:\n", + " if len(signature(input_filter).parameters) == 1:\n", + " self.input_filter = lambda utt: input_filter(utt)\n", + " else:\n", + " self.input_filter = input_filter\n", + " else:\n", + " self.input_filter = lambda utt: True\n", + " self.obj_type = obj_type\n", + " self.input_field = input_field\n", + " self.output_field = output_field\n", + " self.verbosity = verbosity\n", + " self.open = [\n", + " \"what\",\n", + " \"do\",\n", + " \"does\",\n", + " \"wonder\",\n", + " \"think\",\n", + " \"you\",\n", + " \"why\",\n", + " \"how\",\n", + " \"should\",\n", + " \"your\"]\n", + " self.closed = [\n", + " \"I\",\n", + " \"right\",\n", + " \"know\",\n", + " \"can\",\n", + " \"or\",\n", + " \"much\",\n", + " \"many\",\n", + " \"long\",\n", + " \"have\",\n", + " \"where\",\n", + " \"when\",\n", + " \"who\",\n", + " \"which\",\n", + " \"yes\",\n", + " \"yeah\",\n", + " \"no\"]\n", + "\n", + " def _print_output(self, i):\n", + " return (self.verbosity > 0) and (i > 0) and (i % self.verbosity == 0)\n", + "\n", + " def find_last_question(self, text):\n", + " #we return the last question\n", + " sent = sent_tokenize(text)\n", + " for sent_tok in reversed(sent):\n", + " if sent_tok[-1]==\"?\":\n", + " return sent_tok\n", + " #no question in this utterance\n", + " return \"\"\n", + "\n", + " def rule_score_for_utterance(self, text):\n", + " score = 0\n", + " for el in text:\n", + " if el in self.closed:\n", + " score -=1\n", + " if el in self.open:\n", + " score +=1\n", + " return score\n", + "\n", + " def transform(self, corpus: Corpus) -> Corpus:\n", + " \"\"\"\n", + " Score the amount of \"openness_score\" and annotate in the corresponding object metadata fields.\n", + "\n", + " :param corpus: Corpus\n", + " :return: the corpus\n", + " \"\"\"\n", + " if self.obj_type == 'utterance':\n", + " total = len(list(corpus.iter_utterances()))\n", + "\n", + " for idx, utterance in enumerate(corpus.iter_utterances()):\n", + " if self._print_output(idx):\n", + " print(f\"%03d/%03d {self.obj_type} processed\" % (idx, total))\n", + " if not self.input_filter(utterance):\n", + " continue\n", + " if self.input_field is None:\n", + " text_entry = utterance.text\n", + " elif isinstance(self.input_field, str):\n", + " text_entry = utterance.meta(self.input_field)\n", + " if text_entry is None:\n", + " continue\n", + "\n", + " sent = self.find_last_question(text_entry.lower())\n", + " tok = word_tokenize(sent)\n", + " catch = self.rule_score_for_utterance(tok)\n", + "\n", + " utterance.add_meta(self.output_field, catch)\n", + "\n", + " elif self.obj_type == 'conversation':\n", + " total = len(list(corpus.iter_conversations()))\n", + "\n", + " for idx, convo in enumerate(corpus.iter_conversations()):\n", + " if self._print_output(idx):\n", + " print(f\"%03d/%03d {self.obj_type} processed\" % (idx, total))\n", + "\n", + " if not self.input_filter(convo):\n", + " continue\n", + "\n", + " if self.input_field is None:\n", + " utt_lst = convo.get_utterance_ids()\n", + " text_entry = ' '.join([corpus.get_utterance(x).text for x in utt_lst])\n", + " elif isinstance(self.input_field, str):\n", + " text_entry = convo.meta(self.input_field)\n", + " if text_entry is None:\n", + " continue\n", + "\n", + " sent = self.find_last_question(text_entry.lower())\n", + " tok = word_tokenize(sent)\n", + " catch = self.rule_score_for_utterance(tok)\n", + "\n", + " convo.add_meta(self.output_field, catch)\n", + "\n", + " elif self.obj_type == 'speaker':\n", + " total = len(list(corpus.iter_speakers()))\n", + "\n", + " for idx, sp in enumerate(corpus.iter_speakers()):\n", + " if self._print_output(idx):\n", + " print(f\"%03d/%03d {self.obj_type} processed\" % (idx, total))\n", + "\n", + " if not self.input_filter(sp):\n", + " continue\n", + "\n", + " if self.input_field is None:\n", + " utt_lst = sp.get_utterance_ids()\n", + " text_entry = ' '.join([corpus.get_utterance(x).text for x in utt_lst])\n", + " elif isinstance(self.input_field, str):\n", + " text_entry = sp.meta(self.input_field)\n", + " if text_entry is None:\n", + " continue\n", + "\n", + " sent = self.find_last_question(text_entry.lower())\n", + " tok = word_tokenize(sent)\n", + " catch = self.rule_score_for_utterance(tok)\n", + "\n", + " sp.add_meta(self.output_field, catch)\n", + "\n", + " else:\n", + " raise KeyError('obj_type must be utterance, conversation, or speaker')\n", + "\n", + "\n", + " if self.verbosity > 0:\n", + " print(f\"%03d/%03d {self.obj_type} processed\" % (total, total))\n", + " return corpus" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "id": "d3deaa5a-6817-4964-be8f-f2040e03d1ed", + "metadata": { + "executionInfo": { + "elapsed": 7, + "status": "aborted", + "timestamp": 1702236835432, + "user": { + "displayName": "Kassandra Jordan", + "userId": "09149191759086076767" + }, + "user_tz": 300 + }, + "id": "d3deaa5a-6817-4964-be8f-f2040e03d1ed" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "10000/527869 utterance processed\n", + "20000/527869 utterance processed\n", + "30000/527869 utterance processed\n", + "40000/527869 utterance processed\n", + "50000/527869 utterance processed\n", + "60000/527869 utterance processed\n", + "70000/527869 utterance processed\n", + "80000/527869 utterance processed\n", + "90000/527869 utterance processed\n", + "100000/527869 utterance processed\n", + "110000/527869 utterance processed\n", + "120000/527869 utterance processed\n", + "130000/527869 utterance processed\n", + "140000/527869 utterance processed\n", + "150000/527869 utterance processed\n", + "160000/527869 utterance processed\n", + "170000/527869 utterance processed\n", + "180000/527869 utterance processed\n", + "190000/527869 utterance processed\n", + "200000/527869 utterance processed\n", + "210000/527869 utterance processed\n", + "220000/527869 utterance processed\n", + "230000/527869 utterance processed\n", + "240000/527869 utterance processed\n", + "250000/527869 utterance processed\n", + "260000/527869 utterance processed\n", + "270000/527869 utterance processed\n", + "280000/527869 utterance processed\n", + "290000/527869 utterance processed\n", + "300000/527869 utterance processed\n", + "310000/527869 utterance processed\n", + "320000/527869 utterance processed\n", + "330000/527869 utterance processed\n", + "340000/527869 utterance processed\n", + "350000/527869 utterance processed\n", + "360000/527869 utterance processed\n", + "370000/527869 utterance processed\n", + "380000/527869 utterance processed\n", + "390000/527869 utterance processed\n", + "400000/527869 utterance processed\n", + "410000/527869 utterance processed\n", + "420000/527869 utterance processed\n", + "430000/527869 utterance processed\n", + "440000/527869 utterance processed\n", + "450000/527869 utterance processed\n", + "460000/527869 utterance processed\n", + "470000/527869 utterance processed\n", + "480000/527869 utterance processed\n", + "490000/527869 utterance processed\n", + "500000/527869 utterance processed\n", + "510000/527869 utterance processed\n", + "520000/527869 utterance processed\n", + "527869/527869 utterance processed\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "opencounter = OpennessRuleScore(obj_type='utterance')\n", + "opencounter.transform(corpus)" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "id": "0cffd1b8-0b5c-45b9-aa71-7961791d783a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Yeah. Hello. Yeah how you doing?\n", + "Utterance(id: '263955', conversation_id: c4d50c68-5b6d-4180-a693-25cab706ada4, reply-to: 263954, speaker: Speaker(id: '5f4fd45ac62ac40f6bdf8ee1', vectors: [], meta: ConvoKitMeta({'sex': 'male', 'politics': 2.0, 'race': 'black_or_african_american', 'edu': 'masters_degree', 'employ': 'unemployed', 'age': 28.0})), timestamp: 203.14, text: 'Yeah. Hello. Yeah how you doing?', vectors: [], meta: ConvoKitMeta({'turn_id': 1, 'start': 203.14, 'stop': 211.55, 'interval': -0.7200000000000273, 'delta': 8.410000000000025, 'questions': 1, 'end_question': True, 'overlap': True, 'n_words': 6, 'openness_score': 1}))\n" + ] + } + ], + "source": [ + "after = corpus.get_utterance(before.id)\n", + "print(after.text)\n", + "print(after)" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "id": "6b0091e4-36e5-4e7d-a079-19ece0ce5b19", + "metadata": { + "executionInfo": { + "elapsed": 7, + "status": "aborted", + "timestamp": 1702236835432, + "user": { + "displayName": "Kassandra Jordan", + "userId": "09149191759086076767" + }, + "user_tz": 300 + }, + "id": "6b0091e4-36e5-4e7d-a079-19ece0ce5b19" + }, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "open = []\n", + "text = []\n", + "max = (0, \"\")\n", + "min = (0, \"\")\n", + "for utt in corpus.iter_utterances():\n", + " if utt.meta['questions'] > 0:\n", + " score = utt.meta['openness_score']\n", + " open.append(score)\n", + " if score > max[0]:\n", + " max = (score, utt.text)\n", + " if score < min[0]:\n", + " min = (score, utt.text)\n", + " if score == 0:\n", + " text.append(utt.text)\n", + "\n", + "plt.hist(open)\n", + "plt.show()" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/convokit/openness_rule/opennessRuleScore.py b/convokit/openness_rule/opennessRuleScore.py new file mode 100644 index 00000000..d43f0def --- /dev/null +++ b/convokit/openness_rule/opennessRuleScore.py @@ -0,0 +1,159 @@ +from convokit.transformer import Transformer +from inspect import signature +from nltk.tokenize import word_tokenize, sent_tokenize +from convokit.model import Corpus + + +class OpennessRuleScore(Transformer): + """ + A simple transformer to count all the usage of "open" or "closed" indicator words in the last question posed. + + :param obj_type: type of Corpus object to calculate: 'conversation', 'speaker', or 'utterance', default to be 'utterance' + :param input_field: Input fields from every utterance object. Will default to reading 'utt.text'. If a string is provided, than consider metadata with field name input_field. + :param output_field: field for writing the computed output in metadata. Will default to write to utterance metadata with name 'openness'. + :param input_filter: a boolean function of signature `input_filter(utterance, aux_input)`. attributes will only be computed for utterances where `input_filter` returns `True`. By default, will always return `True`, meaning that attributes will be computed for all utterances. + :param verbosity: frequency at which to print status messages when computing attributes. + """ + + def __init__( + self, + obj_type="utterance", + output_field="openness_score", + input_field=None, + input_filter=None, + verbosity=10000, + ): + if input_filter: + if len(signature(input_filter).parameters) == 1: + self.input_filter = lambda utt: input_filter(utt) + else: + self.input_filter = input_filter + else: + self.input_filter = lambda utt: True + self.obj_type = obj_type + self.input_field = input_field + self.output_field = output_field + self.verbosity = verbosity + self.open = ["what", "do", "does", "wonder", "think", "you", "why", "how", "should", "your"] + self.closed = [ + "I", + "right", + "know", + "can", + "or", + "much", + "many", + "long", + "have", + "where", + "when", + "who", + "which", + "yes", + "yeah", + "no", + ] + + def _print_output(self, i): + return (self.verbosity > 0) and (i > 0) and (i % self.verbosity == 0) + + def find_last_question(self, text): + # we return the last question + sent = sent_tokenize(text) + for sent_tok in reversed(sent): + if sent_tok[-1] == "?": + return sent_tok + # no question in this utterance + return "" + + def rule_score_for_utterance(self, text): + score = 0 + for el in text: + if el in self.closed: + score -= 1 + if el in self.open: + score += 1 + return score + + def transform(self, corpus: Corpus) -> Corpus: + """ + Score the amount of "openness_score" and annotate in the corresponding object metadata fields. + + :param corpus: Corpus + :return: the corpus + """ + if self.obj_type == "utterance": + total = len(list(corpus.iter_utterances())) + + for idx, utterance in enumerate(corpus.iter_utterances()): + if self._print_output(idx): + print(f"%03d/%03d {self.obj_type} processed" % (idx, total)) + if not self.input_filter(utterance): + continue + if self.input_field is None: + text_entry = utterance.text + elif isinstance(self.input_field, str): + text_entry = utterance.meta(self.input_field) + if text_entry is None: + continue + + sent = self.find_last_question(text_entry.lower()) + tok = word_tokenize(sent) + catch = self.rule_score_for_utterance(tok) + + utterance.add_meta(self.output_field, catch) + + elif self.obj_type == "conversation": + total = len(list(corpus.iter_conversations())) + + for idx, convo in enumerate(corpus.iter_conversations()): + if self._print_output(idx): + print(f"%03d/%03d {self.obj_type} processed" % (idx, total)) + + if not self.input_filter(convo): + continue + + if self.input_field is None: + utt_lst = convo.get_utterance_ids() + text_entry = " ".join([corpus.get_utterance(x).text for x in utt_lst]) + elif isinstance(self.input_field, str): + text_entry = convo.meta(self.input_field) + if text_entry is None: + continue + + sent = self.find_last_question(text_entry.lower()) + tok = word_tokenize(sent) + catch = self.rule_score_for_utterance(tok) + + convo.add_meta(self.output_field, catch) + + elif self.obj_type == "speaker": + total = len(list(corpus.iter_speakers())) + + for idx, sp in enumerate(corpus.iter_speakers()): + if self._print_output(idx): + print(f"%03d/%03d {self.obj_type} processed" % (idx, total)) + + if not self.input_filter(sp): + continue + + if self.input_field is None: + utt_lst = sp.get_utterance_ids() + text_entry = " ".join([corpus.get_utterance(x).text for x in utt_lst]) + elif isinstance(self.input_field, str): + text_entry = sp.meta(self.input_field) + if text_entry is None: + continue + + sent = self.find_last_question(text_entry.lower()) + tok = word_tokenize(sent) + catch = self.rule_score_for_utterance(tok) + + sp.add_meta(self.output_field, catch) + + else: + raise KeyError("obj_type must be utterance, conversation, or speaker") + + if self.verbosity > 0: + print(f"%03d/%03d {self.obj_type} processed" % (total, total)) + return corpus diff --git a/convokit/openness_rule/pull_request.md b/convokit/openness_rule/pull_request.md new file mode 100644 index 00000000..ba6f5720 --- /dev/null +++ b/convokit/openness_rule/pull_request.md @@ -0,0 +1,20 @@ +### Description + +Feature: This PR introduces a new transformer that scores utterances with questions (0 otherwise). + +The rule-based method checks to see if there are keywords associated with "open" or "closed" and adds or subtracts a point respectively. The list of keywords associated with questions is finite. This list includes the wh-questions previously explained through Pomerantz. For example, what, why, and how are all considered indicators that a question is "more open." While who, where, when, and which are all considered "more closed." + +This takes from both the distinction that Robinson and Rackstraw make, but also Dohrenwend's idea that closed questions consist of the (1) selection question, (2) yes-no question, and (3) identification question. + +Another notable category of indicator words are words that invoke opinion such as "wonder," "think," or "you." These tokens come from the human annotation tasks. We also relegate "yeah," "yes," and "no" as indicators of Dohrenwend's yes-no questions and rhetorical questions. + +When checking against the utterance, we only score the last question. This is due to the fact that when asked a series of questions, respondents will usually answer the last one posed. This also helps us avoid scoring multiple questions in one utterance. + +### Motivation and Context +We seek to examine the phenomena of question "openness" and how it might change throughout a conversation. + +### How Has This Been Tested? +We ran sanity checks through identifying a few utterances in a corpus (CANDOR) and seeing if the score performs as expected one the transformer is applied. + +### Other information: +N/A