diff --git a/convokit/conversation_smoothness/conversationSmoothness.py b/convokit/conversation_smoothness/conversationSmoothness.py new file mode 100644 index 00000000..97caf050 --- /dev/null +++ b/convokit/conversation_smoothness/conversationSmoothness.py @@ -0,0 +1,246 @@ +from convokit import Corpus, PolitenessStrategies, TextParser +from convokit.transformer import Transformer +from scipy.stats import kendalltau +from inspect import signature +from collections import deque + + +def calculate_politeness_score(polite_strat): + """This function is meant to return a politeness score given a politness strategy map. + + Args: + polite_strat (dict): the politeness strategies as generated from the PolitenessStrategies trasnformer + + Returns: + float: represents the politeness score + """ + politeness_mapping = { + "feature_politeness_==Please==": 0.49, + "feature_politeness_==Please_start==": -0.3, + "feature_politeness_==HASHEDGE==": 0, + "feature_politeness_==Indirect_(btw)==": 0.63, + "feature_politeness_==Hedges==": 0.14, + "feature_politeness_==Factuality==": -0.38, + "feature_politeness_==Deference==": 0.78, + "feature_politeness_==Gratitude==": 0.87, + "feature_politeness_==Apologizing==": 0.36, + "feature_politeness_==1st_person_pl.==": 0.08, + "feature_politeness_==1st_person==": 0.08, + "feature_politeness_==1st_person_start==": 0.12, + "feature_politeness_==2nd_person==": 0.05, + "feature_politeness_==2nd_person_start==": -0.3, + "feature_politeness_==Indirect_(greeting)==": 0.43, + "feature_politeness_==Direct_question==": -0.27, + "feature_politeness_==Direct_start==": -0.43, + "feature_politeness_==HASPOSITIVE==": 0.12, + "feature_politeness_==HASNEGATIVE==": -0.13, + "feature_politeness_==SUBJUNCTIVE==": 0, + "feature_politeness_==INDICATIVE==": 0, + } + + politeness = 0 + + for key, value in polite_strat.items(): + politeness += politeness_mapping[key] * value + + return politeness + + +class ConversationSmoothness(Transformer): + """ + A simple transformer to label a Corpus on a conversation level + + Will only work on the Candor corpus + :param metric: a string that chooses which computation method to use to compute smoothness. It will either be 'ratio', 'decline', or 'tone'. By default, it is 'ratio' + :param end_len: the number of utterances to take from the conversation end (must be an even number) + :param output_field: field for writing the computed output in metadata. Will default to write to conversation metadata with name 'smoothness'. + :param input_filter: a boolean function of signature `input_filter(conversation, aux_input)`. attributes will only be computed for conversations where `input_filter` returns `True`. By default, will always return `True`, meaning that attributes will be computed for all utterances. + :param verbosity: frequency at which to print status messages when computing attributes. + + (previous params for the object in the demo for reference, you can ignore) + obj_type: type of Corpus object to calculate: 'conversation', 'speaker', or 'utterance', default to be 'utterance' + input_field: Input fields from every utterance object. Will default to reading 'utt.text'. If a string is provided, than consider metadata with field name input_field. + output_field: field for writing the computed output in metadata. Will default to write to utterance metadata with name 'capitalization'. + input_filter: a boolean function of signature `input_filter(utterance, aux_input)`. attributes will only be computed for utterances where `input_filter` returns `True`. By default, will always return `True`, meaning that attributes will be computed for all utterances. + verbosity: frequency at which to print status messages when computing attributes. + """ + + def __init__( + self, + metric="ratio", + end_len=12, + output_field="smoothness", + input_filter=None, + verbosity=200, + ): + if input_filter: + if len(signature(input_filter).parameters) == 1: + self.input_filter = lambda convo: input_filter(convo) + else: + self.input_filter = input_filter + else: + self.input_filter = lambda convo: True + self.metric = metric + self.end_len = end_len + self.output_field = output_field + self.verbosity = verbosity + self.ps = PolitenessStrategies(verbose=0) + self.parser = TextParser(verbosity=0) + + def _print_output(self, i): + return (self.verbosity > 0) and (i > 0) and (i % self.verbosity == 0) + + def transform(self, corpus: Corpus) -> Corpus: + """ + Takes the and annotate in the corresponding object metadata fields. + + :param corpus: Corpus + :return: the corpus + """ + + total = len(list(corpus.iter_conversations())) + + for idx, convo in enumerate(corpus.iter_conversations()): + if self._print_output(idx): + print(f"%03d/%03d conversations processed" % (idx, total)) + + if not self.input_filter(convo): + continue + + last_utts = convo.get_utterance_ids()[-self.end_len :] + len_last_utts = len(last_utts) + + # for the calculation + calc = 0 + + # difference for pairs in decline metric + diffs = deque([]) + + # has pos and has neg freqs + has_pos = [0, 0] + has_neg = [0, 0] + + politeness1 = 0 + politeness2 = 0 + + # for tau decline metric + canonical_ordering = [] + ordering = [] + + # for metric calculations + + # new for loop here for the ratio metric + if self.metric == "ratio": + for i in range(len_last_utts - 1): + utt = corpus.get_utterance(last_utts[i]) + next_utt = corpus.get_utterance(last_utts[i + 1]) + utt1len, utt2len = utt.meta["delta"], next_utt.meta["delta"] + ratio = utt1len / utt2len if utt1len <= utt2len else utt2len / utt1len + calc += ratio + + # here are the other loops + for i in range(len_last_utts // 2): + utt = corpus.get_utterance(last_utts[i]) + paired_utt = corpus.get_utterance(last_utts[i + 1]) + + if self.metric == "ratio_old": + # old metric + # get your pairs (only look at even numbers) + utt1len, utt2len = utt.meta["delta"], paired_utt.meta["delta"] + ratio = utt1len / utt2len if utt1len <= utt2len else utt2len / utt1len + calc += ratio + + elif self.metric == "decline": + # append the differences + diffs.append(abs(utt.meta["delta"] - paired_utt.meta["delta"])) + # calculate the difference of differences when possible + if len(diffs) == 2: + # remove last element and calculate the most recent element + popped = diffs.popleft() + calc += abs(popped - diffs[0]) + + # NEW DECLINE METRIC + ordering.append( + (abs(utt.meta["delta"] - paired_utt.meta["delta"]), (utt.id, paired_utt.id)) + ) + canonical_ordering.append((utt.id, paired_utt.id)) + + elif self.metric == "tone": + # old metric + + # run the text transformer for this utterance + self.parser.transform_utterance(utt) + self.parser.transform_utterance(paired_utt) + # run politeness on here + utt_polite = self.ps.transform_utterance(utt, markers=True) + paired_utt_polite = self.ps.transform_utterance(paired_utt, markers=True) + # find the ratios + + has_pos[0] += utt_polite.meta["politeness_strategies"][ + "feature_politeness_==HASPOSITIVE==" + ] + has_pos[1] += paired_utt_polite.meta["politeness_strategies"][ + "feature_politeness_==HASPOSITIVE==" + ] + has_neg[0] += utt_polite.meta["politeness_strategies"][ + "feature_politeness_==HASNEGATIVE==" + ] + has_neg[1] += paired_utt_polite.meta["politeness_strategies"][ + "feature_politeness_==HASNEGATIVE==" + ] + # Returns (1) absolute difference between Has Positive prevalences and (2) absolute difference between Has Negative prevalences + + # difference in politeness score + politeness1 = calculate_politeness_score( + utt_polite.meta["politeness_strategies"] + ) + politeness2 = calculate_politeness_score( + paired_utt_polite.meta["politeness_strategies"] + ) + + else: + raise KeyError("metric must be ratio, ratio_old, decline, or tone, ") + + if self.metric == "decline": + ordering = [pair for ratio, pair in sorted(ordering, reverse=True)] + tau, _ = kendalltau(ordering, canonical_ordering) + calc = tau + + if self.metric == "tone": + pos_diff = abs( + has_pos[0] / (len_last_utts // 2) - has_pos[1] / (len_last_utts // 2) + ) + neg_diff = abs( + has_neg[0] / (len_last_utts // 2) - has_neg[1] / (len_last_utts // 2) + ) + convo.add_meta(f"{self.output_field}_{self.metric}_pos_count1", has_pos[0]) + convo.add_meta(f"{self.output_field}_{self.metric}_neg_count1", has_neg[0]) + convo.add_meta(f"{self.output_field}_{self.metric}_pos_count2", has_pos[1]) + convo.add_meta(f"{self.output_field}_{self.metric}_neg_count2", has_neg[1]) + convo.add_meta(f"{self.output_field}_{self.metric}_pos", pos_diff) + convo.add_meta(f"{self.output_field}_{self.metric}_neg", neg_diff) + convo.add_meta(f"{self.output_field}_{self.metric}_politeness1", politeness1) + convo.add_meta(f"{self.output_field}_{self.metric}_politeness2", politeness2) + convo.add_meta( + f"{self.output_field}_{self.metric}_politeness_diff", + abs(politeness1 / (len_last_utts // 2) - politeness2 / (len_last_utts // 2)), + ) + else: + # take the average of all summed components + calc /= ( + (len_last_utts - 1) + if self.metric == "ratio" + else len_last_utts // 2 + if self.metric == "ratio_old" + else 1 + ) + # do the catching and add to output_field + convo.add_meta(f"{self.output_field}_{self.metric}", calc) + + last_utt_time_delta = ( + corpus.get_utterance(last_utts[-1]).meta["stop"] + - corpus.get_utterance(last_utts[0]).meta["start"] + ) + convo.add_meta(f"{self.output_field}_last_utts_time", last_utt_time_delta) + + return corpus diff --git a/convokit/conversation_smoothness/demos/Conversation_Smoothness_Transformer.ipynb b/convokit/conversation_smoothness/demos/Conversation_Smoothness_Transformer.ipynb new file mode 100644 index 00000000..a2b9de61 --- /dev/null +++ b/convokit/conversation_smoothness/demos/Conversation_Smoothness_Transformer.ipynb @@ -0,0 +1,667 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "YCcTEDDlW0UP" + }, + "outputs": [], + "source": [ + "from convokit import Corpus, PolitenessStrategies, TextParser\n", + "from convokit.transformer import Transformer\n", + "from inspect import signature\n", + "from collections import deque" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "V_e23nMDyqb9" + }, + "outputs": [], + "source": [ + "path_to_folder = \"/content/gdrive/My Drive/INFO 4350 Final Project/Data and Analysis\"\n", + "# change this to wherever the CANDOR corpus is located in the file system" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "x6c1cZ1UdMhB" + }, + "outputs": [], + "source": [ + "# load the Candor dataset in from the folder we have on Drive\n", + "corpus = Corpus(filename=f\"{path_to_folder}/CANDOR-corpus-cliffhanger\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "4KKu541sgM4P", + "outputId": "f1bb28a6-9ccf-461d-9e6c-c224868ae31e" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Utterance Text: Yeah, they emailed me about that.\n", + "Utterance Meta: ConvoKitMeta({'turn_id': 300, 'start': 1437.44, 'stop': 1440.96, 'interval': -2.3199999999999363, 'delta': 3.5199999999999823, 'questions': 0, 'end_question': False, 'overlap': True, 'n_words': 6})\n", + "Conversation('id': '3231ee9a-483f-464c-b563-da35de30594c', 'utterances': ['229458', '229459', '229460', '229461', '229462', '229463', '229464', '229465', '229466', '229467', '229468', '229469', '229470', '229471', '229472', '229473', '229474', '229475', '229476', '229477', '229478', '229479', '229480', '229481', '229482', '229483', '229484', '229485', '229486', '229487', '229488', '229489', '229490', '229491', '229492', '229493', '229494', '229495', '229496', '229497', '229498', '229499', '229500', '229501', '229502', '229503', '229504', '229505', '229506', '229507', '229508', '229509', '229510', '229511', '229512', '229513', '229514', '229515', '229516', '229517', '229518', '229519', '229520', '229521', '229522', '229523', '229524', '229525', '229526', '229527', '229528', '229529', '229530', '229531', '229532', '229533', '229534', '229535', '229536', '229537', '229538', '229539', '229540', '229541', '229542', '229543', '229544', '229545', '229546', '229547', '229548', '229549', '229550', '229551', '229552', '229553', '229554', '229555', '229556', '229557', '229558', '229559', '229560', '229561', '229562', '229563', '229564', '229565', '229566', '229567', '229568', '229569', '229570', '229571', '229572', '229573', '229574', '229575', '229576', '229577', '229578', '229579', '229580', '229581', '229582', '229583', '229584', '229585', '229586', '229587', '229588', '229589', '229590', '229591', '229592', '229593', '229594', '229595', '229596', '229597', '229598', '229599', '229600', '229601', '229602', '229603', '229604', '229605', '229606', '229607', '229608', '229609', '229610', '229611', '229612', '229613', '229614', '229615', '229616', '229617', '229618', '229619', '229620', '229621', '229622', '229623', '229624', '229625', '229626', '229627', '229628', '229629', '229630', '229631', '229632', '229633', '229634', '229635', '229636', '229637', '229638', '229639', '229640', '229641', '229642', '229643', '229644', '229645', '229646', '229647', '229648', '229649', '229650', '229651', '229652', '229653', '229654', '229655', '229656', '229657', '229658', '229659', '229660', '229661', '229662', '229663', '229664', '229665', '229666', '229667', '229668', '229669', '229670', '229671', '229672', '229673', '229674', '229675', '229676', '229677', '229678', '229679', '229680', '229681', '229682', '229683', '229684', '229685', '229686', '229687', '229688', '229689', '229690', '229691', '229692', '229693', '229694', '229695', '229696', '229697', '229698', '229699', '229700', '229701', '229702', '229703', '229704', '229705', '229706', '229707', '229708', '229709', '229710', '229711', '229712', '229713', '229714', '229715', '229716', '229717', '229718', '229719', '229720', '229721', '229722', '229723', '229724', '229725', '229726', '229727', '229728', '229729', '229730', '229731', '229732', '229733', '229734', '229735', '229736', '229737', '229738', '229739', '229740', '229741', '229742', '229743', '229744'], 'meta': ConvoKitMeta({'partner_id': {'5dd4bd7ae4f4a0499548a810': '5dd352c51c219b35931aefd1', '5dd352c51c219b35931aefd1': '5dd4bd7ae4f4a0499548a810'}, 'date': {'5dd4bd7ae4f4a0499548a810': datetime.datetime(2020, 1, 10, 0, 0), '5dd352c51c219b35931aefd1': datetime.datetime(2020, 1, 10, 0, 0)}, 'survey_duration_in_seconds': {'5dd4bd7ae4f4a0499548a810': 3372, '5dd352c51c219b35931aefd1': 4401}, 'time_zone': {'5dd4bd7ae4f4a0499548a810': 8.0, '5dd352c51c219b35931aefd1': 8.0}, 'pre_affect': {'5dd4bd7ae4f4a0499548a810': 7.0, '5dd352c51c219b35931aefd1': 6.0}, 'pre_arousal': {'5dd4bd7ae4f4a0499548a810': 4.0, '5dd352c51c219b35931aefd1': 6.0}, 'technical_quality': {'5dd4bd7ae4f4a0499548a810': 1.0, '5dd352c51c219b35931aefd1': 1.0}, 'conv_length': {'5dd4bd7ae4f4a0499548a810': 33.0, '5dd352c51c219b35931aefd1': 30.0}, 'affect': {'5dd4bd7ae4f4a0499548a810': 7.0, '5dd352c51c219b35931aefd1': 7.0}, 'arousal': {'5dd4bd7ae4f4a0499548a810': 5.0, '5dd352c51c219b35931aefd1': 7.0}, 'overall_affect': {'5dd4bd7ae4f4a0499548a810': 5.0, '5dd352c51c219b35931aefd1': 8.0}, 'overall_arousal': {'5dd4bd7ae4f4a0499548a810': 5.0, '5dd352c51c219b35931aefd1': 7.0}, 'overall_memory_rating': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}, 'begin_affect': {'5dd4bd7ae4f4a0499548a810': 7.0, '5dd352c51c219b35931aefd1': 6.0}, 'begin_arousal': {'5dd4bd7ae4f4a0499548a810': 6.0, '5dd352c51c219b35931aefd1': 6.0}, 'begin_memory_rating': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}, 'begin_memory_text': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}, 'middle_affect': {'5dd4bd7ae4f4a0499548a810': 5.0, '5dd352c51c219b35931aefd1': 8.0}, 'middle_arousal': {'5dd4bd7ae4f4a0499548a810': 5.0, '5dd352c51c219b35931aefd1': 8.0}, 'middle_memory_rating': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}, 'middle_memory_text': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}, 'end_affect': {'5dd4bd7ae4f4a0499548a810': 7.0, '5dd352c51c219b35931aefd1': 8.0}, 'end_arousal': {'5dd4bd7ae4f4a0499548a810': 5.0, '5dd352c51c219b35931aefd1': 8.0}, 'end_memory_rating': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}, 'end_memory_text': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}, 'worst_affect': {'5dd4bd7ae4f4a0499548a810': 3.0, '5dd352c51c219b35931aefd1': 7.0}, 'worst_arousal': {'5dd4bd7ae4f4a0499548a810': 4.0, '5dd352c51c219b35931aefd1': 7.0}, 'best_affect': {'5dd4bd7ae4f4a0499548a810': 8.0, '5dd352c51c219b35931aefd1': 9.0}, 'best_arousal': {'5dd4bd7ae4f4a0499548a810': 7.0, '5dd352c51c219b35931aefd1': 9.0}, 'how_enjoyable': {'5dd4bd7ae4f4a0499548a810': 6.0, '5dd352c51c219b35931aefd1': 9.0}, 'i_like_you': {'5dd4bd7ae4f4a0499548a810': 5.0, '5dd352c51c219b35931aefd1': 7.0}, 'you_like_me': {'5dd4bd7ae4f4a0499548a810': 5.0, '5dd352c51c219b35931aefd1': 6.0}, 'in_common': {'5dd4bd7ae4f4a0499548a810': 6.0, '5dd352c51c219b35931aefd1': 8.0}, 'conversationalist': {'5dd4bd7ae4f4a0499548a810': 40.0, '5dd352c51c219b35931aefd1': 95.0}, 'next_seven_days': {'5dd4bd7ae4f4a0499548a810': 1.0, '5dd352c51c219b35931aefd1': 1.0}, 'my_friends_like_you': {'5dd4bd7ae4f4a0499548a810': 5.0, '5dd352c51c219b35931aefd1': 65.0}, 'good_for_advice': {'5dd4bd7ae4f4a0499548a810': 0.0, '5dd352c51c219b35931aefd1': 75.0}, 'i_felt_close_to_my_partner': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}, 'i_would_like_to_become_friends': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}, 'i_paid_attention_to_my_partner': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}, 'my_partner_paid_attention_to_me': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}, 'my_partner_was_clear_and_coherent': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}, 'interested_in_exchanging_contact_info': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}, 'you_are_intelligent': {'5dd4bd7ae4f4a0499548a810': 3.0, '5dd352c51c219b35931aefd1': 8.0}, 'you_are_quickwitted': {'5dd4bd7ae4f4a0499548a810': 3.0, '5dd352c51c219b35931aefd1': 8.0}, 'you_are_competent': {'5dd4bd7ae4f4a0499548a810': 4.0, '5dd352c51c219b35931aefd1': 9.0}, 'you_are_kind': {'5dd4bd7ae4f4a0499548a810': 7.0, '5dd352c51c219b35931aefd1': 9.0}, 'you_are_friendly': {'5dd4bd7ae4f4a0499548a810': 7.0, '5dd352c51c219b35931aefd1': 9.0}, 'you_are_warm': {'5dd4bd7ae4f4a0499548a810': 7.0, '5dd352c51c219b35931aefd1': 9.0}, 'you_think_i_am_intelligent': {'5dd4bd7ae4f4a0499548a810': 7.0, '5dd352c51c219b35931aefd1': 8.0}, 'you_think_i_am_quickwitted': {'5dd4bd7ae4f4a0499548a810': 3.0, '5dd352c51c219b35931aefd1': 8.0}, 'you_think_i_am_competent': {'5dd4bd7ae4f4a0499548a810': 6.0, '5dd352c51c219b35931aefd1': 8.0}, 'you_think_i_am_kind': {'5dd4bd7ae4f4a0499548a810': 6.0, '5dd352c51c219b35931aefd1': 8.0}, 'you_think_i_am_friendly': {'5dd4bd7ae4f4a0499548a810': 5.0, '5dd352c51c219b35931aefd1': 8.0}, 'you_think_i_am_warm': {'5dd4bd7ae4f4a0499548a810': 5.0, '5dd352c51c219b35931aefd1': 7.0}, 'you_are_humble': {'5dd4bd7ae4f4a0499548a810': 5.0, '5dd352c51c219b35931aefd1': 8.0}, 'you_are_giving': {'5dd4bd7ae4f4a0499548a810': 6.0, '5dd352c51c219b35931aefd1': 8.0}, 'you_are_fair': {'5dd4bd7ae4f4a0499548a810': 6.0, '5dd352c51c219b35931aefd1': 9.0}, 'you_are_trustworthy': {'5dd4bd7ae4f4a0499548a810': 6.0, '5dd352c51c219b35931aefd1': 9.0}, 'you_are_agreeable': {'5dd4bd7ae4f4a0499548a810': 6.0, '5dd352c51c219b35931aefd1': 9.0}, 'you_are_playful': {'5dd4bd7ae4f4a0499548a810': 5.0, '5dd352c51c219b35931aefd1': 8.0}, 'i_am_intelligent': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}, 'i_am_quickwitted': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}, 'i_am_competent': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}, 'i_am_kind': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}, 'i_am_friendly': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}, 'i_am_warm': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}, 'i_am_humble': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}, 'i_am_giving': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}, 'i_am_fair': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}, 'i_am_trustworthy': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}, 'i_am_agreeable': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}, 'i_am_playful': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}, 'rest_of_day_open': {'5dd4bd7ae4f4a0499548a810': 'I see that tomorrow will be a good day to finish off the week and then I will have an enjoyable weekend spending time with my family and attending church. I see myself sleeping in and being better rested, continuing my exercises to be in less pain and enjoying 2 days off of work.', '5dd352c51c219b35931aefd1': \"It will be good. It might snow a little so that will be fun. I stocked up on groceries so in case I get stuck at my house I'll have food to eat. I have natural gas so if my power goes out I can cook. I hope my handyman finishes the bathroom remodel project soon. \"}, 'critical_positive': {'5dd4bd7ae4f4a0499548a810': 'Having things in common worked well. Being able to expand on things that were similar and explain in further detail. Being open to listening and hearing the other person and trying to find further similarities was very helpful. And then keeping the conversation going and avoiding dead space was very good so when we did find something in common, we kept talking about it.', '5dd352c51c219b35931aefd1': \"We had similar work experiences. She worked in occupational therapy and I worked in social services so we could relate with each other as far as dealing with people's problems and burn out and and all that. We both have kids. We took turns talking and relating with each other about kids and jobs. \"}, 'critical_negative': {'5dd4bd7ae4f4a0499548a810': \"I think one of the biggest things that didn't work well was that she didn't remember what I had said earlier in the conversation, I'm not sure if it was memory or she wasn't paying attention well. I also did the same thing and didn't remember that she only had 1 daughter so she had to repeat herself. I think it was also awkward to have a conversation with someone I haven't seen before and have to keep it going.\", '5dd352c51c219b35931aefd1': \"Everything went well. She talked about her husband who helps with the kids and I thought that was really cool. She was younger than I am but that's ok. We both agreed that the West Coast isn't what it used to be. She had not heard of Bret Weinstein from Evergreen State College but that's ok. \"}, 'smiles': {'5dd4bd7ae4f4a0499548a810': 3.0, '5dd352c51c219b35931aefd1': 5.0}, 'laughter': {'5dd4bd7ae4f4a0499548a810': 5.0, '5dd352c51c219b35931aefd1': 4.0}, 'questions': {'5dd4bd7ae4f4a0499548a810': 7.0, '5dd352c51c219b35931aefd1': 5.0}, 'nods': {'5dd4bd7ae4f4a0499548a810': 4.0, '5dd352c51c219b35931aefd1': 4.0}, 'verbal_feedback': {'5dd4bd7ae4f4a0499548a810': 4.0, '5dd352c51c219b35931aefd1': 5.0}, 'speak_quickly': {'5dd4bd7ae4f4a0499548a810': 3.0, '5dd352c51c219b35931aefd1': 5.0}, 'my_turn_length': {'5dd4bd7ae4f4a0499548a810': 5.0, '5dd352c51c219b35931aefd1': 5.0}, 'your_turn_length': {'5dd4bd7ae4f4a0499548a810': 5.0, '5dd352c51c219b35931aefd1': 5.0}, 'topic_diversity': {'5dd4bd7ae4f4a0499548a810': 5.0, '5dd352c51c219b35931aefd1': 4.0}, 'turn_overlap_gap': {'5dd4bd7ae4f4a0499548a810': 5.0, '5dd352c51c219b35931aefd1': 5.0}, 'i_disclosed': {'5dd4bd7ae4f4a0499548a810': 6.0, '5dd352c51c219b35931aefd1': 6.0}, 'you_disclosed': {'5dd4bd7ae4f4a0499548a810': 6.0, '5dd352c51c219b35931aefd1': 6.0}, 'i_am_good_listener': {'5dd4bd7ae4f4a0499548a810': 7.0, '5dd352c51c219b35931aefd1': 7.0}, 'you_are_good_listener': {'5dd4bd7ae4f4a0499548a810': 5.0, '5dd352c51c219b35931aefd1': 7.0}, 'conv_leader': {'5dd4bd7ae4f4a0499548a810': 7.0, '5dd352c51c219b35931aefd1': 4.0}, 'talk_time_100_1': {'5dd4bd7ae4f4a0499548a810': 40.0, '5dd352c51c219b35931aefd1': 50.0}, 'you_total_talk_time': {'5dd4bd7ae4f4a0499548a810': 60.0, '5dd352c51c219b35931aefd1': 50.0}, 'conv_pace': {'5dd4bd7ae4f4a0499548a810': 5.0, '5dd352c51c219b35931aefd1': 5.0}, 'my_mind_wander': {'5dd4bd7ae4f4a0499548a810': 6.0, '5dd352c51c219b35931aefd1': 6.0}, 'your_mind_wander': {'5dd4bd7ae4f4a0499548a810': 6.0, '5dd352c51c219b35931aefd1': 6.0}, 'i_am_funny': {'5dd4bd7ae4f4a0499548a810': 2.0, '5dd352c51c219b35931aefd1': 5.0}, 'you_are_funny': {'5dd4bd7ae4f4a0499548a810': 2.0, '5dd352c51c219b35931aefd1': 5.0}, 'i_am_polite': {'5dd4bd7ae4f4a0499548a810': 8.0, '5dd352c51c219b35931aefd1': 7.0}, 'you_are_polite': {'5dd4bd7ae4f4a0499548a810': 8.0, '5dd352c51c219b35931aefd1': 7.0}, 'i_tried_to_impress': {'5dd4bd7ae4f4a0499548a810': 6.0, '5dd352c51c219b35931aefd1': 6.0}, 'you_tried_to_impress': {'5dd4bd7ae4f4a0499548a810': 6.0, '5dd352c51c219b35931aefd1': 6.0}, 'responsive_1': {'5dd4bd7ae4f4a0499548a810': 4.0, '5dd352c51c219b35931aefd1': 6.0}, 'responsive_2': {'5dd4bd7ae4f4a0499548a810': 4.0, '5dd352c51c219b35931aefd1': 6.0}, 'responsive_3': {'5dd4bd7ae4f4a0499548a810': 4.0, '5dd352c51c219b35931aefd1': 6.0}, 'end_you': {'5dd4bd7ae4f4a0499548a810': 4.0, '5dd352c51c219b35931aefd1': 5.0}, 'end_you_time_1': {'5dd4bd7ae4f4a0499548a810': 2.0, '5dd352c51c219b35931aefd1': nan}, 'longer_self': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': 2.0}, 'how_long_you': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}, 'end_other': {'5dd4bd7ae4f4a0499548a810': 5.0, '5dd352c51c219b35931aefd1': 4.0}, 'end_other_time_1': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': 25.0}, 'longer_other': {'5dd4bd7ae4f4a0499548a810': 1.0, '5dd352c51c219b35931aefd1': nan}, 'how_long_other': {'5dd4bd7ae4f4a0499548a810': 3.0, '5dd352c51c219b35931aefd1': nan}, 'our_thoughts_synced_up_sr1': {'5dd4bd7ae4f4a0499548a810': 3.0, '5dd352c51c219b35931aefd1': 6.0}, 'developed_joint_perspective_sr2': {'5dd4bd7ae4f4a0499548a810': 4.0, '5dd352c51c219b35931aefd1': 6.0}, 'shared_thoughts_feels_sr3': {'5dd4bd7ae4f4a0499548a810': 5.0, '5dd352c51c219b35931aefd1': 6.0}, 'discussed_real_things_sr4': {'5dd4bd7ae4f4a0499548a810': 5.0, '5dd352c51c219b35931aefd1': 7.0}, 'thoughts_became_more_alike_sr5': {'5dd4bd7ae4f4a0499548a810': 5.0, '5dd352c51c219b35931aefd1': 6.0}, 'anticipated_each_other_sr6': {'5dd4bd7ae4f4a0499548a810': 4.0, '5dd352c51c219b35931aefd1': 5.0}, 'became_certain_of_perception_sr7': {'5dd4bd7ae4f4a0499548a810': 4.0, '5dd352c51c219b35931aefd1': 6.0}, 'saw_world_in_same_way_sr8': {'5dd4bd7ae4f4a0499548a810': 5.0, '5dd352c51c219b35931aefd1': 6.0}, 'i_think_my_status': {'5dd4bd7ae4f4a0499548a810': 7.0, '5dd352c51c219b35931aefd1': 3.0}, 'i_think_your_status': {'5dd4bd7ae4f4a0499548a810': 3.0, '5dd352c51c219b35931aefd1': 6.0}, 'you_think_my_status': {'5dd4bd7ae4f4a0499548a810': 5.0, '5dd352c51c219b35931aefd1': 4.0}, 'my_bfi_1': {'5dd4bd7ae4f4a0499548a810': 4.0, '5dd352c51c219b35931aefd1': 3.0}, 'my_bfi_2': {'5dd4bd7ae4f4a0499548a810': 4.0, '5dd352c51c219b35931aefd1': 4.0}, 'my_bfi_3': {'5dd4bd7ae4f4a0499548a810': 5.0, '5dd352c51c219b35931aefd1': 3.0}, 'my_bfi_4': {'5dd4bd7ae4f4a0499548a810': 5.0, '5dd352c51c219b35931aefd1': 3.0}, 'my_bfi_5': {'5dd4bd7ae4f4a0499548a810': 2.0, '5dd352c51c219b35931aefd1': 4.0}, 'my_bfi_6': {'5dd4bd7ae4f4a0499548a810': 2.0, '5dd352c51c219b35931aefd1': 3.0}, 'my_bfi_7': {'5dd4bd7ae4f4a0499548a810': 3.0, '5dd352c51c219b35931aefd1': 3.0}, 'my_bfi_8': {'5dd4bd7ae4f4a0499548a810': 5.0, '5dd352c51c219b35931aefd1': 3.0}, 'my_bfi_9': {'5dd4bd7ae4f4a0499548a810': 4.0, '5dd352c51c219b35931aefd1': 2.0}, 'my_bfi_10': {'5dd4bd7ae4f4a0499548a810': 4.0, '5dd352c51c219b35931aefd1': 2.0}, 'my_bfi_11': {'5dd4bd7ae4f4a0499548a810': 3.0, '5dd352c51c219b35931aefd1': 3.0}, 'my_bfi_12': {'5dd4bd7ae4f4a0499548a810': 1.0, '5dd352c51c219b35931aefd1': 3.0}, 'my_bfi_13': {'5dd4bd7ae4f4a0499548a810': 4.0, '5dd352c51c219b35931aefd1': 4.0}, 'my_bfi_14': {'5dd4bd7ae4f4a0499548a810': 2.0, '5dd352c51c219b35931aefd1': 4.0}, 'my_bfi_15': {'5dd4bd7ae4f4a0499548a810': 3.0, '5dd352c51c219b35931aefd1': 3.0}, 'your_bfi_1': {'5dd4bd7ae4f4a0499548a810': 2.0, '5dd352c51c219b35931aefd1': 3.0}, 'your_bfi_2': {'5dd4bd7ae4f4a0499548a810': 4.0, '5dd352c51c219b35931aefd1': 5.0}, 'your_bfi_3': {'5dd4bd7ae4f4a0499548a810': 5.0, '5dd352c51c219b35931aefd1': 3.0}, 'your_bfi_4': {'5dd4bd7ae4f4a0499548a810': 4.0, '5dd352c51c219b35931aefd1': 3.0}, 'your_bfi_5': {'5dd4bd7ae4f4a0499548a810': 3.0, '5dd352c51c219b35931aefd1': 4.0}, 'your_bfi_6': {'5dd4bd7ae4f4a0499548a810': 4.0, '5dd352c51c219b35931aefd1': 3.0}, 'your_bfi_7': {'5dd4bd7ae4f4a0499548a810': 3.0, '5dd352c51c219b35931aefd1': 2.0}, 'your_bfi_8': {'5dd4bd7ae4f4a0499548a810': 4.0, '5dd352c51c219b35931aefd1': 3.0}, 'your_bfi_9': {'5dd4bd7ae4f4a0499548a810': 4.0, '5dd352c51c219b35931aefd1': 2.0}, 'your_bfi_10': {'5dd4bd7ae4f4a0499548a810': 4.0, '5dd352c51c219b35931aefd1': 2.0}, 'your_bfi_11': {'5dd4bd7ae4f4a0499548a810': 3.0, '5dd352c51c219b35931aefd1': 3.0}, 'your_bfi_12': {'5dd4bd7ae4f4a0499548a810': 4.0, '5dd352c51c219b35931aefd1': 4.0}, 'your_bfi_13': {'5dd4bd7ae4f4a0499548a810': 3.0, '5dd352c51c219b35931aefd1': 5.0}, 'your_bfi_14': {'5dd4bd7ae4f4a0499548a810': 2.0, '5dd352c51c219b35931aefd1': 5.0}, 'your_bfi_15': {'5dd4bd7ae4f4a0499548a810': 2.0, '5dd352c51c219b35931aefd1': 3.0}, 'you_lack_companionship': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}, 'you_feel_left_out': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}, 'you_feel_isolated': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}, 'sex': {'5dd4bd7ae4f4a0499548a810': 'female', '5dd352c51c219b35931aefd1': 'female'}, 'politics': {'5dd4bd7ae4f4a0499548a810': 1.0, '5dd352c51c219b35931aefd1': 2.0}, 'race': {'5dd4bd7ae4f4a0499548a810': 'white', '5dd352c51c219b35931aefd1': 'other'}, 'edu': {'5dd4bd7ae4f4a0499548a810': 'bachelors_degree', '5dd352c51c219b35931aefd1': 'bachelors_degree'}, 'employ': {'5dd4bd7ae4f4a0499548a810': 1.0, '5dd352c51c219b35931aefd1': 1.0}, 'employ_7_TEXT': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}, 'sleep_today': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}, 'sleep_usual': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}, 'know_partner_1': {'5dd4bd7ae4f4a0499548a810': 2.0, '5dd352c51c219b35931aefd1': 2.0}, 'how_know_partner': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}, 'realtime': {'5dd4bd7ae4f4a0499548a810': False, '5dd352c51c219b35931aefd1': False}, 'did': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}, 'guided': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}, 'age': {'5dd4bd7ae4f4a0499548a810': 36.0, '5dd352c51c219b35931aefd1': 56.0}, 'conversation_count_past24': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}, 'conversation_hours_past24': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}, 'convo_who_family': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}, 'convo_who_romantic_partner': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}, 'convo_who_friends': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}, 'convo_who_work_colleagues': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}, 'convo_who_other': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}, 'convo_format_in_person': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}, 'convo_format_video_chat': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}, 'convo_format_phone_call': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}, 'convo_format_text': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}, 'convo_format_other': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}, 'my_isolation_pre_covid': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}, 'my_isolation_post_covid': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}, 'my_stress_pre_covid': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}, 'my_distracted_pre_covid': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}, 'my_sad_pre_covid': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}, 'my_stress_post_covid': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}, 'my_distracted_post_covid': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}, 'my_sad_post_covid': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}, 'n_living_with_me_pre_covid': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}, 'n_living_with_me_post_covid': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}, 'am_i_sheltering_in_place_now': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}, 'how_long_shelter_in_place': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}, 'i_lack_companionship': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}, 'i_feel_left_out': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}, 'i_feel_isolated': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}, 'shared_reality': {'5dd4bd7ae4f4a0499548a810': 4.375, '5dd352c51c219b35931aefd1': 6.0}, 'responsive': {'5dd4bd7ae4f4a0499548a810': 4.0, '5dd352c51c219b35931aefd1': 6.0}, 'my_loneliness': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}, 'your_loneliness': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}, 'RMET_score': {'5dd4bd7ae4f4a0499548a810': 0.0, '5dd352c51c219b35931aefd1': 0.0}, 'IRI_PT_score': {'5dd4bd7ae4f4a0499548a810': 0.0, '5dd352c51c219b35931aefd1': 0.0}, 'IRI_EC_score': {'5dd4bd7ae4f4a0499548a810': 0.0, '5dd352c51c219b35931aefd1': 0.0}, 'AQ_social_skill_score': {'5dd4bd7ae4f4a0499548a810': 0.0, '5dd352c51c219b35931aefd1': 0.0}, 'AQ_communication_score': {'5dd4bd7ae4f4a0499548a810': 0.0, '5dd352c51c219b35931aefd1': 0.0}, 'BFNE_score': {'5dd4bd7ae4f4a0499548a810': 0.0, '5dd352c51c219b35931aefd1': 0.0}, 'my_extraversion': {'5dd4bd7ae4f4a0499548a810': 2.333333333333333, '5dd352c51c219b35931aefd1': 3.0}, 'my_agreeable': {'5dd4bd7ae4f4a0499548a810': 2.6666666666666665, '5dd352c51c219b35931aefd1': 3.333333333333333}, 'my_conscientious': {'5dd4bd7ae4f4a0499548a810': 2.0, '5dd352c51c219b35931aefd1': 3.333333333333333}, 'my_neurotic': {'5dd4bd7ae4f4a0499548a810': 4.333333333333333, '5dd352c51c219b35931aefd1': 2.333333333333333}, 'my_open': {'5dd4bd7ae4f4a0499548a810': 2.333333333333333, '5dd352c51c219b35931aefd1': 3.6666666666666665}, 'your_extraversion': {'5dd4bd7ae4f4a0499548a810': 3.6666666666666665, '5dd352c51c219b35931aefd1': 3.0}, 'your_agreeable': {'5dd4bd7ae4f4a0499548a810': 3.6666666666666665, '5dd352c51c219b35931aefd1': 4.333333333333333}, 'your_conscientious': {'5dd4bd7ae4f4a0499548a810': 2.0, '5dd352c51c219b35931aefd1': 3.6666666666666665}, 'your_neurotic': {'5dd4bd7ae4f4a0499548a810': 4.0, '5dd352c51c219b35931aefd1': 2.0}, 'your_open': {'5dd4bd7ae4f4a0499548a810': 2.333333333333333, '5dd352c51c219b35931aefd1': 3.6666666666666665}, 'who_i_talked_to_most_past24': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}, 'most_common_format_past24': {'5dd4bd7ae4f4a0499548a810': nan, '5dd352c51c219b35931aefd1': nan}}))\n" + ] + } + ], + "source": [ + "utt = corpus.random_utterance()\n", + "print(\"Utterance Text: \", utt.text)\n", + "print(\"Utterance Meta: \", utt.meta)\n", + "\n", + "convo = corpus.random_conversation()\n", + "print(convo)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tgT24xNYRql9" + }, + "source": [ + "### The transform( ) function\n", + "\n", + "The transform is to compute and add something to the corpus, mostly metadata. Here we give a simple example of how a transform function would look like." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "iYciBXisixJ5" + }, + "outputs": [], + "source": [ + "def calculate_politeness_score(polite_strat):\n", + " politeness_mapping = {\n", + " 'feature_politeness_==Please==': 0.49,\n", + " 'feature_politeness_==Please_start==': -0.3,\n", + " 'feature_politeness_==HASHEDGE==': 0,\n", + " 'feature_politeness_==Indirect_(btw)==': 0.63,\n", + " 'feature_politeness_==Hedges==': 0.14,\n", + " 'feature_politeness_==Factuality==': -0.38,\n", + " 'feature_politeness_==Deference==': 0.78,\n", + " 'feature_politeness_==Gratitude==': 0.87,\n", + " 'feature_politeness_==Apologizing==': 0.36,\n", + " 'feature_politeness_==1st_person_pl.==': 0.08,\n", + " 'feature_politeness_==1st_person==': 0.08,\n", + " 'feature_politeness_==1st_person_start==': 0.12,\n", + " 'feature_politeness_==2nd_person==': 0.05,\n", + " 'feature_politeness_==2nd_person_start==': -0.3,\n", + " 'feature_politeness_==Indirect_(greeting)==': 0.43,\n", + " 'feature_politeness_==Direct_question==': -0.27,\n", + " 'feature_politeness_==Direct_start==': -0.43,\n", + " 'feature_politeness_==HASPOSITIVE==': 0.12,\n", + " 'feature_politeness_==HASNEGATIVE==': -0.13,\n", + " 'feature_politeness_==SUBJUNCTIVE==': 0,\n", + " 'feature_politeness_==INDICATIVE==': 0,\n", + " }\n", + "\n", + " politeness = 0\n", + "\n", + " for key, value in polite_strat.items():\n", + " politeness += politeness_mapping[key] * value\n", + "\n", + " return politeness\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Sw2fVmmwXBYP" + }, + "outputs": [], + "source": [ + "from scipy.stats import kendalltau\n", + "\n", + "class ConversationSmoothness(Transformer):\n", + " \"\"\"\n", + " A simple transformer to label a Corpus on a conversation level\n", + "\n", + " Will only work on the Candor corpus\n", + " :param metric: a string that chooses which computation method to use to compute smoothness. It will either be 'ratio', 'decline', or 'tone'. By default, it is 'ratio'\n", + " :param end_len: the number of utterances to take from the conversation end (must be an even number)\n", + " :param output_field: field for writing the computed output in metadata. Will default to write to conversation metadata with name 'smoothness'.\n", + " :param input_filter: a boolean function of signature `input_filter(conversation, aux_input)`. attributes will only be computed for conversations where `input_filter` returns `True`. By default, will always return `True`, meaning that attributes will be computed for all utterances.\n", + " :param verbosity: frequency at which to print status messages when computing attributes.\n", + "\n", + " (previous params for the object in the demo for reference, you can ignore)\n", + " obj_type: type of Corpus object to calculate: 'conversation', 'speaker', or 'utterance', default to be 'utterance'\n", + " input_field: Input fields from every utterance object. Will default to reading 'utt.text'. If a string is provided, than consider metadata with field name input_field.\n", + " output_field: field for writing the computed output in metadata. Will default to write to utterance metadata with name 'capitalization'.\n", + " input_filter: a boolean function of signature `input_filter(utterance, aux_input)`. attributes will only be computed for utterances where `input_filter` returns `True`. By default, will always return `True`, meaning that attributes will be computed for all utterances.\n", + " verbosity: frequency at which to print status messages when computing attributes.\n", + " \"\"\"\n", + "\n", + " def __init__(\n", + " self,\n", + " metric='ratio',\n", + " end_len=12,\n", + " output_field='smoothness',\n", + " input_filter=None,\n", + " verbosity=200,\n", + " ):\n", + " if input_filter:\n", + " if len(signature(input_filter).parameters) == 1:\n", + " self.input_filter = lambda convo: input_filter(convo)\n", + " else:\n", + " self.input_filter = input_filter\n", + " else:\n", + " self.input_filter = lambda convo: True\n", + " self.metric = metric\n", + " self.end_len = end_len\n", + " self.output_field = output_field\n", + " self.verbosity = verbosity\n", + " self.ps = PolitenessStrategies(verbose=0)\n", + " self.parser = TextParser(verbosity=0)\n", + "\n", + "\n", + " def _print_output(self, i):\n", + " return (self.verbosity > 0) and (i > 0) and (i % self.verbosity == 0)\n", + "\n", + " def transform(self, corpus: Corpus) -> Corpus:\n", + " \"\"\"\n", + " Takes the and annotate in the corresponding object metadata fields.\n", + "\n", + " :param corpus: Corpus\n", + " :return: the corpus\n", + " \"\"\"\n", + "\n", + " total = len(list(corpus.iter_conversations()))\n", + "\n", + " for idx, convo in enumerate(corpus.iter_conversations()):\n", + " if self._print_output(idx):\n", + " print(f\"%03d/%03d conversations processed\" % (idx, total))\n", + "\n", + " if not self.input_filter(convo):\n", + " continue\n", + "\n", + " last_utts = convo.get_utterance_ids()[-self.end_len:]\n", + " len_last_utts = len(last_utts)\n", + "\n", + " # for the calculation\n", + " calc = 0\n", + "\n", + " # difference for pairs in decline metric\n", + " diffs = deque([])\n", + "\n", + " # has pos and has neg freqs\n", + " has_pos = [0, 0]\n", + " has_neg = [0, 0]\n", + "\n", + " politeness1 = 0\n", + " politeness2 = 0\n", + "\n", + " # for tau decline metric\n", + " canonical_ordering = []\n", + " ordering = []\n", + "\n", + " # for metric calculations\n", + "\n", + " # new for loop here for the ratio metric\n", + " if self.metric == 'ratio':\n", + " for i in range(len_last_utts - 1):\n", + " utt = corpus.get_utterance(last_utts[i])\n", + " next_utt = corpus.get_utterance(last_utts[i + 1])\n", + " utt1len, utt2len = utt.meta['delta'], next_utt.meta['delta']\n", + " ratio = utt1len / utt2len if utt1len <= utt2len else utt2len / utt1len\n", + " calc += ratio\n", + "\n", + " # here are the other loops\n", + " for i in range(len_last_utts // 2):\n", + " utt = corpus.get_utterance(last_utts[i])\n", + " paired_utt = corpus.get_utterance(last_utts[i + 1])\n", + "\n", + " if self.metric == 'ratio':\n", + " # old metric\n", + " # # get your pairs (only look at even numbers)\n", + " # utt1len, utt2len = utt.meta['delta'], paired_utt.meta['delta']\n", + " # ratio = utt1len / utt2len if utt1len <= utt2len else utt2len / utt1len\n", + " # calc += ratio\n", + " pass\n", + "\n", + " elif self.metric == 'decline':\n", + " # append the differences\n", + " diffs.append(abs(utt.meta['delta'] - paired_utt.meta['delta']))\n", + " # calculate the difference of differences when possible\n", + " if len(diffs) == 2:\n", + " # remove last element and calculate the most recent element\n", + " popped = diffs.popleft()\n", + " calc += abs(popped - diffs[0])\n", + "\n", + " # NEW DECLINE METRIC\n", + " ordering.append((abs(utt.meta['delta'] - paired_utt.meta['delta']), (utt.id, paired_utt.id)))\n", + " canonical_ordering.append((utt.id, paired_utt.id))\n", + "\n", + " elif self.metric == 'tone':\n", + " # old metric\n", + "\n", + " # run the text transformer for this utterance\n", + " self.parser.transform_utterance(utt)\n", + " self.parser.transform_utterance(paired_utt)\n", + " # run politeness on here\n", + " utt_polite = self.ps.transform_utterance(utt, markers=True)\n", + " paired_utt_polite = self.ps.transform_utterance(paired_utt, markers=True)\n", + " # find the ratios\n", + "\n", + " has_pos[0] += utt_polite.meta[\"politeness_strategies\"]['feature_politeness_==HASPOSITIVE==']\n", + " has_pos[1] += paired_utt_polite.meta[\"politeness_strategies\"]['feature_politeness_==HASPOSITIVE==']\n", + " has_neg[0] += utt_polite.meta[\"politeness_strategies\"]['feature_politeness_==HASNEGATIVE==']\n", + " has_neg[1] += paired_utt_polite.meta[\"politeness_strategies\"]['feature_politeness_==HASNEGATIVE==']\n", + " # Returns (1) absolute difference between Has Positive prevalences and (2) absolute difference between Has Negative prevalences\n", + "\n", + " # difference in politeness score\n", + " politeness1 = calculate_politeness_score(utt_polite.meta[\"politeness_strategies\"])\n", + " politeness2 = calculate_politeness_score(paired_utt_polite.meta[\"politeness_strategies\"])\n", + "\n", + " else:\n", + " raise KeyError('metric must be ratio, decline, or tone')\n", + "\n", + " if self.metric == 'decline':\n", + " ordering = [pair for ratio, pair in sorted(ordering, reverse=True)]\n", + " tau, _ = kendalltau(ordering, canonical_ordering)\n", + " calc = tau\n", + "\n", + " if self.metric == 'tone':\n", + " pos_diff = abs(has_pos[0] / (len_last_utts // 2) - has_pos[1] / (len_last_utts // 2))\n", + " neg_diff = abs(has_neg[0] / (len_last_utts // 2) - has_neg[1] / (len_last_utts // 2))\n", + " convo.add_meta(f'{self.output_field}_{self.metric}_pos_count1', has_pos[0])\n", + " convo.add_meta(f'{self.output_field}_{self.metric}_neg_count1', has_neg[0])\n", + " convo.add_meta(f'{self.output_field}_{self.metric}_pos_count2', has_pos[1])\n", + " convo.add_meta(f'{self.output_field}_{self.metric}_neg_count2', has_neg[1])\n", + " convo.add_meta(f'{self.output_field}_{self.metric}_pos', pos_diff)\n", + " convo.add_meta(f'{self.output_field}_{self.metric}_neg', neg_diff)\n", + " convo.add_meta(f'{self.output_field}_{self.metric}_politeness1', politeness1)\n", + " convo.add_meta(f'{self.output_field}_{self.metric}_politeness2', politeness2)\n", + " convo.add_meta(f'{self.output_field}_{self.metric}_politeness_diff', abs(politeness1 / (len_last_utts // 2) - politeness2 / (len_last_utts // 2)))\n", + " else:\n", + " # take the average of all summed components\n", + " calc /= (len_last_utts - 1) if self.metric == \"ratio\" else 1\n", + " # do the catching and add to output_field\n", + " convo.add_meta(f'{self.output_field}_{self.metric}', calc)\n", + "\n", + " last_utt_time_delta = corpus.get_utterance(last_utts[-1]).meta[\"stop\"] - corpus.get_utterance(last_utts[0]).meta[\"start\"]\n", + " convo.add_meta(f'{self.output_field}_last_utts_time', last_utt_time_delta)\n", + "\n", + " return corpus" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "0aoH4fqyc_FE", + "outputId": "ed4e5f24-01ec-44f0-ce8f-bffa45adede5" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "200/1650 conversations processed\n", + "400/1650 conversations processed\n", + "600/1650 conversations processed\n", + "800/1650 conversations processed\n", + "1000/1650 conversations processed\n", + "1200/1650 conversations processed\n", + "1400/1650 conversations processed\n", + "1600/1650 conversations processed\n", + "200/1650 conversations processed\n", + "400/1650 conversations processed\n", + "600/1650 conversations processed\n", + "800/1650 conversations processed\n", + "1000/1650 conversations processed\n", + "1200/1650 conversations processed\n", + "1400/1650 conversations processed\n", + "1600/1650 conversations processed\n", + "200/1650 conversations processed\n", + "400/1650 conversations processed\n", + "600/1650 conversations processed\n", + "800/1650 conversations processed\n", + "1000/1650 conversations processed\n", + "1200/1650 conversations processed\n", + "1400/1650 conversations processed\n", + "1600/1650 conversations processed\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 107, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ratio_transformer = ConversationSmoothness()\n", + "ratio_transformer.transform(corpus)\n", + "\n", + "decline_transformer = ConversationSmoothness(metric=\"decline\")\n", + "decline_transformer.transform(corpus)\n", + "\n", + "tone_transformer = ConversationSmoothness(metric=\"tone\")\n", + "tone_transformer.transform(corpus)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "D2cywith_eb6" + }, + "outputs": [], + "source": [ + "def print_conversation_ending(convo):\n", + " print(convo.id)\n", + " speaker_map = {}\n", + " curr_speaker = 1\n", + " last_utts = convo.get_utterance_ids()[-12:]\n", + " for utt_id in last_utts:\n", + " utt = corpus.get_utterance(utt_id)\n", + " if utt.speaker.id not in speaker_map:\n", + " speaker_map[utt.speaker.id] = curr_speaker\n", + " curr_speaker += 1\n", + " print(f\"SPEAKER {speaker_map[utt.speaker.id]}: {utt.text}\")\n", + " print()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "fF5hsUghdZNC", + "outputId": "da5466cd-eb38-4884-f13b-878d87d20661" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "3231ee9a-483f-464c-b563-da35de30594c\n", + "SPEAKER 1: I think we made it, we make it.\n", + "SPEAKER 2: Okay.\n", + "SPEAKER 1: Yeah.\n", + "SPEAKER 2: Well you're probably you're probably tired so.\n", + "SPEAKER 1: I think we did it. I am after working all day.\n", + "SPEAKER 2: Uh huh. Yeah.\n", + "SPEAKER 1: Yeah.\n", + "SPEAKER 2: Oh my gosh. Okay well have a good rest of the day, Week whatever.\n", + "SPEAKER 1: Yeah. Thank you. You too.\n", + "SPEAKER 2: All right thank you.\n", + "SPEAKER 1: Bye.\n", + "SPEAKER 2: Goodbye.\n", + "\n", + "Smoothness (ratio, the higher the number, the more smooth it will be): 0.45962043967974253\n", + "Smoothness (decline, the higher the number, the more smooth it will be): -0.11475409836065575\n", + "Smoothness (pos tone, the lower the number, the more smooth it will be): 0.0\n", + "Smoothness (neg tone, the lower the number, the more smooth it will be): 0.0\n", + "Smoothness (politeness, the lower the number, the more smooth it will be): 0.0\n" + ] + } + ], + "source": [ + "print_conversation_ending(convo)\n", + "print(\"Smoothness (ratio, the higher the number, the more smooth it will be): \", convo.meta['smoothness_ratio'])\n", + "print(\"Smoothness (decline, the higher the number, the more smooth it will be): \", convo.meta['smoothness_decline'])\n", + "print(\"Smoothness (pos tone, the lower the number, the more smooth it will be): \", convo.meta['smoothness_tone_pos'])\n", + "print(\"Smoothness (neg tone, the lower the number, the more smooth it will be): \", convo.meta['smoothness_tone_neg'])\n", + "print(\"Smoothness (politeness, the lower the number, the more smooth it will be): \", convo.meta['smoothness_tone_politeness_diff'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "xTvDin9Twyho", + "outputId": "50d76e71-f6b0-4e26-cc45-0e27394353da" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "06b83c0a-7bf1-4cf9-9e72-034615d97050\n", + "SPEAKER 1: We're supposed to be oh yeah we're way past.\n", + "SPEAKER 2: I think we've passed yeah, we've passed the time. Yeah, I think we're I think we're good, but yeah.\n", + "SPEAKER 1: Okay. Yeah. We had 45 minutes probably. We only need 25.\n", + "SPEAKER 2: Okay. Well it's so nice talking to you.\n", + "SPEAKER 1: That's great talking to you. I don't even It didn't seem like 45 minutes.\n", + "SPEAKER 2: Yeah, this is really great. I learned so much and like thank you for all the great advice to.\n", + "SPEAKER 1: Thank you. Always say yes. Go for all the adventures. You can nice talking to you too.\n", + "SPEAKER 2: Yeah, it was really nice talking to you.\n", + "SPEAKER 1: Um Have fun.\n", + "SPEAKER 2: Okay, well, goodbye. Thank you.\n", + "SPEAKER 1: Right.\n", + "SPEAKER 2: Bye.\n", + "\n", + "Smoothness (ratio, the higher the number, the more smooth it will be): 0.49963096484501934\n", + "Smoothness (decline, the higher the number, the more smooth it will be): 0.459016393442623\n", + "Smoothness (pos tone, the lower the number, the more smooth it will be): 0.16666666666666674\n", + "Smoothness (neg tone, the lower the number, the more smooth it will be): 0.0\n", + "Smoothness (politeness, the lower the number, the more smooth it will be): 0.006666666666666654\n" + ] + } + ], + "source": [ + "convo = corpus.random_conversation()\n", + "print_conversation_ending(convo)\n", + "print(\"Smoothness (ratio, the higher the number, the more smooth it will be): \", convo.meta['smoothness_ratio'])\n", + "print(\"Smoothness (decline, the higher the number, the more smooth it will be): \", convo.meta['smoothness_decline'])\n", + "print(\"Smoothness (pos tone, the lower the number, the more smooth it will be): \", convo.meta['smoothness_tone_pos'])\n", + "print(\"Smoothness (neg tone, the lower the number, the more smooth it will be): \", convo.meta['smoothness_tone_neg'])\n", + "print(\"Smoothness (politeness, the lower the number, the more smooth it will be): \", convo.meta['smoothness_tone_politeness_diff'])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_XJse2GVmaPr" + }, + "source": [ + "### Metrics" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "UfF5qLuopZ4V" + }, + "source": [ + "#### Generate Histograms" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "8MxF3GFimmuM" + }, + "outputs": [], + "source": [ + "import matplotlib\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "qXe0p2DN0ScO" + }, + "outputs": [], + "source": [ + "def graph_plot(metric, metric_good_name, filename, bins=10, range=None):\n", + " dist = []\n", + "\n", + " for convo in corpus.iter_conversations():\n", + " dist.append(convo.meta[f\"smoothness_{metric}\"])\n", + "\n", + " print(\"Min:\", np.min(dist))\n", + " print(\"Max:\", np.max(dist))\n", + " print(\"Mean:\", np.mean(dist))\n", + " print(\"SD:\", np.std(dist))\n", + "\n", + " if range:\n", + " plt.hist(dist, density=True, bins=bins, range=range)\n", + " else:\n", + " plt.hist(dist, density=True, bins=bins)\n", + " plt.ylabel('Count')\n", + " plt.xlabel(metric_good_name)\n", + "\n", + " plt.savefig(f'{path_to_folder}/Outputs/{filename}.png')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "BxpY6yerrQw7" + }, + "outputs": [], + "source": [ + "# for ratio\n", + "graph_plot(\"ratio\", \"Ratio\", \"ratio_hist\", bins=20)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ynh0JWn2sq0A" + }, + "outputs": [], + "source": [ + "# for decline\n", + "graph_plot(\"decline\", \"Decline\", \"decline_hist\", bins=5, range=[0, 40])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "TLCgCd05srNe" + }, + "outputs": [], + "source": [ + "# for pos tone\n", + "graph_plot(\"tone_pos\", \"Positive Tone\", \"tone_pos_hist\", bins=5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "LvAREsZ9QqNT" + }, + "outputs": [], + "source": [ + "# for neg tone\n", + "graph_plot(\"tone_neg\", \"Negative Tone\", \"tone_neg_hist\", bins=5)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hMoAfLTZquhf" + }, + "source": [ + "#### Get the annotated data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "MbfFIJXWqzVH" + }, + "outputs": [], + "source": [ + "annotated_ids = [\n", + " '1c82d05c-19ce-4d2a-83db-a54021c9196d',\n", + " '0203bb21-da17-416b-8e2f-018b99689616',\n", + " 'fffda3e6-7d99-4db8-aa12-16e99fa454c2',\n", + " '826248c3-018b-4b56-8844-ef762f5b60cd',\n", + " '11cb78ed-49fb-4634-8a7a-3c59109563b5',\n", + " '33528414-6a77-4fde-a01a-aebbad5fc3d8',\n", + " '2d7f1113-de9d-4e61-bdbe-38a9bd2a1121',\n", + " '29b7edd5-d78d-4edb-bcce-4f6c9a166455',\n", + " '2608d293-6af3-4f26-959c-e0f6a2597a37',\n", + " '5d895bf7-4efd-4a5d-ad62-57fa820ad746',\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "iO5yEWPRskc5" + }, + "outputs": [], + "source": [ + "for id in annotated_ids:\n", + " print_conversation_ending(corpus.get_conversation(id))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "A8T30qokzq9F" + }, + "outputs": [], + "source": [ + "for id in annotated_ids:\n", + " convo = corpus.get_conversation(id)\n", + " print(id)\n", + " print(\"Smoothness (ratio, the higher the number, the more smooth it will be): \", convo.meta['smoothness_ratio'])\n", + " print(\"Smoothness (decline, the lower the number, the more smooth it will be): \", convo.meta['smoothness_decline'])\n", + " print(\"Smoothness (pos tone, the lower the number, the more smooth it will be): \", convo.meta['smoothness_tone_pos'])\n", + " print(\"Smoothness (neg tone, the lower the number, the more smooth it will be): \", convo.meta['smoothness_tone_neg'])\n", + " print()" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}