RasaHQ · twerkmeister · Feb 1, 2024 · Feb 1, 2024 · Feb 4, 2024 · Feb 4, 2024
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
@@ -0,0 +1,4 @@
+## Description
+
+## TODOs
+[ ] compared flaky tests with the [known list of flaky tests steps](https://www.notion.so/rasa/Flaky-E2E-Test-Steps-63864d3d8c7b4427a0f3df8052e39f21)
diff --git a/.github/workflows/continous-integration.yml b/.github/workflows/continous-integration.yml
@@ -69,7 +69,7 @@ jobs:
           OPENAI_API_KEY: ${{secrets.OPENAI_API_KEY}}
           RASA_PRO_LICENSE: ${{secrets.RASA_PRO_LICENSE}}
           RASA_PRO_BETA_INTENTLESS: true
-          DUCKLING_URL: ${{secrets.DUCKLING_URL}}
+          RASA_DUCKLING_HTTP_URL: ${{secrets.DUCKLING_URL}}
         run: |
           make train
 
@@ -160,16 +160,28 @@ jobs:
       env:
         OPENAI_API_KEY: ${{secrets.OPENAI_API_KEY}}
         RASA_PRO_LICENSE: ${{secrets.RASA_PRO_LICENSE}}
+        RASA_DUCKLING_HTTP_URL: ${{secrets.DUCKLING_URL}}
         RASA_PRO_BETA_INTENTLESS: true
       run: |
         make actions &
         make test-passing
 
+    - name: Run e2e flaky tests
+      env:
+        OPENAI_API_KEY: ${{secrets.OPENAI_API_KEY}}
+        RASA_PRO_LICENSE: ${{secrets.RASA_PRO_LICENSE}}
+        RASA_DUCKLING_HTTP_URL: ${{secrets.DUCKLING_URL}}
+        RASA_PRO_BETA_INTENTLESS: true
+      run: |
+        make actions &
+        make test-flaky || true
+
     - name: Run e2e failing tests
       env:
         OPENAI_API_KEY: ${{secrets.OPENAI_API_KEY}}
         RASA_PRO_LICENSE: ${{secrets.RASA_PRO_LICENSE}}
+        RASA_DUCKLING_HTTP_URL: ${{secrets.DUCKLING_URL}}
         RASA_PRO_BETA_INTENTLESS: true
       run: |
         make actions &
-        make test-failing | grep '0 passed'
+        make test-failing | grep '0 passed'
diff --git a/Makefile b/Makefile
@@ -28,6 +28,9 @@ actions:
 test-passing: .EXPORT_ALL_VARIABLES
 	poetry run rasa test e2e e2e_tests/passing
 
+test-flaky: .EXPORT_ALL_VARIABLES
+	poetry run rasa test e2e e2e_tests/flaky
+
 test-failing: .EXPORT_ALL_VARIABLES
 	poetry run rasa test e2e e2e_tests/failing
 

diff --git a/README.md b/README.md
@@ -170,6 +170,7 @@ Prerequisites:
   `poetry self update`
 - python (3.10.12), e.g. using [pyenv](https://github.com/pyenv/pyenv) 
   `pyenv install 3.10.12`
+- set up and running [Duckling](https://github.com/facebook/duckling) server
 
 After you cloned the repository and are authenticated, follow the installation steps:
 
@@ -191,6 +192,7 @@ After you cloned the repository and are authenticated, follow the installation s
    ```bash
    RASA_PRO_LICENSE=<your rasa pro license key>
    OPENAI_API_KEY=<your openai api key>
+   RASA_DUCKLING_HTTP_URL=<url to the duckling server>
    ```
 
 ### Training the bot

diff --git a/actions/entity_extractor.py b/actions/entity_extractor.py
@@ -4,7 +4,7 @@
 from rasa.nlu.extractors.duckling_entity_extractor import DucklingEntityExtractor
 
 load_dotenv()
-duckling_url = os.environ.get("DUCKLING_URL")
+duckling_url = os.environ.get("RASA_DUCKLING_HTTP_URL")
 
 duckling_config = {
     **DucklingEntityExtractor.get_default_config(),

diff --git a/config.yml b/config.yml
@@ -7,6 +7,8 @@ pipeline:
   llm:
     model_name: gpt-4
     request_timeout: 7
+    temperature: 0.0
+    top_p: 0.0
 
 policies:
 - name: FlowPolicy

diff --git a/data/flows/add_card.yml b/data/flows/add_card.yml
@@ -0,0 +1,6 @@
+flows:
+  add_card:
+    description: add a card to your account
+    name: add a card
+    steps:
+      - action: utter_card_added
diff --git a/data/flows/patterns.yml b/data/flows/patterns.yml
@@ -25,11 +25,13 @@ flows:
     steps:
       - action: action_trigger_chitchat
 
+  # using chitchat here so that intentless is used for better testability
   pattern_search:
     description: handle knowledge-based requests using enterprise search
     steps:
+#      - action: action_trigger_chitchat
       - action: action_trigger_search
-  
+
   pattern_cancel_flow:
     description: A meta flow that's started when a flow is cancelled.
     steps:

diff --git a/data/prompts/PROMPT_README.md b/data/prompts/PROMPT_README.md
@@ -0,0 +1,28 @@
+# Prompts README
+
+This README provides information on how to use the prompts and includes the results of the end-to-end (e2e) tests for different models.
+
+## Usage
+
+```
+name: LLMCommandGenerator
+prompt: path_to_prompt_file.jinja2
+```
+This [component](https://rasa.com/docs/rasa-pro/concepts/dialogue-understanding/#using-the-llmcommandgenerator) generates commands using a LLM based on the given prompt file and should be included in the `pipeline` section of the `config.yml` file.
+
+## E2E Test Results
+
+The `e2e_tests` folder contains the test cases across different conversation categories that are used to evaluate the models.
+
+The conversations are modeled using `flows`. The `domain` file contains the definition of bot utterances, slots, and actions that are used in the test cases.
+
+The following are the results of the e2e tests conducted for different models using designated prompts. 
+
+| Model   | Accuracy | Prompt file |
+|---------|----------|-------------|
+| gpt-4   | 88.09%   | default     |
+| gpt-4-1106-preview | 71.42%      | default     |
+| gpt-4-0125-preview | 67.86%      | default     |
+| gpt-3.5-turbo | 63.1%      | [data/prompts/gpt_3-5_turbo_cmd_gen_prompt.jinja2](gpt_3-5_turbo_cmd_gen_prompt.jinja2)     |
+| gpt-3.5-turbo-1106 | 52.38%      | [data/prompts/gpt_3-5_turbo_1106_cmd_gen_prompt.jinja2](gpt_3-5_turbo_1106_cmd_gen_prompt.jinja2)     |
+| mistral-medium | 44.05%      | [data/prompts/mistral_medium_cmd_gen_prompt.jinja2](mistral_medium_cmd_gen_prompt.jinja2)     |
diff --git a/data/prompts/gpt_3-5_turbo_1106_cmd_gen_prompt.jinja2 b/data/prompts/gpt_3-5_turbo_1106_cmd_gen_prompt.jinja2
@@ -0,0 +1,60 @@
+Your task is to analyze the current conversation context and generate a list of actions to start new business processes that we call flows, to extract slots, or respond to small talk and knowledge requests.
+Believe in your abilities and strive for excellence. Your hard work will yield remarkable results. You can do it!
+
+These are the flows that can be started, with their description and slots:
+{% for flow in available_flows %}
+{{ flow.name }}: {{ flow.description }}
+    {% for slot in flow.slots -%}
+    slot: {{ slot.name }}{% if slot.description %} ({{ slot.description }}){% endif %}{% if slot.allowed_values %}, allowed values: {{ slot.allowed_values }}{% endif %}
+    {% endfor %}
+{%- endfor %}
+
+===
+{% if current_flow != None %}
+You are currently in the flow "{{ current_flow }}".
+You have just asked the user for the slot "{{ current_slot }}"{% if current_slot_description %} ({{ current_slot_description }}){% endif %}.
+
+{% if flow_slots|length > 0 %}
+Here are the slots of the currently active flow:
+{% for slot in flow_slots -%}
+- name: {{ slot.name }}, value: {{ slot.value }}, type: {{ slot.type }}, description: {{ slot.description}}{% if slot.allowed_values %}, allowed values: {{ slot.allowed_values }}{% endif %}
+{% endfor %}
+{% endif %}
+{% else %}
+You are currently not in any flow and so there are no active slots.
+This means you can only set a slot if you first start a flow that requires that slot.
+{% endif %}
+If you start a flow, first start the flow and then optionally fill that flow's slots with information the user provided in their message.
+
+===
+Based on this information generate a list of actions you want to take. Any logic of what happens afterwards is handled by the flow engine. These are your available actions:
+* Slot setting, described by "SetSlot(slot_name, slot_value)". An example would be "SetSlot(recipient, Freddy)". Only set a slot when it is explicitly mentioned by the user, do not set a slot with abstract or unspecific values.
+* Starting a flow, described by "StartFlow(flow_name)". An example would be "StartFlow(transfer_money)".
+* Canceling/Stopping the current flow, described by "CancelFlow()". Examples of user canceling flow phrases are: "stop that", "cancel this".
+* Clarifying which flow should be started. An example would be Clarify(list_contacts, add_contact, remove_contact) if the user just wrote "contacts" and there are multiple potential candidates. It also works with a single flow name to confirm you understood correctly, as in Clarify(transfer_money).
+* Intercepting and handle user messages with the intent to bypass the current step in the flow, described by "SkipQuestion()". Examples of user skip phrases are: "Go to the next question", "Ask me something else".
+* Responding to knowledge-oriented user messages, that needs further information from a knowledge base, described by "SearchAndReply()".
+* Responding to a casual, non-task-oriented user message, described by "ChitChat()". Do not predict "ChitChat()" if the message contains valuable information, such as slots.
+* Handing off to a human, in case the user seems frustrated or explicitly asks to speak to one, described by "HumanHandoff()".
+
+===
+Do not fill slots with abstract values or placeholders.
+You can only fill a slot when a flow is active.
+Only use information provided by the user.
+If the user asks for two things which seem contradictory, clarify before starting a flow.
+If it's not clear whether the user wants to skip the step or to cancel the flow, cancel the flow.
+Strictly adhere to the provided action types listed above.
+Focus on the last message and take it one step at a time.
+Use the previous conversation steps only to aid understanding.
+Only predict "ChitChat()" if there is no other action to take.
+A flow can be interrupted by another flow.
+
+===
+Here is what happened previously in the conversation:
+{{ current_conversation }}
+
+The user just said """{{ user_message }}""".
+
+===
+Think this through step by step manner, go through the context, surfacing important information that could be useful, and first write an analysis of the last user message. Pay close attention to the descriptions of slots. Do not fill slots with abstract values before the user has mentioned or referenced the values. Do not add any unnecessary actions.
+Afterwards, write out the actions you want to take, one per line.
diff --git a/data/prompts/gpt_3-5_turbo_cmd_gen_prompt.jinja2 b/data/prompts/gpt_3-5_turbo_cmd_gen_prompt.jinja2
@@ -0,0 +1,59 @@
+Your task is to analyze the current conversation context and generate a list of actions to start new business processes that we call flows, to extract slots, or respond to small talk and knowledge requests.
+
+These are the flows that can be started, with their description and slots:
+{% for flow in available_flows %}
+{{ flow.name }}: {{ flow.description }}
+    {% for slot in flow.slots -%}
+    slot: {{ slot.name }}{% if slot.description %} ({{ slot.description }}){% endif %}{% if slot.allowed_values %}, allowed values: {{ slot.allowed_values }}{% endif %}
+    {% endfor %}
+{%- endfor %}
+
+===
+{% if current_flow != None %}
+You are currently in the flow "{{ current_flow }}".
+You have just asked the user for the slot "{{ current_slot }}"{% if current_slot_description %} ({{ current_slot_description }}){% endif %}.
+
+{% if flow_slots|length > 0 %}
+Here are the slots of the currently active flow:
+{% for slot in flow_slots -%}
+- name: {{ slot.name }}, value: {{ slot.value }}, type: {{ slot.type }}, description: {{ slot.description}}{% if slot.allowed_values %}, allowed values: {{ slot.allowed_values }}{% endif %}
+{% endfor %}
+{% endif %}
+{% else %}
+You are currently not in any flow and so there are no active slots.
+This means you can only set a slot if you first start a flow that requires that slot.
+{% endif %}
+If you start a flow, first start the flow and then optionally fill that flow's slots with information the user provided in their message.
+
+===
+Based on this information generate a list of actions you want to take. Any logic of what happens afterwards is handled by the flow engine. These are your available actions:
+* Slot setting, described by "SetSlot(slot_name, slot_value)". An example would be "SetSlot(recipient, Freddy)". Only set a slot when it is explicitly mentioned by the user, do not set a slot with abstract or unspecific values.
+* Starting a flow, described by "StartFlow(flow_name)". An example would be "StartFlow(transfer_money)".
+* Canceling/Stopping the current flow, described by "CancelFlow()". Examples of user canceling flow phrases are: "stop that", "cancel this".
+* Clarifying which flow should be started. An example would be Clarify(list_contacts, add_contact, remove_contact) if the user just wrote "contacts" and there are multiple potential candidates. It also works with a single flow name to confirm you understood correctly, as in Clarify(transfer_money).
+* Intercepting and handle user messages with the intent to bypass the current step in the flow, described by "SkipQuestion()". Examples of user skip phrases are: "Go to the next question", "Ask me something else".
+* Responding to knowledge-oriented user messages, that needs further information from a knowledge base, described by "SearchAndReply()".
+* Responding to a casual, non-task-oriented user message, described by "ChitChat()". Do not predict "ChitChat()" if the message contains valuable information, such as slots.
+* Handing off to a human, in case the user seems frustrated or explicitly asks to speak to one, described by "HumanHandoff()".
+
+===
+Do not fill slots with abstract values or placeholders.
+You can only fill a slot when a flow is active.
+Only use information provided by the user.
+If the user asks for two things which seem contradictory, clarify before starting a flow.
+If it's not clear whether the user wants to skip the step or to cancel the flow, cancel the flow.
+Strictly adhere to the provided action types listed above.
+Focus on the last message and take it one step at a time.
+Use the previous conversation steps only to aid understanding.
+Only predict "ChitChat()" if there is no other action to take.
+A flow can be interrupted by another flow.
+
+===
+Here is what happened previously in the conversation:
+{{ current_conversation }}
+
+The user just said """{{ user_message }}""".
+
+===
+Think this through step by step manner, go through the context, surfacing important information that could be useful, and first write an analysis of the last user message. Pay close attention to the descriptions of slots. Do not fill slots with abstract values before the user has mentioned or referenced the values. Do not add any unnecessary actions.
+Afterwards, write out the actions you want to take, one per line.
diff --git a/data/prompts/mistral_medium_cmd_gen_prompt.jinja2 b/data/prompts/mistral_medium_cmd_gen_prompt.jinja2
@@ -0,0 +1,62 @@
+<s>[INST] Your task is to analyze the current conversation context and generate a list of actions to start new business processes that we call flows, to extract slots, or respond to small talk and knowledge requests.
+Believe in your abilities and strive for excellence. Your hard work will yield remarkable results. You can do it!
+
+These are the flows that can be started, with their description and slots:
+{% for flow in available_flows %}
+{{ flow.name }}: {{ flow.description }}
+    {% for slot in flow.slots -%}
+    slot: {{ slot.name }}{% if slot.description %} ({{ slot.description }}){% endif %}{% if slot.allowed_values %}, allowed values: {{ slot.allowed_values }}{% endif %}
+    {% endfor %}
+{%- endfor %}
+
+===
+{% if current_flow != None %}
+You are currently in the flow "{{ current_flow }}".
+You have just asked the user for the slot "{{ current_slot }}"{% if current_slot_description %} ({{ current_slot_description }}){% endif %}.
+
+{% if flow_slots|length > 0 %}
+Here are the slots of the currently active flow:
+{% for slot in flow_slots -%}
+- name: {{ slot.name }}, value: {{ slot.value }}, type: {{ slot.type }}, description: {{ slot.description}}{% if slot.allowed_values %}, allowed values: {{ slot.allowed_values }}{% endif %}
+{% endfor %}
+{% endif %}
+{% else %}
+You are currently not in any flow and so there are no active slots.
+This means you can only set a slot if you first start a flow that requires that slot.
+{% endif %}
+If you start a flow, first start the flow and then optionally fill that flow's slots with information the user provided in their message.
+
+===
+Based on this information generate a list of actions you want to take. Any logic of what happens afterwards is handled by the flow engine. These are your available actions:
+* Slot setting, described by "SetSlot(slot_name, slot_value)". An example would be "SetSlot(recipient, Freddy)". Only set a slot when it is explicitly mentioned by the user, do not set a slot with abstract or unspecific values.
+* Starting a flow, described by "StartFlow(flow_name)". An example would be "StartFlow(transfer_money)".
+* Canceling/Stopping the current flow, described by "CancelFlow()". Examples of user canceling flow phrases are: "stop that", "cancel this".
+* Clarifying which flow should be started. An example would be Clarify(list_contacts, add_contact, remove_contact) if the user just wrote "contacts" and there are multiple potential candidates. It also works with a single flow name to confirm you understood correctly, as in Clarify(transfer_money).
+* Intercepting and handle user messages with the intent to bypass the current step in the flow, described by "SkipQuestion()". Examples of user skip phrases are: "Go to the next question", "Ask me something else".
+* Responding to knowledge-oriented user messages, that needs further information from a knowledge base, described by "SearchAndReply()".
+* Responding to a casual, non-task-oriented user message, described by "ChitChat()". Do not predict "ChitChat()" if the message contains valuable information, such as slots.
+* Handing off to a human, in case the user seems frustrated or explicitly asks to speak to one, described by "HumanHandoff()".
+
+===
+Do not fill slots with abstract values or placeholders.
+You can only fill a slot when a flow is active.
+Only use information provided by the user.
+If the user asks for two things which seem contradictory, clarify before starting a flow.
+If it's not clear whether the user wants to skip the step or to cancel the flow, cancel the flow.
+Strictly adhere to the provided action types listed above.
+Focus on the last message and take it one step at a time.
+Use the previous conversation steps only to aid understanding.
+Only predict "ChitChat()" if there is no other action to take.
+A flow can be interrupted by another flow.
+[/INST]
+===
+Here is what happened previously in the conversation:
+{{ current_conversation }}
+
+The user just said """{{ user_message }}""".
+</s>
+===
+[INST]
+Think this through step by step manner, go through the context, surfacing important information that could be useful, and first write an analysis of the last user message. Pay close attention to the descriptions of slots. Do not fill slots with abstract values before the user has mentioned or referenced the values. Do not add any unnecessary actions.
+Afterwards, write out the actions you want to take, one per line.
+[/INST]
diff --git a/domain/add_card.yml b/domain/add_card.yml
@@ -0,0 +1,5 @@
+version: "3.1"
+
+responses:
+  utter_card_added:
+    - text: "Okay, added another card."
diff --git a/domain/patterns.yml b/domain/patterns.yml
@@ -14,12 +14,11 @@ responses:
           title: Yes
         - payload: no
           title: No, please keep the previous information
-      metadata: 
+      metadata:
         rephrase: True
         template: jinja
-  
+
   utter_not_corrected_previous_input:
     - text: "Ok, I did not correct the previous input."
-      metadata: 
+      metadata:
         rephrase: True
-
diff --git a/...sions/user_asks_for_a_moment_to_think.yml → ...sions/user_asks_for_a_moment_to_think.yml b/...sions/user_asks_for_a_moment_to_think.yml → ...sions/user_asks_for_a_moment_to_think.yml
@@ -17,7 +17,7 @@ test_cases:
           - set_slot:
             - transfer_money_amount_of_money: "100"
       - utter: utter_ask_transfer_money_final_confirmation
-      - user: Yes
+      - user: "Yes"
       - commands:
           - set_slot:
             - transfer_money_final_confirmation: "True"

diff --git a/e2e_tests/flaky/happy_path/user_sets_up_recurrent_payment.yml b/e2e_tests/flaky/happy_path/user_sets_up_recurrent_payment.yml
@@ -0,0 +1,12 @@
+test_cases:
+  - test_case: user wants to set up a new recurrent payment, but specifies the type incompletely, example 3
+    steps:
+      - user: I want to set up a new recurrent payment
+      - commands:
+        - start_flow: setup_recurrent_payment
+      - utter: utter_ask_recurrent_payment_type
+      - user: stand order
+      - commands:
+        - set_slot:
+          - recurrent_payment_type: "standing order"
+      - utter: utter_ask_recipient